harness-evolver 0.2.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/install.js CHANGED
@@ -1,26 +1,53 @@
1
1
  #!/usr/bin/env node
2
2
  /**
3
3
  * Harness Evolver installer.
4
- * Detects Claude Code, copies skills/agents/tools to the right locations.
4
+ * Interactive setup with runtime selection, global/local choice.
5
5
  *
6
6
  * Usage: npx harness-evolver@latest
7
7
  */
8
8
 
9
9
  const fs = require("fs");
10
10
  const path = require("path");
11
+ const readline = require("readline");
11
12
  const { execSync } = require("child_process");
12
13
 
14
+ const VERSION = require("../package.json").version;
13
15
  const PLUGIN_ROOT = path.resolve(__dirname, "..");
14
16
  const HOME = process.env.HOME || process.env.USERPROFILE;
15
17
 
16
- const CLAUDE_DIR = path.join(HOME, ".claude");
17
- const COMMANDS_DIR = path.join(CLAUDE_DIR, "commands", "harness-evolver");
18
- const AGENTS_DIR = path.join(CLAUDE_DIR, "agents");
19
- const TOOLS_DIR = path.join(HOME, ".harness-evolver", "tools");
20
- const EXAMPLES_DIR = path.join(HOME, ".harness-evolver", "examples");
21
-
22
- function log(msg) {
23
- console.log(` ${msg}`);
18
+ // ANSI colors
19
+ const CYAN = "\x1b[36m";
20
+ const GREEN = "\x1b[32m";
21
+ const YELLOW = "\x1b[33m";
22
+ const RED = "\x1b[31m";
23
+ const DIM = "\x1b[2m";
24
+ const BOLD = "\x1b[1m";
25
+ const RESET = "\x1b[0m";
26
+
27
+ const LOGO = `
28
+ ${CYAN} ██╗ ██╗ █████╗ ██████╗ ███╗ ██╗███████╗███████╗███████╗
29
+ ██║ ██║██╔══██╗██╔══██╗████╗ ██║██╔════╝██╔════╝██╔════╝
30
+ ███████║███████║██████╔╝██╔██╗ ██║█████╗ ███████╗███████╗
31
+ ██╔══██║██╔══██║██╔══██╗██║╚██╗██║██╔══╝ ╚════██║╚════██║
32
+ ██║ ██║██║ ██║██║ ██║██║ ╚████║███████╗███████║███████║
33
+ ╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═══╝╚══════╝╚══════╝╚══════╝
34
+ ${BOLD}███████╗██╗ ██╗ ██████╗ ██╗ ██╗ ██╗███████╗██████╗
35
+ ██╔════╝██║ ██║██╔═══██╗██║ ██║ ██║██╔════╝██╔══██╗
36
+ █████╗ ██║ ██║██║ ██║██║ ██║ ██║█████╗ ██████╔╝
37
+ ██╔══╝ ╚██╗ ██╔╝██║ ██║██║ ╚██╗ ██╔╝██╔══╝ ██╔══██╗
38
+ ███████╗ ╚████╔╝ ╚██████╔╝███████╗╚████╔╝ ███████╗██║ ██║
39
+ ╚══════╝ ╚═══╝ ╚═════╝ ╚══════╝ ╚═══╝ ╚══════╝╚═╝ ╚═╝${RESET}
40
+ `;
41
+
42
+ const RUNTIMES = [
43
+ { name: "Claude Code", dir: ".claude", detected: () => fs.existsSync(path.join(HOME, ".claude")) },
44
+ { name: "Cursor", dir: ".cursor", detected: () => fs.existsSync(path.join(HOME, ".cursor")) },
45
+ { name: "Codex", dir: ".codex", detected: () => fs.existsSync(path.join(HOME, ".codex")) },
46
+ { name: "Windsurf", dir: ".windsurf", detected: () => fs.existsSync(path.join(HOME, ".windsurf")) },
47
+ ];
48
+
49
+ function ask(rl, question) {
50
+ return new Promise((resolve) => rl.question(question, resolve));
24
51
  }
25
52
 
26
53
  function copyDir(src, dest) {
@@ -50,76 +77,152 @@ function checkPython() {
50
77
  }
51
78
  }
52
79
 
53
- function main() {
54
- console.log("\n Harness Evolver v0.1.0\n");
80
+ function installForRuntime(runtimeDir, scope) {
81
+ const baseDir = scope === "local"
82
+ ? path.join(process.cwd(), runtimeDir)
83
+ : path.join(HOME, runtimeDir);
55
84
 
56
- if (!checkPython()) {
57
- console.error(" ERROR: python3 not found in PATH. Install Python 3.8+ first.");
58
- process.exit(1);
59
- }
60
- log("\u2713 python3 found");
85
+ const commandsDir = path.join(baseDir, "commands", "harness-evolver");
86
+ const agentsDir = path.join(baseDir, "agents");
61
87
 
62
- if (!fs.existsSync(CLAUDE_DIR)) {
63
- console.error(` ERROR: Claude Code directory not found at ${CLAUDE_DIR}`);
64
- console.error(" Install Claude Code first: https://claude.ai/code");
65
- process.exit(1);
66
- }
67
- log("\u2713 Claude Code detected");
68
-
69
- // Copy skills
88
+ // Skills
70
89
  const skillsSource = path.join(PLUGIN_ROOT, "skills");
71
90
  if (fs.existsSync(skillsSource)) {
72
91
  for (const skill of fs.readdirSync(skillsSource, { withFileTypes: true })) {
73
92
  if (skill.isDirectory()) {
74
- const src = path.join(skillsSource, skill.name);
75
- const dest = path.join(COMMANDS_DIR, skill.name);
76
- copyDir(src, dest);
77
- log(` skill: ${skill.name}`);
93
+ copyDir(path.join(skillsSource, skill.name), path.join(commandsDir, skill.name));
94
+ console.log(` ${GREEN}✓${RESET} Installed skill: ${skill.name}`);
78
95
  }
79
96
  }
80
97
  }
81
98
 
82
- // Copy agents
99
+ // Agents
83
100
  const agentsSource = path.join(PLUGIN_ROOT, "agents");
84
101
  if (fs.existsSync(agentsSource)) {
85
- fs.mkdirSync(AGENTS_DIR, { recursive: true });
102
+ fs.mkdirSync(agentsDir, { recursive: true });
86
103
  for (const agent of fs.readdirSync(agentsSource)) {
87
- copyFile(
88
- path.join(agentsSource, agent),
89
- path.join(AGENTS_DIR, agent)
90
- );
91
- log(` agent: ${agent}`);
104
+ copyFile(path.join(agentsSource, agent), path.join(agentsDir, agent));
105
+ console.log(` ${GREEN}✓${RESET} Installed agent: ${agent}`);
92
106
  }
93
107
  }
108
+ }
94
109
 
95
- // Copy tools
110
+ function installTools() {
111
+ const toolsDir = path.join(HOME, ".harness-evolver", "tools");
96
112
  const toolsSource = path.join(PLUGIN_ROOT, "tools");
97
113
  if (fs.existsSync(toolsSource)) {
98
- fs.mkdirSync(TOOLS_DIR, { recursive: true });
114
+ fs.mkdirSync(toolsDir, { recursive: true });
99
115
  for (const tool of fs.readdirSync(toolsSource)) {
100
116
  if (tool.endsWith(".py")) {
101
- copyFile(
102
- path.join(toolsSource, tool),
103
- path.join(TOOLS_DIR, tool)
104
- );
105
- log(` tool: ${tool}`);
117
+ copyFile(path.join(toolsSource, tool), path.join(toolsDir, tool));
118
+ console.log(` ${GREEN}✓${RESET} Installed tool: ${tool}`);
106
119
  }
107
120
  }
108
121
  }
122
+ }
109
123
 
110
- // Copy examples
124
+ function installExamples() {
125
+ const examplesDir = path.join(HOME, ".harness-evolver", "examples");
111
126
  const examplesSource = path.join(PLUGIN_ROOT, "examples");
112
127
  if (fs.existsSync(examplesSource)) {
113
- copyDir(examplesSource, EXAMPLES_DIR);
114
- log(" examples: classifier");
128
+ copyDir(examplesSource, examplesDir);
129
+ console.log(` ${GREEN}✓${RESET} Installed examples: classifier`);
130
+ }
131
+ }
132
+
133
+ async function main() {
134
+ console.log(LOGO);
135
+ console.log(` ${DIM}Harness Evolver v${VERSION}${RESET}`);
136
+ console.log(` ${DIM}Meta-Harness-style autonomous harness optimization${RESET}`);
137
+ console.log();
138
+
139
+ // Check python
140
+ if (!checkPython()) {
141
+ console.error(` ${RED}ERROR:${RESET} python3 not found in PATH. Install Python 3.8+ first.`);
142
+ process.exit(1);
143
+ }
144
+ console.log(` ${GREEN}✓${RESET} python3 found`);
145
+
146
+ // Detect runtimes
147
+ const available = RUNTIMES.filter((r) => r.detected());
148
+ if (available.length === 0) {
149
+ console.error(`\n ${RED}ERROR:${RESET} No supported runtime detected.`);
150
+ console.error(` Install Claude Code, Cursor, Codex, or Windsurf first.`);
151
+ process.exit(1);
115
152
  }
116
153
 
117
- console.log("\n \u2713 Installed successfully!\n");
118
- console.log(" Next steps:");
119
- console.log(" 1. Copy an example: cp -r ~/.harness-evolver/examples/classifier ./my-project");
120
- console.log(" 2. cd my-project");
121
- console.log(" 3. /harness-evolve-init --harness harness.py --eval eval.py --tasks tasks/");
122
- console.log(" 4. /harness-evolve --iterations 5\n");
154
+ const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
155
+
156
+ // Runtime selection
157
+ console.log(`\n ${YELLOW}Which runtime(s) would you like to install for?${RESET}\n`);
158
+ available.forEach((r, i) => {
159
+ console.log(` ${i + 1}) ${r.name.padEnd(14)} (~/${r.dir})`);
160
+ });
161
+ if (available.length > 1) {
162
+ console.log(` ${available.length + 1}) All`);
163
+ console.log(`\n ${DIM}Select multiple: 1,2 or 1 2${RESET}`);
164
+ }
165
+
166
+ const defaultChoice = "1";
167
+ const runtimeAnswer = await ask(rl, `\n ${YELLOW}Choice [${defaultChoice}]:${RESET} `);
168
+ const runtimeInput = (runtimeAnswer.trim() || defaultChoice);
169
+
170
+ let selectedRuntimes;
171
+ if (runtimeInput === String(available.length + 1)) {
172
+ selectedRuntimes = available;
173
+ } else {
174
+ const indices = runtimeInput.split(/[,\s]+/).map((s) => parseInt(s, 10) - 1);
175
+ selectedRuntimes = indices
176
+ .filter((i) => i >= 0 && i < available.length)
177
+ .map((i) => available[i]);
178
+ }
179
+
180
+ if (selectedRuntimes.length === 0) {
181
+ selectedRuntimes = [available[0]];
182
+ }
183
+
184
+ // Scope selection
185
+ console.log(`\n ${YELLOW}Where would you like to install?${RESET}\n`);
186
+ console.log(` 1) Global (~/${selectedRuntimes[0].dir}) - available in all projects`);
187
+ console.log(` 2) Local (./${selectedRuntimes[0].dir}) - this project only`);
188
+
189
+ const scopeAnswer = await ask(rl, `\n ${YELLOW}Choice [1]:${RESET} `);
190
+ const scope = (scopeAnswer.trim() === "2") ? "local" : "global";
191
+
192
+ console.log();
193
+
194
+ // Install for each selected runtime
195
+ for (const runtime of selectedRuntimes) {
196
+ const target = scope === "local" ? `./${runtime.dir}` : `~/${runtime.dir}`;
197
+ console.log(` Installing for ${CYAN}${runtime.name}${RESET} to ${target}`);
198
+ console.log();
199
+ installForRuntime(runtime.dir, scope);
200
+ }
201
+
202
+ // Tools and examples are always global
203
+ installTools();
204
+ installExamples();
205
+
206
+ // Write version file
207
+ const versionPath = path.join(HOME, ".harness-evolver", "VERSION");
208
+ fs.mkdirSync(path.dirname(versionPath), { recursive: true });
209
+ fs.writeFileSync(versionPath, VERSION);
210
+ console.log(` ${GREEN}✓${RESET} Wrote VERSION (${VERSION})`);
211
+
212
+ console.log(`\n ${GREEN}Done!${RESET} Open a project in Claude Code and run ${CYAN}/harness-evolver:init${RESET}`);
213
+ console.log(`\n ${DIM}Quick start with example:${RESET}`);
214
+ console.log(` cp -r ~/.harness-evolver/examples/classifier ./my-project`);
215
+ console.log(` cd my-project && claude`);
216
+ console.log(` /harness-evolver:init`);
217
+ console.log(` /harness-evolver:evolve`);
218
+
219
+ console.log(`\n ${DIM}GitHub: https://github.com/raphaelchristi/harness-evolver${RESET}`);
220
+ console.log();
221
+
222
+ rl.close();
123
223
  }
124
224
 
125
- main();
225
+ main().catch((err) => {
226
+ console.error(` ${RED}ERROR:${RESET} ${err.message}`);
227
+ process.exit(1);
228
+ });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
- "version": "0.2.1",
3
+ "version": "0.5.0",
4
4
  "description": "Meta-Harness-style autonomous harness optimization for Claude Code",
5
5
  "author": "Raphael Valdetaro Christi Cordeiro",
6
6
  "license": "MIT",
@@ -0,0 +1,73 @@
1
+ ---
2
+ name: compare
3
+ description: "Use when the user wants to compare two harness versions, understand what changed between iterations, see why one version scored better than another, or debug a regression."
4
+ argument-hint: "<vA> <vB>"
5
+ allowed-tools: [Read, Bash, Glob, Grep]
6
+ ---
7
+
8
+ # /harness-evolver:compare
9
+
10
+ Compare two harness versions side by side.
11
+
12
+ ## Arguments
13
+
14
+ - `vA` — first version (e.g., `v001`, `baseline`)
15
+ - `vB` — second version (e.g., `v003`)
16
+
17
+ If only one version given, compare it against the current best.
18
+ If no versions given, compare the two most recent.
19
+
20
+ ## What To Do
21
+
22
+ ### 1. Code Diff
23
+
24
+ ```bash
25
+ diff .harness-evolver/harnesses/{vA}/harness.py .harness-evolver/harnesses/{vB}/harness.py
26
+ ```
27
+
28
+ If config changed:
29
+ ```bash
30
+ diff .harness-evolver/harnesses/{vA}/config.json .harness-evolver/harnesses/{vB}/config.json
31
+ ```
32
+
33
+ ### 2. Score Comparison
34
+
35
+ ```bash
36
+ cat .harness-evolver/harnesses/{vA}/scores.json
37
+ cat .harness-evolver/harnesses/{vB}/scores.json
38
+ ```
39
+
40
+ Report: combined_score delta, per-task wins/losses.
41
+
42
+ ### 3. Per-Task Analysis
43
+
44
+ For tasks where scores diverge, show what each version produced:
45
+
46
+ ```bash
47
+ cat .harness-evolver/harnesses/{vA}/traces/task_{ID}/output.json
48
+ cat .harness-evolver/harnesses/{vB}/traces/task_{ID}/output.json
49
+ ```
50
+
51
+ ### 4. Proposal Context
52
+
53
+ ```bash
54
+ cat .harness-evolver/harnesses/{vB}/proposal.md
55
+ ```
56
+
57
+ Show what the proposer intended and whether the result matched expectations.
58
+
59
+ ## Report Format
60
+
61
+ ```
62
+ v001 (0.62) vs v003 (0.71) — +0.09 improvement
63
+
64
+ Code changes:
65
+ + Added few-shot examples (3 examples)
66
+ ~ Changed prompt template
67
+ - Removed retry logic
68
+
69
+ Per-task:
70
+ task_001: 1.0 → 1.0 (unchanged)
71
+ task_007: 0.0 → 1.0 (FIXED — was cardiac, now correctly classified)
72
+ task_008: 1.0 → 0.0 (REGRESSION — was neurological, now wrong)
73
+ ```
@@ -0,0 +1,53 @@
1
+ ---
2
+ name: deploy
3
+ description: "Use when the user wants to use the best evolved harness in their project, promote a version to production, copy the winning harness back, or is done evolving and wants to apply the result."
4
+ argument-hint: "[version]"
5
+ allowed-tools: [Read, Write, Bash, Glob]
6
+ ---
7
+
8
+ # /harness-evolver:deploy
9
+
10
+ Promote the best (or specified) harness version back to the user's project.
11
+
12
+ ## Arguments
13
+
14
+ - `version` — optional. If not given, deploys the best version from `summary.json`.
15
+
16
+ ## What To Do
17
+
18
+ ### 1. Identify Best Version
19
+
20
+ ```bash
21
+ python3 -c "import json; s=json.load(open('.harness-evolver/summary.json')); print(s['best']['version'], s['best']['combined_score'])"
22
+ ```
23
+
24
+ Or use the user-specified version.
25
+
26
+ ### 2. Show What Will Be Deployed
27
+
28
+ ```bash
29
+ cat .harness-evolver/harnesses/{version}/proposal.md
30
+ cat .harness-evolver/harnesses/{version}/scores.json
31
+ ```
32
+
33
+ Report: version, score, improvement over baseline, what changed.
34
+
35
+ ### 3. Ask for Confirmation
36
+
37
+ > Deploy `{version}` (score: {score}, +{delta} over baseline) to your project?
38
+ > This will copy `harness.py` and `config.json` to the project root.
39
+
40
+ ### 4. Copy Files
41
+
42
+ ```bash
43
+ cp .harness-evolver/harnesses/{version}/harness.py ./harness.py
44
+ cp .harness-evolver/harnesses/{version}/config.json ./config.json # if exists
45
+ ```
46
+
47
+ If the original entry point had a different name (e.g., `graph.py`), ask the user where to put it.
48
+
49
+ ### 5. Report
50
+
51
+ - What was copied and where
52
+ - Score improvement: baseline → deployed version
53
+ - Suggest: review the diff before committing
@@ -0,0 +1,96 @@
1
+ ---
2
+ name: diagnose
3
+ description: "Use when the user wants to understand why a specific harness version failed, investigate a regression, analyze trace data, or debug a low score. Also use when the user says 'why did v003 fail' or 'what went wrong'."
4
+ argument-hint: "[version]"
5
+ allowed-tools: [Read, Bash, Glob, Grep]
6
+ ---
7
+
8
+ # /harness-evolver:diagnose
9
+
10
+ Deep analysis of a harness version's execution traces and scores.
11
+
12
+ ## Arguments
13
+
14
+ - `version` — version to diagnose (e.g., `v003`). If not given, diagnose the worst or most recent regression.
15
+
16
+ ## Resolve Tool Path
17
+
18
+ ```bash
19
+ TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo "$HOME/.harness-evolver/tools")
20
+ ```
21
+
22
+ ## What To Do
23
+
24
+ ### 1. Identify the Version
25
+
26
+ If not specified, find the worst or most recent regression:
27
+
28
+ ```bash
29
+ python3 $TOOLS/state.py show --base-dir .harness-evolver
30
+ cat .harness-evolver/summary.json
31
+ ```
32
+
33
+ ### 2. Score Breakdown
34
+
35
+ ```bash
36
+ cat .harness-evolver/harnesses/{version}/scores.json
37
+ ```
38
+
39
+ Identify which tasks failed (`score: 0.0`) and which passed.
40
+
41
+ ### 3. Trace Analysis (failed tasks)
42
+
43
+ For each failed task:
44
+
45
+ ```bash
46
+ cat .harness-evolver/harnesses/{version}/traces/{task_id}/input.json
47
+ cat .harness-evolver/harnesses/{version}/traces/{task_id}/output.json
48
+ ```
49
+
50
+ Look for patterns: wrong format? wrong category? empty output? crash?
51
+
52
+ ### 4. Error Search
53
+
54
+ ```bash
55
+ grep -r "error\|Error\|FAIL\|exception\|Traceback" .harness-evolver/harnesses/{version}/traces/
56
+ cat .harness-evolver/harnesses/{version}/traces/stderr.log
57
+ ```
58
+
59
+ ### 5. Compare with Parent
60
+
61
+ Read the proposal to find the parent version:
62
+
63
+ ```bash
64
+ cat .harness-evolver/harnesses/{version}/proposal.md
65
+ ```
66
+
67
+ Then diff:
68
+
69
+ ```bash
70
+ diff .harness-evolver/harnesses/{parent}/harness.py .harness-evolver/harnesses/{version}/harness.py
71
+ ```
72
+
73
+ ### 6. LangSmith (if available)
74
+
75
+ If `langsmith-cli` is installed and LangSmith is configured:
76
+
77
+ ```bash
78
+ langsmith-cli --json runs list --project harness-evolver-{version} --failed --fields id,name,error,inputs
79
+ langsmith-cli --json runs stats --project harness-evolver-{version}
80
+ ```
81
+
82
+ ### 7. Report
83
+
84
+ ```
85
+ Diagnosis: v003 (score: 0.31) — REGRESSION from v001 (0.62)
86
+
87
+ Root cause: Prompt template change broke JSON parsing
88
+ - 4/10 tasks returned malformed output
89
+ - stderr shows: json.JSONDecodeError on 4 tasks
90
+ - The change on line 42 removed the "Reply with ONLY..." instruction
91
+
92
+ Affected tasks: task_002, task_005, task_007, task_010
93
+ Unaffected tasks: task_001, task_003, task_004, task_006, task_008, task_009
94
+
95
+ Recommendation: Revert the prompt change, keep the retry logic from v002
96
+ ```
@@ -0,0 +1,94 @@
1
+ ---
2
+ name: evolve
3
+ description: "Use when the user wants to run the optimization loop, improve harness performance, evolve the harness, or iterate on harness quality. Requires .harness-evolver/ to exist (run harness-evolver:init first)."
4
+ argument-hint: "[--iterations N]"
5
+ allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent]
6
+ ---
7
+
8
+ # /harness-evolve
9
+
10
+ Run the autonomous propose-evaluate-iterate loop.
11
+
12
+ ## Prerequisites
13
+
14
+ `.harness-evolver/summary.json` must exist. If not, tell user to run `harness-evolver:init`.
15
+
16
+ ## Resolve Tool Path
17
+
18
+ ```bash
19
+ TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo "$HOME/.harness-evolver/tools")
20
+ ```
21
+
22
+ ## Parse Arguments
23
+
24
+ - `--iterations N` (default: 10)
25
+ - Read `config.json` for `evolution.stagnation_limit` (default: 3) and `evolution.target_score`
26
+
27
+ ## The Loop
28
+
29
+ For each iteration:
30
+
31
+ ### 1. Get Next Version
32
+
33
+ ```bash
34
+ python3 -c "import json; s=json.load(open('.harness-evolver/summary.json')); print(f'v{s[\"iterations\"]+1:03d}')"
35
+ ```
36
+
37
+ ### 2. Propose
38
+
39
+ Spawn the `harness-evolver-proposer` agent:
40
+
41
+ > You are proposing iteration {i}. Create version {version} in `.harness-evolver/harnesses/{version}/`.
42
+ > Working directory contains `.harness-evolver/` with all prior candidates and traces.
43
+
44
+ The proposer creates: `harness.py`, `config.json`, `proposal.md`.
45
+
46
+ ### 3. Validate
47
+
48
+ ```bash
49
+ python3 $TOOLS/evaluate.py validate \
50
+ --harness .harness-evolver/harnesses/{version}/harness.py \
51
+ --config .harness-evolver/harnesses/{version}/config.json
52
+ ```
53
+
54
+ If fails: one retry via proposer. If still fails: score 0.0, continue.
55
+
56
+ ### 4. Evaluate
57
+
58
+ ```bash
59
+ python3 $TOOLS/evaluate.py run \
60
+ --harness .harness-evolver/harnesses/{version}/harness.py \
61
+ --config .harness-evolver/harnesses/{version}/config.json \
62
+ --tasks-dir .harness-evolver/eval/tasks/ \
63
+ --eval .harness-evolver/eval/eval.py \
64
+ --traces-dir .harness-evolver/harnesses/{version}/traces/ \
65
+ --scores .harness-evolver/harnesses/{version}/scores.json \
66
+ --timeout 60
67
+ ```
68
+
69
+ ### 5. Update State
70
+
71
+ ```bash
72
+ python3 $TOOLS/state.py update \
73
+ --base-dir .harness-evolver \
74
+ --version {version} \
75
+ --scores .harness-evolver/harnesses/{version}/scores.json \
76
+ --proposal .harness-evolver/harnesses/{version}/proposal.md
77
+ ```
78
+
79
+ ### 6. Report
80
+
81
+ Read `summary.json`. Print: `Iteration {i}/{N}: {version} scored {score} (best: {best} at {best_score})`
82
+
83
+ ### 7. Check Stop Conditions
84
+
85
+ - **Stagnation**: last 3 scores within 1% of each other → stop
86
+ - **Target**: `combined_score >= target_score` → stop
87
+ - **N reached**: done
88
+
89
+ ## When Loop Ends — Final Report
90
+
91
+ - Best version and score
92
+ - Improvement over baseline (absolute and %)
93
+ - Total iterations run
94
+ - Suggest: "The best harness is at `.harness-evolver/harnesses/{best}/harness.py`. Copy it to your project."
@@ -0,0 +1,66 @@
1
+ ---
2
+ name: init
3
+ description: "Use when the user wants to set up harness optimization in their project, optimize an LLM agent, improve a harness, or mentions harness-evolver for the first time in a project without .harness-evolver/ directory."
4
+ argument-hint: "[directory]"
5
+ allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent]
6
+ ---
7
+
8
+ # /harness-evolve-init
9
+
10
+ Set up the Harness Evolver in a project. Scans the codebase, identifies the entry point, creates missing artifacts, runs baseline evaluation.
11
+
12
+ ## Resolve Tool Path
13
+
14
+ ```bash
15
+ TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo "$HOME/.harness-evolver/tools")
16
+ ```
17
+
18
+ Use `$TOOLS` prefix for all tool calls below.
19
+
20
+ ## Phase 1: Scan
21
+
22
+ ```bash
23
+ find . -maxdepth 3 -type f -name "*.py" | head -30
24
+ python3 $TOOLS/detect_stack.py .
25
+ ```
26
+
27
+ Look for:
28
+ - Entry points: files with `if __name__`, or named `main.py`, `app.py`, `agent.py`, `graph.py`, `pipeline.py`, `bot.py`
29
+ - Existing eval: `eval.py`, `score.py`, `judge.py`
30
+ - Existing tasks: directories with JSON files containing `id` + `input` fields
31
+ - Config: `config.json`, `config.yaml`, `.env`
32
+
33
+ ## Phase 2: Create What's Missing
34
+
35
+ Three artifacts needed. For each — use existing if found, create if not.
36
+
37
+ **Harness** (`harness.py`): If user's entry point doesn't match our CLI interface (`--input`, `--output`, `--traces-dir`, `--config`), create a thin wrapper that imports their code. Read their entry point first to understand the I/O format. Ask if unsure.
38
+
39
+ **Eval** (`eval.py`): Ask the user what "correct" means for their domain. Generate the simplest eval that gives signal. Even rough scoring works — the evolver iterates.
40
+
41
+ **Tasks** (`tasks/`): If no test data exists, ask the user for 5-10 example input/output pairs. Each task is `{"id": "task_001", "input": "...", "expected": "...", "metadata": {}}`.
42
+
43
+ ## Phase 3: Run Init
44
+
45
+ ```bash
46
+ python3 $TOOLS/init.py [directory] \
47
+ --harness harness.py --eval eval.py --tasks tasks/ \
48
+ --tools-dir $TOOLS
49
+ ```
50
+
51
+ Add `--harness-config config.json` if a config exists.
52
+
53
+ ## After Init — Report
54
+
55
+ - What was detected vs created
56
+ - Stack + integrations (LangSmith, Context7)
57
+ - Baseline score
58
+ - Next: `harness-evolver:evolve` to start
59
+
60
+ ## Gotchas
61
+
62
+ - The harness must write valid JSON to `--output`. If the user's code returns non-JSON, the wrapper must serialize it.
63
+ - Tasks must have unique `id` fields. Duplicate IDs cause silent eval errors.
64
+ - The `expected` field is never shown to the harness — only the eval script sees it.
65
+ - If `.harness-evolver/` already exists, warn before overwriting.
66
+ - If no Python files exist in CWD, the user is probably in the wrong directory.
@@ -0,0 +1,34 @@
1
+ ---
2
+ name: status
3
+ description: "Use when the user asks about evolution progress, current scores, best harness version, how many iterations ran, or whether the loop is stagnating. Also use when the user says 'status', 'progress', or 'how is it going'."
4
+ allowed-tools: [Read, Bash]
5
+ ---
6
+
7
+ # /harness-evolve-status
8
+
9
+ Show evolution progress.
10
+
11
+ ## Resolve Tool Path
12
+
13
+ ```bash
14
+ TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo "$HOME/.harness-evolver/tools")
15
+ ```
16
+
17
+ ## What To Do
18
+
19
+ If `.harness-evolver/` does not exist, tell user to run `harness-evolver:init` first.
20
+
21
+ Otherwise:
22
+
23
+ ```bash
24
+ python3 $TOOLS/state.py show --base-dir .harness-evolver
25
+ ```
26
+
27
+ Then read and display `.harness-evolver/STATE.md` for the full history table.
28
+
29
+ ## If User Wants More Detail
30
+
31
+ - Scores per task: `cat .harness-evolver/harnesses/{version}/scores.json`
32
+ - What changed: `cat .harness-evolver/harnesses/{version}/proposal.md`
33
+ - Compare two versions: `diff .harness-evolver/harnesses/{vA}/harness.py .harness-evolver/harnesses/{vB}/harness.py`
34
+ - Full history: `cat .harness-evolver/PROPOSER_HISTORY.md`
@@ -1,93 +0,0 @@
1
- ---
2
- name: harness-evolve
3
- description: "Run the harness evolution loop. Autonomously proposes, evaluates, and iterates on harness designs using full execution traces as feedback."
4
- argument-hint: "[--iterations N] [--candidates-per-iter N]"
5
- allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent]
6
- ---
7
-
8
- # /harness-evolve
9
-
10
- Run the Meta-Harness optimization loop.
11
-
12
- ## Arguments
13
-
14
- - `--iterations N` (default: 10) — number of evolution iterations
15
- - `--candidates-per-iter N` (default: 1) — harnesses per iteration
16
-
17
- ## Prerequisites
18
-
19
- Run `/harness-evolve-init` first. The `.harness-evolver/` directory must exist with a valid `summary.json`.
20
-
21
- ## The Loop
22
-
23
- For each iteration i from 1 to N:
24
-
25
- ### 1. PROPOSE
26
-
27
- Determine the next version number by reading `summary.json`:
28
-
29
- ```bash
30
- python3 -c "import json; s=json.load(open('.harness-evolver/summary.json')); print(f'v{s[\"iterations\"]+1:03d}')"
31
- ```
32
-
33
- Spawn the `harness-evolver-proposer` agent with this prompt:
34
-
35
- > You are proposing iteration {i}. Create version {version_number} in `.harness-evolver/harnesses/{version_number}/`.
36
- > Working directory contains `.harness-evolver/` with all prior candidates and traces.
37
-
38
- The proposer agent will create:
39
- - `.harness-evolver/harnesses/v{NNN}/harness.py`
40
- - `.harness-evolver/harnesses/v{NNN}/config.json`
41
- - `.harness-evolver/harnesses/v{NNN}/proposal.md`
42
-
43
- ### 2. VALIDATE
44
-
45
- ```bash
46
- python3 ~/.harness-evolver/tools/evaluate.py validate \
47
- --harness .harness-evolver/harnesses/v{NNN}/harness.py \
48
- --config .harness-evolver/harnesses/v{NNN}/config.json
49
- ```
50
-
51
- If validation fails, ask the proposer to fix (1 retry). If it fails again, set score to 0.0 and continue.
52
-
53
- ### 3. EVALUATE
54
-
55
- ```bash
56
- python3 ~/.harness-evolver/tools/evaluate.py run \
57
- --harness .harness-evolver/harnesses/v{NNN}/harness.py \
58
- --config .harness-evolver/harnesses/v{NNN}/config.json \
59
- --tasks-dir .harness-evolver/eval/tasks/ \
60
- --eval .harness-evolver/eval/eval.py \
61
- --traces-dir .harness-evolver/harnesses/v{NNN}/traces/ \
62
- --scores .harness-evolver/harnesses/v{NNN}/scores.json \
63
- --timeout 60
64
- ```
65
-
66
- ### 4. UPDATE STATE
67
-
68
- ```bash
69
- python3 ~/.harness-evolver/tools/state.py update \
70
- --base-dir .harness-evolver \
71
- --version v{NNN} \
72
- --scores .harness-evolver/harnesses/v{NNN}/scores.json \
73
- --proposal .harness-evolver/harnesses/v{NNN}/proposal.md
74
- ```
75
-
76
- ### 5. REPORT
77
-
78
- Read the updated `summary.json` and report:
79
- - `Iteration {i}/{N}: v{NNN} scored {score} (best: v{best} at {best_score})`
80
- - If regression (score < parent score): warn
81
- - If new best: celebrate
82
-
83
- ### Stop Conditions
84
-
85
- - All N iterations completed
86
- - **Stagnation**: 3 consecutive iterations without >1% improvement. Read `summary.json` history to check.
87
- - **Target reached**: if `config.json` has `target_score` set and achieved.
88
-
89
- When stopping, report final summary: best version, score, number of iterations, improvement over baseline.
90
-
91
- ## Tool Path Resolution
92
-
93
- Check `.harness-evolver/tools/` first (local override), then `~/.harness-evolver/tools/` (global install).
@@ -1,198 +0,0 @@
1
- ---
2
- name: harness-evolve-init
3
- description: "Initialize harness evolution in the current project. Scans the codebase, identifies the entry point, and helps create harness wrapper, eval script, and test cases if they don't exist."
4
- argument-hint: "[directory]"
5
- allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent]
6
- ---
7
-
8
- # /harness-evolve-init
9
-
10
- Initialize the Harness Evolver for this project.
11
-
12
- ## Usage
13
-
14
- ```
15
- /harness-evolve-init # setup in current directory
16
- /harness-evolve-init ./my-project # setup in a specific directory
17
- ```
18
-
19
- ## Your Job: A 3-Phase Setup Wizard
20
-
21
- You are the intelligent layer. The init.py tool is dumb — it takes paths. Your job is to figure out what to pass it, creating files if needed.
22
-
23
- ### Phase 1: SCAN the project
24
-
25
- Read the project structure to understand what exists:
26
-
27
- ```bash
28
- find . -maxdepth 3 -type f -name "*.py" | head -30
29
- ls -la
30
- ```
31
-
32
- Look for:
33
- - **Entry point candidates:** `main.py`, `app.py`, `agent.py`, `graph.py`, `pipeline.py`, `bot.py`, `run.py`, or any file with `if __name__` block
34
- - **Existing eval/test files:** `eval.py`, `test_*.py`, `score.py`, `judge.py`
35
- - **Existing test data:** `tasks/`, `tests/`, `data/`, `examples/`, `fixtures/`, any dir with JSON/JSONL files
36
- - **Config files:** `config.json`, `config.yaml`, `.env`
37
- - **Framework clues:** imports of langchain, langgraph, openai, anthropic, crewai, etc.
38
-
39
- Also run stack detection:
40
- ```bash
41
- python3 ~/.harness-evolver/tools/detect_stack.py .
42
- ```
43
-
44
- ### Phase 2: CREATE what's missing
45
-
46
- There are 3 artifacts needed. For each, check if it exists or needs to be created.
47
-
48
- #### A. The Harness (`harness.py`)
49
-
50
- **If `harness.py` already exists with the right interface** (`--input`, `--output`): use it directly.
51
-
52
- **If the project has an entry point but NOT in our format:** Create a `harness.py` wrapper.
53
-
54
- Read the entry point to understand its input/output format, then generate a wrapper:
55
-
56
- ```python
57
- #!/usr/bin/env python3
58
- """Harness wrapper for [project name]. Generated by harness-evolve-init."""
59
-
60
- import argparse
61
- import json
62
- import sys
63
-
64
- # Import the actual project code
65
- from [entry_module] import [main_function_or_class]
66
-
67
- def main():
68
- parser = argparse.ArgumentParser()
69
- parser.add_argument("--input", required=True)
70
- parser.add_argument("--output", required=True)
71
- parser.add_argument("--traces-dir", default=None)
72
- parser.add_argument("--config", default=None)
73
- args = parser.parse_args()
74
-
75
- task = json.load(open(args.input))
76
- config = json.load(open(args.config)) if args.config else {}
77
-
78
- # Call the actual project code
79
- result = [call_the_project](task["input"], **config)
80
-
81
- json.dump({"id": task["id"], "output": str(result)}, open(args.output, "w"))
82
-
83
- if __name__ == "__main__":
84
- main()
85
- ```
86
-
87
- Adapt this template based on what you learned from reading the entry point. Ask the user to confirm if you're unsure about the input/output mapping.
88
-
89
- #### B. The Eval (`eval.py`)
90
-
91
- **If `eval.py` exists:** use it.
92
-
93
- **If not:** Ask the user what "correct" means for their use case, then generate one:
94
-
95
- - **Classification/extraction:** exact match or fuzzy match
96
- - **Chatbot/QA:** LLM-as-judge (requires API key) or keyword matching
97
- - **Code generation:** execution-based (run the code, check output)
98
- - **RAG:** relevance scoring
99
-
100
- Start with the simplest eval that works. The evolver can iterate on the harness without a perfect eval — even a rough eval gives signal.
101
-
102
- ```python
103
- #!/usr/bin/env python3
104
- """Eval script for [project]. Generated by harness-evolve-init."""
105
-
106
- import argparse
107
- import json
108
- import os
109
-
110
- def main():
111
- parser = argparse.ArgumentParser()
112
- parser.add_argument("--results-dir", required=True)
113
- parser.add_argument("--tasks-dir", required=True)
114
- parser.add_argument("--scores", required=True)
115
- args = parser.parse_args()
116
-
117
- correct, total = 0, 0
118
- per_task = {}
119
-
120
- for fname in sorted(os.listdir(args.tasks_dir)):
121
- if not fname.endswith(".json"):
122
- continue
123
- with open(os.path.join(args.tasks_dir, fname)) as f:
124
- task = json.load(f)
125
- task_id = task["id"]
126
-
127
- result_path = os.path.join(args.results_dir, fname)
128
- if not os.path.exists(result_path):
129
- per_task[task_id] = {"score": 0.0, "error": "no output"}
130
- total += 1
131
- continue
132
-
133
- with open(result_path) as f:
134
- result = json.load(f)
135
-
136
- # ADAPT THIS: define what "correct" means for this project
137
- expected = task.get("expected", "")
138
- actual = result.get("output", "")
139
- match = actual.lower().strip() == expected.lower().strip()
140
-
141
- per_task[task_id] = {"score": 1.0 if match else 0.0, "expected": expected, "actual": actual}
142
- correct += int(match)
143
- total += 1
144
-
145
- accuracy = correct / total if total > 0 else 0.0
146
- json.dump({
147
- "combined_score": accuracy,
148
- "accuracy": accuracy,
149
- "total_tasks": total,
150
- "correct": correct,
151
- "per_task": per_task,
152
- }, open(args.scores, "w"), indent=2)
153
-
154
- if __name__ == "__main__":
155
- main()
156
- ```
157
-
158
- #### C. Test Cases (`tasks/`)
159
-
160
- **If `tasks/` exists with JSON files:** use it.
161
-
162
- **If the project has test data in another format:** Convert it to our format.
163
-
164
- **If no test data exists:** Help create 5-10 test cases. Ask the user:
165
- > "What are typical inputs to your system? And what are the expected outputs? Give me 3-5 examples and I'll create the task files."
166
-
167
- Each task file is:
168
- ```json
169
- {"id": "task_001", "input": "...", "expected": "...", "metadata": {}}
170
- ```
171
-
172
- ### Phase 3: RUN init.py
173
-
174
- Once all 3 artifacts exist, run:
175
-
176
- ```bash
177
- python3 ~/.harness-evolver/tools/init.py \
178
- --harness harness.py \
179
- --eval eval.py \
180
- --tasks tasks/ \
181
- --tools-dir ~/.harness-evolver/tools
182
- ```
183
-
184
- If a harness config exists, add `--harness-config config.json`.
185
-
186
- If `~/.harness-evolver/tools/init.py` does not exist, check `.harness-evolver/tools/init.py`.
187
-
188
- ### After init completes, report:
189
-
190
- - What was detected vs created
191
- - Stack detected (libraries)
192
- - Integrations available (LangSmith, Context7)
193
- - Baseline score
194
- - Next step: `/harness-evolve` to start the optimization loop
195
-
196
- ## Key Principle
197
-
198
- **Don't ask the user to restructure their project.** You adapt to them. If they have a LangGraph graph in `src/graph.py`, you create a thin wrapper — you don't ask them to rename it to `harness.py`. The wrapper IS the harness.
@@ -1,25 +0,0 @@
1
- ---
2
- name: harness-evolve-status
3
- description: "Show the current status of harness evolution: best score, iteration count, progress history."
4
- allowed-tools: [Read, Bash]
5
- ---
6
-
7
- # /harness-evolve-status
8
-
9
- Show the current evolution status.
10
-
11
- ## What To Do
12
-
13
- ```bash
14
- python3 ~/.harness-evolver/tools/state.py show --base-dir .harness-evolver
15
- ```
16
-
17
- If that doesn't exist, try:
18
-
19
- ```bash
20
- python3 .harness-evolver/tools/state.py show --base-dir .harness-evolver
21
- ```
22
-
23
- Also read and display the contents of `.harness-evolver/STATE.md` for the full status table.
24
-
25
- If `.harness-evolver/` doesn't exist, tell the user to run `/harness-evolve-init` first.