harness-evolver 0.2.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/install.js +155 -52
- package/package.json +1 -1
- package/skills/compare/SKILL.md +73 -0
- package/skills/deploy/SKILL.md +53 -0
- package/skills/diagnose/SKILL.md +96 -0
- package/skills/evolve/SKILL.md +94 -0
- package/skills/init/SKILL.md +66 -0
- package/skills/status/SKILL.md +34 -0
- package/skills/harness-evolve/SKILL.md +0 -93
- package/skills/harness-evolve-init/SKILL.md +0 -198
- package/skills/harness-evolve-status/SKILL.md +0 -25
package/bin/install.js
CHANGED
|
@@ -1,26 +1,53 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
/**
|
|
3
3
|
* Harness Evolver installer.
|
|
4
|
-
*
|
|
4
|
+
* Interactive setup with runtime selection, global/local choice.
|
|
5
5
|
*
|
|
6
6
|
* Usage: npx harness-evolver@latest
|
|
7
7
|
*/
|
|
8
8
|
|
|
9
9
|
const fs = require("fs");
|
|
10
10
|
const path = require("path");
|
|
11
|
+
const readline = require("readline");
|
|
11
12
|
const { execSync } = require("child_process");
|
|
12
13
|
|
|
14
|
+
const VERSION = require("../package.json").version;
|
|
13
15
|
const PLUGIN_ROOT = path.resolve(__dirname, "..");
|
|
14
16
|
const HOME = process.env.HOME || process.env.USERPROFILE;
|
|
15
17
|
|
|
16
|
-
|
|
17
|
-
const
|
|
18
|
-
const
|
|
19
|
-
const
|
|
20
|
-
const
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
18
|
+
// ANSI colors
|
|
19
|
+
const CYAN = "\x1b[36m";
|
|
20
|
+
const GREEN = "\x1b[32m";
|
|
21
|
+
const YELLOW = "\x1b[33m";
|
|
22
|
+
const RED = "\x1b[31m";
|
|
23
|
+
const DIM = "\x1b[2m";
|
|
24
|
+
const BOLD = "\x1b[1m";
|
|
25
|
+
const RESET = "\x1b[0m";
|
|
26
|
+
|
|
27
|
+
const LOGO = `
|
|
28
|
+
${CYAN} ██╗ ██╗ █████╗ ██████╗ ███╗ ██╗███████╗███████╗███████╗
|
|
29
|
+
██║ ██║██╔══██╗██╔══██╗████╗ ██║██╔════╝██╔════╝██╔════╝
|
|
30
|
+
███████║███████║██████╔╝██╔██╗ ██║█████╗ ███████╗███████╗
|
|
31
|
+
██╔══██║██╔══██║██╔══██╗██║╚██╗██║██╔══╝ ╚════██║╚════██║
|
|
32
|
+
██║ ██║██║ ██║██║ ██║██║ ╚████║███████╗███████║███████║
|
|
33
|
+
╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═══╝╚══════╝╚══════╝╚══════╝
|
|
34
|
+
${BOLD}███████╗██╗ ██╗ ██████╗ ██╗ ██╗ ██╗███████╗██████╗
|
|
35
|
+
██╔════╝██║ ██║██╔═══██╗██║ ██║ ██║██╔════╝██╔══██╗
|
|
36
|
+
█████╗ ██║ ██║██║ ██║██║ ██║ ██║█████╗ ██████╔╝
|
|
37
|
+
██╔══╝ ╚██╗ ██╔╝██║ ██║██║ ╚██╗ ██╔╝██╔══╝ ██╔══██╗
|
|
38
|
+
███████╗ ╚████╔╝ ╚██████╔╝███████╗╚████╔╝ ███████╗██║ ██║
|
|
39
|
+
╚══════╝ ╚═══╝ ╚═════╝ ╚══════╝ ╚═══╝ ╚══════╝╚═╝ ╚═╝${RESET}
|
|
40
|
+
`;
|
|
41
|
+
|
|
42
|
+
const RUNTIMES = [
|
|
43
|
+
{ name: "Claude Code", dir: ".claude", detected: () => fs.existsSync(path.join(HOME, ".claude")) },
|
|
44
|
+
{ name: "Cursor", dir: ".cursor", detected: () => fs.existsSync(path.join(HOME, ".cursor")) },
|
|
45
|
+
{ name: "Codex", dir: ".codex", detected: () => fs.existsSync(path.join(HOME, ".codex")) },
|
|
46
|
+
{ name: "Windsurf", dir: ".windsurf", detected: () => fs.existsSync(path.join(HOME, ".windsurf")) },
|
|
47
|
+
];
|
|
48
|
+
|
|
49
|
+
function ask(rl, question) {
|
|
50
|
+
return new Promise((resolve) => rl.question(question, resolve));
|
|
24
51
|
}
|
|
25
52
|
|
|
26
53
|
function copyDir(src, dest) {
|
|
@@ -50,76 +77,152 @@ function checkPython() {
|
|
|
50
77
|
}
|
|
51
78
|
}
|
|
52
79
|
|
|
53
|
-
function
|
|
54
|
-
|
|
80
|
+
function installForRuntime(runtimeDir, scope) {
|
|
81
|
+
const baseDir = scope === "local"
|
|
82
|
+
? path.join(process.cwd(), runtimeDir)
|
|
83
|
+
: path.join(HOME, runtimeDir);
|
|
55
84
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
process.exit(1);
|
|
59
|
-
}
|
|
60
|
-
log("\u2713 python3 found");
|
|
85
|
+
const commandsDir = path.join(baseDir, "commands", "harness-evolver");
|
|
86
|
+
const agentsDir = path.join(baseDir, "agents");
|
|
61
87
|
|
|
62
|
-
|
|
63
|
-
console.error(` ERROR: Claude Code directory not found at ${CLAUDE_DIR}`);
|
|
64
|
-
console.error(" Install Claude Code first: https://claude.ai/code");
|
|
65
|
-
process.exit(1);
|
|
66
|
-
}
|
|
67
|
-
log("\u2713 Claude Code detected");
|
|
68
|
-
|
|
69
|
-
// Copy skills
|
|
88
|
+
// Skills
|
|
70
89
|
const skillsSource = path.join(PLUGIN_ROOT, "skills");
|
|
71
90
|
if (fs.existsSync(skillsSource)) {
|
|
72
91
|
for (const skill of fs.readdirSync(skillsSource, { withFileTypes: true })) {
|
|
73
92
|
if (skill.isDirectory()) {
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
copyDir(src, dest);
|
|
77
|
-
log(` skill: ${skill.name}`);
|
|
93
|
+
copyDir(path.join(skillsSource, skill.name), path.join(commandsDir, skill.name));
|
|
94
|
+
console.log(` ${GREEN}✓${RESET} Installed skill: ${skill.name}`);
|
|
78
95
|
}
|
|
79
96
|
}
|
|
80
97
|
}
|
|
81
98
|
|
|
82
|
-
//
|
|
99
|
+
// Agents
|
|
83
100
|
const agentsSource = path.join(PLUGIN_ROOT, "agents");
|
|
84
101
|
if (fs.existsSync(agentsSource)) {
|
|
85
|
-
fs.mkdirSync(
|
|
102
|
+
fs.mkdirSync(agentsDir, { recursive: true });
|
|
86
103
|
for (const agent of fs.readdirSync(agentsSource)) {
|
|
87
|
-
copyFile(
|
|
88
|
-
|
|
89
|
-
path.join(AGENTS_DIR, agent)
|
|
90
|
-
);
|
|
91
|
-
log(` agent: ${agent}`);
|
|
104
|
+
copyFile(path.join(agentsSource, agent), path.join(agentsDir, agent));
|
|
105
|
+
console.log(` ${GREEN}✓${RESET} Installed agent: ${agent}`);
|
|
92
106
|
}
|
|
93
107
|
}
|
|
108
|
+
}
|
|
94
109
|
|
|
95
|
-
|
|
110
|
+
function installTools() {
|
|
111
|
+
const toolsDir = path.join(HOME, ".harness-evolver", "tools");
|
|
96
112
|
const toolsSource = path.join(PLUGIN_ROOT, "tools");
|
|
97
113
|
if (fs.existsSync(toolsSource)) {
|
|
98
|
-
fs.mkdirSync(
|
|
114
|
+
fs.mkdirSync(toolsDir, { recursive: true });
|
|
99
115
|
for (const tool of fs.readdirSync(toolsSource)) {
|
|
100
116
|
if (tool.endsWith(".py")) {
|
|
101
|
-
copyFile(
|
|
102
|
-
|
|
103
|
-
path.join(TOOLS_DIR, tool)
|
|
104
|
-
);
|
|
105
|
-
log(` tool: ${tool}`);
|
|
117
|
+
copyFile(path.join(toolsSource, tool), path.join(toolsDir, tool));
|
|
118
|
+
console.log(` ${GREEN}✓${RESET} Installed tool: ${tool}`);
|
|
106
119
|
}
|
|
107
120
|
}
|
|
108
121
|
}
|
|
122
|
+
}
|
|
109
123
|
|
|
110
|
-
|
|
124
|
+
function installExamples() {
|
|
125
|
+
const examplesDir = path.join(HOME, ".harness-evolver", "examples");
|
|
111
126
|
const examplesSource = path.join(PLUGIN_ROOT, "examples");
|
|
112
127
|
if (fs.existsSync(examplesSource)) {
|
|
113
|
-
copyDir(examplesSource,
|
|
114
|
-
log(
|
|
128
|
+
copyDir(examplesSource, examplesDir);
|
|
129
|
+
console.log(` ${GREEN}✓${RESET} Installed examples: classifier`);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
async function main() {
|
|
134
|
+
console.log(LOGO);
|
|
135
|
+
console.log(` ${DIM}Harness Evolver v${VERSION}${RESET}`);
|
|
136
|
+
console.log(` ${DIM}Meta-Harness-style autonomous harness optimization${RESET}`);
|
|
137
|
+
console.log();
|
|
138
|
+
|
|
139
|
+
// Check python
|
|
140
|
+
if (!checkPython()) {
|
|
141
|
+
console.error(` ${RED}ERROR:${RESET} python3 not found in PATH. Install Python 3.8+ first.`);
|
|
142
|
+
process.exit(1);
|
|
143
|
+
}
|
|
144
|
+
console.log(` ${GREEN}✓${RESET} python3 found`);
|
|
145
|
+
|
|
146
|
+
// Detect runtimes
|
|
147
|
+
const available = RUNTIMES.filter((r) => r.detected());
|
|
148
|
+
if (available.length === 0) {
|
|
149
|
+
console.error(`\n ${RED}ERROR:${RESET} No supported runtime detected.`);
|
|
150
|
+
console.error(` Install Claude Code, Cursor, Codex, or Windsurf first.`);
|
|
151
|
+
process.exit(1);
|
|
115
152
|
}
|
|
116
153
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
console.log(
|
|
121
|
-
|
|
122
|
-
|
|
154
|
+
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
155
|
+
|
|
156
|
+
// Runtime selection
|
|
157
|
+
console.log(`\n ${YELLOW}Which runtime(s) would you like to install for?${RESET}\n`);
|
|
158
|
+
available.forEach((r, i) => {
|
|
159
|
+
console.log(` ${i + 1}) ${r.name.padEnd(14)} (~/${r.dir})`);
|
|
160
|
+
});
|
|
161
|
+
if (available.length > 1) {
|
|
162
|
+
console.log(` ${available.length + 1}) All`);
|
|
163
|
+
console.log(`\n ${DIM}Select multiple: 1,2 or 1 2${RESET}`);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const defaultChoice = "1";
|
|
167
|
+
const runtimeAnswer = await ask(rl, `\n ${YELLOW}Choice [${defaultChoice}]:${RESET} `);
|
|
168
|
+
const runtimeInput = (runtimeAnswer.trim() || defaultChoice);
|
|
169
|
+
|
|
170
|
+
let selectedRuntimes;
|
|
171
|
+
if (runtimeInput === String(available.length + 1)) {
|
|
172
|
+
selectedRuntimes = available;
|
|
173
|
+
} else {
|
|
174
|
+
const indices = runtimeInput.split(/[,\s]+/).map((s) => parseInt(s, 10) - 1);
|
|
175
|
+
selectedRuntimes = indices
|
|
176
|
+
.filter((i) => i >= 0 && i < available.length)
|
|
177
|
+
.map((i) => available[i]);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
if (selectedRuntimes.length === 0) {
|
|
181
|
+
selectedRuntimes = [available[0]];
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// Scope selection
|
|
185
|
+
console.log(`\n ${YELLOW}Where would you like to install?${RESET}\n`);
|
|
186
|
+
console.log(` 1) Global (~/${selectedRuntimes[0].dir}) - available in all projects`);
|
|
187
|
+
console.log(` 2) Local (./${selectedRuntimes[0].dir}) - this project only`);
|
|
188
|
+
|
|
189
|
+
const scopeAnswer = await ask(rl, `\n ${YELLOW}Choice [1]:${RESET} `);
|
|
190
|
+
const scope = (scopeAnswer.trim() === "2") ? "local" : "global";
|
|
191
|
+
|
|
192
|
+
console.log();
|
|
193
|
+
|
|
194
|
+
// Install for each selected runtime
|
|
195
|
+
for (const runtime of selectedRuntimes) {
|
|
196
|
+
const target = scope === "local" ? `./${runtime.dir}` : `~/${runtime.dir}`;
|
|
197
|
+
console.log(` Installing for ${CYAN}${runtime.name}${RESET} to ${target}`);
|
|
198
|
+
console.log();
|
|
199
|
+
installForRuntime(runtime.dir, scope);
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
// Tools and examples are always global
|
|
203
|
+
installTools();
|
|
204
|
+
installExamples();
|
|
205
|
+
|
|
206
|
+
// Write version file
|
|
207
|
+
const versionPath = path.join(HOME, ".harness-evolver", "VERSION");
|
|
208
|
+
fs.mkdirSync(path.dirname(versionPath), { recursive: true });
|
|
209
|
+
fs.writeFileSync(versionPath, VERSION);
|
|
210
|
+
console.log(` ${GREEN}✓${RESET} Wrote VERSION (${VERSION})`);
|
|
211
|
+
|
|
212
|
+
console.log(`\n ${GREEN}Done!${RESET} Open a project in Claude Code and run ${CYAN}/harness-evolver:init${RESET}`);
|
|
213
|
+
console.log(`\n ${DIM}Quick start with example:${RESET}`);
|
|
214
|
+
console.log(` cp -r ~/.harness-evolver/examples/classifier ./my-project`);
|
|
215
|
+
console.log(` cd my-project && claude`);
|
|
216
|
+
console.log(` /harness-evolver:init`);
|
|
217
|
+
console.log(` /harness-evolver:evolve`);
|
|
218
|
+
|
|
219
|
+
console.log(`\n ${DIM}GitHub: https://github.com/raphaelchristi/harness-evolver${RESET}`);
|
|
220
|
+
console.log();
|
|
221
|
+
|
|
222
|
+
rl.close();
|
|
123
223
|
}
|
|
124
224
|
|
|
125
|
-
main()
|
|
225
|
+
main().catch((err) => {
|
|
226
|
+
console.error(` ${RED}ERROR:${RESET} ${err.message}`);
|
|
227
|
+
process.exit(1);
|
|
228
|
+
});
|
package/package.json
CHANGED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: compare
|
|
3
|
+
description: "Use when the user wants to compare two harness versions, understand what changed between iterations, see why one version scored better than another, or debug a regression."
|
|
4
|
+
argument-hint: "<vA> <vB>"
|
|
5
|
+
allowed-tools: [Read, Bash, Glob, Grep]
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# /harness-evolver:compare
|
|
9
|
+
|
|
10
|
+
Compare two harness versions side by side.
|
|
11
|
+
|
|
12
|
+
## Arguments
|
|
13
|
+
|
|
14
|
+
- `vA` — first version (e.g., `v001`, `baseline`)
|
|
15
|
+
- `vB` — second version (e.g., `v003`)
|
|
16
|
+
|
|
17
|
+
If only one version given, compare it against the current best.
|
|
18
|
+
If no versions given, compare the two most recent.
|
|
19
|
+
|
|
20
|
+
## What To Do
|
|
21
|
+
|
|
22
|
+
### 1. Code Diff
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
diff .harness-evolver/harnesses/{vA}/harness.py .harness-evolver/harnesses/{vB}/harness.py
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
If config changed:
|
|
29
|
+
```bash
|
|
30
|
+
diff .harness-evolver/harnesses/{vA}/config.json .harness-evolver/harnesses/{vB}/config.json
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### 2. Score Comparison
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
cat .harness-evolver/harnesses/{vA}/scores.json
|
|
37
|
+
cat .harness-evolver/harnesses/{vB}/scores.json
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Report: combined_score delta, per-task wins/losses.
|
|
41
|
+
|
|
42
|
+
### 3. Per-Task Analysis
|
|
43
|
+
|
|
44
|
+
For tasks where scores diverge, show what each version produced:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
cat .harness-evolver/harnesses/{vA}/traces/task_{ID}/output.json
|
|
48
|
+
cat .harness-evolver/harnesses/{vB}/traces/task_{ID}/output.json
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### 4. Proposal Context
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
cat .harness-evolver/harnesses/{vB}/proposal.md
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Show what the proposer intended and whether the result matched expectations.
|
|
58
|
+
|
|
59
|
+
## Report Format
|
|
60
|
+
|
|
61
|
+
```
|
|
62
|
+
v001 (0.62) vs v003 (0.71) — +0.09 improvement
|
|
63
|
+
|
|
64
|
+
Code changes:
|
|
65
|
+
+ Added few-shot examples (3 examples)
|
|
66
|
+
~ Changed prompt template
|
|
67
|
+
- Removed retry logic
|
|
68
|
+
|
|
69
|
+
Per-task:
|
|
70
|
+
task_001: 1.0 → 1.0 (unchanged)
|
|
71
|
+
task_007: 0.0 → 1.0 (FIXED — was cardiac, now correctly classified)
|
|
72
|
+
task_008: 1.0 → 0.0 (REGRESSION — was neurological, now wrong)
|
|
73
|
+
```
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: deploy
|
|
3
|
+
description: "Use when the user wants to use the best evolved harness in their project, promote a version to production, copy the winning harness back, or is done evolving and wants to apply the result."
|
|
4
|
+
argument-hint: "[version]"
|
|
5
|
+
allowed-tools: [Read, Write, Bash, Glob]
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# /harness-evolver:deploy
|
|
9
|
+
|
|
10
|
+
Promote the best (or specified) harness version back to the user's project.
|
|
11
|
+
|
|
12
|
+
## Arguments
|
|
13
|
+
|
|
14
|
+
- `version` — optional. If not given, deploys the best version from `summary.json`.
|
|
15
|
+
|
|
16
|
+
## What To Do
|
|
17
|
+
|
|
18
|
+
### 1. Identify Best Version
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
python3 -c "import json; s=json.load(open('.harness-evolver/summary.json')); print(s['best']['version'], s['best']['combined_score'])"
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Or use the user-specified version.
|
|
25
|
+
|
|
26
|
+
### 2. Show What Will Be Deployed
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
cat .harness-evolver/harnesses/{version}/proposal.md
|
|
30
|
+
cat .harness-evolver/harnesses/{version}/scores.json
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Report: version, score, improvement over baseline, what changed.
|
|
34
|
+
|
|
35
|
+
### 3. Ask for Confirmation
|
|
36
|
+
|
|
37
|
+
> Deploy `{version}` (score: {score}, +{delta} over baseline) to your project?
|
|
38
|
+
> This will copy `harness.py` and `config.json` to the project root.
|
|
39
|
+
|
|
40
|
+
### 4. Copy Files
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
cp .harness-evolver/harnesses/{version}/harness.py ./harness.py
|
|
44
|
+
cp .harness-evolver/harnesses/{version}/config.json ./config.json # if exists
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
If the original entry point had a different name (e.g., `graph.py`), ask the user where to put it.
|
|
48
|
+
|
|
49
|
+
### 5. Report
|
|
50
|
+
|
|
51
|
+
- What was copied and where
|
|
52
|
+
- Score improvement: baseline → deployed version
|
|
53
|
+
- Suggest: review the diff before committing
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: diagnose
|
|
3
|
+
description: "Use when the user wants to understand why a specific harness version failed, investigate a regression, analyze trace data, or debug a low score. Also use when the user says 'why did v003 fail' or 'what went wrong'."
|
|
4
|
+
argument-hint: "[version]"
|
|
5
|
+
allowed-tools: [Read, Bash, Glob, Grep]
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# /harness-evolver:diagnose
|
|
9
|
+
|
|
10
|
+
Deep analysis of a harness version's execution traces and scores.
|
|
11
|
+
|
|
12
|
+
## Arguments
|
|
13
|
+
|
|
14
|
+
- `version` — version to diagnose (e.g., `v003`). If not given, diagnose the worst or most recent regression.
|
|
15
|
+
|
|
16
|
+
## Resolve Tool Path
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo "$HOME/.harness-evolver/tools")
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## What To Do
|
|
23
|
+
|
|
24
|
+
### 1. Identify the Version
|
|
25
|
+
|
|
26
|
+
If not specified, find the worst or most recent regression:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
python3 $TOOLS/state.py show --base-dir .harness-evolver
|
|
30
|
+
cat .harness-evolver/summary.json
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### 2. Score Breakdown
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
cat .harness-evolver/harnesses/{version}/scores.json
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Identify which tasks failed (`score: 0.0`) and which passed.
|
|
40
|
+
|
|
41
|
+
### 3. Trace Analysis (failed tasks)
|
|
42
|
+
|
|
43
|
+
For each failed task:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
cat .harness-evolver/harnesses/{version}/traces/{task_id}/input.json
|
|
47
|
+
cat .harness-evolver/harnesses/{version}/traces/{task_id}/output.json
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
Look for patterns: wrong format? wrong category? empty output? crash?
|
|
51
|
+
|
|
52
|
+
### 4. Error Search
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
grep -r "error\|Error\|FAIL\|exception\|Traceback" .harness-evolver/harnesses/{version}/traces/
|
|
56
|
+
cat .harness-evolver/harnesses/{version}/traces/stderr.log
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### 5. Compare with Parent
|
|
60
|
+
|
|
61
|
+
Read the proposal to find the parent version:
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
cat .harness-evolver/harnesses/{version}/proposal.md
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Then diff:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
diff .harness-evolver/harnesses/{parent}/harness.py .harness-evolver/harnesses/{version}/harness.py
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### 6. LangSmith (if available)
|
|
74
|
+
|
|
75
|
+
If `langsmith-cli` is installed and LangSmith is configured:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
langsmith-cli --json runs list --project harness-evolver-{version} --failed --fields id,name,error,inputs
|
|
79
|
+
langsmith-cli --json runs stats --project harness-evolver-{version}
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### 7. Report
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
Diagnosis: v003 (score: 0.31) — REGRESSION from v001 (0.62)
|
|
86
|
+
|
|
87
|
+
Root cause: Prompt template change broke JSON parsing
|
|
88
|
+
- 4/10 tasks returned malformed output
|
|
89
|
+
- stderr shows: json.JSONDecodeError on 4 tasks
|
|
90
|
+
- The change on line 42 removed the "Reply with ONLY..." instruction
|
|
91
|
+
|
|
92
|
+
Affected tasks: task_002, task_005, task_007, task_010
|
|
93
|
+
Unaffected tasks: task_001, task_003, task_004, task_006, task_008, task_009
|
|
94
|
+
|
|
95
|
+
Recommendation: Revert the prompt change, keep the retry logic from v002
|
|
96
|
+
```
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: evolve
|
|
3
|
+
description: "Use when the user wants to run the optimization loop, improve harness performance, evolve the harness, or iterate on harness quality. Requires .harness-evolver/ to exist (run harness-evolver:init first)."
|
|
4
|
+
argument-hint: "[--iterations N]"
|
|
5
|
+
allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent]
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# /harness-evolve
|
|
9
|
+
|
|
10
|
+
Run the autonomous propose-evaluate-iterate loop.
|
|
11
|
+
|
|
12
|
+
## Prerequisites
|
|
13
|
+
|
|
14
|
+
`.harness-evolver/summary.json` must exist. If not, tell user to run `harness-evolver:init`.
|
|
15
|
+
|
|
16
|
+
## Resolve Tool Path
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo "$HOME/.harness-evolver/tools")
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Parse Arguments
|
|
23
|
+
|
|
24
|
+
- `--iterations N` (default: 10)
|
|
25
|
+
- Read `config.json` for `evolution.stagnation_limit` (default: 3) and `evolution.target_score`
|
|
26
|
+
|
|
27
|
+
## The Loop
|
|
28
|
+
|
|
29
|
+
For each iteration:
|
|
30
|
+
|
|
31
|
+
### 1. Get Next Version
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
python3 -c "import json; s=json.load(open('.harness-evolver/summary.json')); print(f'v{s[\"iterations\"]+1:03d}')"
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### 2. Propose
|
|
38
|
+
|
|
39
|
+
Spawn the `harness-evolver-proposer` agent:
|
|
40
|
+
|
|
41
|
+
> You are proposing iteration {i}. Create version {version} in `.harness-evolver/harnesses/{version}/`.
|
|
42
|
+
> Working directory contains `.harness-evolver/` with all prior candidates and traces.
|
|
43
|
+
|
|
44
|
+
The proposer creates: `harness.py`, `config.json`, `proposal.md`.
|
|
45
|
+
|
|
46
|
+
### 3. Validate
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
python3 $TOOLS/evaluate.py validate \
|
|
50
|
+
--harness .harness-evolver/harnesses/{version}/harness.py \
|
|
51
|
+
--config .harness-evolver/harnesses/{version}/config.json
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
If fails: one retry via proposer. If still fails: score 0.0, continue.
|
|
55
|
+
|
|
56
|
+
### 4. Evaluate
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
python3 $TOOLS/evaluate.py run \
|
|
60
|
+
--harness .harness-evolver/harnesses/{version}/harness.py \
|
|
61
|
+
--config .harness-evolver/harnesses/{version}/config.json \
|
|
62
|
+
--tasks-dir .harness-evolver/eval/tasks/ \
|
|
63
|
+
--eval .harness-evolver/eval/eval.py \
|
|
64
|
+
--traces-dir .harness-evolver/harnesses/{version}/traces/ \
|
|
65
|
+
--scores .harness-evolver/harnesses/{version}/scores.json \
|
|
66
|
+
--timeout 60
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### 5. Update State
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
python3 $TOOLS/state.py update \
|
|
73
|
+
--base-dir .harness-evolver \
|
|
74
|
+
--version {version} \
|
|
75
|
+
--scores .harness-evolver/harnesses/{version}/scores.json \
|
|
76
|
+
--proposal .harness-evolver/harnesses/{version}/proposal.md
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### 6. Report
|
|
80
|
+
|
|
81
|
+
Read `summary.json`. Print: `Iteration {i}/{N}: {version} scored {score} (best: {best} at {best_score})`
|
|
82
|
+
|
|
83
|
+
### 7. Check Stop Conditions
|
|
84
|
+
|
|
85
|
+
- **Stagnation**: last 3 scores within 1% of each other → stop
|
|
86
|
+
- **Target**: `combined_score >= target_score` → stop
|
|
87
|
+
- **N reached**: done
|
|
88
|
+
|
|
89
|
+
## When Loop Ends — Final Report
|
|
90
|
+
|
|
91
|
+
- Best version and score
|
|
92
|
+
- Improvement over baseline (absolute and %)
|
|
93
|
+
- Total iterations run
|
|
94
|
+
- Suggest: "The best harness is at `.harness-evolver/harnesses/{best}/harness.py`. Copy it to your project."
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: init
|
|
3
|
+
description: "Use when the user wants to set up harness optimization in their project, optimize an LLM agent, improve a harness, or mentions harness-evolver for the first time in a project without .harness-evolver/ directory."
|
|
4
|
+
argument-hint: "[directory]"
|
|
5
|
+
allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent]
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# /harness-evolve-init
|
|
9
|
+
|
|
10
|
+
Set up the Harness Evolver in a project. Scans the codebase, identifies the entry point, creates missing artifacts, runs baseline evaluation.
|
|
11
|
+
|
|
12
|
+
## Resolve Tool Path
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo "$HOME/.harness-evolver/tools")
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
Use `$TOOLS` prefix for all tool calls below.
|
|
19
|
+
|
|
20
|
+
## Phase 1: Scan
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
find . -maxdepth 3 -type f -name "*.py" | head -30
|
|
24
|
+
python3 $TOOLS/detect_stack.py .
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Look for:
|
|
28
|
+
- Entry points: files with `if __name__`, or named `main.py`, `app.py`, `agent.py`, `graph.py`, `pipeline.py`, `bot.py`
|
|
29
|
+
- Existing eval: `eval.py`, `score.py`, `judge.py`
|
|
30
|
+
- Existing tasks: directories with JSON files containing `id` + `input` fields
|
|
31
|
+
- Config: `config.json`, `config.yaml`, `.env`
|
|
32
|
+
|
|
33
|
+
## Phase 2: Create What's Missing
|
|
34
|
+
|
|
35
|
+
Three artifacts needed. For each — use existing if found, create if not.
|
|
36
|
+
|
|
37
|
+
**Harness** (`harness.py`): If user's entry point doesn't match our CLI interface (`--input`, `--output`, `--traces-dir`, `--config`), create a thin wrapper that imports their code. Read their entry point first to understand the I/O format. Ask if unsure.
|
|
38
|
+
|
|
39
|
+
**Eval** (`eval.py`): Ask the user what "correct" means for their domain. Generate the simplest eval that gives signal. Even rough scoring works — the evolver iterates.
|
|
40
|
+
|
|
41
|
+
**Tasks** (`tasks/`): If no test data exists, ask the user for 5-10 example input/output pairs. Each task is `{"id": "task_001", "input": "...", "expected": "...", "metadata": {}}`.
|
|
42
|
+
|
|
43
|
+
## Phase 3: Run Init
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
python3 $TOOLS/init.py [directory] \
|
|
47
|
+
--harness harness.py --eval eval.py --tasks tasks/ \
|
|
48
|
+
--tools-dir $TOOLS
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Add `--harness-config config.json` if a config exists.
|
|
52
|
+
|
|
53
|
+
## After Init — Report
|
|
54
|
+
|
|
55
|
+
- What was detected vs created
|
|
56
|
+
- Stack + integrations (LangSmith, Context7)
|
|
57
|
+
- Baseline score
|
|
58
|
+
- Next: `harness-evolver:evolve` to start
|
|
59
|
+
|
|
60
|
+
## Gotchas
|
|
61
|
+
|
|
62
|
+
- The harness must write valid JSON to `--output`. If the user's code returns non-JSON, the wrapper must serialize it.
|
|
63
|
+
- Tasks must have unique `id` fields. Duplicate IDs cause silent eval errors.
|
|
64
|
+
- The `expected` field is never shown to the harness — only the eval script sees it.
|
|
65
|
+
- If `.harness-evolver/` already exists, warn before overwriting.
|
|
66
|
+
- If no Python files exist in CWD, the user is probably in the wrong directory.
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: status
|
|
3
|
+
description: "Use when the user asks about evolution progress, current scores, best harness version, how many iterations ran, or whether the loop is stagnating. Also use when the user says 'status', 'progress', or 'how is it going'."
|
|
4
|
+
allowed-tools: [Read, Bash]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# /harness-evolve-status
|
|
8
|
+
|
|
9
|
+
Show evolution progress.
|
|
10
|
+
|
|
11
|
+
## Resolve Tool Path
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo "$HOME/.harness-evolver/tools")
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## What To Do
|
|
18
|
+
|
|
19
|
+
If `.harness-evolver/` does not exist, tell user to run `harness-evolver:init` first.
|
|
20
|
+
|
|
21
|
+
Otherwise:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
python3 $TOOLS/state.py show --base-dir .harness-evolver
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Then read and display `.harness-evolver/STATE.md` for the full history table.
|
|
28
|
+
|
|
29
|
+
## If User Wants More Detail
|
|
30
|
+
|
|
31
|
+
- Scores per task: `cat .harness-evolver/harnesses/{version}/scores.json`
|
|
32
|
+
- What changed: `cat .harness-evolver/harnesses/{version}/proposal.md`
|
|
33
|
+
- Compare two versions: `diff .harness-evolver/harnesses/{vA}/harness.py .harness-evolver/harnesses/{vB}/harness.py`
|
|
34
|
+
- Full history: `cat .harness-evolver/PROPOSER_HISTORY.md`
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: harness-evolve
|
|
3
|
-
description: "Run the harness evolution loop. Autonomously proposes, evaluates, and iterates on harness designs using full execution traces as feedback."
|
|
4
|
-
argument-hint: "[--iterations N] [--candidates-per-iter N]"
|
|
5
|
-
allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent]
|
|
6
|
-
---
|
|
7
|
-
|
|
8
|
-
# /harness-evolve
|
|
9
|
-
|
|
10
|
-
Run the Meta-Harness optimization loop.
|
|
11
|
-
|
|
12
|
-
## Arguments
|
|
13
|
-
|
|
14
|
-
- `--iterations N` (default: 10) — number of evolution iterations
|
|
15
|
-
- `--candidates-per-iter N` (default: 1) — harnesses per iteration
|
|
16
|
-
|
|
17
|
-
## Prerequisites
|
|
18
|
-
|
|
19
|
-
Run `/harness-evolve-init` first. The `.harness-evolver/` directory must exist with a valid `summary.json`.
|
|
20
|
-
|
|
21
|
-
## The Loop
|
|
22
|
-
|
|
23
|
-
For each iteration i from 1 to N:
|
|
24
|
-
|
|
25
|
-
### 1. PROPOSE
|
|
26
|
-
|
|
27
|
-
Determine the next version number by reading `summary.json`:
|
|
28
|
-
|
|
29
|
-
```bash
|
|
30
|
-
python3 -c "import json; s=json.load(open('.harness-evolver/summary.json')); print(f'v{s[\"iterations\"]+1:03d}')"
|
|
31
|
-
```
|
|
32
|
-
|
|
33
|
-
Spawn the `harness-evolver-proposer` agent with this prompt:
|
|
34
|
-
|
|
35
|
-
> You are proposing iteration {i}. Create version {version_number} in `.harness-evolver/harnesses/{version_number}/`.
|
|
36
|
-
> Working directory contains `.harness-evolver/` with all prior candidates and traces.
|
|
37
|
-
|
|
38
|
-
The proposer agent will create:
|
|
39
|
-
- `.harness-evolver/harnesses/v{NNN}/harness.py`
|
|
40
|
-
- `.harness-evolver/harnesses/v{NNN}/config.json`
|
|
41
|
-
- `.harness-evolver/harnesses/v{NNN}/proposal.md`
|
|
42
|
-
|
|
43
|
-
### 2. VALIDATE
|
|
44
|
-
|
|
45
|
-
```bash
|
|
46
|
-
python3 ~/.harness-evolver/tools/evaluate.py validate \
|
|
47
|
-
--harness .harness-evolver/harnesses/v{NNN}/harness.py \
|
|
48
|
-
--config .harness-evolver/harnesses/v{NNN}/config.json
|
|
49
|
-
```
|
|
50
|
-
|
|
51
|
-
If validation fails, ask the proposer to fix (1 retry). If it fails again, set score to 0.0 and continue.
|
|
52
|
-
|
|
53
|
-
### 3. EVALUATE
|
|
54
|
-
|
|
55
|
-
```bash
|
|
56
|
-
python3 ~/.harness-evolver/tools/evaluate.py run \
|
|
57
|
-
--harness .harness-evolver/harnesses/v{NNN}/harness.py \
|
|
58
|
-
--config .harness-evolver/harnesses/v{NNN}/config.json \
|
|
59
|
-
--tasks-dir .harness-evolver/eval/tasks/ \
|
|
60
|
-
--eval .harness-evolver/eval/eval.py \
|
|
61
|
-
--traces-dir .harness-evolver/harnesses/v{NNN}/traces/ \
|
|
62
|
-
--scores .harness-evolver/harnesses/v{NNN}/scores.json \
|
|
63
|
-
--timeout 60
|
|
64
|
-
```
|
|
65
|
-
|
|
66
|
-
### 4. UPDATE STATE
|
|
67
|
-
|
|
68
|
-
```bash
|
|
69
|
-
python3 ~/.harness-evolver/tools/state.py update \
|
|
70
|
-
--base-dir .harness-evolver \
|
|
71
|
-
--version v{NNN} \
|
|
72
|
-
--scores .harness-evolver/harnesses/v{NNN}/scores.json \
|
|
73
|
-
--proposal .harness-evolver/harnesses/v{NNN}/proposal.md
|
|
74
|
-
```
|
|
75
|
-
|
|
76
|
-
### 5. REPORT
|
|
77
|
-
|
|
78
|
-
Read the updated `summary.json` and report:
|
|
79
|
-
- `Iteration {i}/{N}: v{NNN} scored {score} (best: v{best} at {best_score})`
|
|
80
|
-
- If regression (score < parent score): warn
|
|
81
|
-
- If new best: celebrate
|
|
82
|
-
|
|
83
|
-
### Stop Conditions
|
|
84
|
-
|
|
85
|
-
- All N iterations completed
|
|
86
|
-
- **Stagnation**: 3 consecutive iterations without >1% improvement. Read `summary.json` history to check.
|
|
87
|
-
- **Target reached**: if `config.json` has `target_score` set and achieved.
|
|
88
|
-
|
|
89
|
-
When stopping, report final summary: best version, score, number of iterations, improvement over baseline.
|
|
90
|
-
|
|
91
|
-
## Tool Path Resolution
|
|
92
|
-
|
|
93
|
-
Check `.harness-evolver/tools/` first (local override), then `~/.harness-evolver/tools/` (global install).
|
|
@@ -1,198 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: harness-evolve-init
|
|
3
|
-
description: "Initialize harness evolution in the current project. Scans the codebase, identifies the entry point, and helps create harness wrapper, eval script, and test cases if they don't exist."
|
|
4
|
-
argument-hint: "[directory]"
|
|
5
|
-
allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent]
|
|
6
|
-
---
|
|
7
|
-
|
|
8
|
-
# /harness-evolve-init
|
|
9
|
-
|
|
10
|
-
Initialize the Harness Evolver for this project.
|
|
11
|
-
|
|
12
|
-
## Usage
|
|
13
|
-
|
|
14
|
-
```
|
|
15
|
-
/harness-evolve-init # setup in current directory
|
|
16
|
-
/harness-evolve-init ./my-project # setup in a specific directory
|
|
17
|
-
```
|
|
18
|
-
|
|
19
|
-
## Your Job: A 3-Phase Setup Wizard
|
|
20
|
-
|
|
21
|
-
You are the intelligent layer. The init.py tool is dumb — it takes paths. Your job is to figure out what to pass it, creating files if needed.
|
|
22
|
-
|
|
23
|
-
### Phase 1: SCAN the project
|
|
24
|
-
|
|
25
|
-
Read the project structure to understand what exists:
|
|
26
|
-
|
|
27
|
-
```bash
|
|
28
|
-
find . -maxdepth 3 -type f -name "*.py" | head -30
|
|
29
|
-
ls -la
|
|
30
|
-
```
|
|
31
|
-
|
|
32
|
-
Look for:
|
|
33
|
-
- **Entry point candidates:** `main.py`, `app.py`, `agent.py`, `graph.py`, `pipeline.py`, `bot.py`, `run.py`, or any file with `if __name__` block
|
|
34
|
-
- **Existing eval/test files:** `eval.py`, `test_*.py`, `score.py`, `judge.py`
|
|
35
|
-
- **Existing test data:** `tasks/`, `tests/`, `data/`, `examples/`, `fixtures/`, any dir with JSON/JSONL files
|
|
36
|
-
- **Config files:** `config.json`, `config.yaml`, `.env`
|
|
37
|
-
- **Framework clues:** imports of langchain, langgraph, openai, anthropic, crewai, etc.
|
|
38
|
-
|
|
39
|
-
Also run stack detection:
|
|
40
|
-
```bash
|
|
41
|
-
python3 ~/.harness-evolver/tools/detect_stack.py .
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
### Phase 2: CREATE what's missing
|
|
45
|
-
|
|
46
|
-
There are 3 artifacts needed. For each, check if it exists or needs to be created.
|
|
47
|
-
|
|
48
|
-
#### A. The Harness (`harness.py`)
|
|
49
|
-
|
|
50
|
-
**If `harness.py` already exists with the right interface** (`--input`, `--output`): use it directly.
|
|
51
|
-
|
|
52
|
-
**If the project has an entry point but NOT in our format:** Create a `harness.py` wrapper.
|
|
53
|
-
|
|
54
|
-
Read the entry point to understand its input/output format, then generate a wrapper:
|
|
55
|
-
|
|
56
|
-
```python
|
|
57
|
-
#!/usr/bin/env python3
|
|
58
|
-
"""Harness wrapper for [project name]. Generated by harness-evolve-init."""
|
|
59
|
-
|
|
60
|
-
import argparse
|
|
61
|
-
import json
|
|
62
|
-
import sys
|
|
63
|
-
|
|
64
|
-
# Import the actual project code
|
|
65
|
-
from [entry_module] import [main_function_or_class]
|
|
66
|
-
|
|
67
|
-
def main():
|
|
68
|
-
parser = argparse.ArgumentParser()
|
|
69
|
-
parser.add_argument("--input", required=True)
|
|
70
|
-
parser.add_argument("--output", required=True)
|
|
71
|
-
parser.add_argument("--traces-dir", default=None)
|
|
72
|
-
parser.add_argument("--config", default=None)
|
|
73
|
-
args = parser.parse_args()
|
|
74
|
-
|
|
75
|
-
task = json.load(open(args.input))
|
|
76
|
-
config = json.load(open(args.config)) if args.config else {}
|
|
77
|
-
|
|
78
|
-
# Call the actual project code
|
|
79
|
-
result = [call_the_project](task["input"], **config)
|
|
80
|
-
|
|
81
|
-
json.dump({"id": task["id"], "output": str(result)}, open(args.output, "w"))
|
|
82
|
-
|
|
83
|
-
if __name__ == "__main__":
|
|
84
|
-
main()
|
|
85
|
-
```
|
|
86
|
-
|
|
87
|
-
Adapt this template based on what you learned from reading the entry point. Ask the user to confirm if you're unsure about the input/output mapping.
|
|
88
|
-
|
|
89
|
-
#### B. The Eval (`eval.py`)
|
|
90
|
-
|
|
91
|
-
**If `eval.py` exists:** use it.
|
|
92
|
-
|
|
93
|
-
**If not:** Ask the user what "correct" means for their use case, then generate one:
|
|
94
|
-
|
|
95
|
-
- **Classification/extraction:** exact match or fuzzy match
|
|
96
|
-
- **Chatbot/QA:** LLM-as-judge (requires API key) or keyword matching
|
|
97
|
-
- **Code generation:** execution-based (run the code, check output)
|
|
98
|
-
- **RAG:** relevance scoring
|
|
99
|
-
|
|
100
|
-
Start with the simplest eval that works. The evolver can iterate on the harness without a perfect eval — even a rough eval gives signal.
|
|
101
|
-
|
|
102
|
-
```python
|
|
103
|
-
#!/usr/bin/env python3
|
|
104
|
-
"""Eval script for [project]. Generated by harness-evolve-init."""
|
|
105
|
-
|
|
106
|
-
import argparse
|
|
107
|
-
import json
|
|
108
|
-
import os
|
|
109
|
-
|
|
110
|
-
def main():
|
|
111
|
-
parser = argparse.ArgumentParser()
|
|
112
|
-
parser.add_argument("--results-dir", required=True)
|
|
113
|
-
parser.add_argument("--tasks-dir", required=True)
|
|
114
|
-
parser.add_argument("--scores", required=True)
|
|
115
|
-
args = parser.parse_args()
|
|
116
|
-
|
|
117
|
-
correct, total = 0, 0
|
|
118
|
-
per_task = {}
|
|
119
|
-
|
|
120
|
-
for fname in sorted(os.listdir(args.tasks_dir)):
|
|
121
|
-
if not fname.endswith(".json"):
|
|
122
|
-
continue
|
|
123
|
-
with open(os.path.join(args.tasks_dir, fname)) as f:
|
|
124
|
-
task = json.load(f)
|
|
125
|
-
task_id = task["id"]
|
|
126
|
-
|
|
127
|
-
result_path = os.path.join(args.results_dir, fname)
|
|
128
|
-
if not os.path.exists(result_path):
|
|
129
|
-
per_task[task_id] = {"score": 0.0, "error": "no output"}
|
|
130
|
-
total += 1
|
|
131
|
-
continue
|
|
132
|
-
|
|
133
|
-
with open(result_path) as f:
|
|
134
|
-
result = json.load(f)
|
|
135
|
-
|
|
136
|
-
# ADAPT THIS: define what "correct" means for this project
|
|
137
|
-
expected = task.get("expected", "")
|
|
138
|
-
actual = result.get("output", "")
|
|
139
|
-
match = actual.lower().strip() == expected.lower().strip()
|
|
140
|
-
|
|
141
|
-
per_task[task_id] = {"score": 1.0 if match else 0.0, "expected": expected, "actual": actual}
|
|
142
|
-
correct += int(match)
|
|
143
|
-
total += 1
|
|
144
|
-
|
|
145
|
-
accuracy = correct / total if total > 0 else 0.0
|
|
146
|
-
json.dump({
|
|
147
|
-
"combined_score": accuracy,
|
|
148
|
-
"accuracy": accuracy,
|
|
149
|
-
"total_tasks": total,
|
|
150
|
-
"correct": correct,
|
|
151
|
-
"per_task": per_task,
|
|
152
|
-
}, open(args.scores, "w"), indent=2)
|
|
153
|
-
|
|
154
|
-
if __name__ == "__main__":
|
|
155
|
-
main()
|
|
156
|
-
```
|
|
157
|
-
|
|
158
|
-
#### C. Test Cases (`tasks/`)
|
|
159
|
-
|
|
160
|
-
**If `tasks/` exists with JSON files:** use it.
|
|
161
|
-
|
|
162
|
-
**If the project has test data in another format:** Convert it to our format.
|
|
163
|
-
|
|
164
|
-
**If no test data exists:** Help create 5-10 test cases. Ask the user:
|
|
165
|
-
> "What are typical inputs to your system? And what are the expected outputs? Give me 3-5 examples and I'll create the task files."
|
|
166
|
-
|
|
167
|
-
Each task file is:
|
|
168
|
-
```json
|
|
169
|
-
{"id": "task_001", "input": "...", "expected": "...", "metadata": {}}
|
|
170
|
-
```
|
|
171
|
-
|
|
172
|
-
### Phase 3: RUN init.py
|
|
173
|
-
|
|
174
|
-
Once all 3 artifacts exist, run:
|
|
175
|
-
|
|
176
|
-
```bash
|
|
177
|
-
python3 ~/.harness-evolver/tools/init.py \
|
|
178
|
-
--harness harness.py \
|
|
179
|
-
--eval eval.py \
|
|
180
|
-
--tasks tasks/ \
|
|
181
|
-
--tools-dir ~/.harness-evolver/tools
|
|
182
|
-
```
|
|
183
|
-
|
|
184
|
-
If a harness config exists, add `--harness-config config.json`.
|
|
185
|
-
|
|
186
|
-
If `~/.harness-evolver/tools/init.py` does not exist, check `.harness-evolver/tools/init.py`.
|
|
187
|
-
|
|
188
|
-
### After init completes, report:
|
|
189
|
-
|
|
190
|
-
- What was detected vs created
|
|
191
|
-
- Stack detected (libraries)
|
|
192
|
-
- Integrations available (LangSmith, Context7)
|
|
193
|
-
- Baseline score
|
|
194
|
-
- Next step: `/harness-evolve` to start the optimization loop
|
|
195
|
-
|
|
196
|
-
## Key Principle
|
|
197
|
-
|
|
198
|
-
**Don't ask the user to restructure their project.** You adapt to them. If they have a LangGraph graph in `src/graph.py`, you create a thin wrapper — you don't ask them to rename it to `harness.py`. The wrapper IS the harness.
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: harness-evolve-status
|
|
3
|
-
description: "Show the current status of harness evolution: best score, iteration count, progress history."
|
|
4
|
-
allowed-tools: [Read, Bash]
|
|
5
|
-
---
|
|
6
|
-
|
|
7
|
-
# /harness-evolve-status
|
|
8
|
-
|
|
9
|
-
Show the current evolution status.
|
|
10
|
-
|
|
11
|
-
## What To Do
|
|
12
|
-
|
|
13
|
-
```bash
|
|
14
|
-
python3 ~/.harness-evolver/tools/state.py show --base-dir .harness-evolver
|
|
15
|
-
```
|
|
16
|
-
|
|
17
|
-
If that doesn't exist, try:
|
|
18
|
-
|
|
19
|
-
```bash
|
|
20
|
-
python3 .harness-evolver/tools/state.py show --base-dir .harness-evolver
|
|
21
|
-
```
|
|
22
|
-
|
|
23
|
-
Also read and display the contents of `.harness-evolver/STATE.md` for the full status table.
|
|
24
|
-
|
|
25
|
-
If `.harness-evolver/` doesn't exist, tell the user to run `/harness-evolve-init` first.
|