harness-evolver 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +252 -0
- package/agents/harness-evolver-proposer.md +163 -0
- package/bin/install.js +125 -0
- package/examples/classifier/README.md +25 -0
- package/examples/classifier/config.json +3 -0
- package/examples/classifier/eval.py +58 -0
- package/examples/classifier/harness.py +111 -0
- package/examples/classifier/tasks/task_001.json +1 -0
- package/examples/classifier/tasks/task_002.json +1 -0
- package/examples/classifier/tasks/task_003.json +1 -0
- package/examples/classifier/tasks/task_004.json +1 -0
- package/examples/classifier/tasks/task_005.json +1 -0
- package/examples/classifier/tasks/task_006.json +1 -0
- package/examples/classifier/tasks/task_007.json +1 -0
- package/examples/classifier/tasks/task_008.json +1 -0
- package/examples/classifier/tasks/task_009.json +1 -0
- package/examples/classifier/tasks/task_010.json +1 -0
- package/package.json +29 -0
- package/skills/harness-evolve/SKILL.md +93 -0
- package/skills/harness-evolve-init/SKILL.md +53 -0
- package/skills/harness-evolve-status/SKILL.md +25 -0
- package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
- package/tools/__pycache__/langsmith_adapter.cpython-313.pyc +0 -0
- package/tools/__pycache__/langsmith_api.cpython-313.pyc +0 -0
- package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
- package/tools/detect_stack.py +173 -0
- package/tools/evaluate.py +214 -0
- package/tools/init.py +231 -0
- package/tools/state.py +219 -0
- package/tools/trace_logger.py +42 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Raphael Valdetaro Christi Cordeiro
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
1
|
+
# Harness Evolver
|
|
2
|
+
|
|
3
|
+
End-to-end optimization of LLM agent harnesses, inspired by [Meta-Harness](https://yoonholee.com/meta-harness/) (Lee et al., 2026).
|
|
4
|
+
|
|
5
|
+
**The harness is the 80% factor.** Changing just the scaffolding around a fixed LLM can produce a [6x performance gap](https://arxiv.org/abs/2603.28052) on the same benchmark. Harness Evolver automates the search for better harnesses using an autonomous propose-evaluate-iterate loop with full execution traces as feedback.
|
|
6
|
+
|
|
7
|
+
## Why
|
|
8
|
+
|
|
9
|
+
Manual harness engineering is slow and doesn't scale. Existing optimizers work in prompt-space (OPRO, TextGrad, GEPA) or use compressed summaries. Meta-Harness showed that **code-space search with full diagnostic context** (10M+ tokens of traces) outperforms all of them by 10+ points.
|
|
10
|
+
|
|
11
|
+
Harness Evolver brings that approach to any domain as a Claude Code plugin.
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
# Via npx (recommended)
|
|
17
|
+
npx harness-evolver@latest
|
|
18
|
+
|
|
19
|
+
# Or as a Claude Code plugin
|
|
20
|
+
/plugin install harness-evolver
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# 1. Copy the example into a working directory
|
|
27
|
+
cp -r ~/.harness-evolver/examples/classifier ./my-classifier
|
|
28
|
+
cd my-classifier
|
|
29
|
+
|
|
30
|
+
# 2. Initialize (validates harness, evaluates baseline)
|
|
31
|
+
/harness-evolve-init --harness harness.py --eval eval.py --tasks tasks/
|
|
32
|
+
|
|
33
|
+
# 3. Run the evolution loop
|
|
34
|
+
/harness-evolve --iterations 5
|
|
35
|
+
|
|
36
|
+
# 4. Check progress anytime
|
|
37
|
+
/harness-evolve-status
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
The classifier example runs in mock mode (no API key needed) and demonstrates the full loop in under 2 minutes.
|
|
41
|
+
|
|
42
|
+
## How It Works
|
|
43
|
+
|
|
44
|
+
```
|
|
45
|
+
┌─────────────────────────────┐
|
|
46
|
+
│ /harness-evolve │
|
|
47
|
+
│ (orchestrator skill) │
|
|
48
|
+
└──────────┬──────────────────┘
|
|
49
|
+
│
|
|
50
|
+
┌────────────────┼────────────────┐
|
|
51
|
+
▼ ▼ ▼
|
|
52
|
+
┌──────────┐ ┌────────────┐ ┌──────────┐
|
|
53
|
+
│ PROPOSE │ │ EVALUATE │ │ UPDATE │
|
|
54
|
+
│ proposer │ │ evaluate.py│ │ state.py │
|
|
55
|
+
│ agent │ │ + eval.py │ │ │
|
|
56
|
+
└──────────┘ └────────────┘ └──────────┘
|
|
57
|
+
│ │ │
|
|
58
|
+
▼ ▼ ▼
|
|
59
|
+
harnesses/ traces/ summary.json
|
|
60
|
+
v{N}/ per-task STATE.md
|
|
61
|
+
harness.py stdout/stderr PROPOSER_HISTORY.md
|
|
62
|
+
proposal.md timing.json
|
|
63
|
+
scores.json
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
1. **Propose** — A proposer agent (Claude Code subagent) reads all prior candidates' code, execution traces, and scores. It diagnoses failure modes via counterfactual analysis and writes a new harness.
|
|
67
|
+
2. **Evaluate** — The harness runs against every task. Traces are captured per-task (input, output, stdout, stderr, timing). The user's eval script scores the results.
|
|
68
|
+
3. **Update** — State files are updated with the new score, parent lineage, and regression detection.
|
|
69
|
+
4. **Repeat** — The loop continues until N iterations, stagnation (3 rounds without >1% improvement), or a target score is reached.
|
|
70
|
+
|
|
71
|
+
## The Harness Contract
|
|
72
|
+
|
|
73
|
+
A harness is **any executable** that accepts:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
python3 harness.py --input task.json --output result.json [--traces-dir DIR] [--config config.json]
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
- `--input`: JSON with `{id, input, metadata}` (never sees expected answers)
|
|
80
|
+
- `--output`: JSON with `{id, output}`
|
|
81
|
+
- `--traces-dir`: optional directory for the harness to write rich traces
|
|
82
|
+
- `--config`: optional JSON with evolvable parameters (model, temperature, etc.)
|
|
83
|
+
|
|
84
|
+
The eval script is also any executable:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
python3 eval.py --results-dir results/ --tasks-dir tasks/ --scores scores.json
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
This means Harness Evolver works with **any language, any framework, any domain**.
|
|
91
|
+
|
|
92
|
+
## Project Structure
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
.harness-evolver/ # Created in your project by /harness-evolve-init
|
|
96
|
+
├── config.json # Project config (harness cmd, eval cmd, evolution params)
|
|
97
|
+
├── summary.json # Source of truth (versions, scores, parents)
|
|
98
|
+
├── STATE.md # Human-readable status (generated)
|
|
99
|
+
├── PROPOSER_HISTORY.md # Log of all proposals and outcomes
|
|
100
|
+
├── baseline/ # Original harness (read-only reference)
|
|
101
|
+
│ ├── harness.py
|
|
102
|
+
│ └── config.json
|
|
103
|
+
├── eval/
|
|
104
|
+
│ ├── eval.py # Scoring script
|
|
105
|
+
│ └── tasks/ # Test cases (JSON files)
|
|
106
|
+
└── harnesses/
|
|
107
|
+
└── v001/
|
|
108
|
+
├── harness.py # Candidate code
|
|
109
|
+
├── config.json # Evolvable parameters
|
|
110
|
+
├── proposal.md # Proposer's reasoning
|
|
111
|
+
├── scores.json # Evaluation results
|
|
112
|
+
└── traces/ # Full execution traces
|
|
113
|
+
├── stdout.log
|
|
114
|
+
├── stderr.log
|
|
115
|
+
├── timing.json
|
|
116
|
+
└── task_001/
|
|
117
|
+
├── input.json # What the harness received
|
|
118
|
+
└── output.json # What the harness returned
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## Plugin Architecture
|
|
122
|
+
|
|
123
|
+
Three-layer design inspired by [GSD](https://github.com/gsd-build/get-shit-done):
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
Layer 1: Skills + Agents (markdown) → AI orchestration
|
|
127
|
+
Layer 2: Tools (Python stdlib-only) → Deterministic operations
|
|
128
|
+
Layer 3: Installer (Node.js) → Distribution via npx
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
| Component | Files | Purpose |
|
|
132
|
+
|---|---|---|
|
|
133
|
+
| **Skills** | `skills/harness-evolve-init/`, `skills/harness-evolve/`, `skills/harness-evolve-status/` | Slash commands that orchestrate the loop |
|
|
134
|
+
| **Agent** | `agents/harness-evolver-proposer.md` | The proposer — 4-phase workflow (orient, diagnose, propose, document) with 6 rules |
|
|
135
|
+
| **Tools** | `tools/evaluate.py`, `tools/state.py`, `tools/init.py`, `tools/detect_stack.py`, `tools/trace_logger.py` | CLI tools called via subprocess — zero LLM tokens spent on deterministic work |
|
|
136
|
+
| **Installer** | `bin/install.js`, `package.json` | Copies skills/agents/tools to the right locations |
|
|
137
|
+
| **Example** | `examples/classifier/` | 10-task medical classifier with mock mode |
|
|
138
|
+
|
|
139
|
+
## Integrations
|
|
140
|
+
|
|
141
|
+
### LangSmith (optional)
|
|
142
|
+
|
|
143
|
+
If `LANGSMITH_API_KEY` is set, the plugin automatically:
|
|
144
|
+
- Enables `LANGCHAIN_TRACING_V2` for auto-tracing of LangChain/LangGraph harnesses
|
|
145
|
+
- Detects [langsmith-cli](https://github.com/gigaverse-app/langsmith-cli) for the proposer to query traces directly
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# Setup
|
|
149
|
+
export LANGSMITH_API_KEY=lsv2_...
|
|
150
|
+
uv tool install langsmith-cli && langsmith-cli auth login
|
|
151
|
+
|
|
152
|
+
# The proposer can then do:
|
|
153
|
+
langsmith-cli --json runs list --project harness-evolver-v003 --failed --fields id,name,error
|
|
154
|
+
langsmith-cli --json runs stats --project harness-evolver-v003
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
No custom API client — the proposer uses `langsmith-cli` like it uses `grep` and `diff`.
|
|
158
|
+
|
|
159
|
+
### Context7 (optional)
|
|
160
|
+
|
|
161
|
+
The plugin detects the harness's technology stack via AST analysis (17 libraries supported) and instructs the proposer to consult current documentation before proposing API changes.
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
# Setup
|
|
165
|
+
claude mcp add context7 -- npx -y @upstash/context7-mcp@latest
|
|
166
|
+
|
|
167
|
+
# The proposer automatically:
|
|
168
|
+
# 1. Reads config.json → stack.detected (e.g., LangChain, ChromaDB)
|
|
169
|
+
# 2. Queries Context7 for current docs before writing code
|
|
170
|
+
# 3. Annotates proposal.md with "API verified via Context7"
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Without Context7, the proposer uses model knowledge and annotates "API not verified against current docs."
|
|
174
|
+
|
|
175
|
+
### LangChain Docs MCP (optional)
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
claude mcp add docs-langchain --transport http https://docs.langchain.com/mcp
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Complements Context7 with LangChain/LangGraph/LangSmith-specific documentation search.
|
|
182
|
+
|
|
183
|
+
## The Proposer
|
|
184
|
+
|
|
185
|
+
The proposer agent is the core of the system. It follows a 4-phase workflow derived from the Meta-Harness paper:
|
|
186
|
+
|
|
187
|
+
| Phase | Context % | What it does |
|
|
188
|
+
|---|---|---|
|
|
189
|
+
| **Orient** | ~6% | Read `summary.json` and `PROPOSER_HISTORY.md`. Decide which 2-3 versions to investigate. |
|
|
190
|
+
| **Diagnose** | ~80% | Deep trace analysis on selected versions. grep for errors, diff between good/bad versions, counterfactual diagnosis. |
|
|
191
|
+
| **Propose** | ~10% | Write new `harness.py` + `config.json`. Prefer additive changes after regressions. |
|
|
192
|
+
| **Document** | ~4% | Write `proposal.md` with evidence. Append to `PROPOSER_HISTORY.md`. |
|
|
193
|
+
|
|
194
|
+
**6 rules:**
|
|
195
|
+
1. Every change motivated by evidence (cite task ID, trace line, or score delta)
|
|
196
|
+
2. After regression, prefer additive changes
|
|
197
|
+
3. Don't repeat past mistakes (read PROPOSER_HISTORY.md)
|
|
198
|
+
4. One hypothesis at a time when possible
|
|
199
|
+
5. Maintain the CLI interface
|
|
200
|
+
6. Prefer readable harnesses over defensive ones
|
|
201
|
+
|
|
202
|
+
## Supported Libraries (Stack Detection)
|
|
203
|
+
|
|
204
|
+
The AST-based stack detector recognizes 17 libraries:
|
|
205
|
+
|
|
206
|
+
| Category | Libraries |
|
|
207
|
+
|---|---|
|
|
208
|
+
| **AI Frameworks** | LangChain, LangGraph, LlamaIndex, OpenAI, Anthropic, DSPy, CrewAI, AutoGen |
|
|
209
|
+
| **Vector Stores** | ChromaDB, Pinecone, Qdrant, Weaviate |
|
|
210
|
+
| **Web** | FastAPI, Flask, Pydantic |
|
|
211
|
+
| **Data** | Pandas, NumPy |
|
|
212
|
+
|
|
213
|
+
## Development
|
|
214
|
+
|
|
215
|
+
```bash
|
|
216
|
+
# Run all tests (41 tests, stdlib-only, no pip install needed)
|
|
217
|
+
python3 -m unittest discover -s tests -v
|
|
218
|
+
|
|
219
|
+
# Test the example manually
|
|
220
|
+
cd examples/classifier
|
|
221
|
+
python3 harness.py --input tasks/task_001.json --output /tmp/result.json --config config.json
|
|
222
|
+
cat /tmp/result.json
|
|
223
|
+
|
|
224
|
+
# Run the installer locally
|
|
225
|
+
node bin/install.js
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
## Comparison with Related Work
|
|
229
|
+
|
|
230
|
+
| | Meta-Harness (paper) | A-Evolve | ECC /evolve | **Harness Evolver** |
|
|
231
|
+
|---|---|---|---|---|
|
|
232
|
+
| **Format** | Paper artifact | Framework (Docker) | Plugin (passive) | **Plugin (active)** |
|
|
233
|
+
| **Search space** | Code-space | Code-space | Prompt-space | **Code-space** |
|
|
234
|
+
| **Context/iter** | 10M tokens | Variable | N/A | **Full filesystem** |
|
|
235
|
+
| **Domain** | TerminalBench-2 | Coding benchmarks | Dev workflow | **Any domain** |
|
|
236
|
+
| **Install** | Manual Python | Docker CLI | `/plugin install` | **`npx` or `/plugin install`** |
|
|
237
|
+
| **LangSmith** | No | No | No | **Yes (langsmith-cli)** |
|
|
238
|
+
| **Context7** | No | No | No | **Yes (MCP)** |
|
|
239
|
+
|
|
240
|
+
## References
|
|
241
|
+
|
|
242
|
+
- [Meta-Harness: End-to-End Optimization of Model Harnesses](https://arxiv.org/abs/2603.28052) — Lee et al., 2026
|
|
243
|
+
- [GSD (Get Shit Done)](https://github.com/gsd-build/get-shit-done) — CLI architecture inspiration
|
|
244
|
+
- [LangSmith CLI](https://github.com/gigaverse-app/langsmith-cli) — Trace analysis for the proposer
|
|
245
|
+
- [Context7](https://github.com/upstash/context7) — Documentation lookup via MCP
|
|
246
|
+
- [Design Spec](docs/specs/2026-03-31-harness-evolver-design.md)
|
|
247
|
+
- [LangSmith Integration Spec](docs/specs/2026-03-31-langsmith-integration.md)
|
|
248
|
+
- [Context7 Integration Spec](docs/specs/2026-03-31-context7-integration.md)
|
|
249
|
+
|
|
250
|
+
## License
|
|
251
|
+
|
|
252
|
+
MIT
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: harness-evolver-proposer
|
|
3
|
+
description: |
|
|
4
|
+
Use this agent when the harness-evolve skill needs to propose a new harness candidate.
|
|
5
|
+
This agent navigates the .harness-evolver/ filesystem to diagnose failures in prior
|
|
6
|
+
candidates and propose an improved harness. It is the core of the Meta-Harness optimization loop.
|
|
7
|
+
model: opus
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Harness Evolver — Proposer Agent
|
|
11
|
+
|
|
12
|
+
You are the proposer in a Meta-Harness optimization loop. Your job is to analyze all prior harness candidates — their code, execution traces, and scores — and propose a new harness that improves on them.
|
|
13
|
+
|
|
14
|
+
## Context
|
|
15
|
+
|
|
16
|
+
You are working inside a `.harness-evolver/` directory with this structure:
|
|
17
|
+
|
|
18
|
+
```
|
|
19
|
+
.harness-evolver/
|
|
20
|
+
├── summary.json # Panorama: all versions, scores, parents
|
|
21
|
+
├── PROPOSER_HISTORY.md # Your prior decisions and their outcomes
|
|
22
|
+
├── config.json # Project config (harness command, eval command, etc.)
|
|
23
|
+
├── baseline/
|
|
24
|
+
│ ├── harness.py # Original harness (read-only reference)
|
|
25
|
+
│ └── config.json # Original config
|
|
26
|
+
├── eval/
|
|
27
|
+
│ ├── eval.py # Scoring script (DO NOT MODIFY)
|
|
28
|
+
│ └── tasks/ # Test cases (DO NOT MODIFY)
|
|
29
|
+
└── harnesses/
|
|
30
|
+
└── v001/
|
|
31
|
+
├── harness.py # Candidate code
|
|
32
|
+
├── config.json # Candidate params
|
|
33
|
+
├── proposal.md # Why this version exists
|
|
34
|
+
├── scores.json # How it scored
|
|
35
|
+
└── traces/
|
|
36
|
+
├── stdout.log # Raw stdout from harness runs
|
|
37
|
+
├── stderr.log # Raw stderr
|
|
38
|
+
├── timing.json # Per-task timing
|
|
39
|
+
└── task_001/
|
|
40
|
+
├── input.json # What the harness received
|
|
41
|
+
├── output.json # What the harness returned
|
|
42
|
+
└── extra/ # Optional traces from harness
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Your Workflow
|
|
46
|
+
|
|
47
|
+
### Phase 1: ORIENT (read summary, identify focus)
|
|
48
|
+
|
|
49
|
+
1. Read `summary.json` to see all versions, scores, and parent lineage.
|
|
50
|
+
2. Read `PROPOSER_HISTORY.md` to see what you've tried before and what worked or failed.
|
|
51
|
+
3. Decide which 2-3 versions to investigate deeply:
|
|
52
|
+
- (a) The current best candidate
|
|
53
|
+
- (b) The most recent regression (if any)
|
|
54
|
+
- (c) A version with a different failure mode
|
|
55
|
+
|
|
56
|
+
### Phase 2: DIAGNOSE (deep trace analysis)
|
|
57
|
+
|
|
58
|
+
Investigate the selected versions. Use standard tools:
|
|
59
|
+
- `cat .harness-evolver/harnesses/v{N}/scores.json` — see per-task results
|
|
60
|
+
- `cat .harness-evolver/harnesses/v{N}/traces/task_XXX/output.json` — see what went wrong
|
|
61
|
+
- `cat .harness-evolver/harnesses/v{N}/traces/stderr.log` — look for errors
|
|
62
|
+
- `diff .harness-evolver/harnesses/v{A}/harness.py .harness-evolver/harnesses/v{B}/harness.py` — compare
|
|
63
|
+
- `grep -r "error\|Error\|FAIL\|exception" .harness-evolver/harnesses/v{N}/traces/`
|
|
64
|
+
|
|
65
|
+
Ask yourself:
|
|
66
|
+
- Which tasks fail? Is there a pattern?
|
|
67
|
+
- What changed between a version that passed and one that failed?
|
|
68
|
+
- Is this a code bug, a prompt issue, a retrieval problem, or a parameter problem?
|
|
69
|
+
|
|
70
|
+
**Do NOT read traces of all versions.** Focus on 2-3. Use summary.json to filter.
|
|
71
|
+
|
|
72
|
+
### Phase 3: PROPOSE (write new harness)
|
|
73
|
+
|
|
74
|
+
Based on your diagnosis, create a new version directory and write:
|
|
75
|
+
|
|
76
|
+
1. `harnesses/v{NEXT}/harness.py` — the new harness code
|
|
77
|
+
2. `harnesses/v{NEXT}/config.json` — parameters (copy from parent, modify if needed)
|
|
78
|
+
3. `harnesses/v{NEXT}/proposal.md` — your reasoning (MUST include "Based on v{PARENT}")
|
|
79
|
+
|
|
80
|
+
**The harness MUST maintain this CLI interface:**
|
|
81
|
+
```
|
|
82
|
+
python3 harness.py --input INPUT.json --output OUTPUT.json [--traces-dir DIR] [--config CONFIG.json]
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### Phase 4: DOCUMENT
|
|
86
|
+
|
|
87
|
+
Write a clear `proposal.md` that includes:
|
|
88
|
+
- `Based on v{PARENT}` on the first line
|
|
89
|
+
- What failure modes you identified
|
|
90
|
+
- What specific changes you made and why
|
|
91
|
+
- What you expect to improve
|
|
92
|
+
|
|
93
|
+
Append a summary to `PROPOSER_HISTORY.md`.
|
|
94
|
+
|
|
95
|
+
## Rules
|
|
96
|
+
|
|
97
|
+
1. **Every change motivated by evidence.** Cite the task ID, trace line, or score delta that justifies the change. Never change code "to see what happens."
|
|
98
|
+
|
|
99
|
+
2. **After a regression, prefer additive changes.** If the last version regressed, make smaller, safer modifications. Don't combine multiple changes.
|
|
100
|
+
|
|
101
|
+
3. **Don't repeat past mistakes.** Read PROPOSER_HISTORY.md. If an approach already failed (e.g., "changed prompt template, broke JSON parsing"), don't try a similar approach without strong justification.
|
|
102
|
+
|
|
103
|
+
4. **One hypothesis at a time when possible.** Changing A+B+C simultaneously makes it impossible to diagnose which helped or hurt. If you must make multiple changes, document each clearly.
|
|
104
|
+
|
|
105
|
+
5. **Maintain the interface.** The harness must accept --input, --output, --traces-dir, --config. Breaking the interface breaks the entire loop.
|
|
106
|
+
|
|
107
|
+
6. **Prefer readable harnesses over defensive ones.** If the harness has grown past 2x the baseline size without proportional score improvement, consider simplifying. Accumulated try/catch blocks, redundant fallbacks, and growing if-chains are a code smell in evolved harnesses.
|
|
108
|
+
|
|
109
|
+
## Documentation Lookup (if Context7 available)
|
|
110
|
+
|
|
111
|
+
- Read `config.json` field `stack.detected` to see which libraries the harness uses.
|
|
112
|
+
- BEFORE writing code that uses a library from the detected stack,
|
|
113
|
+
use the `resolve-library-id` tool with the `context7_id` from the config, then
|
|
114
|
+
`get-library-docs` to fetch documentation relevant to your proposed change.
|
|
115
|
+
- If Context7 is NOT available, proceed with model knowledge
|
|
116
|
+
but note in `proposal.md`: "API not verified against current docs."
|
|
117
|
+
- Do NOT look up docs for every line of code — only when proposing
|
|
118
|
+
changes that involve specific APIs (new imports, new methods, new parameters).
|
|
119
|
+
|
|
120
|
+
## What You Do NOT Do
|
|
121
|
+
|
|
122
|
+
- Do NOT run the evaluation. The evolve skill handles that after you propose.
|
|
123
|
+
- Do NOT modify anything in `eval/` — the eval set and scoring are fixed.
|
|
124
|
+
- Do NOT modify `baseline/` — it is your immutable reference.
|
|
125
|
+
- Do NOT modify any prior version's files — history is immutable.
|
|
126
|
+
- Do NOT create files outside of `harnesses/v{NEXT}/` and `PROPOSER_HISTORY.md`.
|
|
127
|
+
|
|
128
|
+
## LangSmith Traces (when langsmith-cli is available)
|
|
129
|
+
|
|
130
|
+
If LangSmith tracing is enabled (check `config.json` field `eval.langsmith.enabled`),
|
|
131
|
+
each harness run is automatically traced to a LangSmith project named
|
|
132
|
+
`{project_prefix}-v{NNN}`.
|
|
133
|
+
|
|
134
|
+
Use `langsmith-cli` to query traces directly:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
# Find failures in this version
|
|
138
|
+
langsmith-cli --json runs list --project harness-evolver-v{N} --failed --fields id,name,error,inputs
|
|
139
|
+
|
|
140
|
+
# Aggregate stats (error rate, latency p50/p95/p99)
|
|
141
|
+
langsmith-cli --json runs stats --project harness-evolver-v{N}
|
|
142
|
+
|
|
143
|
+
# Search for specific error patterns
|
|
144
|
+
langsmith-cli --json runs list --grep "pattern" --grep-in error --project harness-evolver-v{N} --fields id,error
|
|
145
|
+
|
|
146
|
+
# Compare two versions
|
|
147
|
+
langsmith-cli --json runs stats --project harness-evolver-v{A}
|
|
148
|
+
langsmith-cli --json runs stats --project harness-evolver-v{B}
|
|
149
|
+
|
|
150
|
+
# Get full details of latest failure
|
|
151
|
+
langsmith-cli --json runs get-latest --project harness-evolver-v{N} --failed
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
ALWAYS use `--json` as the first flag and `--fields` to limit output size.
|
|
155
|
+
If `langsmith-cli` is not available, fall back to local traces in `traces/` as usual.
|
|
156
|
+
|
|
157
|
+
## Output
|
|
158
|
+
|
|
159
|
+
When done, report what you created:
|
|
160
|
+
- Version number (e.g., "v003")
|
|
161
|
+
- Parent version
|
|
162
|
+
- 1-sentence summary of the change
|
|
163
|
+
- Expected impact on score
|
package/bin/install.js
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Harness Evolver installer.
|
|
4
|
+
* Detects Claude Code, copies skills/agents/tools to the right locations.
|
|
5
|
+
*
|
|
6
|
+
* Usage: npx harness-evolver@latest
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
const fs = require("fs");
|
|
10
|
+
const path = require("path");
|
|
11
|
+
const { execSync } = require("child_process");
|
|
12
|
+
|
|
13
|
+
const PLUGIN_ROOT = path.resolve(__dirname, "..");
|
|
14
|
+
const HOME = process.env.HOME || process.env.USERPROFILE;
|
|
15
|
+
|
|
16
|
+
const CLAUDE_DIR = path.join(HOME, ".claude");
|
|
17
|
+
const COMMANDS_DIR = path.join(CLAUDE_DIR, "commands", "harness-evolver");
|
|
18
|
+
const AGENTS_DIR = path.join(CLAUDE_DIR, "agents");
|
|
19
|
+
const TOOLS_DIR = path.join(HOME, ".harness-evolver", "tools");
|
|
20
|
+
const EXAMPLES_DIR = path.join(HOME, ".harness-evolver", "examples");
|
|
21
|
+
|
|
22
|
+
function log(msg) {
|
|
23
|
+
console.log(` ${msg}`);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function copyDir(src, dest) {
|
|
27
|
+
fs.mkdirSync(dest, { recursive: true });
|
|
28
|
+
for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
|
|
29
|
+
const srcPath = path.join(src, entry.name);
|
|
30
|
+
const destPath = path.join(dest, entry.name);
|
|
31
|
+
if (entry.isDirectory()) {
|
|
32
|
+
copyDir(srcPath, destPath);
|
|
33
|
+
} else {
|
|
34
|
+
fs.copyFileSync(srcPath, destPath);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function copyFile(src, dest) {
|
|
40
|
+
fs.mkdirSync(path.dirname(dest), { recursive: true });
|
|
41
|
+
fs.copyFileSync(src, dest);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function checkPython() {
|
|
45
|
+
try {
|
|
46
|
+
execSync("python3 --version", { stdio: "pipe" });
|
|
47
|
+
return true;
|
|
48
|
+
} catch {
|
|
49
|
+
return false;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function main() {
|
|
54
|
+
console.log("\n Harness Evolver v0.1.0\n");
|
|
55
|
+
|
|
56
|
+
if (!checkPython()) {
|
|
57
|
+
console.error(" ERROR: python3 not found in PATH. Install Python 3.8+ first.");
|
|
58
|
+
process.exit(1);
|
|
59
|
+
}
|
|
60
|
+
log("\u2713 python3 found");
|
|
61
|
+
|
|
62
|
+
if (!fs.existsSync(CLAUDE_DIR)) {
|
|
63
|
+
console.error(` ERROR: Claude Code directory not found at ${CLAUDE_DIR}`);
|
|
64
|
+
console.error(" Install Claude Code first: https://claude.ai/code");
|
|
65
|
+
process.exit(1);
|
|
66
|
+
}
|
|
67
|
+
log("\u2713 Claude Code detected");
|
|
68
|
+
|
|
69
|
+
// Copy skills
|
|
70
|
+
const skillsSource = path.join(PLUGIN_ROOT, "skills");
|
|
71
|
+
if (fs.existsSync(skillsSource)) {
|
|
72
|
+
for (const skill of fs.readdirSync(skillsSource, { withFileTypes: true })) {
|
|
73
|
+
if (skill.isDirectory()) {
|
|
74
|
+
const src = path.join(skillsSource, skill.name);
|
|
75
|
+
const dest = path.join(COMMANDS_DIR, skill.name);
|
|
76
|
+
copyDir(src, dest);
|
|
77
|
+
log(` skill: ${skill.name}`);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Copy agents
|
|
83
|
+
const agentsSource = path.join(PLUGIN_ROOT, "agents");
|
|
84
|
+
if (fs.existsSync(agentsSource)) {
|
|
85
|
+
fs.mkdirSync(AGENTS_DIR, { recursive: true });
|
|
86
|
+
for (const agent of fs.readdirSync(agentsSource)) {
|
|
87
|
+
copyFile(
|
|
88
|
+
path.join(agentsSource, agent),
|
|
89
|
+
path.join(AGENTS_DIR, agent)
|
|
90
|
+
);
|
|
91
|
+
log(` agent: ${agent}`);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// Copy tools
|
|
96
|
+
const toolsSource = path.join(PLUGIN_ROOT, "tools");
|
|
97
|
+
if (fs.existsSync(toolsSource)) {
|
|
98
|
+
fs.mkdirSync(TOOLS_DIR, { recursive: true });
|
|
99
|
+
for (const tool of fs.readdirSync(toolsSource)) {
|
|
100
|
+
if (tool.endsWith(".py")) {
|
|
101
|
+
copyFile(
|
|
102
|
+
path.join(toolsSource, tool),
|
|
103
|
+
path.join(TOOLS_DIR, tool)
|
|
104
|
+
);
|
|
105
|
+
log(` tool: ${tool}`);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Copy examples
|
|
111
|
+
const examplesSource = path.join(PLUGIN_ROOT, "examples");
|
|
112
|
+
if (fs.existsSync(examplesSource)) {
|
|
113
|
+
copyDir(examplesSource, EXAMPLES_DIR);
|
|
114
|
+
log(" examples: classifier");
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
console.log("\n \u2713 Installed successfully!\n");
|
|
118
|
+
console.log(" Next steps:");
|
|
119
|
+
console.log(" 1. Copy an example: cp -r ~/.harness-evolver/examples/classifier ./my-project");
|
|
120
|
+
console.log(" 2. cd my-project");
|
|
121
|
+
console.log(" 3. /harness-evolve-init --harness harness.py --eval eval.py --tasks tasks/");
|
|
122
|
+
console.log(" 4. /harness-evolve --iterations 5\n");
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
main();
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Classifier Example
|
|
2
|
+
|
|
3
|
+
Medical symptom classifier — deliberately naive, designed to be improved by the evolver.
|
|
4
|
+
|
|
5
|
+
## Quick Start (Mock Mode — No API Key)
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
/harness-evolve-init --harness harness.py --eval eval.py --tasks tasks/
|
|
9
|
+
/harness-evolve --iterations 5
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
## With LLM
|
|
13
|
+
|
|
14
|
+
Edit `config.json`:
|
|
15
|
+
```json
|
|
16
|
+
{
|
|
17
|
+
"mock": false,
|
|
18
|
+
"api_key": "sk-ant-...",
|
|
19
|
+
"model": "claude-haiku-4-5-20251001"
|
|
20
|
+
}
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Categories
|
|
24
|
+
|
|
25
|
+
respiratory, cardiac, gastrointestinal, neurological, musculoskeletal, dermatological
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Exact match accuracy scorer for the classifier example."""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main():
|
|
10
|
+
parser = argparse.ArgumentParser()
|
|
11
|
+
parser.add_argument("--results-dir", required=True)
|
|
12
|
+
parser.add_argument("--tasks-dir", required=True)
|
|
13
|
+
parser.add_argument("--scores", required=True)
|
|
14
|
+
args = parser.parse_args()
|
|
15
|
+
|
|
16
|
+
correct = 0
|
|
17
|
+
total = 0
|
|
18
|
+
per_task = {}
|
|
19
|
+
|
|
20
|
+
for fname in sorted(os.listdir(args.tasks_dir)):
|
|
21
|
+
if not fname.endswith(".json"):
|
|
22
|
+
continue
|
|
23
|
+
task_path = os.path.join(args.tasks_dir, fname)
|
|
24
|
+
task = json.load(open(task_path))
|
|
25
|
+
task_id = task["id"]
|
|
26
|
+
|
|
27
|
+
result_path = os.path.join(args.results_dir, fname)
|
|
28
|
+
if not os.path.exists(result_path):
|
|
29
|
+
per_task[task_id] = {"score": 0.0, "error": "no output file"}
|
|
30
|
+
total += 1
|
|
31
|
+
continue
|
|
32
|
+
|
|
33
|
+
result = json.load(open(result_path))
|
|
34
|
+
expected = task["expected"].lower().strip()
|
|
35
|
+
actual = result.get("output", "").lower().strip()
|
|
36
|
+
match = actual == expected
|
|
37
|
+
|
|
38
|
+
per_task[task_id] = {
|
|
39
|
+
"score": 1.0 if match else 0.0,
|
|
40
|
+
"expected": expected,
|
|
41
|
+
"actual": actual,
|
|
42
|
+
}
|
|
43
|
+
correct += int(match)
|
|
44
|
+
total += 1
|
|
45
|
+
|
|
46
|
+
accuracy = correct / total if total > 0 else 0.0
|
|
47
|
+
scores = {
|
|
48
|
+
"combined_score": accuracy,
|
|
49
|
+
"accuracy": accuracy,
|
|
50
|
+
"total_tasks": total,
|
|
51
|
+
"correct": correct,
|
|
52
|
+
"per_task": per_task,
|
|
53
|
+
}
|
|
54
|
+
json.dump(scores, open(args.scores, "w"), indent=2)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
if __name__ == "__main__":
|
|
58
|
+
main()
|