harness-evolver 4.3.0 → 4.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +15 -9
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +28 -33
- package/tools/run_eval.py +6 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.3.
|
|
4
|
+
"version": "4.3.1",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -91,8 +91,12 @@ claude
|
|
|
91
91
|
<td>Cross-iteration memory consolidation inspired by Claude Code's autoDream. Tracks which approaches win, which failures recur, and promotes insights after 2+ occurrences.</td>
|
|
92
92
|
</tr>
|
|
93
93
|
<tr>
|
|
94
|
+
<td><b>Dataset Health</b></td>
|
|
95
|
+
<td>Pre-flight dataset quality check: size adequacy, difficulty distribution, dead example detection, production coverage analysis, train/held-out splits. Auto-corrects issues before evolution starts.</td>
|
|
96
|
+
</tr>
|
|
97
|
+
<tr>
|
|
94
98
|
<td><b>Smart Gating</b></td>
|
|
95
|
-
<td>
|
|
99
|
+
<td>Claude assesses gate conditions directly — score plateau, target reached, diminishing returns. No hardcoded thresholds. State validation ensures config hasn't diverged from LangSmith.</td>
|
|
96
100
|
</tr>
|
|
97
101
|
<tr>
|
|
98
102
|
<td><b>Background Mode</b></td>
|
|
@@ -107,6 +111,7 @@ claude
|
|
|
107
111
|
| Command | What it does |
|
|
108
112
|
|---|---|
|
|
109
113
|
| `/evolver:setup` | Explore project, configure LangSmith (dataset, evaluators), run baseline |
|
|
114
|
+
| `/evolver:health` | Check dataset quality (size, difficulty, coverage, splits), auto-correct issues |
|
|
110
115
|
| `/evolver:evolve` | Run the optimization loop (dynamic self-organizing proposers in worktrees) |
|
|
111
116
|
| `/evolver:status` | Show progress, scores, history |
|
|
112
117
|
| `/evolver:deploy` | Tag, push, clean up temporary files |
|
|
@@ -132,10 +137,11 @@ claude
|
|
|
132
137
|
/evolver:evolve
|
|
133
138
|
|
|
|
134
139
|
+- 0.5 Validate state (skeptical memory — check .evolver.json vs LangSmith)
|
|
140
|
+
+- 0.6 /evolver:health — dataset quality check + auto-correct
|
|
135
141
|
+- 1. Read state (.evolver.json + LangSmith experiments)
|
|
136
142
|
+- 1.5 Gather trace insights (cluster errors, tokens, latency)
|
|
137
|
-
+- 1.8 Analyze per-task failures
|
|
138
|
-
+- 1.8a
|
|
143
|
+
+- 1.8 Analyze per-task failures (train split only — proposers don't see held-out)
|
|
144
|
+
+- 1.8a Claude generates strategy.md + lenses.json from analysis data
|
|
139
145
|
+- 1.9 Prepare shared proposer context (KV cache-optimized prefix)
|
|
140
146
|
+- 2. Spawn N self-organizing proposers in parallel (each in a git worktree)
|
|
141
147
|
+- 3. Run target for each candidate (code-based evaluators)
|
|
@@ -144,10 +150,10 @@ claude
|
|
|
144
150
|
+- 5. Merge winning worktree into main branch
|
|
145
151
|
+- 5.5 Regression tracking (auto-add guard examples to dataset)
|
|
146
152
|
+- 6. Report results
|
|
147
|
-
+- 6.2
|
|
153
|
+
+- 6.2 Consolidator agent updates evolution memory (runs in background)
|
|
148
154
|
+- 6.5 Auto-trigger Active Critic (detect + fix evaluator gaming)
|
|
149
155
|
+- 7. Auto-trigger ULTRAPLAN Architect (opus model, deep analysis)
|
|
150
|
-
+- 8.
|
|
156
|
+
+- 8. Claude assesses gate conditions (plateau, target, diminishing returns)
|
|
151
157
|
```
|
|
152
158
|
|
|
153
159
|
---
|
|
@@ -159,7 +165,8 @@ Plugin hook (SessionStart)
|
|
|
159
165
|
└→ Creates venv, installs langsmith + langsmith-cli, exports env vars
|
|
160
166
|
|
|
161
167
|
Skills (markdown)
|
|
162
|
-
├── /evolver:setup → explores project, runs setup.py
|
|
168
|
+
├── /evolver:setup → explores project, smart defaults, runs setup.py
|
|
169
|
+
├── /evolver:health → dataset quality check + auto-correct
|
|
163
170
|
├── /evolver:evolve → orchestrates the evolution loop
|
|
164
171
|
├── /evolver:status → reads .evolver.json + LangSmith
|
|
165
172
|
└── /evolver:deploy → tags and pushes
|
|
@@ -179,10 +186,8 @@ Tools (Python + langsmith SDK)
|
|
|
179
186
|
├── trace_insights.py → clusters errors from traces
|
|
180
187
|
├── seed_from_traces.py → imports production traces
|
|
181
188
|
├── validate_state.py → validates config vs LangSmith state
|
|
182
|
-
├──
|
|
189
|
+
├── dataset_health.py → dataset quality diagnostic (size, difficulty, coverage, splits)
|
|
183
190
|
├── regression_tracker.py → tracks regressions, adds guard examples
|
|
184
|
-
├── consolidate.py → cross-iteration memory consolidation
|
|
185
|
-
├── synthesize_strategy.py→ generates strategy document + investigation lenses
|
|
186
191
|
├── add_evaluator.py → programmatically adds evaluators
|
|
187
192
|
└── adversarial_inject.py → detects memorization, injects adversarial tests
|
|
188
193
|
```
|
|
@@ -221,6 +226,7 @@ LangSmith traces **any** AI framework. The evolver works with all of them:
|
|
|
221
226
|
- [Darwin Godel Machine](https://sakana.ai/dgm/) — Sakana AI
|
|
222
227
|
- [AlphaEvolve](https://deepmind.google/blog/alphaevolve/) — DeepMind
|
|
223
228
|
- [LangSmith Evaluation](https://docs.smith.langchain.com/evaluation) — LangChain
|
|
229
|
+
- [Harnessing Claude's Intelligence](https://claude.com/blog/harnessing-claudes-intelligence) — Martin, Anthropic, 2026
|
|
224
230
|
- [Traces Start the Agent Improvement Loop](https://www.langchain.com/conceptual-guides/traces-start-agent-improvement-loop) — LangChain
|
|
225
231
|
|
|
226
232
|
---
|
package/package.json
CHANGED
package/skills/evolve/SKILL.md
CHANGED
|
@@ -156,44 +156,36 @@ For each iteration:
|
|
|
156
156
|
python3 -c "import json; c=json.load(open('.evolver.json')); print(f'v{c[\"iterations\"]+1:03d}')"
|
|
157
157
|
```
|
|
158
158
|
|
|
159
|
-
### 1.5. Gather
|
|
159
|
+
### 1.5. Gather Analysis Data (Parallel)
|
|
160
160
|
|
|
161
|
-
Read the best experiment from config. If null (no baseline was run), skip
|
|
161
|
+
Read the best experiment from config. If null (no baseline was run), skip data gathering — proposers will work from code analysis only:
|
|
162
162
|
|
|
163
163
|
```bash
|
|
164
164
|
BEST=$(python3 -c "import json; b=json.load(open('.evolver.json')).get('best_experiment'); print(b if b else '')")
|
|
165
|
+
PROD=$(python3 -c "import json; c=json.load(open('.evolver.json')); print(c.get('production_project',''))")
|
|
166
|
+
|
|
165
167
|
if [ -n "$BEST" ]; then
|
|
168
|
+
# Run all data gathering in parallel — these are independent API calls
|
|
166
169
|
$EVOLVER_PY $TOOLS/trace_insights.py \
|
|
167
170
|
--from-experiment "$BEST" \
|
|
168
|
-
--output trace_insights.json 2>/dev/null
|
|
169
|
-
fi
|
|
170
|
-
```
|
|
171
|
+
--output trace_insights.json 2>/dev/null &
|
|
171
172
|
|
|
172
|
-
|
|
173
|
+
$EVOLVER_PY $TOOLS/read_results.py \
|
|
174
|
+
--experiment "$BEST" \
|
|
175
|
+
--config .evolver.json \
|
|
176
|
+
--split train \
|
|
177
|
+
--output best_results.json 2>/dev/null &
|
|
178
|
+
fi
|
|
173
179
|
|
|
174
|
-
```bash
|
|
175
|
-
PROD=$(python3 -c "import json; c=json.load(open('.evolver.json')); print(c.get('production_project',''))")
|
|
176
180
|
if [ -n "$PROD" ] && [ ! -f "production_seed.json" ]; then
|
|
177
181
|
$EVOLVER_PY $TOOLS/seed_from_traces.py \
|
|
178
182
|
--project "$PROD" \
|
|
179
183
|
--output-md production_seed.md \
|
|
180
184
|
--output-json production_seed.json \
|
|
181
|
-
--limit 100 2>/dev/null
|
|
185
|
+
--limit 100 2>/dev/null &
|
|
182
186
|
fi
|
|
183
|
-
```
|
|
184
187
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
If `$BEST` is set (not the first iteration without baseline), read results and cluster failures:
|
|
188
|
-
|
|
189
|
-
```bash
|
|
190
|
-
if [ -n "$BEST" ]; then
|
|
191
|
-
$EVOLVER_PY $TOOLS/read_results.py \
|
|
192
|
-
--experiment "$BEST" \
|
|
193
|
-
--config .evolver.json \
|
|
194
|
-
--split train \
|
|
195
|
-
--output best_results.json 2>/dev/null
|
|
196
|
-
fi
|
|
188
|
+
wait # Wait for all data gathering to complete
|
|
197
189
|
```
|
|
198
190
|
|
|
199
191
|
If `best_results.json` exists, parse it to find failing examples (score < 0.7). Group by metadata or error pattern.
|
|
@@ -338,20 +330,23 @@ done
|
|
|
338
330
|
|
|
339
331
|
Only run evaluation (Step 3) for proposers that committed changes (not abstained, not stuck).
|
|
340
332
|
|
|
341
|
-
### 3. Run Target for Each Candidate
|
|
333
|
+
### 3. Run Target for Each Candidate (Parallel)
|
|
342
334
|
|
|
343
|
-
|
|
335
|
+
Run evaluations for ALL candidates simultaneously — they're independent:
|
|
344
336
|
|
|
345
337
|
```bash
|
|
346
|
-
#
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
338
|
+
# Launch all evaluations in parallel
|
|
339
|
+
for WORKTREE in {worktree_paths_with_commits}; do
|
|
340
|
+
WORKTREE_PROJECT="$WORKTREE"
|
|
341
|
+
[ -n "$PROJECT_DIR" ] && WORKTREE_PROJECT="$WORKTREE/$PROJECT_DIR"
|
|
342
|
+
|
|
343
|
+
$EVOLVER_PY $TOOLS/run_eval.py \
|
|
344
|
+
--config "$WORKTREE_PROJECT/.evolver.json" \
|
|
345
|
+
--worktree-path "$WORKTREE_PROJECT" \
|
|
346
|
+
--experiment-prefix v{NNN}-{lens_id} \
|
|
347
|
+
--timeout 120 &
|
|
348
|
+
done
|
|
349
|
+
wait # Wait for all evaluations to complete
|
|
355
350
|
```
|
|
356
351
|
|
|
357
352
|
Each candidate becomes a separate LangSmith experiment. This step runs the agent and applies code-based evaluators (has_output, token_efficiency) only.
|
package/tools/run_eval.py
CHANGED
|
@@ -166,11 +166,14 @@ def main():
|
|
|
166
166
|
parser.add_argument("--worktree-path", required=True, help="Path to the candidate's worktree")
|
|
167
167
|
parser.add_argument("--experiment-prefix", required=True, help="Experiment name prefix (e.g. v001a)")
|
|
168
168
|
parser.add_argument("--timeout", type=int, default=120, help="Per-task timeout in seconds")
|
|
169
|
+
parser.add_argument("--concurrency", type=int, default=None, help="Max concurrent evaluations (default: from config or 1)")
|
|
169
170
|
args = parser.parse_args()
|
|
170
171
|
|
|
171
172
|
with open(args.config) as f:
|
|
172
173
|
config = json.load(f)
|
|
173
174
|
|
|
175
|
+
concurrency = args.concurrency or config.get("eval_concurrency", 1)
|
|
176
|
+
|
|
174
177
|
os.environ["EVAL_TASK_TIMEOUT"] = str(args.timeout)
|
|
175
178
|
ensure_langsmith_api_key()
|
|
176
179
|
|
|
@@ -188,6 +191,8 @@ def main():
|
|
|
188
191
|
print(f" Dataset: {config['dataset']}")
|
|
189
192
|
print(f" Worktree: {args.worktree_path}")
|
|
190
193
|
print(f" Code evaluators: {['has_output'] + code_evaluators}")
|
|
194
|
+
if concurrency > 1:
|
|
195
|
+
print(f" Concurrency: {concurrency} parallel evaluations")
|
|
191
196
|
if llm_evaluators:
|
|
192
197
|
print(f" Pending LLM evaluators (agent): {llm_evaluators}")
|
|
193
198
|
|
|
@@ -197,7 +202,7 @@ def main():
|
|
|
197
202
|
data=config["dataset"],
|
|
198
203
|
evaluators=evaluators,
|
|
199
204
|
experiment_prefix=args.experiment_prefix,
|
|
200
|
-
max_concurrency=
|
|
205
|
+
max_concurrency=concurrency,
|
|
201
206
|
)
|
|
202
207
|
|
|
203
208
|
experiment_name = results.experiment_name
|