harness-evolver 4.0.1 → 4.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +4 -4
- package/agents/evolver-proposer.md +0 -8
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +4 -59
- package/tools/synthesize_strategy.py +51 -5
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.0.
|
|
4
|
+
"version": "4.0.3",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -95,8 +95,8 @@ claude
|
|
|
95
95
|
<td>Three-gate iteration triggers (score plateau, cost budget, convergence detection) replace blind N-iteration loops. State validation ensures config hasn't diverged from LangSmith.</td>
|
|
96
96
|
</tr>
|
|
97
97
|
<tr>
|
|
98
|
-
<td><b>
|
|
99
|
-
<td>
|
|
98
|
+
<td><b>Background Mode</b></td>
|
|
99
|
+
<td>Run all iterations in background while you continue working. Get notified on completion or significant improvements.</td>
|
|
100
100
|
</tr>
|
|
101
101
|
</table>
|
|
102
102
|
|
|
@@ -137,7 +137,7 @@ claude
|
|
|
137
137
|
+- 1.8 Analyze per-task failures (adaptive briefings)
|
|
138
138
|
+- 1.8a Synthesize strategy document (coordinator synthesis)
|
|
139
139
|
+- 1.9 Prepare shared proposer context (KV cache-optimized prefix)
|
|
140
|
-
+- 2. Spawn 5 proposers in parallel (
|
|
140
|
+
+- 2. Spawn 5 proposers in parallel (each in a git worktree)
|
|
141
141
|
+- 3. Run target for each candidate (code-based evaluators)
|
|
142
142
|
+- 3.5 Spawn evaluator agent (LLM-as-judge via langsmith-cli)
|
|
143
143
|
+- 4. Compare experiments -> select winner + per-task champion
|
|
@@ -165,7 +165,7 @@ Skills (markdown)
|
|
|
165
165
|
└── /evolver:deploy → tags and pushes
|
|
166
166
|
|
|
167
167
|
Agents (markdown)
|
|
168
|
-
├── Proposer (x5) → modifies code in
|
|
168
|
+
├── Proposer (x5) → modifies code in isolated git worktrees
|
|
169
169
|
├── Evaluator → LLM-as-judge via langsmith-cli
|
|
170
170
|
├── Critic → detects gaming + implements stricter evaluators
|
|
171
171
|
├── Architect → ULTRAPLAN deep analysis (opus model)
|
|
@@ -148,14 +148,6 @@ Prioritize changes that fix real production failures over synthetic test failure
|
|
|
148
148
|
4. **Commit your changes** — uncommitted changes are lost when the worktree is cleaned up
|
|
149
149
|
5. **Write proposal.md** — the evolve skill reads this to understand what you did
|
|
150
150
|
|
|
151
|
-
## Tool Restrictions
|
|
152
|
-
|
|
153
|
-
Your available tools may be restricted based on your strategy:
|
|
154
|
-
- **Exploit/Crossover/Failure-targeted**: Edit-only (no Write). Focus on modifying existing files.
|
|
155
|
-
- **Explore**: Full access including Write. You may create new files if your approach requires it.
|
|
156
|
-
|
|
157
|
-
If you need to create a file but only have Edit, restructure your approach to modify existing files instead.
|
|
158
|
-
|
|
159
151
|
## Return Protocol
|
|
160
152
|
|
|
161
153
|
When done, end your response with:
|
package/package.json
CHANGED
package/skills/evolve/SKILL.md
CHANGED
|
@@ -87,8 +87,7 @@ If iterations > 3, offer execution mode:
|
|
|
87
87
|
"multiSelect": false,
|
|
88
88
|
"options": [
|
|
89
89
|
{"label": "Interactive", "description": "I'll watch. Show results after each iteration."},
|
|
90
|
-
{"label": "Background", "description": "Run all iterations in background. Notify on completion or significant improvement."}
|
|
91
|
-
{"label": "Scheduled", "description": "Schedule iterations to run on a cron (e.g., nightly optimization)."}
|
|
90
|
+
{"label": "Background", "description": "Run all iterations in background. Notify on completion or significant improvement."}
|
|
92
91
|
]
|
|
93
92
|
}
|
|
94
93
|
]
|
|
@@ -98,35 +97,6 @@ If iterations > 3, offer execution mode:
|
|
|
98
97
|
**If "Background" selected:**
|
|
99
98
|
Run the evolution loop as a background task. Use the `run_in_background` parameter on the main loop execution.
|
|
100
99
|
|
|
101
|
-
**If "Scheduled" selected:**
|
|
102
|
-
Ask for schedule via AskUserQuestion:
|
|
103
|
-
```json
|
|
104
|
-
{
|
|
105
|
-
"questions": [
|
|
106
|
-
{
|
|
107
|
-
"question": "Schedule?",
|
|
108
|
-
"header": "Cron Schedule",
|
|
109
|
-
"multiSelect": false,
|
|
110
|
-
"options": [
|
|
111
|
-
{"label": "Every 6 hours", "description": "Run 1 iteration every 6 hours"},
|
|
112
|
-
{"label": "Nightly (2 AM)", "description": "Run iterations overnight"},
|
|
113
|
-
{"label": "Custom", "description": "Enter a cron expression"}
|
|
114
|
-
]
|
|
115
|
-
}
|
|
116
|
-
]
|
|
117
|
-
}
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
Then create a cron trigger:
|
|
121
|
-
```
|
|
122
|
-
Use CronCreate tool to schedule:
|
|
123
|
-
- command: "/evolver:evolve --iterations 1 --no-interactive"
|
|
124
|
-
- schedule: {selected_cron}
|
|
125
|
-
- description: "Harness Evolver: scheduled optimization iteration"
|
|
126
|
-
```
|
|
127
|
-
|
|
128
|
-
Report: "Scheduled evolution iterations. Use `/evolver:status` to check progress. Cancel with CronDelete."
|
|
129
|
-
|
|
130
100
|
## The Loop
|
|
131
101
|
|
|
132
102
|
Read config:
|
|
@@ -218,10 +188,11 @@ $EVOLVER_PY $TOOLS/synthesize_strategy.py \
|
|
|
218
188
|
--trace-insights trace_insights.json \
|
|
219
189
|
--best-results best_results.json \
|
|
220
190
|
--evolution-memory evolution_memory.json \
|
|
191
|
+
--production-seed production_seed.json \
|
|
221
192
|
--output strategy.md 2>/dev/null
|
|
222
193
|
```
|
|
223
194
|
|
|
224
|
-
The `strategy.md` file is included in the proposer `<files_to_read>` block via the shared context (Step 1.9).
|
|
195
|
+
The `strategy.md` file is included in the proposer `<files_to_read>` block via the shared context (Step 1.9). It synthesizes trace analysis, evolution memory, and production data into an actionable document. Proposers also receive `production_seed.json` directly for access to raw production traces.
|
|
225
196
|
|
|
226
197
|
### 1.9. Prepare Shared Proposer Context
|
|
227
198
|
|
|
@@ -233,6 +204,7 @@ SHARED_FILES_BLOCK="<files_to_read>
|
|
|
233
204
|
- .evolver.json
|
|
234
205
|
- strategy.md (if exists)
|
|
235
206
|
- evolution_memory.md (if exists)
|
|
207
|
+
- production_seed.json (if exists)
|
|
236
208
|
- {entry_point_file}
|
|
237
209
|
</files_to_read>"
|
|
238
210
|
|
|
@@ -311,33 +283,6 @@ APPROACH: {failure_targeted_or_efficiency}
|
|
|
311
283
|
{adaptive_briefing_e}
|
|
312
284
|
```
|
|
313
285
|
|
|
314
|
-
**Tool restrictions per strategy:**
|
|
315
|
-
|
|
316
|
-
| Strategy | Allowed Tools | Rationale |
|
|
317
|
-
|----------|--------------|-----------|
|
|
318
|
-
| Exploit (A) | Read, Edit, Bash, Glob, Grep | No Write — can't create new files, only edit existing |
|
|
319
|
-
| Explore (B) | Read, Write, Edit, Bash, Glob, Grep | Full access — may need new files for new architecture |
|
|
320
|
-
| Crossover (C) | Read, Edit, Bash, Glob, Grep | No Write — combines existing patterns, doesn't create |
|
|
321
|
-
| Failure-targeted (D, E) | Read, Edit, Bash, Glob, Grep | No Write — focused fixes on specific files |
|
|
322
|
-
|
|
323
|
-
Apply via the `tools` parameter in each Agent() call. Example for exploit:
|
|
324
|
-
```
|
|
325
|
-
Agent(
|
|
326
|
-
subagent_type: "evolver-proposer",
|
|
327
|
-
tools: ["Read", "Edit", "Bash", "Glob", "Grep"],
|
|
328
|
-
...
|
|
329
|
-
)
|
|
330
|
-
```
|
|
331
|
-
|
|
332
|
-
For explore:
|
|
333
|
-
```
|
|
334
|
-
Agent(
|
|
335
|
-
subagent_type: "evolver-proposer",
|
|
336
|
-
tools: ["Read", "Write", "Edit", "Bash", "Glob", "Grep"],
|
|
337
|
-
...
|
|
338
|
-
)
|
|
339
|
-
```
|
|
340
|
-
|
|
341
286
|
Wait for all 5 to complete.
|
|
342
287
|
|
|
343
288
|
**Stuck proposer detection**: If any proposer hasn't completed after 10 minutes, it may be stuck in a loop. The Claude Code runtime handles this via the agent's turn limit. If a proposer returns without committing changes, skip it — don't retry.
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""Synthesize evolution strategy document from trace analysis.
|
|
3
3
|
|
|
4
|
-
Reads trace_insights.json, best_results.json,
|
|
5
|
-
to produce a targeted strategy document with
|
|
6
|
-
|
|
4
|
+
Reads trace_insights.json, best_results.json, evolution_memory.json,
|
|
5
|
+
and production_seed.json to produce a targeted strategy document with
|
|
6
|
+
specific file paths and concrete change recommendations for proposers.
|
|
7
7
|
|
|
8
8
|
Usage:
|
|
9
9
|
python3 synthesize_strategy.py \
|
|
@@ -11,6 +11,7 @@ Usage:
|
|
|
11
11
|
--trace-insights trace_insights.json \
|
|
12
12
|
--best-results best_results.json \
|
|
13
13
|
--evolution-memory evolution_memory.json \
|
|
14
|
+
--production-seed production_seed.json \
|
|
14
15
|
--output strategy.md
|
|
15
16
|
"""
|
|
16
17
|
|
|
@@ -42,7 +43,7 @@ def identify_target_files(config):
|
|
|
42
43
|
return target_files
|
|
43
44
|
|
|
44
45
|
|
|
45
|
-
def synthesize(config, insights, results, memory):
|
|
46
|
+
def synthesize(config, insights, results, memory, production=None):
|
|
46
47
|
"""Produce strategy recommendations."""
|
|
47
48
|
strategy = {
|
|
48
49
|
"primary_targets": [],
|
|
@@ -94,6 +95,28 @@ def synthesize(config, insights, results, memory):
|
|
|
94
95
|
for eid, data in failing[:10]
|
|
95
96
|
]
|
|
96
97
|
|
|
98
|
+
# Production trace data
|
|
99
|
+
if production:
|
|
100
|
+
prod_data = {}
|
|
101
|
+
stats = production.get("stats", {})
|
|
102
|
+
if stats:
|
|
103
|
+
prod_data["total_traces"] = stats.get("total_traces", 0)
|
|
104
|
+
prod_data["error_rate"] = stats.get("error_rate", 0)
|
|
105
|
+
categories = production.get("categories", [])
|
|
106
|
+
if categories:
|
|
107
|
+
prod_data["traffic_distribution"] = categories[:10]
|
|
108
|
+
neg = production.get("negative_feedback_inputs", [])
|
|
109
|
+
if neg:
|
|
110
|
+
prod_data["negative_feedback"] = neg[:5]
|
|
111
|
+
errors = production.get("error_patterns", production.get("errors", []))
|
|
112
|
+
if errors:
|
|
113
|
+
prod_data["production_errors"] = errors[:5] if isinstance(errors, list) else []
|
|
114
|
+
slow = production.get("slow_queries", [])
|
|
115
|
+
if slow:
|
|
116
|
+
prod_data["slow_queries"] = slow[:5]
|
|
117
|
+
if prod_data:
|
|
118
|
+
strategy["production"] = prod_data
|
|
119
|
+
|
|
97
120
|
return strategy
|
|
98
121
|
|
|
99
122
|
|
|
@@ -142,6 +165,27 @@ def format_strategy_md(strategy, config):
|
|
|
142
165
|
lines.append(f"- `{ex['example_id']}` (score: {score:.2f}): {preview}{error}")
|
|
143
166
|
lines.append("")
|
|
144
167
|
|
|
168
|
+
prod = strategy.get("production", {})
|
|
169
|
+
if prod:
|
|
170
|
+
lines.append("## Production Insights")
|
|
171
|
+
if prod.get("total_traces"):
|
|
172
|
+
lines.append(f"- **Traces**: {prod['total_traces']} total, {prod.get('error_rate', 0):.1%} error rate")
|
|
173
|
+
if prod.get("traffic_distribution"):
|
|
174
|
+
lines.append(f"- **Traffic**: {', '.join(str(c) for c in prod['traffic_distribution'][:5])}")
|
|
175
|
+
if prod.get("negative_feedback"):
|
|
176
|
+
lines.append("- **Negative feedback inputs**:")
|
|
177
|
+
for nf in prod["negative_feedback"]:
|
|
178
|
+
lines.append(f" - {str(nf)[:120]}")
|
|
179
|
+
if prod.get("production_errors"):
|
|
180
|
+
lines.append("- **Production errors**:")
|
|
181
|
+
for pe in prod["production_errors"]:
|
|
182
|
+
lines.append(f" - {str(pe)[:120]}")
|
|
183
|
+
if prod.get("slow_queries"):
|
|
184
|
+
lines.append("- **Slow queries**:")
|
|
185
|
+
for sq in prod["slow_queries"]:
|
|
186
|
+
lines.append(f" - {str(sq)[:120]}")
|
|
187
|
+
lines.append("")
|
|
188
|
+
|
|
145
189
|
return "\n".join(lines)
|
|
146
190
|
|
|
147
191
|
|
|
@@ -151,6 +195,7 @@ def main():
|
|
|
151
195
|
parser.add_argument("--trace-insights", default="trace_insights.json")
|
|
152
196
|
parser.add_argument("--best-results", default="best_results.json")
|
|
153
197
|
parser.add_argument("--evolution-memory", default="evolution_memory.json")
|
|
198
|
+
parser.add_argument("--production-seed", default="production_seed.json")
|
|
154
199
|
parser.add_argument("--output", default="strategy.md")
|
|
155
200
|
args = parser.parse_args()
|
|
156
201
|
|
|
@@ -160,8 +205,9 @@ def main():
|
|
|
160
205
|
insights = load_json_safe(args.trace_insights)
|
|
161
206
|
results = load_json_safe(args.best_results)
|
|
162
207
|
memory = load_json_safe(args.evolution_memory)
|
|
208
|
+
production = load_json_safe(args.production_seed)
|
|
163
209
|
|
|
164
|
-
strategy = synthesize(config, insights, results, memory)
|
|
210
|
+
strategy = synthesize(config, insights, results, memory, production)
|
|
165
211
|
|
|
166
212
|
md = format_strategy_md(strategy, config)
|
|
167
213
|
with open(args.output, "w") as f:
|