harness-evolver 4.2.9 → 4.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +15 -9
- package/agents/evolver-proposer.md +7 -46
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +74 -271
- package/skills/health/SKILL.md +120 -0
- package/skills/setup/SKILL.md +66 -64
- package/tools/run_eval.py +6 -1
- package/tools/seed_from_traces.py +24 -105
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.
|
|
4
|
+
"version": "4.3.1",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -91,8 +91,12 @@ claude
|
|
|
91
91
|
<td>Cross-iteration memory consolidation inspired by Claude Code's autoDream. Tracks which approaches win, which failures recur, and promotes insights after 2+ occurrences.</td>
|
|
92
92
|
</tr>
|
|
93
93
|
<tr>
|
|
94
|
+
<td><b>Dataset Health</b></td>
|
|
95
|
+
<td>Pre-flight dataset quality check: size adequacy, difficulty distribution, dead example detection, production coverage analysis, train/held-out splits. Auto-corrects issues before evolution starts.</td>
|
|
96
|
+
</tr>
|
|
97
|
+
<tr>
|
|
94
98
|
<td><b>Smart Gating</b></td>
|
|
95
|
-
<td>
|
|
99
|
+
<td>Claude assesses gate conditions directly — score plateau, target reached, diminishing returns. No hardcoded thresholds. State validation ensures config hasn't diverged from LangSmith.</td>
|
|
96
100
|
</tr>
|
|
97
101
|
<tr>
|
|
98
102
|
<td><b>Background Mode</b></td>
|
|
@@ -107,6 +111,7 @@ claude
|
|
|
107
111
|
| Command | What it does |
|
|
108
112
|
|---|---|
|
|
109
113
|
| `/evolver:setup` | Explore project, configure LangSmith (dataset, evaluators), run baseline |
|
|
114
|
+
| `/evolver:health` | Check dataset quality (size, difficulty, coverage, splits), auto-correct issues |
|
|
110
115
|
| `/evolver:evolve` | Run the optimization loop (dynamic self-organizing proposers in worktrees) |
|
|
111
116
|
| `/evolver:status` | Show progress, scores, history |
|
|
112
117
|
| `/evolver:deploy` | Tag, push, clean up temporary files |
|
|
@@ -132,10 +137,11 @@ claude
|
|
|
132
137
|
/evolver:evolve
|
|
133
138
|
|
|
|
134
139
|
+- 0.5 Validate state (skeptical memory — check .evolver.json vs LangSmith)
|
|
140
|
+
+- 0.6 /evolver:health — dataset quality check + auto-correct
|
|
135
141
|
+- 1. Read state (.evolver.json + LangSmith experiments)
|
|
136
142
|
+- 1.5 Gather trace insights (cluster errors, tokens, latency)
|
|
137
|
-
+- 1.8 Analyze per-task failures
|
|
138
|
-
+- 1.8a
|
|
143
|
+
+- 1.8 Analyze per-task failures (train split only — proposers don't see held-out)
|
|
144
|
+
+- 1.8a Claude generates strategy.md + lenses.json from analysis data
|
|
139
145
|
+- 1.9 Prepare shared proposer context (KV cache-optimized prefix)
|
|
140
146
|
+- 2. Spawn N self-organizing proposers in parallel (each in a git worktree)
|
|
141
147
|
+- 3. Run target for each candidate (code-based evaluators)
|
|
@@ -144,10 +150,10 @@ claude
|
|
|
144
150
|
+- 5. Merge winning worktree into main branch
|
|
145
151
|
+- 5.5 Regression tracking (auto-add guard examples to dataset)
|
|
146
152
|
+- 6. Report results
|
|
147
|
-
+- 6.2
|
|
153
|
+
+- 6.2 Consolidator agent updates evolution memory (runs in background)
|
|
148
154
|
+- 6.5 Auto-trigger Active Critic (detect + fix evaluator gaming)
|
|
149
155
|
+- 7. Auto-trigger ULTRAPLAN Architect (opus model, deep analysis)
|
|
150
|
-
+- 8.
|
|
156
|
+
+- 8. Claude assesses gate conditions (plateau, target, diminishing returns)
|
|
151
157
|
```
|
|
152
158
|
|
|
153
159
|
---
|
|
@@ -159,7 +165,8 @@ Plugin hook (SessionStart)
|
|
|
159
165
|
└→ Creates venv, installs langsmith + langsmith-cli, exports env vars
|
|
160
166
|
|
|
161
167
|
Skills (markdown)
|
|
162
|
-
├── /evolver:setup → explores project, runs setup.py
|
|
168
|
+
├── /evolver:setup → explores project, smart defaults, runs setup.py
|
|
169
|
+
├── /evolver:health → dataset quality check + auto-correct
|
|
163
170
|
├── /evolver:evolve → orchestrates the evolution loop
|
|
164
171
|
├── /evolver:status → reads .evolver.json + LangSmith
|
|
165
172
|
└── /evolver:deploy → tags and pushes
|
|
@@ -179,10 +186,8 @@ Tools (Python + langsmith SDK)
|
|
|
179
186
|
├── trace_insights.py → clusters errors from traces
|
|
180
187
|
├── seed_from_traces.py → imports production traces
|
|
181
188
|
├── validate_state.py → validates config vs LangSmith state
|
|
182
|
-
├──
|
|
189
|
+
├── dataset_health.py → dataset quality diagnostic (size, difficulty, coverage, splits)
|
|
183
190
|
├── regression_tracker.py → tracks regressions, adds guard examples
|
|
184
|
-
├── consolidate.py → cross-iteration memory consolidation
|
|
185
|
-
├── synthesize_strategy.py→ generates strategy document + investigation lenses
|
|
186
191
|
├── add_evaluator.py → programmatically adds evaluators
|
|
187
192
|
└── adversarial_inject.py → detects memorization, injects adversarial tests
|
|
188
193
|
```
|
|
@@ -221,6 +226,7 @@ LangSmith traces **any** AI framework. The evolver works with all of them:
|
|
|
221
226
|
- [Darwin Godel Machine](https://sakana.ai/dgm/) — Sakana AI
|
|
222
227
|
- [AlphaEvolve](https://deepmind.google/blog/alphaevolve/) — DeepMind
|
|
223
228
|
- [LangSmith Evaluation](https://docs.smith.langchain.com/evaluation) — LangChain
|
|
229
|
+
- [Harnessing Claude's Intelligence](https://claude.com/blog/harnessing-claudes-intelligence) — Martin, Anthropic, 2026
|
|
224
230
|
- [Traces Start the Agent Improvement Loop](https://www.langchain.com/conceptual-guides/traces-start-agent-improvement-loop) — LangChain
|
|
225
231
|
|
|
226
232
|
---
|
|
@@ -22,14 +22,7 @@ Your prompt contains `<files_to_read>`, `<context>`, and `<lens>` blocks. You MU
|
|
|
22
22
|
|
|
23
23
|
## Turn Budget
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
- Spend early turns reading context and investigating your lens question
|
|
27
|
-
- Spend middle turns implementing changes and consulting documentation
|
|
28
|
-
- Reserve final turns for committing and writing proposal.md
|
|
29
|
-
|
|
30
|
-
**If you're past turn 12 and haven't started implementing**, simplify your approach. A small, focused change that works is better than an ambitious change that's incomplete.
|
|
31
|
-
|
|
32
|
-
**Context management**: After turn 8, avoid re-reading files you've already read. Reference your earlier analysis instead of re-running Glob/Grep searches.
|
|
25
|
+
Most proposals need **10-15 turns**. Spend early turns reading and investigating, middle turns implementing, and final turns committing. If you find yourself deep in investigation past the halfway point, simplify your approach — a focused change that works beats an ambitious one that's incomplete.
|
|
33
26
|
|
|
34
27
|
## Lens Protocol
|
|
35
28
|
|
|
@@ -44,19 +37,7 @@ You are NOT constrained to the lens topic. The lens gives you a starting perspec
|
|
|
44
37
|
|
|
45
38
|
## Your Workflow
|
|
46
39
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
**Orient** — Read .evolver.json, strategy.md, evolution_memory.md. Understand the framework, entry point, evaluators, current score, and what has been tried before.
|
|
50
|
-
|
|
51
|
-
**Investigate** — Read trace_insights.json and best_results.json. Understand which examples fail and why. If production_seed.json exists, understand real-world usage patterns. Focus on data relevant to your lens question.
|
|
52
|
-
|
|
53
|
-
**Decide** — Based on investigation, decide what to change. Consider:
|
|
54
|
-
- **Prompts**: system prompts, few-shot examples, output format instructions
|
|
55
|
-
- **Routing**: how queries are dispatched to different handlers
|
|
56
|
-
- **Tools**: tool definitions, tool selection logic
|
|
57
|
-
- **Architecture**: agent topology, chain structure, graph edges
|
|
58
|
-
- **Error handling**: retry logic, fallback strategies, timeout handling
|
|
59
|
-
- **Model selection**: which model for which task
|
|
40
|
+
Read the available context files (.evolver.json, strategy.md, evolution_memory.md, trace_insights.json, best_results.json, production_seed.json). Investigate your lens question. Decide what to change and implement it.
|
|
60
41
|
|
|
61
42
|
## Self-Abstention
|
|
62
43
|
|
|
@@ -74,34 +55,14 @@ To abstain, skip implementation and write only a `proposal.md`:
|
|
|
74
55
|
|
|
75
56
|
Then end with the return protocol using `ABSTAIN` as your approach.
|
|
76
57
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
**Before writing ANY code**, you MUST consult Context7 for every library you'll be modifying or using. This is NOT optional.
|
|
80
|
-
|
|
81
|
-
**Step 1 — Identify libraries from the code you read:**
|
|
82
|
-
Read the imports in the files you're about to modify. For each framework/library (LangGraph, OpenAI, Anthropic, CrewAI, etc.):
|
|
83
|
-
|
|
84
|
-
**Step 2 — Resolve library ID:**
|
|
85
|
-
```
|
|
86
|
-
resolve-library-id(libraryName: "langgraph", query: "what you're trying to do")
|
|
87
|
-
```
|
|
88
|
-
This returns up to 10 matches. Pick the one with the highest relevance.
|
|
89
|
-
|
|
90
|
-
**Step 3 — Query docs for your specific task:**
|
|
91
|
-
```
|
|
92
|
-
get-library-docs(libraryId: "/langchain-ai/langgraph", query: "conditional edges StateGraph", topic: "routing")
|
|
93
|
-
```
|
|
94
|
-
Ask about the SPECIFIC API you're going to use or change.
|
|
58
|
+
## Consult Documentation
|
|
95
59
|
|
|
96
|
-
|
|
97
|
-
- About to modify a StateGraph? → `query: "StateGraph add_conditional_edges"`
|
|
98
|
-
- Changing prompt template? → `query: "ChatPromptTemplate from_messages"` for langchain
|
|
99
|
-
- Adding a tool? → `query: "StructuredTool create tool definition"` for langchain
|
|
100
|
-
- Changing model? → `query: "ChatOpenAI model parameters temperature"` for openai
|
|
60
|
+
Before modifying library APIs (LangGraph, OpenAI, Anthropic, etc.), consult Context7 to verify you're using current patterns:
|
|
101
61
|
|
|
102
|
-
|
|
62
|
+
1. `resolve-library-id(libraryName: "langgraph")`
|
|
63
|
+
2. `get-library-docs(libraryId: "/langchain-ai/langgraph", query: "your specific API question")`
|
|
103
64
|
|
|
104
|
-
|
|
65
|
+
If Context7 MCP is not available, note in proposal.md that API patterns were not verified.
|
|
105
66
|
|
|
106
67
|
### Commit and Document
|
|
107
68
|
|
package/package.json
CHANGED
package/skills/evolve/SKILL.md
CHANGED
|
@@ -131,119 +131,7 @@ If critical issues found, ask user whether to continue or fix first via AskUserQ
|
|
|
131
131
|
|
|
132
132
|
### 0.6. Dataset Health Check
|
|
133
133
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
```bash
|
|
137
|
-
$EVOLVER_PY $TOOLS/dataset_health.py \
|
|
138
|
-
--config .evolver.json \
|
|
139
|
-
--production-seed production_seed.json \
|
|
140
|
-
--output health_report.json 2>/dev/null
|
|
141
|
-
```
|
|
142
|
-
|
|
143
|
-
Read `health_report.json`. Print summary:
|
|
144
|
-
```bash
|
|
145
|
-
python3 -c "
|
|
146
|
-
import json, os
|
|
147
|
-
if os.path.exists('health_report.json'):
|
|
148
|
-
r = json.load(open('health_report.json'))
|
|
149
|
-
print(f'Dataset Health: {r[\"health_score\"]}/10 ({r[\"example_count\"]} examples)')
|
|
150
|
-
for issue in r.get('issues', []):
|
|
151
|
-
print(f' [{issue[\"severity\"]}] {issue[\"message\"]}')
|
|
152
|
-
"
|
|
153
|
-
```
|
|
154
|
-
|
|
155
|
-
### 0.7. Auto-Correct Dataset Issues
|
|
156
|
-
|
|
157
|
-
If `health_report.json` has corrections, apply them automatically:
|
|
158
|
-
|
|
159
|
-
```bash
|
|
160
|
-
CORRECTIONS=$(python3 -c "
|
|
161
|
-
import json, os
|
|
162
|
-
if os.path.exists('health_report.json'):
|
|
163
|
-
r = json.load(open('health_report.json'))
|
|
164
|
-
for c in r.get('corrections', []):
|
|
165
|
-
print(c['action'])
|
|
166
|
-
" 2>/dev/null)
|
|
167
|
-
```
|
|
168
|
-
|
|
169
|
-
For each correction:
|
|
170
|
-
|
|
171
|
-
**If `create_splits`**: Run inline Python to assign 70/30 splits:
|
|
172
|
-
```bash
|
|
173
|
-
$EVOLVER_PY -c "
|
|
174
|
-
from langsmith import Client
|
|
175
|
-
import json, random
|
|
176
|
-
client = Client()
|
|
177
|
-
config = json.load(open('.evolver.json'))
|
|
178
|
-
examples = list(client.list_examples(dataset_name=config['dataset']))
|
|
179
|
-
random.shuffle(examples)
|
|
180
|
-
sp = int(len(examples) * 0.7)
|
|
181
|
-
for ex in examples[:sp]:
|
|
182
|
-
client.update_example(ex.id, split='train')
|
|
183
|
-
for ex in examples[sp:]:
|
|
184
|
-
client.update_example(ex.id, split='held_out')
|
|
185
|
-
print(f'Assigned splits: {sp} train, {len(examples)-sp} held_out')
|
|
186
|
-
"
|
|
187
|
-
```
|
|
188
|
-
|
|
189
|
-
**If `generate_hard`**: Spawn testgen agent with hard-mode instruction:
|
|
190
|
-
```
|
|
191
|
-
Agent(
|
|
192
|
-
subagent_type: "evolver-testgen",
|
|
193
|
-
description: "Generate hard examples to rebalance dataset",
|
|
194
|
-
prompt: |
|
|
195
|
-
<objective>
|
|
196
|
-
The dataset is skewed toward easy examples. Generate {count} HARD examples
|
|
197
|
-
that the current agent is likely to fail on.
|
|
198
|
-
Focus on: edge cases, adversarial inputs, complex multi-step queries,
|
|
199
|
-
ambiguous questions, and inputs that require deep reasoning.
|
|
200
|
-
</objective>
|
|
201
|
-
<files_to_read>
|
|
202
|
-
- .evolver.json
|
|
203
|
-
- strategy.md (if exists)
|
|
204
|
-
- production_seed.json (if exists)
|
|
205
|
-
</files_to_read>
|
|
206
|
-
)
|
|
207
|
-
```
|
|
208
|
-
|
|
209
|
-
**If `fill_coverage`**: Spawn testgen agent with coverage-fill instruction:
|
|
210
|
-
```
|
|
211
|
-
Agent(
|
|
212
|
-
subagent_type: "evolver-testgen",
|
|
213
|
-
description: "Generate examples for missing categories",
|
|
214
|
-
prompt: |
|
|
215
|
-
<objective>
|
|
216
|
-
The dataset is missing these production categories: {categories}.
|
|
217
|
-
Generate 5 examples per missing category.
|
|
218
|
-
Use production_seed.json for real-world patterns in these categories.
|
|
219
|
-
</objective>
|
|
220
|
-
<files_to_read>
|
|
221
|
-
- .evolver.json
|
|
222
|
-
- production_seed.json (if exists)
|
|
223
|
-
</files_to_read>
|
|
224
|
-
)
|
|
225
|
-
```
|
|
226
|
-
|
|
227
|
-
**If `retire_dead`**: Move dead examples to retired split:
|
|
228
|
-
```bash
|
|
229
|
-
$EVOLVER_PY -c "
|
|
230
|
-
from langsmith import Client
|
|
231
|
-
import json
|
|
232
|
-
client = Client()
|
|
233
|
-
report = json.load(open('health_report.json'))
|
|
234
|
-
dead_ids = report.get('dead_examples', {}).get('ids', [])
|
|
235
|
-
config = json.load(open('.evolver.json'))
|
|
236
|
-
examples = {str(e.id): e for e in client.list_examples(dataset_name=config['dataset'])}
|
|
237
|
-
retired = 0
|
|
238
|
-
for eid in dead_ids:
|
|
239
|
-
if eid in examples:
|
|
240
|
-
client.update_example(examples[eid].id, split='retired')
|
|
241
|
-
retired += 1
|
|
242
|
-
print(f'Retired {retired} dead examples')
|
|
243
|
-
"
|
|
244
|
-
```
|
|
245
|
-
|
|
246
|
-
After corrections, log what was done. Do NOT re-run health check (corrections may need an experiment cycle to show effect).
|
|
134
|
+
Invoke `/evolver:health` to check and auto-correct dataset issues. If health_report.json shows critical issues that couldn't be auto-corrected, ask user whether to proceed via AskUserQuestion.
|
|
247
135
|
|
|
248
136
|
### 0.8. Resolve Project Directory
|
|
249
137
|
|
|
@@ -268,66 +156,75 @@ For each iteration:
|
|
|
268
156
|
python3 -c "import json; c=json.load(open('.evolver.json')); print(f'v{c[\"iterations\"]+1:03d}')"
|
|
269
157
|
```
|
|
270
158
|
|
|
271
|
-
### 1.5. Gather
|
|
159
|
+
### 1.5. Gather Analysis Data (Parallel)
|
|
272
160
|
|
|
273
|
-
Read the best experiment from config. If null (no baseline was run), skip
|
|
161
|
+
Read the best experiment from config. If null (no baseline was run), skip data gathering — proposers will work from code analysis only:
|
|
274
162
|
|
|
275
163
|
```bash
|
|
276
164
|
BEST=$(python3 -c "import json; b=json.load(open('.evolver.json')).get('best_experiment'); print(b if b else '')")
|
|
165
|
+
PROD=$(python3 -c "import json; c=json.load(open('.evolver.json')); print(c.get('production_project',''))")
|
|
166
|
+
|
|
277
167
|
if [ -n "$BEST" ]; then
|
|
168
|
+
# Run all data gathering in parallel — these are independent API calls
|
|
278
169
|
$EVOLVER_PY $TOOLS/trace_insights.py \
|
|
279
170
|
--from-experiment "$BEST" \
|
|
280
|
-
--output trace_insights.json 2>/dev/null
|
|
281
|
-
fi
|
|
282
|
-
```
|
|
171
|
+
--output trace_insights.json 2>/dev/null &
|
|
283
172
|
|
|
284
|
-
|
|
173
|
+
$EVOLVER_PY $TOOLS/read_results.py \
|
|
174
|
+
--experiment "$BEST" \
|
|
175
|
+
--config .evolver.json \
|
|
176
|
+
--split train \
|
|
177
|
+
--output best_results.json 2>/dev/null &
|
|
178
|
+
fi
|
|
285
179
|
|
|
286
|
-
```bash
|
|
287
|
-
PROD=$(python3 -c "import json; c=json.load(open('.evolver.json')); print(c.get('production_project',''))")
|
|
288
180
|
if [ -n "$PROD" ] && [ ! -f "production_seed.json" ]; then
|
|
289
181
|
$EVOLVER_PY $TOOLS/seed_from_traces.py \
|
|
290
|
-
--project "$PROD"
|
|
182
|
+
--project "$PROD" \
|
|
291
183
|
--output-md production_seed.md \
|
|
292
184
|
--output-json production_seed.json \
|
|
293
|
-
--limit 100 2>/dev/null
|
|
185
|
+
--limit 100 2>/dev/null &
|
|
294
186
|
fi
|
|
295
|
-
```
|
|
296
187
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
If `$BEST` is set (not the first iteration without baseline), read results and cluster failures:
|
|
300
|
-
|
|
301
|
-
```bash
|
|
302
|
-
if [ -n "$BEST" ]; then
|
|
303
|
-
$EVOLVER_PY $TOOLS/read_results.py \
|
|
304
|
-
--experiment "$BEST" \
|
|
305
|
-
--config .evolver.json \
|
|
306
|
-
--split train \
|
|
307
|
-
--output best_results.json 2>/dev/null
|
|
308
|
-
fi
|
|
188
|
+
wait # Wait for all data gathering to complete
|
|
309
189
|
```
|
|
310
190
|
|
|
311
191
|
If `best_results.json` exists, parse it to find failing examples (score < 0.7). Group by metadata or error pattern.
|
|
312
|
-
This failure data feeds into
|
|
192
|
+
This failure data feeds into the strategy and lens generation step (1.8a).
|
|
313
193
|
If no best_results.json (first iteration without baseline), all proposers work from code analysis only — no failure data available.
|
|
314
194
|
|
|
315
|
-
### 1.8a.
|
|
195
|
+
### 1.8a. Generate Strategy and Lenses
|
|
316
196
|
|
|
317
|
-
|
|
197
|
+
Read the available analysis files:
|
|
198
|
+
- `trace_insights.json` (error clusters, token analysis)
|
|
199
|
+
- `best_results.json` (per-task scores and failures)
|
|
200
|
+
- `evolution_memory.json` / `evolution_memory.md` (cross-iteration insights)
|
|
201
|
+
- `production_seed.json` (real-world traffic patterns, if exists)
|
|
318
202
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
203
|
+
Based on this data, generate two files:
|
|
204
|
+
|
|
205
|
+
**`strategy.md`** — A concise strategy document with: target files, failure clusters (prioritized), recommended approaches (from evolution memory), approaches to avoid, top failing examples, and production insights.
|
|
206
|
+
|
|
207
|
+
**`lenses.json`** — Investigation questions for proposers, format:
|
|
208
|
+
```json
|
|
209
|
+
{
|
|
210
|
+
"generated_at": "ISO timestamp",
|
|
211
|
+
"lens_count": N,
|
|
212
|
+
"lenses": [
|
|
213
|
+
{"id": 1, "question": "...", "source": "failure_cluster|architecture|production|evolution_memory|uniform_failure|open", "severity": "critical|high|medium", "context": {}},
|
|
214
|
+
...
|
|
215
|
+
]
|
|
216
|
+
}
|
|
328
217
|
```
|
|
329
218
|
|
|
330
|
-
|
|
219
|
+
Lens generation rules:
|
|
220
|
+
- One lens per distinct failure cluster (max 3)
|
|
221
|
+
- One architecture lens if high-severity structural issues exist
|
|
222
|
+
- One production lens if production data shows problems
|
|
223
|
+
- One evolution memory lens if a pattern won 2+ times
|
|
224
|
+
- One persistent failure lens if a pattern recurred 3+ iterations
|
|
225
|
+
- If all examples fail with same error, one "uniform_failure" lens
|
|
226
|
+
- Always include one "open" lens
|
|
227
|
+
- Sort by severity (critical > high > medium), cap at max_proposers from config (default 5)
|
|
331
228
|
|
|
332
229
|
### 1.9. Prepare Shared Proposer Context
|
|
333
230
|
|
|
@@ -433,20 +330,23 @@ done
|
|
|
433
330
|
|
|
434
331
|
Only run evaluation (Step 3) for proposers that committed changes (not abstained, not stuck).
|
|
435
332
|
|
|
436
|
-
### 3. Run Target for Each Candidate
|
|
333
|
+
### 3. Run Target for Each Candidate (Parallel)
|
|
437
334
|
|
|
438
|
-
|
|
335
|
+
Run evaluations for ALL candidates simultaneously — they're independent:
|
|
439
336
|
|
|
440
337
|
```bash
|
|
441
|
-
#
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
338
|
+
# Launch all evaluations in parallel
|
|
339
|
+
for WORKTREE in {worktree_paths_with_commits}; do
|
|
340
|
+
WORKTREE_PROJECT="$WORKTREE"
|
|
341
|
+
[ -n "$PROJECT_DIR" ] && WORKTREE_PROJECT="$WORKTREE/$PROJECT_DIR"
|
|
342
|
+
|
|
343
|
+
$EVOLVER_PY $TOOLS/run_eval.py \
|
|
344
|
+
--config "$WORKTREE_PROJECT/.evolver.json" \
|
|
345
|
+
--worktree-path "$WORKTREE_PROJECT" \
|
|
346
|
+
--experiment-prefix v{NNN}-{lens_id} \
|
|
347
|
+
--timeout 120 &
|
|
348
|
+
done
|
|
349
|
+
wait # Wait for all evaluations to complete
|
|
450
350
|
```
|
|
451
351
|
|
|
452
352
|
Each candidate becomes a separate LangSmith experiment. This step runs the agent and applies code-based evaluators (has_output, token_efficiency) only.
|
|
@@ -473,27 +373,7 @@ Then spawn ONE evaluator agent that scores ALL candidates in a single pass. This
|
|
|
473
373
|
Agent(
|
|
474
374
|
subagent_type: "evolver-evaluator",
|
|
475
375
|
description: "Evaluate all candidates for iteration v{NNN}",
|
|
476
|
-
prompt:
|
|
477
|
-
<experiment>
|
|
478
|
-
Evaluate the following experiments (one per candidate):
|
|
479
|
-
{list all experiment names from proposers that committed changes — skip abstained}
|
|
480
|
-
</experiment>
|
|
481
|
-
|
|
482
|
-
<evaluators>
|
|
483
|
-
Apply these evaluators to each run in each experiment:
|
|
484
|
-
- {llm_evaluator_list, e.g. "correctness", "conciseness"}
|
|
485
|
-
</evaluators>
|
|
486
|
-
|
|
487
|
-
<context>
|
|
488
|
-
Agent type: {framework} agent
|
|
489
|
-
Domain: {description from .evolver.json or entry point context}
|
|
490
|
-
Entry point: {entry_point}
|
|
491
|
-
|
|
492
|
-
For each experiment:
|
|
493
|
-
1. Read all runs via: langsmith-cli --json runs list --project "{experiment_name}" --fields id,inputs,outputs,error --is-root true --limit 200
|
|
494
|
-
2. Judge each run's output against the input
|
|
495
|
-
3. Write scores via: langsmith-cli --json feedback create {run_id} --key {evaluator} --score {0.0|1.0} --comment "{reason}" --source model
|
|
496
|
-
</context>
|
|
376
|
+
prompt: "Experiments to evaluate: {comma-separated experiment names from non-abstained proposers}. Evaluators: {llm_evaluator_list}. Framework: {framework}. Entry point: {entry_point}."
|
|
497
377
|
)
|
|
498
378
|
```
|
|
499
379
|
|
|
@@ -592,45 +472,18 @@ Print: `Iteration {i}/{N}: v{NNN} scored {score} (best: {best} at {best_score})`
|
|
|
592
472
|
|
|
593
473
|
### 6.2. Consolidate Evolution Memory
|
|
594
474
|
|
|
595
|
-
Spawn the consolidator agent
|
|
475
|
+
Spawn the consolidator agent (runs in background — doesn't block the next iteration):
|
|
596
476
|
|
|
597
477
|
```
|
|
598
478
|
Agent(
|
|
599
479
|
subagent_type: "evolver-consolidator",
|
|
600
480
|
description: "Consolidate evolution memory after iteration v{NNN}",
|
|
601
481
|
run_in_background: true,
|
|
602
|
-
prompt:
|
|
603
|
-
<objective>
|
|
604
|
-
Consolidate learnings from iteration v{NNN}.
|
|
605
|
-
Run the consolidation tool and review its output.
|
|
606
|
-
</objective>
|
|
607
|
-
|
|
608
|
-
<tools_path>
|
|
609
|
-
TOOLS={tools_path}
|
|
610
|
-
EVOLVER_PY={evolver_py_path}
|
|
611
|
-
</tools_path>
|
|
612
|
-
|
|
613
|
-
<instructions>
|
|
614
|
-
Run: $EVOLVER_PY $TOOLS/consolidate.py \
|
|
615
|
-
--config .evolver.json \
|
|
616
|
-
--comparison-files comparison.json \
|
|
617
|
-
--output evolution_memory.md \
|
|
618
|
-
--output-json evolution_memory.json
|
|
619
|
-
|
|
620
|
-
Then read the output and verify insights are accurate.
|
|
621
|
-
</instructions>
|
|
622
|
-
|
|
623
|
-
<files_to_read>
|
|
624
|
-
- .evolver.json
|
|
625
|
-
- comparison.json
|
|
626
|
-
- trace_insights.json (if exists)
|
|
627
|
-
- regression_report.json (if exists)
|
|
628
|
-
- evolution_memory.md (if exists)
|
|
629
|
-
</files_to_read>
|
|
482
|
+
prompt: "Update evolution_memory.md with learnings from this iteration. Read .evolver.json, comparison.json, trace_insights.json, regression_report.json (if exists), and current evolution_memory.md (if exists). Track what worked, what failed, and promote insights that recur across iterations."
|
|
630
483
|
)
|
|
631
484
|
```
|
|
632
485
|
|
|
633
|
-
The `evolution_memory.md` file will be
|
|
486
|
+
The `evolution_memory.md` file will be available for proposer briefings in subsequent iterations.
|
|
634
487
|
|
|
635
488
|
### 6.5. Auto-trigger Active Critic
|
|
636
489
|
|
|
@@ -639,25 +492,8 @@ If score jumped >0.3 from previous iteration OR reached target in <3 iterations:
|
|
|
639
492
|
```
|
|
640
493
|
Agent(
|
|
641
494
|
subagent_type: "evolver-critic",
|
|
642
|
-
description: "
|
|
643
|
-
prompt:
|
|
644
|
-
<objective>
|
|
645
|
-
EVAL GAMING CHECK: Score jumped from {prev_score} to {score}.
|
|
646
|
-
Check if the LangSmith evaluators are being gamed.
|
|
647
|
-
If gaming detected, add stricter evaluators using $TOOLS/add_evaluator.py.
|
|
648
|
-
</objective>
|
|
649
|
-
|
|
650
|
-
<tools_path>
|
|
651
|
-
TOOLS={tools_path}
|
|
652
|
-
EVOLVER_PY={evolver_py_path}
|
|
653
|
-
</tools_path>
|
|
654
|
-
|
|
655
|
-
<files_to_read>
|
|
656
|
-
- .evolver.json
|
|
657
|
-
- comparison.json
|
|
658
|
-
- trace_insights.json
|
|
659
|
-
- evolution_memory.md (if exists)
|
|
660
|
-
</files_to_read>
|
|
495
|
+
description: "Check evaluator gaming after score jump",
|
|
496
|
+
prompt: "Score jumped from {prev_score} to {score}. Check if LangSmith evaluators are being gamed. Read .evolver.json, comparison.json, trace_insights.json, evolution_memory.md. If gaming detected, add stricter evaluators using $EVOLVER_PY $TOOLS/add_evaluator.py."
|
|
661
497
|
)
|
|
662
498
|
```
|
|
663
499
|
|
|
@@ -674,55 +510,22 @@ If 3 consecutive iterations within 1% OR score dropped:
|
|
|
674
510
|
Agent(
|
|
675
511
|
subagent_type: "evolver-architect",
|
|
676
512
|
model: "opus",
|
|
677
|
-
description: "
|
|
678
|
-
prompt:
|
|
679
|
-
<objective>
|
|
680
|
-
The evolution loop has stagnated after {iterations} iterations.
|
|
681
|
-
Scores: {last_3_scores}.
|
|
682
|
-
Perform deep architectural analysis and recommend structural changes.
|
|
683
|
-
Use extended thinking — you have more compute budget than normal agents.
|
|
684
|
-
</objective>
|
|
685
|
-
|
|
686
|
-
<tools_path>
|
|
687
|
-
TOOLS={tools_path}
|
|
688
|
-
EVOLVER_PY={evolver_py_path}
|
|
689
|
-
</tools_path>
|
|
690
|
-
|
|
691
|
-
<files_to_read>
|
|
692
|
-
- .evolver.json
|
|
693
|
-
- trace_insights.json
|
|
694
|
-
- evolution_memory.md (if exists)
|
|
695
|
-
- evolution_memory.json (if exists)
|
|
696
|
-
- strategy.md (if exists)
|
|
697
|
-
- {entry point and all related source files}
|
|
698
|
-
</files_to_read>
|
|
513
|
+
description: "Deep topology analysis after stagnation",
|
|
514
|
+
prompt: "Evolution stagnated after {iterations} iterations. Scores: {last_3_scores}. Analyze architecture and recommend structural changes. Read .evolver.json, trace_insights.json, evolution_memory.md, strategy.md, and the entry point source files. Use $EVOLVER_PY $TOOLS/analyze_architecture.py for AST analysis if helpful."
|
|
699
515
|
)
|
|
700
516
|
```
|
|
701
517
|
|
|
702
518
|
After architect completes, include `architecture.md` in proposer `<files_to_read>` for next iteration.
|
|
703
519
|
|
|
704
|
-
### 8. Gate Check
|
|
705
|
-
|
|
706
|
-
Before starting the next iteration, run the gate check:
|
|
520
|
+
### 8. Gate Check
|
|
707
521
|
|
|
708
|
-
|
|
709
|
-
GATE_RESULT=$($EVOLVER_PY $TOOLS/iteration_gate.py --config .evolver.json 2>/dev/null)
|
|
710
|
-
PROCEED=$(echo "$GATE_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('proceed', True))")
|
|
711
|
-
```
|
|
712
|
-
|
|
713
|
-
If `PROCEED` is `False`, check suggestions:
|
|
714
|
-
|
|
715
|
-
```bash
|
|
716
|
-
SUGGEST=$(echo "$GATE_RESULT" | python3 -c "import sys,json; s=json.load(sys.stdin).get('suggestions',[]); print(s[0] if s else '')")
|
|
717
|
-
```
|
|
522
|
+
Read `.evolver.json` history and assess whether to continue:
|
|
718
523
|
|
|
719
|
-
- If
|
|
720
|
-
-
|
|
721
|
-
-
|
|
524
|
+
- **Score plateau**: If last 3 scores are within 2% of each other, evolution may have converged. Consider triggering architect (Step 7) or stopping.
|
|
525
|
+
- **Target reached**: If `best_score >= target_score`, stop and report success.
|
|
526
|
+
- **Diminishing returns**: If average improvement over last 5 iterations is less than 0.5%, consider stopping.
|
|
722
527
|
|
|
723
|
-
|
|
724
|
-
- **Target**: `score >= target_score` → stop
|
|
725
|
-
- **N reached**: all requested iterations done → stop
|
|
528
|
+
If stopping, skip to the final report. If continuing, proceed to next iteration.
|
|
726
529
|
|
|
727
530
|
## When Loop Ends — Final Report
|
|
728
531
|
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: evolver:health
|
|
3
|
+
description: "Use when the user wants to check dataset quality, diagnose eval issues, or before running evolve. Checks size, difficulty distribution, dead examples, coverage, and splits. Auto-corrects issues found."
|
|
4
|
+
allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent, AskUserQuestion]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# /evolver:health
|
|
8
|
+
|
|
9
|
+
Check eval dataset quality and auto-correct issues. Can be run independently or is invoked by `/evolver:evolve` before the iteration loop.
|
|
10
|
+
|
|
11
|
+
## Prerequisites
|
|
12
|
+
|
|
13
|
+
`.evolver.json` must exist. If not, tell user to run `/evolver:setup`.
|
|
14
|
+
|
|
15
|
+
## Resolve Tool Path and Python
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
TOOLS="${EVOLVER_TOOLS:-$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")}"
|
|
19
|
+
EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")}"
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## 1. Run Health Diagnostic
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
$EVOLVER_PY $TOOLS/dataset_health.py \
|
|
26
|
+
--config .evolver.json \
|
|
27
|
+
--production-seed production_seed.json \
|
|
28
|
+
--output health_report.json 2>/dev/null
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Print summary:
|
|
32
|
+
```bash
|
|
33
|
+
python3 -c "
|
|
34
|
+
import json, os
|
|
35
|
+
if os.path.exists('health_report.json'):
|
|
36
|
+
r = json.load(open('health_report.json'))
|
|
37
|
+
print(f'Dataset Health: {r[\"health_score\"]}/10 ({r[\"example_count\"]} examples)')
|
|
38
|
+
for issue in r.get('issues', []):
|
|
39
|
+
print(f' [{issue[\"severity\"]}] {issue[\"message\"]}')
|
|
40
|
+
if not r.get('issues'):
|
|
41
|
+
print(' No issues found.')
|
|
42
|
+
"
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## 2. Auto-Correct Issues
|
|
46
|
+
|
|
47
|
+
If `health_report.json` has corrections, apply them automatically:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
CORRECTIONS=$(python3 -c "
|
|
51
|
+
import json, os
|
|
52
|
+
if os.path.exists('health_report.json'):
|
|
53
|
+
r = json.load(open('health_report.json'))
|
|
54
|
+
for c in r.get('corrections', []):
|
|
55
|
+
print(c['action'])
|
|
56
|
+
" 2>/dev/null)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
For each correction:
|
|
60
|
+
|
|
61
|
+
**If `create_splits`**: Assign 70/30 train/held_out splits:
|
|
62
|
+
```bash
|
|
63
|
+
$EVOLVER_PY -c "
|
|
64
|
+
from langsmith import Client
|
|
65
|
+
import json, random
|
|
66
|
+
client = Client()
|
|
67
|
+
config = json.load(open('.evolver.json'))
|
|
68
|
+
examples = list(client.list_examples(dataset_name=config['dataset']))
|
|
69
|
+
random.shuffle(examples)
|
|
70
|
+
sp = int(len(examples) * 0.7)
|
|
71
|
+
for ex in examples[:sp]:
|
|
72
|
+
client.update_example(ex.id, split='train')
|
|
73
|
+
for ex in examples[sp:]:
|
|
74
|
+
client.update_example(ex.id, split='held_out')
|
|
75
|
+
print(f'Assigned splits: {sp} train, {len(examples)-sp} held_out')
|
|
76
|
+
"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**If `generate_hard`**: Spawn testgen agent to generate hard examples:
|
|
80
|
+
```
|
|
81
|
+
Agent(
|
|
82
|
+
subagent_type: "evolver-testgen",
|
|
83
|
+
description: "Generate hard examples to rebalance dataset",
|
|
84
|
+
prompt: "The dataset is skewed toward easy examples. Generate {count} HARD examples that the current agent is likely to fail on. Focus on edge cases, adversarial inputs, and complex multi-step queries. Read .evolver.json and production_seed.json for context."
|
|
85
|
+
)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
**If `fill_coverage`**: Spawn testgen agent for missing categories:
|
|
89
|
+
```
|
|
90
|
+
Agent(
|
|
91
|
+
subagent_type: "evolver-testgen",
|
|
92
|
+
description: "Generate examples for missing categories",
|
|
93
|
+
prompt: "The dataset is missing these production categories: {categories}. Generate 5 examples per missing category. Read .evolver.json and production_seed.json for context."
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**If `retire_dead`**: Move dead examples to retired split:
|
|
98
|
+
```bash
|
|
99
|
+
$EVOLVER_PY -c "
|
|
100
|
+
from langsmith import Client
|
|
101
|
+
import json
|
|
102
|
+
client = Client()
|
|
103
|
+
report = json.load(open('health_report.json'))
|
|
104
|
+
dead_ids = report.get('dead_examples', {}).get('ids', [])
|
|
105
|
+
config = json.load(open('.evolver.json'))
|
|
106
|
+
examples = {str(e.id): e for e in client.list_examples(dataset_name=config['dataset'])}
|
|
107
|
+
retired = 0
|
|
108
|
+
for eid in dead_ids:
|
|
109
|
+
if eid in examples:
|
|
110
|
+
client.update_example(examples[eid].id, split='retired')
|
|
111
|
+
retired += 1
|
|
112
|
+
print(f'Retired {retired} dead examples')
|
|
113
|
+
"
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
After corrections, log what was done.
|
|
117
|
+
|
|
118
|
+
## 3. Report
|
|
119
|
+
|
|
120
|
+
Print final health status. If critical issues remain that couldn't be auto-corrected, warn the user.
|
package/skills/setup/SKILL.md
CHANGED
|
@@ -86,82 +86,84 @@ The runner writes `{"input": "user question..."}` to a temp `.json` file and rep
|
|
|
86
86
|
|
|
87
87
|
If no placeholder and no `--input` flag detected, the runner appends `--input <path> --output <path>`.
|
|
88
88
|
|
|
89
|
-
## Phase 2: Confirm
|
|
89
|
+
## Phase 2: Confirm Configuration (interactive)
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
```json
|
|
94
|
-
{
|
|
95
|
-
"questions": [{
|
|
96
|
-
"question": "Here's what I detected. Does this look right?\n\nEntry point: {path}\nFramework: {framework}\nRun command: {command}\nLangSmith: {status}",
|
|
97
|
-
"header": "Confirm",
|
|
98
|
-
"multiSelect": false,
|
|
99
|
-
"options": [
|
|
100
|
-
{"label": "Looks good, proceed", "description": "Continue with detected configuration"},
|
|
101
|
-
{"label": "Let me adjust", "description": "I'll provide correct paths and commands"},
|
|
102
|
-
{"label": "Wrong directory", "description": "I need to cd somewhere else first"}
|
|
103
|
-
]
|
|
104
|
-
}]
|
|
105
|
-
}
|
|
106
|
-
```
|
|
107
|
-
|
|
108
|
-
## Phase 3: What to Optimize (interactive)
|
|
91
|
+
Present all detected configuration in one view with smart defaults and ask for confirmation.
|
|
109
92
|
|
|
110
93
|
Use AskUserQuestion:
|
|
111
94
|
|
|
112
95
|
```json
|
|
113
96
|
{
|
|
114
97
|
"questions": [{
|
|
115
|
-
"question": "
|
|
116
|
-
"header": "
|
|
117
|
-
"multiSelect": true,
|
|
118
|
-
"options": [
|
|
119
|
-
{"label": "Accuracy", "description": "Correctness of outputs — LLM-as-judge evaluator"},
|
|
120
|
-
{"label": "Latency", "description": "Response time — track and minimize"},
|
|
121
|
-
{"label": "Token efficiency", "description": "Fewer tokens for same quality"},
|
|
122
|
-
{"label": "Error handling", "description": "Reduce failures, timeouts, crashes"}
|
|
123
|
-
]
|
|
124
|
-
}]
|
|
125
|
-
}
|
|
126
|
-
```
|
|
127
|
-
|
|
128
|
-
Map selections to evaluator configuration for setup.py.
|
|
129
|
-
|
|
130
|
-
## Phase 4: Test Data Source (interactive)
|
|
131
|
-
|
|
132
|
-
Use AskUserQuestion with **preview**:
|
|
133
|
-
|
|
134
|
-
```json
|
|
135
|
-
{
|
|
136
|
-
"questions": [{
|
|
137
|
-
"question": "Where should test inputs come from?",
|
|
138
|
-
"header": "Test data",
|
|
98
|
+
"question": "Here's the configuration for your project:\n\n**Entry point**: {command}\n**Framework**: {framework}\n**Python**: {venv_path or 'system python3'}\n**Optimization goals**: accuracy (correctness evaluator)\n**Test data**: generate 30 examples with AI\n\nDoes this look good?",
|
|
99
|
+
"header": "Setup Configuration",
|
|
139
100
|
"multiSelect": false,
|
|
140
101
|
"options": [
|
|
141
|
-
{
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
},
|
|
146
|
-
{
|
|
147
|
-
"label": "Generate from code",
|
|
148
|
-
"description": "AI generates test inputs by analyzing your code",
|
|
149
|
-
"preview": "## Generate from Code\n\nThe testgen agent reads your source code and generates\n30 diverse test inputs:\n- 40% standard cases\n- 20% edge cases\n- 20% cross-domain\n- 20% adversarial\n\nOutputs are scored by LLM-as-judge."
|
|
150
|
-
},
|
|
151
|
-
{
|
|
152
|
-
"label": "I have test data",
|
|
153
|
-
"description": "Point to an existing file with test inputs",
|
|
154
|
-
"preview": "## Provide Test Data\n\nSupported formats:\n- JSON array of inputs\n- JSON with {\"inputs\": {...}} objects\n- CSV with input columns\n\nExample:\n```json\n[\n {\"input\": \"What is Python?\"},\n {\"input\": \"Explain quantum computing\"}\n]\n```"
|
|
155
|
-
}
|
|
102
|
+
{"label": "Looks good, proceed", "description": "Use these settings and start setup"},
|
|
103
|
+
{"label": "Customize goals", "description": "Choose different optimization goals"},
|
|
104
|
+
{"label": "I have test data", "description": "Use existing JSON file or LangSmith project"},
|
|
105
|
+
{"label": "Let me adjust everything", "description": "Change entry point, framework, goals, and data source"}
|
|
156
106
|
]
|
|
157
107
|
}]
|
|
158
108
|
}
|
|
159
109
|
```
|
|
160
110
|
|
|
161
|
-
If "
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
111
|
+
**If "Looks good, proceed"**: Use defaults — goals=accuracy, data=generate 30 with testgen. Skip straight to Phase 3.
|
|
112
|
+
|
|
113
|
+
**If "Customize goals"**: Ask the goals question, then proceed to Phase 3 with testgen as default data source.
|
|
114
|
+
|
|
115
|
+
Use AskUserQuestion:
|
|
116
|
+
|
|
117
|
+
```json
|
|
118
|
+
{
|
|
119
|
+
"questions": [{
|
|
120
|
+
"question": "What do you want to optimize?",
|
|
121
|
+
"header": "Goals",
|
|
122
|
+
"multiSelect": true,
|
|
123
|
+
"options": [
|
|
124
|
+
{"label": "Accuracy", "description": "Correctness of outputs — LLM-as-judge evaluator"},
|
|
125
|
+
{"label": "Latency", "description": "Response time — track and minimize"},
|
|
126
|
+
{"label": "Token efficiency", "description": "Fewer tokens for same quality"},
|
|
127
|
+
{"label": "Error handling", "description": "Reduce failures, timeouts, crashes"}
|
|
128
|
+
]
|
|
129
|
+
}]
|
|
130
|
+
}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Map selections to evaluator configuration for setup.py.
|
|
134
|
+
|
|
135
|
+
**If "I have test data"**: Ask the data source question, then proceed to Phase 3 with accuracy as default goal.
|
|
136
|
+
|
|
137
|
+
Use AskUserQuestion with **preview**:
|
|
138
|
+
|
|
139
|
+
```json
|
|
140
|
+
{
|
|
141
|
+
"questions": [{
|
|
142
|
+
"question": "Where should test inputs come from?",
|
|
143
|
+
"header": "Test data",
|
|
144
|
+
"multiSelect": false,
|
|
145
|
+
"options": [
|
|
146
|
+
{
|
|
147
|
+
"label": "Import from LangSmith",
|
|
148
|
+
"description": "Use real production traces as test inputs",
|
|
149
|
+
"preview": "## Import from LangSmith\n\nFetches up to 100 recent traces from your production project.\nPrioritizes traces with negative feedback.\nCreates a LangSmith Dataset with real user inputs.\n\nRequires: an existing LangSmith project with traces."
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
"label": "I have a file",
|
|
153
|
+
"description": "Point to an existing file with test inputs",
|
|
154
|
+
"preview": "## Provide Test Data\n\nSupported formats:\n- JSON array of inputs\n- JSON with {\"inputs\": {...}} objects\n- CSV with input columns\n\nExample:\n```json\n[\n {\"input\": \"What is Python?\"},\n {\"input\": \"Explain quantum computing\"}\n]\n```"
|
|
155
|
+
}
|
|
156
|
+
]
|
|
157
|
+
}]
|
|
158
|
+
}
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
If "Import from LangSmith": discover projects and ask which one (same as v2 Phase 1.9).
|
|
162
|
+
If "I have a file": ask for file path.
|
|
163
|
+
|
|
164
|
+
**If "Let me adjust everything"**: Ask all three original questions in sequence — confirm detection (entry point, framework, run command), then goals, then data source — using the question formats above.
|
|
165
|
+
|
|
166
|
+
## Phase 3: Run Setup
|
|
165
167
|
|
|
166
168
|
Build the setup.py command based on all gathered information:
|
|
167
169
|
|
|
@@ -178,7 +180,7 @@ $EVOLVER_PY $TOOLS/setup.py \
|
|
|
178
180
|
|
|
179
181
|
If "Generate from code" was selected AND no test data file exists, first spawn the testgen agent to generate inputs, then pass the generated file to setup.py.
|
|
180
182
|
|
|
181
|
-
## Phase
|
|
183
|
+
## Phase 4: Generate Test Data (if needed)
|
|
182
184
|
|
|
183
185
|
If testgen is needed, spawn it:
|
|
184
186
|
|
|
@@ -205,7 +207,7 @@ Agent(
|
|
|
205
207
|
|
|
206
208
|
Then pass `--dataset-from-file test_inputs.json` to setup.py.
|
|
207
209
|
|
|
208
|
-
## Phase
|
|
210
|
+
## Phase 5: Report
|
|
209
211
|
|
|
210
212
|
```
|
|
211
213
|
Setup complete!
|
package/tools/run_eval.py
CHANGED
|
@@ -166,11 +166,14 @@ def main():
|
|
|
166
166
|
parser.add_argument("--worktree-path", required=True, help="Path to the candidate's worktree")
|
|
167
167
|
parser.add_argument("--experiment-prefix", required=True, help="Experiment name prefix (e.g. v001a)")
|
|
168
168
|
parser.add_argument("--timeout", type=int, default=120, help="Per-task timeout in seconds")
|
|
169
|
+
parser.add_argument("--concurrency", type=int, default=None, help="Max concurrent evaluations (default: from config or 1)")
|
|
169
170
|
args = parser.parse_args()
|
|
170
171
|
|
|
171
172
|
with open(args.config) as f:
|
|
172
173
|
config = json.load(f)
|
|
173
174
|
|
|
175
|
+
concurrency = args.concurrency or config.get("eval_concurrency", 1)
|
|
176
|
+
|
|
174
177
|
os.environ["EVAL_TASK_TIMEOUT"] = str(args.timeout)
|
|
175
178
|
ensure_langsmith_api_key()
|
|
176
179
|
|
|
@@ -188,6 +191,8 @@ def main():
|
|
|
188
191
|
print(f" Dataset: {config['dataset']}")
|
|
189
192
|
print(f" Worktree: {args.worktree_path}")
|
|
190
193
|
print(f" Code evaluators: {['has_output'] + code_evaluators}")
|
|
194
|
+
if concurrency > 1:
|
|
195
|
+
print(f" Concurrency: {concurrency} parallel evaluations")
|
|
191
196
|
if llm_evaluators:
|
|
192
197
|
print(f" Pending LLM evaluators (agent): {llm_evaluators}")
|
|
193
198
|
|
|
@@ -197,7 +202,7 @@ def main():
|
|
|
197
202
|
data=config["dataset"],
|
|
198
203
|
evaluators=evaluators,
|
|
199
204
|
experiment_prefix=args.experiment_prefix,
|
|
200
|
-
max_concurrency=
|
|
205
|
+
max_concurrency=concurrency,
|
|
201
206
|
)
|
|
202
207
|
|
|
203
208
|
experiment_name = results.experiment_name
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""Fetch and summarize production LangSmith traces for Harness Evolver.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
production traces and produce:
|
|
4
|
+
Uses the LangSmith Python SDK to fetch production traces and produce:
|
|
6
5
|
1. A markdown seed file for the testgen agent (production_seed.md)
|
|
7
6
|
2. A JSON summary for programmatic use (production_seed.json)
|
|
8
7
|
|
|
@@ -11,85 +10,18 @@ Usage:
|
|
|
11
10
|
--project ceppem-langgraph \
|
|
12
11
|
--output-md production_seed.md \
|
|
13
12
|
--output-json production_seed.json \
|
|
14
|
-
[--api-key-env LANGSMITH_API_KEY] \
|
|
15
13
|
[--limit 100]
|
|
16
14
|
|
|
17
|
-
|
|
15
|
+
Requires: pip install langsmith
|
|
18
16
|
"""
|
|
19
17
|
|
|
20
18
|
import argparse
|
|
21
19
|
import json
|
|
22
20
|
import os
|
|
23
21
|
import sys
|
|
24
|
-
import urllib.parse
|
|
25
|
-
import urllib.request
|
|
26
22
|
from collections import Counter
|
|
27
23
|
from datetime import datetime, timezone
|
|
28
24
|
|
|
29
|
-
LANGSMITH_API_BASE = "https://api.smith.langchain.com/api/v1"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def langsmith_request(endpoint, api_key, method="GET", body=None, params=None):
|
|
33
|
-
"""Make a request to the LangSmith REST API."""
|
|
34
|
-
url = f"{LANGSMITH_API_BASE}/{endpoint}"
|
|
35
|
-
if params:
|
|
36
|
-
url += "?" + urllib.parse.urlencode(params)
|
|
37
|
-
|
|
38
|
-
headers = {
|
|
39
|
-
"x-api-key": api_key,
|
|
40
|
-
"Accept": "application/json",
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
data = None
|
|
44
|
-
if body is not None:
|
|
45
|
-
headers["Content-Type"] = "application/json"
|
|
46
|
-
data = json.dumps(body).encode("utf-8")
|
|
47
|
-
|
|
48
|
-
req = urllib.request.Request(url, data=data, headers=headers, method=method)
|
|
49
|
-
try:
|
|
50
|
-
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
51
|
-
return json.loads(resp.read())
|
|
52
|
-
except urllib.error.HTTPError as e:
|
|
53
|
-
body_text = ""
|
|
54
|
-
try:
|
|
55
|
-
body_text = e.read().decode("utf-8", errors="replace")[:500]
|
|
56
|
-
except Exception:
|
|
57
|
-
pass
|
|
58
|
-
print(f"LangSmith API error {e.code}: {body_text}", file=sys.stderr)
|
|
59
|
-
return None
|
|
60
|
-
except Exception as e:
|
|
61
|
-
print(f"LangSmith API request failed: {e}", file=sys.stderr)
|
|
62
|
-
return None
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def fetch_runs(project_name, api_key, limit=100):
|
|
66
|
-
"""Fetch recent root runs from a LangSmith project."""
|
|
67
|
-
# Try POST /runs/query first (newer API)
|
|
68
|
-
body = {
|
|
69
|
-
"project_name": project_name,
|
|
70
|
-
"is_root": True,
|
|
71
|
-
"limit": limit,
|
|
72
|
-
}
|
|
73
|
-
result = langsmith_request("runs/query", api_key, method="POST", body=body)
|
|
74
|
-
if result and isinstance(result, dict):
|
|
75
|
-
return result.get("runs", result.get("results", []))
|
|
76
|
-
if result and isinstance(result, list):
|
|
77
|
-
return result
|
|
78
|
-
|
|
79
|
-
# Fallback: GET /runs with query params
|
|
80
|
-
params = {
|
|
81
|
-
"project_name": project_name,
|
|
82
|
-
"is_root": "true",
|
|
83
|
-
"limit": str(limit),
|
|
84
|
-
}
|
|
85
|
-
result = langsmith_request("runs", api_key, params=params)
|
|
86
|
-
if result and isinstance(result, list):
|
|
87
|
-
return result
|
|
88
|
-
if result and isinstance(result, dict):
|
|
89
|
-
return result.get("runs", result.get("results", []))
|
|
90
|
-
|
|
91
|
-
return []
|
|
92
|
-
|
|
93
25
|
|
|
94
26
|
def extract_input(run):
|
|
95
27
|
"""Extract user input from a run's inputs field."""
|
|
@@ -396,48 +328,35 @@ def generate_json_summary(analysis, project_name):
|
|
|
396
328
|
def main():
|
|
397
329
|
parser = argparse.ArgumentParser(description="Fetch and summarize production LangSmith traces")
|
|
398
330
|
parser.add_argument("--project", required=True, help="LangSmith project name")
|
|
399
|
-
parser.add_argument("--api-key-env", default="LANGSMITH_API_KEY",
|
|
400
|
-
help="Env var containing API key (default: LANGSMITH_API_KEY)")
|
|
401
331
|
parser.add_argument("--limit", type=int, default=100, help="Max traces to fetch (default: 100)")
|
|
402
332
|
parser.add_argument("--output-md", required=True, help="Output path for markdown seed")
|
|
403
333
|
parser.add_argument("--output-json", required=True, help="Output path for JSON summary")
|
|
404
|
-
|
|
405
|
-
|
|
334
|
+
# Kept for backwards compatibility — silently ignored (SDK is now the only mode)
|
|
335
|
+
parser.add_argument("--use-sdk", action="store_true", help=argparse.SUPPRESS)
|
|
406
336
|
args = parser.parse_args()
|
|
407
337
|
|
|
408
338
|
print(f"Fetching up to {args.limit} traces from LangSmith project '{args.project}'...")
|
|
409
339
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
}
|
|
431
|
-
runs.append(run_dict)
|
|
432
|
-
except ImportError:
|
|
433
|
-
print("langsmith package not installed. Use --use-sdk with pip install langsmith", file=sys.stderr)
|
|
434
|
-
sys.exit(1)
|
|
435
|
-
else:
|
|
436
|
-
api_key = os.environ.get(args.api_key_env, "")
|
|
437
|
-
if not api_key:
|
|
438
|
-
print(f"No API key found in ${args.api_key_env} — cannot fetch production traces", file=sys.stderr)
|
|
439
|
-
sys.exit(1)
|
|
440
|
-
runs = fetch_runs(args.project, api_key, args.limit)
|
|
340
|
+
from langsmith import Client
|
|
341
|
+
client = Client()
|
|
342
|
+
raw_runs = list(client.list_runs(
|
|
343
|
+
project_name=args.project, is_root=True, limit=args.limit,
|
|
344
|
+
))
|
|
345
|
+
# Convert SDK run objects to dicts matching our analysis format
|
|
346
|
+
runs = []
|
|
347
|
+
for r in raw_runs:
|
|
348
|
+
run_dict = {
|
|
349
|
+
"id": str(r.id),
|
|
350
|
+
"name": r.name,
|
|
351
|
+
"inputs": r.inputs,
|
|
352
|
+
"outputs": r.outputs,
|
|
353
|
+
"error": r.error,
|
|
354
|
+
"total_tokens": r.total_tokens,
|
|
355
|
+
"feedback_stats": None,
|
|
356
|
+
"start_time": r.start_time.isoformat() if r.start_time else None,
|
|
357
|
+
"end_time": r.end_time.isoformat() if r.end_time else None,
|
|
358
|
+
}
|
|
359
|
+
runs.append(run_dict)
|
|
441
360
|
|
|
442
361
|
if not runs:
|
|
443
362
|
print("No traces found. The project may be empty or the name may be wrong.")
|