harness-evolver 4.3.0 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +15 -9
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +55 -40
- package/skills/status/SKILL.md +15 -19
- package/tools/evolution_chart.py +312 -0
- package/tools/run_eval.py +6 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.
|
|
4
|
+
"version": "4.4.0",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -91,8 +91,12 @@ claude
|
|
|
91
91
|
<td>Cross-iteration memory consolidation inspired by Claude Code's autoDream. Tracks which approaches win, which failures recur, and promotes insights after 2+ occurrences.</td>
|
|
92
92
|
</tr>
|
|
93
93
|
<tr>
|
|
94
|
+
<td><b>Dataset Health</b></td>
|
|
95
|
+
<td>Pre-flight dataset quality check: size adequacy, difficulty distribution, dead example detection, production coverage analysis, train/held-out splits. Auto-corrects issues before evolution starts.</td>
|
|
96
|
+
</tr>
|
|
97
|
+
<tr>
|
|
94
98
|
<td><b>Smart Gating</b></td>
|
|
95
|
-
<td>
|
|
99
|
+
<td>Claude assesses gate conditions directly — score plateau, target reached, diminishing returns. No hardcoded thresholds. State validation ensures config hasn't diverged from LangSmith.</td>
|
|
96
100
|
</tr>
|
|
97
101
|
<tr>
|
|
98
102
|
<td><b>Background Mode</b></td>
|
|
@@ -107,6 +111,7 @@ claude
|
|
|
107
111
|
| Command | What it does |
|
|
108
112
|
|---|---|
|
|
109
113
|
| `/evolver:setup` | Explore project, configure LangSmith (dataset, evaluators), run baseline |
|
|
114
|
+
| `/evolver:health` | Check dataset quality (size, difficulty, coverage, splits), auto-correct issues |
|
|
110
115
|
| `/evolver:evolve` | Run the optimization loop (dynamic self-organizing proposers in worktrees) |
|
|
111
116
|
| `/evolver:status` | Show progress, scores, history |
|
|
112
117
|
| `/evolver:deploy` | Tag, push, clean up temporary files |
|
|
@@ -132,10 +137,11 @@ claude
|
|
|
132
137
|
/evolver:evolve
|
|
133
138
|
|
|
|
134
139
|
+- 0.5 Validate state (skeptical memory — check .evolver.json vs LangSmith)
|
|
140
|
+
+- 0.6 /evolver:health — dataset quality check + auto-correct
|
|
135
141
|
+- 1. Read state (.evolver.json + LangSmith experiments)
|
|
136
142
|
+- 1.5 Gather trace insights (cluster errors, tokens, latency)
|
|
137
|
-
+- 1.8 Analyze per-task failures
|
|
138
|
-
+- 1.8a
|
|
143
|
+
+- 1.8 Analyze per-task failures (train split only — proposers don't see held-out)
|
|
144
|
+
+- 1.8a Claude generates strategy.md + lenses.json from analysis data
|
|
139
145
|
+- 1.9 Prepare shared proposer context (KV cache-optimized prefix)
|
|
140
146
|
+- 2. Spawn N self-organizing proposers in parallel (each in a git worktree)
|
|
141
147
|
+- 3. Run target for each candidate (code-based evaluators)
|
|
@@ -144,10 +150,10 @@ claude
|
|
|
144
150
|
+- 5. Merge winning worktree into main branch
|
|
145
151
|
+- 5.5 Regression tracking (auto-add guard examples to dataset)
|
|
146
152
|
+- 6. Report results
|
|
147
|
-
+- 6.2
|
|
153
|
+
+- 6.2 Consolidator agent updates evolution memory (runs in background)
|
|
148
154
|
+- 6.5 Auto-trigger Active Critic (detect + fix evaluator gaming)
|
|
149
155
|
+- 7. Auto-trigger ULTRAPLAN Architect (opus model, deep analysis)
|
|
150
|
-
+- 8.
|
|
156
|
+
+- 8. Claude assesses gate conditions (plateau, target, diminishing returns)
|
|
151
157
|
```
|
|
152
158
|
|
|
153
159
|
---
|
|
@@ -159,7 +165,8 @@ Plugin hook (SessionStart)
|
|
|
159
165
|
└→ Creates venv, installs langsmith + langsmith-cli, exports env vars
|
|
160
166
|
|
|
161
167
|
Skills (markdown)
|
|
162
|
-
├── /evolver:setup → explores project, runs setup.py
|
|
168
|
+
├── /evolver:setup → explores project, smart defaults, runs setup.py
|
|
169
|
+
├── /evolver:health → dataset quality check + auto-correct
|
|
163
170
|
├── /evolver:evolve → orchestrates the evolution loop
|
|
164
171
|
├── /evolver:status → reads .evolver.json + LangSmith
|
|
165
172
|
└── /evolver:deploy → tags and pushes
|
|
@@ -179,10 +186,8 @@ Tools (Python + langsmith SDK)
|
|
|
179
186
|
├── trace_insights.py → clusters errors from traces
|
|
180
187
|
├── seed_from_traces.py → imports production traces
|
|
181
188
|
├── validate_state.py → validates config vs LangSmith state
|
|
182
|
-
├──
|
|
189
|
+
├── dataset_health.py → dataset quality diagnostic (size, difficulty, coverage, splits)
|
|
183
190
|
├── regression_tracker.py → tracks regressions, adds guard examples
|
|
184
|
-
├── consolidate.py → cross-iteration memory consolidation
|
|
185
|
-
├── synthesize_strategy.py→ generates strategy document + investigation lenses
|
|
186
191
|
├── add_evaluator.py → programmatically adds evaluators
|
|
187
192
|
└── adversarial_inject.py → detects memorization, injects adversarial tests
|
|
188
193
|
```
|
|
@@ -221,6 +226,7 @@ LangSmith traces **any** AI framework. The evolver works with all of them:
|
|
|
221
226
|
- [Darwin Godel Machine](https://sakana.ai/dgm/) — Sakana AI
|
|
222
227
|
- [AlphaEvolve](https://deepmind.google/blog/alphaevolve/) — DeepMind
|
|
223
228
|
- [LangSmith Evaluation](https://docs.smith.langchain.com/evaluation) — LangChain
|
|
229
|
+
- [Harnessing Claude's Intelligence](https://claude.com/blog/harnessing-claudes-intelligence) — Martin, Anthropic, 2026
|
|
224
230
|
- [Traces Start the Agent Improvement Loop](https://www.langchain.com/conceptual-guides/traces-start-agent-improvement-loop) — LangChain
|
|
225
231
|
|
|
226
232
|
---
|
package/package.json
CHANGED
package/skills/evolve/SKILL.md
CHANGED
|
@@ -156,44 +156,36 @@ For each iteration:
|
|
|
156
156
|
python3 -c "import json; c=json.load(open('.evolver.json')); print(f'v{c[\"iterations\"]+1:03d}')"
|
|
157
157
|
```
|
|
158
158
|
|
|
159
|
-
### 1.5. Gather
|
|
159
|
+
### 1.5. Gather Analysis Data (Parallel)
|
|
160
160
|
|
|
161
|
-
Read the best experiment from config. If null (no baseline was run), skip
|
|
161
|
+
Read the best experiment from config. If null (no baseline was run), skip data gathering — proposers will work from code analysis only:
|
|
162
162
|
|
|
163
163
|
```bash
|
|
164
164
|
BEST=$(python3 -c "import json; b=json.load(open('.evolver.json')).get('best_experiment'); print(b if b else '')")
|
|
165
|
+
PROD=$(python3 -c "import json; c=json.load(open('.evolver.json')); print(c.get('production_project',''))")
|
|
166
|
+
|
|
165
167
|
if [ -n "$BEST" ]; then
|
|
168
|
+
# Run all data gathering in parallel — these are independent API calls
|
|
166
169
|
$EVOLVER_PY $TOOLS/trace_insights.py \
|
|
167
170
|
--from-experiment "$BEST" \
|
|
168
|
-
--output trace_insights.json 2>/dev/null
|
|
169
|
-
fi
|
|
170
|
-
```
|
|
171
|
+
--output trace_insights.json 2>/dev/null &
|
|
171
172
|
|
|
172
|
-
|
|
173
|
+
$EVOLVER_PY $TOOLS/read_results.py \
|
|
174
|
+
--experiment "$BEST" \
|
|
175
|
+
--config .evolver.json \
|
|
176
|
+
--split train \
|
|
177
|
+
--output best_results.json 2>/dev/null &
|
|
178
|
+
fi
|
|
173
179
|
|
|
174
|
-
```bash
|
|
175
|
-
PROD=$(python3 -c "import json; c=json.load(open('.evolver.json')); print(c.get('production_project',''))")
|
|
176
180
|
if [ -n "$PROD" ] && [ ! -f "production_seed.json" ]; then
|
|
177
181
|
$EVOLVER_PY $TOOLS/seed_from_traces.py \
|
|
178
182
|
--project "$PROD" \
|
|
179
183
|
--output-md production_seed.md \
|
|
180
184
|
--output-json production_seed.json \
|
|
181
|
-
--limit 100 2>/dev/null
|
|
185
|
+
--limit 100 2>/dev/null &
|
|
182
186
|
fi
|
|
183
|
-
```
|
|
184
187
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
If `$BEST` is set (not the first iteration without baseline), read results and cluster failures:
|
|
188
|
-
|
|
189
|
-
```bash
|
|
190
|
-
if [ -n "$BEST" ]; then
|
|
191
|
-
$EVOLVER_PY $TOOLS/read_results.py \
|
|
192
|
-
--experiment "$BEST" \
|
|
193
|
-
--config .evolver.json \
|
|
194
|
-
--split train \
|
|
195
|
-
--output best_results.json 2>/dev/null
|
|
196
|
-
fi
|
|
188
|
+
wait # Wait for all data gathering to complete
|
|
197
189
|
```
|
|
198
190
|
|
|
199
191
|
If `best_results.json` exists, parse it to find failing examples (score < 0.7). Group by metadata or error pattern.
|
|
@@ -338,20 +330,23 @@ done
|
|
|
338
330
|
|
|
339
331
|
Only run evaluation (Step 3) for proposers that committed changes (not abstained, not stuck).
|
|
340
332
|
|
|
341
|
-
### 3. Run Target for Each Candidate
|
|
333
|
+
### 3. Run Target for Each Candidate (Parallel)
|
|
342
334
|
|
|
343
|
-
|
|
335
|
+
Run evaluations for ALL candidates simultaneously — they're independent:
|
|
344
336
|
|
|
345
337
|
```bash
|
|
346
|
-
#
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
338
|
+
# Launch all evaluations in parallel
|
|
339
|
+
for WORKTREE in {worktree_paths_with_commits}; do
|
|
340
|
+
WORKTREE_PROJECT="$WORKTREE"
|
|
341
|
+
[ -n "$PROJECT_DIR" ] && WORKTREE_PROJECT="$WORKTREE/$PROJECT_DIR"
|
|
342
|
+
|
|
343
|
+
$EVOLVER_PY $TOOLS/run_eval.py \
|
|
344
|
+
--config "$WORKTREE_PROJECT/.evolver.json" \
|
|
345
|
+
--worktree-path "$WORKTREE_PROJECT" \
|
|
346
|
+
--experiment-prefix v{NNN}-{lens_id} \
|
|
347
|
+
--timeout 120 &
|
|
348
|
+
done
|
|
349
|
+
wait # Wait for all evaluations to complete
|
|
355
350
|
```
|
|
356
351
|
|
|
357
352
|
Each candidate becomes a separate LangSmith experiment. This step runs the agent and applies code-based evaluators (has_output, token_efficiency) only.
|
|
@@ -410,7 +405,15 @@ WINNER_BRANCH={winning_worktree_branch}
|
|
|
410
405
|
git merge $WINNER_BRANCH --no-edit -m "evolve: merge v{NNN}-{lens_id} (score: {score})"
|
|
411
406
|
```
|
|
412
407
|
|
|
413
|
-
Update `.evolver.json
|
|
408
|
+
Update `.evolver.json` with enriched history entry:
|
|
409
|
+
|
|
410
|
+
Extract winner metrics for the chart:
|
|
411
|
+
- `tokens`, `latency_ms`, `errors` → from `comparison.all_candidates` for the winner
|
|
412
|
+
- `passing`, `total` → count per_example scores ≥0.5 vs total from best_results.json (re-read for winner experiment)
|
|
413
|
+
- `per_evaluator` → average each evaluator's scores across per_example from best_results.json
|
|
414
|
+
- `approach` → first line of `## Approach` section from winner's proposal.md
|
|
415
|
+
- `lens` → the `source` field from the winning proposer's lens in lenses.json
|
|
416
|
+
|
|
414
417
|
```python
|
|
415
418
|
import json
|
|
416
419
|
c = json.load(open('.evolver.json'))
|
|
@@ -420,7 +423,15 @@ c['iterations'] = c['iterations'] + 1
|
|
|
420
423
|
c['history'].append({
|
|
421
424
|
'version': 'v{NNN}',
|
|
422
425
|
'experiment': '{winner_experiment}',
|
|
423
|
-
'score': {winner_score}
|
|
426
|
+
'score': {winner_score},
|
|
427
|
+
'tokens': {winner_tokens},
|
|
428
|
+
'latency_ms': {winner_latency_ms},
|
|
429
|
+
'error_count': {winner_errors},
|
|
430
|
+
'passing': {winner_passing},
|
|
431
|
+
'total': {winner_total},
|
|
432
|
+
'per_evaluator': {winner_per_evaluator_dict},
|
|
433
|
+
'approach': '{approach_from_proposal_md}',
|
|
434
|
+
'lens': '{lens_source}'
|
|
424
435
|
})
|
|
425
436
|
json.dump(c, open('.evolver.json', 'w'), indent=2)
|
|
426
437
|
```
|
|
@@ -534,9 +545,13 @@ If stopping, skip to the final report. If continuing, proceed to next iteration.
|
|
|
534
545
|
|
|
535
546
|
## When Loop Ends — Final Report
|
|
536
547
|
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
548
|
+
Display the evolution chart:
|
|
549
|
+
|
|
550
|
+
```bash
|
|
551
|
+
$EVOLVER_PY $TOOLS/evolution_chart.py --config .evolver.json
|
|
552
|
+
```
|
|
553
|
+
|
|
554
|
+
Then add:
|
|
555
|
+
- LangSmith experiment URL for the best experiment (construct from project name)
|
|
556
|
+
- `git log --oneline` from baseline to current HEAD (key changes summary)
|
|
542
557
|
- Suggest: `/evolver:deploy` to finalize
|
package/skills/status/SKILL.md
CHANGED
|
@@ -10,27 +10,23 @@ Show current evolution progress.
|
|
|
10
10
|
|
|
11
11
|
## What To Do
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
### Resolve Tool Path
|
|
14
14
|
|
|
15
15
|
```bash
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
c = json.load(open('.evolver.json'))
|
|
19
|
-
print(f'Project: {c[\"project\"]}')
|
|
20
|
-
print(f'Dataset: {c[\"dataset\"]}')
|
|
21
|
-
print(f'Framework: {c[\"framework\"]}')
|
|
22
|
-
print(f'Evaluators: {c[\"evaluators\"]}')
|
|
23
|
-
print(f'Iterations: {c[\"iterations\"]}')
|
|
24
|
-
print(f'Best: {c[\"best_experiment\"]} (score: {c[\"best_score\"]:.3f})')
|
|
25
|
-
print(f'Baseline: {c[\"history\"][0][\"score\"]:.3f}' if c['history'] else 'No baseline')
|
|
26
|
-
print()
|
|
27
|
-
print('History:')
|
|
28
|
-
for h in c.get('history', []):
|
|
29
|
-
print(f' {h[\"version\"]}: {h[\"score\"]:.3f}')
|
|
30
|
-
"
|
|
16
|
+
TOOLS="${EVOLVER_TOOLS:-$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")}"
|
|
17
|
+
EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")}"
|
|
31
18
|
```
|
|
32
19
|
|
|
33
|
-
|
|
34
|
-
Detect regression: if current best is lower than a previous best, warn.
|
|
20
|
+
### Display Chart
|
|
35
21
|
|
|
36
|
-
|
|
22
|
+
```bash
|
|
23
|
+
$EVOLVER_PY $TOOLS/evolution_chart.py --config .evolver.json
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Additional Analysis
|
|
27
|
+
|
|
28
|
+
After displaying the chart:
|
|
29
|
+
|
|
30
|
+
- Detect stagnation: if last 3 scores within 1% of each other, warn and suggest `/evolver:evolve` with architect trigger.
|
|
31
|
+
- Detect regression: if current best is lower than a previous best, warn.
|
|
32
|
+
- Print LangSmith experiment URL for the best experiment if available.
|
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Evolution chart — ASCII visualization of agent optimization progress.
|
|
3
|
+
|
|
4
|
+
Reads .evolver.json history and optionally best_results.json to render
|
|
5
|
+
a rich terminal chart with score progression, per-evaluator breakdown,
|
|
6
|
+
change narrative, and horizontal bar chart.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python3 evolution_chart.py --config .evolver.json
|
|
10
|
+
python3 evolution_chart.py --config .evolver.json --no-color
|
|
11
|
+
|
|
12
|
+
Stdlib-only — no langsmith dependency.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
import sys
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Colors:
|
|
22
|
+
def __init__(self, enabled=True):
|
|
23
|
+
if enabled:
|
|
24
|
+
self.G = '\033[32m'
|
|
25
|
+
self.R = '\033[31m'
|
|
26
|
+
self.Y = '\033[33m'
|
|
27
|
+
self.C = '\033[36m'
|
|
28
|
+
self.B = '\033[1m'
|
|
29
|
+
self.D = '\033[90m'
|
|
30
|
+
self.RST = '\033[0m'
|
|
31
|
+
else:
|
|
32
|
+
self.G = self.R = self.Y = self.C = ''
|
|
33
|
+
self.B = self.D = self.RST = ''
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def sparkline(values):
|
|
37
|
+
blocks = ' ▁▂▃▄▅▆▇█'
|
|
38
|
+
if not values:
|
|
39
|
+
return ''
|
|
40
|
+
mn, mx = min(values), max(values)
|
|
41
|
+
rng = mx - mn or 1
|
|
42
|
+
return ''.join(blocks[min(8, int((v - mn) / rng * 8))] for v in values)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def hbar(val, width, c):
|
|
46
|
+
filled = round(val * width)
|
|
47
|
+
return f'{c.G}{"█" * filled}{c.D}{"░" * (width - filled)}{c.RST}'
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def fmt_tokens(t):
|
|
51
|
+
if not t:
|
|
52
|
+
return '—'
|
|
53
|
+
if t >= 1_000_000:
|
|
54
|
+
return f'{t / 1_000_000:.1f}M'
|
|
55
|
+
if t >= 1000:
|
|
56
|
+
return f'{t / 1000:.1f}k'
|
|
57
|
+
return str(t)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def trend_icon(delta, is_best, c):
|
|
61
|
+
if is_best and delta >= 0:
|
|
62
|
+
return f'{c.G}★{c.RST}'
|
|
63
|
+
if delta > 0:
|
|
64
|
+
return f'{c.G}▲{c.RST}'
|
|
65
|
+
if delta < -0.01:
|
|
66
|
+
return f'{c.R}▼{c.RST}'
|
|
67
|
+
if delta < 0:
|
|
68
|
+
return f'{c.Y}━{c.RST}'
|
|
69
|
+
return f'{c.Y}━{c.RST}'
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def render_header(config, history, scores, c):
|
|
73
|
+
project = config.get('project', 'unknown')
|
|
74
|
+
dataset = config.get('dataset', 'unknown')
|
|
75
|
+
evals = config.get('evaluators', [])
|
|
76
|
+
total = history[0].get('total', config.get('num_examples', '?'))
|
|
77
|
+
base_score = scores[0]
|
|
78
|
+
best_score = max(scores)
|
|
79
|
+
iters = len(history) - 1
|
|
80
|
+
pct = ((best_score - base_score) / base_score * 100) if base_score > 0 else 0
|
|
81
|
+
spark = sparkline(scores)
|
|
82
|
+
evals_str = ' · '.join(evals)
|
|
83
|
+
|
|
84
|
+
W = 70
|
|
85
|
+
lines = []
|
|
86
|
+
lines.append(f' {c.C}╔{"═" * W}╗{c.RST}')
|
|
87
|
+
lines.append(f' {c.C}║{c.RST} {c.B}EVOLUTION REPORT{c.RST}{" " * (W - 18)}{c.C}║{c.RST}')
|
|
88
|
+
lines.append(f' {c.C}║{c.RST} {project:<{W - 16}}{c.D}{iters} iterations{c.RST} {c.C}║{c.RST}')
|
|
89
|
+
lines.append(f' {c.C}║{c.RST} {c.D}dataset{c.RST} {dataset} ({total} examples){" " * max(0, W - 22 - len(dataset) - len(str(total)))}{c.C}║{c.RST}')
|
|
90
|
+
lines.append(f' {c.C}║{c.RST} {c.D}evals{c.RST} {evals_str:<{W - 11}}{c.C}║{c.RST}')
|
|
91
|
+
lines.append(f' {c.C}║{c.RST} {c.D}trend{c.RST} {spark} {base_score:.3f} → {c.G}{c.B}{best_score:.3f}{c.RST} {c.G}(+{pct:.1f}%){c.RST}{" " * max(0, W - 40 - len(spark))}{c.C}║{c.RST}')
|
|
92
|
+
lines.append(f' {c.C}╚{"═" * W}╝{c.RST}')
|
|
93
|
+
return '\n'.join(lines)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def render_score_table(history, scores, c):
|
|
97
|
+
base = scores[0]
|
|
98
|
+
best = max(scores)
|
|
99
|
+
W = 70
|
|
100
|
+
|
|
101
|
+
lines = []
|
|
102
|
+
lines.append(f' {c.B}SCORE PROGRESSION{c.RST}')
|
|
103
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
104
|
+
lines.append(f' {c.D}{"Version":<10}{"Score":>6}{"Δ":>8}{"vs Base":>9}{"Pass":>7}{"Err":>5}{"Tokens":>8}{"Latency":>9}{c.RST}')
|
|
105
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
106
|
+
|
|
107
|
+
for i, h in enumerate(history):
|
|
108
|
+
v = h['version']
|
|
109
|
+
s = h['score']
|
|
110
|
+
passing = h.get('passing')
|
|
111
|
+
total = h.get('total')
|
|
112
|
+
errors = h.get('error_count', h.get('errors'))
|
|
113
|
+
tokens = h.get('tokens', 0)
|
|
114
|
+
latency = h.get('latency_ms', 0)
|
|
115
|
+
|
|
116
|
+
s_str = f'{c.G}{c.B}{s:.3f}{c.RST}' if s == best else f'{s:.3f}'
|
|
117
|
+
|
|
118
|
+
if i == 0:
|
|
119
|
+
d_str = f'{c.D}{"—":>7}{c.RST}'
|
|
120
|
+
p_str = f'{c.D}{"—":>8}{c.RST}'
|
|
121
|
+
icon = ''
|
|
122
|
+
else:
|
|
123
|
+
d = s - history[i - 1]['score']
|
|
124
|
+
pct = ((s - base) / base * 100) if base > 0 else 0
|
|
125
|
+
dc = c.G if d > 0 else (c.R if d < 0 else c.Y)
|
|
126
|
+
d_str = f'{dc}{d:>+7.3f}{c.RST}'
|
|
127
|
+
p_str = f'{dc}{pct:>+7.1f}%{c.RST}'
|
|
128
|
+
icon = trend_icon(d, i == len(history) - 1 and s == best, c)
|
|
129
|
+
|
|
130
|
+
if passing is not None and total is not None:
|
|
131
|
+
pass_str = f'{passing}/{total}'
|
|
132
|
+
else:
|
|
133
|
+
pass_str = '—'
|
|
134
|
+
|
|
135
|
+
if errors is not None:
|
|
136
|
+
e_str = f'{c.R}{errors}{c.RST}' if errors > 0 else f'{c.G}{errors}{c.RST}'
|
|
137
|
+
else:
|
|
138
|
+
e_str = '—'
|
|
139
|
+
|
|
140
|
+
tok_str = fmt_tokens(tokens)
|
|
141
|
+
lat_str = f'{latency}ms' if latency else '—'
|
|
142
|
+
|
|
143
|
+
lines.append(f' {v:<10}{s_str:>6} {d_str} {p_str} {pass_str:>5} {e_str:>3} {tok_str:>6} {lat_str:>6} {icon}')
|
|
144
|
+
|
|
145
|
+
return '\n'.join(lines)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def render_evaluator_breakdown(history, config, best_results, c):
|
|
149
|
+
evaluators = config.get('evaluators', [])
|
|
150
|
+
if not evaluators:
|
|
151
|
+
return None
|
|
152
|
+
|
|
153
|
+
has_per_eval = any(h.get('per_evaluator') for h in history)
|
|
154
|
+
|
|
155
|
+
if not has_per_eval and not best_results:
|
|
156
|
+
return None
|
|
157
|
+
|
|
158
|
+
W = 70
|
|
159
|
+
lines = []
|
|
160
|
+
lines.append(f' {c.B}PER-EVALUATOR BREAKDOWN{c.RST}')
|
|
161
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
162
|
+
|
|
163
|
+
if has_per_eval:
|
|
164
|
+
lines.append(f' {c.D}{"Evaluator":<20}{"Base":>6}{"Best":>7}{"Δ":>7} {"":20} Trend{c.RST}')
|
|
165
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
166
|
+
|
|
167
|
+
for ev in evaluators:
|
|
168
|
+
vals = [h.get('per_evaluator', {}).get(ev, 0) for h in history]
|
|
169
|
+
bv = vals[0]
|
|
170
|
+
best_v = vals[-1]
|
|
171
|
+
delta = best_v - bv
|
|
172
|
+
dc = c.G if delta > 0 else c.R
|
|
173
|
+
spark_ev = sparkline(vals)
|
|
174
|
+
|
|
175
|
+
lines.append(
|
|
176
|
+
f' {ev:<20}{bv:>5.2f} → {dc}{c.B}{best_v:.2f}{c.RST}'
|
|
177
|
+
f' {dc}{delta:>+6.2f}{c.RST}'
|
|
178
|
+
f' {hbar(best_v, 20, c)}'
|
|
179
|
+
f' {spark_ev}'
|
|
180
|
+
)
|
|
181
|
+
elif best_results:
|
|
182
|
+
lines.append(f' {c.D}{"Evaluator":<20}{"Avg Score":>10} {"":20}{c.RST}')
|
|
183
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
184
|
+
|
|
185
|
+
eval_scores = {}
|
|
186
|
+
for ex_data in best_results.get('per_example', {}).values():
|
|
187
|
+
for ev_name, ev_score in ex_data.get('scores', {}).items():
|
|
188
|
+
eval_scores.setdefault(ev_name, []).append(ev_score)
|
|
189
|
+
|
|
190
|
+
for ev in evaluators:
|
|
191
|
+
if ev in eval_scores:
|
|
192
|
+
avg = sum(eval_scores[ev]) / len(eval_scores[ev])
|
|
193
|
+
lines.append(f' {ev:<20}{avg:>9.3f} {hbar(avg, 20, c)}')
|
|
194
|
+
|
|
195
|
+
return '\n'.join(lines)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def render_what_changed(history, c):
|
|
199
|
+
has_narrative = any(h.get('approach') for h in history[1:])
|
|
200
|
+
if not has_narrative:
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
W = 70
|
|
204
|
+
best_score = max(h['score'] for h in history)
|
|
205
|
+
lines = []
|
|
206
|
+
lines.append(f' {c.B}WHAT CHANGED{c.RST}')
|
|
207
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
208
|
+
|
|
209
|
+
for i, h in enumerate(history):
|
|
210
|
+
if i == 0:
|
|
211
|
+
continue
|
|
212
|
+
d = h['score'] - history[i - 1]['score']
|
|
213
|
+
dc = c.G if d > 0 else (c.R if d < 0 else c.Y)
|
|
214
|
+
icon = trend_icon(d, i == len(history) - 1 and h['score'] == best_score, c)
|
|
215
|
+
approach = (h.get('approach') or '—')[:42]
|
|
216
|
+
lens = h.get('lens', '')
|
|
217
|
+
lens_str = f' {c.D}[{lens}]{c.RST}' if lens else ''
|
|
218
|
+
lines.append(f' {h["version"]:<6} {icon} {dc}{d:>+.3f}{c.RST} {approach:<42}{lens_str}')
|
|
219
|
+
|
|
220
|
+
return '\n'.join(lines)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def render_bar_chart(history, scores, c):
|
|
224
|
+
best = max(scores)
|
|
225
|
+
best_idx = scores.index(best)
|
|
226
|
+
BAR_W = 40
|
|
227
|
+
W = 70
|
|
228
|
+
|
|
229
|
+
lines = []
|
|
230
|
+
lines.append(f' {c.B}SCORE CHART{c.RST}')
|
|
231
|
+
lines.append(f' {c.D}{"─" * W}{c.RST}')
|
|
232
|
+
|
|
233
|
+
for i, h in enumerate(history):
|
|
234
|
+
v = h['version']
|
|
235
|
+
s = h['score']
|
|
236
|
+
filled = round(s * BAR_W)
|
|
237
|
+
|
|
238
|
+
if i == best_idx:
|
|
239
|
+
bar_str = f'{c.G}{"█" * filled}{c.D}{"░" * (BAR_W - filled)}{c.RST}'
|
|
240
|
+
score_str = f'{c.G}{c.B}{s:.3f}{c.RST}'
|
|
241
|
+
elif i == 0:
|
|
242
|
+
bar_str = f'{c.C}{"█" * filled}{c.D}{"░" * (BAR_W - filled)}{c.RST}'
|
|
243
|
+
score_str = f'{c.C}{s:.3f}{c.RST}'
|
|
244
|
+
else:
|
|
245
|
+
bar_str = f'{c.G}{"█" * filled}{c.D}{"░" * (BAR_W - filled)}{c.RST}'
|
|
246
|
+
score_str = f'{s:.3f}'
|
|
247
|
+
|
|
248
|
+
lines.append(f' {v:<10}{bar_str} {score_str}')
|
|
249
|
+
|
|
250
|
+
lines.append(f' {c.D}{" " * 10}|{" " * 9}|{" " * 9}|{" " * 9}|{" " * 9}|{c.RST}')
|
|
251
|
+
lines.append(f' {c.D}{" " * 10}0{" " * 8}.25{" " * 7}.50{" " * 7}.75{" " * 8}1.0{c.RST}')
|
|
252
|
+
|
|
253
|
+
return '\n'.join(lines)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def main():
|
|
257
|
+
parser = argparse.ArgumentParser(description="Render evolution progress chart")
|
|
258
|
+
parser.add_argument("--config", default=".evolver.json", help="Path to .evolver.json")
|
|
259
|
+
parser.add_argument("--best-results", default=None, help="Path to best_results.json (auto-detected if not set)")
|
|
260
|
+
parser.add_argument("--no-color", action="store_true", help="Disable ANSI colors")
|
|
261
|
+
parser.add_argument("--output", default=None, help="Write output to file instead of stdout")
|
|
262
|
+
args = parser.parse_args()
|
|
263
|
+
|
|
264
|
+
if not os.path.exists(args.config):
|
|
265
|
+
print(f"Config not found: {args.config}", file=sys.stderr)
|
|
266
|
+
sys.exit(1)
|
|
267
|
+
|
|
268
|
+
with open(args.config) as f:
|
|
269
|
+
config = json.load(f)
|
|
270
|
+
|
|
271
|
+
history = config.get('history', [])
|
|
272
|
+
if not history:
|
|
273
|
+
print("No history data in config.", file=sys.stderr)
|
|
274
|
+
sys.exit(1)
|
|
275
|
+
|
|
276
|
+
best_results = None
|
|
277
|
+
br_path = args.best_results or os.path.join(os.path.dirname(args.config) or '.', 'best_results.json')
|
|
278
|
+
if os.path.exists(br_path):
|
|
279
|
+
with open(br_path) as f:
|
|
280
|
+
best_results = json.load(f)
|
|
281
|
+
|
|
282
|
+
use_color = not args.no_color and sys.stdout.isatty() and args.output is None
|
|
283
|
+
c = Colors(enabled=use_color)
|
|
284
|
+
|
|
285
|
+
scores = [h['score'] for h in history]
|
|
286
|
+
|
|
287
|
+
sections = [
|
|
288
|
+
'',
|
|
289
|
+
render_header(config, history, scores, c),
|
|
290
|
+
'',
|
|
291
|
+
render_score_table(history, scores, c),
|
|
292
|
+
'',
|
|
293
|
+
render_evaluator_breakdown(history, config, best_results, c),
|
|
294
|
+
'',
|
|
295
|
+
render_what_changed(history, c),
|
|
296
|
+
'',
|
|
297
|
+
render_bar_chart(history, scores, c),
|
|
298
|
+
'',
|
|
299
|
+
]
|
|
300
|
+
|
|
301
|
+
output = '\n'.join(s for s in sections if s is not None)
|
|
302
|
+
|
|
303
|
+
if args.output:
|
|
304
|
+
with open(args.output, 'w') as f:
|
|
305
|
+
f.write(output + '\n')
|
|
306
|
+
print(f"Chart written to {args.output}", file=sys.stderr)
|
|
307
|
+
else:
|
|
308
|
+
print(output)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
if __name__ == "__main__":
|
|
312
|
+
main()
|
package/tools/run_eval.py
CHANGED
|
@@ -166,11 +166,14 @@ def main():
|
|
|
166
166
|
parser.add_argument("--worktree-path", required=True, help="Path to the candidate's worktree")
|
|
167
167
|
parser.add_argument("--experiment-prefix", required=True, help="Experiment name prefix (e.g. v001a)")
|
|
168
168
|
parser.add_argument("--timeout", type=int, default=120, help="Per-task timeout in seconds")
|
|
169
|
+
parser.add_argument("--concurrency", type=int, default=None, help="Max concurrent evaluations (default: from config or 1)")
|
|
169
170
|
args = parser.parse_args()
|
|
170
171
|
|
|
171
172
|
with open(args.config) as f:
|
|
172
173
|
config = json.load(f)
|
|
173
174
|
|
|
175
|
+
concurrency = args.concurrency or config.get("eval_concurrency", 1)
|
|
176
|
+
|
|
174
177
|
os.environ["EVAL_TASK_TIMEOUT"] = str(args.timeout)
|
|
175
178
|
ensure_langsmith_api_key()
|
|
176
179
|
|
|
@@ -188,6 +191,8 @@ def main():
|
|
|
188
191
|
print(f" Dataset: {config['dataset']}")
|
|
189
192
|
print(f" Worktree: {args.worktree_path}")
|
|
190
193
|
print(f" Code evaluators: {['has_output'] + code_evaluators}")
|
|
194
|
+
if concurrency > 1:
|
|
195
|
+
print(f" Concurrency: {concurrency} parallel evaluations")
|
|
191
196
|
if llm_evaluators:
|
|
192
197
|
print(f" Pending LLM evaluators (agent): {llm_evaluators}")
|
|
193
198
|
|
|
@@ -197,7 +202,7 @@ def main():
|
|
|
197
202
|
data=config["dataset"],
|
|
198
203
|
evaluators=evaluators,
|
|
199
204
|
experiment_prefix=args.experiment_prefix,
|
|
200
|
-
max_concurrency=
|
|
205
|
+
max_concurrency=concurrency,
|
|
201
206
|
)
|
|
202
207
|
|
|
203
208
|
experiment_name = results.experiment_name
|