harness-evolver 4.3.1 → 4.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
3
  "description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
4
- "version": "4.3.1",
4
+ "version": "4.4.0",
5
5
  "author": {
6
6
  "name": "Raphael Valdetaro"
7
7
  },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
- "version": "4.3.1",
3
+ "version": "4.4.0",
4
4
  "description": "LangSmith-native autonomous agent optimization for Claude Code",
5
5
  "author": "Raphael Valdetaro",
6
6
  "license": "MIT",
@@ -405,7 +405,15 @@ WINNER_BRANCH={winning_worktree_branch}
405
405
  git merge $WINNER_BRANCH --no-edit -m "evolve: merge v{NNN}-{lens_id} (score: {score})"
406
406
  ```
407
407
 
408
- Update `.evolver.json`:
408
+ Update `.evolver.json` with enriched history entry:
409
+
410
+ Extract winner metrics for the chart:
411
+ - `tokens`, `latency_ms`, `errors` → from `comparison.all_candidates` for the winner
412
+ - `passing`, `total` → count per_example scores ≥0.5 vs total from best_results.json (re-read for winner experiment)
413
+ - `per_evaluator` → average each evaluator's scores across per_example from best_results.json
414
+ - `approach` → first line of `## Approach` section from winner's proposal.md
415
+ - `lens` → the `source` field from the winning proposer's lens in lenses.json
416
+
409
417
  ```python
410
418
  import json
411
419
  c = json.load(open('.evolver.json'))
@@ -415,7 +423,15 @@ c['iterations'] = c['iterations'] + 1
415
423
  c['history'].append({
416
424
  'version': 'v{NNN}',
417
425
  'experiment': '{winner_experiment}',
418
- 'score': {winner_score}
426
+ 'score': {winner_score},
427
+ 'tokens': {winner_tokens},
428
+ 'latency_ms': {winner_latency_ms},
429
+ 'error_count': {winner_errors},
430
+ 'passing': {winner_passing},
431
+ 'total': {winner_total},
432
+ 'per_evaluator': {winner_per_evaluator_dict},
433
+ 'approach': '{approach_from_proposal_md}',
434
+ 'lens': '{lens_source}'
419
435
  })
420
436
  json.dump(c, open('.evolver.json', 'w'), indent=2)
421
437
  ```
@@ -529,9 +545,13 @@ If stopping, skip to the final report. If continuing, proceed to next iteration.
529
545
 
530
546
  ## When Loop Ends — Final Report
531
547
 
532
- - Best version and score
533
- - Improvement over baseline (absolute and %)
534
- - Total iterations run
535
- - Key changes made (git log from baseline to current)
536
- - LangSmith experiment URLs for comparison
548
+ Display the evolution chart:
549
+
550
+ ```bash
551
+ $EVOLVER_PY $TOOLS/evolution_chart.py --config .evolver.json
552
+ ```
553
+
554
+ Then add:
555
+ - LangSmith experiment URL for the best experiment (construct from project name)
556
+ - `git log --oneline` from baseline to current HEAD (key changes summary)
537
557
  - Suggest: `/evolver:deploy` to finalize
@@ -10,27 +10,23 @@ Show current evolution progress.
10
10
 
11
11
  ## What To Do
12
12
 
13
- Read `.evolver.json` and report:
13
+ ### Resolve Tool Path
14
14
 
15
15
  ```bash
16
- python3 -c "
17
- import json
18
- c = json.load(open('.evolver.json'))
19
- print(f'Project: {c[\"project\"]}')
20
- print(f'Dataset: {c[\"dataset\"]}')
21
- print(f'Framework: {c[\"framework\"]}')
22
- print(f'Evaluators: {c[\"evaluators\"]}')
23
- print(f'Iterations: {c[\"iterations\"]}')
24
- print(f'Best: {c[\"best_experiment\"]} (score: {c[\"best_score\"]:.3f})')
25
- print(f'Baseline: {c[\"history\"][0][\"score\"]:.3f}' if c['history'] else 'No baseline')
26
- print()
27
- print('History:')
28
- for h in c.get('history', []):
29
- print(f' {h[\"version\"]}: {h[\"score\"]:.3f}')
30
- "
16
+ TOOLS="${EVOLVER_TOOLS:-$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")}"
17
+ EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")}"
31
18
  ```
32
19
 
33
- Detect stagnation: if last 3 scores are within 1% of each other, warn.
34
- Detect regression: if current best is lower than a previous best, warn.
20
+ ### Display Chart
35
21
 
36
- Print LangSmith URL for the best experiment if available.
22
+ ```bash
23
+ $EVOLVER_PY $TOOLS/evolution_chart.py --config .evolver.json
24
+ ```
25
+
26
+ ### Additional Analysis
27
+
28
+ After displaying the chart:
29
+
30
+ - Detect stagnation: if last 3 scores within 1% of each other, warn and suggest `/evolver:evolve` with architect trigger.
31
+ - Detect regression: if current best is lower than a previous best, warn.
32
+ - Print LangSmith experiment URL for the best experiment if available.
@@ -0,0 +1,312 @@
1
+ #!/usr/bin/env python3
2
+ """Evolution chart — ASCII visualization of agent optimization progress.
3
+
4
+ Reads .evolver.json history and optionally best_results.json to render
5
+ a rich terminal chart with score progression, per-evaluator breakdown,
6
+ change narrative, and horizontal bar chart.
7
+
8
+ Usage:
9
+ python3 evolution_chart.py --config .evolver.json
10
+ python3 evolution_chart.py --config .evolver.json --no-color
11
+
12
+ Stdlib-only — no langsmith dependency.
13
+ """
14
+
15
+ import argparse
16
+ import json
17
+ import os
18
+ import sys
19
+
20
+
21
+ class Colors:
22
+ def __init__(self, enabled=True):
23
+ if enabled:
24
+ self.G = '\033[32m'
25
+ self.R = '\033[31m'
26
+ self.Y = '\033[33m'
27
+ self.C = '\033[36m'
28
+ self.B = '\033[1m'
29
+ self.D = '\033[90m'
30
+ self.RST = '\033[0m'
31
+ else:
32
+ self.G = self.R = self.Y = self.C = ''
33
+ self.B = self.D = self.RST = ''
34
+
35
+
36
+ def sparkline(values):
37
+ blocks = ' ▁▂▃▄▅▆▇█'
38
+ if not values:
39
+ return ''
40
+ mn, mx = min(values), max(values)
41
+ rng = mx - mn or 1
42
+ return ''.join(blocks[min(8, int((v - mn) / rng * 8))] for v in values)
43
+
44
+
45
+ def hbar(val, width, c):
46
+ filled = round(val * width)
47
+ return f'{c.G}{"█" * filled}{c.D}{"░" * (width - filled)}{c.RST}'
48
+
49
+
50
+ def fmt_tokens(t):
51
+ if not t:
52
+ return '—'
53
+ if t >= 1_000_000:
54
+ return f'{t / 1_000_000:.1f}M'
55
+ if t >= 1000:
56
+ return f'{t / 1000:.1f}k'
57
+ return str(t)
58
+
59
+
60
+ def trend_icon(delta, is_best, c):
61
+ if is_best and delta >= 0:
62
+ return f'{c.G}★{c.RST}'
63
+ if delta > 0:
64
+ return f'{c.G}▲{c.RST}'
65
+ if delta < -0.01:
66
+ return f'{c.R}▼{c.RST}'
67
+ if delta < 0:
68
+ return f'{c.Y}━{c.RST}'
69
+ return f'{c.Y}━{c.RST}'
70
+
71
+
72
+ def render_header(config, history, scores, c):
73
+ project = config.get('project', 'unknown')
74
+ dataset = config.get('dataset', 'unknown')
75
+ evals = config.get('evaluators', [])
76
+ total = history[0].get('total', config.get('num_examples', '?'))
77
+ base_score = scores[0]
78
+ best_score = max(scores)
79
+ iters = len(history) - 1
80
+ pct = ((best_score - base_score) / base_score * 100) if base_score > 0 else 0
81
+ spark = sparkline(scores)
82
+ evals_str = ' · '.join(evals)
83
+
84
+ W = 70
85
+ lines = []
86
+ lines.append(f' {c.C}╔{"═" * W}╗{c.RST}')
87
+ lines.append(f' {c.C}║{c.RST} {c.B}EVOLUTION REPORT{c.RST}{" " * (W - 18)}{c.C}║{c.RST}')
88
+ lines.append(f' {c.C}║{c.RST} {project:<{W - 16}}{c.D}{iters} iterations{c.RST} {c.C}║{c.RST}')
89
+ lines.append(f' {c.C}║{c.RST} {c.D}dataset{c.RST} {dataset} ({total} examples){" " * max(0, W - 22 - len(dataset) - len(str(total)))}{c.C}║{c.RST}')
90
+ lines.append(f' {c.C}║{c.RST} {c.D}evals{c.RST} {evals_str:<{W - 11}}{c.C}║{c.RST}')
91
+ lines.append(f' {c.C}║{c.RST} {c.D}trend{c.RST} {spark} {base_score:.3f} → {c.G}{c.B}{best_score:.3f}{c.RST} {c.G}(+{pct:.1f}%){c.RST}{" " * max(0, W - 40 - len(spark))}{c.C}║{c.RST}')
92
+ lines.append(f' {c.C}╚{"═" * W}╝{c.RST}')
93
+ return '\n'.join(lines)
94
+
95
+
96
+ def render_score_table(history, scores, c):
97
+ base = scores[0]
98
+ best = max(scores)
99
+ W = 70
100
+
101
+ lines = []
102
+ lines.append(f' {c.B}SCORE PROGRESSION{c.RST}')
103
+ lines.append(f' {c.D}{"─" * W}{c.RST}')
104
+ lines.append(f' {c.D}{"Version":<10}{"Score":>6}{"Δ":>8}{"vs Base":>9}{"Pass":>7}{"Err":>5}{"Tokens":>8}{"Latency":>9}{c.RST}')
105
+ lines.append(f' {c.D}{"─" * W}{c.RST}')
106
+
107
+ for i, h in enumerate(history):
108
+ v = h['version']
109
+ s = h['score']
110
+ passing = h.get('passing')
111
+ total = h.get('total')
112
+ errors = h.get('error_count', h.get('errors'))
113
+ tokens = h.get('tokens', 0)
114
+ latency = h.get('latency_ms', 0)
115
+
116
+ s_str = f'{c.G}{c.B}{s:.3f}{c.RST}' if s == best else f'{s:.3f}'
117
+
118
+ if i == 0:
119
+ d_str = f'{c.D}{"—":>7}{c.RST}'
120
+ p_str = f'{c.D}{"—":>8}{c.RST}'
121
+ icon = ''
122
+ else:
123
+ d = s - history[i - 1]['score']
124
+ pct = ((s - base) / base * 100) if base > 0 else 0
125
+ dc = c.G if d > 0 else (c.R if d < 0 else c.Y)
126
+ d_str = f'{dc}{d:>+7.3f}{c.RST}'
127
+ p_str = f'{dc}{pct:>+7.1f}%{c.RST}'
128
+ icon = trend_icon(d, i == len(history) - 1 and s == best, c)
129
+
130
+ if passing is not None and total is not None:
131
+ pass_str = f'{passing}/{total}'
132
+ else:
133
+ pass_str = '—'
134
+
135
+ if errors is not None:
136
+ e_str = f'{c.R}{errors}{c.RST}' if errors > 0 else f'{c.G}{errors}{c.RST}'
137
+ else:
138
+ e_str = '—'
139
+
140
+ tok_str = fmt_tokens(tokens)
141
+ lat_str = f'{latency}ms' if latency else '—'
142
+
143
+ lines.append(f' {v:<10}{s_str:>6} {d_str} {p_str} {pass_str:>5} {e_str:>3} {tok_str:>6} {lat_str:>6} {icon}')
144
+
145
+ return '\n'.join(lines)
146
+
147
+
148
+ def render_evaluator_breakdown(history, config, best_results, c):
149
+ evaluators = config.get('evaluators', [])
150
+ if not evaluators:
151
+ return None
152
+
153
+ has_per_eval = any(h.get('per_evaluator') for h in history)
154
+
155
+ if not has_per_eval and not best_results:
156
+ return None
157
+
158
+ W = 70
159
+ lines = []
160
+ lines.append(f' {c.B}PER-EVALUATOR BREAKDOWN{c.RST}')
161
+ lines.append(f' {c.D}{"─" * W}{c.RST}')
162
+
163
+ if has_per_eval:
164
+ lines.append(f' {c.D}{"Evaluator":<20}{"Base":>6}{"Best":>7}{"Δ":>7} {"":20} Trend{c.RST}')
165
+ lines.append(f' {c.D}{"─" * W}{c.RST}')
166
+
167
+ for ev in evaluators:
168
+ vals = [h.get('per_evaluator', {}).get(ev, 0) for h in history]
169
+ bv = vals[0]
170
+ best_v = vals[-1]
171
+ delta = best_v - bv
172
+ dc = c.G if delta > 0 else c.R
173
+ spark_ev = sparkline(vals)
174
+
175
+ lines.append(
176
+ f' {ev:<20}{bv:>5.2f} → {dc}{c.B}{best_v:.2f}{c.RST}'
177
+ f' {dc}{delta:>+6.2f}{c.RST}'
178
+ f' {hbar(best_v, 20, c)}'
179
+ f' {spark_ev}'
180
+ )
181
+ elif best_results:
182
+ lines.append(f' {c.D}{"Evaluator":<20}{"Avg Score":>10} {"":20}{c.RST}')
183
+ lines.append(f' {c.D}{"─" * W}{c.RST}')
184
+
185
+ eval_scores = {}
186
+ for ex_data in best_results.get('per_example', {}).values():
187
+ for ev_name, ev_score in ex_data.get('scores', {}).items():
188
+ eval_scores.setdefault(ev_name, []).append(ev_score)
189
+
190
+ for ev in evaluators:
191
+ if ev in eval_scores:
192
+ avg = sum(eval_scores[ev]) / len(eval_scores[ev])
193
+ lines.append(f' {ev:<20}{avg:>9.3f} {hbar(avg, 20, c)}')
194
+
195
+ return '\n'.join(lines)
196
+
197
+
198
+ def render_what_changed(history, c):
199
+ has_narrative = any(h.get('approach') for h in history[1:])
200
+ if not has_narrative:
201
+ return None
202
+
203
+ W = 70
204
+ best_score = max(h['score'] for h in history)
205
+ lines = []
206
+ lines.append(f' {c.B}WHAT CHANGED{c.RST}')
207
+ lines.append(f' {c.D}{"─" * W}{c.RST}')
208
+
209
+ for i, h in enumerate(history):
210
+ if i == 0:
211
+ continue
212
+ d = h['score'] - history[i - 1]['score']
213
+ dc = c.G if d > 0 else (c.R if d < 0 else c.Y)
214
+ icon = trend_icon(d, i == len(history) - 1 and h['score'] == best_score, c)
215
+ approach = (h.get('approach') or '—')[:42]
216
+ lens = h.get('lens', '')
217
+ lens_str = f' {c.D}[{lens}]{c.RST}' if lens else ''
218
+ lines.append(f' {h["version"]:<6} {icon} {dc}{d:>+.3f}{c.RST} {approach:<42}{lens_str}')
219
+
220
+ return '\n'.join(lines)
221
+
222
+
223
+ def render_bar_chart(history, scores, c):
224
+ best = max(scores)
225
+ best_idx = scores.index(best)
226
+ BAR_W = 40
227
+ W = 70
228
+
229
+ lines = []
230
+ lines.append(f' {c.B}SCORE CHART{c.RST}')
231
+ lines.append(f' {c.D}{"─" * W}{c.RST}')
232
+
233
+ for i, h in enumerate(history):
234
+ v = h['version']
235
+ s = h['score']
236
+ filled = round(s * BAR_W)
237
+
238
+ if i == best_idx:
239
+ bar_str = f'{c.G}{"█" * filled}{c.D}{"░" * (BAR_W - filled)}{c.RST}'
240
+ score_str = f'{c.G}{c.B}{s:.3f}{c.RST}'
241
+ elif i == 0:
242
+ bar_str = f'{c.C}{"█" * filled}{c.D}{"░" * (BAR_W - filled)}{c.RST}'
243
+ score_str = f'{c.C}{s:.3f}{c.RST}'
244
+ else:
245
+ bar_str = f'{c.G}{"█" * filled}{c.D}{"░" * (BAR_W - filled)}{c.RST}'
246
+ score_str = f'{s:.3f}'
247
+
248
+ lines.append(f' {v:<10}{bar_str} {score_str}')
249
+
250
+ lines.append(f' {c.D}{" " * 10}|{" " * 9}|{" " * 9}|{" " * 9}|{" " * 9}|{c.RST}')
251
+ lines.append(f' {c.D}{" " * 10}0{" " * 8}.25{" " * 7}.50{" " * 7}.75{" " * 8}1.0{c.RST}')
252
+
253
+ return '\n'.join(lines)
254
+
255
+
256
+ def main():
257
+ parser = argparse.ArgumentParser(description="Render evolution progress chart")
258
+ parser.add_argument("--config", default=".evolver.json", help="Path to .evolver.json")
259
+ parser.add_argument("--best-results", default=None, help="Path to best_results.json (auto-detected if not set)")
260
+ parser.add_argument("--no-color", action="store_true", help="Disable ANSI colors")
261
+ parser.add_argument("--output", default=None, help="Write output to file instead of stdout")
262
+ args = parser.parse_args()
263
+
264
+ if not os.path.exists(args.config):
265
+ print(f"Config not found: {args.config}", file=sys.stderr)
266
+ sys.exit(1)
267
+
268
+ with open(args.config) as f:
269
+ config = json.load(f)
270
+
271
+ history = config.get('history', [])
272
+ if not history:
273
+ print("No history data in config.", file=sys.stderr)
274
+ sys.exit(1)
275
+
276
+ best_results = None
277
+ br_path = args.best_results or os.path.join(os.path.dirname(args.config) or '.', 'best_results.json')
278
+ if os.path.exists(br_path):
279
+ with open(br_path) as f:
280
+ best_results = json.load(f)
281
+
282
+ use_color = not args.no_color and sys.stdout.isatty() and args.output is None
283
+ c = Colors(enabled=use_color)
284
+
285
+ scores = [h['score'] for h in history]
286
+
287
+ sections = [
288
+ '',
289
+ render_header(config, history, scores, c),
290
+ '',
291
+ render_score_table(history, scores, c),
292
+ '',
293
+ render_evaluator_breakdown(history, config, best_results, c),
294
+ '',
295
+ render_what_changed(history, c),
296
+ '',
297
+ render_bar_chart(history, scores, c),
298
+ '',
299
+ ]
300
+
301
+ output = '\n'.join(s for s in sections if s is not None)
302
+
303
+ if args.output:
304
+ with open(args.output, 'w') as f:
305
+ f.write(output + '\n')
306
+ print(f"Chart written to {args.output}", file=sys.stderr)
307
+ else:
308
+ print(output)
309
+
310
+
311
+ if __name__ == "__main__":
312
+ main()