harness-evolver 2.9.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -117
- package/agents/evolver-architect.md +53 -0
- package/agents/evolver-critic.md +44 -0
- package/agents/evolver-proposer.md +128 -0
- package/agents/evolver-testgen.md +67 -0
- package/bin/install.js +181 -171
- package/package.json +7 -7
- package/skills/deploy/SKILL.md +49 -56
- package/skills/evolve/SKILL.md +180 -700
- package/skills/setup/SKILL.md +182 -0
- package/skills/status/SKILL.md +23 -21
- package/tools/read_results.py +240 -0
- package/tools/run_eval.py +202 -0
- package/tools/seed_from_traces.py +36 -8
- package/tools/setup.py +393 -0
- package/tools/trace_insights.py +86 -14
- package/agents/harness-evolver-architect.md +0 -173
- package/agents/harness-evolver-critic.md +0 -132
- package/agents/harness-evolver-judge.md +0 -110
- package/agents/harness-evolver-proposer.md +0 -317
- package/agents/harness-evolver-testgen.md +0 -112
- package/examples/classifier/README.md +0 -25
- package/examples/classifier/config.json +0 -3
- package/examples/classifier/eval.py +0 -58
- package/examples/classifier/harness.py +0 -111
- package/examples/classifier/tasks/task_001.json +0 -1
- package/examples/classifier/tasks/task_002.json +0 -1
- package/examples/classifier/tasks/task_003.json +0 -1
- package/examples/classifier/tasks/task_004.json +0 -1
- package/examples/classifier/tasks/task_005.json +0 -1
- package/examples/classifier/tasks/task_006.json +0 -1
- package/examples/classifier/tasks/task_007.json +0 -1
- package/examples/classifier/tasks/task_008.json +0 -1
- package/examples/classifier/tasks/task_009.json +0 -1
- package/examples/classifier/tasks/task_010.json +0 -1
- package/skills/architect/SKILL.md +0 -93
- package/skills/compare/SKILL.md +0 -73
- package/skills/critic/SKILL.md +0 -67
- package/skills/diagnose/SKILL.md +0 -96
- package/skills/import-traces/SKILL.md +0 -102
- package/skills/init/SKILL.md +0 -253
- package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
- package/tools/__pycache__/init.cpython-313.pyc +0 -0
- package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
- package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
- package/tools/eval_llm_judge.py +0 -233
- package/tools/eval_passthrough.py +0 -55
- package/tools/evaluate.py +0 -255
- package/tools/import_traces.py +0 -229
- package/tools/init.py +0 -531
- package/tools/llm_api.py +0 -125
- package/tools/state.py +0 -219
- package/tools/test_growth.py +0 -230
- package/tools/trace_logger.py +0 -42
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: evolver:setup
|
|
3
|
+
description: "Use when the user wants to set up the evolver in their project, optimize an LLM agent, improve agent performance, or mentions evolver for the first time in a project without .evolver.json."
|
|
4
|
+
argument-hint: "[directory]"
|
|
5
|
+
allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent, AskUserQuestion]
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# /evolver:setup
|
|
9
|
+
|
|
10
|
+
Set up the Harness Evolver v3 in a project. Explores the codebase, configures LangSmith, runs baseline evaluation.
|
|
11
|
+
|
|
12
|
+
## Prerequisites
|
|
13
|
+
|
|
14
|
+
- `LANGSMITH_API_KEY` must be set. If not: "Set your LangSmith API key: `export LANGSMITH_API_KEY=lsv2_pt_...`"
|
|
15
|
+
- Python 3.10+ with `langsmith` and `openevals` packages. If missing:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install langsmith openevals 2>/dev/null || uv pip install langsmith openevals
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Resolve Tool Path
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
TOOLS=$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Phase 1: Explore Project (automatic)
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
find . -maxdepth 3 -type f -name "*.py" | head -30
|
|
31
|
+
python3 $TOOLS/detect_stack.py .
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Look for:
|
|
35
|
+
- Entry points: files with `if __name__`, or named `main.py`, `app.py`, `agent.py`, `graph.py`, `pipeline.py`
|
|
36
|
+
- Framework: LangGraph, CrewAI, OpenAI SDK, Anthropic SDK, etc.
|
|
37
|
+
- Existing LangSmith config: `LANGCHAIN_PROJECT` / `LANGSMITH_PROJECT` in env or `.env`
|
|
38
|
+
- Existing test data: JSON files with inputs, CSV files, etc.
|
|
39
|
+
- Dependencies: `requirements.txt`, `pyproject.toml`
|
|
40
|
+
|
|
41
|
+
Identify the **run command** — how to execute the agent. Examples:
|
|
42
|
+
- `python main.py` (if it accepts `--input` flag)
|
|
43
|
+
- `python -c "from agent import run; import json,sys; print(json.dumps(run(json.load(open(sys.argv[1])))))"`
|
|
44
|
+
|
|
45
|
+
## Phase 2: Confirm Detection (interactive)
|
|
46
|
+
|
|
47
|
+
Use AskUserQuestion:
|
|
48
|
+
|
|
49
|
+
```json
|
|
50
|
+
{
|
|
51
|
+
"questions": [{
|
|
52
|
+
"question": "Here's what I detected. Does this look right?\n\nEntry point: {path}\nFramework: {framework}\nRun command: {command}\nLangSmith: {status}",
|
|
53
|
+
"header": "Confirm",
|
|
54
|
+
"multiSelect": false,
|
|
55
|
+
"options": [
|
|
56
|
+
{"label": "Looks good, proceed", "description": "Continue with detected configuration"},
|
|
57
|
+
{"label": "Let me adjust", "description": "I'll provide correct paths and commands"},
|
|
58
|
+
{"label": "Wrong directory", "description": "I need to cd somewhere else first"}
|
|
59
|
+
]
|
|
60
|
+
}]
|
|
61
|
+
}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Phase 3: What to Optimize (interactive)
|
|
65
|
+
|
|
66
|
+
Use AskUserQuestion:
|
|
67
|
+
|
|
68
|
+
```json
|
|
69
|
+
{
|
|
70
|
+
"questions": [{
|
|
71
|
+
"question": "What do you want to optimize?",
|
|
72
|
+
"header": "Goals",
|
|
73
|
+
"multiSelect": true,
|
|
74
|
+
"options": [
|
|
75
|
+
{"label": "Accuracy", "description": "Correctness of outputs — LLM-as-judge evaluator"},
|
|
76
|
+
{"label": "Latency", "description": "Response time — track and minimize"},
|
|
77
|
+
{"label": "Token efficiency", "description": "Fewer tokens for same quality"},
|
|
78
|
+
{"label": "Error handling", "description": "Reduce failures, timeouts, crashes"}
|
|
79
|
+
]
|
|
80
|
+
}]
|
|
81
|
+
}
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Map selections to evaluator configuration for setup.py.
|
|
85
|
+
|
|
86
|
+
## Phase 4: Test Data Source (interactive)
|
|
87
|
+
|
|
88
|
+
Use AskUserQuestion with **preview**:
|
|
89
|
+
|
|
90
|
+
```json
|
|
91
|
+
{
|
|
92
|
+
"questions": [{
|
|
93
|
+
"question": "Where should test inputs come from?",
|
|
94
|
+
"header": "Test data",
|
|
95
|
+
"multiSelect": false,
|
|
96
|
+
"options": [
|
|
97
|
+
{
|
|
98
|
+
"label": "Import from LangSmith",
|
|
99
|
+
"description": "Use real production traces as test inputs",
|
|
100
|
+
"preview": "## Import from LangSmith\n\nFetches up to 100 recent traces from your production project.\nPrioritizes traces with negative feedback.\nCreates a LangSmith Dataset with real user inputs.\n\nRequires: an existing LangSmith project with traces."
|
|
101
|
+
},
|
|
102
|
+
{
|
|
103
|
+
"label": "Generate from code",
|
|
104
|
+
"description": "AI generates test inputs by analyzing your code",
|
|
105
|
+
"preview": "## Generate from Code\n\nThe testgen agent reads your source code and generates\n30 diverse test inputs:\n- 40% standard cases\n- 20% edge cases\n- 20% cross-domain\n- 20% adversarial\n\nOutputs are scored by LLM-as-judge."
|
|
106
|
+
},
|
|
107
|
+
{
|
|
108
|
+
"label": "I have test data",
|
|
109
|
+
"description": "Point to an existing file with test inputs",
|
|
110
|
+
"preview": "## Provide Test Data\n\nSupported formats:\n- JSON array of inputs\n- JSON with {\"inputs\": {...}} objects\n- CSV with input columns\n\nExample:\n```json\n[\n {\"input\": \"What is Python?\"},\n {\"input\": \"Explain quantum computing\"}\n]\n```"
|
|
111
|
+
}
|
|
112
|
+
]
|
|
113
|
+
}]
|
|
114
|
+
}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
If "Import from LangSmith": discover projects and ask which one (same as v2 Phase 1.9).
|
|
118
|
+
If "I have test data": ask for file path.
|
|
119
|
+
|
|
120
|
+
## Phase 5: Run Setup
|
|
121
|
+
|
|
122
|
+
Build the setup.py command based on all gathered information:
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
python3 $TOOLS/setup.py \
|
|
126
|
+
--project-name "{project_name}" \
|
|
127
|
+
--entry-point "{run_command}" \
|
|
128
|
+
--framework "{framework}" \
|
|
129
|
+
--goals "{goals_csv}" \
|
|
130
|
+
${DATASET_FROM_FILE:+--dataset-from-file "$DATASET_FROM_FILE"} \
|
|
131
|
+
${DATASET_FROM_LANGSMITH:+--dataset-from-langsmith "$DATASET_FROM_LANGSMITH"} \
|
|
132
|
+
${PRODUCTION_PROJECT:+--production-project "$PRODUCTION_PROJECT"}
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
If "Generate from code" was selected AND no test data file exists, first spawn the testgen agent to generate inputs, then pass the generated file to setup.py.
|
|
136
|
+
|
|
137
|
+
## Phase 6: Generate Test Data (if needed)
|
|
138
|
+
|
|
139
|
+
If testgen is needed, spawn it:
|
|
140
|
+
|
|
141
|
+
```
|
|
142
|
+
Agent(
|
|
143
|
+
subagent_type: "evolver-testgen",
|
|
144
|
+
description: "TestGen: generate test inputs",
|
|
145
|
+
prompt: |
|
|
146
|
+
<objective>
|
|
147
|
+
Generate 30 diverse test inputs for this project.
|
|
148
|
+
Write them as a JSON array to test_inputs.json.
|
|
149
|
+
</objective>
|
|
150
|
+
|
|
151
|
+
<files_to_read>
|
|
152
|
+
{all .py files discovered in Phase 1}
|
|
153
|
+
</files_to_read>
|
|
154
|
+
|
|
155
|
+
<output>
|
|
156
|
+
Create test_inputs.json with format:
|
|
157
|
+
[{"input": "..."}, {"input": "..."}, ...]
|
|
158
|
+
</output>
|
|
159
|
+
)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
Then pass `--dataset-from-file test_inputs.json` to setup.py.
|
|
163
|
+
|
|
164
|
+
## Phase 7: Report
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
Setup complete!
|
|
168
|
+
Project: evolver-{name}
|
|
169
|
+
Dataset: {name}-eval-v1 ({N} examples)
|
|
170
|
+
Evaluators: {list}
|
|
171
|
+
Baseline score: {score}
|
|
172
|
+
Config: .evolver.json
|
|
173
|
+
|
|
174
|
+
Next: run /evolver:evolve to start optimizing.
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Gotchas
|
|
178
|
+
|
|
179
|
+
- If `.evolver.json` already exists, ask before overwriting.
|
|
180
|
+
- If the agent needs a venv, the run command should activate it: `cd {dir} && .venv/bin/python main.py`
|
|
181
|
+
- If LangSmith connection fails, check API key and network.
|
|
182
|
+
- The setup installs `langsmith` and `openevals` if missing.
|
package/skills/status/SKILL.md
CHANGED
|
@@ -1,34 +1,36 @@
|
|
|
1
1
|
---
|
|
2
|
-
name:
|
|
3
|
-
description: "Use when the user asks about evolution progress, current scores, best
|
|
2
|
+
name: evolver:status
|
|
3
|
+
description: "Use when the user asks about evolution progress, current scores, best version, how many iterations ran, or whether the loop is stagnating."
|
|
4
4
|
allowed-tools: [Read, Bash]
|
|
5
5
|
---
|
|
6
6
|
|
|
7
|
-
# /
|
|
7
|
+
# /evolver:status
|
|
8
8
|
|
|
9
|
-
Show evolution progress.
|
|
10
|
-
|
|
11
|
-
## Resolve Tool Path
|
|
12
|
-
|
|
13
|
-
```bash
|
|
14
|
-
TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo "$HOME/.harness-evolver/tools")
|
|
15
|
-
```
|
|
9
|
+
Show current evolution progress.
|
|
16
10
|
|
|
17
11
|
## What To Do
|
|
18
12
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
Otherwise:
|
|
13
|
+
Read `.evolver.json` and report:
|
|
22
14
|
|
|
23
15
|
```bash
|
|
24
|
-
python3
|
|
16
|
+
python3 -c "
|
|
17
|
+
import json
|
|
18
|
+
c = json.load(open('.evolver.json'))
|
|
19
|
+
print(f'Project: {c[\"project\"]}')
|
|
20
|
+
print(f'Dataset: {c[\"dataset\"]}')
|
|
21
|
+
print(f'Framework: {c[\"framework\"]}')
|
|
22
|
+
print(f'Evaluators: {c[\"evaluators\"]}')
|
|
23
|
+
print(f'Iterations: {c[\"iterations\"]}')
|
|
24
|
+
print(f'Best: {c[\"best_experiment\"]} (score: {c[\"best_score\"]:.3f})')
|
|
25
|
+
print(f'Baseline: {c[\"history\"][0][\"score\"]:.3f}' if c['history'] else 'No baseline')
|
|
26
|
+
print()
|
|
27
|
+
print('History:')
|
|
28
|
+
for h in c.get('history', []):
|
|
29
|
+
print(f' {h[\"version\"]}: {h[\"score\"]:.3f}')
|
|
30
|
+
"
|
|
25
31
|
```
|
|
26
32
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
## If User Wants More Detail
|
|
33
|
+
Detect stagnation: if last 3 scores are within 1% of each other, warn.
|
|
34
|
+
Detect regression: if current best is lower than a previous best, warn.
|
|
30
35
|
|
|
31
|
-
|
|
32
|
-
- What changed: `cat .harness-evolver/harnesses/{version}/proposal.md`
|
|
33
|
-
- Compare two versions: `diff .harness-evolver/harnesses/{vA}/harness.py .harness-evolver/harnesses/{vB}/harness.py`
|
|
34
|
-
- Full history: `cat .harness-evolver/PROPOSER_HISTORY.md`
|
|
36
|
+
Print LangSmith URL for the best experiment if available.
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Read LangSmith experiment results for Harness Evolver v3.
|
|
3
|
+
|
|
4
|
+
Reads experiment results from LangSmith and formats them for agents
|
|
5
|
+
(proposer, critic, architect). Handles comparison between candidates.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python3 read_results.py \
|
|
9
|
+
--experiments v001a,v001b,v001c,v001d,v001e \
|
|
10
|
+
--config .evolver.json \
|
|
11
|
+
[--output results.json]
|
|
12
|
+
|
|
13
|
+
python3 read_results.py \
|
|
14
|
+
--experiment v001a \
|
|
15
|
+
--config .evolver.json \
|
|
16
|
+
--format markdown
|
|
17
|
+
|
|
18
|
+
Requires: pip install langsmith
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import json
|
|
23
|
+
import os
|
|
24
|
+
import sys
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def read_experiment(client, experiment_name):
|
|
28
|
+
"""Read results from a single LangSmith experiment."""
|
|
29
|
+
try:
|
|
30
|
+
# List runs for this experiment
|
|
31
|
+
runs = list(client.list_runs(
|
|
32
|
+
project_name=experiment_name,
|
|
33
|
+
is_root=True,
|
|
34
|
+
limit=200,
|
|
35
|
+
))
|
|
36
|
+
|
|
37
|
+
if not runs:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
per_example = {}
|
|
41
|
+
total_tokens = 0
|
|
42
|
+
total_latency_ms = 0
|
|
43
|
+
errors = 0
|
|
44
|
+
|
|
45
|
+
for run in runs:
|
|
46
|
+
example_id = str(run.reference_example_id or run.id)
|
|
47
|
+
tokens = run.total_tokens or 0
|
|
48
|
+
total_tokens += tokens
|
|
49
|
+
|
|
50
|
+
latency_ms = 0
|
|
51
|
+
if run.end_time and run.start_time:
|
|
52
|
+
latency_ms = int((run.end_time - run.start_time).total_seconds() * 1000)
|
|
53
|
+
total_latency_ms += latency_ms
|
|
54
|
+
|
|
55
|
+
has_error = bool(run.error)
|
|
56
|
+
if has_error:
|
|
57
|
+
errors += 1
|
|
58
|
+
|
|
59
|
+
# Read feedback/scores
|
|
60
|
+
feedbacks = list(client.list_feedback(run_ids=[run.id]))
|
|
61
|
+
scores = {}
|
|
62
|
+
for fb in feedbacks:
|
|
63
|
+
if fb.score is not None:
|
|
64
|
+
scores[fb.key] = fb.score
|
|
65
|
+
|
|
66
|
+
per_example[example_id] = {
|
|
67
|
+
"score": sum(scores.values()) / len(scores) if scores else 0.0,
|
|
68
|
+
"scores": scores,
|
|
69
|
+
"tokens": tokens,
|
|
70
|
+
"latency_ms": latency_ms,
|
|
71
|
+
"error": run.error[:200] if run.error else None,
|
|
72
|
+
"input_preview": str(run.inputs)[:200] if run.inputs else "",
|
|
73
|
+
"output_preview": str(run.outputs)[:200] if run.outputs else "",
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
num_examples = len(per_example)
|
|
77
|
+
all_scores = [v["score"] for v in per_example.values()]
|
|
78
|
+
combined_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
|
79
|
+
|
|
80
|
+
return {
|
|
81
|
+
"experiment": experiment_name,
|
|
82
|
+
"combined_score": combined_score,
|
|
83
|
+
"num_examples": num_examples,
|
|
84
|
+
"total_tokens": total_tokens,
|
|
85
|
+
"avg_latency_ms": total_latency_ms // max(num_examples, 1),
|
|
86
|
+
"error_count": errors,
|
|
87
|
+
"error_rate": errors / max(num_examples, 1),
|
|
88
|
+
"per_example": per_example,
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
except Exception as e:
|
|
92
|
+
return {"experiment": experiment_name, "error": str(e), "combined_score": 0.0}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def compare_experiments(results_list):
|
|
96
|
+
"""Compare multiple experiment results and find winner + per-task champion."""
|
|
97
|
+
if not results_list:
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
valid = [r for r in results_list if "error" not in r and r.get("combined_score", 0) > 0]
|
|
101
|
+
if not valid:
|
|
102
|
+
valid = results_list
|
|
103
|
+
|
|
104
|
+
# Overall winner
|
|
105
|
+
winner = max(valid, key=lambda r: r.get("combined_score", 0))
|
|
106
|
+
|
|
107
|
+
# Per-task champion (candidate that beats winner on most individual tasks)
|
|
108
|
+
task_wins = {}
|
|
109
|
+
winner_examples = winner.get("per_example", {})
|
|
110
|
+
|
|
111
|
+
for result in valid:
|
|
112
|
+
if result["experiment"] == winner["experiment"]:
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
wins = 0
|
|
116
|
+
for example_id, data in result.get("per_example", {}).items():
|
|
117
|
+
winner_score = winner_examples.get(example_id, {}).get("score", 0)
|
|
118
|
+
if data.get("score", 0) > winner_score:
|
|
119
|
+
wins += 1
|
|
120
|
+
|
|
121
|
+
if wins > 0:
|
|
122
|
+
task_wins[result["experiment"]] = wins
|
|
123
|
+
|
|
124
|
+
champion = None
|
|
125
|
+
if task_wins:
|
|
126
|
+
champion_name = max(task_wins, key=task_wins.get)
|
|
127
|
+
champion = {
|
|
128
|
+
"experiment": champion_name,
|
|
129
|
+
"task_wins": task_wins[champion_name],
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return {
|
|
133
|
+
"winner": {
|
|
134
|
+
"experiment": winner["experiment"],
|
|
135
|
+
"score": winner["combined_score"],
|
|
136
|
+
},
|
|
137
|
+
"champion": champion,
|
|
138
|
+
"all_candidates": [
|
|
139
|
+
{
|
|
140
|
+
"experiment": r["experiment"],
|
|
141
|
+
"score": r.get("combined_score", 0),
|
|
142
|
+
"tokens": r.get("total_tokens", 0),
|
|
143
|
+
"latency_ms": r.get("avg_latency_ms", 0),
|
|
144
|
+
"errors": r.get("error_count", 0),
|
|
145
|
+
}
|
|
146
|
+
for r in results_list
|
|
147
|
+
],
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def format_markdown(results):
|
|
152
|
+
"""Format experiment results as markdown for agents."""
|
|
153
|
+
lines = [f"# Experiment Results: {results['experiment']}", ""]
|
|
154
|
+
|
|
155
|
+
lines.append(f"**Combined Score**: {results.get('combined_score', 0):.3f}")
|
|
156
|
+
lines.append(f"**Examples**: {results.get('num_examples', 0)}")
|
|
157
|
+
lines.append(f"**Total Tokens**: {results.get('total_tokens', 0)}")
|
|
158
|
+
lines.append(f"**Avg Latency**: {results.get('avg_latency_ms', 0)}ms")
|
|
159
|
+
lines.append(f"**Errors**: {results.get('error_count', 0)} ({results.get('error_rate', 0):.1%})")
|
|
160
|
+
lines.append("")
|
|
161
|
+
|
|
162
|
+
per_example = results.get("per_example", {})
|
|
163
|
+
if per_example:
|
|
164
|
+
# Failing examples
|
|
165
|
+
failing = {k: v for k, v in per_example.items() if v.get("score", 0) < 0.5}
|
|
166
|
+
if failing:
|
|
167
|
+
lines.append("## Failing Examples")
|
|
168
|
+
lines.append("")
|
|
169
|
+
for eid, data in sorted(failing.items(), key=lambda x: x[1].get("score", 0)):
|
|
170
|
+
lines.append(f"- **{eid}**: score={data['score']:.2f}")
|
|
171
|
+
if data.get("error"):
|
|
172
|
+
lines.append(f" Error: {data['error']}")
|
|
173
|
+
lines.append(f" Input: {data.get('input_preview', 'N/A')}")
|
|
174
|
+
lines.append("")
|
|
175
|
+
|
|
176
|
+
return "\n".join(lines)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def main():
|
|
180
|
+
parser = argparse.ArgumentParser(description="Read LangSmith experiment results")
|
|
181
|
+
parser.add_argument("--experiments", default=None, help="Comma-separated experiment names to compare")
|
|
182
|
+
parser.add_argument("--experiment", default=None, help="Single experiment to read")
|
|
183
|
+
parser.add_argument("--config", default=".evolver.json", help="Path to .evolver.json")
|
|
184
|
+
parser.add_argument("--output", default=None, help="Output JSON path")
|
|
185
|
+
parser.add_argument("--format", default="json", choices=["json", "markdown"], help="Output format")
|
|
186
|
+
args = parser.parse_args()
|
|
187
|
+
|
|
188
|
+
from langsmith import Client
|
|
189
|
+
client = Client()
|
|
190
|
+
|
|
191
|
+
if args.experiment:
|
|
192
|
+
# Single experiment
|
|
193
|
+
result = read_experiment(client, args.experiment)
|
|
194
|
+
if not result:
|
|
195
|
+
print(f"No results found for experiment: {args.experiment}", file=sys.stderr)
|
|
196
|
+
sys.exit(1)
|
|
197
|
+
|
|
198
|
+
if args.format == "markdown":
|
|
199
|
+
output = format_markdown(result)
|
|
200
|
+
else:
|
|
201
|
+
output = json.dumps(result, indent=2, default=str)
|
|
202
|
+
|
|
203
|
+
if args.output:
|
|
204
|
+
with open(args.output, "w") as f:
|
|
205
|
+
f.write(output)
|
|
206
|
+
print(output)
|
|
207
|
+
|
|
208
|
+
elif args.experiments:
|
|
209
|
+
# Compare multiple experiments
|
|
210
|
+
experiment_names = [e.strip() for e in args.experiments.split(",")]
|
|
211
|
+
results_list = []
|
|
212
|
+
|
|
213
|
+
for name in experiment_names:
|
|
214
|
+
print(f"Reading experiment: {name}...", file=sys.stderr)
|
|
215
|
+
result = read_experiment(client, name)
|
|
216
|
+
if result:
|
|
217
|
+
results_list.append(result)
|
|
218
|
+
|
|
219
|
+
if not results_list:
|
|
220
|
+
print("No experiment results found.", file=sys.stderr)
|
|
221
|
+
sys.exit(1)
|
|
222
|
+
|
|
223
|
+
comparison = compare_experiments(results_list)
|
|
224
|
+
output = json.dumps({
|
|
225
|
+
"comparison": comparison,
|
|
226
|
+
"experiments": results_list,
|
|
227
|
+
}, indent=2, default=str)
|
|
228
|
+
|
|
229
|
+
if args.output:
|
|
230
|
+
with open(args.output, "w") as f:
|
|
231
|
+
f.write(output)
|
|
232
|
+
print(output)
|
|
233
|
+
|
|
234
|
+
else:
|
|
235
|
+
print("Provide --experiment or --experiments", file=sys.stderr)
|
|
236
|
+
sys.exit(1)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
if __name__ == "__main__":
|
|
240
|
+
main()
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Run LangSmith evaluation for a candidate in a worktree.
|
|
3
|
+
|
|
4
|
+
Wraps client.evaluate() — runs the user's agent against the dataset
|
|
5
|
+
with configured evaluators, from within a specific directory (worktree).
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python3 run_eval.py \
|
|
9
|
+
--config .evolver.json \
|
|
10
|
+
--worktree-path /tmp/worktree-abc \
|
|
11
|
+
--experiment-prefix v001a \
|
|
12
|
+
[--timeout 120]
|
|
13
|
+
|
|
14
|
+
Requires: pip install langsmith openevals
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import os
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
import tempfile
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def make_target(entry_point, cwd):
|
|
26
|
+
"""Create a target function that runs the agent from a specific directory."""
|
|
27
|
+
def target(inputs):
|
|
28
|
+
input_json = json.dumps(inputs)
|
|
29
|
+
input_path = tempfile.mktemp(suffix=".json")
|
|
30
|
+
output_path = input_path + ".out"
|
|
31
|
+
|
|
32
|
+
with open(input_path, "w") as f:
|
|
33
|
+
f.write(input_json)
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
cmd = entry_point
|
|
37
|
+
if "{input}" in cmd:
|
|
38
|
+
cmd = cmd.replace("{input}", input_path)
|
|
39
|
+
elif "{input_json}" in cmd:
|
|
40
|
+
cmd = cmd.replace("{input_json}", input_json)
|
|
41
|
+
else:
|
|
42
|
+
cmd = f"{cmd} --input {input_path} --output {output_path}"
|
|
43
|
+
|
|
44
|
+
env = os.environ.copy()
|
|
45
|
+
# Ensure traces go to the evolver project
|
|
46
|
+
env["LANGSMITH_TRACING"] = "true"
|
|
47
|
+
|
|
48
|
+
result = subprocess.run(
|
|
49
|
+
cmd, shell=True, capture_output=True, text=True,
|
|
50
|
+
timeout=int(os.environ.get("EVAL_TASK_TIMEOUT", "120")),
|
|
51
|
+
cwd=cwd, env=env,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Read output file if it exists
|
|
55
|
+
if os.path.exists(output_path):
|
|
56
|
+
with open(output_path) as f:
|
|
57
|
+
try:
|
|
58
|
+
return json.load(f)
|
|
59
|
+
except json.JSONDecodeError:
|
|
60
|
+
pass
|
|
61
|
+
|
|
62
|
+
# Fallback: parse stdout
|
|
63
|
+
if result.stdout.strip():
|
|
64
|
+
try:
|
|
65
|
+
return json.loads(result.stdout)
|
|
66
|
+
except json.JSONDecodeError:
|
|
67
|
+
return {"output": result.stdout.strip()}
|
|
68
|
+
|
|
69
|
+
# Accept segfault (139) if output was produced
|
|
70
|
+
if result.returncode != 0:
|
|
71
|
+
return {"output": "", "error": result.stderr.strip()[:500]}
|
|
72
|
+
|
|
73
|
+
return {"output": ""}
|
|
74
|
+
|
|
75
|
+
except subprocess.TimeoutExpired:
|
|
76
|
+
return {"output": "", "error": f"TIMEOUT after {os.environ.get('EVAL_TASK_TIMEOUT', '120')}s"}
|
|
77
|
+
except Exception as e:
|
|
78
|
+
return {"output": "", "error": str(e)}
|
|
79
|
+
finally:
|
|
80
|
+
for p in [input_path, output_path]:
|
|
81
|
+
if os.path.exists(p):
|
|
82
|
+
try:
|
|
83
|
+
os.remove(p)
|
|
84
|
+
except OSError:
|
|
85
|
+
pass
|
|
86
|
+
|
|
87
|
+
return target
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def load_evaluators(evaluator_keys):
|
|
91
|
+
"""Load evaluators by key name."""
|
|
92
|
+
from openevals.llm import create_llm_as_judge
|
|
93
|
+
from openevals.prompts import CORRECTNESS_PROMPT, CONCISENESS_PROMPT
|
|
94
|
+
|
|
95
|
+
evaluators = []
|
|
96
|
+
for key in evaluator_keys:
|
|
97
|
+
if key == "correctness":
|
|
98
|
+
evaluators.append(create_llm_as_judge(
|
|
99
|
+
prompt=CORRECTNESS_PROMPT,
|
|
100
|
+
feedback_key="correctness",
|
|
101
|
+
model="openai:gpt-4.1-mini",
|
|
102
|
+
))
|
|
103
|
+
elif key == "conciseness":
|
|
104
|
+
evaluators.append(create_llm_as_judge(
|
|
105
|
+
prompt=CONCISENESS_PROMPT,
|
|
106
|
+
feedback_key="conciseness",
|
|
107
|
+
model="openai:gpt-4.1-mini",
|
|
108
|
+
))
|
|
109
|
+
elif key == "latency":
|
|
110
|
+
def latency_eval(inputs, outputs, **kwargs):
|
|
111
|
+
return {"key": "has_output", "score": 1.0 if outputs else 0.0}
|
|
112
|
+
evaluators.append(latency_eval)
|
|
113
|
+
elif key == "token_efficiency":
|
|
114
|
+
def token_eval(inputs, outputs, **kwargs):
|
|
115
|
+
output_text = str(outputs.get("output", outputs.get("answer", "")))
|
|
116
|
+
score = min(1.0, 2000 / max(len(output_text), 1))
|
|
117
|
+
return {"key": "token_efficiency", "score": score}
|
|
118
|
+
evaluators.append(token_eval)
|
|
119
|
+
|
|
120
|
+
return evaluators
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def main():
|
|
124
|
+
parser = argparse.ArgumentParser(description="Run LangSmith evaluation for a candidate")
|
|
125
|
+
parser.add_argument("--config", default=".evolver.json", help="Path to .evolver.json")
|
|
126
|
+
parser.add_argument("--worktree-path", required=True, help="Path to the candidate's worktree")
|
|
127
|
+
parser.add_argument("--experiment-prefix", required=True, help="Experiment name prefix (e.g. v001a)")
|
|
128
|
+
parser.add_argument("--timeout", type=int, default=120, help="Per-task timeout in seconds")
|
|
129
|
+
args = parser.parse_args()
|
|
130
|
+
|
|
131
|
+
with open(args.config) as f:
|
|
132
|
+
config = json.load(f)
|
|
133
|
+
|
|
134
|
+
os.environ["EVAL_TASK_TIMEOUT"] = str(args.timeout)
|
|
135
|
+
|
|
136
|
+
from langsmith import Client
|
|
137
|
+
client = Client()
|
|
138
|
+
|
|
139
|
+
target = make_target(config["entry_point"], args.worktree_path)
|
|
140
|
+
evaluators = load_evaluators(config["evaluators"])
|
|
141
|
+
|
|
142
|
+
print(f"Running evaluation: {args.experiment_prefix}")
|
|
143
|
+
print(f" Dataset: {config['dataset']}")
|
|
144
|
+
print(f" Worktree: {args.worktree_path}")
|
|
145
|
+
print(f" Evaluators: {config['evaluators']}")
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
results = client.evaluate(
|
|
149
|
+
target,
|
|
150
|
+
data=config["dataset"],
|
|
151
|
+
evaluators=evaluators,
|
|
152
|
+
experiment_prefix=args.experiment_prefix,
|
|
153
|
+
max_concurrency=1,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
experiment_name = results.experiment_name
|
|
157
|
+
|
|
158
|
+
# Calculate mean score
|
|
159
|
+
scores = []
|
|
160
|
+
per_example = {}
|
|
161
|
+
for result in results:
|
|
162
|
+
example_scores = []
|
|
163
|
+
if result.evaluation_results and result.evaluation_results.get("results"):
|
|
164
|
+
for er in result.evaluation_results["results"]:
|
|
165
|
+
if er.get("score") is not None:
|
|
166
|
+
example_scores.append(er["score"])
|
|
167
|
+
scores.append(er["score"])
|
|
168
|
+
|
|
169
|
+
example_id = str(result.example.id) if result.example else "unknown"
|
|
170
|
+
per_example[example_id] = {
|
|
171
|
+
"score": sum(example_scores) / len(example_scores) if example_scores else 0.0,
|
|
172
|
+
"num_evaluators": len(example_scores),
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
mean_score = sum(scores) / len(scores) if scores else 0.0
|
|
176
|
+
|
|
177
|
+
output = {
|
|
178
|
+
"experiment": experiment_name,
|
|
179
|
+
"prefix": args.experiment_prefix,
|
|
180
|
+
"combined_score": mean_score,
|
|
181
|
+
"num_examples": len(per_example),
|
|
182
|
+
"num_scores": len(scores),
|
|
183
|
+
"per_example": per_example,
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
print(json.dumps(output))
|
|
187
|
+
print(f"\nEvaluation complete: {mean_score:.3f} ({len(per_example)} examples)")
|
|
188
|
+
|
|
189
|
+
except Exception as e:
|
|
190
|
+
print(f"Evaluation failed: {e}", file=sys.stderr)
|
|
191
|
+
output = {
|
|
192
|
+
"experiment": None,
|
|
193
|
+
"prefix": args.experiment_prefix,
|
|
194
|
+
"combined_score": 0.0,
|
|
195
|
+
"error": str(e),
|
|
196
|
+
}
|
|
197
|
+
print(json.dumps(output))
|
|
198
|
+
sys.exit(1)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
if __name__ == "__main__":
|
|
202
|
+
main()
|