harness-evolver 2.9.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +62 -117
  2. package/agents/evolver-architect.md +53 -0
  3. package/agents/evolver-critic.md +44 -0
  4. package/agents/evolver-proposer.md +128 -0
  5. package/agents/evolver-testgen.md +67 -0
  6. package/bin/install.js +181 -171
  7. package/package.json +7 -7
  8. package/skills/deploy/SKILL.md +49 -56
  9. package/skills/evolve/SKILL.md +180 -700
  10. package/skills/setup/SKILL.md +182 -0
  11. package/skills/status/SKILL.md +23 -21
  12. package/tools/read_results.py +240 -0
  13. package/tools/run_eval.py +202 -0
  14. package/tools/seed_from_traces.py +36 -8
  15. package/tools/setup.py +393 -0
  16. package/tools/trace_insights.py +86 -14
  17. package/agents/harness-evolver-architect.md +0 -173
  18. package/agents/harness-evolver-critic.md +0 -132
  19. package/agents/harness-evolver-judge.md +0 -110
  20. package/agents/harness-evolver-proposer.md +0 -317
  21. package/agents/harness-evolver-testgen.md +0 -112
  22. package/examples/classifier/README.md +0 -25
  23. package/examples/classifier/config.json +0 -3
  24. package/examples/classifier/eval.py +0 -58
  25. package/examples/classifier/harness.py +0 -111
  26. package/examples/classifier/tasks/task_001.json +0 -1
  27. package/examples/classifier/tasks/task_002.json +0 -1
  28. package/examples/classifier/tasks/task_003.json +0 -1
  29. package/examples/classifier/tasks/task_004.json +0 -1
  30. package/examples/classifier/tasks/task_005.json +0 -1
  31. package/examples/classifier/tasks/task_006.json +0 -1
  32. package/examples/classifier/tasks/task_007.json +0 -1
  33. package/examples/classifier/tasks/task_008.json +0 -1
  34. package/examples/classifier/tasks/task_009.json +0 -1
  35. package/examples/classifier/tasks/task_010.json +0 -1
  36. package/skills/architect/SKILL.md +0 -93
  37. package/skills/compare/SKILL.md +0 -73
  38. package/skills/critic/SKILL.md +0 -67
  39. package/skills/diagnose/SKILL.md +0 -96
  40. package/skills/import-traces/SKILL.md +0 -102
  41. package/skills/init/SKILL.md +0 -253
  42. package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
  43. package/tools/__pycache__/init.cpython-313.pyc +0 -0
  44. package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
  45. package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
  46. package/tools/eval_llm_judge.py +0 -233
  47. package/tools/eval_passthrough.py +0 -55
  48. package/tools/evaluate.py +0 -255
  49. package/tools/import_traces.py +0 -229
  50. package/tools/init.py +0 -531
  51. package/tools/llm_api.py +0 -125
  52. package/tools/state.py +0 -219
  53. package/tools/test_growth.py +0 -230
  54. package/tools/trace_logger.py +0 -42
@@ -0,0 +1,182 @@
1
+ ---
2
+ name: evolver:setup
3
+ description: "Use when the user wants to set up the evolver in their project, optimize an LLM agent, improve agent performance, or mentions evolver for the first time in a project without .evolver.json."
4
+ argument-hint: "[directory]"
5
+ allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent, AskUserQuestion]
6
+ ---
7
+
8
+ # /evolver:setup
9
+
10
+ Set up the Harness Evolver v3 in a project. Explores the codebase, configures LangSmith, runs baseline evaluation.
11
+
12
+ ## Prerequisites
13
+
14
+ - `LANGSMITH_API_KEY` must be set. If not: "Set your LangSmith API key: `export LANGSMITH_API_KEY=lsv2_pt_...`"
15
+ - Python 3.10+ with `langsmith` and `openevals` packages. If missing:
16
+
17
+ ```bash
18
+ pip install langsmith openevals 2>/dev/null || uv pip install langsmith openevals
19
+ ```
20
+
21
+ ## Resolve Tool Path
22
+
23
+ ```bash
24
+ TOOLS=$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")
25
+ ```
26
+
27
+ ## Phase 1: Explore Project (automatic)
28
+
29
+ ```bash
30
+ find . -maxdepth 3 -type f -name "*.py" | head -30
31
+ python3 $TOOLS/detect_stack.py .
32
+ ```
33
+
34
+ Look for:
35
+ - Entry points: files with `if __name__`, or named `main.py`, `app.py`, `agent.py`, `graph.py`, `pipeline.py`
36
+ - Framework: LangGraph, CrewAI, OpenAI SDK, Anthropic SDK, etc.
37
+ - Existing LangSmith config: `LANGCHAIN_PROJECT` / `LANGSMITH_PROJECT` in env or `.env`
38
+ - Existing test data: JSON files with inputs, CSV files, etc.
39
+ - Dependencies: `requirements.txt`, `pyproject.toml`
40
+
41
+ Identify the **run command** — how to execute the agent. Examples:
42
+ - `python main.py` (if it accepts `--input` flag)
43
+ - `python -c "from agent import run; import json,sys; print(json.dumps(run(json.load(open(sys.argv[1])))))"`
44
+
45
+ ## Phase 2: Confirm Detection (interactive)
46
+
47
+ Use AskUserQuestion:
48
+
49
+ ```json
50
+ {
51
+ "questions": [{
52
+ "question": "Here's what I detected. Does this look right?\n\nEntry point: {path}\nFramework: {framework}\nRun command: {command}\nLangSmith: {status}",
53
+ "header": "Confirm",
54
+ "multiSelect": false,
55
+ "options": [
56
+ {"label": "Looks good, proceed", "description": "Continue with detected configuration"},
57
+ {"label": "Let me adjust", "description": "I'll provide correct paths and commands"},
58
+ {"label": "Wrong directory", "description": "I need to cd somewhere else first"}
59
+ ]
60
+ }]
61
+ }
62
+ ```
63
+
64
+ ## Phase 3: What to Optimize (interactive)
65
+
66
+ Use AskUserQuestion:
67
+
68
+ ```json
69
+ {
70
+ "questions": [{
71
+ "question": "What do you want to optimize?",
72
+ "header": "Goals",
73
+ "multiSelect": true,
74
+ "options": [
75
+ {"label": "Accuracy", "description": "Correctness of outputs — LLM-as-judge evaluator"},
76
+ {"label": "Latency", "description": "Response time — track and minimize"},
77
+ {"label": "Token efficiency", "description": "Fewer tokens for same quality"},
78
+ {"label": "Error handling", "description": "Reduce failures, timeouts, crashes"}
79
+ ]
80
+ }]
81
+ }
82
+ ```
83
+
84
+ Map selections to evaluator configuration for setup.py.
85
+
86
+ ## Phase 4: Test Data Source (interactive)
87
+
88
+ Use AskUserQuestion with **preview**:
89
+
90
+ ```json
91
+ {
92
+ "questions": [{
93
+ "question": "Where should test inputs come from?",
94
+ "header": "Test data",
95
+ "multiSelect": false,
96
+ "options": [
97
+ {
98
+ "label": "Import from LangSmith",
99
+ "description": "Use real production traces as test inputs",
100
+ "preview": "## Import from LangSmith\n\nFetches up to 100 recent traces from your production project.\nPrioritizes traces with negative feedback.\nCreates a LangSmith Dataset with real user inputs.\n\nRequires: an existing LangSmith project with traces."
101
+ },
102
+ {
103
+ "label": "Generate from code",
104
+ "description": "AI generates test inputs by analyzing your code",
105
+ "preview": "## Generate from Code\n\nThe testgen agent reads your source code and generates\n30 diverse test inputs:\n- 40% standard cases\n- 20% edge cases\n- 20% cross-domain\n- 20% adversarial\n\nOutputs are scored by LLM-as-judge."
106
+ },
107
+ {
108
+ "label": "I have test data",
109
+ "description": "Point to an existing file with test inputs",
110
+ "preview": "## Provide Test Data\n\nSupported formats:\n- JSON array of inputs\n- JSON with {\"inputs\": {...}} objects\n- CSV with input columns\n\nExample:\n```json\n[\n {\"input\": \"What is Python?\"},\n {\"input\": \"Explain quantum computing\"}\n]\n```"
111
+ }
112
+ ]
113
+ }]
114
+ }
115
+ ```
116
+
117
+ If "Import from LangSmith": discover projects and ask which one (same as v2 Phase 1.9).
118
+ If "I have test data": ask for file path.
119
+
120
+ ## Phase 5: Run Setup
121
+
122
+ Build the setup.py command based on all gathered information:
123
+
124
+ ```bash
125
+ python3 $TOOLS/setup.py \
126
+ --project-name "{project_name}" \
127
+ --entry-point "{run_command}" \
128
+ --framework "{framework}" \
129
+ --goals "{goals_csv}" \
130
+ ${DATASET_FROM_FILE:+--dataset-from-file "$DATASET_FROM_FILE"} \
131
+ ${DATASET_FROM_LANGSMITH:+--dataset-from-langsmith "$DATASET_FROM_LANGSMITH"} \
132
+ ${PRODUCTION_PROJECT:+--production-project "$PRODUCTION_PROJECT"}
133
+ ```
134
+
135
+ If "Generate from code" was selected AND no test data file exists, first spawn the testgen agent to generate inputs, then pass the generated file to setup.py.
136
+
137
+ ## Phase 6: Generate Test Data (if needed)
138
+
139
+ If testgen is needed, spawn it:
140
+
141
+ ```
142
+ Agent(
143
+ subagent_type: "evolver-testgen",
144
+ description: "TestGen: generate test inputs",
145
+ prompt: |
146
+ <objective>
147
+ Generate 30 diverse test inputs for this project.
148
+ Write them as a JSON array to test_inputs.json.
149
+ </objective>
150
+
151
+ <files_to_read>
152
+ {all .py files discovered in Phase 1}
153
+ </files_to_read>
154
+
155
+ <output>
156
+ Create test_inputs.json with format:
157
+ [{"input": "..."}, {"input": "..."}, ...]
158
+ </output>
159
+ )
160
+ ```
161
+
162
+ Then pass `--dataset-from-file test_inputs.json` to setup.py.
163
+
164
+ ## Phase 7: Report
165
+
166
+ ```
167
+ Setup complete!
168
+ Project: evolver-{name}
169
+ Dataset: {name}-eval-v1 ({N} examples)
170
+ Evaluators: {list}
171
+ Baseline score: {score}
172
+ Config: .evolver.json
173
+
174
+ Next: run /evolver:evolve to start optimizing.
175
+ ```
176
+
177
+ ## Gotchas
178
+
179
+ - If `.evolver.json` already exists, ask before overwriting.
180
+ - If the agent needs a venv, the run command should activate it: `cd {dir} && .venv/bin/python main.py`
181
+ - If LangSmith connection fails, check API key and network.
182
+ - The setup installs `langsmith` and `openevals` if missing.
@@ -1,34 +1,36 @@
1
1
  ---
2
- name: harness-evolver:status
3
- description: "Use when the user asks about evolution progress, current scores, best harness version, how many iterations ran, or whether the loop is stagnating. Also use when the user says 'status', 'progress', or 'how is it going'."
2
+ name: evolver:status
3
+ description: "Use when the user asks about evolution progress, current scores, best version, how many iterations ran, or whether the loop is stagnating."
4
4
  allowed-tools: [Read, Bash]
5
5
  ---
6
6
 
7
- # /harness-evolve-status
7
+ # /evolver:status
8
8
 
9
- Show evolution progress.
10
-
11
- ## Resolve Tool Path
12
-
13
- ```bash
14
- TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo "$HOME/.harness-evolver/tools")
15
- ```
9
+ Show current evolution progress.
16
10
 
17
11
  ## What To Do
18
12
 
19
- If `.harness-evolver/` does not exist, tell user to run `harness-evolver:init` first.
20
-
21
- Otherwise:
13
+ Read `.evolver.json` and report:
22
14
 
23
15
  ```bash
24
- python3 $TOOLS/state.py show --base-dir .harness-evolver
16
+ python3 -c "
17
+ import json
18
+ c = json.load(open('.evolver.json'))
19
+ print(f'Project: {c[\"project\"]}')
20
+ print(f'Dataset: {c[\"dataset\"]}')
21
+ print(f'Framework: {c[\"framework\"]}')
22
+ print(f'Evaluators: {c[\"evaluators\"]}')
23
+ print(f'Iterations: {c[\"iterations\"]}')
24
+ print(f'Best: {c[\"best_experiment\"]} (score: {c[\"best_score\"]:.3f})')
25
+ print(f'Baseline: {c[\"history\"][0][\"score\"]:.3f}' if c['history'] else 'No baseline')
26
+ print()
27
+ print('History:')
28
+ for h in c.get('history', []):
29
+ print(f' {h[\"version\"]}: {h[\"score\"]:.3f}')
30
+ "
25
31
  ```
26
32
 
27
- Then read and display `.harness-evolver/STATE.md` for the full history table.
28
-
29
- ## If User Wants More Detail
33
+ Detect stagnation: if last 3 scores are within 1% of each other, warn.
34
+ Detect regression: if current best is lower than a previous best, warn.
30
35
 
31
- - Scores per task: `cat .harness-evolver/harnesses/{version}/scores.json`
32
- - What changed: `cat .harness-evolver/harnesses/{version}/proposal.md`
33
- - Compare two versions: `diff .harness-evolver/harnesses/{vA}/harness.py .harness-evolver/harnesses/{vB}/harness.py`
34
- - Full history: `cat .harness-evolver/PROPOSER_HISTORY.md`
36
+ Print LangSmith URL for the best experiment if available.
@@ -0,0 +1,240 @@
1
+ #!/usr/bin/env python3
2
+ """Read LangSmith experiment results for Harness Evolver v3.
3
+
4
+ Reads experiment results from LangSmith and formats them for agents
5
+ (proposer, critic, architect). Handles comparison between candidates.
6
+
7
+ Usage:
8
+ python3 read_results.py \
9
+ --experiments v001a,v001b,v001c,v001d,v001e \
10
+ --config .evolver.json \
11
+ [--output results.json]
12
+
13
+ python3 read_results.py \
14
+ --experiment v001a \
15
+ --config .evolver.json \
16
+ --format markdown
17
+
18
+ Requires: pip install langsmith
19
+ """
20
+
21
+ import argparse
22
+ import json
23
+ import os
24
+ import sys
25
+
26
+
27
+ def read_experiment(client, experiment_name):
28
+ """Read results from a single LangSmith experiment."""
29
+ try:
30
+ # List runs for this experiment
31
+ runs = list(client.list_runs(
32
+ project_name=experiment_name,
33
+ is_root=True,
34
+ limit=200,
35
+ ))
36
+
37
+ if not runs:
38
+ return None
39
+
40
+ per_example = {}
41
+ total_tokens = 0
42
+ total_latency_ms = 0
43
+ errors = 0
44
+
45
+ for run in runs:
46
+ example_id = str(run.reference_example_id or run.id)
47
+ tokens = run.total_tokens or 0
48
+ total_tokens += tokens
49
+
50
+ latency_ms = 0
51
+ if run.end_time and run.start_time:
52
+ latency_ms = int((run.end_time - run.start_time).total_seconds() * 1000)
53
+ total_latency_ms += latency_ms
54
+
55
+ has_error = bool(run.error)
56
+ if has_error:
57
+ errors += 1
58
+
59
+ # Read feedback/scores
60
+ feedbacks = list(client.list_feedback(run_ids=[run.id]))
61
+ scores = {}
62
+ for fb in feedbacks:
63
+ if fb.score is not None:
64
+ scores[fb.key] = fb.score
65
+
66
+ per_example[example_id] = {
67
+ "score": sum(scores.values()) / len(scores) if scores else 0.0,
68
+ "scores": scores,
69
+ "tokens": tokens,
70
+ "latency_ms": latency_ms,
71
+ "error": run.error[:200] if run.error else None,
72
+ "input_preview": str(run.inputs)[:200] if run.inputs else "",
73
+ "output_preview": str(run.outputs)[:200] if run.outputs else "",
74
+ }
75
+
76
+ num_examples = len(per_example)
77
+ all_scores = [v["score"] for v in per_example.values()]
78
+ combined_score = sum(all_scores) / len(all_scores) if all_scores else 0.0
79
+
80
+ return {
81
+ "experiment": experiment_name,
82
+ "combined_score": combined_score,
83
+ "num_examples": num_examples,
84
+ "total_tokens": total_tokens,
85
+ "avg_latency_ms": total_latency_ms // max(num_examples, 1),
86
+ "error_count": errors,
87
+ "error_rate": errors / max(num_examples, 1),
88
+ "per_example": per_example,
89
+ }
90
+
91
+ except Exception as e:
92
+ return {"experiment": experiment_name, "error": str(e), "combined_score": 0.0}
93
+
94
+
95
+ def compare_experiments(results_list):
96
+ """Compare multiple experiment results and find winner + per-task champion."""
97
+ if not results_list:
98
+ return None
99
+
100
+ valid = [r for r in results_list if "error" not in r and r.get("combined_score", 0) > 0]
101
+ if not valid:
102
+ valid = results_list
103
+
104
+ # Overall winner
105
+ winner = max(valid, key=lambda r: r.get("combined_score", 0))
106
+
107
+ # Per-task champion (candidate that beats winner on most individual tasks)
108
+ task_wins = {}
109
+ winner_examples = winner.get("per_example", {})
110
+
111
+ for result in valid:
112
+ if result["experiment"] == winner["experiment"]:
113
+ continue
114
+
115
+ wins = 0
116
+ for example_id, data in result.get("per_example", {}).items():
117
+ winner_score = winner_examples.get(example_id, {}).get("score", 0)
118
+ if data.get("score", 0) > winner_score:
119
+ wins += 1
120
+
121
+ if wins > 0:
122
+ task_wins[result["experiment"]] = wins
123
+
124
+ champion = None
125
+ if task_wins:
126
+ champion_name = max(task_wins, key=task_wins.get)
127
+ champion = {
128
+ "experiment": champion_name,
129
+ "task_wins": task_wins[champion_name],
130
+ }
131
+
132
+ return {
133
+ "winner": {
134
+ "experiment": winner["experiment"],
135
+ "score": winner["combined_score"],
136
+ },
137
+ "champion": champion,
138
+ "all_candidates": [
139
+ {
140
+ "experiment": r["experiment"],
141
+ "score": r.get("combined_score", 0),
142
+ "tokens": r.get("total_tokens", 0),
143
+ "latency_ms": r.get("avg_latency_ms", 0),
144
+ "errors": r.get("error_count", 0),
145
+ }
146
+ for r in results_list
147
+ ],
148
+ }
149
+
150
+
151
+ def format_markdown(results):
152
+ """Format experiment results as markdown for agents."""
153
+ lines = [f"# Experiment Results: {results['experiment']}", ""]
154
+
155
+ lines.append(f"**Combined Score**: {results.get('combined_score', 0):.3f}")
156
+ lines.append(f"**Examples**: {results.get('num_examples', 0)}")
157
+ lines.append(f"**Total Tokens**: {results.get('total_tokens', 0)}")
158
+ lines.append(f"**Avg Latency**: {results.get('avg_latency_ms', 0)}ms")
159
+ lines.append(f"**Errors**: {results.get('error_count', 0)} ({results.get('error_rate', 0):.1%})")
160
+ lines.append("")
161
+
162
+ per_example = results.get("per_example", {})
163
+ if per_example:
164
+ # Failing examples
165
+ failing = {k: v for k, v in per_example.items() if v.get("score", 0) < 0.5}
166
+ if failing:
167
+ lines.append("## Failing Examples")
168
+ lines.append("")
169
+ for eid, data in sorted(failing.items(), key=lambda x: x[1].get("score", 0)):
170
+ lines.append(f"- **{eid}**: score={data['score']:.2f}")
171
+ if data.get("error"):
172
+ lines.append(f" Error: {data['error']}")
173
+ lines.append(f" Input: {data.get('input_preview', 'N/A')}")
174
+ lines.append("")
175
+
176
+ return "\n".join(lines)
177
+
178
+
179
+ def main():
180
+ parser = argparse.ArgumentParser(description="Read LangSmith experiment results")
181
+ parser.add_argument("--experiments", default=None, help="Comma-separated experiment names to compare")
182
+ parser.add_argument("--experiment", default=None, help="Single experiment to read")
183
+ parser.add_argument("--config", default=".evolver.json", help="Path to .evolver.json")
184
+ parser.add_argument("--output", default=None, help="Output JSON path")
185
+ parser.add_argument("--format", default="json", choices=["json", "markdown"], help="Output format")
186
+ args = parser.parse_args()
187
+
188
+ from langsmith import Client
189
+ client = Client()
190
+
191
+ if args.experiment:
192
+ # Single experiment
193
+ result = read_experiment(client, args.experiment)
194
+ if not result:
195
+ print(f"No results found for experiment: {args.experiment}", file=sys.stderr)
196
+ sys.exit(1)
197
+
198
+ if args.format == "markdown":
199
+ output = format_markdown(result)
200
+ else:
201
+ output = json.dumps(result, indent=2, default=str)
202
+
203
+ if args.output:
204
+ with open(args.output, "w") as f:
205
+ f.write(output)
206
+ print(output)
207
+
208
+ elif args.experiments:
209
+ # Compare multiple experiments
210
+ experiment_names = [e.strip() for e in args.experiments.split(",")]
211
+ results_list = []
212
+
213
+ for name in experiment_names:
214
+ print(f"Reading experiment: {name}...", file=sys.stderr)
215
+ result = read_experiment(client, name)
216
+ if result:
217
+ results_list.append(result)
218
+
219
+ if not results_list:
220
+ print("No experiment results found.", file=sys.stderr)
221
+ sys.exit(1)
222
+
223
+ comparison = compare_experiments(results_list)
224
+ output = json.dumps({
225
+ "comparison": comparison,
226
+ "experiments": results_list,
227
+ }, indent=2, default=str)
228
+
229
+ if args.output:
230
+ with open(args.output, "w") as f:
231
+ f.write(output)
232
+ print(output)
233
+
234
+ else:
235
+ print("Provide --experiment or --experiments", file=sys.stderr)
236
+ sys.exit(1)
237
+
238
+
239
+ if __name__ == "__main__":
240
+ main()
@@ -0,0 +1,202 @@
1
+ #!/usr/bin/env python3
2
+ """Run LangSmith evaluation for a candidate in a worktree.
3
+
4
+ Wraps client.evaluate() — runs the user's agent against the dataset
5
+ with configured evaluators, from within a specific directory (worktree).
6
+
7
+ Usage:
8
+ python3 run_eval.py \
9
+ --config .evolver.json \
10
+ --worktree-path /tmp/worktree-abc \
11
+ --experiment-prefix v001a \
12
+ [--timeout 120]
13
+
14
+ Requires: pip install langsmith openevals
15
+ """
16
+
17
+ import argparse
18
+ import json
19
+ import os
20
+ import subprocess
21
+ import sys
22
+ import tempfile
23
+
24
+
25
+ def make_target(entry_point, cwd):
26
+ """Create a target function that runs the agent from a specific directory."""
27
+ def target(inputs):
28
+ input_json = json.dumps(inputs)
29
+ input_path = tempfile.mktemp(suffix=".json")
30
+ output_path = input_path + ".out"
31
+
32
+ with open(input_path, "w") as f:
33
+ f.write(input_json)
34
+
35
+ try:
36
+ cmd = entry_point
37
+ if "{input}" in cmd:
38
+ cmd = cmd.replace("{input}", input_path)
39
+ elif "{input_json}" in cmd:
40
+ cmd = cmd.replace("{input_json}", input_json)
41
+ else:
42
+ cmd = f"{cmd} --input {input_path} --output {output_path}"
43
+
44
+ env = os.environ.copy()
45
+ # Ensure traces go to the evolver project
46
+ env["LANGSMITH_TRACING"] = "true"
47
+
48
+ result = subprocess.run(
49
+ cmd, shell=True, capture_output=True, text=True,
50
+ timeout=int(os.environ.get("EVAL_TASK_TIMEOUT", "120")),
51
+ cwd=cwd, env=env,
52
+ )
53
+
54
+ # Read output file if it exists
55
+ if os.path.exists(output_path):
56
+ with open(output_path) as f:
57
+ try:
58
+ return json.load(f)
59
+ except json.JSONDecodeError:
60
+ pass
61
+
62
+ # Fallback: parse stdout
63
+ if result.stdout.strip():
64
+ try:
65
+ return json.loads(result.stdout)
66
+ except json.JSONDecodeError:
67
+ return {"output": result.stdout.strip()}
68
+
69
+ # Accept segfault (139) if output was produced
70
+ if result.returncode != 0:
71
+ return {"output": "", "error": result.stderr.strip()[:500]}
72
+
73
+ return {"output": ""}
74
+
75
+ except subprocess.TimeoutExpired:
76
+ return {"output": "", "error": f"TIMEOUT after {os.environ.get('EVAL_TASK_TIMEOUT', '120')}s"}
77
+ except Exception as e:
78
+ return {"output": "", "error": str(e)}
79
+ finally:
80
+ for p in [input_path, output_path]:
81
+ if os.path.exists(p):
82
+ try:
83
+ os.remove(p)
84
+ except OSError:
85
+ pass
86
+
87
+ return target
88
+
89
+
90
+ def load_evaluators(evaluator_keys):
91
+ """Load evaluators by key name."""
92
+ from openevals.llm import create_llm_as_judge
93
+ from openevals.prompts import CORRECTNESS_PROMPT, CONCISENESS_PROMPT
94
+
95
+ evaluators = []
96
+ for key in evaluator_keys:
97
+ if key == "correctness":
98
+ evaluators.append(create_llm_as_judge(
99
+ prompt=CORRECTNESS_PROMPT,
100
+ feedback_key="correctness",
101
+ model="openai:gpt-4.1-mini",
102
+ ))
103
+ elif key == "conciseness":
104
+ evaluators.append(create_llm_as_judge(
105
+ prompt=CONCISENESS_PROMPT,
106
+ feedback_key="conciseness",
107
+ model="openai:gpt-4.1-mini",
108
+ ))
109
+ elif key == "latency":
110
+ def latency_eval(inputs, outputs, **kwargs):
111
+ return {"key": "has_output", "score": 1.0 if outputs else 0.0}
112
+ evaluators.append(latency_eval)
113
+ elif key == "token_efficiency":
114
+ def token_eval(inputs, outputs, **kwargs):
115
+ output_text = str(outputs.get("output", outputs.get("answer", "")))
116
+ score = min(1.0, 2000 / max(len(output_text), 1))
117
+ return {"key": "token_efficiency", "score": score}
118
+ evaluators.append(token_eval)
119
+
120
+ return evaluators
121
+
122
+
123
+ def main():
124
+ parser = argparse.ArgumentParser(description="Run LangSmith evaluation for a candidate")
125
+ parser.add_argument("--config", default=".evolver.json", help="Path to .evolver.json")
126
+ parser.add_argument("--worktree-path", required=True, help="Path to the candidate's worktree")
127
+ parser.add_argument("--experiment-prefix", required=True, help="Experiment name prefix (e.g. v001a)")
128
+ parser.add_argument("--timeout", type=int, default=120, help="Per-task timeout in seconds")
129
+ args = parser.parse_args()
130
+
131
+ with open(args.config) as f:
132
+ config = json.load(f)
133
+
134
+ os.environ["EVAL_TASK_TIMEOUT"] = str(args.timeout)
135
+
136
+ from langsmith import Client
137
+ client = Client()
138
+
139
+ target = make_target(config["entry_point"], args.worktree_path)
140
+ evaluators = load_evaluators(config["evaluators"])
141
+
142
+ print(f"Running evaluation: {args.experiment_prefix}")
143
+ print(f" Dataset: {config['dataset']}")
144
+ print(f" Worktree: {args.worktree_path}")
145
+ print(f" Evaluators: {config['evaluators']}")
146
+
147
+ try:
148
+ results = client.evaluate(
149
+ target,
150
+ data=config["dataset"],
151
+ evaluators=evaluators,
152
+ experiment_prefix=args.experiment_prefix,
153
+ max_concurrency=1,
154
+ )
155
+
156
+ experiment_name = results.experiment_name
157
+
158
+ # Calculate mean score
159
+ scores = []
160
+ per_example = {}
161
+ for result in results:
162
+ example_scores = []
163
+ if result.evaluation_results and result.evaluation_results.get("results"):
164
+ for er in result.evaluation_results["results"]:
165
+ if er.get("score") is not None:
166
+ example_scores.append(er["score"])
167
+ scores.append(er["score"])
168
+
169
+ example_id = str(result.example.id) if result.example else "unknown"
170
+ per_example[example_id] = {
171
+ "score": sum(example_scores) / len(example_scores) if example_scores else 0.0,
172
+ "num_evaluators": len(example_scores),
173
+ }
174
+
175
+ mean_score = sum(scores) / len(scores) if scores else 0.0
176
+
177
+ output = {
178
+ "experiment": experiment_name,
179
+ "prefix": args.experiment_prefix,
180
+ "combined_score": mean_score,
181
+ "num_examples": len(per_example),
182
+ "num_scores": len(scores),
183
+ "per_example": per_example,
184
+ }
185
+
186
+ print(json.dumps(output))
187
+ print(f"\nEvaluation complete: {mean_score:.3f} ({len(per_example)} examples)")
188
+
189
+ except Exception as e:
190
+ print(f"Evaluation failed: {e}", file=sys.stderr)
191
+ output = {
192
+ "experiment": None,
193
+ "prefix": args.experiment_prefix,
194
+ "combined_score": 0.0,
195
+ "error": str(e),
196
+ }
197
+ print(json.dumps(output))
198
+ sys.exit(1)
199
+
200
+
201
+ if __name__ == "__main__":
202
+ main()