harness-evolver 2.9.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -117
- package/agents/evolver-architect.md +53 -0
- package/agents/evolver-critic.md +44 -0
- package/agents/evolver-proposer.md +128 -0
- package/agents/evolver-testgen.md +67 -0
- package/bin/install.js +181 -171
- package/package.json +7 -7
- package/skills/deploy/SKILL.md +49 -56
- package/skills/evolve/SKILL.md +156 -687
- package/skills/setup/SKILL.md +182 -0
- package/skills/status/SKILL.md +23 -21
- package/tools/read_results.py +240 -0
- package/tools/run_eval.py +202 -0
- package/tools/seed_from_traces.py +36 -8
- package/tools/setup.py +393 -0
- package/tools/trace_insights.py +86 -14
- package/agents/harness-evolver-architect.md +0 -173
- package/agents/harness-evolver-critic.md +0 -132
- package/agents/harness-evolver-judge.md +0 -110
- package/agents/harness-evolver-proposer.md +0 -317
- package/agents/harness-evolver-testgen.md +0 -112
- package/examples/classifier/README.md +0 -25
- package/examples/classifier/config.json +0 -3
- package/examples/classifier/eval.py +0 -58
- package/examples/classifier/harness.py +0 -111
- package/examples/classifier/tasks/task_001.json +0 -1
- package/examples/classifier/tasks/task_002.json +0 -1
- package/examples/classifier/tasks/task_003.json +0 -1
- package/examples/classifier/tasks/task_004.json +0 -1
- package/examples/classifier/tasks/task_005.json +0 -1
- package/examples/classifier/tasks/task_006.json +0 -1
- package/examples/classifier/tasks/task_007.json +0 -1
- package/examples/classifier/tasks/task_008.json +0 -1
- package/examples/classifier/tasks/task_009.json +0 -1
- package/examples/classifier/tasks/task_010.json +0 -1
- package/skills/architect/SKILL.md +0 -93
- package/skills/compare/SKILL.md +0 -73
- package/skills/critic/SKILL.md +0 -67
- package/skills/diagnose/SKILL.md +0 -96
- package/skills/import-traces/SKILL.md +0 -102
- package/skills/init/SKILL.md +0 -293
- package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
- package/tools/__pycache__/init.cpython-313.pyc +0 -0
- package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
- package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
- package/tools/eval_llm_judge.py +0 -233
- package/tools/eval_passthrough.py +0 -55
- package/tools/evaluate.py +0 -255
- package/tools/import_traces.py +0 -229
- package/tools/init.py +0 -531
- package/tools/llm_api.py +0 -125
- package/tools/state.py +0 -219
- package/tools/test_growth.py +0 -230
- package/tools/trace_logger.py +0 -42
package/skills/init/SKILL.md
DELETED
|
@@ -1,293 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: harness-evolver:init
|
|
3
|
-
description: "Use when the user wants to set up harness optimization in their project, optimize an LLM agent, improve a harness, or mentions harness-evolver for the first time in a project without .harness-evolver/ directory."
|
|
4
|
-
argument-hint: "[directory]"
|
|
5
|
-
allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent, AskUserQuestion]
|
|
6
|
-
---
|
|
7
|
-
|
|
8
|
-
# /harness-evolve-init
|
|
9
|
-
|
|
10
|
-
Set up the Harness Evolver in a project. Scans the codebase, identifies the entry point, creates missing artifacts, runs baseline evaluation.
|
|
11
|
-
|
|
12
|
-
## Resolve Tool Path
|
|
13
|
-
|
|
14
|
-
```bash
|
|
15
|
-
TOOLS=$([ -d ".harness-evolver/tools" ] && echo ".harness-evolver/tools" || echo "$HOME/.harness-evolver/tools")
|
|
16
|
-
```
|
|
17
|
-
|
|
18
|
-
Use `$TOOLS` prefix for all tool calls below.
|
|
19
|
-
|
|
20
|
-
## Phase 1: Scan
|
|
21
|
-
|
|
22
|
-
```bash
|
|
23
|
-
find . -maxdepth 3 -type f -name "*.py" | head -30
|
|
24
|
-
python3 $TOOLS/detect_stack.py .
|
|
25
|
-
```
|
|
26
|
-
|
|
27
|
-
Look for:
|
|
28
|
-
- Entry points: files with `if __name__`, or named `main.py`, `app.py`, `agent.py`, `graph.py`, `pipeline.py`, `bot.py`
|
|
29
|
-
- Existing eval: `eval.py`, `score.py`, `judge.py`
|
|
30
|
-
- Existing tasks: directories with JSON files containing `id` + `input` fields
|
|
31
|
-
- Config: `config.json`, `config.yaml`, `.env`
|
|
32
|
-
|
|
33
|
-
## Phase 1.5: Confirm Detection (Interactive)
|
|
34
|
-
|
|
35
|
-
After scanning, present what was found and ask the user to confirm before proceeding.
|
|
36
|
-
|
|
37
|
-
Use AskUserQuestion:
|
|
38
|
-
|
|
39
|
-
```
|
|
40
|
-
Question: "Here's what I detected. Does this look right?"
|
|
41
|
-
Header: "Confirm"
|
|
42
|
-
Options:
|
|
43
|
-
- "Looks good, proceed" — Continue with detected paths
|
|
44
|
-
- "Let me adjust paths" — User will provide correct paths
|
|
45
|
-
- "Start over in different directory" — Abort and let user cd elsewhere
|
|
46
|
-
|
|
47
|
-
Show in the question description:
|
|
48
|
-
- Harness: {path or "not found"}
|
|
49
|
-
- Eval: {path or "not found — will use LLM-as-judge"}
|
|
50
|
-
- Tasks: {path with N files, or "not found — will generate"}
|
|
51
|
-
- Stack: {detected frameworks or "none detected"}
|
|
52
|
-
- Architecture: {topology or "unknown"}
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
If user chose "Let me adjust paths", ask which paths to change and update accordingly.
|
|
56
|
-
|
|
57
|
-
## Phase 1.8: Eval Mode (Interactive — only if NO eval found)
|
|
58
|
-
|
|
59
|
-
If no eval.py was detected, ask the user which evaluation mode to use.
|
|
60
|
-
|
|
61
|
-
Use AskUserQuestion with **preview** (single-select with side-by-side preview):
|
|
62
|
-
|
|
63
|
-
```json
|
|
64
|
-
{
|
|
65
|
-
"questions": [{
|
|
66
|
-
"question": "No eval script found. How should outputs be scored?",
|
|
67
|
-
"header": "Eval mode",
|
|
68
|
-
"multiSelect": false,
|
|
69
|
-
"options": [
|
|
70
|
-
{
|
|
71
|
-
"label": "LLM-as-judge (zero-config)",
|
|
72
|
-
"description": "Claude Code scores outputs automatically. No expected answers needed.",
|
|
73
|
-
"preview": "## LLM-as-Judge\n\nScoring dimensions:\n- **Accuracy** (40%) — correctness of output\n- **Completeness** (20%) — covers all aspects\n- **Relevance** (20%) — focused on the question\n- **No-Hallucination** (20%) — supported by facts\n\nEach scored 1-5, normalized to 0.0-1.0.\n\n**Requirements:** None. Works with any task format.\n\n```json\n{\"id\": \"task_001\", \"input\": \"your question\"}\n```"
|
|
74
|
-
},
|
|
75
|
-
{
|
|
76
|
-
"label": "Keyword matching",
|
|
77
|
-
"description": "Check if expected substrings appear in the output. Requires 'expected' field.",
|
|
78
|
-
"preview": "## Keyword Matching\n\nSimple deterministic scoring:\n- Score 1.0 if ALL expected keywords found in output\n- Score 0.0 otherwise\n\n**Requirements:** Tasks must include `expected` field:\n\n```json\n{\n \"id\": \"task_001\",\n \"input\": \"What is the capital of France?\",\n \"expected\": \"Paris\"\n}\n```\n\nFast, deterministic, no LLM calls during eval."
|
|
79
|
-
},
|
|
80
|
-
{
|
|
81
|
-
"label": "I'll provide my own eval.py",
|
|
82
|
-
"description": "Pause setup. You write the eval script following the contract.",
|
|
83
|
-
"preview": "## Custom Eval Contract\n\nYour eval.py must accept:\n```\npython3 eval.py \\\n --results-dir DIR \\\n --tasks-dir DIR \\\n --scores OUTPUT.json\n```\n\nMust write scores.json:\n```json\n{\n \"combined_score\": 0.85,\n \"per_task\": {\n \"task_001\": {\"score\": 0.9},\n \"task_002\": {\"score\": 0.8}\n }\n}\n```\n\nScores must be 0.0 to 1.0."
|
|
84
|
-
}
|
|
85
|
-
]
|
|
86
|
-
}]
|
|
87
|
-
}
|
|
88
|
-
```
|
|
89
|
-
|
|
90
|
-
If "LLM-as-judge": copy eval_passthrough.py as eval.py.
|
|
91
|
-
If "Keyword matching": create a simple keyword eval (check if expected substrings appear in output).
|
|
92
|
-
If "I'll provide my own": print instructions for the eval contract and wait.
|
|
93
|
-
|
|
94
|
-
## Phase 1.9: LangSmith Project (Interactive — only if LANGSMITH_API_KEY detected)
|
|
95
|
-
|
|
96
|
-
If a LangSmith API key is available, discover projects and ask which one has production traces:
|
|
97
|
-
|
|
98
|
-
```bash
|
|
99
|
-
langsmith-cli --json projects list --limit 10 2>/dev/null
|
|
100
|
-
```
|
|
101
|
-
|
|
102
|
-
Use AskUserQuestion with **preview** (single-select with side-by-side). Build options dynamically from the discovered projects:
|
|
103
|
-
|
|
104
|
-
```json
|
|
105
|
-
{
|
|
106
|
-
"questions": [{
|
|
107
|
-
"question": "LangSmith detected. Which project has your production traces?",
|
|
108
|
-
"header": "LangSmith",
|
|
109
|
-
"multiSelect": false,
|
|
110
|
-
"options": [
|
|
111
|
-
{
|
|
112
|
-
"label": "{project_name_1}",
|
|
113
|
-
"description": "{run_count} runs, last active {date}",
|
|
114
|
-
"preview": "## {project_name_1}\n\n- **Runs:** {run_count}\n- **Last active:** {date}\n- **Created:** {created_date}\n\nSelecting this project will:\n1. Fetch up to 100 recent traces\n2. Analyze traffic distribution and error patterns\n3. Generate production_seed.md for testgen\n4. Proposers will see real usage data"
|
|
115
|
-
},
|
|
116
|
-
{
|
|
117
|
-
"label": "{project_name_2}",
|
|
118
|
-
"description": "{run_count} runs, last active {date}",
|
|
119
|
-
"preview": "## {project_name_2}\n\n- **Runs:** {run_count}\n- **Last active:** {date}\n- **Created:** {created_date}\n\n(same explanation)"
|
|
120
|
-
},
|
|
121
|
-
{
|
|
122
|
-
"label": "Skip",
|
|
123
|
-
"description": "Don't use production traces",
|
|
124
|
-
"preview": "## Skip Production Traces\n\nThe evolver will work without production data:\n- Testgen generates synthetic tasks from code analysis\n- No real-world traffic distribution\n- No production error patterns\n\nYou can import traces later with:\n`/harness-evolver:import-traces`"
|
|
125
|
-
}
|
|
126
|
-
]
|
|
127
|
-
}]
|
|
128
|
-
}
|
|
129
|
-
```
|
|
130
|
-
|
|
131
|
-
Build the options from the `langsmith-cli` output. Use up to 3 projects (sorted by most recent activity) + the "Skip" option. Fill in actual values for run_count, date, etc.
|
|
132
|
-
|
|
133
|
-
If a project is selected, pass it as `--langsmith-project` to init.py.
|
|
134
|
-
|
|
135
|
-
## Phase 2: Create What's Missing
|
|
136
|
-
|
|
137
|
-
Three artifacts needed. For each — use existing if found, create if not.
|
|
138
|
-
|
|
139
|
-
**Harness** (`harness.py`): If user's entry point doesn't match our CLI interface (`--input`, `--output`, `--traces-dir`, `--config`), create a thin wrapper that imports their code. Read their entry point first to understand the I/O format. Ask if unsure.
|
|
140
|
-
|
|
141
|
-
**Eval** (`eval.py`): If an eval script exists, use it. If the user already chose an eval mode in Phase 1.8, follow that choice.
|
|
142
|
-
|
|
143
|
-
If NO eval exists and no mode was chosen yet:
|
|
144
|
-
- Copy `eval_passthrough.py` from `$TOOLS/eval_passthrough.py` as the project's eval.py:
|
|
145
|
-
```bash
|
|
146
|
-
cp $TOOLS/eval_passthrough.py eval.py
|
|
147
|
-
```
|
|
148
|
-
- This passthrough eval collects outputs for the judge subagent to score during evolve.
|
|
149
|
-
- Print: "No eval found. Using LLM-as-judge (Claude Code scores outputs directly)."
|
|
150
|
-
|
|
151
|
-
**Tasks** (`tasks/`): If test tasks exist, use them.
|
|
152
|
-
|
|
153
|
-
If NO tasks exist, generate them. First, identify all relevant source files:
|
|
154
|
-
|
|
155
|
-
```bash
|
|
156
|
-
find . -name "*.py" -not -path "./.venv/*" -not -path "./.harness-evolver/*" | head -10
|
|
157
|
-
find . -name "*.json" -o -name "*.md" -o -name "*.txt" -o -name "*.yaml" -o -name "*.yml" | grep -v .venv | grep -v .harness-evolver | head -10
|
|
158
|
-
```
|
|
159
|
-
|
|
160
|
-
Then spawn testgen subagent with CONCRETE file paths (not placeholders):
|
|
161
|
-
|
|
162
|
-
```
|
|
163
|
-
Agent(
|
|
164
|
-
subagent_type: "harness-evolver-testgen",
|
|
165
|
-
description: "TestGen: generate 30 test cases",
|
|
166
|
-
prompt: |
|
|
167
|
-
<objective>
|
|
168
|
-
Generate 30 diverse test cases for this project. Write them to the tasks/ directory
|
|
169
|
-
in the current working directory.
|
|
170
|
-
</objective>
|
|
171
|
-
|
|
172
|
-
<project_context>
|
|
173
|
-
This project is at: {absolute path to project root}
|
|
174
|
-
Entry point: {the harness/agent file you identified, e.g., crew.py or pipeline/moderator.py}
|
|
175
|
-
Framework: {what you detected — CrewAI, LangGraph, etc.}
|
|
176
|
-
</project_context>
|
|
177
|
-
|
|
178
|
-
<files_to_read>
|
|
179
|
-
{LIST EVERY .py file and data file you found above — use ABSOLUTE PATHS}
|
|
180
|
-
Example:
|
|
181
|
-
- /home/rp/Desktop/test-crewai/crew.py
|
|
182
|
-
- /home/rp/Desktop/test-crewai/README.md
|
|
183
|
-
</files_to_read>
|
|
184
|
-
|
|
185
|
-
<production_traces>
|
|
186
|
-
{IF .harness-evolver/production_seed.md EXISTS, paste its full contents here.
|
|
187
|
-
This file contains real production inputs, traffic distribution, error patterns,
|
|
188
|
-
and user feedback from LangSmith. Use it to generate REALISTIC test cases that
|
|
189
|
-
match actual usage patterns instead of synthetic ones.
|
|
190
|
-
|
|
191
|
-
If the file does not exist, omit this entire block.}
|
|
192
|
-
</production_traces>
|
|
193
|
-
|
|
194
|
-
<output>
|
|
195
|
-
Create directory tasks/ (at project root) with 30 files: task_001.json through task_030.json.
|
|
196
|
-
Format: {"id": "task_001", "input": "...", "metadata": {"difficulty": "easy|medium|hard", "type": "standard|edge|cross_domain|adversarial"}}
|
|
197
|
-
No "expected" field needed — the judge subagent will score outputs.
|
|
198
|
-
Distribution: 40% standard, 20% edge, 20% cross-domain, 20% adversarial.
|
|
199
|
-
If production traces are available, match the real traffic distribution instead of uniform.
|
|
200
|
-
</output>
|
|
201
|
-
)
|
|
202
|
-
```
|
|
203
|
-
|
|
204
|
-
Wait for `## TESTGEN COMPLETE`. If the subagent fails or returns with no tasks, generate them yourself inline (fallback).
|
|
205
|
-
|
|
206
|
-
Print: "Generated {N} test cases from code analysis."
|
|
207
|
-
|
|
208
|
-
If `.harness-evolver/production_seed.md` exists, also print:
|
|
209
|
-
"Tasks enriched with production trace data from LangSmith."
|
|
210
|
-
|
|
211
|
-
## Phase 3: Run Init
|
|
212
|
-
|
|
213
|
-
First, check if the project has a LangSmith production project configured:
|
|
214
|
-
|
|
215
|
-
```bash
|
|
216
|
-
# Auto-detect from env vars or .env
|
|
217
|
-
PROD_PROJECT=$(python3 -c "
|
|
218
|
-
import os
|
|
219
|
-
for v in ('LANGCHAIN_PROJECT', 'LANGSMITH_PROJECT'):
|
|
220
|
-
p = os.environ.get(v, '')
|
|
221
|
-
if p: print(p); exit()
|
|
222
|
-
for f in ('.env', '.env.local'):
|
|
223
|
-
if os.path.exists(f):
|
|
224
|
-
for line in open(f):
|
|
225
|
-
line = line.strip()
|
|
226
|
-
if '=' in line and not line.startswith('#'):
|
|
227
|
-
k, _, val = line.partition('=')
|
|
228
|
-
if k.strip() in ('LANGCHAIN_PROJECT', 'LANGSMITH_PROJECT'):
|
|
229
|
-
print(val.strip().strip('\"').strip(\"'\"))
|
|
230
|
-
exit()
|
|
231
|
-
" 2>/dev/null)
|
|
232
|
-
```
|
|
233
|
-
|
|
234
|
-
```bash
|
|
235
|
-
python3 $TOOLS/init.py [directory] \
|
|
236
|
-
--harness harness.py --eval eval.py --tasks tasks/ \
|
|
237
|
-
--tools-dir $TOOLS \
|
|
238
|
-
${PROD_PROJECT:+--langsmith-project "$PROD_PROJECT"}
|
|
239
|
-
```
|
|
240
|
-
|
|
241
|
-
Add `--harness-config config.json` if a config exists.
|
|
242
|
-
|
|
243
|
-
For **LLM-powered agents** that make real API calls (LangGraph, CrewAI, etc.) and take
|
|
244
|
-
more than 30 seconds per invocation, increase the validation timeout:
|
|
245
|
-
|
|
246
|
-
```bash
|
|
247
|
-
python3 $TOOLS/init.py [directory] \
|
|
248
|
-
--harness harness.py --eval eval.py --tasks tasks/ \
|
|
249
|
-
--tools-dir $TOOLS \
|
|
250
|
-
--validation-timeout 120
|
|
251
|
-
```
|
|
252
|
-
|
|
253
|
-
If validation keeps timing out but you've verified the harness works manually, skip it:
|
|
254
|
-
|
|
255
|
-
```bash
|
|
256
|
-
python3 $TOOLS/init.py [directory] \
|
|
257
|
-
--harness harness.py --eval eval.py --tasks tasks/ \
|
|
258
|
-
--tools-dir $TOOLS \
|
|
259
|
-
--skip-validation
|
|
260
|
-
```
|
|
261
|
-
|
|
262
|
-
## After Init — Report
|
|
263
|
-
|
|
264
|
-
- What was detected vs created
|
|
265
|
-
- Stack + integrations (LangSmith, Context7)
|
|
266
|
-
- Baseline score
|
|
267
|
-
- Next: `harness-evolver:evolve` to start
|
|
268
|
-
|
|
269
|
-
## Architecture Hint
|
|
270
|
-
|
|
271
|
-
After init completes, run a quick architecture analysis:
|
|
272
|
-
|
|
273
|
-
```bash
|
|
274
|
-
python3 $TOOLS/analyze_architecture.py --harness .harness-evolver/baseline/harness.py
|
|
275
|
-
```
|
|
276
|
-
|
|
277
|
-
If the analysis suggests the current topology may not be optimal for the task complexity, mention it:
|
|
278
|
-
|
|
279
|
-
> Architecture note: Current topology is "{topology}". For tasks with {characteristics},
|
|
280
|
-
> consider running `/harness-evolver:architect` for a detailed recommendation.
|
|
281
|
-
|
|
282
|
-
This is advisory only — do not spawn the architect agent.
|
|
283
|
-
|
|
284
|
-
## Gotchas
|
|
285
|
-
|
|
286
|
-
- The harness must write valid JSON to `--output`. If the user's code returns non-JSON, the wrapper must serialize it.
|
|
287
|
-
- Tasks must have unique `id` fields. Duplicate IDs cause silent eval errors.
|
|
288
|
-
- The `expected` field is never shown to the harness — only the eval script sees it.
|
|
289
|
-
- If `.harness-evolver/` already exists, warn before overwriting.
|
|
290
|
-
- If no Python files exist in CWD, the user is probably in the wrong directory.
|
|
291
|
-
- **Monorepo / venv mismatch**: In monorepos with dedicated venvs per app, the system `python3` may differ from the project's Python version. The harness wrapper should re-exec with the correct venv Python. The tools now use `sys.executable` instead of hardcoded `python3`.
|
|
292
|
-
- **Stale site-packages**: If the project uses editable installs (`pip install -e .`), packages in `site-packages/` may have stale copies of data files (e.g. registry YAMLs). Run `uv pip install -e . --force-reinstall --no-deps` to sync.
|
|
293
|
-
- **Validation timeout**: LLM agents making real API calls typically take 15-60s per invocation. Use `--validation-timeout 120` or `--skip-validation` to handle this.
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
package/tools/eval_llm_judge.py
DELETED
|
@@ -1,233 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""LLM-as-judge evaluation script for Harness Evolver.
|
|
3
|
-
|
|
4
|
-
Scores harness outputs using an LLM judge across multiple quality dimensions:
|
|
5
|
-
accuracy, completeness, relevance, no_hallucination.
|
|
6
|
-
|
|
7
|
-
CLI interface matches existing evals: --results-dir, --tasks-dir, --scores.
|
|
8
|
-
Stdlib-only. No external dependencies.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import argparse
|
|
12
|
-
import json
|
|
13
|
-
import os
|
|
14
|
-
import re
|
|
15
|
-
import sys
|
|
16
|
-
|
|
17
|
-
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
18
|
-
from llm_api import detect_provider, call_llm
|
|
19
|
-
|
|
20
|
-
DIMENSIONS = ["accuracy", "completeness", "relevance", "no_hallucination"]
|
|
21
|
-
|
|
22
|
-
WEIGHTS = {
|
|
23
|
-
"accuracy": 0.4,
|
|
24
|
-
"completeness": 0.2,
|
|
25
|
-
"relevance": 0.2,
|
|
26
|
-
"no_hallucination": 0.2,
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def build_judge_prompt(task, result):
|
|
31
|
-
"""Build the evaluation prompt for the LLM judge."""
|
|
32
|
-
prompt_parts = [
|
|
33
|
-
"You are an expert evaluator. Assess the quality of the following output.",
|
|
34
|
-
"",
|
|
35
|
-
"QUESTION/INPUT:",
|
|
36
|
-
str(task.get("input", "")),
|
|
37
|
-
"",
|
|
38
|
-
"OUTPUT TO EVALUATE:",
|
|
39
|
-
str(result.get("output", "")),
|
|
40
|
-
]
|
|
41
|
-
|
|
42
|
-
if "expected" in task:
|
|
43
|
-
prompt_parts.extend([
|
|
44
|
-
"",
|
|
45
|
-
"REFERENCE ANSWER:",
|
|
46
|
-
str(task["expected"]),
|
|
47
|
-
])
|
|
48
|
-
|
|
49
|
-
prompt_parts.extend([
|
|
50
|
-
"",
|
|
51
|
-
"Score each dimension from 1 (worst) to 5 (best):",
|
|
52
|
-
"- accuracy: Is the output factually correct and properly addresses the input?",
|
|
53
|
-
"- completeness: Does it cover all relevant aspects?",
|
|
54
|
-
"- relevance: Is it focused and on-topic?",
|
|
55
|
-
"- no_hallucination: Does it avoid fabricating information not supported by context?",
|
|
56
|
-
"",
|
|
57
|
-
"Think step by step, then respond with ONLY this JSON:",
|
|
58
|
-
'{"reasoning": "your analysis", "accuracy": N, "completeness": N, "relevance": N, "no_hallucination": N}',
|
|
59
|
-
])
|
|
60
|
-
|
|
61
|
-
return "\n".join(prompt_parts)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def extract_json_scores(response):
|
|
65
|
-
"""Extract scoring JSON from LLM response. Handles fenced and bare JSON."""
|
|
66
|
-
# Try direct parse
|
|
67
|
-
try:
|
|
68
|
-
data = json.loads(response.strip())
|
|
69
|
-
if "accuracy" in data:
|
|
70
|
-
return data
|
|
71
|
-
except (json.JSONDecodeError, ValueError):
|
|
72
|
-
pass
|
|
73
|
-
|
|
74
|
-
# Try extracting from markdown fences
|
|
75
|
-
fence_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response, re.DOTALL)
|
|
76
|
-
if fence_match:
|
|
77
|
-
try:
|
|
78
|
-
data = json.loads(fence_match.group(1))
|
|
79
|
-
if "accuracy" in data:
|
|
80
|
-
return data
|
|
81
|
-
except (json.JSONDecodeError, ValueError):
|
|
82
|
-
pass
|
|
83
|
-
|
|
84
|
-
# Try regex extraction for JSON with accuracy key
|
|
85
|
-
json_match = re.search(r'\{[^{}]*"accuracy"\s*:\s*\d[^{}]*\}', response)
|
|
86
|
-
if json_match:
|
|
87
|
-
try:
|
|
88
|
-
data = json.loads(json_match.group(0))
|
|
89
|
-
if "accuracy" in data:
|
|
90
|
-
return data
|
|
91
|
-
except (json.JSONDecodeError, ValueError):
|
|
92
|
-
pass
|
|
93
|
-
|
|
94
|
-
return None
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def normalize_score(raw_score):
|
|
98
|
-
"""Normalize a 1-5 score to 0.0-1.0 range."""
|
|
99
|
-
clamped = max(1, min(5, int(raw_score)))
|
|
100
|
-
return (clamped - 1) / 4.0
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def compute_combined_score(scores_dict):
|
|
104
|
-
"""Compute weighted combined score from normalized dimension scores."""
|
|
105
|
-
total = 0.0
|
|
106
|
-
for dim in DIMENSIONS:
|
|
107
|
-
total += scores_dict.get(dim, 0.0) * WEIGHTS[dim]
|
|
108
|
-
return total
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def evaluate_task(provider, api_key, model, task, result):
|
|
112
|
-
"""Evaluate a single task with the LLM judge. Returns per-task score dict."""
|
|
113
|
-
prompt = build_judge_prompt(task, result)
|
|
114
|
-
|
|
115
|
-
try:
|
|
116
|
-
response = call_llm(provider, api_key, model, prompt, max_tokens=2048)
|
|
117
|
-
except Exception as e:
|
|
118
|
-
return {
|
|
119
|
-
"score": 0.0,
|
|
120
|
-
"accuracy": 1, "completeness": 1, "relevance": 1, "no_hallucination": 1,
|
|
121
|
-
"reasoning": f"LLM call failed: {e}",
|
|
122
|
-
"error": str(e),
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
parsed = extract_json_scores(response)
|
|
126
|
-
if parsed is None:
|
|
127
|
-
return {
|
|
128
|
-
"score": 0.0,
|
|
129
|
-
"accuracy": 1, "completeness": 1, "relevance": 1, "no_hallucination": 1,
|
|
130
|
-
"reasoning": f"Failed to parse judge response: {response[:200]}",
|
|
131
|
-
"error": "parse_failed",
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
# Extract raw scores
|
|
135
|
-
raw = {}
|
|
136
|
-
normalized = {}
|
|
137
|
-
for dim in DIMENSIONS:
|
|
138
|
-
raw[dim] = parsed.get(dim, 1)
|
|
139
|
-
normalized[dim] = normalize_score(raw[dim])
|
|
140
|
-
|
|
141
|
-
combined = compute_combined_score(normalized)
|
|
142
|
-
|
|
143
|
-
return {
|
|
144
|
-
"score": round(combined, 4),
|
|
145
|
-
"accuracy": raw["accuracy"],
|
|
146
|
-
"completeness": raw["completeness"],
|
|
147
|
-
"relevance": raw["relevance"],
|
|
148
|
-
"no_hallucination": raw["no_hallucination"],
|
|
149
|
-
"reasoning": parsed.get("reasoning", ""),
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
def main():
|
|
154
|
-
parser = argparse.ArgumentParser(description="LLM-as-judge evaluation")
|
|
155
|
-
parser.add_argument("--results-dir", required=True,
|
|
156
|
-
help="Directory with harness output JSON files")
|
|
157
|
-
parser.add_argument("--tasks-dir", required=True,
|
|
158
|
-
help="Directory with task JSON files")
|
|
159
|
-
parser.add_argument("--scores", required=True,
|
|
160
|
-
help="Output path for scores JSON")
|
|
161
|
-
args = parser.parse_args()
|
|
162
|
-
|
|
163
|
-
# Detect LLM provider
|
|
164
|
-
provider, api_key, model = detect_provider()
|
|
165
|
-
|
|
166
|
-
# Collect tasks
|
|
167
|
-
task_files = sorted(f for f in os.listdir(args.tasks_dir) if f.endswith(".json"))
|
|
168
|
-
if not task_files:
|
|
169
|
-
print(f"FAIL: no .json task files in {args.tasks_dir}", file=sys.stderr)
|
|
170
|
-
sys.exit(1)
|
|
171
|
-
|
|
172
|
-
per_task = {}
|
|
173
|
-
dimension_totals = {dim: 0.0 for dim in DIMENSIONS}
|
|
174
|
-
total_combined = 0.0
|
|
175
|
-
total_tasks = 0
|
|
176
|
-
|
|
177
|
-
for task_file in task_files:
|
|
178
|
-
# Load task
|
|
179
|
-
task_path = os.path.join(args.tasks_dir, task_file)
|
|
180
|
-
with open(task_path) as f:
|
|
181
|
-
task = json.load(f)
|
|
182
|
-
task_id = task["id"]
|
|
183
|
-
|
|
184
|
-
# Load result
|
|
185
|
-
result_path = os.path.join(args.results_dir, task_file)
|
|
186
|
-
if os.path.exists(result_path):
|
|
187
|
-
with open(result_path) as f:
|
|
188
|
-
result = json.load(f)
|
|
189
|
-
else:
|
|
190
|
-
result = {"id": task_id, "output": "", "error": "no output file"}
|
|
191
|
-
|
|
192
|
-
# Evaluate
|
|
193
|
-
task_scores = evaluate_task(provider, api_key, model, task, result)
|
|
194
|
-
per_task[task_id] = task_scores
|
|
195
|
-
|
|
196
|
-
# Accumulate
|
|
197
|
-
total_combined += task_scores["score"]
|
|
198
|
-
for dim in DIMENSIONS:
|
|
199
|
-
dimension_totals[dim] += normalize_score(task_scores[dim])
|
|
200
|
-
total_tasks += 1
|
|
201
|
-
|
|
202
|
-
# Compute averages
|
|
203
|
-
if total_tasks > 0:
|
|
204
|
-
combined_score = round(total_combined / total_tasks, 4)
|
|
205
|
-
avg_dimensions = {
|
|
206
|
-
dim: round(dimension_totals[dim] / total_tasks, 4) for dim in DIMENSIONS
|
|
207
|
-
}
|
|
208
|
-
else:
|
|
209
|
-
combined_score = 0.0
|
|
210
|
-
avg_dimensions = {dim: 0.0 for dim in DIMENSIONS}
|
|
211
|
-
|
|
212
|
-
scores = {
|
|
213
|
-
"combined_score": combined_score,
|
|
214
|
-
"eval_type": "llm-judge",
|
|
215
|
-
"judge_provider": provider,
|
|
216
|
-
"judge_model": model,
|
|
217
|
-
"dimensions": avg_dimensions,
|
|
218
|
-
"weights": WEIGHTS,
|
|
219
|
-
"total_tasks": total_tasks,
|
|
220
|
-
"per_task": per_task,
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
# Write scores
|
|
224
|
-
os.makedirs(os.path.dirname(os.path.abspath(args.scores)), exist_ok=True)
|
|
225
|
-
with open(args.scores, "w") as f:
|
|
226
|
-
json.dump(scores, f, indent=2)
|
|
227
|
-
|
|
228
|
-
print(f"LLM judge evaluation complete. combined_score: {combined_score} "
|
|
229
|
-
f"({total_tasks} tasks, provider: {provider}/{model})")
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
if __name__ == "__main__":
|
|
233
|
-
main()
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Passthrough eval — collects outputs for judge subagent scoring.
|
|
3
|
-
|
|
4
|
-
When no custom eval.py exists, this is used as the default. It does NOT score
|
|
5
|
-
outputs — it collects them and marks them for the judge subagent to evaluate.
|
|
6
|
-
The evolve skill detects eval_type=pending-judge and spawns the judge agent.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import argparse
|
|
10
|
-
import json
|
|
11
|
-
import os
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def main():
|
|
15
|
-
parser = argparse.ArgumentParser()
|
|
16
|
-
parser.add_argument("--results-dir", required=True)
|
|
17
|
-
parser.add_argument("--tasks-dir", required=True)
|
|
18
|
-
parser.add_argument("--scores", required=True)
|
|
19
|
-
args = parser.parse_args()
|
|
20
|
-
|
|
21
|
-
per_task = {}
|
|
22
|
-
for fname in sorted(os.listdir(args.tasks_dir)):
|
|
23
|
-
if not fname.endswith(".json"):
|
|
24
|
-
continue
|
|
25
|
-
with open(os.path.join(args.tasks_dir, fname)) as f:
|
|
26
|
-
task = json.load(f)
|
|
27
|
-
task_id = task["id"]
|
|
28
|
-
|
|
29
|
-
result_path = os.path.join(args.results_dir, fname)
|
|
30
|
-
output = ""
|
|
31
|
-
if os.path.exists(result_path):
|
|
32
|
-
with open(result_path) as f:
|
|
33
|
-
result = json.load(f)
|
|
34
|
-
output = str(result.get("output", ""))
|
|
35
|
-
|
|
36
|
-
per_task[task_id] = {
|
|
37
|
-
"score": -1,
|
|
38
|
-
"input": str(task.get("input", ""))[:500],
|
|
39
|
-
"output": output[:500],
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
scores = {
|
|
43
|
-
"combined_score": -1,
|
|
44
|
-
"eval_type": "pending-judge",
|
|
45
|
-
"total_tasks": len(per_task),
|
|
46
|
-
"per_task": per_task,
|
|
47
|
-
}
|
|
48
|
-
with open(args.scores, "w") as f:
|
|
49
|
-
json.dump(scores, f, indent=2)
|
|
50
|
-
|
|
51
|
-
print(f"Collected {len(per_task)} task outputs for judge scoring.")
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
if __name__ == "__main__":
|
|
55
|
-
main()
|