harness-evolver 4.5.0 → 4.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +69 -2
- package/tools/run_eval.py +41 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.5.
|
|
4
|
+
"version": "4.5.1",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/package.json
CHANGED
package/skills/evolve/SKILL.md
CHANGED
|
@@ -133,6 +133,61 @@ If critical issues found, ask user whether to continue or fix first via AskUserQ
|
|
|
133
133
|
|
|
134
134
|
Invoke `/evolver:health` to check and auto-correct dataset issues. If health_report.json shows critical issues that couldn't be auto-corrected, ask user whether to proceed via AskUserQuestion.
|
|
135
135
|
|
|
136
|
+
### 0.7. Ensure Baseline Has LLM-Judge Scores
|
|
137
|
+
|
|
138
|
+
The baseline experiment (from setup) only runs code-based evaluators (has_output, token_efficiency). Without LLM-judge scores, the baseline score is inflated — any agent that produces text gets 1.0, making gate checks stop evolution prematurely.
|
|
139
|
+
|
|
140
|
+
Check if LLM evaluators are configured and the baseline needs scoring:
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
LLM_EVALS=$(python3 -c "import json; c=json.load(open('.evolver.json')); llm=[k for k in c['evaluators'] if k in ('correctness','conciseness')]; print(','.join(llm) if llm else '')")
|
|
144
|
+
BASELINE=$(python3 -c "import json; print(json.load(open('.evolver.json')).get('baseline_experiment', ''))")
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
If `LLM_EVALS` is non-empty and `BASELINE` exists, check if LLM scores already exist:
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
HAS_LLM_SCORES=$($EVOLVER_PY $TOOLS/read_results.py --experiment "$BASELINE" --config .evolver.json 2>/dev/null | python3 -c "
|
|
151
|
+
import sys, json
|
|
152
|
+
try:
|
|
153
|
+
r = json.load(sys.stdin)
|
|
154
|
+
scored_keys = set()
|
|
155
|
+
for ex in r.get('per_example', {}).values():
|
|
156
|
+
scored_keys.update(ex.get('scores', {}).keys())
|
|
157
|
+
llm_keys = set('correctness,conciseness'.split(','))
|
|
158
|
+
configured = set(k for k in llm_keys if k in '$LLM_EVALS'.split(','))
|
|
159
|
+
print('yes' if configured.issubset(scored_keys) else 'no')
|
|
160
|
+
except: print('no')
|
|
161
|
+
")
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
If `HAS_LLM_SCORES` is "no", trigger the evaluator agent on the baseline:
|
|
165
|
+
|
|
166
|
+
```
|
|
167
|
+
Agent(
|
|
168
|
+
subagent_type: "evolver-evaluator",
|
|
169
|
+
description: "Score baseline with LLM-judge",
|
|
170
|
+
prompt: "Experiments to evaluate: {baseline_experiment}. Evaluators: {llm_evaluator_list}. Framework: {framework}. Entry point: {entry_point}. Dataset: {dataset_name}. NOTE: This is the baseline — score it fairly so evolution has a meaningful starting point. Some examples have expected_behavior rubrics in their metadata — fetch example metadata and use rubrics for scoring when available."
|
|
171
|
+
)
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
After the evaluator completes, re-read the baseline score and update `.evolver.json`:
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
$EVOLVER_PY $TOOLS/read_results.py --experiment "$BASELINE" --config .evolver.json --output best_results.json 2>/dev/null
|
|
178
|
+
python3 -c "
|
|
179
|
+
import json
|
|
180
|
+
br = json.load(open('best_results.json'))
|
|
181
|
+
c = json.load(open('.evolver.json'))
|
|
182
|
+
new_score = br.get('combined_score', c['best_score'])
|
|
183
|
+
c['best_score'] = new_score
|
|
184
|
+
if c.get('history'):
|
|
185
|
+
c['history'][0]['score'] = new_score
|
|
186
|
+
json.dump(c, open('.evolver.json', 'w'), indent=2)
|
|
187
|
+
print(f'Baseline re-scored with LLM-judge: {new_score:.3f}')
|
|
188
|
+
"
|
|
189
|
+
```
|
|
190
|
+
|
|
136
191
|
### 0.8. Resolve Project Directory
|
|
137
192
|
|
|
138
193
|
If the project is in a subdirectory of the git repo (e.g., `playground/react-agent/`), worktrees replicate the full repo structure. Read `project_dir` from `.evolver.json` to resolve paths correctly:
|
|
@@ -340,10 +395,22 @@ Only run evaluation (Step 3) for proposers that committed changes (not abstained
|
|
|
340
395
|
|
|
341
396
|
### 3. Run Target for Each Candidate (Parallel)
|
|
342
397
|
|
|
343
|
-
|
|
398
|
+
First, copy config files into each worktree (untracked files aren't replicated by git — this was the #1 bug in all real-world runs):
|
|
399
|
+
|
|
400
|
+
```bash
|
|
401
|
+
for WORKTREE in {worktree_paths_with_commits}; do
|
|
402
|
+
WORKTREE_PROJECT="$WORKTREE"
|
|
403
|
+
[ -n "$PROJECT_DIR" ] && WORKTREE_PROJECT="$WORKTREE/$PROJECT_DIR"
|
|
404
|
+
|
|
405
|
+
# Copy untracked config files needed by run_eval.py and the agent
|
|
406
|
+
cp .evolver.json "$WORKTREE_PROJECT/.evolver.json" 2>/dev/null
|
|
407
|
+
[ -f .env ] && cp .env "$WORKTREE_PROJECT/.env" 2>/dev/null
|
|
408
|
+
done
|
|
409
|
+
```
|
|
410
|
+
|
|
411
|
+
Then run evaluations for ALL candidates simultaneously:
|
|
344
412
|
|
|
345
413
|
```bash
|
|
346
|
-
# Launch all evaluations in parallel
|
|
347
414
|
for WORKTREE in {worktree_paths_with_commits}; do
|
|
348
415
|
WORKTREE_PROJECT="$WORKTREE"
|
|
349
416
|
[ -n "$PROJECT_DIR" ] && WORKTREE_PROJECT="$WORKTREE/$PROJECT_DIR"
|
package/tools/run_eval.py
CHANGED
|
@@ -72,7 +72,20 @@ def make_target(entry_point, cwd):
|
|
|
72
72
|
|
|
73
73
|
try:
|
|
74
74
|
cmd = entry_point
|
|
75
|
-
|
|
75
|
+
|
|
76
|
+
# {input_text}: extract plain text from inputs dict (for agents expecting --query "text")
|
|
77
|
+
if "{input_text}" in cmd:
|
|
78
|
+
import shlex
|
|
79
|
+
text = ""
|
|
80
|
+
for key in ("input", "question", "query", "prompt", "text", "user_input"):
|
|
81
|
+
if key in inputs and isinstance(inputs[key], str):
|
|
82
|
+
text = inputs[key]
|
|
83
|
+
break
|
|
84
|
+
if not text and inputs:
|
|
85
|
+
first_val = next(iter(inputs.values()), "")
|
|
86
|
+
text = str(first_val) if not isinstance(first_val, str) else first_val
|
|
87
|
+
cmd = cmd.replace("{input_text}", shlex.quote(text))
|
|
88
|
+
elif "{input}" in cmd:
|
|
76
89
|
# Placeholder: replace with path to JSON file
|
|
77
90
|
cmd = cmd.replace("{input}", input_path)
|
|
78
91
|
elif "{input_json}" in cmd:
|
|
@@ -167,6 +180,7 @@ def main():
|
|
|
167
180
|
parser.add_argument("--experiment-prefix", required=True, help="Experiment name prefix (e.g. v001a)")
|
|
168
181
|
parser.add_argument("--timeout", type=int, default=120, help="Per-task timeout in seconds")
|
|
169
182
|
parser.add_argument("--concurrency", type=int, default=None, help="Max concurrent evaluations (default: from config or 1)")
|
|
183
|
+
parser.add_argument("--no-canary", action="store_true", help="Skip canary preflight check")
|
|
170
184
|
args = parser.parse_args()
|
|
171
185
|
|
|
172
186
|
with open(args.config) as f:
|
|
@@ -187,6 +201,32 @@ def main():
|
|
|
187
201
|
llm_evaluators = [k for k in config["evaluators"] if k in ("correctness", "conciseness")]
|
|
188
202
|
code_evaluators = [k for k in config["evaluators"] if k not in ("correctness", "conciseness")]
|
|
189
203
|
|
|
204
|
+
# Canary run: verify agent works before burning through full dataset
|
|
205
|
+
if not args.no_canary:
|
|
206
|
+
print(" Canary: running 1 example preflight...", file=sys.stderr)
|
|
207
|
+
try:
|
|
208
|
+
canary_examples = list(client.list_examples(dataset_name=config["dataset"], limit=1))
|
|
209
|
+
if canary_examples:
|
|
210
|
+
canary_result = target(canary_examples[0].inputs)
|
|
211
|
+
canary_output = canary_result.get("output", "")
|
|
212
|
+
canary_error = canary_result.get("error", "")
|
|
213
|
+
if not canary_output and canary_error:
|
|
214
|
+
print(f" CANARY FAILED: Agent produced no output.", file=sys.stderr)
|
|
215
|
+
print(f" Error: {canary_error}", file=sys.stderr)
|
|
216
|
+
print(f" Fix the agent before running full evaluation.", file=sys.stderr)
|
|
217
|
+
output = {
|
|
218
|
+
"experiment": None,
|
|
219
|
+
"prefix": args.experiment_prefix,
|
|
220
|
+
"combined_score": 0.0,
|
|
221
|
+
"error": f"Canary failed: {canary_error[:200]}",
|
|
222
|
+
}
|
|
223
|
+
print(json.dumps(output))
|
|
224
|
+
sys.exit(2)
|
|
225
|
+
else:
|
|
226
|
+
print(f" Canary passed: got output ({len(str(canary_output))} chars)", file=sys.stderr)
|
|
227
|
+
except Exception as e:
|
|
228
|
+
print(f" Canary check failed: {e} (proceeding anyway)", file=sys.stderr)
|
|
229
|
+
|
|
190
230
|
print(f"Running evaluation: {args.experiment_prefix}")
|
|
191
231
|
print(f" Dataset: {config['dataset']}")
|
|
192
232
|
print(f" Worktree: {args.worktree_path}")
|