harness-evolver 4.2.1 → 4.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +26 -3
- package/tools/adversarial_inject.py +1 -1
- package/tools/dataset_health.py +2 -2
- package/tools/read_results.py +1 -1
- package/tools/regression_tracker.py +1 -1
- package/tools/setup.py +13 -0
- package/tools/synthesize_strategy.py +39 -0
- package/tools/trace_insights.py +1 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.2.
|
|
4
|
+
"version": "4.2.3",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/package.json
CHANGED
package/skills/evolve/SKILL.md
CHANGED
|
@@ -243,6 +243,21 @@ print(f'Retired {retired} dead examples')
|
|
|
243
243
|
|
|
244
244
|
After corrections, log what was done. Do NOT re-run health check (corrections may need an experiment cycle to show effect).
|
|
245
245
|
|
|
246
|
+
### 0.8. Resolve Project Directory
|
|
247
|
+
|
|
248
|
+
If the project is in a subdirectory of the git repo (e.g., `playground/react-agent/`), worktrees replicate the full repo structure. Read `project_dir` from `.evolver.json` to resolve paths correctly:
|
|
249
|
+
|
|
250
|
+
```bash
|
|
251
|
+
PROJECT_DIR=$(python3 -c "import json; print(json.load(open('.evolver.json')).get('project_dir', ''))")
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
If `PROJECT_DIR` is non-empty, all worktree paths must include it:
|
|
255
|
+
- Config in worktree: `{worktree_path}/{PROJECT_DIR}/.evolver.json`
|
|
256
|
+
- CWD in worktree: `{worktree_path}/{PROJECT_DIR}`
|
|
257
|
+
- proposal.md in worktree: `{worktree_path}/{PROJECT_DIR}/proposal.md`
|
|
258
|
+
|
|
259
|
+
If `PROJECT_DIR` is empty (project at git root), paths are unchanged: `{worktree_path}/.evolver.json`, etc.
|
|
260
|
+
|
|
246
261
|
For each iteration:
|
|
247
262
|
|
|
248
263
|
### 1. Get Next Version
|
|
@@ -402,7 +417,11 @@ After all proposers complete, check which ones committed and which abstained:
|
|
|
402
417
|
|
|
403
418
|
```bash
|
|
404
419
|
for WORKTREE in {worktree_paths}; do
|
|
405
|
-
|
|
420
|
+
# Resolve project path within worktree
|
|
421
|
+
WT_PROJECT="$WORKTREE"
|
|
422
|
+
[ -n "$PROJECT_DIR" ] && WT_PROJECT="$WORKTREE/$PROJECT_DIR"
|
|
423
|
+
|
|
424
|
+
if [ -f "$WT_PROJECT/proposal.md" ] && grep -q "## ABSTAIN" "$WT_PROJECT/proposal.md" 2>/dev/null; then
|
|
406
425
|
echo "Proposer in $WORKTREE abstained — skipping evaluation"
|
|
407
426
|
elif [ $(cd "$WORKTREE" && git log --oneline -1 --since="10 minutes ago" 2>/dev/null | wc -l) -eq 0 ]; then
|
|
408
427
|
echo "Proposer in $WORKTREE made no commits — skipping"
|
|
@@ -417,9 +436,13 @@ Only run evaluation (Step 3) for proposers that committed changes (not abstained
|
|
|
417
436
|
For each worktree that has changes (proposer committed something):
|
|
418
437
|
|
|
419
438
|
```bash
|
|
439
|
+
# If PROJECT_DIR is set, resolve paths into the worktree subdirectory
|
|
440
|
+
WORKTREE_PROJECT="{worktree_path}"
|
|
441
|
+
[ -n "$PROJECT_DIR" ] && WORKTREE_PROJECT="{worktree_path}/{PROJECT_DIR}"
|
|
442
|
+
|
|
420
443
|
$EVOLVER_PY $TOOLS/run_eval.py \
|
|
421
|
-
--config
|
|
422
|
-
--worktree-path
|
|
444
|
+
--config "$WORKTREE_PROJECT/.evolver.json" \
|
|
445
|
+
--worktree-path "$WORKTREE_PROJECT" \
|
|
423
446
|
--experiment-prefix v{NNN}-{lens_id} \
|
|
424
447
|
--timeout 120
|
|
425
448
|
```
|
|
@@ -59,7 +59,7 @@ def detect_memorization(client, experiment_name, dataset_name):
|
|
|
59
59
|
"""Check if agent outputs are suspiciously similar to reference outputs."""
|
|
60
60
|
suspicious = []
|
|
61
61
|
try:
|
|
62
|
-
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=
|
|
62
|
+
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=100))
|
|
63
63
|
examples = {str(e.id): e for e in client.list_examples(dataset_name=dataset_name, limit=500)}
|
|
64
64
|
|
|
65
65
|
for run in runs:
|
package/tools/dataset_health.py
CHANGED
|
@@ -68,7 +68,7 @@ def check_difficulty(client, config):
|
|
|
68
68
|
return None
|
|
69
69
|
|
|
70
70
|
try:
|
|
71
|
-
runs = list(client.list_runs(project_name=best_exp, is_root=True, limit=
|
|
71
|
+
runs = list(client.list_runs(project_name=best_exp, is_root=True, limit=100))
|
|
72
72
|
if not runs:
|
|
73
73
|
return None
|
|
74
74
|
|
|
@@ -129,7 +129,7 @@ def check_dead_examples(client, config):
|
|
|
129
129
|
|
|
130
130
|
for exp_name in recent_exps:
|
|
131
131
|
try:
|
|
132
|
-
runs = list(client.list_runs(project_name=exp_name, is_root=True, limit=
|
|
132
|
+
runs = list(client.list_runs(project_name=exp_name, is_root=True, limit=100))
|
|
133
133
|
all_run_ids = [run.id for run in runs]
|
|
134
134
|
if not all_run_ids:
|
|
135
135
|
continue
|
package/tools/read_results.py
CHANGED
|
@@ -60,7 +60,7 @@ def get_per_example_scores(client, experiment_name):
|
|
|
60
60
|
"""Get per-example scores from an experiment."""
|
|
61
61
|
scores = {}
|
|
62
62
|
try:
|
|
63
|
-
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=
|
|
63
|
+
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=100))
|
|
64
64
|
all_run_ids = [run.id for run in runs]
|
|
65
65
|
all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
|
|
66
66
|
fb_map = {}
|
package/tools/setup.py
CHANGED
|
@@ -511,12 +511,25 @@ def main():
|
|
|
511
511
|
else:
|
|
512
512
|
print("Skipping baseline (--skip-baseline)")
|
|
513
513
|
|
|
514
|
+
# Compute project_dir relative to git root (for worktree path resolution)
|
|
515
|
+
project_dir = ""
|
|
516
|
+
try:
|
|
517
|
+
git_prefix = subprocess.run(
|
|
518
|
+
["git", "rev-parse", "--show-prefix"],
|
|
519
|
+
capture_output=True, text=True, timeout=5,
|
|
520
|
+
)
|
|
521
|
+
if git_prefix.returncode == 0:
|
|
522
|
+
project_dir = git_prefix.stdout.strip().rstrip("/")
|
|
523
|
+
except Exception:
|
|
524
|
+
pass
|
|
525
|
+
|
|
514
526
|
# Write config
|
|
515
527
|
config = {
|
|
516
528
|
"version": "3.0.0",
|
|
517
529
|
"project": project_name,
|
|
518
530
|
"dataset": dataset_name,
|
|
519
531
|
"dataset_id": str(dataset.id) if dataset else None,
|
|
532
|
+
"project_dir": project_dir,
|
|
520
533
|
"entry_point": args.entry_point,
|
|
521
534
|
"evaluators": evaluator_keys,
|
|
522
535
|
"optimization_goals": goals,
|
|
@@ -216,6 +216,45 @@ def generate_lenses(strategy, config, insights, results, memory, production, max
|
|
|
216
216
|
})
|
|
217
217
|
break # at most 1 persistent failure lens
|
|
218
218
|
|
|
219
|
+
# Uniform failure lens — when there are failing examples but no cluster lenses were generated
|
|
220
|
+
# (e.g., all examples fail with same error like "python: not found")
|
|
221
|
+
failing_examples = strategy.get("failing_examples", [])
|
|
222
|
+
if failing_examples and not any(l["source"] == "failure_cluster" for l in lenses):
|
|
223
|
+
# Check if all errors are the same
|
|
224
|
+
errors = [ex.get("error", "") for ex in failing_examples if ex.get("error")]
|
|
225
|
+
common_error = errors[0] if errors and len(set(errors)) == 1 else None
|
|
226
|
+
if common_error:
|
|
227
|
+
lens_id += 1
|
|
228
|
+
lenses.append({
|
|
229
|
+
"id": lens_id,
|
|
230
|
+
"question": f"All {len(failing_examples)} examples fail with the same error: \"{common_error[:150]}\". Is this a code bug, configuration issue, or environment problem? What's the fix?",
|
|
231
|
+
"source": "uniform_failure",
|
|
232
|
+
"severity": "critical",
|
|
233
|
+
"context": {"error": common_error[:300], "count": len(failing_examples)},
|
|
234
|
+
})
|
|
235
|
+
else:
|
|
236
|
+
# Diverse errors but no clusters — create a general failure lens
|
|
237
|
+
lens_id += 1
|
|
238
|
+
lenses.append({
|
|
239
|
+
"id": lens_id,
|
|
240
|
+
"question": f"{len(failing_examples)} examples are failing with various errors. What are the root causes and what changes would fix the most failures?",
|
|
241
|
+
"source": "failure_analysis",
|
|
242
|
+
"severity": "high",
|
|
243
|
+
"context": {"count": len(failing_examples)},
|
|
244
|
+
})
|
|
245
|
+
|
|
246
|
+
# Input diversity lens — when we have failing examples, suggest investigating by input type
|
|
247
|
+
if failing_examples and len(failing_examples) >= 5 and len(lenses) < max_lenses - 1:
|
|
248
|
+
previews = [ex.get("input_preview", "")[:50] for ex in failing_examples[:5]]
|
|
249
|
+
lens_id += 1
|
|
250
|
+
lenses.append({
|
|
251
|
+
"id": lens_id,
|
|
252
|
+
"question": f"The agent fails on diverse inputs like: {'; '.join(previews[:3])}. Are there different failure modes for different input types?",
|
|
253
|
+
"source": "input_diversity",
|
|
254
|
+
"severity": "medium",
|
|
255
|
+
"context": {"sample_inputs": previews},
|
|
256
|
+
})
|
|
257
|
+
|
|
219
258
|
# Open lens (always included)
|
|
220
259
|
lens_id += 1
|
|
221
260
|
lenses.append({
|
package/tools/trace_insights.py
CHANGED