harness-evolver 3.1.0 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +20 -0
- package/hooks/hooks.json +15 -0
- package/hooks/session-start.sh +71 -0
- package/package.json +4 -2
- package/skills/evolve/SKILL.md +19 -13
- package/skills/setup/SKILL.md +3 -2
- package/tools/setup.py +35 -1
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "harness-evolver",
|
|
3
|
+
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
+
"version": "3.2.0",
|
|
5
|
+
"author": {
|
|
6
|
+
"name": "Raphael Valdetaro"
|
|
7
|
+
},
|
|
8
|
+
"homepage": "https://github.com/raphaelchristi/harness-evolver",
|
|
9
|
+
"repository": "https://github.com/raphaelchristi/harness-evolver",
|
|
10
|
+
"license": "MIT",
|
|
11
|
+
"keywords": [
|
|
12
|
+
"langsmith",
|
|
13
|
+
"optimization",
|
|
14
|
+
"evolution",
|
|
15
|
+
"llm",
|
|
16
|
+
"agent",
|
|
17
|
+
"evaluator",
|
|
18
|
+
"langsmith-cli"
|
|
19
|
+
]
|
|
20
|
+
}
|
package/hooks/hooks.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"description": "Harness Evolver — ensures Python deps and env vars are available each session",
|
|
3
|
+
"hooks": {
|
|
4
|
+
"SessionStart": [
|
|
5
|
+
{
|
|
6
|
+
"hooks": [
|
|
7
|
+
{
|
|
8
|
+
"type": "command",
|
|
9
|
+
"command": "bash \"${CLAUDE_PLUGIN_ROOT}/hooks/session-start.sh\""
|
|
10
|
+
}
|
|
11
|
+
]
|
|
12
|
+
}
|
|
13
|
+
]
|
|
14
|
+
}
|
|
15
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Harness Evolver — SessionStart hook
|
|
3
|
+
# Ensures Python venv, langsmith, langsmith-cli, and env vars are ready.
|
|
4
|
+
# Runs silently on every session start. Installs deps only if missing.
|
|
5
|
+
|
|
6
|
+
set -euo pipefail
|
|
7
|
+
|
|
8
|
+
# Resolve paths — plugin root is set by Claude Code
|
|
9
|
+
PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-}"
|
|
10
|
+
PLUGIN_DATA="${CLAUDE_PLUGIN_DATA:-}"
|
|
11
|
+
|
|
12
|
+
# Fallback: if running outside plugin system (npx install), use legacy paths
|
|
13
|
+
if [ -z "$PLUGIN_ROOT" ]; then
|
|
14
|
+
PLUGIN_ROOT="$HOME/.evolver"
|
|
15
|
+
PLUGIN_DATA="$HOME/.evolver"
|
|
16
|
+
fi
|
|
17
|
+
|
|
18
|
+
TOOLS_DIR="$PLUGIN_ROOT/tools"
|
|
19
|
+
VENV_DIR="$PLUGIN_DATA/venv"
|
|
20
|
+
VENV_PY="$VENV_DIR/bin/python"
|
|
21
|
+
|
|
22
|
+
# --- 1. Create venv if missing ---
|
|
23
|
+
if [ ! -f "$VENV_PY" ]; then
|
|
24
|
+
if command -v uv >/dev/null 2>&1; then
|
|
25
|
+
uv venv "$VENV_DIR" >/dev/null 2>&1
|
|
26
|
+
else
|
|
27
|
+
python3 -m venv "$VENV_DIR" >/dev/null 2>&1
|
|
28
|
+
fi
|
|
29
|
+
fi
|
|
30
|
+
|
|
31
|
+
# --- 2. Install langsmith if missing ---
|
|
32
|
+
if [ -f "$VENV_PY" ]; then
|
|
33
|
+
"$VENV_PY" -c "import langsmith" 2>/dev/null || {
|
|
34
|
+
if command -v uv >/dev/null 2>&1; then
|
|
35
|
+
uv pip install --python "$VENV_PY" langsmith >/dev/null 2>&1
|
|
36
|
+
else
|
|
37
|
+
"$VENV_DIR/bin/pip" install --upgrade langsmith >/dev/null 2>&1 || \
|
|
38
|
+
"$VENV_PY" -m pip install --upgrade langsmith >/dev/null 2>&1
|
|
39
|
+
fi
|
|
40
|
+
}
|
|
41
|
+
fi
|
|
42
|
+
|
|
43
|
+
# --- 3. Install langsmith-cli if missing ---
|
|
44
|
+
command -v langsmith-cli >/dev/null 2>&1 || {
|
|
45
|
+
if command -v uv >/dev/null 2>&1; then
|
|
46
|
+
uv tool install langsmith-cli >/dev/null 2>&1
|
|
47
|
+
else
|
|
48
|
+
pip install langsmith-cli >/dev/null 2>&1 || pip3 install langsmith-cli >/dev/null 2>&1
|
|
49
|
+
fi
|
|
50
|
+
} || true
|
|
51
|
+
|
|
52
|
+
# --- 4. Load API key from credentials file if not in env ---
|
|
53
|
+
if [ -z "${LANGSMITH_API_KEY:-}" ]; then
|
|
54
|
+
if [ "$(uname)" = "Darwin" ]; then
|
|
55
|
+
CREDS="$HOME/Library/Application Support/langsmith-cli/credentials"
|
|
56
|
+
else
|
|
57
|
+
CREDS="$HOME/.config/langsmith-cli/credentials"
|
|
58
|
+
fi
|
|
59
|
+
if [ -f "$CREDS" ]; then
|
|
60
|
+
KEY=$(grep '^LANGSMITH_API_KEY=' "$CREDS" 2>/dev/null | head -1 | cut -d= -f2-)
|
|
61
|
+
if [ -n "$KEY" ]; then
|
|
62
|
+
echo "export LANGSMITH_API_KEY=\"$KEY\"" >> "$CLAUDE_ENV_FILE"
|
|
63
|
+
fi
|
|
64
|
+
fi
|
|
65
|
+
fi
|
|
66
|
+
|
|
67
|
+
# --- 5. Export env vars for skills ---
|
|
68
|
+
if [ -n "${CLAUDE_ENV_FILE:-}" ]; then
|
|
69
|
+
echo "export EVOLVER_TOOLS=\"$TOOLS_DIR\"" >> "$CLAUDE_ENV_FILE"
|
|
70
|
+
echo "export EVOLVER_PY=\"$VENV_PY\"" >> "$CLAUDE_ENV_FILE"
|
|
71
|
+
fi
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.2.0",
|
|
4
4
|
"description": "LangSmith-native autonomous agent optimization for Claude Code",
|
|
5
5
|
"author": "Raphael Valdetaro",
|
|
6
6
|
"license": "MIT",
|
|
@@ -24,6 +24,8 @@
|
|
|
24
24
|
"bin/",
|
|
25
25
|
"skills/",
|
|
26
26
|
"agents/",
|
|
27
|
-
"tools/"
|
|
27
|
+
"tools/",
|
|
28
|
+
"hooks/",
|
|
29
|
+
".claude-plugin/"
|
|
28
30
|
]
|
|
29
31
|
}
|
package/skills/evolve/SKILL.md
CHANGED
|
@@ -16,8 +16,9 @@ Run the autonomous propose-evaluate-iterate loop using LangSmith as the evaluati
|
|
|
16
16
|
## Resolve Tool Path and Python
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
# Prefer env vars set by plugin hook; fallback to legacy npx paths
|
|
20
|
+
TOOLS="${EVOLVER_TOOLS:-$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")}"
|
|
21
|
+
EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")}"
|
|
21
22
|
```
|
|
22
23
|
|
|
23
24
|
Use `$EVOLVER_PY` instead of `python3` for ALL tool invocations.
|
|
@@ -75,13 +76,15 @@ python3 -c "import json; c=json.load(open('.evolver.json')); print(f'v{c[\"itera
|
|
|
75
76
|
|
|
76
77
|
### 1.5. Gather Trace Insights
|
|
77
78
|
|
|
78
|
-
|
|
79
|
+
Read the best experiment from config. If null (no baseline was run), skip trace insights for this iteration — proposers will work blind on the first pass:
|
|
79
80
|
|
|
80
81
|
```bash
|
|
81
|
-
BEST=$(python3 -c "import json;
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
82
|
+
BEST=$(python3 -c "import json; b=json.load(open('.evolver.json')).get('best_experiment'); print(b if b else '')")
|
|
83
|
+
if [ -n "$BEST" ]; then
|
|
84
|
+
$EVOLVER_PY $TOOLS/trace_insights.py \
|
|
85
|
+
--from-experiment "$BEST" \
|
|
86
|
+
--output trace_insights.json 2>/dev/null
|
|
87
|
+
fi
|
|
85
88
|
```
|
|
86
89
|
|
|
87
90
|
If a production project is configured, also gather production insights:
|
|
@@ -99,17 +102,20 @@ fi
|
|
|
99
102
|
|
|
100
103
|
### 1.8. Analyze Per-Task Failures
|
|
101
104
|
|
|
102
|
-
|
|
105
|
+
If `$BEST` is set (not the first iteration without baseline), read results and cluster failures:
|
|
103
106
|
|
|
104
107
|
```bash
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
108
|
+
if [ -n "$BEST" ]; then
|
|
109
|
+
$EVOLVER_PY $TOOLS/read_results.py \
|
|
110
|
+
--experiment "$BEST" \
|
|
111
|
+
--config .evolver.json \
|
|
112
|
+
--output best_results.json 2>/dev/null
|
|
113
|
+
fi
|
|
109
114
|
```
|
|
110
115
|
|
|
111
|
-
|
|
116
|
+
If `best_results.json` exists, parse it to find failing examples (score < 0.7). Group by metadata or error pattern.
|
|
112
117
|
Generate adaptive briefings for Candidates D and E (same logic as v2).
|
|
118
|
+
If no best_results.json (first iteration without baseline), all proposers work from code analysis only — no failure data available.
|
|
113
119
|
|
|
114
120
|
### 2. Spawn 5 Proposers in Parallel
|
|
115
121
|
|
package/skills/setup/SKILL.md
CHANGED
|
@@ -38,8 +38,9 @@ The tools auto-load the key from the credentials file, but the env var takes pre
|
|
|
38
38
|
## Resolve Tool Path and Python
|
|
39
39
|
|
|
40
40
|
```bash
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
# Prefer env vars set by plugin hook; fallback to legacy npx paths
|
|
42
|
+
TOOLS="${EVOLVER_TOOLS:-$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")}"
|
|
43
|
+
EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")}"
|
|
43
44
|
```
|
|
44
45
|
|
|
45
46
|
Use `$EVOLVER_PY` instead of `python3` for ALL tool invocations. This ensures the venv with langsmith is used.
|
package/tools/setup.py
CHANGED
|
@@ -87,6 +87,29 @@ def check_dependencies():
|
|
|
87
87
|
return missing
|
|
88
88
|
|
|
89
89
|
|
|
90
|
+
def resolve_dataset_name(client, base_name):
|
|
91
|
+
"""Find an available dataset name by auto-incrementing the version suffix.
|
|
92
|
+
|
|
93
|
+
Tries base_name-eval-v1, v2, v3... until an unused name is found.
|
|
94
|
+
Returns (resolved_name, version_number).
|
|
95
|
+
"""
|
|
96
|
+
existing = set()
|
|
97
|
+
try:
|
|
98
|
+
for ds in client.list_datasets():
|
|
99
|
+
existing.add(ds.name)
|
|
100
|
+
except Exception:
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
for v in range(1, 100):
|
|
104
|
+
candidate = f"{base_name}-eval-v{v}"
|
|
105
|
+
if candidate not in existing:
|
|
106
|
+
return candidate, v
|
|
107
|
+
|
|
108
|
+
# Fallback: timestamp-based
|
|
109
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
|
|
110
|
+
return f"{base_name}-eval-{ts}", 0
|
|
111
|
+
|
|
112
|
+
|
|
90
113
|
def create_dataset_from_file(client, dataset_name, file_path):
|
|
91
114
|
"""Create a LangSmith dataset from a JSON file of inputs."""
|
|
92
115
|
with open(file_path) as f:
|
|
@@ -320,6 +343,7 @@ def main():
|
|
|
320
343
|
parser.add_argument("--dataset-from-file", default=None, help="Create dataset from JSON file")
|
|
321
344
|
parser.add_argument("--dataset-from-langsmith", default=None, help="Create dataset from LangSmith project")
|
|
322
345
|
parser.add_argument("--production-project", default=None, help="Production LangSmith project")
|
|
346
|
+
parser.add_argument("--dataset-name", default=None, help="Explicit dataset name (skip auto-versioning)")
|
|
323
347
|
parser.add_argument("--evaluators", default=None, help="Comma-separated evaluator names")
|
|
324
348
|
parser.add_argument("--skip-baseline", action="store_true", help="Skip baseline evaluation")
|
|
325
349
|
parser.add_argument("--output", default=".evolver.json", help="Output config path")
|
|
@@ -351,9 +375,19 @@ def main():
|
|
|
351
375
|
sys.exit(1)
|
|
352
376
|
|
|
353
377
|
project_name = f"evolver-{args.project_name}"
|
|
354
|
-
dataset_name = f"{args.project_name}-eval-v1"
|
|
355
378
|
goals = [g.strip() for g in args.goals.split(",")]
|
|
356
379
|
|
|
380
|
+
# Resolve dataset name (explicit or auto-versioned)
|
|
381
|
+
if args.dataset_name:
|
|
382
|
+
dataset_name = args.dataset_name
|
|
383
|
+
print(f"Using explicit dataset name: '{dataset_name}'")
|
|
384
|
+
else:
|
|
385
|
+
dataset_name, version = resolve_dataset_name(client, args.project_name)
|
|
386
|
+
if version > 1:
|
|
387
|
+
print(f"Dataset name auto-versioned to '{dataset_name}' (v1-v{version-1} already exist)")
|
|
388
|
+
else:
|
|
389
|
+
print(f"Dataset: '{dataset_name}'")
|
|
390
|
+
|
|
357
391
|
# Create dataset
|
|
358
392
|
print(f"Creating dataset '{dataset_name}'...")
|
|
359
393
|
if args.dataset_from_file:
|