harness-evolver 3.1.0 → 3.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ {
2
+ "name": "harness-evolver",
3
+ "description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
4
+ "version": "3.2.0",
5
+ "author": {
6
+ "name": "Raphael Valdetaro"
7
+ },
8
+ "homepage": "https://github.com/raphaelchristi/harness-evolver",
9
+ "repository": "https://github.com/raphaelchristi/harness-evolver",
10
+ "license": "MIT",
11
+ "keywords": [
12
+ "langsmith",
13
+ "optimization",
14
+ "evolution",
15
+ "llm",
16
+ "agent",
17
+ "evaluator",
18
+ "langsmith-cli"
19
+ ]
20
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ "description": "Harness Evolver — ensures Python deps and env vars are available each session",
3
+ "hooks": {
4
+ "SessionStart": [
5
+ {
6
+ "hooks": [
7
+ {
8
+ "type": "command",
9
+ "command": "bash \"${CLAUDE_PLUGIN_ROOT}/hooks/session-start.sh\""
10
+ }
11
+ ]
12
+ }
13
+ ]
14
+ }
15
+ }
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env bash
2
+ # Harness Evolver — SessionStart hook
3
+ # Ensures Python venv, langsmith, langsmith-cli, and env vars are ready.
4
+ # Runs silently on every session start. Installs deps only if missing.
5
+
6
+ set -euo pipefail
7
+
8
+ # Resolve paths — plugin root is set by Claude Code
9
+ PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-}"
10
+ PLUGIN_DATA="${CLAUDE_PLUGIN_DATA:-}"
11
+
12
+ # Fallback: if running outside plugin system (npx install), use legacy paths
13
+ if [ -z "$PLUGIN_ROOT" ]; then
14
+ PLUGIN_ROOT="$HOME/.evolver"
15
+ PLUGIN_DATA="$HOME/.evolver"
16
+ fi
17
+
18
+ TOOLS_DIR="$PLUGIN_ROOT/tools"
19
+ VENV_DIR="$PLUGIN_DATA/venv"
20
+ VENV_PY="$VENV_DIR/bin/python"
21
+
22
+ # --- 1. Create venv if missing ---
23
+ if [ ! -f "$VENV_PY" ]; then
24
+ if command -v uv >/dev/null 2>&1; then
25
+ uv venv "$VENV_DIR" >/dev/null 2>&1
26
+ else
27
+ python3 -m venv "$VENV_DIR" >/dev/null 2>&1
28
+ fi
29
+ fi
30
+
31
+ # --- 2. Install langsmith if missing ---
32
+ if [ -f "$VENV_PY" ]; then
33
+ "$VENV_PY" -c "import langsmith" 2>/dev/null || {
34
+ if command -v uv >/dev/null 2>&1; then
35
+ uv pip install --python "$VENV_PY" langsmith >/dev/null 2>&1
36
+ else
37
+ "$VENV_DIR/bin/pip" install --upgrade langsmith >/dev/null 2>&1 || \
38
+ "$VENV_PY" -m pip install --upgrade langsmith >/dev/null 2>&1
39
+ fi
40
+ }
41
+ fi
42
+
43
+ # --- 3. Install langsmith-cli if missing ---
44
+ command -v langsmith-cli >/dev/null 2>&1 || {
45
+ if command -v uv >/dev/null 2>&1; then
46
+ uv tool install langsmith-cli >/dev/null 2>&1
47
+ else
48
+ pip install langsmith-cli >/dev/null 2>&1 || pip3 install langsmith-cli >/dev/null 2>&1
49
+ fi
50
+ } || true
51
+
52
+ # --- 4. Load API key from credentials file if not in env ---
53
+ if [ -z "${LANGSMITH_API_KEY:-}" ]; then
54
+ if [ "$(uname)" = "Darwin" ]; then
55
+ CREDS="$HOME/Library/Application Support/langsmith-cli/credentials"
56
+ else
57
+ CREDS="$HOME/.config/langsmith-cli/credentials"
58
+ fi
59
+ if [ -f "$CREDS" ]; then
60
+ KEY=$(grep '^LANGSMITH_API_KEY=' "$CREDS" 2>/dev/null | head -1 | cut -d= -f2-)
61
+ if [ -n "$KEY" ]; then
62
+ echo "export LANGSMITH_API_KEY=\"$KEY\"" >> "$CLAUDE_ENV_FILE"
63
+ fi
64
+ fi
65
+ fi
66
+
67
+ # --- 5. Export env vars for skills ---
68
+ if [ -n "${CLAUDE_ENV_FILE:-}" ]; then
69
+ echo "export EVOLVER_TOOLS=\"$TOOLS_DIR\"" >> "$CLAUDE_ENV_FILE"
70
+ echo "export EVOLVER_PY=\"$VENV_PY\"" >> "$CLAUDE_ENV_FILE"
71
+ fi
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
- "version": "3.1.0",
3
+ "version": "3.2.0",
4
4
  "description": "LangSmith-native autonomous agent optimization for Claude Code",
5
5
  "author": "Raphael Valdetaro",
6
6
  "license": "MIT",
@@ -24,6 +24,8 @@
24
24
  "bin/",
25
25
  "skills/",
26
26
  "agents/",
27
- "tools/"
27
+ "tools/",
28
+ "hooks/",
29
+ ".claude-plugin/"
28
30
  ]
29
31
  }
@@ -16,8 +16,9 @@ Run the autonomous propose-evaluate-iterate loop using LangSmith as the evaluati
16
16
  ## Resolve Tool Path and Python
17
17
 
18
18
  ```bash
19
- TOOLS=$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")
20
- EVOLVER_PY=$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")
19
+ # Prefer env vars set by plugin hook; fallback to legacy npx paths
20
+ TOOLS="${EVOLVER_TOOLS:-$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")}"
21
+ EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")}"
21
22
  ```
22
23
 
23
24
  Use `$EVOLVER_PY` instead of `python3` for ALL tool invocations.
@@ -75,13 +76,15 @@ python3 -c "import json; c=json.load(open('.evolver.json')); print(f'v{c[\"itera
75
76
 
76
77
  ### 1.5. Gather Trace Insights
77
78
 
78
- Run trace insights from the best experiment:
79
+ Read the best experiment from config. If null (no baseline was run), skip trace insights for this iteration — proposers will work blind on the first pass:
79
80
 
80
81
  ```bash
81
- BEST=$(python3 -c "import json; print(json.load(open('.evolver.json'))['best_experiment'])")
82
- $EVOLVER_PY $TOOLS/trace_insights.py \
83
- --from-experiment "$BEST" \
84
- --output trace_insights.json 2>/dev/null
82
+ BEST=$(python3 -c "import json; b=json.load(open('.evolver.json')).get('best_experiment'); print(b if b else '')")
83
+ if [ -n "$BEST" ]; then
84
+ $EVOLVER_PY $TOOLS/trace_insights.py \
85
+ --from-experiment "$BEST" \
86
+ --output trace_insights.json 2>/dev/null
87
+ fi
85
88
  ```
86
89
 
87
90
  If a production project is configured, also gather production insights:
@@ -99,17 +102,20 @@ fi
99
102
 
100
103
  ### 1.8. Analyze Per-Task Failures
101
104
 
102
- Read the best experiment results and cluster failures:
105
+ If `$BEST` is set (not the first iteration without baseline), read results and cluster failures:
103
106
 
104
107
  ```bash
105
- $EVOLVER_PY $TOOLS/read_results.py \
106
- --experiment "$BEST" \
107
- --config .evolver.json \
108
- --output best_results.json 2>/dev/null
108
+ if [ -n "$BEST" ]; then
109
+ $EVOLVER_PY $TOOLS/read_results.py \
110
+ --experiment "$BEST" \
111
+ --config .evolver.json \
112
+ --output best_results.json 2>/dev/null
113
+ fi
109
114
  ```
110
115
 
111
- Parse `best_results.json` to find failing examples (score < 0.7). Group by metadata or error pattern.
116
+ If `best_results.json` exists, parse it to find failing examples (score < 0.7). Group by metadata or error pattern.
112
117
  Generate adaptive briefings for Candidates D and E (same logic as v2).
118
+ If no best_results.json (first iteration without baseline), all proposers work from code analysis only — no failure data available.
113
119
 
114
120
  ### 2. Spawn 5 Proposers in Parallel
115
121
 
@@ -38,8 +38,9 @@ The tools auto-load the key from the credentials file, but the env var takes pre
38
38
  ## Resolve Tool Path and Python
39
39
 
40
40
  ```bash
41
- TOOLS=$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")
42
- EVOLVER_PY=$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")
41
+ # Prefer env vars set by plugin hook; fallback to legacy npx paths
42
+ TOOLS="${EVOLVER_TOOLS:-$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")}"
43
+ EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")}"
43
44
  ```
44
45
 
45
46
  Use `$EVOLVER_PY` instead of `python3` for ALL tool invocations. This ensures the venv with langsmith is used.
package/tools/setup.py CHANGED
@@ -87,6 +87,29 @@ def check_dependencies():
87
87
  return missing
88
88
 
89
89
 
90
+ def resolve_dataset_name(client, base_name):
91
+ """Find an available dataset name by auto-incrementing the version suffix.
92
+
93
+ Tries base_name-eval-v1, v2, v3... until an unused name is found.
94
+ Returns (resolved_name, version_number).
95
+ """
96
+ existing = set()
97
+ try:
98
+ for ds in client.list_datasets():
99
+ existing.add(ds.name)
100
+ except Exception:
101
+ pass
102
+
103
+ for v in range(1, 100):
104
+ candidate = f"{base_name}-eval-v{v}"
105
+ if candidate not in existing:
106
+ return candidate, v
107
+
108
+ # Fallback: timestamp-based
109
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
110
+ return f"{base_name}-eval-{ts}", 0
111
+
112
+
90
113
  def create_dataset_from_file(client, dataset_name, file_path):
91
114
  """Create a LangSmith dataset from a JSON file of inputs."""
92
115
  with open(file_path) as f:
@@ -320,6 +343,7 @@ def main():
320
343
  parser.add_argument("--dataset-from-file", default=None, help="Create dataset from JSON file")
321
344
  parser.add_argument("--dataset-from-langsmith", default=None, help="Create dataset from LangSmith project")
322
345
  parser.add_argument("--production-project", default=None, help="Production LangSmith project")
346
+ parser.add_argument("--dataset-name", default=None, help="Explicit dataset name (skip auto-versioning)")
323
347
  parser.add_argument("--evaluators", default=None, help="Comma-separated evaluator names")
324
348
  parser.add_argument("--skip-baseline", action="store_true", help="Skip baseline evaluation")
325
349
  parser.add_argument("--output", default=".evolver.json", help="Output config path")
@@ -351,9 +375,19 @@ def main():
351
375
  sys.exit(1)
352
376
 
353
377
  project_name = f"evolver-{args.project_name}"
354
- dataset_name = f"{args.project_name}-eval-v1"
355
378
  goals = [g.strip() for g in args.goals.split(",")]
356
379
 
380
+ # Resolve dataset name (explicit or auto-versioned)
381
+ if args.dataset_name:
382
+ dataset_name = args.dataset_name
383
+ print(f"Using explicit dataset name: '{dataset_name}'")
384
+ else:
385
+ dataset_name, version = resolve_dataset_name(client, args.project_name)
386
+ if version > 1:
387
+ print(f"Dataset name auto-versioned to '{dataset_name}' (v1-v{version-1} already exist)")
388
+ else:
389
+ print(f"Dataset: '{dataset_name}'")
390
+
357
391
  # Create dataset
358
392
  print(f"Creating dataset '{dataset_name}'...")
359
393
  if args.dataset_from_file: