harness-evolver 3.1.1 → 3.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +20 -0
- package/README.md +50 -9
- package/agents/evolver-evaluator.md +2 -2
- package/agents/evolver-proposer.md +2 -1
- package/hooks/hooks.json +15 -0
- package/hooks/session-start.sh +71 -0
- package/package.json +4 -2
- package/skills/evolve/SKILL.md +4 -3
- package/skills/setup/SKILL.md +11 -5
- package/tools/read_results.py +14 -1
- package/tools/run_eval.py +33 -6
- package/tools/setup.py +2 -0
- package/tools/trace_insights.py +37 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "harness-evolver",
|
|
3
|
+
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
+
"version": "3.2.1",
|
|
5
|
+
"author": {
|
|
6
|
+
"name": "Raphael Valdetaro"
|
|
7
|
+
},
|
|
8
|
+
"homepage": "https://github.com/raphaelchristi/harness-evolver",
|
|
9
|
+
"repository": "https://github.com/raphaelchristi/harness-evolver",
|
|
10
|
+
"license": "MIT",
|
|
11
|
+
"keywords": [
|
|
12
|
+
"langsmith",
|
|
13
|
+
"optimization",
|
|
14
|
+
"evolution",
|
|
15
|
+
"llm",
|
|
16
|
+
"agent",
|
|
17
|
+
"evaluator",
|
|
18
|
+
"langsmith-cli"
|
|
19
|
+
]
|
|
20
|
+
}
|
package/README.md
CHANGED
|
@@ -19,11 +19,24 @@ Inspired by [Meta-Harness](https://yoonholee.com/meta-harness/) (Lee et al., 202
|
|
|
19
19
|
|
|
20
20
|
## Install
|
|
21
21
|
|
|
22
|
+
### Claude Code Plugin (recommended)
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
/plugin marketplace add raphaelchristi/harness-evolver-marketplace
|
|
26
|
+
/plugin install harness-evolver
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Updates are automatic. Python dependencies (langsmith, langsmith-cli) are installed on first session start via hook.
|
|
30
|
+
|
|
31
|
+
### npx (first-time setup or non-Claude Code runtimes)
|
|
32
|
+
|
|
22
33
|
```bash
|
|
23
34
|
npx harness-evolver@latest
|
|
24
35
|
```
|
|
25
36
|
|
|
26
|
-
|
|
37
|
+
Interactive installer that configures LangSmith API key, creates Python venv, and installs all dependencies. Works with Claude Code, Cursor, Codex, and Windsurf.
|
|
38
|
+
|
|
39
|
+
> **Both install paths work together.** Use npx for initial setup (API key, venv), then the plugin marketplace handles updates automatically.
|
|
27
40
|
|
|
28
41
|
---
|
|
29
42
|
|
|
@@ -58,6 +71,10 @@ claude
|
|
|
58
71
|
<td>Each iteration spawns 5 parallel agents: exploit, explore, crossover, and 2 failure-targeted. Strategies adapt based on per-task analysis. Quality-diversity selection preserves per-task champions.</td>
|
|
59
72
|
</tr>
|
|
60
73
|
<tr>
|
|
74
|
+
<td><b>Agent-Based Evaluation</b></td>
|
|
75
|
+
<td>The evaluator agent reads experiment outputs via langsmith-cli, judges correctness using the same Claude model powering the other agents, and writes scores back. No OpenAI API key or openevals dependency needed.</td>
|
|
76
|
+
</tr>
|
|
77
|
+
<tr>
|
|
61
78
|
<td><b>Production Traces</b></td>
|
|
62
79
|
<td>Auto-discovers existing LangSmith production projects. Uses real user inputs for test generation and real error patterns for targeted optimization.</td>
|
|
63
80
|
</tr>
|
|
@@ -89,10 +106,10 @@ claude
|
|
|
89
106
|
| Agent | Role | Color |
|
|
90
107
|
|---|---|---|
|
|
91
108
|
| **Proposer** | Modifies agent code in isolated worktrees based on trace analysis | Green |
|
|
109
|
+
| **Evaluator** | LLM-as-judge — reads outputs via langsmith-cli, scores correctness | Yellow |
|
|
92
110
|
| **Architect** | Recommends multi-agent topology changes | Blue |
|
|
93
111
|
| **Critic** | Validates evaluator quality, detects gaming | Red |
|
|
94
112
|
| **TestGen** | Generates test inputs for LangSmith datasets | Cyan |
|
|
95
|
-
| **Evaluator** | LLM-as-judge — reads outputs via langsmith-cli, scores correctness | Yellow |
|
|
96
113
|
|
|
97
114
|
---
|
|
98
115
|
|
|
@@ -118,19 +135,43 @@ claude
|
|
|
118
135
|
|
|
119
136
|
---
|
|
120
137
|
|
|
138
|
+
## Architecture
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
Plugin hook (SessionStart)
|
|
142
|
+
└→ Creates venv, installs langsmith + langsmith-cli, exports env vars
|
|
143
|
+
|
|
144
|
+
Skills (markdown)
|
|
145
|
+
├── /evolver:setup → explores project, runs setup.py
|
|
146
|
+
├── /evolver:evolve → orchestrates the evolution loop
|
|
147
|
+
├── /evolver:status → reads .evolver.json + LangSmith
|
|
148
|
+
└── /evolver:deploy → tags and pushes
|
|
149
|
+
|
|
150
|
+
Agents (markdown)
|
|
151
|
+
├── Proposer (x5) → modifies code in git worktrees
|
|
152
|
+
├── Evaluator → LLM-as-judge via langsmith-cli
|
|
153
|
+
├── Critic → detects evaluator gaming
|
|
154
|
+
├── Architect → recommends topology changes
|
|
155
|
+
└── TestGen → generates test inputs
|
|
156
|
+
|
|
157
|
+
Tools (Python + langsmith SDK)
|
|
158
|
+
├── setup.py → creates datasets, configures evaluators
|
|
159
|
+
├── run_eval.py → runs target against dataset
|
|
160
|
+
├── read_results.py → compares experiments
|
|
161
|
+
├── trace_insights.py → clusters errors from traces
|
|
162
|
+
└── seed_from_traces.py → imports production traces
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
121
167
|
## Requirements
|
|
122
168
|
|
|
123
169
|
- **LangSmith account** + `LANGSMITH_API_KEY`
|
|
124
|
-
- **Python 3.10+**
|
|
125
|
-
- **langsmith-cli** (`uv tool install langsmith-cli`) — required for evaluator agent
|
|
170
|
+
- **Python 3.10+**
|
|
126
171
|
- **Git** (for worktree-based isolation)
|
|
127
172
|
- **Claude Code** (or Cursor/Codex/Windsurf)
|
|
128
173
|
|
|
129
|
-
|
|
130
|
-
export LANGSMITH_API_KEY="lsv2_pt_..."
|
|
131
|
-
pip install langsmith
|
|
132
|
-
uv tool install langsmith-cli
|
|
133
|
-
```
|
|
174
|
+
Dependencies (`langsmith`, `langsmith-cli`) are installed automatically by the plugin hook or the npx installer.
|
|
134
175
|
|
|
135
176
|
---
|
|
136
177
|
|
|
@@ -37,7 +37,7 @@ You interact with LangSmith exclusively through `langsmith-cli`. Always use `--j
|
|
|
37
37
|
langsmith-cli --json runs list \
|
|
38
38
|
--project "{experiment_name}" \
|
|
39
39
|
--fields id,inputs,outputs,error,reference_example_id \
|
|
40
|
-
--is-root \
|
|
40
|
+
--is-root true \
|
|
41
41
|
--limit 200
|
|
42
42
|
```
|
|
43
43
|
|
|
@@ -72,7 +72,7 @@ Fetch all runs from the experiment. Save the output to a file for reference:
|
|
|
72
72
|
langsmith-cli --json runs list \
|
|
73
73
|
--project "{experiment_name}" \
|
|
74
74
|
--fields id,inputs,outputs,error,reference_example_id \
|
|
75
|
-
--is-root --limit 200 \
|
|
75
|
+
--is-root true --limit 200 \
|
|
76
76
|
--output experiment_runs.jsonl
|
|
77
77
|
```
|
|
78
78
|
|
|
@@ -97,9 +97,10 @@ Ask about the SPECIFIC API you're going to use or change.
|
|
|
97
97
|
|
|
98
98
|
1. **Commit all changes** with a descriptive message:
|
|
99
99
|
```bash
|
|
100
|
-
git add -A
|
|
100
|
+
git add -A -- ':!.venv' ':!venv' ':!node_modules'
|
|
101
101
|
git commit -m "evolver: {brief description of changes}"
|
|
102
102
|
```
|
|
103
|
+
**CRITICAL**: Never commit `.venv`, `venv`, or `node_modules`. Symlinks to these in worktrees will break the main branch if merged.
|
|
103
104
|
|
|
104
105
|
2. **Write proposal.md** explaining:
|
|
105
106
|
- What you changed and why
|
package/hooks/hooks.json
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"description": "Harness Evolver — ensures Python deps and env vars are available each session",
|
|
3
|
+
"hooks": {
|
|
4
|
+
"SessionStart": [
|
|
5
|
+
{
|
|
6
|
+
"hooks": [
|
|
7
|
+
{
|
|
8
|
+
"type": "command",
|
|
9
|
+
"command": "bash \"${CLAUDE_PLUGIN_ROOT}/hooks/session-start.sh\""
|
|
10
|
+
}
|
|
11
|
+
]
|
|
12
|
+
}
|
|
13
|
+
]
|
|
14
|
+
}
|
|
15
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Harness Evolver — SessionStart hook
|
|
3
|
+
# Ensures Python venv, langsmith, langsmith-cli, and env vars are ready.
|
|
4
|
+
# Runs silently on every session start. Installs deps only if missing.
|
|
5
|
+
|
|
6
|
+
set -euo pipefail
|
|
7
|
+
|
|
8
|
+
# Resolve paths — plugin root is set by Claude Code
|
|
9
|
+
PLUGIN_ROOT="${CLAUDE_PLUGIN_ROOT:-}"
|
|
10
|
+
PLUGIN_DATA="${CLAUDE_PLUGIN_DATA:-}"
|
|
11
|
+
|
|
12
|
+
# Fallback: if running outside plugin system (npx install), use legacy paths
|
|
13
|
+
if [ -z "$PLUGIN_ROOT" ]; then
|
|
14
|
+
PLUGIN_ROOT="$HOME/.evolver"
|
|
15
|
+
PLUGIN_DATA="$HOME/.evolver"
|
|
16
|
+
fi
|
|
17
|
+
|
|
18
|
+
TOOLS_DIR="$PLUGIN_ROOT/tools"
|
|
19
|
+
VENV_DIR="$PLUGIN_DATA/venv"
|
|
20
|
+
VENV_PY="$VENV_DIR/bin/python"
|
|
21
|
+
|
|
22
|
+
# --- 1. Create venv if missing ---
|
|
23
|
+
if [ ! -f "$VENV_PY" ]; then
|
|
24
|
+
if command -v uv >/dev/null 2>&1; then
|
|
25
|
+
uv venv "$VENV_DIR" >/dev/null 2>&1
|
|
26
|
+
else
|
|
27
|
+
python3 -m venv "$VENV_DIR" >/dev/null 2>&1
|
|
28
|
+
fi
|
|
29
|
+
fi
|
|
30
|
+
|
|
31
|
+
# --- 2. Install langsmith if missing ---
|
|
32
|
+
if [ -f "$VENV_PY" ]; then
|
|
33
|
+
"$VENV_PY" -c "import langsmith" 2>/dev/null || {
|
|
34
|
+
if command -v uv >/dev/null 2>&1; then
|
|
35
|
+
uv pip install --python "$VENV_PY" langsmith >/dev/null 2>&1
|
|
36
|
+
else
|
|
37
|
+
"$VENV_DIR/bin/pip" install --upgrade langsmith >/dev/null 2>&1 || \
|
|
38
|
+
"$VENV_PY" -m pip install --upgrade langsmith >/dev/null 2>&1
|
|
39
|
+
fi
|
|
40
|
+
}
|
|
41
|
+
fi
|
|
42
|
+
|
|
43
|
+
# --- 3. Install langsmith-cli if missing ---
|
|
44
|
+
command -v langsmith-cli >/dev/null 2>&1 || {
|
|
45
|
+
if command -v uv >/dev/null 2>&1; then
|
|
46
|
+
uv tool install langsmith-cli >/dev/null 2>&1
|
|
47
|
+
else
|
|
48
|
+
pip install langsmith-cli >/dev/null 2>&1 || pip3 install langsmith-cli >/dev/null 2>&1
|
|
49
|
+
fi
|
|
50
|
+
} || true
|
|
51
|
+
|
|
52
|
+
# --- 4. Load API key from credentials file if not in env ---
|
|
53
|
+
if [ -z "${LANGSMITH_API_KEY:-}" ]; then
|
|
54
|
+
if [ "$(uname)" = "Darwin" ]; then
|
|
55
|
+
CREDS="$HOME/Library/Application Support/langsmith-cli/credentials"
|
|
56
|
+
else
|
|
57
|
+
CREDS="$HOME/.config/langsmith-cli/credentials"
|
|
58
|
+
fi
|
|
59
|
+
if [ -f "$CREDS" ]; then
|
|
60
|
+
KEY=$(grep '^LANGSMITH_API_KEY=' "$CREDS" 2>/dev/null | head -1 | cut -d= -f2-)
|
|
61
|
+
if [ -n "$KEY" ]; then
|
|
62
|
+
echo "export LANGSMITH_API_KEY=\"$KEY\"" >> "$CLAUDE_ENV_FILE"
|
|
63
|
+
fi
|
|
64
|
+
fi
|
|
65
|
+
fi
|
|
66
|
+
|
|
67
|
+
# --- 5. Export env vars for skills ---
|
|
68
|
+
if [ -n "${CLAUDE_ENV_FILE:-}" ]; then
|
|
69
|
+
echo "export EVOLVER_TOOLS=\"$TOOLS_DIR\"" >> "$CLAUDE_ENV_FILE"
|
|
70
|
+
echo "export EVOLVER_PY=\"$VENV_PY\"" >> "$CLAUDE_ENV_FILE"
|
|
71
|
+
fi
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.2.1",
|
|
4
4
|
"description": "LangSmith-native autonomous agent optimization for Claude Code",
|
|
5
5
|
"author": "Raphael Valdetaro",
|
|
6
6
|
"license": "MIT",
|
|
@@ -24,6 +24,8 @@
|
|
|
24
24
|
"bin/",
|
|
25
25
|
"skills/",
|
|
26
26
|
"agents/",
|
|
27
|
-
"tools/"
|
|
27
|
+
"tools/",
|
|
28
|
+
"hooks/",
|
|
29
|
+
".claude-plugin/"
|
|
28
30
|
]
|
|
29
31
|
}
|
package/skills/evolve/SKILL.md
CHANGED
|
@@ -16,8 +16,9 @@ Run the autonomous propose-evaluate-iterate loop using LangSmith as the evaluati
|
|
|
16
16
|
## Resolve Tool Path and Python
|
|
17
17
|
|
|
18
18
|
```bash
|
|
19
|
-
|
|
20
|
-
|
|
19
|
+
# Prefer env vars set by plugin hook; fallback to legacy npx paths
|
|
20
|
+
TOOLS="${EVOLVER_TOOLS:-$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")}"
|
|
21
|
+
EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")}"
|
|
21
22
|
```
|
|
22
23
|
|
|
23
24
|
Use `$EVOLVER_PY` instead of `python3` for ALL tool invocations.
|
|
@@ -234,7 +235,7 @@ Agent(
|
|
|
234
235
|
Entry point: {entry_point}
|
|
235
236
|
|
|
236
237
|
For each experiment:
|
|
237
|
-
1. Read all runs via: langsmith-cli --json runs list --project "{experiment_name}" --fields id,inputs,outputs,error --is-root --limit 200
|
|
238
|
+
1. Read all runs via: langsmith-cli --json runs list --project "{experiment_name}" --fields id,inputs,outputs,error --is-root true --limit 200
|
|
238
239
|
2. Judge each run's output against the input
|
|
239
240
|
3. Write scores via: langsmith-cli --json feedback create {run_id} --key {evaluator} --score {0.0|1.0} --comment "{reason}" --source model
|
|
240
241
|
</context>
|
package/skills/setup/SKILL.md
CHANGED
|
@@ -38,8 +38,9 @@ The tools auto-load the key from the credentials file, but the env var takes pre
|
|
|
38
38
|
## Resolve Tool Path and Python
|
|
39
39
|
|
|
40
40
|
```bash
|
|
41
|
-
|
|
42
|
-
|
|
41
|
+
# Prefer env vars set by plugin hook; fallback to legacy npx paths
|
|
42
|
+
TOOLS="${EVOLVER_TOOLS:-$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")}"
|
|
43
|
+
EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")}"
|
|
43
44
|
```
|
|
44
45
|
|
|
45
46
|
Use `$EVOLVER_PY` instead of `python3` for ALL tool invocations. This ensures the venv with langsmith is used.
|
|
@@ -60,9 +61,14 @@ Look for:
|
|
|
60
61
|
|
|
61
62
|
To identify the **framework**, read the entry point file and its immediate imports. The proposer agents will use Context7 MCP for detailed documentation lookup — you don't need to detect every library, just identify the main framework (LangGraph, CrewAI, OpenAI Agents SDK, etc.) from the imports you see.
|
|
62
63
|
|
|
63
|
-
Identify the **run command** — how to execute the agent:
|
|
64
|
-
- `python main.py`
|
|
65
|
-
-
|
|
64
|
+
Identify the **run command** — how to execute the agent. Use `{input}` as a placeholder for the JSON file path:
|
|
65
|
+
- `python main.py {input}` — agent reads JSON file from positional arg
|
|
66
|
+
- `python main.py --input {input}` — agent reads JSON file from `--input` flag
|
|
67
|
+
- `python main.py --query {input_json}` — agent receives inline JSON string
|
|
68
|
+
|
|
69
|
+
The runner writes `{"input": "user question..."}` to a temp `.json` file and replaces `{input}` with the file path. If the entry point already contains `--input` (without placeholder), the runner appends the file path as the next argument.
|
|
70
|
+
|
|
71
|
+
If no placeholder and no `--input` flag detected, the runner appends `--input <path> --output <path>`.
|
|
66
72
|
|
|
67
73
|
## Phase 2: Confirm Detection (interactive)
|
|
68
74
|
|
package/tools/read_results.py
CHANGED
|
@@ -26,7 +26,7 @@ import sys
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def ensure_langsmith_api_key():
|
|
29
|
-
"""Load LANGSMITH_API_KEY from credentials file if not in env."""
|
|
29
|
+
"""Load LANGSMITH_API_KEY from credentials file or .env if not in env."""
|
|
30
30
|
if os.environ.get("LANGSMITH_API_KEY"):
|
|
31
31
|
return True
|
|
32
32
|
if platform.system() == "Darwin":
|
|
@@ -45,6 +45,19 @@ def ensure_langsmith_api_key():
|
|
|
45
45
|
return True
|
|
46
46
|
except OSError:
|
|
47
47
|
pass
|
|
48
|
+
# Also check .env in current directory
|
|
49
|
+
if os.path.exists(".env"):
|
|
50
|
+
try:
|
|
51
|
+
with open(".env") as f:
|
|
52
|
+
for line in f:
|
|
53
|
+
line = line.strip()
|
|
54
|
+
if line.startswith("LANGSMITH_API_KEY=") and not line.startswith("#"):
|
|
55
|
+
key = line.split("=", 1)[1].strip().strip("'\"")
|
|
56
|
+
if key:
|
|
57
|
+
os.environ["LANGSMITH_API_KEY"] = key
|
|
58
|
+
return True
|
|
59
|
+
except OSError:
|
|
60
|
+
pass
|
|
48
61
|
return False
|
|
49
62
|
|
|
50
63
|
|
package/tools/run_eval.py
CHANGED
|
@@ -73,10 +73,16 @@ def make_target(entry_point, cwd):
|
|
|
73
73
|
try:
|
|
74
74
|
cmd = entry_point
|
|
75
75
|
if "{input}" in cmd:
|
|
76
|
+
# Placeholder: replace with path to JSON file
|
|
76
77
|
cmd = cmd.replace("{input}", input_path)
|
|
77
78
|
elif "{input_json}" in cmd:
|
|
79
|
+
# Placeholder: replace with inline JSON string
|
|
78
80
|
cmd = cmd.replace("{input_json}", input_json)
|
|
81
|
+
elif "--input" in cmd or "-i " in cmd:
|
|
82
|
+
# Entry point already has --input flag — pass the file path as next arg
|
|
83
|
+
cmd = f"{cmd} {input_path}"
|
|
79
84
|
else:
|
|
85
|
+
# Default: append --input and --output flags
|
|
80
86
|
cmd = f"{cmd} --input {input_path} --output {output_path}"
|
|
81
87
|
|
|
82
88
|
env = os.environ.copy()
|
|
@@ -197,17 +203,38 @@ def main():
|
|
|
197
203
|
experiment_name = results.experiment_name
|
|
198
204
|
|
|
199
205
|
# Calculate mean score from code-based evaluators only
|
|
206
|
+
# langsmith>=0.7.x returns dicts, older versions return dataclasses
|
|
200
207
|
scores = []
|
|
201
208
|
per_example = {}
|
|
202
209
|
for result in results:
|
|
203
210
|
example_scores = []
|
|
204
|
-
if result.evaluation_results and result.evaluation_results.get("results"):
|
|
205
|
-
for er in result.evaluation_results["results"]:
|
|
206
|
-
if er.get("score") is not None:
|
|
207
|
-
example_scores.append(er["score"])
|
|
208
|
-
scores.append(er["score"])
|
|
209
211
|
|
|
210
|
-
|
|
212
|
+
# Handle both dict and object results (SDK version compat)
|
|
213
|
+
if isinstance(result, dict):
|
|
214
|
+
eval_results = result.get("evaluation_results", {})
|
|
215
|
+
if isinstance(eval_results, dict):
|
|
216
|
+
eval_list = eval_results.get("results", [])
|
|
217
|
+
else:
|
|
218
|
+
eval_list = getattr(eval_results, "results", []) or []
|
|
219
|
+
example_obj = result.get("example")
|
|
220
|
+
example_id = str(example_obj.get("id", "unknown") if isinstance(example_obj, dict) else getattr(example_obj, "id", "unknown"))
|
|
221
|
+
else:
|
|
222
|
+
eval_results = getattr(result, "evaluation_results", None)
|
|
223
|
+
if isinstance(eval_results, dict):
|
|
224
|
+
eval_list = eval_results.get("results", [])
|
|
225
|
+
elif eval_results:
|
|
226
|
+
eval_list = getattr(eval_results, "results", []) or []
|
|
227
|
+
else:
|
|
228
|
+
eval_list = []
|
|
229
|
+
example_obj = getattr(result, "example", None)
|
|
230
|
+
example_id = str(getattr(example_obj, "id", "unknown") if example_obj else "unknown")
|
|
231
|
+
|
|
232
|
+
for er in eval_list:
|
|
233
|
+
score_val = er.get("score") if isinstance(er, dict) else getattr(er, "score", None)
|
|
234
|
+
if score_val is not None:
|
|
235
|
+
example_scores.append(score_val)
|
|
236
|
+
scores.append(score_val)
|
|
237
|
+
|
|
211
238
|
per_example[example_id] = {
|
|
212
239
|
"score": sum(example_scores) / len(example_scores) if example_scores else 0.0,
|
|
213
240
|
"num_evaluators": len(example_scores),
|
package/tools/setup.py
CHANGED
|
@@ -267,6 +267,8 @@ def make_target(entry_point, cwd=None):
|
|
|
267
267
|
cmd = cmd.replace("{input}", input_path)
|
|
268
268
|
elif "{input_json}" in cmd:
|
|
269
269
|
cmd = cmd.replace("{input_json}", input_json)
|
|
270
|
+
elif "--input" in cmd or "-i " in cmd:
|
|
271
|
+
cmd = f"{cmd} {input_path}"
|
|
270
272
|
else:
|
|
271
273
|
cmd = f"{cmd} --input {input_path} --output {output_path}"
|
|
272
274
|
|
package/tools/trace_insights.py
CHANGED
|
@@ -23,10 +23,46 @@ Requires: pip install langsmith (for SDK mode)
|
|
|
23
23
|
import argparse
|
|
24
24
|
import json
|
|
25
25
|
import os
|
|
26
|
+
import platform
|
|
26
27
|
import sys
|
|
27
28
|
from datetime import datetime, timezone
|
|
28
29
|
|
|
29
30
|
|
|
31
|
+
def ensure_langsmith_api_key():
|
|
32
|
+
"""Load LANGSMITH_API_KEY from credentials file or .env if not in env."""
|
|
33
|
+
if os.environ.get("LANGSMITH_API_KEY"):
|
|
34
|
+
return True
|
|
35
|
+
if platform.system() == "Darwin":
|
|
36
|
+
creds_path = os.path.expanduser("~/Library/Application Support/langsmith-cli/credentials")
|
|
37
|
+
else:
|
|
38
|
+
creds_path = os.path.expanduser("~/.config/langsmith-cli/credentials")
|
|
39
|
+
if os.path.exists(creds_path):
|
|
40
|
+
try:
|
|
41
|
+
with open(creds_path) as f:
|
|
42
|
+
for line in f:
|
|
43
|
+
line = line.strip()
|
|
44
|
+
if line.startswith("LANGSMITH_API_KEY="):
|
|
45
|
+
key = line.split("=", 1)[1].strip()
|
|
46
|
+
if key:
|
|
47
|
+
os.environ["LANGSMITH_API_KEY"] = key
|
|
48
|
+
return True
|
|
49
|
+
except OSError:
|
|
50
|
+
pass
|
|
51
|
+
if os.path.exists(".env"):
|
|
52
|
+
try:
|
|
53
|
+
with open(".env") as f:
|
|
54
|
+
for line in f:
|
|
55
|
+
line = line.strip()
|
|
56
|
+
if line.startswith("LANGSMITH_API_KEY=") and not line.startswith("#"):
|
|
57
|
+
key = line.split("=", 1)[1].strip().strip("'\"")
|
|
58
|
+
if key:
|
|
59
|
+
os.environ["LANGSMITH_API_KEY"] = key
|
|
60
|
+
return True
|
|
61
|
+
except OSError:
|
|
62
|
+
pass
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
|
|
30
66
|
def load_json(path):
|
|
31
67
|
"""Load JSON file, return None if missing or invalid."""
|
|
32
68
|
if not path or not os.path.exists(path):
|
|
@@ -260,6 +296,7 @@ def identify_top_issues(error_clusters, response_analysis, score_cross_ref):
|
|
|
260
296
|
def fetch_runs_from_langsmith(project_name, experiment_name=None, limit=50):
|
|
261
297
|
"""Fetch runs directly from LangSmith SDK (v3 mode)."""
|
|
262
298
|
try:
|
|
299
|
+
ensure_langsmith_api_key()
|
|
263
300
|
from langsmith import Client
|
|
264
301
|
client = Client()
|
|
265
302
|
|