harness-evolver 3.2.0 → 3.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/README.md +50 -9
- package/agents/evolver-evaluator.md +2 -2
- package/agents/evolver-proposer.md +2 -1
- package/bin/install.js +288 -110
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +1 -1
- package/skills/setup/SKILL.md +8 -3
- package/tools/read_results.py +14 -1
- package/tools/run_eval.py +33 -6
- package/tools/setup.py +2 -0
- package/tools/trace_insights.py +37 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "3.
|
|
4
|
+
"version": "3.3.0",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/README.md
CHANGED
|
@@ -19,11 +19,24 @@ Inspired by [Meta-Harness](https://yoonholee.com/meta-harness/) (Lee et al., 202
|
|
|
19
19
|
|
|
20
20
|
## Install
|
|
21
21
|
|
|
22
|
+
### Claude Code Plugin (recommended)
|
|
23
|
+
|
|
24
|
+
```
|
|
25
|
+
/plugin marketplace add raphaelchristi/harness-evolver-marketplace
|
|
26
|
+
/plugin install harness-evolver
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Updates are automatic. Python dependencies (langsmith, langsmith-cli) are installed on first session start via hook.
|
|
30
|
+
|
|
31
|
+
### npx (first-time setup or non-Claude Code runtimes)
|
|
32
|
+
|
|
22
33
|
```bash
|
|
23
34
|
npx harness-evolver@latest
|
|
24
35
|
```
|
|
25
36
|
|
|
26
|
-
|
|
37
|
+
Interactive installer that configures LangSmith API key, creates Python venv, and installs all dependencies. Works with Claude Code, Cursor, Codex, and Windsurf.
|
|
38
|
+
|
|
39
|
+
> **Both install paths work together.** Use npx for initial setup (API key, venv), then the plugin marketplace handles updates automatically.
|
|
27
40
|
|
|
28
41
|
---
|
|
29
42
|
|
|
@@ -58,6 +71,10 @@ claude
|
|
|
58
71
|
<td>Each iteration spawns 5 parallel agents: exploit, explore, crossover, and 2 failure-targeted. Strategies adapt based on per-task analysis. Quality-diversity selection preserves per-task champions.</td>
|
|
59
72
|
</tr>
|
|
60
73
|
<tr>
|
|
74
|
+
<td><b>Agent-Based Evaluation</b></td>
|
|
75
|
+
<td>The evaluator agent reads experiment outputs via langsmith-cli, judges correctness using the same Claude model powering the other agents, and writes scores back. No OpenAI API key or openevals dependency needed.</td>
|
|
76
|
+
</tr>
|
|
77
|
+
<tr>
|
|
61
78
|
<td><b>Production Traces</b></td>
|
|
62
79
|
<td>Auto-discovers existing LangSmith production projects. Uses real user inputs for test generation and real error patterns for targeted optimization.</td>
|
|
63
80
|
</tr>
|
|
@@ -89,10 +106,10 @@ claude
|
|
|
89
106
|
| Agent | Role | Color |
|
|
90
107
|
|---|---|---|
|
|
91
108
|
| **Proposer** | Modifies agent code in isolated worktrees based on trace analysis | Green |
|
|
109
|
+
| **Evaluator** | LLM-as-judge — reads outputs via langsmith-cli, scores correctness | Yellow |
|
|
92
110
|
| **Architect** | Recommends multi-agent topology changes | Blue |
|
|
93
111
|
| **Critic** | Validates evaluator quality, detects gaming | Red |
|
|
94
112
|
| **TestGen** | Generates test inputs for LangSmith datasets | Cyan |
|
|
95
|
-
| **Evaluator** | LLM-as-judge — reads outputs via langsmith-cli, scores correctness | Yellow |
|
|
96
113
|
|
|
97
114
|
---
|
|
98
115
|
|
|
@@ -118,19 +135,43 @@ claude
|
|
|
118
135
|
|
|
119
136
|
---
|
|
120
137
|
|
|
138
|
+
## Architecture
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
Plugin hook (SessionStart)
|
|
142
|
+
└→ Creates venv, installs langsmith + langsmith-cli, exports env vars
|
|
143
|
+
|
|
144
|
+
Skills (markdown)
|
|
145
|
+
├── /evolver:setup → explores project, runs setup.py
|
|
146
|
+
├── /evolver:evolve → orchestrates the evolution loop
|
|
147
|
+
├── /evolver:status → reads .evolver.json + LangSmith
|
|
148
|
+
└── /evolver:deploy → tags and pushes
|
|
149
|
+
|
|
150
|
+
Agents (markdown)
|
|
151
|
+
├── Proposer (x5) → modifies code in git worktrees
|
|
152
|
+
├── Evaluator → LLM-as-judge via langsmith-cli
|
|
153
|
+
├── Critic → detects evaluator gaming
|
|
154
|
+
├── Architect → recommends topology changes
|
|
155
|
+
└── TestGen → generates test inputs
|
|
156
|
+
|
|
157
|
+
Tools (Python + langsmith SDK)
|
|
158
|
+
├── setup.py → creates datasets, configures evaluators
|
|
159
|
+
├── run_eval.py → runs target against dataset
|
|
160
|
+
├── read_results.py → compares experiments
|
|
161
|
+
├── trace_insights.py → clusters errors from traces
|
|
162
|
+
└── seed_from_traces.py → imports production traces
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
121
167
|
## Requirements
|
|
122
168
|
|
|
123
169
|
- **LangSmith account** + `LANGSMITH_API_KEY`
|
|
124
|
-
- **Python 3.10+**
|
|
125
|
-
- **langsmith-cli** (`uv tool install langsmith-cli`) — required for evaluator agent
|
|
170
|
+
- **Python 3.10+**
|
|
126
171
|
- **Git** (for worktree-based isolation)
|
|
127
172
|
- **Claude Code** (or Cursor/Codex/Windsurf)
|
|
128
173
|
|
|
129
|
-
|
|
130
|
-
export LANGSMITH_API_KEY="lsv2_pt_..."
|
|
131
|
-
pip install langsmith
|
|
132
|
-
uv tool install langsmith-cli
|
|
133
|
-
```
|
|
174
|
+
Dependencies (`langsmith`, `langsmith-cli`) are installed automatically by the plugin hook or the npx installer.
|
|
134
175
|
|
|
135
176
|
---
|
|
136
177
|
|
|
@@ -37,7 +37,7 @@ You interact with LangSmith exclusively through `langsmith-cli`. Always use `--j
|
|
|
37
37
|
langsmith-cli --json runs list \
|
|
38
38
|
--project "{experiment_name}" \
|
|
39
39
|
--fields id,inputs,outputs,error,reference_example_id \
|
|
40
|
-
--is-root \
|
|
40
|
+
--is-root true \
|
|
41
41
|
--limit 200
|
|
42
42
|
```
|
|
43
43
|
|
|
@@ -72,7 +72,7 @@ Fetch all runs from the experiment. Save the output to a file for reference:
|
|
|
72
72
|
langsmith-cli --json runs list \
|
|
73
73
|
--project "{experiment_name}" \
|
|
74
74
|
--fields id,inputs,outputs,error,reference_example_id \
|
|
75
|
-
--is-root --limit 200 \
|
|
75
|
+
--is-root true --limit 200 \
|
|
76
76
|
--output experiment_runs.jsonl
|
|
77
77
|
```
|
|
78
78
|
|
|
@@ -97,9 +97,10 @@ Ask about the SPECIFIC API you're going to use or change.
|
|
|
97
97
|
|
|
98
98
|
1. **Commit all changes** with a descriptive message:
|
|
99
99
|
```bash
|
|
100
|
-
git add -A
|
|
100
|
+
git add -A -- ':!.venv' ':!venv' ':!node_modules'
|
|
101
101
|
git commit -m "evolver: {brief description of changes}"
|
|
102
102
|
```
|
|
103
|
+
**CRITICAL**: Never commit `.venv`, `venv`, or `node_modules`. Symlinks to these in worktrees will break the main branch if merged.
|
|
103
104
|
|
|
104
105
|
2. **Write proposal.md** explaining:
|
|
105
106
|
- What you changed and why
|
package/bin/install.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
/**
|
|
3
|
-
* Harness Evolver
|
|
4
|
-
* Copies skills/agents/tools to runtime directories
|
|
3
|
+
* Harness Evolver installer.
|
|
4
|
+
* Copies skills/agents/tools to runtime directories.
|
|
5
5
|
* Installs Python dependencies (langsmith) and langsmith-cli.
|
|
6
6
|
*
|
|
7
7
|
* Usage: npx harness-evolver@latest
|
|
@@ -16,20 +16,123 @@ const VERSION = require("../package.json").version;
|
|
|
16
16
|
const PLUGIN_ROOT = path.resolve(__dirname, "..");
|
|
17
17
|
const HOME = process.env.HOME || process.env.USERPROFILE;
|
|
18
18
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
const
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
19
|
+
// ─── Colors (zero dependencies, inline ANSI) ───────────────────────────────
|
|
20
|
+
|
|
21
|
+
const isColorSupported =
|
|
22
|
+
process.env.FORCE_COLOR !== "0" &&
|
|
23
|
+
!process.env.NO_COLOR &&
|
|
24
|
+
(process.env.FORCE_COLOR !== undefined || process.stdout.isTTY);
|
|
25
|
+
|
|
26
|
+
function ansi(code) {
|
|
27
|
+
return isColorSupported ? `\x1b[${code}m` : "";
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
const reset = ansi("0");
|
|
31
|
+
const bold = ansi("1");
|
|
32
|
+
const dim = ansi("2");
|
|
33
|
+
const red = ansi("31");
|
|
34
|
+
const green = ansi("32");
|
|
35
|
+
const yellow = ansi("33");
|
|
36
|
+
const cyan = ansi("36");
|
|
37
|
+
const gray = ansi("90");
|
|
38
|
+
const bgCyan = ansi("46");
|
|
39
|
+
const black = ansi("30");
|
|
40
|
+
|
|
41
|
+
const c = {
|
|
42
|
+
bold: (s) => `${bold}${s}${reset}`,
|
|
43
|
+
dim: (s) => `${dim}${s}${reset}`,
|
|
44
|
+
red: (s) => `${red}${s}${reset}`,
|
|
45
|
+
green: (s) => `${green}${s}${reset}`,
|
|
46
|
+
yellow: (s) => `${yellow}${s}${reset}`,
|
|
47
|
+
cyan: (s) => `${cyan}${s}${reset}`,
|
|
48
|
+
gray: (s) => `${gray}${s}${reset}`,
|
|
49
|
+
bgCyan: (s) => `${bgCyan}${black}${s}${reset}`,
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
// ─── Symbols ────────────────────────────────────────────────────────────────
|
|
53
|
+
|
|
54
|
+
const S = {
|
|
55
|
+
bar: "\u2502", // │
|
|
56
|
+
barEnd: "\u2514", // └
|
|
57
|
+
barStart: "\u250C", // ┌
|
|
58
|
+
step: "\u25C7", // ◇
|
|
59
|
+
stepActive: "\u25C6",// ◆
|
|
60
|
+
stepDone: "\u25CF", // ●
|
|
61
|
+
stepError: "\u25A0", // ■
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
// ─── UI helpers (clack-style) ───────────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
function barLine(content = "") {
|
|
67
|
+
console.log(`${c.gray(S.bar)} ${content}`);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function barEmpty() {
|
|
71
|
+
console.log(`${c.gray(S.bar)}`);
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function header(label) {
|
|
75
|
+
console.log();
|
|
76
|
+
console.log(`${c.gray(S.barStart)} ${c.bgCyan(` ${label} `)}`);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function footer(message) {
|
|
80
|
+
if (message) {
|
|
81
|
+
console.log(`${c.gray(S.barEnd)} ${message}`);
|
|
82
|
+
} else {
|
|
83
|
+
console.log(`${c.gray(S.barEnd)}`);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function step(content) {
|
|
88
|
+
console.log(`${c.gray(S.step)} ${content}`);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function stepDone(content) {
|
|
92
|
+
console.log(`${c.green(S.stepDone)} ${content}`);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function stepError(content) {
|
|
96
|
+
console.log(`${c.red(S.stepError)} ${content}`);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function stepPrompt(content) {
|
|
100
|
+
console.log(`${c.cyan(S.stepActive)} ${content}`);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// ─── Banner (gradient dark → light) ─────────────────────────────────────────
|
|
104
|
+
|
|
105
|
+
const BANNER_LINES = [
|
|
106
|
+
"\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2557 \u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2557 \u2588\u2588\u2557 \u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2588\u2588\u2588\u2588\u2557 ",
|
|
107
|
+
"\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255D\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2550\u2550\u2588\u2588\u2557\u2588\u2588\u2551 \u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2554\u2550\u2550\u2550\u2550\u255D\u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557",
|
|
108
|
+
"\u2588\u2588\u2588\u2588\u2588\u2557 \u255A\u2588\u2588\u2557 \u2588\u2588\u2554\u255D\u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2551 \u255A\u2588\u2588\u2557 \u2588\u2588\u2554\u255D\u2588\u2588\u2588\u2588\u2588\u2557 \u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255D",
|
|
109
|
+
"\u2588\u2588\u2554\u2550\u2550\u255D \u255A\u2588\u2588\u2588\u2588\u2554\u255D \u2588\u2588\u2551 \u2588\u2588\u2551\u2588\u2588\u2551 \u255A\u2588\u2588\u2588\u2588\u2554\u255D \u2588\u2588\u2554\u2550\u2550\u255D \u2588\u2588\u2554\u2550\u2550\u2588\u2588\u2557",
|
|
110
|
+
"\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u255A\u2588\u2588\u2554\u255D \u255A\u2588\u2588\u2588\u2588\u2588\u2588\u2554\u255D\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557 \u255A\u2588\u2588\u2554\u255D \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2557\u2588\u2588\u2551 \u2588\u2588\u2551",
|
|
111
|
+
"\u255A\u2550\u2550\u2550\u2550\u2550\u2550\u255D \u255A\u2550\u255D \u255A\u2550\u2550\u2550\u2550\u2550\u255D \u255A\u2550\u2550\u2550\u2550\u2550\u2550\u255D \u255A\u2550\u255D \u255A\u2550\u2550\u2550\u2550\u2550\u2550\u255D\u255A\u2550\u255D \u255A\u2550\u255D",
|
|
112
|
+
];
|
|
113
|
+
|
|
114
|
+
const GRADIENT = [
|
|
115
|
+
[60, 60, 60],
|
|
116
|
+
[90, 90, 90],
|
|
117
|
+
[125, 125, 125],
|
|
118
|
+
[160, 160, 160],
|
|
119
|
+
[200, 200, 200],
|
|
120
|
+
[240, 240, 240],
|
|
121
|
+
];
|
|
122
|
+
|
|
123
|
+
function rgb(r, g, b) {
|
|
124
|
+
return isColorSupported ? `\x1b[38;2;${r};${g};${b}m` : "";
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function banner() {
|
|
128
|
+
console.log();
|
|
129
|
+
for (let i = 0; i < BANNER_LINES.length; i++) {
|
|
130
|
+
const [r, g, b] = GRADIENT[i];
|
|
131
|
+
console.log(`${rgb(r, g, b)}${BANNER_LINES[i]}${reset}`);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// ─── Utilities ──────────────────────────────────────────────────────────────
|
|
33
136
|
|
|
34
137
|
function ask(rl, question) {
|
|
35
138
|
return new Promise((resolve) => rl.question(question, resolve));
|
|
@@ -72,6 +175,8 @@ function checkCommand(cmd) {
|
|
|
72
175
|
}
|
|
73
176
|
}
|
|
74
177
|
|
|
178
|
+
// ─── Install logic ──────────────────────────────────────────────────────────
|
|
179
|
+
|
|
75
180
|
function cleanPreviousInstall(runtimeDir, scope) {
|
|
76
181
|
const baseDir = scope === "local"
|
|
77
182
|
? path.join(process.cwd(), runtimeDir)
|
|
@@ -81,7 +186,6 @@ function cleanPreviousInstall(runtimeDir, scope) {
|
|
|
81
186
|
const agentsDir = path.join(baseDir, "agents");
|
|
82
187
|
let cleaned = 0;
|
|
83
188
|
|
|
84
|
-
// Remove ALL evolver/harness-evolver skills (any version)
|
|
85
189
|
if (fs.existsSync(skillsDir)) {
|
|
86
190
|
const ours = ["setup", "evolve", "deploy", "status",
|
|
87
191
|
"init", "architect", "compare", "critic", "diagnose",
|
|
@@ -100,7 +204,6 @@ function cleanPreviousInstall(runtimeDir, scope) {
|
|
|
100
204
|
}
|
|
101
205
|
}
|
|
102
206
|
|
|
103
|
-
// Remove ALL evolver/harness-evolver agents
|
|
104
207
|
if (fs.existsSync(agentsDir)) {
|
|
105
208
|
for (const f of fs.readdirSync(agentsDir)) {
|
|
106
209
|
if (f.startsWith("evolver-") || f.startsWith("harness-evolver-")) {
|
|
@@ -110,14 +213,12 @@ function cleanPreviousInstall(runtimeDir, scope) {
|
|
|
110
213
|
}
|
|
111
214
|
}
|
|
112
215
|
|
|
113
|
-
// Remove old commands/ directory (v1)
|
|
114
216
|
const oldCommandsDir = path.join(baseDir, "commands", "harness-evolver");
|
|
115
217
|
if (fs.existsSync(oldCommandsDir)) {
|
|
116
218
|
fs.rmSync(oldCommandsDir, { recursive: true, force: true });
|
|
117
219
|
cleaned++;
|
|
118
220
|
}
|
|
119
221
|
|
|
120
|
-
// Remove old tools directories
|
|
121
222
|
for (const toolsPath of [
|
|
122
223
|
path.join(HOME, ".evolver", "tools"),
|
|
123
224
|
path.join(HOME, ".harness-evolver"),
|
|
@@ -129,10 +230,39 @@ function cleanPreviousInstall(runtimeDir, scope) {
|
|
|
129
230
|
}
|
|
130
231
|
|
|
131
232
|
if (cleaned > 0) {
|
|
132
|
-
|
|
233
|
+
barLine(c.dim(`Cleaned ${cleaned} items from previous install`));
|
|
133
234
|
}
|
|
134
235
|
}
|
|
135
236
|
|
|
237
|
+
function countInstallables() {
|
|
238
|
+
let skills = 0;
|
|
239
|
+
let agents = 0;
|
|
240
|
+
let tools = 0;
|
|
241
|
+
|
|
242
|
+
const skillsSource = path.join(PLUGIN_ROOT, "skills");
|
|
243
|
+
if (fs.existsSync(skillsSource)) {
|
|
244
|
+
for (const s of fs.readdirSync(skillsSource, { withFileTypes: true })) {
|
|
245
|
+
if (s.isDirectory() && fs.existsSync(path.join(skillsSource, s.name, "SKILL.md"))) skills++;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
const agentsSource = path.join(PLUGIN_ROOT, "agents");
|
|
250
|
+
if (fs.existsSync(agentsSource)) {
|
|
251
|
+
for (const a of fs.readdirSync(agentsSource)) {
|
|
252
|
+
if (a.endsWith(".md")) agents++;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
const toolsSource = path.join(PLUGIN_ROOT, "tools");
|
|
257
|
+
if (fs.existsSync(toolsSource)) {
|
|
258
|
+
for (const t of fs.readdirSync(toolsSource)) {
|
|
259
|
+
if (t.endsWith(".py")) tools++;
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
return { skills, agents, tools };
|
|
264
|
+
}
|
|
265
|
+
|
|
136
266
|
function installSkillsAndAgents(runtimeDir, scope) {
|
|
137
267
|
const baseDir = scope === "local"
|
|
138
268
|
? path.join(process.cwd(), runtimeDir)
|
|
@@ -140,8 +270,8 @@ function installSkillsAndAgents(runtimeDir, scope) {
|
|
|
140
270
|
|
|
141
271
|
const skillsDir = path.join(baseDir, "skills");
|
|
142
272
|
const agentsDir = path.join(baseDir, "agents");
|
|
273
|
+
let installed = 0;
|
|
143
274
|
|
|
144
|
-
// Skills — read SKILL.md name field, use directory name for filesystem
|
|
145
275
|
const skillsSource = path.join(PLUGIN_ROOT, "skills");
|
|
146
276
|
if (fs.existsSync(skillsSource)) {
|
|
147
277
|
for (const skill of fs.readdirSync(skillsSource, { withFileTypes: true })) {
|
|
@@ -150,18 +280,17 @@ function installSkillsAndAgents(runtimeDir, scope) {
|
|
|
150
280
|
const skillMd = path.join(src, "SKILL.md");
|
|
151
281
|
if (!fs.existsSync(skillMd)) continue;
|
|
152
282
|
|
|
153
|
-
// Read the skill name from frontmatter
|
|
154
283
|
const content = fs.readFileSync(skillMd, "utf8");
|
|
155
284
|
const nameMatch = content.match(/^name:\s*(.+)$/m);
|
|
156
285
|
const skillName = nameMatch ? nameMatch[1].trim() : skill.name;
|
|
157
286
|
|
|
158
287
|
const dest = path.join(skillsDir, skill.name);
|
|
159
288
|
copyDir(src, dest);
|
|
160
|
-
|
|
289
|
+
barLine(`${c.green("\u2714")} ${skillName}`);
|
|
290
|
+
installed++;
|
|
161
291
|
}
|
|
162
292
|
}
|
|
163
293
|
|
|
164
|
-
// Agents
|
|
165
294
|
const agentsSource = path.join(PLUGIN_ROOT, "agents");
|
|
166
295
|
if (fs.existsSync(agentsSource)) {
|
|
167
296
|
fs.mkdirSync(agentsDir, { recursive: true });
|
|
@@ -169,9 +298,12 @@ function installSkillsAndAgents(runtimeDir, scope) {
|
|
|
169
298
|
if (!agent.endsWith(".md")) continue;
|
|
170
299
|
copyFile(path.join(agentsSource, agent), path.join(agentsDir, agent));
|
|
171
300
|
const agentName = agent.replace(".md", "");
|
|
172
|
-
|
|
301
|
+
barLine(`${c.green("\u2714")} agent: ${agentName}`);
|
|
302
|
+
installed++;
|
|
173
303
|
}
|
|
174
304
|
}
|
|
305
|
+
|
|
306
|
+
return installed;
|
|
175
307
|
}
|
|
176
308
|
|
|
177
309
|
function installTools() {
|
|
@@ -185,8 +317,9 @@ function installTools() {
|
|
|
185
317
|
copyFile(path.join(toolsSource, tool), path.join(toolsDir, tool));
|
|
186
318
|
count++;
|
|
187
319
|
}
|
|
188
|
-
|
|
320
|
+
return count;
|
|
189
321
|
}
|
|
322
|
+
return 0;
|
|
190
323
|
}
|
|
191
324
|
|
|
192
325
|
function installPythonDeps() {
|
|
@@ -194,11 +327,10 @@ function installPythonDeps() {
|
|
|
194
327
|
const venvPython = path.join(venvDir, "bin", "python");
|
|
195
328
|
const venvPip = path.join(venvDir, "bin", "pip");
|
|
196
329
|
|
|
197
|
-
|
|
330
|
+
step("Setting up Python environment...");
|
|
198
331
|
|
|
199
|
-
// Create venv if it doesn't exist
|
|
200
332
|
if (!fs.existsSync(venvPython)) {
|
|
201
|
-
|
|
333
|
+
barLine("Creating isolated venv at ~/.evolver/venv/");
|
|
202
334
|
const venvCommands = [
|
|
203
335
|
`uv venv "${venvDir}"`,
|
|
204
336
|
`python3 -m venv "${venvDir}"`,
|
|
@@ -214,119 +346,123 @@ function installPythonDeps() {
|
|
|
214
346
|
}
|
|
215
347
|
}
|
|
216
348
|
if (!created) {
|
|
217
|
-
|
|
218
|
-
|
|
349
|
+
stepError("Failed to create venv");
|
|
350
|
+
barLine(c.dim(`Run manually: python3 -m venv ~/.evolver/venv`));
|
|
219
351
|
return false;
|
|
220
352
|
}
|
|
221
|
-
|
|
353
|
+
stepDone("venv created");
|
|
222
354
|
} else {
|
|
223
|
-
|
|
355
|
+
stepDone("venv exists at ~/.evolver/venv/");
|
|
224
356
|
}
|
|
225
357
|
|
|
226
|
-
|
|
358
|
+
barEmpty();
|
|
359
|
+
|
|
227
360
|
const installCommands = [
|
|
228
361
|
`uv pip install --python "${venvPython}" langsmith`,
|
|
229
362
|
`"${venvPip}" install --upgrade langsmith`,
|
|
230
363
|
`"${venvPython}" -m pip install --upgrade langsmith`,
|
|
231
364
|
];
|
|
232
365
|
|
|
366
|
+
step("Installing langsmith...");
|
|
233
367
|
for (const cmd of installCommands) {
|
|
234
368
|
try {
|
|
235
369
|
execSync(cmd, { stdio: "pipe", timeout: 120000 });
|
|
236
|
-
|
|
370
|
+
stepDone("langsmith installed in venv");
|
|
237
371
|
return true;
|
|
238
372
|
} catch {
|
|
239
373
|
continue;
|
|
240
374
|
}
|
|
241
375
|
}
|
|
242
376
|
|
|
243
|
-
|
|
244
|
-
|
|
377
|
+
stepError("Could not install langsmith");
|
|
378
|
+
barLine(c.dim("Run manually: ~/.evolver/venv/bin/pip install langsmith"));
|
|
245
379
|
return false;
|
|
246
380
|
}
|
|
247
381
|
|
|
248
382
|
async function configureLangSmith(rl) {
|
|
249
|
-
console.log(`\n ${BOLD}${GREEN}LangSmith Configuration${RESET} ${DIM}(required)${RESET}\n`);
|
|
250
|
-
|
|
251
383
|
const langsmithCredsDir = process.platform === "darwin"
|
|
252
384
|
? path.join(HOME, "Library", "Application Support", "langsmith-cli")
|
|
253
385
|
: path.join(HOME, ".config", "langsmith-cli");
|
|
254
386
|
const langsmithCredsFile = path.join(langsmithCredsDir, "credentials");
|
|
255
387
|
const hasLangsmithCli = checkCommand("langsmith-cli --version");
|
|
256
388
|
|
|
257
|
-
// --- Step 1: API Key ---
|
|
258
389
|
let hasKey = false;
|
|
259
390
|
|
|
391
|
+
barEmpty();
|
|
392
|
+
step(c.bold("LangSmith API Key") + " " + c.dim("(required)"));
|
|
393
|
+
|
|
260
394
|
if (process.env.LANGSMITH_API_KEY) {
|
|
261
|
-
|
|
395
|
+
stepDone("LANGSMITH_API_KEY found in environment");
|
|
262
396
|
hasKey = true;
|
|
263
397
|
} else if (fs.existsSync(langsmithCredsFile)) {
|
|
264
398
|
try {
|
|
265
399
|
const content = fs.readFileSync(langsmithCredsFile, "utf8");
|
|
266
400
|
if (content.includes("LANGSMITH_API_KEY=lsv2_")) {
|
|
267
|
-
|
|
401
|
+
stepDone("API key found in credentials file");
|
|
268
402
|
hasKey = true;
|
|
269
403
|
}
|
|
270
404
|
} catch {}
|
|
271
405
|
}
|
|
272
406
|
|
|
273
407
|
if (!hasKey) {
|
|
274
|
-
|
|
275
|
-
|
|
408
|
+
barLine(c.dim("Get yours at https://smith.langchain.com/settings"));
|
|
409
|
+
barLine(c.dim("LangSmith is required. The evolver won't work without it."));
|
|
410
|
+
barEmpty();
|
|
276
411
|
|
|
277
|
-
// Keep asking until they provide a key or explicitly skip
|
|
278
412
|
let attempts = 0;
|
|
279
413
|
while (!hasKey && attempts < 3) {
|
|
280
|
-
const apiKey = await ask(rl,
|
|
414
|
+
const apiKey = await ask(rl, `${c.cyan(S.stepActive)} Paste your LangSmith API key (lsv2_pt_...): `);
|
|
281
415
|
const key = apiKey.trim();
|
|
282
416
|
|
|
283
417
|
if (key && key.startsWith("lsv2_")) {
|
|
284
418
|
try {
|
|
285
419
|
fs.mkdirSync(langsmithCredsDir, { recursive: true });
|
|
286
420
|
fs.writeFileSync(langsmithCredsFile, `LANGSMITH_API_KEY=${key}\n`);
|
|
287
|
-
|
|
421
|
+
stepDone("API key saved");
|
|
288
422
|
hasKey = true;
|
|
289
423
|
} catch {
|
|
290
|
-
|
|
291
|
-
|
|
424
|
+
stepError("Failed to save");
|
|
425
|
+
barLine(c.dim(`Add to your shell: export LANGSMITH_API_KEY=${key}`));
|
|
426
|
+
hasKey = true;
|
|
292
427
|
}
|
|
293
428
|
} else if (key) {
|
|
294
|
-
|
|
429
|
+
barLine(c.yellow("Invalid \u2014 LangSmith keys start with lsv2_"));
|
|
295
430
|
attempts++;
|
|
296
431
|
} else {
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
console.log(` Run: ${DIM}export LANGSMITH_API_KEY=lsv2_pt_your_key${RESET}\n`);
|
|
432
|
+
stepError("No API key configured");
|
|
433
|
+
barLine(c.dim("/evolver:setup will not work until you set LANGSMITH_API_KEY"));
|
|
434
|
+
barLine(c.dim("Run: export LANGSMITH_API_KEY=lsv2_pt_your_key"));
|
|
301
435
|
break;
|
|
302
436
|
}
|
|
303
437
|
}
|
|
304
438
|
}
|
|
305
439
|
|
|
306
|
-
|
|
440
|
+
barEmpty();
|
|
441
|
+
step(c.bold("langsmith-cli") + " " + c.dim("(required for LLM-as-judge)"));
|
|
442
|
+
|
|
307
443
|
if (hasLangsmithCli) {
|
|
308
|
-
|
|
444
|
+
stepDone("langsmith-cli installed");
|
|
309
445
|
} else {
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
console.log(`\n Installing langsmith-cli...`);
|
|
446
|
+
barLine(c.dim("The evaluator agent uses it to read experiment outputs and write scores"));
|
|
447
|
+
step("Installing langsmith-cli...");
|
|
313
448
|
try {
|
|
314
449
|
execSync("uv tool install langsmith-cli 2>/dev/null || pip install langsmith-cli 2>/dev/null || pip3 install langsmith-cli", { stdio: "pipe", timeout: 60000 });
|
|
315
|
-
|
|
450
|
+
stepDone("langsmith-cli installed");
|
|
316
451
|
|
|
317
|
-
// If we have a key, auto-authenticate
|
|
318
452
|
if (hasKey && fs.existsSync(langsmithCredsFile)) {
|
|
319
|
-
|
|
453
|
+
stepDone("langsmith-cli auto-authenticated");
|
|
320
454
|
}
|
|
321
455
|
} catch {
|
|
322
|
-
|
|
323
|
-
|
|
456
|
+
stepError("Could not install langsmith-cli");
|
|
457
|
+
barLine(c.dim("Install manually: uv tool install langsmith-cli"));
|
|
324
458
|
}
|
|
325
459
|
}
|
|
326
460
|
}
|
|
327
461
|
|
|
328
462
|
async function configureOptionalIntegrations(rl) {
|
|
329
|
-
|
|
463
|
+
barEmpty();
|
|
464
|
+
step(c.bold("Optional Integrations"));
|
|
465
|
+
barEmpty();
|
|
330
466
|
|
|
331
467
|
// Context7 MCP
|
|
332
468
|
const hasContext7 = (() => {
|
|
@@ -342,20 +478,24 @@ async function configureOptionalIntegrations(rl) {
|
|
|
342
478
|
})();
|
|
343
479
|
|
|
344
480
|
if (hasContext7) {
|
|
345
|
-
|
|
481
|
+
stepDone("Context7 MCP already configured");
|
|
346
482
|
} else {
|
|
347
|
-
|
|
348
|
-
const c7Answer = await ask(rl,
|
|
483
|
+
barLine(c.bold("Context7 MCP") + " \u2014 " + c.dim("up-to-date library documentation"));
|
|
484
|
+
const c7Answer = await ask(rl, `${c.cyan(S.stepActive)} Install Context7 MCP? [y/N]: `);
|
|
349
485
|
if (c7Answer.trim().toLowerCase() === "y") {
|
|
486
|
+
step("Installing Context7 MCP...");
|
|
350
487
|
try {
|
|
351
488
|
execSync("claude mcp add context7 -- npx -y @upstash/context7-mcp@latest", { stdio: "inherit" });
|
|
352
|
-
|
|
489
|
+
stepDone("Context7 MCP configured");
|
|
353
490
|
} catch {
|
|
354
|
-
|
|
491
|
+
stepError("Failed to install Context7 MCP");
|
|
492
|
+
barLine(c.dim("Run manually: claude mcp add context7 -- npx -y @upstash/context7-mcp@latest"));
|
|
355
493
|
}
|
|
356
494
|
}
|
|
357
495
|
}
|
|
358
496
|
|
|
497
|
+
barEmpty();
|
|
498
|
+
|
|
359
499
|
// LangChain Docs MCP
|
|
360
500
|
const hasLcDocs = (() => {
|
|
361
501
|
try {
|
|
@@ -370,38 +510,50 @@ async function configureOptionalIntegrations(rl) {
|
|
|
370
510
|
})();
|
|
371
511
|
|
|
372
512
|
if (hasLcDocs) {
|
|
373
|
-
|
|
513
|
+
stepDone("LangChain Docs MCP already configured");
|
|
374
514
|
} else {
|
|
375
|
-
|
|
376
|
-
const lcAnswer = await ask(rl,
|
|
515
|
+
barLine(c.bold("LangChain Docs MCP") + " \u2014 " + c.dim("LangChain/LangGraph/LangSmith docs"));
|
|
516
|
+
const lcAnswer = await ask(rl, `${c.cyan(S.stepActive)} Install LangChain Docs MCP? [y/N]: `);
|
|
377
517
|
if (lcAnswer.trim().toLowerCase() === "y") {
|
|
518
|
+
step("Installing LangChain Docs MCP...");
|
|
378
519
|
try {
|
|
379
520
|
execSync("claude mcp add docs-langchain --transport http https://docs.langchain.com/mcp", { stdio: "inherit" });
|
|
380
|
-
|
|
521
|
+
stepDone("LangChain Docs MCP configured");
|
|
381
522
|
} catch {
|
|
382
|
-
|
|
523
|
+
stepError("Failed to install LangChain Docs MCP");
|
|
524
|
+
barLine(c.dim("Run manually: claude mcp add docs-langchain --transport http https://docs.langchain.com/mcp"));
|
|
383
525
|
}
|
|
384
526
|
}
|
|
385
527
|
}
|
|
386
528
|
}
|
|
387
529
|
|
|
530
|
+
// ─── Main ───────────────────────────────────────────────────────────────────
|
|
531
|
+
|
|
388
532
|
async function main() {
|
|
389
|
-
|
|
533
|
+
banner();
|
|
390
534
|
|
|
391
|
-
|
|
535
|
+
header("harness-evolver");
|
|
536
|
+
step(`Source: ${c.dim(`v${VERSION} \u2014 LangSmith-native agent optimization`)}`);
|
|
537
|
+
|
|
538
|
+
// Version check
|
|
392
539
|
try {
|
|
393
540
|
const latest = execSync("npm view harness-evolver version", { stdio: "pipe", timeout: 5000 }).toString().trim();
|
|
394
541
|
if (latest && latest !== VERSION) {
|
|
395
|
-
|
|
396
|
-
|
|
542
|
+
barEmpty();
|
|
543
|
+
stepError(`You're running v${VERSION} but v${c.cyan(latest)} is available`);
|
|
544
|
+
barLine(c.dim(`Run: npx harness-evolver@${latest}`));
|
|
397
545
|
}
|
|
398
546
|
} catch {}
|
|
399
547
|
|
|
548
|
+
barEmpty();
|
|
549
|
+
|
|
550
|
+
// Python check
|
|
400
551
|
if (!checkPython()) {
|
|
401
|
-
|
|
552
|
+
stepError("python3 not found. Install Python 3.10+ first.");
|
|
553
|
+
footer();
|
|
402
554
|
process.exit(1);
|
|
403
555
|
}
|
|
404
|
-
|
|
556
|
+
stepDone("python3 found");
|
|
405
557
|
|
|
406
558
|
// Detect runtimes
|
|
407
559
|
const RUNTIMES = [
|
|
@@ -412,22 +564,25 @@ async function main() {
|
|
|
412
564
|
].filter(r => fs.existsSync(path.join(HOME, r.dir)));
|
|
413
565
|
|
|
414
566
|
if (RUNTIMES.length === 0) {
|
|
415
|
-
|
|
416
|
-
|
|
567
|
+
stepError("No supported runtime detected");
|
|
568
|
+
barLine(c.dim("Install Claude Code, Cursor, Codex, or Windsurf first"));
|
|
569
|
+
footer();
|
|
417
570
|
process.exit(1);
|
|
418
571
|
}
|
|
419
572
|
|
|
420
573
|
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
421
574
|
|
|
422
575
|
// Runtime selection
|
|
423
|
-
|
|
424
|
-
|
|
576
|
+
barEmpty();
|
|
577
|
+
stepPrompt("Which runtime(s) to install for?");
|
|
578
|
+
barEmpty();
|
|
579
|
+
RUNTIMES.forEach((r, i) => barLine(` ${c.bold(String(i + 1))} ${r.name.padEnd(14)} ${c.dim(`~/${r.dir}`)}`));
|
|
425
580
|
if (RUNTIMES.length > 1) {
|
|
426
|
-
|
|
427
|
-
|
|
581
|
+
barLine(` ${c.bold(String(RUNTIMES.length + 1))} All`);
|
|
582
|
+
barLine(c.dim("Select multiple: 1,2 or 1 2"));
|
|
428
583
|
}
|
|
429
584
|
|
|
430
|
-
const runtimeAnswer = await ask(rl,
|
|
585
|
+
const runtimeAnswer = await ask(rl, `${c.cyan(S.stepActive)} Choice [1]: `);
|
|
431
586
|
const runtimeInput = (runtimeAnswer.trim() || "1");
|
|
432
587
|
|
|
433
588
|
let selected;
|
|
@@ -439,31 +594,48 @@ async function main() {
|
|
|
439
594
|
}
|
|
440
595
|
if (selected.length === 0) selected = [RUNTIMES[0]];
|
|
441
596
|
|
|
597
|
+
stepDone(`Target: ${c.cyan(selected.map(r => r.name).join(", "))}`);
|
|
598
|
+
|
|
442
599
|
// Scope selection
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
600
|
+
barEmpty();
|
|
601
|
+
stepPrompt("Where to install?");
|
|
602
|
+
barEmpty();
|
|
603
|
+
barLine(` ${c.bold("1")} Global ${c.dim(`(~/${selected[0].dir})`)}`);
|
|
604
|
+
barLine(` ${c.bold("2")} Local ${c.dim(`(./${selected[0].dir})`)}`);
|
|
446
605
|
|
|
447
|
-
const scopeAnswer = await ask(rl,
|
|
606
|
+
const scopeAnswer = await ask(rl, `${c.cyan(S.stepActive)} Choice [1]: `);
|
|
448
607
|
const scope = (scopeAnswer.trim() === "2") ? "local" : "global";
|
|
449
608
|
|
|
450
|
-
|
|
451
|
-
|
|
609
|
+
stepDone(`Scope: ${c.cyan(scope)}`);
|
|
610
|
+
|
|
611
|
+
// Discover what we're installing
|
|
612
|
+
const counts = countInstallables();
|
|
613
|
+
barEmpty();
|
|
614
|
+
step(`Found ${c.bold(`${counts.skills} skills, ${counts.agents} agents, ${counts.tools} tools`)}`);
|
|
615
|
+
|
|
616
|
+
// Clean previous install
|
|
617
|
+
barEmpty();
|
|
618
|
+
step("Cleaning previous install...");
|
|
452
619
|
for (const runtime of selected) {
|
|
453
620
|
cleanPreviousInstall(runtime.dir, scope);
|
|
454
621
|
}
|
|
622
|
+
stepDone("Clean");
|
|
455
623
|
|
|
456
624
|
// Install skills + agents
|
|
457
|
-
|
|
625
|
+
barEmpty();
|
|
458
626
|
for (const runtime of selected) {
|
|
459
|
-
|
|
627
|
+
step(`Installing to ${c.bold(runtime.name)}...`);
|
|
628
|
+
barEmpty();
|
|
460
629
|
installSkillsAndAgents(runtime.dir, scope);
|
|
461
|
-
|
|
630
|
+
barEmpty();
|
|
631
|
+
stepDone(`${c.cyan(runtime.name)} ready`);
|
|
462
632
|
}
|
|
463
633
|
|
|
464
|
-
// Install tools
|
|
465
|
-
|
|
466
|
-
|
|
634
|
+
// Install tools
|
|
635
|
+
barEmpty();
|
|
636
|
+
step("Installing tools...");
|
|
637
|
+
const toolCount = installTools();
|
|
638
|
+
stepDone(`${toolCount} tools installed to ~/.evolver/tools/`);
|
|
467
639
|
|
|
468
640
|
// Version marker
|
|
469
641
|
const versionPath = path.join(HOME, ".evolver", "VERSION");
|
|
@@ -471,27 +643,33 @@ async function main() {
|
|
|
471
643
|
fs.writeFileSync(versionPath, VERSION);
|
|
472
644
|
|
|
473
645
|
// Install Python deps
|
|
646
|
+
barEmpty();
|
|
474
647
|
installPythonDeps();
|
|
475
648
|
|
|
476
|
-
// Configure LangSmith
|
|
649
|
+
// Configure LangSmith
|
|
477
650
|
await configureLangSmith(rl);
|
|
478
651
|
|
|
479
652
|
// Optional integrations
|
|
480
653
|
await configureOptionalIntegrations(rl);
|
|
481
654
|
|
|
482
655
|
// Done
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
656
|
+
barEmpty();
|
|
657
|
+
stepDone(c.green("Done.") + " Restart your agent tools to load the plugin.");
|
|
658
|
+
barEmpty();
|
|
659
|
+
barLine(c.dim("Commands:"));
|
|
660
|
+
barLine(` ${c.cyan("/evolver:setup")} \u2014 configure LangSmith for your project`);
|
|
661
|
+
barLine(` ${c.cyan("/evolver:evolve")} \u2014 run the optimization loop`);
|
|
662
|
+
barLine(` ${c.cyan("/evolver:status")} \u2014 check progress`);
|
|
663
|
+
barLine(` ${c.cyan("/evolver:deploy")} \u2014 finalize and push`);
|
|
664
|
+
barEmpty();
|
|
665
|
+
barLine(c.dim("GitHub: https://github.com/raphaelchristi/harness-evolver"));
|
|
666
|
+
footer();
|
|
490
667
|
|
|
491
668
|
rl.close();
|
|
492
669
|
}
|
|
493
670
|
|
|
494
671
|
main().catch(err => {
|
|
495
|
-
|
|
672
|
+
stepError(err.message);
|
|
673
|
+
footer();
|
|
496
674
|
process.exit(1);
|
|
497
675
|
});
|
package/package.json
CHANGED
package/skills/evolve/SKILL.md
CHANGED
|
@@ -235,7 +235,7 @@ Agent(
|
|
|
235
235
|
Entry point: {entry_point}
|
|
236
236
|
|
|
237
237
|
For each experiment:
|
|
238
|
-
1. Read all runs via: langsmith-cli --json runs list --project "{experiment_name}" --fields id,inputs,outputs,error --is-root --limit 200
|
|
238
|
+
1. Read all runs via: langsmith-cli --json runs list --project "{experiment_name}" --fields id,inputs,outputs,error --is-root true --limit 200
|
|
239
239
|
2. Judge each run's output against the input
|
|
240
240
|
3. Write scores via: langsmith-cli --json feedback create {run_id} --key {evaluator} --score {0.0|1.0} --comment "{reason}" --source model
|
|
241
241
|
</context>
|
package/skills/setup/SKILL.md
CHANGED
|
@@ -61,9 +61,14 @@ Look for:
|
|
|
61
61
|
|
|
62
62
|
To identify the **framework**, read the entry point file and its immediate imports. The proposer agents will use Context7 MCP for detailed documentation lookup — you don't need to detect every library, just identify the main framework (LangGraph, CrewAI, OpenAI Agents SDK, etc.) from the imports you see.
|
|
63
63
|
|
|
64
|
-
Identify the **run command** — how to execute the agent:
|
|
65
|
-
- `python main.py`
|
|
66
|
-
-
|
|
64
|
+
Identify the **run command** — how to execute the agent. Use `{input}` as a placeholder for the JSON file path:
|
|
65
|
+
- `python main.py {input}` — agent reads JSON file from positional arg
|
|
66
|
+
- `python main.py --input {input}` — agent reads JSON file from `--input` flag
|
|
67
|
+
- `python main.py --query {input_json}` — agent receives inline JSON string
|
|
68
|
+
|
|
69
|
+
The runner writes `{"input": "user question..."}` to a temp `.json` file and replaces `{input}` with the file path. If the entry point already contains `--input` (without placeholder), the runner appends the file path as the next argument.
|
|
70
|
+
|
|
71
|
+
If no placeholder and no `--input` flag detected, the runner appends `--input <path> --output <path>`.
|
|
67
72
|
|
|
68
73
|
## Phase 2: Confirm Detection (interactive)
|
|
69
74
|
|
package/tools/read_results.py
CHANGED
|
@@ -26,7 +26,7 @@ import sys
|
|
|
26
26
|
|
|
27
27
|
|
|
28
28
|
def ensure_langsmith_api_key():
|
|
29
|
-
"""Load LANGSMITH_API_KEY from credentials file if not in env."""
|
|
29
|
+
"""Load LANGSMITH_API_KEY from credentials file or .env if not in env."""
|
|
30
30
|
if os.environ.get("LANGSMITH_API_KEY"):
|
|
31
31
|
return True
|
|
32
32
|
if platform.system() == "Darwin":
|
|
@@ -45,6 +45,19 @@ def ensure_langsmith_api_key():
|
|
|
45
45
|
return True
|
|
46
46
|
except OSError:
|
|
47
47
|
pass
|
|
48
|
+
# Also check .env in current directory
|
|
49
|
+
if os.path.exists(".env"):
|
|
50
|
+
try:
|
|
51
|
+
with open(".env") as f:
|
|
52
|
+
for line in f:
|
|
53
|
+
line = line.strip()
|
|
54
|
+
if line.startswith("LANGSMITH_API_KEY=") and not line.startswith("#"):
|
|
55
|
+
key = line.split("=", 1)[1].strip().strip("'\"")
|
|
56
|
+
if key:
|
|
57
|
+
os.environ["LANGSMITH_API_KEY"] = key
|
|
58
|
+
return True
|
|
59
|
+
except OSError:
|
|
60
|
+
pass
|
|
48
61
|
return False
|
|
49
62
|
|
|
50
63
|
|
package/tools/run_eval.py
CHANGED
|
@@ -73,10 +73,16 @@ def make_target(entry_point, cwd):
|
|
|
73
73
|
try:
|
|
74
74
|
cmd = entry_point
|
|
75
75
|
if "{input}" in cmd:
|
|
76
|
+
# Placeholder: replace with path to JSON file
|
|
76
77
|
cmd = cmd.replace("{input}", input_path)
|
|
77
78
|
elif "{input_json}" in cmd:
|
|
79
|
+
# Placeholder: replace with inline JSON string
|
|
78
80
|
cmd = cmd.replace("{input_json}", input_json)
|
|
81
|
+
elif "--input" in cmd or "-i " in cmd:
|
|
82
|
+
# Entry point already has --input flag — pass the file path as next arg
|
|
83
|
+
cmd = f"{cmd} {input_path}"
|
|
79
84
|
else:
|
|
85
|
+
# Default: append --input and --output flags
|
|
80
86
|
cmd = f"{cmd} --input {input_path} --output {output_path}"
|
|
81
87
|
|
|
82
88
|
env = os.environ.copy()
|
|
@@ -197,17 +203,38 @@ def main():
|
|
|
197
203
|
experiment_name = results.experiment_name
|
|
198
204
|
|
|
199
205
|
# Calculate mean score from code-based evaluators only
|
|
206
|
+
# langsmith>=0.7.x returns dicts, older versions return dataclasses
|
|
200
207
|
scores = []
|
|
201
208
|
per_example = {}
|
|
202
209
|
for result in results:
|
|
203
210
|
example_scores = []
|
|
204
|
-
if result.evaluation_results and result.evaluation_results.get("results"):
|
|
205
|
-
for er in result.evaluation_results["results"]:
|
|
206
|
-
if er.get("score") is not None:
|
|
207
|
-
example_scores.append(er["score"])
|
|
208
|
-
scores.append(er["score"])
|
|
209
211
|
|
|
210
|
-
|
|
212
|
+
# Handle both dict and object results (SDK version compat)
|
|
213
|
+
if isinstance(result, dict):
|
|
214
|
+
eval_results = result.get("evaluation_results", {})
|
|
215
|
+
if isinstance(eval_results, dict):
|
|
216
|
+
eval_list = eval_results.get("results", [])
|
|
217
|
+
else:
|
|
218
|
+
eval_list = getattr(eval_results, "results", []) or []
|
|
219
|
+
example_obj = result.get("example")
|
|
220
|
+
example_id = str(example_obj.get("id", "unknown") if isinstance(example_obj, dict) else getattr(example_obj, "id", "unknown"))
|
|
221
|
+
else:
|
|
222
|
+
eval_results = getattr(result, "evaluation_results", None)
|
|
223
|
+
if isinstance(eval_results, dict):
|
|
224
|
+
eval_list = eval_results.get("results", [])
|
|
225
|
+
elif eval_results:
|
|
226
|
+
eval_list = getattr(eval_results, "results", []) or []
|
|
227
|
+
else:
|
|
228
|
+
eval_list = []
|
|
229
|
+
example_obj = getattr(result, "example", None)
|
|
230
|
+
example_id = str(getattr(example_obj, "id", "unknown") if example_obj else "unknown")
|
|
231
|
+
|
|
232
|
+
for er in eval_list:
|
|
233
|
+
score_val = er.get("score") if isinstance(er, dict) else getattr(er, "score", None)
|
|
234
|
+
if score_val is not None:
|
|
235
|
+
example_scores.append(score_val)
|
|
236
|
+
scores.append(score_val)
|
|
237
|
+
|
|
211
238
|
per_example[example_id] = {
|
|
212
239
|
"score": sum(example_scores) / len(example_scores) if example_scores else 0.0,
|
|
213
240
|
"num_evaluators": len(example_scores),
|
package/tools/setup.py
CHANGED
|
@@ -267,6 +267,8 @@ def make_target(entry_point, cwd=None):
|
|
|
267
267
|
cmd = cmd.replace("{input}", input_path)
|
|
268
268
|
elif "{input_json}" in cmd:
|
|
269
269
|
cmd = cmd.replace("{input_json}", input_json)
|
|
270
|
+
elif "--input" in cmd or "-i " in cmd:
|
|
271
|
+
cmd = f"{cmd} {input_path}"
|
|
270
272
|
else:
|
|
271
273
|
cmd = f"{cmd} --input {input_path} --output {output_path}"
|
|
272
274
|
|
package/tools/trace_insights.py
CHANGED
|
@@ -23,10 +23,46 @@ Requires: pip install langsmith (for SDK mode)
|
|
|
23
23
|
import argparse
|
|
24
24
|
import json
|
|
25
25
|
import os
|
|
26
|
+
import platform
|
|
26
27
|
import sys
|
|
27
28
|
from datetime import datetime, timezone
|
|
28
29
|
|
|
29
30
|
|
|
31
|
+
def ensure_langsmith_api_key():
|
|
32
|
+
"""Load LANGSMITH_API_KEY from credentials file or .env if not in env."""
|
|
33
|
+
if os.environ.get("LANGSMITH_API_KEY"):
|
|
34
|
+
return True
|
|
35
|
+
if platform.system() == "Darwin":
|
|
36
|
+
creds_path = os.path.expanduser("~/Library/Application Support/langsmith-cli/credentials")
|
|
37
|
+
else:
|
|
38
|
+
creds_path = os.path.expanduser("~/.config/langsmith-cli/credentials")
|
|
39
|
+
if os.path.exists(creds_path):
|
|
40
|
+
try:
|
|
41
|
+
with open(creds_path) as f:
|
|
42
|
+
for line in f:
|
|
43
|
+
line = line.strip()
|
|
44
|
+
if line.startswith("LANGSMITH_API_KEY="):
|
|
45
|
+
key = line.split("=", 1)[1].strip()
|
|
46
|
+
if key:
|
|
47
|
+
os.environ["LANGSMITH_API_KEY"] = key
|
|
48
|
+
return True
|
|
49
|
+
except OSError:
|
|
50
|
+
pass
|
|
51
|
+
if os.path.exists(".env"):
|
|
52
|
+
try:
|
|
53
|
+
with open(".env") as f:
|
|
54
|
+
for line in f:
|
|
55
|
+
line = line.strip()
|
|
56
|
+
if line.startswith("LANGSMITH_API_KEY=") and not line.startswith("#"):
|
|
57
|
+
key = line.split("=", 1)[1].strip().strip("'\"")
|
|
58
|
+
if key:
|
|
59
|
+
os.environ["LANGSMITH_API_KEY"] = key
|
|
60
|
+
return True
|
|
61
|
+
except OSError:
|
|
62
|
+
pass
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
|
|
30
66
|
def load_json(path):
|
|
31
67
|
"""Load JSON file, return None if missing or invalid."""
|
|
32
68
|
if not path or not os.path.exists(path):
|
|
@@ -260,6 +296,7 @@ def identify_top_issues(error_clusters, response_analysis, score_cross_ref):
|
|
|
260
296
|
def fetch_runs_from_langsmith(project_name, experiment_name=None, limit=50):
|
|
261
297
|
"""Fetch runs directly from LangSmith SDK (v3 mode)."""
|
|
262
298
|
try:
|
|
299
|
+
ensure_langsmith_api_key()
|
|
263
300
|
from langsmith import Client
|
|
264
301
|
client = Client()
|
|
265
302
|
|