harness-evolver 4.2.8 → 4.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/agents/evolver-proposer.md +7 -46
- package/bin/install.js +32 -21
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +46 -238
- package/skills/health/SKILL.md +120 -0
- package/skills/setup/SKILL.md +66 -64
- package/tools/seed_from_traces.py +24 -105
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.
|
|
4
|
+
"version": "4.3.0",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
|
@@ -22,14 +22,7 @@ Your prompt contains `<files_to_read>`, `<context>`, and `<lens>` blocks. You MU
|
|
|
22
22
|
|
|
23
23
|
## Turn Budget
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
- Spend early turns reading context and investigating your lens question
|
|
27
|
-
- Spend middle turns implementing changes and consulting documentation
|
|
28
|
-
- Reserve final turns for committing and writing proposal.md
|
|
29
|
-
|
|
30
|
-
**If you're past turn 12 and haven't started implementing**, simplify your approach. A small, focused change that works is better than an ambitious change that's incomplete.
|
|
31
|
-
|
|
32
|
-
**Context management**: After turn 8, avoid re-reading files you've already read. Reference your earlier analysis instead of re-running Glob/Grep searches.
|
|
25
|
+
Most proposals need **10-15 turns**. Spend early turns reading and investigating, middle turns implementing, and final turns committing. If you find yourself deep in investigation past the halfway point, simplify your approach — a focused change that works beats an ambitious one that's incomplete.
|
|
33
26
|
|
|
34
27
|
## Lens Protocol
|
|
35
28
|
|
|
@@ -44,19 +37,7 @@ You are NOT constrained to the lens topic. The lens gives you a starting perspec
|
|
|
44
37
|
|
|
45
38
|
## Your Workflow
|
|
46
39
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
**Orient** — Read .evolver.json, strategy.md, evolution_memory.md. Understand the framework, entry point, evaluators, current score, and what has been tried before.
|
|
50
|
-
|
|
51
|
-
**Investigate** — Read trace_insights.json and best_results.json. Understand which examples fail and why. If production_seed.json exists, understand real-world usage patterns. Focus on data relevant to your lens question.
|
|
52
|
-
|
|
53
|
-
**Decide** — Based on investigation, decide what to change. Consider:
|
|
54
|
-
- **Prompts**: system prompts, few-shot examples, output format instructions
|
|
55
|
-
- **Routing**: how queries are dispatched to different handlers
|
|
56
|
-
- **Tools**: tool definitions, tool selection logic
|
|
57
|
-
- **Architecture**: agent topology, chain structure, graph edges
|
|
58
|
-
- **Error handling**: retry logic, fallback strategies, timeout handling
|
|
59
|
-
- **Model selection**: which model for which task
|
|
40
|
+
Read the available context files (.evolver.json, strategy.md, evolution_memory.md, trace_insights.json, best_results.json, production_seed.json). Investigate your lens question. Decide what to change and implement it.
|
|
60
41
|
|
|
61
42
|
## Self-Abstention
|
|
62
43
|
|
|
@@ -74,34 +55,14 @@ To abstain, skip implementation and write only a `proposal.md`:
|
|
|
74
55
|
|
|
75
56
|
Then end with the return protocol using `ABSTAIN` as your approach.
|
|
76
57
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
**Before writing ANY code**, you MUST consult Context7 for every library you'll be modifying or using. This is NOT optional.
|
|
80
|
-
|
|
81
|
-
**Step 1 — Identify libraries from the code you read:**
|
|
82
|
-
Read the imports in the files you're about to modify. For each framework/library (LangGraph, OpenAI, Anthropic, CrewAI, etc.):
|
|
83
|
-
|
|
84
|
-
**Step 2 — Resolve library ID:**
|
|
85
|
-
```
|
|
86
|
-
resolve-library-id(libraryName: "langgraph", query: "what you're trying to do")
|
|
87
|
-
```
|
|
88
|
-
This returns up to 10 matches. Pick the one with the highest relevance.
|
|
89
|
-
|
|
90
|
-
**Step 3 — Query docs for your specific task:**
|
|
91
|
-
```
|
|
92
|
-
get-library-docs(libraryId: "/langchain-ai/langgraph", query: "conditional edges StateGraph", topic: "routing")
|
|
93
|
-
```
|
|
94
|
-
Ask about the SPECIFIC API you're going to use or change.
|
|
58
|
+
## Consult Documentation
|
|
95
59
|
|
|
96
|
-
|
|
97
|
-
- About to modify a StateGraph? → `query: "StateGraph add_conditional_edges"`
|
|
98
|
-
- Changing prompt template? → `query: "ChatPromptTemplate from_messages"` for langchain
|
|
99
|
-
- Adding a tool? → `query: "StructuredTool create tool definition"` for langchain
|
|
100
|
-
- Changing model? → `query: "ChatOpenAI model parameters temperature"` for openai
|
|
60
|
+
Before modifying library APIs (LangGraph, OpenAI, Anthropic, etc.), consult Context7 to verify you're using current patterns:
|
|
101
61
|
|
|
102
|
-
|
|
62
|
+
1. `resolve-library-id(libraryName: "langgraph")`
|
|
63
|
+
2. `get-library-docs(libraryId: "/langchain-ai/langgraph", query: "your specific API question")`
|
|
103
64
|
|
|
104
|
-
|
|
65
|
+
If Context7 MCP is not available, note in proposal.md that API patterns were not verified.
|
|
105
66
|
|
|
106
67
|
### Commit and Document
|
|
107
68
|
|
package/bin/install.js
CHANGED
|
@@ -481,21 +481,40 @@ async function configureOptionalIntegrations(rl, nonInteractive) {
|
|
|
481
481
|
step(c.bold("Optional Integrations"));
|
|
482
482
|
barEmpty();
|
|
483
483
|
|
|
484
|
-
//
|
|
485
|
-
|
|
484
|
+
// Helper: check if an MCP server is configured anywhere
|
|
485
|
+
function hasMcpServer(...names) {
|
|
486
486
|
try {
|
|
487
|
-
// Check settings.json / .claude.json
|
|
488
487
|
for (const p of [path.join(HOME, ".claude", "settings.json"), path.join(HOME, ".claude.json")]) {
|
|
489
|
-
if (fs.existsSync(p))
|
|
490
|
-
|
|
491
|
-
|
|
488
|
+
if (!fs.existsSync(p)) continue;
|
|
489
|
+
const s = JSON.parse(fs.readFileSync(p, "utf8"));
|
|
490
|
+
// Check top-level mcpServers
|
|
491
|
+
if (s.mcpServers) {
|
|
492
|
+
for (const name of names) {
|
|
493
|
+
if (s.mcpServers[name]) return true;
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
// Check per-project mcpServers (claude mcp add saves here)
|
|
497
|
+
if (s.projects) {
|
|
498
|
+
for (const proj of Object.values(s.projects)) {
|
|
499
|
+
if (proj && proj.mcpServers) {
|
|
500
|
+
for (const name of names) {
|
|
501
|
+
if (proj.mcpServers[name]) return true;
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
}
|
|
492
505
|
}
|
|
493
506
|
}
|
|
494
|
-
// Check plugin marketplace install
|
|
495
|
-
const pluginMcp = path.join(HOME, ".claude", "plugins", "marketplaces", "claude-plugins-official", "external_plugins", "context7", ".mcp.json");
|
|
496
|
-
if (fs.existsSync(pluginMcp)) return true;
|
|
497
507
|
} catch {}
|
|
498
508
|
return false;
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// Context7 MCP — check settings, per-project, AND plugin marketplace
|
|
512
|
+
const hasContext7 = (() => {
|
|
513
|
+
if (hasMcpServer("context7", "Context7")) return true;
|
|
514
|
+
// Check plugin marketplace install
|
|
515
|
+
const pluginMcp = path.join(HOME, ".claude", "plugins", "marketplaces", "claude-plugins-official", "external_plugins", "context7", ".mcp.json");
|
|
516
|
+
if (fs.existsSync(pluginMcp)) return true;
|
|
517
|
+
return false;
|
|
499
518
|
})();
|
|
500
519
|
|
|
501
520
|
if (hasContext7) {
|
|
@@ -517,19 +536,11 @@ async function configureOptionalIntegrations(rl, nonInteractive) {
|
|
|
517
536
|
|
|
518
537
|
barEmpty();
|
|
519
538
|
|
|
520
|
-
// LangChain Docs MCP — check settings
|
|
539
|
+
// LangChain Docs MCP — check settings, per-project, AND plugin marketplace
|
|
521
540
|
const hasLcDocs = (() => {
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
const s = JSON.parse(fs.readFileSync(p, "utf8"));
|
|
526
|
-
if (s.mcpServers && (s.mcpServers["docs-langchain"] || s.mcpServers["LangChain Docs"])) return true;
|
|
527
|
-
}
|
|
528
|
-
}
|
|
529
|
-
// Check plugin marketplace install
|
|
530
|
-
const pluginMcp = path.join(HOME, ".claude", "plugins", "marketplaces", "claude-plugins-official", "external_plugins", "docs-langchain", ".mcp.json");
|
|
531
|
-
if (fs.existsSync(pluginMcp)) return true;
|
|
532
|
-
} catch {}
|
|
541
|
+
if (hasMcpServer("docs-langchain", "LangChain Docs")) return true;
|
|
542
|
+
const pluginMcp = path.join(HOME, ".claude", "plugins", "marketplaces", "claude-plugins-official", "external_plugins", "docs-langchain", ".mcp.json");
|
|
543
|
+
if (fs.existsSync(pluginMcp)) return true;
|
|
533
544
|
return false;
|
|
534
545
|
})();
|
|
535
546
|
|
package/package.json
CHANGED
package/skills/evolve/SKILL.md
CHANGED
|
@@ -131,119 +131,7 @@ If critical issues found, ask user whether to continue or fix first via AskUserQ
|
|
|
131
131
|
|
|
132
132
|
### 0.6. Dataset Health Check
|
|
133
133
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
```bash
|
|
137
|
-
$EVOLVER_PY $TOOLS/dataset_health.py \
|
|
138
|
-
--config .evolver.json \
|
|
139
|
-
--production-seed production_seed.json \
|
|
140
|
-
--output health_report.json 2>/dev/null
|
|
141
|
-
```
|
|
142
|
-
|
|
143
|
-
Read `health_report.json`. Print summary:
|
|
144
|
-
```bash
|
|
145
|
-
python3 -c "
|
|
146
|
-
import json, os
|
|
147
|
-
if os.path.exists('health_report.json'):
|
|
148
|
-
r = json.load(open('health_report.json'))
|
|
149
|
-
print(f'Dataset Health: {r[\"health_score\"]}/10 ({r[\"example_count\"]} examples)')
|
|
150
|
-
for issue in r.get('issues', []):
|
|
151
|
-
print(f' [{issue[\"severity\"]}] {issue[\"message\"]}')
|
|
152
|
-
"
|
|
153
|
-
```
|
|
154
|
-
|
|
155
|
-
### 0.7. Auto-Correct Dataset Issues
|
|
156
|
-
|
|
157
|
-
If `health_report.json` has corrections, apply them automatically:
|
|
158
|
-
|
|
159
|
-
```bash
|
|
160
|
-
CORRECTIONS=$(python3 -c "
|
|
161
|
-
import json, os
|
|
162
|
-
if os.path.exists('health_report.json'):
|
|
163
|
-
r = json.load(open('health_report.json'))
|
|
164
|
-
for c in r.get('corrections', []):
|
|
165
|
-
print(c['action'])
|
|
166
|
-
" 2>/dev/null)
|
|
167
|
-
```
|
|
168
|
-
|
|
169
|
-
For each correction:
|
|
170
|
-
|
|
171
|
-
**If `create_splits`**: Run inline Python to assign 70/30 splits:
|
|
172
|
-
```bash
|
|
173
|
-
$EVOLVER_PY -c "
|
|
174
|
-
from langsmith import Client
|
|
175
|
-
import json, random
|
|
176
|
-
client = Client()
|
|
177
|
-
config = json.load(open('.evolver.json'))
|
|
178
|
-
examples = list(client.list_examples(dataset_name=config['dataset']))
|
|
179
|
-
random.shuffle(examples)
|
|
180
|
-
sp = int(len(examples) * 0.7)
|
|
181
|
-
for ex in examples[:sp]:
|
|
182
|
-
client.update_example(ex.id, split='train')
|
|
183
|
-
for ex in examples[sp:]:
|
|
184
|
-
client.update_example(ex.id, split='held_out')
|
|
185
|
-
print(f'Assigned splits: {sp} train, {len(examples)-sp} held_out')
|
|
186
|
-
"
|
|
187
|
-
```
|
|
188
|
-
|
|
189
|
-
**If `generate_hard`**: Spawn testgen agent with hard-mode instruction:
|
|
190
|
-
```
|
|
191
|
-
Agent(
|
|
192
|
-
subagent_type: "evolver-testgen",
|
|
193
|
-
description: "Generate hard examples to rebalance dataset",
|
|
194
|
-
prompt: |
|
|
195
|
-
<objective>
|
|
196
|
-
The dataset is skewed toward easy examples. Generate {count} HARD examples
|
|
197
|
-
that the current agent is likely to fail on.
|
|
198
|
-
Focus on: edge cases, adversarial inputs, complex multi-step queries,
|
|
199
|
-
ambiguous questions, and inputs that require deep reasoning.
|
|
200
|
-
</objective>
|
|
201
|
-
<files_to_read>
|
|
202
|
-
- .evolver.json
|
|
203
|
-
- strategy.md (if exists)
|
|
204
|
-
- production_seed.json (if exists)
|
|
205
|
-
</files_to_read>
|
|
206
|
-
)
|
|
207
|
-
```
|
|
208
|
-
|
|
209
|
-
**If `fill_coverage`**: Spawn testgen agent with coverage-fill instruction:
|
|
210
|
-
```
|
|
211
|
-
Agent(
|
|
212
|
-
subagent_type: "evolver-testgen",
|
|
213
|
-
description: "Generate examples for missing categories",
|
|
214
|
-
prompt: |
|
|
215
|
-
<objective>
|
|
216
|
-
The dataset is missing these production categories: {categories}.
|
|
217
|
-
Generate 5 examples per missing category.
|
|
218
|
-
Use production_seed.json for real-world patterns in these categories.
|
|
219
|
-
</objective>
|
|
220
|
-
<files_to_read>
|
|
221
|
-
- .evolver.json
|
|
222
|
-
- production_seed.json (if exists)
|
|
223
|
-
</files_to_read>
|
|
224
|
-
)
|
|
225
|
-
```
|
|
226
|
-
|
|
227
|
-
**If `retire_dead`**: Move dead examples to retired split:
|
|
228
|
-
```bash
|
|
229
|
-
$EVOLVER_PY -c "
|
|
230
|
-
from langsmith import Client
|
|
231
|
-
import json
|
|
232
|
-
client = Client()
|
|
233
|
-
report = json.load(open('health_report.json'))
|
|
234
|
-
dead_ids = report.get('dead_examples', {}).get('ids', [])
|
|
235
|
-
config = json.load(open('.evolver.json'))
|
|
236
|
-
examples = {str(e.id): e for e in client.list_examples(dataset_name=config['dataset'])}
|
|
237
|
-
retired = 0
|
|
238
|
-
for eid in dead_ids:
|
|
239
|
-
if eid in examples:
|
|
240
|
-
client.update_example(examples[eid].id, split='retired')
|
|
241
|
-
retired += 1
|
|
242
|
-
print(f'Retired {retired} dead examples')
|
|
243
|
-
"
|
|
244
|
-
```
|
|
245
|
-
|
|
246
|
-
After corrections, log what was done. Do NOT re-run health check (corrections may need an experiment cycle to show effect).
|
|
134
|
+
Invoke `/evolver:health` to check and auto-correct dataset issues. If health_report.json shows critical issues that couldn't be auto-corrected, ask user whether to proceed via AskUserQuestion.
|
|
247
135
|
|
|
248
136
|
### 0.8. Resolve Project Directory
|
|
249
137
|
|
|
@@ -287,7 +175,7 @@ If a production project is configured, also gather production insights:
|
|
|
287
175
|
PROD=$(python3 -c "import json; c=json.load(open('.evolver.json')); print(c.get('production_project',''))")
|
|
288
176
|
if [ -n "$PROD" ] && [ ! -f "production_seed.json" ]; then
|
|
289
177
|
$EVOLVER_PY $TOOLS/seed_from_traces.py \
|
|
290
|
-
--project "$PROD"
|
|
178
|
+
--project "$PROD" \
|
|
291
179
|
--output-md production_seed.md \
|
|
292
180
|
--output-json production_seed.json \
|
|
293
181
|
--limit 100 2>/dev/null
|
|
@@ -309,25 +197,42 @@ fi
|
|
|
309
197
|
```
|
|
310
198
|
|
|
311
199
|
If `best_results.json` exists, parse it to find failing examples (score < 0.7). Group by metadata or error pattern.
|
|
312
|
-
This failure data feeds into
|
|
200
|
+
This failure data feeds into the strategy and lens generation step (1.8a).
|
|
313
201
|
If no best_results.json (first iteration without baseline), all proposers work from code analysis only — no failure data available.
|
|
314
202
|
|
|
315
|
-
### 1.8a.
|
|
203
|
+
### 1.8a. Generate Strategy and Lenses
|
|
316
204
|
|
|
317
|
-
|
|
205
|
+
Read the available analysis files:
|
|
206
|
+
- `trace_insights.json` (error clusters, token analysis)
|
|
207
|
+
- `best_results.json` (per-task scores and failures)
|
|
208
|
+
- `evolution_memory.json` / `evolution_memory.md` (cross-iteration insights)
|
|
209
|
+
- `production_seed.json` (real-world traffic patterns, if exists)
|
|
318
210
|
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
211
|
+
Based on this data, generate two files:
|
|
212
|
+
|
|
213
|
+
**`strategy.md`** — A concise strategy document with: target files, failure clusters (prioritized), recommended approaches (from evolution memory), approaches to avoid, top failing examples, and production insights.
|
|
214
|
+
|
|
215
|
+
**`lenses.json`** — Investigation questions for proposers, format:
|
|
216
|
+
```json
|
|
217
|
+
{
|
|
218
|
+
"generated_at": "ISO timestamp",
|
|
219
|
+
"lens_count": N,
|
|
220
|
+
"lenses": [
|
|
221
|
+
{"id": 1, "question": "...", "source": "failure_cluster|architecture|production|evolution_memory|uniform_failure|open", "severity": "critical|high|medium", "context": {}},
|
|
222
|
+
...
|
|
223
|
+
]
|
|
224
|
+
}
|
|
328
225
|
```
|
|
329
226
|
|
|
330
|
-
|
|
227
|
+
Lens generation rules:
|
|
228
|
+
- One lens per distinct failure cluster (max 3)
|
|
229
|
+
- One architecture lens if high-severity structural issues exist
|
|
230
|
+
- One production lens if production data shows problems
|
|
231
|
+
- One evolution memory lens if a pattern won 2+ times
|
|
232
|
+
- One persistent failure lens if a pattern recurred 3+ iterations
|
|
233
|
+
- If all examples fail with same error, one "uniform_failure" lens
|
|
234
|
+
- Always include one "open" lens
|
|
235
|
+
- Sort by severity (critical > high > medium), cap at max_proposers from config (default 5)
|
|
331
236
|
|
|
332
237
|
### 1.9. Prepare Shared Proposer Context
|
|
333
238
|
|
|
@@ -473,27 +378,7 @@ Then spawn ONE evaluator agent that scores ALL candidates in a single pass. This
|
|
|
473
378
|
Agent(
|
|
474
379
|
subagent_type: "evolver-evaluator",
|
|
475
380
|
description: "Evaluate all candidates for iteration v{NNN}",
|
|
476
|
-
prompt:
|
|
477
|
-
<experiment>
|
|
478
|
-
Evaluate the following experiments (one per candidate):
|
|
479
|
-
{list all experiment names from proposers that committed changes — skip abstained}
|
|
480
|
-
</experiment>
|
|
481
|
-
|
|
482
|
-
<evaluators>
|
|
483
|
-
Apply these evaluators to each run in each experiment:
|
|
484
|
-
- {llm_evaluator_list, e.g. "correctness", "conciseness"}
|
|
485
|
-
</evaluators>
|
|
486
|
-
|
|
487
|
-
<context>
|
|
488
|
-
Agent type: {framework} agent
|
|
489
|
-
Domain: {description from .evolver.json or entry point context}
|
|
490
|
-
Entry point: {entry_point}
|
|
491
|
-
|
|
492
|
-
For each experiment:
|
|
493
|
-
1. Read all runs via: langsmith-cli --json runs list --project "{experiment_name}" --fields id,inputs,outputs,error --is-root true --limit 200
|
|
494
|
-
2. Judge each run's output against the input
|
|
495
|
-
3. Write scores via: langsmith-cli --json feedback create {run_id} --key {evaluator} --score {0.0|1.0} --comment "{reason}" --source model
|
|
496
|
-
</context>
|
|
381
|
+
prompt: "Experiments to evaluate: {comma-separated experiment names from non-abstained proposers}. Evaluators: {llm_evaluator_list}. Framework: {framework}. Entry point: {entry_point}."
|
|
497
382
|
)
|
|
498
383
|
```
|
|
499
384
|
|
|
@@ -592,45 +477,18 @@ Print: `Iteration {i}/{N}: v{NNN} scored {score} (best: {best} at {best_score})`
|
|
|
592
477
|
|
|
593
478
|
### 6.2. Consolidate Evolution Memory
|
|
594
479
|
|
|
595
|
-
Spawn the consolidator agent
|
|
480
|
+
Spawn the consolidator agent (runs in background — doesn't block the next iteration):
|
|
596
481
|
|
|
597
482
|
```
|
|
598
483
|
Agent(
|
|
599
484
|
subagent_type: "evolver-consolidator",
|
|
600
485
|
description: "Consolidate evolution memory after iteration v{NNN}",
|
|
601
486
|
run_in_background: true,
|
|
602
|
-
prompt:
|
|
603
|
-
<objective>
|
|
604
|
-
Consolidate learnings from iteration v{NNN}.
|
|
605
|
-
Run the consolidation tool and review its output.
|
|
606
|
-
</objective>
|
|
607
|
-
|
|
608
|
-
<tools_path>
|
|
609
|
-
TOOLS={tools_path}
|
|
610
|
-
EVOLVER_PY={evolver_py_path}
|
|
611
|
-
</tools_path>
|
|
612
|
-
|
|
613
|
-
<instructions>
|
|
614
|
-
Run: $EVOLVER_PY $TOOLS/consolidate.py \
|
|
615
|
-
--config .evolver.json \
|
|
616
|
-
--comparison-files comparison.json \
|
|
617
|
-
--output evolution_memory.md \
|
|
618
|
-
--output-json evolution_memory.json
|
|
619
|
-
|
|
620
|
-
Then read the output and verify insights are accurate.
|
|
621
|
-
</instructions>
|
|
622
|
-
|
|
623
|
-
<files_to_read>
|
|
624
|
-
- .evolver.json
|
|
625
|
-
- comparison.json
|
|
626
|
-
- trace_insights.json (if exists)
|
|
627
|
-
- regression_report.json (if exists)
|
|
628
|
-
- evolution_memory.md (if exists)
|
|
629
|
-
</files_to_read>
|
|
487
|
+
prompt: "Update evolution_memory.md with learnings from this iteration. Read .evolver.json, comparison.json, trace_insights.json, regression_report.json (if exists), and current evolution_memory.md (if exists). Track what worked, what failed, and promote insights that recur across iterations."
|
|
630
488
|
)
|
|
631
489
|
```
|
|
632
490
|
|
|
633
|
-
The `evolution_memory.md` file will be
|
|
491
|
+
The `evolution_memory.md` file will be available for proposer briefings in subsequent iterations.
|
|
634
492
|
|
|
635
493
|
### 6.5. Auto-trigger Active Critic
|
|
636
494
|
|
|
@@ -639,25 +497,8 @@ If score jumped >0.3 from previous iteration OR reached target in <3 iterations:
|
|
|
639
497
|
```
|
|
640
498
|
Agent(
|
|
641
499
|
subagent_type: "evolver-critic",
|
|
642
|
-
description: "
|
|
643
|
-
prompt:
|
|
644
|
-
<objective>
|
|
645
|
-
EVAL GAMING CHECK: Score jumped from {prev_score} to {score}.
|
|
646
|
-
Check if the LangSmith evaluators are being gamed.
|
|
647
|
-
If gaming detected, add stricter evaluators using $TOOLS/add_evaluator.py.
|
|
648
|
-
</objective>
|
|
649
|
-
|
|
650
|
-
<tools_path>
|
|
651
|
-
TOOLS={tools_path}
|
|
652
|
-
EVOLVER_PY={evolver_py_path}
|
|
653
|
-
</tools_path>
|
|
654
|
-
|
|
655
|
-
<files_to_read>
|
|
656
|
-
- .evolver.json
|
|
657
|
-
- comparison.json
|
|
658
|
-
- trace_insights.json
|
|
659
|
-
- evolution_memory.md (if exists)
|
|
660
|
-
</files_to_read>
|
|
500
|
+
description: "Check evaluator gaming after score jump",
|
|
501
|
+
prompt: "Score jumped from {prev_score} to {score}. Check if LangSmith evaluators are being gamed. Read .evolver.json, comparison.json, trace_insights.json, evolution_memory.md. If gaming detected, add stricter evaluators using $EVOLVER_PY $TOOLS/add_evaluator.py."
|
|
661
502
|
)
|
|
662
503
|
```
|
|
663
504
|
|
|
@@ -674,55 +515,22 @@ If 3 consecutive iterations within 1% OR score dropped:
|
|
|
674
515
|
Agent(
|
|
675
516
|
subagent_type: "evolver-architect",
|
|
676
517
|
model: "opus",
|
|
677
|
-
description: "
|
|
678
|
-
prompt:
|
|
679
|
-
<objective>
|
|
680
|
-
The evolution loop has stagnated after {iterations} iterations.
|
|
681
|
-
Scores: {last_3_scores}.
|
|
682
|
-
Perform deep architectural analysis and recommend structural changes.
|
|
683
|
-
Use extended thinking — you have more compute budget than normal agents.
|
|
684
|
-
</objective>
|
|
685
|
-
|
|
686
|
-
<tools_path>
|
|
687
|
-
TOOLS={tools_path}
|
|
688
|
-
EVOLVER_PY={evolver_py_path}
|
|
689
|
-
</tools_path>
|
|
690
|
-
|
|
691
|
-
<files_to_read>
|
|
692
|
-
- .evolver.json
|
|
693
|
-
- trace_insights.json
|
|
694
|
-
- evolution_memory.md (if exists)
|
|
695
|
-
- evolution_memory.json (if exists)
|
|
696
|
-
- strategy.md (if exists)
|
|
697
|
-
- {entry point and all related source files}
|
|
698
|
-
</files_to_read>
|
|
518
|
+
description: "Deep topology analysis after stagnation",
|
|
519
|
+
prompt: "Evolution stagnated after {iterations} iterations. Scores: {last_3_scores}. Analyze architecture and recommend structural changes. Read .evolver.json, trace_insights.json, evolution_memory.md, strategy.md, and the entry point source files. Use $EVOLVER_PY $TOOLS/analyze_architecture.py for AST analysis if helpful."
|
|
699
520
|
)
|
|
700
521
|
```
|
|
701
522
|
|
|
702
523
|
After architect completes, include `architecture.md` in proposer `<files_to_read>` for next iteration.
|
|
703
524
|
|
|
704
|
-
### 8. Gate Check
|
|
705
|
-
|
|
706
|
-
Before starting the next iteration, run the gate check:
|
|
525
|
+
### 8. Gate Check
|
|
707
526
|
|
|
708
|
-
|
|
709
|
-
GATE_RESULT=$($EVOLVER_PY $TOOLS/iteration_gate.py --config .evolver.json 2>/dev/null)
|
|
710
|
-
PROCEED=$(echo "$GATE_RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('proceed', True))")
|
|
711
|
-
```
|
|
712
|
-
|
|
713
|
-
If `PROCEED` is `False`, check suggestions:
|
|
714
|
-
|
|
715
|
-
```bash
|
|
716
|
-
SUGGEST=$(echo "$GATE_RESULT" | python3 -c "import sys,json; s=json.load(sys.stdin).get('suggestions',[]); print(s[0] if s else '')")
|
|
717
|
-
```
|
|
527
|
+
Read `.evolver.json` history and assess whether to continue:
|
|
718
528
|
|
|
719
|
-
- If
|
|
720
|
-
-
|
|
721
|
-
-
|
|
529
|
+
- **Score plateau**: If last 3 scores are within 2% of each other, evolution may have converged. Consider triggering architect (Step 7) or stopping.
|
|
530
|
+
- **Target reached**: If `best_score >= target_score`, stop and report success.
|
|
531
|
+
- **Diminishing returns**: If average improvement over last 5 iterations is less than 0.5%, consider stopping.
|
|
722
532
|
|
|
723
|
-
|
|
724
|
-
- **Target**: `score >= target_score` → stop
|
|
725
|
-
- **N reached**: all requested iterations done → stop
|
|
533
|
+
If stopping, skip to the final report. If continuing, proceed to next iteration.
|
|
726
534
|
|
|
727
535
|
## When Loop Ends — Final Report
|
|
728
536
|
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: evolver:health
|
|
3
|
+
description: "Use when the user wants to check dataset quality, diagnose eval issues, or before running evolve. Checks size, difficulty distribution, dead examples, coverage, and splits. Auto-corrects issues found."
|
|
4
|
+
allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent, AskUserQuestion]
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# /evolver:health
|
|
8
|
+
|
|
9
|
+
Check eval dataset quality and auto-correct issues. Can be run independently or is invoked by `/evolver:evolve` before the iteration loop.
|
|
10
|
+
|
|
11
|
+
## Prerequisites
|
|
12
|
+
|
|
13
|
+
`.evolver.json` must exist. If not, tell user to run `/evolver:setup`.
|
|
14
|
+
|
|
15
|
+
## Resolve Tool Path and Python
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
TOOLS="${EVOLVER_TOOLS:-$([ -d ".evolver/tools" ] && echo ".evolver/tools" || echo "$HOME/.evolver/tools")}"
|
|
19
|
+
EVOLVER_PY="${EVOLVER_PY:-$([ -f "$HOME/.evolver/venv/bin/python" ] && echo "$HOME/.evolver/venv/bin/python" || echo "python3")}"
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## 1. Run Health Diagnostic
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
$EVOLVER_PY $TOOLS/dataset_health.py \
|
|
26
|
+
--config .evolver.json \
|
|
27
|
+
--production-seed production_seed.json \
|
|
28
|
+
--output health_report.json 2>/dev/null
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Print summary:
|
|
32
|
+
```bash
|
|
33
|
+
python3 -c "
|
|
34
|
+
import json, os
|
|
35
|
+
if os.path.exists('health_report.json'):
|
|
36
|
+
r = json.load(open('health_report.json'))
|
|
37
|
+
print(f'Dataset Health: {r[\"health_score\"]}/10 ({r[\"example_count\"]} examples)')
|
|
38
|
+
for issue in r.get('issues', []):
|
|
39
|
+
print(f' [{issue[\"severity\"]}] {issue[\"message\"]}')
|
|
40
|
+
if not r.get('issues'):
|
|
41
|
+
print(' No issues found.')
|
|
42
|
+
"
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## 2. Auto-Correct Issues
|
|
46
|
+
|
|
47
|
+
If `health_report.json` has corrections, apply them automatically:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
CORRECTIONS=$(python3 -c "
|
|
51
|
+
import json, os
|
|
52
|
+
if os.path.exists('health_report.json'):
|
|
53
|
+
r = json.load(open('health_report.json'))
|
|
54
|
+
for c in r.get('corrections', []):
|
|
55
|
+
print(c['action'])
|
|
56
|
+
" 2>/dev/null)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
For each correction:
|
|
60
|
+
|
|
61
|
+
**If `create_splits`**: Assign 70/30 train/held_out splits:
|
|
62
|
+
```bash
|
|
63
|
+
$EVOLVER_PY -c "
|
|
64
|
+
from langsmith import Client
|
|
65
|
+
import json, random
|
|
66
|
+
client = Client()
|
|
67
|
+
config = json.load(open('.evolver.json'))
|
|
68
|
+
examples = list(client.list_examples(dataset_name=config['dataset']))
|
|
69
|
+
random.shuffle(examples)
|
|
70
|
+
sp = int(len(examples) * 0.7)
|
|
71
|
+
for ex in examples[:sp]:
|
|
72
|
+
client.update_example(ex.id, split='train')
|
|
73
|
+
for ex in examples[sp:]:
|
|
74
|
+
client.update_example(ex.id, split='held_out')
|
|
75
|
+
print(f'Assigned splits: {sp} train, {len(examples)-sp} held_out')
|
|
76
|
+
"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**If `generate_hard`**: Spawn testgen agent to generate hard examples:
|
|
80
|
+
```
|
|
81
|
+
Agent(
|
|
82
|
+
subagent_type: "evolver-testgen",
|
|
83
|
+
description: "Generate hard examples to rebalance dataset",
|
|
84
|
+
prompt: "The dataset is skewed toward easy examples. Generate {count} HARD examples that the current agent is likely to fail on. Focus on edge cases, adversarial inputs, and complex multi-step queries. Read .evolver.json and production_seed.json for context."
|
|
85
|
+
)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
**If `fill_coverage`**: Spawn testgen agent for missing categories:
|
|
89
|
+
```
|
|
90
|
+
Agent(
|
|
91
|
+
subagent_type: "evolver-testgen",
|
|
92
|
+
description: "Generate examples for missing categories",
|
|
93
|
+
prompt: "The dataset is missing these production categories: {categories}. Generate 5 examples per missing category. Read .evolver.json and production_seed.json for context."
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**If `retire_dead`**: Move dead examples to retired split:
|
|
98
|
+
```bash
|
|
99
|
+
$EVOLVER_PY -c "
|
|
100
|
+
from langsmith import Client
|
|
101
|
+
import json
|
|
102
|
+
client = Client()
|
|
103
|
+
report = json.load(open('health_report.json'))
|
|
104
|
+
dead_ids = report.get('dead_examples', {}).get('ids', [])
|
|
105
|
+
config = json.load(open('.evolver.json'))
|
|
106
|
+
examples = {str(e.id): e for e in client.list_examples(dataset_name=config['dataset'])}
|
|
107
|
+
retired = 0
|
|
108
|
+
for eid in dead_ids:
|
|
109
|
+
if eid in examples:
|
|
110
|
+
client.update_example(examples[eid].id, split='retired')
|
|
111
|
+
retired += 1
|
|
112
|
+
print(f'Retired {retired} dead examples')
|
|
113
|
+
"
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
After corrections, log what was done.
|
|
117
|
+
|
|
118
|
+
## 3. Report
|
|
119
|
+
|
|
120
|
+
Print final health status. If critical issues remain that couldn't be auto-corrected, warn the user.
|
package/skills/setup/SKILL.md
CHANGED
|
@@ -86,82 +86,84 @@ The runner writes `{"input": "user question..."}` to a temp `.json` file and rep
|
|
|
86
86
|
|
|
87
87
|
If no placeholder and no `--input` flag detected, the runner appends `--input <path> --output <path>`.
|
|
88
88
|
|
|
89
|
-
## Phase 2: Confirm
|
|
89
|
+
## Phase 2: Confirm Configuration (interactive)
|
|
90
90
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
```json
|
|
94
|
-
{
|
|
95
|
-
"questions": [{
|
|
96
|
-
"question": "Here's what I detected. Does this look right?\n\nEntry point: {path}\nFramework: {framework}\nRun command: {command}\nLangSmith: {status}",
|
|
97
|
-
"header": "Confirm",
|
|
98
|
-
"multiSelect": false,
|
|
99
|
-
"options": [
|
|
100
|
-
{"label": "Looks good, proceed", "description": "Continue with detected configuration"},
|
|
101
|
-
{"label": "Let me adjust", "description": "I'll provide correct paths and commands"},
|
|
102
|
-
{"label": "Wrong directory", "description": "I need to cd somewhere else first"}
|
|
103
|
-
]
|
|
104
|
-
}]
|
|
105
|
-
}
|
|
106
|
-
```
|
|
107
|
-
|
|
108
|
-
## Phase 3: What to Optimize (interactive)
|
|
91
|
+
Present all detected configuration in one view with smart defaults and ask for confirmation.
|
|
109
92
|
|
|
110
93
|
Use AskUserQuestion:
|
|
111
94
|
|
|
112
95
|
```json
|
|
113
96
|
{
|
|
114
97
|
"questions": [{
|
|
115
|
-
"question": "
|
|
116
|
-
"header": "
|
|
117
|
-
"multiSelect": true,
|
|
118
|
-
"options": [
|
|
119
|
-
{"label": "Accuracy", "description": "Correctness of outputs — LLM-as-judge evaluator"},
|
|
120
|
-
{"label": "Latency", "description": "Response time — track and minimize"},
|
|
121
|
-
{"label": "Token efficiency", "description": "Fewer tokens for same quality"},
|
|
122
|
-
{"label": "Error handling", "description": "Reduce failures, timeouts, crashes"}
|
|
123
|
-
]
|
|
124
|
-
}]
|
|
125
|
-
}
|
|
126
|
-
```
|
|
127
|
-
|
|
128
|
-
Map selections to evaluator configuration for setup.py.
|
|
129
|
-
|
|
130
|
-
## Phase 4: Test Data Source (interactive)
|
|
131
|
-
|
|
132
|
-
Use AskUserQuestion with **preview**:
|
|
133
|
-
|
|
134
|
-
```json
|
|
135
|
-
{
|
|
136
|
-
"questions": [{
|
|
137
|
-
"question": "Where should test inputs come from?",
|
|
138
|
-
"header": "Test data",
|
|
98
|
+
"question": "Here's the configuration for your project:\n\n**Entry point**: {command}\n**Framework**: {framework}\n**Python**: {venv_path or 'system python3'}\n**Optimization goals**: accuracy (correctness evaluator)\n**Test data**: generate 30 examples with AI\n\nDoes this look good?",
|
|
99
|
+
"header": "Setup Configuration",
|
|
139
100
|
"multiSelect": false,
|
|
140
101
|
"options": [
|
|
141
|
-
{
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
},
|
|
146
|
-
{
|
|
147
|
-
"label": "Generate from code",
|
|
148
|
-
"description": "AI generates test inputs by analyzing your code",
|
|
149
|
-
"preview": "## Generate from Code\n\nThe testgen agent reads your source code and generates\n30 diverse test inputs:\n- 40% standard cases\n- 20% edge cases\n- 20% cross-domain\n- 20% adversarial\n\nOutputs are scored by LLM-as-judge."
|
|
150
|
-
},
|
|
151
|
-
{
|
|
152
|
-
"label": "I have test data",
|
|
153
|
-
"description": "Point to an existing file with test inputs",
|
|
154
|
-
"preview": "## Provide Test Data\n\nSupported formats:\n- JSON array of inputs\n- JSON with {\"inputs\": {...}} objects\n- CSV with input columns\n\nExample:\n```json\n[\n {\"input\": \"What is Python?\"},\n {\"input\": \"Explain quantum computing\"}\n]\n```"
|
|
155
|
-
}
|
|
102
|
+
{"label": "Looks good, proceed", "description": "Use these settings and start setup"},
|
|
103
|
+
{"label": "Customize goals", "description": "Choose different optimization goals"},
|
|
104
|
+
{"label": "I have test data", "description": "Use existing JSON file or LangSmith project"},
|
|
105
|
+
{"label": "Let me adjust everything", "description": "Change entry point, framework, goals, and data source"}
|
|
156
106
|
]
|
|
157
107
|
}]
|
|
158
108
|
}
|
|
159
109
|
```
|
|
160
110
|
|
|
161
|
-
If "
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
111
|
+
**If "Looks good, proceed"**: Use defaults — goals=accuracy, data=generate 30 with testgen. Skip straight to Phase 3.
|
|
112
|
+
|
|
113
|
+
**If "Customize goals"**: Ask the goals question, then proceed to Phase 3 with testgen as default data source.
|
|
114
|
+
|
|
115
|
+
Use AskUserQuestion:
|
|
116
|
+
|
|
117
|
+
```json
|
|
118
|
+
{
|
|
119
|
+
"questions": [{
|
|
120
|
+
"question": "What do you want to optimize?",
|
|
121
|
+
"header": "Goals",
|
|
122
|
+
"multiSelect": true,
|
|
123
|
+
"options": [
|
|
124
|
+
{"label": "Accuracy", "description": "Correctness of outputs — LLM-as-judge evaluator"},
|
|
125
|
+
{"label": "Latency", "description": "Response time — track and minimize"},
|
|
126
|
+
{"label": "Token efficiency", "description": "Fewer tokens for same quality"},
|
|
127
|
+
{"label": "Error handling", "description": "Reduce failures, timeouts, crashes"}
|
|
128
|
+
]
|
|
129
|
+
}]
|
|
130
|
+
}
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Map selections to evaluator configuration for setup.py.
|
|
134
|
+
|
|
135
|
+
**If "I have test data"**: Ask the data source question, then proceed to Phase 3 with accuracy as default goal.
|
|
136
|
+
|
|
137
|
+
Use AskUserQuestion with **preview**:
|
|
138
|
+
|
|
139
|
+
```json
|
|
140
|
+
{
|
|
141
|
+
"questions": [{
|
|
142
|
+
"question": "Where should test inputs come from?",
|
|
143
|
+
"header": "Test data",
|
|
144
|
+
"multiSelect": false,
|
|
145
|
+
"options": [
|
|
146
|
+
{
|
|
147
|
+
"label": "Import from LangSmith",
|
|
148
|
+
"description": "Use real production traces as test inputs",
|
|
149
|
+
"preview": "## Import from LangSmith\n\nFetches up to 100 recent traces from your production project.\nPrioritizes traces with negative feedback.\nCreates a LangSmith Dataset with real user inputs.\n\nRequires: an existing LangSmith project with traces."
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
"label": "I have a file",
|
|
153
|
+
"description": "Point to an existing file with test inputs",
|
|
154
|
+
"preview": "## Provide Test Data\n\nSupported formats:\n- JSON array of inputs\n- JSON with {\"inputs\": {...}} objects\n- CSV with input columns\n\nExample:\n```json\n[\n {\"input\": \"What is Python?\"},\n {\"input\": \"Explain quantum computing\"}\n]\n```"
|
|
155
|
+
}
|
|
156
|
+
]
|
|
157
|
+
}]
|
|
158
|
+
}
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
If "Import from LangSmith": discover projects and ask which one (same as v2 Phase 1.9).
|
|
162
|
+
If "I have a file": ask for file path.
|
|
163
|
+
|
|
164
|
+
**If "Let me adjust everything"**: Ask all three original questions in sequence — confirm detection (entry point, framework, run command), then goals, then data source — using the question formats above.
|
|
165
|
+
|
|
166
|
+
## Phase 3: Run Setup
|
|
165
167
|
|
|
166
168
|
Build the setup.py command based on all gathered information:
|
|
167
169
|
|
|
@@ -178,7 +180,7 @@ $EVOLVER_PY $TOOLS/setup.py \
|
|
|
178
180
|
|
|
179
181
|
If "Generate from code" was selected AND no test data file exists, first spawn the testgen agent to generate inputs, then pass the generated file to setup.py.
|
|
180
182
|
|
|
181
|
-
## Phase
|
|
183
|
+
## Phase 4: Generate Test Data (if needed)
|
|
182
184
|
|
|
183
185
|
If testgen is needed, spawn it:
|
|
184
186
|
|
|
@@ -205,7 +207,7 @@ Agent(
|
|
|
205
207
|
|
|
206
208
|
Then pass `--dataset-from-file test_inputs.json` to setup.py.
|
|
207
209
|
|
|
208
|
-
## Phase
|
|
210
|
+
## Phase 5: Report
|
|
209
211
|
|
|
210
212
|
```
|
|
211
213
|
Setup complete!
|
|
@@ -1,8 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""Fetch and summarize production LangSmith traces for Harness Evolver.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
production traces and produce:
|
|
4
|
+
Uses the LangSmith Python SDK to fetch production traces and produce:
|
|
6
5
|
1. A markdown seed file for the testgen agent (production_seed.md)
|
|
7
6
|
2. A JSON summary for programmatic use (production_seed.json)
|
|
8
7
|
|
|
@@ -11,85 +10,18 @@ Usage:
|
|
|
11
10
|
--project ceppem-langgraph \
|
|
12
11
|
--output-md production_seed.md \
|
|
13
12
|
--output-json production_seed.json \
|
|
14
|
-
[--api-key-env LANGSMITH_API_KEY] \
|
|
15
13
|
[--limit 100]
|
|
16
14
|
|
|
17
|
-
|
|
15
|
+
Requires: pip install langsmith
|
|
18
16
|
"""
|
|
19
17
|
|
|
20
18
|
import argparse
|
|
21
19
|
import json
|
|
22
20
|
import os
|
|
23
21
|
import sys
|
|
24
|
-
import urllib.parse
|
|
25
|
-
import urllib.request
|
|
26
22
|
from collections import Counter
|
|
27
23
|
from datetime import datetime, timezone
|
|
28
24
|
|
|
29
|
-
LANGSMITH_API_BASE = "https://api.smith.langchain.com/api/v1"
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
def langsmith_request(endpoint, api_key, method="GET", body=None, params=None):
|
|
33
|
-
"""Make a request to the LangSmith REST API."""
|
|
34
|
-
url = f"{LANGSMITH_API_BASE}/{endpoint}"
|
|
35
|
-
if params:
|
|
36
|
-
url += "?" + urllib.parse.urlencode(params)
|
|
37
|
-
|
|
38
|
-
headers = {
|
|
39
|
-
"x-api-key": api_key,
|
|
40
|
-
"Accept": "application/json",
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
data = None
|
|
44
|
-
if body is not None:
|
|
45
|
-
headers["Content-Type"] = "application/json"
|
|
46
|
-
data = json.dumps(body).encode("utf-8")
|
|
47
|
-
|
|
48
|
-
req = urllib.request.Request(url, data=data, headers=headers, method=method)
|
|
49
|
-
try:
|
|
50
|
-
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
51
|
-
return json.loads(resp.read())
|
|
52
|
-
except urllib.error.HTTPError as e:
|
|
53
|
-
body_text = ""
|
|
54
|
-
try:
|
|
55
|
-
body_text = e.read().decode("utf-8", errors="replace")[:500]
|
|
56
|
-
except Exception:
|
|
57
|
-
pass
|
|
58
|
-
print(f"LangSmith API error {e.code}: {body_text}", file=sys.stderr)
|
|
59
|
-
return None
|
|
60
|
-
except Exception as e:
|
|
61
|
-
print(f"LangSmith API request failed: {e}", file=sys.stderr)
|
|
62
|
-
return None
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def fetch_runs(project_name, api_key, limit=100):
|
|
66
|
-
"""Fetch recent root runs from a LangSmith project."""
|
|
67
|
-
# Try POST /runs/query first (newer API)
|
|
68
|
-
body = {
|
|
69
|
-
"project_name": project_name,
|
|
70
|
-
"is_root": True,
|
|
71
|
-
"limit": limit,
|
|
72
|
-
}
|
|
73
|
-
result = langsmith_request("runs/query", api_key, method="POST", body=body)
|
|
74
|
-
if result and isinstance(result, dict):
|
|
75
|
-
return result.get("runs", result.get("results", []))
|
|
76
|
-
if result and isinstance(result, list):
|
|
77
|
-
return result
|
|
78
|
-
|
|
79
|
-
# Fallback: GET /runs with query params
|
|
80
|
-
params = {
|
|
81
|
-
"project_name": project_name,
|
|
82
|
-
"is_root": "true",
|
|
83
|
-
"limit": str(limit),
|
|
84
|
-
}
|
|
85
|
-
result = langsmith_request("runs", api_key, params=params)
|
|
86
|
-
if result and isinstance(result, list):
|
|
87
|
-
return result
|
|
88
|
-
if result and isinstance(result, dict):
|
|
89
|
-
return result.get("runs", result.get("results", []))
|
|
90
|
-
|
|
91
|
-
return []
|
|
92
|
-
|
|
93
25
|
|
|
94
26
|
def extract_input(run):
|
|
95
27
|
"""Extract user input from a run's inputs field."""
|
|
@@ -396,48 +328,35 @@ def generate_json_summary(analysis, project_name):
|
|
|
396
328
|
def main():
|
|
397
329
|
parser = argparse.ArgumentParser(description="Fetch and summarize production LangSmith traces")
|
|
398
330
|
parser.add_argument("--project", required=True, help="LangSmith project name")
|
|
399
|
-
parser.add_argument("--api-key-env", default="LANGSMITH_API_KEY",
|
|
400
|
-
help="Env var containing API key (default: LANGSMITH_API_KEY)")
|
|
401
331
|
parser.add_argument("--limit", type=int, default=100, help="Max traces to fetch (default: 100)")
|
|
402
332
|
parser.add_argument("--output-md", required=True, help="Output path for markdown seed")
|
|
403
333
|
parser.add_argument("--output-json", required=True, help="Output path for JSON summary")
|
|
404
|
-
|
|
405
|
-
|
|
334
|
+
# Kept for backwards compatibility — silently ignored (SDK is now the only mode)
|
|
335
|
+
parser.add_argument("--use-sdk", action="store_true", help=argparse.SUPPRESS)
|
|
406
336
|
args = parser.parse_args()
|
|
407
337
|
|
|
408
338
|
print(f"Fetching up to {args.limit} traces from LangSmith project '{args.project}'...")
|
|
409
339
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
}
|
|
431
|
-
runs.append(run_dict)
|
|
432
|
-
except ImportError:
|
|
433
|
-
print("langsmith package not installed. Use --use-sdk with pip install langsmith", file=sys.stderr)
|
|
434
|
-
sys.exit(1)
|
|
435
|
-
else:
|
|
436
|
-
api_key = os.environ.get(args.api_key_env, "")
|
|
437
|
-
if not api_key:
|
|
438
|
-
print(f"No API key found in ${args.api_key_env} — cannot fetch production traces", file=sys.stderr)
|
|
439
|
-
sys.exit(1)
|
|
440
|
-
runs = fetch_runs(args.project, api_key, args.limit)
|
|
340
|
+
from langsmith import Client
|
|
341
|
+
client = Client()
|
|
342
|
+
raw_runs = list(client.list_runs(
|
|
343
|
+
project_name=args.project, is_root=True, limit=args.limit,
|
|
344
|
+
))
|
|
345
|
+
# Convert SDK run objects to dicts matching our analysis format
|
|
346
|
+
runs = []
|
|
347
|
+
for r in raw_runs:
|
|
348
|
+
run_dict = {
|
|
349
|
+
"id": str(r.id),
|
|
350
|
+
"name": r.name,
|
|
351
|
+
"inputs": r.inputs,
|
|
352
|
+
"outputs": r.outputs,
|
|
353
|
+
"error": r.error,
|
|
354
|
+
"total_tokens": r.total_tokens,
|
|
355
|
+
"feedback_stats": None,
|
|
356
|
+
"start_time": r.start_time.isoformat() if r.start_time else None,
|
|
357
|
+
"end_time": r.end_time.isoformat() if r.end_time else None,
|
|
358
|
+
}
|
|
359
|
+
runs.append(run_dict)
|
|
441
360
|
|
|
442
361
|
if not runs:
|
|
443
362
|
print("No traces found. The project may be empty or the name may be wrong.")
|