harness-evolver 2.9.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -117
- package/agents/evolver-architect.md +53 -0
- package/agents/evolver-critic.md +44 -0
- package/agents/evolver-proposer.md +128 -0
- package/agents/evolver-testgen.md +67 -0
- package/bin/install.js +181 -171
- package/package.json +7 -7
- package/skills/deploy/SKILL.md +49 -56
- package/skills/evolve/SKILL.md +156 -687
- package/skills/setup/SKILL.md +182 -0
- package/skills/status/SKILL.md +23 -21
- package/tools/read_results.py +240 -0
- package/tools/run_eval.py +202 -0
- package/tools/seed_from_traces.py +36 -8
- package/tools/setup.py +393 -0
- package/tools/trace_insights.py +86 -14
- package/agents/harness-evolver-architect.md +0 -173
- package/agents/harness-evolver-critic.md +0 -132
- package/agents/harness-evolver-judge.md +0 -110
- package/agents/harness-evolver-proposer.md +0 -317
- package/agents/harness-evolver-testgen.md +0 -112
- package/examples/classifier/README.md +0 -25
- package/examples/classifier/config.json +0 -3
- package/examples/classifier/eval.py +0 -58
- package/examples/classifier/harness.py +0 -111
- package/examples/classifier/tasks/task_001.json +0 -1
- package/examples/classifier/tasks/task_002.json +0 -1
- package/examples/classifier/tasks/task_003.json +0 -1
- package/examples/classifier/tasks/task_004.json +0 -1
- package/examples/classifier/tasks/task_005.json +0 -1
- package/examples/classifier/tasks/task_006.json +0 -1
- package/examples/classifier/tasks/task_007.json +0 -1
- package/examples/classifier/tasks/task_008.json +0 -1
- package/examples/classifier/tasks/task_009.json +0 -1
- package/examples/classifier/tasks/task_010.json +0 -1
- package/skills/architect/SKILL.md +0 -93
- package/skills/compare/SKILL.md +0 -73
- package/skills/critic/SKILL.md +0 -67
- package/skills/diagnose/SKILL.md +0 -96
- package/skills/import-traces/SKILL.md +0 -102
- package/skills/init/SKILL.md +0 -293
- package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
- package/tools/__pycache__/init.cpython-313.pyc +0 -0
- package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
- package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
- package/tools/eval_llm_judge.py +0 -233
- package/tools/eval_passthrough.py +0 -55
- package/tools/evaluate.py +0 -255
- package/tools/import_traces.py +0 -229
- package/tools/init.py +0 -531
- package/tools/llm_api.py +0 -125
- package/tools/state.py +0 -219
- package/tools/test_growth.py +0 -230
- package/tools/trace_logger.py +0 -42
|
@@ -9,8 +9,8 @@ production traces and produce:
|
|
|
9
9
|
Usage:
|
|
10
10
|
python3 seed_from_traces.py \
|
|
11
11
|
--project ceppem-langgraph \
|
|
12
|
-
--output-md
|
|
13
|
-
--output-json
|
|
12
|
+
--output-md production_seed.md \
|
|
13
|
+
--output-json production_seed.json \
|
|
14
14
|
[--api-key-env LANGSMITH_API_KEY] \
|
|
15
15
|
[--limit 100]
|
|
16
16
|
|
|
@@ -401,15 +401,43 @@ def main():
|
|
|
401
401
|
parser.add_argument("--limit", type=int, default=100, help="Max traces to fetch (default: 100)")
|
|
402
402
|
parser.add_argument("--output-md", required=True, help="Output path for markdown seed")
|
|
403
403
|
parser.add_argument("--output-json", required=True, help="Output path for JSON summary")
|
|
404
|
+
parser.add_argument("--use-sdk", action="store_true",
|
|
405
|
+
help="Use langsmith Python SDK instead of REST API (v3 mode)")
|
|
404
406
|
args = parser.parse_args()
|
|
405
407
|
|
|
406
|
-
api_key = os.environ.get(args.api_key_env, "")
|
|
407
|
-
if not api_key:
|
|
408
|
-
print(f"No API key found in ${args.api_key_env} — cannot fetch production traces", file=sys.stderr)
|
|
409
|
-
sys.exit(1)
|
|
410
|
-
|
|
411
408
|
print(f"Fetching up to {args.limit} traces from LangSmith project '{args.project}'...")
|
|
412
|
-
|
|
409
|
+
|
|
410
|
+
if args.use_sdk:
|
|
411
|
+
try:
|
|
412
|
+
from langsmith import Client
|
|
413
|
+
client = Client()
|
|
414
|
+
raw_runs = list(client.list_runs(
|
|
415
|
+
project_name=args.project, is_root=True, limit=args.limit,
|
|
416
|
+
))
|
|
417
|
+
# Convert SDK run objects to dicts matching our format
|
|
418
|
+
runs = []
|
|
419
|
+
for r in raw_runs:
|
|
420
|
+
run_dict = {
|
|
421
|
+
"id": str(r.id),
|
|
422
|
+
"name": r.name,
|
|
423
|
+
"inputs": r.inputs,
|
|
424
|
+
"outputs": r.outputs,
|
|
425
|
+
"error": r.error,
|
|
426
|
+
"total_tokens": r.total_tokens,
|
|
427
|
+
"feedback_stats": None,
|
|
428
|
+
"start_time": r.start_time.isoformat() if r.start_time else None,
|
|
429
|
+
"end_time": r.end_time.isoformat() if r.end_time else None,
|
|
430
|
+
}
|
|
431
|
+
runs.append(run_dict)
|
|
432
|
+
except ImportError:
|
|
433
|
+
print("langsmith package not installed. Use --use-sdk with pip install langsmith", file=sys.stderr)
|
|
434
|
+
sys.exit(1)
|
|
435
|
+
else:
|
|
436
|
+
api_key = os.environ.get(args.api_key_env, "")
|
|
437
|
+
if not api_key:
|
|
438
|
+
print(f"No API key found in ${args.api_key_env} — cannot fetch production traces", file=sys.stderr)
|
|
439
|
+
sys.exit(1)
|
|
440
|
+
runs = fetch_runs(args.project, api_key, args.limit)
|
|
413
441
|
|
|
414
442
|
if not runs:
|
|
415
443
|
print("No traces found. The project may be empty or the name may be wrong.")
|
package/tools/setup.py
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""LangSmith Setup for Harness Evolver v3.
|
|
3
|
+
|
|
4
|
+
Configures the LangSmith environment for evolution:
|
|
5
|
+
- Creates/connects to a LangSmith project
|
|
6
|
+
- Creates a dataset from test inputs, production traces, or generated data
|
|
7
|
+
- Configures evaluators based on optimization goals
|
|
8
|
+
- Runs baseline evaluation
|
|
9
|
+
- Writes .evolver.json config
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python3 setup.py \
|
|
13
|
+
--project-name my-agent \
|
|
14
|
+
--entry-point "python main.py" \
|
|
15
|
+
--framework langgraph \
|
|
16
|
+
--goals accuracy,latency \
|
|
17
|
+
[--dataset-from-file inputs.json] \
|
|
18
|
+
[--dataset-from-langsmith production-project] \
|
|
19
|
+
[--production-project my-prod-project] \
|
|
20
|
+
[--evaluators correctness,conciseness]
|
|
21
|
+
|
|
22
|
+
Requires: pip install langsmith openevals
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import argparse
|
|
26
|
+
import json
|
|
27
|
+
import os
|
|
28
|
+
import subprocess
|
|
29
|
+
import sys
|
|
30
|
+
import tempfile
|
|
31
|
+
from datetime import datetime, timezone
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def check_dependencies():
|
|
35
|
+
"""Verify langsmith and openevals are installed."""
|
|
36
|
+
missing = []
|
|
37
|
+
try:
|
|
38
|
+
import langsmith # noqa: F401
|
|
39
|
+
except ImportError:
|
|
40
|
+
missing.append("langsmith")
|
|
41
|
+
try:
|
|
42
|
+
import openevals # noqa: F401
|
|
43
|
+
except ImportError:
|
|
44
|
+
missing.append("openevals")
|
|
45
|
+
return missing
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def create_dataset_from_file(client, dataset_name, file_path):
|
|
49
|
+
"""Create a LangSmith dataset from a JSON file of inputs."""
|
|
50
|
+
with open(file_path) as f:
|
|
51
|
+
data = json.load(f)
|
|
52
|
+
|
|
53
|
+
if isinstance(data, dict):
|
|
54
|
+
data = data.get("examples", data.get("tasks", [data]))
|
|
55
|
+
|
|
56
|
+
dataset = client.create_dataset(
|
|
57
|
+
dataset_name=dataset_name,
|
|
58
|
+
description=f"Evaluation dataset created from {os.path.basename(file_path)}",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
examples = []
|
|
62
|
+
for item in data:
|
|
63
|
+
if isinstance(item, str):
|
|
64
|
+
examples.append({"inputs": {"input": item}})
|
|
65
|
+
elif isinstance(item, dict):
|
|
66
|
+
# Support both {"input": "..."} and {"inputs": {"question": "..."}} formats
|
|
67
|
+
if "inputs" in item:
|
|
68
|
+
ex = {"inputs": item["inputs"]}
|
|
69
|
+
elif "input" in item:
|
|
70
|
+
ex = {"inputs": {"input": item["input"]}}
|
|
71
|
+
elif "question" in item:
|
|
72
|
+
ex = {"inputs": {"question": item["question"]}}
|
|
73
|
+
else:
|
|
74
|
+
ex = {"inputs": item}
|
|
75
|
+
|
|
76
|
+
# Include expected outputs if present
|
|
77
|
+
if "outputs" in item:
|
|
78
|
+
ex["outputs"] = item["outputs"]
|
|
79
|
+
elif "expected" in item:
|
|
80
|
+
ex["outputs"] = {"expected": item["expected"]}
|
|
81
|
+
|
|
82
|
+
# Include metadata
|
|
83
|
+
if "metadata" in item:
|
|
84
|
+
ex["metadata"] = item["metadata"]
|
|
85
|
+
|
|
86
|
+
examples.append(ex)
|
|
87
|
+
|
|
88
|
+
if examples:
|
|
89
|
+
client.create_examples(dataset_id=dataset.id, examples=examples)
|
|
90
|
+
|
|
91
|
+
return dataset, len(examples)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def create_dataset_from_langsmith(client, dataset_name, source_project, limit=100):
|
|
95
|
+
"""Create a dataset from existing LangSmith production traces."""
|
|
96
|
+
runs = list(client.list_runs(
|
|
97
|
+
project_name=source_project,
|
|
98
|
+
is_root=True,
|
|
99
|
+
limit=limit,
|
|
100
|
+
))
|
|
101
|
+
|
|
102
|
+
if not runs:
|
|
103
|
+
return None, 0
|
|
104
|
+
|
|
105
|
+
dataset = client.create_dataset(
|
|
106
|
+
dataset_name=dataset_name,
|
|
107
|
+
description=f"Evaluation dataset from production traces ({source_project})",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
examples = []
|
|
111
|
+
for run in runs:
|
|
112
|
+
if run.inputs:
|
|
113
|
+
ex = {"inputs": run.inputs}
|
|
114
|
+
if run.outputs:
|
|
115
|
+
ex["outputs"] = run.outputs
|
|
116
|
+
examples.append(ex)
|
|
117
|
+
|
|
118
|
+
if examples:
|
|
119
|
+
client.create_examples(dataset_id=dataset.id, examples=examples)
|
|
120
|
+
|
|
121
|
+
return dataset, len(examples)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def create_empty_dataset(client, dataset_name):
|
|
125
|
+
"""Create an empty dataset (to be populated by testgen agent)."""
|
|
126
|
+
dataset = client.create_dataset(
|
|
127
|
+
dataset_name=dataset_name,
|
|
128
|
+
description="Evaluation dataset (pending test generation)",
|
|
129
|
+
)
|
|
130
|
+
return dataset
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def get_evaluators(goals, evaluator_names=None):
|
|
134
|
+
"""Build evaluator list based on optimization goals."""
|
|
135
|
+
from openevals.llm import create_llm_as_judge
|
|
136
|
+
from openevals.prompts import CORRECTNESS_PROMPT, CONCISENESS_PROMPT
|
|
137
|
+
|
|
138
|
+
evaluators = []
|
|
139
|
+
evaluator_keys = []
|
|
140
|
+
|
|
141
|
+
# Map goals to evaluators
|
|
142
|
+
goal_map = {
|
|
143
|
+
"accuracy": ("correctness", CORRECTNESS_PROMPT),
|
|
144
|
+
"conciseness": ("conciseness", CONCISENESS_PROMPT),
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
if evaluator_names:
|
|
148
|
+
names = [n.strip() for n in evaluator_names.split(",")]
|
|
149
|
+
else:
|
|
150
|
+
names = []
|
|
151
|
+
for goal in goals:
|
|
152
|
+
if goal in goal_map:
|
|
153
|
+
names.append(goal_map[goal][0])
|
|
154
|
+
if not names:
|
|
155
|
+
names = ["correctness"] # default
|
|
156
|
+
|
|
157
|
+
for name in names:
|
|
158
|
+
if name in ("correctness", "accuracy"):
|
|
159
|
+
evaluators.append(create_llm_as_judge(
|
|
160
|
+
prompt=CORRECTNESS_PROMPT,
|
|
161
|
+
feedback_key="correctness",
|
|
162
|
+
model="openai:gpt-4.1-mini",
|
|
163
|
+
))
|
|
164
|
+
evaluator_keys.append("correctness")
|
|
165
|
+
elif name in ("conciseness", "brevity"):
|
|
166
|
+
evaluators.append(create_llm_as_judge(
|
|
167
|
+
prompt=CONCISENESS_PROMPT,
|
|
168
|
+
feedback_key="conciseness",
|
|
169
|
+
model="openai:gpt-4.1-mini",
|
|
170
|
+
))
|
|
171
|
+
evaluator_keys.append("conciseness")
|
|
172
|
+
|
|
173
|
+
# Code-based evaluators for latency/tokens
|
|
174
|
+
if "latency" in goals:
|
|
175
|
+
def latency_eval(inputs, outputs, **kwargs):
|
|
176
|
+
# Latency is captured in traces, not scored here
|
|
177
|
+
return {"key": "has_output", "score": 1.0 if outputs else 0.0}
|
|
178
|
+
evaluators.append(latency_eval)
|
|
179
|
+
evaluator_keys.append("latency")
|
|
180
|
+
|
|
181
|
+
if "token_efficiency" in goals:
|
|
182
|
+
def token_eval(inputs, outputs, **kwargs):
|
|
183
|
+
output_text = str(outputs.get("output", outputs.get("answer", "")))
|
|
184
|
+
# Penalize very long outputs (>2000 chars)
|
|
185
|
+
score = min(1.0, 2000 / max(len(output_text), 1))
|
|
186
|
+
return {"key": "token_efficiency", "score": score}
|
|
187
|
+
evaluators.append(token_eval)
|
|
188
|
+
evaluator_keys.append("token_efficiency")
|
|
189
|
+
|
|
190
|
+
return evaluators, evaluator_keys
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def make_target(entry_point, cwd=None):
|
|
194
|
+
"""Create a target function that runs the user's agent."""
|
|
195
|
+
def target(inputs):
|
|
196
|
+
input_json = json.dumps(inputs)
|
|
197
|
+
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
|
|
198
|
+
f.write(input_json)
|
|
199
|
+
input_path = f.name
|
|
200
|
+
|
|
201
|
+
output_path = input_path + ".out"
|
|
202
|
+
try:
|
|
203
|
+
# Build command — supports {input} placeholder
|
|
204
|
+
cmd = entry_point
|
|
205
|
+
if "{input}" in cmd:
|
|
206
|
+
cmd = cmd.replace("{input}", input_path)
|
|
207
|
+
elif "{input_json}" in cmd:
|
|
208
|
+
cmd = cmd.replace("{input_json}", input_json)
|
|
209
|
+
else:
|
|
210
|
+
cmd = f"{cmd} --input {input_path} --output {output_path}"
|
|
211
|
+
|
|
212
|
+
result = subprocess.run(
|
|
213
|
+
cmd, shell=True, capture_output=True, text=True,
|
|
214
|
+
timeout=120, cwd=cwd,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Try to read output file
|
|
218
|
+
if os.path.exists(output_path):
|
|
219
|
+
with open(output_path) as f:
|
|
220
|
+
return json.load(f)
|
|
221
|
+
|
|
222
|
+
# Fallback: parse stdout as JSON
|
|
223
|
+
if result.stdout.strip():
|
|
224
|
+
try:
|
|
225
|
+
return json.loads(result.stdout)
|
|
226
|
+
except json.JSONDecodeError:
|
|
227
|
+
return {"output": result.stdout.strip()}
|
|
228
|
+
|
|
229
|
+
return {"output": "", "error": result.stderr.strip() if result.returncode != 0 else None}
|
|
230
|
+
|
|
231
|
+
except subprocess.TimeoutExpired:
|
|
232
|
+
return {"output": "", "error": "TIMEOUT after 120s"}
|
|
233
|
+
except Exception as e:
|
|
234
|
+
return {"output": "", "error": str(e)}
|
|
235
|
+
finally:
|
|
236
|
+
for p in [input_path, output_path]:
|
|
237
|
+
if os.path.exists(p):
|
|
238
|
+
os.remove(p)
|
|
239
|
+
|
|
240
|
+
return target
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def run_baseline(client, dataset_name, entry_point, evaluators):
|
|
244
|
+
"""Run baseline evaluation and return experiment name + score."""
|
|
245
|
+
target = make_target(entry_point)
|
|
246
|
+
|
|
247
|
+
results = client.evaluate(
|
|
248
|
+
target,
|
|
249
|
+
data=dataset_name,
|
|
250
|
+
evaluators=evaluators,
|
|
251
|
+
experiment_prefix="baseline",
|
|
252
|
+
max_concurrency=1,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
experiment_name = results.experiment_name
|
|
256
|
+
# Read aggregate metrics
|
|
257
|
+
try:
|
|
258
|
+
project = client.read_project(project_name=experiment_name, include_stats=True)
|
|
259
|
+
stats = project.model_dump() if hasattr(project, "model_dump") else {}
|
|
260
|
+
except Exception:
|
|
261
|
+
stats = {}
|
|
262
|
+
|
|
263
|
+
# Calculate mean score from results
|
|
264
|
+
scores = []
|
|
265
|
+
for result in results:
|
|
266
|
+
if result.evaluation_results and result.evaluation_results.get("results"):
|
|
267
|
+
for er in result.evaluation_results["results"]:
|
|
268
|
+
if er.get("score") is not None:
|
|
269
|
+
scores.append(er["score"])
|
|
270
|
+
|
|
271
|
+
mean_score = sum(scores) / len(scores) if scores else 0.0
|
|
272
|
+
|
|
273
|
+
return experiment_name, mean_score
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def main():
|
|
277
|
+
parser = argparse.ArgumentParser(description="Setup LangSmith for Harness Evolver v3")
|
|
278
|
+
parser.add_argument("--project-name", required=True, help="Name for the evolver project")
|
|
279
|
+
parser.add_argument("--entry-point", required=True, help="Command to run the agent")
|
|
280
|
+
parser.add_argument("--framework", default="unknown", help="Detected framework")
|
|
281
|
+
parser.add_argument("--goals", default="accuracy", help="Comma-separated optimization goals")
|
|
282
|
+
parser.add_argument("--dataset-from-file", default=None, help="Create dataset from JSON file")
|
|
283
|
+
parser.add_argument("--dataset-from-langsmith", default=None, help="Create dataset from LangSmith project")
|
|
284
|
+
parser.add_argument("--production-project", default=None, help="Production LangSmith project")
|
|
285
|
+
parser.add_argument("--evaluators", default=None, help="Comma-separated evaluator names")
|
|
286
|
+
parser.add_argument("--skip-baseline", action="store_true", help="Skip baseline evaluation")
|
|
287
|
+
parser.add_argument("--output", default=".evolver.json", help="Output config path")
|
|
288
|
+
args = parser.parse_args()
|
|
289
|
+
|
|
290
|
+
# Check dependencies
|
|
291
|
+
missing = check_dependencies()
|
|
292
|
+
if missing:
|
|
293
|
+
print(f"Missing packages: {', '.join(missing)}", file=sys.stderr)
|
|
294
|
+
print(f"Install with: pip install {' '.join(missing)}", file=sys.stderr)
|
|
295
|
+
sys.exit(1)
|
|
296
|
+
|
|
297
|
+
from langsmith import Client
|
|
298
|
+
client = Client()
|
|
299
|
+
|
|
300
|
+
# Verify connection
|
|
301
|
+
try:
|
|
302
|
+
client.list_datasets(limit=1)
|
|
303
|
+
print("LangSmith connection verified.")
|
|
304
|
+
except Exception as e:
|
|
305
|
+
print(f"Failed to connect to LangSmith: {e}", file=sys.stderr)
|
|
306
|
+
print("Check LANGSMITH_API_KEY is set correctly.", file=sys.stderr)
|
|
307
|
+
sys.exit(1)
|
|
308
|
+
|
|
309
|
+
project_name = f"evolver-{args.project_name}"
|
|
310
|
+
dataset_name = f"{args.project_name}-eval-v1"
|
|
311
|
+
goals = [g.strip() for g in args.goals.split(",")]
|
|
312
|
+
|
|
313
|
+
# Create dataset
|
|
314
|
+
print(f"Creating dataset '{dataset_name}'...")
|
|
315
|
+
if args.dataset_from_file:
|
|
316
|
+
dataset, count = create_dataset_from_file(client, dataset_name, args.dataset_from_file)
|
|
317
|
+
print(f" Created from file: {count} examples")
|
|
318
|
+
elif args.dataset_from_langsmith:
|
|
319
|
+
dataset, count = create_dataset_from_langsmith(
|
|
320
|
+
client, dataset_name, args.dataset_from_langsmith,
|
|
321
|
+
)
|
|
322
|
+
if not dataset:
|
|
323
|
+
print(" No traces found in source project. Creating empty dataset.")
|
|
324
|
+
dataset = create_empty_dataset(client, dataset_name)
|
|
325
|
+
count = 0
|
|
326
|
+
else:
|
|
327
|
+
print(f" Created from LangSmith traces: {count} examples")
|
|
328
|
+
else:
|
|
329
|
+
dataset = create_empty_dataset(client, dataset_name)
|
|
330
|
+
count = 0
|
|
331
|
+
print(" Created empty dataset (testgen will populate)")
|
|
332
|
+
|
|
333
|
+
# Configure evaluators
|
|
334
|
+
print(f"Configuring evaluators for goals: {goals}")
|
|
335
|
+
evaluators, evaluator_keys = get_evaluators(goals, args.evaluators)
|
|
336
|
+
print(f" Active evaluators: {evaluator_keys}")
|
|
337
|
+
|
|
338
|
+
# Run baseline
|
|
339
|
+
baseline_experiment = None
|
|
340
|
+
baseline_score = 0.0
|
|
341
|
+
if not args.skip_baseline and count > 0:
|
|
342
|
+
print(f"Running baseline evaluation ({count} examples)...")
|
|
343
|
+
try:
|
|
344
|
+
baseline_experiment, baseline_score = run_baseline(
|
|
345
|
+
client, dataset_name, args.entry_point, evaluators,
|
|
346
|
+
)
|
|
347
|
+
print(f" Baseline score: {baseline_score:.3f}")
|
|
348
|
+
print(f" Experiment: {baseline_experiment}")
|
|
349
|
+
except Exception as e:
|
|
350
|
+
print(f" Baseline evaluation failed: {e}", file=sys.stderr)
|
|
351
|
+
print(" Continuing with score 0.0")
|
|
352
|
+
elif count == 0:
|
|
353
|
+
print("Skipping baseline (no examples in dataset yet)")
|
|
354
|
+
else:
|
|
355
|
+
print("Skipping baseline (--skip-baseline)")
|
|
356
|
+
|
|
357
|
+
# Write config
|
|
358
|
+
config = {
|
|
359
|
+
"version": "3.0.0",
|
|
360
|
+
"project": project_name,
|
|
361
|
+
"dataset": dataset_name,
|
|
362
|
+
"dataset_id": str(dataset.id) if dataset else None,
|
|
363
|
+
"entry_point": args.entry_point,
|
|
364
|
+
"evaluators": evaluator_keys,
|
|
365
|
+
"optimization_goals": goals,
|
|
366
|
+
"production_project": args.production_project,
|
|
367
|
+
"baseline_experiment": baseline_experiment,
|
|
368
|
+
"best_experiment": baseline_experiment,
|
|
369
|
+
"best_score": baseline_score,
|
|
370
|
+
"iterations": 0,
|
|
371
|
+
"framework": args.framework,
|
|
372
|
+
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
373
|
+
"history": [{
|
|
374
|
+
"version": "baseline",
|
|
375
|
+
"experiment": baseline_experiment,
|
|
376
|
+
"score": baseline_score,
|
|
377
|
+
}] if baseline_experiment else [],
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
with open(args.output, "w") as f:
|
|
381
|
+
json.dump(config, f, indent=2)
|
|
382
|
+
|
|
383
|
+
print(f"\nSetup complete. Config saved to {args.output}")
|
|
384
|
+
print(f" Project: {project_name}")
|
|
385
|
+
print(f" Dataset: {dataset_name} ({count} examples)")
|
|
386
|
+
print(f" Evaluators: {evaluator_keys}")
|
|
387
|
+
if baseline_experiment:
|
|
388
|
+
print(f" Baseline: {baseline_score:.3f}")
|
|
389
|
+
print(f"\nNext: run /evolver:evolve")
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
if __name__ == "__main__":
|
|
393
|
+
main()
|
package/tools/trace_insights.py
CHANGED
|
@@ -5,15 +5,19 @@ Analyzes LangSmith traces + per-task scores to produce structured insights.
|
|
|
5
5
|
Clusters errors, analyzes token usage, cross-references with scores,
|
|
6
6
|
and generates data-driven hypotheses.
|
|
7
7
|
|
|
8
|
-
Usage:
|
|
8
|
+
Usage (v3 — SDK mode):
|
|
9
9
|
python3 trace_insights.py \
|
|
10
|
-
--
|
|
11
|
-
--
|
|
12
|
-
--tasks-dir .harness-evolver/eval/tasks/ \
|
|
13
|
-
--output .harness-evolver/trace_insights.json \
|
|
14
|
-
[--langsmith-stats .harness-evolver/langsmith_stats.json]
|
|
10
|
+
--from-experiment "v003-2026-04-01" \
|
|
11
|
+
--output trace_insights.json
|
|
15
12
|
|
|
16
|
-
|
|
13
|
+
Usage (legacy — file mode):
|
|
14
|
+
python3 trace_insights.py \
|
|
15
|
+
--langsmith-runs langsmith_runs.json \
|
|
16
|
+
--scores scores.json \
|
|
17
|
+
--tasks-dir tasks/ \
|
|
18
|
+
--output trace_insights.json
|
|
19
|
+
|
|
20
|
+
Requires: pip install langsmith (for SDK mode)
|
|
17
21
|
"""
|
|
18
22
|
|
|
19
23
|
import argparse
|
|
@@ -253,18 +257,85 @@ def identify_top_issues(error_clusters, response_analysis, score_cross_ref):
|
|
|
253
257
|
return issues
|
|
254
258
|
|
|
255
259
|
|
|
260
|
+
def fetch_runs_from_langsmith(project_name, experiment_name=None, limit=50):
|
|
261
|
+
"""Fetch runs directly from LangSmith SDK (v3 mode)."""
|
|
262
|
+
try:
|
|
263
|
+
from langsmith import Client
|
|
264
|
+
client = Client()
|
|
265
|
+
|
|
266
|
+
source = experiment_name or project_name
|
|
267
|
+
raw_runs = list(client.list_runs(
|
|
268
|
+
project_name=source,
|
|
269
|
+
is_root=True,
|
|
270
|
+
limit=limit,
|
|
271
|
+
))
|
|
272
|
+
|
|
273
|
+
runs = []
|
|
274
|
+
for run in raw_runs:
|
|
275
|
+
entry = {
|
|
276
|
+
"name": run.name or "unknown",
|
|
277
|
+
"tokens": run.total_tokens or 0,
|
|
278
|
+
"error": run.error[:200] if run.error else None,
|
|
279
|
+
"llm_response": str(run.outputs)[:300] if run.outputs else "",
|
|
280
|
+
}
|
|
281
|
+
runs.append(entry)
|
|
282
|
+
|
|
283
|
+
return runs
|
|
284
|
+
except Exception as e:
|
|
285
|
+
print(f"Failed to fetch from LangSmith: {e}", file=sys.stderr)
|
|
286
|
+
return []
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def fetch_scores_from_experiment(experiment_name):
|
|
290
|
+
"""Fetch per-example scores from a LangSmith experiment (v3 mode)."""
|
|
291
|
+
try:
|
|
292
|
+
from langsmith import Client
|
|
293
|
+
client = Client()
|
|
294
|
+
|
|
295
|
+
runs = list(client.list_runs(
|
|
296
|
+
project_name=experiment_name,
|
|
297
|
+
is_root=True,
|
|
298
|
+
limit=200,
|
|
299
|
+
))
|
|
300
|
+
|
|
301
|
+
per_task = {}
|
|
302
|
+
for run in runs:
|
|
303
|
+
example_id = str(run.reference_example_id or run.id)
|
|
304
|
+
feedbacks = list(client.list_feedback(run_ids=[run.id]))
|
|
305
|
+
scores = [fb.score for fb in feedbacks if fb.score is not None]
|
|
306
|
+
avg_score = sum(scores) / len(scores) if scores else 0.0
|
|
307
|
+
per_task[example_id] = {"score": avg_score}
|
|
308
|
+
|
|
309
|
+
all_scores = [v["score"] for v in per_task.values()]
|
|
310
|
+
combined = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
|
311
|
+
|
|
312
|
+
return {"combined_score": combined, "per_task": per_task}
|
|
313
|
+
except Exception as e:
|
|
314
|
+
print(f"Failed to fetch experiment scores: {e}", file=sys.stderr)
|
|
315
|
+
return None
|
|
316
|
+
|
|
317
|
+
|
|
256
318
|
def main():
|
|
257
319
|
parser = argparse.ArgumentParser(description="Generate trace insights from LangSmith data + scores")
|
|
258
|
-
parser.add_argument("--langsmith-runs",
|
|
320
|
+
parser.add_argument("--langsmith-runs", default=None, help="Path to langsmith_runs.json (v2 mode)")
|
|
259
321
|
parser.add_argument("--langsmith-stats", help="Path to langsmith_stats.json (optional)")
|
|
260
|
-
parser.add_argument("--scores",
|
|
261
|
-
parser.add_argument("--tasks-dir",
|
|
322
|
+
parser.add_argument("--scores", default=None, help="Path to scores.json (v2 mode)")
|
|
323
|
+
parser.add_argument("--tasks-dir", default=None, help="Path to eval/tasks/ directory (v2 mode)")
|
|
324
|
+
parser.add_argument("--from-project", default=None, help="LangSmith project name (v3 mode)")
|
|
325
|
+
parser.add_argument("--from-experiment", default=None, help="LangSmith experiment name (v3 mode)")
|
|
262
326
|
parser.add_argument("--output", required=True, help="Output path for trace_insights.json")
|
|
263
327
|
args = parser.parse_args()
|
|
264
328
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
329
|
+
# v3 mode: fetch directly from LangSmith
|
|
330
|
+
if args.from_project or args.from_experiment:
|
|
331
|
+
runs = fetch_runs_from_langsmith(args.from_project, args.from_experiment)
|
|
332
|
+
scores_data = fetch_scores_from_experiment(args.from_experiment) if args.from_experiment else None
|
|
333
|
+
stats = None
|
|
334
|
+
else:
|
|
335
|
+
# v2 mode: read from local files
|
|
336
|
+
runs = load_json(args.langsmith_runs)
|
|
337
|
+
stats = load_json(args.langsmith_stats)
|
|
338
|
+
scores_data = load_json(args.scores)
|
|
268
339
|
|
|
269
340
|
if not runs and not scores_data:
|
|
270
341
|
# Nothing to analyze — write minimal insights
|
|
@@ -291,7 +362,8 @@ def main():
|
|
|
291
362
|
response_analysis = analyze_responses(runs)
|
|
292
363
|
|
|
293
364
|
# Phase 2: Cross-reference with scores
|
|
294
|
-
|
|
365
|
+
tasks_dir = getattr(args, "tasks_dir", None)
|
|
366
|
+
score_cross_ref = cross_reference_scores(runs, scores_data, tasks_dir)
|
|
295
367
|
token_score_corr = correlate_tokens_scores(runs, scores_data)
|
|
296
368
|
|
|
297
369
|
# Phase 3: Generate hypotheses
|