harness-evolver 2.9.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +62 -117
  2. package/agents/evolver-architect.md +53 -0
  3. package/agents/evolver-critic.md +44 -0
  4. package/agents/evolver-proposer.md +128 -0
  5. package/agents/evolver-testgen.md +67 -0
  6. package/bin/install.js +181 -171
  7. package/package.json +7 -7
  8. package/skills/deploy/SKILL.md +49 -56
  9. package/skills/evolve/SKILL.md +156 -687
  10. package/skills/setup/SKILL.md +182 -0
  11. package/skills/status/SKILL.md +23 -21
  12. package/tools/read_results.py +240 -0
  13. package/tools/run_eval.py +202 -0
  14. package/tools/seed_from_traces.py +36 -8
  15. package/tools/setup.py +393 -0
  16. package/tools/trace_insights.py +86 -14
  17. package/agents/harness-evolver-architect.md +0 -173
  18. package/agents/harness-evolver-critic.md +0 -132
  19. package/agents/harness-evolver-judge.md +0 -110
  20. package/agents/harness-evolver-proposer.md +0 -317
  21. package/agents/harness-evolver-testgen.md +0 -112
  22. package/examples/classifier/README.md +0 -25
  23. package/examples/classifier/config.json +0 -3
  24. package/examples/classifier/eval.py +0 -58
  25. package/examples/classifier/harness.py +0 -111
  26. package/examples/classifier/tasks/task_001.json +0 -1
  27. package/examples/classifier/tasks/task_002.json +0 -1
  28. package/examples/classifier/tasks/task_003.json +0 -1
  29. package/examples/classifier/tasks/task_004.json +0 -1
  30. package/examples/classifier/tasks/task_005.json +0 -1
  31. package/examples/classifier/tasks/task_006.json +0 -1
  32. package/examples/classifier/tasks/task_007.json +0 -1
  33. package/examples/classifier/tasks/task_008.json +0 -1
  34. package/examples/classifier/tasks/task_009.json +0 -1
  35. package/examples/classifier/tasks/task_010.json +0 -1
  36. package/skills/architect/SKILL.md +0 -93
  37. package/skills/compare/SKILL.md +0 -73
  38. package/skills/critic/SKILL.md +0 -67
  39. package/skills/diagnose/SKILL.md +0 -96
  40. package/skills/import-traces/SKILL.md +0 -102
  41. package/skills/init/SKILL.md +0 -293
  42. package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
  43. package/tools/__pycache__/init.cpython-313.pyc +0 -0
  44. package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
  45. package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
  46. package/tools/eval_llm_judge.py +0 -233
  47. package/tools/eval_passthrough.py +0 -55
  48. package/tools/evaluate.py +0 -255
  49. package/tools/import_traces.py +0 -229
  50. package/tools/init.py +0 -531
  51. package/tools/llm_api.py +0 -125
  52. package/tools/state.py +0 -219
  53. package/tools/test_growth.py +0 -230
  54. package/tools/trace_logger.py +0 -42
@@ -9,8 +9,8 @@ production traces and produce:
9
9
  Usage:
10
10
  python3 seed_from_traces.py \
11
11
  --project ceppem-langgraph \
12
- --output-md .harness-evolver/production_seed.md \
13
- --output-json .harness-evolver/production_seed.json \
12
+ --output-md production_seed.md \
13
+ --output-json production_seed.json \
14
14
  [--api-key-env LANGSMITH_API_KEY] \
15
15
  [--limit 100]
16
16
 
@@ -401,15 +401,43 @@ def main():
401
401
  parser.add_argument("--limit", type=int, default=100, help="Max traces to fetch (default: 100)")
402
402
  parser.add_argument("--output-md", required=True, help="Output path for markdown seed")
403
403
  parser.add_argument("--output-json", required=True, help="Output path for JSON summary")
404
+ parser.add_argument("--use-sdk", action="store_true",
405
+ help="Use langsmith Python SDK instead of REST API (v3 mode)")
404
406
  args = parser.parse_args()
405
407
 
406
- api_key = os.environ.get(args.api_key_env, "")
407
- if not api_key:
408
- print(f"No API key found in ${args.api_key_env} — cannot fetch production traces", file=sys.stderr)
409
- sys.exit(1)
410
-
411
408
  print(f"Fetching up to {args.limit} traces from LangSmith project '{args.project}'...")
412
- runs = fetch_runs(args.project, api_key, args.limit)
409
+
410
+ if args.use_sdk:
411
+ try:
412
+ from langsmith import Client
413
+ client = Client()
414
+ raw_runs = list(client.list_runs(
415
+ project_name=args.project, is_root=True, limit=args.limit,
416
+ ))
417
+ # Convert SDK run objects to dicts matching our format
418
+ runs = []
419
+ for r in raw_runs:
420
+ run_dict = {
421
+ "id": str(r.id),
422
+ "name": r.name,
423
+ "inputs": r.inputs,
424
+ "outputs": r.outputs,
425
+ "error": r.error,
426
+ "total_tokens": r.total_tokens,
427
+ "feedback_stats": None,
428
+ "start_time": r.start_time.isoformat() if r.start_time else None,
429
+ "end_time": r.end_time.isoformat() if r.end_time else None,
430
+ }
431
+ runs.append(run_dict)
432
+ except ImportError:
433
+ print("langsmith package not installed. Use --use-sdk with pip install langsmith", file=sys.stderr)
434
+ sys.exit(1)
435
+ else:
436
+ api_key = os.environ.get(args.api_key_env, "")
437
+ if not api_key:
438
+ print(f"No API key found in ${args.api_key_env} — cannot fetch production traces", file=sys.stderr)
439
+ sys.exit(1)
440
+ runs = fetch_runs(args.project, api_key, args.limit)
413
441
 
414
442
  if not runs:
415
443
  print("No traces found. The project may be empty or the name may be wrong.")
package/tools/setup.py ADDED
@@ -0,0 +1,393 @@
1
+ #!/usr/bin/env python3
2
+ """LangSmith Setup for Harness Evolver v3.
3
+
4
+ Configures the LangSmith environment for evolution:
5
+ - Creates/connects to a LangSmith project
6
+ - Creates a dataset from test inputs, production traces, or generated data
7
+ - Configures evaluators based on optimization goals
8
+ - Runs baseline evaluation
9
+ - Writes .evolver.json config
10
+
11
+ Usage:
12
+ python3 setup.py \
13
+ --project-name my-agent \
14
+ --entry-point "python main.py" \
15
+ --framework langgraph \
16
+ --goals accuracy,latency \
17
+ [--dataset-from-file inputs.json] \
18
+ [--dataset-from-langsmith production-project] \
19
+ [--production-project my-prod-project] \
20
+ [--evaluators correctness,conciseness]
21
+
22
+ Requires: pip install langsmith openevals
23
+ """
24
+
25
+ import argparse
26
+ import json
27
+ import os
28
+ import subprocess
29
+ import sys
30
+ import tempfile
31
+ from datetime import datetime, timezone
32
+
33
+
34
+ def check_dependencies():
35
+ """Verify langsmith and openevals are installed."""
36
+ missing = []
37
+ try:
38
+ import langsmith # noqa: F401
39
+ except ImportError:
40
+ missing.append("langsmith")
41
+ try:
42
+ import openevals # noqa: F401
43
+ except ImportError:
44
+ missing.append("openevals")
45
+ return missing
46
+
47
+
48
+ def create_dataset_from_file(client, dataset_name, file_path):
49
+ """Create a LangSmith dataset from a JSON file of inputs."""
50
+ with open(file_path) as f:
51
+ data = json.load(f)
52
+
53
+ if isinstance(data, dict):
54
+ data = data.get("examples", data.get("tasks", [data]))
55
+
56
+ dataset = client.create_dataset(
57
+ dataset_name=dataset_name,
58
+ description=f"Evaluation dataset created from {os.path.basename(file_path)}",
59
+ )
60
+
61
+ examples = []
62
+ for item in data:
63
+ if isinstance(item, str):
64
+ examples.append({"inputs": {"input": item}})
65
+ elif isinstance(item, dict):
66
+ # Support both {"input": "..."} and {"inputs": {"question": "..."}} formats
67
+ if "inputs" in item:
68
+ ex = {"inputs": item["inputs"]}
69
+ elif "input" in item:
70
+ ex = {"inputs": {"input": item["input"]}}
71
+ elif "question" in item:
72
+ ex = {"inputs": {"question": item["question"]}}
73
+ else:
74
+ ex = {"inputs": item}
75
+
76
+ # Include expected outputs if present
77
+ if "outputs" in item:
78
+ ex["outputs"] = item["outputs"]
79
+ elif "expected" in item:
80
+ ex["outputs"] = {"expected": item["expected"]}
81
+
82
+ # Include metadata
83
+ if "metadata" in item:
84
+ ex["metadata"] = item["metadata"]
85
+
86
+ examples.append(ex)
87
+
88
+ if examples:
89
+ client.create_examples(dataset_id=dataset.id, examples=examples)
90
+
91
+ return dataset, len(examples)
92
+
93
+
94
+ def create_dataset_from_langsmith(client, dataset_name, source_project, limit=100):
95
+ """Create a dataset from existing LangSmith production traces."""
96
+ runs = list(client.list_runs(
97
+ project_name=source_project,
98
+ is_root=True,
99
+ limit=limit,
100
+ ))
101
+
102
+ if not runs:
103
+ return None, 0
104
+
105
+ dataset = client.create_dataset(
106
+ dataset_name=dataset_name,
107
+ description=f"Evaluation dataset from production traces ({source_project})",
108
+ )
109
+
110
+ examples = []
111
+ for run in runs:
112
+ if run.inputs:
113
+ ex = {"inputs": run.inputs}
114
+ if run.outputs:
115
+ ex["outputs"] = run.outputs
116
+ examples.append(ex)
117
+
118
+ if examples:
119
+ client.create_examples(dataset_id=dataset.id, examples=examples)
120
+
121
+ return dataset, len(examples)
122
+
123
+
124
+ def create_empty_dataset(client, dataset_name):
125
+ """Create an empty dataset (to be populated by testgen agent)."""
126
+ dataset = client.create_dataset(
127
+ dataset_name=dataset_name,
128
+ description="Evaluation dataset (pending test generation)",
129
+ )
130
+ return dataset
131
+
132
+
133
+ def get_evaluators(goals, evaluator_names=None):
134
+ """Build evaluator list based on optimization goals."""
135
+ from openevals.llm import create_llm_as_judge
136
+ from openevals.prompts import CORRECTNESS_PROMPT, CONCISENESS_PROMPT
137
+
138
+ evaluators = []
139
+ evaluator_keys = []
140
+
141
+ # Map goals to evaluators
142
+ goal_map = {
143
+ "accuracy": ("correctness", CORRECTNESS_PROMPT),
144
+ "conciseness": ("conciseness", CONCISENESS_PROMPT),
145
+ }
146
+
147
+ if evaluator_names:
148
+ names = [n.strip() for n in evaluator_names.split(",")]
149
+ else:
150
+ names = []
151
+ for goal in goals:
152
+ if goal in goal_map:
153
+ names.append(goal_map[goal][0])
154
+ if not names:
155
+ names = ["correctness"] # default
156
+
157
+ for name in names:
158
+ if name in ("correctness", "accuracy"):
159
+ evaluators.append(create_llm_as_judge(
160
+ prompt=CORRECTNESS_PROMPT,
161
+ feedback_key="correctness",
162
+ model="openai:gpt-4.1-mini",
163
+ ))
164
+ evaluator_keys.append("correctness")
165
+ elif name in ("conciseness", "brevity"):
166
+ evaluators.append(create_llm_as_judge(
167
+ prompt=CONCISENESS_PROMPT,
168
+ feedback_key="conciseness",
169
+ model="openai:gpt-4.1-mini",
170
+ ))
171
+ evaluator_keys.append("conciseness")
172
+
173
+ # Code-based evaluators for latency/tokens
174
+ if "latency" in goals:
175
+ def latency_eval(inputs, outputs, **kwargs):
176
+ # Latency is captured in traces, not scored here
177
+ return {"key": "has_output", "score": 1.0 if outputs else 0.0}
178
+ evaluators.append(latency_eval)
179
+ evaluator_keys.append("latency")
180
+
181
+ if "token_efficiency" in goals:
182
+ def token_eval(inputs, outputs, **kwargs):
183
+ output_text = str(outputs.get("output", outputs.get("answer", "")))
184
+ # Penalize very long outputs (>2000 chars)
185
+ score = min(1.0, 2000 / max(len(output_text), 1))
186
+ return {"key": "token_efficiency", "score": score}
187
+ evaluators.append(token_eval)
188
+ evaluator_keys.append("token_efficiency")
189
+
190
+ return evaluators, evaluator_keys
191
+
192
+
193
+ def make_target(entry_point, cwd=None):
194
+ """Create a target function that runs the user's agent."""
195
+ def target(inputs):
196
+ input_json = json.dumps(inputs)
197
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
198
+ f.write(input_json)
199
+ input_path = f.name
200
+
201
+ output_path = input_path + ".out"
202
+ try:
203
+ # Build command — supports {input} placeholder
204
+ cmd = entry_point
205
+ if "{input}" in cmd:
206
+ cmd = cmd.replace("{input}", input_path)
207
+ elif "{input_json}" in cmd:
208
+ cmd = cmd.replace("{input_json}", input_json)
209
+ else:
210
+ cmd = f"{cmd} --input {input_path} --output {output_path}"
211
+
212
+ result = subprocess.run(
213
+ cmd, shell=True, capture_output=True, text=True,
214
+ timeout=120, cwd=cwd,
215
+ )
216
+
217
+ # Try to read output file
218
+ if os.path.exists(output_path):
219
+ with open(output_path) as f:
220
+ return json.load(f)
221
+
222
+ # Fallback: parse stdout as JSON
223
+ if result.stdout.strip():
224
+ try:
225
+ return json.loads(result.stdout)
226
+ except json.JSONDecodeError:
227
+ return {"output": result.stdout.strip()}
228
+
229
+ return {"output": "", "error": result.stderr.strip() if result.returncode != 0 else None}
230
+
231
+ except subprocess.TimeoutExpired:
232
+ return {"output": "", "error": "TIMEOUT after 120s"}
233
+ except Exception as e:
234
+ return {"output": "", "error": str(e)}
235
+ finally:
236
+ for p in [input_path, output_path]:
237
+ if os.path.exists(p):
238
+ os.remove(p)
239
+
240
+ return target
241
+
242
+
243
+ def run_baseline(client, dataset_name, entry_point, evaluators):
244
+ """Run baseline evaluation and return experiment name + score."""
245
+ target = make_target(entry_point)
246
+
247
+ results = client.evaluate(
248
+ target,
249
+ data=dataset_name,
250
+ evaluators=evaluators,
251
+ experiment_prefix="baseline",
252
+ max_concurrency=1,
253
+ )
254
+
255
+ experiment_name = results.experiment_name
256
+ # Read aggregate metrics
257
+ try:
258
+ project = client.read_project(project_name=experiment_name, include_stats=True)
259
+ stats = project.model_dump() if hasattr(project, "model_dump") else {}
260
+ except Exception:
261
+ stats = {}
262
+
263
+ # Calculate mean score from results
264
+ scores = []
265
+ for result in results:
266
+ if result.evaluation_results and result.evaluation_results.get("results"):
267
+ for er in result.evaluation_results["results"]:
268
+ if er.get("score") is not None:
269
+ scores.append(er["score"])
270
+
271
+ mean_score = sum(scores) / len(scores) if scores else 0.0
272
+
273
+ return experiment_name, mean_score
274
+
275
+
276
+ def main():
277
+ parser = argparse.ArgumentParser(description="Setup LangSmith for Harness Evolver v3")
278
+ parser.add_argument("--project-name", required=True, help="Name for the evolver project")
279
+ parser.add_argument("--entry-point", required=True, help="Command to run the agent")
280
+ parser.add_argument("--framework", default="unknown", help="Detected framework")
281
+ parser.add_argument("--goals", default="accuracy", help="Comma-separated optimization goals")
282
+ parser.add_argument("--dataset-from-file", default=None, help="Create dataset from JSON file")
283
+ parser.add_argument("--dataset-from-langsmith", default=None, help="Create dataset from LangSmith project")
284
+ parser.add_argument("--production-project", default=None, help="Production LangSmith project")
285
+ parser.add_argument("--evaluators", default=None, help="Comma-separated evaluator names")
286
+ parser.add_argument("--skip-baseline", action="store_true", help="Skip baseline evaluation")
287
+ parser.add_argument("--output", default=".evolver.json", help="Output config path")
288
+ args = parser.parse_args()
289
+
290
+ # Check dependencies
291
+ missing = check_dependencies()
292
+ if missing:
293
+ print(f"Missing packages: {', '.join(missing)}", file=sys.stderr)
294
+ print(f"Install with: pip install {' '.join(missing)}", file=sys.stderr)
295
+ sys.exit(1)
296
+
297
+ from langsmith import Client
298
+ client = Client()
299
+
300
+ # Verify connection
301
+ try:
302
+ client.list_datasets(limit=1)
303
+ print("LangSmith connection verified.")
304
+ except Exception as e:
305
+ print(f"Failed to connect to LangSmith: {e}", file=sys.stderr)
306
+ print("Check LANGSMITH_API_KEY is set correctly.", file=sys.stderr)
307
+ sys.exit(1)
308
+
309
+ project_name = f"evolver-{args.project_name}"
310
+ dataset_name = f"{args.project_name}-eval-v1"
311
+ goals = [g.strip() for g in args.goals.split(",")]
312
+
313
+ # Create dataset
314
+ print(f"Creating dataset '{dataset_name}'...")
315
+ if args.dataset_from_file:
316
+ dataset, count = create_dataset_from_file(client, dataset_name, args.dataset_from_file)
317
+ print(f" Created from file: {count} examples")
318
+ elif args.dataset_from_langsmith:
319
+ dataset, count = create_dataset_from_langsmith(
320
+ client, dataset_name, args.dataset_from_langsmith,
321
+ )
322
+ if not dataset:
323
+ print(" No traces found in source project. Creating empty dataset.")
324
+ dataset = create_empty_dataset(client, dataset_name)
325
+ count = 0
326
+ else:
327
+ print(f" Created from LangSmith traces: {count} examples")
328
+ else:
329
+ dataset = create_empty_dataset(client, dataset_name)
330
+ count = 0
331
+ print(" Created empty dataset (testgen will populate)")
332
+
333
+ # Configure evaluators
334
+ print(f"Configuring evaluators for goals: {goals}")
335
+ evaluators, evaluator_keys = get_evaluators(goals, args.evaluators)
336
+ print(f" Active evaluators: {evaluator_keys}")
337
+
338
+ # Run baseline
339
+ baseline_experiment = None
340
+ baseline_score = 0.0
341
+ if not args.skip_baseline and count > 0:
342
+ print(f"Running baseline evaluation ({count} examples)...")
343
+ try:
344
+ baseline_experiment, baseline_score = run_baseline(
345
+ client, dataset_name, args.entry_point, evaluators,
346
+ )
347
+ print(f" Baseline score: {baseline_score:.3f}")
348
+ print(f" Experiment: {baseline_experiment}")
349
+ except Exception as e:
350
+ print(f" Baseline evaluation failed: {e}", file=sys.stderr)
351
+ print(" Continuing with score 0.0")
352
+ elif count == 0:
353
+ print("Skipping baseline (no examples in dataset yet)")
354
+ else:
355
+ print("Skipping baseline (--skip-baseline)")
356
+
357
+ # Write config
358
+ config = {
359
+ "version": "3.0.0",
360
+ "project": project_name,
361
+ "dataset": dataset_name,
362
+ "dataset_id": str(dataset.id) if dataset else None,
363
+ "entry_point": args.entry_point,
364
+ "evaluators": evaluator_keys,
365
+ "optimization_goals": goals,
366
+ "production_project": args.production_project,
367
+ "baseline_experiment": baseline_experiment,
368
+ "best_experiment": baseline_experiment,
369
+ "best_score": baseline_score,
370
+ "iterations": 0,
371
+ "framework": args.framework,
372
+ "created_at": datetime.now(timezone.utc).isoformat(),
373
+ "history": [{
374
+ "version": "baseline",
375
+ "experiment": baseline_experiment,
376
+ "score": baseline_score,
377
+ }] if baseline_experiment else [],
378
+ }
379
+
380
+ with open(args.output, "w") as f:
381
+ json.dump(config, f, indent=2)
382
+
383
+ print(f"\nSetup complete. Config saved to {args.output}")
384
+ print(f" Project: {project_name}")
385
+ print(f" Dataset: {dataset_name} ({count} examples)")
386
+ print(f" Evaluators: {evaluator_keys}")
387
+ if baseline_experiment:
388
+ print(f" Baseline: {baseline_score:.3f}")
389
+ print(f"\nNext: run /evolver:evolve")
390
+
391
+
392
+ if __name__ == "__main__":
393
+ main()
@@ -5,15 +5,19 @@ Analyzes LangSmith traces + per-task scores to produce structured insights.
5
5
  Clusters errors, analyzes token usage, cross-references with scores,
6
6
  and generates data-driven hypotheses.
7
7
 
8
- Usage:
8
+ Usage (v3 — SDK mode):
9
9
  python3 trace_insights.py \
10
- --langsmith-runs .harness-evolver/langsmith_runs.json \
11
- --scores .harness-evolver/harnesses/v002/scores.json \
12
- --tasks-dir .harness-evolver/eval/tasks/ \
13
- --output .harness-evolver/trace_insights.json \
14
- [--langsmith-stats .harness-evolver/langsmith_stats.json]
10
+ --from-experiment "v003-2026-04-01" \
11
+ --output trace_insights.json
15
12
 
16
- Stdlib-only. No external dependencies.
13
+ Usage (legacy file mode):
14
+ python3 trace_insights.py \
15
+ --langsmith-runs langsmith_runs.json \
16
+ --scores scores.json \
17
+ --tasks-dir tasks/ \
18
+ --output trace_insights.json
19
+
20
+ Requires: pip install langsmith (for SDK mode)
17
21
  """
18
22
 
19
23
  import argparse
@@ -253,18 +257,85 @@ def identify_top_issues(error_clusters, response_analysis, score_cross_ref):
253
257
  return issues
254
258
 
255
259
 
260
+ def fetch_runs_from_langsmith(project_name, experiment_name=None, limit=50):
261
+ """Fetch runs directly from LangSmith SDK (v3 mode)."""
262
+ try:
263
+ from langsmith import Client
264
+ client = Client()
265
+
266
+ source = experiment_name or project_name
267
+ raw_runs = list(client.list_runs(
268
+ project_name=source,
269
+ is_root=True,
270
+ limit=limit,
271
+ ))
272
+
273
+ runs = []
274
+ for run in raw_runs:
275
+ entry = {
276
+ "name": run.name or "unknown",
277
+ "tokens": run.total_tokens or 0,
278
+ "error": run.error[:200] if run.error else None,
279
+ "llm_response": str(run.outputs)[:300] if run.outputs else "",
280
+ }
281
+ runs.append(entry)
282
+
283
+ return runs
284
+ except Exception as e:
285
+ print(f"Failed to fetch from LangSmith: {e}", file=sys.stderr)
286
+ return []
287
+
288
+
289
+ def fetch_scores_from_experiment(experiment_name):
290
+ """Fetch per-example scores from a LangSmith experiment (v3 mode)."""
291
+ try:
292
+ from langsmith import Client
293
+ client = Client()
294
+
295
+ runs = list(client.list_runs(
296
+ project_name=experiment_name,
297
+ is_root=True,
298
+ limit=200,
299
+ ))
300
+
301
+ per_task = {}
302
+ for run in runs:
303
+ example_id = str(run.reference_example_id or run.id)
304
+ feedbacks = list(client.list_feedback(run_ids=[run.id]))
305
+ scores = [fb.score for fb in feedbacks if fb.score is not None]
306
+ avg_score = sum(scores) / len(scores) if scores else 0.0
307
+ per_task[example_id] = {"score": avg_score}
308
+
309
+ all_scores = [v["score"] for v in per_task.values()]
310
+ combined = sum(all_scores) / len(all_scores) if all_scores else 0.0
311
+
312
+ return {"combined_score": combined, "per_task": per_task}
313
+ except Exception as e:
314
+ print(f"Failed to fetch experiment scores: {e}", file=sys.stderr)
315
+ return None
316
+
317
+
256
318
  def main():
257
319
  parser = argparse.ArgumentParser(description="Generate trace insights from LangSmith data + scores")
258
- parser.add_argument("--langsmith-runs", required=True, help="Path to langsmith_runs.json")
320
+ parser.add_argument("--langsmith-runs", default=None, help="Path to langsmith_runs.json (v2 mode)")
259
321
  parser.add_argument("--langsmith-stats", help="Path to langsmith_stats.json (optional)")
260
- parser.add_argument("--scores", required=True, help="Path to best version's scores.json")
261
- parser.add_argument("--tasks-dir", required=True, help="Path to eval/tasks/ directory")
322
+ parser.add_argument("--scores", default=None, help="Path to scores.json (v2 mode)")
323
+ parser.add_argument("--tasks-dir", default=None, help="Path to eval/tasks/ directory (v2 mode)")
324
+ parser.add_argument("--from-project", default=None, help="LangSmith project name (v3 mode)")
325
+ parser.add_argument("--from-experiment", default=None, help="LangSmith experiment name (v3 mode)")
262
326
  parser.add_argument("--output", required=True, help="Output path for trace_insights.json")
263
327
  args = parser.parse_args()
264
328
 
265
- runs = load_json(args.langsmith_runs)
266
- stats = load_json(args.langsmith_stats)
267
- scores_data = load_json(args.scores)
329
+ # v3 mode: fetch directly from LangSmith
330
+ if args.from_project or args.from_experiment:
331
+ runs = fetch_runs_from_langsmith(args.from_project, args.from_experiment)
332
+ scores_data = fetch_scores_from_experiment(args.from_experiment) if args.from_experiment else None
333
+ stats = None
334
+ else:
335
+ # v2 mode: read from local files
336
+ runs = load_json(args.langsmith_runs)
337
+ stats = load_json(args.langsmith_stats)
338
+ scores_data = load_json(args.scores)
268
339
 
269
340
  if not runs and not scores_data:
270
341
  # Nothing to analyze — write minimal insights
@@ -291,7 +362,8 @@ def main():
291
362
  response_analysis = analyze_responses(runs)
292
363
 
293
364
  # Phase 2: Cross-reference with scores
294
- score_cross_ref = cross_reference_scores(runs, scores_data, args.tasks_dir)
365
+ tasks_dir = getattr(args, "tasks_dir", None)
366
+ score_cross_ref = cross_reference_scores(runs, scores_data, tasks_dir)
295
367
  token_score_corr = correlate_tokens_scores(runs, scores_data)
296
368
 
297
369
  # Phase 3: Generate hypotheses