harness-evolver 2.9.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -117
- package/agents/evolver-architect.md +53 -0
- package/agents/evolver-critic.md +44 -0
- package/agents/evolver-proposer.md +128 -0
- package/agents/evolver-testgen.md +67 -0
- package/bin/install.js +181 -171
- package/package.json +7 -7
- package/skills/deploy/SKILL.md +49 -56
- package/skills/evolve/SKILL.md +180 -700
- package/skills/setup/SKILL.md +182 -0
- package/skills/status/SKILL.md +23 -21
- package/tools/read_results.py +240 -0
- package/tools/run_eval.py +202 -0
- package/tools/seed_from_traces.py +36 -8
- package/tools/setup.py +393 -0
- package/tools/trace_insights.py +86 -14
- package/agents/harness-evolver-architect.md +0 -173
- package/agents/harness-evolver-critic.md +0 -132
- package/agents/harness-evolver-judge.md +0 -110
- package/agents/harness-evolver-proposer.md +0 -317
- package/agents/harness-evolver-testgen.md +0 -112
- package/examples/classifier/README.md +0 -25
- package/examples/classifier/config.json +0 -3
- package/examples/classifier/eval.py +0 -58
- package/examples/classifier/harness.py +0 -111
- package/examples/classifier/tasks/task_001.json +0 -1
- package/examples/classifier/tasks/task_002.json +0 -1
- package/examples/classifier/tasks/task_003.json +0 -1
- package/examples/classifier/tasks/task_004.json +0 -1
- package/examples/classifier/tasks/task_005.json +0 -1
- package/examples/classifier/tasks/task_006.json +0 -1
- package/examples/classifier/tasks/task_007.json +0 -1
- package/examples/classifier/tasks/task_008.json +0 -1
- package/examples/classifier/tasks/task_009.json +0 -1
- package/examples/classifier/tasks/task_010.json +0 -1
- package/skills/architect/SKILL.md +0 -93
- package/skills/compare/SKILL.md +0 -73
- package/skills/critic/SKILL.md +0 -67
- package/skills/diagnose/SKILL.md +0 -96
- package/skills/import-traces/SKILL.md +0 -102
- package/skills/init/SKILL.md +0 -253
- package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
- package/tools/__pycache__/init.cpython-313.pyc +0 -0
- package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
- package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
- package/tools/eval_llm_judge.py +0 -233
- package/tools/eval_passthrough.py +0 -55
- package/tools/evaluate.py +0 -255
- package/tools/import_traces.py +0 -229
- package/tools/init.py +0 -531
- package/tools/llm_api.py +0 -125
- package/tools/state.py +0 -219
- package/tools/test_growth.py +0 -230
- package/tools/trace_logger.py +0 -42
package/tools/import_traces.py
DELETED
|
@@ -1,229 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Import LangSmith Traces as Eval Tasks for Harness Evolver.
|
|
3
|
-
|
|
4
|
-
Transforms LangSmith trace JSON (from langsmith-cli) into task JSON files
|
|
5
|
-
for the evaluation set. Prioritizes traces with negative feedback.
|
|
6
|
-
|
|
7
|
-
Usage:
|
|
8
|
-
python3 import_traces.py \
|
|
9
|
-
--traces-json /tmp/langsmith_traces.json \
|
|
10
|
-
--output-dir .harness-evolver/eval/tasks/ \
|
|
11
|
-
--prefix imported \
|
|
12
|
-
[--max-tasks 30]
|
|
13
|
-
|
|
14
|
-
Stdlib-only. No external dependencies.
|
|
15
|
-
"""
|
|
16
|
-
|
|
17
|
-
import argparse
|
|
18
|
-
import hashlib
|
|
19
|
-
import json
|
|
20
|
-
import os
|
|
21
|
-
import re
|
|
22
|
-
import sys
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def load_json(path):
|
|
26
|
-
"""Load JSON file, return None if missing or invalid."""
|
|
27
|
-
if not path or not os.path.exists(path):
|
|
28
|
-
return None
|
|
29
|
-
try:
|
|
30
|
-
with open(path) as f:
|
|
31
|
-
return json.load(f)
|
|
32
|
-
except (json.JSONDecodeError, OSError):
|
|
33
|
-
return None
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def extract_input_from_trace(run):
|
|
37
|
-
"""Extract the user input from a LangSmith run's inputs field.
|
|
38
|
-
|
|
39
|
-
Handles multiple LangChain serialization formats:
|
|
40
|
-
- Direct {"input": "..."} field
|
|
41
|
-
- {"messages": [[HumanMessage, ...]]} format
|
|
42
|
-
- {"question": "..."} or {"query": "..."} fields
|
|
43
|
-
"""
|
|
44
|
-
inputs = run.get("inputs", {})
|
|
45
|
-
if not inputs:
|
|
46
|
-
return None
|
|
47
|
-
|
|
48
|
-
if isinstance(inputs, str):
|
|
49
|
-
return inputs
|
|
50
|
-
|
|
51
|
-
# Direct input field
|
|
52
|
-
for key in ("input", "question", "query", "prompt", "text", "user_input"):
|
|
53
|
-
if key in inputs and isinstance(inputs[key], str):
|
|
54
|
-
return inputs[key]
|
|
55
|
-
|
|
56
|
-
# LangChain messages format
|
|
57
|
-
messages = inputs.get("messages") or inputs.get("input")
|
|
58
|
-
if isinstance(messages, list):
|
|
59
|
-
# Might be [[msg1, msg2]] (batched) or [msg1, msg2]
|
|
60
|
-
if messages and isinstance(messages[0], list):
|
|
61
|
-
messages = messages[0]
|
|
62
|
-
for msg in messages:
|
|
63
|
-
if isinstance(msg, dict):
|
|
64
|
-
# {"type": "human", "content": "..."}
|
|
65
|
-
if msg.get("type") in ("human", "HumanMessage") or msg.get("role") == "user":
|
|
66
|
-
content = msg.get("content", "")
|
|
67
|
-
if isinstance(content, str) and content:
|
|
68
|
-
return content
|
|
69
|
-
if isinstance(content, list):
|
|
70
|
-
# Multi-modal: [{"type": "text", "text": "..."}]
|
|
71
|
-
for part in content:
|
|
72
|
-
if isinstance(part, dict) and part.get("type") == "text":
|
|
73
|
-
return part.get("text", "")
|
|
74
|
-
elif isinstance(msg, str) and msg:
|
|
75
|
-
return msg
|
|
76
|
-
|
|
77
|
-
# Fallback: stringify the whole inputs
|
|
78
|
-
flat = json.dumps(inputs)
|
|
79
|
-
if len(flat) > 20: # Only if there's meaningful content
|
|
80
|
-
return flat[:2000]
|
|
81
|
-
|
|
82
|
-
return None
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def extract_feedback(run):
|
|
86
|
-
"""Extract user feedback from a LangSmith run."""
|
|
87
|
-
feedback = run.get("feedback_stats") or run.get("feedback") or {}
|
|
88
|
-
if not feedback:
|
|
89
|
-
return None
|
|
90
|
-
|
|
91
|
-
# feedback_stats format: {"thumbs_up": N, "thumbs_down": N}
|
|
92
|
-
if isinstance(feedback, dict):
|
|
93
|
-
up = feedback.get("thumbs_up", 0) or feedback.get("positive", 0)
|
|
94
|
-
down = feedback.get("thumbs_down", 0) or feedback.get("negative", 0)
|
|
95
|
-
if down > 0:
|
|
96
|
-
return "negative"
|
|
97
|
-
if up > 0:
|
|
98
|
-
return "positive"
|
|
99
|
-
return None
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
def infer_difficulty(text):
|
|
103
|
-
"""Infer difficulty from input characteristics."""
|
|
104
|
-
if not text:
|
|
105
|
-
return "medium"
|
|
106
|
-
length = len(text)
|
|
107
|
-
# Count question marks, clauses, etc.
|
|
108
|
-
questions = text.count("?")
|
|
109
|
-
sentences = len(re.split(r"[.!?]+", text))
|
|
110
|
-
|
|
111
|
-
if length < 50 and questions <= 1:
|
|
112
|
-
return "easy"
|
|
113
|
-
if length > 500 or questions > 2 or sentences > 5:
|
|
114
|
-
return "hard"
|
|
115
|
-
return "medium"
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
def short_id(run_id):
|
|
119
|
-
"""Create a short deterministic ID from a full run ID."""
|
|
120
|
-
return hashlib.md5(str(run_id).encode()).hexdigest()[:8]
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
def main():
|
|
124
|
-
parser = argparse.ArgumentParser(description="Import LangSmith traces as eval tasks")
|
|
125
|
-
parser.add_argument("--traces-json", required=True, help="Path to langsmith-cli JSON output")
|
|
126
|
-
parser.add_argument("--output-dir", required=True, help="Directory to write task JSON files")
|
|
127
|
-
parser.add_argument("--prefix", default="imported", help="Prefix for task IDs (default: imported)")
|
|
128
|
-
parser.add_argument("--max-tasks", type=int, default=30, help="Max tasks to import (default: 30)")
|
|
129
|
-
parser.add_argument("--prioritize-negative", action="store_true", default=True,
|
|
130
|
-
help="Import negative-feedback traces first (default: true)")
|
|
131
|
-
args = parser.parse_args()
|
|
132
|
-
|
|
133
|
-
traces = load_json(args.traces_json)
|
|
134
|
-
if not traces:
|
|
135
|
-
print("No traces found or invalid JSON — nothing to import")
|
|
136
|
-
return
|
|
137
|
-
|
|
138
|
-
if isinstance(traces, dict):
|
|
139
|
-
# Might be wrapped in {"runs": [...]}
|
|
140
|
-
traces = traces.get("runs", traces.get("data", [traces]))
|
|
141
|
-
|
|
142
|
-
if not isinstance(traces, list):
|
|
143
|
-
print("Unexpected traces format — expected a JSON array")
|
|
144
|
-
return
|
|
145
|
-
|
|
146
|
-
# Sort: negative feedback first, then errors, then the rest
|
|
147
|
-
if args.prioritize_negative:
|
|
148
|
-
def priority(run):
|
|
149
|
-
fb = extract_feedback(run)
|
|
150
|
-
has_error = bool(run.get("error"))
|
|
151
|
-
if fb == "negative":
|
|
152
|
-
return 0
|
|
153
|
-
if has_error:
|
|
154
|
-
return 1
|
|
155
|
-
return 2
|
|
156
|
-
traces.sort(key=priority)
|
|
157
|
-
|
|
158
|
-
os.makedirs(args.output_dir, exist_ok=True)
|
|
159
|
-
|
|
160
|
-
# Check for existing imported tasks to avoid duplicates
|
|
161
|
-
existing_run_ids = set()
|
|
162
|
-
for fname in os.listdir(args.output_dir):
|
|
163
|
-
if fname.endswith(".json"):
|
|
164
|
-
task = load_json(os.path.join(args.output_dir, fname))
|
|
165
|
-
if task and task.get("metadata", {}).get("langsmith_run_id"):
|
|
166
|
-
existing_run_ids.add(task["metadata"]["langsmith_run_id"])
|
|
167
|
-
|
|
168
|
-
imported = 0
|
|
169
|
-
skipped_no_input = 0
|
|
170
|
-
skipped_duplicate = 0
|
|
171
|
-
negative_count = 0
|
|
172
|
-
|
|
173
|
-
for run in traces:
|
|
174
|
-
if imported >= args.max_tasks:
|
|
175
|
-
break
|
|
176
|
-
|
|
177
|
-
run_id = str(run.get("id", ""))
|
|
178
|
-
if run_id in existing_run_ids:
|
|
179
|
-
skipped_duplicate += 1
|
|
180
|
-
continue
|
|
181
|
-
|
|
182
|
-
user_input = extract_input_from_trace(run)
|
|
183
|
-
if not user_input or len(user_input.strip()) < 5:
|
|
184
|
-
skipped_no_input += 1
|
|
185
|
-
continue
|
|
186
|
-
|
|
187
|
-
feedback = extract_feedback(run)
|
|
188
|
-
has_error = bool(run.get("error"))
|
|
189
|
-
task_id = f"{args.prefix}_{short_id(run_id)}"
|
|
190
|
-
|
|
191
|
-
task = {
|
|
192
|
-
"id": task_id,
|
|
193
|
-
"input": user_input.strip(),
|
|
194
|
-
"metadata": {
|
|
195
|
-
"difficulty": infer_difficulty(user_input),
|
|
196
|
-
"category": run.get("name", "unknown"),
|
|
197
|
-
"type": "production",
|
|
198
|
-
"source": "imported",
|
|
199
|
-
"langsmith_run_id": run_id,
|
|
200
|
-
"had_error": has_error,
|
|
201
|
-
"user_feedback": feedback,
|
|
202
|
-
},
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
out_path = os.path.join(args.output_dir, f"{task_id}.json")
|
|
206
|
-
with open(out_path, "w") as f:
|
|
207
|
-
json.dump(task, f, indent=2)
|
|
208
|
-
|
|
209
|
-
imported += 1
|
|
210
|
-
if feedback == "negative":
|
|
211
|
-
negative_count += 1
|
|
212
|
-
|
|
213
|
-
summary = {
|
|
214
|
-
"imported": imported,
|
|
215
|
-
"negative_feedback": negative_count,
|
|
216
|
-
"skipped_no_input": skipped_no_input,
|
|
217
|
-
"skipped_duplicate": skipped_duplicate,
|
|
218
|
-
"total_traces": len(traces),
|
|
219
|
-
}
|
|
220
|
-
print(json.dumps(summary))
|
|
221
|
-
print(f"Imported {imported} production traces as tasks ({negative_count} with negative feedback)")
|
|
222
|
-
if skipped_duplicate:
|
|
223
|
-
print(f" Skipped {skipped_duplicate} already-imported traces")
|
|
224
|
-
if skipped_no_input:
|
|
225
|
-
print(f" Skipped {skipped_no_input} traces with no extractable input")
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
if __name__ == "__main__":
|
|
229
|
-
main()
|