harness-evolver 2.9.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +62 -117
  2. package/agents/evolver-architect.md +53 -0
  3. package/agents/evolver-critic.md +44 -0
  4. package/agents/evolver-proposer.md +128 -0
  5. package/agents/evolver-testgen.md +67 -0
  6. package/bin/install.js +181 -171
  7. package/package.json +7 -7
  8. package/skills/deploy/SKILL.md +49 -56
  9. package/skills/evolve/SKILL.md +180 -700
  10. package/skills/setup/SKILL.md +182 -0
  11. package/skills/status/SKILL.md +23 -21
  12. package/tools/read_results.py +240 -0
  13. package/tools/run_eval.py +202 -0
  14. package/tools/seed_from_traces.py +36 -8
  15. package/tools/setup.py +393 -0
  16. package/tools/trace_insights.py +86 -14
  17. package/agents/harness-evolver-architect.md +0 -173
  18. package/agents/harness-evolver-critic.md +0 -132
  19. package/agents/harness-evolver-judge.md +0 -110
  20. package/agents/harness-evolver-proposer.md +0 -317
  21. package/agents/harness-evolver-testgen.md +0 -112
  22. package/examples/classifier/README.md +0 -25
  23. package/examples/classifier/config.json +0 -3
  24. package/examples/classifier/eval.py +0 -58
  25. package/examples/classifier/harness.py +0 -111
  26. package/examples/classifier/tasks/task_001.json +0 -1
  27. package/examples/classifier/tasks/task_002.json +0 -1
  28. package/examples/classifier/tasks/task_003.json +0 -1
  29. package/examples/classifier/tasks/task_004.json +0 -1
  30. package/examples/classifier/tasks/task_005.json +0 -1
  31. package/examples/classifier/tasks/task_006.json +0 -1
  32. package/examples/classifier/tasks/task_007.json +0 -1
  33. package/examples/classifier/tasks/task_008.json +0 -1
  34. package/examples/classifier/tasks/task_009.json +0 -1
  35. package/examples/classifier/tasks/task_010.json +0 -1
  36. package/skills/architect/SKILL.md +0 -93
  37. package/skills/compare/SKILL.md +0 -73
  38. package/skills/critic/SKILL.md +0 -67
  39. package/skills/diagnose/SKILL.md +0 -96
  40. package/skills/import-traces/SKILL.md +0 -102
  41. package/skills/init/SKILL.md +0 -253
  42. package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
  43. package/tools/__pycache__/init.cpython-313.pyc +0 -0
  44. package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
  45. package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
  46. package/tools/eval_llm_judge.py +0 -233
  47. package/tools/eval_passthrough.py +0 -55
  48. package/tools/evaluate.py +0 -255
  49. package/tools/import_traces.py +0 -229
  50. package/tools/init.py +0 -531
  51. package/tools/llm_api.py +0 -125
  52. package/tools/state.py +0 -219
  53. package/tools/test_growth.py +0 -230
  54. package/tools/trace_logger.py +0 -42
@@ -1,229 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Import LangSmith Traces as Eval Tasks for Harness Evolver.
3
-
4
- Transforms LangSmith trace JSON (from langsmith-cli) into task JSON files
5
- for the evaluation set. Prioritizes traces with negative feedback.
6
-
7
- Usage:
8
- python3 import_traces.py \
9
- --traces-json /tmp/langsmith_traces.json \
10
- --output-dir .harness-evolver/eval/tasks/ \
11
- --prefix imported \
12
- [--max-tasks 30]
13
-
14
- Stdlib-only. No external dependencies.
15
- """
16
-
17
- import argparse
18
- import hashlib
19
- import json
20
- import os
21
- import re
22
- import sys
23
-
24
-
25
- def load_json(path):
26
- """Load JSON file, return None if missing or invalid."""
27
- if not path or not os.path.exists(path):
28
- return None
29
- try:
30
- with open(path) as f:
31
- return json.load(f)
32
- except (json.JSONDecodeError, OSError):
33
- return None
34
-
35
-
36
- def extract_input_from_trace(run):
37
- """Extract the user input from a LangSmith run's inputs field.
38
-
39
- Handles multiple LangChain serialization formats:
40
- - Direct {"input": "..."} field
41
- - {"messages": [[HumanMessage, ...]]} format
42
- - {"question": "..."} or {"query": "..."} fields
43
- """
44
- inputs = run.get("inputs", {})
45
- if not inputs:
46
- return None
47
-
48
- if isinstance(inputs, str):
49
- return inputs
50
-
51
- # Direct input field
52
- for key in ("input", "question", "query", "prompt", "text", "user_input"):
53
- if key in inputs and isinstance(inputs[key], str):
54
- return inputs[key]
55
-
56
- # LangChain messages format
57
- messages = inputs.get("messages") or inputs.get("input")
58
- if isinstance(messages, list):
59
- # Might be [[msg1, msg2]] (batched) or [msg1, msg2]
60
- if messages and isinstance(messages[0], list):
61
- messages = messages[0]
62
- for msg in messages:
63
- if isinstance(msg, dict):
64
- # {"type": "human", "content": "..."}
65
- if msg.get("type") in ("human", "HumanMessage") or msg.get("role") == "user":
66
- content = msg.get("content", "")
67
- if isinstance(content, str) and content:
68
- return content
69
- if isinstance(content, list):
70
- # Multi-modal: [{"type": "text", "text": "..."}]
71
- for part in content:
72
- if isinstance(part, dict) and part.get("type") == "text":
73
- return part.get("text", "")
74
- elif isinstance(msg, str) and msg:
75
- return msg
76
-
77
- # Fallback: stringify the whole inputs
78
- flat = json.dumps(inputs)
79
- if len(flat) > 20: # Only if there's meaningful content
80
- return flat[:2000]
81
-
82
- return None
83
-
84
-
85
- def extract_feedback(run):
86
- """Extract user feedback from a LangSmith run."""
87
- feedback = run.get("feedback_stats") or run.get("feedback") or {}
88
- if not feedback:
89
- return None
90
-
91
- # feedback_stats format: {"thumbs_up": N, "thumbs_down": N}
92
- if isinstance(feedback, dict):
93
- up = feedback.get("thumbs_up", 0) or feedback.get("positive", 0)
94
- down = feedback.get("thumbs_down", 0) or feedback.get("negative", 0)
95
- if down > 0:
96
- return "negative"
97
- if up > 0:
98
- return "positive"
99
- return None
100
-
101
-
102
- def infer_difficulty(text):
103
- """Infer difficulty from input characteristics."""
104
- if not text:
105
- return "medium"
106
- length = len(text)
107
- # Count question marks, clauses, etc.
108
- questions = text.count("?")
109
- sentences = len(re.split(r"[.!?]+", text))
110
-
111
- if length < 50 and questions <= 1:
112
- return "easy"
113
- if length > 500 or questions > 2 or sentences > 5:
114
- return "hard"
115
- return "medium"
116
-
117
-
118
- def short_id(run_id):
119
- """Create a short deterministic ID from a full run ID."""
120
- return hashlib.md5(str(run_id).encode()).hexdigest()[:8]
121
-
122
-
123
- def main():
124
- parser = argparse.ArgumentParser(description="Import LangSmith traces as eval tasks")
125
- parser.add_argument("--traces-json", required=True, help="Path to langsmith-cli JSON output")
126
- parser.add_argument("--output-dir", required=True, help="Directory to write task JSON files")
127
- parser.add_argument("--prefix", default="imported", help="Prefix for task IDs (default: imported)")
128
- parser.add_argument("--max-tasks", type=int, default=30, help="Max tasks to import (default: 30)")
129
- parser.add_argument("--prioritize-negative", action="store_true", default=True,
130
- help="Import negative-feedback traces first (default: true)")
131
- args = parser.parse_args()
132
-
133
- traces = load_json(args.traces_json)
134
- if not traces:
135
- print("No traces found or invalid JSON — nothing to import")
136
- return
137
-
138
- if isinstance(traces, dict):
139
- # Might be wrapped in {"runs": [...]}
140
- traces = traces.get("runs", traces.get("data", [traces]))
141
-
142
- if not isinstance(traces, list):
143
- print("Unexpected traces format — expected a JSON array")
144
- return
145
-
146
- # Sort: negative feedback first, then errors, then the rest
147
- if args.prioritize_negative:
148
- def priority(run):
149
- fb = extract_feedback(run)
150
- has_error = bool(run.get("error"))
151
- if fb == "negative":
152
- return 0
153
- if has_error:
154
- return 1
155
- return 2
156
- traces.sort(key=priority)
157
-
158
- os.makedirs(args.output_dir, exist_ok=True)
159
-
160
- # Check for existing imported tasks to avoid duplicates
161
- existing_run_ids = set()
162
- for fname in os.listdir(args.output_dir):
163
- if fname.endswith(".json"):
164
- task = load_json(os.path.join(args.output_dir, fname))
165
- if task and task.get("metadata", {}).get("langsmith_run_id"):
166
- existing_run_ids.add(task["metadata"]["langsmith_run_id"])
167
-
168
- imported = 0
169
- skipped_no_input = 0
170
- skipped_duplicate = 0
171
- negative_count = 0
172
-
173
- for run in traces:
174
- if imported >= args.max_tasks:
175
- break
176
-
177
- run_id = str(run.get("id", ""))
178
- if run_id in existing_run_ids:
179
- skipped_duplicate += 1
180
- continue
181
-
182
- user_input = extract_input_from_trace(run)
183
- if not user_input or len(user_input.strip()) < 5:
184
- skipped_no_input += 1
185
- continue
186
-
187
- feedback = extract_feedback(run)
188
- has_error = bool(run.get("error"))
189
- task_id = f"{args.prefix}_{short_id(run_id)}"
190
-
191
- task = {
192
- "id": task_id,
193
- "input": user_input.strip(),
194
- "metadata": {
195
- "difficulty": infer_difficulty(user_input),
196
- "category": run.get("name", "unknown"),
197
- "type": "production",
198
- "source": "imported",
199
- "langsmith_run_id": run_id,
200
- "had_error": has_error,
201
- "user_feedback": feedback,
202
- },
203
- }
204
-
205
- out_path = os.path.join(args.output_dir, f"{task_id}.json")
206
- with open(out_path, "w") as f:
207
- json.dump(task, f, indent=2)
208
-
209
- imported += 1
210
- if feedback == "negative":
211
- negative_count += 1
212
-
213
- summary = {
214
- "imported": imported,
215
- "negative_feedback": negative_count,
216
- "skipped_no_input": skipped_no_input,
217
- "skipped_duplicate": skipped_duplicate,
218
- "total_traces": len(traces),
219
- }
220
- print(json.dumps(summary))
221
- print(f"Imported {imported} production traces as tasks ({negative_count} with negative feedback)")
222
- if skipped_duplicate:
223
- print(f" Skipped {skipped_duplicate} already-imported traces")
224
- if skipped_no_input:
225
- print(f" Skipped {skipped_no_input} traces with no extractable input")
226
-
227
-
228
- if __name__ == "__main__":
229
- main()