harness-evolver 2.9.1 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +62 -117
  2. package/agents/evolver-architect.md +53 -0
  3. package/agents/evolver-critic.md +44 -0
  4. package/agents/evolver-proposer.md +128 -0
  5. package/agents/evolver-testgen.md +67 -0
  6. package/bin/install.js +181 -171
  7. package/package.json +7 -7
  8. package/skills/deploy/SKILL.md +49 -56
  9. package/skills/evolve/SKILL.md +156 -687
  10. package/skills/setup/SKILL.md +182 -0
  11. package/skills/status/SKILL.md +23 -21
  12. package/tools/read_results.py +240 -0
  13. package/tools/run_eval.py +202 -0
  14. package/tools/seed_from_traces.py +36 -8
  15. package/tools/setup.py +393 -0
  16. package/tools/trace_insights.py +86 -14
  17. package/agents/harness-evolver-architect.md +0 -173
  18. package/agents/harness-evolver-critic.md +0 -132
  19. package/agents/harness-evolver-judge.md +0 -110
  20. package/agents/harness-evolver-proposer.md +0 -317
  21. package/agents/harness-evolver-testgen.md +0 -112
  22. package/examples/classifier/README.md +0 -25
  23. package/examples/classifier/config.json +0 -3
  24. package/examples/classifier/eval.py +0 -58
  25. package/examples/classifier/harness.py +0 -111
  26. package/examples/classifier/tasks/task_001.json +0 -1
  27. package/examples/classifier/tasks/task_002.json +0 -1
  28. package/examples/classifier/tasks/task_003.json +0 -1
  29. package/examples/classifier/tasks/task_004.json +0 -1
  30. package/examples/classifier/tasks/task_005.json +0 -1
  31. package/examples/classifier/tasks/task_006.json +0 -1
  32. package/examples/classifier/tasks/task_007.json +0 -1
  33. package/examples/classifier/tasks/task_008.json +0 -1
  34. package/examples/classifier/tasks/task_009.json +0 -1
  35. package/examples/classifier/tasks/task_010.json +0 -1
  36. package/skills/architect/SKILL.md +0 -93
  37. package/skills/compare/SKILL.md +0 -73
  38. package/skills/critic/SKILL.md +0 -67
  39. package/skills/diagnose/SKILL.md +0 -96
  40. package/skills/import-traces/SKILL.md +0 -102
  41. package/skills/init/SKILL.md +0 -293
  42. package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
  43. package/tools/__pycache__/init.cpython-313.pyc +0 -0
  44. package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
  45. package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
  46. package/tools/eval_llm_judge.py +0 -233
  47. package/tools/eval_passthrough.py +0 -55
  48. package/tools/evaluate.py +0 -255
  49. package/tools/import_traces.py +0 -229
  50. package/tools/init.py +0 -531
  51. package/tools/llm_api.py +0 -125
  52. package/tools/state.py +0 -219
  53. package/tools/test_growth.py +0 -230
  54. package/tools/trace_logger.py +0 -42
package/tools/evaluate.py DELETED
@@ -1,255 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Evaluation orchestrator for Harness Evolver.
3
-
4
- Commands:
5
- validate --harness PATH [--config PATH] [--timeout SECONDS]
6
- run --harness PATH --tasks-dir PATH --eval PATH --traces-dir PATH --scores PATH
7
- [--config PATH] [--timeout SECONDS]
8
-
9
- Runs harness per task, captures traces (stdout/stderr/timing), then calls user's eval script.
10
- Stdlib-only. No external dependencies.
11
- """
12
-
13
- import argparse
14
- import json
15
- import os
16
- import shutil
17
- import subprocess
18
- import sys
19
- import tempfile
20
- import time
21
-
22
-
23
- def _resolve_python():
24
- """Resolve the Python interpreter to use for subprocesses.
25
-
26
- Prefers the current interpreter (sys.executable) over a hardcoded 'python3'.
27
- This is critical in monorepo setups where the harness may need a specific
28
- venv Python (e.g. Python 3.12) while the system 'python3' is a different
29
- version (e.g. 3.14) with incompatible site-packages.
30
- """
31
- exe = sys.executable
32
- if exe and os.path.isfile(exe):
33
- return exe
34
- return "python3"
35
-
36
-
37
- def _run_harness_on_task(harness, config, task_input_path, output_path, task_traces_dir, timeout, env=None):
38
- """Run the harness on a single task. Returns (success, elapsed_ms, stdout, stderr)."""
39
- cmd = [_resolve_python(), harness, "--input", task_input_path, "--output", output_path]
40
- if task_traces_dir:
41
- extra_dir = os.path.join(task_traces_dir, "extra")
42
- os.makedirs(extra_dir, exist_ok=True)
43
- cmd.extend(["--traces-dir", extra_dir])
44
- if config and os.path.exists(config):
45
- cmd.extend(["--config", config])
46
-
47
- start = time.time()
48
- try:
49
- result = subprocess.run(
50
- cmd, capture_output=True, text=True, timeout=timeout, env=env,
51
- )
52
- elapsed_ms = (time.time() - start) * 1000
53
- # Accept exit code 0 (success) or check if output file exists for non-zero exits.
54
- # LLM agents with C extensions (numpy, httpx) often segfault (exit 139) during
55
- # Python shutdown AFTER writing correct output.
56
- success = result.returncode == 0
57
- if not success and os.path.exists(output_path):
58
- try:
59
- with open(output_path) as f:
60
- json.load(f)
61
- # Valid JSON output exists despite non-zero exit — treat as success
62
- success = True
63
- except (json.JSONDecodeError, OSError):
64
- pass
65
- return success, elapsed_ms, result.stdout, result.stderr
66
- except subprocess.TimeoutExpired:
67
- elapsed_ms = (time.time() - start) * 1000
68
- return False, elapsed_ms, "", f"TIMEOUT after {timeout}s"
69
- except Exception as e:
70
- elapsed_ms = (time.time() - start) * 1000
71
- return False, elapsed_ms, "", str(e)
72
-
73
-
74
- def cmd_validate(args):
75
- harness = args.harness
76
- config = getattr(args, "config", None)
77
- timeout = getattr(args, "timeout", 30) or 30
78
-
79
- if not os.path.exists(harness):
80
- print(f"FAIL: harness not found: {harness}", file=sys.stderr)
81
- sys.exit(1)
82
-
83
- with tempfile.TemporaryDirectory() as tmpdir:
84
- dummy_task = {"id": "validation", "input": "test input for validation", "metadata": {}}
85
- input_path = os.path.join(tmpdir, "input.json")
86
- output_path = os.path.join(tmpdir, "output.json")
87
- with open(input_path, "w") as f:
88
- json.dump(dummy_task, f)
89
-
90
- success, elapsed, stdout, stderr = _run_harness_on_task(
91
- harness, config, input_path, output_path, None, timeout=timeout,
92
- )
93
-
94
- if not success:
95
- hint = ""
96
- if "TIMEOUT" in stderr:
97
- hint = (f"\nHint: validation timed out after {timeout}s. "
98
- "For LLM-powered agents that make real API calls, "
99
- "use --timeout to increase the limit: "
100
- f"evaluate.py validate --harness {harness} --timeout 120")
101
- print(f"FAIL: harness exited with error.\nstderr: {stderr}{hint}", file=sys.stderr)
102
- sys.exit(1)
103
-
104
- if not os.path.exists(output_path):
105
- print("FAIL: harness did not create output file.", file=sys.stderr)
106
- sys.exit(1)
107
-
108
- try:
109
- with open(output_path) as f:
110
- output = json.load(f)
111
- except (json.JSONDecodeError, ValueError) as e:
112
- print(f"FAIL: output is not valid JSON: {e}", file=sys.stderr)
113
- sys.exit(1)
114
-
115
- if "id" not in output or "output" not in output:
116
- print(f"FAIL: output missing 'id' or 'output' fields. Got: {output}", file=sys.stderr)
117
- sys.exit(1)
118
-
119
- print(f"OK: harness validated in {elapsed:.0f}ms. Output: {output}")
120
-
121
-
122
- def cmd_run(args):
123
- harness = args.harness
124
- config = getattr(args, "config", None)
125
- tasks_dir = args.tasks_dir
126
- eval_script = getattr(args, "eval")
127
- traces_dir = args.traces_dir
128
- scores_path = args.scores
129
- timeout = args.timeout
130
-
131
- os.makedirs(traces_dir, exist_ok=True)
132
-
133
- task_files = sorted(f for f in os.listdir(tasks_dir) if f.endswith(".json"))
134
- if not task_files:
135
- print(f"FAIL: no .json task files in {tasks_dir}", file=sys.stderr)
136
- sys.exit(1)
137
-
138
- all_stdout = []
139
- all_stderr = []
140
- timing = {"per_task": {}}
141
- results_dir = tempfile.mkdtemp()
142
-
143
- # LangSmith: setup auto-tracing env vars if configured
144
- langsmith_env = None
145
- project_config_path = os.path.join(os.path.dirname(os.path.dirname(traces_dir)), "config.json")
146
- if os.path.exists(project_config_path):
147
- with open(project_config_path) as f:
148
- project_config = json.load(f)
149
- ls = project_config.get("eval", {}).get("langsmith", {})
150
- if ls.get("enabled"):
151
- api_key = os.environ.get(ls.get("api_key_env", "LANGSMITH_API_KEY"), "")
152
- if api_key:
153
- version = os.path.basename(os.path.dirname(traces_dir))
154
- ls_project = f"{ls.get('project_prefix', 'harness-evolver')}-{version}"
155
- langsmith_env = {
156
- **os.environ,
157
- "LANGCHAIN_TRACING_V2": "true",
158
- "LANGCHAIN_API_KEY": api_key,
159
- "LANGCHAIN_PROJECT": ls_project,
160
- }
161
- # Write the project name so the evolve skill knows where to find traces
162
- ls_project_file = os.path.join(os.path.dirname(os.path.dirname(traces_dir)), "langsmith_project.txt")
163
- with open(ls_project_file, "w") as f:
164
- f.write(ls_project)
165
-
166
- for task_file in task_files:
167
- task_path = os.path.join(tasks_dir, task_file)
168
- with open(task_path) as f:
169
- task = json.load(f)
170
- task_id = task["id"]
171
-
172
- task_input = {k: v for k, v in task.items() if k != "expected"}
173
-
174
- task_traces_dir = os.path.join(traces_dir, task_id)
175
- os.makedirs(task_traces_dir, exist_ok=True)
176
-
177
- input_path = os.path.join(task_traces_dir, "input.json")
178
- with open(input_path, "w") as f:
179
- json.dump(task_input, f, indent=2)
180
-
181
- output_path = os.path.join(results_dir, task_file)
182
-
183
- success, elapsed_ms, stdout, stderr = _run_harness_on_task(
184
- harness, config, input_path, output_path, task_traces_dir, timeout,
185
- env=langsmith_env,
186
- )
187
-
188
- if os.path.exists(output_path):
189
- shutil.copy2(output_path, os.path.join(task_traces_dir, "output.json"))
190
- else:
191
- with open(os.path.join(task_traces_dir, "output.json"), "w") as f:
192
- json.dump({"id": task_id, "output": "", "error": "harness failed"}, f)
193
-
194
- timing["per_task"][task_id] = round(elapsed_ms, 1)
195
- all_stdout.append(f"--- {task_id} ---\n{stdout}")
196
- all_stderr.append(f"--- {task_id} ---\n{stderr}")
197
-
198
- timing["total_ms"] = round(sum(timing["per_task"].values()), 1)
199
- with open(os.path.join(traces_dir, "timing.json"), "w") as f:
200
- json.dump(timing, f, indent=2)
201
- with open(os.path.join(traces_dir, "stdout.log"), "w") as f:
202
- f.write("\n".join(all_stdout))
203
- with open(os.path.join(traces_dir, "stderr.log"), "w") as f:
204
- f.write("\n".join(all_stderr))
205
-
206
- eval_cmd = [
207
- _resolve_python(), eval_script,
208
- "--results-dir", results_dir,
209
- "--tasks-dir", tasks_dir,
210
- "--scores", scores_path,
211
- ]
212
- result = subprocess.run(eval_cmd, capture_output=True, text=True, timeout=120)
213
- if result.returncode != 0:
214
- print(f"FAIL: eval script failed.\nstderr: {result.stderr}", file=sys.stderr)
215
- sys.exit(1)
216
-
217
- if os.path.exists(scores_path):
218
- scores = json.load(open(scores_path))
219
- print(f"Evaluation complete. combined_score: {scores.get('combined_score', 'N/A')}")
220
- else:
221
- print("WARNING: eval script did not produce scores file.", file=sys.stderr)
222
-
223
-
224
- def main():
225
- parser = argparse.ArgumentParser(description="Harness Evolver evaluation orchestrator")
226
- sub = parser.add_subparsers(dest="command")
227
-
228
- p_val = sub.add_parser("validate")
229
- p_val.add_argument("--harness", required=True)
230
- p_val.add_argument("--config", default=None)
231
- p_val.add_argument("--timeout", type=int, default=30,
232
- help="Validation timeout in seconds (default: 30). "
233
- "Increase for LLM-powered agents that make real API calls.")
234
-
235
- p_run = sub.add_parser("run")
236
- p_run.add_argument("--harness", required=True)
237
- p_run.add_argument("--config", default=None)
238
- p_run.add_argument("--tasks-dir", required=True)
239
- p_run.add_argument("--eval", required=True)
240
- p_run.add_argument("--traces-dir", required=True)
241
- p_run.add_argument("--scores", required=True)
242
- p_run.add_argument("--timeout", type=int, default=60)
243
-
244
- args = parser.parse_args()
245
- if args.command == "validate":
246
- cmd_validate(args)
247
- elif args.command == "run":
248
- cmd_run(args)
249
- else:
250
- parser.print_help()
251
- sys.exit(1)
252
-
253
-
254
- if __name__ == "__main__":
255
- main()
@@ -1,229 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Import LangSmith Traces as Eval Tasks for Harness Evolver.
3
-
4
- Transforms LangSmith trace JSON (from langsmith-cli) into task JSON files
5
- for the evaluation set. Prioritizes traces with negative feedback.
6
-
7
- Usage:
8
- python3 import_traces.py \
9
- --traces-json /tmp/langsmith_traces.json \
10
- --output-dir .harness-evolver/eval/tasks/ \
11
- --prefix imported \
12
- [--max-tasks 30]
13
-
14
- Stdlib-only. No external dependencies.
15
- """
16
-
17
- import argparse
18
- import hashlib
19
- import json
20
- import os
21
- import re
22
- import sys
23
-
24
-
25
- def load_json(path):
26
- """Load JSON file, return None if missing or invalid."""
27
- if not path or not os.path.exists(path):
28
- return None
29
- try:
30
- with open(path) as f:
31
- return json.load(f)
32
- except (json.JSONDecodeError, OSError):
33
- return None
34
-
35
-
36
- def extract_input_from_trace(run):
37
- """Extract the user input from a LangSmith run's inputs field.
38
-
39
- Handles multiple LangChain serialization formats:
40
- - Direct {"input": "..."} field
41
- - {"messages": [[HumanMessage, ...]]} format
42
- - {"question": "..."} or {"query": "..."} fields
43
- """
44
- inputs = run.get("inputs", {})
45
- if not inputs:
46
- return None
47
-
48
- if isinstance(inputs, str):
49
- return inputs
50
-
51
- # Direct input field
52
- for key in ("input", "question", "query", "prompt", "text", "user_input"):
53
- if key in inputs and isinstance(inputs[key], str):
54
- return inputs[key]
55
-
56
- # LangChain messages format
57
- messages = inputs.get("messages") or inputs.get("input")
58
- if isinstance(messages, list):
59
- # Might be [[msg1, msg2]] (batched) or [msg1, msg2]
60
- if messages and isinstance(messages[0], list):
61
- messages = messages[0]
62
- for msg in messages:
63
- if isinstance(msg, dict):
64
- # {"type": "human", "content": "..."}
65
- if msg.get("type") in ("human", "HumanMessage") or msg.get("role") == "user":
66
- content = msg.get("content", "")
67
- if isinstance(content, str) and content:
68
- return content
69
- if isinstance(content, list):
70
- # Multi-modal: [{"type": "text", "text": "..."}]
71
- for part in content:
72
- if isinstance(part, dict) and part.get("type") == "text":
73
- return part.get("text", "")
74
- elif isinstance(msg, str) and msg:
75
- return msg
76
-
77
- # Fallback: stringify the whole inputs
78
- flat = json.dumps(inputs)
79
- if len(flat) > 20: # Only if there's meaningful content
80
- return flat[:2000]
81
-
82
- return None
83
-
84
-
85
- def extract_feedback(run):
86
- """Extract user feedback from a LangSmith run."""
87
- feedback = run.get("feedback_stats") or run.get("feedback") or {}
88
- if not feedback:
89
- return None
90
-
91
- # feedback_stats format: {"thumbs_up": N, "thumbs_down": N}
92
- if isinstance(feedback, dict):
93
- up = feedback.get("thumbs_up", 0) or feedback.get("positive", 0)
94
- down = feedback.get("thumbs_down", 0) or feedback.get("negative", 0)
95
- if down > 0:
96
- return "negative"
97
- if up > 0:
98
- return "positive"
99
- return None
100
-
101
-
102
- def infer_difficulty(text):
103
- """Infer difficulty from input characteristics."""
104
- if not text:
105
- return "medium"
106
- length = len(text)
107
- # Count question marks, clauses, etc.
108
- questions = text.count("?")
109
- sentences = len(re.split(r"[.!?]+", text))
110
-
111
- if length < 50 and questions <= 1:
112
- return "easy"
113
- if length > 500 or questions > 2 or sentences > 5:
114
- return "hard"
115
- return "medium"
116
-
117
-
118
- def short_id(run_id):
119
- """Create a short deterministic ID from a full run ID."""
120
- return hashlib.md5(str(run_id).encode()).hexdigest()[:8]
121
-
122
-
123
- def main():
124
- parser = argparse.ArgumentParser(description="Import LangSmith traces as eval tasks")
125
- parser.add_argument("--traces-json", required=True, help="Path to langsmith-cli JSON output")
126
- parser.add_argument("--output-dir", required=True, help="Directory to write task JSON files")
127
- parser.add_argument("--prefix", default="imported", help="Prefix for task IDs (default: imported)")
128
- parser.add_argument("--max-tasks", type=int, default=30, help="Max tasks to import (default: 30)")
129
- parser.add_argument("--prioritize-negative", action="store_true", default=True,
130
- help="Import negative-feedback traces first (default: true)")
131
- args = parser.parse_args()
132
-
133
- traces = load_json(args.traces_json)
134
- if not traces:
135
- print("No traces found or invalid JSON — nothing to import")
136
- return
137
-
138
- if isinstance(traces, dict):
139
- # Might be wrapped in {"runs": [...]}
140
- traces = traces.get("runs", traces.get("data", [traces]))
141
-
142
- if not isinstance(traces, list):
143
- print("Unexpected traces format — expected a JSON array")
144
- return
145
-
146
- # Sort: negative feedback first, then errors, then the rest
147
- if args.prioritize_negative:
148
- def priority(run):
149
- fb = extract_feedback(run)
150
- has_error = bool(run.get("error"))
151
- if fb == "negative":
152
- return 0
153
- if has_error:
154
- return 1
155
- return 2
156
- traces.sort(key=priority)
157
-
158
- os.makedirs(args.output_dir, exist_ok=True)
159
-
160
- # Check for existing imported tasks to avoid duplicates
161
- existing_run_ids = set()
162
- for fname in os.listdir(args.output_dir):
163
- if fname.endswith(".json"):
164
- task = load_json(os.path.join(args.output_dir, fname))
165
- if task and task.get("metadata", {}).get("langsmith_run_id"):
166
- existing_run_ids.add(task["metadata"]["langsmith_run_id"])
167
-
168
- imported = 0
169
- skipped_no_input = 0
170
- skipped_duplicate = 0
171
- negative_count = 0
172
-
173
- for run in traces:
174
- if imported >= args.max_tasks:
175
- break
176
-
177
- run_id = str(run.get("id", ""))
178
- if run_id in existing_run_ids:
179
- skipped_duplicate += 1
180
- continue
181
-
182
- user_input = extract_input_from_trace(run)
183
- if not user_input or len(user_input.strip()) < 5:
184
- skipped_no_input += 1
185
- continue
186
-
187
- feedback = extract_feedback(run)
188
- has_error = bool(run.get("error"))
189
- task_id = f"{args.prefix}_{short_id(run_id)}"
190
-
191
- task = {
192
- "id": task_id,
193
- "input": user_input.strip(),
194
- "metadata": {
195
- "difficulty": infer_difficulty(user_input),
196
- "category": run.get("name", "unknown"),
197
- "type": "production",
198
- "source": "imported",
199
- "langsmith_run_id": run_id,
200
- "had_error": has_error,
201
- "user_feedback": feedback,
202
- },
203
- }
204
-
205
- out_path = os.path.join(args.output_dir, f"{task_id}.json")
206
- with open(out_path, "w") as f:
207
- json.dump(task, f, indent=2)
208
-
209
- imported += 1
210
- if feedback == "negative":
211
- negative_count += 1
212
-
213
- summary = {
214
- "imported": imported,
215
- "negative_feedback": negative_count,
216
- "skipped_no_input": skipped_no_input,
217
- "skipped_duplicate": skipped_duplicate,
218
- "total_traces": len(traces),
219
- }
220
- print(json.dumps(summary))
221
- print(f"Imported {imported} production traces as tasks ({negative_count} with negative feedback)")
222
- if skipped_duplicate:
223
- print(f" Skipped {skipped_duplicate} already-imported traces")
224
- if skipped_no_input:
225
- print(f" Skipped {skipped_no_input} traces with no extractable input")
226
-
227
-
228
- if __name__ == "__main__":
229
- main()