harness-evolver 2.9.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +62 -117
  2. package/agents/evolver-architect.md +53 -0
  3. package/agents/evolver-critic.md +44 -0
  4. package/agents/evolver-proposer.md +128 -0
  5. package/agents/evolver-testgen.md +67 -0
  6. package/bin/install.js +181 -171
  7. package/package.json +7 -7
  8. package/skills/deploy/SKILL.md +49 -56
  9. package/skills/evolve/SKILL.md +180 -700
  10. package/skills/setup/SKILL.md +182 -0
  11. package/skills/status/SKILL.md +23 -21
  12. package/tools/read_results.py +240 -0
  13. package/tools/run_eval.py +202 -0
  14. package/tools/seed_from_traces.py +36 -8
  15. package/tools/setup.py +393 -0
  16. package/tools/trace_insights.py +86 -14
  17. package/agents/harness-evolver-architect.md +0 -173
  18. package/agents/harness-evolver-critic.md +0 -132
  19. package/agents/harness-evolver-judge.md +0 -110
  20. package/agents/harness-evolver-proposer.md +0 -317
  21. package/agents/harness-evolver-testgen.md +0 -112
  22. package/examples/classifier/README.md +0 -25
  23. package/examples/classifier/config.json +0 -3
  24. package/examples/classifier/eval.py +0 -58
  25. package/examples/classifier/harness.py +0 -111
  26. package/examples/classifier/tasks/task_001.json +0 -1
  27. package/examples/classifier/tasks/task_002.json +0 -1
  28. package/examples/classifier/tasks/task_003.json +0 -1
  29. package/examples/classifier/tasks/task_004.json +0 -1
  30. package/examples/classifier/tasks/task_005.json +0 -1
  31. package/examples/classifier/tasks/task_006.json +0 -1
  32. package/examples/classifier/tasks/task_007.json +0 -1
  33. package/examples/classifier/tasks/task_008.json +0 -1
  34. package/examples/classifier/tasks/task_009.json +0 -1
  35. package/examples/classifier/tasks/task_010.json +0 -1
  36. package/skills/architect/SKILL.md +0 -93
  37. package/skills/compare/SKILL.md +0 -73
  38. package/skills/critic/SKILL.md +0 -67
  39. package/skills/diagnose/SKILL.md +0 -96
  40. package/skills/import-traces/SKILL.md +0 -102
  41. package/skills/init/SKILL.md +0 -253
  42. package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
  43. package/tools/__pycache__/init.cpython-313.pyc +0 -0
  44. package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
  45. package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
  46. package/tools/eval_llm_judge.py +0 -233
  47. package/tools/eval_passthrough.py +0 -55
  48. package/tools/evaluate.py +0 -255
  49. package/tools/import_traces.py +0 -229
  50. package/tools/init.py +0 -531
  51. package/tools/llm_api.py +0 -125
  52. package/tools/state.py +0 -219
  53. package/tools/test_growth.py +0 -230
  54. package/tools/trace_logger.py +0 -42
@@ -1,233 +0,0 @@
1
- #!/usr/bin/env python3
2
- """LLM-as-judge evaluation script for Harness Evolver.
3
-
4
- Scores harness outputs using an LLM judge across multiple quality dimensions:
5
- accuracy, completeness, relevance, no_hallucination.
6
-
7
- CLI interface matches existing evals: --results-dir, --tasks-dir, --scores.
8
- Stdlib-only. No external dependencies.
9
- """
10
-
11
- import argparse
12
- import json
13
- import os
14
- import re
15
- import sys
16
-
17
- sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
18
- from llm_api import detect_provider, call_llm
19
-
20
- DIMENSIONS = ["accuracy", "completeness", "relevance", "no_hallucination"]
21
-
22
- WEIGHTS = {
23
- "accuracy": 0.4,
24
- "completeness": 0.2,
25
- "relevance": 0.2,
26
- "no_hallucination": 0.2,
27
- }
28
-
29
-
30
- def build_judge_prompt(task, result):
31
- """Build the evaluation prompt for the LLM judge."""
32
- prompt_parts = [
33
- "You are an expert evaluator. Assess the quality of the following output.",
34
- "",
35
- "QUESTION/INPUT:",
36
- str(task.get("input", "")),
37
- "",
38
- "OUTPUT TO EVALUATE:",
39
- str(result.get("output", "")),
40
- ]
41
-
42
- if "expected" in task:
43
- prompt_parts.extend([
44
- "",
45
- "REFERENCE ANSWER:",
46
- str(task["expected"]),
47
- ])
48
-
49
- prompt_parts.extend([
50
- "",
51
- "Score each dimension from 1 (worst) to 5 (best):",
52
- "- accuracy: Is the output factually correct and properly addresses the input?",
53
- "- completeness: Does it cover all relevant aspects?",
54
- "- relevance: Is it focused and on-topic?",
55
- "- no_hallucination: Does it avoid fabricating information not supported by context?",
56
- "",
57
- "Think step by step, then respond with ONLY this JSON:",
58
- '{"reasoning": "your analysis", "accuracy": N, "completeness": N, "relevance": N, "no_hallucination": N}',
59
- ])
60
-
61
- return "\n".join(prompt_parts)
62
-
63
-
64
- def extract_json_scores(response):
65
- """Extract scoring JSON from LLM response. Handles fenced and bare JSON."""
66
- # Try direct parse
67
- try:
68
- data = json.loads(response.strip())
69
- if "accuracy" in data:
70
- return data
71
- except (json.JSONDecodeError, ValueError):
72
- pass
73
-
74
- # Try extracting from markdown fences
75
- fence_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response, re.DOTALL)
76
- if fence_match:
77
- try:
78
- data = json.loads(fence_match.group(1))
79
- if "accuracy" in data:
80
- return data
81
- except (json.JSONDecodeError, ValueError):
82
- pass
83
-
84
- # Try regex extraction for JSON with accuracy key
85
- json_match = re.search(r'\{[^{}]*"accuracy"\s*:\s*\d[^{}]*\}', response)
86
- if json_match:
87
- try:
88
- data = json.loads(json_match.group(0))
89
- if "accuracy" in data:
90
- return data
91
- except (json.JSONDecodeError, ValueError):
92
- pass
93
-
94
- return None
95
-
96
-
97
- def normalize_score(raw_score):
98
- """Normalize a 1-5 score to 0.0-1.0 range."""
99
- clamped = max(1, min(5, int(raw_score)))
100
- return (clamped - 1) / 4.0
101
-
102
-
103
- def compute_combined_score(scores_dict):
104
- """Compute weighted combined score from normalized dimension scores."""
105
- total = 0.0
106
- for dim in DIMENSIONS:
107
- total += scores_dict.get(dim, 0.0) * WEIGHTS[dim]
108
- return total
109
-
110
-
111
- def evaluate_task(provider, api_key, model, task, result):
112
- """Evaluate a single task with the LLM judge. Returns per-task score dict."""
113
- prompt = build_judge_prompt(task, result)
114
-
115
- try:
116
- response = call_llm(provider, api_key, model, prompt, max_tokens=2048)
117
- except Exception as e:
118
- return {
119
- "score": 0.0,
120
- "accuracy": 1, "completeness": 1, "relevance": 1, "no_hallucination": 1,
121
- "reasoning": f"LLM call failed: {e}",
122
- "error": str(e),
123
- }
124
-
125
- parsed = extract_json_scores(response)
126
- if parsed is None:
127
- return {
128
- "score": 0.0,
129
- "accuracy": 1, "completeness": 1, "relevance": 1, "no_hallucination": 1,
130
- "reasoning": f"Failed to parse judge response: {response[:200]}",
131
- "error": "parse_failed",
132
- }
133
-
134
- # Extract raw scores
135
- raw = {}
136
- normalized = {}
137
- for dim in DIMENSIONS:
138
- raw[dim] = parsed.get(dim, 1)
139
- normalized[dim] = normalize_score(raw[dim])
140
-
141
- combined = compute_combined_score(normalized)
142
-
143
- return {
144
- "score": round(combined, 4),
145
- "accuracy": raw["accuracy"],
146
- "completeness": raw["completeness"],
147
- "relevance": raw["relevance"],
148
- "no_hallucination": raw["no_hallucination"],
149
- "reasoning": parsed.get("reasoning", ""),
150
- }
151
-
152
-
153
- def main():
154
- parser = argparse.ArgumentParser(description="LLM-as-judge evaluation")
155
- parser.add_argument("--results-dir", required=True,
156
- help="Directory with harness output JSON files")
157
- parser.add_argument("--tasks-dir", required=True,
158
- help="Directory with task JSON files")
159
- parser.add_argument("--scores", required=True,
160
- help="Output path for scores JSON")
161
- args = parser.parse_args()
162
-
163
- # Detect LLM provider
164
- provider, api_key, model = detect_provider()
165
-
166
- # Collect tasks
167
- task_files = sorted(f for f in os.listdir(args.tasks_dir) if f.endswith(".json"))
168
- if not task_files:
169
- print(f"FAIL: no .json task files in {args.tasks_dir}", file=sys.stderr)
170
- sys.exit(1)
171
-
172
- per_task = {}
173
- dimension_totals = {dim: 0.0 for dim in DIMENSIONS}
174
- total_combined = 0.0
175
- total_tasks = 0
176
-
177
- for task_file in task_files:
178
- # Load task
179
- task_path = os.path.join(args.tasks_dir, task_file)
180
- with open(task_path) as f:
181
- task = json.load(f)
182
- task_id = task["id"]
183
-
184
- # Load result
185
- result_path = os.path.join(args.results_dir, task_file)
186
- if os.path.exists(result_path):
187
- with open(result_path) as f:
188
- result = json.load(f)
189
- else:
190
- result = {"id": task_id, "output": "", "error": "no output file"}
191
-
192
- # Evaluate
193
- task_scores = evaluate_task(provider, api_key, model, task, result)
194
- per_task[task_id] = task_scores
195
-
196
- # Accumulate
197
- total_combined += task_scores["score"]
198
- for dim in DIMENSIONS:
199
- dimension_totals[dim] += normalize_score(task_scores[dim])
200
- total_tasks += 1
201
-
202
- # Compute averages
203
- if total_tasks > 0:
204
- combined_score = round(total_combined / total_tasks, 4)
205
- avg_dimensions = {
206
- dim: round(dimension_totals[dim] / total_tasks, 4) for dim in DIMENSIONS
207
- }
208
- else:
209
- combined_score = 0.0
210
- avg_dimensions = {dim: 0.0 for dim in DIMENSIONS}
211
-
212
- scores = {
213
- "combined_score": combined_score,
214
- "eval_type": "llm-judge",
215
- "judge_provider": provider,
216
- "judge_model": model,
217
- "dimensions": avg_dimensions,
218
- "weights": WEIGHTS,
219
- "total_tasks": total_tasks,
220
- "per_task": per_task,
221
- }
222
-
223
- # Write scores
224
- os.makedirs(os.path.dirname(os.path.abspath(args.scores)), exist_ok=True)
225
- with open(args.scores, "w") as f:
226
- json.dump(scores, f, indent=2)
227
-
228
- print(f"LLM judge evaluation complete. combined_score: {combined_score} "
229
- f"({total_tasks} tasks, provider: {provider}/{model})")
230
-
231
-
232
- if __name__ == "__main__":
233
- main()
@@ -1,55 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Passthrough eval — collects outputs for judge subagent scoring.
3
-
4
- When no custom eval.py exists, this is used as the default. It does NOT score
5
- outputs — it collects them and marks them for the judge subagent to evaluate.
6
- The evolve skill detects eval_type=pending-judge and spawns the judge agent.
7
- """
8
-
9
- import argparse
10
- import json
11
- import os
12
-
13
-
14
- def main():
15
- parser = argparse.ArgumentParser()
16
- parser.add_argument("--results-dir", required=True)
17
- parser.add_argument("--tasks-dir", required=True)
18
- parser.add_argument("--scores", required=True)
19
- args = parser.parse_args()
20
-
21
- per_task = {}
22
- for fname in sorted(os.listdir(args.tasks_dir)):
23
- if not fname.endswith(".json"):
24
- continue
25
- with open(os.path.join(args.tasks_dir, fname)) as f:
26
- task = json.load(f)
27
- task_id = task["id"]
28
-
29
- result_path = os.path.join(args.results_dir, fname)
30
- output = ""
31
- if os.path.exists(result_path):
32
- with open(result_path) as f:
33
- result = json.load(f)
34
- output = str(result.get("output", ""))
35
-
36
- per_task[task_id] = {
37
- "score": -1,
38
- "input": str(task.get("input", ""))[:500],
39
- "output": output[:500],
40
- }
41
-
42
- scores = {
43
- "combined_score": -1,
44
- "eval_type": "pending-judge",
45
- "total_tasks": len(per_task),
46
- "per_task": per_task,
47
- }
48
- with open(args.scores, "w") as f:
49
- json.dump(scores, f, indent=2)
50
-
51
- print(f"Collected {len(per_task)} task outputs for judge scoring.")
52
-
53
-
54
- if __name__ == "__main__":
55
- main()
package/tools/evaluate.py DELETED
@@ -1,255 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Evaluation orchestrator for Harness Evolver.
3
-
4
- Commands:
5
- validate --harness PATH [--config PATH] [--timeout SECONDS]
6
- run --harness PATH --tasks-dir PATH --eval PATH --traces-dir PATH --scores PATH
7
- [--config PATH] [--timeout SECONDS]
8
-
9
- Runs harness per task, captures traces (stdout/stderr/timing), then calls user's eval script.
10
- Stdlib-only. No external dependencies.
11
- """
12
-
13
- import argparse
14
- import json
15
- import os
16
- import shutil
17
- import subprocess
18
- import sys
19
- import tempfile
20
- import time
21
-
22
-
23
- def _resolve_python():
24
- """Resolve the Python interpreter to use for subprocesses.
25
-
26
- Prefers the current interpreter (sys.executable) over a hardcoded 'python3'.
27
- This is critical in monorepo setups where the harness may need a specific
28
- venv Python (e.g. Python 3.12) while the system 'python3' is a different
29
- version (e.g. 3.14) with incompatible site-packages.
30
- """
31
- exe = sys.executable
32
- if exe and os.path.isfile(exe):
33
- return exe
34
- return "python3"
35
-
36
-
37
- def _run_harness_on_task(harness, config, task_input_path, output_path, task_traces_dir, timeout, env=None):
38
- """Run the harness on a single task. Returns (success, elapsed_ms, stdout, stderr)."""
39
- cmd = [_resolve_python(), harness, "--input", task_input_path, "--output", output_path]
40
- if task_traces_dir:
41
- extra_dir = os.path.join(task_traces_dir, "extra")
42
- os.makedirs(extra_dir, exist_ok=True)
43
- cmd.extend(["--traces-dir", extra_dir])
44
- if config and os.path.exists(config):
45
- cmd.extend(["--config", config])
46
-
47
- start = time.time()
48
- try:
49
- result = subprocess.run(
50
- cmd, capture_output=True, text=True, timeout=timeout, env=env,
51
- )
52
- elapsed_ms = (time.time() - start) * 1000
53
- # Accept exit code 0 (success) or check if output file exists for non-zero exits.
54
- # LLM agents with C extensions (numpy, httpx) often segfault (exit 139) during
55
- # Python shutdown AFTER writing correct output.
56
- success = result.returncode == 0
57
- if not success and os.path.exists(output_path):
58
- try:
59
- with open(output_path) as f:
60
- json.load(f)
61
- # Valid JSON output exists despite non-zero exit — treat as success
62
- success = True
63
- except (json.JSONDecodeError, OSError):
64
- pass
65
- return success, elapsed_ms, result.stdout, result.stderr
66
- except subprocess.TimeoutExpired:
67
- elapsed_ms = (time.time() - start) * 1000
68
- return False, elapsed_ms, "", f"TIMEOUT after {timeout}s"
69
- except Exception as e:
70
- elapsed_ms = (time.time() - start) * 1000
71
- return False, elapsed_ms, "", str(e)
72
-
73
-
74
- def cmd_validate(args):
75
- harness = args.harness
76
- config = getattr(args, "config", None)
77
- timeout = getattr(args, "timeout", 30) or 30
78
-
79
- if not os.path.exists(harness):
80
- print(f"FAIL: harness not found: {harness}", file=sys.stderr)
81
- sys.exit(1)
82
-
83
- with tempfile.TemporaryDirectory() as tmpdir:
84
- dummy_task = {"id": "validation", "input": "test input for validation", "metadata": {}}
85
- input_path = os.path.join(tmpdir, "input.json")
86
- output_path = os.path.join(tmpdir, "output.json")
87
- with open(input_path, "w") as f:
88
- json.dump(dummy_task, f)
89
-
90
- success, elapsed, stdout, stderr = _run_harness_on_task(
91
- harness, config, input_path, output_path, None, timeout=timeout,
92
- )
93
-
94
- if not success:
95
- hint = ""
96
- if "TIMEOUT" in stderr:
97
- hint = (f"\nHint: validation timed out after {timeout}s. "
98
- "For LLM-powered agents that make real API calls, "
99
- "use --timeout to increase the limit: "
100
- f"evaluate.py validate --harness {harness} --timeout 120")
101
- print(f"FAIL: harness exited with error.\nstderr: {stderr}{hint}", file=sys.stderr)
102
- sys.exit(1)
103
-
104
- if not os.path.exists(output_path):
105
- print("FAIL: harness did not create output file.", file=sys.stderr)
106
- sys.exit(1)
107
-
108
- try:
109
- with open(output_path) as f:
110
- output = json.load(f)
111
- except (json.JSONDecodeError, ValueError) as e:
112
- print(f"FAIL: output is not valid JSON: {e}", file=sys.stderr)
113
- sys.exit(1)
114
-
115
- if "id" not in output or "output" not in output:
116
- print(f"FAIL: output missing 'id' or 'output' fields. Got: {output}", file=sys.stderr)
117
- sys.exit(1)
118
-
119
- print(f"OK: harness validated in {elapsed:.0f}ms. Output: {output}")
120
-
121
-
122
- def cmd_run(args):
123
- harness = args.harness
124
- config = getattr(args, "config", None)
125
- tasks_dir = args.tasks_dir
126
- eval_script = getattr(args, "eval")
127
- traces_dir = args.traces_dir
128
- scores_path = args.scores
129
- timeout = args.timeout
130
-
131
- os.makedirs(traces_dir, exist_ok=True)
132
-
133
- task_files = sorted(f for f in os.listdir(tasks_dir) if f.endswith(".json"))
134
- if not task_files:
135
- print(f"FAIL: no .json task files in {tasks_dir}", file=sys.stderr)
136
- sys.exit(1)
137
-
138
- all_stdout = []
139
- all_stderr = []
140
- timing = {"per_task": {}}
141
- results_dir = tempfile.mkdtemp()
142
-
143
- # LangSmith: setup auto-tracing env vars if configured
144
- langsmith_env = None
145
- project_config_path = os.path.join(os.path.dirname(os.path.dirname(traces_dir)), "config.json")
146
- if os.path.exists(project_config_path):
147
- with open(project_config_path) as f:
148
- project_config = json.load(f)
149
- ls = project_config.get("eval", {}).get("langsmith", {})
150
- if ls.get("enabled"):
151
- api_key = os.environ.get(ls.get("api_key_env", "LANGSMITH_API_KEY"), "")
152
- if api_key:
153
- version = os.path.basename(os.path.dirname(traces_dir))
154
- ls_project = f"{ls.get('project_prefix', 'harness-evolver')}-{version}"
155
- langsmith_env = {
156
- **os.environ,
157
- "LANGCHAIN_TRACING_V2": "true",
158
- "LANGCHAIN_API_KEY": api_key,
159
- "LANGCHAIN_PROJECT": ls_project,
160
- }
161
- # Write the project name so the evolve skill knows where to find traces
162
- ls_project_file = os.path.join(os.path.dirname(os.path.dirname(traces_dir)), "langsmith_project.txt")
163
- with open(ls_project_file, "w") as f:
164
- f.write(ls_project)
165
-
166
- for task_file in task_files:
167
- task_path = os.path.join(tasks_dir, task_file)
168
- with open(task_path) as f:
169
- task = json.load(f)
170
- task_id = task["id"]
171
-
172
- task_input = {k: v for k, v in task.items() if k != "expected"}
173
-
174
- task_traces_dir = os.path.join(traces_dir, task_id)
175
- os.makedirs(task_traces_dir, exist_ok=True)
176
-
177
- input_path = os.path.join(task_traces_dir, "input.json")
178
- with open(input_path, "w") as f:
179
- json.dump(task_input, f, indent=2)
180
-
181
- output_path = os.path.join(results_dir, task_file)
182
-
183
- success, elapsed_ms, stdout, stderr = _run_harness_on_task(
184
- harness, config, input_path, output_path, task_traces_dir, timeout,
185
- env=langsmith_env,
186
- )
187
-
188
- if os.path.exists(output_path):
189
- shutil.copy2(output_path, os.path.join(task_traces_dir, "output.json"))
190
- else:
191
- with open(os.path.join(task_traces_dir, "output.json"), "w") as f:
192
- json.dump({"id": task_id, "output": "", "error": "harness failed"}, f)
193
-
194
- timing["per_task"][task_id] = round(elapsed_ms, 1)
195
- all_stdout.append(f"--- {task_id} ---\n{stdout}")
196
- all_stderr.append(f"--- {task_id} ---\n{stderr}")
197
-
198
- timing["total_ms"] = round(sum(timing["per_task"].values()), 1)
199
- with open(os.path.join(traces_dir, "timing.json"), "w") as f:
200
- json.dump(timing, f, indent=2)
201
- with open(os.path.join(traces_dir, "stdout.log"), "w") as f:
202
- f.write("\n".join(all_stdout))
203
- with open(os.path.join(traces_dir, "stderr.log"), "w") as f:
204
- f.write("\n".join(all_stderr))
205
-
206
- eval_cmd = [
207
- _resolve_python(), eval_script,
208
- "--results-dir", results_dir,
209
- "--tasks-dir", tasks_dir,
210
- "--scores", scores_path,
211
- ]
212
- result = subprocess.run(eval_cmd, capture_output=True, text=True, timeout=120)
213
- if result.returncode != 0:
214
- print(f"FAIL: eval script failed.\nstderr: {result.stderr}", file=sys.stderr)
215
- sys.exit(1)
216
-
217
- if os.path.exists(scores_path):
218
- scores = json.load(open(scores_path))
219
- print(f"Evaluation complete. combined_score: {scores.get('combined_score', 'N/A')}")
220
- else:
221
- print("WARNING: eval script did not produce scores file.", file=sys.stderr)
222
-
223
-
224
- def main():
225
- parser = argparse.ArgumentParser(description="Harness Evolver evaluation orchestrator")
226
- sub = parser.add_subparsers(dest="command")
227
-
228
- p_val = sub.add_parser("validate")
229
- p_val.add_argument("--harness", required=True)
230
- p_val.add_argument("--config", default=None)
231
- p_val.add_argument("--timeout", type=int, default=30,
232
- help="Validation timeout in seconds (default: 30). "
233
- "Increase for LLM-powered agents that make real API calls.")
234
-
235
- p_run = sub.add_parser("run")
236
- p_run.add_argument("--harness", required=True)
237
- p_run.add_argument("--config", default=None)
238
- p_run.add_argument("--tasks-dir", required=True)
239
- p_run.add_argument("--eval", required=True)
240
- p_run.add_argument("--traces-dir", required=True)
241
- p_run.add_argument("--scores", required=True)
242
- p_run.add_argument("--timeout", type=int, default=60)
243
-
244
- args = parser.parse_args()
245
- if args.command == "validate":
246
- cmd_validate(args)
247
- elif args.command == "run":
248
- cmd_run(args)
249
- else:
250
- parser.print_help()
251
- sys.exit(1)
252
-
253
-
254
- if __name__ == "__main__":
255
- main()