harness-evolver 2.9.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -117
- package/agents/evolver-architect.md +53 -0
- package/agents/evolver-critic.md +44 -0
- package/agents/evolver-proposer.md +128 -0
- package/agents/evolver-testgen.md +67 -0
- package/bin/install.js +181 -171
- package/package.json +7 -7
- package/skills/deploy/SKILL.md +49 -56
- package/skills/evolve/SKILL.md +180 -700
- package/skills/setup/SKILL.md +182 -0
- package/skills/status/SKILL.md +23 -21
- package/tools/read_results.py +240 -0
- package/tools/run_eval.py +202 -0
- package/tools/seed_from_traces.py +36 -8
- package/tools/setup.py +393 -0
- package/tools/trace_insights.py +86 -14
- package/agents/harness-evolver-architect.md +0 -173
- package/agents/harness-evolver-critic.md +0 -132
- package/agents/harness-evolver-judge.md +0 -110
- package/agents/harness-evolver-proposer.md +0 -317
- package/agents/harness-evolver-testgen.md +0 -112
- package/examples/classifier/README.md +0 -25
- package/examples/classifier/config.json +0 -3
- package/examples/classifier/eval.py +0 -58
- package/examples/classifier/harness.py +0 -111
- package/examples/classifier/tasks/task_001.json +0 -1
- package/examples/classifier/tasks/task_002.json +0 -1
- package/examples/classifier/tasks/task_003.json +0 -1
- package/examples/classifier/tasks/task_004.json +0 -1
- package/examples/classifier/tasks/task_005.json +0 -1
- package/examples/classifier/tasks/task_006.json +0 -1
- package/examples/classifier/tasks/task_007.json +0 -1
- package/examples/classifier/tasks/task_008.json +0 -1
- package/examples/classifier/tasks/task_009.json +0 -1
- package/examples/classifier/tasks/task_010.json +0 -1
- package/skills/architect/SKILL.md +0 -93
- package/skills/compare/SKILL.md +0 -73
- package/skills/critic/SKILL.md +0 -67
- package/skills/diagnose/SKILL.md +0 -96
- package/skills/import-traces/SKILL.md +0 -102
- package/skills/init/SKILL.md +0 -253
- package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
- package/tools/__pycache__/init.cpython-313.pyc +0 -0
- package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
- package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
- package/tools/eval_llm_judge.py +0 -233
- package/tools/eval_passthrough.py +0 -55
- package/tools/evaluate.py +0 -255
- package/tools/import_traces.py +0 -229
- package/tools/init.py +0 -531
- package/tools/llm_api.py +0 -125
- package/tools/state.py +0 -219
- package/tools/test_growth.py +0 -230
- package/tools/trace_logger.py +0 -42
package/tools/eval_llm_judge.py
DELETED
|
@@ -1,233 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""LLM-as-judge evaluation script for Harness Evolver.
|
|
3
|
-
|
|
4
|
-
Scores harness outputs using an LLM judge across multiple quality dimensions:
|
|
5
|
-
accuracy, completeness, relevance, no_hallucination.
|
|
6
|
-
|
|
7
|
-
CLI interface matches existing evals: --results-dir, --tasks-dir, --scores.
|
|
8
|
-
Stdlib-only. No external dependencies.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
import argparse
|
|
12
|
-
import json
|
|
13
|
-
import os
|
|
14
|
-
import re
|
|
15
|
-
import sys
|
|
16
|
-
|
|
17
|
-
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
18
|
-
from llm_api import detect_provider, call_llm
|
|
19
|
-
|
|
20
|
-
DIMENSIONS = ["accuracy", "completeness", "relevance", "no_hallucination"]
|
|
21
|
-
|
|
22
|
-
WEIGHTS = {
|
|
23
|
-
"accuracy": 0.4,
|
|
24
|
-
"completeness": 0.2,
|
|
25
|
-
"relevance": 0.2,
|
|
26
|
-
"no_hallucination": 0.2,
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def build_judge_prompt(task, result):
|
|
31
|
-
"""Build the evaluation prompt for the LLM judge."""
|
|
32
|
-
prompt_parts = [
|
|
33
|
-
"You are an expert evaluator. Assess the quality of the following output.",
|
|
34
|
-
"",
|
|
35
|
-
"QUESTION/INPUT:",
|
|
36
|
-
str(task.get("input", "")),
|
|
37
|
-
"",
|
|
38
|
-
"OUTPUT TO EVALUATE:",
|
|
39
|
-
str(result.get("output", "")),
|
|
40
|
-
]
|
|
41
|
-
|
|
42
|
-
if "expected" in task:
|
|
43
|
-
prompt_parts.extend([
|
|
44
|
-
"",
|
|
45
|
-
"REFERENCE ANSWER:",
|
|
46
|
-
str(task["expected"]),
|
|
47
|
-
])
|
|
48
|
-
|
|
49
|
-
prompt_parts.extend([
|
|
50
|
-
"",
|
|
51
|
-
"Score each dimension from 1 (worst) to 5 (best):",
|
|
52
|
-
"- accuracy: Is the output factually correct and properly addresses the input?",
|
|
53
|
-
"- completeness: Does it cover all relevant aspects?",
|
|
54
|
-
"- relevance: Is it focused and on-topic?",
|
|
55
|
-
"- no_hallucination: Does it avoid fabricating information not supported by context?",
|
|
56
|
-
"",
|
|
57
|
-
"Think step by step, then respond with ONLY this JSON:",
|
|
58
|
-
'{"reasoning": "your analysis", "accuracy": N, "completeness": N, "relevance": N, "no_hallucination": N}',
|
|
59
|
-
])
|
|
60
|
-
|
|
61
|
-
return "\n".join(prompt_parts)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
def extract_json_scores(response):
|
|
65
|
-
"""Extract scoring JSON from LLM response. Handles fenced and bare JSON."""
|
|
66
|
-
# Try direct parse
|
|
67
|
-
try:
|
|
68
|
-
data = json.loads(response.strip())
|
|
69
|
-
if "accuracy" in data:
|
|
70
|
-
return data
|
|
71
|
-
except (json.JSONDecodeError, ValueError):
|
|
72
|
-
pass
|
|
73
|
-
|
|
74
|
-
# Try extracting from markdown fences
|
|
75
|
-
fence_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response, re.DOTALL)
|
|
76
|
-
if fence_match:
|
|
77
|
-
try:
|
|
78
|
-
data = json.loads(fence_match.group(1))
|
|
79
|
-
if "accuracy" in data:
|
|
80
|
-
return data
|
|
81
|
-
except (json.JSONDecodeError, ValueError):
|
|
82
|
-
pass
|
|
83
|
-
|
|
84
|
-
# Try regex extraction for JSON with accuracy key
|
|
85
|
-
json_match = re.search(r'\{[^{}]*"accuracy"\s*:\s*\d[^{}]*\}', response)
|
|
86
|
-
if json_match:
|
|
87
|
-
try:
|
|
88
|
-
data = json.loads(json_match.group(0))
|
|
89
|
-
if "accuracy" in data:
|
|
90
|
-
return data
|
|
91
|
-
except (json.JSONDecodeError, ValueError):
|
|
92
|
-
pass
|
|
93
|
-
|
|
94
|
-
return None
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def normalize_score(raw_score):
|
|
98
|
-
"""Normalize a 1-5 score to 0.0-1.0 range."""
|
|
99
|
-
clamped = max(1, min(5, int(raw_score)))
|
|
100
|
-
return (clamped - 1) / 4.0
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
def compute_combined_score(scores_dict):
|
|
104
|
-
"""Compute weighted combined score from normalized dimension scores."""
|
|
105
|
-
total = 0.0
|
|
106
|
-
for dim in DIMENSIONS:
|
|
107
|
-
total += scores_dict.get(dim, 0.0) * WEIGHTS[dim]
|
|
108
|
-
return total
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def evaluate_task(provider, api_key, model, task, result):
|
|
112
|
-
"""Evaluate a single task with the LLM judge. Returns per-task score dict."""
|
|
113
|
-
prompt = build_judge_prompt(task, result)
|
|
114
|
-
|
|
115
|
-
try:
|
|
116
|
-
response = call_llm(provider, api_key, model, prompt, max_tokens=2048)
|
|
117
|
-
except Exception as e:
|
|
118
|
-
return {
|
|
119
|
-
"score": 0.0,
|
|
120
|
-
"accuracy": 1, "completeness": 1, "relevance": 1, "no_hallucination": 1,
|
|
121
|
-
"reasoning": f"LLM call failed: {e}",
|
|
122
|
-
"error": str(e),
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
parsed = extract_json_scores(response)
|
|
126
|
-
if parsed is None:
|
|
127
|
-
return {
|
|
128
|
-
"score": 0.0,
|
|
129
|
-
"accuracy": 1, "completeness": 1, "relevance": 1, "no_hallucination": 1,
|
|
130
|
-
"reasoning": f"Failed to parse judge response: {response[:200]}",
|
|
131
|
-
"error": "parse_failed",
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
# Extract raw scores
|
|
135
|
-
raw = {}
|
|
136
|
-
normalized = {}
|
|
137
|
-
for dim in DIMENSIONS:
|
|
138
|
-
raw[dim] = parsed.get(dim, 1)
|
|
139
|
-
normalized[dim] = normalize_score(raw[dim])
|
|
140
|
-
|
|
141
|
-
combined = compute_combined_score(normalized)
|
|
142
|
-
|
|
143
|
-
return {
|
|
144
|
-
"score": round(combined, 4),
|
|
145
|
-
"accuracy": raw["accuracy"],
|
|
146
|
-
"completeness": raw["completeness"],
|
|
147
|
-
"relevance": raw["relevance"],
|
|
148
|
-
"no_hallucination": raw["no_hallucination"],
|
|
149
|
-
"reasoning": parsed.get("reasoning", ""),
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
def main():
|
|
154
|
-
parser = argparse.ArgumentParser(description="LLM-as-judge evaluation")
|
|
155
|
-
parser.add_argument("--results-dir", required=True,
|
|
156
|
-
help="Directory with harness output JSON files")
|
|
157
|
-
parser.add_argument("--tasks-dir", required=True,
|
|
158
|
-
help="Directory with task JSON files")
|
|
159
|
-
parser.add_argument("--scores", required=True,
|
|
160
|
-
help="Output path for scores JSON")
|
|
161
|
-
args = parser.parse_args()
|
|
162
|
-
|
|
163
|
-
# Detect LLM provider
|
|
164
|
-
provider, api_key, model = detect_provider()
|
|
165
|
-
|
|
166
|
-
# Collect tasks
|
|
167
|
-
task_files = sorted(f for f in os.listdir(args.tasks_dir) if f.endswith(".json"))
|
|
168
|
-
if not task_files:
|
|
169
|
-
print(f"FAIL: no .json task files in {args.tasks_dir}", file=sys.stderr)
|
|
170
|
-
sys.exit(1)
|
|
171
|
-
|
|
172
|
-
per_task = {}
|
|
173
|
-
dimension_totals = {dim: 0.0 for dim in DIMENSIONS}
|
|
174
|
-
total_combined = 0.0
|
|
175
|
-
total_tasks = 0
|
|
176
|
-
|
|
177
|
-
for task_file in task_files:
|
|
178
|
-
# Load task
|
|
179
|
-
task_path = os.path.join(args.tasks_dir, task_file)
|
|
180
|
-
with open(task_path) as f:
|
|
181
|
-
task = json.load(f)
|
|
182
|
-
task_id = task["id"]
|
|
183
|
-
|
|
184
|
-
# Load result
|
|
185
|
-
result_path = os.path.join(args.results_dir, task_file)
|
|
186
|
-
if os.path.exists(result_path):
|
|
187
|
-
with open(result_path) as f:
|
|
188
|
-
result = json.load(f)
|
|
189
|
-
else:
|
|
190
|
-
result = {"id": task_id, "output": "", "error": "no output file"}
|
|
191
|
-
|
|
192
|
-
# Evaluate
|
|
193
|
-
task_scores = evaluate_task(provider, api_key, model, task, result)
|
|
194
|
-
per_task[task_id] = task_scores
|
|
195
|
-
|
|
196
|
-
# Accumulate
|
|
197
|
-
total_combined += task_scores["score"]
|
|
198
|
-
for dim in DIMENSIONS:
|
|
199
|
-
dimension_totals[dim] += normalize_score(task_scores[dim])
|
|
200
|
-
total_tasks += 1
|
|
201
|
-
|
|
202
|
-
# Compute averages
|
|
203
|
-
if total_tasks > 0:
|
|
204
|
-
combined_score = round(total_combined / total_tasks, 4)
|
|
205
|
-
avg_dimensions = {
|
|
206
|
-
dim: round(dimension_totals[dim] / total_tasks, 4) for dim in DIMENSIONS
|
|
207
|
-
}
|
|
208
|
-
else:
|
|
209
|
-
combined_score = 0.0
|
|
210
|
-
avg_dimensions = {dim: 0.0 for dim in DIMENSIONS}
|
|
211
|
-
|
|
212
|
-
scores = {
|
|
213
|
-
"combined_score": combined_score,
|
|
214
|
-
"eval_type": "llm-judge",
|
|
215
|
-
"judge_provider": provider,
|
|
216
|
-
"judge_model": model,
|
|
217
|
-
"dimensions": avg_dimensions,
|
|
218
|
-
"weights": WEIGHTS,
|
|
219
|
-
"total_tasks": total_tasks,
|
|
220
|
-
"per_task": per_task,
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
# Write scores
|
|
224
|
-
os.makedirs(os.path.dirname(os.path.abspath(args.scores)), exist_ok=True)
|
|
225
|
-
with open(args.scores, "w") as f:
|
|
226
|
-
json.dump(scores, f, indent=2)
|
|
227
|
-
|
|
228
|
-
print(f"LLM judge evaluation complete. combined_score: {combined_score} "
|
|
229
|
-
f"({total_tasks} tasks, provider: {provider}/{model})")
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
if __name__ == "__main__":
|
|
233
|
-
main()
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Passthrough eval — collects outputs for judge subagent scoring.
|
|
3
|
-
|
|
4
|
-
When no custom eval.py exists, this is used as the default. It does NOT score
|
|
5
|
-
outputs — it collects them and marks them for the judge subagent to evaluate.
|
|
6
|
-
The evolve skill detects eval_type=pending-judge and spawns the judge agent.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import argparse
|
|
10
|
-
import json
|
|
11
|
-
import os
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def main():
|
|
15
|
-
parser = argparse.ArgumentParser()
|
|
16
|
-
parser.add_argument("--results-dir", required=True)
|
|
17
|
-
parser.add_argument("--tasks-dir", required=True)
|
|
18
|
-
parser.add_argument("--scores", required=True)
|
|
19
|
-
args = parser.parse_args()
|
|
20
|
-
|
|
21
|
-
per_task = {}
|
|
22
|
-
for fname in sorted(os.listdir(args.tasks_dir)):
|
|
23
|
-
if not fname.endswith(".json"):
|
|
24
|
-
continue
|
|
25
|
-
with open(os.path.join(args.tasks_dir, fname)) as f:
|
|
26
|
-
task = json.load(f)
|
|
27
|
-
task_id = task["id"]
|
|
28
|
-
|
|
29
|
-
result_path = os.path.join(args.results_dir, fname)
|
|
30
|
-
output = ""
|
|
31
|
-
if os.path.exists(result_path):
|
|
32
|
-
with open(result_path) as f:
|
|
33
|
-
result = json.load(f)
|
|
34
|
-
output = str(result.get("output", ""))
|
|
35
|
-
|
|
36
|
-
per_task[task_id] = {
|
|
37
|
-
"score": -1,
|
|
38
|
-
"input": str(task.get("input", ""))[:500],
|
|
39
|
-
"output": output[:500],
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
scores = {
|
|
43
|
-
"combined_score": -1,
|
|
44
|
-
"eval_type": "pending-judge",
|
|
45
|
-
"total_tasks": len(per_task),
|
|
46
|
-
"per_task": per_task,
|
|
47
|
-
}
|
|
48
|
-
with open(args.scores, "w") as f:
|
|
49
|
-
json.dump(scores, f, indent=2)
|
|
50
|
-
|
|
51
|
-
print(f"Collected {len(per_task)} task outputs for judge scoring.")
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
if __name__ == "__main__":
|
|
55
|
-
main()
|
package/tools/evaluate.py
DELETED
|
@@ -1,255 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Evaluation orchestrator for Harness Evolver.
|
|
3
|
-
|
|
4
|
-
Commands:
|
|
5
|
-
validate --harness PATH [--config PATH] [--timeout SECONDS]
|
|
6
|
-
run --harness PATH --tasks-dir PATH --eval PATH --traces-dir PATH --scores PATH
|
|
7
|
-
[--config PATH] [--timeout SECONDS]
|
|
8
|
-
|
|
9
|
-
Runs harness per task, captures traces (stdout/stderr/timing), then calls user's eval script.
|
|
10
|
-
Stdlib-only. No external dependencies.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
import argparse
|
|
14
|
-
import json
|
|
15
|
-
import os
|
|
16
|
-
import shutil
|
|
17
|
-
import subprocess
|
|
18
|
-
import sys
|
|
19
|
-
import tempfile
|
|
20
|
-
import time
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def _resolve_python():
|
|
24
|
-
"""Resolve the Python interpreter to use for subprocesses.
|
|
25
|
-
|
|
26
|
-
Prefers the current interpreter (sys.executable) over a hardcoded 'python3'.
|
|
27
|
-
This is critical in monorepo setups where the harness may need a specific
|
|
28
|
-
venv Python (e.g. Python 3.12) while the system 'python3' is a different
|
|
29
|
-
version (e.g. 3.14) with incompatible site-packages.
|
|
30
|
-
"""
|
|
31
|
-
exe = sys.executable
|
|
32
|
-
if exe and os.path.isfile(exe):
|
|
33
|
-
return exe
|
|
34
|
-
return "python3"
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def _run_harness_on_task(harness, config, task_input_path, output_path, task_traces_dir, timeout, env=None):
|
|
38
|
-
"""Run the harness on a single task. Returns (success, elapsed_ms, stdout, stderr)."""
|
|
39
|
-
cmd = [_resolve_python(), harness, "--input", task_input_path, "--output", output_path]
|
|
40
|
-
if task_traces_dir:
|
|
41
|
-
extra_dir = os.path.join(task_traces_dir, "extra")
|
|
42
|
-
os.makedirs(extra_dir, exist_ok=True)
|
|
43
|
-
cmd.extend(["--traces-dir", extra_dir])
|
|
44
|
-
if config and os.path.exists(config):
|
|
45
|
-
cmd.extend(["--config", config])
|
|
46
|
-
|
|
47
|
-
start = time.time()
|
|
48
|
-
try:
|
|
49
|
-
result = subprocess.run(
|
|
50
|
-
cmd, capture_output=True, text=True, timeout=timeout, env=env,
|
|
51
|
-
)
|
|
52
|
-
elapsed_ms = (time.time() - start) * 1000
|
|
53
|
-
# Accept exit code 0 (success) or check if output file exists for non-zero exits.
|
|
54
|
-
# LLM agents with C extensions (numpy, httpx) often segfault (exit 139) during
|
|
55
|
-
# Python shutdown AFTER writing correct output.
|
|
56
|
-
success = result.returncode == 0
|
|
57
|
-
if not success and os.path.exists(output_path):
|
|
58
|
-
try:
|
|
59
|
-
with open(output_path) as f:
|
|
60
|
-
json.load(f)
|
|
61
|
-
# Valid JSON output exists despite non-zero exit — treat as success
|
|
62
|
-
success = True
|
|
63
|
-
except (json.JSONDecodeError, OSError):
|
|
64
|
-
pass
|
|
65
|
-
return success, elapsed_ms, result.stdout, result.stderr
|
|
66
|
-
except subprocess.TimeoutExpired:
|
|
67
|
-
elapsed_ms = (time.time() - start) * 1000
|
|
68
|
-
return False, elapsed_ms, "", f"TIMEOUT after {timeout}s"
|
|
69
|
-
except Exception as e:
|
|
70
|
-
elapsed_ms = (time.time() - start) * 1000
|
|
71
|
-
return False, elapsed_ms, "", str(e)
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def cmd_validate(args):
|
|
75
|
-
harness = args.harness
|
|
76
|
-
config = getattr(args, "config", None)
|
|
77
|
-
timeout = getattr(args, "timeout", 30) or 30
|
|
78
|
-
|
|
79
|
-
if not os.path.exists(harness):
|
|
80
|
-
print(f"FAIL: harness not found: {harness}", file=sys.stderr)
|
|
81
|
-
sys.exit(1)
|
|
82
|
-
|
|
83
|
-
with tempfile.TemporaryDirectory() as tmpdir:
|
|
84
|
-
dummy_task = {"id": "validation", "input": "test input for validation", "metadata": {}}
|
|
85
|
-
input_path = os.path.join(tmpdir, "input.json")
|
|
86
|
-
output_path = os.path.join(tmpdir, "output.json")
|
|
87
|
-
with open(input_path, "w") as f:
|
|
88
|
-
json.dump(dummy_task, f)
|
|
89
|
-
|
|
90
|
-
success, elapsed, stdout, stderr = _run_harness_on_task(
|
|
91
|
-
harness, config, input_path, output_path, None, timeout=timeout,
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
if not success:
|
|
95
|
-
hint = ""
|
|
96
|
-
if "TIMEOUT" in stderr:
|
|
97
|
-
hint = (f"\nHint: validation timed out after {timeout}s. "
|
|
98
|
-
"For LLM-powered agents that make real API calls, "
|
|
99
|
-
"use --timeout to increase the limit: "
|
|
100
|
-
f"evaluate.py validate --harness {harness} --timeout 120")
|
|
101
|
-
print(f"FAIL: harness exited with error.\nstderr: {stderr}{hint}", file=sys.stderr)
|
|
102
|
-
sys.exit(1)
|
|
103
|
-
|
|
104
|
-
if not os.path.exists(output_path):
|
|
105
|
-
print("FAIL: harness did not create output file.", file=sys.stderr)
|
|
106
|
-
sys.exit(1)
|
|
107
|
-
|
|
108
|
-
try:
|
|
109
|
-
with open(output_path) as f:
|
|
110
|
-
output = json.load(f)
|
|
111
|
-
except (json.JSONDecodeError, ValueError) as e:
|
|
112
|
-
print(f"FAIL: output is not valid JSON: {e}", file=sys.stderr)
|
|
113
|
-
sys.exit(1)
|
|
114
|
-
|
|
115
|
-
if "id" not in output or "output" not in output:
|
|
116
|
-
print(f"FAIL: output missing 'id' or 'output' fields. Got: {output}", file=sys.stderr)
|
|
117
|
-
sys.exit(1)
|
|
118
|
-
|
|
119
|
-
print(f"OK: harness validated in {elapsed:.0f}ms. Output: {output}")
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
def cmd_run(args):
|
|
123
|
-
harness = args.harness
|
|
124
|
-
config = getattr(args, "config", None)
|
|
125
|
-
tasks_dir = args.tasks_dir
|
|
126
|
-
eval_script = getattr(args, "eval")
|
|
127
|
-
traces_dir = args.traces_dir
|
|
128
|
-
scores_path = args.scores
|
|
129
|
-
timeout = args.timeout
|
|
130
|
-
|
|
131
|
-
os.makedirs(traces_dir, exist_ok=True)
|
|
132
|
-
|
|
133
|
-
task_files = sorted(f for f in os.listdir(tasks_dir) if f.endswith(".json"))
|
|
134
|
-
if not task_files:
|
|
135
|
-
print(f"FAIL: no .json task files in {tasks_dir}", file=sys.stderr)
|
|
136
|
-
sys.exit(1)
|
|
137
|
-
|
|
138
|
-
all_stdout = []
|
|
139
|
-
all_stderr = []
|
|
140
|
-
timing = {"per_task": {}}
|
|
141
|
-
results_dir = tempfile.mkdtemp()
|
|
142
|
-
|
|
143
|
-
# LangSmith: setup auto-tracing env vars if configured
|
|
144
|
-
langsmith_env = None
|
|
145
|
-
project_config_path = os.path.join(os.path.dirname(os.path.dirname(traces_dir)), "config.json")
|
|
146
|
-
if os.path.exists(project_config_path):
|
|
147
|
-
with open(project_config_path) as f:
|
|
148
|
-
project_config = json.load(f)
|
|
149
|
-
ls = project_config.get("eval", {}).get("langsmith", {})
|
|
150
|
-
if ls.get("enabled"):
|
|
151
|
-
api_key = os.environ.get(ls.get("api_key_env", "LANGSMITH_API_KEY"), "")
|
|
152
|
-
if api_key:
|
|
153
|
-
version = os.path.basename(os.path.dirname(traces_dir))
|
|
154
|
-
ls_project = f"{ls.get('project_prefix', 'harness-evolver')}-{version}"
|
|
155
|
-
langsmith_env = {
|
|
156
|
-
**os.environ,
|
|
157
|
-
"LANGCHAIN_TRACING_V2": "true",
|
|
158
|
-
"LANGCHAIN_API_KEY": api_key,
|
|
159
|
-
"LANGCHAIN_PROJECT": ls_project,
|
|
160
|
-
}
|
|
161
|
-
# Write the project name so the evolve skill knows where to find traces
|
|
162
|
-
ls_project_file = os.path.join(os.path.dirname(os.path.dirname(traces_dir)), "langsmith_project.txt")
|
|
163
|
-
with open(ls_project_file, "w") as f:
|
|
164
|
-
f.write(ls_project)
|
|
165
|
-
|
|
166
|
-
for task_file in task_files:
|
|
167
|
-
task_path = os.path.join(tasks_dir, task_file)
|
|
168
|
-
with open(task_path) as f:
|
|
169
|
-
task = json.load(f)
|
|
170
|
-
task_id = task["id"]
|
|
171
|
-
|
|
172
|
-
task_input = {k: v for k, v in task.items() if k != "expected"}
|
|
173
|
-
|
|
174
|
-
task_traces_dir = os.path.join(traces_dir, task_id)
|
|
175
|
-
os.makedirs(task_traces_dir, exist_ok=True)
|
|
176
|
-
|
|
177
|
-
input_path = os.path.join(task_traces_dir, "input.json")
|
|
178
|
-
with open(input_path, "w") as f:
|
|
179
|
-
json.dump(task_input, f, indent=2)
|
|
180
|
-
|
|
181
|
-
output_path = os.path.join(results_dir, task_file)
|
|
182
|
-
|
|
183
|
-
success, elapsed_ms, stdout, stderr = _run_harness_on_task(
|
|
184
|
-
harness, config, input_path, output_path, task_traces_dir, timeout,
|
|
185
|
-
env=langsmith_env,
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
if os.path.exists(output_path):
|
|
189
|
-
shutil.copy2(output_path, os.path.join(task_traces_dir, "output.json"))
|
|
190
|
-
else:
|
|
191
|
-
with open(os.path.join(task_traces_dir, "output.json"), "w") as f:
|
|
192
|
-
json.dump({"id": task_id, "output": "", "error": "harness failed"}, f)
|
|
193
|
-
|
|
194
|
-
timing["per_task"][task_id] = round(elapsed_ms, 1)
|
|
195
|
-
all_stdout.append(f"--- {task_id} ---\n{stdout}")
|
|
196
|
-
all_stderr.append(f"--- {task_id} ---\n{stderr}")
|
|
197
|
-
|
|
198
|
-
timing["total_ms"] = round(sum(timing["per_task"].values()), 1)
|
|
199
|
-
with open(os.path.join(traces_dir, "timing.json"), "w") as f:
|
|
200
|
-
json.dump(timing, f, indent=2)
|
|
201
|
-
with open(os.path.join(traces_dir, "stdout.log"), "w") as f:
|
|
202
|
-
f.write("\n".join(all_stdout))
|
|
203
|
-
with open(os.path.join(traces_dir, "stderr.log"), "w") as f:
|
|
204
|
-
f.write("\n".join(all_stderr))
|
|
205
|
-
|
|
206
|
-
eval_cmd = [
|
|
207
|
-
_resolve_python(), eval_script,
|
|
208
|
-
"--results-dir", results_dir,
|
|
209
|
-
"--tasks-dir", tasks_dir,
|
|
210
|
-
"--scores", scores_path,
|
|
211
|
-
]
|
|
212
|
-
result = subprocess.run(eval_cmd, capture_output=True, text=True, timeout=120)
|
|
213
|
-
if result.returncode != 0:
|
|
214
|
-
print(f"FAIL: eval script failed.\nstderr: {result.stderr}", file=sys.stderr)
|
|
215
|
-
sys.exit(1)
|
|
216
|
-
|
|
217
|
-
if os.path.exists(scores_path):
|
|
218
|
-
scores = json.load(open(scores_path))
|
|
219
|
-
print(f"Evaluation complete. combined_score: {scores.get('combined_score', 'N/A')}")
|
|
220
|
-
else:
|
|
221
|
-
print("WARNING: eval script did not produce scores file.", file=sys.stderr)
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
def main():
|
|
225
|
-
parser = argparse.ArgumentParser(description="Harness Evolver evaluation orchestrator")
|
|
226
|
-
sub = parser.add_subparsers(dest="command")
|
|
227
|
-
|
|
228
|
-
p_val = sub.add_parser("validate")
|
|
229
|
-
p_val.add_argument("--harness", required=True)
|
|
230
|
-
p_val.add_argument("--config", default=None)
|
|
231
|
-
p_val.add_argument("--timeout", type=int, default=30,
|
|
232
|
-
help="Validation timeout in seconds (default: 30). "
|
|
233
|
-
"Increase for LLM-powered agents that make real API calls.")
|
|
234
|
-
|
|
235
|
-
p_run = sub.add_parser("run")
|
|
236
|
-
p_run.add_argument("--harness", required=True)
|
|
237
|
-
p_run.add_argument("--config", default=None)
|
|
238
|
-
p_run.add_argument("--tasks-dir", required=True)
|
|
239
|
-
p_run.add_argument("--eval", required=True)
|
|
240
|
-
p_run.add_argument("--traces-dir", required=True)
|
|
241
|
-
p_run.add_argument("--scores", required=True)
|
|
242
|
-
p_run.add_argument("--timeout", type=int, default=60)
|
|
243
|
-
|
|
244
|
-
args = parser.parse_args()
|
|
245
|
-
if args.command == "validate":
|
|
246
|
-
cmd_validate(args)
|
|
247
|
-
elif args.command == "run":
|
|
248
|
-
cmd_run(args)
|
|
249
|
-
else:
|
|
250
|
-
parser.print_help()
|
|
251
|
-
sys.exit(1)
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
if __name__ == "__main__":
|
|
255
|
-
main()
|