harness-evolver 2.9.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +62 -117
- package/agents/evolver-architect.md +53 -0
- package/agents/evolver-critic.md +44 -0
- package/agents/evolver-proposer.md +128 -0
- package/agents/evolver-testgen.md +67 -0
- package/bin/install.js +181 -171
- package/package.json +7 -7
- package/skills/deploy/SKILL.md +49 -56
- package/skills/evolve/SKILL.md +156 -687
- package/skills/setup/SKILL.md +182 -0
- package/skills/status/SKILL.md +23 -21
- package/tools/read_results.py +240 -0
- package/tools/run_eval.py +202 -0
- package/tools/seed_from_traces.py +36 -8
- package/tools/setup.py +393 -0
- package/tools/trace_insights.py +86 -14
- package/agents/harness-evolver-architect.md +0 -173
- package/agents/harness-evolver-critic.md +0 -132
- package/agents/harness-evolver-judge.md +0 -110
- package/agents/harness-evolver-proposer.md +0 -317
- package/agents/harness-evolver-testgen.md +0 -112
- package/examples/classifier/README.md +0 -25
- package/examples/classifier/config.json +0 -3
- package/examples/classifier/eval.py +0 -58
- package/examples/classifier/harness.py +0 -111
- package/examples/classifier/tasks/task_001.json +0 -1
- package/examples/classifier/tasks/task_002.json +0 -1
- package/examples/classifier/tasks/task_003.json +0 -1
- package/examples/classifier/tasks/task_004.json +0 -1
- package/examples/classifier/tasks/task_005.json +0 -1
- package/examples/classifier/tasks/task_006.json +0 -1
- package/examples/classifier/tasks/task_007.json +0 -1
- package/examples/classifier/tasks/task_008.json +0 -1
- package/examples/classifier/tasks/task_009.json +0 -1
- package/examples/classifier/tasks/task_010.json +0 -1
- package/skills/architect/SKILL.md +0 -93
- package/skills/compare/SKILL.md +0 -73
- package/skills/critic/SKILL.md +0 -67
- package/skills/diagnose/SKILL.md +0 -96
- package/skills/import-traces/SKILL.md +0 -102
- package/skills/init/SKILL.md +0 -293
- package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
- package/tools/__pycache__/init.cpython-313.pyc +0 -0
- package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
- package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
- package/tools/eval_llm_judge.py +0 -233
- package/tools/eval_passthrough.py +0 -55
- package/tools/evaluate.py +0 -255
- package/tools/import_traces.py +0 -229
- package/tools/init.py +0 -531
- package/tools/llm_api.py +0 -125
- package/tools/state.py +0 -219
- package/tools/test_growth.py +0 -230
- package/tools/trace_logger.py +0 -42
package/tools/state.py
DELETED
|
@@ -1,219 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""State manager for Harness Evolver.
|
|
3
|
-
|
|
4
|
-
Commands:
|
|
5
|
-
init --base-dir DIR --baseline-score FLOAT
|
|
6
|
-
update --base-dir DIR --version VER --scores PATH --proposal PATH
|
|
7
|
-
show --base-dir DIR
|
|
8
|
-
|
|
9
|
-
Manages: summary.json (source of truth), STATE.md (human view), PROPOSER_HISTORY.md (log).
|
|
10
|
-
Stdlib-only. No external dependencies.
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
import argparse
|
|
14
|
-
import json
|
|
15
|
-
import os
|
|
16
|
-
import re
|
|
17
|
-
import sys
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
def _read_json(path):
|
|
21
|
-
with open(path) as f:
|
|
22
|
-
return json.load(f)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def _write_json(path, data):
|
|
26
|
-
with open(path, "w") as f:
|
|
27
|
-
json.dump(data, f, indent=2)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def _read_text(path):
|
|
31
|
-
with open(path) as f:
|
|
32
|
-
return f.read()
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def _write_text(path, text):
|
|
36
|
-
with open(path, "w") as f:
|
|
37
|
-
f.write(text)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
def _summary_path(base_dir):
|
|
41
|
-
return os.path.join(base_dir, "summary.json")
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def _state_md_path(base_dir):
|
|
45
|
-
return os.path.join(base_dir, "STATE.md")
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def _history_path(base_dir):
|
|
49
|
-
return os.path.join(base_dir, "PROPOSER_HISTORY.md")
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def _detect_parent(proposal_text, current_best):
|
|
53
|
-
"""Parse 'Based on vXXX' or 'Based on baseline' from proposal text."""
|
|
54
|
-
match = re.search(r"[Bb]ased on (v\d+|baseline)", proposal_text)
|
|
55
|
-
if match:
|
|
56
|
-
return match.group(1)
|
|
57
|
-
return current_best
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def _render_state_md(summary):
|
|
61
|
-
"""Generate STATE.md from summary.json data."""
|
|
62
|
-
lines = ["# Harness Evolver Status", ""]
|
|
63
|
-
best = summary["best"]
|
|
64
|
-
worst = summary["worst"]
|
|
65
|
-
lines.append(f"**Iterations:** {summary['iterations']}")
|
|
66
|
-
lines.append(f"**Best:** {best['version']} ({best['combined_score']:.2f})")
|
|
67
|
-
lines.append(f"**Worst:** {worst['version']} ({worst['combined_score']:.2f})")
|
|
68
|
-
if summary["history"]:
|
|
69
|
-
last = summary["history"][-1]
|
|
70
|
-
lines.append(f"**Latest:** {last['version']} ({last['combined_score']:.2f})")
|
|
71
|
-
lines.append("")
|
|
72
|
-
lines.append("## History")
|
|
73
|
-
lines.append("")
|
|
74
|
-
lines.append("| Version | Score | Parent | Delta |")
|
|
75
|
-
lines.append("|---------|-------|--------|-------|")
|
|
76
|
-
prev_score = None
|
|
77
|
-
for entry in summary["history"]:
|
|
78
|
-
v = entry["version"]
|
|
79
|
-
s = entry["combined_score"]
|
|
80
|
-
p = entry["parent"] or "-"
|
|
81
|
-
if prev_score is not None and v != "baseline":
|
|
82
|
-
delta = s - prev_score
|
|
83
|
-
if delta < -0.01:
|
|
84
|
-
delta_str = f"{delta:+.2f} REGRESSION"
|
|
85
|
-
elif delta > 0.01:
|
|
86
|
-
delta_str = f"{delta:+.2f}"
|
|
87
|
-
else:
|
|
88
|
-
delta_str = f"{delta:+.2f} (stagnant)"
|
|
89
|
-
else:
|
|
90
|
-
delta_str = "-"
|
|
91
|
-
lines.append(f"| {v} | {s:.2f} | {p} | {delta_str} |")
|
|
92
|
-
prev_score = s
|
|
93
|
-
return "\n".join(lines) + "\n"
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def cmd_init(args):
|
|
97
|
-
base_dir = args.base_dir
|
|
98
|
-
score = args.baseline_score
|
|
99
|
-
os.makedirs(base_dir, exist_ok=True)
|
|
100
|
-
|
|
101
|
-
summary = {
|
|
102
|
-
"iterations": 0,
|
|
103
|
-
"best": {"version": "baseline", "combined_score": score},
|
|
104
|
-
"worst": {"version": "baseline", "combined_score": score},
|
|
105
|
-
"history": [
|
|
106
|
-
{"version": "baseline", "combined_score": score, "parent": None}
|
|
107
|
-
],
|
|
108
|
-
}
|
|
109
|
-
_write_json(_summary_path(base_dir), summary)
|
|
110
|
-
_write_text(_state_md_path(base_dir), _render_state_md(summary))
|
|
111
|
-
_write_text(_history_path(base_dir), "# Proposer History\n")
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def cmd_update(args):
|
|
115
|
-
base_dir = args.base_dir
|
|
116
|
-
version = args.version
|
|
117
|
-
scores = _read_json(args.scores)
|
|
118
|
-
proposal_text = _read_text(args.proposal) if args.proposal else ""
|
|
119
|
-
|
|
120
|
-
summary = _read_json(_summary_path(base_dir))
|
|
121
|
-
combined = scores.get("combined_score", 0.0)
|
|
122
|
-
parent = _detect_parent(proposal_text, summary["best"]["version"])
|
|
123
|
-
|
|
124
|
-
entry = {
|
|
125
|
-
"version": version,
|
|
126
|
-
"combined_score": combined,
|
|
127
|
-
"parent": parent,
|
|
128
|
-
}
|
|
129
|
-
summary["history"].append(entry)
|
|
130
|
-
summary["iterations"] = len(summary["history"]) - 1
|
|
131
|
-
|
|
132
|
-
non_baseline = [h for h in summary["history"] if h["version"] != "baseline"]
|
|
133
|
-
if non_baseline:
|
|
134
|
-
best_entry = max(non_baseline, key=lambda h: h["combined_score"])
|
|
135
|
-
worst_entry = min(non_baseline, key=lambda h: h["combined_score"])
|
|
136
|
-
summary["best"] = {
|
|
137
|
-
"version": best_entry["version"],
|
|
138
|
-
"combined_score": best_entry["combined_score"],
|
|
139
|
-
}
|
|
140
|
-
summary["worst"] = {
|
|
141
|
-
"version": worst_entry["version"],
|
|
142
|
-
"combined_score": worst_entry["combined_score"],
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
_write_json(_summary_path(base_dir), summary)
|
|
146
|
-
_write_text(_state_md_path(base_dir), _render_state_md(summary))
|
|
147
|
-
|
|
148
|
-
parent_score = None
|
|
149
|
-
for h in summary["history"]:
|
|
150
|
-
if h["version"] == parent:
|
|
151
|
-
parent_score = h["combined_score"]
|
|
152
|
-
break
|
|
153
|
-
|
|
154
|
-
is_regression = parent_score is not None and combined < parent_score - 0.01
|
|
155
|
-
regression_tag = " <- REGRESSION" if is_regression else ""
|
|
156
|
-
|
|
157
|
-
proposal_lines = proposal_text.strip().split("\n")
|
|
158
|
-
summary_line = ""
|
|
159
|
-
for line in proposal_lines:
|
|
160
|
-
stripped = line.strip()
|
|
161
|
-
if stripped and not re.match(r"^[Bb]ased on", stripped):
|
|
162
|
-
summary_line = stripped
|
|
163
|
-
break
|
|
164
|
-
|
|
165
|
-
history_entry = f"\n## {version} (score: {combined:.2f}){regression_tag}\n{summary_line}\n"
|
|
166
|
-
history_path = _history_path(base_dir)
|
|
167
|
-
with open(history_path, "a") as f:
|
|
168
|
-
f.write(history_entry)
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
def cmd_show(args):
|
|
172
|
-
base_dir = args.base_dir
|
|
173
|
-
summary = _read_json(_summary_path(base_dir))
|
|
174
|
-
best = summary["best"]
|
|
175
|
-
worst = summary["worst"]
|
|
176
|
-
|
|
177
|
-
print(f"Harness Evolver — Iteration {summary['iterations']}")
|
|
178
|
-
print(f"Best: {best['version']} score: {best['combined_score']:.2f}")
|
|
179
|
-
print(f"Worst: {worst['version']} score: {worst['combined_score']:.2f}")
|
|
180
|
-
print()
|
|
181
|
-
for entry in summary["history"]:
|
|
182
|
-
v = entry["version"]
|
|
183
|
-
s = entry["combined_score"]
|
|
184
|
-
bar_len = int(s * 30)
|
|
185
|
-
bar = "\u2588" * bar_len
|
|
186
|
-
print(f" {v:>10}: {s:.2f} {bar}")
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def main():
|
|
190
|
-
parser = argparse.ArgumentParser(description="Harness Evolver state manager")
|
|
191
|
-
sub = parser.add_subparsers(dest="command")
|
|
192
|
-
|
|
193
|
-
p_init = sub.add_parser("init")
|
|
194
|
-
p_init.add_argument("--base-dir", required=True)
|
|
195
|
-
p_init.add_argument("--baseline-score", type=float, required=True)
|
|
196
|
-
|
|
197
|
-
p_update = sub.add_parser("update")
|
|
198
|
-
p_update.add_argument("--base-dir", required=True)
|
|
199
|
-
p_update.add_argument("--version", required=True)
|
|
200
|
-
p_update.add_argument("--scores", required=True)
|
|
201
|
-
p_update.add_argument("--proposal", default=None)
|
|
202
|
-
|
|
203
|
-
p_show = sub.add_parser("show")
|
|
204
|
-
p_show.add_argument("--base-dir", required=True)
|
|
205
|
-
|
|
206
|
-
args = parser.parse_args()
|
|
207
|
-
if args.command == "init":
|
|
208
|
-
cmd_init(args)
|
|
209
|
-
elif args.command == "update":
|
|
210
|
-
cmd_update(args)
|
|
211
|
-
elif args.command == "show":
|
|
212
|
-
cmd_show(args)
|
|
213
|
-
else:
|
|
214
|
-
parser.print_help()
|
|
215
|
-
sys.exit(1)
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
if __name__ == "__main__":
|
|
219
|
-
main()
|
package/tools/test_growth.py
DELETED
|
@@ -1,230 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""Test Suite Growth for Harness Evolver.
|
|
3
|
-
|
|
4
|
-
Generates regression test tasks when previously-failing tasks are now passing.
|
|
5
|
-
Creates mechanical variations of fixed tasks to prevent future regressions.
|
|
6
|
-
|
|
7
|
-
Usage:
|
|
8
|
-
python3 test_growth.py \
|
|
9
|
-
--current-scores .harness-evolver/harnesses/v003/scores.json \
|
|
10
|
-
--previous-scores .harness-evolver/harnesses/v002/scores.json \
|
|
11
|
-
--tasks-dir .harness-evolver/eval/tasks/ \
|
|
12
|
-
--output-dir .harness-evolver/eval/tasks/ \
|
|
13
|
-
--max-total-tasks 60
|
|
14
|
-
|
|
15
|
-
Stdlib-only. No external dependencies.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
import argparse
|
|
19
|
-
import json
|
|
20
|
-
import os
|
|
21
|
-
import re
|
|
22
|
-
import sys
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def load_json(path):
|
|
26
|
-
"""Load JSON file, return None if missing or invalid."""
|
|
27
|
-
if not path or not os.path.exists(path):
|
|
28
|
-
return None
|
|
29
|
-
try:
|
|
30
|
-
with open(path) as f:
|
|
31
|
-
return json.load(f)
|
|
32
|
-
except (json.JSONDecodeError, OSError):
|
|
33
|
-
return None
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def find_fixed_tasks(current_scores, previous_scores, fix_threshold_before=0.5, fix_threshold_after=0.8):
|
|
37
|
-
"""Find tasks that improved significantly: score < before_threshold → > after_threshold."""
|
|
38
|
-
current_per_task = current_scores.get("per_task", {})
|
|
39
|
-
previous_per_task = previous_scores.get("per_task", {})
|
|
40
|
-
|
|
41
|
-
fixed = []
|
|
42
|
-
for tid, curr_data in current_per_task.items():
|
|
43
|
-
if not isinstance(curr_data, dict):
|
|
44
|
-
continue
|
|
45
|
-
curr_score = curr_data.get("score", 0)
|
|
46
|
-
prev_data = previous_per_task.get(tid, {})
|
|
47
|
-
prev_score = prev_data.get("score", 0) if isinstance(prev_data, dict) else 0
|
|
48
|
-
|
|
49
|
-
if prev_score < fix_threshold_before and curr_score > fix_threshold_after:
|
|
50
|
-
fixed.append({
|
|
51
|
-
"task_id": tid,
|
|
52
|
-
"previous_score": prev_score,
|
|
53
|
-
"current_score": curr_score,
|
|
54
|
-
"improvement": curr_score - prev_score,
|
|
55
|
-
})
|
|
56
|
-
|
|
57
|
-
# Sort by improvement (biggest fixes first)
|
|
58
|
-
fixed.sort(key=lambda x: -x["improvement"])
|
|
59
|
-
return fixed
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def count_existing_tasks(directory):
|
|
63
|
-
"""Count existing task JSON files in directory."""
|
|
64
|
-
if not os.path.isdir(directory):
|
|
65
|
-
return 0
|
|
66
|
-
return sum(1 for f in os.listdir(directory) if f.endswith(".json"))
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def next_regression_id(output_dir):
|
|
70
|
-
"""Find the next available regression task ID."""
|
|
71
|
-
existing = set()
|
|
72
|
-
if os.path.isdir(output_dir):
|
|
73
|
-
for fname in os.listdir(output_dir):
|
|
74
|
-
m = re.match(r"regression_(\d+)\.json", fname)
|
|
75
|
-
if m:
|
|
76
|
-
existing.add(int(m.group(1)))
|
|
77
|
-
n = 1
|
|
78
|
-
while n in existing:
|
|
79
|
-
n += 1
|
|
80
|
-
return n
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def generate_variations(original_input, task_id):
|
|
84
|
-
"""Generate 2-3 mechanical variations of an input string.
|
|
85
|
-
|
|
86
|
-
Uses simple string transforms — no LLM needed:
|
|
87
|
-
- Rephrase by reordering
|
|
88
|
-
- Add qualifying clause
|
|
89
|
-
- Simplify to minimal form
|
|
90
|
-
"""
|
|
91
|
-
variations = []
|
|
92
|
-
text = original_input.strip()
|
|
93
|
-
|
|
94
|
-
# Variation 1: Add a qualifying clause
|
|
95
|
-
qualifiers = [
|
|
96
|
-
"Please be specific and detailed in your response.",
|
|
97
|
-
"Consider edge cases in your answer.",
|
|
98
|
-
"Provide a concise but thorough response.",
|
|
99
|
-
"Think step by step before answering.",
|
|
100
|
-
]
|
|
101
|
-
# Pick qualifier based on hash of task_id for determinism
|
|
102
|
-
qi = hash(task_id) % len(qualifiers)
|
|
103
|
-
v1 = f"{text}\n\n{qualifiers[qi]}"
|
|
104
|
-
variations.append(("qualified", v1))
|
|
105
|
-
|
|
106
|
-
# Variation 2: Reorder sentences if multiple exist
|
|
107
|
-
sentences = re.split(r"(?<=[.!?])\s+", text)
|
|
108
|
-
if len(sentences) >= 2:
|
|
109
|
-
# Swap first two sentences
|
|
110
|
-
reordered = sentences[1:] + sentences[:1]
|
|
111
|
-
v2 = " ".join(reordered)
|
|
112
|
-
variations.append(("reordered", v2))
|
|
113
|
-
else:
|
|
114
|
-
# If single sentence, prepend "Given the context: "
|
|
115
|
-
v2 = f"Given the following context, {text[0].lower()}{text[1:]}" if len(text) > 1 else text
|
|
116
|
-
variations.append(("rephrased", v2))
|
|
117
|
-
|
|
118
|
-
# Variation 3: Minimal version — strip to core question
|
|
119
|
-
# Remove qualifiers, keep just the main ask
|
|
120
|
-
minimal = text
|
|
121
|
-
# Strip common padding phrases
|
|
122
|
-
for prefix in ["Please ", "Can you ", "Could you ", "I would like you to ", "I need you to "]:
|
|
123
|
-
if minimal.startswith(prefix):
|
|
124
|
-
minimal = minimal[len(prefix):]
|
|
125
|
-
minimal = minimal[0].upper() + minimal[1:] if minimal else minimal
|
|
126
|
-
break
|
|
127
|
-
if minimal != text:
|
|
128
|
-
variations.append(("minimal", minimal))
|
|
129
|
-
|
|
130
|
-
return variations
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
def main():
|
|
134
|
-
parser = argparse.ArgumentParser(description="Generate regression test tasks from score improvements")
|
|
135
|
-
parser.add_argument("--current-scores", required=True, help="Path to current version's scores.json")
|
|
136
|
-
parser.add_argument("--previous-scores", required=True, help="Path to previous version's scores.json")
|
|
137
|
-
parser.add_argument("--tasks-dir", required=True, help="Path to eval/tasks/ (to read originals)")
|
|
138
|
-
parser.add_argument("--output-dir", required=True, help="Directory to write regression tasks")
|
|
139
|
-
parser.add_argument("--max-total-tasks", type=int, default=60, help="Cap total tasks in output-dir (default 60)")
|
|
140
|
-
args = parser.parse_args()
|
|
141
|
-
|
|
142
|
-
current = load_json(args.current_scores)
|
|
143
|
-
previous = load_json(args.previous_scores)
|
|
144
|
-
|
|
145
|
-
if not current or not previous:
|
|
146
|
-
print("Missing scores files — skipping test growth")
|
|
147
|
-
return
|
|
148
|
-
|
|
149
|
-
# Find tasks that were fixed
|
|
150
|
-
fixed = find_fixed_tasks(current, previous)
|
|
151
|
-
if not fixed:
|
|
152
|
-
print("No tasks improved significantly — no regression tasks needed")
|
|
153
|
-
return
|
|
154
|
-
|
|
155
|
-
# Check capacity
|
|
156
|
-
existing_count = count_existing_tasks(args.output_dir)
|
|
157
|
-
available_slots = args.max_total_tasks - existing_count
|
|
158
|
-
if available_slots <= 0:
|
|
159
|
-
print(f"Task suite already at capacity ({existing_count}/{args.max_total_tasks}) — skipping growth")
|
|
160
|
-
return
|
|
161
|
-
|
|
162
|
-
os.makedirs(args.output_dir, exist_ok=True)
|
|
163
|
-
regression_id = next_regression_id(args.output_dir)
|
|
164
|
-
tasks_added = 0
|
|
165
|
-
fixed_ids = []
|
|
166
|
-
|
|
167
|
-
for fix_info in fixed:
|
|
168
|
-
if tasks_added >= available_slots:
|
|
169
|
-
break
|
|
170
|
-
|
|
171
|
-
tid = fix_info["task_id"]
|
|
172
|
-
# Load original task
|
|
173
|
-
task_path = os.path.join(args.tasks_dir, f"{tid}.json")
|
|
174
|
-
original = load_json(task_path)
|
|
175
|
-
if not original:
|
|
176
|
-
continue
|
|
177
|
-
|
|
178
|
-
original_input = original.get("input", "")
|
|
179
|
-
if not original_input:
|
|
180
|
-
continue
|
|
181
|
-
|
|
182
|
-
original_meta = original.get("metadata", {})
|
|
183
|
-
variations = generate_variations(original_input, tid)
|
|
184
|
-
|
|
185
|
-
for var_type, var_input in variations:
|
|
186
|
-
if tasks_added >= available_slots:
|
|
187
|
-
break
|
|
188
|
-
|
|
189
|
-
reg_id = f"regression_{regression_id:03d}"
|
|
190
|
-
task = {
|
|
191
|
-
"id": reg_id,
|
|
192
|
-
"input": var_input,
|
|
193
|
-
"metadata": {
|
|
194
|
-
"difficulty": original_meta.get("difficulty", "medium"),
|
|
195
|
-
"category": original_meta.get("category", "unknown"),
|
|
196
|
-
"type": "regression",
|
|
197
|
-
"source": "regression",
|
|
198
|
-
"regression_for": tid,
|
|
199
|
-
"variation": var_type,
|
|
200
|
-
"previous_score": fix_info["previous_score"],
|
|
201
|
-
"fixed_at_score": fix_info["current_score"],
|
|
202
|
-
},
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
# Include expected if original had it
|
|
206
|
-
if "expected" in original:
|
|
207
|
-
task["expected"] = original["expected"]
|
|
208
|
-
|
|
209
|
-
out_path = os.path.join(args.output_dir, f"{reg_id}.json")
|
|
210
|
-
with open(out_path, "w") as f:
|
|
211
|
-
json.dump(task, f, indent=2)
|
|
212
|
-
|
|
213
|
-
tasks_added += 1
|
|
214
|
-
regression_id += 1
|
|
215
|
-
|
|
216
|
-
fixed_ids.append(tid)
|
|
217
|
-
|
|
218
|
-
# Output summary
|
|
219
|
-
summary = {
|
|
220
|
-
"tasks_added": tasks_added,
|
|
221
|
-
"fixed_tasks": fixed_ids,
|
|
222
|
-
"total_tasks_now": existing_count + tasks_added,
|
|
223
|
-
"max_total_tasks": args.max_total_tasks,
|
|
224
|
-
}
|
|
225
|
-
print(json.dumps(summary))
|
|
226
|
-
print(f"Added {tasks_added} regression tasks to lock in improvements on: {', '.join(fixed_ids)}")
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
if __name__ == "__main__":
|
|
230
|
-
main()
|
package/tools/trace_logger.py
DELETED
|
@@ -1,42 +0,0 @@
|
|
|
1
|
-
"""TraceLogger — optional helper for harnesses to write structured trace records.
|
|
2
|
-
|
|
3
|
-
Usage in a harness:
|
|
4
|
-
from trace_logger import TraceLogger
|
|
5
|
-
|
|
6
|
-
trace = TraceLogger(traces_dir)
|
|
7
|
-
trace.step("llm_call", {"prompt": p, "response": r, "model": "gpt-4"})
|
|
8
|
-
trace.step("tool_use", {"tool": "search", "query": q, "results": results})
|
|
9
|
-
trace.save()
|
|
10
|
-
|
|
11
|
-
Stdlib-only. No external dependencies.
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
import json
|
|
15
|
-
import os
|
|
16
|
-
import time
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class TraceLogger:
|
|
20
|
-
def __init__(self, traces_dir):
|
|
21
|
-
self.traces_dir = traces_dir
|
|
22
|
-
self._steps = []
|
|
23
|
-
if traces_dir:
|
|
24
|
-
os.makedirs(traces_dir, exist_ok=True)
|
|
25
|
-
|
|
26
|
-
def step(self, name, data=None):
|
|
27
|
-
self._steps.append({
|
|
28
|
-
"name": name,
|
|
29
|
-
"timestamp": time.time(),
|
|
30
|
-
"data": data if data is not None else {},
|
|
31
|
-
})
|
|
32
|
-
|
|
33
|
-
def save(self):
|
|
34
|
-
if not self.traces_dir:
|
|
35
|
-
return
|
|
36
|
-
path = os.path.join(self.traces_dir, "trace.json")
|
|
37
|
-
with open(path, "w") as f:
|
|
38
|
-
json.dump(self._steps, f, indent=2)
|
|
39
|
-
|
|
40
|
-
@property
|
|
41
|
-
def steps(self):
|
|
42
|
-
return list(self._steps)
|