harness-evolver 2.9.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +62 -117
  2. package/agents/evolver-architect.md +53 -0
  3. package/agents/evolver-critic.md +44 -0
  4. package/agents/evolver-proposer.md +128 -0
  5. package/agents/evolver-testgen.md +67 -0
  6. package/bin/install.js +181 -171
  7. package/package.json +7 -7
  8. package/skills/deploy/SKILL.md +49 -56
  9. package/skills/evolve/SKILL.md +180 -700
  10. package/skills/setup/SKILL.md +182 -0
  11. package/skills/status/SKILL.md +23 -21
  12. package/tools/read_results.py +240 -0
  13. package/tools/run_eval.py +202 -0
  14. package/tools/seed_from_traces.py +36 -8
  15. package/tools/setup.py +393 -0
  16. package/tools/trace_insights.py +86 -14
  17. package/agents/harness-evolver-architect.md +0 -173
  18. package/agents/harness-evolver-critic.md +0 -132
  19. package/agents/harness-evolver-judge.md +0 -110
  20. package/agents/harness-evolver-proposer.md +0 -317
  21. package/agents/harness-evolver-testgen.md +0 -112
  22. package/examples/classifier/README.md +0 -25
  23. package/examples/classifier/config.json +0 -3
  24. package/examples/classifier/eval.py +0 -58
  25. package/examples/classifier/harness.py +0 -111
  26. package/examples/classifier/tasks/task_001.json +0 -1
  27. package/examples/classifier/tasks/task_002.json +0 -1
  28. package/examples/classifier/tasks/task_003.json +0 -1
  29. package/examples/classifier/tasks/task_004.json +0 -1
  30. package/examples/classifier/tasks/task_005.json +0 -1
  31. package/examples/classifier/tasks/task_006.json +0 -1
  32. package/examples/classifier/tasks/task_007.json +0 -1
  33. package/examples/classifier/tasks/task_008.json +0 -1
  34. package/examples/classifier/tasks/task_009.json +0 -1
  35. package/examples/classifier/tasks/task_010.json +0 -1
  36. package/skills/architect/SKILL.md +0 -93
  37. package/skills/compare/SKILL.md +0 -73
  38. package/skills/critic/SKILL.md +0 -67
  39. package/skills/diagnose/SKILL.md +0 -96
  40. package/skills/import-traces/SKILL.md +0 -102
  41. package/skills/init/SKILL.md +0 -253
  42. package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
  43. package/tools/__pycache__/init.cpython-313.pyc +0 -0
  44. package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
  45. package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
  46. package/tools/eval_llm_judge.py +0 -233
  47. package/tools/eval_passthrough.py +0 -55
  48. package/tools/evaluate.py +0 -255
  49. package/tools/import_traces.py +0 -229
  50. package/tools/init.py +0 -531
  51. package/tools/llm_api.py +0 -125
  52. package/tools/state.py +0 -219
  53. package/tools/test_growth.py +0 -230
  54. package/tools/trace_logger.py +0 -42
package/tools/state.py DELETED
@@ -1,219 +0,0 @@
1
- #!/usr/bin/env python3
2
- """State manager for Harness Evolver.
3
-
4
- Commands:
5
- init --base-dir DIR --baseline-score FLOAT
6
- update --base-dir DIR --version VER --scores PATH --proposal PATH
7
- show --base-dir DIR
8
-
9
- Manages: summary.json (source of truth), STATE.md (human view), PROPOSER_HISTORY.md (log).
10
- Stdlib-only. No external dependencies.
11
- """
12
-
13
- import argparse
14
- import json
15
- import os
16
- import re
17
- import sys
18
-
19
-
20
- def _read_json(path):
21
- with open(path) as f:
22
- return json.load(f)
23
-
24
-
25
- def _write_json(path, data):
26
- with open(path, "w") as f:
27
- json.dump(data, f, indent=2)
28
-
29
-
30
- def _read_text(path):
31
- with open(path) as f:
32
- return f.read()
33
-
34
-
35
- def _write_text(path, text):
36
- with open(path, "w") as f:
37
- f.write(text)
38
-
39
-
40
- def _summary_path(base_dir):
41
- return os.path.join(base_dir, "summary.json")
42
-
43
-
44
- def _state_md_path(base_dir):
45
- return os.path.join(base_dir, "STATE.md")
46
-
47
-
48
- def _history_path(base_dir):
49
- return os.path.join(base_dir, "PROPOSER_HISTORY.md")
50
-
51
-
52
- def _detect_parent(proposal_text, current_best):
53
- """Parse 'Based on vXXX' or 'Based on baseline' from proposal text."""
54
- match = re.search(r"[Bb]ased on (v\d+|baseline)", proposal_text)
55
- if match:
56
- return match.group(1)
57
- return current_best
58
-
59
-
60
- def _render_state_md(summary):
61
- """Generate STATE.md from summary.json data."""
62
- lines = ["# Harness Evolver Status", ""]
63
- best = summary["best"]
64
- worst = summary["worst"]
65
- lines.append(f"**Iterations:** {summary['iterations']}")
66
- lines.append(f"**Best:** {best['version']} ({best['combined_score']:.2f})")
67
- lines.append(f"**Worst:** {worst['version']} ({worst['combined_score']:.2f})")
68
- if summary["history"]:
69
- last = summary["history"][-1]
70
- lines.append(f"**Latest:** {last['version']} ({last['combined_score']:.2f})")
71
- lines.append("")
72
- lines.append("## History")
73
- lines.append("")
74
- lines.append("| Version | Score | Parent | Delta |")
75
- lines.append("|---------|-------|--------|-------|")
76
- prev_score = None
77
- for entry in summary["history"]:
78
- v = entry["version"]
79
- s = entry["combined_score"]
80
- p = entry["parent"] or "-"
81
- if prev_score is not None and v != "baseline":
82
- delta = s - prev_score
83
- if delta < -0.01:
84
- delta_str = f"{delta:+.2f} REGRESSION"
85
- elif delta > 0.01:
86
- delta_str = f"{delta:+.2f}"
87
- else:
88
- delta_str = f"{delta:+.2f} (stagnant)"
89
- else:
90
- delta_str = "-"
91
- lines.append(f"| {v} | {s:.2f} | {p} | {delta_str} |")
92
- prev_score = s
93
- return "\n".join(lines) + "\n"
94
-
95
-
96
- def cmd_init(args):
97
- base_dir = args.base_dir
98
- score = args.baseline_score
99
- os.makedirs(base_dir, exist_ok=True)
100
-
101
- summary = {
102
- "iterations": 0,
103
- "best": {"version": "baseline", "combined_score": score},
104
- "worst": {"version": "baseline", "combined_score": score},
105
- "history": [
106
- {"version": "baseline", "combined_score": score, "parent": None}
107
- ],
108
- }
109
- _write_json(_summary_path(base_dir), summary)
110
- _write_text(_state_md_path(base_dir), _render_state_md(summary))
111
- _write_text(_history_path(base_dir), "# Proposer History\n")
112
-
113
-
114
- def cmd_update(args):
115
- base_dir = args.base_dir
116
- version = args.version
117
- scores = _read_json(args.scores)
118
- proposal_text = _read_text(args.proposal) if args.proposal else ""
119
-
120
- summary = _read_json(_summary_path(base_dir))
121
- combined = scores.get("combined_score", 0.0)
122
- parent = _detect_parent(proposal_text, summary["best"]["version"])
123
-
124
- entry = {
125
- "version": version,
126
- "combined_score": combined,
127
- "parent": parent,
128
- }
129
- summary["history"].append(entry)
130
- summary["iterations"] = len(summary["history"]) - 1
131
-
132
- non_baseline = [h for h in summary["history"] if h["version"] != "baseline"]
133
- if non_baseline:
134
- best_entry = max(non_baseline, key=lambda h: h["combined_score"])
135
- worst_entry = min(non_baseline, key=lambda h: h["combined_score"])
136
- summary["best"] = {
137
- "version": best_entry["version"],
138
- "combined_score": best_entry["combined_score"],
139
- }
140
- summary["worst"] = {
141
- "version": worst_entry["version"],
142
- "combined_score": worst_entry["combined_score"],
143
- }
144
-
145
- _write_json(_summary_path(base_dir), summary)
146
- _write_text(_state_md_path(base_dir), _render_state_md(summary))
147
-
148
- parent_score = None
149
- for h in summary["history"]:
150
- if h["version"] == parent:
151
- parent_score = h["combined_score"]
152
- break
153
-
154
- is_regression = parent_score is not None and combined < parent_score - 0.01
155
- regression_tag = " <- REGRESSION" if is_regression else ""
156
-
157
- proposal_lines = proposal_text.strip().split("\n")
158
- summary_line = ""
159
- for line in proposal_lines:
160
- stripped = line.strip()
161
- if stripped and not re.match(r"^[Bb]ased on", stripped):
162
- summary_line = stripped
163
- break
164
-
165
- history_entry = f"\n## {version} (score: {combined:.2f}){regression_tag}\n{summary_line}\n"
166
- history_path = _history_path(base_dir)
167
- with open(history_path, "a") as f:
168
- f.write(history_entry)
169
-
170
-
171
- def cmd_show(args):
172
- base_dir = args.base_dir
173
- summary = _read_json(_summary_path(base_dir))
174
- best = summary["best"]
175
- worst = summary["worst"]
176
-
177
- print(f"Harness Evolver — Iteration {summary['iterations']}")
178
- print(f"Best: {best['version']} score: {best['combined_score']:.2f}")
179
- print(f"Worst: {worst['version']} score: {worst['combined_score']:.2f}")
180
- print()
181
- for entry in summary["history"]:
182
- v = entry["version"]
183
- s = entry["combined_score"]
184
- bar_len = int(s * 30)
185
- bar = "\u2588" * bar_len
186
- print(f" {v:>10}: {s:.2f} {bar}")
187
-
188
-
189
- def main():
190
- parser = argparse.ArgumentParser(description="Harness Evolver state manager")
191
- sub = parser.add_subparsers(dest="command")
192
-
193
- p_init = sub.add_parser("init")
194
- p_init.add_argument("--base-dir", required=True)
195
- p_init.add_argument("--baseline-score", type=float, required=True)
196
-
197
- p_update = sub.add_parser("update")
198
- p_update.add_argument("--base-dir", required=True)
199
- p_update.add_argument("--version", required=True)
200
- p_update.add_argument("--scores", required=True)
201
- p_update.add_argument("--proposal", default=None)
202
-
203
- p_show = sub.add_parser("show")
204
- p_show.add_argument("--base-dir", required=True)
205
-
206
- args = parser.parse_args()
207
- if args.command == "init":
208
- cmd_init(args)
209
- elif args.command == "update":
210
- cmd_update(args)
211
- elif args.command == "show":
212
- cmd_show(args)
213
- else:
214
- parser.print_help()
215
- sys.exit(1)
216
-
217
-
218
- if __name__ == "__main__":
219
- main()
@@ -1,230 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Test Suite Growth for Harness Evolver.
3
-
4
- Generates regression test tasks when previously-failing tasks are now passing.
5
- Creates mechanical variations of fixed tasks to prevent future regressions.
6
-
7
- Usage:
8
- python3 test_growth.py \
9
- --current-scores .harness-evolver/harnesses/v003/scores.json \
10
- --previous-scores .harness-evolver/harnesses/v002/scores.json \
11
- --tasks-dir .harness-evolver/eval/tasks/ \
12
- --output-dir .harness-evolver/eval/tasks/ \
13
- --max-total-tasks 60
14
-
15
- Stdlib-only. No external dependencies.
16
- """
17
-
18
- import argparse
19
- import json
20
- import os
21
- import re
22
- import sys
23
-
24
-
25
- def load_json(path):
26
- """Load JSON file, return None if missing or invalid."""
27
- if not path or not os.path.exists(path):
28
- return None
29
- try:
30
- with open(path) as f:
31
- return json.load(f)
32
- except (json.JSONDecodeError, OSError):
33
- return None
34
-
35
-
36
- def find_fixed_tasks(current_scores, previous_scores, fix_threshold_before=0.5, fix_threshold_after=0.8):
37
- """Find tasks that improved significantly: score < before_threshold → > after_threshold."""
38
- current_per_task = current_scores.get("per_task", {})
39
- previous_per_task = previous_scores.get("per_task", {})
40
-
41
- fixed = []
42
- for tid, curr_data in current_per_task.items():
43
- if not isinstance(curr_data, dict):
44
- continue
45
- curr_score = curr_data.get("score", 0)
46
- prev_data = previous_per_task.get(tid, {})
47
- prev_score = prev_data.get("score", 0) if isinstance(prev_data, dict) else 0
48
-
49
- if prev_score < fix_threshold_before and curr_score > fix_threshold_after:
50
- fixed.append({
51
- "task_id": tid,
52
- "previous_score": prev_score,
53
- "current_score": curr_score,
54
- "improvement": curr_score - prev_score,
55
- })
56
-
57
- # Sort by improvement (biggest fixes first)
58
- fixed.sort(key=lambda x: -x["improvement"])
59
- return fixed
60
-
61
-
62
- def count_existing_tasks(directory):
63
- """Count existing task JSON files in directory."""
64
- if not os.path.isdir(directory):
65
- return 0
66
- return sum(1 for f in os.listdir(directory) if f.endswith(".json"))
67
-
68
-
69
- def next_regression_id(output_dir):
70
- """Find the next available regression task ID."""
71
- existing = set()
72
- if os.path.isdir(output_dir):
73
- for fname in os.listdir(output_dir):
74
- m = re.match(r"regression_(\d+)\.json", fname)
75
- if m:
76
- existing.add(int(m.group(1)))
77
- n = 1
78
- while n in existing:
79
- n += 1
80
- return n
81
-
82
-
83
- def generate_variations(original_input, task_id):
84
- """Generate 2-3 mechanical variations of an input string.
85
-
86
- Uses simple string transforms — no LLM needed:
87
- - Rephrase by reordering
88
- - Add qualifying clause
89
- - Simplify to minimal form
90
- """
91
- variations = []
92
- text = original_input.strip()
93
-
94
- # Variation 1: Add a qualifying clause
95
- qualifiers = [
96
- "Please be specific and detailed in your response.",
97
- "Consider edge cases in your answer.",
98
- "Provide a concise but thorough response.",
99
- "Think step by step before answering.",
100
- ]
101
- # Pick qualifier based on hash of task_id for determinism
102
- qi = hash(task_id) % len(qualifiers)
103
- v1 = f"{text}\n\n{qualifiers[qi]}"
104
- variations.append(("qualified", v1))
105
-
106
- # Variation 2: Reorder sentences if multiple exist
107
- sentences = re.split(r"(?<=[.!?])\s+", text)
108
- if len(sentences) >= 2:
109
- # Swap first two sentences
110
- reordered = sentences[1:] + sentences[:1]
111
- v2 = " ".join(reordered)
112
- variations.append(("reordered", v2))
113
- else:
114
- # If single sentence, prepend "Given the context: "
115
- v2 = f"Given the following context, {text[0].lower()}{text[1:]}" if len(text) > 1 else text
116
- variations.append(("rephrased", v2))
117
-
118
- # Variation 3: Minimal version — strip to core question
119
- # Remove qualifiers, keep just the main ask
120
- minimal = text
121
- # Strip common padding phrases
122
- for prefix in ["Please ", "Can you ", "Could you ", "I would like you to ", "I need you to "]:
123
- if minimal.startswith(prefix):
124
- minimal = minimal[len(prefix):]
125
- minimal = minimal[0].upper() + minimal[1:] if minimal else minimal
126
- break
127
- if minimal != text:
128
- variations.append(("minimal", minimal))
129
-
130
- return variations
131
-
132
-
133
- def main():
134
- parser = argparse.ArgumentParser(description="Generate regression test tasks from score improvements")
135
- parser.add_argument("--current-scores", required=True, help="Path to current version's scores.json")
136
- parser.add_argument("--previous-scores", required=True, help="Path to previous version's scores.json")
137
- parser.add_argument("--tasks-dir", required=True, help="Path to eval/tasks/ (to read originals)")
138
- parser.add_argument("--output-dir", required=True, help="Directory to write regression tasks")
139
- parser.add_argument("--max-total-tasks", type=int, default=60, help="Cap total tasks in output-dir (default 60)")
140
- args = parser.parse_args()
141
-
142
- current = load_json(args.current_scores)
143
- previous = load_json(args.previous_scores)
144
-
145
- if not current or not previous:
146
- print("Missing scores files — skipping test growth")
147
- return
148
-
149
- # Find tasks that were fixed
150
- fixed = find_fixed_tasks(current, previous)
151
- if not fixed:
152
- print("No tasks improved significantly — no regression tasks needed")
153
- return
154
-
155
- # Check capacity
156
- existing_count = count_existing_tasks(args.output_dir)
157
- available_slots = args.max_total_tasks - existing_count
158
- if available_slots <= 0:
159
- print(f"Task suite already at capacity ({existing_count}/{args.max_total_tasks}) — skipping growth")
160
- return
161
-
162
- os.makedirs(args.output_dir, exist_ok=True)
163
- regression_id = next_regression_id(args.output_dir)
164
- tasks_added = 0
165
- fixed_ids = []
166
-
167
- for fix_info in fixed:
168
- if tasks_added >= available_slots:
169
- break
170
-
171
- tid = fix_info["task_id"]
172
- # Load original task
173
- task_path = os.path.join(args.tasks_dir, f"{tid}.json")
174
- original = load_json(task_path)
175
- if not original:
176
- continue
177
-
178
- original_input = original.get("input", "")
179
- if not original_input:
180
- continue
181
-
182
- original_meta = original.get("metadata", {})
183
- variations = generate_variations(original_input, tid)
184
-
185
- for var_type, var_input in variations:
186
- if tasks_added >= available_slots:
187
- break
188
-
189
- reg_id = f"regression_{regression_id:03d}"
190
- task = {
191
- "id": reg_id,
192
- "input": var_input,
193
- "metadata": {
194
- "difficulty": original_meta.get("difficulty", "medium"),
195
- "category": original_meta.get("category", "unknown"),
196
- "type": "regression",
197
- "source": "regression",
198
- "regression_for": tid,
199
- "variation": var_type,
200
- "previous_score": fix_info["previous_score"],
201
- "fixed_at_score": fix_info["current_score"],
202
- },
203
- }
204
-
205
- # Include expected if original had it
206
- if "expected" in original:
207
- task["expected"] = original["expected"]
208
-
209
- out_path = os.path.join(args.output_dir, f"{reg_id}.json")
210
- with open(out_path, "w") as f:
211
- json.dump(task, f, indent=2)
212
-
213
- tasks_added += 1
214
- regression_id += 1
215
-
216
- fixed_ids.append(tid)
217
-
218
- # Output summary
219
- summary = {
220
- "tasks_added": tasks_added,
221
- "fixed_tasks": fixed_ids,
222
- "total_tasks_now": existing_count + tasks_added,
223
- "max_total_tasks": args.max_total_tasks,
224
- }
225
- print(json.dumps(summary))
226
- print(f"Added {tasks_added} regression tasks to lock in improvements on: {', '.join(fixed_ids)}")
227
-
228
-
229
- if __name__ == "__main__":
230
- main()
@@ -1,42 +0,0 @@
1
- """TraceLogger — optional helper for harnesses to write structured trace records.
2
-
3
- Usage in a harness:
4
- from trace_logger import TraceLogger
5
-
6
- trace = TraceLogger(traces_dir)
7
- trace.step("llm_call", {"prompt": p, "response": r, "model": "gpt-4"})
8
- trace.step("tool_use", {"tool": "search", "query": q, "results": results})
9
- trace.save()
10
-
11
- Stdlib-only. No external dependencies.
12
- """
13
-
14
- import json
15
- import os
16
- import time
17
-
18
-
19
- class TraceLogger:
20
- def __init__(self, traces_dir):
21
- self.traces_dir = traces_dir
22
- self._steps = []
23
- if traces_dir:
24
- os.makedirs(traces_dir, exist_ok=True)
25
-
26
- def step(self, name, data=None):
27
- self._steps.append({
28
- "name": name,
29
- "timestamp": time.time(),
30
- "data": data if data is not None else {},
31
- })
32
-
33
- def save(self):
34
- if not self.traces_dir:
35
- return
36
- path = os.path.join(self.traces_dir, "trace.json")
37
- with open(path, "w") as f:
38
- json.dump(self._steps, f, indent=2)
39
-
40
- @property
41
- def steps(self):
42
- return list(self._steps)