harness-evolver 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,214 @@
1
+ #!/usr/bin/env python3
2
+ """Evaluation orchestrator for Harness Evolver.
3
+
4
+ Commands:
5
+ validate --harness PATH [--config PATH]
6
+ run --harness PATH --tasks-dir PATH --eval PATH --traces-dir PATH --scores PATH
7
+ [--config PATH] [--timeout SECONDS]
8
+
9
+ Runs harness per task, captures traces (stdout/stderr/timing), then calls user's eval script.
10
+ Stdlib-only. No external dependencies.
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import os
16
+ import shutil
17
+ import subprocess
18
+ import sys
19
+ import tempfile
20
+ import time
21
+
22
+
23
+ def _run_harness_on_task(harness, config, task_input_path, output_path, task_traces_dir, timeout, env=None):
24
+ """Run the harness on a single task. Returns (success, elapsed_ms, stdout, stderr)."""
25
+ cmd = ["python3", harness, "--input", task_input_path, "--output", output_path]
26
+ if task_traces_dir:
27
+ extra_dir = os.path.join(task_traces_dir, "extra")
28
+ os.makedirs(extra_dir, exist_ok=True)
29
+ cmd.extend(["--traces-dir", extra_dir])
30
+ if config and os.path.exists(config):
31
+ cmd.extend(["--config", config])
32
+
33
+ start = time.time()
34
+ try:
35
+ result = subprocess.run(
36
+ cmd, capture_output=True, text=True, timeout=timeout, env=env,
37
+ )
38
+ elapsed_ms = (time.time() - start) * 1000
39
+ return result.returncode == 0, elapsed_ms, result.stdout, result.stderr
40
+ except subprocess.TimeoutExpired:
41
+ elapsed_ms = (time.time() - start) * 1000
42
+ return False, elapsed_ms, "", f"TIMEOUT after {timeout}s"
43
+ except Exception as e:
44
+ elapsed_ms = (time.time() - start) * 1000
45
+ return False, elapsed_ms, "", str(e)
46
+
47
+
48
+ def cmd_validate(args):
49
+ harness = args.harness
50
+ config = getattr(args, "config", None)
51
+
52
+ if not os.path.exists(harness):
53
+ print(f"FAIL: harness not found: {harness}", file=sys.stderr)
54
+ sys.exit(1)
55
+
56
+ with tempfile.TemporaryDirectory() as tmpdir:
57
+ dummy_task = {"id": "validation", "input": "test input for validation", "metadata": {}}
58
+ input_path = os.path.join(tmpdir, "input.json")
59
+ output_path = os.path.join(tmpdir, "output.json")
60
+ with open(input_path, "w") as f:
61
+ json.dump(dummy_task, f)
62
+
63
+ success, elapsed, stdout, stderr = _run_harness_on_task(
64
+ harness, config, input_path, output_path, None, timeout=30,
65
+ )
66
+
67
+ if not success:
68
+ print(f"FAIL: harness exited with error.\nstderr: {stderr}", file=sys.stderr)
69
+ sys.exit(1)
70
+
71
+ if not os.path.exists(output_path):
72
+ print("FAIL: harness did not create output file.", file=sys.stderr)
73
+ sys.exit(1)
74
+
75
+ try:
76
+ with open(output_path) as f:
77
+ output = json.load(f)
78
+ except (json.JSONDecodeError, ValueError) as e:
79
+ print(f"FAIL: output is not valid JSON: {e}", file=sys.stderr)
80
+ sys.exit(1)
81
+
82
+ if "id" not in output or "output" not in output:
83
+ print(f"FAIL: output missing 'id' or 'output' fields. Got: {output}", file=sys.stderr)
84
+ sys.exit(1)
85
+
86
+ print(f"OK: harness validated in {elapsed:.0f}ms. Output: {output}")
87
+
88
+
89
+ def cmd_run(args):
90
+ harness = args.harness
91
+ config = getattr(args, "config", None)
92
+ tasks_dir = args.tasks_dir
93
+ eval_script = getattr(args, "eval")
94
+ traces_dir = args.traces_dir
95
+ scores_path = args.scores
96
+ timeout = args.timeout
97
+
98
+ os.makedirs(traces_dir, exist_ok=True)
99
+
100
+ task_files = sorted(f for f in os.listdir(tasks_dir) if f.endswith(".json"))
101
+ if not task_files:
102
+ print(f"FAIL: no .json task files in {tasks_dir}", file=sys.stderr)
103
+ sys.exit(1)
104
+
105
+ all_stdout = []
106
+ all_stderr = []
107
+ timing = {"per_task": {}}
108
+ results_dir = tempfile.mkdtemp()
109
+
110
+ # LangSmith: setup auto-tracing env vars if configured
111
+ langsmith_env = None
112
+ project_config_path = os.path.join(os.path.dirname(os.path.dirname(traces_dir)), "config.json")
113
+ if os.path.exists(project_config_path):
114
+ with open(project_config_path) as f:
115
+ project_config = json.load(f)
116
+ ls = project_config.get("eval", {}).get("langsmith", {})
117
+ if ls.get("enabled"):
118
+ api_key = os.environ.get(ls.get("api_key_env", "LANGSMITH_API_KEY"), "")
119
+ if api_key:
120
+ version = os.path.basename(os.path.dirname(traces_dir))
121
+ langsmith_env = {
122
+ **os.environ,
123
+ "LANGCHAIN_TRACING_V2": "true",
124
+ "LANGCHAIN_API_KEY": api_key,
125
+ "LANGCHAIN_PROJECT": f"{ls.get('project_prefix', 'harness-evolver')}-{version}",
126
+ }
127
+
128
+ for task_file in task_files:
129
+ task_path = os.path.join(tasks_dir, task_file)
130
+ with open(task_path) as f:
131
+ task = json.load(f)
132
+ task_id = task["id"]
133
+
134
+ task_input = {k: v for k, v in task.items() if k != "expected"}
135
+
136
+ task_traces_dir = os.path.join(traces_dir, task_id)
137
+ os.makedirs(task_traces_dir, exist_ok=True)
138
+
139
+ input_path = os.path.join(task_traces_dir, "input.json")
140
+ with open(input_path, "w") as f:
141
+ json.dump(task_input, f, indent=2)
142
+
143
+ output_path = os.path.join(results_dir, task_file)
144
+
145
+ success, elapsed_ms, stdout, stderr = _run_harness_on_task(
146
+ harness, config, input_path, output_path, task_traces_dir, timeout,
147
+ env=langsmith_env,
148
+ )
149
+
150
+ if os.path.exists(output_path):
151
+ shutil.copy2(output_path, os.path.join(task_traces_dir, "output.json"))
152
+ else:
153
+ with open(os.path.join(task_traces_dir, "output.json"), "w") as f:
154
+ json.dump({"id": task_id, "output": "", "error": "harness failed"}, f)
155
+
156
+ timing["per_task"][task_id] = round(elapsed_ms, 1)
157
+ all_stdout.append(f"--- {task_id} ---\n{stdout}")
158
+ all_stderr.append(f"--- {task_id} ---\n{stderr}")
159
+
160
+ timing["total_ms"] = round(sum(timing["per_task"].values()), 1)
161
+ with open(os.path.join(traces_dir, "timing.json"), "w") as f:
162
+ json.dump(timing, f, indent=2)
163
+ with open(os.path.join(traces_dir, "stdout.log"), "w") as f:
164
+ f.write("\n".join(all_stdout))
165
+ with open(os.path.join(traces_dir, "stderr.log"), "w") as f:
166
+ f.write("\n".join(all_stderr))
167
+
168
+ eval_cmd = [
169
+ "python3", eval_script,
170
+ "--results-dir", results_dir,
171
+ "--tasks-dir", tasks_dir,
172
+ "--scores", scores_path,
173
+ ]
174
+ result = subprocess.run(eval_cmd, capture_output=True, text=True, timeout=120)
175
+ if result.returncode != 0:
176
+ print(f"FAIL: eval script failed.\nstderr: {result.stderr}", file=sys.stderr)
177
+ sys.exit(1)
178
+
179
+ if os.path.exists(scores_path):
180
+ scores = json.load(open(scores_path))
181
+ print(f"Evaluation complete. combined_score: {scores.get('combined_score', 'N/A')}")
182
+ else:
183
+ print("WARNING: eval script did not produce scores file.", file=sys.stderr)
184
+
185
+
186
+ def main():
187
+ parser = argparse.ArgumentParser(description="Harness Evolver evaluation orchestrator")
188
+ sub = parser.add_subparsers(dest="command")
189
+
190
+ p_val = sub.add_parser("validate")
191
+ p_val.add_argument("--harness", required=True)
192
+ p_val.add_argument("--config", default=None)
193
+
194
+ p_run = sub.add_parser("run")
195
+ p_run.add_argument("--harness", required=True)
196
+ p_run.add_argument("--config", default=None)
197
+ p_run.add_argument("--tasks-dir", required=True)
198
+ p_run.add_argument("--eval", required=True)
199
+ p_run.add_argument("--traces-dir", required=True)
200
+ p_run.add_argument("--scores", required=True)
201
+ p_run.add_argument("--timeout", type=int, default=60)
202
+
203
+ args = parser.parse_args()
204
+ if args.command == "validate":
205
+ cmd_validate(args)
206
+ elif args.command == "run":
207
+ cmd_run(args)
208
+ else:
209
+ parser.print_help()
210
+ sys.exit(1)
211
+
212
+
213
+ if __name__ == "__main__":
214
+ main()
package/tools/init.py ADDED
@@ -0,0 +1,231 @@
1
+ #!/usr/bin/env python3
2
+ """Project initializer for Harness Evolver.
3
+
4
+ Usage:
5
+ init.py --harness PATH --eval PATH --tasks PATH --base-dir PATH
6
+ [--harness-config PATH] [--tools-dir PATH]
7
+
8
+ Creates the .harness-evolver/ directory structure, copies baseline files,
9
+ runs validation, evaluates the baseline, and initializes state.
10
+ Stdlib-only. No external dependencies.
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import os
16
+ import shutil
17
+ import subprocess
18
+ import sys
19
+ import tempfile
20
+
21
+
22
+ def _detect_langsmith():
23
+ """Auto-detect LangSmith API key and return config section."""
24
+ if os.environ.get("LANGSMITH_API_KEY"):
25
+ return {
26
+ "enabled": True,
27
+ "api_key_env": "LANGSMITH_API_KEY",
28
+ "project_prefix": "harness-evolver",
29
+ }
30
+ return {"enabled": False}
31
+
32
+
33
+ def _check_langsmith_cli():
34
+ """Check if langsmith-cli is installed."""
35
+ try:
36
+ r = subprocess.run(["langsmith-cli", "self", "detect"],
37
+ capture_output=True, text=True, timeout=5)
38
+ return r.returncode == 0
39
+ except FileNotFoundError:
40
+ return False
41
+
42
+
43
+ def _detect_stack(harness_path):
44
+ """Detect technology stack from harness imports."""
45
+ detect_stack_py = os.path.join(os.path.dirname(__file__), "detect_stack.py")
46
+ if not os.path.exists(detect_stack_py):
47
+ return {}
48
+ try:
49
+ r = subprocess.run(
50
+ ["python3", detect_stack_py, harness_path],
51
+ capture_output=True, text=True, timeout=30,
52
+ )
53
+ if r.returncode == 0 and r.stdout.strip():
54
+ return json.loads(r.stdout)
55
+ except Exception:
56
+ pass
57
+ return {}
58
+
59
+
60
+ def _check_context7_available():
61
+ """Check if Context7 MCP is configured in Claude Code."""
62
+ settings_paths = [
63
+ os.path.expanduser("~/.claude/settings.json"),
64
+ os.path.expanduser("~/.claude.json"),
65
+ ]
66
+ for path in settings_paths:
67
+ if os.path.exists(path):
68
+ try:
69
+ with open(path) as f:
70
+ settings = json.load(f)
71
+ mcp = settings.get("mcpServers", {})
72
+ if "context7" in mcp or "Context7" in mcp:
73
+ return True
74
+ except (json.JSONDecodeError, KeyError):
75
+ pass
76
+ return False
77
+
78
+
79
+ def main():
80
+ parser = argparse.ArgumentParser(description="Initialize Harness Evolver project")
81
+ parser.add_argument("--harness", required=True, help="Path to harness script")
82
+ parser.add_argument("--eval", required=True, help="Path to eval script")
83
+ parser.add_argument("--tasks", required=True, help="Path to tasks directory")
84
+ parser.add_argument("--base-dir", required=True, help="Path for .harness-evolver/")
85
+ parser.add_argument("--harness-config", default=None, help="Path to harness config.json")
86
+ parser.add_argument("--tools-dir", default=None, help="Path to tools directory")
87
+ args = parser.parse_args()
88
+
89
+ base = args.base_dir
90
+ tools = args.tools_dir or os.path.dirname(__file__)
91
+
92
+ evaluate_py = os.path.join(tools, "evaluate.py")
93
+ state_py = os.path.join(tools, "state.py")
94
+
95
+ # 1. Create directory structure
96
+ for d in ["baseline", "eval/tasks", "harnesses"]:
97
+ os.makedirs(os.path.join(base, d), exist_ok=True)
98
+
99
+ # 2. Copy baseline harness
100
+ shutil.copy2(args.harness, os.path.join(base, "baseline", "harness.py"))
101
+ if args.harness_config and os.path.exists(args.harness_config):
102
+ shutil.copy2(args.harness_config, os.path.join(base, "baseline", "config.json"))
103
+
104
+ # 3. Copy eval script and tasks
105
+ shutil.copy2(args.eval, os.path.join(base, "eval", "eval.py"))
106
+ for f in os.listdir(args.tasks):
107
+ src = os.path.join(args.tasks, f)
108
+ if os.path.isfile(src):
109
+ shutil.copy2(src, os.path.join(base, "eval", "tasks", f))
110
+
111
+ # 4. Generate config.json
112
+ harness_name = os.path.basename(args.harness)
113
+ eval_name = os.path.basename(args.eval)
114
+ config = {
115
+ "version": "0.1.0",
116
+ "harness": {
117
+ "command": f"python3 {harness_name}",
118
+ "args": ["--input", "{input}", "--output", "{output}",
119
+ "--traces-dir", "{traces_dir}", "--config", "{config}"],
120
+ "timeout_per_task_sec": 60,
121
+ },
122
+ "eval": {
123
+ "command": f"python3 {eval_name}",
124
+ "args": ["--results-dir", "{results_dir}", "--tasks-dir", "{tasks_dir}",
125
+ "--scores", "{scores}"],
126
+ "langsmith": _detect_langsmith(),
127
+ },
128
+ "evolution": {
129
+ "max_iterations": 10,
130
+ "candidates_per_iter": 1,
131
+ "stagnation_limit": 3,
132
+ "stagnation_threshold": 0.01,
133
+ "target_score": None,
134
+ },
135
+ "paths": {
136
+ "baseline": "baseline/",
137
+ "eval_tasks": "eval/tasks/",
138
+ "eval_script": "eval/eval.py",
139
+ "harnesses": "harnesses/",
140
+ },
141
+ }
142
+ with open(os.path.join(base, "config.json"), "w") as f:
143
+ json.dump(config, f, indent=2)
144
+
145
+ ls_config = config["eval"].get("langsmith", {})
146
+ if ls_config.get("enabled"):
147
+ print(" LangSmith tracing enabled (LANGSMITH_API_KEY detected)")
148
+ if _check_langsmith_cli():
149
+ print(" langsmith-cli detected — proposer will use it for trace analysis")
150
+ else:
151
+ print(" Recommendation: install langsmith-cli for rich trace analysis:")
152
+ print(" uv tool install langsmith-cli && langsmith-cli auth login")
153
+
154
+ # Detect stack
155
+ stack = _detect_stack(args.harness)
156
+ config["stack"] = {
157
+ "detected": stack,
158
+ "documentation_hint": "use context7",
159
+ "auto_detected": True,
160
+ }
161
+ # Re-write config.json with stack section added
162
+ with open(os.path.join(base, "config.json"), "w") as f:
163
+ json.dump(config, f, indent=2)
164
+
165
+ if stack:
166
+ print("Stack detected:")
167
+ for lib_info in stack.values():
168
+ print(f" {lib_info['display']}")
169
+ if not _check_context7_available():
170
+ print("\nRecommendation: install Context7 MCP for up-to-date documentation:")
171
+ print(" claude mcp add context7 -- npx -y @upstash/context7-mcp@latest")
172
+
173
+ # 5. Validate baseline harness
174
+ print("Validating baseline harness...")
175
+ val_args = ["python3", evaluate_py, "validate",
176
+ "--harness", os.path.join(base, "baseline", "harness.py")]
177
+ config_path = os.path.join(base, "baseline", "config.json")
178
+ if os.path.exists(config_path):
179
+ val_args.extend(["--config", config_path])
180
+ r = subprocess.run(val_args, capture_output=True, text=True)
181
+ if r.returncode != 0:
182
+ print(f"FAIL: baseline harness validation failed.\n{r.stderr}", file=sys.stderr)
183
+ sys.exit(1)
184
+ print(r.stdout.strip())
185
+
186
+ # 6. Evaluate baseline
187
+ print("Evaluating baseline harness...")
188
+ baseline_traces = tempfile.mkdtemp()
189
+ baseline_scores = os.path.join(base, "baseline_scores.json")
190
+ eval_args = [
191
+ "python3", evaluate_py, "run",
192
+ "--harness", os.path.join(base, "baseline", "harness.py"),
193
+ "--tasks-dir", os.path.join(base, "eval", "tasks"),
194
+ "--eval", os.path.join(base, "eval", "eval.py"),
195
+ "--traces-dir", baseline_traces,
196
+ "--scores", baseline_scores,
197
+ "--timeout", "60",
198
+ ]
199
+ if os.path.exists(config_path):
200
+ eval_args.extend(["--config", config_path])
201
+ r = subprocess.run(eval_args, capture_output=True, text=True, timeout=300)
202
+ if r.returncode != 0:
203
+ print(f"WARNING: baseline evaluation failed. Using score 0.0.\n{r.stderr}", file=sys.stderr)
204
+ baseline_score = 0.0
205
+ else:
206
+ print(r.stdout.strip())
207
+ scores = json.load(open(baseline_scores))
208
+ baseline_score = scores.get("combined_score", 0.0)
209
+
210
+ if os.path.exists(baseline_scores):
211
+ os.remove(baseline_scores)
212
+
213
+ # 7. Initialize state with baseline score
214
+ print(f"Baseline score: {baseline_score:.2f}")
215
+ r = subprocess.run(
216
+ ["python3", state_py, "init",
217
+ "--base-dir", base,
218
+ "--baseline-score", str(baseline_score)],
219
+ capture_output=True, text=True,
220
+ )
221
+ if r.returncode != 0:
222
+ print(f"FAIL: state init failed.\n{r.stderr}", file=sys.stderr)
223
+ sys.exit(1)
224
+
225
+ print(f"\nInitialized .harness-evolver/ at {base}")
226
+ print(f"Baseline score: {baseline_score:.2f}")
227
+ print("Run /harness-evolve to start the optimization loop.")
228
+
229
+
230
+ if __name__ == "__main__":
231
+ main()
package/tools/state.py ADDED
@@ -0,0 +1,219 @@
1
+ #!/usr/bin/env python3
2
+ """State manager for Harness Evolver.
3
+
4
+ Commands:
5
+ init --base-dir DIR --baseline-score FLOAT
6
+ update --base-dir DIR --version VER --scores PATH --proposal PATH
7
+ show --base-dir DIR
8
+
9
+ Manages: summary.json (source of truth), STATE.md (human view), PROPOSER_HISTORY.md (log).
10
+ Stdlib-only. No external dependencies.
11
+ """
12
+
13
+ import argparse
14
+ import json
15
+ import os
16
+ import re
17
+ import sys
18
+
19
+
20
+ def _read_json(path):
21
+ with open(path) as f:
22
+ return json.load(f)
23
+
24
+
25
+ def _write_json(path, data):
26
+ with open(path, "w") as f:
27
+ json.dump(data, f, indent=2)
28
+
29
+
30
+ def _read_text(path):
31
+ with open(path) as f:
32
+ return f.read()
33
+
34
+
35
+ def _write_text(path, text):
36
+ with open(path, "w") as f:
37
+ f.write(text)
38
+
39
+
40
+ def _summary_path(base_dir):
41
+ return os.path.join(base_dir, "summary.json")
42
+
43
+
44
+ def _state_md_path(base_dir):
45
+ return os.path.join(base_dir, "STATE.md")
46
+
47
+
48
+ def _history_path(base_dir):
49
+ return os.path.join(base_dir, "PROPOSER_HISTORY.md")
50
+
51
+
52
+ def _detect_parent(proposal_text, current_best):
53
+ """Parse 'Based on vXXX' or 'Based on baseline' from proposal text."""
54
+ match = re.search(r"[Bb]ased on (v\d+|baseline)", proposal_text)
55
+ if match:
56
+ return match.group(1)
57
+ return current_best
58
+
59
+
60
+ def _render_state_md(summary):
61
+ """Generate STATE.md from summary.json data."""
62
+ lines = ["# Harness Evolver Status", ""]
63
+ best = summary["best"]
64
+ worst = summary["worst"]
65
+ lines.append(f"**Iterations:** {summary['iterations']}")
66
+ lines.append(f"**Best:** {best['version']} ({best['combined_score']:.2f})")
67
+ lines.append(f"**Worst:** {worst['version']} ({worst['combined_score']:.2f})")
68
+ if summary["history"]:
69
+ last = summary["history"][-1]
70
+ lines.append(f"**Latest:** {last['version']} ({last['combined_score']:.2f})")
71
+ lines.append("")
72
+ lines.append("## History")
73
+ lines.append("")
74
+ lines.append("| Version | Score | Parent | Delta |")
75
+ lines.append("|---------|-------|--------|-------|")
76
+ prev_score = None
77
+ for entry in summary["history"]:
78
+ v = entry["version"]
79
+ s = entry["combined_score"]
80
+ p = entry["parent"] or "-"
81
+ if prev_score is not None and v != "baseline":
82
+ delta = s - prev_score
83
+ if delta < -0.01:
84
+ delta_str = f"{delta:+.2f} REGRESSION"
85
+ elif delta > 0.01:
86
+ delta_str = f"{delta:+.2f}"
87
+ else:
88
+ delta_str = f"{delta:+.2f} (stagnant)"
89
+ else:
90
+ delta_str = "-"
91
+ lines.append(f"| {v} | {s:.2f} | {p} | {delta_str} |")
92
+ prev_score = s
93
+ return "\n".join(lines) + "\n"
94
+
95
+
96
+ def cmd_init(args):
97
+ base_dir = args.base_dir
98
+ score = args.baseline_score
99
+ os.makedirs(base_dir, exist_ok=True)
100
+
101
+ summary = {
102
+ "iterations": 0,
103
+ "best": {"version": "baseline", "combined_score": score},
104
+ "worst": {"version": "baseline", "combined_score": score},
105
+ "history": [
106
+ {"version": "baseline", "combined_score": score, "parent": None}
107
+ ],
108
+ }
109
+ _write_json(_summary_path(base_dir), summary)
110
+ _write_text(_state_md_path(base_dir), _render_state_md(summary))
111
+ _write_text(_history_path(base_dir), "# Proposer History\n")
112
+
113
+
114
+ def cmd_update(args):
115
+ base_dir = args.base_dir
116
+ version = args.version
117
+ scores = _read_json(args.scores)
118
+ proposal_text = _read_text(args.proposal) if args.proposal else ""
119
+
120
+ summary = _read_json(_summary_path(base_dir))
121
+ combined = scores.get("combined_score", 0.0)
122
+ parent = _detect_parent(proposal_text, summary["best"]["version"])
123
+
124
+ entry = {
125
+ "version": version,
126
+ "combined_score": combined,
127
+ "parent": parent,
128
+ }
129
+ summary["history"].append(entry)
130
+ summary["iterations"] = len(summary["history"]) - 1
131
+
132
+ non_baseline = [h for h in summary["history"] if h["version"] != "baseline"]
133
+ if non_baseline:
134
+ best_entry = max(non_baseline, key=lambda h: h["combined_score"])
135
+ worst_entry = min(non_baseline, key=lambda h: h["combined_score"])
136
+ summary["best"] = {
137
+ "version": best_entry["version"],
138
+ "combined_score": best_entry["combined_score"],
139
+ }
140
+ summary["worst"] = {
141
+ "version": worst_entry["version"],
142
+ "combined_score": worst_entry["combined_score"],
143
+ }
144
+
145
+ _write_json(_summary_path(base_dir), summary)
146
+ _write_text(_state_md_path(base_dir), _render_state_md(summary))
147
+
148
+ parent_score = None
149
+ for h in summary["history"]:
150
+ if h["version"] == parent:
151
+ parent_score = h["combined_score"]
152
+ break
153
+
154
+ is_regression = parent_score is not None and combined < parent_score - 0.01
155
+ regression_tag = " <- REGRESSION" if is_regression else ""
156
+
157
+ proposal_lines = proposal_text.strip().split("\n")
158
+ summary_line = ""
159
+ for line in proposal_lines:
160
+ stripped = line.strip()
161
+ if stripped and not re.match(r"^[Bb]ased on", stripped):
162
+ summary_line = stripped
163
+ break
164
+
165
+ history_entry = f"\n## {version} (score: {combined:.2f}){regression_tag}\n{summary_line}\n"
166
+ history_path = _history_path(base_dir)
167
+ with open(history_path, "a") as f:
168
+ f.write(history_entry)
169
+
170
+
171
+ def cmd_show(args):
172
+ base_dir = args.base_dir
173
+ summary = _read_json(_summary_path(base_dir))
174
+ best = summary["best"]
175
+ worst = summary["worst"]
176
+
177
+ print(f"Harness Evolver — Iteration {summary['iterations']}")
178
+ print(f"Best: {best['version']} score: {best['combined_score']:.2f}")
179
+ print(f"Worst: {worst['version']} score: {worst['combined_score']:.2f}")
180
+ print()
181
+ for entry in summary["history"]:
182
+ v = entry["version"]
183
+ s = entry["combined_score"]
184
+ bar_len = int(s * 30)
185
+ bar = "\u2588" * bar_len
186
+ print(f" {v:>10}: {s:.2f} {bar}")
187
+
188
+
189
+ def main():
190
+ parser = argparse.ArgumentParser(description="Harness Evolver state manager")
191
+ sub = parser.add_subparsers(dest="command")
192
+
193
+ p_init = sub.add_parser("init")
194
+ p_init.add_argument("--base-dir", required=True)
195
+ p_init.add_argument("--baseline-score", type=float, required=True)
196
+
197
+ p_update = sub.add_parser("update")
198
+ p_update.add_argument("--base-dir", required=True)
199
+ p_update.add_argument("--version", required=True)
200
+ p_update.add_argument("--scores", required=True)
201
+ p_update.add_argument("--proposal", default=None)
202
+
203
+ p_show = sub.add_parser("show")
204
+ p_show.add_argument("--base-dir", required=True)
205
+
206
+ args = parser.parse_args()
207
+ if args.command == "init":
208
+ cmd_init(args)
209
+ elif args.command == "update":
210
+ cmd_update(args)
211
+ elif args.command == "show":
212
+ cmd_show(args)
213
+ else:
214
+ parser.print_help()
215
+ sys.exit(1)
216
+
217
+
218
+ if __name__ == "__main__":
219
+ main()