harness-evolver 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,53 +1,198 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: harness-evolve-init
|
|
3
|
-
description: "Initialize harness evolution in the current project.
|
|
4
|
-
argument-hint: "
|
|
5
|
-
allowed-tools: [Read, Write, Bash, Glob]
|
|
3
|
+
description: "Initialize harness evolution in the current project. Scans the codebase, identifies the entry point, and helps create harness wrapper, eval script, and test cases if they don't exist."
|
|
4
|
+
argument-hint: "[directory]"
|
|
5
|
+
allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent]
|
|
6
6
|
---
|
|
7
7
|
|
|
8
8
|
# /harness-evolve-init
|
|
9
9
|
|
|
10
10
|
Initialize the Harness Evolver for this project.
|
|
11
11
|
|
|
12
|
-
##
|
|
12
|
+
## Usage
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
-
|
|
16
|
-
-
|
|
14
|
+
```
|
|
15
|
+
/harness-evolve-init # setup in current directory
|
|
16
|
+
/harness-evolve-init ./my-project # setup in a specific directory
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Your Job: A 3-Phase Setup Wizard
|
|
20
|
+
|
|
21
|
+
You are the intelligent layer. The init.py tool is dumb — it takes paths. Your job is to figure out what to pass it, creating files if needed.
|
|
17
22
|
|
|
18
|
-
|
|
23
|
+
### Phase 1: SCAN the project
|
|
19
24
|
|
|
20
|
-
|
|
25
|
+
Read the project structure to understand what exists:
|
|
21
26
|
|
|
22
27
|
```bash
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
--eval {eval} \
|
|
26
|
-
--tasks {tasks} \
|
|
27
|
-
--base-dir .harness-evolver \
|
|
28
|
-
--harness-config {config if provided, else omit} \
|
|
29
|
-
--tools-dir ~/.harness-evolver/tools
|
|
28
|
+
find . -maxdepth 3 -type f -name "*.py" | head -30
|
|
29
|
+
ls -la
|
|
30
30
|
```
|
|
31
31
|
|
|
32
|
-
|
|
32
|
+
Look for:
|
|
33
|
+
- **Entry point candidates:** `main.py`, `app.py`, `agent.py`, `graph.py`, `pipeline.py`, `bot.py`, `run.py`, or any file with `if __name__` block
|
|
34
|
+
- **Existing eval/test files:** `eval.py`, `test_*.py`, `score.py`, `judge.py`
|
|
35
|
+
- **Existing test data:** `tasks/`, `tests/`, `data/`, `examples/`, `fixtures/`, any dir with JSON/JSONL files
|
|
36
|
+
- **Config files:** `config.json`, `config.yaml`, `.env`
|
|
37
|
+
- **Framework clues:** imports of langchain, langgraph, openai, anthropic, crewai, etc.
|
|
33
38
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
-
|
|
37
|
-
|
|
39
|
+
Also run stack detection:
|
|
40
|
+
```bash
|
|
41
|
+
python3 ~/.harness-evolver/tools/detect_stack.py .
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Phase 2: CREATE what's missing
|
|
45
|
+
|
|
46
|
+
There are 3 artifacts needed. For each, check if it exists or needs to be created.
|
|
47
|
+
|
|
48
|
+
#### A. The Harness (`harness.py`)
|
|
49
|
+
|
|
50
|
+
**If `harness.py` already exists with the right interface** (`--input`, `--output`): use it directly.
|
|
51
|
+
|
|
52
|
+
**If the project has an entry point but NOT in our format:** Create a `harness.py` wrapper.
|
|
53
|
+
|
|
54
|
+
Read the entry point to understand its input/output format, then generate a wrapper:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
#!/usr/bin/env python3
|
|
58
|
+
"""Harness wrapper for [project name]. Generated by harness-evolve-init."""
|
|
59
|
+
|
|
60
|
+
import argparse
|
|
61
|
+
import json
|
|
62
|
+
import sys
|
|
63
|
+
|
|
64
|
+
# Import the actual project code
|
|
65
|
+
from [entry_module] import [main_function_or_class]
|
|
66
|
+
|
|
67
|
+
def main():
|
|
68
|
+
parser = argparse.ArgumentParser()
|
|
69
|
+
parser.add_argument("--input", required=True)
|
|
70
|
+
parser.add_argument("--output", required=True)
|
|
71
|
+
parser.add_argument("--traces-dir", default=None)
|
|
72
|
+
parser.add_argument("--config", default=None)
|
|
73
|
+
args = parser.parse_args()
|
|
74
|
+
|
|
75
|
+
task = json.load(open(args.input))
|
|
76
|
+
config = json.load(open(args.config)) if args.config else {}
|
|
77
|
+
|
|
78
|
+
# Call the actual project code
|
|
79
|
+
result = [call_the_project](task["input"], **config)
|
|
80
|
+
|
|
81
|
+
json.dump({"id": task["id"], "output": str(result)}, open(args.output, "w"))
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
main()
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Adapt this template based on what you learned from reading the entry point. Ask the user to confirm if you're unsure about the input/output mapping.
|
|
88
|
+
|
|
89
|
+
#### B. The Eval (`eval.py`)
|
|
90
|
+
|
|
91
|
+
**If `eval.py` exists:** use it.
|
|
92
|
+
|
|
93
|
+
**If not:** Ask the user what "correct" means for their use case, then generate one:
|
|
38
94
|
|
|
39
|
-
|
|
95
|
+
- **Classification/extraction:** exact match or fuzzy match
|
|
96
|
+
- **Chatbot/QA:** LLM-as-judge (requires API key) or keyword matching
|
|
97
|
+
- **Code generation:** execution-based (run the code, check output)
|
|
98
|
+
- **RAG:** relevance scoring
|
|
40
99
|
|
|
41
|
-
|
|
100
|
+
Start with the simplest eval that works. The evolver can iterate on the harness without a perfect eval — even a rough eval gives signal.
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
#!/usr/bin/env python3
|
|
104
|
+
"""Eval script for [project]. Generated by harness-evolve-init."""
|
|
105
|
+
|
|
106
|
+
import argparse
|
|
107
|
+
import json
|
|
108
|
+
import os
|
|
109
|
+
|
|
110
|
+
def main():
|
|
111
|
+
parser = argparse.ArgumentParser()
|
|
112
|
+
parser.add_argument("--results-dir", required=True)
|
|
113
|
+
parser.add_argument("--tasks-dir", required=True)
|
|
114
|
+
parser.add_argument("--scores", required=True)
|
|
115
|
+
args = parser.parse_args()
|
|
116
|
+
|
|
117
|
+
correct, total = 0, 0
|
|
118
|
+
per_task = {}
|
|
119
|
+
|
|
120
|
+
for fname in sorted(os.listdir(args.tasks_dir)):
|
|
121
|
+
if not fname.endswith(".json"):
|
|
122
|
+
continue
|
|
123
|
+
with open(os.path.join(args.tasks_dir, fname)) as f:
|
|
124
|
+
task = json.load(f)
|
|
125
|
+
task_id = task["id"]
|
|
126
|
+
|
|
127
|
+
result_path = os.path.join(args.results_dir, fname)
|
|
128
|
+
if not os.path.exists(result_path):
|
|
129
|
+
per_task[task_id] = {"score": 0.0, "error": "no output"}
|
|
130
|
+
total += 1
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
with open(result_path) as f:
|
|
134
|
+
result = json.load(f)
|
|
135
|
+
|
|
136
|
+
# ADAPT THIS: define what "correct" means for this project
|
|
137
|
+
expected = task.get("expected", "")
|
|
138
|
+
actual = result.get("output", "")
|
|
139
|
+
match = actual.lower().strip() == expected.lower().strip()
|
|
140
|
+
|
|
141
|
+
per_task[task_id] = {"score": 1.0 if match else 0.0, "expected": expected, "actual": actual}
|
|
142
|
+
correct += int(match)
|
|
143
|
+
total += 1
|
|
144
|
+
|
|
145
|
+
accuracy = correct / total if total > 0 else 0.0
|
|
146
|
+
json.dump({
|
|
147
|
+
"combined_score": accuracy,
|
|
148
|
+
"accuracy": accuracy,
|
|
149
|
+
"total_tasks": total,
|
|
150
|
+
"correct": correct,
|
|
151
|
+
"per_task": per_task,
|
|
152
|
+
}, open(args.scores, "w"), indent=2)
|
|
153
|
+
|
|
154
|
+
if __name__ == "__main__":
|
|
155
|
+
main()
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
#### C. Test Cases (`tasks/`)
|
|
159
|
+
|
|
160
|
+
**If `tasks/` exists with JSON files:** use it.
|
|
161
|
+
|
|
162
|
+
**If the project has test data in another format:** Convert it to our format.
|
|
163
|
+
|
|
164
|
+
**If no test data exists:** Help create 5-10 test cases. Ask the user:
|
|
165
|
+
> "What are typical inputs to your system? And what are the expected outputs? Give me 3-5 examples and I'll create the task files."
|
|
166
|
+
|
|
167
|
+
Each task file is:
|
|
168
|
+
```json
|
|
169
|
+
{"id": "task_001", "input": "...", "expected": "...", "metadata": {}}
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### Phase 3: RUN init.py
|
|
173
|
+
|
|
174
|
+
Once all 3 artifacts exist, run:
|
|
42
175
|
|
|
43
176
|
```bash
|
|
44
177
|
python3 ~/.harness-evolver/tools/init.py \
|
|
45
|
-
--harness
|
|
46
|
-
--eval
|
|
47
|
-
--tasks
|
|
48
|
-
--
|
|
49
|
-
--langsmith-dataset {dataset_id}
|
|
178
|
+
--harness harness.py \
|
|
179
|
+
--eval eval.py \
|
|
180
|
+
--tasks tasks/ \
|
|
181
|
+
--tools-dir ~/.harness-evolver/tools
|
|
50
182
|
```
|
|
51
183
|
|
|
52
|
-
|
|
53
|
-
|
|
184
|
+
If a harness config exists, add `--harness-config config.json`.
|
|
185
|
+
|
|
186
|
+
If `~/.harness-evolver/tools/init.py` does not exist, check `.harness-evolver/tools/init.py`.
|
|
187
|
+
|
|
188
|
+
### After init completes, report:
|
|
189
|
+
|
|
190
|
+
- What was detected vs created
|
|
191
|
+
- Stack detected (libraries)
|
|
192
|
+
- Integrations available (LangSmith, Context7)
|
|
193
|
+
- Baseline score
|
|
194
|
+
- Next step: `/harness-evolve` to start the optimization loop
|
|
195
|
+
|
|
196
|
+
## Key Principle
|
|
197
|
+
|
|
198
|
+
**Don't ask the user to restructure their project.** You adapt to them. If they have a LangGraph graph in `src/graph.py`, you create a thin wrapper — you don't ask them to rename it to `harness.py`. The wrapper IS the harness.
|
|
Binary file
|
|
Binary file
|
package/tools/init.py
CHANGED
|
@@ -2,15 +2,17 @@
|
|
|
2
2
|
"""Project initializer for Harness Evolver.
|
|
3
3
|
|
|
4
4
|
Usage:
|
|
5
|
-
init.py
|
|
6
|
-
|
|
5
|
+
init.py [DIR] # auto-detect in DIR (or CWD)
|
|
6
|
+
init.py --harness PATH --eval PATH --tasks PATH # explicit paths
|
|
7
|
+
init.py --base-dir PATH [--harness-config PATH] # advanced options
|
|
7
8
|
|
|
8
|
-
|
|
9
|
-
|
|
9
|
+
Auto-detects harness.py, eval.py, tasks/ and config.json in the working directory.
|
|
10
|
+
Falls back to fuzzy matching (*harness*, *eval*, *score*, dirs with .json files).
|
|
10
11
|
Stdlib-only. No external dependencies.
|
|
11
12
|
"""
|
|
12
13
|
|
|
13
14
|
import argparse
|
|
15
|
+
import glob
|
|
14
16
|
import json
|
|
15
17
|
import os
|
|
16
18
|
import shutil
|
|
@@ -19,6 +21,74 @@ import sys
|
|
|
19
21
|
import tempfile
|
|
20
22
|
|
|
21
23
|
|
|
24
|
+
def _auto_detect(search_dir):
|
|
25
|
+
"""Auto-detect harness, eval, and tasks in a directory.
|
|
26
|
+
|
|
27
|
+
Returns (harness_path, eval_path, tasks_path, config_path) or raises SystemExit.
|
|
28
|
+
"""
|
|
29
|
+
search_dir = os.path.abspath(search_dir)
|
|
30
|
+
|
|
31
|
+
# Exact convention names first
|
|
32
|
+
harness = None
|
|
33
|
+
eval_script = None
|
|
34
|
+
tasks = None
|
|
35
|
+
config = None
|
|
36
|
+
|
|
37
|
+
# 1. Exact matches
|
|
38
|
+
for name in ["harness.py"]:
|
|
39
|
+
p = os.path.join(search_dir, name)
|
|
40
|
+
if os.path.isfile(p):
|
|
41
|
+
harness = p
|
|
42
|
+
for name in ["eval.py"]:
|
|
43
|
+
p = os.path.join(search_dir, name)
|
|
44
|
+
if os.path.isfile(p):
|
|
45
|
+
eval_script = p
|
|
46
|
+
for name in ["tasks", "tasks/"]:
|
|
47
|
+
p = os.path.join(search_dir, name.rstrip("/"))
|
|
48
|
+
if os.path.isdir(p):
|
|
49
|
+
tasks = p
|
|
50
|
+
for name in ["config.json"]:
|
|
51
|
+
p = os.path.join(search_dir, name)
|
|
52
|
+
if os.path.isfile(p):
|
|
53
|
+
config = p
|
|
54
|
+
|
|
55
|
+
# 2. Fuzzy fallback for harness
|
|
56
|
+
if not harness:
|
|
57
|
+
candidates = [f for f in glob.glob(os.path.join(search_dir, "*.py"))
|
|
58
|
+
if any(k in os.path.basename(f).lower() for k in ["harness", "agent", "run"])]
|
|
59
|
+
if len(candidates) == 1:
|
|
60
|
+
harness = candidates[0]
|
|
61
|
+
|
|
62
|
+
# 3. Fuzzy fallback for eval
|
|
63
|
+
if not eval_script:
|
|
64
|
+
candidates = [f for f in glob.glob(os.path.join(search_dir, "*.py"))
|
|
65
|
+
if any(k in os.path.basename(f).lower() for k in ["eval", "score", "judge"])
|
|
66
|
+
and f != harness]
|
|
67
|
+
if len(candidates) == 1:
|
|
68
|
+
eval_script = candidates[0]
|
|
69
|
+
|
|
70
|
+
# 4. Fuzzy fallback for tasks
|
|
71
|
+
if not tasks:
|
|
72
|
+
for d in os.listdir(search_dir):
|
|
73
|
+
dp = os.path.join(search_dir, d)
|
|
74
|
+
if os.path.isdir(dp) and any(f.endswith(".json") for f in os.listdir(dp)):
|
|
75
|
+
# Check if at least one JSON has "id" and "input" keys
|
|
76
|
+
for f in os.listdir(dp):
|
|
77
|
+
if f.endswith(".json"):
|
|
78
|
+
try:
|
|
79
|
+
with open(os.path.join(dp, f)) as fh:
|
|
80
|
+
data = json.load(fh)
|
|
81
|
+
if "id" in data and "input" in data:
|
|
82
|
+
tasks = dp
|
|
83
|
+
break
|
|
84
|
+
except (json.JSONDecodeError, KeyError):
|
|
85
|
+
pass
|
|
86
|
+
if tasks:
|
|
87
|
+
break
|
|
88
|
+
|
|
89
|
+
return harness, eval_script, tasks, config
|
|
90
|
+
|
|
91
|
+
|
|
22
92
|
def _detect_langsmith():
|
|
23
93
|
"""Auto-detect LangSmith API key and return config section."""
|
|
24
94
|
if os.environ.get("LANGSMITH_API_KEY"):
|
|
@@ -77,16 +147,59 @@ def _check_context7_available():
|
|
|
77
147
|
|
|
78
148
|
|
|
79
149
|
def main():
|
|
80
|
-
parser = argparse.ArgumentParser(
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
parser.add_argument("
|
|
150
|
+
parser = argparse.ArgumentParser(
|
|
151
|
+
description="Initialize Harness Evolver project",
|
|
152
|
+
usage="init.py [DIR] [--harness PATH] [--eval PATH] [--tasks PATH]",
|
|
153
|
+
)
|
|
154
|
+
parser.add_argument("dir", nargs="?", default=".",
|
|
155
|
+
help="Directory to scan (default: current directory)")
|
|
156
|
+
parser.add_argument("--harness", default=None, help="Path to harness script")
|
|
157
|
+
parser.add_argument("--eval", default=None, help="Path to eval script")
|
|
158
|
+
parser.add_argument("--tasks", default=None, help="Path to tasks directory")
|
|
159
|
+
parser.add_argument("--base-dir", default=None, help="Path for .harness-evolver/")
|
|
85
160
|
parser.add_argument("--harness-config", default=None, help="Path to harness config.json")
|
|
86
161
|
parser.add_argument("--tools-dir", default=None, help="Path to tools directory")
|
|
87
162
|
args = parser.parse_args()
|
|
88
163
|
|
|
89
|
-
|
|
164
|
+
# Auto-detect missing args
|
|
165
|
+
search_dir = os.path.abspath(args.dir)
|
|
166
|
+
if not args.harness or not args.eval or not args.tasks:
|
|
167
|
+
detected_harness, detected_eval, detected_tasks, detected_config = _auto_detect(search_dir)
|
|
168
|
+
if not args.harness:
|
|
169
|
+
args.harness = detected_harness
|
|
170
|
+
if not args.eval:
|
|
171
|
+
args.eval = detected_eval
|
|
172
|
+
if not args.tasks:
|
|
173
|
+
args.tasks = detected_tasks
|
|
174
|
+
if not args.harness_config and detected_config:
|
|
175
|
+
args.harness_config = detected_config
|
|
176
|
+
|
|
177
|
+
# Validate we have everything
|
|
178
|
+
missing = []
|
|
179
|
+
if not args.harness:
|
|
180
|
+
missing.append("harness (no harness.py or *harness*.py found)")
|
|
181
|
+
if not args.eval:
|
|
182
|
+
missing.append("eval (no eval.py or *eval*.py found)")
|
|
183
|
+
if not args.tasks:
|
|
184
|
+
missing.append("tasks (no tasks/ directory with JSON files found)")
|
|
185
|
+
if missing:
|
|
186
|
+
print("Could not auto-detect:", file=sys.stderr)
|
|
187
|
+
for m in missing:
|
|
188
|
+
print(f" - {m}", file=sys.stderr)
|
|
189
|
+
print(f"\nSearched in: {search_dir}", file=sys.stderr)
|
|
190
|
+
print("\nProvide explicitly:", file=sys.stderr)
|
|
191
|
+
print(" /harness-evolve-init --harness PATH --eval PATH --tasks PATH", file=sys.stderr)
|
|
192
|
+
sys.exit(1)
|
|
193
|
+
|
|
194
|
+
# Print what was detected
|
|
195
|
+
print(f"Harness: {os.path.relpath(args.harness, search_dir)}")
|
|
196
|
+
print(f"Eval: {os.path.relpath(args.eval, search_dir)}")
|
|
197
|
+
print(f"Tasks: {os.path.relpath(args.tasks, search_dir)}/")
|
|
198
|
+
if args.harness_config:
|
|
199
|
+
print(f"Config: {os.path.relpath(args.harness_config, search_dir)}")
|
|
200
|
+
print()
|
|
201
|
+
|
|
202
|
+
base = args.base_dir or os.path.join(search_dir, ".harness-evolver")
|
|
90
203
|
tools = args.tools_dir or os.path.dirname(__file__)
|
|
91
204
|
|
|
92
205
|
evaluate_py = os.path.join(tools, "evaluate.py")
|