harness-evolver 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,111 @@
1
+ #!/usr/bin/env python3
2
+ """Medical symptom classifier — deliberately naive, with room for improvement.
3
+
4
+ Mock mode (default): keyword matching, ~40% accuracy.
5
+ LLM mode: calls API, ~50-60% accuracy (no few-shot, no retry, no structured output).
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import os
11
+ import sys
12
+
13
+ CATEGORIES = [
14
+ "respiratory", "cardiac", "gastrointestinal",
15
+ "neurological", "musculoskeletal", "dermatological",
16
+ ]
17
+
18
+ KEYWORDS = {
19
+ "respiratory": ["cough", "breath", "lung", "wheeze", "sputum"],
20
+ "cardiac": ["chest pain", "heart", "blood pressure", "palpitation"],
21
+ "gastrointestinal": ["nausea", "vomit", "abdominal", "diarrhea", "stomach"],
22
+ "neurological": ["headache", "dizz", "numb", "seizure", "confusion"],
23
+ "musculoskeletal": ["joint", "back pain", "muscle", "stiffness", "swelling"],
24
+ "dermatological": ["rash", "itch", "skin", "lesion", "bump"],
25
+ }
26
+
27
+
28
+ def classify_mock(text):
29
+ text_lower = text.lower()
30
+ scores = {}
31
+ for category, words in KEYWORDS.items():
32
+ scores[category] = sum(1 for w in words if w in text_lower)
33
+ best = max(scores, key=scores.get)
34
+ if scores[best] == 0:
35
+ return "unknown"
36
+ return best
37
+
38
+
39
+ def classify_llm(text, config):
40
+ import urllib.request
41
+
42
+ api_key = config.get("api_key", os.environ.get("ANTHROPIC_API_KEY", ""))
43
+ model = config.get("model", "claude-haiku-4-5-20251001")
44
+
45
+ prompt = (
46
+ f"Classify the following medical symptom description into exactly one category.\n"
47
+ f"Categories: {', '.join(CATEGORIES)}\n"
48
+ f"Reply with ONLY the category name, nothing else.\n\n"
49
+ f"{text}"
50
+ )
51
+
52
+ body = json.dumps({
53
+ "model": model,
54
+ "max_tokens": 50,
55
+ "messages": [{"role": "user", "content": prompt}],
56
+ }).encode()
57
+
58
+ req = urllib.request.Request(
59
+ "https://api.anthropic.com/v1/messages",
60
+ data=body,
61
+ headers={
62
+ "Content-Type": "application/json",
63
+ "x-api-key": api_key,
64
+ "anthropic-version": "2023-06-01",
65
+ },
66
+ )
67
+ with urllib.request.urlopen(req, timeout=30) as resp:
68
+ result = json.loads(resp.read())
69
+
70
+ answer = result["content"][0]["text"].strip().lower()
71
+ for cat in CATEGORIES:
72
+ if cat in answer:
73
+ return cat
74
+ return answer
75
+
76
+
77
+ def main():
78
+ parser = argparse.ArgumentParser()
79
+ parser.add_argument("--input", required=True)
80
+ parser.add_argument("--output", required=True)
81
+ parser.add_argument("--traces-dir", default=None)
82
+ parser.add_argument("--config", default=None)
83
+ args = parser.parse_args()
84
+
85
+ task = json.load(open(args.input))
86
+ config = json.load(open(args.config)) if args.config and os.path.exists(args.config) else {}
87
+ use_mock = config.get("mock", True)
88
+
89
+ if use_mock:
90
+ result = classify_mock(task["input"])
91
+ else:
92
+ result = classify_llm(task["input"], config)
93
+
94
+ output = {"id": task["id"], "output": result}
95
+
96
+ if args.traces_dir:
97
+ os.makedirs(args.traces_dir, exist_ok=True)
98
+ trace = {
99
+ "mode": "mock" if use_mock else "llm",
100
+ "input_text": task["input"],
101
+ "output_category": result,
102
+ "config": {k: v for k, v in config.items() if k != "api_key"},
103
+ }
104
+ with open(os.path.join(args.traces_dir, "trace.json"), "w") as f:
105
+ json.dump([trace], f, indent=2)
106
+
107
+ json.dump(output, open(args.output, "w"), indent=2)
108
+
109
+
110
+ if __name__ == "__main__":
111
+ main()
@@ -0,0 +1 @@
1
+ {"id": "task_001", "input": "The patient presents with persistent cough, fever of 38.5C, and shortness of breath", "expected": "respiratory", "metadata": {"difficulty": "easy"}}
@@ -0,0 +1 @@
1
+ {"id": "task_002", "input": "Patient reports severe chest pain radiating to left arm with elevated blood pressure", "expected": "cardiac", "metadata": {"difficulty": "easy"}}
@@ -0,0 +1 @@
1
+ {"id": "task_003", "input": "Recurring nausea, vomiting after meals, and sharp abdominal pain in lower right quadrant", "expected": "gastrointestinal", "metadata": {"difficulty": "easy"}}
@@ -0,0 +1 @@
1
+ {"id": "task_004", "input": "Patient complains of severe headache, dizziness, and intermittent numbness in left hand", "expected": "neurological", "metadata": {"difficulty": "easy"}}
@@ -0,0 +1 @@
1
+ {"id": "task_005", "input": "Chronic lower back pain with stiffness, worsening after prolonged sitting, mild joint swelling", "expected": "musculoskeletal", "metadata": {"difficulty": "easy"}}
@@ -0,0 +1 @@
1
+ {"id": "task_006", "input": "Red itchy rash spreading across torso with small raised bumps and occasional skin peeling", "expected": "dermatological", "metadata": {"difficulty": "easy"}}
@@ -0,0 +1 @@
1
+ {"id": "task_007", "input": "Patient has a mild cough and reports feeling dizzy with occasional heart palpitations", "expected": "cardiac", "metadata": {"difficulty": "hard"}}
@@ -0,0 +1 @@
1
+ {"id": "task_008", "input": "Fatigue and muscle weakness with tingling in extremities and difficulty concentrating", "expected": "neurological", "metadata": {"difficulty": "hard"}}
@@ -0,0 +1 @@
1
+ {"id": "task_009", "input": "Stomach cramps with alternating diarrhea and constipation, bloating after eating", "expected": "gastrointestinal", "metadata": {"difficulty": "medium"}}
@@ -0,0 +1 @@
1
+ {"id": "task_010", "input": "Joint pain in fingers and wrists with morning stiffness lasting over an hour and skin rash on knuckles", "expected": "musculoskeletal", "metadata": {"difficulty": "hard"}}
package/package.json ADDED
@@ -0,0 +1,29 @@
1
+ {
2
+ "name": "harness-evolver",
3
+ "version": "0.1.0",
4
+ "description": "Meta-Harness-style autonomous harness optimization for Claude Code",
5
+ "author": "Raphael Valdetaro Christi Cordeiro",
6
+ "license": "MIT",
7
+ "repository": {
8
+ "type": "git",
9
+ "url": "https://github.com/raphaelchristi/harness-evolver.git"
10
+ },
11
+ "keywords": [
12
+ "claude-code",
13
+ "harness",
14
+ "meta-harness",
15
+ "llm",
16
+ "optimization",
17
+ "agent"
18
+ ],
19
+ "bin": {
20
+ "harness-evolver": "bin/install.js"
21
+ },
22
+ "files": [
23
+ "bin/",
24
+ "skills/",
25
+ "agents/",
26
+ "tools/",
27
+ "examples/"
28
+ ]
29
+ }
@@ -0,0 +1,93 @@
1
+ ---
2
+ name: harness-evolve
3
+ description: "Run the harness evolution loop. Autonomously proposes, evaluates, and iterates on harness designs using full execution traces as feedback."
4
+ argument-hint: "[--iterations N] [--candidates-per-iter N]"
5
+ allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent]
6
+ ---
7
+
8
+ # /harness-evolve
9
+
10
+ Run the Meta-Harness optimization loop.
11
+
12
+ ## Arguments
13
+
14
+ - `--iterations N` (default: 10) — number of evolution iterations
15
+ - `--candidates-per-iter N` (default: 1) — harnesses per iteration
16
+
17
+ ## Prerequisites
18
+
19
+ Run `/harness-evolve-init` first. The `.harness-evolver/` directory must exist with a valid `summary.json`.
20
+
21
+ ## The Loop
22
+
23
+ For each iteration i from 1 to N:
24
+
25
+ ### 1. PROPOSE
26
+
27
+ Determine the next version number by reading `summary.json`:
28
+
29
+ ```bash
30
+ python3 -c "import json; s=json.load(open('.harness-evolver/summary.json')); print(f'v{s[\"iterations\"]+1:03d}')"
31
+ ```
32
+
33
+ Spawn the `harness-evolver-proposer` agent with this prompt:
34
+
35
+ > You are proposing iteration {i}. Create version {version_number} in `.harness-evolver/harnesses/{version_number}/`.
36
+ > Working directory contains `.harness-evolver/` with all prior candidates and traces.
37
+
38
+ The proposer agent will create:
39
+ - `.harness-evolver/harnesses/v{NNN}/harness.py`
40
+ - `.harness-evolver/harnesses/v{NNN}/config.json`
41
+ - `.harness-evolver/harnesses/v{NNN}/proposal.md`
42
+
43
+ ### 2. VALIDATE
44
+
45
+ ```bash
46
+ python3 ~/.harness-evolver/tools/evaluate.py validate \
47
+ --harness .harness-evolver/harnesses/v{NNN}/harness.py \
48
+ --config .harness-evolver/harnesses/v{NNN}/config.json
49
+ ```
50
+
51
+ If validation fails, ask the proposer to fix (1 retry). If it fails again, set score to 0.0 and continue.
52
+
53
+ ### 3. EVALUATE
54
+
55
+ ```bash
56
+ python3 ~/.harness-evolver/tools/evaluate.py run \
57
+ --harness .harness-evolver/harnesses/v{NNN}/harness.py \
58
+ --config .harness-evolver/harnesses/v{NNN}/config.json \
59
+ --tasks-dir .harness-evolver/eval/tasks/ \
60
+ --eval .harness-evolver/eval/eval.py \
61
+ --traces-dir .harness-evolver/harnesses/v{NNN}/traces/ \
62
+ --scores .harness-evolver/harnesses/v{NNN}/scores.json \
63
+ --timeout 60
64
+ ```
65
+
66
+ ### 4. UPDATE STATE
67
+
68
+ ```bash
69
+ python3 ~/.harness-evolver/tools/state.py update \
70
+ --base-dir .harness-evolver \
71
+ --version v{NNN} \
72
+ --scores .harness-evolver/harnesses/v{NNN}/scores.json \
73
+ --proposal .harness-evolver/harnesses/v{NNN}/proposal.md
74
+ ```
75
+
76
+ ### 5. REPORT
77
+
78
+ Read the updated `summary.json` and report:
79
+ - `Iteration {i}/{N}: v{NNN} scored {score} (best: v{best} at {best_score})`
80
+ - If regression (score < parent score): warn
81
+ - If new best: celebrate
82
+
83
+ ### Stop Conditions
84
+
85
+ - All N iterations completed
86
+ - **Stagnation**: 3 consecutive iterations without >1% improvement. Read `summary.json` history to check.
87
+ - **Target reached**: if `config.json` has `target_score` set and achieved.
88
+
89
+ When stopping, report final summary: best version, score, number of iterations, improvement over baseline.
90
+
91
+ ## Tool Path Resolution
92
+
93
+ Check `.harness-evolver/tools/` first (local override), then `~/.harness-evolver/tools/` (global install).
@@ -0,0 +1,53 @@
1
+ ---
2
+ name: harness-evolve-init
3
+ description: "Initialize harness evolution in the current project. Sets up .harness-evolver/ with baseline harness, eval script, and tasks."
4
+ argument-hint: "--harness <path> --eval <path> --tasks <path>"
5
+ allowed-tools: [Read, Write, Bash, Glob]
6
+ ---
7
+
8
+ # /harness-evolve-init
9
+
10
+ Initialize the Harness Evolver for this project.
11
+
12
+ ## Arguments
13
+
14
+ - `--harness <path>` — path to the harness script (any executable, typically Python)
15
+ - `--eval <path>` — path to the evaluation script
16
+ - `--tasks <path>` — path to the tasks directory (JSON files with id, input, expected)
17
+
18
+ ## What To Do
19
+
20
+ Run the init tool:
21
+
22
+ ```bash
23
+ python3 ~/.harness-evolver/tools/init.py \
24
+ --harness {harness} \
25
+ --eval {eval} \
26
+ --tasks {tasks} \
27
+ --base-dir .harness-evolver \
28
+ --harness-config {config if provided, else omit} \
29
+ --tools-dir ~/.harness-evolver/tools
30
+ ```
31
+
32
+ If `~/.harness-evolver/tools/init.py` does not exist, check `.harness-evolver/tools/init.py` (local override).
33
+
34
+ After init completes, report:
35
+ - Baseline score
36
+ - Number of tasks
37
+ - Next step: run `/harness-evolve` to start the optimization loop
38
+
39
+ ## LangSmith Dataset (optional)
40
+
41
+ If the user provides `--langsmith-dataset <dataset_id>`:
42
+
43
+ ```bash
44
+ python3 ~/.harness-evolver/tools/init.py \
45
+ --harness {harness} \
46
+ --eval {eval} \
47
+ --tasks {tasks} \
48
+ --base-dir .harness-evolver \
49
+ --langsmith-dataset {dataset_id}
50
+ ```
51
+
52
+ This pulls examples from a LangSmith dataset to use as tasks.
53
+ Requires `LANGSMITH_API_KEY` in the environment.
@@ -0,0 +1,25 @@
1
+ ---
2
+ name: harness-evolve-status
3
+ description: "Show the current status of harness evolution: best score, iteration count, progress history."
4
+ allowed-tools: [Read, Bash]
5
+ ---
6
+
7
+ # /harness-evolve-status
8
+
9
+ Show the current evolution status.
10
+
11
+ ## What To Do
12
+
13
+ ```bash
14
+ python3 ~/.harness-evolver/tools/state.py show --base-dir .harness-evolver
15
+ ```
16
+
17
+ If that doesn't exist, try:
18
+
19
+ ```bash
20
+ python3 .harness-evolver/tools/state.py show --base-dir .harness-evolver
21
+ ```
22
+
23
+ Also read and display the contents of `.harness-evolver/STATE.md` for the full status table.
24
+
25
+ If `.harness-evolver/` doesn't exist, tell the user to run `/harness-evolve-init` first.
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env python3
2
+ """Detect the technology stack of a harness by analyzing Python imports via AST.
3
+
4
+ Usage:
5
+ detect_stack.py <file_or_directory> [-o output.json]
6
+
7
+ Maps imports to known libraries and their Context7 IDs for documentation lookup.
8
+ Stdlib-only. No external dependencies.
9
+ """
10
+
11
+ import ast
12
+ import json
13
+ import os
14
+ import sys
15
+
16
+ KNOWN_LIBRARIES = {
17
+ "langchain": {
18
+ "context7_id": "/langchain-ai/langchain",
19
+ "display": "LangChain",
20
+ "modules": ["langchain", "langchain_core", "langchain_openai",
21
+ "langchain_anthropic", "langchain_community"],
22
+ },
23
+ "langgraph": {
24
+ "context7_id": "/langchain-ai/langgraph",
25
+ "display": "LangGraph",
26
+ "modules": ["langgraph"],
27
+ },
28
+ "llamaindex": {
29
+ "context7_id": "/run-llama/llama_index",
30
+ "display": "LlamaIndex",
31
+ "modules": ["llama_index"],
32
+ },
33
+ "openai": {
34
+ "context7_id": "/openai/openai-python",
35
+ "display": "OpenAI Python SDK",
36
+ "modules": ["openai"],
37
+ },
38
+ "anthropic": {
39
+ "context7_id": "/anthropics/anthropic-sdk-python",
40
+ "display": "Anthropic Python SDK",
41
+ "modules": ["anthropic"],
42
+ },
43
+ "dspy": {
44
+ "context7_id": "/stanfordnlp/dspy",
45
+ "display": "DSPy",
46
+ "modules": ["dspy"],
47
+ },
48
+ "crewai": {
49
+ "context7_id": "/crewAIInc/crewAI",
50
+ "display": "CrewAI",
51
+ "modules": ["crewai"],
52
+ },
53
+ "autogen": {
54
+ "context7_id": "/microsoft/autogen",
55
+ "display": "AutoGen",
56
+ "modules": ["autogen"],
57
+ },
58
+ "chromadb": {
59
+ "context7_id": "/chroma-core/chroma",
60
+ "display": "ChromaDB",
61
+ "modules": ["chromadb"],
62
+ },
63
+ "pinecone": {
64
+ "context7_id": "/pinecone-io/pinecone-python-client",
65
+ "display": "Pinecone",
66
+ "modules": ["pinecone"],
67
+ },
68
+ "qdrant": {
69
+ "context7_id": "/qdrant/qdrant",
70
+ "display": "Qdrant",
71
+ "modules": ["qdrant_client"],
72
+ },
73
+ "weaviate": {
74
+ "context7_id": "/weaviate/weaviate",
75
+ "display": "Weaviate",
76
+ "modules": ["weaviate"],
77
+ },
78
+ "fastapi": {
79
+ "context7_id": "/fastapi/fastapi",
80
+ "display": "FastAPI",
81
+ "modules": ["fastapi"],
82
+ },
83
+ "flask": {
84
+ "context7_id": "/pallets/flask",
85
+ "display": "Flask",
86
+ "modules": ["flask"],
87
+ },
88
+ "pydantic": {
89
+ "context7_id": "/pydantic/pydantic",
90
+ "display": "Pydantic",
91
+ "modules": ["pydantic"],
92
+ },
93
+ "pandas": {
94
+ "context7_id": "/pandas-dev/pandas",
95
+ "display": "Pandas",
96
+ "modules": ["pandas"],
97
+ },
98
+ "numpy": {
99
+ "context7_id": "/numpy/numpy",
100
+ "display": "NumPy",
101
+ "modules": ["numpy"],
102
+ },
103
+ }
104
+
105
+
106
+ def detect_from_file(filepath):
107
+ """Analyze imports of a Python file and return detected stack."""
108
+ with open(filepath) as f:
109
+ try:
110
+ tree = ast.parse(f.read())
111
+ except SyntaxError:
112
+ return {}
113
+
114
+ imports = set()
115
+ for node in ast.walk(tree):
116
+ if isinstance(node, ast.Import):
117
+ for alias in node.names:
118
+ imports.add(alias.name.split(".")[0])
119
+ elif isinstance(node, ast.ImportFrom):
120
+ if node.module:
121
+ imports.add(node.module.split(".")[0])
122
+
123
+ detected = {}
124
+ for lib_key, lib_info in KNOWN_LIBRARIES.items():
125
+ found = imports & set(lib_info["modules"])
126
+ if found:
127
+ detected[lib_key] = {
128
+ "context7_id": lib_info["context7_id"],
129
+ "display": lib_info["display"],
130
+ "modules_found": sorted(found),
131
+ }
132
+
133
+ return detected
134
+
135
+
136
+ def detect_from_directory(directory):
137
+ """Analyze all .py files in a directory and consolidate the stack."""
138
+ all_detected = {}
139
+ for root, dirs, files in os.walk(directory):
140
+ for f in files:
141
+ if f.endswith(".py"):
142
+ filepath = os.path.join(root, f)
143
+ file_detected = detect_from_file(filepath)
144
+ for lib_key, lib_info in file_detected.items():
145
+ if lib_key not in all_detected:
146
+ all_detected[lib_key] = lib_info
147
+ else:
148
+ existing = set(all_detected[lib_key]["modules_found"])
149
+ existing.update(lib_info["modules_found"])
150
+ all_detected[lib_key]["modules_found"] = sorted(existing)
151
+ return all_detected
152
+
153
+
154
+ if __name__ == "__main__":
155
+ import argparse
156
+
157
+ parser = argparse.ArgumentParser(description="Detect stack from Python files")
158
+ parser.add_argument("path", help="File or directory to analyze")
159
+ parser.add_argument("--output", "-o", help="Output JSON path")
160
+ args = parser.parse_args()
161
+
162
+ if os.path.isfile(args.path):
163
+ result = detect_from_file(args.path)
164
+ else:
165
+ result = detect_from_directory(args.path)
166
+
167
+ output = json.dumps(result, indent=2)
168
+
169
+ if args.output:
170
+ with open(args.output, "w") as f:
171
+ f.write(output)
172
+ else:
173
+ print(output)