npm - harness-evolver - Versions diffs - 0.1.0 - Mend

harness-evolver 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/LICENSE +21 -0
package/README.md +252 -0
package/agents/harness-evolver-proposer.md +163 -0
package/bin/install.js +125 -0
package/examples/classifier/README.md +25 -0
package/examples/classifier/config.json +3 -0
package/examples/classifier/eval.py +58 -0
package/examples/classifier/harness.py +111 -0
package/examples/classifier/tasks/task_001.json +1 -0
package/examples/classifier/tasks/task_002.json +1 -0
package/examples/classifier/tasks/task_003.json +1 -0
package/examples/classifier/tasks/task_004.json +1 -0
package/examples/classifier/tasks/task_005.json +1 -0
package/examples/classifier/tasks/task_006.json +1 -0
package/examples/classifier/tasks/task_007.json +1 -0
package/examples/classifier/tasks/task_008.json +1 -0
package/examples/classifier/tasks/task_009.json +1 -0
package/examples/classifier/tasks/task_010.json +1 -0
package/package.json +29 -0
package/skills/harness-evolve/SKILL.md +93 -0
package/skills/harness-evolve-init/SKILL.md +53 -0
package/skills/harness-evolve-status/SKILL.md +25 -0
package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
package/tools/__pycache__/langsmith_adapter.cpython-313.pyc +0 -0
package/tools/__pycache__/langsmith_api.cpython-313.pyc +0 -0
package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
package/tools/detect_stack.py +173 -0
package/tools/evaluate.py +214 -0
package/tools/init.py +231 -0
package/tools/state.py +219 -0
package/tools/trace_logger.py +42 -0

package/examples/classifier/harness.py ADDED Viewed

@@ -0,0 +1,111 @@
+#!/usr/bin/env python3
+"""Medical symptom classifier — deliberately naive, with room for improvement.
+Mock mode (default): keyword matching, ~40% accuracy.
+LLM mode: calls API, ~50-60% accuracy (no few-shot, no retry, no structured output).
+"""
+import argparse
+import json
+import os
+import sys
+CATEGORIES = [
+    "respiratory", "cardiac", "gastrointestinal",
+    "neurological", "musculoskeletal", "dermatological",
+]
+KEYWORDS = {
+    "respiratory": ["cough", "breath", "lung", "wheeze", "sputum"],
+    "cardiac": ["chest pain", "heart", "blood pressure", "palpitation"],
+    "gastrointestinal": ["nausea", "vomit", "abdominal", "diarrhea", "stomach"],
+    "neurological": ["headache", "dizz", "numb", "seizure", "confusion"],
+    "musculoskeletal": ["joint", "back pain", "muscle", "stiffness", "swelling"],
+    "dermatological": ["rash", "itch", "skin", "lesion", "bump"],
+}
+def classify_mock(text):
+    text_lower = text.lower()
+    scores = {}
+    for category, words in KEYWORDS.items():
+        scores[category] = sum(1 for w in words if w in text_lower)
+    best = max(scores, key=scores.get)
+    if scores[best] == 0:
+        return "unknown"
+    return best
+def classify_llm(text, config):
+    import urllib.request
+    api_key = config.get("api_key", os.environ.get("ANTHROPIC_API_KEY", ""))
+    model = config.get("model", "claude-haiku-4-5-20251001")
+    prompt = (
+        f"Classify the following medical symptom description into exactly one category.\n"
+        f"Categories: {', '.join(CATEGORIES)}\n"
+        f"Reply with ONLY the category name, nothing else.\n\n"
+        f"{text}"
+    )
+    body = json.dumps({
+        "model": model,
+        "max_tokens": 50,
+        "messages": [{"role": "user", "content": prompt}],
+    }).encode()
+    req = urllib.request.Request(
+        "https://api.anthropic.com/v1/messages",
+        data=body,
+        headers={
+            "Content-Type": "application/json",
+            "x-api-key": api_key,
+            "anthropic-version": "2023-06-01",
+        },
+    )
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        result = json.loads(resp.read())
+    answer = result["content"][0]["text"].strip().lower()
+    for cat in CATEGORIES:
+        if cat in answer:
+            return cat
+    return answer
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", required=True)
+    parser.add_argument("--output", required=True)
+    parser.add_argument("--traces-dir", default=None)
+    parser.add_argument("--config", default=None)
+    args = parser.parse_args()
+    task = json.load(open(args.input))
+    config = json.load(open(args.config)) if args.config and os.path.exists(args.config) else {}
+    use_mock = config.get("mock", True)
+    if use_mock:
+        result = classify_mock(task["input"])
+    else:
+        result = classify_llm(task["input"], config)
+    output = {"id": task["id"], "output": result}
+    if args.traces_dir:
+        os.makedirs(args.traces_dir, exist_ok=True)
+        trace = {
+            "mode": "mock" if use_mock else "llm",
+            "input_text": task["input"],
+            "output_category": result,
+            "config": {k: v for k, v in config.items() if k != "api_key"},
+        }
+        with open(os.path.join(args.traces_dir, "trace.json"), "w") as f:
+            json.dump([trace], f, indent=2)
+    json.dump(output, open(args.output, "w"), indent=2)
+if __name__ == "__main__":
+    main()

package/examples/classifier/tasks/task_001.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"id": "task_001", "input": "The patient presents with persistent cough, fever of 38.5C, and shortness of breath", "expected": "respiratory", "metadata": {"difficulty": "easy"}}

package/examples/classifier/tasks/task_002.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"id": "task_002", "input": "Patient reports severe chest pain radiating to left arm with elevated blood pressure", "expected": "cardiac", "metadata": {"difficulty": "easy"}}

package/examples/classifier/tasks/task_003.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"id": "task_003", "input": "Recurring nausea, vomiting after meals, and sharp abdominal pain in lower right quadrant", "expected": "gastrointestinal", "metadata": {"difficulty": "easy"}}

package/examples/classifier/tasks/task_004.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"id": "task_004", "input": "Patient complains of severe headache, dizziness, and intermittent numbness in left hand", "expected": "neurological", "metadata": {"difficulty": "easy"}}

package/examples/classifier/tasks/task_005.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"id": "task_005", "input": "Chronic lower back pain with stiffness, worsening after prolonged sitting, mild joint swelling", "expected": "musculoskeletal", "metadata": {"difficulty": "easy"}}

package/examples/classifier/tasks/task_006.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"id": "task_006", "input": "Red itchy rash spreading across torso with small raised bumps and occasional skin peeling", "expected": "dermatological", "metadata": {"difficulty": "easy"}}

package/examples/classifier/tasks/task_007.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"id": "task_007", "input": "Patient has a mild cough and reports feeling dizzy with occasional heart palpitations", "expected": "cardiac", "metadata": {"difficulty": "hard"}}

package/examples/classifier/tasks/task_008.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"id": "task_008", "input": "Fatigue and muscle weakness with tingling in extremities and difficulty concentrating", "expected": "neurological", "metadata": {"difficulty": "hard"}}

package/examples/classifier/tasks/task_009.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"id": "task_009", "input": "Stomach cramps with alternating diarrhea and constipation, bloating after eating", "expected": "gastrointestinal", "metadata": {"difficulty": "medium"}}

package/examples/classifier/tasks/task_010.json ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"id": "task_010", "input": "Joint pain in fingers and wrists with morning stiffness lasting over an hour and skin rash on knuckles", "expected": "musculoskeletal", "metadata": {"difficulty": "hard"}}

package/package.json ADDED Viewed

@@ -0,0 +1,29 @@
+{
+  "name": "harness-evolver",
+  "version": "0.1.0",
+  "description": "Meta-Harness-style autonomous harness optimization for Claude Code",
+  "author": "Raphael Valdetaro Christi Cordeiro",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/raphaelchristi/harness-evolver.git"
+  },
+  "keywords": [
+    "claude-code",
+    "harness",
+    "meta-harness",
+    "llm",
+    "optimization",
+    "agent"
+  ],
+  "bin": {
+    "harness-evolver": "bin/install.js"
+  },
+  "files": [
+    "bin/",
+    "skills/",
+    "agents/",
+    "tools/",
+    "examples/"
+  ]
+}

package/skills/harness-evolve/SKILL.md ADDED Viewed

@@ -0,0 +1,93 @@
+---
+name: harness-evolve
+description: "Run the harness evolution loop. Autonomously proposes, evaluates, and iterates on harness designs using full execution traces as feedback."
+argument-hint: "[--iterations N] [--candidates-per-iter N]"
+allowed-tools: [Read, Write, Edit, Bash, Glob, Grep, Agent]
+---
+# /harness-evolve
+Run the Meta-Harness optimization loop.
+## Arguments
+- `--iterations N` (default: 10) — number of evolution iterations
+- `--candidates-per-iter N` (default: 1) — harnesses per iteration
+## Prerequisites
+Run `/harness-evolve-init` first. The `.harness-evolver/` directory must exist with a valid `summary.json`.
+## The Loop
+For each iteration i from 1 to N:
+### 1. PROPOSE
+Determine the next version number by reading `summary.json`:
+```bash
+python3 -c "import json; s=json.load(open('.harness-evolver/summary.json')); print(f'v{s[\"iterations\"]+1:03d}')"
+```
+Spawn the `harness-evolver-proposer` agent with this prompt:
+> You are proposing iteration {i}. Create version {version_number} in `.harness-evolver/harnesses/{version_number}/`.
+> Working directory contains `.harness-evolver/` with all prior candidates and traces.
+The proposer agent will create:
+- `.harness-evolver/harnesses/v{NNN}/harness.py`
+- `.harness-evolver/harnesses/v{NNN}/config.json`
+- `.harness-evolver/harnesses/v{NNN}/proposal.md`
+### 2. VALIDATE
+```bash
+python3 ~/.harness-evolver/tools/evaluate.py validate \
+    --harness .harness-evolver/harnesses/v{NNN}/harness.py \
+    --config .harness-evolver/harnesses/v{NNN}/config.json
+```
+If validation fails, ask the proposer to fix (1 retry). If it fails again, set score to 0.0 and continue.
+### 3. EVALUATE
+```bash
+python3 ~/.harness-evolver/tools/evaluate.py run \
+    --harness .harness-evolver/harnesses/v{NNN}/harness.py \
+    --config .harness-evolver/harnesses/v{NNN}/config.json \
+    --tasks-dir .harness-evolver/eval/tasks/ \
+    --eval .harness-evolver/eval/eval.py \
+    --traces-dir .harness-evolver/harnesses/v{NNN}/traces/ \
+    --scores .harness-evolver/harnesses/v{NNN}/scores.json \
+    --timeout 60
+```
+### 4. UPDATE STATE
+```bash
+python3 ~/.harness-evolver/tools/state.py update \
+    --base-dir .harness-evolver \
+    --version v{NNN} \
+    --scores .harness-evolver/harnesses/v{NNN}/scores.json \
+    --proposal .harness-evolver/harnesses/v{NNN}/proposal.md
+```
+### 5. REPORT
+Read the updated `summary.json` and report:
+- `Iteration {i}/{N}: v{NNN} scored {score} (best: v{best} at {best_score})`
+- If regression (score < parent score): warn
+- If new best: celebrate
+### Stop Conditions
+- All N iterations completed
+- **Stagnation**: 3 consecutive iterations without >1% improvement. Read `summary.json` history to check.
+- **Target reached**: if `config.json` has `target_score` set and achieved.
+When stopping, report final summary: best version, score, number of iterations, improvement over baseline.
+## Tool Path Resolution
+Check `.harness-evolver/tools/` first (local override), then `~/.harness-evolver/tools/` (global install).

package/skills/harness-evolve-init/SKILL.md ADDED Viewed

@@ -0,0 +1,53 @@
+---
+name: harness-evolve-init
+description: "Initialize harness evolution in the current project. Sets up .harness-evolver/ with baseline harness, eval script, and tasks."
+argument-hint: "--harness <path> --eval <path> --tasks <path>"
+allowed-tools: [Read, Write, Bash, Glob]
+---
+# /harness-evolve-init
+Initialize the Harness Evolver for this project.
+## Arguments
+- `--harness <path>` — path to the harness script (any executable, typically Python)
+- `--eval <path>` — path to the evaluation script
+- `--tasks <path>` — path to the tasks directory (JSON files with id, input, expected)
+## What To Do
+Run the init tool:
+```bash
+python3 ~/.harness-evolver/tools/init.py \
+    --harness {harness} \
+    --eval {eval} \
+    --tasks {tasks} \
+    --base-dir .harness-evolver \
+    --harness-config {config if provided, else omit} \
+    --tools-dir ~/.harness-evolver/tools
+```
+If `~/.harness-evolver/tools/init.py` does not exist, check `.harness-evolver/tools/init.py` (local override).
+After init completes, report:
+- Baseline score
+- Number of tasks
+- Next step: run `/harness-evolve` to start the optimization loop
+## LangSmith Dataset (optional)
+If the user provides `--langsmith-dataset <dataset_id>`:
+```bash
+python3 ~/.harness-evolver/tools/init.py \
+    --harness {harness} \
+    --eval {eval} \
+    --tasks {tasks} \
+    --base-dir .harness-evolver \
+    --langsmith-dataset {dataset_id}
+```
+This pulls examples from a LangSmith dataset to use as tasks.
+Requires `LANGSMITH_API_KEY` in the environment.

package/skills/harness-evolve-status/SKILL.md ADDED Viewed

@@ -0,0 +1,25 @@
+---
+name: harness-evolve-status
+description: "Show the current status of harness evolution: best score, iteration count, progress history."
+allowed-tools: [Read, Bash]
+---
+# /harness-evolve-status
+Show the current evolution status.
+## What To Do
+```bash
+python3 ~/.harness-evolver/tools/state.py show --base-dir .harness-evolver
+```
+If that doesn't exist, try:
+```bash
+python3 .harness-evolver/tools/state.py show --base-dir .harness-evolver
+```
+Also read and display the contents of `.harness-evolver/STATE.md` for the full status table.
+If `.harness-evolver/` doesn't exist, tell the user to run `/harness-evolve-init` first.

package/tools/__pycache__/detect_stack.cpython-313.pyc ADDED Viewed

Binary file

package/tools/__pycache__/langsmith_adapter.cpython-313.pyc ADDED Viewed

Binary file

package/tools/__pycache__/langsmith_api.cpython-313.pyc ADDED Viewed

Binary file

package/tools/__pycache__/trace_logger.cpython-313.pyc ADDED Viewed

Binary file

package/tools/detect_stack.py ADDED Viewed

@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+"""Detect the technology stack of a harness by analyzing Python imports via AST.
+Usage:
+    detect_stack.py <file_or_directory> [-o output.json]
+Maps imports to known libraries and their Context7 IDs for documentation lookup.
+Stdlib-only. No external dependencies.
+"""
+import ast
+import json
+import os
+import sys
+KNOWN_LIBRARIES = {
+    "langchain": {
+        "context7_id": "/langchain-ai/langchain",
+        "display": "LangChain",
+        "modules": ["langchain", "langchain_core", "langchain_openai",
+                     "langchain_anthropic", "langchain_community"],
+    },
+    "langgraph": {
+        "context7_id": "/langchain-ai/langgraph",
+        "display": "LangGraph",
+        "modules": ["langgraph"],
+    },
+    "llamaindex": {
+        "context7_id": "/run-llama/llama_index",
+        "display": "LlamaIndex",
+        "modules": ["llama_index"],
+    },
+    "openai": {
+        "context7_id": "/openai/openai-python",
+        "display": "OpenAI Python SDK",
+        "modules": ["openai"],
+    },
+    "anthropic": {
+        "context7_id": "/anthropics/anthropic-sdk-python",
+        "display": "Anthropic Python SDK",
+        "modules": ["anthropic"],
+    },
+    "dspy": {
+        "context7_id": "/stanfordnlp/dspy",
+        "display": "DSPy",
+        "modules": ["dspy"],
+    },
+    "crewai": {
+        "context7_id": "/crewAIInc/crewAI",
+        "display": "CrewAI",
+        "modules": ["crewai"],
+    },
+    "autogen": {
+        "context7_id": "/microsoft/autogen",
+        "display": "AutoGen",
+        "modules": ["autogen"],
+    },
+    "chromadb": {
+        "context7_id": "/chroma-core/chroma",
+        "display": "ChromaDB",
+        "modules": ["chromadb"],
+    },
+    "pinecone": {
+        "context7_id": "/pinecone-io/pinecone-python-client",
+        "display": "Pinecone",
+        "modules": ["pinecone"],
+    },
+    "qdrant": {
+        "context7_id": "/qdrant/qdrant",
+        "display": "Qdrant",
+        "modules": ["qdrant_client"],
+    },
+    "weaviate": {
+        "context7_id": "/weaviate/weaviate",
+        "display": "Weaviate",
+        "modules": ["weaviate"],
+    },
+    "fastapi": {
+        "context7_id": "/fastapi/fastapi",
+        "display": "FastAPI",
+        "modules": ["fastapi"],
+    },
+    "flask": {
+        "context7_id": "/pallets/flask",
+        "display": "Flask",
+        "modules": ["flask"],
+    },
+    "pydantic": {
+        "context7_id": "/pydantic/pydantic",
+        "display": "Pydantic",
+        "modules": ["pydantic"],
+    },
+    "pandas": {
+        "context7_id": "/pandas-dev/pandas",
+        "display": "Pandas",
+        "modules": ["pandas"],
+    },
+    "numpy": {
+        "context7_id": "/numpy/numpy",
+        "display": "NumPy",
+        "modules": ["numpy"],
+    },
+}
+def detect_from_file(filepath):
+    """Analyze imports of a Python file and return detected stack."""
+    with open(filepath) as f:
+        try:
+            tree = ast.parse(f.read())
+        except SyntaxError:
+            return {}
+    imports = set()
+    for node in ast.walk(tree):
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                imports.add(alias.name.split(".")[0])
+        elif isinstance(node, ast.ImportFrom):
+            if node.module:
+                imports.add(node.module.split(".")[0])
+    detected = {}
+    for lib_key, lib_info in KNOWN_LIBRARIES.items():
+        found = imports & set(lib_info["modules"])
+        if found:
+            detected[lib_key] = {
+                "context7_id": lib_info["context7_id"],
+                "display": lib_info["display"],
+                "modules_found": sorted(found),
+            }
+    return detected
+def detect_from_directory(directory):
+    """Analyze all .py files in a directory and consolidate the stack."""
+    all_detected = {}
+    for root, dirs, files in os.walk(directory):
+        for f in files:
+            if f.endswith(".py"):
+                filepath = os.path.join(root, f)
+                file_detected = detect_from_file(filepath)
+                for lib_key, lib_info in file_detected.items():
+                    if lib_key not in all_detected:
+                        all_detected[lib_key] = lib_info
+                    else:
+                        existing = set(all_detected[lib_key]["modules_found"])
+                        existing.update(lib_info["modules_found"])
+                        all_detected[lib_key]["modules_found"] = sorted(existing)
+    return all_detected
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Detect stack from Python files")
+    parser.add_argument("path", help="File or directory to analyze")
+    parser.add_argument("--output", "-o", help="Output JSON path")
+    args = parser.parse_args()
+    if os.path.isfile(args.path):
+        result = detect_from_file(args.path)
+    else:
+        result = detect_from_directory(args.path)
+    output = json.dumps(result, indent=2)
+    if args.output:
+        with open(args.output, "w") as f:
+            f.write(output)
+    else:
+        print(output)