npm - harness-evolver - Versions diffs - 2.9.1 → 3.0.1 - Mend

harness-evolver 2.9.1 → 3.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/README.md +62 -117
package/agents/evolver-architect.md +53 -0
package/agents/evolver-critic.md +44 -0
package/agents/evolver-proposer.md +128 -0
package/agents/evolver-testgen.md +67 -0
package/bin/install.js +181 -171
package/package.json +7 -7
package/skills/deploy/SKILL.md +49 -56
package/skills/evolve/SKILL.md +156 -687
package/skills/setup/SKILL.md +182 -0
package/skills/status/SKILL.md +23 -21
package/tools/read_results.py +240 -0
package/tools/run_eval.py +202 -0
package/tools/seed_from_traces.py +36 -8
package/tools/setup.py +393 -0
package/tools/trace_insights.py +86 -14
package/agents/harness-evolver-architect.md +0 -173
package/agents/harness-evolver-critic.md +0 -132
package/agents/harness-evolver-judge.md +0 -110
package/agents/harness-evolver-proposer.md +0 -317
package/agents/harness-evolver-testgen.md +0 -112
package/examples/classifier/README.md +0 -25
package/examples/classifier/config.json +0 -3
package/examples/classifier/eval.py +0 -58
package/examples/classifier/harness.py +0 -111
package/examples/classifier/tasks/task_001.json +0 -1
package/examples/classifier/tasks/task_002.json +0 -1
package/examples/classifier/tasks/task_003.json +0 -1
package/examples/classifier/tasks/task_004.json +0 -1
package/examples/classifier/tasks/task_005.json +0 -1
package/examples/classifier/tasks/task_006.json +0 -1
package/examples/classifier/tasks/task_007.json +0 -1
package/examples/classifier/tasks/task_008.json +0 -1
package/examples/classifier/tasks/task_009.json +0 -1
package/examples/classifier/tasks/task_010.json +0 -1
package/skills/architect/SKILL.md +0 -93
package/skills/compare/SKILL.md +0 -73
package/skills/critic/SKILL.md +0 -67
package/skills/diagnose/SKILL.md +0 -96
package/skills/import-traces/SKILL.md +0 -102
package/skills/init/SKILL.md +0 -293
package/tools/__pycache__/detect_stack.cpython-313.pyc +0 -0
package/tools/__pycache__/init.cpython-313.pyc +0 -0
package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
package/tools/__pycache__/trace_logger.cpython-313.pyc +0 -0
package/tools/eval_llm_judge.py +0 -233
package/tools/eval_passthrough.py +0 -55
package/tools/evaluate.py +0 -255
package/tools/import_traces.py +0 -229
package/tools/init.py +0 -531
package/tools/llm_api.py +0 -125
package/tools/state.py +0 -219
package/tools/test_growth.py +0 -230
package/tools/trace_logger.py +0 -42

package/tools/evaluate.py DELETED Viewed

@@ -1,255 +0,0 @@
-#!/usr/bin/env python3
-"""Evaluation orchestrator for Harness Evolver.
-Commands:
-    validate --harness PATH [--config PATH] [--timeout SECONDS]
-    run      --harness PATH --tasks-dir PATH --eval PATH --traces-dir PATH --scores PATH
-             [--config PATH] [--timeout SECONDS]
-Runs harness per task, captures traces (stdout/stderr/timing), then calls user's eval script.
-Stdlib-only. No external dependencies.
-"""
-import argparse
-import json
-import os
-import shutil
-import subprocess
-import sys
-import tempfile
-import time
-def _resolve_python():
-    """Resolve the Python interpreter to use for subprocesses.
-    Prefers the current interpreter (sys.executable) over a hardcoded 'python3'.
-    This is critical in monorepo setups where the harness may need a specific
-    venv Python (e.g. Python 3.12) while the system 'python3' is a different
-    version (e.g. 3.14) with incompatible site-packages.
-    """
-    exe = sys.executable
-    if exe and os.path.isfile(exe):
-        return exe
-    return "python3"
-def _run_harness_on_task(harness, config, task_input_path, output_path, task_traces_dir, timeout, env=None):
-    """Run the harness on a single task. Returns (success, elapsed_ms, stdout, stderr)."""
-    cmd = [_resolve_python(), harness, "--input", task_input_path, "--output", output_path]
-    if task_traces_dir:
-        extra_dir = os.path.join(task_traces_dir, "extra")
-        os.makedirs(extra_dir, exist_ok=True)
-        cmd.extend(["--traces-dir", extra_dir])
-    if config and os.path.exists(config):
-        cmd.extend(["--config", config])
-    start = time.time()
-    try:
-        result = subprocess.run(
-            cmd, capture_output=True, text=True, timeout=timeout, env=env,
-        )
-        elapsed_ms = (time.time() - start) * 1000
-        # Accept exit code 0 (success) or check if output file exists for non-zero exits.
-        # LLM agents with C extensions (numpy, httpx) often segfault (exit 139) during
-        # Python shutdown AFTER writing correct output.
-        success = result.returncode == 0
-        if not success and os.path.exists(output_path):
-            try:
-                with open(output_path) as f:
-                    json.load(f)
-                # Valid JSON output exists despite non-zero exit — treat as success
-                success = True
-            except (json.JSONDecodeError, OSError):
-                pass
-        return success, elapsed_ms, result.stdout, result.stderr
-    except subprocess.TimeoutExpired:
-        elapsed_ms = (time.time() - start) * 1000
-        return False, elapsed_ms, "", f"TIMEOUT after {timeout}s"
-    except Exception as e:
-        elapsed_ms = (time.time() - start) * 1000
-        return False, elapsed_ms, "", str(e)
-def cmd_validate(args):
-    harness = args.harness
-    config = getattr(args, "config", None)
-    timeout = getattr(args, "timeout", 30) or 30
-    if not os.path.exists(harness):
-        print(f"FAIL: harness not found: {harness}", file=sys.stderr)
-        sys.exit(1)
-    with tempfile.TemporaryDirectory() as tmpdir:
-        dummy_task = {"id": "validation", "input": "test input for validation", "metadata": {}}
-        input_path = os.path.join(tmpdir, "input.json")
-        output_path = os.path.join(tmpdir, "output.json")
-        with open(input_path, "w") as f:
-            json.dump(dummy_task, f)
-        success, elapsed, stdout, stderr = _run_harness_on_task(
-            harness, config, input_path, output_path, None, timeout=timeout,
-        )
-        if not success:
-            hint = ""
-            if "TIMEOUT" in stderr:
-                hint = (f"\nHint: validation timed out after {timeout}s. "
-                        "For LLM-powered agents that make real API calls, "
-                        "use --timeout to increase the limit: "
-                        f"evaluate.py validate --harness {harness} --timeout 120")
-            print(f"FAIL: harness exited with error.\nstderr: {stderr}{hint}", file=sys.stderr)
-            sys.exit(1)
-        if not os.path.exists(output_path):
-            print("FAIL: harness did not create output file.", file=sys.stderr)
-            sys.exit(1)
-        try:
-            with open(output_path) as f:
-                output = json.load(f)
-        except (json.JSONDecodeError, ValueError) as e:
-            print(f"FAIL: output is not valid JSON: {e}", file=sys.stderr)
-            sys.exit(1)
-        if "id" not in output or "output" not in output:
-            print(f"FAIL: output missing 'id' or 'output' fields. Got: {output}", file=sys.stderr)
-            sys.exit(1)
-        print(f"OK: harness validated in {elapsed:.0f}ms. Output: {output}")
-def cmd_run(args):
-    harness = args.harness
-    config = getattr(args, "config", None)
-    tasks_dir = args.tasks_dir
-    eval_script = getattr(args, "eval")
-    traces_dir = args.traces_dir
-    scores_path = args.scores
-    timeout = args.timeout
-    os.makedirs(traces_dir, exist_ok=True)
-    task_files = sorted(f for f in os.listdir(tasks_dir) if f.endswith(".json"))
-    if not task_files:
-        print(f"FAIL: no .json task files in {tasks_dir}", file=sys.stderr)
-        sys.exit(1)
-    all_stdout = []
-    all_stderr = []
-    timing = {"per_task": {}}
-    results_dir = tempfile.mkdtemp()
-    # LangSmith: setup auto-tracing env vars if configured
-    langsmith_env = None
-    project_config_path = os.path.join(os.path.dirname(os.path.dirname(traces_dir)), "config.json")
-    if os.path.exists(project_config_path):
-        with open(project_config_path) as f:
-            project_config = json.load(f)
-        ls = project_config.get("eval", {}).get("langsmith", {})
-        if ls.get("enabled"):
-            api_key = os.environ.get(ls.get("api_key_env", "LANGSMITH_API_KEY"), "")
-            if api_key:
-                version = os.path.basename(os.path.dirname(traces_dir))
-                ls_project = f"{ls.get('project_prefix', 'harness-evolver')}-{version}"
-                langsmith_env = {
-                    **os.environ,
-                    "LANGCHAIN_TRACING_V2": "true",
-                    "LANGCHAIN_API_KEY": api_key,
-                    "LANGCHAIN_PROJECT": ls_project,
-                }
-                # Write the project name so the evolve skill knows where to find traces
-                ls_project_file = os.path.join(os.path.dirname(os.path.dirname(traces_dir)), "langsmith_project.txt")
-                with open(ls_project_file, "w") as f:
-                    f.write(ls_project)
-    for task_file in task_files:
-        task_path = os.path.join(tasks_dir, task_file)
-        with open(task_path) as f:
-            task = json.load(f)
-        task_id = task["id"]
-        task_input = {k: v for k, v in task.items() if k != "expected"}
-        task_traces_dir = os.path.join(traces_dir, task_id)
-        os.makedirs(task_traces_dir, exist_ok=True)
-        input_path = os.path.join(task_traces_dir, "input.json")
-        with open(input_path, "w") as f:
-            json.dump(task_input, f, indent=2)
-        output_path = os.path.join(results_dir, task_file)
-        success, elapsed_ms, stdout, stderr = _run_harness_on_task(
-            harness, config, input_path, output_path, task_traces_dir, timeout,
-            env=langsmith_env,
-        )
-        if os.path.exists(output_path):
-            shutil.copy2(output_path, os.path.join(task_traces_dir, "output.json"))
-        else:
-            with open(os.path.join(task_traces_dir, "output.json"), "w") as f:
-                json.dump({"id": task_id, "output": "", "error": "harness failed"}, f)
-        timing["per_task"][task_id] = round(elapsed_ms, 1)
-        all_stdout.append(f"--- {task_id} ---\n{stdout}")
-        all_stderr.append(f"--- {task_id} ---\n{stderr}")
-    timing["total_ms"] = round(sum(timing["per_task"].values()), 1)
-    with open(os.path.join(traces_dir, "timing.json"), "w") as f:
-        json.dump(timing, f, indent=2)
-    with open(os.path.join(traces_dir, "stdout.log"), "w") as f:
-        f.write("\n".join(all_stdout))
-    with open(os.path.join(traces_dir, "stderr.log"), "w") as f:
-        f.write("\n".join(all_stderr))
-    eval_cmd = [
-        _resolve_python(), eval_script,
-        "--results-dir", results_dir,
-        "--tasks-dir", tasks_dir,
-        "--scores", scores_path,
-    ]
-    result = subprocess.run(eval_cmd, capture_output=True, text=True, timeout=120)
-    if result.returncode != 0:
-        print(f"FAIL: eval script failed.\nstderr: {result.stderr}", file=sys.stderr)
-        sys.exit(1)
-    if os.path.exists(scores_path):
-        scores = json.load(open(scores_path))
-        print(f"Evaluation complete. combined_score: {scores.get('combined_score', 'N/A')}")
-    else:
-        print("WARNING: eval script did not produce scores file.", file=sys.stderr)
-def main():
-    parser = argparse.ArgumentParser(description="Harness Evolver evaluation orchestrator")
-    sub = parser.add_subparsers(dest="command")
-    p_val = sub.add_parser("validate")
-    p_val.add_argument("--harness", required=True)
-    p_val.add_argument("--config", default=None)
-    p_val.add_argument("--timeout", type=int, default=30,
-                       help="Validation timeout in seconds (default: 30). "
-                            "Increase for LLM-powered agents that make real API calls.")
-    p_run = sub.add_parser("run")
-    p_run.add_argument("--harness", required=True)
-    p_run.add_argument("--config", default=None)
-    p_run.add_argument("--tasks-dir", required=True)
-    p_run.add_argument("--eval", required=True)
-    p_run.add_argument("--traces-dir", required=True)
-    p_run.add_argument("--scores", required=True)
-    p_run.add_argument("--timeout", type=int, default=60)
-    args = parser.parse_args()
-    if args.command == "validate":
-        cmd_validate(args)
-    elif args.command == "run":
-        cmd_run(args)
-    else:
-        parser.print_help()
-        sys.exit(1)
-if __name__ == "__main__":
-    main()

package/tools/import_traces.py DELETED Viewed

@@ -1,229 +0,0 @@
-#!/usr/bin/env python3
-"""Import LangSmith Traces as Eval Tasks for Harness Evolver.
-Transforms LangSmith trace JSON (from langsmith-cli) into task JSON files
-for the evaluation set. Prioritizes traces with negative feedback.
-Usage:
-    python3 import_traces.py \
-        --traces-json /tmp/langsmith_traces.json \
-        --output-dir .harness-evolver/eval/tasks/ \
-        --prefix imported \
-        [--max-tasks 30]
-Stdlib-only. No external dependencies.
-"""
-import argparse
-import hashlib
-import json
-import os
-import re
-import sys
-def load_json(path):
-    """Load JSON file, return None if missing or invalid."""
-    if not path or not os.path.exists(path):
-        return None
-    try:
-        with open(path) as f:
-            return json.load(f)
-    except (json.JSONDecodeError, OSError):
-        return None
-def extract_input_from_trace(run):
-    """Extract the user input from a LangSmith run's inputs field.
-    Handles multiple LangChain serialization formats:
-    - Direct {"input": "..."} field
-    - {"messages": [[HumanMessage, ...]]} format
-    - {"question": "..."} or {"query": "..."} fields
-    """
-    inputs = run.get("inputs", {})
-    if not inputs:
-        return None
-    if isinstance(inputs, str):
-        return inputs
-    # Direct input field
-    for key in ("input", "question", "query", "prompt", "text", "user_input"):
-        if key in inputs and isinstance(inputs[key], str):
-            return inputs[key]
-    # LangChain messages format
-    messages = inputs.get("messages") or inputs.get("input")
-    if isinstance(messages, list):
-        # Might be [[msg1, msg2]] (batched) or [msg1, msg2]
-        if messages and isinstance(messages[0], list):
-            messages = messages[0]
-        for msg in messages:
-            if isinstance(msg, dict):
-                # {"type": "human", "content": "..."}
-                if msg.get("type") in ("human", "HumanMessage") or msg.get("role") == "user":
-                    content = msg.get("content", "")
-                    if isinstance(content, str) and content:
-                        return content
-                    if isinstance(content, list):
-                        # Multi-modal: [{"type": "text", "text": "..."}]
-                        for part in content:
-                            if isinstance(part, dict) and part.get("type") == "text":
-                                return part.get("text", "")
-            elif isinstance(msg, str) and msg:
-                return msg
-    # Fallback: stringify the whole inputs
-    flat = json.dumps(inputs)
-    if len(flat) > 20:  # Only if there's meaningful content
-        return flat[:2000]
-    return None
-def extract_feedback(run):
-    """Extract user feedback from a LangSmith run."""
-    feedback = run.get("feedback_stats") or run.get("feedback") or {}
-    if not feedback:
-        return None
-    # feedback_stats format: {"thumbs_up": N, "thumbs_down": N}
-    if isinstance(feedback, dict):
-        up = feedback.get("thumbs_up", 0) or feedback.get("positive", 0)
-        down = feedback.get("thumbs_down", 0) or feedback.get("negative", 0)
-        if down > 0:
-            return "negative"
-        if up > 0:
-            return "positive"
-    return None
-def infer_difficulty(text):
-    """Infer difficulty from input characteristics."""
-    if not text:
-        return "medium"
-    length = len(text)
-    # Count question marks, clauses, etc.
-    questions = text.count("?")
-    sentences = len(re.split(r"[.!?]+", text))
-    if length < 50 and questions <= 1:
-        return "easy"
-    if length > 500 or questions > 2 or sentences > 5:
-        return "hard"
-    return "medium"
-def short_id(run_id):
-    """Create a short deterministic ID from a full run ID."""
-    return hashlib.md5(str(run_id).encode()).hexdigest()[:8]
-def main():
-    parser = argparse.ArgumentParser(description="Import LangSmith traces as eval tasks")
-    parser.add_argument("--traces-json", required=True, help="Path to langsmith-cli JSON output")
-    parser.add_argument("--output-dir", required=True, help="Directory to write task JSON files")
-    parser.add_argument("--prefix", default="imported", help="Prefix for task IDs (default: imported)")
-    parser.add_argument("--max-tasks", type=int, default=30, help="Max tasks to import (default: 30)")
-    parser.add_argument("--prioritize-negative", action="store_true", default=True,
-                        help="Import negative-feedback traces first (default: true)")
-    args = parser.parse_args()
-    traces = load_json(args.traces_json)
-    if not traces:
-        print("No traces found or invalid JSON — nothing to import")
-        return
-    if isinstance(traces, dict):
-        # Might be wrapped in {"runs": [...]}
-        traces = traces.get("runs", traces.get("data", [traces]))
-    if not isinstance(traces, list):
-        print("Unexpected traces format — expected a JSON array")
-        return
-    # Sort: negative feedback first, then errors, then the rest
-    if args.prioritize_negative:
-        def priority(run):
-            fb = extract_feedback(run)
-            has_error = bool(run.get("error"))
-            if fb == "negative":
-                return 0
-            if has_error:
-                return 1
-            return 2
-        traces.sort(key=priority)
-    os.makedirs(args.output_dir, exist_ok=True)
-    # Check for existing imported tasks to avoid duplicates
-    existing_run_ids = set()
-    for fname in os.listdir(args.output_dir):
-        if fname.endswith(".json"):
-            task = load_json(os.path.join(args.output_dir, fname))
-            if task and task.get("metadata", {}).get("langsmith_run_id"):
-                existing_run_ids.add(task["metadata"]["langsmith_run_id"])
-    imported = 0
-    skipped_no_input = 0
-    skipped_duplicate = 0
-    negative_count = 0
-    for run in traces:
-        if imported >= args.max_tasks:
-            break
-        run_id = str(run.get("id", ""))
-        if run_id in existing_run_ids:
-            skipped_duplicate += 1
-            continue
-        user_input = extract_input_from_trace(run)
-        if not user_input or len(user_input.strip()) < 5:
-            skipped_no_input += 1
-            continue
-        feedback = extract_feedback(run)
-        has_error = bool(run.get("error"))
-        task_id = f"{args.prefix}_{short_id(run_id)}"
-        task = {
-            "id": task_id,
-            "input": user_input.strip(),
-            "metadata": {
-                "difficulty": infer_difficulty(user_input),
-                "category": run.get("name", "unknown"),
-                "type": "production",
-                "source": "imported",
-                "langsmith_run_id": run_id,
-                "had_error": has_error,
-                "user_feedback": feedback,
-            },
-        }
-        out_path = os.path.join(args.output_dir, f"{task_id}.json")
-        with open(out_path, "w") as f:
-            json.dump(task, f, indent=2)
-        imported += 1
-        if feedback == "negative":
-            negative_count += 1
-    summary = {
-        "imported": imported,
-        "negative_feedback": negative_count,
-        "skipped_no_input": skipped_no_input,
-        "skipped_duplicate": skipped_duplicate,
-        "total_traces": len(traces),
-    }
-    print(json.dumps(summary))
-    print(f"Imported {imported} production traces as tasks ({negative_count} with negative feedback)")
-    if skipped_duplicate:
-        print(f"  Skipped {skipped_duplicate} already-imported traces")
-    if skipped_no_input:
-        print(f"  Skipped {skipped_no_input} traces with no extractable input")
-if __name__ == "__main__":
-    main()