npm - harness-evolver - Versions diffs - 2.6.1 → 2.7.0 - Mend

harness-evolver 2.6.1 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/skills/init/SKILL.md +22 -0
package/tools/analyze_architecture.py +56 -2
package/tools/evaluate.py +29 -5
package/tools/init.py +44 -16

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "harness-evolver",
-  "version": "2.6.1",
+  "version": "2.7.0",
   "description": "Meta-Harness-style autonomous harness optimization for Claude Code",
   "author": "Raphael Valdetaro",
   "license": "MIT",

package/skills/init/SKILL.md CHANGED Viewed

@@ -103,6 +103,25 @@ python3 $TOOLS/init.py [directory] \
 Add `--harness-config config.json` if a config exists.
+For **LLM-powered agents** that make real API calls (LangGraph, CrewAI, etc.) and take
+more than 30 seconds per invocation, increase the validation timeout:
+```bash
+python3 $TOOLS/init.py [directory] \
+    --harness harness.py --eval eval.py --tasks tasks/ \
+    --tools-dir $TOOLS \
+    --validation-timeout 120
+```
+If validation keeps timing out but you've verified the harness works manually, skip it:
+```bash
+python3 $TOOLS/init.py [directory] \
+    --harness harness.py --eval eval.py --tasks tasks/ \
+    --tools-dir $TOOLS \
+    --skip-validation
+```
 ## After Init — Report
 - What was detected vs created
@@ -132,3 +151,6 @@ This is advisory only — do not spawn the architect agent.
 - The `expected` field is never shown to the harness — only the eval script sees it.
 - If `.harness-evolver/` already exists, warn before overwriting.
 - If no Python files exist in CWD, the user is probably in the wrong directory.
+- **Monorepo / venv mismatch**: In monorepos with dedicated venvs per app, the system `python3` may differ from the project's Python version. The harness wrapper should re-exec with the correct venv Python. The tools now use `sys.executable` instead of hardcoded `python3`.
+- **Stale site-packages**: If the project uses editable installs (`pip install -e .`), packages in `site-packages/` may have stale copies of data files (e.g. registry YAMLs). Run `uv pip install -e . --force-reinstall --no-deps` to sync.
+- **Validation timeout**: LLM agents making real API calls typically take 15-60s per invocation. Use `--validation-timeout 120` or `--skip-validation` to handle this.

package/tools/analyze_architecture.py CHANGED Viewed

@@ -472,12 +472,60 @@ def analyze_scores(summary_path):
 # --- Main ---
+def analyze_multiple(file_paths):
+    """Analyze multiple Python files and merge their signals.
+    Useful in monorepo setups where the harness is a thin wrapper that
+    delegates to the actual agent code. Pass the harness AND the main
+    agent source files for a comprehensive topology classification.
+    """
+    merged = {
+        "llm_call_count": 0,
+        "has_loop_around_llm": False,
+        "has_tool_definitions": False,
+        "has_retrieval": False,
+        "has_graph_framework": False,
+        "has_parallel_execution": False,
+        "has_error_handling": False,
+        "code_lines": 0,
+        "function_count": 0,
+        "class_count": 0,
+        "files_analyzed": [],
+    }
+    for path in file_paths:
+        if not os.path.isfile(path):
+            continue
+        try:
+            signals = analyze_code(path)
+        except Exception:
+            continue
+        merged["llm_call_count"] += signals.get("llm_call_count", 0)
+        merged["code_lines"] += signals.get("code_lines", 0)
+        merged["function_count"] += signals.get("function_count", 0)
+        merged["class_count"] += signals.get("class_count", 0)
+        merged["files_analyzed"].append(os.path.basename(path))
+        for bool_key in ["has_loop_around_llm", "has_tool_definitions", "has_retrieval",
+                         "has_graph_framework", "has_parallel_execution", "has_error_handling"]:
+            if signals.get(bool_key):
+                merged[bool_key] = True
+    merged["estimated_topology"] = _estimate_topology(merged)
+    return merged
 def main():
     parser = argparse.ArgumentParser(
         description="Analyze harness architecture and produce signals for the architect agent",
-        usage="analyze_architecture.py --harness PATH [--traces-dir PATH] [--summary PATH] [-o output.json]",
+        usage="analyze_architecture.py --harness PATH [--source-files PATH ...] "
+              "[--traces-dir PATH] [--summary PATH] [-o output.json]",
     )
     parser.add_argument("--harness", required=True, help="Path to harness Python file")
+    parser.add_argument("--source-files", nargs="*", default=None,
+                        help="Additional source files to analyze (e.g. the actual agent code). "
+                             "Useful when the harness is a thin wrapper around a larger system.")
     parser.add_argument("--traces-dir", default=None, help="Path to traces directory")
     parser.add_argument("--summary", default=None, help="Path to summary.json")
     parser.add_argument("-o", "--output", default=None, help="Output JSON path")
@@ -487,8 +535,14 @@ def main():
         print(json.dumps({"error": f"Harness file not found: {args.harness}"}))
         sys.exit(1)
+    if args.source_files:
+        all_files = [args.harness] + [f for f in args.source_files if os.path.isfile(f)]
+        code_signals = analyze_multiple(all_files)
+    else:
+        code_signals = analyze_code(args.harness)
     result = {
-        "code_signals": analyze_code(args.harness),
+        "code_signals": code_signals,
         "trace_signals": None,
         "score_signals": None,
     }

package/tools/evaluate.py CHANGED Viewed

@@ -2,7 +2,7 @@
 """Evaluation orchestrator for Harness Evolver.
 Commands:
-    validate --harness PATH [--config PATH]
+    validate --harness PATH [--config PATH] [--timeout SECONDS]
     run      --harness PATH --tasks-dir PATH --eval PATH --traces-dir PATH --scores PATH
              [--config PATH] [--timeout SECONDS]
@@ -20,9 +20,23 @@ import tempfile
 import time
+def _resolve_python():
+    """Resolve the Python interpreter to use for subprocesses.
+    Prefers the current interpreter (sys.executable) over a hardcoded 'python3'.
+    This is critical in monorepo setups where the harness may need a specific
+    venv Python (e.g. Python 3.12) while the system 'python3' is a different
+    version (e.g. 3.14) with incompatible site-packages.
+    """
+    exe = sys.executable
+    if exe and os.path.isfile(exe):
+        return exe
+    return "python3"
 def _run_harness_on_task(harness, config, task_input_path, output_path, task_traces_dir, timeout, env=None):
     """Run the harness on a single task. Returns (success, elapsed_ms, stdout, stderr)."""
-    cmd = ["python3", harness, "--input", task_input_path, "--output", output_path]
+    cmd = [_resolve_python(), harness, "--input", task_input_path, "--output", output_path]
     if task_traces_dir:
         extra_dir = os.path.join(task_traces_dir, "extra")
         os.makedirs(extra_dir, exist_ok=True)
@@ -48,6 +62,7 @@ def _run_harness_on_task(harness, config, task_input_path, output_path, task_tra
 def cmd_validate(args):
     harness = args.harness
     config = getattr(args, "config", None)
+    timeout = getattr(args, "timeout", 30) or 30
     if not os.path.exists(harness):
         print(f"FAIL: harness not found: {harness}", file=sys.stderr)
@@ -61,11 +76,17 @@ def cmd_validate(args):
             json.dump(dummy_task, f)
         success, elapsed, stdout, stderr = _run_harness_on_task(
-            harness, config, input_path, output_path, None, timeout=30,
+            harness, config, input_path, output_path, None, timeout=timeout,
         )
         if not success:
-            print(f"FAIL: harness exited with error.\nstderr: {stderr}", file=sys.stderr)
+            hint = ""
+            if "TIMEOUT" in stderr:
+                hint = (f"\nHint: validation timed out after {timeout}s. "
+                        "For LLM-powered agents that make real API calls, "
+                        "use --timeout to increase the limit: "
+                        f"evaluate.py validate --harness {harness} --timeout 120")
+            print(f"FAIL: harness exited with error.\nstderr: {stderr}{hint}", file=sys.stderr)
             sys.exit(1)
         if not os.path.exists(output_path):
@@ -171,7 +192,7 @@ def cmd_run(args):
         f.write("\n".join(all_stderr))
     eval_cmd = [
-        "python3", eval_script,
+        _resolve_python(), eval_script,
         "--results-dir", results_dir,
         "--tasks-dir", tasks_dir,
         "--scores", scores_path,
@@ -195,6 +216,9 @@ def main():
     p_val = sub.add_parser("validate")
     p_val.add_argument("--harness", required=True)
     p_val.add_argument("--config", default=None)
+    p_val.add_argument("--timeout", type=int, default=30,
+                       help="Validation timeout in seconds (default: 30). "
+                            "Increase for LLM-powered agents that make real API calls.")
     p_run = sub.add_parser("run")
     p_run.add_argument("--harness", required=True)

package/tools/init.py CHANGED Viewed

@@ -134,6 +134,19 @@ def _check_langsmith_cli():
         return False
+def _resolve_python():
+    """Resolve the Python interpreter for subprocesses.
+    Uses the current interpreter (sys.executable) instead of hardcoded 'python3'.
+    This prevents version mismatches in monorepo setups where the harness may
+    need a specific venv Python different from the system python3.
+    """
+    exe = sys.executable
+    if exe and os.path.isfile(exe):
+        return exe
+    return "python3"
 def _detect_stack(harness_path):
     """Detect technology stack from harness imports."""
     detect_stack_py = os.path.join(os.path.dirname(__file__), "detect_stack.py")
@@ -141,7 +154,7 @@ def _detect_stack(harness_path):
         return {}
     try:
         r = subprocess.run(
-            ["python3", detect_stack_py, harness_path],
+            [_resolve_python(), detect_stack_py, harness_path],
             capture_output=True, text=True, timeout=30,
         )
         if r.returncode == 0 and r.stdout.strip():
@@ -183,6 +196,12 @@ def main():
     parser.add_argument("--base-dir", default=None, help="Path for .harness-evolver/")
     parser.add_argument("--harness-config", default=None, help="Path to harness config.json")
     parser.add_argument("--tools-dir", default=None, help="Path to tools directory")
+    parser.add_argument("--validation-timeout", type=int, default=30,
+                        help="Timeout for harness validation in seconds (default: 30). "
+                             "Increase for LLM-powered agents that make real API calls.")
+    parser.add_argument("--skip-validation", action="store_true",
+                        help="Skip harness validation step. Use when you know the harness "
+                             "works but validation times out (e.g. real LLM agent calls).")
     args = parser.parse_args()
     # Auto-detect missing args
@@ -309,7 +328,7 @@ def main():
         if os.path.exists(detect_stack_py):
             try:
                 r = subprocess.run(
-                    ["python3", detect_stack_py, harness_dir],
+                    [_resolve_python(), detect_stack_py, harness_dir],
                     capture_output=True, text=True, timeout=30,
                 )
                 if r.returncode == 0 and r.stdout.strip():
@@ -338,7 +357,7 @@ def main():
     if os.path.exists(analyze_py):
         try:
             r = subprocess.run(
-                ["python3", analyze_py, "--harness", args.harness],
+                [_resolve_python(), analyze_py, "--harness", args.harness],
                 capture_output=True, text=True, timeout=30,
             )
             if r.returncode == 0 and r.stdout.strip():
@@ -357,30 +376,39 @@ def main():
             pass
     # 5. Validate baseline harness
-    print("Validating baseline harness...")
-    val_args = ["python3", evaluate_py, "validate",
-                "--harness", os.path.join(base, "baseline", "harness.py")]
     config_path = os.path.join(base, "baseline", "config.json")
-    if os.path.exists(config_path):
-        val_args.extend(["--config", config_path])
-    r = subprocess.run(val_args, capture_output=True, text=True)
-    if r.returncode != 0:
-        print(f"FAIL: baseline harness validation failed.\n{r.stderr}", file=sys.stderr)
-        sys.exit(1)
-    print(r.stdout.strip())
+    if args.skip_validation:
+        print("Skipping baseline validation (--skip-validation).")
+    else:
+        print(f"Validating baseline harness (timeout: {args.validation_timeout}s)...")
+        val_args = [_resolve_python(), evaluate_py, "validate",
+                    "--harness", os.path.join(base, "baseline", "harness.py"),
+                    "--timeout", str(args.validation_timeout)]
+        if os.path.exists(config_path):
+            val_args.extend(["--config", config_path])
+        r = subprocess.run(val_args, capture_output=True, text=True)
+        if r.returncode != 0:
+            hint = ""
+            if "TIMEOUT" in r.stderr:
+                hint = (f"\n\nHint: The harness timed out after {args.validation_timeout}s. "
+                        "This is common for LLM-powered agents that make real API calls.\n"
+                        "Try: --validation-timeout 120  (or --skip-validation to bypass)")
+            print(f"FAIL: baseline harness validation failed.\n{r.stderr}{hint}", file=sys.stderr)
+            sys.exit(1)
+        print(r.stdout.strip())
     # 6. Evaluate baseline
     print("Evaluating baseline harness...")
     baseline_traces = tempfile.mkdtemp()
     baseline_scores = os.path.join(base, "baseline_scores.json")
     eval_args = [
-        "python3", evaluate_py, "run",
+        _resolve_python(), evaluate_py, "run",
         "--harness", os.path.join(base, "baseline", "harness.py"),
         "--tasks-dir", os.path.join(base, "eval", "tasks"),
         "--eval", os.path.join(base, "eval", "eval.py"),
         "--traces-dir", baseline_traces,
         "--scores", baseline_scores,
-        "--timeout", "60",
+        "--timeout", str(max(args.validation_timeout, 60)),
     ]
     if os.path.exists(config_path):
         eval_args.extend(["--config", config_path])
@@ -399,7 +427,7 @@ def main():
     # 7. Initialize state with baseline score
     print(f"Baseline score: {baseline_score:.2f}")
     r = subprocess.run(
-        ["python3", state_py, "init",
+        [_resolve_python(), state_py, "init",
          "--base-dir", base,
          "--baseline-score", str(baseline_score)],
         capture_output=True, text=True,