npm - harness-evolver - Versions diffs - 2.8.0 → 2.8.1 - Mend

harness-evolver 2.8.0 → 2.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/tools/__pycache__/init.cpython-313.pyc +0 -0
package/tools/__pycache__/seed_from_traces.cpython-313.pyc +0 -0
package/tools/evaluate.py +13 -1
package/tools/init.py +31 -8

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "harness-evolver",
-  "version": "2.8.0",
+  "version": "2.8.1",
   "description": "Meta-Harness-style autonomous harness optimization for Claude Code",
   "author": "Raphael Valdetaro",
   "license": "MIT",

package/tools/__pycache__/init.cpython-313.pyc CHANGED Viewed

Binary file

package/tools/__pycache__/seed_from_traces.cpython-313.pyc CHANGED Viewed

Binary file

package/tools/evaluate.py CHANGED Viewed

@@ -50,7 +50,19 @@ def _run_harness_on_task(harness, config, task_input_path, output_path, task_tra
             cmd, capture_output=True, text=True, timeout=timeout, env=env,
         )
         elapsed_ms = (time.time() - start) * 1000
-        return result.returncode == 0, elapsed_ms, result.stdout, result.stderr
+        # Accept exit code 0 (success) or check if output file exists for non-zero exits.
+        # LLM agents with C extensions (numpy, httpx) often segfault (exit 139) during
+        # Python shutdown AFTER writing correct output.
+        success = result.returncode == 0
+        if not success and os.path.exists(output_path):
+            try:
+                with open(output_path) as f:
+                    json.load(f)
+                # Valid JSON output exists despite non-zero exit — treat as success
+                success = True
+            except (json.JSONDecodeError, OSError):
+                pass
+        return success, elapsed_ms, result.stdout, result.stderr
     except subprocess.TimeoutExpired:
         elapsed_ms = (time.time() - start) * 1000
         return False, elapsed_ms, "", f"TIMEOUT after {timeout}s"

package/tools/init.py CHANGED Viewed

@@ -391,11 +391,21 @@ def main():
             print("  claude mcp add context7 -- npx -y @upstash/context7-mcp@latest")
     # Architecture analysis (quick, advisory)
+    # Auto-detect additional source files by scanning for .py files near the harness
     analyze_py = os.path.join(tools, "analyze_architecture.py")
     if os.path.exists(analyze_py):
         try:
+            harness_dir = os.path.dirname(os.path.abspath(args.harness))
+            source_files = []
+            for fname in os.listdir(harness_dir):
+                fpath = os.path.join(harness_dir, fname)
+                if fname.endswith(".py") and os.path.isfile(fpath) and fpath != os.path.abspath(args.harness):
+                    source_files.append(fpath)
+            arch_cmd = [_resolve_python(), analyze_py, "--harness", args.harness]
+            if source_files:
+                arch_cmd.extend(["--source-files"] + source_files[:10])
             r = subprocess.run(
-                [_resolve_python(), analyze_py, "--harness", args.harness],
+                arch_cmd,
                 capture_output=True, text=True, timeout=30,
             )
             if r.returncode == 0 and r.stdout.strip():
@@ -461,7 +471,10 @@ def main():
         print(r.stdout.strip())
     # 6. Evaluate baseline
-    print("Evaluating baseline harness...")
+    num_tasks = len([f for f in os.listdir(os.path.join(base, "eval", "tasks")) if f.endswith(".json")])
+    per_task_timeout = max(args.validation_timeout, 60)
+    eval_timeout = max(num_tasks * per_task_timeout + 60, 300)
+    print(f"Evaluating baseline harness ({num_tasks} tasks, timeout: {eval_timeout}s)...")
     baseline_traces = tempfile.mkdtemp()
     baseline_scores = os.path.join(base, "baseline_scores.json")
     eval_args = [
@@ -471,18 +484,28 @@ def main():
         "--eval", os.path.join(base, "eval", "eval.py"),
         "--traces-dir", baseline_traces,
         "--scores", baseline_scores,
-        "--timeout", str(max(args.validation_timeout, 60)),
+        "--timeout", str(per_task_timeout),
     ]
     if os.path.exists(config_path):
         eval_args.extend(["--config", config_path])
-    r = subprocess.run(eval_args, capture_output=True, text=True, timeout=300)
-    if r.returncode != 0:
+    try:
+        r = subprocess.run(eval_args, capture_output=True, text=True, timeout=eval_timeout)
+    except subprocess.TimeoutExpired:
+        print(f"WARNING: baseline evaluation timed out after {eval_timeout}s "
+              f"({num_tasks} tasks at {per_task_timeout}s/task). "
+              f"Using score 0.0. Run evaluation separately with more time.",
+              file=sys.stderr)
+        r = None
+    if r is not None and r.returncode != 0:
         print(f"WARNING: baseline evaluation failed. Using score 0.0.\n{r.stderr}", file=sys.stderr)
-        baseline_score = 0.0
-    else:
+    if r is not None and r.returncode == 0:
         print(r.stdout.strip())
-        scores = json.load(open(baseline_scores))
+    if r is not None and r.returncode == 0 and os.path.exists(baseline_scores):
+        with open(baseline_scores) as f:
+            scores = json.load(f)
         baseline_score = scores.get("combined_score", 0.0)
+    else:
+        baseline_score = 0.0
     if os.path.exists(baseline_scores):
         os.remove(baseline_scores)