harness-evolver 2.8.0 → 2.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
- "version": "2.8.0",
3
+ "version": "2.8.1",
4
4
  "description": "Meta-Harness-style autonomous harness optimization for Claude Code",
5
5
  "author": "Raphael Valdetaro",
6
6
  "license": "MIT",
package/tools/evaluate.py CHANGED
@@ -50,7 +50,19 @@ def _run_harness_on_task(harness, config, task_input_path, output_path, task_tra
50
50
  cmd, capture_output=True, text=True, timeout=timeout, env=env,
51
51
  )
52
52
  elapsed_ms = (time.time() - start) * 1000
53
- return result.returncode == 0, elapsed_ms, result.stdout, result.stderr
53
+ # Accept exit code 0 (success) or check if output file exists for non-zero exits.
54
+ # LLM agents with C extensions (numpy, httpx) often segfault (exit 139) during
55
+ # Python shutdown AFTER writing correct output.
56
+ success = result.returncode == 0
57
+ if not success and os.path.exists(output_path):
58
+ try:
59
+ with open(output_path) as f:
60
+ json.load(f)
61
+ # Valid JSON output exists despite non-zero exit — treat as success
62
+ success = True
63
+ except (json.JSONDecodeError, OSError):
64
+ pass
65
+ return success, elapsed_ms, result.stdout, result.stderr
54
66
  except subprocess.TimeoutExpired:
55
67
  elapsed_ms = (time.time() - start) * 1000
56
68
  return False, elapsed_ms, "", f"TIMEOUT after {timeout}s"
package/tools/init.py CHANGED
@@ -391,11 +391,21 @@ def main():
391
391
  print(" claude mcp add context7 -- npx -y @upstash/context7-mcp@latest")
392
392
 
393
393
  # Architecture analysis (quick, advisory)
394
+ # Auto-detect additional source files by scanning for .py files near the harness
394
395
  analyze_py = os.path.join(tools, "analyze_architecture.py")
395
396
  if os.path.exists(analyze_py):
396
397
  try:
398
+ harness_dir = os.path.dirname(os.path.abspath(args.harness))
399
+ source_files = []
400
+ for fname in os.listdir(harness_dir):
401
+ fpath = os.path.join(harness_dir, fname)
402
+ if fname.endswith(".py") and os.path.isfile(fpath) and fpath != os.path.abspath(args.harness):
403
+ source_files.append(fpath)
404
+ arch_cmd = [_resolve_python(), analyze_py, "--harness", args.harness]
405
+ if source_files:
406
+ arch_cmd.extend(["--source-files"] + source_files[:10])
397
407
  r = subprocess.run(
398
- [_resolve_python(), analyze_py, "--harness", args.harness],
408
+ arch_cmd,
399
409
  capture_output=True, text=True, timeout=30,
400
410
  )
401
411
  if r.returncode == 0 and r.stdout.strip():
@@ -461,7 +471,10 @@ def main():
461
471
  print(r.stdout.strip())
462
472
 
463
473
  # 6. Evaluate baseline
464
- print("Evaluating baseline harness...")
474
+ num_tasks = len([f for f in os.listdir(os.path.join(base, "eval", "tasks")) if f.endswith(".json")])
475
+ per_task_timeout = max(args.validation_timeout, 60)
476
+ eval_timeout = max(num_tasks * per_task_timeout + 60, 300)
477
+ print(f"Evaluating baseline harness ({num_tasks} tasks, timeout: {eval_timeout}s)...")
465
478
  baseline_traces = tempfile.mkdtemp()
466
479
  baseline_scores = os.path.join(base, "baseline_scores.json")
467
480
  eval_args = [
@@ -471,18 +484,28 @@ def main():
471
484
  "--eval", os.path.join(base, "eval", "eval.py"),
472
485
  "--traces-dir", baseline_traces,
473
486
  "--scores", baseline_scores,
474
- "--timeout", str(max(args.validation_timeout, 60)),
487
+ "--timeout", str(per_task_timeout),
475
488
  ]
476
489
  if os.path.exists(config_path):
477
490
  eval_args.extend(["--config", config_path])
478
- r = subprocess.run(eval_args, capture_output=True, text=True, timeout=300)
479
- if r.returncode != 0:
491
+ try:
492
+ r = subprocess.run(eval_args, capture_output=True, text=True, timeout=eval_timeout)
493
+ except subprocess.TimeoutExpired:
494
+ print(f"WARNING: baseline evaluation timed out after {eval_timeout}s "
495
+ f"({num_tasks} tasks at {per_task_timeout}s/task). "
496
+ f"Using score 0.0. Run evaluation separately with more time.",
497
+ file=sys.stderr)
498
+ r = None
499
+ if r is not None and r.returncode != 0:
480
500
  print(f"WARNING: baseline evaluation failed. Using score 0.0.\n{r.stderr}", file=sys.stderr)
481
- baseline_score = 0.0
482
- else:
501
+ if r is not None and r.returncode == 0:
483
502
  print(r.stdout.strip())
484
- scores = json.load(open(baseline_scores))
503
+ if r is not None and r.returncode == 0 and os.path.exists(baseline_scores):
504
+ with open(baseline_scores) as f:
505
+ scores = json.load(f)
485
506
  baseline_score = scores.get("combined_score", 0.0)
507
+ else:
508
+ baseline_score = 0.0
486
509
 
487
510
  if os.path.exists(baseline_scores):
488
511
  os.remove(baseline_scores)