harness-evolver 2.8.0 → 2.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
Binary file
|
|
Binary file
|
package/tools/evaluate.py
CHANGED
|
@@ -50,7 +50,19 @@ def _run_harness_on_task(harness, config, task_input_path, output_path, task_tra
|
|
|
50
50
|
cmd, capture_output=True, text=True, timeout=timeout, env=env,
|
|
51
51
|
)
|
|
52
52
|
elapsed_ms = (time.time() - start) * 1000
|
|
53
|
-
|
|
53
|
+
# Accept exit code 0 (success) or check if output file exists for non-zero exits.
|
|
54
|
+
# LLM agents with C extensions (numpy, httpx) often segfault (exit 139) during
|
|
55
|
+
# Python shutdown AFTER writing correct output.
|
|
56
|
+
success = result.returncode == 0
|
|
57
|
+
if not success and os.path.exists(output_path):
|
|
58
|
+
try:
|
|
59
|
+
with open(output_path) as f:
|
|
60
|
+
json.load(f)
|
|
61
|
+
# Valid JSON output exists despite non-zero exit — treat as success
|
|
62
|
+
success = True
|
|
63
|
+
except (json.JSONDecodeError, OSError):
|
|
64
|
+
pass
|
|
65
|
+
return success, elapsed_ms, result.stdout, result.stderr
|
|
54
66
|
except subprocess.TimeoutExpired:
|
|
55
67
|
elapsed_ms = (time.time() - start) * 1000
|
|
56
68
|
return False, elapsed_ms, "", f"TIMEOUT after {timeout}s"
|
package/tools/init.py
CHANGED
|
@@ -391,11 +391,21 @@ def main():
|
|
|
391
391
|
print(" claude mcp add context7 -- npx -y @upstash/context7-mcp@latest")
|
|
392
392
|
|
|
393
393
|
# Architecture analysis (quick, advisory)
|
|
394
|
+
# Auto-detect additional source files by scanning for .py files near the harness
|
|
394
395
|
analyze_py = os.path.join(tools, "analyze_architecture.py")
|
|
395
396
|
if os.path.exists(analyze_py):
|
|
396
397
|
try:
|
|
398
|
+
harness_dir = os.path.dirname(os.path.abspath(args.harness))
|
|
399
|
+
source_files = []
|
|
400
|
+
for fname in os.listdir(harness_dir):
|
|
401
|
+
fpath = os.path.join(harness_dir, fname)
|
|
402
|
+
if fname.endswith(".py") and os.path.isfile(fpath) and fpath != os.path.abspath(args.harness):
|
|
403
|
+
source_files.append(fpath)
|
|
404
|
+
arch_cmd = [_resolve_python(), analyze_py, "--harness", args.harness]
|
|
405
|
+
if source_files:
|
|
406
|
+
arch_cmd.extend(["--source-files"] + source_files[:10])
|
|
397
407
|
r = subprocess.run(
|
|
398
|
-
|
|
408
|
+
arch_cmd,
|
|
399
409
|
capture_output=True, text=True, timeout=30,
|
|
400
410
|
)
|
|
401
411
|
if r.returncode == 0 and r.stdout.strip():
|
|
@@ -461,7 +471,10 @@ def main():
|
|
|
461
471
|
print(r.stdout.strip())
|
|
462
472
|
|
|
463
473
|
# 6. Evaluate baseline
|
|
464
|
-
|
|
474
|
+
num_tasks = len([f for f in os.listdir(os.path.join(base, "eval", "tasks")) if f.endswith(".json")])
|
|
475
|
+
per_task_timeout = max(args.validation_timeout, 60)
|
|
476
|
+
eval_timeout = max(num_tasks * per_task_timeout + 60, 300)
|
|
477
|
+
print(f"Evaluating baseline harness ({num_tasks} tasks, timeout: {eval_timeout}s)...")
|
|
465
478
|
baseline_traces = tempfile.mkdtemp()
|
|
466
479
|
baseline_scores = os.path.join(base, "baseline_scores.json")
|
|
467
480
|
eval_args = [
|
|
@@ -471,18 +484,28 @@ def main():
|
|
|
471
484
|
"--eval", os.path.join(base, "eval", "eval.py"),
|
|
472
485
|
"--traces-dir", baseline_traces,
|
|
473
486
|
"--scores", baseline_scores,
|
|
474
|
-
"--timeout", str(
|
|
487
|
+
"--timeout", str(per_task_timeout),
|
|
475
488
|
]
|
|
476
489
|
if os.path.exists(config_path):
|
|
477
490
|
eval_args.extend(["--config", config_path])
|
|
478
|
-
|
|
479
|
-
|
|
491
|
+
try:
|
|
492
|
+
r = subprocess.run(eval_args, capture_output=True, text=True, timeout=eval_timeout)
|
|
493
|
+
except subprocess.TimeoutExpired:
|
|
494
|
+
print(f"WARNING: baseline evaluation timed out after {eval_timeout}s "
|
|
495
|
+
f"({num_tasks} tasks at {per_task_timeout}s/task). "
|
|
496
|
+
f"Using score 0.0. Run evaluation separately with more time.",
|
|
497
|
+
file=sys.stderr)
|
|
498
|
+
r = None
|
|
499
|
+
if r is not None and r.returncode != 0:
|
|
480
500
|
print(f"WARNING: baseline evaluation failed. Using score 0.0.\n{r.stderr}", file=sys.stderr)
|
|
481
|
-
|
|
482
|
-
else:
|
|
501
|
+
if r is not None and r.returncode == 0:
|
|
483
502
|
print(r.stdout.strip())
|
|
484
|
-
|
|
503
|
+
if r is not None and r.returncode == 0 and os.path.exists(baseline_scores):
|
|
504
|
+
with open(baseline_scores) as f:
|
|
505
|
+
scores = json.load(f)
|
|
485
506
|
baseline_score = scores.get("combined_score", 0.0)
|
|
507
|
+
else:
|
|
508
|
+
baseline_score = 0.0
|
|
486
509
|
|
|
487
510
|
if os.path.exists(baseline_scores):
|
|
488
511
|
os.remove(baseline_scores)
|