harness-evolver 2.6.1 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/skills/init/SKILL.md +22 -0
- package/tools/analyze_architecture.py +56 -2
- package/tools/evaluate.py +29 -5
- package/tools/init.py +44 -16
package/package.json
CHANGED
package/skills/init/SKILL.md
CHANGED
|
@@ -103,6 +103,25 @@ python3 $TOOLS/init.py [directory] \
|
|
|
103
103
|
|
|
104
104
|
Add `--harness-config config.json` if a config exists.
|
|
105
105
|
|
|
106
|
+
For **LLM-powered agents** that make real API calls (LangGraph, CrewAI, etc.) and take
|
|
107
|
+
more than 30 seconds per invocation, increase the validation timeout:
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
python3 $TOOLS/init.py [directory] \
|
|
111
|
+
--harness harness.py --eval eval.py --tasks tasks/ \
|
|
112
|
+
--tools-dir $TOOLS \
|
|
113
|
+
--validation-timeout 120
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
If validation keeps timing out but you've verified the harness works manually, skip it:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
python3 $TOOLS/init.py [directory] \
|
|
120
|
+
--harness harness.py --eval eval.py --tasks tasks/ \
|
|
121
|
+
--tools-dir $TOOLS \
|
|
122
|
+
--skip-validation
|
|
123
|
+
```
|
|
124
|
+
|
|
106
125
|
## After Init — Report
|
|
107
126
|
|
|
108
127
|
- What was detected vs created
|
|
@@ -132,3 +151,6 @@ This is advisory only — do not spawn the architect agent.
|
|
|
132
151
|
- The `expected` field is never shown to the harness — only the eval script sees it.
|
|
133
152
|
- If `.harness-evolver/` already exists, warn before overwriting.
|
|
134
153
|
- If no Python files exist in CWD, the user is probably in the wrong directory.
|
|
154
|
+
- **Monorepo / venv mismatch**: In monorepos with dedicated venvs per app, the system `python3` may differ from the project's Python version. The harness wrapper should re-exec with the correct venv Python. The tools now use `sys.executable` instead of hardcoded `python3`.
|
|
155
|
+
- **Stale site-packages**: If the project uses editable installs (`pip install -e .`), packages in `site-packages/` may have stale copies of data files (e.g. registry YAMLs). Run `uv pip install -e . --force-reinstall --no-deps` to sync.
|
|
156
|
+
- **Validation timeout**: LLM agents making real API calls typically take 15-60s per invocation. Use `--validation-timeout 120` or `--skip-validation` to handle this.
|
|
@@ -472,12 +472,60 @@ def analyze_scores(summary_path):
|
|
|
472
472
|
|
|
473
473
|
# --- Main ---
|
|
474
474
|
|
|
475
|
+
def analyze_multiple(file_paths):
|
|
476
|
+
"""Analyze multiple Python files and merge their signals.
|
|
477
|
+
|
|
478
|
+
Useful in monorepo setups where the harness is a thin wrapper that
|
|
479
|
+
delegates to the actual agent code. Pass the harness AND the main
|
|
480
|
+
agent source files for a comprehensive topology classification.
|
|
481
|
+
"""
|
|
482
|
+
merged = {
|
|
483
|
+
"llm_call_count": 0,
|
|
484
|
+
"has_loop_around_llm": False,
|
|
485
|
+
"has_tool_definitions": False,
|
|
486
|
+
"has_retrieval": False,
|
|
487
|
+
"has_graph_framework": False,
|
|
488
|
+
"has_parallel_execution": False,
|
|
489
|
+
"has_error_handling": False,
|
|
490
|
+
"code_lines": 0,
|
|
491
|
+
"function_count": 0,
|
|
492
|
+
"class_count": 0,
|
|
493
|
+
"files_analyzed": [],
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
for path in file_paths:
|
|
497
|
+
if not os.path.isfile(path):
|
|
498
|
+
continue
|
|
499
|
+
try:
|
|
500
|
+
signals = analyze_code(path)
|
|
501
|
+
except Exception:
|
|
502
|
+
continue
|
|
503
|
+
|
|
504
|
+
merged["llm_call_count"] += signals.get("llm_call_count", 0)
|
|
505
|
+
merged["code_lines"] += signals.get("code_lines", 0)
|
|
506
|
+
merged["function_count"] += signals.get("function_count", 0)
|
|
507
|
+
merged["class_count"] += signals.get("class_count", 0)
|
|
508
|
+
merged["files_analyzed"].append(os.path.basename(path))
|
|
509
|
+
|
|
510
|
+
for bool_key in ["has_loop_around_llm", "has_tool_definitions", "has_retrieval",
|
|
511
|
+
"has_graph_framework", "has_parallel_execution", "has_error_handling"]:
|
|
512
|
+
if signals.get(bool_key):
|
|
513
|
+
merged[bool_key] = True
|
|
514
|
+
|
|
515
|
+
merged["estimated_topology"] = _estimate_topology(merged)
|
|
516
|
+
return merged
|
|
517
|
+
|
|
518
|
+
|
|
475
519
|
def main():
|
|
476
520
|
parser = argparse.ArgumentParser(
|
|
477
521
|
description="Analyze harness architecture and produce signals for the architect agent",
|
|
478
|
-
usage="analyze_architecture.py --harness PATH [--
|
|
522
|
+
usage="analyze_architecture.py --harness PATH [--source-files PATH ...] "
|
|
523
|
+
"[--traces-dir PATH] [--summary PATH] [-o output.json]",
|
|
479
524
|
)
|
|
480
525
|
parser.add_argument("--harness", required=True, help="Path to harness Python file")
|
|
526
|
+
parser.add_argument("--source-files", nargs="*", default=None,
|
|
527
|
+
help="Additional source files to analyze (e.g. the actual agent code). "
|
|
528
|
+
"Useful when the harness is a thin wrapper around a larger system.")
|
|
481
529
|
parser.add_argument("--traces-dir", default=None, help="Path to traces directory")
|
|
482
530
|
parser.add_argument("--summary", default=None, help="Path to summary.json")
|
|
483
531
|
parser.add_argument("-o", "--output", default=None, help="Output JSON path")
|
|
@@ -487,8 +535,14 @@ def main():
|
|
|
487
535
|
print(json.dumps({"error": f"Harness file not found: {args.harness}"}))
|
|
488
536
|
sys.exit(1)
|
|
489
537
|
|
|
538
|
+
if args.source_files:
|
|
539
|
+
all_files = [args.harness] + [f for f in args.source_files if os.path.isfile(f)]
|
|
540
|
+
code_signals = analyze_multiple(all_files)
|
|
541
|
+
else:
|
|
542
|
+
code_signals = analyze_code(args.harness)
|
|
543
|
+
|
|
490
544
|
result = {
|
|
491
|
-
"code_signals":
|
|
545
|
+
"code_signals": code_signals,
|
|
492
546
|
"trace_signals": None,
|
|
493
547
|
"score_signals": None,
|
|
494
548
|
}
|
package/tools/evaluate.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"""Evaluation orchestrator for Harness Evolver.
|
|
3
3
|
|
|
4
4
|
Commands:
|
|
5
|
-
validate --harness PATH [--config PATH]
|
|
5
|
+
validate --harness PATH [--config PATH] [--timeout SECONDS]
|
|
6
6
|
run --harness PATH --tasks-dir PATH --eval PATH --traces-dir PATH --scores PATH
|
|
7
7
|
[--config PATH] [--timeout SECONDS]
|
|
8
8
|
|
|
@@ -20,9 +20,23 @@ import tempfile
|
|
|
20
20
|
import time
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
def _resolve_python():
|
|
24
|
+
"""Resolve the Python interpreter to use for subprocesses.
|
|
25
|
+
|
|
26
|
+
Prefers the current interpreter (sys.executable) over a hardcoded 'python3'.
|
|
27
|
+
This is critical in monorepo setups where the harness may need a specific
|
|
28
|
+
venv Python (e.g. Python 3.12) while the system 'python3' is a different
|
|
29
|
+
version (e.g. 3.14) with incompatible site-packages.
|
|
30
|
+
"""
|
|
31
|
+
exe = sys.executable
|
|
32
|
+
if exe and os.path.isfile(exe):
|
|
33
|
+
return exe
|
|
34
|
+
return "python3"
|
|
35
|
+
|
|
36
|
+
|
|
23
37
|
def _run_harness_on_task(harness, config, task_input_path, output_path, task_traces_dir, timeout, env=None):
|
|
24
38
|
"""Run the harness on a single task. Returns (success, elapsed_ms, stdout, stderr)."""
|
|
25
|
-
cmd = [
|
|
39
|
+
cmd = [_resolve_python(), harness, "--input", task_input_path, "--output", output_path]
|
|
26
40
|
if task_traces_dir:
|
|
27
41
|
extra_dir = os.path.join(task_traces_dir, "extra")
|
|
28
42
|
os.makedirs(extra_dir, exist_ok=True)
|
|
@@ -48,6 +62,7 @@ def _run_harness_on_task(harness, config, task_input_path, output_path, task_tra
|
|
|
48
62
|
def cmd_validate(args):
|
|
49
63
|
harness = args.harness
|
|
50
64
|
config = getattr(args, "config", None)
|
|
65
|
+
timeout = getattr(args, "timeout", 30) or 30
|
|
51
66
|
|
|
52
67
|
if not os.path.exists(harness):
|
|
53
68
|
print(f"FAIL: harness not found: {harness}", file=sys.stderr)
|
|
@@ -61,11 +76,17 @@ def cmd_validate(args):
|
|
|
61
76
|
json.dump(dummy_task, f)
|
|
62
77
|
|
|
63
78
|
success, elapsed, stdout, stderr = _run_harness_on_task(
|
|
64
|
-
harness, config, input_path, output_path, None, timeout=
|
|
79
|
+
harness, config, input_path, output_path, None, timeout=timeout,
|
|
65
80
|
)
|
|
66
81
|
|
|
67
82
|
if not success:
|
|
68
|
-
|
|
83
|
+
hint = ""
|
|
84
|
+
if "TIMEOUT" in stderr:
|
|
85
|
+
hint = (f"\nHint: validation timed out after {timeout}s. "
|
|
86
|
+
"For LLM-powered agents that make real API calls, "
|
|
87
|
+
"use --timeout to increase the limit: "
|
|
88
|
+
f"evaluate.py validate --harness {harness} --timeout 120")
|
|
89
|
+
print(f"FAIL: harness exited with error.\nstderr: {stderr}{hint}", file=sys.stderr)
|
|
69
90
|
sys.exit(1)
|
|
70
91
|
|
|
71
92
|
if not os.path.exists(output_path):
|
|
@@ -171,7 +192,7 @@ def cmd_run(args):
|
|
|
171
192
|
f.write("\n".join(all_stderr))
|
|
172
193
|
|
|
173
194
|
eval_cmd = [
|
|
174
|
-
|
|
195
|
+
_resolve_python(), eval_script,
|
|
175
196
|
"--results-dir", results_dir,
|
|
176
197
|
"--tasks-dir", tasks_dir,
|
|
177
198
|
"--scores", scores_path,
|
|
@@ -195,6 +216,9 @@ def main():
|
|
|
195
216
|
p_val = sub.add_parser("validate")
|
|
196
217
|
p_val.add_argument("--harness", required=True)
|
|
197
218
|
p_val.add_argument("--config", default=None)
|
|
219
|
+
p_val.add_argument("--timeout", type=int, default=30,
|
|
220
|
+
help="Validation timeout in seconds (default: 30). "
|
|
221
|
+
"Increase for LLM-powered agents that make real API calls.")
|
|
198
222
|
|
|
199
223
|
p_run = sub.add_parser("run")
|
|
200
224
|
p_run.add_argument("--harness", required=True)
|
package/tools/init.py
CHANGED
|
@@ -134,6 +134,19 @@ def _check_langsmith_cli():
|
|
|
134
134
|
return False
|
|
135
135
|
|
|
136
136
|
|
|
137
|
+
def _resolve_python():
|
|
138
|
+
"""Resolve the Python interpreter for subprocesses.
|
|
139
|
+
|
|
140
|
+
Uses the current interpreter (sys.executable) instead of hardcoded 'python3'.
|
|
141
|
+
This prevents version mismatches in monorepo setups where the harness may
|
|
142
|
+
need a specific venv Python different from the system python3.
|
|
143
|
+
"""
|
|
144
|
+
exe = sys.executable
|
|
145
|
+
if exe and os.path.isfile(exe):
|
|
146
|
+
return exe
|
|
147
|
+
return "python3"
|
|
148
|
+
|
|
149
|
+
|
|
137
150
|
def _detect_stack(harness_path):
|
|
138
151
|
"""Detect technology stack from harness imports."""
|
|
139
152
|
detect_stack_py = os.path.join(os.path.dirname(__file__), "detect_stack.py")
|
|
@@ -141,7 +154,7 @@ def _detect_stack(harness_path):
|
|
|
141
154
|
return {}
|
|
142
155
|
try:
|
|
143
156
|
r = subprocess.run(
|
|
144
|
-
[
|
|
157
|
+
[_resolve_python(), detect_stack_py, harness_path],
|
|
145
158
|
capture_output=True, text=True, timeout=30,
|
|
146
159
|
)
|
|
147
160
|
if r.returncode == 0 and r.stdout.strip():
|
|
@@ -183,6 +196,12 @@ def main():
|
|
|
183
196
|
parser.add_argument("--base-dir", default=None, help="Path for .harness-evolver/")
|
|
184
197
|
parser.add_argument("--harness-config", default=None, help="Path to harness config.json")
|
|
185
198
|
parser.add_argument("--tools-dir", default=None, help="Path to tools directory")
|
|
199
|
+
parser.add_argument("--validation-timeout", type=int, default=30,
|
|
200
|
+
help="Timeout for harness validation in seconds (default: 30). "
|
|
201
|
+
"Increase for LLM-powered agents that make real API calls.")
|
|
202
|
+
parser.add_argument("--skip-validation", action="store_true",
|
|
203
|
+
help="Skip harness validation step. Use when you know the harness "
|
|
204
|
+
"works but validation times out (e.g. real LLM agent calls).")
|
|
186
205
|
args = parser.parse_args()
|
|
187
206
|
|
|
188
207
|
# Auto-detect missing args
|
|
@@ -309,7 +328,7 @@ def main():
|
|
|
309
328
|
if os.path.exists(detect_stack_py):
|
|
310
329
|
try:
|
|
311
330
|
r = subprocess.run(
|
|
312
|
-
[
|
|
331
|
+
[_resolve_python(), detect_stack_py, harness_dir],
|
|
313
332
|
capture_output=True, text=True, timeout=30,
|
|
314
333
|
)
|
|
315
334
|
if r.returncode == 0 and r.stdout.strip():
|
|
@@ -338,7 +357,7 @@ def main():
|
|
|
338
357
|
if os.path.exists(analyze_py):
|
|
339
358
|
try:
|
|
340
359
|
r = subprocess.run(
|
|
341
|
-
[
|
|
360
|
+
[_resolve_python(), analyze_py, "--harness", args.harness],
|
|
342
361
|
capture_output=True, text=True, timeout=30,
|
|
343
362
|
)
|
|
344
363
|
if r.returncode == 0 and r.stdout.strip():
|
|
@@ -357,30 +376,39 @@ def main():
|
|
|
357
376
|
pass
|
|
358
377
|
|
|
359
378
|
# 5. Validate baseline harness
|
|
360
|
-
print("Validating baseline harness...")
|
|
361
|
-
val_args = ["python3", evaluate_py, "validate",
|
|
362
|
-
"--harness", os.path.join(base, "baseline", "harness.py")]
|
|
363
379
|
config_path = os.path.join(base, "baseline", "config.json")
|
|
364
|
-
if
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
380
|
+
if args.skip_validation:
|
|
381
|
+
print("Skipping baseline validation (--skip-validation).")
|
|
382
|
+
else:
|
|
383
|
+
print(f"Validating baseline harness (timeout: {args.validation_timeout}s)...")
|
|
384
|
+
val_args = [_resolve_python(), evaluate_py, "validate",
|
|
385
|
+
"--harness", os.path.join(base, "baseline", "harness.py"),
|
|
386
|
+
"--timeout", str(args.validation_timeout)]
|
|
387
|
+
if os.path.exists(config_path):
|
|
388
|
+
val_args.extend(["--config", config_path])
|
|
389
|
+
r = subprocess.run(val_args, capture_output=True, text=True)
|
|
390
|
+
if r.returncode != 0:
|
|
391
|
+
hint = ""
|
|
392
|
+
if "TIMEOUT" in r.stderr:
|
|
393
|
+
hint = (f"\n\nHint: The harness timed out after {args.validation_timeout}s. "
|
|
394
|
+
"This is common for LLM-powered agents that make real API calls.\n"
|
|
395
|
+
"Try: --validation-timeout 120 (or --skip-validation to bypass)")
|
|
396
|
+
print(f"FAIL: baseline harness validation failed.\n{r.stderr}{hint}", file=sys.stderr)
|
|
397
|
+
sys.exit(1)
|
|
398
|
+
print(r.stdout.strip())
|
|
371
399
|
|
|
372
400
|
# 6. Evaluate baseline
|
|
373
401
|
print("Evaluating baseline harness...")
|
|
374
402
|
baseline_traces = tempfile.mkdtemp()
|
|
375
403
|
baseline_scores = os.path.join(base, "baseline_scores.json")
|
|
376
404
|
eval_args = [
|
|
377
|
-
|
|
405
|
+
_resolve_python(), evaluate_py, "run",
|
|
378
406
|
"--harness", os.path.join(base, "baseline", "harness.py"),
|
|
379
407
|
"--tasks-dir", os.path.join(base, "eval", "tasks"),
|
|
380
408
|
"--eval", os.path.join(base, "eval", "eval.py"),
|
|
381
409
|
"--traces-dir", baseline_traces,
|
|
382
410
|
"--scores", baseline_scores,
|
|
383
|
-
"--timeout",
|
|
411
|
+
"--timeout", str(max(args.validation_timeout, 60)),
|
|
384
412
|
]
|
|
385
413
|
if os.path.exists(config_path):
|
|
386
414
|
eval_args.extend(["--config", config_path])
|
|
@@ -399,7 +427,7 @@ def main():
|
|
|
399
427
|
# 7. Initialize state with baseline score
|
|
400
428
|
print(f"Baseline score: {baseline_score:.2f}")
|
|
401
429
|
r = subprocess.run(
|
|
402
|
-
[
|
|
430
|
+
[_resolve_python(), state_py, "init",
|
|
403
431
|
"--base-dir", base,
|
|
404
432
|
"--baseline-score", str(baseline_score)],
|
|
405
433
|
capture_output=True, text=True,
|