harness-evolver 2.6.0 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/install.js CHANGED
@@ -231,20 +231,48 @@ async function main() {
231
231
 
232
232
  // LangSmith CLI
233
233
  const hasLangsmithCli = checkCommand("langsmith-cli --version");
234
- if (hasLangsmithCli) {
235
- console.log(` ${GREEN}✓${RESET} langsmith-cli already installed`);
234
+ const langsmithCredsDir = process.platform === "darwin"
235
+ ? path.join(HOME, "Library", "Application Support", "langsmith-cli")
236
+ : path.join(HOME, ".config", "langsmith-cli");
237
+ const langsmithCredsFile = path.join(langsmithCredsDir, "credentials");
238
+ const hasLangsmithCreds = fs.existsSync(langsmithCredsFile);
239
+
240
+ if (hasLangsmithCli && hasLangsmithCreds) {
241
+ console.log(` ${GREEN}✓${RESET} langsmith-cli installed and authenticated`);
236
242
  } else {
237
- console.log(` ${BOLD}LangSmith CLI${RESET} — rich trace analysis (error rates, latency, token usage)`);
238
- console.log(` ${DIM}uv tool install langsmith-cli && langsmith-cli auth login${RESET}`);
239
- const lsAnswer = await ask(rl, `\n ${YELLOW}Install langsmith-cli? [y/N]:${RESET} `);
240
- if (lsAnswer.trim().toLowerCase() === "y") {
241
- console.log(`\n Installing langsmith-cli...`);
242
- try {
243
- execSync("uv tool install langsmith-cli", { stdio: "inherit" });
244
- console.log(`\n ${GREEN}✓${RESET} langsmith-cli installed`);
245
- console.log(` ${YELLOW}Run ${BOLD}langsmith-cli auth login${RESET}${YELLOW} to authenticate with your LangSmith API key.${RESET}\n`);
246
- } catch {
247
- console.log(`\n ${RED}Failed.${RESET} Install manually: uv tool install langsmith-cli\n`);
243
+ if (!hasLangsmithCli) {
244
+ console.log(` ${BOLD}LangSmith CLI${RESET} rich trace analysis (error rates, latency, token usage)`);
245
+ const lsAnswer = await ask(rl, `\n ${YELLOW}Install langsmith-cli? [y/N]:${RESET} `);
246
+ if (lsAnswer.trim().toLowerCase() === "y") {
247
+ console.log(`\n Installing langsmith-cli...`);
248
+ try {
249
+ execSync("uv tool install langsmith-cli", { stdio: "inherit" });
250
+ console.log(`\n ${GREEN}✓${RESET} langsmith-cli installed`);
251
+ } catch {
252
+ console.log(`\n ${RED}Failed.${RESET} Install manually: uv tool install langsmith-cli\n`);
253
+ }
254
+ }
255
+ } else {
256
+ console.log(` ${GREEN}✓${RESET} langsmith-cli already installed`);
257
+ }
258
+
259
+ // Auth — ask for API key inline if not already configured
260
+ if (!hasLangsmithCreds) {
261
+ console.log(`\n ${BOLD}LangSmith API Key${RESET} — get yours at ${DIM}https://smith.langchain.com/settings${RESET}`);
262
+ const apiKey = await ask(rl, ` ${YELLOW}Paste your LangSmith API key (or Enter to skip):${RESET} `);
263
+ const key = apiKey.trim();
264
+ if (key && key.startsWith("lsv2_")) {
265
+ try {
266
+ fs.mkdirSync(langsmithCredsDir, { recursive: true });
267
+ fs.writeFileSync(langsmithCredsFile, `LANGSMITH_API_KEY=${key}\n`);
268
+ console.log(` ${GREEN}✓${RESET} LangSmith API key saved`);
269
+ } catch {
270
+ console.log(` ${RED}Failed to save credentials.${RESET} Set LANGSMITH_API_KEY in your shell instead.`);
271
+ }
272
+ } else if (key) {
273
+ console.log(` ${YELLOW}Doesn't look like a LangSmith key (should start with lsv2_). Skipped.${RESET}`);
274
+ } else {
275
+ console.log(` ${DIM}Skipped. Set LANGSMITH_API_KEY later or run: langsmith-cli auth login${RESET}`);
248
276
  }
249
277
  }
250
278
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
- "version": "2.6.0",
3
+ "version": "2.7.0",
4
4
  "description": "Meta-Harness-style autonomous harness optimization for Claude Code",
5
5
  "author": "Raphael Valdetaro",
6
6
  "license": "MIT",
@@ -103,6 +103,25 @@ python3 $TOOLS/init.py [directory] \
103
103
 
104
104
  Add `--harness-config config.json` if a config exists.
105
105
 
106
+ For **LLM-powered agents** that make real API calls (LangGraph, CrewAI, etc.) and take
107
+ more than 30 seconds per invocation, increase the validation timeout:
108
+
109
+ ```bash
110
+ python3 $TOOLS/init.py [directory] \
111
+ --harness harness.py --eval eval.py --tasks tasks/ \
112
+ --tools-dir $TOOLS \
113
+ --validation-timeout 120
114
+ ```
115
+
116
+ If validation keeps timing out but you've verified the harness works manually, skip it:
117
+
118
+ ```bash
119
+ python3 $TOOLS/init.py [directory] \
120
+ --harness harness.py --eval eval.py --tasks tasks/ \
121
+ --tools-dir $TOOLS \
122
+ --skip-validation
123
+ ```
124
+
106
125
  ## After Init — Report
107
126
 
108
127
  - What was detected vs created
@@ -132,3 +151,6 @@ This is advisory only — do not spawn the architect agent.
132
151
  - The `expected` field is never shown to the harness — only the eval script sees it.
133
152
  - If `.harness-evolver/` already exists, warn before overwriting.
134
153
  - If no Python files exist in CWD, the user is probably in the wrong directory.
154
+ - **Monorepo / venv mismatch**: In monorepos with dedicated venvs per app, the system `python3` may differ from the project's Python version. The harness wrapper should re-exec with the correct venv Python. The tools now use `sys.executable` instead of hardcoded `python3`.
155
+ - **Stale site-packages**: If the project uses editable installs (`pip install -e .`), packages in `site-packages/` may have stale copies of data files (e.g. registry YAMLs). Run `uv pip install -e . --force-reinstall --no-deps` to sync.
156
+ - **Validation timeout**: LLM agents making real API calls typically take 15-60s per invocation. Use `--validation-timeout 120` or `--skip-validation` to handle this.
@@ -472,12 +472,60 @@ def analyze_scores(summary_path):
472
472
 
473
473
  # --- Main ---
474
474
 
475
+ def analyze_multiple(file_paths):
476
+ """Analyze multiple Python files and merge their signals.
477
+
478
+ Useful in monorepo setups where the harness is a thin wrapper that
479
+ delegates to the actual agent code. Pass the harness AND the main
480
+ agent source files for a comprehensive topology classification.
481
+ """
482
+ merged = {
483
+ "llm_call_count": 0,
484
+ "has_loop_around_llm": False,
485
+ "has_tool_definitions": False,
486
+ "has_retrieval": False,
487
+ "has_graph_framework": False,
488
+ "has_parallel_execution": False,
489
+ "has_error_handling": False,
490
+ "code_lines": 0,
491
+ "function_count": 0,
492
+ "class_count": 0,
493
+ "files_analyzed": [],
494
+ }
495
+
496
+ for path in file_paths:
497
+ if not os.path.isfile(path):
498
+ continue
499
+ try:
500
+ signals = analyze_code(path)
501
+ except Exception:
502
+ continue
503
+
504
+ merged["llm_call_count"] += signals.get("llm_call_count", 0)
505
+ merged["code_lines"] += signals.get("code_lines", 0)
506
+ merged["function_count"] += signals.get("function_count", 0)
507
+ merged["class_count"] += signals.get("class_count", 0)
508
+ merged["files_analyzed"].append(os.path.basename(path))
509
+
510
+ for bool_key in ["has_loop_around_llm", "has_tool_definitions", "has_retrieval",
511
+ "has_graph_framework", "has_parallel_execution", "has_error_handling"]:
512
+ if signals.get(bool_key):
513
+ merged[bool_key] = True
514
+
515
+ merged["estimated_topology"] = _estimate_topology(merged)
516
+ return merged
517
+
518
+
475
519
  def main():
476
520
  parser = argparse.ArgumentParser(
477
521
  description="Analyze harness architecture and produce signals for the architect agent",
478
- usage="analyze_architecture.py --harness PATH [--traces-dir PATH] [--summary PATH] [-o output.json]",
522
+ usage="analyze_architecture.py --harness PATH [--source-files PATH ...] "
523
+ "[--traces-dir PATH] [--summary PATH] [-o output.json]",
479
524
  )
480
525
  parser.add_argument("--harness", required=True, help="Path to harness Python file")
526
+ parser.add_argument("--source-files", nargs="*", default=None,
527
+ help="Additional source files to analyze (e.g. the actual agent code). "
528
+ "Useful when the harness is a thin wrapper around a larger system.")
481
529
  parser.add_argument("--traces-dir", default=None, help="Path to traces directory")
482
530
  parser.add_argument("--summary", default=None, help="Path to summary.json")
483
531
  parser.add_argument("-o", "--output", default=None, help="Output JSON path")
@@ -487,8 +535,14 @@ def main():
487
535
  print(json.dumps({"error": f"Harness file not found: {args.harness}"}))
488
536
  sys.exit(1)
489
537
 
538
+ if args.source_files:
539
+ all_files = [args.harness] + [f for f in args.source_files if os.path.isfile(f)]
540
+ code_signals = analyze_multiple(all_files)
541
+ else:
542
+ code_signals = analyze_code(args.harness)
543
+
490
544
  result = {
491
- "code_signals": analyze_code(args.harness),
545
+ "code_signals": code_signals,
492
546
  "trace_signals": None,
493
547
  "score_signals": None,
494
548
  }
package/tools/evaluate.py CHANGED
@@ -2,7 +2,7 @@
2
2
  """Evaluation orchestrator for Harness Evolver.
3
3
 
4
4
  Commands:
5
- validate --harness PATH [--config PATH]
5
+ validate --harness PATH [--config PATH] [--timeout SECONDS]
6
6
  run --harness PATH --tasks-dir PATH --eval PATH --traces-dir PATH --scores PATH
7
7
  [--config PATH] [--timeout SECONDS]
8
8
 
@@ -20,9 +20,23 @@ import tempfile
20
20
  import time
21
21
 
22
22
 
23
+ def _resolve_python():
24
+ """Resolve the Python interpreter to use for subprocesses.
25
+
26
+ Prefers the current interpreter (sys.executable) over a hardcoded 'python3'.
27
+ This is critical in monorepo setups where the harness may need a specific
28
+ venv Python (e.g. Python 3.12) while the system 'python3' is a different
29
+ version (e.g. 3.14) with incompatible site-packages.
30
+ """
31
+ exe = sys.executable
32
+ if exe and os.path.isfile(exe):
33
+ return exe
34
+ return "python3"
35
+
36
+
23
37
  def _run_harness_on_task(harness, config, task_input_path, output_path, task_traces_dir, timeout, env=None):
24
38
  """Run the harness on a single task. Returns (success, elapsed_ms, stdout, stderr)."""
25
- cmd = ["python3", harness, "--input", task_input_path, "--output", output_path]
39
+ cmd = [_resolve_python(), harness, "--input", task_input_path, "--output", output_path]
26
40
  if task_traces_dir:
27
41
  extra_dir = os.path.join(task_traces_dir, "extra")
28
42
  os.makedirs(extra_dir, exist_ok=True)
@@ -48,6 +62,7 @@ def _run_harness_on_task(harness, config, task_input_path, output_path, task_tra
48
62
  def cmd_validate(args):
49
63
  harness = args.harness
50
64
  config = getattr(args, "config", None)
65
+ timeout = getattr(args, "timeout", 30) or 30
51
66
 
52
67
  if not os.path.exists(harness):
53
68
  print(f"FAIL: harness not found: {harness}", file=sys.stderr)
@@ -61,11 +76,17 @@ def cmd_validate(args):
61
76
  json.dump(dummy_task, f)
62
77
 
63
78
  success, elapsed, stdout, stderr = _run_harness_on_task(
64
- harness, config, input_path, output_path, None, timeout=30,
79
+ harness, config, input_path, output_path, None, timeout=timeout,
65
80
  )
66
81
 
67
82
  if not success:
68
- print(f"FAIL: harness exited with error.\nstderr: {stderr}", file=sys.stderr)
83
+ hint = ""
84
+ if "TIMEOUT" in stderr:
85
+ hint = (f"\nHint: validation timed out after {timeout}s. "
86
+ "For LLM-powered agents that make real API calls, "
87
+ "use --timeout to increase the limit: "
88
+ f"evaluate.py validate --harness {harness} --timeout 120")
89
+ print(f"FAIL: harness exited with error.\nstderr: {stderr}{hint}", file=sys.stderr)
69
90
  sys.exit(1)
70
91
 
71
92
  if not os.path.exists(output_path):
@@ -171,7 +192,7 @@ def cmd_run(args):
171
192
  f.write("\n".join(all_stderr))
172
193
 
173
194
  eval_cmd = [
174
- "python3", eval_script,
195
+ _resolve_python(), eval_script,
175
196
  "--results-dir", results_dir,
176
197
  "--tasks-dir", tasks_dir,
177
198
  "--scores", scores_path,
@@ -195,6 +216,9 @@ def main():
195
216
  p_val = sub.add_parser("validate")
196
217
  p_val.add_argument("--harness", required=True)
197
218
  p_val.add_argument("--config", default=None)
219
+ p_val.add_argument("--timeout", type=int, default=30,
220
+ help="Validation timeout in seconds (default: 30). "
221
+ "Increase for LLM-powered agents that make real API calls.")
198
222
 
199
223
  p_run = sub.add_parser("run")
200
224
  p_run.add_argument("--harness", required=True)
package/tools/init.py CHANGED
@@ -134,6 +134,19 @@ def _check_langsmith_cli():
134
134
  return False
135
135
 
136
136
 
137
+ def _resolve_python():
138
+ """Resolve the Python interpreter for subprocesses.
139
+
140
+ Uses the current interpreter (sys.executable) instead of hardcoded 'python3'.
141
+ This prevents version mismatches in monorepo setups where the harness may
142
+ need a specific venv Python different from the system python3.
143
+ """
144
+ exe = sys.executable
145
+ if exe and os.path.isfile(exe):
146
+ return exe
147
+ return "python3"
148
+
149
+
137
150
  def _detect_stack(harness_path):
138
151
  """Detect technology stack from harness imports."""
139
152
  detect_stack_py = os.path.join(os.path.dirname(__file__), "detect_stack.py")
@@ -141,7 +154,7 @@ def _detect_stack(harness_path):
141
154
  return {}
142
155
  try:
143
156
  r = subprocess.run(
144
- ["python3", detect_stack_py, harness_path],
157
+ [_resolve_python(), detect_stack_py, harness_path],
145
158
  capture_output=True, text=True, timeout=30,
146
159
  )
147
160
  if r.returncode == 0 and r.stdout.strip():
@@ -183,6 +196,12 @@ def main():
183
196
  parser.add_argument("--base-dir", default=None, help="Path for .harness-evolver/")
184
197
  parser.add_argument("--harness-config", default=None, help="Path to harness config.json")
185
198
  parser.add_argument("--tools-dir", default=None, help="Path to tools directory")
199
+ parser.add_argument("--validation-timeout", type=int, default=30,
200
+ help="Timeout for harness validation in seconds (default: 30). "
201
+ "Increase for LLM-powered agents that make real API calls.")
202
+ parser.add_argument("--skip-validation", action="store_true",
203
+ help="Skip harness validation step. Use when you know the harness "
204
+ "works but validation times out (e.g. real LLM agent calls).")
186
205
  args = parser.parse_args()
187
206
 
188
207
  # Auto-detect missing args
@@ -309,7 +328,7 @@ def main():
309
328
  if os.path.exists(detect_stack_py):
310
329
  try:
311
330
  r = subprocess.run(
312
- ["python3", detect_stack_py, harness_dir],
331
+ [_resolve_python(), detect_stack_py, harness_dir],
313
332
  capture_output=True, text=True, timeout=30,
314
333
  )
315
334
  if r.returncode == 0 and r.stdout.strip():
@@ -338,7 +357,7 @@ def main():
338
357
  if os.path.exists(analyze_py):
339
358
  try:
340
359
  r = subprocess.run(
341
- ["python3", analyze_py, "--harness", args.harness],
360
+ [_resolve_python(), analyze_py, "--harness", args.harness],
342
361
  capture_output=True, text=True, timeout=30,
343
362
  )
344
363
  if r.returncode == 0 and r.stdout.strip():
@@ -357,30 +376,39 @@ def main():
357
376
  pass
358
377
 
359
378
  # 5. Validate baseline harness
360
- print("Validating baseline harness...")
361
- val_args = ["python3", evaluate_py, "validate",
362
- "--harness", os.path.join(base, "baseline", "harness.py")]
363
379
  config_path = os.path.join(base, "baseline", "config.json")
364
- if os.path.exists(config_path):
365
- val_args.extend(["--config", config_path])
366
- r = subprocess.run(val_args, capture_output=True, text=True)
367
- if r.returncode != 0:
368
- print(f"FAIL: baseline harness validation failed.\n{r.stderr}", file=sys.stderr)
369
- sys.exit(1)
370
- print(r.stdout.strip())
380
+ if args.skip_validation:
381
+ print("Skipping baseline validation (--skip-validation).")
382
+ else:
383
+ print(f"Validating baseline harness (timeout: {args.validation_timeout}s)...")
384
+ val_args = [_resolve_python(), evaluate_py, "validate",
385
+ "--harness", os.path.join(base, "baseline", "harness.py"),
386
+ "--timeout", str(args.validation_timeout)]
387
+ if os.path.exists(config_path):
388
+ val_args.extend(["--config", config_path])
389
+ r = subprocess.run(val_args, capture_output=True, text=True)
390
+ if r.returncode != 0:
391
+ hint = ""
392
+ if "TIMEOUT" in r.stderr:
393
+ hint = (f"\n\nHint: The harness timed out after {args.validation_timeout}s. "
394
+ "This is common for LLM-powered agents that make real API calls.\n"
395
+ "Try: --validation-timeout 120 (or --skip-validation to bypass)")
396
+ print(f"FAIL: baseline harness validation failed.\n{r.stderr}{hint}", file=sys.stderr)
397
+ sys.exit(1)
398
+ print(r.stdout.strip())
371
399
 
372
400
  # 6. Evaluate baseline
373
401
  print("Evaluating baseline harness...")
374
402
  baseline_traces = tempfile.mkdtemp()
375
403
  baseline_scores = os.path.join(base, "baseline_scores.json")
376
404
  eval_args = [
377
- "python3", evaluate_py, "run",
405
+ _resolve_python(), evaluate_py, "run",
378
406
  "--harness", os.path.join(base, "baseline", "harness.py"),
379
407
  "--tasks-dir", os.path.join(base, "eval", "tasks"),
380
408
  "--eval", os.path.join(base, "eval", "eval.py"),
381
409
  "--traces-dir", baseline_traces,
382
410
  "--scores", baseline_scores,
383
- "--timeout", "60",
411
+ "--timeout", str(max(args.validation_timeout, 60)),
384
412
  ]
385
413
  if os.path.exists(config_path):
386
414
  eval_args.extend(["--config", config_path])
@@ -399,7 +427,7 @@ def main():
399
427
  # 7. Initialize state with baseline score
400
428
  print(f"Baseline score: {baseline_score:.2f}")
401
429
  r = subprocess.run(
402
- ["python3", state_py, "init",
430
+ [_resolve_python(), state_py, "init",
403
431
  "--base-dir", base,
404
432
  "--baseline-score", str(baseline_score)],
405
433
  capture_output=True, text=True,