harness-evolver 4.2.3 → 4.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/package.json +1 -1
- package/skills/setup/SKILL.md +14 -0
- package/tools/setup.py +116 -88
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.2.
|
|
4
|
+
"version": "4.2.5",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/package.json
CHANGED
package/skills/setup/SKILL.md
CHANGED
|
@@ -61,7 +61,21 @@ Look for:
|
|
|
61
61
|
|
|
62
62
|
To identify the **framework**, read the entry point file and its immediate imports. The proposer agents will use Context7 MCP for detailed documentation lookup — you don't need to detect every library, just identify the main framework (LangGraph, CrewAI, OpenAI Agents SDK, etc.) from the imports you see.
|
|
63
63
|
|
|
64
|
+
**Detect virtual environments** — check for venvs in the project or parent directories:
|
|
65
|
+
```bash
|
|
66
|
+
# Check common venv locations
|
|
67
|
+
for venv_dir in .venv venv ../.venv ../venv; do
|
|
68
|
+
if [ -f "$venv_dir/bin/python" ]; then
|
|
69
|
+
echo "VENV_FOUND: $venv_dir/bin/python"
|
|
70
|
+
break
|
|
71
|
+
fi
|
|
72
|
+
done
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
If a venv is found, **use it for the entry point** instead of bare `python`. The agent's dependencies are likely installed there, not in the system Python. For example: `../.venv/bin/python agent.py {input}` instead of `python agent.py {input}`.
|
|
76
|
+
|
|
64
77
|
Identify the **run command** — how to execute the agent. Use `{input}` as a placeholder for the JSON file path:
|
|
78
|
+
- `.venv/bin/python main.py {input}` — if venv detected (preferred)
|
|
65
79
|
- `python main.py {input}` — agent reads JSON file from positional arg
|
|
66
80
|
- `python main.py --input {input}` — agent reads JSON file from `--input` flag
|
|
67
81
|
- `python main.py --query {input_json}` — agent receives inline JSON string
|
package/tools/setup.py
CHANGED
|
@@ -462,101 +462,129 @@ def main():
|
|
|
462
462
|
else:
|
|
463
463
|
print(f"Dataset: '{dataset_name}'")
|
|
464
464
|
|
|
465
|
-
# Create dataset
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
dataset
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
465
|
+
# Create dataset — wrapped in try/except to clean up orphaned datasets on failure
|
|
466
|
+
dataset = None
|
|
467
|
+
try:
|
|
468
|
+
print(f"Creating dataset '{dataset_name}'...")
|
|
469
|
+
if args.dataset_from_file:
|
|
470
|
+
dataset, count = create_dataset_from_file(client, dataset_name, args.dataset_from_file)
|
|
471
|
+
print(f" Created from file: {count} examples")
|
|
472
|
+
elif args.dataset_from_langsmith:
|
|
473
|
+
dataset, count = create_dataset_from_langsmith(
|
|
474
|
+
client, dataset_name, args.dataset_from_langsmith,
|
|
475
|
+
)
|
|
476
|
+
if not dataset:
|
|
477
|
+
print(" No traces found in source project. Creating empty dataset.")
|
|
478
|
+
dataset = create_empty_dataset(client, dataset_name)
|
|
479
|
+
count = 0
|
|
480
|
+
else:
|
|
481
|
+
print(f" Created from LangSmith traces: {count} examples")
|
|
482
|
+
else:
|
|
476
483
|
dataset = create_empty_dataset(client, dataset_name)
|
|
477
484
|
count = 0
|
|
485
|
+
print(" Created empty dataset (testgen will populate)")
|
|
486
|
+
|
|
487
|
+
# Configure evaluators
|
|
488
|
+
print(f"Configuring evaluators for goals: {goals}")
|
|
489
|
+
evaluators, evaluator_keys = get_evaluators(goals, args.evaluators)
|
|
490
|
+
print(f" Active evaluators: {evaluator_keys}")
|
|
491
|
+
llm_evaluators = [k for k in evaluator_keys if k in ("correctness", "conciseness")]
|
|
492
|
+
if llm_evaluators:
|
|
493
|
+
print(f" LLM evaluators (agent-based): {llm_evaluators}")
|
|
494
|
+
|
|
495
|
+
# Run baseline (code-based evaluators only; LLM scoring done by evaluator agent)
|
|
496
|
+
baseline_experiment = None
|
|
497
|
+
baseline_score = 0.0
|
|
498
|
+
if not args.skip_baseline and count > 0:
|
|
499
|
+
print(f"Running baseline target ({count} examples)...")
|
|
500
|
+
try:
|
|
501
|
+
baseline_experiment, baseline_score = run_baseline(
|
|
502
|
+
client, dataset_name, args.entry_point, evaluators,
|
|
503
|
+
)
|
|
504
|
+
print(f" Baseline has_output score: {baseline_score:.3f}")
|
|
505
|
+
print(f" Experiment: {baseline_experiment}")
|
|
506
|
+
if llm_evaluators:
|
|
507
|
+
print(f" Note: LLM scoring pending — evaluator agent will run during /evolver:evolve")
|
|
508
|
+
except Exception as e:
|
|
509
|
+
print(f" Baseline evaluation failed: {e}", file=sys.stderr)
|
|
510
|
+
print(" Continuing with score 0.0")
|
|
511
|
+
elif count == 0:
|
|
512
|
+
print("Skipping baseline (no examples in dataset yet)")
|
|
478
513
|
else:
|
|
479
|
-
print(
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
print(f"Running baseline target ({count} examples)...")
|
|
514
|
+
print("Skipping baseline (--skip-baseline)")
|
|
515
|
+
|
|
516
|
+
# Resolve Python interpreter in entry_point to absolute path
|
|
517
|
+
# This ensures the entry point works in worktrees where venvs don't exist
|
|
518
|
+
entry_point = args.entry_point
|
|
519
|
+
parts = entry_point.split()
|
|
520
|
+
if parts:
|
|
521
|
+
python_path = parts[0]
|
|
522
|
+
# Resolve relative Python paths (e.g., ../.venv/bin/python, .venv/bin/python)
|
|
523
|
+
if "/" in python_path and not os.path.isabs(python_path):
|
|
524
|
+
abs_python = os.path.abspath(python_path)
|
|
525
|
+
if os.path.exists(abs_python):
|
|
526
|
+
parts[0] = abs_python
|
|
527
|
+
entry_point = " ".join(parts)
|
|
528
|
+
print(f" Resolved Python path: {abs_python}")
|
|
529
|
+
|
|
530
|
+
# Compute project_dir relative to git root (for worktree path resolution)
|
|
531
|
+
project_dir = ""
|
|
498
532
|
try:
|
|
499
|
-
|
|
500
|
-
|
|
533
|
+
git_prefix = subprocess.run(
|
|
534
|
+
["git", "rev-parse", "--show-prefix"],
|
|
535
|
+
capture_output=True, text=True, timeout=5,
|
|
501
536
|
)
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
except Exception as e:
|
|
507
|
-
print(f" Baseline evaluation failed: {e}", file=sys.stderr)
|
|
508
|
-
print(" Continuing with score 0.0")
|
|
509
|
-
elif count == 0:
|
|
510
|
-
print("Skipping baseline (no examples in dataset yet)")
|
|
511
|
-
else:
|
|
512
|
-
print("Skipping baseline (--skip-baseline)")
|
|
513
|
-
|
|
514
|
-
# Compute project_dir relative to git root (for worktree path resolution)
|
|
515
|
-
project_dir = ""
|
|
516
|
-
try:
|
|
517
|
-
git_prefix = subprocess.run(
|
|
518
|
-
["git", "rev-parse", "--show-prefix"],
|
|
519
|
-
capture_output=True, text=True, timeout=5,
|
|
520
|
-
)
|
|
521
|
-
if git_prefix.returncode == 0:
|
|
522
|
-
project_dir = git_prefix.stdout.strip().rstrip("/")
|
|
523
|
-
except Exception:
|
|
524
|
-
pass
|
|
525
|
-
|
|
526
|
-
# Write config
|
|
527
|
-
config = {
|
|
528
|
-
"version": "3.0.0",
|
|
529
|
-
"project": project_name,
|
|
530
|
-
"dataset": dataset_name,
|
|
531
|
-
"dataset_id": str(dataset.id) if dataset else None,
|
|
532
|
-
"project_dir": project_dir,
|
|
533
|
-
"entry_point": args.entry_point,
|
|
534
|
-
"evaluators": evaluator_keys,
|
|
535
|
-
"optimization_goals": goals,
|
|
536
|
-
"production_project": args.production_project,
|
|
537
|
-
"baseline_experiment": baseline_experiment,
|
|
538
|
-
"best_experiment": baseline_experiment,
|
|
539
|
-
"best_score": baseline_score,
|
|
540
|
-
"iterations": 0,
|
|
541
|
-
"framework": args.framework,
|
|
542
|
-
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
543
|
-
"history": [{
|
|
544
|
-
"version": "baseline",
|
|
545
|
-
"experiment": baseline_experiment,
|
|
546
|
-
"score": baseline_score,
|
|
547
|
-
}] if baseline_experiment else [],
|
|
548
|
-
}
|
|
537
|
+
if git_prefix.returncode == 0:
|
|
538
|
+
project_dir = git_prefix.stdout.strip().rstrip("/")
|
|
539
|
+
except Exception:
|
|
540
|
+
pass
|
|
549
541
|
|
|
550
|
-
|
|
551
|
-
|
|
542
|
+
# Write config
|
|
543
|
+
config = {
|
|
544
|
+
"version": "3.0.0",
|
|
545
|
+
"project": project_name,
|
|
546
|
+
"dataset": dataset_name,
|
|
547
|
+
"dataset_id": str(dataset.id) if dataset else None,
|
|
548
|
+
"project_dir": project_dir,
|
|
549
|
+
"entry_point": entry_point,
|
|
550
|
+
"evaluators": evaluator_keys,
|
|
551
|
+
"optimization_goals": goals,
|
|
552
|
+
"production_project": args.production_project,
|
|
553
|
+
"baseline_experiment": baseline_experiment,
|
|
554
|
+
"best_experiment": baseline_experiment,
|
|
555
|
+
"best_score": baseline_score,
|
|
556
|
+
"iterations": 0,
|
|
557
|
+
"framework": args.framework,
|
|
558
|
+
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
559
|
+
"history": [{
|
|
560
|
+
"version": "baseline",
|
|
561
|
+
"experiment": baseline_experiment,
|
|
562
|
+
"score": baseline_score,
|
|
563
|
+
}] if baseline_experiment else [],
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
with open(args.output, "w") as f:
|
|
567
|
+
json.dump(config, f, indent=2)
|
|
568
|
+
|
|
569
|
+
print(f"\nSetup complete. Config saved to {args.output}")
|
|
570
|
+
print(f" Project: {project_name}")
|
|
571
|
+
print(f" Dataset: {dataset_name} ({count} examples)")
|
|
572
|
+
print(f" Evaluators: {evaluator_keys}")
|
|
573
|
+
if baseline_experiment:
|
|
574
|
+
print(f" Baseline: {baseline_score:.3f}")
|
|
575
|
+
print(f"\nNext: run /evolver:evolve")
|
|
552
576
|
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
577
|
+
except Exception as e:
|
|
578
|
+
# Cleanup orphaned dataset if setup fails after dataset creation
|
|
579
|
+
if dataset:
|
|
580
|
+
print(f"Setup failed: {e}", file=sys.stderr)
|
|
581
|
+
print(f"Cleaning up orphaned dataset '{dataset_name}'...", file=sys.stderr)
|
|
582
|
+
try:
|
|
583
|
+
client.delete_dataset(dataset_id=dataset.id)
|
|
584
|
+
print(" Dataset deleted.", file=sys.stderr)
|
|
585
|
+
except Exception:
|
|
586
|
+
print(f" WARNING: Could not delete dataset. Clean up manually in LangSmith.", file=sys.stderr)
|
|
587
|
+
raise
|
|
560
588
|
|
|
561
589
|
|
|
562
590
|
if __name__ == "__main__":
|