harness-evolver 4.2.4 → 4.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
3
  "description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
4
- "version": "4.2.4",
4
+ "version": "4.2.5",
5
5
  "author": {
6
6
  "name": "Raphael Valdetaro"
7
7
  },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
- "version": "4.2.4",
3
+ "version": "4.2.5",
4
4
  "description": "LangSmith-native autonomous agent optimization for Claude Code",
5
5
  "author": "Raphael Valdetaro",
6
6
  "license": "MIT",
package/tools/setup.py CHANGED
@@ -462,101 +462,129 @@ def main():
462
462
  else:
463
463
  print(f"Dataset: '{dataset_name}'")
464
464
 
465
- # Create dataset
466
- print(f"Creating dataset '{dataset_name}'...")
467
- if args.dataset_from_file:
468
- dataset, count = create_dataset_from_file(client, dataset_name, args.dataset_from_file)
469
- print(f" Created from file: {count} examples")
470
- elif args.dataset_from_langsmith:
471
- dataset, count = create_dataset_from_langsmith(
472
- client, dataset_name, args.dataset_from_langsmith,
473
- )
474
- if not dataset:
475
- print(" No traces found in source project. Creating empty dataset.")
465
+ # Create dataset — wrapped in try/except to clean up orphaned datasets on failure
466
+ dataset = None
467
+ try:
468
+ print(f"Creating dataset '{dataset_name}'...")
469
+ if args.dataset_from_file:
470
+ dataset, count = create_dataset_from_file(client, dataset_name, args.dataset_from_file)
471
+ print(f" Created from file: {count} examples")
472
+ elif args.dataset_from_langsmith:
473
+ dataset, count = create_dataset_from_langsmith(
474
+ client, dataset_name, args.dataset_from_langsmith,
475
+ )
476
+ if not dataset:
477
+ print(" No traces found in source project. Creating empty dataset.")
478
+ dataset = create_empty_dataset(client, dataset_name)
479
+ count = 0
480
+ else:
481
+ print(f" Created from LangSmith traces: {count} examples")
482
+ else:
476
483
  dataset = create_empty_dataset(client, dataset_name)
477
484
  count = 0
485
+ print(" Created empty dataset (testgen will populate)")
486
+
487
+ # Configure evaluators
488
+ print(f"Configuring evaluators for goals: {goals}")
489
+ evaluators, evaluator_keys = get_evaluators(goals, args.evaluators)
490
+ print(f" Active evaluators: {evaluator_keys}")
491
+ llm_evaluators = [k for k in evaluator_keys if k in ("correctness", "conciseness")]
492
+ if llm_evaluators:
493
+ print(f" LLM evaluators (agent-based): {llm_evaluators}")
494
+
495
+ # Run baseline (code-based evaluators only; LLM scoring done by evaluator agent)
496
+ baseline_experiment = None
497
+ baseline_score = 0.0
498
+ if not args.skip_baseline and count > 0:
499
+ print(f"Running baseline target ({count} examples)...")
500
+ try:
501
+ baseline_experiment, baseline_score = run_baseline(
502
+ client, dataset_name, args.entry_point, evaluators,
503
+ )
504
+ print(f" Baseline has_output score: {baseline_score:.3f}")
505
+ print(f" Experiment: {baseline_experiment}")
506
+ if llm_evaluators:
507
+ print(f" Note: LLM scoring pending — evaluator agent will run during /evolver:evolve")
508
+ except Exception as e:
509
+ print(f" Baseline evaluation failed: {e}", file=sys.stderr)
510
+ print(" Continuing with score 0.0")
511
+ elif count == 0:
512
+ print("Skipping baseline (no examples in dataset yet)")
478
513
  else:
479
- print(f" Created from LangSmith traces: {count} examples")
480
- else:
481
- dataset = create_empty_dataset(client, dataset_name)
482
- count = 0
483
- print(" Created empty dataset (testgen will populate)")
484
-
485
- # Configure evaluators
486
- print(f"Configuring evaluators for goals: {goals}")
487
- evaluators, evaluator_keys = get_evaluators(goals, args.evaluators)
488
- print(f" Active evaluators: {evaluator_keys}")
489
- llm_evaluators = [k for k in evaluator_keys if k in ("correctness", "conciseness")]
490
- if llm_evaluators:
491
- print(f" LLM evaluators (agent-based): {llm_evaluators}")
492
-
493
- # Run baseline (code-based evaluators only; LLM scoring done by evaluator agent)
494
- baseline_experiment = None
495
- baseline_score = 0.0
496
- if not args.skip_baseline and count > 0:
497
- print(f"Running baseline target ({count} examples)...")
514
+ print("Skipping baseline (--skip-baseline)")
515
+
516
+ # Resolve Python interpreter in entry_point to absolute path
517
+ # This ensures the entry point works in worktrees where venvs don't exist
518
+ entry_point = args.entry_point
519
+ parts = entry_point.split()
520
+ if parts:
521
+ python_path = parts[0]
522
+ # Resolve relative Python paths (e.g., ../.venv/bin/python, .venv/bin/python)
523
+ if "/" in python_path and not os.path.isabs(python_path):
524
+ abs_python = os.path.abspath(python_path)
525
+ if os.path.exists(abs_python):
526
+ parts[0] = abs_python
527
+ entry_point = " ".join(parts)
528
+ print(f" Resolved Python path: {abs_python}")
529
+
530
+ # Compute project_dir relative to git root (for worktree path resolution)
531
+ project_dir = ""
498
532
  try:
499
- baseline_experiment, baseline_score = run_baseline(
500
- client, dataset_name, args.entry_point, evaluators,
533
+ git_prefix = subprocess.run(
534
+ ["git", "rev-parse", "--show-prefix"],
535
+ capture_output=True, text=True, timeout=5,
501
536
  )
502
- print(f" Baseline has_output score: {baseline_score:.3f}")
503
- print(f" Experiment: {baseline_experiment}")
504
- if llm_evaluators:
505
- print(f" Note: LLM scoring pending — evaluator agent will run during /evolver:evolve")
506
- except Exception as e:
507
- print(f" Baseline evaluation failed: {e}", file=sys.stderr)
508
- print(" Continuing with score 0.0")
509
- elif count == 0:
510
- print("Skipping baseline (no examples in dataset yet)")
511
- else:
512
- print("Skipping baseline (--skip-baseline)")
513
-
514
- # Compute project_dir relative to git root (for worktree path resolution)
515
- project_dir = ""
516
- try:
517
- git_prefix = subprocess.run(
518
- ["git", "rev-parse", "--show-prefix"],
519
- capture_output=True, text=True, timeout=5,
520
- )
521
- if git_prefix.returncode == 0:
522
- project_dir = git_prefix.stdout.strip().rstrip("/")
523
- except Exception:
524
- pass
525
-
526
- # Write config
527
- config = {
528
- "version": "3.0.0",
529
- "project": project_name,
530
- "dataset": dataset_name,
531
- "dataset_id": str(dataset.id) if dataset else None,
532
- "project_dir": project_dir,
533
- "entry_point": args.entry_point,
534
- "evaluators": evaluator_keys,
535
- "optimization_goals": goals,
536
- "production_project": args.production_project,
537
- "baseline_experiment": baseline_experiment,
538
- "best_experiment": baseline_experiment,
539
- "best_score": baseline_score,
540
- "iterations": 0,
541
- "framework": args.framework,
542
- "created_at": datetime.now(timezone.utc).isoformat(),
543
- "history": [{
544
- "version": "baseline",
545
- "experiment": baseline_experiment,
546
- "score": baseline_score,
547
- }] if baseline_experiment else [],
548
- }
537
+ if git_prefix.returncode == 0:
538
+ project_dir = git_prefix.stdout.strip().rstrip("/")
539
+ except Exception:
540
+ pass
549
541
 
550
- with open(args.output, "w") as f:
551
- json.dump(config, f, indent=2)
542
+ # Write config
543
+ config = {
544
+ "version": "3.0.0",
545
+ "project": project_name,
546
+ "dataset": dataset_name,
547
+ "dataset_id": str(dataset.id) if dataset else None,
548
+ "project_dir": project_dir,
549
+ "entry_point": entry_point,
550
+ "evaluators": evaluator_keys,
551
+ "optimization_goals": goals,
552
+ "production_project": args.production_project,
553
+ "baseline_experiment": baseline_experiment,
554
+ "best_experiment": baseline_experiment,
555
+ "best_score": baseline_score,
556
+ "iterations": 0,
557
+ "framework": args.framework,
558
+ "created_at": datetime.now(timezone.utc).isoformat(),
559
+ "history": [{
560
+ "version": "baseline",
561
+ "experiment": baseline_experiment,
562
+ "score": baseline_score,
563
+ }] if baseline_experiment else [],
564
+ }
565
+
566
+ with open(args.output, "w") as f:
567
+ json.dump(config, f, indent=2)
568
+
569
+ print(f"\nSetup complete. Config saved to {args.output}")
570
+ print(f" Project: {project_name}")
571
+ print(f" Dataset: {dataset_name} ({count} examples)")
572
+ print(f" Evaluators: {evaluator_keys}")
573
+ if baseline_experiment:
574
+ print(f" Baseline: {baseline_score:.3f}")
575
+ print(f"\nNext: run /evolver:evolve")
552
576
 
553
- print(f"\nSetup complete. Config saved to {args.output}")
554
- print(f" Project: {project_name}")
555
- print(f" Dataset: {dataset_name} ({count} examples)")
556
- print(f" Evaluators: {evaluator_keys}")
557
- if baseline_experiment:
558
- print(f" Baseline: {baseline_score:.3f}")
559
- print(f"\nNext: run /evolver:evolve")
577
+ except Exception as e:
578
+ # Cleanup orphaned dataset if setup fails after dataset creation
579
+ if dataset:
580
+ print(f"Setup failed: {e}", file=sys.stderr)
581
+ print(f"Cleaning up orphaned dataset '{dataset_name}'...", file=sys.stderr)
582
+ try:
583
+ client.delete_dataset(dataset_id=dataset.id)
584
+ print(" Dataset deleted.", file=sys.stderr)
585
+ except Exception:
586
+ print(f" WARNING: Could not delete dataset. Clean up manually in LangSmith.", file=sys.stderr)
587
+ raise
560
588
 
561
589
 
562
590
  if __name__ == "__main__":