npm - harness-evolver - Versions diffs - 4.2.1 → 4.2.3 - Mend

harness-evolver 4.2.1 → 4.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/.claude-plugin/plugin.json +1 -1
package/package.json +1 -1
package/skills/evolve/SKILL.md +26 -3
package/tools/adversarial_inject.py +1 -1
package/tools/dataset_health.py +2 -2
package/tools/read_results.py +1 -1
package/tools/regression_tracker.py +1 -1
package/tools/setup.py +13 -0
package/tools/synthesize_strategy.py +39 -0
package/tools/trace_insights.py +1 -1

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "harness-evolver",
   "description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
-  "version": "4.2.1",
+  "version": "4.2.3",
   "author": {
     "name": "Raphael Valdetaro"
   },

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "harness-evolver",
-  "version": "4.2.1",
+  "version": "4.2.3",
   "description": "LangSmith-native autonomous agent optimization for Claude Code",
   "author": "Raphael Valdetaro",
   "license": "MIT",

package/skills/evolve/SKILL.md CHANGED Viewed

@@ -243,6 +243,21 @@ print(f'Retired {retired} dead examples')
 After corrections, log what was done. Do NOT re-run health check (corrections may need an experiment cycle to show effect).
+### 0.8. Resolve Project Directory
+If the project is in a subdirectory of the git repo (e.g., `playground/react-agent/`), worktrees replicate the full repo structure. Read `project_dir` from `.evolver.json` to resolve paths correctly:
+```bash
+PROJECT_DIR=$(python3 -c "import json; print(json.load(open('.evolver.json')).get('project_dir', ''))")
+```
+If `PROJECT_DIR` is non-empty, all worktree paths must include it:
+- Config in worktree: `{worktree_path}/{PROJECT_DIR}/.evolver.json`
+- CWD in worktree: `{worktree_path}/{PROJECT_DIR}`
+- proposal.md in worktree: `{worktree_path}/{PROJECT_DIR}/proposal.md`
+If `PROJECT_DIR` is empty (project at git root), paths are unchanged: `{worktree_path}/.evolver.json`, etc.
 For each iteration:
 ### 1. Get Next Version
@@ -402,7 +417,11 @@ After all proposers complete, check which ones committed and which abstained:
 ```bash
 for WORKTREE in {worktree_paths}; do
-    if [ -f "$WORKTREE/proposal.md" ] && grep -q "## ABSTAIN" "$WORKTREE/proposal.md" 2>/dev/null; then
+    # Resolve project path within worktree
+    WT_PROJECT="$WORKTREE"
+    [ -n "$PROJECT_DIR" ] && WT_PROJECT="$WORKTREE/$PROJECT_DIR"
+    if [ -f "$WT_PROJECT/proposal.md" ] && grep -q "## ABSTAIN" "$WT_PROJECT/proposal.md" 2>/dev/null; then
         echo "Proposer in $WORKTREE abstained — skipping evaluation"
     elif [ $(cd "$WORKTREE" && git log --oneline -1 --since="10 minutes ago" 2>/dev/null | wc -l) -eq 0 ]; then
         echo "Proposer in $WORKTREE made no commits — skipping"
@@ -417,9 +436,13 @@ Only run evaluation (Step 3) for proposers that committed changes (not abstained
 For each worktree that has changes (proposer committed something):
 ```bash
+# If PROJECT_DIR is set, resolve paths into the worktree subdirectory
+WORKTREE_PROJECT="{worktree_path}"
+[ -n "$PROJECT_DIR" ] && WORKTREE_PROJECT="{worktree_path}/{PROJECT_DIR}"
 $EVOLVER_PY $TOOLS/run_eval.py \
-    --config .evolver.json \
-    --worktree-path {worktree_path} \
+    --config "$WORKTREE_PROJECT/.evolver.json" \
+    --worktree-path "$WORKTREE_PROJECT" \
     --experiment-prefix v{NNN}-{lens_id} \
     --timeout 120
 ```

package/tools/adversarial_inject.py CHANGED Viewed

@@ -59,7 +59,7 @@ def detect_memorization(client, experiment_name, dataset_name):
     """Check if agent outputs are suspiciously similar to reference outputs."""
     suspicious = []
     try:
-        runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=200))
+        runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=100))
         examples = {str(e.id): e for e in client.list_examples(dataset_name=dataset_name, limit=500)}
         for run in runs:

package/tools/dataset_health.py CHANGED Viewed

@@ -68,7 +68,7 @@ def check_difficulty(client, config):
         return None
     try:
-        runs = list(client.list_runs(project_name=best_exp, is_root=True, limit=200))
+        runs = list(client.list_runs(project_name=best_exp, is_root=True, limit=100))
         if not runs:
             return None
@@ -129,7 +129,7 @@ def check_dead_examples(client, config):
     for exp_name in recent_exps:
         try:
-            runs = list(client.list_runs(project_name=exp_name, is_root=True, limit=200))
+            runs = list(client.list_runs(project_name=exp_name, is_root=True, limit=100))
             all_run_ids = [run.id for run in runs]
             if not all_run_ids:
                 continue

package/tools/read_results.py CHANGED Viewed

@@ -68,7 +68,7 @@ def read_experiment(client, experiment_name):
         runs = list(client.list_runs(
             project_name=experiment_name,
             is_root=True,
-            limit=200,
+            limit=100,
         ))
         if not runs:

package/tools/regression_tracker.py CHANGED Viewed

@@ -60,7 +60,7 @@ def get_per_example_scores(client, experiment_name):
     """Get per-example scores from an experiment."""
     scores = {}
     try:
-        runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=200))
+        runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=100))
         all_run_ids = [run.id for run in runs]
         all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
         fb_map = {}

package/tools/setup.py CHANGED Viewed

@@ -511,12 +511,25 @@ def main():
     else:
         print("Skipping baseline (--skip-baseline)")
+    # Compute project_dir relative to git root (for worktree path resolution)
+    project_dir = ""
+    try:
+        git_prefix = subprocess.run(
+            ["git", "rev-parse", "--show-prefix"],
+            capture_output=True, text=True, timeout=5,
+        )
+        if git_prefix.returncode == 0:
+            project_dir = git_prefix.stdout.strip().rstrip("/")
+    except Exception:
+        pass
     # Write config
     config = {
         "version": "3.0.0",
         "project": project_name,
         "dataset": dataset_name,
         "dataset_id": str(dataset.id) if dataset else None,
+        "project_dir": project_dir,
         "entry_point": args.entry_point,
         "evaluators": evaluator_keys,
         "optimization_goals": goals,

package/tools/synthesize_strategy.py CHANGED Viewed

@@ -216,6 +216,45 @@ def generate_lenses(strategy, config, insights, results, memory, production, max
                 })
                 break  # at most 1 persistent failure lens
+    # Uniform failure lens — when there are failing examples but no cluster lenses were generated
+    # (e.g., all examples fail with same error like "python: not found")
+    failing_examples = strategy.get("failing_examples", [])
+    if failing_examples and not any(l["source"] == "failure_cluster" for l in lenses):
+        # Check if all errors are the same
+        errors = [ex.get("error", "") for ex in failing_examples if ex.get("error")]
+        common_error = errors[0] if errors and len(set(errors)) == 1 else None
+        if common_error:
+            lens_id += 1
+            lenses.append({
+                "id": lens_id,
+                "question": f"All {len(failing_examples)} examples fail with the same error: \"{common_error[:150]}\". Is this a code bug, configuration issue, or environment problem? What's the fix?",
+                "source": "uniform_failure",
+                "severity": "critical",
+                "context": {"error": common_error[:300], "count": len(failing_examples)},
+            })
+        else:
+            # Diverse errors but no clusters — create a general failure lens
+            lens_id += 1
+            lenses.append({
+                "id": lens_id,
+                "question": f"{len(failing_examples)} examples are failing with various errors. What are the root causes and what changes would fix the most failures?",
+                "source": "failure_analysis",
+                "severity": "high",
+                "context": {"count": len(failing_examples)},
+            })
+    # Input diversity lens — when we have failing examples, suggest investigating by input type
+    if failing_examples and len(failing_examples) >= 5 and len(lenses) < max_lenses - 1:
+        previews = [ex.get("input_preview", "")[:50] for ex in failing_examples[:5]]
+        lens_id += 1
+        lenses.append({
+            "id": lens_id,
+            "question": f"The agent fails on diverse inputs like: {'; '.join(previews[:3])}. Are there different failure modes for different input types?",
+            "source": "input_diversity",
+            "severity": "medium",
+            "context": {"sample_inputs": previews},
+        })
     # Open lens (always included)
     lens_id += 1
     lenses.append({

package/tools/trace_insights.py CHANGED Viewed

@@ -332,7 +332,7 @@ def fetch_scores_from_experiment(experiment_name):
         runs = list(client.list_runs(
             project_name=experiment_name,
             is_root=True,
-            limit=200,
+            limit=100,
         ))
         all_run_ids = [run.id for run in runs]