npm - harness-evolver - Versions diffs - 4.5.0 → 4.5.1 - Mend

harness-evolver 4.5.0 → 4.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/.claude-plugin/plugin.json +1 -1
package/package.json +1 -1
package/skills/evolve/SKILL.md +69 -2
package/tools/run_eval.py +41 -1

package/.claude-plugin/plugin.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "harness-evolver",
   "description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
-  "version": "4.5.0",
+  "version": "4.5.1",
   "author": {
     "name": "Raphael Valdetaro"
   },

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "harness-evolver",
-  "version": "4.5.0",
+  "version": "4.5.1",
   "description": "LangSmith-native autonomous agent optimization for Claude Code",
   "author": "Raphael Valdetaro",
   "license": "MIT",

package/skills/evolve/SKILL.md CHANGED Viewed

@@ -133,6 +133,61 @@ If critical issues found, ask user whether to continue or fix first via AskUserQ
 Invoke `/evolver:health` to check and auto-correct dataset issues. If health_report.json shows critical issues that couldn't be auto-corrected, ask user whether to proceed via AskUserQuestion.
+### 0.7. Ensure Baseline Has LLM-Judge Scores
+The baseline experiment (from setup) only runs code-based evaluators (has_output, token_efficiency). Without LLM-judge scores, the baseline score is inflated — any agent that produces text gets 1.0, making gate checks stop evolution prematurely.
+Check if LLM evaluators are configured and the baseline needs scoring:
+```bash
+LLM_EVALS=$(python3 -c "import json; c=json.load(open('.evolver.json')); llm=[k for k in c['evaluators'] if k in ('correctness','conciseness')]; print(','.join(llm) if llm else '')")
+BASELINE=$(python3 -c "import json; print(json.load(open('.evolver.json')).get('baseline_experiment', ''))")
+```
+If `LLM_EVALS` is non-empty and `BASELINE` exists, check if LLM scores already exist:
+```bash
+HAS_LLM_SCORES=$($EVOLVER_PY $TOOLS/read_results.py --experiment "$BASELINE" --config .evolver.json 2>/dev/null | python3 -c "
+import sys, json
+try:
+    r = json.load(sys.stdin)
+    scored_keys = set()
+    for ex in r.get('per_example', {}).values():
+        scored_keys.update(ex.get('scores', {}).keys())
+    llm_keys = set('correctness,conciseness'.split(','))
+    configured = set(k for k in llm_keys if k in '$LLM_EVALS'.split(','))
+    print('yes' if configured.issubset(scored_keys) else 'no')
+except: print('no')
+")
+```
+If `HAS_LLM_SCORES` is "no", trigger the evaluator agent on the baseline:
+```
+Agent(
+  subagent_type: "evolver-evaluator",
+  description: "Score baseline with LLM-judge",
+  prompt: "Experiments to evaluate: {baseline_experiment}. Evaluators: {llm_evaluator_list}. Framework: {framework}. Entry point: {entry_point}. Dataset: {dataset_name}. NOTE: This is the baseline — score it fairly so evolution has a meaningful starting point. Some examples have expected_behavior rubrics in their metadata — fetch example metadata and use rubrics for scoring when available."
+)
+```
+After the evaluator completes, re-read the baseline score and update `.evolver.json`:
+```bash
+$EVOLVER_PY $TOOLS/read_results.py --experiment "$BASELINE" --config .evolver.json --output best_results.json 2>/dev/null
+python3 -c "
+import json
+br = json.load(open('best_results.json'))
+c = json.load(open('.evolver.json'))
+new_score = br.get('combined_score', c['best_score'])
+c['best_score'] = new_score
+if c.get('history'):
+    c['history'][0]['score'] = new_score
+json.dump(c, open('.evolver.json', 'w'), indent=2)
+print(f'Baseline re-scored with LLM-judge: {new_score:.3f}')
+"
+```
 ### 0.8. Resolve Project Directory
 If the project is in a subdirectory of the git repo (e.g., `playground/react-agent/`), worktrees replicate the full repo structure. Read `project_dir` from `.evolver.json` to resolve paths correctly:
@@ -340,10 +395,22 @@ Only run evaluation (Step 3) for proposers that committed changes (not abstained
 ### 3. Run Target for Each Candidate (Parallel)
-Run evaluations for ALL candidates simultaneously — they're independent:
+First, copy config files into each worktree (untracked files aren't replicated by git — this was the #1 bug in all real-world runs):
+```bash
+for WORKTREE in {worktree_paths_with_commits}; do
+    WORKTREE_PROJECT="$WORKTREE"
+    [ -n "$PROJECT_DIR" ] && WORKTREE_PROJECT="$WORKTREE/$PROJECT_DIR"
+    # Copy untracked config files needed by run_eval.py and the agent
+    cp .evolver.json "$WORKTREE_PROJECT/.evolver.json" 2>/dev/null
+    [ -f .env ] && cp .env "$WORKTREE_PROJECT/.env" 2>/dev/null
+done
+```
+Then run evaluations for ALL candidates simultaneously:
 ```bash
-# Launch all evaluations in parallel
 for WORKTREE in {worktree_paths_with_commits}; do
     WORKTREE_PROJECT="$WORKTREE"
     [ -n "$PROJECT_DIR" ] && WORKTREE_PROJECT="$WORKTREE/$PROJECT_DIR"

package/tools/run_eval.py CHANGED Viewed

@@ -72,7 +72,20 @@ def make_target(entry_point, cwd):
         try:
             cmd = entry_point
-            if "{input}" in cmd:
+            # {input_text}: extract plain text from inputs dict (for agents expecting --query "text")
+            if "{input_text}" in cmd:
+                import shlex
+                text = ""
+                for key in ("input", "question", "query", "prompt", "text", "user_input"):
+                    if key in inputs and isinstance(inputs[key], str):
+                        text = inputs[key]
+                        break
+                if not text and inputs:
+                    first_val = next(iter(inputs.values()), "")
+                    text = str(first_val) if not isinstance(first_val, str) else first_val
+                cmd = cmd.replace("{input_text}", shlex.quote(text))
+            elif "{input}" in cmd:
                 # Placeholder: replace with path to JSON file
                 cmd = cmd.replace("{input}", input_path)
             elif "{input_json}" in cmd:
@@ -167,6 +180,7 @@ def main():
     parser.add_argument("--experiment-prefix", required=True, help="Experiment name prefix (e.g. v001a)")
     parser.add_argument("--timeout", type=int, default=120, help="Per-task timeout in seconds")
     parser.add_argument("--concurrency", type=int, default=None, help="Max concurrent evaluations (default: from config or 1)")
+    parser.add_argument("--no-canary", action="store_true", help="Skip canary preflight check")
     args = parser.parse_args()
     with open(args.config) as f:
@@ -187,6 +201,32 @@ def main():
     llm_evaluators = [k for k in config["evaluators"] if k in ("correctness", "conciseness")]
     code_evaluators = [k for k in config["evaluators"] if k not in ("correctness", "conciseness")]
+    # Canary run: verify agent works before burning through full dataset
+    if not args.no_canary:
+        print("  Canary: running 1 example preflight...", file=sys.stderr)
+        try:
+            canary_examples = list(client.list_examples(dataset_name=config["dataset"], limit=1))
+            if canary_examples:
+                canary_result = target(canary_examples[0].inputs)
+                canary_output = canary_result.get("output", "")
+                canary_error = canary_result.get("error", "")
+                if not canary_output and canary_error:
+                    print(f"  CANARY FAILED: Agent produced no output.", file=sys.stderr)
+                    print(f"  Error: {canary_error}", file=sys.stderr)
+                    print(f"  Fix the agent before running full evaluation.", file=sys.stderr)
+                    output = {
+                        "experiment": None,
+                        "prefix": args.experiment_prefix,
+                        "combined_score": 0.0,
+                        "error": f"Canary failed: {canary_error[:200]}",
+                    }
+                    print(json.dumps(output))
+                    sys.exit(2)
+                else:
+                    print(f"  Canary passed: got output ({len(str(canary_output))} chars)", file=sys.stderr)
+        except Exception as e:
+            print(f"  Canary check failed: {e} (proceeding anyway)", file=sys.stderr)
     print(f"Running evaluation: {args.experiment_prefix}")
     print(f"  Dataset: {config['dataset']}")
     print(f"  Worktree: {args.worktree_path}")