harness-evolver 4.5.0 → 4.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
3
  "description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
4
- "version": "4.5.0",
4
+ "version": "4.5.1",
5
5
  "author": {
6
6
  "name": "Raphael Valdetaro"
7
7
  },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
- "version": "4.5.0",
3
+ "version": "4.5.1",
4
4
  "description": "LangSmith-native autonomous agent optimization for Claude Code",
5
5
  "author": "Raphael Valdetaro",
6
6
  "license": "MIT",
@@ -133,6 +133,61 @@ If critical issues found, ask user whether to continue or fix first via AskUserQ
133
133
 
134
134
  Invoke `/evolver:health` to check and auto-correct dataset issues. If health_report.json shows critical issues that couldn't be auto-corrected, ask user whether to proceed via AskUserQuestion.
135
135
 
136
+ ### 0.7. Ensure Baseline Has LLM-Judge Scores
137
+
138
+ The baseline experiment (from setup) only runs code-based evaluators (has_output, token_efficiency). Without LLM-judge scores, the baseline score is inflated — any agent that produces text gets 1.0, making gate checks stop evolution prematurely.
139
+
140
+ Check if LLM evaluators are configured and the baseline needs scoring:
141
+
142
+ ```bash
143
+ LLM_EVALS=$(python3 -c "import json; c=json.load(open('.evolver.json')); llm=[k for k in c['evaluators'] if k in ('correctness','conciseness')]; print(','.join(llm) if llm else '')")
144
+ BASELINE=$(python3 -c "import json; print(json.load(open('.evolver.json')).get('baseline_experiment', ''))")
145
+ ```
146
+
147
+ If `LLM_EVALS` is non-empty and `BASELINE` exists, check if LLM scores already exist:
148
+
149
+ ```bash
150
+ HAS_LLM_SCORES=$($EVOLVER_PY $TOOLS/read_results.py --experiment "$BASELINE" --config .evolver.json 2>/dev/null | python3 -c "
151
+ import sys, json
152
+ try:
153
+ r = json.load(sys.stdin)
154
+ scored_keys = set()
155
+ for ex in r.get('per_example', {}).values():
156
+ scored_keys.update(ex.get('scores', {}).keys())
157
+ llm_keys = set('correctness,conciseness'.split(','))
158
+ configured = set(k for k in llm_keys if k in '$LLM_EVALS'.split(','))
159
+ print('yes' if configured.issubset(scored_keys) else 'no')
160
+ except: print('no')
161
+ ")
162
+ ```
163
+
164
+ If `HAS_LLM_SCORES` is "no", trigger the evaluator agent on the baseline:
165
+
166
+ ```
167
+ Agent(
168
+ subagent_type: "evolver-evaluator",
169
+ description: "Score baseline with LLM-judge",
170
+ prompt: "Experiments to evaluate: {baseline_experiment}. Evaluators: {llm_evaluator_list}. Framework: {framework}. Entry point: {entry_point}. Dataset: {dataset_name}. NOTE: This is the baseline — score it fairly so evolution has a meaningful starting point. Some examples have expected_behavior rubrics in their metadata — fetch example metadata and use rubrics for scoring when available."
171
+ )
172
+ ```
173
+
174
+ After the evaluator completes, re-read the baseline score and update `.evolver.json`:
175
+
176
+ ```bash
177
+ $EVOLVER_PY $TOOLS/read_results.py --experiment "$BASELINE" --config .evolver.json --output best_results.json 2>/dev/null
178
+ python3 -c "
179
+ import json
180
+ br = json.load(open('best_results.json'))
181
+ c = json.load(open('.evolver.json'))
182
+ new_score = br.get('combined_score', c['best_score'])
183
+ c['best_score'] = new_score
184
+ if c.get('history'):
185
+ c['history'][0]['score'] = new_score
186
+ json.dump(c, open('.evolver.json', 'w'), indent=2)
187
+ print(f'Baseline re-scored with LLM-judge: {new_score:.3f}')
188
+ "
189
+ ```
190
+
136
191
  ### 0.8. Resolve Project Directory
137
192
 
138
193
  If the project is in a subdirectory of the git repo (e.g., `playground/react-agent/`), worktrees replicate the full repo structure. Read `project_dir` from `.evolver.json` to resolve paths correctly:
@@ -340,10 +395,22 @@ Only run evaluation (Step 3) for proposers that committed changes (not abstained
340
395
 
341
396
  ### 3. Run Target for Each Candidate (Parallel)
342
397
 
343
- Run evaluations for ALL candidates simultaneously they're independent:
398
+ First, copy config files into each worktree (untracked files aren't replicated by git — this was the #1 bug in all real-world runs):
399
+
400
+ ```bash
401
+ for WORKTREE in {worktree_paths_with_commits}; do
402
+ WORKTREE_PROJECT="$WORKTREE"
403
+ [ -n "$PROJECT_DIR" ] && WORKTREE_PROJECT="$WORKTREE/$PROJECT_DIR"
404
+
405
+ # Copy untracked config files needed by run_eval.py and the agent
406
+ cp .evolver.json "$WORKTREE_PROJECT/.evolver.json" 2>/dev/null
407
+ [ -f .env ] && cp .env "$WORKTREE_PROJECT/.env" 2>/dev/null
408
+ done
409
+ ```
410
+
411
+ Then run evaluations for ALL candidates simultaneously:
344
412
 
345
413
  ```bash
346
- # Launch all evaluations in parallel
347
414
  for WORKTREE in {worktree_paths_with_commits}; do
348
415
  WORKTREE_PROJECT="$WORKTREE"
349
416
  [ -n "$PROJECT_DIR" ] && WORKTREE_PROJECT="$WORKTREE/$PROJECT_DIR"
package/tools/run_eval.py CHANGED
@@ -72,7 +72,20 @@ def make_target(entry_point, cwd):
72
72
 
73
73
  try:
74
74
  cmd = entry_point
75
- if "{input}" in cmd:
75
+
76
+ # {input_text}: extract plain text from inputs dict (for agents expecting --query "text")
77
+ if "{input_text}" in cmd:
78
+ import shlex
79
+ text = ""
80
+ for key in ("input", "question", "query", "prompt", "text", "user_input"):
81
+ if key in inputs and isinstance(inputs[key], str):
82
+ text = inputs[key]
83
+ break
84
+ if not text and inputs:
85
+ first_val = next(iter(inputs.values()), "")
86
+ text = str(first_val) if not isinstance(first_val, str) else first_val
87
+ cmd = cmd.replace("{input_text}", shlex.quote(text))
88
+ elif "{input}" in cmd:
76
89
  # Placeholder: replace with path to JSON file
77
90
  cmd = cmd.replace("{input}", input_path)
78
91
  elif "{input_json}" in cmd:
@@ -167,6 +180,7 @@ def main():
167
180
  parser.add_argument("--experiment-prefix", required=True, help="Experiment name prefix (e.g. v001a)")
168
181
  parser.add_argument("--timeout", type=int, default=120, help="Per-task timeout in seconds")
169
182
  parser.add_argument("--concurrency", type=int, default=None, help="Max concurrent evaluations (default: from config or 1)")
183
+ parser.add_argument("--no-canary", action="store_true", help="Skip canary preflight check")
170
184
  args = parser.parse_args()
171
185
 
172
186
  with open(args.config) as f:
@@ -187,6 +201,32 @@ def main():
187
201
  llm_evaluators = [k for k in config["evaluators"] if k in ("correctness", "conciseness")]
188
202
  code_evaluators = [k for k in config["evaluators"] if k not in ("correctness", "conciseness")]
189
203
 
204
+ # Canary run: verify agent works before burning through full dataset
205
+ if not args.no_canary:
206
+ print(" Canary: running 1 example preflight...", file=sys.stderr)
207
+ try:
208
+ canary_examples = list(client.list_examples(dataset_name=config["dataset"], limit=1))
209
+ if canary_examples:
210
+ canary_result = target(canary_examples[0].inputs)
211
+ canary_output = canary_result.get("output", "")
212
+ canary_error = canary_result.get("error", "")
213
+ if not canary_output and canary_error:
214
+ print(f" CANARY FAILED: Agent produced no output.", file=sys.stderr)
215
+ print(f" Error: {canary_error}", file=sys.stderr)
216
+ print(f" Fix the agent before running full evaluation.", file=sys.stderr)
217
+ output = {
218
+ "experiment": None,
219
+ "prefix": args.experiment_prefix,
220
+ "combined_score": 0.0,
221
+ "error": f"Canary failed: {canary_error[:200]}",
222
+ }
223
+ print(json.dumps(output))
224
+ sys.exit(2)
225
+ else:
226
+ print(f" Canary passed: got output ({len(str(canary_output))} chars)", file=sys.stderr)
227
+ except Exception as e:
228
+ print(f" Canary check failed: {e} (proceeding anyway)", file=sys.stderr)
229
+
190
230
  print(f"Running evaluation: {args.experiment_prefix}")
191
231
  print(f" Dataset: {config['dataset']}")
192
232
  print(f" Worktree: {args.worktree_path}")