harness-evolver 4.0.3 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -94,24 +94,16 @@ def consolidate(orientation, signals, existing_memory=None):
94
94
  """Phase 3: Merge signals into consolidated memory."""
95
95
  insights = []
96
96
 
97
- # Strategy effectiveness
97
+ # Winning approach tracking (from comparison data)
98
98
  winning = signals.get("winning_strategies", [])
99
- strategy_map = {"a": "exploit", "b": "explore", "c": "crossover", "d": "failure-targeted-1", "e": "failure-targeted-2"}
100
- win_counts = {}
101
- for w in winning:
102
- exp = w.get("experiment", "")
103
- if exp:
104
- suffix = exp[-1]
105
- name = strategy_map.get(suffix, suffix)
106
- win_counts[name] = win_counts.get(name, 0) + 1
107
-
108
- if win_counts:
109
- best_strategy = max(win_counts, key=win_counts.get)
99
+ if winning:
100
+ win_count = len(winning)
101
+ best_score = max(w.get("score", 0) for w in winning)
110
102
  insights.append({
111
103
  "type": "strategy_effectiveness",
112
- "insight": f"Most winning strategy: {best_strategy} ({win_counts[best_strategy]} wins)",
113
- "recurrence": win_counts[best_strategy],
114
- "data": win_counts,
104
+ "insight": f"Best candidate score: {best_score:.3f} across {win_count} iterations",
105
+ "recurrence": win_count,
106
+ "data": {"win_count": win_count, "best_score": best_score},
115
107
  })
116
108
 
117
109
  # Recurring failures (only promote if seen 2+ times)
@@ -0,0 +1,385 @@
1
+ #!/usr/bin/env python3
2
+ """Dataset health diagnostic for Harness Evolver.
3
+
4
+ Analyzes eval dataset quality: size adequacy, difficulty distribution,
5
+ dead examples, production coverage, and split configuration.
6
+ Outputs health_report.json with issues and recommended corrections.
7
+
8
+ Usage:
9
+ python3 dataset_health.py --config .evolver.json --output health_report.json
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ import os
15
+ import sys
16
+ from datetime import datetime, timezone
17
+
18
+
19
+ def ensure_langsmith_api_key():
20
+ """Load API key from langsmith-cli credentials if not in env."""
21
+ if os.environ.get("LANGSMITH_API_KEY"):
22
+ return True
23
+ import platform
24
+ if platform.system() == "Darwin":
25
+ creds_path = os.path.expanduser("~/Library/Application Support/langsmith-cli/credentials")
26
+ else:
27
+ creds_path = os.path.expanduser("~/.config/langsmith-cli/credentials")
28
+ if os.path.exists(creds_path):
29
+ try:
30
+ with open(creds_path) as f:
31
+ for line in f:
32
+ if line.strip().startswith("api_key"):
33
+ key = line.split("=", 1)[1].strip().strip("'\"")
34
+ if key:
35
+ os.environ["LANGSMITH_API_KEY"] = key
36
+ return True
37
+ except OSError:
38
+ pass
39
+ return False
40
+
41
+
42
+ def load_json_safe(path):
43
+ """Load JSON file, return None if missing or invalid."""
44
+ if not path or not os.path.exists(path):
45
+ return None
46
+ try:
47
+ with open(path) as f:
48
+ return json.load(f)
49
+ except (json.JSONDecodeError, OSError):
50
+ return None
51
+
52
+
53
+ def check_size(examples, evaluators):
54
+ """Check dataset size adequacy."""
55
+ count = len(examples)
56
+ min_recommended = max(20, 10 * len(evaluators))
57
+ return {
58
+ "example_count": count,
59
+ "min_recommended": min_recommended,
60
+ "adequate": count >= min_recommended,
61
+ }
62
+
63
+
64
+ def check_difficulty(client, config):
65
+ """Check difficulty distribution from best experiment scores."""
66
+ best_exp = config.get("best_experiment")
67
+ if not best_exp:
68
+ return None
69
+
70
+ try:
71
+ runs = list(client.list_runs(project_name=best_exp, is_root=True, limit=200))
72
+ if not runs:
73
+ return None
74
+
75
+ all_run_ids = [run.id for run in runs]
76
+ all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
77
+ fb_map = {}
78
+ for fb in all_feedbacks:
79
+ fb_map.setdefault(str(fb.run_id), []).append(fb)
80
+
81
+ scores = []
82
+ example_difficulties = {}
83
+ for run in runs:
84
+ run_fbs = fb_map.get(str(run.id), [])
85
+ run_scores = [fb.score for fb in run_fbs if fb.score is not None]
86
+ if run_scores:
87
+ avg = sum(run_scores) / len(run_scores)
88
+ scores.append(avg)
89
+ eid = str(run.reference_example_id or run.id)
90
+ if avg > 0.9:
91
+ example_difficulties[eid] = "easy"
92
+ elif avg >= 0.5:
93
+ example_difficulties[eid] = "medium"
94
+ else:
95
+ example_difficulties[eid] = "hard"
96
+
97
+ if not scores:
98
+ return None
99
+
100
+ easy = sum(1 for s in scores if s > 0.9)
101
+ medium = sum(1 for s in scores if 0.5 <= s <= 0.9)
102
+ hard = sum(1 for s in scores if s < 0.5)
103
+ total = len(scores)
104
+ skew = None
105
+ if total > 0 and easy / total > 0.6:
106
+ skew = "easy_heavy"
107
+ elif total > 0 and hard / total > 0.6:
108
+ skew = "hard_heavy"
109
+
110
+ return {
111
+ "easy": easy,
112
+ "medium": medium,
113
+ "hard": hard,
114
+ "skew": skew,
115
+ "example_difficulties": example_difficulties,
116
+ }
117
+ except Exception:
118
+ return None
119
+
120
+
121
+ def check_dead_examples(client, config):
122
+ """Find examples that scored >=0.9 across all recent experiments."""
123
+ history = config.get("history", [])
124
+ if len(history) < 2:
125
+ return {"count": 0, "ids": []}
126
+
127
+ recent_exps = [h["experiment"] for h in history[-3:]]
128
+ example_scores = {}
129
+
130
+ for exp_name in recent_exps:
131
+ try:
132
+ runs = list(client.list_runs(project_name=exp_name, is_root=True, limit=200))
133
+ all_run_ids = [run.id for run in runs]
134
+ if not all_run_ids:
135
+ continue
136
+ all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
137
+ fb_map = {}
138
+ for fb in all_feedbacks:
139
+ fb_map.setdefault(str(fb.run_id), []).append(fb)
140
+
141
+ for run in runs:
142
+ eid = str(run.reference_example_id or run.id)
143
+ run_fbs = fb_map.get(str(run.id), [])
144
+ run_scores = [fb.score for fb in run_fbs if fb.score is not None]
145
+ if run_scores:
146
+ avg = sum(run_scores) / len(run_scores)
147
+ if eid not in example_scores:
148
+ example_scores[eid] = []
149
+ example_scores[eid].append(avg)
150
+ except Exception:
151
+ continue
152
+
153
+ dead_ids = []
154
+ for eid, exp_scores in example_scores.items():
155
+ if len(exp_scores) >= 2 and all(s >= 0.9 for s in exp_scores):
156
+ dead_ids.append(eid)
157
+
158
+ return {"count": len(dead_ids), "ids": dead_ids}
159
+
160
+
161
+ def check_coverage(examples, production):
162
+ """Compare dataset categories vs production traffic."""
163
+ if not production:
164
+ return None
165
+
166
+ prod_categories = set()
167
+ for cat in production.get("categories", []):
168
+ if isinstance(cat, str):
169
+ prod_categories.add(cat)
170
+ elif isinstance(cat, dict) and "category" in cat:
171
+ prod_categories.add(cat["category"])
172
+
173
+ if not prod_categories:
174
+ return None
175
+
176
+ dataset_categories = set()
177
+ for ex in examples:
178
+ meta = getattr(ex, "metadata", None) or {}
179
+ if "category" in meta:
180
+ dataset_categories.add(meta["category"])
181
+
182
+ missing = prod_categories - dataset_categories
183
+ coverage_pct = 0
184
+ if prod_categories:
185
+ coverage_pct = int(100 * len(prod_categories - missing) / len(prod_categories))
186
+
187
+ return {
188
+ "production": sorted(prod_categories),
189
+ "dataset": sorted(dataset_categories),
190
+ "missing": sorted(missing),
191
+ "pct": coverage_pct,
192
+ }
193
+
194
+
195
+ def check_splits(client, dataset_name):
196
+ """Check if train/held_out splits exist."""
197
+ has_train = False
198
+ has_held_out = False
199
+ try:
200
+ train = list(client.list_examples(dataset_name=dataset_name, splits=["train"], limit=1))
201
+ has_train = len(train) > 0
202
+ except Exception:
203
+ pass
204
+ try:
205
+ held = list(client.list_examples(dataset_name=dataset_name, splits=["held_out"], limit=1))
206
+ has_held_out = len(held) > 0
207
+ except Exception:
208
+ pass
209
+ return {"has_train": has_train, "has_held_out": has_held_out}
210
+
211
+
212
+ def compute_health_score(size_info, difficulty, dead, coverage, splits):
213
+ """Compute overall health score 0-10."""
214
+ score = 10
215
+
216
+ if not size_info.get("adequate"):
217
+ score -= 3
218
+
219
+ if difficulty and difficulty.get("skew"):
220
+ score -= 2
221
+
222
+ if dead and dead.get("count", 0) > 0:
223
+ total = size_info.get("example_count", 1)
224
+ if dead["count"] / max(total, 1) > 0.2:
225
+ score -= 1
226
+
227
+ if coverage and coverage.get("pct", 100) < 75:
228
+ score -= 2
229
+
230
+ if splits and not splits.get("has_train"):
231
+ score -= 2
232
+
233
+ return max(0, score)
234
+
235
+
236
+ def build_issues_and_corrections(size_info, difficulty, dead, coverage, splits):
237
+ """Build issues and corrections lists."""
238
+ issues = []
239
+ corrections = []
240
+
241
+ if not size_info.get("adequate"):
242
+ issues.append({
243
+ "type": "size_inadequate",
244
+ "severity": "high",
245
+ "message": f"Only {size_info['example_count']} examples (recommended: {size_info['min_recommended']}+)",
246
+ })
247
+ corrections.append({
248
+ "action": "generate_more",
249
+ "count": size_info["min_recommended"] - size_info["example_count"],
250
+ })
251
+
252
+ if difficulty and difficulty.get("skew") == "easy_heavy":
253
+ easy_pct = int(100 * difficulty["easy"] / max(difficulty["easy"] + difficulty["medium"] + difficulty["hard"], 1))
254
+ issues.append({
255
+ "type": "difficulty_skew",
256
+ "severity": "high",
257
+ "message": f"{easy_pct}% easy examples — low discriminative power",
258
+ })
259
+ corrections.append({
260
+ "action": "generate_hard",
261
+ "count": max(5, difficulty["easy"] // 3),
262
+ })
263
+
264
+ if dead and dead.get("count", 0) > 0:
265
+ total = size_info.get("example_count", 1)
266
+ dead_pct = int(100 * dead["count"] / max(total, 1))
267
+ if dead_pct > 10:
268
+ issues.append({
269
+ "type": "dead_examples",
270
+ "severity": "medium",
271
+ "message": f"{dead['count']} dead examples ({dead_pct}%) — scored >=0.9 in all recent experiments",
272
+ })
273
+ corrections.append({
274
+ "action": "retire_dead",
275
+ "ids": dead["ids"],
276
+ })
277
+
278
+ if coverage and coverage.get("missing"):
279
+ issues.append({
280
+ "type": "coverage_gap",
281
+ "severity": "high",
282
+ "message": f"Missing categories: {', '.join(coverage['missing'])} ({coverage['pct']}% coverage)",
283
+ })
284
+ corrections.append({
285
+ "action": "fill_coverage",
286
+ "categories": coverage["missing"],
287
+ })
288
+
289
+ if splits and not splits.get("has_train"):
290
+ issues.append({
291
+ "type": "no_splits",
292
+ "severity": "medium",
293
+ "message": "No train/held-out split — proposer overfit risk",
294
+ })
295
+ corrections.append({
296
+ "action": "create_splits",
297
+ "train_pct": 70,
298
+ })
299
+
300
+ return issues, corrections
301
+
302
+
303
+ def main():
304
+ parser = argparse.ArgumentParser(description="Dataset health diagnostic")
305
+ parser.add_argument("--config", default=".evolver.json")
306
+ parser.add_argument("--production-seed", default="production_seed.json")
307
+ parser.add_argument("--output", default="health_report.json")
308
+ args = parser.parse_args()
309
+
310
+ ensure_langsmith_api_key()
311
+
312
+ with open(args.config) as f:
313
+ config = json.load(f)
314
+
315
+ production = load_json_safe(args.production_seed)
316
+
317
+ from langsmith import Client
318
+ client = Client()
319
+
320
+ dataset_name = config["dataset"]
321
+
322
+ # Get all examples
323
+ examples = list(client.list_examples(dataset_name=dataset_name, limit=500))
324
+
325
+ # Run checks
326
+ evaluators = config.get("evaluators", ["correctness"])
327
+ size_info = check_size(examples, evaluators)
328
+ difficulty = check_difficulty(client, config)
329
+ dead = check_dead_examples(client, config)
330
+ coverage = check_coverage(examples, production)
331
+ splits = check_splits(client, dataset_name)
332
+
333
+ # Tag difficulty metadata on examples if we computed it
334
+ if difficulty and difficulty.get("example_difficulties"):
335
+ for ex in examples:
336
+ eid = str(ex.id)
337
+ diff = difficulty["example_difficulties"].get(eid)
338
+ if diff:
339
+ meta = dict(getattr(ex, "metadata", None) or {})
340
+ if meta.get("difficulty") != diff:
341
+ meta["difficulty"] = diff
342
+ try:
343
+ client.update_example(ex.id, metadata=meta)
344
+ except Exception:
345
+ pass
346
+
347
+ # Compute health score and build report
348
+ health_score = compute_health_score(size_info, difficulty, dead, coverage, splits)
349
+ issues, corrections = build_issues_and_corrections(size_info, difficulty, dead, coverage, splits)
350
+
351
+ report = {
352
+ "generated_at": datetime.now(timezone.utc).isoformat(),
353
+ "health_score": health_score,
354
+ "example_count": size_info["example_count"],
355
+ "min_recommended": size_info["min_recommended"],
356
+ "difficulty": {k: v for k, v in (difficulty or {}).items() if k != "example_difficulties"} or None,
357
+ "dead_examples": dead,
358
+ "coverage": coverage,
359
+ "splits": splits,
360
+ "issues": issues,
361
+ "corrections": corrections,
362
+ }
363
+
364
+ with open(args.output, "w") as f:
365
+ json.dump(report, f, indent=2)
366
+
367
+ # Print human-readable summary
368
+ print(f"Dataset Health: {health_score}/10")
369
+ print(f"Examples: {size_info['example_count']} (min recommended: {size_info['min_recommended']})")
370
+ if difficulty:
371
+ print(f"Difficulty: {difficulty.get('easy', 0)} easy, {difficulty.get('medium', 0)} medium, {difficulty.get('hard', 0)} hard")
372
+ if dead and dead["count"] > 0:
373
+ print(f"Dead examples: {dead['count']}")
374
+ if coverage:
375
+ print(f"Coverage: {coverage['pct']}% ({len(coverage.get('missing', []))} categories missing)")
376
+ if splits:
377
+ print(f"Splits: train={'yes' if splits['has_train'] else 'no'}, held_out={'yes' if splits['has_held_out'] else 'no'}")
378
+ if issues:
379
+ print(f"\nIssues ({len(issues)}):")
380
+ for issue in issues:
381
+ print(f" [{issue['severity']}] {issue['message']}")
382
+
383
+
384
+ if __name__ == "__main__":
385
+ main()
@@ -79,6 +79,13 @@ def read_experiment(client, experiment_name):
79
79
  total_latency_ms = 0
80
80
  errors = 0
81
81
 
82
+ # Batch-fetch all feedback in one API call instead of N+1
83
+ all_run_ids = [run.id for run in runs]
84
+ all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
85
+ fb_map = {}
86
+ for fb in all_feedbacks:
87
+ fb_map.setdefault(str(fb.run_id), []).append(fb)
88
+
82
89
  for run in runs:
83
90
  example_id = str(run.reference_example_id or run.id)
84
91
  tokens = run.total_tokens or 0
@@ -93,8 +100,8 @@ def read_experiment(client, experiment_name):
93
100
  if has_error:
94
101
  errors += 1
95
102
 
96
- # Read feedback/scores
97
- feedbacks = list(client.list_feedback(run_ids=[run.id]))
103
+ # Read feedback/scores from pre-fetched batch
104
+ feedbacks = fb_map.get(str(run.id), [])
98
105
  scores = {}
99
106
  for fb in feedbacks:
100
107
  if fb.score is not None:
@@ -220,6 +227,7 @@ def main():
220
227
  parser.add_argument("--config", default=".evolver.json", help="Path to .evolver.json")
221
228
  parser.add_argument("--output", default=None, help="Output JSON path")
222
229
  parser.add_argument("--format", default="json", choices=["json", "markdown"], help="Output format")
230
+ parser.add_argument("--split", default=None, help="Filter by dataset split (e.g., 'train')")
223
231
  args = parser.parse_args()
224
232
  ensure_langsmith_api_key()
225
233
 
@@ -233,6 +241,17 @@ def main():
233
241
  print(f"No results found for experiment: {args.experiment}", file=sys.stderr)
234
242
  sys.exit(1)
235
243
 
244
+ if args.split and result and "per_example" in result:
245
+ with open(args.config) as f:
246
+ cfg = json.load(f)
247
+ split_example_ids = set()
248
+ for ex in client.list_examples(dataset_name=cfg["dataset"], splits=[args.split]):
249
+ split_example_ids.add(str(ex.id))
250
+ result["per_example"] = {k: v for k, v in result["per_example"].items() if k in split_example_ids}
251
+ all_scores = [v["score"] for v in result["per_example"].values()]
252
+ result["combined_score"] = sum(all_scores) / len(all_scores) if all_scores else 0.0
253
+ result["num_examples"] = len(result["per_example"])
254
+
236
255
  if args.format == "markdown":
237
256
  output = format_markdown(result)
238
257
  else:
@@ -17,6 +17,7 @@ import argparse
17
17
  import json
18
18
  import os
19
19
  import platform
20
+ import random
20
21
  import sys
21
22
 
22
23
 
@@ -60,9 +61,14 @@ def get_per_example_scores(client, experiment_name):
60
61
  scores = {}
61
62
  try:
62
63
  runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=200))
64
+ all_run_ids = [run.id for run in runs]
65
+ all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
66
+ fb_map = {}
67
+ for fb in all_feedbacks:
68
+ fb_map.setdefault(str(fb.run_id), []).append(fb)
63
69
  for run in runs:
64
70
  example_id = str(run.reference_example_id or run.id)
65
- feedbacks = list(client.list_feedback(run_ids=[run.id]))
71
+ feedbacks = fb_map.get(str(run.id), [])
66
72
  fb_scores = {}
67
73
  for fb in feedbacks:
68
74
  if fb.score is not None:
@@ -107,16 +113,23 @@ def find_transitions(prev_scores, curr_scores, fail_threshold=0.5, pass_threshol
107
113
  return transitions, regressions
108
114
 
109
115
 
110
- def add_regression_guards(client, dataset_id, transitions, max_guards=5):
116
+ def add_regression_guards(client, dataset_id, transitions, max_guards=5, config=None):
111
117
  """Add regression guard examples to the dataset."""
118
+ config = config or {}
112
119
  added = 0
113
120
  for t in transitions[:max_guards]:
114
121
  try:
115
122
  input_data = json.loads(t["input"]) if t["input"].startswith("{") else {"input": t["input"]}
123
+ split = "train" if random.random() < 0.7 else "held_out"
116
124
  client.create_example(
117
125
  inputs=input_data,
118
126
  dataset_id=dataset_id,
119
- metadata={"source": "regression_guard", "original_example_id": t["example_id"]},
127
+ metadata={
128
+ "source": "regression_guard",
129
+ "original_example_id": t["example_id"],
130
+ "added_at_iteration": config.get("iterations", 0),
131
+ },
132
+ split=split,
120
133
  )
121
134
  added += 1
122
135
  except Exception as e:
@@ -148,7 +161,7 @@ def main():
148
161
 
149
162
  added = 0
150
163
  if args.add_guards and transitions:
151
- added = add_regression_guards(client, config["dataset_id"], transitions, args.max_guards)
164
+ added = add_regression_guards(client, config["dataset_id"], transitions, args.max_guards, config=config)
152
165
 
153
166
  result = {
154
167
  "previous": args.previous_experiment,
package/tools/setup.py CHANGED
@@ -87,6 +87,19 @@ def check_dependencies():
87
87
  return missing
88
88
 
89
89
 
90
+ def assign_splits(client, dataset_id, train_pct=70):
91
+ """Assign train/held_out splits to all examples in a dataset."""
92
+ import random
93
+ examples = list(client.list_examples(dataset_id=dataset_id))
94
+ random.shuffle(examples)
95
+ split_point = int(len(examples) * train_pct / 100)
96
+ for ex in examples[:split_point]:
97
+ client.update_example(ex.id, split="train")
98
+ for ex in examples[split_point:]:
99
+ client.update_example(ex.id, split="held_out")
100
+ return len(examples[:split_point]), len(examples[split_point:])
101
+
102
+
90
103
  def resolve_dataset_name(client, base_name):
91
104
  """Find an available dataset name by auto-incrementing the version suffix.
92
105
 
@@ -148,10 +161,17 @@ def create_dataset_from_file(client, dataset_name, file_path):
148
161
  if "metadata" in item:
149
162
  ex["metadata"] = item["metadata"]
150
163
 
164
+ if "metadata" not in ex:
165
+ ex["metadata"] = {}
166
+ ex["metadata"].setdefault("source", "file")
167
+ ex["metadata"].setdefault("added_at_iteration", 0)
168
+
151
169
  examples.append(ex)
152
170
 
153
171
  if examples:
154
172
  client.create_examples(dataset_id=dataset.id, examples=examples)
173
+ train_n, held_n = assign_splits(client, dataset.id)
174
+ print(f"Assigned splits: {train_n} train, {held_n} held_out", file=sys.stderr)
155
175
 
156
176
  return dataset, len(examples)
157
177
 
@@ -178,10 +198,13 @@ def create_dataset_from_langsmith(client, dataset_name, source_project, limit=10
178
198
  ex = {"inputs": run.inputs}
179
199
  if run.outputs:
180
200
  ex["outputs"] = run.outputs
201
+ ex["metadata"] = {"source": "production", "added_at_iteration": 0}
181
202
  examples.append(ex)
182
203
 
183
204
  if examples:
184
205
  client.create_examples(dataset_id=dataset.id, examples=examples)
206
+ train_n, held_n = assign_splits(client, dataset.id)
207
+ print(f"Assigned splits: {train_n} train, {held_n} held_out", file=sys.stderr)
185
208
 
186
209
  return dataset, len(examples)
187
210