harness-evolver 4.1.0 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
3
  "description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
4
- "version": "4.1.0",
4
+ "version": "4.2.0",
5
5
  "author": {
6
6
  "name": "Raphael Valdetaro"
7
7
  },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "harness-evolver",
3
- "version": "4.1.0",
3
+ "version": "4.2.0",
4
4
  "description": "LangSmith-native autonomous agent optimization for Claude Code",
5
5
  "author": "Raphael Valdetaro",
6
6
  "license": "MIT",
@@ -127,6 +127,122 @@ If critical issues found, ask user whether to continue or fix first via AskUserQ
127
127
  - "Fix and retry" — attempt auto-fix with `--fix` flag
128
128
  - "Abort" — stop the evolution loop
129
129
 
130
+ ### 0.6. Dataset Health Check
131
+
132
+ Run the dataset health diagnostic:
133
+
134
+ ```bash
135
+ $EVOLVER_PY $TOOLS/dataset_health.py \
136
+ --config .evolver.json \
137
+ --production-seed production_seed.json \
138
+ --output health_report.json 2>/dev/null
139
+ ```
140
+
141
+ Read `health_report.json`. Print summary:
142
+ ```bash
143
+ python3 -c "
144
+ import json, os
145
+ if os.path.exists('health_report.json'):
146
+ r = json.load(open('health_report.json'))
147
+ print(f'Dataset Health: {r[\"health_score\"]}/10 ({r[\"example_count\"]} examples)')
148
+ for issue in r.get('issues', []):
149
+ print(f' [{issue[\"severity\"]}] {issue[\"message\"]}')
150
+ "
151
+ ```
152
+
153
+ ### 0.7. Auto-Correct Dataset Issues
154
+
155
+ If `health_report.json` has corrections, apply them automatically:
156
+
157
+ ```bash
158
+ CORRECTIONS=$(python3 -c "
159
+ import json, os
160
+ if os.path.exists('health_report.json'):
161
+ r = json.load(open('health_report.json'))
162
+ for c in r.get('corrections', []):
163
+ print(c['action'])
164
+ " 2>/dev/null)
165
+ ```
166
+
167
+ For each correction:
168
+
169
+ **If `create_splits`**: Run inline Python to assign 70/30 splits:
170
+ ```bash
171
+ $EVOLVER_PY -c "
172
+ from langsmith import Client
173
+ import json, random
174
+ client = Client()
175
+ config = json.load(open('.evolver.json'))
176
+ examples = list(client.list_examples(dataset_name=config['dataset']))
177
+ random.shuffle(examples)
178
+ sp = int(len(examples) * 0.7)
179
+ for ex in examples[:sp]:
180
+ client.update_example(ex.id, split='train')
181
+ for ex in examples[sp:]:
182
+ client.update_example(ex.id, split='held_out')
183
+ print(f'Assigned splits: {sp} train, {len(examples)-sp} held_out')
184
+ "
185
+ ```
186
+
187
+ **If `generate_hard`**: Spawn testgen agent with hard-mode instruction:
188
+ ```
189
+ Agent(
190
+ subagent_type: "evolver-testgen",
191
+ description: "Generate hard examples to rebalance dataset",
192
+ prompt: |
193
+ <objective>
194
+ The dataset is skewed toward easy examples. Generate {count} HARD examples
195
+ that the current agent is likely to fail on.
196
+ Focus on: edge cases, adversarial inputs, complex multi-step queries,
197
+ ambiguous questions, and inputs that require deep reasoning.
198
+ </objective>
199
+ <files_to_read>
200
+ - .evolver.json
201
+ - strategy.md (if exists)
202
+ - production_seed.json (if exists)
203
+ </files_to_read>
204
+ )
205
+ ```
206
+
207
+ **If `fill_coverage`**: Spawn testgen agent with coverage-fill instruction:
208
+ ```
209
+ Agent(
210
+ subagent_type: "evolver-testgen",
211
+ description: "Generate examples for missing categories",
212
+ prompt: |
213
+ <objective>
214
+ The dataset is missing these production categories: {categories}.
215
+ Generate 5 examples per missing category.
216
+ Use production_seed.json for real-world patterns in these categories.
217
+ </objective>
218
+ <files_to_read>
219
+ - .evolver.json
220
+ - production_seed.json (if exists)
221
+ </files_to_read>
222
+ )
223
+ ```
224
+
225
+ **If `retire_dead`**: Move dead examples to retired split:
226
+ ```bash
227
+ $EVOLVER_PY -c "
228
+ from langsmith import Client
229
+ import json
230
+ client = Client()
231
+ report = json.load(open('health_report.json'))
232
+ dead_ids = report.get('dead_examples', {}).get('ids', [])
233
+ config = json.load(open('.evolver.json'))
234
+ examples = {str(e.id): e for e in client.list_examples(dataset_name=config['dataset'])}
235
+ retired = 0
236
+ for eid in dead_ids:
237
+ if eid in examples:
238
+ client.update_example(examples[eid].id, split='retired')
239
+ retired += 1
240
+ print(f'Retired {retired} dead examples')
241
+ "
242
+ ```
243
+
244
+ After corrections, log what was done. Do NOT re-run health check (corrections may need an experiment cycle to show effect).
245
+
130
246
  For each iteration:
131
247
 
132
248
  ### 1. Get Next Version
@@ -170,6 +286,7 @@ if [ -n "$BEST" ]; then
170
286
  $EVOLVER_PY $TOOLS/read_results.py \
171
287
  --experiment "$BEST" \
172
288
  --config .evolver.json \
289
+ --split train \
173
290
  --output best_results.json 2>/dev/null
174
291
  fi
175
292
  ```
@@ -145,15 +145,20 @@ def generate_adversarial_inputs(client, dataset_name, num_inputs=5):
145
145
  return adversarial
146
146
 
147
147
 
148
- def inject_adversarial(client, dataset_id, adversarial_inputs):
148
+ def inject_adversarial(client, dataset_id, adversarial_inputs, config=None):
149
149
  """Add adversarial examples to dataset."""
150
+ config = config or {}
150
151
  added = 0
151
152
  for adv in adversarial_inputs:
152
153
  try:
154
+ split = "train" if random.random() < 0.7 else "held_out"
155
+ metadata = dict(adv["metadata"])
156
+ metadata["added_at_iteration"] = config.get("iterations", 0)
153
157
  client.create_example(
154
158
  inputs=adv["inputs"],
155
159
  dataset_id=dataset_id,
156
- metadata=adv["metadata"],
160
+ metadata=metadata,
161
+ split=split,
157
162
  )
158
163
  added += 1
159
164
  except Exception as e:
@@ -182,7 +187,7 @@ def main():
182
187
 
183
188
  injected = 0
184
189
  if args.inject and adversarial:
185
- injected = inject_adversarial(client, config["dataset_id"], adversarial)
190
+ injected = inject_adversarial(client, config["dataset_id"], adversarial, config=config)
186
191
 
187
192
  result = {
188
193
  "memorization_suspects": len(suspicious),
@@ -0,0 +1,385 @@
1
+ #!/usr/bin/env python3
2
+ """Dataset health diagnostic for Harness Evolver.
3
+
4
+ Analyzes eval dataset quality: size adequacy, difficulty distribution,
5
+ dead examples, production coverage, and split configuration.
6
+ Outputs health_report.json with issues and recommended corrections.
7
+
8
+ Usage:
9
+ python3 dataset_health.py --config .evolver.json --output health_report.json
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ import os
15
+ import sys
16
+ from datetime import datetime, timezone
17
+
18
+
19
+ def ensure_langsmith_api_key():
20
+ """Load API key from langsmith-cli credentials if not in env."""
21
+ if os.environ.get("LANGSMITH_API_KEY"):
22
+ return True
23
+ import platform
24
+ if platform.system() == "Darwin":
25
+ creds_path = os.path.expanduser("~/Library/Application Support/langsmith-cli/credentials")
26
+ else:
27
+ creds_path = os.path.expanduser("~/.config/langsmith-cli/credentials")
28
+ if os.path.exists(creds_path):
29
+ try:
30
+ with open(creds_path) as f:
31
+ for line in f:
32
+ if line.strip().startswith("api_key"):
33
+ key = line.split("=", 1)[1].strip().strip("'\"")
34
+ if key:
35
+ os.environ["LANGSMITH_API_KEY"] = key
36
+ return True
37
+ except OSError:
38
+ pass
39
+ return False
40
+
41
+
42
+ def load_json_safe(path):
43
+ """Load JSON file, return None if missing or invalid."""
44
+ if not path or not os.path.exists(path):
45
+ return None
46
+ try:
47
+ with open(path) as f:
48
+ return json.load(f)
49
+ except (json.JSONDecodeError, OSError):
50
+ return None
51
+
52
+
53
+ def check_size(examples, evaluators):
54
+ """Check dataset size adequacy."""
55
+ count = len(examples)
56
+ min_recommended = max(20, 10 * len(evaluators))
57
+ return {
58
+ "example_count": count,
59
+ "min_recommended": min_recommended,
60
+ "adequate": count >= min_recommended,
61
+ }
62
+
63
+
64
+ def check_difficulty(client, config):
65
+ """Check difficulty distribution from best experiment scores."""
66
+ best_exp = config.get("best_experiment")
67
+ if not best_exp:
68
+ return None
69
+
70
+ try:
71
+ runs = list(client.list_runs(project_name=best_exp, is_root=True, limit=200))
72
+ if not runs:
73
+ return None
74
+
75
+ all_run_ids = [run.id for run in runs]
76
+ all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
77
+ fb_map = {}
78
+ for fb in all_feedbacks:
79
+ fb_map.setdefault(str(fb.run_id), []).append(fb)
80
+
81
+ scores = []
82
+ example_difficulties = {}
83
+ for run in runs:
84
+ run_fbs = fb_map.get(str(run.id), [])
85
+ run_scores = [fb.score for fb in run_fbs if fb.score is not None]
86
+ if run_scores:
87
+ avg = sum(run_scores) / len(run_scores)
88
+ scores.append(avg)
89
+ eid = str(run.reference_example_id or run.id)
90
+ if avg > 0.9:
91
+ example_difficulties[eid] = "easy"
92
+ elif avg >= 0.5:
93
+ example_difficulties[eid] = "medium"
94
+ else:
95
+ example_difficulties[eid] = "hard"
96
+
97
+ if not scores:
98
+ return None
99
+
100
+ easy = sum(1 for s in scores if s > 0.9)
101
+ medium = sum(1 for s in scores if 0.5 <= s <= 0.9)
102
+ hard = sum(1 for s in scores if s < 0.5)
103
+ total = len(scores)
104
+ skew = None
105
+ if total > 0 and easy / total > 0.6:
106
+ skew = "easy_heavy"
107
+ elif total > 0 and hard / total > 0.6:
108
+ skew = "hard_heavy"
109
+
110
+ return {
111
+ "easy": easy,
112
+ "medium": medium,
113
+ "hard": hard,
114
+ "skew": skew,
115
+ "example_difficulties": example_difficulties,
116
+ }
117
+ except Exception:
118
+ return None
119
+
120
+
121
+ def check_dead_examples(client, config):
122
+ """Find examples that scored >=0.9 across all recent experiments."""
123
+ history = config.get("history", [])
124
+ if len(history) < 2:
125
+ return {"count": 0, "ids": []}
126
+
127
+ recent_exps = [h["experiment"] for h in history[-3:]]
128
+ example_scores = {}
129
+
130
+ for exp_name in recent_exps:
131
+ try:
132
+ runs = list(client.list_runs(project_name=exp_name, is_root=True, limit=200))
133
+ all_run_ids = [run.id for run in runs]
134
+ if not all_run_ids:
135
+ continue
136
+ all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
137
+ fb_map = {}
138
+ for fb in all_feedbacks:
139
+ fb_map.setdefault(str(fb.run_id), []).append(fb)
140
+
141
+ for run in runs:
142
+ eid = str(run.reference_example_id or run.id)
143
+ run_fbs = fb_map.get(str(run.id), [])
144
+ run_scores = [fb.score for fb in run_fbs if fb.score is not None]
145
+ if run_scores:
146
+ avg = sum(run_scores) / len(run_scores)
147
+ if eid not in example_scores:
148
+ example_scores[eid] = []
149
+ example_scores[eid].append(avg)
150
+ except Exception:
151
+ continue
152
+
153
+ dead_ids = []
154
+ for eid, exp_scores in example_scores.items():
155
+ if len(exp_scores) >= 2 and all(s >= 0.9 for s in exp_scores):
156
+ dead_ids.append(eid)
157
+
158
+ return {"count": len(dead_ids), "ids": dead_ids}
159
+
160
+
161
+ def check_coverage(examples, production):
162
+ """Compare dataset categories vs production traffic."""
163
+ if not production:
164
+ return None
165
+
166
+ prod_categories = set()
167
+ for cat in production.get("categories", []):
168
+ if isinstance(cat, str):
169
+ prod_categories.add(cat)
170
+ elif isinstance(cat, dict) and "category" in cat:
171
+ prod_categories.add(cat["category"])
172
+
173
+ if not prod_categories:
174
+ return None
175
+
176
+ dataset_categories = set()
177
+ for ex in examples:
178
+ meta = getattr(ex, "metadata", None) or {}
179
+ if "category" in meta:
180
+ dataset_categories.add(meta["category"])
181
+
182
+ missing = prod_categories - dataset_categories
183
+ coverage_pct = 0
184
+ if prod_categories:
185
+ coverage_pct = int(100 * len(prod_categories - missing) / len(prod_categories))
186
+
187
+ return {
188
+ "production": sorted(prod_categories),
189
+ "dataset": sorted(dataset_categories),
190
+ "missing": sorted(missing),
191
+ "pct": coverage_pct,
192
+ }
193
+
194
+
195
+ def check_splits(client, dataset_name):
196
+ """Check if train/held_out splits exist."""
197
+ has_train = False
198
+ has_held_out = False
199
+ try:
200
+ train = list(client.list_examples(dataset_name=dataset_name, splits=["train"], limit=1))
201
+ has_train = len(train) > 0
202
+ except Exception:
203
+ pass
204
+ try:
205
+ held = list(client.list_examples(dataset_name=dataset_name, splits=["held_out"], limit=1))
206
+ has_held_out = len(held) > 0
207
+ except Exception:
208
+ pass
209
+ return {"has_train": has_train, "has_held_out": has_held_out}
210
+
211
+
212
+ def compute_health_score(size_info, difficulty, dead, coverage, splits):
213
+ """Compute overall health score 0-10."""
214
+ score = 10
215
+
216
+ if not size_info.get("adequate"):
217
+ score -= 3
218
+
219
+ if difficulty and difficulty.get("skew"):
220
+ score -= 2
221
+
222
+ if dead and dead.get("count", 0) > 0:
223
+ total = size_info.get("example_count", 1)
224
+ if dead["count"] / max(total, 1) > 0.2:
225
+ score -= 1
226
+
227
+ if coverage and coverage.get("pct", 100) < 75:
228
+ score -= 2
229
+
230
+ if splits and not splits.get("has_train"):
231
+ score -= 2
232
+
233
+ return max(0, score)
234
+
235
+
236
+ def build_issues_and_corrections(size_info, difficulty, dead, coverage, splits):
237
+ """Build issues and corrections lists."""
238
+ issues = []
239
+ corrections = []
240
+
241
+ if not size_info.get("adequate"):
242
+ issues.append({
243
+ "type": "size_inadequate",
244
+ "severity": "high",
245
+ "message": f"Only {size_info['example_count']} examples (recommended: {size_info['min_recommended']}+)",
246
+ })
247
+ corrections.append({
248
+ "action": "generate_more",
249
+ "count": size_info["min_recommended"] - size_info["example_count"],
250
+ })
251
+
252
+ if difficulty and difficulty.get("skew") == "easy_heavy":
253
+ easy_pct = int(100 * difficulty["easy"] / max(difficulty["easy"] + difficulty["medium"] + difficulty["hard"], 1))
254
+ issues.append({
255
+ "type": "difficulty_skew",
256
+ "severity": "high",
257
+ "message": f"{easy_pct}% easy examples — low discriminative power",
258
+ })
259
+ corrections.append({
260
+ "action": "generate_hard",
261
+ "count": max(5, difficulty["easy"] // 3),
262
+ })
263
+
264
+ if dead and dead.get("count", 0) > 0:
265
+ total = size_info.get("example_count", 1)
266
+ dead_pct = int(100 * dead["count"] / max(total, 1))
267
+ if dead_pct > 10:
268
+ issues.append({
269
+ "type": "dead_examples",
270
+ "severity": "medium",
271
+ "message": f"{dead['count']} dead examples ({dead_pct}%) — scored >=0.9 in all recent experiments",
272
+ })
273
+ corrections.append({
274
+ "action": "retire_dead",
275
+ "ids": dead["ids"],
276
+ })
277
+
278
+ if coverage and coverage.get("missing"):
279
+ issues.append({
280
+ "type": "coverage_gap",
281
+ "severity": "high",
282
+ "message": f"Missing categories: {', '.join(coverage['missing'])} ({coverage['pct']}% coverage)",
283
+ })
284
+ corrections.append({
285
+ "action": "fill_coverage",
286
+ "categories": coverage["missing"],
287
+ })
288
+
289
+ if splits and not splits.get("has_train"):
290
+ issues.append({
291
+ "type": "no_splits",
292
+ "severity": "medium",
293
+ "message": "No train/held-out split — proposer overfit risk",
294
+ })
295
+ corrections.append({
296
+ "action": "create_splits",
297
+ "train_pct": 70,
298
+ })
299
+
300
+ return issues, corrections
301
+
302
+
303
+ def main():
304
+ parser = argparse.ArgumentParser(description="Dataset health diagnostic")
305
+ parser.add_argument("--config", default=".evolver.json")
306
+ parser.add_argument("--production-seed", default="production_seed.json")
307
+ parser.add_argument("--output", default="health_report.json")
308
+ args = parser.parse_args()
309
+
310
+ ensure_langsmith_api_key()
311
+
312
+ with open(args.config) as f:
313
+ config = json.load(f)
314
+
315
+ production = load_json_safe(args.production_seed)
316
+
317
+ from langsmith import Client
318
+ client = Client()
319
+
320
+ dataset_name = config["dataset"]
321
+
322
+ # Get all examples
323
+ examples = list(client.list_examples(dataset_name=dataset_name, limit=500))
324
+
325
+ # Run checks
326
+ evaluators = config.get("evaluators", ["correctness"])
327
+ size_info = check_size(examples, evaluators)
328
+ difficulty = check_difficulty(client, config)
329
+ dead = check_dead_examples(client, config)
330
+ coverage = check_coverage(examples, production)
331
+ splits = check_splits(client, dataset_name)
332
+
333
+ # Tag difficulty metadata on examples if we computed it
334
+ if difficulty and difficulty.get("example_difficulties"):
335
+ for ex in examples:
336
+ eid = str(ex.id)
337
+ diff = difficulty["example_difficulties"].get(eid)
338
+ if diff:
339
+ meta = dict(getattr(ex, "metadata", None) or {})
340
+ if meta.get("difficulty") != diff:
341
+ meta["difficulty"] = diff
342
+ try:
343
+ client.update_example(ex.id, metadata=meta)
344
+ except Exception:
345
+ pass
346
+
347
+ # Compute health score and build report
348
+ health_score = compute_health_score(size_info, difficulty, dead, coverage, splits)
349
+ issues, corrections = build_issues_and_corrections(size_info, difficulty, dead, coverage, splits)
350
+
351
+ report = {
352
+ "generated_at": datetime.now(timezone.utc).isoformat(),
353
+ "health_score": health_score,
354
+ "example_count": size_info["example_count"],
355
+ "min_recommended": size_info["min_recommended"],
356
+ "difficulty": {k: v for k, v in (difficulty or {}).items() if k != "example_difficulties"} or None,
357
+ "dead_examples": dead,
358
+ "coverage": coverage,
359
+ "splits": splits,
360
+ "issues": issues,
361
+ "corrections": corrections,
362
+ }
363
+
364
+ with open(args.output, "w") as f:
365
+ json.dump(report, f, indent=2)
366
+
367
+ # Print human-readable summary
368
+ print(f"Dataset Health: {health_score}/10")
369
+ print(f"Examples: {size_info['example_count']} (min recommended: {size_info['min_recommended']})")
370
+ if difficulty:
371
+ print(f"Difficulty: {difficulty.get('easy', 0)} easy, {difficulty.get('medium', 0)} medium, {difficulty.get('hard', 0)} hard")
372
+ if dead and dead["count"] > 0:
373
+ print(f"Dead examples: {dead['count']}")
374
+ if coverage:
375
+ print(f"Coverage: {coverage['pct']}% ({len(coverage.get('missing', []))} categories missing)")
376
+ if splits:
377
+ print(f"Splits: train={'yes' if splits['has_train'] else 'no'}, held_out={'yes' if splits['has_held_out'] else 'no'}")
378
+ if issues:
379
+ print(f"\nIssues ({len(issues)}):")
380
+ for issue in issues:
381
+ print(f" [{issue['severity']}] {issue['message']}")
382
+
383
+
384
+ if __name__ == "__main__":
385
+ main()
@@ -79,6 +79,13 @@ def read_experiment(client, experiment_name):
79
79
  total_latency_ms = 0
80
80
  errors = 0
81
81
 
82
+ # Batch-fetch all feedback in one API call instead of N+1
83
+ all_run_ids = [run.id for run in runs]
84
+ all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
85
+ fb_map = {}
86
+ for fb in all_feedbacks:
87
+ fb_map.setdefault(str(fb.run_id), []).append(fb)
88
+
82
89
  for run in runs:
83
90
  example_id = str(run.reference_example_id or run.id)
84
91
  tokens = run.total_tokens or 0
@@ -93,8 +100,8 @@ def read_experiment(client, experiment_name):
93
100
  if has_error:
94
101
  errors += 1
95
102
 
96
- # Read feedback/scores
97
- feedbacks = list(client.list_feedback(run_ids=[run.id]))
103
+ # Read feedback/scores from pre-fetched batch
104
+ feedbacks = fb_map.get(str(run.id), [])
98
105
  scores = {}
99
106
  for fb in feedbacks:
100
107
  if fb.score is not None:
@@ -220,6 +227,7 @@ def main():
220
227
  parser.add_argument("--config", default=".evolver.json", help="Path to .evolver.json")
221
228
  parser.add_argument("--output", default=None, help="Output JSON path")
222
229
  parser.add_argument("--format", default="json", choices=["json", "markdown"], help="Output format")
230
+ parser.add_argument("--split", default=None, help="Filter by dataset split (e.g., 'train')")
223
231
  args = parser.parse_args()
224
232
  ensure_langsmith_api_key()
225
233
 
@@ -233,6 +241,17 @@ def main():
233
241
  print(f"No results found for experiment: {args.experiment}", file=sys.stderr)
234
242
  sys.exit(1)
235
243
 
244
+ if args.split and result and "per_example" in result:
245
+ with open(args.config) as f:
246
+ cfg = json.load(f)
247
+ split_example_ids = set()
248
+ for ex in client.list_examples(dataset_name=cfg["dataset"], splits=[args.split]):
249
+ split_example_ids.add(str(ex.id))
250
+ result["per_example"] = {k: v for k, v in result["per_example"].items() if k in split_example_ids}
251
+ all_scores = [v["score"] for v in result["per_example"].values()]
252
+ result["combined_score"] = sum(all_scores) / len(all_scores) if all_scores else 0.0
253
+ result["num_examples"] = len(result["per_example"])
254
+
236
255
  if args.format == "markdown":
237
256
  output = format_markdown(result)
238
257
  else:
@@ -17,6 +17,7 @@ import argparse
17
17
  import json
18
18
  import os
19
19
  import platform
20
+ import random
20
21
  import sys
21
22
 
22
23
 
@@ -60,9 +61,14 @@ def get_per_example_scores(client, experiment_name):
60
61
  scores = {}
61
62
  try:
62
63
  runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=200))
64
+ all_run_ids = [run.id for run in runs]
65
+ all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
66
+ fb_map = {}
67
+ for fb in all_feedbacks:
68
+ fb_map.setdefault(str(fb.run_id), []).append(fb)
63
69
  for run in runs:
64
70
  example_id = str(run.reference_example_id or run.id)
65
- feedbacks = list(client.list_feedback(run_ids=[run.id]))
71
+ feedbacks = fb_map.get(str(run.id), [])
66
72
  fb_scores = {}
67
73
  for fb in feedbacks:
68
74
  if fb.score is not None:
@@ -107,16 +113,23 @@ def find_transitions(prev_scores, curr_scores, fail_threshold=0.5, pass_threshol
107
113
  return transitions, regressions
108
114
 
109
115
 
110
- def add_regression_guards(client, dataset_id, transitions, max_guards=5):
116
+ def add_regression_guards(client, dataset_id, transitions, max_guards=5, config=None):
111
117
  """Add regression guard examples to the dataset."""
118
+ config = config or {}
112
119
  added = 0
113
120
  for t in transitions[:max_guards]:
114
121
  try:
115
122
  input_data = json.loads(t["input"]) if t["input"].startswith("{") else {"input": t["input"]}
123
+ split = "train" if random.random() < 0.7 else "held_out"
116
124
  client.create_example(
117
125
  inputs=input_data,
118
126
  dataset_id=dataset_id,
119
- metadata={"source": "regression_guard", "original_example_id": t["example_id"]},
127
+ metadata={
128
+ "source": "regression_guard",
129
+ "original_example_id": t["example_id"],
130
+ "added_at_iteration": config.get("iterations", 0),
131
+ },
132
+ split=split,
120
133
  )
121
134
  added += 1
122
135
  except Exception as e:
@@ -148,7 +161,7 @@ def main():
148
161
 
149
162
  added = 0
150
163
  if args.add_guards and transitions:
151
- added = add_regression_guards(client, config["dataset_id"], transitions, args.max_guards)
164
+ added = add_regression_guards(client, config["dataset_id"], transitions, args.max_guards, config=config)
152
165
 
153
166
  result = {
154
167
  "previous": args.previous_experiment,
package/tools/setup.py CHANGED
@@ -87,6 +87,19 @@ def check_dependencies():
87
87
  return missing
88
88
 
89
89
 
90
+ def assign_splits(client, dataset_id, train_pct=70):
91
+ """Assign train/held_out splits to all examples in a dataset."""
92
+ import random
93
+ examples = list(client.list_examples(dataset_id=dataset_id))
94
+ random.shuffle(examples)
95
+ split_point = int(len(examples) * train_pct / 100)
96
+ for ex in examples[:split_point]:
97
+ client.update_example(ex.id, split="train")
98
+ for ex in examples[split_point:]:
99
+ client.update_example(ex.id, split="held_out")
100
+ return len(examples[:split_point]), len(examples[split_point:])
101
+
102
+
90
103
  def resolve_dataset_name(client, base_name):
91
104
  """Find an available dataset name by auto-incrementing the version suffix.
92
105
 
@@ -148,10 +161,17 @@ def create_dataset_from_file(client, dataset_name, file_path):
148
161
  if "metadata" in item:
149
162
  ex["metadata"] = item["metadata"]
150
163
 
164
+ if "metadata" not in ex:
165
+ ex["metadata"] = {}
166
+ ex["metadata"].setdefault("source", "file")
167
+ ex["metadata"].setdefault("added_at_iteration", 0)
168
+
151
169
  examples.append(ex)
152
170
 
153
171
  if examples:
154
172
  client.create_examples(dataset_id=dataset.id, examples=examples)
173
+ train_n, held_n = assign_splits(client, dataset.id)
174
+ print(f"Assigned splits: {train_n} train, {held_n} held_out", file=sys.stderr)
155
175
 
156
176
  return dataset, len(examples)
157
177
 
@@ -178,10 +198,13 @@ def create_dataset_from_langsmith(client, dataset_name, source_project, limit=10
178
198
  ex = {"inputs": run.inputs}
179
199
  if run.outputs:
180
200
  ex["outputs"] = run.outputs
201
+ ex["metadata"] = {"source": "production", "added_at_iteration": 0}
181
202
  examples.append(ex)
182
203
 
183
204
  if examples:
184
205
  client.create_examples(dataset_id=dataset.id, examples=examples)
206
+ train_n, held_n = assign_splits(client, dataset.id)
207
+ print(f"Assigned splits: {train_n} train, {held_n} held_out", file=sys.stderr)
185
208
 
186
209
  return dataset, len(examples)
187
210
 
@@ -335,10 +335,16 @@ def fetch_scores_from_experiment(experiment_name):
335
335
  limit=200,
336
336
  ))
337
337
 
338
+ all_run_ids = [run.id for run in runs]
339
+ all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
340
+ fb_map = {}
341
+ for fb in all_feedbacks:
342
+ fb_map.setdefault(str(fb.run_id), []).append(fb)
343
+
338
344
  per_task = {}
339
345
  for run in runs:
340
346
  example_id = str(run.reference_example_id or run.id)
341
- feedbacks = list(client.list_feedback(run_ids=[run.id]))
347
+ feedbacks = fb_map.get(str(run.id), [])
342
348
  scores = [fb.score for fb in feedbacks if fb.score is not None]
343
349
  avg_score = sum(scores) / len(scores) if scores else 0.0
344
350
  per_task[example_id] = {"score": avg_score}