harness-evolver 4.1.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/package.json +1 -1
- package/skills/evolve/SKILL.md +117 -0
- package/tools/__pycache__/adversarial_inject.cpython-313.pyc +0 -0
- package/tools/__pycache__/regression_tracker.cpython-313.pyc +0 -0
- package/tools/__pycache__/setup.cpython-313.pyc +0 -0
- package/tools/adversarial_inject.py +8 -3
- package/tools/dataset_health.py +385 -0
- package/tools/read_results.py +21 -2
- package/tools/regression_tracker.py +17 -4
- package/tools/setup.py +94 -22
- package/tools/trace_insights.py +7 -1
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "harness-evolver",
|
|
3
3
|
"description": "LangSmith-native autonomous agent optimization — evolves LLM agent code using multi-agent proposers, LangSmith experiments, and git worktrees",
|
|
4
|
-
"version": "4.1
|
|
4
|
+
"version": "4.2.1",
|
|
5
5
|
"author": {
|
|
6
6
|
"name": "Raphael Valdetaro"
|
|
7
7
|
},
|
package/package.json
CHANGED
package/skills/evolve/SKILL.md
CHANGED
|
@@ -127,6 +127,122 @@ If critical issues found, ask user whether to continue or fix first via AskUserQ
|
|
|
127
127
|
- "Fix and retry" — attempt auto-fix with `--fix` flag
|
|
128
128
|
- "Abort" — stop the evolution loop
|
|
129
129
|
|
|
130
|
+
### 0.6. Dataset Health Check
|
|
131
|
+
|
|
132
|
+
Run the dataset health diagnostic:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
$EVOLVER_PY $TOOLS/dataset_health.py \
|
|
136
|
+
--config .evolver.json \
|
|
137
|
+
--production-seed production_seed.json \
|
|
138
|
+
--output health_report.json 2>/dev/null
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Read `health_report.json`. Print summary:
|
|
142
|
+
```bash
|
|
143
|
+
python3 -c "
|
|
144
|
+
import json, os
|
|
145
|
+
if os.path.exists('health_report.json'):
|
|
146
|
+
r = json.load(open('health_report.json'))
|
|
147
|
+
print(f'Dataset Health: {r[\"health_score\"]}/10 ({r[\"example_count\"]} examples)')
|
|
148
|
+
for issue in r.get('issues', []):
|
|
149
|
+
print(f' [{issue[\"severity\"]}] {issue[\"message\"]}')
|
|
150
|
+
"
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### 0.7. Auto-Correct Dataset Issues
|
|
154
|
+
|
|
155
|
+
If `health_report.json` has corrections, apply them automatically:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
CORRECTIONS=$(python3 -c "
|
|
159
|
+
import json, os
|
|
160
|
+
if os.path.exists('health_report.json'):
|
|
161
|
+
r = json.load(open('health_report.json'))
|
|
162
|
+
for c in r.get('corrections', []):
|
|
163
|
+
print(c['action'])
|
|
164
|
+
" 2>/dev/null)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
For each correction:
|
|
168
|
+
|
|
169
|
+
**If `create_splits`**: Run inline Python to assign 70/30 splits:
|
|
170
|
+
```bash
|
|
171
|
+
$EVOLVER_PY -c "
|
|
172
|
+
from langsmith import Client
|
|
173
|
+
import json, random
|
|
174
|
+
client = Client()
|
|
175
|
+
config = json.load(open('.evolver.json'))
|
|
176
|
+
examples = list(client.list_examples(dataset_name=config['dataset']))
|
|
177
|
+
random.shuffle(examples)
|
|
178
|
+
sp = int(len(examples) * 0.7)
|
|
179
|
+
for ex in examples[:sp]:
|
|
180
|
+
client.update_example(ex.id, split='train')
|
|
181
|
+
for ex in examples[sp:]:
|
|
182
|
+
client.update_example(ex.id, split='held_out')
|
|
183
|
+
print(f'Assigned splits: {sp} train, {len(examples)-sp} held_out')
|
|
184
|
+
"
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
**If `generate_hard`**: Spawn testgen agent with hard-mode instruction:
|
|
188
|
+
```
|
|
189
|
+
Agent(
|
|
190
|
+
subagent_type: "evolver-testgen",
|
|
191
|
+
description: "Generate hard examples to rebalance dataset",
|
|
192
|
+
prompt: |
|
|
193
|
+
<objective>
|
|
194
|
+
The dataset is skewed toward easy examples. Generate {count} HARD examples
|
|
195
|
+
that the current agent is likely to fail on.
|
|
196
|
+
Focus on: edge cases, adversarial inputs, complex multi-step queries,
|
|
197
|
+
ambiguous questions, and inputs that require deep reasoning.
|
|
198
|
+
</objective>
|
|
199
|
+
<files_to_read>
|
|
200
|
+
- .evolver.json
|
|
201
|
+
- strategy.md (if exists)
|
|
202
|
+
- production_seed.json (if exists)
|
|
203
|
+
</files_to_read>
|
|
204
|
+
)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
**If `fill_coverage`**: Spawn testgen agent with coverage-fill instruction:
|
|
208
|
+
```
|
|
209
|
+
Agent(
|
|
210
|
+
subagent_type: "evolver-testgen",
|
|
211
|
+
description: "Generate examples for missing categories",
|
|
212
|
+
prompt: |
|
|
213
|
+
<objective>
|
|
214
|
+
The dataset is missing these production categories: {categories}.
|
|
215
|
+
Generate 5 examples per missing category.
|
|
216
|
+
Use production_seed.json for real-world patterns in these categories.
|
|
217
|
+
</objective>
|
|
218
|
+
<files_to_read>
|
|
219
|
+
- .evolver.json
|
|
220
|
+
- production_seed.json (if exists)
|
|
221
|
+
</files_to_read>
|
|
222
|
+
)
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
**If `retire_dead`**: Move dead examples to retired split:
|
|
226
|
+
```bash
|
|
227
|
+
$EVOLVER_PY -c "
|
|
228
|
+
from langsmith import Client
|
|
229
|
+
import json
|
|
230
|
+
client = Client()
|
|
231
|
+
report = json.load(open('health_report.json'))
|
|
232
|
+
dead_ids = report.get('dead_examples', {}).get('ids', [])
|
|
233
|
+
config = json.load(open('.evolver.json'))
|
|
234
|
+
examples = {str(e.id): e for e in client.list_examples(dataset_name=config['dataset'])}
|
|
235
|
+
retired = 0
|
|
236
|
+
for eid in dead_ids:
|
|
237
|
+
if eid in examples:
|
|
238
|
+
client.update_example(examples[eid].id, split='retired')
|
|
239
|
+
retired += 1
|
|
240
|
+
print(f'Retired {retired} dead examples')
|
|
241
|
+
"
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
After corrections, log what was done. Do NOT re-run health check (corrections may need an experiment cycle to show effect).
|
|
245
|
+
|
|
130
246
|
For each iteration:
|
|
131
247
|
|
|
132
248
|
### 1. Get Next Version
|
|
@@ -170,6 +286,7 @@ if [ -n "$BEST" ]; then
|
|
|
170
286
|
$EVOLVER_PY $TOOLS/read_results.py \
|
|
171
287
|
--experiment "$BEST" \
|
|
172
288
|
--config .evolver.json \
|
|
289
|
+
--split train \
|
|
173
290
|
--output best_results.json 2>/dev/null
|
|
174
291
|
fi
|
|
175
292
|
```
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -145,15 +145,20 @@ def generate_adversarial_inputs(client, dataset_name, num_inputs=5):
|
|
|
145
145
|
return adversarial
|
|
146
146
|
|
|
147
147
|
|
|
148
|
-
def inject_adversarial(client, dataset_id, adversarial_inputs):
|
|
148
|
+
def inject_adversarial(client, dataset_id, adversarial_inputs, config=None):
|
|
149
149
|
"""Add adversarial examples to dataset."""
|
|
150
|
+
config = config or {}
|
|
150
151
|
added = 0
|
|
151
152
|
for adv in adversarial_inputs:
|
|
152
153
|
try:
|
|
154
|
+
split = "train" if random.random() < 0.7 else "held_out"
|
|
155
|
+
metadata = dict(adv["metadata"])
|
|
156
|
+
metadata["added_at_iteration"] = config.get("iterations", 0)
|
|
153
157
|
client.create_example(
|
|
154
158
|
inputs=adv["inputs"],
|
|
155
159
|
dataset_id=dataset_id,
|
|
156
|
-
metadata=
|
|
160
|
+
metadata=metadata,
|
|
161
|
+
split=split,
|
|
157
162
|
)
|
|
158
163
|
added += 1
|
|
159
164
|
except Exception as e:
|
|
@@ -182,7 +187,7 @@ def main():
|
|
|
182
187
|
|
|
183
188
|
injected = 0
|
|
184
189
|
if args.inject and adversarial:
|
|
185
|
-
injected = inject_adversarial(client, config["dataset_id"], adversarial)
|
|
190
|
+
injected = inject_adversarial(client, config["dataset_id"], adversarial, config=config)
|
|
186
191
|
|
|
187
192
|
result = {
|
|
188
193
|
"memorization_suspects": len(suspicious),
|
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Dataset health diagnostic for Harness Evolver.
|
|
3
|
+
|
|
4
|
+
Analyzes eval dataset quality: size adequacy, difficulty distribution,
|
|
5
|
+
dead examples, production coverage, and split configuration.
|
|
6
|
+
Outputs health_report.json with issues and recommended corrections.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python3 dataset_health.py --config .evolver.json --output health_report.json
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
import sys
|
|
16
|
+
from datetime import datetime, timezone
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def ensure_langsmith_api_key():
|
|
20
|
+
"""Load API key from langsmith-cli credentials if not in env."""
|
|
21
|
+
if os.environ.get("LANGSMITH_API_KEY"):
|
|
22
|
+
return True
|
|
23
|
+
import platform
|
|
24
|
+
if platform.system() == "Darwin":
|
|
25
|
+
creds_path = os.path.expanduser("~/Library/Application Support/langsmith-cli/credentials")
|
|
26
|
+
else:
|
|
27
|
+
creds_path = os.path.expanduser("~/.config/langsmith-cli/credentials")
|
|
28
|
+
if os.path.exists(creds_path):
|
|
29
|
+
try:
|
|
30
|
+
with open(creds_path) as f:
|
|
31
|
+
for line in f:
|
|
32
|
+
if line.strip().startswith("api_key"):
|
|
33
|
+
key = line.split("=", 1)[1].strip().strip("'\"")
|
|
34
|
+
if key:
|
|
35
|
+
os.environ["LANGSMITH_API_KEY"] = key
|
|
36
|
+
return True
|
|
37
|
+
except OSError:
|
|
38
|
+
pass
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def load_json_safe(path):
|
|
43
|
+
"""Load JSON file, return None if missing or invalid."""
|
|
44
|
+
if not path or not os.path.exists(path):
|
|
45
|
+
return None
|
|
46
|
+
try:
|
|
47
|
+
with open(path) as f:
|
|
48
|
+
return json.load(f)
|
|
49
|
+
except (json.JSONDecodeError, OSError):
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def check_size(examples, evaluators):
|
|
54
|
+
"""Check dataset size adequacy."""
|
|
55
|
+
count = len(examples)
|
|
56
|
+
min_recommended = max(20, 10 * len(evaluators))
|
|
57
|
+
return {
|
|
58
|
+
"example_count": count,
|
|
59
|
+
"min_recommended": min_recommended,
|
|
60
|
+
"adequate": count >= min_recommended,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def check_difficulty(client, config):
|
|
65
|
+
"""Check difficulty distribution from best experiment scores."""
|
|
66
|
+
best_exp = config.get("best_experiment")
|
|
67
|
+
if not best_exp:
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
runs = list(client.list_runs(project_name=best_exp, is_root=True, limit=200))
|
|
72
|
+
if not runs:
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
all_run_ids = [run.id for run in runs]
|
|
76
|
+
all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
|
|
77
|
+
fb_map = {}
|
|
78
|
+
for fb in all_feedbacks:
|
|
79
|
+
fb_map.setdefault(str(fb.run_id), []).append(fb)
|
|
80
|
+
|
|
81
|
+
scores = []
|
|
82
|
+
example_difficulties = {}
|
|
83
|
+
for run in runs:
|
|
84
|
+
run_fbs = fb_map.get(str(run.id), [])
|
|
85
|
+
run_scores = [fb.score for fb in run_fbs if fb.score is not None]
|
|
86
|
+
if run_scores:
|
|
87
|
+
avg = sum(run_scores) / len(run_scores)
|
|
88
|
+
scores.append(avg)
|
|
89
|
+
eid = str(run.reference_example_id or run.id)
|
|
90
|
+
if avg > 0.9:
|
|
91
|
+
example_difficulties[eid] = "easy"
|
|
92
|
+
elif avg >= 0.5:
|
|
93
|
+
example_difficulties[eid] = "medium"
|
|
94
|
+
else:
|
|
95
|
+
example_difficulties[eid] = "hard"
|
|
96
|
+
|
|
97
|
+
if not scores:
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
easy = sum(1 for s in scores if s > 0.9)
|
|
101
|
+
medium = sum(1 for s in scores if 0.5 <= s <= 0.9)
|
|
102
|
+
hard = sum(1 for s in scores if s < 0.5)
|
|
103
|
+
total = len(scores)
|
|
104
|
+
skew = None
|
|
105
|
+
if total > 0 and easy / total > 0.6:
|
|
106
|
+
skew = "easy_heavy"
|
|
107
|
+
elif total > 0 and hard / total > 0.6:
|
|
108
|
+
skew = "hard_heavy"
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
"easy": easy,
|
|
112
|
+
"medium": medium,
|
|
113
|
+
"hard": hard,
|
|
114
|
+
"skew": skew,
|
|
115
|
+
"example_difficulties": example_difficulties,
|
|
116
|
+
}
|
|
117
|
+
except Exception:
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def check_dead_examples(client, config):
|
|
122
|
+
"""Find examples that scored >=0.9 across all recent experiments."""
|
|
123
|
+
history = config.get("history", [])
|
|
124
|
+
if len(history) < 2:
|
|
125
|
+
return {"count": 0, "ids": []}
|
|
126
|
+
|
|
127
|
+
recent_exps = [h["experiment"] for h in history[-3:]]
|
|
128
|
+
example_scores = {}
|
|
129
|
+
|
|
130
|
+
for exp_name in recent_exps:
|
|
131
|
+
try:
|
|
132
|
+
runs = list(client.list_runs(project_name=exp_name, is_root=True, limit=200))
|
|
133
|
+
all_run_ids = [run.id for run in runs]
|
|
134
|
+
if not all_run_ids:
|
|
135
|
+
continue
|
|
136
|
+
all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
|
|
137
|
+
fb_map = {}
|
|
138
|
+
for fb in all_feedbacks:
|
|
139
|
+
fb_map.setdefault(str(fb.run_id), []).append(fb)
|
|
140
|
+
|
|
141
|
+
for run in runs:
|
|
142
|
+
eid = str(run.reference_example_id or run.id)
|
|
143
|
+
run_fbs = fb_map.get(str(run.id), [])
|
|
144
|
+
run_scores = [fb.score for fb in run_fbs if fb.score is not None]
|
|
145
|
+
if run_scores:
|
|
146
|
+
avg = sum(run_scores) / len(run_scores)
|
|
147
|
+
if eid not in example_scores:
|
|
148
|
+
example_scores[eid] = []
|
|
149
|
+
example_scores[eid].append(avg)
|
|
150
|
+
except Exception:
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
dead_ids = []
|
|
154
|
+
for eid, exp_scores in example_scores.items():
|
|
155
|
+
if len(exp_scores) >= 2 and all(s >= 0.9 for s in exp_scores):
|
|
156
|
+
dead_ids.append(eid)
|
|
157
|
+
|
|
158
|
+
return {"count": len(dead_ids), "ids": dead_ids}
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def check_coverage(examples, production):
|
|
162
|
+
"""Compare dataset categories vs production traffic."""
|
|
163
|
+
if not production:
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
prod_categories = set()
|
|
167
|
+
for cat in production.get("categories", []):
|
|
168
|
+
if isinstance(cat, str):
|
|
169
|
+
prod_categories.add(cat)
|
|
170
|
+
elif isinstance(cat, dict) and "category" in cat:
|
|
171
|
+
prod_categories.add(cat["category"])
|
|
172
|
+
|
|
173
|
+
if not prod_categories:
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
dataset_categories = set()
|
|
177
|
+
for ex in examples:
|
|
178
|
+
meta = getattr(ex, "metadata", None) or {}
|
|
179
|
+
if "category" in meta:
|
|
180
|
+
dataset_categories.add(meta["category"])
|
|
181
|
+
|
|
182
|
+
missing = prod_categories - dataset_categories
|
|
183
|
+
coverage_pct = 0
|
|
184
|
+
if prod_categories:
|
|
185
|
+
coverage_pct = int(100 * len(prod_categories - missing) / len(prod_categories))
|
|
186
|
+
|
|
187
|
+
return {
|
|
188
|
+
"production": sorted(prod_categories),
|
|
189
|
+
"dataset": sorted(dataset_categories),
|
|
190
|
+
"missing": sorted(missing),
|
|
191
|
+
"pct": coverage_pct,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def check_splits(client, dataset_name):
|
|
196
|
+
"""Check if train/held_out splits exist."""
|
|
197
|
+
has_train = False
|
|
198
|
+
has_held_out = False
|
|
199
|
+
try:
|
|
200
|
+
train = list(client.list_examples(dataset_name=dataset_name, splits=["train"], limit=1))
|
|
201
|
+
has_train = len(train) > 0
|
|
202
|
+
except Exception:
|
|
203
|
+
pass
|
|
204
|
+
try:
|
|
205
|
+
held = list(client.list_examples(dataset_name=dataset_name, splits=["held_out"], limit=1))
|
|
206
|
+
has_held_out = len(held) > 0
|
|
207
|
+
except Exception:
|
|
208
|
+
pass
|
|
209
|
+
return {"has_train": has_train, "has_held_out": has_held_out}
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def compute_health_score(size_info, difficulty, dead, coverage, splits):
|
|
213
|
+
"""Compute overall health score 0-10."""
|
|
214
|
+
score = 10
|
|
215
|
+
|
|
216
|
+
if not size_info.get("adequate"):
|
|
217
|
+
score -= 3
|
|
218
|
+
|
|
219
|
+
if difficulty and difficulty.get("skew"):
|
|
220
|
+
score -= 2
|
|
221
|
+
|
|
222
|
+
if dead and dead.get("count", 0) > 0:
|
|
223
|
+
total = size_info.get("example_count", 1)
|
|
224
|
+
if dead["count"] / max(total, 1) > 0.2:
|
|
225
|
+
score -= 1
|
|
226
|
+
|
|
227
|
+
if coverage and coverage.get("pct", 100) < 75:
|
|
228
|
+
score -= 2
|
|
229
|
+
|
|
230
|
+
if splits and not splits.get("has_train"):
|
|
231
|
+
score -= 2
|
|
232
|
+
|
|
233
|
+
return max(0, score)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def build_issues_and_corrections(size_info, difficulty, dead, coverage, splits):
|
|
237
|
+
"""Build issues and corrections lists."""
|
|
238
|
+
issues = []
|
|
239
|
+
corrections = []
|
|
240
|
+
|
|
241
|
+
if not size_info.get("adequate"):
|
|
242
|
+
issues.append({
|
|
243
|
+
"type": "size_inadequate",
|
|
244
|
+
"severity": "high",
|
|
245
|
+
"message": f"Only {size_info['example_count']} examples (recommended: {size_info['min_recommended']}+)",
|
|
246
|
+
})
|
|
247
|
+
corrections.append({
|
|
248
|
+
"action": "generate_more",
|
|
249
|
+
"count": size_info["min_recommended"] - size_info["example_count"],
|
|
250
|
+
})
|
|
251
|
+
|
|
252
|
+
if difficulty and difficulty.get("skew") == "easy_heavy":
|
|
253
|
+
easy_pct = int(100 * difficulty["easy"] / max(difficulty["easy"] + difficulty["medium"] + difficulty["hard"], 1))
|
|
254
|
+
issues.append({
|
|
255
|
+
"type": "difficulty_skew",
|
|
256
|
+
"severity": "high",
|
|
257
|
+
"message": f"{easy_pct}% easy examples — low discriminative power",
|
|
258
|
+
})
|
|
259
|
+
corrections.append({
|
|
260
|
+
"action": "generate_hard",
|
|
261
|
+
"count": max(5, difficulty["easy"] // 3),
|
|
262
|
+
})
|
|
263
|
+
|
|
264
|
+
if dead and dead.get("count", 0) > 0:
|
|
265
|
+
total = size_info.get("example_count", 1)
|
|
266
|
+
dead_pct = int(100 * dead["count"] / max(total, 1))
|
|
267
|
+
if dead_pct > 10:
|
|
268
|
+
issues.append({
|
|
269
|
+
"type": "dead_examples",
|
|
270
|
+
"severity": "medium",
|
|
271
|
+
"message": f"{dead['count']} dead examples ({dead_pct}%) — scored >=0.9 in all recent experiments",
|
|
272
|
+
})
|
|
273
|
+
corrections.append({
|
|
274
|
+
"action": "retire_dead",
|
|
275
|
+
"ids": dead["ids"],
|
|
276
|
+
})
|
|
277
|
+
|
|
278
|
+
if coverage and coverage.get("missing"):
|
|
279
|
+
issues.append({
|
|
280
|
+
"type": "coverage_gap",
|
|
281
|
+
"severity": "high",
|
|
282
|
+
"message": f"Missing categories: {', '.join(coverage['missing'])} ({coverage['pct']}% coverage)",
|
|
283
|
+
})
|
|
284
|
+
corrections.append({
|
|
285
|
+
"action": "fill_coverage",
|
|
286
|
+
"categories": coverage["missing"],
|
|
287
|
+
})
|
|
288
|
+
|
|
289
|
+
if splits and not splits.get("has_train"):
|
|
290
|
+
issues.append({
|
|
291
|
+
"type": "no_splits",
|
|
292
|
+
"severity": "medium",
|
|
293
|
+
"message": "No train/held-out split — proposer overfit risk",
|
|
294
|
+
})
|
|
295
|
+
corrections.append({
|
|
296
|
+
"action": "create_splits",
|
|
297
|
+
"train_pct": 70,
|
|
298
|
+
})
|
|
299
|
+
|
|
300
|
+
return issues, corrections
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def main():
|
|
304
|
+
parser = argparse.ArgumentParser(description="Dataset health diagnostic")
|
|
305
|
+
parser.add_argument("--config", default=".evolver.json")
|
|
306
|
+
parser.add_argument("--production-seed", default="production_seed.json")
|
|
307
|
+
parser.add_argument("--output", default="health_report.json")
|
|
308
|
+
args = parser.parse_args()
|
|
309
|
+
|
|
310
|
+
ensure_langsmith_api_key()
|
|
311
|
+
|
|
312
|
+
with open(args.config) as f:
|
|
313
|
+
config = json.load(f)
|
|
314
|
+
|
|
315
|
+
production = load_json_safe(args.production_seed)
|
|
316
|
+
|
|
317
|
+
from langsmith import Client
|
|
318
|
+
client = Client()
|
|
319
|
+
|
|
320
|
+
dataset_name = config["dataset"]
|
|
321
|
+
|
|
322
|
+
# Get all examples
|
|
323
|
+
examples = list(client.list_examples(dataset_name=dataset_name, limit=500))
|
|
324
|
+
|
|
325
|
+
# Run checks
|
|
326
|
+
evaluators = config.get("evaluators", ["correctness"])
|
|
327
|
+
size_info = check_size(examples, evaluators)
|
|
328
|
+
difficulty = check_difficulty(client, config)
|
|
329
|
+
dead = check_dead_examples(client, config)
|
|
330
|
+
coverage = check_coverage(examples, production)
|
|
331
|
+
splits = check_splits(client, dataset_name)
|
|
332
|
+
|
|
333
|
+
# Tag difficulty metadata on examples if we computed it
|
|
334
|
+
if difficulty and difficulty.get("example_difficulties"):
|
|
335
|
+
for ex in examples:
|
|
336
|
+
eid = str(ex.id)
|
|
337
|
+
diff = difficulty["example_difficulties"].get(eid)
|
|
338
|
+
if diff:
|
|
339
|
+
meta = dict(getattr(ex, "metadata", None) or {})
|
|
340
|
+
if meta.get("difficulty") != diff:
|
|
341
|
+
meta["difficulty"] = diff
|
|
342
|
+
try:
|
|
343
|
+
client.update_example(ex.id, metadata=meta)
|
|
344
|
+
except Exception:
|
|
345
|
+
pass
|
|
346
|
+
|
|
347
|
+
# Compute health score and build report
|
|
348
|
+
health_score = compute_health_score(size_info, difficulty, dead, coverage, splits)
|
|
349
|
+
issues, corrections = build_issues_and_corrections(size_info, difficulty, dead, coverage, splits)
|
|
350
|
+
|
|
351
|
+
report = {
|
|
352
|
+
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
353
|
+
"health_score": health_score,
|
|
354
|
+
"example_count": size_info["example_count"],
|
|
355
|
+
"min_recommended": size_info["min_recommended"],
|
|
356
|
+
"difficulty": {k: v for k, v in (difficulty or {}).items() if k != "example_difficulties"} or None,
|
|
357
|
+
"dead_examples": dead,
|
|
358
|
+
"coverage": coverage,
|
|
359
|
+
"splits": splits,
|
|
360
|
+
"issues": issues,
|
|
361
|
+
"corrections": corrections,
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
with open(args.output, "w") as f:
|
|
365
|
+
json.dump(report, f, indent=2)
|
|
366
|
+
|
|
367
|
+
# Print human-readable summary
|
|
368
|
+
print(f"Dataset Health: {health_score}/10")
|
|
369
|
+
print(f"Examples: {size_info['example_count']} (min recommended: {size_info['min_recommended']})")
|
|
370
|
+
if difficulty:
|
|
371
|
+
print(f"Difficulty: {difficulty.get('easy', 0)} easy, {difficulty.get('medium', 0)} medium, {difficulty.get('hard', 0)} hard")
|
|
372
|
+
if dead and dead["count"] > 0:
|
|
373
|
+
print(f"Dead examples: {dead['count']}")
|
|
374
|
+
if coverage:
|
|
375
|
+
print(f"Coverage: {coverage['pct']}% ({len(coverage.get('missing', []))} categories missing)")
|
|
376
|
+
if splits:
|
|
377
|
+
print(f"Splits: train={'yes' if splits['has_train'] else 'no'}, held_out={'yes' if splits['has_held_out'] else 'no'}")
|
|
378
|
+
if issues:
|
|
379
|
+
print(f"\nIssues ({len(issues)}):")
|
|
380
|
+
for issue in issues:
|
|
381
|
+
print(f" [{issue['severity']}] {issue['message']}")
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
if __name__ == "__main__":
|
|
385
|
+
main()
|
package/tools/read_results.py
CHANGED
|
@@ -79,6 +79,13 @@ def read_experiment(client, experiment_name):
|
|
|
79
79
|
total_latency_ms = 0
|
|
80
80
|
errors = 0
|
|
81
81
|
|
|
82
|
+
# Batch-fetch all feedback in one API call instead of N+1
|
|
83
|
+
all_run_ids = [run.id for run in runs]
|
|
84
|
+
all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
|
|
85
|
+
fb_map = {}
|
|
86
|
+
for fb in all_feedbacks:
|
|
87
|
+
fb_map.setdefault(str(fb.run_id), []).append(fb)
|
|
88
|
+
|
|
82
89
|
for run in runs:
|
|
83
90
|
example_id = str(run.reference_example_id or run.id)
|
|
84
91
|
tokens = run.total_tokens or 0
|
|
@@ -93,8 +100,8 @@ def read_experiment(client, experiment_name):
|
|
|
93
100
|
if has_error:
|
|
94
101
|
errors += 1
|
|
95
102
|
|
|
96
|
-
# Read feedback/scores
|
|
97
|
-
feedbacks =
|
|
103
|
+
# Read feedback/scores from pre-fetched batch
|
|
104
|
+
feedbacks = fb_map.get(str(run.id), [])
|
|
98
105
|
scores = {}
|
|
99
106
|
for fb in feedbacks:
|
|
100
107
|
if fb.score is not None:
|
|
@@ -220,6 +227,7 @@ def main():
|
|
|
220
227
|
parser.add_argument("--config", default=".evolver.json", help="Path to .evolver.json")
|
|
221
228
|
parser.add_argument("--output", default=None, help="Output JSON path")
|
|
222
229
|
parser.add_argument("--format", default="json", choices=["json", "markdown"], help="Output format")
|
|
230
|
+
parser.add_argument("--split", default=None, help="Filter by dataset split (e.g., 'train')")
|
|
223
231
|
args = parser.parse_args()
|
|
224
232
|
ensure_langsmith_api_key()
|
|
225
233
|
|
|
@@ -233,6 +241,17 @@ def main():
|
|
|
233
241
|
print(f"No results found for experiment: {args.experiment}", file=sys.stderr)
|
|
234
242
|
sys.exit(1)
|
|
235
243
|
|
|
244
|
+
if args.split and result and "per_example" in result:
|
|
245
|
+
with open(args.config) as f:
|
|
246
|
+
cfg = json.load(f)
|
|
247
|
+
split_example_ids = set()
|
|
248
|
+
for ex in client.list_examples(dataset_name=cfg["dataset"], splits=[args.split]):
|
|
249
|
+
split_example_ids.add(str(ex.id))
|
|
250
|
+
result["per_example"] = {k: v for k, v in result["per_example"].items() if k in split_example_ids}
|
|
251
|
+
all_scores = [v["score"] for v in result["per_example"].values()]
|
|
252
|
+
result["combined_score"] = sum(all_scores) / len(all_scores) if all_scores else 0.0
|
|
253
|
+
result["num_examples"] = len(result["per_example"])
|
|
254
|
+
|
|
236
255
|
if args.format == "markdown":
|
|
237
256
|
output = format_markdown(result)
|
|
238
257
|
else:
|
|
@@ -17,6 +17,7 @@ import argparse
|
|
|
17
17
|
import json
|
|
18
18
|
import os
|
|
19
19
|
import platform
|
|
20
|
+
import random
|
|
20
21
|
import sys
|
|
21
22
|
|
|
22
23
|
|
|
@@ -60,9 +61,14 @@ def get_per_example_scores(client, experiment_name):
|
|
|
60
61
|
scores = {}
|
|
61
62
|
try:
|
|
62
63
|
runs = list(client.list_runs(project_name=experiment_name, is_root=True, limit=200))
|
|
64
|
+
all_run_ids = [run.id for run in runs]
|
|
65
|
+
all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
|
|
66
|
+
fb_map = {}
|
|
67
|
+
for fb in all_feedbacks:
|
|
68
|
+
fb_map.setdefault(str(fb.run_id), []).append(fb)
|
|
63
69
|
for run in runs:
|
|
64
70
|
example_id = str(run.reference_example_id or run.id)
|
|
65
|
-
feedbacks =
|
|
71
|
+
feedbacks = fb_map.get(str(run.id), [])
|
|
66
72
|
fb_scores = {}
|
|
67
73
|
for fb in feedbacks:
|
|
68
74
|
if fb.score is not None:
|
|
@@ -107,16 +113,23 @@ def find_transitions(prev_scores, curr_scores, fail_threshold=0.5, pass_threshol
|
|
|
107
113
|
return transitions, regressions
|
|
108
114
|
|
|
109
115
|
|
|
110
|
-
def add_regression_guards(client, dataset_id, transitions, max_guards=5):
|
|
116
|
+
def add_regression_guards(client, dataset_id, transitions, max_guards=5, config=None):
|
|
111
117
|
"""Add regression guard examples to the dataset."""
|
|
118
|
+
config = config or {}
|
|
112
119
|
added = 0
|
|
113
120
|
for t in transitions[:max_guards]:
|
|
114
121
|
try:
|
|
115
122
|
input_data = json.loads(t["input"]) if t["input"].startswith("{") else {"input": t["input"]}
|
|
123
|
+
split = "train" if random.random() < 0.7 else "held_out"
|
|
116
124
|
client.create_example(
|
|
117
125
|
inputs=input_data,
|
|
118
126
|
dataset_id=dataset_id,
|
|
119
|
-
metadata={
|
|
127
|
+
metadata={
|
|
128
|
+
"source": "regression_guard",
|
|
129
|
+
"original_example_id": t["example_id"],
|
|
130
|
+
"added_at_iteration": config.get("iterations", 0),
|
|
131
|
+
},
|
|
132
|
+
split=split,
|
|
120
133
|
)
|
|
121
134
|
added += 1
|
|
122
135
|
except Exception as e:
|
|
@@ -148,7 +161,7 @@ def main():
|
|
|
148
161
|
|
|
149
162
|
added = 0
|
|
150
163
|
if args.add_guards and transitions:
|
|
151
|
-
added = add_regression_guards(client, config["dataset_id"], transitions, args.max_guards)
|
|
164
|
+
added = add_regression_guards(client, config["dataset_id"], transitions, args.max_guards, config=config)
|
|
152
165
|
|
|
153
166
|
result = {
|
|
154
167
|
"previous": args.previous_experiment,
|
package/tools/setup.py
CHANGED
|
@@ -32,13 +32,19 @@ import tempfile
|
|
|
32
32
|
from datetime import datetime, timezone
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
# Track where the API key was loaded from
|
|
36
|
+
key_source = None
|
|
37
|
+
|
|
38
|
+
|
|
35
39
|
def ensure_langsmith_api_key():
|
|
36
40
|
"""Load LANGSMITH_API_KEY from credentials file if not in env.
|
|
37
41
|
|
|
38
42
|
The installer saves the key to the langsmith-cli credentials file,
|
|
39
43
|
but the SDK only reads the env var. This bridges the gap.
|
|
40
44
|
"""
|
|
45
|
+
global key_source
|
|
41
46
|
if os.environ.get("LANGSMITH_API_KEY"):
|
|
47
|
+
key_source = "environment"
|
|
42
48
|
return True
|
|
43
49
|
|
|
44
50
|
# Platform-specific credentials path (matches langsmith-cli)
|
|
@@ -56,6 +62,7 @@ def ensure_langsmith_api_key():
|
|
|
56
62
|
key = line.split("=", 1)[1].strip()
|
|
57
63
|
if key:
|
|
58
64
|
os.environ["LANGSMITH_API_KEY"] = key
|
|
65
|
+
key_source = "credentials file"
|
|
59
66
|
return True
|
|
60
67
|
except OSError:
|
|
61
68
|
pass
|
|
@@ -70,6 +77,7 @@ def ensure_langsmith_api_key():
|
|
|
70
77
|
key = line.split("=", 1)[1].strip().strip("'\"")
|
|
71
78
|
if key:
|
|
72
79
|
os.environ["LANGSMITH_API_KEY"] = key
|
|
80
|
+
key_source = ".env file"
|
|
73
81
|
return True
|
|
74
82
|
except OSError:
|
|
75
83
|
pass
|
|
@@ -87,6 +95,19 @@ def check_dependencies():
|
|
|
87
95
|
return missing
|
|
88
96
|
|
|
89
97
|
|
|
98
|
+
def assign_splits(client, dataset_id, train_pct=70):
|
|
99
|
+
"""Assign train/held_out splits to all examples in a dataset."""
|
|
100
|
+
import random
|
|
101
|
+
examples = list(client.list_examples(dataset_id=dataset_id))
|
|
102
|
+
random.shuffle(examples)
|
|
103
|
+
split_point = int(len(examples) * train_pct / 100)
|
|
104
|
+
for ex in examples[:split_point]:
|
|
105
|
+
client.update_example(ex.id, split="train")
|
|
106
|
+
for ex in examples[split_point:]:
|
|
107
|
+
client.update_example(ex.id, split="held_out")
|
|
108
|
+
return len(examples[:split_point]), len(examples[split_point:])
|
|
109
|
+
|
|
110
|
+
|
|
90
111
|
def resolve_dataset_name(client, base_name):
|
|
91
112
|
"""Find an available dataset name by auto-incrementing the version suffix.
|
|
92
113
|
|
|
@@ -110,6 +131,21 @@ def resolve_dataset_name(client, base_name):
|
|
|
110
131
|
return f"{base_name}-eval-{ts}", 0
|
|
111
132
|
|
|
112
133
|
|
|
134
|
+
def create_dataset_with_retry(client, dataset_name, description, max_retries=3):
|
|
135
|
+
"""Create dataset with retry for transient errors."""
|
|
136
|
+
import time
|
|
137
|
+
for attempt in range(max_retries):
|
|
138
|
+
try:
|
|
139
|
+
return client.create_dataset(dataset_name=dataset_name, description=description)
|
|
140
|
+
except Exception as e:
|
|
141
|
+
if attempt + 1 < max_retries and ("403" in str(e) or "500" in str(e)):
|
|
142
|
+
wait = 2 ** attempt + 0.5
|
|
143
|
+
print(f" Transient error creating dataset (attempt {attempt + 1}/{max_retries}), retrying in {wait:.0f}s...", file=sys.stderr)
|
|
144
|
+
time.sleep(wait)
|
|
145
|
+
else:
|
|
146
|
+
raise
|
|
147
|
+
|
|
148
|
+
|
|
113
149
|
def create_dataset_from_file(client, dataset_name, file_path):
|
|
114
150
|
"""Create a LangSmith dataset from a JSON file of inputs."""
|
|
115
151
|
with open(file_path) as f:
|
|
@@ -118,8 +154,8 @@ def create_dataset_from_file(client, dataset_name, file_path):
|
|
|
118
154
|
if isinstance(data, dict):
|
|
119
155
|
data = data.get("examples", data.get("tasks", [data]))
|
|
120
156
|
|
|
121
|
-
dataset =
|
|
122
|
-
dataset_name
|
|
157
|
+
dataset = create_dataset_with_retry(
|
|
158
|
+
client, dataset_name,
|
|
123
159
|
description=f"Evaluation dataset created from {os.path.basename(file_path)}",
|
|
124
160
|
)
|
|
125
161
|
|
|
@@ -148,10 +184,17 @@ def create_dataset_from_file(client, dataset_name, file_path):
|
|
|
148
184
|
if "metadata" in item:
|
|
149
185
|
ex["metadata"] = item["metadata"]
|
|
150
186
|
|
|
187
|
+
if "metadata" not in ex:
|
|
188
|
+
ex["metadata"] = {}
|
|
189
|
+
ex["metadata"].setdefault("source", "file")
|
|
190
|
+
ex["metadata"].setdefault("added_at_iteration", 0)
|
|
191
|
+
|
|
151
192
|
examples.append(ex)
|
|
152
193
|
|
|
153
194
|
if examples:
|
|
154
195
|
client.create_examples(dataset_id=dataset.id, examples=examples)
|
|
196
|
+
train_n, held_n = assign_splits(client, dataset.id)
|
|
197
|
+
print(f"Assigned splits: {train_n} train, {held_n} held_out", file=sys.stderr)
|
|
155
198
|
|
|
156
199
|
return dataset, len(examples)
|
|
157
200
|
|
|
@@ -167,8 +210,8 @@ def create_dataset_from_langsmith(client, dataset_name, source_project, limit=10
|
|
|
167
210
|
if not runs:
|
|
168
211
|
return None, 0
|
|
169
212
|
|
|
170
|
-
dataset =
|
|
171
|
-
dataset_name
|
|
213
|
+
dataset = create_dataset_with_retry(
|
|
214
|
+
client, dataset_name,
|
|
172
215
|
description=f"Evaluation dataset from production traces ({source_project})",
|
|
173
216
|
)
|
|
174
217
|
|
|
@@ -178,18 +221,21 @@ def create_dataset_from_langsmith(client, dataset_name, source_project, limit=10
|
|
|
178
221
|
ex = {"inputs": run.inputs}
|
|
179
222
|
if run.outputs:
|
|
180
223
|
ex["outputs"] = run.outputs
|
|
224
|
+
ex["metadata"] = {"source": "production", "added_at_iteration": 0}
|
|
181
225
|
examples.append(ex)
|
|
182
226
|
|
|
183
227
|
if examples:
|
|
184
228
|
client.create_examples(dataset_id=dataset.id, examples=examples)
|
|
229
|
+
train_n, held_n = assign_splits(client, dataset.id)
|
|
230
|
+
print(f"Assigned splits: {train_n} train, {held_n} held_out", file=sys.stderr)
|
|
185
231
|
|
|
186
232
|
return dataset, len(examples)
|
|
187
233
|
|
|
188
234
|
|
|
189
235
|
def create_empty_dataset(client, dataset_name):
|
|
190
236
|
"""Create an empty dataset (to be populated by testgen agent)."""
|
|
191
|
-
dataset =
|
|
192
|
-
dataset_name
|
|
237
|
+
dataset = create_dataset_with_retry(
|
|
238
|
+
client, dataset_name,
|
|
193
239
|
description="Evaluation dataset (pending test generation)",
|
|
194
240
|
)
|
|
195
241
|
return dataset
|
|
@@ -316,22 +362,30 @@ def run_baseline(client, dataset_name, entry_point, evaluators):
|
|
|
316
362
|
)
|
|
317
363
|
|
|
318
364
|
experiment_name = results.experiment_name
|
|
319
|
-
|
|
365
|
+
|
|
366
|
+
# Try to extract scores — this can fail with different SDK versions
|
|
367
|
+
mean_score = 0.0
|
|
320
368
|
try:
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
369
|
+
scores = []
|
|
370
|
+
for result in results:
|
|
371
|
+
# Handle both object and dict result formats
|
|
372
|
+
if hasattr(result, 'evaluation_results'):
|
|
373
|
+
eval_results = result.evaluation_results
|
|
374
|
+
elif isinstance(result, dict):
|
|
375
|
+
eval_results = result.get("evaluation_results", {})
|
|
376
|
+
else:
|
|
377
|
+
continue
|
|
325
378
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
if er.get("score") is not None:
|
|
332
|
-
scores.append(er["score"])
|
|
379
|
+
results_list = eval_results.get("results", []) if isinstance(eval_results, dict) else []
|
|
380
|
+
for er in results_list:
|
|
381
|
+
score = er.get("score") if isinstance(er, dict) else getattr(er, "score", None)
|
|
382
|
+
if score is not None:
|
|
383
|
+
scores.append(score)
|
|
333
384
|
|
|
334
|
-
|
|
385
|
+
mean_score = sum(scores) / len(scores) if scores else 0.0
|
|
386
|
+
except Exception as e:
|
|
387
|
+
print(f" Warning: Could not extract baseline scores: {e}", file=sys.stderr)
|
|
388
|
+
print(f" Baseline experiment '{experiment_name}' was created — scores will be computed during /evolve", file=sys.stderr)
|
|
335
389
|
|
|
336
390
|
return experiment_name, mean_score
|
|
337
391
|
|
|
@@ -370,10 +424,28 @@ def main():
|
|
|
370
424
|
# Verify connection
|
|
371
425
|
try:
|
|
372
426
|
client.list_datasets(limit=1)
|
|
373
|
-
print("LangSmith connection verified.")
|
|
427
|
+
print(f"LangSmith connection verified (key from {key_source}).")
|
|
428
|
+
except Exception as e:
|
|
429
|
+
if key_source in ("credentials file", ".env file"):
|
|
430
|
+
print(f"ERROR: API key loaded from {key_source} is invalid or lacks permissions.", file=sys.stderr)
|
|
431
|
+
print(f"The key was loaded from the {key_source} but LangSmith rejected it.", file=sys.stderr)
|
|
432
|
+
print(f"Fix: export LANGSMITH_API_KEY=lsv2_pt_... (with a valid key)", file=sys.stderr)
|
|
433
|
+
else:
|
|
434
|
+
print(f"Failed to connect to LangSmith: {e}", file=sys.stderr)
|
|
435
|
+
sys.exit(1)
|
|
436
|
+
|
|
437
|
+
# Verify write permissions
|
|
438
|
+
try:
|
|
439
|
+
test_ds = client.create_dataset(
|
|
440
|
+
dataset_name="_evolver-permission-check",
|
|
441
|
+
description="Temporary — verifying write permissions",
|
|
442
|
+
)
|
|
443
|
+
client.delete_dataset(dataset_id=test_ds.id)
|
|
444
|
+
print("Write permissions verified.")
|
|
374
445
|
except Exception as e:
|
|
375
|
-
print(f"
|
|
376
|
-
print("
|
|
446
|
+
print(f"ERROR: API key can read but cannot write to LangSmith.", file=sys.stderr)
|
|
447
|
+
print(f"The key needs 'Editor' role or higher to create datasets.", file=sys.stderr)
|
|
448
|
+
print(f"Details: {e}", file=sys.stderr)
|
|
377
449
|
sys.exit(1)
|
|
378
450
|
|
|
379
451
|
project_name = f"evolver-{args.project_name}"
|
package/tools/trace_insights.py
CHANGED
|
@@ -335,10 +335,16 @@ def fetch_scores_from_experiment(experiment_name):
|
|
|
335
335
|
limit=200,
|
|
336
336
|
))
|
|
337
337
|
|
|
338
|
+
all_run_ids = [run.id for run in runs]
|
|
339
|
+
all_feedbacks = list(client.list_feedback(run_ids=all_run_ids))
|
|
340
|
+
fb_map = {}
|
|
341
|
+
for fb in all_feedbacks:
|
|
342
|
+
fb_map.setdefault(str(fb.run_id), []).append(fb)
|
|
343
|
+
|
|
338
344
|
per_task = {}
|
|
339
345
|
for run in runs:
|
|
340
346
|
example_id = str(run.reference_example_id or run.id)
|
|
341
|
-
feedbacks =
|
|
347
|
+
feedbacks = fb_map.get(str(run.id), [])
|
|
342
348
|
scores = [fb.score for fb in feedbacks if fb.score is not None]
|
|
343
349
|
avg_score = sum(scores) / len(scores) if scores else 0.0
|
|
344
350
|
per_task[example_id] = {"score": avg_score}
|