latch-eval-tools 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- latch_eval_tools/__init__.py +64 -0
- latch_eval_tools/answer_extraction.py +35 -0
- latch_eval_tools/cli/__init__.py +0 -0
- latch_eval_tools/cli/eval_lint.py +185 -0
- latch_eval_tools/eval_server.py +570 -0
- latch_eval_tools/faas_utils.py +13 -0
- latch_eval_tools/graders/__init__.py +40 -0
- latch_eval_tools/graders/base.py +29 -0
- latch_eval_tools/graders/distribution.py +102 -0
- latch_eval_tools/graders/label_set.py +75 -0
- latch_eval_tools/graders/marker_gene.py +317 -0
- latch_eval_tools/graders/multiple_choice.py +38 -0
- latch_eval_tools/graders/numeric.py +137 -0
- latch_eval_tools/graders/spatial.py +93 -0
- latch_eval_tools/harness/__init__.py +27 -0
- latch_eval_tools/harness/claudecode.py +212 -0
- latch_eval_tools/harness/minisweagent.py +265 -0
- latch_eval_tools/harness/plotsagent.py +156 -0
- latch_eval_tools/harness/runner.py +191 -0
- latch_eval_tools/harness/utils.py +191 -0
- latch_eval_tools/headless_eval_server.py +727 -0
- latch_eval_tools/linter/__init__.py +25 -0
- latch_eval_tools/linter/explanations.py +331 -0
- latch_eval_tools/linter/runner.py +146 -0
- latch_eval_tools/linter/schema.py +126 -0
- latch_eval_tools/linter/validators.py +595 -0
- latch_eval_tools/types.py +30 -0
- latch_eval_tools/wrapper_entrypoint.py +316 -0
- latch_eval_tools-0.1.0.dist-info/METADATA +118 -0
- latch_eval_tools-0.1.0.dist-info/RECORD +33 -0
- latch_eval_tools-0.1.0.dist-info/WHEEL +4 -0
- latch_eval_tools-0.1.0.dist-info/entry_points.txt +2 -0
- latch_eval_tools-0.1.0.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,595 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
from .schema import (
|
|
4
|
+
VALID_TASKS,
|
|
5
|
+
VALID_KITS,
|
|
6
|
+
VALID_TIME_HORIZONS,
|
|
7
|
+
VALID_EVAL_TYPES,
|
|
8
|
+
VALID_GRADER_TYPES,
|
|
9
|
+
VALID_TOLERANCE_TYPES,
|
|
10
|
+
GRADER_CONFIGS,
|
|
11
|
+
DATA_NODE_PATTERN,
|
|
12
|
+
ALLOWED_TOP_LEVEL_FIELDS,
|
|
13
|
+
ALLOWED_METADATA_FIELDS,
|
|
14
|
+
ALLOWED_GRADER_FIELDS,
|
|
15
|
+
MULTIPLE_CHOICE_PLACEHOLDER,
|
|
16
|
+
LintIssue,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def validate_required_fields(data: dict) -> list[LintIssue]:
|
|
21
|
+
issues = []
|
|
22
|
+
|
|
23
|
+
if "id" not in data:
|
|
24
|
+
issues.append(LintIssue("error", "E001", "Missing required field: id"))
|
|
25
|
+
elif not isinstance(data["id"], str) or not data["id"].strip():
|
|
26
|
+
issues.append(LintIssue("error", "E002", "Field 'id' must be a non-empty string"))
|
|
27
|
+
|
|
28
|
+
if "task" not in data:
|
|
29
|
+
issues.append(LintIssue("error", "E003", "Missing required field: task"))
|
|
30
|
+
elif not isinstance(data["task"], str) or not data["task"].strip():
|
|
31
|
+
issues.append(LintIssue("error", "E004", "Field 'task' must be a non-empty string"))
|
|
32
|
+
|
|
33
|
+
if "metadata" not in data:
|
|
34
|
+
issues.append(LintIssue("error", "E005", "Missing required field: metadata"))
|
|
35
|
+
elif not isinstance(data["metadata"], dict):
|
|
36
|
+
issues.append(LintIssue("error", "E006", "Field 'metadata' must be an object"))
|
|
37
|
+
|
|
38
|
+
return issues
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def validate_metadata(data: dict) -> list[LintIssue]:
|
|
42
|
+
issues = []
|
|
43
|
+
metadata = data.get("metadata")
|
|
44
|
+
|
|
45
|
+
if not isinstance(metadata, dict):
|
|
46
|
+
return issues
|
|
47
|
+
|
|
48
|
+
if "task" not in metadata:
|
|
49
|
+
issues.append(LintIssue("error", "E010", "Missing required field: metadata.task"))
|
|
50
|
+
elif metadata["task"] not in VALID_TASKS:
|
|
51
|
+
issues.append(LintIssue(
|
|
52
|
+
"error", "E011",
|
|
53
|
+
f"Invalid metadata.task: '{metadata['task']}'. Must be one of: {VALID_TASKS}"
|
|
54
|
+
))
|
|
55
|
+
|
|
56
|
+
if "kit" not in metadata:
|
|
57
|
+
issues.append(LintIssue("error", "E012", "Missing required field: metadata.kit"))
|
|
58
|
+
elif metadata["kit"] not in VALID_KITS:
|
|
59
|
+
issues.append(LintIssue(
|
|
60
|
+
"error", "E013",
|
|
61
|
+
f"Invalid metadata.kit: '{metadata['kit']}'. Must be one of: {VALID_KITS}"
|
|
62
|
+
))
|
|
63
|
+
|
|
64
|
+
if "time_horizon" not in metadata:
|
|
65
|
+
issues.append(LintIssue("error", "E014", "Missing required field: metadata.time_horizon"))
|
|
66
|
+
elif metadata["time_horizon"] not in VALID_TIME_HORIZONS:
|
|
67
|
+
issues.append(LintIssue(
|
|
68
|
+
"error", "E015",
|
|
69
|
+
f"Invalid metadata.time_horizon: '{metadata['time_horizon']}'. Must be one of: {VALID_TIME_HORIZONS}"
|
|
70
|
+
))
|
|
71
|
+
|
|
72
|
+
if "eval_type" not in metadata:
|
|
73
|
+
issues.append(LintIssue(
|
|
74
|
+
"warning", "W001",
|
|
75
|
+
f"Missing metadata.eval_type. Consider adding one of: {VALID_EVAL_TYPES}"
|
|
76
|
+
))
|
|
77
|
+
elif metadata["eval_type"] not in VALID_EVAL_TYPES:
|
|
78
|
+
issues.append(LintIssue(
|
|
79
|
+
"error", "E016",
|
|
80
|
+
f"Invalid metadata.eval_type: '{metadata['eval_type']}'. Must be one of: {VALID_EVAL_TYPES}"
|
|
81
|
+
))
|
|
82
|
+
|
|
83
|
+
return issues
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def validate_data_node(data: dict) -> list[LintIssue]:
|
|
87
|
+
issues = []
|
|
88
|
+
data_node = data.get("data_node")
|
|
89
|
+
|
|
90
|
+
if data_node is None:
|
|
91
|
+
return issues
|
|
92
|
+
|
|
93
|
+
def check_node(node: str, location: str) -> list[LintIssue]:
|
|
94
|
+
if not isinstance(node, str):
|
|
95
|
+
return [LintIssue("error", "E020", f"data_node must be string, got {type(node).__name__}", location)]
|
|
96
|
+
if not DATA_NODE_PATTERN.match(node):
|
|
97
|
+
return [LintIssue(
|
|
98
|
+
"error", "E021",
|
|
99
|
+
f"Invalid data_node format: '{node}'. Expected: latch://<id>.(account|node)/<path>",
|
|
100
|
+
location
|
|
101
|
+
)]
|
|
102
|
+
return []
|
|
103
|
+
|
|
104
|
+
if isinstance(data_node, str):
|
|
105
|
+
issues.extend(check_node(data_node, "data_node"))
|
|
106
|
+
elif isinstance(data_node, list):
|
|
107
|
+
for i, node in enumerate(data_node):
|
|
108
|
+
issues.extend(check_node(node, f"data_node[{i}]"))
|
|
109
|
+
else:
|
|
110
|
+
issues.append(LintIssue(
|
|
111
|
+
"error", "E022",
|
|
112
|
+
f"data_node must be string or list, got {type(data_node).__name__}"
|
|
113
|
+
))
|
|
114
|
+
|
|
115
|
+
return issues
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def validate_task_answer_format(data: dict) -> list[LintIssue]:
|
|
119
|
+
issues = []
|
|
120
|
+
task = data.get("task", "")
|
|
121
|
+
grader_type = data.get("grader", {}).get("type")
|
|
122
|
+
|
|
123
|
+
if "<EVAL_ANSWER>" not in task:
|
|
124
|
+
issues.append(LintIssue(
|
|
125
|
+
"warning", "W010",
|
|
126
|
+
"Task description does not contain <EVAL_ANSWER> format specification"
|
|
127
|
+
))
|
|
128
|
+
elif "</EVAL_ANSWER>" not in task:
|
|
129
|
+
issues.append(LintIssue(
|
|
130
|
+
"warning", "W011",
|
|
131
|
+
"Task description has <EVAL_ANSWER> but missing closing </EVAL_ANSWER> tag"
|
|
132
|
+
))
|
|
133
|
+
else:
|
|
134
|
+
task_lower = task.lower()
|
|
135
|
+
has_return_exactly = "return exactly" in task_lower or "respond exactly" in task_lower
|
|
136
|
+
if not has_return_exactly:
|
|
137
|
+
issues.append(LintIssue(
|
|
138
|
+
"warning", "W012",
|
|
139
|
+
"Task has <EVAL_ANSWER> but missing 'Return EXACTLY:' instruction before it"
|
|
140
|
+
))
|
|
141
|
+
|
|
142
|
+
if grader_type == "multiple_choice":
|
|
143
|
+
answer_pattern = re.search(r'"answer"\s*:\s*"([^"]*)"', task)
|
|
144
|
+
if answer_pattern:
|
|
145
|
+
placeholder = answer_pattern.group(1)
|
|
146
|
+
if placeholder != MULTIPLE_CHOICE_PLACEHOLDER:
|
|
147
|
+
issues.append(LintIssue(
|
|
148
|
+
"warning", "W013",
|
|
149
|
+
f"Multiple choice answer placeholder should be '{MULTIPLE_CHOICE_PLACEHOLDER}', "
|
|
150
|
+
f"found '{placeholder}'",
|
|
151
|
+
"task"
|
|
152
|
+
))
|
|
153
|
+
|
|
154
|
+
return issues
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def validate_grader(data: dict) -> list[LintIssue]:
|
|
158
|
+
issues = []
|
|
159
|
+
grader = data.get("grader")
|
|
160
|
+
|
|
161
|
+
if grader is None:
|
|
162
|
+
return issues
|
|
163
|
+
|
|
164
|
+
if not isinstance(grader, dict):
|
|
165
|
+
issues.append(LintIssue("error", "E030", f"grader must be object, got {type(grader).__name__}"))
|
|
166
|
+
return issues
|
|
167
|
+
|
|
168
|
+
grader_type = grader.get("type")
|
|
169
|
+
if grader_type is None:
|
|
170
|
+
issues.append(LintIssue("error", "E031", "Missing required field: grader.type"))
|
|
171
|
+
return issues
|
|
172
|
+
|
|
173
|
+
if grader_type not in VALID_GRADER_TYPES:
|
|
174
|
+
issues.append(LintIssue(
|
|
175
|
+
"error", "E032",
|
|
176
|
+
f"Invalid grader.type: '{grader_type}'. Must be one of: {VALID_GRADER_TYPES}"
|
|
177
|
+
))
|
|
178
|
+
return issues
|
|
179
|
+
|
|
180
|
+
config = grader.get("config")
|
|
181
|
+
if config is None:
|
|
182
|
+
issues.append(LintIssue("error", "E033", "Missing required field: grader.config"))
|
|
183
|
+
return issues
|
|
184
|
+
|
|
185
|
+
if not isinstance(config, dict):
|
|
186
|
+
issues.append(LintIssue("error", "E034", f"grader.config must be object, got {type(config).__name__}"))
|
|
187
|
+
return issues
|
|
188
|
+
|
|
189
|
+
grader_spec = GRADER_CONFIGS.get(grader_type, {})
|
|
190
|
+
|
|
191
|
+
for req_field in grader_spec.get("required", []):
|
|
192
|
+
if req_field not in config:
|
|
193
|
+
if grader_type == "marker_gene_precision_recall" and req_field == "answer_field":
|
|
194
|
+
issues.append(LintIssue(
|
|
195
|
+
"error", "E037",
|
|
196
|
+
f"Missing 'answer_field' - specify which JSON field contains the gene list",
|
|
197
|
+
f"grader.config.{req_field}"
|
|
198
|
+
))
|
|
199
|
+
else:
|
|
200
|
+
issues.append(LintIssue(
|
|
201
|
+
"error", "E035",
|
|
202
|
+
f"Missing required config field for {grader_type}: {req_field}",
|
|
203
|
+
f"grader.config.{req_field}"
|
|
204
|
+
))
|
|
205
|
+
|
|
206
|
+
for req_any_group in grader_spec.get("required_any", []):
|
|
207
|
+
if not any(f in config for f in req_any_group):
|
|
208
|
+
issues.append(LintIssue(
|
|
209
|
+
"error", "E036",
|
|
210
|
+
f"Missing required config field for {grader_type}: one of {req_any_group}",
|
|
211
|
+
"grader.config"
|
|
212
|
+
))
|
|
213
|
+
|
|
214
|
+
issues.extend(_validate_tolerances(config))
|
|
215
|
+
issues.extend(_validate_unrecognized_config_fields(grader_type, config))
|
|
216
|
+
issues.extend(_validate_config_types(grader_type, config))
|
|
217
|
+
issues.extend(_validate_config_semantics(grader_type, config))
|
|
218
|
+
issues.extend(_validate_config_edge_cases(grader_type, config))
|
|
219
|
+
|
|
220
|
+
return issues
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _validate_unrecognized_config_fields(grader_type: str, config: dict) -> list[LintIssue]:
|
|
224
|
+
issues = []
|
|
225
|
+
grader_spec = GRADER_CONFIGS.get(grader_type, {})
|
|
226
|
+
recognized = grader_spec.get("recognized", set())
|
|
227
|
+
|
|
228
|
+
if not recognized:
|
|
229
|
+
return issues
|
|
230
|
+
|
|
231
|
+
for field in config.keys():
|
|
232
|
+
if field not in recognized:
|
|
233
|
+
issues.append(LintIssue(
|
|
234
|
+
"warning", "W030",
|
|
235
|
+
f"Config field '{field}' is not recognized by {grader_type} grader and will be ignored",
|
|
236
|
+
f"grader.config.{field}"
|
|
237
|
+
))
|
|
238
|
+
|
|
239
|
+
return issues
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def _validate_config_types(grader_type: str, config: dict) -> list[LintIssue]:
|
|
243
|
+
issues = []
|
|
244
|
+
|
|
245
|
+
if grader_type in ("numeric_tolerance", "distribution_comparison"):
|
|
246
|
+
ground_truth = config.get("ground_truth")
|
|
247
|
+
if ground_truth is not None and not isinstance(ground_truth, dict):
|
|
248
|
+
issues.append(LintIssue(
|
|
249
|
+
"error", "E060",
|
|
250
|
+
f"ground_truth must be object, got {type(ground_truth).__name__}",
|
|
251
|
+
"grader.config.ground_truth"
|
|
252
|
+
))
|
|
253
|
+
|
|
254
|
+
if grader_type in ("label_set_jaccard", "jaccard_label_set", "marker_gene_precision_recall"):
|
|
255
|
+
ground_truth_labels = config.get("ground_truth_labels")
|
|
256
|
+
if ground_truth_labels is not None and not isinstance(ground_truth_labels, list):
|
|
257
|
+
issues.append(LintIssue(
|
|
258
|
+
"error", "E062",
|
|
259
|
+
f"ground_truth_labels must be list, got {type(ground_truth_labels).__name__}",
|
|
260
|
+
"grader.config.ground_truth_labels"
|
|
261
|
+
))
|
|
262
|
+
|
|
263
|
+
if grader_type in ("label_set_jaccard", "jaccard_label_set", "spatial_adjacency",
|
|
264
|
+
"marker_gene_separation", "marker_gene_precision_recall"):
|
|
265
|
+
scoring = config.get("scoring")
|
|
266
|
+
if scoring is not None and not isinstance(scoring, dict):
|
|
267
|
+
issues.append(LintIssue(
|
|
268
|
+
"error", "E065",
|
|
269
|
+
f"scoring must be object, got {type(scoring).__name__}",
|
|
270
|
+
"grader.config.scoring"
|
|
271
|
+
))
|
|
272
|
+
|
|
273
|
+
return issues
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _validate_config_semantics(grader_type: str, config: dict) -> list[LintIssue]:
|
|
277
|
+
issues = []
|
|
278
|
+
|
|
279
|
+
if grader_type == "numeric_tolerance":
|
|
280
|
+
ground_truth = config.get("ground_truth", {})
|
|
281
|
+
tolerances = config.get("tolerances", {})
|
|
282
|
+
if isinstance(ground_truth, dict) and isinstance(tolerances, dict):
|
|
283
|
+
for field_name in ground_truth.keys():
|
|
284
|
+
if field_name not in tolerances:
|
|
285
|
+
issues.append(LintIssue(
|
|
286
|
+
"warning", "W070",
|
|
287
|
+
f"ground_truth field '{field_name}' has no tolerance specified (defaults to 0)",
|
|
288
|
+
f"grader.config.ground_truth.{field_name}"
|
|
289
|
+
))
|
|
290
|
+
|
|
291
|
+
issues.extend(_validate_tolerance_values(config))
|
|
292
|
+
issues.extend(_validate_threshold_ranges(grader_type, config))
|
|
293
|
+
|
|
294
|
+
return issues
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _validate_tolerance_values(config: dict) -> list[LintIssue]:
|
|
298
|
+
issues = []
|
|
299
|
+
tolerances = config.get("tolerances", {})
|
|
300
|
+
|
|
301
|
+
if not isinstance(tolerances, dict):
|
|
302
|
+
return issues
|
|
303
|
+
|
|
304
|
+
for field_name, tol_config in tolerances.items():
|
|
305
|
+
if not isinstance(tol_config, dict):
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
value = tol_config.get("value")
|
|
309
|
+
if isinstance(value, (int, float)) and value < 0:
|
|
310
|
+
issues.append(LintIssue(
|
|
311
|
+
"error", "E080",
|
|
312
|
+
f"Tolerance value must be non-negative, got {value}",
|
|
313
|
+
f"grader.config.tolerances.{field_name}.value"
|
|
314
|
+
))
|
|
315
|
+
|
|
316
|
+
lower = tol_config.get("lower")
|
|
317
|
+
if isinstance(lower, (int, float)) and lower < 0:
|
|
318
|
+
issues.append(LintIssue(
|
|
319
|
+
"error", "E080",
|
|
320
|
+
f"Tolerance lower bound must be non-negative, got {lower}",
|
|
321
|
+
f"grader.config.tolerances.{field_name}.lower"
|
|
322
|
+
))
|
|
323
|
+
|
|
324
|
+
upper = tol_config.get("upper")
|
|
325
|
+
if isinstance(upper, (int, float)) and upper < 0:
|
|
326
|
+
issues.append(LintIssue(
|
|
327
|
+
"error", "E080",
|
|
328
|
+
f"Tolerance upper bound must be non-negative, got {upper}",
|
|
329
|
+
f"grader.config.tolerances.{field_name}.upper"
|
|
330
|
+
))
|
|
331
|
+
|
|
332
|
+
return issues
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _validate_threshold_ranges(grader_type: str, config: dict) -> list[LintIssue]:
|
|
336
|
+
issues = []
|
|
337
|
+
scoring = config.get("scoring", {})
|
|
338
|
+
|
|
339
|
+
if not isinstance(scoring, dict):
|
|
340
|
+
return issues
|
|
341
|
+
|
|
342
|
+
if grader_type in ("label_set_jaccard", "jaccard_label_set"):
|
|
343
|
+
pass_threshold = scoring.get("pass_threshold")
|
|
344
|
+
if isinstance(pass_threshold, (int, float)):
|
|
345
|
+
if pass_threshold < 0 or pass_threshold > 1:
|
|
346
|
+
issues.append(LintIssue(
|
|
347
|
+
"error", "E081",
|
|
348
|
+
f"Jaccard pass_threshold must be in [0, 1], got {pass_threshold}",
|
|
349
|
+
"grader.config.scoring.pass_threshold"
|
|
350
|
+
))
|
|
351
|
+
|
|
352
|
+
if grader_type == "marker_gene_precision_recall":
|
|
353
|
+
pass_thresholds = scoring.get("pass_thresholds", {})
|
|
354
|
+
if isinstance(pass_thresholds, dict):
|
|
355
|
+
for key in ("precision_at_k", "recall_at_k"):
|
|
356
|
+
val = pass_thresholds.get(key)
|
|
357
|
+
if isinstance(val, (int, float)) and (val < 0 or val > 1):
|
|
358
|
+
issues.append(LintIssue(
|
|
359
|
+
"error", "E082",
|
|
360
|
+
f"Precision/recall threshold must be in [0, 1], got {val}",
|
|
361
|
+
f"grader.config.scoring.pass_thresholds.{key}"
|
|
362
|
+
))
|
|
363
|
+
|
|
364
|
+
return issues
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _validate_config_edge_cases(grader_type: str, config: dict) -> list[LintIssue]:
|
|
368
|
+
issues = []
|
|
369
|
+
|
|
370
|
+
if grader_type == "numeric_tolerance":
|
|
371
|
+
has_tolerance = "tolerance" in config
|
|
372
|
+
has_tolerances = "tolerances" in config
|
|
373
|
+
if has_tolerance and has_tolerances:
|
|
374
|
+
issues.append(LintIssue(
|
|
375
|
+
"warning", "W085",
|
|
376
|
+
"Both 'tolerance' and 'tolerances' present; 'tolerances' will be used",
|
|
377
|
+
"grader.config"
|
|
378
|
+
))
|
|
379
|
+
|
|
380
|
+
if grader_type == "marker_gene_precision_recall":
|
|
381
|
+
has_canonical = "canonical_markers" in config
|
|
382
|
+
has_ground_truth_labels = "ground_truth_labels" in config
|
|
383
|
+
if not has_canonical and has_ground_truth_labels:
|
|
384
|
+
issues.append(LintIssue(
|
|
385
|
+
"warning", "W086",
|
|
386
|
+
"Using 'ground_truth_labels' as fallback for 'canonical_markers'",
|
|
387
|
+
"grader.config"
|
|
388
|
+
))
|
|
389
|
+
|
|
390
|
+
if grader_type == "distribution_comparison":
|
|
391
|
+
ground_truth = config.get("ground_truth", {})
|
|
392
|
+
if isinstance(ground_truth, dict):
|
|
393
|
+
distribution = ground_truth.get("cell_type_distribution", ground_truth)
|
|
394
|
+
if isinstance(distribution, dict):
|
|
395
|
+
percentages = [v for v in distribution.values() if isinstance(v, (int, float))]
|
|
396
|
+
if percentages:
|
|
397
|
+
total = sum(percentages)
|
|
398
|
+
if abs(total - 100) > 5:
|
|
399
|
+
issues.append(LintIssue(
|
|
400
|
+
"warning", "W080",
|
|
401
|
+
f"Distribution percentages sum to {total}, expected ~100%",
|
|
402
|
+
"grader.config.ground_truth"
|
|
403
|
+
))
|
|
404
|
+
|
|
405
|
+
return issues
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def _validate_tolerances(config: dict) -> list[LintIssue]:
|
|
409
|
+
issues = []
|
|
410
|
+
tolerances = config.get("tolerances")
|
|
411
|
+
|
|
412
|
+
if tolerances is None:
|
|
413
|
+
return issues
|
|
414
|
+
|
|
415
|
+
if not isinstance(tolerances, dict):
|
|
416
|
+
issues.append(LintIssue(
|
|
417
|
+
"error", "E040",
|
|
418
|
+
f"tolerances must be object, got {type(tolerances).__name__}",
|
|
419
|
+
"grader.config.tolerances"
|
|
420
|
+
))
|
|
421
|
+
return issues
|
|
422
|
+
|
|
423
|
+
for field_name, tol_config in tolerances.items():
|
|
424
|
+
if not isinstance(tol_config, dict):
|
|
425
|
+
issues.append(LintIssue(
|
|
426
|
+
"error", "E041",
|
|
427
|
+
f"tolerance config must be object, got {type(tol_config).__name__}",
|
|
428
|
+
f"grader.config.tolerances.{field_name}"
|
|
429
|
+
))
|
|
430
|
+
continue
|
|
431
|
+
|
|
432
|
+
tol_type = tol_config.get("type")
|
|
433
|
+
if tol_type is None:
|
|
434
|
+
issues.append(LintIssue(
|
|
435
|
+
"error", "E042",
|
|
436
|
+
"Missing tolerance type",
|
|
437
|
+
f"grader.config.tolerances.{field_name}.type"
|
|
438
|
+
))
|
|
439
|
+
elif tol_type not in VALID_TOLERANCE_TYPES:
|
|
440
|
+
issues.append(LintIssue(
|
|
441
|
+
"error", "E043",
|
|
442
|
+
f"Invalid tolerance type: '{tol_type}'. Must be one of: {VALID_TOLERANCE_TYPES}",
|
|
443
|
+
f"grader.config.tolerances.{field_name}.type"
|
|
444
|
+
))
|
|
445
|
+
|
|
446
|
+
has_value = "value" in tol_config
|
|
447
|
+
has_lower = "lower" in tol_config
|
|
448
|
+
has_upper = "upper" in tol_config
|
|
449
|
+
|
|
450
|
+
if not has_value and not has_lower and not has_upper:
|
|
451
|
+
issues.append(LintIssue(
|
|
452
|
+
"error", "E044",
|
|
453
|
+
"Missing tolerance: need 'value' or 'lower'/'upper' for asymmetric",
|
|
454
|
+
f"grader.config.tolerances.{field_name}"
|
|
455
|
+
))
|
|
456
|
+
elif has_value:
|
|
457
|
+
tol_value = tol_config["value"]
|
|
458
|
+
if not isinstance(tol_value, (int, float)):
|
|
459
|
+
issues.append(LintIssue(
|
|
460
|
+
"error", "E045",
|
|
461
|
+
f"Tolerance value must be numeric, got {type(tol_value).__name__}",
|
|
462
|
+
f"grader.config.tolerances.{field_name}.value"
|
|
463
|
+
))
|
|
464
|
+
if has_lower and not isinstance(tol_config["lower"], (int, float)):
|
|
465
|
+
issues.append(LintIssue(
|
|
466
|
+
"error", "E046",
|
|
467
|
+
f"Tolerance lower must be numeric, got {type(tol_config['lower']).__name__}",
|
|
468
|
+
f"grader.config.tolerances.{field_name}.lower"
|
|
469
|
+
))
|
|
470
|
+
if has_upper and not isinstance(tol_config["upper"], (int, float)):
|
|
471
|
+
issues.append(LintIssue(
|
|
472
|
+
"error", "E047",
|
|
473
|
+
f"Tolerance upper must be numeric, got {type(tol_config['upper']).__name__}",
|
|
474
|
+
f"grader.config.tolerances.{field_name}.upper"
|
|
475
|
+
))
|
|
476
|
+
|
|
477
|
+
return issues
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def validate_answer_fields_match(data: dict) -> list[LintIssue]:
|
|
481
|
+
issues = []
|
|
482
|
+
task = data.get("task", "")
|
|
483
|
+
grader = data.get("grader", {})
|
|
484
|
+
grader_type = grader.get("type")
|
|
485
|
+
config = grader.get("config", {})
|
|
486
|
+
|
|
487
|
+
if not grader_type or grader_type not in GRADER_CONFIGS:
|
|
488
|
+
return issues
|
|
489
|
+
|
|
490
|
+
grader_spec = GRADER_CONFIGS.get(grader_type, {})
|
|
491
|
+
expected_fields = _get_expected_answer_fields(grader_spec, config)
|
|
492
|
+
|
|
493
|
+
if not expected_fields:
|
|
494
|
+
return issues
|
|
495
|
+
|
|
496
|
+
task_fields = _extract_answer_fields_from_task(task)
|
|
497
|
+
|
|
498
|
+
if not task_fields:
|
|
499
|
+
return issues
|
|
500
|
+
|
|
501
|
+
missing_in_task = set(expected_fields) - task_fields
|
|
502
|
+
extra_in_task = task_fields - set(expected_fields)
|
|
503
|
+
|
|
504
|
+
optional_fields = set(grader_spec.get("answer_fields_optional", []))
|
|
505
|
+
missing_in_task -= optional_fields
|
|
506
|
+
|
|
507
|
+
for field in missing_in_task:
|
|
508
|
+
issues.append(LintIssue(
|
|
509
|
+
"error", "E050",
|
|
510
|
+
f"Grader expects answer field '{field}' but task <EVAL_ANSWER> does not include it",
|
|
511
|
+
"task"
|
|
512
|
+
))
|
|
513
|
+
|
|
514
|
+
for field in extra_in_task:
|
|
515
|
+
issues.append(LintIssue(
|
|
516
|
+
"warning", "W031",
|
|
517
|
+
f"Task <EVAL_ANSWER> has field '{field}' not expected by {grader_type} grader",
|
|
518
|
+
"task"
|
|
519
|
+
))
|
|
520
|
+
|
|
521
|
+
return issues
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def _get_expected_answer_fields(grader_spec: dict, config: dict) -> list[str]:
|
|
525
|
+
if "answer_fields" in grader_spec:
|
|
526
|
+
return grader_spec["answer_fields"]
|
|
527
|
+
|
|
528
|
+
if "answer_fields_from" in grader_spec:
|
|
529
|
+
source_field = grader_spec["answer_fields_from"]
|
|
530
|
+
source_data = config.get(source_field, {})
|
|
531
|
+
if isinstance(source_data, dict):
|
|
532
|
+
return list(source_data.keys())
|
|
533
|
+
|
|
534
|
+
if "answer_field_from_config" in grader_spec:
|
|
535
|
+
config_key = grader_spec["answer_field_from_config"]
|
|
536
|
+
default = grader_spec.get("answer_field_default", "value")
|
|
537
|
+
field_name = config.get(config_key, default)
|
|
538
|
+
return [field_name]
|
|
539
|
+
|
|
540
|
+
return []
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def _extract_answer_fields_from_task(task: str) -> set[str]:
|
|
544
|
+
match = re.search(r"<EVAL_ANSWER>\s*(\{[^}]+\})\s*</EVAL_ANSWER>", task, re.DOTALL)
|
|
545
|
+
if not match:
|
|
546
|
+
return set()
|
|
547
|
+
|
|
548
|
+
json_template = match.group(1)
|
|
549
|
+
field_matches = re.findall(r'"([^"]+)"\s*:', json_template)
|
|
550
|
+
return set(field_matches)
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
def validate_unknown_fields(data: dict) -> list[LintIssue]:
|
|
554
|
+
issues = []
|
|
555
|
+
|
|
556
|
+
for field in data.keys():
|
|
557
|
+
if field not in ALLOWED_TOP_LEVEL_FIELDS:
|
|
558
|
+
issues.append(LintIssue(
|
|
559
|
+
"warning", "W020",
|
|
560
|
+
f"Unknown top-level field: '{field}'",
|
|
561
|
+
field
|
|
562
|
+
))
|
|
563
|
+
|
|
564
|
+
metadata = data.get("metadata")
|
|
565
|
+
if isinstance(metadata, dict):
|
|
566
|
+
for field in metadata.keys():
|
|
567
|
+
if field not in ALLOWED_METADATA_FIELDS:
|
|
568
|
+
issues.append(LintIssue(
|
|
569
|
+
"warning", "W021",
|
|
570
|
+
f"Unknown metadata field: '{field}'",
|
|
571
|
+
f"metadata.{field}"
|
|
572
|
+
))
|
|
573
|
+
|
|
574
|
+
grader = data.get("grader")
|
|
575
|
+
if isinstance(grader, dict):
|
|
576
|
+
for field in grader.keys():
|
|
577
|
+
if field not in ALLOWED_GRADER_FIELDS:
|
|
578
|
+
issues.append(LintIssue(
|
|
579
|
+
"warning", "W022",
|
|
580
|
+
f"Unknown grader field: '{field}'",
|
|
581
|
+
f"grader.{field}"
|
|
582
|
+
))
|
|
583
|
+
|
|
584
|
+
return issues
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
ALL_VALIDATORS = [
|
|
588
|
+
validate_required_fields,
|
|
589
|
+
validate_metadata,
|
|
590
|
+
validate_data_node,
|
|
591
|
+
validate_task_answer_format,
|
|
592
|
+
validate_grader,
|
|
593
|
+
validate_answer_fields_match,
|
|
594
|
+
validate_unknown_fields,
|
|
595
|
+
]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from pydantic import BaseModel, Field
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class Eval(BaseModel):
|
|
5
|
+
id: str
|
|
6
|
+
task: str
|
|
7
|
+
data_node: str | list[str] | None = None
|
|
8
|
+
grader: dict | None = None
|
|
9
|
+
timeout: int | None = None
|
|
10
|
+
download_timeout: int | None = None
|
|
11
|
+
agent_timeout: int | None = None
|
|
12
|
+
notes: str | None = None
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# Backward compatibility alias for scbench/spatialbench
|
|
16
|
+
TestCase = Eval
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class EvalResult(BaseModel):
|
|
20
|
+
eval_id: str
|
|
21
|
+
conversation_history: list[dict] = Field(default_factory=list)
|
|
22
|
+
trajectory: list[dict] = Field(default_factory=list)
|
|
23
|
+
notebook_state: dict = Field(default_factory=dict)
|
|
24
|
+
duration_ms: float = 0.0
|
|
25
|
+
grader_result: dict | None = None
|
|
26
|
+
agent_answer: dict | None = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Backward compatibility alias for scbench/spatialbench
|
|
30
|
+
TestResult = EvalResult
|