latch-eval-tools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. latch_eval_tools/__init__.py +64 -0
  2. latch_eval_tools/answer_extraction.py +35 -0
  3. latch_eval_tools/cli/__init__.py +0 -0
  4. latch_eval_tools/cli/eval_lint.py +185 -0
  5. latch_eval_tools/eval_server.py +570 -0
  6. latch_eval_tools/faas_utils.py +13 -0
  7. latch_eval_tools/graders/__init__.py +40 -0
  8. latch_eval_tools/graders/base.py +29 -0
  9. latch_eval_tools/graders/distribution.py +102 -0
  10. latch_eval_tools/graders/label_set.py +75 -0
  11. latch_eval_tools/graders/marker_gene.py +317 -0
  12. latch_eval_tools/graders/multiple_choice.py +38 -0
  13. latch_eval_tools/graders/numeric.py +137 -0
  14. latch_eval_tools/graders/spatial.py +93 -0
  15. latch_eval_tools/harness/__init__.py +27 -0
  16. latch_eval_tools/harness/claudecode.py +212 -0
  17. latch_eval_tools/harness/minisweagent.py +265 -0
  18. latch_eval_tools/harness/plotsagent.py +156 -0
  19. latch_eval_tools/harness/runner.py +191 -0
  20. latch_eval_tools/harness/utils.py +191 -0
  21. latch_eval_tools/headless_eval_server.py +727 -0
  22. latch_eval_tools/linter/__init__.py +25 -0
  23. latch_eval_tools/linter/explanations.py +331 -0
  24. latch_eval_tools/linter/runner.py +146 -0
  25. latch_eval_tools/linter/schema.py +126 -0
  26. latch_eval_tools/linter/validators.py +595 -0
  27. latch_eval_tools/types.py +30 -0
  28. latch_eval_tools/wrapper_entrypoint.py +316 -0
  29. latch_eval_tools-0.1.0.dist-info/METADATA +118 -0
  30. latch_eval_tools-0.1.0.dist-info/RECORD +33 -0
  31. latch_eval_tools-0.1.0.dist-info/WHEEL +4 -0
  32. latch_eval_tools-0.1.0.dist-info/entry_points.txt +2 -0
  33. latch_eval_tools-0.1.0.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,595 @@
1
+ import re
2
+
3
+ from .schema import (
4
+ VALID_TASKS,
5
+ VALID_KITS,
6
+ VALID_TIME_HORIZONS,
7
+ VALID_EVAL_TYPES,
8
+ VALID_GRADER_TYPES,
9
+ VALID_TOLERANCE_TYPES,
10
+ GRADER_CONFIGS,
11
+ DATA_NODE_PATTERN,
12
+ ALLOWED_TOP_LEVEL_FIELDS,
13
+ ALLOWED_METADATA_FIELDS,
14
+ ALLOWED_GRADER_FIELDS,
15
+ MULTIPLE_CHOICE_PLACEHOLDER,
16
+ LintIssue,
17
+ )
18
+
19
+
20
+ def validate_required_fields(data: dict) -> list[LintIssue]:
21
+ issues = []
22
+
23
+ if "id" not in data:
24
+ issues.append(LintIssue("error", "E001", "Missing required field: id"))
25
+ elif not isinstance(data["id"], str) or not data["id"].strip():
26
+ issues.append(LintIssue("error", "E002", "Field 'id' must be a non-empty string"))
27
+
28
+ if "task" not in data:
29
+ issues.append(LintIssue("error", "E003", "Missing required field: task"))
30
+ elif not isinstance(data["task"], str) or not data["task"].strip():
31
+ issues.append(LintIssue("error", "E004", "Field 'task' must be a non-empty string"))
32
+
33
+ if "metadata" not in data:
34
+ issues.append(LintIssue("error", "E005", "Missing required field: metadata"))
35
+ elif not isinstance(data["metadata"], dict):
36
+ issues.append(LintIssue("error", "E006", "Field 'metadata' must be an object"))
37
+
38
+ return issues
39
+
40
+
41
+ def validate_metadata(data: dict) -> list[LintIssue]:
42
+ issues = []
43
+ metadata = data.get("metadata")
44
+
45
+ if not isinstance(metadata, dict):
46
+ return issues
47
+
48
+ if "task" not in metadata:
49
+ issues.append(LintIssue("error", "E010", "Missing required field: metadata.task"))
50
+ elif metadata["task"] not in VALID_TASKS:
51
+ issues.append(LintIssue(
52
+ "error", "E011",
53
+ f"Invalid metadata.task: '{metadata['task']}'. Must be one of: {VALID_TASKS}"
54
+ ))
55
+
56
+ if "kit" not in metadata:
57
+ issues.append(LintIssue("error", "E012", "Missing required field: metadata.kit"))
58
+ elif metadata["kit"] not in VALID_KITS:
59
+ issues.append(LintIssue(
60
+ "error", "E013",
61
+ f"Invalid metadata.kit: '{metadata['kit']}'. Must be one of: {VALID_KITS}"
62
+ ))
63
+
64
+ if "time_horizon" not in metadata:
65
+ issues.append(LintIssue("error", "E014", "Missing required field: metadata.time_horizon"))
66
+ elif metadata["time_horizon"] not in VALID_TIME_HORIZONS:
67
+ issues.append(LintIssue(
68
+ "error", "E015",
69
+ f"Invalid metadata.time_horizon: '{metadata['time_horizon']}'. Must be one of: {VALID_TIME_HORIZONS}"
70
+ ))
71
+
72
+ if "eval_type" not in metadata:
73
+ issues.append(LintIssue(
74
+ "warning", "W001",
75
+ f"Missing metadata.eval_type. Consider adding one of: {VALID_EVAL_TYPES}"
76
+ ))
77
+ elif metadata["eval_type"] not in VALID_EVAL_TYPES:
78
+ issues.append(LintIssue(
79
+ "error", "E016",
80
+ f"Invalid metadata.eval_type: '{metadata['eval_type']}'. Must be one of: {VALID_EVAL_TYPES}"
81
+ ))
82
+
83
+ return issues
84
+
85
+
86
+ def validate_data_node(data: dict) -> list[LintIssue]:
87
+ issues = []
88
+ data_node = data.get("data_node")
89
+
90
+ if data_node is None:
91
+ return issues
92
+
93
+ def check_node(node: str, location: str) -> list[LintIssue]:
94
+ if not isinstance(node, str):
95
+ return [LintIssue("error", "E020", f"data_node must be string, got {type(node).__name__}", location)]
96
+ if not DATA_NODE_PATTERN.match(node):
97
+ return [LintIssue(
98
+ "error", "E021",
99
+ f"Invalid data_node format: '{node}'. Expected: latch://<id>.(account|node)/<path>",
100
+ location
101
+ )]
102
+ return []
103
+
104
+ if isinstance(data_node, str):
105
+ issues.extend(check_node(data_node, "data_node"))
106
+ elif isinstance(data_node, list):
107
+ for i, node in enumerate(data_node):
108
+ issues.extend(check_node(node, f"data_node[{i}]"))
109
+ else:
110
+ issues.append(LintIssue(
111
+ "error", "E022",
112
+ f"data_node must be string or list, got {type(data_node).__name__}"
113
+ ))
114
+
115
+ return issues
116
+
117
+
118
+ def validate_task_answer_format(data: dict) -> list[LintIssue]:
119
+ issues = []
120
+ task = data.get("task", "")
121
+ grader_type = data.get("grader", {}).get("type")
122
+
123
+ if "<EVAL_ANSWER>" not in task:
124
+ issues.append(LintIssue(
125
+ "warning", "W010",
126
+ "Task description does not contain <EVAL_ANSWER> format specification"
127
+ ))
128
+ elif "</EVAL_ANSWER>" not in task:
129
+ issues.append(LintIssue(
130
+ "warning", "W011",
131
+ "Task description has <EVAL_ANSWER> but missing closing </EVAL_ANSWER> tag"
132
+ ))
133
+ else:
134
+ task_lower = task.lower()
135
+ has_return_exactly = "return exactly" in task_lower or "respond exactly" in task_lower
136
+ if not has_return_exactly:
137
+ issues.append(LintIssue(
138
+ "warning", "W012",
139
+ "Task has <EVAL_ANSWER> but missing 'Return EXACTLY:' instruction before it"
140
+ ))
141
+
142
+ if grader_type == "multiple_choice":
143
+ answer_pattern = re.search(r'"answer"\s*:\s*"([^"]*)"', task)
144
+ if answer_pattern:
145
+ placeholder = answer_pattern.group(1)
146
+ if placeholder != MULTIPLE_CHOICE_PLACEHOLDER:
147
+ issues.append(LintIssue(
148
+ "warning", "W013",
149
+ f"Multiple choice answer placeholder should be '{MULTIPLE_CHOICE_PLACEHOLDER}', "
150
+ f"found '{placeholder}'",
151
+ "task"
152
+ ))
153
+
154
+ return issues
155
+
156
+
157
+ def validate_grader(data: dict) -> list[LintIssue]:
158
+ issues = []
159
+ grader = data.get("grader")
160
+
161
+ if grader is None:
162
+ return issues
163
+
164
+ if not isinstance(grader, dict):
165
+ issues.append(LintIssue("error", "E030", f"grader must be object, got {type(grader).__name__}"))
166
+ return issues
167
+
168
+ grader_type = grader.get("type")
169
+ if grader_type is None:
170
+ issues.append(LintIssue("error", "E031", "Missing required field: grader.type"))
171
+ return issues
172
+
173
+ if grader_type not in VALID_GRADER_TYPES:
174
+ issues.append(LintIssue(
175
+ "error", "E032",
176
+ f"Invalid grader.type: '{grader_type}'. Must be one of: {VALID_GRADER_TYPES}"
177
+ ))
178
+ return issues
179
+
180
+ config = grader.get("config")
181
+ if config is None:
182
+ issues.append(LintIssue("error", "E033", "Missing required field: grader.config"))
183
+ return issues
184
+
185
+ if not isinstance(config, dict):
186
+ issues.append(LintIssue("error", "E034", f"grader.config must be object, got {type(config).__name__}"))
187
+ return issues
188
+
189
+ grader_spec = GRADER_CONFIGS.get(grader_type, {})
190
+
191
+ for req_field in grader_spec.get("required", []):
192
+ if req_field not in config:
193
+ if grader_type == "marker_gene_precision_recall" and req_field == "answer_field":
194
+ issues.append(LintIssue(
195
+ "error", "E037",
196
+ f"Missing 'answer_field' - specify which JSON field contains the gene list",
197
+ f"grader.config.{req_field}"
198
+ ))
199
+ else:
200
+ issues.append(LintIssue(
201
+ "error", "E035",
202
+ f"Missing required config field for {grader_type}: {req_field}",
203
+ f"grader.config.{req_field}"
204
+ ))
205
+
206
+ for req_any_group in grader_spec.get("required_any", []):
207
+ if not any(f in config for f in req_any_group):
208
+ issues.append(LintIssue(
209
+ "error", "E036",
210
+ f"Missing required config field for {grader_type}: one of {req_any_group}",
211
+ "grader.config"
212
+ ))
213
+
214
+ issues.extend(_validate_tolerances(config))
215
+ issues.extend(_validate_unrecognized_config_fields(grader_type, config))
216
+ issues.extend(_validate_config_types(grader_type, config))
217
+ issues.extend(_validate_config_semantics(grader_type, config))
218
+ issues.extend(_validate_config_edge_cases(grader_type, config))
219
+
220
+ return issues
221
+
222
+
223
+ def _validate_unrecognized_config_fields(grader_type: str, config: dict) -> list[LintIssue]:
224
+ issues = []
225
+ grader_spec = GRADER_CONFIGS.get(grader_type, {})
226
+ recognized = grader_spec.get("recognized", set())
227
+
228
+ if not recognized:
229
+ return issues
230
+
231
+ for field in config.keys():
232
+ if field not in recognized:
233
+ issues.append(LintIssue(
234
+ "warning", "W030",
235
+ f"Config field '{field}' is not recognized by {grader_type} grader and will be ignored",
236
+ f"grader.config.{field}"
237
+ ))
238
+
239
+ return issues
240
+
241
+
242
+ def _validate_config_types(grader_type: str, config: dict) -> list[LintIssue]:
243
+ issues = []
244
+
245
+ if grader_type in ("numeric_tolerance", "distribution_comparison"):
246
+ ground_truth = config.get("ground_truth")
247
+ if ground_truth is not None and not isinstance(ground_truth, dict):
248
+ issues.append(LintIssue(
249
+ "error", "E060",
250
+ f"ground_truth must be object, got {type(ground_truth).__name__}",
251
+ "grader.config.ground_truth"
252
+ ))
253
+
254
+ if grader_type in ("label_set_jaccard", "jaccard_label_set", "marker_gene_precision_recall"):
255
+ ground_truth_labels = config.get("ground_truth_labels")
256
+ if ground_truth_labels is not None and not isinstance(ground_truth_labels, list):
257
+ issues.append(LintIssue(
258
+ "error", "E062",
259
+ f"ground_truth_labels must be list, got {type(ground_truth_labels).__name__}",
260
+ "grader.config.ground_truth_labels"
261
+ ))
262
+
263
+ if grader_type in ("label_set_jaccard", "jaccard_label_set", "spatial_adjacency",
264
+ "marker_gene_separation", "marker_gene_precision_recall"):
265
+ scoring = config.get("scoring")
266
+ if scoring is not None and not isinstance(scoring, dict):
267
+ issues.append(LintIssue(
268
+ "error", "E065",
269
+ f"scoring must be object, got {type(scoring).__name__}",
270
+ "grader.config.scoring"
271
+ ))
272
+
273
+ return issues
274
+
275
+
276
+ def _validate_config_semantics(grader_type: str, config: dict) -> list[LintIssue]:
277
+ issues = []
278
+
279
+ if grader_type == "numeric_tolerance":
280
+ ground_truth = config.get("ground_truth", {})
281
+ tolerances = config.get("tolerances", {})
282
+ if isinstance(ground_truth, dict) and isinstance(tolerances, dict):
283
+ for field_name in ground_truth.keys():
284
+ if field_name not in tolerances:
285
+ issues.append(LintIssue(
286
+ "warning", "W070",
287
+ f"ground_truth field '{field_name}' has no tolerance specified (defaults to 0)",
288
+ f"grader.config.ground_truth.{field_name}"
289
+ ))
290
+
291
+ issues.extend(_validate_tolerance_values(config))
292
+ issues.extend(_validate_threshold_ranges(grader_type, config))
293
+
294
+ return issues
295
+
296
+
297
+ def _validate_tolerance_values(config: dict) -> list[LintIssue]:
298
+ issues = []
299
+ tolerances = config.get("tolerances", {})
300
+
301
+ if not isinstance(tolerances, dict):
302
+ return issues
303
+
304
+ for field_name, tol_config in tolerances.items():
305
+ if not isinstance(tol_config, dict):
306
+ continue
307
+
308
+ value = tol_config.get("value")
309
+ if isinstance(value, (int, float)) and value < 0:
310
+ issues.append(LintIssue(
311
+ "error", "E080",
312
+ f"Tolerance value must be non-negative, got {value}",
313
+ f"grader.config.tolerances.{field_name}.value"
314
+ ))
315
+
316
+ lower = tol_config.get("lower")
317
+ if isinstance(lower, (int, float)) and lower < 0:
318
+ issues.append(LintIssue(
319
+ "error", "E080",
320
+ f"Tolerance lower bound must be non-negative, got {lower}",
321
+ f"grader.config.tolerances.{field_name}.lower"
322
+ ))
323
+
324
+ upper = tol_config.get("upper")
325
+ if isinstance(upper, (int, float)) and upper < 0:
326
+ issues.append(LintIssue(
327
+ "error", "E080",
328
+ f"Tolerance upper bound must be non-negative, got {upper}",
329
+ f"grader.config.tolerances.{field_name}.upper"
330
+ ))
331
+
332
+ return issues
333
+
334
+
335
+ def _validate_threshold_ranges(grader_type: str, config: dict) -> list[LintIssue]:
336
+ issues = []
337
+ scoring = config.get("scoring", {})
338
+
339
+ if not isinstance(scoring, dict):
340
+ return issues
341
+
342
+ if grader_type in ("label_set_jaccard", "jaccard_label_set"):
343
+ pass_threshold = scoring.get("pass_threshold")
344
+ if isinstance(pass_threshold, (int, float)):
345
+ if pass_threshold < 0 or pass_threshold > 1:
346
+ issues.append(LintIssue(
347
+ "error", "E081",
348
+ f"Jaccard pass_threshold must be in [0, 1], got {pass_threshold}",
349
+ "grader.config.scoring.pass_threshold"
350
+ ))
351
+
352
+ if grader_type == "marker_gene_precision_recall":
353
+ pass_thresholds = scoring.get("pass_thresholds", {})
354
+ if isinstance(pass_thresholds, dict):
355
+ for key in ("precision_at_k", "recall_at_k"):
356
+ val = pass_thresholds.get(key)
357
+ if isinstance(val, (int, float)) and (val < 0 or val > 1):
358
+ issues.append(LintIssue(
359
+ "error", "E082",
360
+ f"Precision/recall threshold must be in [0, 1], got {val}",
361
+ f"grader.config.scoring.pass_thresholds.{key}"
362
+ ))
363
+
364
+ return issues
365
+
366
+
367
+ def _validate_config_edge_cases(grader_type: str, config: dict) -> list[LintIssue]:
368
+ issues = []
369
+
370
+ if grader_type == "numeric_tolerance":
371
+ has_tolerance = "tolerance" in config
372
+ has_tolerances = "tolerances" in config
373
+ if has_tolerance and has_tolerances:
374
+ issues.append(LintIssue(
375
+ "warning", "W085",
376
+ "Both 'tolerance' and 'tolerances' present; 'tolerances' will be used",
377
+ "grader.config"
378
+ ))
379
+
380
+ if grader_type == "marker_gene_precision_recall":
381
+ has_canonical = "canonical_markers" in config
382
+ has_ground_truth_labels = "ground_truth_labels" in config
383
+ if not has_canonical and has_ground_truth_labels:
384
+ issues.append(LintIssue(
385
+ "warning", "W086",
386
+ "Using 'ground_truth_labels' as fallback for 'canonical_markers'",
387
+ "grader.config"
388
+ ))
389
+
390
+ if grader_type == "distribution_comparison":
391
+ ground_truth = config.get("ground_truth", {})
392
+ if isinstance(ground_truth, dict):
393
+ distribution = ground_truth.get("cell_type_distribution", ground_truth)
394
+ if isinstance(distribution, dict):
395
+ percentages = [v for v in distribution.values() if isinstance(v, (int, float))]
396
+ if percentages:
397
+ total = sum(percentages)
398
+ if abs(total - 100) > 5:
399
+ issues.append(LintIssue(
400
+ "warning", "W080",
401
+ f"Distribution percentages sum to {total}, expected ~100%",
402
+ "grader.config.ground_truth"
403
+ ))
404
+
405
+ return issues
406
+
407
+
408
+ def _validate_tolerances(config: dict) -> list[LintIssue]:
409
+ issues = []
410
+ tolerances = config.get("tolerances")
411
+
412
+ if tolerances is None:
413
+ return issues
414
+
415
+ if not isinstance(tolerances, dict):
416
+ issues.append(LintIssue(
417
+ "error", "E040",
418
+ f"tolerances must be object, got {type(tolerances).__name__}",
419
+ "grader.config.tolerances"
420
+ ))
421
+ return issues
422
+
423
+ for field_name, tol_config in tolerances.items():
424
+ if not isinstance(tol_config, dict):
425
+ issues.append(LintIssue(
426
+ "error", "E041",
427
+ f"tolerance config must be object, got {type(tol_config).__name__}",
428
+ f"grader.config.tolerances.{field_name}"
429
+ ))
430
+ continue
431
+
432
+ tol_type = tol_config.get("type")
433
+ if tol_type is None:
434
+ issues.append(LintIssue(
435
+ "error", "E042",
436
+ "Missing tolerance type",
437
+ f"grader.config.tolerances.{field_name}.type"
438
+ ))
439
+ elif tol_type not in VALID_TOLERANCE_TYPES:
440
+ issues.append(LintIssue(
441
+ "error", "E043",
442
+ f"Invalid tolerance type: '{tol_type}'. Must be one of: {VALID_TOLERANCE_TYPES}",
443
+ f"grader.config.tolerances.{field_name}.type"
444
+ ))
445
+
446
+ has_value = "value" in tol_config
447
+ has_lower = "lower" in tol_config
448
+ has_upper = "upper" in tol_config
449
+
450
+ if not has_value and not has_lower and not has_upper:
451
+ issues.append(LintIssue(
452
+ "error", "E044",
453
+ "Missing tolerance: need 'value' or 'lower'/'upper' for asymmetric",
454
+ f"grader.config.tolerances.{field_name}"
455
+ ))
456
+ elif has_value:
457
+ tol_value = tol_config["value"]
458
+ if not isinstance(tol_value, (int, float)):
459
+ issues.append(LintIssue(
460
+ "error", "E045",
461
+ f"Tolerance value must be numeric, got {type(tol_value).__name__}",
462
+ f"grader.config.tolerances.{field_name}.value"
463
+ ))
464
+ if has_lower and not isinstance(tol_config["lower"], (int, float)):
465
+ issues.append(LintIssue(
466
+ "error", "E046",
467
+ f"Tolerance lower must be numeric, got {type(tol_config['lower']).__name__}",
468
+ f"grader.config.tolerances.{field_name}.lower"
469
+ ))
470
+ if has_upper and not isinstance(tol_config["upper"], (int, float)):
471
+ issues.append(LintIssue(
472
+ "error", "E047",
473
+ f"Tolerance upper must be numeric, got {type(tol_config['upper']).__name__}",
474
+ f"grader.config.tolerances.{field_name}.upper"
475
+ ))
476
+
477
+ return issues
478
+
479
+
480
+ def validate_answer_fields_match(data: dict) -> list[LintIssue]:
481
+ issues = []
482
+ task = data.get("task", "")
483
+ grader = data.get("grader", {})
484
+ grader_type = grader.get("type")
485
+ config = grader.get("config", {})
486
+
487
+ if not grader_type or grader_type not in GRADER_CONFIGS:
488
+ return issues
489
+
490
+ grader_spec = GRADER_CONFIGS.get(grader_type, {})
491
+ expected_fields = _get_expected_answer_fields(grader_spec, config)
492
+
493
+ if not expected_fields:
494
+ return issues
495
+
496
+ task_fields = _extract_answer_fields_from_task(task)
497
+
498
+ if not task_fields:
499
+ return issues
500
+
501
+ missing_in_task = set(expected_fields) - task_fields
502
+ extra_in_task = task_fields - set(expected_fields)
503
+
504
+ optional_fields = set(grader_spec.get("answer_fields_optional", []))
505
+ missing_in_task -= optional_fields
506
+
507
+ for field in missing_in_task:
508
+ issues.append(LintIssue(
509
+ "error", "E050",
510
+ f"Grader expects answer field '{field}' but task <EVAL_ANSWER> does not include it",
511
+ "task"
512
+ ))
513
+
514
+ for field in extra_in_task:
515
+ issues.append(LintIssue(
516
+ "warning", "W031",
517
+ f"Task <EVAL_ANSWER> has field '{field}' not expected by {grader_type} grader",
518
+ "task"
519
+ ))
520
+
521
+ return issues
522
+
523
+
524
+ def _get_expected_answer_fields(grader_spec: dict, config: dict) -> list[str]:
525
+ if "answer_fields" in grader_spec:
526
+ return grader_spec["answer_fields"]
527
+
528
+ if "answer_fields_from" in grader_spec:
529
+ source_field = grader_spec["answer_fields_from"]
530
+ source_data = config.get(source_field, {})
531
+ if isinstance(source_data, dict):
532
+ return list(source_data.keys())
533
+
534
+ if "answer_field_from_config" in grader_spec:
535
+ config_key = grader_spec["answer_field_from_config"]
536
+ default = grader_spec.get("answer_field_default", "value")
537
+ field_name = config.get(config_key, default)
538
+ return [field_name]
539
+
540
+ return []
541
+
542
+
543
+ def _extract_answer_fields_from_task(task: str) -> set[str]:
544
+ match = re.search(r"<EVAL_ANSWER>\s*(\{[^}]+\})\s*</EVAL_ANSWER>", task, re.DOTALL)
545
+ if not match:
546
+ return set()
547
+
548
+ json_template = match.group(1)
549
+ field_matches = re.findall(r'"([^"]+)"\s*:', json_template)
550
+ return set(field_matches)
551
+
552
+
553
+ def validate_unknown_fields(data: dict) -> list[LintIssue]:
554
+ issues = []
555
+
556
+ for field in data.keys():
557
+ if field not in ALLOWED_TOP_LEVEL_FIELDS:
558
+ issues.append(LintIssue(
559
+ "warning", "W020",
560
+ f"Unknown top-level field: '{field}'",
561
+ field
562
+ ))
563
+
564
+ metadata = data.get("metadata")
565
+ if isinstance(metadata, dict):
566
+ for field in metadata.keys():
567
+ if field not in ALLOWED_METADATA_FIELDS:
568
+ issues.append(LintIssue(
569
+ "warning", "W021",
570
+ f"Unknown metadata field: '{field}'",
571
+ f"metadata.{field}"
572
+ ))
573
+
574
+ grader = data.get("grader")
575
+ if isinstance(grader, dict):
576
+ for field in grader.keys():
577
+ if field not in ALLOWED_GRADER_FIELDS:
578
+ issues.append(LintIssue(
579
+ "warning", "W022",
580
+ f"Unknown grader field: '{field}'",
581
+ f"grader.{field}"
582
+ ))
583
+
584
+ return issues
585
+
586
+
587
+ ALL_VALIDATORS = [
588
+ validate_required_fields,
589
+ validate_metadata,
590
+ validate_data_node,
591
+ validate_task_answer_format,
592
+ validate_grader,
593
+ validate_answer_fields_match,
594
+ validate_unknown_fields,
595
+ ]
@@ -0,0 +1,30 @@
1
+ from pydantic import BaseModel, Field
2
+
3
+
4
+ class Eval(BaseModel):
5
+ id: str
6
+ task: str
7
+ data_node: str | list[str] | None = None
8
+ grader: dict | None = None
9
+ timeout: int | None = None
10
+ download_timeout: int | None = None
11
+ agent_timeout: int | None = None
12
+ notes: str | None = None
13
+
14
+
15
+ # Backward compatibility alias for scbench/spatialbench
16
+ TestCase = Eval
17
+
18
+
19
+ class EvalResult(BaseModel):
20
+ eval_id: str
21
+ conversation_history: list[dict] = Field(default_factory=list)
22
+ trajectory: list[dict] = Field(default_factory=list)
23
+ notebook_state: dict = Field(default_factory=dict)
24
+ duration_ms: float = 0.0
25
+ grader_result: dict | None = None
26
+ agent_answer: dict | None = None
27
+
28
+
29
+ # Backward compatibility alias for scbench/spatialbench
30
+ TestResult = EvalResult