latch-eval-tools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. latch_eval_tools/__init__.py +64 -0
  2. latch_eval_tools/answer_extraction.py +35 -0
  3. latch_eval_tools/cli/__init__.py +0 -0
  4. latch_eval_tools/cli/eval_lint.py +185 -0
  5. latch_eval_tools/eval_server.py +570 -0
  6. latch_eval_tools/faas_utils.py +13 -0
  7. latch_eval_tools/graders/__init__.py +40 -0
  8. latch_eval_tools/graders/base.py +29 -0
  9. latch_eval_tools/graders/distribution.py +102 -0
  10. latch_eval_tools/graders/label_set.py +75 -0
  11. latch_eval_tools/graders/marker_gene.py +317 -0
  12. latch_eval_tools/graders/multiple_choice.py +38 -0
  13. latch_eval_tools/graders/numeric.py +137 -0
  14. latch_eval_tools/graders/spatial.py +93 -0
  15. latch_eval_tools/harness/__init__.py +27 -0
  16. latch_eval_tools/harness/claudecode.py +212 -0
  17. latch_eval_tools/harness/minisweagent.py +265 -0
  18. latch_eval_tools/harness/plotsagent.py +156 -0
  19. latch_eval_tools/harness/runner.py +191 -0
  20. latch_eval_tools/harness/utils.py +191 -0
  21. latch_eval_tools/headless_eval_server.py +727 -0
  22. latch_eval_tools/linter/__init__.py +25 -0
  23. latch_eval_tools/linter/explanations.py +331 -0
  24. latch_eval_tools/linter/runner.py +146 -0
  25. latch_eval_tools/linter/schema.py +126 -0
  26. latch_eval_tools/linter/validators.py +595 -0
  27. latch_eval_tools/types.py +30 -0
  28. latch_eval_tools/wrapper_entrypoint.py +316 -0
  29. latch_eval_tools-0.1.0.dist-info/METADATA +118 -0
  30. latch_eval_tools-0.1.0.dist-info/RECORD +33 -0
  31. latch_eval_tools-0.1.0.dist-info/WHEEL +4 -0
  32. latch_eval_tools-0.1.0.dist-info/entry_points.txt +2 -0
  33. latch_eval_tools-0.1.0.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,25 @@
1
+ from .runner import lint_eval, lint_directory, format_results, LintResult
2
+ from .schema import (
3
+ VALID_TASKS,
4
+ VALID_KITS,
5
+ VALID_TIME_HORIZONS,
6
+ VALID_EVAL_TYPES,
7
+ GRADER_CONFIGS,
8
+ LintIssue,
9
+ )
10
+ from .explanations import get_explanation, ErrorExplanation
11
+
12
+ __all__ = [
13
+ "lint_eval",
14
+ "lint_directory",
15
+ "format_results",
16
+ "LintResult",
17
+ "LintIssue",
18
+ "VALID_TASKS",
19
+ "VALID_KITS",
20
+ "VALID_TIME_HORIZONS",
21
+ "VALID_EVAL_TYPES",
22
+ "GRADER_CONFIGS",
23
+ "get_explanation",
24
+ "ErrorExplanation",
25
+ ]
@@ -0,0 +1,331 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class ErrorExplanation:
6
+ code: str
7
+ title: str
8
+ explanation: str
9
+ example_before: str
10
+ example_after: str
11
+ doc_link: str | None = None
12
+
13
+
14
+ EXPLANATIONS: dict[str, ErrorExplanation] = {
15
+ "E000": ErrorExplanation(
16
+ code="E000",
17
+ title="File not found",
18
+ explanation="The specified file does not exist at the given path.",
19
+ example_before="evals/missing_file.json",
20
+ example_after="evals/my_eval.json # Use correct path",
21
+ doc_link=None,
22
+ ),
23
+ "E001": ErrorExplanation(
24
+ code="E001",
25
+ title="Invalid JSON / Missing 'id' field",
26
+ explanation="The file contains malformed JSON or is missing the required 'id' field. Every eval must have a unique identifier.",
27
+ example_before='{ "task": "..." }',
28
+ example_after='{ "id": "my_eval_001", "task": "..." }',
29
+ doc_link=None,
30
+ ),
31
+ "E002": ErrorExplanation(
32
+ code="E002",
33
+ title="Invalid root type / Invalid 'id' field",
34
+ explanation="The root must be a JSON object, or the 'id' field must be a non-empty string.",
35
+ example_before='{ "id": "" }',
36
+ example_after='{ "id": "clustering_exp_01" }',
37
+ doc_link=None,
38
+ ),
39
+ "E003": ErrorExplanation(
40
+ code="E003",
41
+ title="Missing 'task' field",
42
+ explanation="Every eval must have a 'task' field containing the prompt/question for the agent.",
43
+ example_before='{ "id": "eval_01" }',
44
+ example_after='{ "id": "eval_01", "task": "Perform clustering on the provided dataset..." }',
45
+ doc_link=None,
46
+ ),
47
+ "E004": ErrorExplanation(
48
+ code="E004",
49
+ title="Invalid 'task' field",
50
+ explanation="The 'task' field must be a non-empty string describing what the agent should do.",
51
+ example_before='{ "task": "" }',
52
+ example_after='{ "task": "Calculate the number of clusters in the dataset..." }',
53
+ doc_link=None,
54
+ ),
55
+ "E005": ErrorExplanation(
56
+ code="E005",
57
+ title="Missing 'metadata' field",
58
+ explanation="Every eval must have a 'metadata' object containing category, kit, time_horizon, etc.",
59
+ example_before='{ "id": "eval_01", "task": "..." }',
60
+ example_after='{ "id": "eval_01", "task": "...", "metadata": { "task": "clustering", "kit": "xenium", "time_horizon": "small" } }',
61
+ doc_link=None,
62
+ ),
63
+ "E006": ErrorExplanation(
64
+ code="E006",
65
+ title="Invalid 'metadata' field",
66
+ explanation="The 'metadata' field must be a JSON object, not a string or array.",
67
+ example_before='"metadata": "clustering"',
68
+ example_after='"metadata": { "task": "clustering" }',
69
+ doc_link=None,
70
+ ),
71
+ "E010": ErrorExplanation(
72
+ code="E010",
73
+ title="Missing 'metadata.task'",
74
+ explanation="The metadata must specify a task category (e.g., 'clustering', 'normalization').",
75
+ example_before='"metadata": { "kit": "xenium" }',
76
+ example_after='"metadata": { "task": "clustering", "kit": "xenium" }',
77
+ doc_link=None,
78
+ ),
79
+ "E011": ErrorExplanation(
80
+ code="E011",
81
+ title="Invalid 'metadata.task'",
82
+ explanation="The task category must be one of: qc, normalization, dimensionality_reduction, clustering, cell_typing, differential_expression, spatial_analysis.",
83
+ example_before='"task": "cluster_analysis"',
84
+ example_after='"task": "clustering"',
85
+ doc_link=None,
86
+ ),
87
+ "E012": ErrorExplanation(
88
+ code="E012",
89
+ title="Missing 'metadata.kit'",
90
+ explanation="The metadata must specify which spatial platform kit was used.",
91
+ example_before='"metadata": { "task": "clustering" }',
92
+ example_after='"metadata": { "task": "clustering", "kit": "xenium" }',
93
+ doc_link=None,
94
+ ),
95
+ "E013": ErrorExplanation(
96
+ code="E013",
97
+ title="Invalid 'metadata.kit'",
98
+ explanation="The kit must be one of: xenium, visium, merfish, vizgen, cosmx, seeker, takara, atlasxomics, curio.",
99
+ example_before='"kit": "10x"',
100
+ example_after='"kit": "xenium"',
101
+ doc_link=None,
102
+ ),
103
+ "E014": ErrorExplanation(
104
+ code="E014",
105
+ title="Missing 'metadata.time_horizon'",
106
+ explanation="The metadata must specify the expected time horizon for the task.",
107
+ example_before='"metadata": { "task": "clustering", "kit": "xenium" }',
108
+ example_after='"metadata": { "task": "clustering", "kit": "xenium", "time_horizon": "small" }',
109
+ doc_link=None,
110
+ ),
111
+ "E015": ErrorExplanation(
112
+ code="E015",
113
+ title="Invalid 'metadata.time_horizon'",
114
+ explanation="The time horizon must be one of: small, medium, large.",
115
+ example_before='"time_horizon": "quick"',
116
+ example_after='"time_horizon": "small"',
117
+ doc_link=None,
118
+ ),
119
+ "E016": ErrorExplanation(
120
+ code="E016",
121
+ title="Invalid 'metadata.eval_type'",
122
+ explanation="The eval_type must be one of: scientific, procedural, observational. Note: 'benchmark' is NOT valid.",
123
+ example_before='"eval_type": "benchmark"',
124
+ example_after='"eval_type": "observational"',
125
+ doc_link=None,
126
+ ),
127
+ "E020": ErrorExplanation(
128
+ code="E020",
129
+ title="Invalid data_node type",
130
+ explanation="The data_node field must be a string (Latch URI).",
131
+ example_before='"data_node": 12345',
132
+ example_after='"data_node": "latch://40248.account/path/to/data"',
133
+ doc_link=None,
134
+ ),
135
+ "E021": ErrorExplanation(
136
+ code="E021",
137
+ title="Invalid data_node format",
138
+ explanation="The data_node must be a valid Latch URI: latch://<id>.(account|node)/<path>",
139
+ example_before='"data_node": "s3://bucket/data"',
140
+ example_after='"data_node": "latch://40248.account/spatialbench/data/GSE123"',
141
+ doc_link=None,
142
+ ),
143
+ "E022": ErrorExplanation(
144
+ code="E022",
145
+ title="Invalid data_node type",
146
+ explanation="The data_node must be a string or array of strings, not an object.",
147
+ example_before='"data_node": { "path": "..." }',
148
+ example_after='"data_node": "latch://40248.account/path/to/data"',
149
+ doc_link=None,
150
+ ),
151
+ "E030": ErrorExplanation(
152
+ code="E030",
153
+ title="Invalid grader type",
154
+ explanation="The grader field must be a JSON object.",
155
+ example_before='"grader": "numeric_tolerance"',
156
+ example_after='"grader": { "type": "numeric_tolerance", "config": { ... } }',
157
+ doc_link=None,
158
+ ),
159
+ "E031": ErrorExplanation(
160
+ code="E031",
161
+ title="Missing 'grader.type'",
162
+ explanation="The grader must specify a type (e.g., 'numeric_tolerance', 'multiple_choice').",
163
+ example_before='"grader": { "config": { ... } }',
164
+ example_after='"grader": { "type": "numeric_tolerance", "config": { ... } }',
165
+ doc_link=None,
166
+ ),
167
+ "E032": ErrorExplanation(
168
+ code="E032",
169
+ title="Invalid 'grader.type'",
170
+ explanation="The grader type must be one of: numeric_tolerance, multiple_choice, distribution_comparison, marker_gene_precision_recall, label_set_jaccard, jaccard_label_set, marker_gene_separation, spatial_adjacency.",
171
+ example_before='"type": "exact_match"',
172
+ example_after='"type": "numeric_tolerance"',
173
+ doc_link=None,
174
+ ),
175
+ "E033": ErrorExplanation(
176
+ code="E033",
177
+ title="Missing 'grader.config'",
178
+ explanation="The grader must have a config object with grader-specific settings.",
179
+ example_before='"grader": { "type": "numeric_tolerance" }',
180
+ example_after='"grader": { "type": "numeric_tolerance", "config": { "ground_truth": { "n_clusters": 5 }, "tolerances": { ... } } }',
181
+ doc_link=None,
182
+ ),
183
+ "E034": ErrorExplanation(
184
+ code="E034",
185
+ title="Invalid 'grader.config'",
186
+ explanation="The grader config must be a JSON object.",
187
+ example_before='"config": "default"',
188
+ example_after='"config": { "ground_truth": { ... } }',
189
+ doc_link=None,
190
+ ),
191
+ "E035": ErrorExplanation(
192
+ code="E035",
193
+ title="Missing required config field",
194
+ explanation="The grader config is missing a required field for this grader type.",
195
+ example_before='"config": { "ground_truth": { "n_clusters": 5 } }',
196
+ example_after='"config": { "ground_truth": { "n_clusters": 5 }, "tolerances": { "n_clusters": { "type": "absolute", "value": 1 } } }',
197
+ doc_link=None,
198
+ ),
199
+ "E036": ErrorExplanation(
200
+ code="E036",
201
+ title="Missing required config field (one of)",
202
+ explanation="The grader config must have at least one of the specified fields.",
203
+ example_before='"config": { }',
204
+ example_after='"config": { "ground_truth_labels": ["A", "B", "C"] }',
205
+ doc_link=None,
206
+ ),
207
+ "E037": ErrorExplanation(
208
+ code="E037",
209
+ title="Missing 'answer_field' in marker_gene_precision_recall",
210
+ explanation="The marker_gene_precision_recall grader requires an 'answer_field' specifying which JSON field in the agent's response contains the gene list.",
211
+ example_before='"config": { "canonical_markers": ["Epcam"], "scoring": { ... } }',
212
+ example_after='"config": { "canonical_markers": ["Epcam"], "answer_field": "housekeeping_genes", "scoring": { ... } }',
213
+ doc_link=None,
214
+ ),
215
+ "E040": ErrorExplanation(
216
+ code="E040",
217
+ title="Invalid tolerances type",
218
+ explanation="The tolerances field must be a JSON object mapping field names to tolerance configs.",
219
+ example_before='"tolerances": 0.1',
220
+ example_after='"tolerances": { "n_clusters": { "type": "absolute", "value": 1 } }',
221
+ doc_link=None,
222
+ ),
223
+ "E041": ErrorExplanation(
224
+ code="E041",
225
+ title="Invalid tolerance config",
226
+ explanation="Each tolerance config must be a JSON object with 'type' and 'value'.",
227
+ example_before='"n_clusters": 1',
228
+ example_after='"n_clusters": { "type": "absolute", "value": 1 }',
229
+ doc_link=None,
230
+ ),
231
+ "E042": ErrorExplanation(
232
+ code="E042",
233
+ title="Missing tolerance type",
234
+ explanation="Each tolerance config must specify a type.",
235
+ example_before='"n_clusters": { "value": 1 }',
236
+ example_after='"n_clusters": { "type": "absolute", "value": 1 }',
237
+ doc_link=None,
238
+ ),
239
+ "E043": ErrorExplanation(
240
+ code="E043",
241
+ title="Invalid tolerance type",
242
+ explanation="The tolerance type must be one of: absolute, relative, min, max. Note: 'percentage' is NOT valid.",
243
+ example_before='"type": "percentage"',
244
+ example_after='"type": "relative"',
245
+ doc_link=None,
246
+ ),
247
+ "E044": ErrorExplanation(
248
+ code="E044",
249
+ title="Missing tolerance value",
250
+ explanation="Each tolerance config must specify a numeric value.",
251
+ example_before='"n_clusters": { "type": "absolute" }',
252
+ example_after='"n_clusters": { "type": "absolute", "value": 1 }',
253
+ doc_link=None,
254
+ ),
255
+ "E045": ErrorExplanation(
256
+ code="E045",
257
+ title="Invalid tolerance value",
258
+ explanation="The tolerance value must be a number (int or float).",
259
+ example_before='"value": "one"',
260
+ example_after='"value": 1',
261
+ doc_link=None,
262
+ ),
263
+ "W000": ErrorExplanation(
264
+ code="W000",
265
+ title="Non-JSON file extension",
266
+ explanation="The file does not have a .json extension. While it may still be valid JSON, consider renaming for clarity.",
267
+ example_before="my_eval.txt",
268
+ example_after="my_eval.json",
269
+ doc_link=None,
270
+ ),
271
+ "W001": ErrorExplanation(
272
+ code="W001",
273
+ title="Missing 'metadata.eval_type'",
274
+ explanation="Consider adding an eval_type to classify this eval. Valid types: scientific, procedural, observational.",
275
+ example_before='"metadata": { "task": "clustering" }',
276
+ example_after='"metadata": { "task": "clustering", "eval_type": "observational" }',
277
+ doc_link=None,
278
+ ),
279
+ "W010": ErrorExplanation(
280
+ code="W010",
281
+ title="Missing <EVAL_ANSWER> block",
282
+ explanation="The task description should include an <EVAL_ANSWER> block to specify the expected output format for the agent.",
283
+ example_before='"task": "Count the clusters in the dataset."',
284
+ example_after='"task": "Count the clusters in the dataset.\\n\\n<EVAL_ANSWER>\\n{\\\"n_clusters\\\": <integer>}\\n</EVAL_ANSWER>"',
285
+ doc_link=None,
286
+ ),
287
+ "W011": ErrorExplanation(
288
+ code="W011",
289
+ title="Missing </EVAL_ANSWER> closing tag",
290
+ explanation="The task has an <EVAL_ANSWER> tag but is missing the closing </EVAL_ANSWER> tag.",
291
+ example_before='"task": "...\\n<EVAL_ANSWER>\\n..."',
292
+ example_after='"task": "...\\n<EVAL_ANSWER>\\n...\\n</EVAL_ANSWER>"',
293
+ doc_link=None,
294
+ ),
295
+ "W012": ErrorExplanation(
296
+ code="W012",
297
+ title="Missing 'Return EXACTLY:' instruction",
298
+ explanation="Tasks with <EVAL_ANSWER> blocks should include 'Return EXACTLY:' before the block to clearly indicate the agent must output the exact format shown, including the tags.",
299
+ example_before='"task": "Count clusters.\\n\\n<EVAL_ANSWER>\\n{\\\"n_clusters\\\": <int>}\\n</EVAL_ANSWER>"',
300
+ example_after='"task": "Count clusters.\\n\\nReturn EXACTLY:\\n\\n<EVAL_ANSWER>\\n{\\\"n_clusters\\\": <int>}\\n</EVAL_ANSWER>"',
301
+ doc_link=None,
302
+ ),
303
+ }
304
+
305
+
306
+ def get_explanation(code: str) -> ErrorExplanation | None:
307
+ return EXPLANATIONS.get(code)
308
+
309
+
310
+ def format_rich_error(code: str, message: str, location: str = "") -> str:
311
+ explanation = get_explanation(code)
312
+ if not explanation:
313
+ loc_str = f" at {location}" if location else ""
314
+ return f"{code}: {message}{loc_str}"
315
+
316
+ lines = [
317
+ f"{code}: {explanation.title}",
318
+ f" {message}",
319
+ "",
320
+ f" How to fix:",
321
+ f" Before: {explanation.example_before}",
322
+ f" After: {explanation.example_after}",
323
+ ]
324
+
325
+ if explanation.doc_link:
326
+ lines.append(f" Docs: {explanation.doc_link}")
327
+
328
+ if location:
329
+ lines.append(f" Location: {location}")
330
+
331
+ return "\n".join(lines)
@@ -0,0 +1,146 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from .schema import LintResult, LintIssue
5
+ from .validators import ALL_VALIDATORS
6
+
7
+
8
+ def lint_eval(path: str | Path) -> LintResult:
9
+ path = Path(path)
10
+ result = LintResult(file_path=str(path))
11
+
12
+ if not path.exists():
13
+ result.issues.append(LintIssue("error", "E000", f"File not found: {path}"))
14
+ return result
15
+
16
+ if not path.suffix == ".json":
17
+ result.issues.append(LintIssue("warning", "W000", f"File does not have .json extension: {path}"))
18
+
19
+ try:
20
+ with open(path) as f:
21
+ data = json.load(f)
22
+ except json.JSONDecodeError as e:
23
+ result.issues.append(LintIssue("error", "E001", f"Invalid JSON: {e}"))
24
+ return result
25
+
26
+ if not isinstance(data, dict):
27
+ result.issues.append(LintIssue("error", "E002", f"Root must be object, got {type(data).__name__}"))
28
+ return result
29
+
30
+ for validator in ALL_VALIDATORS:
31
+ result.issues.extend(validator(data))
32
+
33
+ return result
34
+
35
+
36
+ def lint_directory(path: str | Path, pattern: str = "**/*.json") -> list[LintResult]:
37
+ path = Path(path)
38
+ results = []
39
+
40
+ if not path.exists():
41
+ return [LintResult(
42
+ file_path=str(path),
43
+ issues=[LintIssue("error", "E000", f"Directory not found: {path}")]
44
+ )]
45
+
46
+ if not path.is_dir():
47
+ return [lint_eval(path)]
48
+
49
+ for json_file in sorted(path.glob(pattern)):
50
+ if json_file.name.startswith("."):
51
+ continue
52
+ results.append(lint_eval(json_file))
53
+
54
+ return results
55
+
56
+
57
+ def format_results(results: list[LintResult], format: str = "console") -> str:
58
+ if format == "console":
59
+ return _format_console(results)
60
+ elif format == "json":
61
+ return _format_json(results)
62
+ elif format == "markdown":
63
+ return _format_markdown(results)
64
+ else:
65
+ raise ValueError(f"Unknown format: {format}")
66
+
67
+
68
+ def _format_console(results: list[LintResult]) -> str:
69
+ lines = []
70
+ total_errors = 0
71
+ total_warnings = 0
72
+
73
+ for result in results:
74
+ if not result.issues:
75
+ continue
76
+
77
+ lines.append(f"\n{result.file_path}")
78
+ for issue in result.issues:
79
+ prefix = " ✗" if issue.level == "error" else " ⚠"
80
+ lines.append(f"{prefix} {issue}")
81
+
82
+ total_errors += result.error_count
83
+ total_warnings += result.warning_count
84
+
85
+ lines.append(f"\n{'='*50}")
86
+ lines.append(f"Files checked: {len(results)}")
87
+ lines.append(f"Files with issues: {sum(1 for r in results if r.issues)}")
88
+ lines.append(f"Errors: {total_errors}, Warnings: {total_warnings}")
89
+
90
+ passed = sum(1 for r in results if r.passed)
91
+ lines.append(f"Passed: {passed}/{len(results)}")
92
+
93
+ return "\n".join(lines)
94
+
95
+
96
+ def _format_json(results: list[LintResult]) -> str:
97
+ output = {
98
+ "summary": {
99
+ "files_checked": len(results),
100
+ "files_with_issues": sum(1 for r in results if r.issues),
101
+ "total_errors": sum(r.error_count for r in results),
102
+ "total_warnings": sum(r.warning_count for r in results),
103
+ "passed": sum(1 for r in results if r.passed),
104
+ },
105
+ "results": [
106
+ {
107
+ "file": r.file_path,
108
+ "passed": r.passed,
109
+ "issues": [
110
+ {"level": i.level, "code": i.code, "message": i.message, "location": i.location}
111
+ for i in r.issues
112
+ ]
113
+ }
114
+ for r in results
115
+ ]
116
+ }
117
+ return json.dumps(output, indent=2)
118
+
119
+
120
+ def _format_markdown(results: list[LintResult]) -> str:
121
+ lines = ["# Lint Results\n"]
122
+
123
+ total_errors = sum(r.error_count for r in results)
124
+ total_warnings = sum(r.warning_count for r in results)
125
+ passed = sum(1 for r in results if r.passed)
126
+
127
+ lines.append(f"**Files checked:** {len(results)}")
128
+ lines.append(f"**Passed:** {passed}/{len(results)}")
129
+ lines.append(f"**Errors:** {total_errors}, **Warnings:** {total_warnings}\n")
130
+
131
+ files_with_issues = [r for r in results if r.issues]
132
+ if not files_with_issues:
133
+ lines.append("All files passed validation.")
134
+ return "\n".join(lines)
135
+
136
+ lines.append("## Issues\n")
137
+ for result in files_with_issues:
138
+ lines.append(f"### `{result.file_path}`\n")
139
+ lines.append("| Level | Code | Message | Location |")
140
+ lines.append("|-------|------|---------|----------|")
141
+ for issue in result.issues:
142
+ loc = issue.location or "-"
143
+ lines.append(f"| {issue.level} | {issue.code} | {issue.message} | {loc} |")
144
+ lines.append("")
145
+
146
+ return "\n".join(lines)
@@ -0,0 +1,126 @@
1
+ import re
2
+ from dataclasses import dataclass, field
3
+
4
+ VALID_TASKS = [
5
+ "qc",
6
+ "normalization",
7
+ "dimensionality_reduction",
8
+ "clustering",
9
+ "cell_typing",
10
+ "differential_expression",
11
+ "spatial_analysis",
12
+ ]
13
+
14
+ VALID_KITS = [
15
+ "xenium",
16
+ "visium",
17
+ "merfish",
18
+ "vizgen",
19
+ "cosmx",
20
+ "seeker",
21
+ "takara",
22
+ "atlasxomics",
23
+ "curio",
24
+ ]
25
+
26
+ VALID_TIME_HORIZONS = ["small"]
27
+
28
+ VALID_EVAL_TYPES = ["scientific", "procedural", "observational"]
29
+
30
+ VALID_TOLERANCE_TYPES = ["absolute", "relative", "min", "max"]
31
+
32
+ DATA_NODE_PATTERN = re.compile(r"^latch://\d+\.(account|node)(/.*)?$")
33
+
34
+ MULTIPLE_CHOICE_PLACEHOLDER = "<letter>"
35
+ NUMERIC_PLACEHOLDER = "<number>"
36
+
37
+ GRADER_CONFIGS: dict[str, dict] = {
38
+ "numeric_tolerance": {
39
+ "required": ["ground_truth", "tolerances"],
40
+ "recognized": {"ground_truth", "tolerances", "tolerance"},
41
+ "answer_fields_from": "ground_truth",
42
+ },
43
+ "multiple_choice": {
44
+ "required_any": [["correct_answer", "correct_answers"]],
45
+ "recognized": {"correct_answer", "correct_answers"},
46
+ "answer_fields": ["answer"],
47
+ },
48
+ "distribution_comparison": {
49
+ "required": ["ground_truth", "tolerances"],
50
+ "recognized": {"ground_truth", "tolerances"},
51
+ "answer_fields": ["cell_type_distribution"],
52
+ "answer_fields_optional": ["total_cells"],
53
+ },
54
+ "marker_gene_precision_recall": {
55
+ "required": ["canonical_markers", "scoring", "answer_field"],
56
+ "recognized": {"canonical_markers", "ground_truth_labels", "scoring", "answer_field"},
57
+ "answer_field_from_config": "answer_field",
58
+ "answer_field_default": "top_marker_genes",
59
+ },
60
+ "label_set_jaccard": {
61
+ "required": ["ground_truth_labels"],
62
+ "recognized": {"ground_truth_labels", "scoring", "answer_field"},
63
+ "answer_field_from_config": "answer_field",
64
+ "answer_field_default": "cell_types_predicted",
65
+ },
66
+ "jaccard_label_set": {
67
+ "required": ["ground_truth_labels"],
68
+ "recognized": {"ground_truth_labels", "scoring", "answer_field"},
69
+ "answer_field_from_config": "answer_field",
70
+ "answer_field_default": "cell_types_predicted",
71
+ },
72
+ "marker_gene_separation": {
73
+ "required": ["scoring"],
74
+ "recognized": {"scoring"},
75
+ "answer_fields": ["per_gene_stats", "mean_auroc"],
76
+ },
77
+ "spatial_adjacency": {
78
+ "required": ["scoring"],
79
+ "recognized": {"scoring"},
80
+ "answer_fields": [
81
+ "median_ic_to_pc_um",
82
+ "p90_ic_to_pc_um",
83
+ "pct_ic_within_15um",
84
+ "pct_ic_mixed_within_55um",
85
+ "adjacency_pass",
86
+ ],
87
+ },
88
+ }
89
+
90
+ VALID_GRADER_TYPES = list(GRADER_CONFIGS.keys())
91
+
92
+ ALLOWED_TOP_LEVEL_FIELDS = {"id", "task", "data_node", "grader", "notes", "metadata"}
93
+
94
+ ALLOWED_METADATA_FIELDS = {"task", "kit", "time_horizon", "eval_type", "timeout_s"}
95
+
96
+ ALLOWED_GRADER_FIELDS = {"type", "config"}
97
+
98
+
99
+ @dataclass
100
+ class LintIssue:
101
+ level: str # "error", "warning", "info"
102
+ code: str
103
+ message: str
104
+ location: str = ""
105
+
106
+ def __str__(self) -> str:
107
+ loc = f" at {self.location}" if self.location else ""
108
+ return f"[{self.level.upper()}] {self.code}: {self.message}{loc}"
109
+
110
+
111
+ @dataclass
112
+ class LintResult:
113
+ file_path: str
114
+ issues: list[LintIssue] = field(default_factory=list)
115
+
116
+ @property
117
+ def passed(self) -> bool:
118
+ return not any(i.level == "error" for i in self.issues)
119
+
120
+ @property
121
+ def error_count(self) -> int:
122
+ return sum(1 for i in self.issues if i.level == "error")
123
+
124
+ @property
125
+ def warning_count(self) -> int:
126
+ return sum(1 for i in self.issues if i.level == "warning")