latch-eval-tools 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. latch_eval_tools-0.1.0/.gitignore +6 -0
  2. latch_eval_tools-0.1.0/Justfile +7 -0
  3. latch_eval_tools-0.1.0/LICENSE +1 -0
  4. latch_eval_tools-0.1.0/PKG-INFO +118 -0
  5. latch_eval_tools-0.1.0/README.md +96 -0
  6. latch_eval_tools-0.1.0/pyproject.toml +35 -0
  7. latch_eval_tools-0.1.0/src/latch_eval_tools/__init__.py +64 -0
  8. latch_eval_tools-0.1.0/src/latch_eval_tools/answer_extraction.py +35 -0
  9. latch_eval_tools-0.1.0/src/latch_eval_tools/cli/__init__.py +0 -0
  10. latch_eval_tools-0.1.0/src/latch_eval_tools/cli/eval_lint.py +185 -0
  11. latch_eval_tools-0.1.0/src/latch_eval_tools/eval_server.py +570 -0
  12. latch_eval_tools-0.1.0/src/latch_eval_tools/faas_utils.py +13 -0
  13. latch_eval_tools-0.1.0/src/latch_eval_tools/graders/__init__.py +40 -0
  14. latch_eval_tools-0.1.0/src/latch_eval_tools/graders/base.py +29 -0
  15. latch_eval_tools-0.1.0/src/latch_eval_tools/graders/distribution.py +102 -0
  16. latch_eval_tools-0.1.0/src/latch_eval_tools/graders/label_set.py +75 -0
  17. latch_eval_tools-0.1.0/src/latch_eval_tools/graders/marker_gene.py +317 -0
  18. latch_eval_tools-0.1.0/src/latch_eval_tools/graders/multiple_choice.py +38 -0
  19. latch_eval_tools-0.1.0/src/latch_eval_tools/graders/numeric.py +137 -0
  20. latch_eval_tools-0.1.0/src/latch_eval_tools/graders/spatial.py +93 -0
  21. latch_eval_tools-0.1.0/src/latch_eval_tools/harness/__init__.py +27 -0
  22. latch_eval_tools-0.1.0/src/latch_eval_tools/harness/claudecode.py +212 -0
  23. latch_eval_tools-0.1.0/src/latch_eval_tools/harness/minisweagent.py +265 -0
  24. latch_eval_tools-0.1.0/src/latch_eval_tools/harness/plotsagent.py +156 -0
  25. latch_eval_tools-0.1.0/src/latch_eval_tools/harness/runner.py +191 -0
  26. latch_eval_tools-0.1.0/src/latch_eval_tools/harness/utils.py +191 -0
  27. latch_eval_tools-0.1.0/src/latch_eval_tools/headless_eval_server.py +727 -0
  28. latch_eval_tools-0.1.0/src/latch_eval_tools/linter/__init__.py +25 -0
  29. latch_eval_tools-0.1.0/src/latch_eval_tools/linter/explanations.py +331 -0
  30. latch_eval_tools-0.1.0/src/latch_eval_tools/linter/runner.py +146 -0
  31. latch_eval_tools-0.1.0/src/latch_eval_tools/linter/schema.py +126 -0
  32. latch_eval_tools-0.1.0/src/latch_eval_tools/linter/validators.py +595 -0
  33. latch_eval_tools-0.1.0/src/latch_eval_tools/types.py +30 -0
  34. latch_eval_tools-0.1.0/src/latch_eval_tools/wrapper_entrypoint.py +316 -0
@@ -0,0 +1,6 @@
1
+ .DS_Store
2
+ __pycache__
3
+ .ruff_cache
4
+ !**/.gitkeep
5
+ /credentials
6
+ /env
@@ -0,0 +1,7 @@
1
+ build:
2
+ rm -rf dist
3
+ uv build
4
+
5
+ publish:
6
+ uv publish --token $(<credentials/pypi_token)
7
+ rm -rf dist
@@ -0,0 +1 @@
1
+ © LatchBio LLC. All rights reserved.
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: latch-eval-tools
3
+ Version: 0.1.0
4
+ Summary: Shared eval tools for single-cell bench, spatial bench, and future biology benchmarks.
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: aiohttp>=3.0.0
8
+ Requires-Dist: anthropic>=0.72.0
9
+ Requires-Dist: latch-config>=0.1.0
10
+ Requires-Dist: latch>=2.0.0
11
+ Requires-Dist: matplotlib>=3.0.0
12
+ Requires-Dist: mini-swe-agent
13
+ Requires-Dist: numpy>=1.24.0
14
+ Requires-Dist: openai>=1.0.0
15
+ Requires-Dist: orjson>=3.0.0
16
+ Requires-Dist: pydantic>=2.0.0
17
+ Requires-Dist: scikit-learn>=1.3.0
18
+ Requires-Dist: scipy>=1.10.0
19
+ Requires-Dist: statsmodels>=0.14.0
20
+ Requires-Dist: websockets>=12.0
21
+ Description-Content-Type: text/markdown
22
+
23
+ # latch-eval-tools
24
+
25
+ Shared eval tools for single-cell bench, spatial bench, and future biology benchmarks.
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install latch-eval-tools
31
+ ```
32
+
33
+ ## Components
34
+
35
+ ### Types
36
+
37
+ ```python
38
+ from latch_eval_tools import Eval, EvalResult
39
+
40
+ eval_case = Eval(
41
+ id="test_001",
42
+ task="Count cells in the dataset",
43
+ data_node="latch:///data/sample.h5ad",
44
+ grader={"type": "numeric_tolerance", "config": {...}}
45
+ )
46
+ ```
47
+
48
+ ### Graders
49
+
50
+ Available graders: `numeric_tolerance`, `label_set_jaccard`, `distribution_comparison`, `marker_gene_precision_recall`, `marker_gene_separation`, `spatial_adjacency`, `multiple_choice`
51
+
52
+ ```python
53
+ from latch_eval_tools.graders import get_grader, NumericToleranceGrader
54
+
55
+ grader = get_grader("numeric_tolerance")
56
+ result = grader.evaluate(
57
+ agent_answer={"n_cells": 1523},
58
+ config={
59
+ "ground_truth": {"n_cells": 1500},
60
+ "tolerances": {"n_cells": {"type": "relative", "value": 0.05}}
61
+ }
62
+ )
63
+ print(result.passed)
64
+ print(result.reasoning)
65
+ ```
66
+
67
+ ### Harness
68
+
69
+ Run evaluations with different agents:
70
+
71
+ ```python
72
+ from latch_eval_tools.harness import EvalRunner, run_minisweagent_task
73
+
74
+ runner = EvalRunner("evals/count_cells.json", cache_name=".scbench")
75
+ result = runner.run(agent_function=lambda task, work_dir:
76
+ run_minisweagent_task(task, work_dir, model_name="anthropic/claude-sonnet-4")
77
+ )
78
+
79
+ def my_agent(task_prompt: str, work_dir: Path) -> dict:
80
+ return {"answer": json.loads((work_dir / "eval_answer.json").read_text())}
81
+
82
+ runner.run(agent_function=my_agent)
83
+ ```
84
+
85
+ Built-in agents: `run_minisweagent_task`, `run_claudecode_task`, `run_plotsagent_task`
86
+
87
+ ### Linter
88
+
89
+ Validate eval JSON files:
90
+
91
+ ```bash
92
+ eval-lint evals/my_dataset/
93
+ eval-lint evals/ --format json
94
+ ```
95
+
96
+ ```python
97
+ from latch_eval_tools.linter import lint_eval, lint_directory
98
+
99
+ result = lint_eval("evals/test.json")
100
+ print(result.passed, result.issues)
101
+ ```
102
+
103
+ ## Eval JSON Schema
104
+
105
+ ```json
106
+ {
107
+ "id": "unique_test_id",
108
+ "task": "Task description for the agent",
109
+ "data_node": "latch:///path/to/data.h5ad",
110
+ "grader": {
111
+ "type": "numeric_tolerance",
112
+ "config": {
113
+ "ground_truth": {"field": 42},
114
+ "tolerances": {"field": {"type": "absolute", "value": 1}}
115
+ }
116
+ }
117
+ }
118
+ ```
@@ -0,0 +1,96 @@
1
+ # latch-eval-tools
2
+
3
+ Shared eval tools for single-cell bench, spatial bench, and future biology benchmarks.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install latch-eval-tools
9
+ ```
10
+
11
+ ## Components
12
+
13
+ ### Types
14
+
15
+ ```python
16
+ from latch_eval_tools import Eval, EvalResult
17
+
18
+ eval_case = Eval(
19
+ id="test_001",
20
+ task="Count cells in the dataset",
21
+ data_node="latch:///data/sample.h5ad",
22
+ grader={"type": "numeric_tolerance", "config": {...}}
23
+ )
24
+ ```
25
+
26
+ ### Graders
27
+
28
+ Available graders: `numeric_tolerance`, `label_set_jaccard`, `distribution_comparison`, `marker_gene_precision_recall`, `marker_gene_separation`, `spatial_adjacency`, `multiple_choice`
29
+
30
+ ```python
31
+ from latch_eval_tools.graders import get_grader, NumericToleranceGrader
32
+
33
+ grader = get_grader("numeric_tolerance")
34
+ result = grader.evaluate(
35
+ agent_answer={"n_cells": 1523},
36
+ config={
37
+ "ground_truth": {"n_cells": 1500},
38
+ "tolerances": {"n_cells": {"type": "relative", "value": 0.05}}
39
+ }
40
+ )
41
+ print(result.passed)
42
+ print(result.reasoning)
43
+ ```
44
+
45
+ ### Harness
46
+
47
+ Run evaluations with different agents:
48
+
49
+ ```python
50
+ from latch_eval_tools.harness import EvalRunner, run_minisweagent_task
51
+
52
+ runner = EvalRunner("evals/count_cells.json", cache_name=".scbench")
53
+ result = runner.run(agent_function=lambda task, work_dir:
54
+ run_minisweagent_task(task, work_dir, model_name="anthropic/claude-sonnet-4")
55
+ )
56
+
57
+ def my_agent(task_prompt: str, work_dir: Path) -> dict:
58
+ return {"answer": json.loads((work_dir / "eval_answer.json").read_text())}
59
+
60
+ runner.run(agent_function=my_agent)
61
+ ```
62
+
63
+ Built-in agents: `run_minisweagent_task`, `run_claudecode_task`, `run_plotsagent_task`
64
+
65
+ ### Linter
66
+
67
+ Validate eval JSON files:
68
+
69
+ ```bash
70
+ eval-lint evals/my_dataset/
71
+ eval-lint evals/ --format json
72
+ ```
73
+
74
+ ```python
75
+ from latch_eval_tools.linter import lint_eval, lint_directory
76
+
77
+ result = lint_eval("evals/test.json")
78
+ print(result.passed, result.issues)
79
+ ```
80
+
81
+ ## Eval JSON Schema
82
+
83
+ ```json
84
+ {
85
+ "id": "unique_test_id",
86
+ "task": "Task description for the agent",
87
+ "data_node": "latch:///path/to/data.h5ad",
88
+ "grader": {
89
+ "type": "numeric_tolerance",
90
+ "config": {
91
+ "ground_truth": {"field": 42},
92
+ "tolerances": {"field": {"type": "absolute", "value": 1}}
93
+ }
94
+ }
95
+ }
96
+ ```
@@ -0,0 +1,35 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "latch-eval-tools"
7
+ version = "0.1.0"
8
+ description = "Shared eval tools for single-cell bench, spatial bench, and future biology benchmarks."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ dependencies = [
12
+ "pydantic>=2.0.0",
13
+ "websockets>=12.0",
14
+ "aiohttp>=3.0.0",
15
+ "orjson>=3.0.0",
16
+ "numpy>=1.24.0",
17
+ "scipy>=1.10.0",
18
+ "scikit-learn>=1.3.0",
19
+ "statsmodels>=0.14.0",
20
+ "matplotlib>=3.0.0",
21
+ "openai>=1.0.0",
22
+ "anthropic>=0.72.0",
23
+ "latch>=2.0.0",
24
+ "latch_config>=0.1.0",
25
+ "mini-swe-agent",
26
+ ]
27
+
28
+ [project.scripts]
29
+ eval-lint = "latch_eval_tools.cli.eval_lint:main"
30
+
31
+ [tool.hatch.metadata]
32
+ allow-direct-references = true
33
+
34
+ [tool.hatch.build.targets.wheel]
35
+ packages = ["src/latch_eval_tools"]
@@ -0,0 +1,64 @@
1
+ from latch_eval_tools.types import Eval, EvalResult, TestCase, TestResult
2
+ from latch_eval_tools.linter import lint_eval, lint_directory, LintResult
3
+ from latch_eval_tools.harness import (
4
+ EvalRunner,
5
+ run_minisweagent_task,
6
+ run_claudecode_task,
7
+ run_plotsagent_task,
8
+ download_single_dataset,
9
+ download_data,
10
+ batch_download_datasets,
11
+ setup_workspace,
12
+ cleanup_workspace,
13
+ )
14
+ from latch_eval_tools.graders import (
15
+ BinaryGrader,
16
+ GraderResult,
17
+ get_nested_value,
18
+ NumericToleranceGrader,
19
+ MarkerGenePrecisionRecallGrader,
20
+ MarkerGeneSeparationGrader,
21
+ LabelSetJaccardGrader,
22
+ DistributionComparisonGrader,
23
+ SpatialAdjacencyGrader,
24
+ MultipleChoiceGrader,
25
+ GRADER_REGISTRY,
26
+ get_grader,
27
+ )
28
+
29
+ __all__ = [
30
+ # Types
31
+ "Eval",
32
+ "EvalResult",
33
+ "TestCase", # Backward compatibility alias
34
+ "TestResult", # Backward compatibility alias
35
+ # Linter
36
+ "lint_eval",
37
+ "lint_directory",
38
+ "LintResult",
39
+ # Harness
40
+ "EvalRunner",
41
+ "run_minisweagent_task",
42
+ "run_claudecode_task",
43
+ "run_plotsagent_task",
44
+ "download_single_dataset",
45
+ "download_data",
46
+ "batch_download_datasets",
47
+ "setup_workspace",
48
+ "cleanup_workspace",
49
+ # Graders
50
+ "BinaryGrader",
51
+ "GraderResult",
52
+ "get_nested_value",
53
+ "NumericToleranceGrader",
54
+ "MarkerGenePrecisionRecallGrader",
55
+ "MarkerGeneSeparationGrader",
56
+ "LabelSetJaccardGrader",
57
+ "DistributionComparisonGrader",
58
+ "SpatialAdjacencyGrader",
59
+ "MultipleChoiceGrader",
60
+ "GRADER_REGISTRY",
61
+ "get_grader",
62
+ ]
63
+
64
+ __version__ = "0.1.0"
@@ -0,0 +1,35 @@
1
+ import json
2
+ import re
3
+
4
+
5
+ def extract_answer_from_conversation(conversation: list[dict]) -> dict | None:
6
+ """Extract the JSON answer from a conversation history.
7
+
8
+ Looks for submit_response tool calls with EVAL_ANSWER tags in the summary.
9
+
10
+ Args:
11
+ conversation: List of message dicts from agent conversation
12
+
13
+ Returns:
14
+ Parsed JSON answer dict, or None if not found
15
+ """
16
+ for msg in reversed(conversation):
17
+ if msg.get("type") != "anthropic_message" or msg.get("role") != "assistant":
18
+ continue
19
+
20
+ content = msg.get("content", [])
21
+ for block in content:
22
+ if isinstance(block, dict) and block.get("type") == "tool_use":
23
+ if block.get("name") == "submit_response":
24
+ tool_input = block.get("input", {})
25
+ summary = tool_input.get("summary", "")
26
+
27
+ match = re.search(r'<EVAL_ANSWER>(.*?)</EVAL_ANSWER>', summary, re.DOTALL)
28
+ if match:
29
+ json_str = match.group(1).strip()
30
+ try:
31
+ return json.loads(json_str)
32
+ except json.JSONDecodeError as e:
33
+ print(f"[grader] Failed to parse JSON from EVAL_ANSWER tags: {e}")
34
+ return None
35
+ return None
@@ -0,0 +1,185 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from latch_eval_tools.linter import lint_eval, lint_directory, LintResult
8
+ from latch_eval_tools.linter.explanations import get_explanation
9
+
10
+
11
+ def format_console_rich(results: list[LintResult]) -> str:
12
+ lines = []
13
+ total_errors = 0
14
+ total_warnings = 0
15
+
16
+ for result in results:
17
+ lines.append(f"\nChecking: {result.file_path}")
18
+ lines.append("─" * 50)
19
+
20
+ if not result.issues:
21
+ lines.append("✓ All checks passed")
22
+ continue
23
+
24
+ for issue in result.issues:
25
+ prefix = "✗" if issue.level == "error" else "⚠"
26
+ explanation = get_explanation(issue.code)
27
+
28
+ lines.append(f"\n{prefix} {issue.code}: {issue.message}")
29
+
30
+ if explanation:
31
+ lines.append(f"")
32
+ lines.append(f" Fix: {explanation.example_before} → {explanation.example_after}")
33
+ if explanation.doc_link:
34
+ lines.append(f" Docs: {explanation.doc_link}")
35
+
36
+ if issue.location:
37
+ lines.append(f" Location: {issue.location}")
38
+
39
+ total_errors += result.error_count
40
+ total_warnings += result.warning_count
41
+
42
+ lines.append("")
43
+ lines.append("─" * 50)
44
+ lines.append(f"Result: {total_errors} error(s), {total_warnings} warning(s)")
45
+ lines.append(f"Files: {sum(1 for r in results if r.passed)}/{len(results)} passed")
46
+
47
+ return "\n".join(lines)
48
+
49
+
50
+ def format_json_output(results: list[LintResult]) -> str:
51
+ output = {
52
+ "summary": {
53
+ "files_checked": len(results),
54
+ "files_passed": sum(1 for r in results if r.passed),
55
+ "total_errors": sum(r.error_count for r in results),
56
+ "total_warnings": sum(r.warning_count for r in results),
57
+ },
58
+ "results": [],
59
+ }
60
+
61
+ for result in results:
62
+ result_entry = {
63
+ "file": result.file_path,
64
+ "passed": result.passed,
65
+ "errors": result.error_count,
66
+ "warnings": result.warning_count,
67
+ "issues": [],
68
+ }
69
+
70
+ for issue in result.issues:
71
+ issue_entry: dict = {
72
+ "level": issue.level,
73
+ "code": issue.code,
74
+ "message": issue.message,
75
+ }
76
+ if issue.location:
77
+ issue_entry["location"] = issue.location
78
+
79
+ explanation = get_explanation(issue.code)
80
+ if explanation:
81
+ issue_entry["fix"] = {
82
+ "before": explanation.example_before,
83
+ "after": explanation.example_after,
84
+ }
85
+ if explanation.doc_link:
86
+ issue_entry["docs"] = explanation.doc_link
87
+
88
+ result_entry["issues"].append(issue_entry)
89
+
90
+ output["results"].append(result_entry)
91
+
92
+ return json.dumps(output, indent=2)
93
+
94
+
95
+ VALID_CATEGORIES = ["qc", "normalization", "dimensionality_reduction", "clustering", "cell_typing", "differential_expression", "spatial_analysis"]
96
+
97
+
98
+ def main():
99
+ parser = argparse.ArgumentParser(
100
+ prog="eval-lint",
101
+ description="Validate eval JSON files locally (no credentials required)",
102
+ formatter_class=argparse.RawDescriptionHelpFormatter,
103
+ epilog="""
104
+ Examples:
105
+ eval-lint path/to/eval.json # Lint single file
106
+ eval-lint evals/my_dataset/ # Lint directory
107
+ eval-lint evals/ --category qc # Lint only QC evals
108
+ eval-lint evals/ --format json # JSON output for CI/CD
109
+ eval-lint evals/ -q # Quiet mode (exit code only)
110
+
111
+ Exit codes:
112
+ 0 All files passed validation
113
+ 1 One or more files have errors
114
+ """,
115
+ )
116
+ parser.add_argument(
117
+ "path",
118
+ type=Path,
119
+ help="Path to eval JSON file or directory containing eval files",
120
+ )
121
+ parser.add_argument(
122
+ "--category", "-c",
123
+ choices=VALID_CATEGORIES,
124
+ help="Only lint evals with this metadata.task category",
125
+ )
126
+ parser.add_argument(
127
+ "--format", "-f",
128
+ choices=["console", "json"],
129
+ default="console",
130
+ help="Output format (default: console)",
131
+ )
132
+ parser.add_argument(
133
+ "--quiet", "-q",
134
+ action="store_true",
135
+ help="Quiet mode: only show summary and exit code",
136
+ )
137
+ parser.add_argument(
138
+ "--pattern",
139
+ default="**/*.json",
140
+ help="Glob pattern for finding files in directory (default: **/*.json)",
141
+ )
142
+
143
+ args = parser.parse_args()
144
+
145
+ if not args.path.exists():
146
+ print(f"Error: Path not found: {args.path}", file=sys.stderr)
147
+ sys.exit(1)
148
+
149
+ if args.path.is_file():
150
+ results = [lint_eval(args.path)]
151
+ else:
152
+ results = lint_directory(args.path, args.pattern)
153
+
154
+ if args.category:
155
+ filtered_results = []
156
+ for result in results:
157
+ try:
158
+ with open(result.file_path) as f:
159
+ eval_data = json.load(f)
160
+ if eval_data.get("metadata", {}).get("task") == args.category:
161
+ filtered_results.append(result)
162
+ except (json.JSONDecodeError, IOError):
163
+ filtered_results.append(result)
164
+ results = filtered_results
165
+
166
+ if not results:
167
+ print("No eval files found", file=sys.stderr)
168
+ sys.exit(1)
169
+
170
+ total_errors = sum(r.error_count for r in results)
171
+ all_passed = all(r.passed for r in results)
172
+
173
+ if args.quiet:
174
+ passed = sum(1 for r in results if r.passed)
175
+ print(f"{passed}/{len(results)} files passed, {total_errors} error(s)")
176
+ elif args.format == "json":
177
+ print(format_json_output(results))
178
+ else:
179
+ print(format_console_rich(results))
180
+
181
+ sys.exit(0 if all_passed else 1)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ main()