latch-eval-tools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. latch_eval_tools/__init__.py +64 -0
  2. latch_eval_tools/answer_extraction.py +35 -0
  3. latch_eval_tools/cli/__init__.py +0 -0
  4. latch_eval_tools/cli/eval_lint.py +185 -0
  5. latch_eval_tools/eval_server.py +570 -0
  6. latch_eval_tools/faas_utils.py +13 -0
  7. latch_eval_tools/graders/__init__.py +40 -0
  8. latch_eval_tools/graders/base.py +29 -0
  9. latch_eval_tools/graders/distribution.py +102 -0
  10. latch_eval_tools/graders/label_set.py +75 -0
  11. latch_eval_tools/graders/marker_gene.py +317 -0
  12. latch_eval_tools/graders/multiple_choice.py +38 -0
  13. latch_eval_tools/graders/numeric.py +137 -0
  14. latch_eval_tools/graders/spatial.py +93 -0
  15. latch_eval_tools/harness/__init__.py +27 -0
  16. latch_eval_tools/harness/claudecode.py +212 -0
  17. latch_eval_tools/harness/minisweagent.py +265 -0
  18. latch_eval_tools/harness/plotsagent.py +156 -0
  19. latch_eval_tools/harness/runner.py +191 -0
  20. latch_eval_tools/harness/utils.py +191 -0
  21. latch_eval_tools/headless_eval_server.py +727 -0
  22. latch_eval_tools/linter/__init__.py +25 -0
  23. latch_eval_tools/linter/explanations.py +331 -0
  24. latch_eval_tools/linter/runner.py +146 -0
  25. latch_eval_tools/linter/schema.py +126 -0
  26. latch_eval_tools/linter/validators.py +595 -0
  27. latch_eval_tools/types.py +30 -0
  28. latch_eval_tools/wrapper_entrypoint.py +316 -0
  29. latch_eval_tools-0.1.0.dist-info/METADATA +118 -0
  30. latch_eval_tools-0.1.0.dist-info/RECORD +33 -0
  31. latch_eval_tools-0.1.0.dist-info/WHEEL +4 -0
  32. latch_eval_tools-0.1.0.dist-info/entry_points.txt +2 -0
  33. latch_eval_tools-0.1.0.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,64 @@
1
+ from latch_eval_tools.types import Eval, EvalResult, TestCase, TestResult
2
+ from latch_eval_tools.linter import lint_eval, lint_directory, LintResult
3
+ from latch_eval_tools.harness import (
4
+ EvalRunner,
5
+ run_minisweagent_task,
6
+ run_claudecode_task,
7
+ run_plotsagent_task,
8
+ download_single_dataset,
9
+ download_data,
10
+ batch_download_datasets,
11
+ setup_workspace,
12
+ cleanup_workspace,
13
+ )
14
+ from latch_eval_tools.graders import (
15
+ BinaryGrader,
16
+ GraderResult,
17
+ get_nested_value,
18
+ NumericToleranceGrader,
19
+ MarkerGenePrecisionRecallGrader,
20
+ MarkerGeneSeparationGrader,
21
+ LabelSetJaccardGrader,
22
+ DistributionComparisonGrader,
23
+ SpatialAdjacencyGrader,
24
+ MultipleChoiceGrader,
25
+ GRADER_REGISTRY,
26
+ get_grader,
27
+ )
28
+
29
+ __all__ = [
30
+ # Types
31
+ "Eval",
32
+ "EvalResult",
33
+ "TestCase", # Backward compatibility alias
34
+ "TestResult", # Backward compatibility alias
35
+ # Linter
36
+ "lint_eval",
37
+ "lint_directory",
38
+ "LintResult",
39
+ # Harness
40
+ "EvalRunner",
41
+ "run_minisweagent_task",
42
+ "run_claudecode_task",
43
+ "run_plotsagent_task",
44
+ "download_single_dataset",
45
+ "download_data",
46
+ "batch_download_datasets",
47
+ "setup_workspace",
48
+ "cleanup_workspace",
49
+ # Graders
50
+ "BinaryGrader",
51
+ "GraderResult",
52
+ "get_nested_value",
53
+ "NumericToleranceGrader",
54
+ "MarkerGenePrecisionRecallGrader",
55
+ "MarkerGeneSeparationGrader",
56
+ "LabelSetJaccardGrader",
57
+ "DistributionComparisonGrader",
58
+ "SpatialAdjacencyGrader",
59
+ "MultipleChoiceGrader",
60
+ "GRADER_REGISTRY",
61
+ "get_grader",
62
+ ]
63
+
64
+ __version__ = "0.1.0"
@@ -0,0 +1,35 @@
1
+ import json
2
+ import re
3
+
4
+
5
+ def extract_answer_from_conversation(conversation: list[dict]) -> dict | None:
6
+ """Extract the JSON answer from a conversation history.
7
+
8
+ Looks for submit_response tool calls with EVAL_ANSWER tags in the summary.
9
+
10
+ Args:
11
+ conversation: List of message dicts from agent conversation
12
+
13
+ Returns:
14
+ Parsed JSON answer dict, or None if not found
15
+ """
16
+ for msg in reversed(conversation):
17
+ if msg.get("type") != "anthropic_message" or msg.get("role") != "assistant":
18
+ continue
19
+
20
+ content = msg.get("content", [])
21
+ for block in content:
22
+ if isinstance(block, dict) and block.get("type") == "tool_use":
23
+ if block.get("name") == "submit_response":
24
+ tool_input = block.get("input", {})
25
+ summary = tool_input.get("summary", "")
26
+
27
+ match = re.search(r'<EVAL_ANSWER>(.*?)</EVAL_ANSWER>', summary, re.DOTALL)
28
+ if match:
29
+ json_str = match.group(1).strip()
30
+ try:
31
+ return json.loads(json_str)
32
+ except json.JSONDecodeError as e:
33
+ print(f"[grader] Failed to parse JSON from EVAL_ANSWER tags: {e}")
34
+ return None
35
+ return None
File without changes
@@ -0,0 +1,185 @@
1
+ #!/usr/bin/env python3
2
+ import argparse
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from latch_eval_tools.linter import lint_eval, lint_directory, LintResult
8
+ from latch_eval_tools.linter.explanations import get_explanation
9
+
10
+
11
+ def format_console_rich(results: list[LintResult]) -> str:
12
+ lines = []
13
+ total_errors = 0
14
+ total_warnings = 0
15
+
16
+ for result in results:
17
+ lines.append(f"\nChecking: {result.file_path}")
18
+ lines.append("─" * 50)
19
+
20
+ if not result.issues:
21
+ lines.append("✓ All checks passed")
22
+ continue
23
+
24
+ for issue in result.issues:
25
+ prefix = "✗" if issue.level == "error" else "⚠"
26
+ explanation = get_explanation(issue.code)
27
+
28
+ lines.append(f"\n{prefix} {issue.code}: {issue.message}")
29
+
30
+ if explanation:
31
+ lines.append(f"")
32
+ lines.append(f" Fix: {explanation.example_before} → {explanation.example_after}")
33
+ if explanation.doc_link:
34
+ lines.append(f" Docs: {explanation.doc_link}")
35
+
36
+ if issue.location:
37
+ lines.append(f" Location: {issue.location}")
38
+
39
+ total_errors += result.error_count
40
+ total_warnings += result.warning_count
41
+
42
+ lines.append("")
43
+ lines.append("─" * 50)
44
+ lines.append(f"Result: {total_errors} error(s), {total_warnings} warning(s)")
45
+ lines.append(f"Files: {sum(1 for r in results if r.passed)}/{len(results)} passed")
46
+
47
+ return "\n".join(lines)
48
+
49
+
50
+ def format_json_output(results: list[LintResult]) -> str:
51
+ output = {
52
+ "summary": {
53
+ "files_checked": len(results),
54
+ "files_passed": sum(1 for r in results if r.passed),
55
+ "total_errors": sum(r.error_count for r in results),
56
+ "total_warnings": sum(r.warning_count for r in results),
57
+ },
58
+ "results": [],
59
+ }
60
+
61
+ for result in results:
62
+ result_entry = {
63
+ "file": result.file_path,
64
+ "passed": result.passed,
65
+ "errors": result.error_count,
66
+ "warnings": result.warning_count,
67
+ "issues": [],
68
+ }
69
+
70
+ for issue in result.issues:
71
+ issue_entry: dict = {
72
+ "level": issue.level,
73
+ "code": issue.code,
74
+ "message": issue.message,
75
+ }
76
+ if issue.location:
77
+ issue_entry["location"] = issue.location
78
+
79
+ explanation = get_explanation(issue.code)
80
+ if explanation:
81
+ issue_entry["fix"] = {
82
+ "before": explanation.example_before,
83
+ "after": explanation.example_after,
84
+ }
85
+ if explanation.doc_link:
86
+ issue_entry["docs"] = explanation.doc_link
87
+
88
+ result_entry["issues"].append(issue_entry)
89
+
90
+ output["results"].append(result_entry)
91
+
92
+ return json.dumps(output, indent=2)
93
+
94
+
95
+ VALID_CATEGORIES = ["qc", "normalization", "dimensionality_reduction", "clustering", "cell_typing", "differential_expression", "spatial_analysis"]
96
+
97
+
98
+ def main():
99
+ parser = argparse.ArgumentParser(
100
+ prog="eval-lint",
101
+ description="Validate eval JSON files locally (no credentials required)",
102
+ formatter_class=argparse.RawDescriptionHelpFormatter,
103
+ epilog="""
104
+ Examples:
105
+ eval-lint path/to/eval.json # Lint single file
106
+ eval-lint evals/my_dataset/ # Lint directory
107
+ eval-lint evals/ --category qc # Lint only QC evals
108
+ eval-lint evals/ --format json # JSON output for CI/CD
109
+ eval-lint evals/ -q # Quiet mode (exit code only)
110
+
111
+ Exit codes:
112
+ 0 All files passed validation
113
+ 1 One or more files have errors
114
+ """,
115
+ )
116
+ parser.add_argument(
117
+ "path",
118
+ type=Path,
119
+ help="Path to eval JSON file or directory containing eval files",
120
+ )
121
+ parser.add_argument(
122
+ "--category", "-c",
123
+ choices=VALID_CATEGORIES,
124
+ help="Only lint evals with this metadata.task category",
125
+ )
126
+ parser.add_argument(
127
+ "--format", "-f",
128
+ choices=["console", "json"],
129
+ default="console",
130
+ help="Output format (default: console)",
131
+ )
132
+ parser.add_argument(
133
+ "--quiet", "-q",
134
+ action="store_true",
135
+ help="Quiet mode: only show summary and exit code",
136
+ )
137
+ parser.add_argument(
138
+ "--pattern",
139
+ default="**/*.json",
140
+ help="Glob pattern for finding files in directory (default: **/*.json)",
141
+ )
142
+
143
+ args = parser.parse_args()
144
+
145
+ if not args.path.exists():
146
+ print(f"Error: Path not found: {args.path}", file=sys.stderr)
147
+ sys.exit(1)
148
+
149
+ if args.path.is_file():
150
+ results = [lint_eval(args.path)]
151
+ else:
152
+ results = lint_directory(args.path, args.pattern)
153
+
154
+ if args.category:
155
+ filtered_results = []
156
+ for result in results:
157
+ try:
158
+ with open(result.file_path) as f:
159
+ eval_data = json.load(f)
160
+ if eval_data.get("metadata", {}).get("task") == args.category:
161
+ filtered_results.append(result)
162
+ except (json.JSONDecodeError, IOError):
163
+ filtered_results.append(result)
164
+ results = filtered_results
165
+
166
+ if not results:
167
+ print("No eval files found", file=sys.stderr)
168
+ sys.exit(1)
169
+
170
+ total_errors = sum(r.error_count for r in results)
171
+ all_passed = all(r.passed for r in results)
172
+
173
+ if args.quiet:
174
+ passed = sum(1 for r in results if r.passed)
175
+ print(f"{passed}/{len(results)} files passed, {total_errors} error(s)")
176
+ elif args.format == "json":
177
+ print(format_json_output(results))
178
+ else:
179
+ print(format_console_rich(results))
180
+
181
+ sys.exit(0 if all_passed else 1)
182
+
183
+
184
+ if __name__ == "__main__":
185
+ main()