latch-eval-tools 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- latch_eval_tools/__init__.py +64 -0
- latch_eval_tools/answer_extraction.py +35 -0
- latch_eval_tools/cli/__init__.py +0 -0
- latch_eval_tools/cli/eval_lint.py +185 -0
- latch_eval_tools/eval_server.py +570 -0
- latch_eval_tools/faas_utils.py +13 -0
- latch_eval_tools/graders/__init__.py +40 -0
- latch_eval_tools/graders/base.py +29 -0
- latch_eval_tools/graders/distribution.py +102 -0
- latch_eval_tools/graders/label_set.py +75 -0
- latch_eval_tools/graders/marker_gene.py +317 -0
- latch_eval_tools/graders/multiple_choice.py +38 -0
- latch_eval_tools/graders/numeric.py +137 -0
- latch_eval_tools/graders/spatial.py +93 -0
- latch_eval_tools/harness/__init__.py +27 -0
- latch_eval_tools/harness/claudecode.py +212 -0
- latch_eval_tools/harness/minisweagent.py +265 -0
- latch_eval_tools/harness/plotsagent.py +156 -0
- latch_eval_tools/harness/runner.py +191 -0
- latch_eval_tools/harness/utils.py +191 -0
- latch_eval_tools/headless_eval_server.py +727 -0
- latch_eval_tools/linter/__init__.py +25 -0
- latch_eval_tools/linter/explanations.py +331 -0
- latch_eval_tools/linter/runner.py +146 -0
- latch_eval_tools/linter/schema.py +126 -0
- latch_eval_tools/linter/validators.py +595 -0
- latch_eval_tools/types.py +30 -0
- latch_eval_tools/wrapper_entrypoint.py +316 -0
- latch_eval_tools-0.1.0.dist-info/METADATA +118 -0
- latch_eval_tools-0.1.0.dist-info/RECORD +33 -0
- latch_eval_tools-0.1.0.dist-info/WHEEL +4 -0
- latch_eval_tools-0.1.0.dist-info/entry_points.txt +2 -0
- latch_eval_tools-0.1.0.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from latch_eval_tools.types import Eval, EvalResult, TestCase, TestResult
|
|
2
|
+
from latch_eval_tools.linter import lint_eval, lint_directory, LintResult
|
|
3
|
+
from latch_eval_tools.harness import (
|
|
4
|
+
EvalRunner,
|
|
5
|
+
run_minisweagent_task,
|
|
6
|
+
run_claudecode_task,
|
|
7
|
+
run_plotsagent_task,
|
|
8
|
+
download_single_dataset,
|
|
9
|
+
download_data,
|
|
10
|
+
batch_download_datasets,
|
|
11
|
+
setup_workspace,
|
|
12
|
+
cleanup_workspace,
|
|
13
|
+
)
|
|
14
|
+
from latch_eval_tools.graders import (
|
|
15
|
+
BinaryGrader,
|
|
16
|
+
GraderResult,
|
|
17
|
+
get_nested_value,
|
|
18
|
+
NumericToleranceGrader,
|
|
19
|
+
MarkerGenePrecisionRecallGrader,
|
|
20
|
+
MarkerGeneSeparationGrader,
|
|
21
|
+
LabelSetJaccardGrader,
|
|
22
|
+
DistributionComparisonGrader,
|
|
23
|
+
SpatialAdjacencyGrader,
|
|
24
|
+
MultipleChoiceGrader,
|
|
25
|
+
GRADER_REGISTRY,
|
|
26
|
+
get_grader,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
# Types
|
|
31
|
+
"Eval",
|
|
32
|
+
"EvalResult",
|
|
33
|
+
"TestCase", # Backward compatibility alias
|
|
34
|
+
"TestResult", # Backward compatibility alias
|
|
35
|
+
# Linter
|
|
36
|
+
"lint_eval",
|
|
37
|
+
"lint_directory",
|
|
38
|
+
"LintResult",
|
|
39
|
+
# Harness
|
|
40
|
+
"EvalRunner",
|
|
41
|
+
"run_minisweagent_task",
|
|
42
|
+
"run_claudecode_task",
|
|
43
|
+
"run_plotsagent_task",
|
|
44
|
+
"download_single_dataset",
|
|
45
|
+
"download_data",
|
|
46
|
+
"batch_download_datasets",
|
|
47
|
+
"setup_workspace",
|
|
48
|
+
"cleanup_workspace",
|
|
49
|
+
# Graders
|
|
50
|
+
"BinaryGrader",
|
|
51
|
+
"GraderResult",
|
|
52
|
+
"get_nested_value",
|
|
53
|
+
"NumericToleranceGrader",
|
|
54
|
+
"MarkerGenePrecisionRecallGrader",
|
|
55
|
+
"MarkerGeneSeparationGrader",
|
|
56
|
+
"LabelSetJaccardGrader",
|
|
57
|
+
"DistributionComparisonGrader",
|
|
58
|
+
"SpatialAdjacencyGrader",
|
|
59
|
+
"MultipleChoiceGrader",
|
|
60
|
+
"GRADER_REGISTRY",
|
|
61
|
+
"get_grader",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def extract_answer_from_conversation(conversation: list[dict]) -> dict | None:
|
|
6
|
+
"""Extract the JSON answer from a conversation history.
|
|
7
|
+
|
|
8
|
+
Looks for submit_response tool calls with EVAL_ANSWER tags in the summary.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
conversation: List of message dicts from agent conversation
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
Parsed JSON answer dict, or None if not found
|
|
15
|
+
"""
|
|
16
|
+
for msg in reversed(conversation):
|
|
17
|
+
if msg.get("type") != "anthropic_message" or msg.get("role") != "assistant":
|
|
18
|
+
continue
|
|
19
|
+
|
|
20
|
+
content = msg.get("content", [])
|
|
21
|
+
for block in content:
|
|
22
|
+
if isinstance(block, dict) and block.get("type") == "tool_use":
|
|
23
|
+
if block.get("name") == "submit_response":
|
|
24
|
+
tool_input = block.get("input", {})
|
|
25
|
+
summary = tool_input.get("summary", "")
|
|
26
|
+
|
|
27
|
+
match = re.search(r'<EVAL_ANSWER>(.*?)</EVAL_ANSWER>', summary, re.DOTALL)
|
|
28
|
+
if match:
|
|
29
|
+
json_str = match.group(1).strip()
|
|
30
|
+
try:
|
|
31
|
+
return json.loads(json_str)
|
|
32
|
+
except json.JSONDecodeError as e:
|
|
33
|
+
print(f"[grader] Failed to parse JSON from EVAL_ANSWER tags: {e}")
|
|
34
|
+
return None
|
|
35
|
+
return None
|
|
File without changes
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import argparse
|
|
3
|
+
import json
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from latch_eval_tools.linter import lint_eval, lint_directory, LintResult
|
|
8
|
+
from latch_eval_tools.linter.explanations import get_explanation
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def format_console_rich(results: list[LintResult]) -> str:
|
|
12
|
+
lines = []
|
|
13
|
+
total_errors = 0
|
|
14
|
+
total_warnings = 0
|
|
15
|
+
|
|
16
|
+
for result in results:
|
|
17
|
+
lines.append(f"\nChecking: {result.file_path}")
|
|
18
|
+
lines.append("─" * 50)
|
|
19
|
+
|
|
20
|
+
if not result.issues:
|
|
21
|
+
lines.append("✓ All checks passed")
|
|
22
|
+
continue
|
|
23
|
+
|
|
24
|
+
for issue in result.issues:
|
|
25
|
+
prefix = "✗" if issue.level == "error" else "⚠"
|
|
26
|
+
explanation = get_explanation(issue.code)
|
|
27
|
+
|
|
28
|
+
lines.append(f"\n{prefix} {issue.code}: {issue.message}")
|
|
29
|
+
|
|
30
|
+
if explanation:
|
|
31
|
+
lines.append(f"")
|
|
32
|
+
lines.append(f" Fix: {explanation.example_before} → {explanation.example_after}")
|
|
33
|
+
if explanation.doc_link:
|
|
34
|
+
lines.append(f" Docs: {explanation.doc_link}")
|
|
35
|
+
|
|
36
|
+
if issue.location:
|
|
37
|
+
lines.append(f" Location: {issue.location}")
|
|
38
|
+
|
|
39
|
+
total_errors += result.error_count
|
|
40
|
+
total_warnings += result.warning_count
|
|
41
|
+
|
|
42
|
+
lines.append("")
|
|
43
|
+
lines.append("─" * 50)
|
|
44
|
+
lines.append(f"Result: {total_errors} error(s), {total_warnings} warning(s)")
|
|
45
|
+
lines.append(f"Files: {sum(1 for r in results if r.passed)}/{len(results)} passed")
|
|
46
|
+
|
|
47
|
+
return "\n".join(lines)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def format_json_output(results: list[LintResult]) -> str:
|
|
51
|
+
output = {
|
|
52
|
+
"summary": {
|
|
53
|
+
"files_checked": len(results),
|
|
54
|
+
"files_passed": sum(1 for r in results if r.passed),
|
|
55
|
+
"total_errors": sum(r.error_count for r in results),
|
|
56
|
+
"total_warnings": sum(r.warning_count for r in results),
|
|
57
|
+
},
|
|
58
|
+
"results": [],
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
for result in results:
|
|
62
|
+
result_entry = {
|
|
63
|
+
"file": result.file_path,
|
|
64
|
+
"passed": result.passed,
|
|
65
|
+
"errors": result.error_count,
|
|
66
|
+
"warnings": result.warning_count,
|
|
67
|
+
"issues": [],
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
for issue in result.issues:
|
|
71
|
+
issue_entry: dict = {
|
|
72
|
+
"level": issue.level,
|
|
73
|
+
"code": issue.code,
|
|
74
|
+
"message": issue.message,
|
|
75
|
+
}
|
|
76
|
+
if issue.location:
|
|
77
|
+
issue_entry["location"] = issue.location
|
|
78
|
+
|
|
79
|
+
explanation = get_explanation(issue.code)
|
|
80
|
+
if explanation:
|
|
81
|
+
issue_entry["fix"] = {
|
|
82
|
+
"before": explanation.example_before,
|
|
83
|
+
"after": explanation.example_after,
|
|
84
|
+
}
|
|
85
|
+
if explanation.doc_link:
|
|
86
|
+
issue_entry["docs"] = explanation.doc_link
|
|
87
|
+
|
|
88
|
+
result_entry["issues"].append(issue_entry)
|
|
89
|
+
|
|
90
|
+
output["results"].append(result_entry)
|
|
91
|
+
|
|
92
|
+
return json.dumps(output, indent=2)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
VALID_CATEGORIES = ["qc", "normalization", "dimensionality_reduction", "clustering", "cell_typing", "differential_expression", "spatial_analysis"]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def main():
|
|
99
|
+
parser = argparse.ArgumentParser(
|
|
100
|
+
prog="eval-lint",
|
|
101
|
+
description="Validate eval JSON files locally (no credentials required)",
|
|
102
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
103
|
+
epilog="""
|
|
104
|
+
Examples:
|
|
105
|
+
eval-lint path/to/eval.json # Lint single file
|
|
106
|
+
eval-lint evals/my_dataset/ # Lint directory
|
|
107
|
+
eval-lint evals/ --category qc # Lint only QC evals
|
|
108
|
+
eval-lint evals/ --format json # JSON output for CI/CD
|
|
109
|
+
eval-lint evals/ -q # Quiet mode (exit code only)
|
|
110
|
+
|
|
111
|
+
Exit codes:
|
|
112
|
+
0 All files passed validation
|
|
113
|
+
1 One or more files have errors
|
|
114
|
+
""",
|
|
115
|
+
)
|
|
116
|
+
parser.add_argument(
|
|
117
|
+
"path",
|
|
118
|
+
type=Path,
|
|
119
|
+
help="Path to eval JSON file or directory containing eval files",
|
|
120
|
+
)
|
|
121
|
+
parser.add_argument(
|
|
122
|
+
"--category", "-c",
|
|
123
|
+
choices=VALID_CATEGORIES,
|
|
124
|
+
help="Only lint evals with this metadata.task category",
|
|
125
|
+
)
|
|
126
|
+
parser.add_argument(
|
|
127
|
+
"--format", "-f",
|
|
128
|
+
choices=["console", "json"],
|
|
129
|
+
default="console",
|
|
130
|
+
help="Output format (default: console)",
|
|
131
|
+
)
|
|
132
|
+
parser.add_argument(
|
|
133
|
+
"--quiet", "-q",
|
|
134
|
+
action="store_true",
|
|
135
|
+
help="Quiet mode: only show summary and exit code",
|
|
136
|
+
)
|
|
137
|
+
parser.add_argument(
|
|
138
|
+
"--pattern",
|
|
139
|
+
default="**/*.json",
|
|
140
|
+
help="Glob pattern for finding files in directory (default: **/*.json)",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
args = parser.parse_args()
|
|
144
|
+
|
|
145
|
+
if not args.path.exists():
|
|
146
|
+
print(f"Error: Path not found: {args.path}", file=sys.stderr)
|
|
147
|
+
sys.exit(1)
|
|
148
|
+
|
|
149
|
+
if args.path.is_file():
|
|
150
|
+
results = [lint_eval(args.path)]
|
|
151
|
+
else:
|
|
152
|
+
results = lint_directory(args.path, args.pattern)
|
|
153
|
+
|
|
154
|
+
if args.category:
|
|
155
|
+
filtered_results = []
|
|
156
|
+
for result in results:
|
|
157
|
+
try:
|
|
158
|
+
with open(result.file_path) as f:
|
|
159
|
+
eval_data = json.load(f)
|
|
160
|
+
if eval_data.get("metadata", {}).get("task") == args.category:
|
|
161
|
+
filtered_results.append(result)
|
|
162
|
+
except (json.JSONDecodeError, IOError):
|
|
163
|
+
filtered_results.append(result)
|
|
164
|
+
results = filtered_results
|
|
165
|
+
|
|
166
|
+
if not results:
|
|
167
|
+
print("No eval files found", file=sys.stderr)
|
|
168
|
+
sys.exit(1)
|
|
169
|
+
|
|
170
|
+
total_errors = sum(r.error_count for r in results)
|
|
171
|
+
all_passed = all(r.passed for r in results)
|
|
172
|
+
|
|
173
|
+
if args.quiet:
|
|
174
|
+
passed = sum(1 for r in results if r.passed)
|
|
175
|
+
print(f"{passed}/{len(results)} files passed, {total_errors} error(s)")
|
|
176
|
+
elif args.format == "json":
|
|
177
|
+
print(format_json_output(results))
|
|
178
|
+
else:
|
|
179
|
+
print(format_console_rich(results))
|
|
180
|
+
|
|
181
|
+
sys.exit(0 if all_passed else 1)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
if __name__ == "__main__":
|
|
185
|
+
main()
|