latch-eval-tools 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. latch_eval_tools/__init__.py +64 -0
  2. latch_eval_tools/answer_extraction.py +35 -0
  3. latch_eval_tools/cli/__init__.py +0 -0
  4. latch_eval_tools/cli/eval_lint.py +185 -0
  5. latch_eval_tools/eval_server.py +570 -0
  6. latch_eval_tools/faas_utils.py +13 -0
  7. latch_eval_tools/graders/__init__.py +40 -0
  8. latch_eval_tools/graders/base.py +29 -0
  9. latch_eval_tools/graders/distribution.py +102 -0
  10. latch_eval_tools/graders/label_set.py +75 -0
  11. latch_eval_tools/graders/marker_gene.py +317 -0
  12. latch_eval_tools/graders/multiple_choice.py +38 -0
  13. latch_eval_tools/graders/numeric.py +137 -0
  14. latch_eval_tools/graders/spatial.py +93 -0
  15. latch_eval_tools/harness/__init__.py +27 -0
  16. latch_eval_tools/harness/claudecode.py +212 -0
  17. latch_eval_tools/harness/minisweagent.py +265 -0
  18. latch_eval_tools/harness/plotsagent.py +156 -0
  19. latch_eval_tools/harness/runner.py +191 -0
  20. latch_eval_tools/harness/utils.py +191 -0
  21. latch_eval_tools/headless_eval_server.py +727 -0
  22. latch_eval_tools/linter/__init__.py +25 -0
  23. latch_eval_tools/linter/explanations.py +331 -0
  24. latch_eval_tools/linter/runner.py +146 -0
  25. latch_eval_tools/linter/schema.py +126 -0
  26. latch_eval_tools/linter/validators.py +595 -0
  27. latch_eval_tools/types.py +30 -0
  28. latch_eval_tools/wrapper_entrypoint.py +316 -0
  29. latch_eval_tools-0.1.0.dist-info/METADATA +118 -0
  30. latch_eval_tools-0.1.0.dist-info/RECORD +33 -0
  31. latch_eval_tools-0.1.0.dist-info/WHEEL +4 -0
  32. latch_eval_tools-0.1.0.dist-info/entry_points.txt +2 -0
  33. latch_eval_tools-0.1.0.dist-info/licenses/LICENSE +1 -0
@@ -0,0 +1,156 @@
1
+ import json
2
+ import os
3
+ import subprocess
4
+ import time
5
+ from pathlib import Path
6
+
7
+ EVAL_TIMEOUT = 600
8
+
9
+
10
+ def run_plotsagent_task(
11
+ task_prompt: str,
12
+ work_dir: Path,
13
+ model_name: str | None = None,
14
+ eval_timeout: int = EVAL_TIMEOUT,
15
+ ) -> dict:
16
+ """Run PlotsAgent on a task.
17
+
18
+ Args:
19
+ task_prompt: Task description for the agent
20
+ work_dir: Working directory for the agent
21
+ model_name: Optional model name (e.g., "anthropic/claude-sonnet-4")
22
+ eval_timeout: Timeout for entire evaluation (seconds)
23
+
24
+ Returns:
25
+ dict with keys "answer" (parsed JSON or None) and "metadata"
26
+ """
27
+ agent_log_file = work_dir / "agent_output.log"
28
+ if agent_log_file.exists():
29
+ agent_log_file.unlink()
30
+
31
+ eval_config = {
32
+ "id": work_dir.name,
33
+ "task": task_prompt,
34
+ "data_node": None,
35
+ "grader": None,
36
+ }
37
+
38
+ eval_file = work_dir / "eval_config.json"
39
+ eval_file.write_text(json.dumps(eval_config, indent=2))
40
+
41
+ output_file = work_dir / "eval_output.json"
42
+
43
+ faas_python = Path(os.environ.get("PLOTS_FAAS_PYTHON", "/root/plots-faas-venv/bin/python"))
44
+
45
+ cmd = [
46
+ str(faas_python),
47
+ "-m", "latch_eval_tools.eval_server",
48
+ "--headless",
49
+ "--eval", str(eval_file),
50
+ "-o", str(output_file),
51
+ ]
52
+
53
+ env = {
54
+ **os.environ,
55
+ "LATCH_PLOTS_FAAS_PATH": os.environ.get("LATCH_PLOTS_FAAS_PATH", "/root/latch-plots-faas"),
56
+ }
57
+
58
+ start_time = time.time()
59
+ timed_out = False
60
+
61
+ try:
62
+ with open(agent_log_file, "w") as log_f:
63
+ subprocess.run(
64
+ cmd,
65
+ env=env,
66
+ stdout=log_f,
67
+ stderr=subprocess.STDOUT,
68
+ timeout=eval_timeout,
69
+ )
70
+ except subprocess.TimeoutExpired:
71
+ timed_out = True
72
+ with open(agent_log_file, "a") as log_f:
73
+ log_f.write(f"\n\nAgent timed out after {eval_timeout} seconds")
74
+
75
+ duration = time.time() - start_time
76
+ print(f"Agent output saved to: {agent_log_file}")
77
+
78
+ eval_id = work_dir.name
79
+ workspace_dir = output_file.parent / "workspaces" / eval_id
80
+
81
+ trajectory = []
82
+ if workspace_dir.exists():
83
+ trajectory_src = workspace_dir / "trajectory.json"
84
+ if trajectory_src.exists():
85
+ try:
86
+ trajectory = json.loads(trajectory_src.read_text())
87
+ trajectory_dst = work_dir / "trajectory.json"
88
+ trajectory_dst.write_text(json.dumps(trajectory, indent=2))
89
+ print(f"Trajectory saved to: {trajectory_dst}")
90
+ except json.JSONDecodeError:
91
+ pass
92
+
93
+ agent_answer = None
94
+ error_details = None
95
+
96
+ if output_file.exists():
97
+ try:
98
+ results = json.loads(output_file.read_text())
99
+ evals = results.get("evals", [])
100
+ if evals:
101
+ eval_entry = evals[0]
102
+ agent_answer = eval_entry.get("agent_answer")
103
+ if agent_answer is not None:
104
+ eval_answer_file = work_dir / "eval_answer.json"
105
+ eval_answer_file.write_text(json.dumps(agent_answer, indent=2))
106
+ except json.JSONDecodeError as e:
107
+ error_details = {"error": f"Failed to parse output: {e}"}
108
+
109
+ if agent_answer is None:
110
+ eval_answer_file = work_dir / "eval_answer.json"
111
+ if eval_answer_file.exists():
112
+ try:
113
+ agent_answer = json.loads(eval_answer_file.read_text())
114
+ except json.JSONDecodeError:
115
+ pass
116
+
117
+ if agent_answer is None:
118
+ if workspace_dir.exists():
119
+ ws_eval_answer = workspace_dir / "eval_answer.json"
120
+ if ws_eval_answer.exists():
121
+ try:
122
+ agent_answer = json.loads(ws_eval_answer.read_text())
123
+ eval_answer_file = work_dir / "eval_answer.json"
124
+ eval_answer_file.write_text(json.dumps(agent_answer, indent=2))
125
+ except json.JSONDecodeError:
126
+ pass
127
+
128
+ if agent_answer is None and not error_details:
129
+ log_tail = ""
130
+ if agent_log_file.exists():
131
+ log_content = agent_log_file.read_text()
132
+ log_tail = log_content[-1000:]
133
+
134
+ error_msg = "Agent timed out" if timed_out else "Agent did not produce an answer"
135
+ error_details = {
136
+ "error": error_msg,
137
+ "timed_out": timed_out,
138
+ "log_tail": log_tail,
139
+ }
140
+ print(f"\nWarning: {error_msg}")
141
+
142
+ n_steps = len([t for t in trajectory if t.get("type") == "assistant"])
143
+
144
+ metadata = {
145
+ "duration_s": round(duration, 2),
146
+ "model": model_name or "anthropic/claude-sonnet-4",
147
+ "n_steps": n_steps,
148
+ "n_messages": len(trajectory),
149
+ }
150
+ if timed_out:
151
+ metadata["timed_out"] = True
152
+ metadata["eval_timeout_seconds"] = eval_timeout
153
+ if error_details:
154
+ metadata["error_details"] = error_details
155
+
156
+ return {"answer": agent_answer, "metadata": metadata}
@@ -0,0 +1,191 @@
1
+ import json
2
+ from pathlib import Path
3
+
4
+ from latch_eval_tools.types import TestCase
5
+ from latch_eval_tools.graders import GRADER_REGISTRY, GraderResult
6
+ from latch_eval_tools.harness.utils import download_data, setup_workspace, cleanup_workspace
7
+
8
+
9
+ class EvalRunner:
10
+ """Main evaluation runner for executing benchmarks with various agents."""
11
+
12
+ def __init__(
13
+ self,
14
+ eval_path: str | Path,
15
+ keep_workspace: bool = False,
16
+ run_id: str | None = None,
17
+ cache_name: str = ".eval_cache",
18
+ workspace_name: str = ".eval_workspace",
19
+ benchmark_name: str = "Eval"
20
+ ):
21
+ """Initialize evaluation runner.
22
+
23
+ Args:
24
+ eval_path: Path to eval JSON file
25
+ keep_workspace: Whether to preserve workspace after completion
26
+ run_id: Optional run ID for organizing multiple runs
27
+ cache_name: Name of cache directory (e.g., .scbench, .spatialbench)
28
+ workspace_name: Name of workspace directory
29
+ benchmark_name: Display name for benchmark (e.g., "SCBench", "SpatialBench")
30
+ """
31
+ self.eval_path = Path(eval_path)
32
+ self.keep_workspace = keep_workspace
33
+ self.run_id = run_id
34
+ self.cache_name = cache_name
35
+ self.workspace_name = workspace_name
36
+ self.benchmark_name = benchmark_name
37
+
38
+ if not self.eval_path.exists():
39
+ raise FileNotFoundError(f"Eval file not found: {self.eval_path}")
40
+
41
+ eval_data = json.loads(self.eval_path.read_text())
42
+ self.test_case = TestCase(**eval_data)
43
+
44
+ def run(self, agent_function=None):
45
+ """Run evaluation with specified agent function.
46
+
47
+ Args:
48
+ agent_function: Callable that takes (task_prompt: str, work_dir: Path)
49
+ and returns dict with keys "answer" and optionally "metadata"
50
+
51
+ Returns:
52
+ dict with test results including test_id, agent_answer, grader_result, passed
53
+ """
54
+ print("=" * 80)
55
+ print(f"Running {self.benchmark_name} evaluation: {self.test_case.id}")
56
+ print("=" * 80)
57
+
58
+ print("\nTask:")
59
+ print("-" * 80)
60
+ print(self.test_case.task)
61
+ print("-" * 80)
62
+
63
+ work_dir = setup_workspace(self.test_case.id, self.run_id, self.workspace_name)
64
+ print(f"\nWorking directory: {work_dir}")
65
+
66
+ print("\n" + "=" * 80)
67
+ print("Staging data files...")
68
+ print("=" * 80)
69
+
70
+ contextual_data = download_data(self.test_case.data_node, work_dir, self.cache_name)
71
+
72
+ data_context = ""
73
+ if contextual_data:
74
+ data_context = f"\n\nHere is the context of the selected nodes the user would like to use: <ContextualNodeData>{json.dumps(contextual_data)}</ContextualNodeData>"
75
+
76
+ task_prompt = f"""{self.test_case.task}
77
+
78
+ IMPORTANT: When you have completed this task:
79
+ 1. Write your final answer as a JSON object to a file named `eval_answer.json`
80
+ 2. The file should contain ONLY the JSON object with the required fields
81
+ 3. After writing the file, you have completed the task
82
+
83
+ Example eval_answer.json:
84
+ {{
85
+ "field1": value1,
86
+ "field2": value2
87
+ }}
88
+ {data_context}"""
89
+
90
+ print("\n" + "=" * 80)
91
+ print("Running agent on task...")
92
+ print("=" * 80)
93
+
94
+ agent_answer = None
95
+ agent_metadata = {}
96
+
97
+ if agent_function is None:
98
+ print("\nNo agent function provided. To run this eval, pass an agent_function that:")
99
+ print(" 1. Takes (task_prompt: str, work_dir: Path) as arguments")
100
+ print(" 2. Returns the parsed agent answer dict")
101
+ print(f"\nExample:")
102
+ print(f" def my_agent(task, work_dir):")
103
+ print(f" # Run your agent")
104
+ print(f" # Agent should write eval_answer.json to work_dir")
105
+ print(f" answer_file = work_dir / 'eval_answer.json'")
106
+ print(f" return json.loads(answer_file.read_text())")
107
+ print(f"\n runner = EvalRunner(eval_path)")
108
+ print(f" runner.run(agent_function=my_agent)")
109
+ else:
110
+ try:
111
+ result = agent_function(task_prompt, work_dir)
112
+
113
+ if isinstance(result, dict) and "answer" in result:
114
+ agent_answer = result["answer"]
115
+ agent_metadata = result.get("metadata", {})
116
+ else:
117
+ agent_answer = result
118
+
119
+ print("\nAgent completed successfully")
120
+ except Exception as e:
121
+ print(f"\nAgent error: {e}")
122
+ import traceback
123
+ traceback.print_exc()
124
+
125
+ eval_answer_path = work_dir / "eval_answer.json"
126
+ if agent_answer is None and eval_answer_path.exists():
127
+ try:
128
+ agent_answer = json.loads(eval_answer_path.read_text())
129
+ print(f"Loaded agent answer from eval_answer.json")
130
+ except json.JSONDecodeError as e:
131
+ print(f"Warning: Failed to parse eval_answer.json: {e}")
132
+
133
+ grader_result = None
134
+ if self.test_case.grader and agent_answer is not None:
135
+ print("\n" + "=" * 80)
136
+ print("Running grader...")
137
+ print("=" * 80)
138
+
139
+ grader_type = self.test_case.grader.get("type")
140
+ grader_config = self.test_case.grader.get("config", {})
141
+
142
+ if grader_type in GRADER_REGISTRY:
143
+ grader_cls = GRADER_REGISTRY[grader_type]
144
+ grader = grader_cls()
145
+ try:
146
+ grader_result = grader.evaluate_answer(agent_answer, grader_config)
147
+ except Exception as e:
148
+ import traceback
149
+ grader_result = GraderResult(
150
+ passed=False,
151
+ metrics={"grader_error": str(e)},
152
+ reasoning=f"Grader failed due to malformed agent output: {e}\n\n{traceback.format_exc()}",
153
+ agent_answer=agent_answer
154
+ )
155
+
156
+ print(f"\n{'✓ EVAL PASSED' if grader_result.passed else '✗ EVAL FAILED'}")
157
+ print("\nGrader reasoning:")
158
+ print("-" * 80)
159
+ print(grader_result.reasoning)
160
+ print("-" * 80)
161
+
162
+ if grader_result.metrics:
163
+ print("\nMetrics:")
164
+ for key, value in grader_result.metrics.items():
165
+ if isinstance(value, (list, dict)):
166
+ continue
167
+ print(f" {key}: {value}")
168
+ else:
169
+ print(f"\nWarning: Unknown grader type '{grader_type}'")
170
+
171
+ print("\n" + "=" * 80)
172
+ print("Cleanup...")
173
+ print("=" * 80)
174
+
175
+ cleanup_workspace(work_dir, keep=self.keep_workspace)
176
+
177
+ if self.keep_workspace:
178
+ print(f"\nTo inspect results:")
179
+ print(f" cd {work_dir}")
180
+
181
+ result_dict = {
182
+ "test_id": self.test_case.id,
183
+ "agent_answer": agent_answer,
184
+ "grader_result": grader_result,
185
+ "passed": grader_result.passed if grader_result else None,
186
+ }
187
+
188
+ if agent_metadata:
189
+ result_dict["metadata"] = agent_metadata
190
+
191
+ return result_dict
@@ -0,0 +1,191 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ import subprocess
5
+ from pathlib import Path
6
+
7
+
8
+ def get_project_root():
9
+ """Find project root by looking for pyproject.toml."""
10
+ current = Path(__file__).resolve()
11
+ while current != current.parent:
12
+ if (current / "pyproject.toml").exists():
13
+ return current
14
+ current = current.parent
15
+ return Path.cwd()
16
+
17
+
18
+ def get_cache_dir(cache_name: str = ".eval_cache"):
19
+ """Get cache directory for datasets.
20
+
21
+ Args:
22
+ cache_name: Name of cache directory (default: .eval_cache)
23
+ Can be customized per benchmark (e.g., .scbench, .spatialbench)
24
+ """
25
+ project_root = get_project_root()
26
+ cache_dir = project_root / cache_name / "cache"
27
+ cache_dir.mkdir(parents=True, exist_ok=True)
28
+ return cache_dir
29
+
30
+
31
+ def get_cache_manifest(cache_name: str = ".eval_cache"):
32
+ """Load cache manifest tracking downloaded datasets."""
33
+ cache_dir = get_cache_dir(cache_name)
34
+ manifest_file = cache_dir / "manifest.json"
35
+
36
+ if manifest_file.exists():
37
+ return json.loads(manifest_file.read_text())
38
+ return {}
39
+
40
+
41
+ def save_cache_manifest(manifest: dict, cache_name: str = ".eval_cache"):
42
+ """Save cache manifest."""
43
+ cache_dir = get_cache_dir(cache_name)
44
+ manifest_file = cache_dir / "manifest.json"
45
+ manifest_file.write_text(json.dumps(manifest, indent=2))
46
+
47
+
48
+ def get_cache_key(uri: str) -> str:
49
+ """Generate cache key from URI."""
50
+ uri_hash = hashlib.sha256(uri.encode()).hexdigest()[:16]
51
+ filename = Path(uri).name
52
+ return f"{uri_hash}__{filename}"
53
+
54
+
55
+ def download_single_dataset(uri: str, show_progress: bool = True, cache_name: str = ".eval_cache") -> Path:
56
+ """Download a single dataset with caching.
57
+
58
+ Args:
59
+ uri: URI of dataset to download (e.g., latch://...)
60
+ show_progress: Whether to print progress messages
61
+ cache_name: Name of cache directory
62
+
63
+ Returns:
64
+ Path to cached file
65
+ """
66
+ cache_dir = get_cache_dir(cache_name)
67
+ manifest = get_cache_manifest(cache_name)
68
+
69
+ if uri in manifest:
70
+ cached_file = cache_dir / manifest[uri]
71
+ if cached_file.exists():
72
+ if show_progress:
73
+ print(f"Using cached: {Path(uri).name}")
74
+ return cached_file
75
+
76
+ cache_key = get_cache_key(uri)
77
+ cached_file = cache_dir / cache_key
78
+
79
+ if show_progress:
80
+ print(f"Downloading: {uri}")
81
+ subprocess.run(
82
+ ["latch", "cp", uri, str(cached_file)],
83
+ check=True,
84
+ capture_output=True
85
+ )
86
+ if show_progress:
87
+ print(f"Cached as: {cache_key}")
88
+
89
+ manifest[uri] = cache_key
90
+ save_cache_manifest(manifest, cache_name)
91
+
92
+ return cached_file
93
+
94
+
95
+ def download_data(data_node: str | list[str], work_dir: Path, cache_name: str = ".eval_cache") -> list[dict]:
96
+ """Download and symlink data files to workspace.
97
+
98
+ Args:
99
+ data_node: Single URI or list of URIs to download
100
+ work_dir: Working directory to create symlinks in
101
+ cache_name: Name of cache directory
102
+
103
+ Returns:
104
+ List of contextual data dicts with file info
105
+ """
106
+ data_nodes = data_node if isinstance(data_node, list) else ([data_node] if data_node else [])
107
+
108
+ contextual_data = []
109
+
110
+ for node in data_nodes:
111
+ cached_file = download_single_dataset(node, cache_name=cache_name)
112
+ data_filename = Path(node).name
113
+
114
+ target_file = work_dir / data_filename
115
+ if target_file.exists():
116
+ target_file.unlink()
117
+ os.symlink(cached_file, target_file)
118
+ print(f"Linked: {data_filename} -> workspace")
119
+
120
+ contextual_data.append({
121
+ "type": "File",
122
+ "path": node,
123
+ "local_path": data_filename,
124
+ "id": node.replace("latch:///", "").replace(".csv", "").replace(".h5ad", ""),
125
+ })
126
+
127
+ return contextual_data
128
+
129
+
130
+ def batch_download_datasets(uris: list[str], show_progress: bool = True, cache_name: str = ".eval_cache"):
131
+ """Download multiple datasets in batch.
132
+
133
+ Args:
134
+ uris: List of URIs to download
135
+ show_progress: Whether to print progress messages
136
+ cache_name: Name of cache directory
137
+ """
138
+ if show_progress and uris:
139
+ print(f"Preparing to download {len(uris)} unique dataset(s)...")
140
+ print("=" * 80)
141
+
142
+ for i, uri in enumerate(uris, 1):
143
+ if show_progress:
144
+ print(f"[{i}/{len(uris)}] ", end="")
145
+ download_single_dataset(uri, show_progress=show_progress, cache_name=cache_name)
146
+
147
+ if show_progress and uris:
148
+ print("=" * 80)
149
+ print(f"Downloaded/verified {len(uris)} dataset(s)")
150
+ print()
151
+
152
+
153
+ def setup_workspace(eval_id: str, run_id: str | None = None, workspace_name: str = ".eval_workspace") -> Path:
154
+ """Setup workspace directory for evaluation.
155
+
156
+ Args:
157
+ eval_id: ID of evaluation
158
+ run_id: Optional run ID for organizing multiple runs
159
+ workspace_name: Name of workspace directory (default: .eval_workspace)
160
+ Can be customized per benchmark (e.g., .scbench, .spatialbench)
161
+
162
+ Returns:
163
+ Path to workspace directory
164
+ """
165
+ project_root = get_project_root()
166
+ if run_id:
167
+ work_dir = project_root / workspace_name / "workspace" / run_id / eval_id
168
+ else:
169
+ work_dir = project_root / workspace_name / "workspace" / eval_id
170
+
171
+ if work_dir.exists():
172
+ import shutil
173
+ shutil.rmtree(work_dir)
174
+ work_dir.mkdir(parents=True)
175
+
176
+ return work_dir
177
+
178
+
179
+ def cleanup_workspace(work_dir: Path, keep: bool = False):
180
+ """Clean up workspace directory.
181
+
182
+ Args:
183
+ work_dir: Workspace directory to clean up
184
+ keep: Whether to keep the workspace
185
+ """
186
+ if keep:
187
+ print(f"Workspace preserved at: {work_dir}")
188
+ else:
189
+ import shutil
190
+ shutil.rmtree(work_dir)
191
+ print(f"Workspace deleted: {work_dir}")