latch-eval-tools 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- latch_eval_tools/__init__.py +64 -0
- latch_eval_tools/answer_extraction.py +35 -0
- latch_eval_tools/cli/__init__.py +0 -0
- latch_eval_tools/cli/eval_lint.py +185 -0
- latch_eval_tools/eval_server.py +570 -0
- latch_eval_tools/faas_utils.py +13 -0
- latch_eval_tools/graders/__init__.py +40 -0
- latch_eval_tools/graders/base.py +29 -0
- latch_eval_tools/graders/distribution.py +102 -0
- latch_eval_tools/graders/label_set.py +75 -0
- latch_eval_tools/graders/marker_gene.py +317 -0
- latch_eval_tools/graders/multiple_choice.py +38 -0
- latch_eval_tools/graders/numeric.py +137 -0
- latch_eval_tools/graders/spatial.py +93 -0
- latch_eval_tools/harness/__init__.py +27 -0
- latch_eval_tools/harness/claudecode.py +212 -0
- latch_eval_tools/harness/minisweagent.py +265 -0
- latch_eval_tools/harness/plotsagent.py +156 -0
- latch_eval_tools/harness/runner.py +191 -0
- latch_eval_tools/harness/utils.py +191 -0
- latch_eval_tools/headless_eval_server.py +727 -0
- latch_eval_tools/linter/__init__.py +25 -0
- latch_eval_tools/linter/explanations.py +331 -0
- latch_eval_tools/linter/runner.py +146 -0
- latch_eval_tools/linter/schema.py +126 -0
- latch_eval_tools/linter/validators.py +595 -0
- latch_eval_tools/types.py +30 -0
- latch_eval_tools/wrapper_entrypoint.py +316 -0
- latch_eval_tools-0.1.0.dist-info/METADATA +118 -0
- latch_eval_tools-0.1.0.dist-info/RECORD +33 -0
- latch_eval_tools-0.1.0.dist-info/WHEEL +4 -0
- latch_eval_tools-0.1.0.dist-info/entry_points.txt +2 -0
- latch_eval_tools-0.1.0.dist-info/licenses/LICENSE +1 -0
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
import signal
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def setup_environment(sandbox_dir: Path, notebook_id: str):
|
|
11
|
+
sandbox_dir.mkdir(parents=True, exist_ok=True)
|
|
12
|
+
|
|
13
|
+
root_dir = sandbox_dir / "root"
|
|
14
|
+
root_dir.mkdir(exist_ok=True)
|
|
15
|
+
|
|
16
|
+
latch_dir = root_dir / ".latch"
|
|
17
|
+
latch_dir.mkdir(exist_ok=True)
|
|
18
|
+
|
|
19
|
+
user_latch_token = Path.home() / ".latch" / "token"
|
|
20
|
+
if not user_latch_token.exists():
|
|
21
|
+
raise RuntimeError("Latch token required at ~/.latch/token")
|
|
22
|
+
token = user_latch_token.read_text().strip()
|
|
23
|
+
|
|
24
|
+
(latch_dir / "token").write_text(token)
|
|
25
|
+
(latch_dir / "id").write_text("99999")
|
|
26
|
+
(latch_dir / "notebook-id").write_text(notebook_id)
|
|
27
|
+
(latch_dir / "session-id").write_text("local-eval-session")
|
|
28
|
+
(latch_dir / "nucleus-url").write_text("https://nucleus.latch.bio")
|
|
29
|
+
|
|
30
|
+
os.environ.update({
|
|
31
|
+
"DD_VERSION": "local-dev",
|
|
32
|
+
"DD_SERVICE": "latch-plots-eval",
|
|
33
|
+
"DD_ENV": "local",
|
|
34
|
+
"DD_AGENT_HOST": "localhost",
|
|
35
|
+
"DD_TRACE_ENABLED": "false",
|
|
36
|
+
"DD_PROFILING_ENABLED": "false",
|
|
37
|
+
"DD_RUNTIME_METRICS_ENABLED": "false",
|
|
38
|
+
"OTEL_SDK_DISABLED": "true",
|
|
39
|
+
"auth_jwks_url": "https://example.com/jwks",
|
|
40
|
+
"auth_issuer": "local-dev",
|
|
41
|
+
"auth_audience": "local-dev",
|
|
42
|
+
"auth_self_signed_jwk": "{}",
|
|
43
|
+
"auto_reload": "false",
|
|
44
|
+
"logging_mode": "console",
|
|
45
|
+
"domain": "latch.bio",
|
|
46
|
+
"AGENT_DEBUG": "1",
|
|
47
|
+
"LATCH_SANDBOX_ROOT": str(latch_dir),
|
|
48
|
+
"LD_LIBRARY_PATH": f"/root/miniconda3/envs/plots-faas/lib:{os.environ.get('LD_LIBRARY_PATH', '')}",
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
import pathlib
|
|
52
|
+
original_path_new = pathlib.Path.__new__
|
|
53
|
+
|
|
54
|
+
def patched_path_new(cls, *args, **kwargs):
|
|
55
|
+
if args and str(args[0]) == "/root/.latch":
|
|
56
|
+
return original_path_new(cls, str(latch_dir), *args[1:], **kwargs)
|
|
57
|
+
return original_path_new(cls, *args, **kwargs)
|
|
58
|
+
|
|
59
|
+
pathlib.Path.__new__ = patched_path_new
|
|
60
|
+
|
|
61
|
+
venv_bin = str(Path(sys.executable).parent)
|
|
62
|
+
os.environ["PATH"] = f"{venv_bin}:{os.environ.get('PATH', '')}"
|
|
63
|
+
|
|
64
|
+
print(f"[wrapper] Using sandbox: {sandbox_dir}", flush=True)
|
|
65
|
+
print(f"[wrapper] Latch dir: {latch_dir}", flush=True)
|
|
66
|
+
print(f"[wrapper] Added {venv_bin} to PATH", flush=True)
|
|
67
|
+
return latch_dir
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def mock_add_pod_event(*, auth, event_type):
|
|
71
|
+
print(f"[wrapper] Pod event (mocked): {event_type}", flush=True)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def run_server(latch_dir: Path, port: int, notebook_id: str):
|
|
75
|
+
from hypercorn.asyncio import serve
|
|
76
|
+
from hypercorn.config import Config as HypercornConfig
|
|
77
|
+
from latch_asgi.server import LatchASGIServer
|
|
78
|
+
|
|
79
|
+
from runtime.mount.endpoints import http_routes, websocket_routes
|
|
80
|
+
from runtime.mount.entrypoint import shutdown
|
|
81
|
+
from runtime.mount import entrypoint, headless_browser
|
|
82
|
+
|
|
83
|
+
print(f"[wrapper] Using real notebook {notebook_id}", flush=True)
|
|
84
|
+
|
|
85
|
+
entrypoint.add_pod_event = mock_add_pod_event
|
|
86
|
+
|
|
87
|
+
entrypoint.plots_ctx_manager.session_owner = "eval-harness"
|
|
88
|
+
original_screenshot = headless_browser.HeadlessBrowser.screenshot
|
|
89
|
+
|
|
90
|
+
async def patched_screenshot(self, path: str):
|
|
91
|
+
if path.startswith("/var/log/"):
|
|
92
|
+
path = str(latch_dir / path.split("/")[-1])
|
|
93
|
+
return await original_screenshot(self, path)
|
|
94
|
+
|
|
95
|
+
async def patched_start(self, notebook_url, local_storage, *, timeout_ms=30000):
|
|
96
|
+
from playwright.async_api import async_playwright
|
|
97
|
+
import json as json_mod
|
|
98
|
+
from collections.abc import Mapping
|
|
99
|
+
|
|
100
|
+
notebook_url = f"https://console.latch.bio/plots/{notebook_id}"
|
|
101
|
+
print(f"[wrapper] Patched notebook URL to: {notebook_url}", flush=True)
|
|
102
|
+
|
|
103
|
+
self.playwright = await async_playwright().start()
|
|
104
|
+
self.browser = await self.playwright.chromium.launch(headless=True)
|
|
105
|
+
self.page = await self.browser.new_page(viewport={"width": 1280, "height": 800})
|
|
106
|
+
|
|
107
|
+
storage = dict(local_storage) if isinstance(local_storage, Mapping) else local_storage
|
|
108
|
+
serialized = json_mod.dumps(storage)
|
|
109
|
+
await self.page.add_init_script(
|
|
110
|
+
f"""
|
|
111
|
+
const entries = JSON.parse({json_mod.dumps(serialized)});
|
|
112
|
+
for (const [k, v] of Object.entries(entries)) {{
|
|
113
|
+
localStorage.setItem(k, v);
|
|
114
|
+
}}
|
|
115
|
+
"""
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
def should_log_browser_msg(msg):
|
|
119
|
+
text = msg.text
|
|
120
|
+
if "ERR_FAILED" in text or "[Network error]" in text:
|
|
121
|
+
return False
|
|
122
|
+
if "Failed to load resource" in text:
|
|
123
|
+
return False
|
|
124
|
+
return True
|
|
125
|
+
|
|
126
|
+
self.page.on("console", lambda msg: print(f"[browser] {msg.type}: {msg.text}", flush=True) if should_log_browser_msg(msg) else None)
|
|
127
|
+
self.page.on("pageerror", lambda err: print(f"[browser-error] Page error: {err}", flush=True))
|
|
128
|
+
|
|
129
|
+
async def handle_api_calls(route):
|
|
130
|
+
url = route.request.url
|
|
131
|
+
|
|
132
|
+
if "graphql" in url or "vacuole" in url:
|
|
133
|
+
try:
|
|
134
|
+
post_data = route.request.post_data
|
|
135
|
+
if post_data:
|
|
136
|
+
import re
|
|
137
|
+
query_match = re.search(r'(query|mutation)\s+(\w+)', post_data)
|
|
138
|
+
query_name = query_match.group(2) if query_match else "unknown"
|
|
139
|
+
print(f"[browser-gql] GraphQL request: {query_name}", flush=True)
|
|
140
|
+
|
|
141
|
+
fake_pod = {
|
|
142
|
+
"__typename": "PodInfo",
|
|
143
|
+
"id": "99999",
|
|
144
|
+
"status": "RUNNING",
|
|
145
|
+
"jupyterToken": "eval-token",
|
|
146
|
+
"cpuMillicores": "4000",
|
|
147
|
+
"memoryBytes": "8589934592",
|
|
148
|
+
"gpus": "0",
|
|
149
|
+
"gpuType": None,
|
|
150
|
+
"storageGigs": "50",
|
|
151
|
+
"usedStorageGigs": "0",
|
|
152
|
+
"internalIpAddress": "127.0.0.1",
|
|
153
|
+
"autoShutoffDelay": {
|
|
154
|
+
"__typename": "Interval",
|
|
155
|
+
"days": 0, "months": 0, "hours": 1,
|
|
156
|
+
"minutes": 0, "seconds": 0, "years": 0
|
|
157
|
+
},
|
|
158
|
+
"deployment": {
|
|
159
|
+
"__typename": "PodDeployment",
|
|
160
|
+
"id": "99999",
|
|
161
|
+
"targetRegion": "us-west-2",
|
|
162
|
+
"targetDomain": "us-west-2"
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
pod_list_queries = ["GetPodStatus", "GetPodInfoFromNotebookId"]
|
|
167
|
+
if query_name in pod_list_queries:
|
|
168
|
+
print(f"[browser-gql] Intercepting {query_name} (podInfos), returning fake RUNNING pod", flush=True)
|
|
169
|
+
fake_pod_response = json_mod.dumps({
|
|
170
|
+
"data": {
|
|
171
|
+
"podInfos": {
|
|
172
|
+
"__typename": "PodInfosConnection",
|
|
173
|
+
"nodes": [fake_pod]
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
})
|
|
177
|
+
await route.fulfill(status=200, content_type="application/json", body=fake_pod_response)
|
|
178
|
+
return
|
|
179
|
+
|
|
180
|
+
if query_name == "PodInfoByPodId":
|
|
181
|
+
print(f"[browser-gql] Intercepting {query_name} (podInfo), returning fake RUNNING pod", flush=True)
|
|
182
|
+
fake_pod_response = json_mod.dumps({
|
|
183
|
+
"data": {
|
|
184
|
+
"podInfo": fake_pod
|
|
185
|
+
}
|
|
186
|
+
})
|
|
187
|
+
await route.fulfill(status=200, content_type="application/json", body=fake_pod_response)
|
|
188
|
+
return
|
|
189
|
+
except Exception as e:
|
|
190
|
+
print(f"[browser-gql] Error checking GQL query: {e}", flush=True)
|
|
191
|
+
await route.continue_()
|
|
192
|
+
elif "nucleus" in url:
|
|
193
|
+
print(f"[browser-api] Blocking nucleus API call: {url}", flush=True)
|
|
194
|
+
await route.fulfill(status=200, content_type="application/json", body='{"data":{}}')
|
|
195
|
+
else:
|
|
196
|
+
await route.continue_()
|
|
197
|
+
|
|
198
|
+
await self.page.route("**/*", handle_api_calls)
|
|
199
|
+
|
|
200
|
+
print(f"[wrapper] Headless browser navigating to: {notebook_url}", flush=True)
|
|
201
|
+
await self.page.goto(notebook_url, wait_until="load")
|
|
202
|
+
|
|
203
|
+
try:
|
|
204
|
+
await self.page.wait_for_selector("[data-plot-ready='true']", timeout=timeout_ms)
|
|
205
|
+
except Exception:
|
|
206
|
+
await self.screenshot(str(latch_dir / "headless_browser_no_selector.png"))
|
|
207
|
+
raise
|
|
208
|
+
|
|
209
|
+
await self.screenshot(str(latch_dir / "headless_browser_ready.png"))
|
|
210
|
+
print("[wrapper] Headless browser page ready", flush=True)
|
|
211
|
+
|
|
212
|
+
headless_browser.HeadlessBrowser.screenshot = patched_screenshot
|
|
213
|
+
headless_browser.HeadlessBrowser.start = patched_start
|
|
214
|
+
|
|
215
|
+
async def patched_start_agent_proc():
|
|
216
|
+
conn_a = entrypoint.a_proc.conn_a = await entrypoint.SocketIo.from_socket(entrypoint.sock_a)
|
|
217
|
+
|
|
218
|
+
entrypoint.async_tasks.append(
|
|
219
|
+
asyncio.create_task(entrypoint.handle_agent_messages(conn_a))
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
print("[wrapper] Starting agent subprocess", flush=True)
|
|
223
|
+
entrypoint.a_proc.proc = await asyncio.create_subprocess_exec(
|
|
224
|
+
sys.executable,
|
|
225
|
+
"-u",
|
|
226
|
+
(entrypoint.dir_p / "agent.py"),
|
|
227
|
+
str(entrypoint.sock_agent_fd),
|
|
228
|
+
pass_fds=[entrypoint.sock_agent_fd],
|
|
229
|
+
stdin=asyncio.subprocess.DEVNULL,
|
|
230
|
+
stdout=asyncio.subprocess.PIPE,
|
|
231
|
+
stderr=asyncio.subprocess.PIPE,
|
|
232
|
+
env={
|
|
233
|
+
**os.environ,
|
|
234
|
+
"LATCH_SANDBOX_ROOT": str(latch_dir),
|
|
235
|
+
"PYTHONUNBUFFERED": "1",
|
|
236
|
+
},
|
|
237
|
+
preexec_fn=lambda: os.nice(5),
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
async def stream_output(stream, prefix=""):
|
|
241
|
+
while True:
|
|
242
|
+
line = await stream.readline()
|
|
243
|
+
if not line:
|
|
244
|
+
break
|
|
245
|
+
print(f"[agent] {prefix}{line.decode().rstrip()}", flush=True)
|
|
246
|
+
|
|
247
|
+
asyncio.create_task(stream_output(entrypoint.a_proc.proc.stdout, ""))
|
|
248
|
+
asyncio.create_task(stream_output(entrypoint.a_proc.proc.stderr, "[stderr] "))
|
|
249
|
+
print(f"[wrapper] Agent subprocess started, PID: {entrypoint.a_proc.proc.pid}", flush=True)
|
|
250
|
+
|
|
251
|
+
entrypoint.start_agent_proc = patched_start_agent_proc
|
|
252
|
+
|
|
253
|
+
latch_server = LatchASGIServer(
|
|
254
|
+
http_routes=http_routes,
|
|
255
|
+
websocket_routes=websocket_routes,
|
|
256
|
+
startup_tasks=[entrypoint.start_kernel_proc(), patched_start_agent_proc()],
|
|
257
|
+
shutdown_tasks=[shutdown()],
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
cfg = HypercornConfig()
|
|
261
|
+
cfg.bind = [f"127.0.0.1:{port}"]
|
|
262
|
+
cfg.graceful_timeout = 0.1
|
|
263
|
+
|
|
264
|
+
print(f"\n[wrapper] Server starting on port {port}", flush=True)
|
|
265
|
+
print(f"[wrapper] WebSocket: ws://127.0.0.1:{port}/agent", flush=True)
|
|
266
|
+
print(f"[wrapper] HTTP: http://127.0.0.1:{port}/readyz", flush=True)
|
|
267
|
+
|
|
268
|
+
shutdown_event = asyncio.Event()
|
|
269
|
+
|
|
270
|
+
async def await_shutdown():
|
|
271
|
+
await shutdown_event.wait()
|
|
272
|
+
|
|
273
|
+
def shutdown_signal(*args):
|
|
274
|
+
print("\n[wrapper] Shutting down...", flush=True)
|
|
275
|
+
shutdown_event.set()
|
|
276
|
+
|
|
277
|
+
signal.signal(signal.SIGINT, shutdown_signal)
|
|
278
|
+
signal.signal(signal.SIGTERM, shutdown_signal)
|
|
279
|
+
|
|
280
|
+
try:
|
|
281
|
+
await serve(latch_server.raw_app, cfg, shutdown_trigger=await_shutdown)
|
|
282
|
+
finally:
|
|
283
|
+
await shutdown()
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def main():
|
|
287
|
+
import argparse
|
|
288
|
+
parser = argparse.ArgumentParser()
|
|
289
|
+
parser.add_argument("--sandbox-dir", required=True, help="Sandbox directory path")
|
|
290
|
+
parser.add_argument("--port", type=int, default=5000, help="Server port")
|
|
291
|
+
parser.add_argument("--notebook-id", required=True, help="Notebook ID to use")
|
|
292
|
+
args = parser.parse_args()
|
|
293
|
+
|
|
294
|
+
sandbox_dir = Path(args.sandbox_dir)
|
|
295
|
+
|
|
296
|
+
# Get the faas directory from environment or default location
|
|
297
|
+
faas_dir = Path(os.environ.get("LATCH_PLOTS_FAAS_PATH", "/root/latch-plots-faas"))
|
|
298
|
+
mount_dir = faas_dir / "runtime" / "mount"
|
|
299
|
+
|
|
300
|
+
sys.path.insert(0, str(faas_dir))
|
|
301
|
+
sys.path.insert(0, str(mount_dir))
|
|
302
|
+
sys.path.insert(0, str(mount_dir / "python_lib"))
|
|
303
|
+
|
|
304
|
+
latch_dir = setup_environment(sandbox_dir, args.notebook_id)
|
|
305
|
+
|
|
306
|
+
from latch_o11y.o11y import setup as setup_o11y
|
|
307
|
+
setup_o11y()
|
|
308
|
+
|
|
309
|
+
try:
|
|
310
|
+
asyncio.run(run_server(latch_dir, args.port, args.notebook_id))
|
|
311
|
+
except KeyboardInterrupt:
|
|
312
|
+
print("\n[wrapper] Stopped", flush=True)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
if __name__ == "__main__":
|
|
316
|
+
main()
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: latch-eval-tools
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Shared eval tools for single-cell bench, spatial bench, and future biology benchmarks.
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: aiohttp>=3.0.0
|
|
8
|
+
Requires-Dist: anthropic>=0.72.0
|
|
9
|
+
Requires-Dist: latch-config>=0.1.0
|
|
10
|
+
Requires-Dist: latch>=2.0.0
|
|
11
|
+
Requires-Dist: matplotlib>=3.0.0
|
|
12
|
+
Requires-Dist: mini-swe-agent
|
|
13
|
+
Requires-Dist: numpy>=1.24.0
|
|
14
|
+
Requires-Dist: openai>=1.0.0
|
|
15
|
+
Requires-Dist: orjson>=3.0.0
|
|
16
|
+
Requires-Dist: pydantic>=2.0.0
|
|
17
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
18
|
+
Requires-Dist: scipy>=1.10.0
|
|
19
|
+
Requires-Dist: statsmodels>=0.14.0
|
|
20
|
+
Requires-Dist: websockets>=12.0
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# latch-eval-tools
|
|
24
|
+
|
|
25
|
+
Shared eval tools for single-cell bench, spatial bench, and future biology benchmarks.
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install latch-eval-tools
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Components
|
|
34
|
+
|
|
35
|
+
### Types
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from latch_eval_tools import Eval, EvalResult
|
|
39
|
+
|
|
40
|
+
eval_case = Eval(
|
|
41
|
+
id="test_001",
|
|
42
|
+
task="Count cells in the dataset",
|
|
43
|
+
data_node="latch:///data/sample.h5ad",
|
|
44
|
+
grader={"type": "numeric_tolerance", "config": {...}}
|
|
45
|
+
)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Graders
|
|
49
|
+
|
|
50
|
+
Available graders: `numeric_tolerance`, `label_set_jaccard`, `distribution_comparison`, `marker_gene_precision_recall`, `marker_gene_separation`, `spatial_adjacency`, `multiple_choice`
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
from latch_eval_tools.graders import get_grader, NumericToleranceGrader
|
|
54
|
+
|
|
55
|
+
grader = get_grader("numeric_tolerance")
|
|
56
|
+
result = grader.evaluate(
|
|
57
|
+
agent_answer={"n_cells": 1523},
|
|
58
|
+
config={
|
|
59
|
+
"ground_truth": {"n_cells": 1500},
|
|
60
|
+
"tolerances": {"n_cells": {"type": "relative", "value": 0.05}}
|
|
61
|
+
}
|
|
62
|
+
)
|
|
63
|
+
print(result.passed)
|
|
64
|
+
print(result.reasoning)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Harness
|
|
68
|
+
|
|
69
|
+
Run evaluations with different agents:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from latch_eval_tools.harness import EvalRunner, run_minisweagent_task
|
|
73
|
+
|
|
74
|
+
runner = EvalRunner("evals/count_cells.json", cache_name=".scbench")
|
|
75
|
+
result = runner.run(agent_function=lambda task, work_dir:
|
|
76
|
+
run_minisweagent_task(task, work_dir, model_name="anthropic/claude-sonnet-4")
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
def my_agent(task_prompt: str, work_dir: Path) -> dict:
|
|
80
|
+
return {"answer": json.loads((work_dir / "eval_answer.json").read_text())}
|
|
81
|
+
|
|
82
|
+
runner.run(agent_function=my_agent)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Built-in agents: `run_minisweagent_task`, `run_claudecode_task`, `run_plotsagent_task`
|
|
86
|
+
|
|
87
|
+
### Linter
|
|
88
|
+
|
|
89
|
+
Validate eval JSON files:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
eval-lint evals/my_dataset/
|
|
93
|
+
eval-lint evals/ --format json
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from latch_eval_tools.linter import lint_eval, lint_directory
|
|
98
|
+
|
|
99
|
+
result = lint_eval("evals/test.json")
|
|
100
|
+
print(result.passed, result.issues)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Eval JSON Schema
|
|
104
|
+
|
|
105
|
+
```json
|
|
106
|
+
{
|
|
107
|
+
"id": "unique_test_id",
|
|
108
|
+
"task": "Task description for the agent",
|
|
109
|
+
"data_node": "latch:///path/to/data.h5ad",
|
|
110
|
+
"grader": {
|
|
111
|
+
"type": "numeric_tolerance",
|
|
112
|
+
"config": {
|
|
113
|
+
"ground_truth": {"field": 42},
|
|
114
|
+
"tolerances": {"field": {"type": "absolute", "value": 1}}
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
```
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
latch_eval_tools/__init__.py,sha256=BykCd2JcFvB5Qb0voGuxQAkzE7xE_PxPnBdhHxgwetw,1590
|
|
2
|
+
latch_eval_tools/answer_extraction.py,sha256=uApcsxAEPZX5o8DgQohM5AK7P0s6vnfgLKndVHDmiG0,1360
|
|
3
|
+
latch_eval_tools/eval_server.py,sha256=iD0gRw0LeB-V3c04EMITti1sVoEwAv9xKcnCdy7WP7Y,21836
|
|
4
|
+
latch_eval_tools/faas_utils.py,sha256=aJoyNuP5GKMioBmP0sreNgnQvXVZma2Y1c0COaCwlBw,561
|
|
5
|
+
latch_eval_tools/headless_eval_server.py,sha256=UjkL7_ZzJzFprlbzi73uBvkKadXq7YkNYCq41xHeods,27461
|
|
6
|
+
latch_eval_tools/types.py,sha256=IKYA6aHnAAKXrPEFVpWY4Q0vo02tn60MJSNXxmU3V38,789
|
|
7
|
+
latch_eval_tools/wrapper_entrypoint.py,sha256=qWIjSFOP5m8FnAnlZhwSPZu7p0NLo3j-RJosX3JohdQ,12712
|
|
8
|
+
latch_eval_tools/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
+
latch_eval_tools/cli/eval_lint.py,sha256=G7R85-Gw_so2stk02zb4GpBMML3G4j_w2tLGTx_Nc74,5813
|
|
10
|
+
latch_eval_tools/graders/__init__.py,sha256=lA9NbxB4Dv990zqtSLs8j27q544HKha9OWiceWFowvc,1427
|
|
11
|
+
latch_eval_tools/graders/base.py,sha256=wRN47ilf0iXSE6cLF5gpQ91EaDHO6wepkCsgJ6vBkpc,770
|
|
12
|
+
latch_eval_tools/graders/distribution.py,sha256=7u9pwPC45ClGTmEvnYPGe3pTNrgkCFUeP412v3-icps,4266
|
|
13
|
+
latch_eval_tools/graders/label_set.py,sha256=CXhLZ4qdOvRMYgVxh3x0m45r7IyXo8nJfrbiNuEfm0Y,2767
|
|
14
|
+
latch_eval_tools/graders/marker_gene.py,sha256=ysFPXx_xZ9rhusvPgcVIpcA7-kU_wSTFR9r9dSZjHi0,13486
|
|
15
|
+
latch_eval_tools/graders/multiple_choice.py,sha256=HTyYWQGFv7ATmTNJMhUnisiuNb1F7VVx4nU-pJ0KV0o,1412
|
|
16
|
+
latch_eval_tools/graders/numeric.py,sha256=Q19GtyYtatKRdPX9DlWJ8CKSQHUhJ6P6k7hnhZnKE04,6826
|
|
17
|
+
latch_eval_tools/graders/spatial.py,sha256=8dNRsCVazCI98mHwhLHz0T-kyvLL4dTrsayu2DqjMdU,4258
|
|
18
|
+
latch_eval_tools/harness/__init__.py,sha256=FyDDHRdEOUo845NIGqzF1CbOzaZfilsId8LxhtgEWHQ,764
|
|
19
|
+
latch_eval_tools/harness/claudecode.py,sha256=gbbPtie4TALwkQ7zas5rPNtkL-SjA4yDVeDfoX1hWuY,7023
|
|
20
|
+
latch_eval_tools/harness/minisweagent.py,sha256=SZwbnigvGOWU4tIgijJSyOr4urTe8n1Dgj6ViuL2uM4,9197
|
|
21
|
+
latch_eval_tools/harness/plotsagent.py,sha256=Qw_q-6WzAsGo1RzKBzD0lzsXZeyGRGwfZ0AU5s_e00g,5017
|
|
22
|
+
latch_eval_tools/harness/runner.py,sha256=f042LLhEAN4rvltPIhjSn588Z2AkP5tTk0pKDH69i2M,7264
|
|
23
|
+
latch_eval_tools/harness/utils.py,sha256=__fy4_eJqEzU7cMqd8fcztwfxKH0jvWCYyUCyHvdhdQ,5880
|
|
24
|
+
latch_eval_tools/linter/__init__.py,sha256=fKRgnVvL_eJduYFagRJrzE4RI5kI9G6VsiyI0dRnc3k,541
|
|
25
|
+
latch_eval_tools/linter/explanations.py,sha256=Uzg-IH9YjZl__X7nF7lDc-4FSv619CE8LyXo8PkO0tk,14308
|
|
26
|
+
latch_eval_tools/linter/runner.py,sha256=zii4EbfsdWqdynzCxn1GLX0oC7rUgnrq0Vwsc-13YNk,4619
|
|
27
|
+
latch_eval_tools/linter/schema.py,sha256=Acqs4VSwhOP4G85nUCghPquGMGQ413In7kAWaQ1BtNE,3688
|
|
28
|
+
latch_eval_tools/linter/validators.py,sha256=iWcWBrcOmjv8abJEy0ch6Di2LbT5L8TVTe2cl88mQoU,21488
|
|
29
|
+
latch_eval_tools-0.1.0.dist-info/METADATA,sha256=qKASKwb8G3-KSMKJHemNt_VbhF6LsoPU4wOUXhZjCXY,2887
|
|
30
|
+
latch_eval_tools-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
31
|
+
latch_eval_tools-0.1.0.dist-info/entry_points.txt,sha256=pxTKTp2rBe7xq_j1KSVZE6ULxDZv_BsbFjAGHEw7Tlo,66
|
|
32
|
+
latch_eval_tools-0.1.0.dist-info/licenses/LICENSE,sha256=zepfPTPE3p7B8XbkQVZ-MeZwB9vmSoXQhMUvHNnlvQo,39
|
|
33
|
+
latch_eval_tools-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
© LatchBio LLC. All rights reserved.
|