evalpilot 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_agent/__init__.py +1 -0
- eval_agent/adapters/__init__.py +39 -0
- eval_agent/adapters/cli.py +79 -0
- eval_agent/adapters/function.py +61 -0
- eval_agent/adapters/http.py +78 -0
- eval_agent/adapters/webui.py +197 -0
- eval_agent/aggregate.py +57 -0
- eval_agent/cli.py +106 -0
- eval_agent/examples/__init__.py +0 -0
- eval_agent/examples/echo.py +11 -0
- eval_agent/loader.py +35 -0
- eval_agent/models.py +79 -0
- eval_agent/reporter.py +70 -0
- eval_agent/runner.py +37 -0
- eval_agent/sandbox.py +43 -0
- eval_agent/scorers/__init__.py +34 -0
- eval_agent/scorers/checkpoint.py +66 -0
- eval_agent/scorers/efficiency.py +19 -0
- eval_agent/scorers/llm_judge.py +57 -0
- eval_agent/scorers/trajectory.py +50 -0
- eval_agent/templates/index.html.j2 +73 -0
- eval_agent/templates/report.html.j2 +68 -0
- eval_agent/wizard.py +132 -0
- evalpilot-0.1.0.dist-info/METADATA +144 -0
- evalpilot-0.1.0.dist-info/RECORD +29 -0
- evalpilot-0.1.0.dist-info/WHEEL +5 -0
- evalpilot-0.1.0.dist-info/entry_points.txt +4 -0
- evalpilot-0.1.0.dist-info/licenses/LICENSE +21 -0
- evalpilot-0.1.0.dist-info/top_level.txt +1 -0
eval_agent/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from eval_agent.models import Task, Trajectory
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class AgentAdapter(ABC):
|
|
8
|
+
"""Narrow interface: drive the agent under test for one task.
|
|
9
|
+
|
|
10
|
+
The framework only ever calls .run(); how the adapter invokes the
|
|
11
|
+
underlying agent (subprocess, HTTP, in-process function) is internal.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
name: str = "agent"
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def run(self, task: Task, workspace: Path) -> Trajectory:
|
|
18
|
+
...
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def build_adapter(config: dict) -> AgentAdapter:
|
|
22
|
+
"""Factory: pick an adapter implementation from a config dict.
|
|
23
|
+
|
|
24
|
+
config["type"] in {"function", "cli", "http", "webui"}.
|
|
25
|
+
"""
|
|
26
|
+
atype = config["type"]
|
|
27
|
+
if atype == "cli":
|
|
28
|
+
from eval_agent.adapters.cli import CLIAgentAdapter
|
|
29
|
+
return CLIAgentAdapter.from_config(config)
|
|
30
|
+
if atype == "http":
|
|
31
|
+
from eval_agent.adapters.http import HTTPAgentAdapter
|
|
32
|
+
return HTTPAgentAdapter.from_config(config)
|
|
33
|
+
if atype == "function":
|
|
34
|
+
from eval_agent.adapters.function import FunctionAgentAdapter
|
|
35
|
+
return FunctionAgentAdapter.from_config(config)
|
|
36
|
+
if atype == "webui":
|
|
37
|
+
from eval_agent.adapters.webui import WebUIAgentAdapter
|
|
38
|
+
return WebUIAgentAdapter.from_config(config)
|
|
39
|
+
raise ValueError(f"unknown adapter type: {atype}")
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
import shlex
|
|
4
|
+
import subprocess
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from eval_agent.adapters import AgentAdapter
|
|
8
|
+
from eval_agent.models import Task, Trajectory, Step, Usage
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CLIAgentAdapter(AgentAdapter):
|
|
12
|
+
"""Drives a command-line agent that emits a JSONL stream on stdout.
|
|
13
|
+
|
|
14
|
+
command_template may contain {prompt}; it is substituted (shell-quoted)
|
|
15
|
+
with the task prompt. The process runs with cwd=workspace.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
name = "cli-agent"
|
|
19
|
+
|
|
20
|
+
def __init__(self, command_template: str, name: str = "cli-agent"):
|
|
21
|
+
self.command_template = command_template
|
|
22
|
+
self.name = name
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def from_config(cls, config: dict) -> "CLIAgentAdapter":
|
|
26
|
+
return cls(command_template=config["command"],
|
|
27
|
+
name=config.get("name", "cli-agent"))
|
|
28
|
+
|
|
29
|
+
def run(self, task: Task, workspace: Path) -> Trajectory:
|
|
30
|
+
cmd = self.command_template.format(prompt=shlex.quote(task.prompt))
|
|
31
|
+
steps: list[Step] = []
|
|
32
|
+
final_output = ""
|
|
33
|
+
total_tokens = 0
|
|
34
|
+
start = time.monotonic()
|
|
35
|
+
try:
|
|
36
|
+
proc = subprocess.run(
|
|
37
|
+
cmd, shell=True, cwd=str(workspace),
|
|
38
|
+
capture_output=True, text=True,
|
|
39
|
+
timeout=task.budget.timeout_seconds,
|
|
40
|
+
)
|
|
41
|
+
except subprocess.TimeoutExpired:
|
|
42
|
+
return Trajectory(
|
|
43
|
+
steps=steps,
|
|
44
|
+
usage=Usage(wall_seconds=time.monotonic() - start,
|
|
45
|
+
num_steps=len(steps)),
|
|
46
|
+
status="timeout",
|
|
47
|
+
error="process exceeded timeout",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
for line in proc.stdout.splitlines():
|
|
51
|
+
line = line.strip()
|
|
52
|
+
if not line:
|
|
53
|
+
continue
|
|
54
|
+
try:
|
|
55
|
+
obj = json.loads(line)
|
|
56
|
+
except json.JSONDecodeError:
|
|
57
|
+
continue
|
|
58
|
+
t = obj.get("type")
|
|
59
|
+
if t == "tool_use":
|
|
60
|
+
steps.append(Step(index=len(steps), kind="tool_call",
|
|
61
|
+
name=obj.get("name"),
|
|
62
|
+
payload=obj.get("input", {})))
|
|
63
|
+
elif t == "text":
|
|
64
|
+
steps.append(Step(index=len(steps), kind="message",
|
|
65
|
+
payload={"text": obj.get("text", "")}))
|
|
66
|
+
elif t == "result":
|
|
67
|
+
final_output = obj.get("output", "")
|
|
68
|
+
total_tokens = obj.get("usage", {}).get("total_tokens", 0)
|
|
69
|
+
|
|
70
|
+
status = "success" if proc.returncode == 0 else "crash"
|
|
71
|
+
return Trajectory(
|
|
72
|
+
steps=steps,
|
|
73
|
+
final_output=final_output,
|
|
74
|
+
usage=Usage(total_tokens=total_tokens,
|
|
75
|
+
wall_seconds=time.monotonic() - start,
|
|
76
|
+
num_steps=len(steps)),
|
|
77
|
+
status=status,
|
|
78
|
+
error=None if status == "success" else proc.stderr[-2000:],
|
|
79
|
+
)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import importlib
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Callable, Optional
|
|
6
|
+
from eval_agent.adapters import AgentAdapter
|
|
7
|
+
from eval_agent.models import Task, Trajectory, Step, Usage
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FunctionAgentAdapter(AgentAdapter):
|
|
11
|
+
"""Drives an in-process Python callable.
|
|
12
|
+
|
|
13
|
+
The callable signature is fn(task, workspace, record) where `record`
|
|
14
|
+
is record(kind, name, payload) and it may return a dict with optional
|
|
15
|
+
keys: final_output (str), tokens (int), retries (int).
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
name = "function-agent"
|
|
19
|
+
|
|
20
|
+
def __init__(self, fn: Callable, name: Optional[str] = None):
|
|
21
|
+
self.fn = fn
|
|
22
|
+
if name:
|
|
23
|
+
self.name = name
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def from_config(cls, config: dict) -> "FunctionAgentAdapter":
|
|
27
|
+
# config["entrypoint"] = "module.path:callable"
|
|
28
|
+
module_path, attr = config["entrypoint"].split(":")
|
|
29
|
+
fn = getattr(importlib.import_module(module_path), attr)
|
|
30
|
+
return cls(fn=fn, name=config.get("name", "function-agent"))
|
|
31
|
+
|
|
32
|
+
def run(self, task: Task, workspace: Path) -> Trajectory:
|
|
33
|
+
steps: list[Step] = []
|
|
34
|
+
|
|
35
|
+
def record(kind: str, name=None, payload=None):
|
|
36
|
+
steps.append(Step(index=len(steps), kind=kind, name=name,
|
|
37
|
+
payload=payload or {}))
|
|
38
|
+
|
|
39
|
+
start = time.monotonic()
|
|
40
|
+
try:
|
|
41
|
+
ret = self.fn(task, str(workspace), record) or {}
|
|
42
|
+
elapsed = time.monotonic() - start
|
|
43
|
+
return Trajectory(
|
|
44
|
+
steps=steps,
|
|
45
|
+
final_output=ret.get("final_output", ""),
|
|
46
|
+
usage=Usage(
|
|
47
|
+
total_tokens=ret.get("tokens", 0),
|
|
48
|
+
wall_seconds=elapsed,
|
|
49
|
+
num_steps=len(steps),
|
|
50
|
+
num_retries=ret.get("retries", 0),
|
|
51
|
+
),
|
|
52
|
+
status="success",
|
|
53
|
+
)
|
|
54
|
+
except Exception as e:
|
|
55
|
+
elapsed = time.monotonic() - start
|
|
56
|
+
return Trajectory(
|
|
57
|
+
steps=steps,
|
|
58
|
+
usage=Usage(wall_seconds=elapsed, num_steps=len(steps)),
|
|
59
|
+
status="crash",
|
|
60
|
+
error=f"{type(e).__name__}: {e}",
|
|
61
|
+
)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json as jsonlib
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Callable, Optional
|
|
6
|
+
from eval_agent.adapters import AgentAdapter
|
|
7
|
+
from eval_agent.models import Task, Trajectory, Step, Usage
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _default_post(url: str, json_body: dict, timeout: Optional[float]):
|
|
11
|
+
import urllib.request
|
|
12
|
+
data = jsonlib.dumps(json_body).encode()
|
|
13
|
+
req = urllib.request.Request(url, data=data,
|
|
14
|
+
headers={"Content-Type": "application/json"})
|
|
15
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
16
|
+
return jsonlib.loads(resp.read().decode())
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class HTTPAgentAdapter(AgentAdapter):
|
|
20
|
+
"""Drives an OpenAI-style chat endpoint.
|
|
21
|
+
|
|
22
|
+
`post(url, json_body, timeout) -> dict` is injectable for testing.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
name = "http-agent"
|
|
26
|
+
|
|
27
|
+
def __init__(self, url: str, post: Callable = _default_post,
|
|
28
|
+
name: str = "http-agent"):
|
|
29
|
+
self.url = url
|
|
30
|
+
self.post = post
|
|
31
|
+
self.name = name
|
|
32
|
+
|
|
33
|
+
@classmethod
|
|
34
|
+
def from_config(cls, config: dict) -> "HTTPAgentAdapter":
|
|
35
|
+
return cls(url=config["url"], name=config.get("name", "http-agent"))
|
|
36
|
+
|
|
37
|
+
def run(self, task: Task, workspace: Path) -> Trajectory:
|
|
38
|
+
body = {"messages": [{"role": "user", "content": task.prompt}]}
|
|
39
|
+
start = time.monotonic()
|
|
40
|
+
try:
|
|
41
|
+
resp = self.post(self.url, body, task.budget.timeout_seconds)
|
|
42
|
+
except Exception as e:
|
|
43
|
+
return Trajectory(
|
|
44
|
+
usage=Usage(wall_seconds=time.monotonic() - start),
|
|
45
|
+
status="crash",
|
|
46
|
+
error=f"{type(e).__name__}: {e}",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
message = resp["choices"][0]["message"]
|
|
51
|
+
steps: list[Step] = []
|
|
52
|
+
text = message.get("content") or ""
|
|
53
|
+
if text:
|
|
54
|
+
steps.append(Step(index=len(steps), kind="message",
|
|
55
|
+
payload={"text": text}))
|
|
56
|
+
for tc in message.get("tool_calls", []) or []:
|
|
57
|
+
fn = tc.get("function", {})
|
|
58
|
+
try:
|
|
59
|
+
args = jsonlib.loads(fn.get("arguments", "{}"))
|
|
60
|
+
except jsonlib.JSONDecodeError:
|
|
61
|
+
args = {"raw": fn.get("arguments")}
|
|
62
|
+
steps.append(Step(index=len(steps), kind="tool_call",
|
|
63
|
+
name=fn.get("name"), payload=args))
|
|
64
|
+
except Exception as e:
|
|
65
|
+
return Trajectory(
|
|
66
|
+
usage=Usage(wall_seconds=time.monotonic() - start),
|
|
67
|
+
status="crash",
|
|
68
|
+
error=f"{type(e).__name__}: {e}",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return Trajectory(
|
|
72
|
+
steps=steps,
|
|
73
|
+
final_output=text,
|
|
74
|
+
usage=Usage(total_tokens=resp.get("usage", {}).get("total_tokens", 0),
|
|
75
|
+
wall_seconds=time.monotonic() - start,
|
|
76
|
+
num_steps=len(steps)),
|
|
77
|
+
status="success",
|
|
78
|
+
)
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import time
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable, Optional
|
|
5
|
+
from eval_agent.adapters import AgentAdapter
|
|
6
|
+
from eval_agent.models import Task, Trajectory, Step, Usage
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class WebUIAgentAdapter(AgentAdapter):
|
|
10
|
+
"""Drives a browser-based agent: type prompt, wait for reply, scrape it.
|
|
11
|
+
|
|
12
|
+
All page operations go through an injected BrowserDriver factory, so the
|
|
13
|
+
adapter is fully unit-testable offline. clock/sleep are injectable so the
|
|
14
|
+
text-stabilization loop is deterministic in tests.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
name = "webui-agent"
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
url: str,
|
|
22
|
+
input_selector: str,
|
|
23
|
+
output_selector: str,
|
|
24
|
+
*,
|
|
25
|
+
submit: str = "enter",
|
|
26
|
+
storage_state: Optional[str] = None,
|
|
27
|
+
headless: bool = True,
|
|
28
|
+
stable_ms: int = 1500,
|
|
29
|
+
done_selector: Optional[str] = None,
|
|
30
|
+
done_timeout: Optional[float] = None,
|
|
31
|
+
poll_interval: float = 0.25,
|
|
32
|
+
name: str = "webui-agent",
|
|
33
|
+
driver_factory: Optional[Callable[[], "BrowserDriver"]] = None,
|
|
34
|
+
clock: Callable[[], float] = time.monotonic,
|
|
35
|
+
sleep: Callable[[float], None] = time.sleep,
|
|
36
|
+
):
|
|
37
|
+
self.url = url
|
|
38
|
+
self.input_selector = input_selector
|
|
39
|
+
self.output_selector = output_selector
|
|
40
|
+
self.submit = submit
|
|
41
|
+
self.storage_state = storage_state
|
|
42
|
+
self.headless = headless
|
|
43
|
+
self.stable_ms = stable_ms
|
|
44
|
+
self.done_selector = done_selector
|
|
45
|
+
self.done_timeout = done_timeout
|
|
46
|
+
self.poll_interval = poll_interval
|
|
47
|
+
self.name = name
|
|
48
|
+
self.driver_factory = driver_factory or (lambda: PlaywrightDriver())
|
|
49
|
+
self.clock = clock
|
|
50
|
+
self.sleep = sleep
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def from_config(cls, config: dict) -> "WebUIAgentAdapter":
|
|
54
|
+
done = config.get("done") or {}
|
|
55
|
+
return cls(
|
|
56
|
+
url=config["url"],
|
|
57
|
+
input_selector=config["input_selector"],
|
|
58
|
+
output_selector=config["output_selector"],
|
|
59
|
+
submit=config.get("submit", "enter"),
|
|
60
|
+
storage_state=config.get("storage_state"),
|
|
61
|
+
headless=config.get("headless", True),
|
|
62
|
+
stable_ms=done.get("stable_ms", 1500),
|
|
63
|
+
done_selector=done.get("selector"),
|
|
64
|
+
done_timeout=done.get("timeout_seconds"),
|
|
65
|
+
name=config.get("name", "webui-agent"),
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def run(self, task: Task, workspace: Path) -> Trajectory:
|
|
69
|
+
driver = self.driver_factory()
|
|
70
|
+
start = self.clock()
|
|
71
|
+
timeout = self.done_timeout or task.budget.timeout_seconds or 120.0
|
|
72
|
+
try:
|
|
73
|
+
try:
|
|
74
|
+
driver.open(self.url, self.storage_state, self.headless)
|
|
75
|
+
driver.wait_for(self.input_selector, timeout)
|
|
76
|
+
driver.fill(self.input_selector, task.prompt)
|
|
77
|
+
if self.submit == "enter":
|
|
78
|
+
driver.press_enter(self.input_selector)
|
|
79
|
+
else:
|
|
80
|
+
driver.click(self.submit)
|
|
81
|
+
try:
|
|
82
|
+
output = self._wait_for_done(driver, timeout)
|
|
83
|
+
except TimeoutError as e:
|
|
84
|
+
partial = e.args[0] if e.args else ""
|
|
85
|
+
return Trajectory(
|
|
86
|
+
steps=[Step(index=0, kind="message",
|
|
87
|
+
payload={"text": task.prompt})],
|
|
88
|
+
final_output=partial,
|
|
89
|
+
usage=Usage(wall_seconds=self.clock() - start,
|
|
90
|
+
num_steps=1),
|
|
91
|
+
status="timeout",
|
|
92
|
+
error=f"agent reply did not stabilize within {timeout}s",
|
|
93
|
+
)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
return Trajectory(
|
|
96
|
+
usage=Usage(wall_seconds=self.clock() - start),
|
|
97
|
+
status="crash",
|
|
98
|
+
error=f"{type(e).__name__}: {e}",
|
|
99
|
+
)
|
|
100
|
+
finally:
|
|
101
|
+
try:
|
|
102
|
+
driver.close()
|
|
103
|
+
except Exception:
|
|
104
|
+
pass
|
|
105
|
+
return Trajectory(
|
|
106
|
+
steps=[Step(index=0, kind="message", payload={"text": task.prompt})],
|
|
107
|
+
final_output=output,
|
|
108
|
+
usage=Usage(wall_seconds=self.clock() - start, num_steps=1),
|
|
109
|
+
status="success",
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
def _wait_for_done(self, driver, timeout: float) -> str:
|
|
113
|
+
"""Poll output_selector; done when text is non-empty, unchanged for
|
|
114
|
+
stable_ms, and (if configured) done_selector is present. Raises
|
|
115
|
+
TimeoutError(last_text) if the deadline passes first."""
|
|
116
|
+
start = self.clock()
|
|
117
|
+
last_text = ""
|
|
118
|
+
last_change = start
|
|
119
|
+
while self.clock() - start < timeout:
|
|
120
|
+
text = driver.read_text(self.output_selector)
|
|
121
|
+
now = self.clock()
|
|
122
|
+
if text != last_text:
|
|
123
|
+
last_text = text
|
|
124
|
+
last_change = now
|
|
125
|
+
done_signal = (self.done_selector is None) or driver.is_present(
|
|
126
|
+
self.done_selector
|
|
127
|
+
)
|
|
128
|
+
if last_text and done_signal and (now - last_change) * 1000 >= self.stable_ms:
|
|
129
|
+
return last_text
|
|
130
|
+
self.sleep(self.poll_interval)
|
|
131
|
+
raise TimeoutError(last_text)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class BrowserDriver:
|
|
135
|
+
"""Primitive page operations. No evaluation logic lives here."""
|
|
136
|
+
|
|
137
|
+
def open(self, url, storage_state, headless): raise NotImplementedError
|
|
138
|
+
def wait_for(self, selector, timeout): raise NotImplementedError
|
|
139
|
+
def fill(self, selector, text): raise NotImplementedError
|
|
140
|
+
def press_enter(self, selector): raise NotImplementedError
|
|
141
|
+
def click(self, selector): raise NotImplementedError
|
|
142
|
+
def read_text(self, selector) -> str: raise NotImplementedError
|
|
143
|
+
def is_present(self, selector) -> bool: raise NotImplementedError
|
|
144
|
+
def close(self): raise NotImplementedError
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class PlaywrightDriver(BrowserDriver):
|
|
148
|
+
"""Real driver. Imports playwright lazily so the module loads without it."""
|
|
149
|
+
|
|
150
|
+
def __init__(self):
|
|
151
|
+
self._pw = None
|
|
152
|
+
self._browser = None
|
|
153
|
+
self._context = None
|
|
154
|
+
self._page = None
|
|
155
|
+
|
|
156
|
+
def open(self, url, storage_state, headless):
|
|
157
|
+
try:
|
|
158
|
+
from playwright.sync_api import sync_playwright
|
|
159
|
+
except ImportError as e:
|
|
160
|
+
raise RuntimeError(
|
|
161
|
+
"WebUI adapter needs Playwright. Install with "
|
|
162
|
+
"'pip install eval-agent[webui]' then run "
|
|
163
|
+
"'playwright install chromium'."
|
|
164
|
+
) from e
|
|
165
|
+
self._pw = sync_playwright().start()
|
|
166
|
+
self._browser = self._pw.chromium.launch(headless=headless)
|
|
167
|
+
ctx_kwargs = {"storage_state": storage_state} if storage_state else {}
|
|
168
|
+
self._context = self._browser.new_context(**ctx_kwargs)
|
|
169
|
+
self._page = self._context.new_page()
|
|
170
|
+
self._page.goto(url)
|
|
171
|
+
|
|
172
|
+
def wait_for(self, selector, timeout):
|
|
173
|
+
self._page.wait_for_selector(selector, timeout=timeout * 1000)
|
|
174
|
+
|
|
175
|
+
def fill(self, selector, text):
|
|
176
|
+
self._page.fill(selector, text)
|
|
177
|
+
|
|
178
|
+
def press_enter(self, selector):
|
|
179
|
+
self._page.press(selector, "Enter")
|
|
180
|
+
|
|
181
|
+
def click(self, selector):
|
|
182
|
+
self._page.click(selector)
|
|
183
|
+
|
|
184
|
+
def read_text(self, selector) -> str:
|
|
185
|
+
el = self._page.query_selector(selector)
|
|
186
|
+
return el.inner_text() if el else ""
|
|
187
|
+
|
|
188
|
+
def is_present(self, selector) -> bool:
|
|
189
|
+
return self._page.query_selector(selector) is not None
|
|
190
|
+
|
|
191
|
+
def close(self):
|
|
192
|
+
if self._context:
|
|
193
|
+
self._context.close()
|
|
194
|
+
if self._browser:
|
|
195
|
+
self._browser.close()
|
|
196
|
+
if self._pw:
|
|
197
|
+
self._pw.stop()
|
eval_agent/aggregate.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Any
|
|
3
|
+
from eval_agent.models import Score, TaskResult
|
|
4
|
+
|
|
5
|
+
DEFAULT_WEIGHTS: dict[str, float] = {
|
|
6
|
+
"checkpoint": 3,
|
|
7
|
+
"llm_judge": 2,
|
|
8
|
+
"trajectory": 1,
|
|
9
|
+
"efficiency": 0,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _weight_for(name: str, task_weights: dict, suite_weights: dict) -> float:
|
|
14
|
+
if name in task_weights:
|
|
15
|
+
return task_weights[name]
|
|
16
|
+
if name in suite_weights:
|
|
17
|
+
return suite_weights[name]
|
|
18
|
+
return DEFAULT_WEIGHTS.get(name, 1.0)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def aggregate_task(scores: list[Score], task_weights: dict,
|
|
22
|
+
suite_weights: dict) -> float:
|
|
23
|
+
num = 0.0
|
|
24
|
+
den = 0.0
|
|
25
|
+
for s in scores:
|
|
26
|
+
if not s.available:
|
|
27
|
+
continue
|
|
28
|
+
w = _weight_for(s.scorer_name, task_weights, suite_weights)
|
|
29
|
+
if w <= 0:
|
|
30
|
+
continue
|
|
31
|
+
num += s.value * w
|
|
32
|
+
den += w
|
|
33
|
+
return num / den if den else 0.0
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def apply_weights(scores: list[Score], task_weights: dict,
|
|
37
|
+
suite_weights: dict) -> list[Score]:
|
|
38
|
+
"""Return scores with their effective weight filled in (for reporting)."""
|
|
39
|
+
out = []
|
|
40
|
+
for s in scores:
|
|
41
|
+
s.weight = _weight_for(s.scorer_name, task_weights, suite_weights)
|
|
42
|
+
out.append(s)
|
|
43
|
+
return out
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def aggregate_run(task_results: list[TaskResult]) -> dict[str, Any]:
|
|
47
|
+
n = len(task_results)
|
|
48
|
+
if n == 0:
|
|
49
|
+
return {"num_tasks": 0, "mean_score": 0.0, "num_success": 0}
|
|
50
|
+
mean = sum(tr.aggregate_score for tr in task_results) / n
|
|
51
|
+
num_success = sum(1 for tr in task_results
|
|
52
|
+
if tr.trajectory.status == "success")
|
|
53
|
+
return {
|
|
54
|
+
"num_tasks": n,
|
|
55
|
+
"mean_score": mean,
|
|
56
|
+
"num_success": num_success,
|
|
57
|
+
}
|
eval_agent/cli.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import click
|
|
7
|
+
from eval_agent.loader import load_suite, load_agent_config
|
|
8
|
+
from eval_agent.adapters import build_adapter
|
|
9
|
+
from eval_agent.scorers import build_scorers
|
|
10
|
+
from eval_agent.runner import Runner
|
|
11
|
+
from eval_agent.reporter import write_json, write_html, write_index
|
|
12
|
+
from eval_agent.models import RunResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _stdin_isatty() -> bool:
|
|
16
|
+
return sys.stdin.isatty()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@click.group()
|
|
20
|
+
def cli():
|
|
21
|
+
"""eval-agent: evaluate agents against task suites."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@cli.command()
|
|
25
|
+
@click.option("--suite", "suite_path", default=None)
|
|
26
|
+
@click.option("--agent", "agent_path", default=None)
|
|
27
|
+
@click.option("--out", "out_dir", default="runs")
|
|
28
|
+
@click.option("--run-id", "run_id", default=None)
|
|
29
|
+
@click.option("--no-llm-judge", is_flag=True, default=False,
|
|
30
|
+
help="Disable the LLM judge scorer (avoids API calls).")
|
|
31
|
+
def run(suite_path, agent_path, out_dir, run_id, no_llm_judge):
|
|
32
|
+
"""Run a suite against an agent; write JSON + HTML.
|
|
33
|
+
|
|
34
|
+
With both --suite and --agent, runs non-interactively. If either is missing
|
|
35
|
+
and stdin is a TTY, launches an interactive wizard to build the agent and
|
|
36
|
+
tasks. Missing args without a TTY is an error (never hangs)."""
|
|
37
|
+
wiz_result = None
|
|
38
|
+
if suite_path and agent_path:
|
|
39
|
+
suite = load_suite(suite_path)
|
|
40
|
+
agent_cfg = load_agent_config(agent_path)
|
|
41
|
+
judge_enabled = not no_llm_judge
|
|
42
|
+
elif _stdin_isatty():
|
|
43
|
+
from eval_agent import wizard
|
|
44
|
+
wiz_result = wizard.run_wizard()
|
|
45
|
+
suite = wiz_result.suite
|
|
46
|
+
agent_cfg = wiz_result.agent_config
|
|
47
|
+
judge_enabled = wiz_result.uses_rubric and not no_llm_judge
|
|
48
|
+
if wiz_result.uses_rubric and not os.environ.get("ANTHROPIC_API_KEY"):
|
|
49
|
+
click.echo("warning: 有任务用了 rubric 但未设置 ANTHROPIC_API_KEY,"
|
|
50
|
+
"LLM judge 将无法评分。")
|
|
51
|
+
else:
|
|
52
|
+
raise click.UsageError(
|
|
53
|
+
"Missing option '--suite' and/or '--agent' (non-interactive stdin).")
|
|
54
|
+
|
|
55
|
+
adapter = build_adapter(agent_cfg)
|
|
56
|
+
scorers = build_scorers({"llm_judge": {"enabled": judge_enabled}})
|
|
57
|
+
|
|
58
|
+
if run_id is None:
|
|
59
|
+
run_id = f"{suite.name}-{adapter.name}"
|
|
60
|
+
|
|
61
|
+
runner = Runner(adapter=adapter, scorers=scorers)
|
|
62
|
+
result = runner.run(suite, run_id=run_id)
|
|
63
|
+
|
|
64
|
+
run_out = str(Path(out_dir) / run_id)
|
|
65
|
+
write_json(result, run_out)
|
|
66
|
+
write_html([result], run_out)
|
|
67
|
+
write_index(out_dir)
|
|
68
|
+
click.echo(f"Run complete: {run_out} mean_score="
|
|
69
|
+
f"{result.stats.get('mean_score', 0.0):.3f}")
|
|
70
|
+
|
|
71
|
+
if wiz_result is not None and click.confirm("保存这次配置以便下次复用?",
|
|
72
|
+
default=False):
|
|
73
|
+
from eval_agent import wizard
|
|
74
|
+
agent_path_out, suite_path_out = wizard.save_wizard_output(wiz_result)
|
|
75
|
+
click.echo(f"已写入 {agent_path_out} 和 {suite_path_out}")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@cli.command()
|
|
79
|
+
@click.option("--out", "out_dir", default="runs")
|
|
80
|
+
def index(out_dir):
|
|
81
|
+
"""Write an index.html overview of all runs under --out."""
|
|
82
|
+
path = write_index(out_dir)
|
|
83
|
+
click.echo(f"Index written: {path}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@cli.command()
|
|
87
|
+
@click.option("--run", "run_dir", required=True)
|
|
88
|
+
def report(run_dir):
|
|
89
|
+
"""Re-render report.html from an existing result.json."""
|
|
90
|
+
data = json.loads((Path(run_dir) / "result.json").read_text())
|
|
91
|
+
rr = RunResult.model_validate(data)
|
|
92
|
+
write_html([rr], run_dir)
|
|
93
|
+
click.echo(f"Report written: {Path(run_dir) / 'report.html'}")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@cli.command()
|
|
97
|
+
@click.argument("run_dirs", nargs=-1, required=True)
|
|
98
|
+
@click.option("--out", "out_dir", default="runs/compare")
|
|
99
|
+
def compare(run_dirs, out_dir):
|
|
100
|
+
"""Render a combined leaderboard across multiple runs."""
|
|
101
|
+
runs = []
|
|
102
|
+
for d in run_dirs:
|
|
103
|
+
data = json.loads((Path(d) / "result.json").read_text())
|
|
104
|
+
runs.append(RunResult.model_validate(data))
|
|
105
|
+
write_html(runs, out_dir)
|
|
106
|
+
click.echo(f"Comparison written: {Path(out_dir) / 'report.html'}")
|
|
File without changes
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""A trivial example agent for smoke-testing the framework end-to-end.
|
|
2
|
+
It writes the prompt into output.txt and reports done."""
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def run(task, workspace, record):
|
|
8
|
+
record(kind="tool_call", name="write_file", payload={"path": "output.txt"})
|
|
9
|
+
Path(workspace, "output.txt").write_text(task.prompt)
|
|
10
|
+
record(kind="message", payload={"text": "done"})
|
|
11
|
+
return {"final_output": "done", "tokens": len(task.prompt)}
|
eval_agent/loader.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import yaml
|
|
3
|
+
from eval_agent.models import TaskSuite, Task, Checkpoint, Budget
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _parse_checkpoint(raw: dict) -> Checkpoint:
|
|
7
|
+
raw = dict(raw)
|
|
8
|
+
ctype = raw.pop("type")
|
|
9
|
+
return Checkpoint(type=ctype, args=raw)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _parse_task(raw: dict) -> Task:
|
|
13
|
+
raw = dict(raw)
|
|
14
|
+
raw["checkpoints"] = [_parse_checkpoint(c)
|
|
15
|
+
for c in raw.get("checkpoints", [])]
|
|
16
|
+
if "budget" in raw:
|
|
17
|
+
raw["budget"] = Budget(**raw["budget"])
|
|
18
|
+
return Task(**raw)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_suite(path: str) -> TaskSuite:
|
|
22
|
+
with open(path) as f:
|
|
23
|
+
data = yaml.safe_load(f)
|
|
24
|
+
tasks = [_parse_task(t) for t in data.get("tasks", [])]
|
|
25
|
+
return TaskSuite(
|
|
26
|
+
name=data["name"],
|
|
27
|
+
tasks=tasks,
|
|
28
|
+
default_weights=data.get("default_weights", {}),
|
|
29
|
+
meta=data.get("meta", {}),
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def load_agent_config(path: str) -> dict:
|
|
34
|
+
with open(path) as f:
|
|
35
|
+
return yaml.safe_load(f)
|