evalpilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
eval_agent/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+ from abc import ABC, abstractmethod
3
+ from pathlib import Path
4
+ from eval_agent.models import Task, Trajectory
5
+
6
+
7
+ class AgentAdapter(ABC):
8
+ """Narrow interface: drive the agent under test for one task.
9
+
10
+ The framework only ever calls .run(); how the adapter invokes the
11
+ underlying agent (subprocess, HTTP, in-process function) is internal.
12
+ """
13
+
14
+ name: str = "agent"
15
+
16
+ @abstractmethod
17
+ def run(self, task: Task, workspace: Path) -> Trajectory:
18
+ ...
19
+
20
+
21
+ def build_adapter(config: dict) -> AgentAdapter:
22
+ """Factory: pick an adapter implementation from a config dict.
23
+
24
+ config["type"] in {"function", "cli", "http", "webui"}.
25
+ """
26
+ atype = config["type"]
27
+ if atype == "cli":
28
+ from eval_agent.adapters.cli import CLIAgentAdapter
29
+ return CLIAgentAdapter.from_config(config)
30
+ if atype == "http":
31
+ from eval_agent.adapters.http import HTTPAgentAdapter
32
+ return HTTPAgentAdapter.from_config(config)
33
+ if atype == "function":
34
+ from eval_agent.adapters.function import FunctionAgentAdapter
35
+ return FunctionAgentAdapter.from_config(config)
36
+ if atype == "webui":
37
+ from eval_agent.adapters.webui import WebUIAgentAdapter
38
+ return WebUIAgentAdapter.from_config(config)
39
+ raise ValueError(f"unknown adapter type: {atype}")
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+ import json
3
+ import shlex
4
+ import subprocess
5
+ import time
6
+ from pathlib import Path
7
+ from eval_agent.adapters import AgentAdapter
8
+ from eval_agent.models import Task, Trajectory, Step, Usage
9
+
10
+
11
+ class CLIAgentAdapter(AgentAdapter):
12
+ """Drives a command-line agent that emits a JSONL stream on stdout.
13
+
14
+ command_template may contain {prompt}; it is substituted (shell-quoted)
15
+ with the task prompt. The process runs with cwd=workspace.
16
+ """
17
+
18
+ name = "cli-agent"
19
+
20
+ def __init__(self, command_template: str, name: str = "cli-agent"):
21
+ self.command_template = command_template
22
+ self.name = name
23
+
24
+ @classmethod
25
+ def from_config(cls, config: dict) -> "CLIAgentAdapter":
26
+ return cls(command_template=config["command"],
27
+ name=config.get("name", "cli-agent"))
28
+
29
+ def run(self, task: Task, workspace: Path) -> Trajectory:
30
+ cmd = self.command_template.format(prompt=shlex.quote(task.prompt))
31
+ steps: list[Step] = []
32
+ final_output = ""
33
+ total_tokens = 0
34
+ start = time.monotonic()
35
+ try:
36
+ proc = subprocess.run(
37
+ cmd, shell=True, cwd=str(workspace),
38
+ capture_output=True, text=True,
39
+ timeout=task.budget.timeout_seconds,
40
+ )
41
+ except subprocess.TimeoutExpired:
42
+ return Trajectory(
43
+ steps=steps,
44
+ usage=Usage(wall_seconds=time.monotonic() - start,
45
+ num_steps=len(steps)),
46
+ status="timeout",
47
+ error="process exceeded timeout",
48
+ )
49
+
50
+ for line in proc.stdout.splitlines():
51
+ line = line.strip()
52
+ if not line:
53
+ continue
54
+ try:
55
+ obj = json.loads(line)
56
+ except json.JSONDecodeError:
57
+ continue
58
+ t = obj.get("type")
59
+ if t == "tool_use":
60
+ steps.append(Step(index=len(steps), kind="tool_call",
61
+ name=obj.get("name"),
62
+ payload=obj.get("input", {})))
63
+ elif t == "text":
64
+ steps.append(Step(index=len(steps), kind="message",
65
+ payload={"text": obj.get("text", "")}))
66
+ elif t == "result":
67
+ final_output = obj.get("output", "")
68
+ total_tokens = obj.get("usage", {}).get("total_tokens", 0)
69
+
70
+ status = "success" if proc.returncode == 0 else "crash"
71
+ return Trajectory(
72
+ steps=steps,
73
+ final_output=final_output,
74
+ usage=Usage(total_tokens=total_tokens,
75
+ wall_seconds=time.monotonic() - start,
76
+ num_steps=len(steps)),
77
+ status=status,
78
+ error=None if status == "success" else proc.stderr[-2000:],
79
+ )
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+ import importlib
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Callable, Optional
6
+ from eval_agent.adapters import AgentAdapter
7
+ from eval_agent.models import Task, Trajectory, Step, Usage
8
+
9
+
10
+ class FunctionAgentAdapter(AgentAdapter):
11
+ """Drives an in-process Python callable.
12
+
13
+ The callable signature is fn(task, workspace, record) where `record`
14
+ is record(kind, name, payload) and it may return a dict with optional
15
+ keys: final_output (str), tokens (int), retries (int).
16
+ """
17
+
18
+ name = "function-agent"
19
+
20
+ def __init__(self, fn: Callable, name: Optional[str] = None):
21
+ self.fn = fn
22
+ if name:
23
+ self.name = name
24
+
25
+ @classmethod
26
+ def from_config(cls, config: dict) -> "FunctionAgentAdapter":
27
+ # config["entrypoint"] = "module.path:callable"
28
+ module_path, attr = config["entrypoint"].split(":")
29
+ fn = getattr(importlib.import_module(module_path), attr)
30
+ return cls(fn=fn, name=config.get("name", "function-agent"))
31
+
32
+ def run(self, task: Task, workspace: Path) -> Trajectory:
33
+ steps: list[Step] = []
34
+
35
+ def record(kind: str, name=None, payload=None):
36
+ steps.append(Step(index=len(steps), kind=kind, name=name,
37
+ payload=payload or {}))
38
+
39
+ start = time.monotonic()
40
+ try:
41
+ ret = self.fn(task, str(workspace), record) or {}
42
+ elapsed = time.monotonic() - start
43
+ return Trajectory(
44
+ steps=steps,
45
+ final_output=ret.get("final_output", ""),
46
+ usage=Usage(
47
+ total_tokens=ret.get("tokens", 0),
48
+ wall_seconds=elapsed,
49
+ num_steps=len(steps),
50
+ num_retries=ret.get("retries", 0),
51
+ ),
52
+ status="success",
53
+ )
54
+ except Exception as e:
55
+ elapsed = time.monotonic() - start
56
+ return Trajectory(
57
+ steps=steps,
58
+ usage=Usage(wall_seconds=elapsed, num_steps=len(steps)),
59
+ status="crash",
60
+ error=f"{type(e).__name__}: {e}",
61
+ )
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+ import json as jsonlib
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Callable, Optional
6
+ from eval_agent.adapters import AgentAdapter
7
+ from eval_agent.models import Task, Trajectory, Step, Usage
8
+
9
+
10
+ def _default_post(url: str, json_body: dict, timeout: Optional[float]):
11
+ import urllib.request
12
+ data = jsonlib.dumps(json_body).encode()
13
+ req = urllib.request.Request(url, data=data,
14
+ headers={"Content-Type": "application/json"})
15
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
16
+ return jsonlib.loads(resp.read().decode())
17
+
18
+
19
+ class HTTPAgentAdapter(AgentAdapter):
20
+ """Drives an OpenAI-style chat endpoint.
21
+
22
+ `post(url, json_body, timeout) -> dict` is injectable for testing.
23
+ """
24
+
25
+ name = "http-agent"
26
+
27
+ def __init__(self, url: str, post: Callable = _default_post,
28
+ name: str = "http-agent"):
29
+ self.url = url
30
+ self.post = post
31
+ self.name = name
32
+
33
+ @classmethod
34
+ def from_config(cls, config: dict) -> "HTTPAgentAdapter":
35
+ return cls(url=config["url"], name=config.get("name", "http-agent"))
36
+
37
+ def run(self, task: Task, workspace: Path) -> Trajectory:
38
+ body = {"messages": [{"role": "user", "content": task.prompt}]}
39
+ start = time.monotonic()
40
+ try:
41
+ resp = self.post(self.url, body, task.budget.timeout_seconds)
42
+ except Exception as e:
43
+ return Trajectory(
44
+ usage=Usage(wall_seconds=time.monotonic() - start),
45
+ status="crash",
46
+ error=f"{type(e).__name__}: {e}",
47
+ )
48
+
49
+ try:
50
+ message = resp["choices"][0]["message"]
51
+ steps: list[Step] = []
52
+ text = message.get("content") or ""
53
+ if text:
54
+ steps.append(Step(index=len(steps), kind="message",
55
+ payload={"text": text}))
56
+ for tc in message.get("tool_calls", []) or []:
57
+ fn = tc.get("function", {})
58
+ try:
59
+ args = jsonlib.loads(fn.get("arguments", "{}"))
60
+ except jsonlib.JSONDecodeError:
61
+ args = {"raw": fn.get("arguments")}
62
+ steps.append(Step(index=len(steps), kind="tool_call",
63
+ name=fn.get("name"), payload=args))
64
+ except Exception as e:
65
+ return Trajectory(
66
+ usage=Usage(wall_seconds=time.monotonic() - start),
67
+ status="crash",
68
+ error=f"{type(e).__name__}: {e}",
69
+ )
70
+
71
+ return Trajectory(
72
+ steps=steps,
73
+ final_output=text,
74
+ usage=Usage(total_tokens=resp.get("usage", {}).get("total_tokens", 0),
75
+ wall_seconds=time.monotonic() - start,
76
+ num_steps=len(steps)),
77
+ status="success",
78
+ )
@@ -0,0 +1,197 @@
1
+ from __future__ import annotations
2
+ import time
3
+ from pathlib import Path
4
+ from typing import Callable, Optional
5
+ from eval_agent.adapters import AgentAdapter
6
+ from eval_agent.models import Task, Trajectory, Step, Usage
7
+
8
+
9
+ class WebUIAgentAdapter(AgentAdapter):
10
+ """Drives a browser-based agent: type prompt, wait for reply, scrape it.
11
+
12
+ All page operations go through an injected BrowserDriver factory, so the
13
+ adapter is fully unit-testable offline. clock/sleep are injectable so the
14
+ text-stabilization loop is deterministic in tests.
15
+ """
16
+
17
+ name = "webui-agent"
18
+
19
+ def __init__(
20
+ self,
21
+ url: str,
22
+ input_selector: str,
23
+ output_selector: str,
24
+ *,
25
+ submit: str = "enter",
26
+ storage_state: Optional[str] = None,
27
+ headless: bool = True,
28
+ stable_ms: int = 1500,
29
+ done_selector: Optional[str] = None,
30
+ done_timeout: Optional[float] = None,
31
+ poll_interval: float = 0.25,
32
+ name: str = "webui-agent",
33
+ driver_factory: Optional[Callable[[], "BrowserDriver"]] = None,
34
+ clock: Callable[[], float] = time.monotonic,
35
+ sleep: Callable[[float], None] = time.sleep,
36
+ ):
37
+ self.url = url
38
+ self.input_selector = input_selector
39
+ self.output_selector = output_selector
40
+ self.submit = submit
41
+ self.storage_state = storage_state
42
+ self.headless = headless
43
+ self.stable_ms = stable_ms
44
+ self.done_selector = done_selector
45
+ self.done_timeout = done_timeout
46
+ self.poll_interval = poll_interval
47
+ self.name = name
48
+ self.driver_factory = driver_factory or (lambda: PlaywrightDriver())
49
+ self.clock = clock
50
+ self.sleep = sleep
51
+
52
+ @classmethod
53
+ def from_config(cls, config: dict) -> "WebUIAgentAdapter":
54
+ done = config.get("done") or {}
55
+ return cls(
56
+ url=config["url"],
57
+ input_selector=config["input_selector"],
58
+ output_selector=config["output_selector"],
59
+ submit=config.get("submit", "enter"),
60
+ storage_state=config.get("storage_state"),
61
+ headless=config.get("headless", True),
62
+ stable_ms=done.get("stable_ms", 1500),
63
+ done_selector=done.get("selector"),
64
+ done_timeout=done.get("timeout_seconds"),
65
+ name=config.get("name", "webui-agent"),
66
+ )
67
+
68
+ def run(self, task: Task, workspace: Path) -> Trajectory:
69
+ driver = self.driver_factory()
70
+ start = self.clock()
71
+ timeout = self.done_timeout or task.budget.timeout_seconds or 120.0
72
+ try:
73
+ try:
74
+ driver.open(self.url, self.storage_state, self.headless)
75
+ driver.wait_for(self.input_selector, timeout)
76
+ driver.fill(self.input_selector, task.prompt)
77
+ if self.submit == "enter":
78
+ driver.press_enter(self.input_selector)
79
+ else:
80
+ driver.click(self.submit)
81
+ try:
82
+ output = self._wait_for_done(driver, timeout)
83
+ except TimeoutError as e:
84
+ partial = e.args[0] if e.args else ""
85
+ return Trajectory(
86
+ steps=[Step(index=0, kind="message",
87
+ payload={"text": task.prompt})],
88
+ final_output=partial,
89
+ usage=Usage(wall_seconds=self.clock() - start,
90
+ num_steps=1),
91
+ status="timeout",
92
+ error=f"agent reply did not stabilize within {timeout}s",
93
+ )
94
+ except Exception as e:
95
+ return Trajectory(
96
+ usage=Usage(wall_seconds=self.clock() - start),
97
+ status="crash",
98
+ error=f"{type(e).__name__}: {e}",
99
+ )
100
+ finally:
101
+ try:
102
+ driver.close()
103
+ except Exception:
104
+ pass
105
+ return Trajectory(
106
+ steps=[Step(index=0, kind="message", payload={"text": task.prompt})],
107
+ final_output=output,
108
+ usage=Usage(wall_seconds=self.clock() - start, num_steps=1),
109
+ status="success",
110
+ )
111
+
112
+ def _wait_for_done(self, driver, timeout: float) -> str:
113
+ """Poll output_selector; done when text is non-empty, unchanged for
114
+ stable_ms, and (if configured) done_selector is present. Raises
115
+ TimeoutError(last_text) if the deadline passes first."""
116
+ start = self.clock()
117
+ last_text = ""
118
+ last_change = start
119
+ while self.clock() - start < timeout:
120
+ text = driver.read_text(self.output_selector)
121
+ now = self.clock()
122
+ if text != last_text:
123
+ last_text = text
124
+ last_change = now
125
+ done_signal = (self.done_selector is None) or driver.is_present(
126
+ self.done_selector
127
+ )
128
+ if last_text and done_signal and (now - last_change) * 1000 >= self.stable_ms:
129
+ return last_text
130
+ self.sleep(self.poll_interval)
131
+ raise TimeoutError(last_text)
132
+
133
+
134
+ class BrowserDriver:
135
+ """Primitive page operations. No evaluation logic lives here."""
136
+
137
+ def open(self, url, storage_state, headless): raise NotImplementedError
138
+ def wait_for(self, selector, timeout): raise NotImplementedError
139
+ def fill(self, selector, text): raise NotImplementedError
140
+ def press_enter(self, selector): raise NotImplementedError
141
+ def click(self, selector): raise NotImplementedError
142
+ def read_text(self, selector) -> str: raise NotImplementedError
143
+ def is_present(self, selector) -> bool: raise NotImplementedError
144
+ def close(self): raise NotImplementedError
145
+
146
+
147
+ class PlaywrightDriver(BrowserDriver):
148
+ """Real driver. Imports playwright lazily so the module loads without it."""
149
+
150
+ def __init__(self):
151
+ self._pw = None
152
+ self._browser = None
153
+ self._context = None
154
+ self._page = None
155
+
156
+ def open(self, url, storage_state, headless):
157
+ try:
158
+ from playwright.sync_api import sync_playwright
159
+ except ImportError as e:
160
+ raise RuntimeError(
161
+ "WebUI adapter needs Playwright. Install with "
162
+ "'pip install eval-agent[webui]' then run "
163
+ "'playwright install chromium'."
164
+ ) from e
165
+ self._pw = sync_playwright().start()
166
+ self._browser = self._pw.chromium.launch(headless=headless)
167
+ ctx_kwargs = {"storage_state": storage_state} if storage_state else {}
168
+ self._context = self._browser.new_context(**ctx_kwargs)
169
+ self._page = self._context.new_page()
170
+ self._page.goto(url)
171
+
172
+ def wait_for(self, selector, timeout):
173
+ self._page.wait_for_selector(selector, timeout=timeout * 1000)
174
+
175
+ def fill(self, selector, text):
176
+ self._page.fill(selector, text)
177
+
178
+ def press_enter(self, selector):
179
+ self._page.press(selector, "Enter")
180
+
181
+ def click(self, selector):
182
+ self._page.click(selector)
183
+
184
+ def read_text(self, selector) -> str:
185
+ el = self._page.query_selector(selector)
186
+ return el.inner_text() if el else ""
187
+
188
+ def is_present(self, selector) -> bool:
189
+ return self._page.query_selector(selector) is not None
190
+
191
+ def close(self):
192
+ if self._context:
193
+ self._context.close()
194
+ if self._browser:
195
+ self._browser.close()
196
+ if self._pw:
197
+ self._pw.stop()
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+ from typing import Any
3
+ from eval_agent.models import Score, TaskResult
4
+
5
+ DEFAULT_WEIGHTS: dict[str, float] = {
6
+ "checkpoint": 3,
7
+ "llm_judge": 2,
8
+ "trajectory": 1,
9
+ "efficiency": 0,
10
+ }
11
+
12
+
13
+ def _weight_for(name: str, task_weights: dict, suite_weights: dict) -> float:
14
+ if name in task_weights:
15
+ return task_weights[name]
16
+ if name in suite_weights:
17
+ return suite_weights[name]
18
+ return DEFAULT_WEIGHTS.get(name, 1.0)
19
+
20
+
21
+ def aggregate_task(scores: list[Score], task_weights: dict,
22
+ suite_weights: dict) -> float:
23
+ num = 0.0
24
+ den = 0.0
25
+ for s in scores:
26
+ if not s.available:
27
+ continue
28
+ w = _weight_for(s.scorer_name, task_weights, suite_weights)
29
+ if w <= 0:
30
+ continue
31
+ num += s.value * w
32
+ den += w
33
+ return num / den if den else 0.0
34
+
35
+
36
+ def apply_weights(scores: list[Score], task_weights: dict,
37
+ suite_weights: dict) -> list[Score]:
38
+ """Return scores with their effective weight filled in (for reporting)."""
39
+ out = []
40
+ for s in scores:
41
+ s.weight = _weight_for(s.scorer_name, task_weights, suite_weights)
42
+ out.append(s)
43
+ return out
44
+
45
+
46
+ def aggregate_run(task_results: list[TaskResult]) -> dict[str, Any]:
47
+ n = len(task_results)
48
+ if n == 0:
49
+ return {"num_tasks": 0, "mean_score": 0.0, "num_success": 0}
50
+ mean = sum(tr.aggregate_score for tr in task_results) / n
51
+ num_success = sum(1 for tr in task_results
52
+ if tr.trajectory.status == "success")
53
+ return {
54
+ "num_tasks": n,
55
+ "mean_score": mean,
56
+ "num_success": num_success,
57
+ }
eval_agent/cli.py ADDED
@@ -0,0 +1,106 @@
1
+ from __future__ import annotations
2
+ import json
3
+ import os
4
+ import sys
5
+ from pathlib import Path
6
+ import click
7
+ from eval_agent.loader import load_suite, load_agent_config
8
+ from eval_agent.adapters import build_adapter
9
+ from eval_agent.scorers import build_scorers
10
+ from eval_agent.runner import Runner
11
+ from eval_agent.reporter import write_json, write_html, write_index
12
+ from eval_agent.models import RunResult
13
+
14
+
15
+ def _stdin_isatty() -> bool:
16
+ return sys.stdin.isatty()
17
+
18
+
19
+ @click.group()
20
+ def cli():
21
+ """eval-agent: evaluate agents against task suites."""
22
+
23
+
24
+ @cli.command()
25
+ @click.option("--suite", "suite_path", default=None)
26
+ @click.option("--agent", "agent_path", default=None)
27
+ @click.option("--out", "out_dir", default="runs")
28
+ @click.option("--run-id", "run_id", default=None)
29
+ @click.option("--no-llm-judge", is_flag=True, default=False,
30
+ help="Disable the LLM judge scorer (avoids API calls).")
31
+ def run(suite_path, agent_path, out_dir, run_id, no_llm_judge):
32
+ """Run a suite against an agent; write JSON + HTML.
33
+
34
+ With both --suite and --agent, runs non-interactively. If either is missing
35
+ and stdin is a TTY, launches an interactive wizard to build the agent and
36
+ tasks. Missing args without a TTY is an error (never hangs)."""
37
+ wiz_result = None
38
+ if suite_path and agent_path:
39
+ suite = load_suite(suite_path)
40
+ agent_cfg = load_agent_config(agent_path)
41
+ judge_enabled = not no_llm_judge
42
+ elif _stdin_isatty():
43
+ from eval_agent import wizard
44
+ wiz_result = wizard.run_wizard()
45
+ suite = wiz_result.suite
46
+ agent_cfg = wiz_result.agent_config
47
+ judge_enabled = wiz_result.uses_rubric and not no_llm_judge
48
+ if wiz_result.uses_rubric and not os.environ.get("ANTHROPIC_API_KEY"):
49
+ click.echo("warning: 有任务用了 rubric 但未设置 ANTHROPIC_API_KEY,"
50
+ "LLM judge 将无法评分。")
51
+ else:
52
+ raise click.UsageError(
53
+ "Missing option '--suite' and/or '--agent' (non-interactive stdin).")
54
+
55
+ adapter = build_adapter(agent_cfg)
56
+ scorers = build_scorers({"llm_judge": {"enabled": judge_enabled}})
57
+
58
+ if run_id is None:
59
+ run_id = f"{suite.name}-{adapter.name}"
60
+
61
+ runner = Runner(adapter=adapter, scorers=scorers)
62
+ result = runner.run(suite, run_id=run_id)
63
+
64
+ run_out = str(Path(out_dir) / run_id)
65
+ write_json(result, run_out)
66
+ write_html([result], run_out)
67
+ write_index(out_dir)
68
+ click.echo(f"Run complete: {run_out} mean_score="
69
+ f"{result.stats.get('mean_score', 0.0):.3f}")
70
+
71
+ if wiz_result is not None and click.confirm("保存这次配置以便下次复用?",
72
+ default=False):
73
+ from eval_agent import wizard
74
+ agent_path_out, suite_path_out = wizard.save_wizard_output(wiz_result)
75
+ click.echo(f"已写入 {agent_path_out} 和 {suite_path_out}")
76
+
77
+
78
+ @cli.command()
79
+ @click.option("--out", "out_dir", default="runs")
80
+ def index(out_dir):
81
+ """Write an index.html overview of all runs under --out."""
82
+ path = write_index(out_dir)
83
+ click.echo(f"Index written: {path}")
84
+
85
+
86
+ @cli.command()
87
+ @click.option("--run", "run_dir", required=True)
88
+ def report(run_dir):
89
+ """Re-render report.html from an existing result.json."""
90
+ data = json.loads((Path(run_dir) / "result.json").read_text())
91
+ rr = RunResult.model_validate(data)
92
+ write_html([rr], run_dir)
93
+ click.echo(f"Report written: {Path(run_dir) / 'report.html'}")
94
+
95
+
96
+ @cli.command()
97
+ @click.argument("run_dirs", nargs=-1, required=True)
98
+ @click.option("--out", "out_dir", default="runs/compare")
99
+ def compare(run_dirs, out_dir):
100
+ """Render a combined leaderboard across multiple runs."""
101
+ runs = []
102
+ for d in run_dirs:
103
+ data = json.loads((Path(d) / "result.json").read_text())
104
+ runs.append(RunResult.model_validate(data))
105
+ write_html(runs, out_dir)
106
+ click.echo(f"Comparison written: {Path(out_dir) / 'report.html'}")
File without changes
@@ -0,0 +1,11 @@
1
+ """A trivial example agent for smoke-testing the framework end-to-end.
2
+ It writes the prompt into output.txt and reports done."""
3
+ from __future__ import annotations
4
+ from pathlib import Path
5
+
6
+
7
+ def run(task, workspace, record):
8
+ record(kind="tool_call", name="write_file", payload={"path": "output.txt"})
9
+ Path(workspace, "output.txt").write_text(task.prompt)
10
+ record(kind="message", payload={"text": "done"})
11
+ return {"final_output": "done", "tokens": len(task.prompt)}
eval_agent/loader.py ADDED
@@ -0,0 +1,35 @@
1
+ from __future__ import annotations
2
+ import yaml
3
+ from eval_agent.models import TaskSuite, Task, Checkpoint, Budget
4
+
5
+
6
+ def _parse_checkpoint(raw: dict) -> Checkpoint:
7
+ raw = dict(raw)
8
+ ctype = raw.pop("type")
9
+ return Checkpoint(type=ctype, args=raw)
10
+
11
+
12
+ def _parse_task(raw: dict) -> Task:
13
+ raw = dict(raw)
14
+ raw["checkpoints"] = [_parse_checkpoint(c)
15
+ for c in raw.get("checkpoints", [])]
16
+ if "budget" in raw:
17
+ raw["budget"] = Budget(**raw["budget"])
18
+ return Task(**raw)
19
+
20
+
21
+ def load_suite(path: str) -> TaskSuite:
22
+ with open(path) as f:
23
+ data = yaml.safe_load(f)
24
+ tasks = [_parse_task(t) for t in data.get("tasks", [])]
25
+ return TaskSuite(
26
+ name=data["name"],
27
+ tasks=tasks,
28
+ default_weights=data.get("default_weights", {}),
29
+ meta=data.get("meta", {}),
30
+ )
31
+
32
+
33
+ def load_agent_config(path: str) -> dict:
34
+ with open(path) as f:
35
+ return yaml.safe_load(f)