evalpilot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. evalpilot-0.1.0/LICENSE +21 -0
  2. evalpilot-0.1.0/PKG-INFO +144 -0
  3. evalpilot-0.1.0/README.md +114 -0
  4. evalpilot-0.1.0/eval_agent/__init__.py +1 -0
  5. evalpilot-0.1.0/eval_agent/adapters/__init__.py +39 -0
  6. evalpilot-0.1.0/eval_agent/adapters/cli.py +79 -0
  7. evalpilot-0.1.0/eval_agent/adapters/function.py +61 -0
  8. evalpilot-0.1.0/eval_agent/adapters/http.py +78 -0
  9. evalpilot-0.1.0/eval_agent/adapters/webui.py +197 -0
  10. evalpilot-0.1.0/eval_agent/aggregate.py +57 -0
  11. evalpilot-0.1.0/eval_agent/cli.py +106 -0
  12. evalpilot-0.1.0/eval_agent/examples/__init__.py +0 -0
  13. evalpilot-0.1.0/eval_agent/examples/echo.py +11 -0
  14. evalpilot-0.1.0/eval_agent/loader.py +35 -0
  15. evalpilot-0.1.0/eval_agent/models.py +79 -0
  16. evalpilot-0.1.0/eval_agent/reporter.py +70 -0
  17. evalpilot-0.1.0/eval_agent/runner.py +37 -0
  18. evalpilot-0.1.0/eval_agent/sandbox.py +43 -0
  19. evalpilot-0.1.0/eval_agent/scorers/__init__.py +34 -0
  20. evalpilot-0.1.0/eval_agent/scorers/checkpoint.py +66 -0
  21. evalpilot-0.1.0/eval_agent/scorers/efficiency.py +19 -0
  22. evalpilot-0.1.0/eval_agent/scorers/llm_judge.py +57 -0
  23. evalpilot-0.1.0/eval_agent/scorers/trajectory.py +50 -0
  24. evalpilot-0.1.0/eval_agent/templates/index.html.j2 +73 -0
  25. evalpilot-0.1.0/eval_agent/templates/report.html.j2 +68 -0
  26. evalpilot-0.1.0/eval_agent/wizard.py +132 -0
  27. evalpilot-0.1.0/evalpilot.egg-info/PKG-INFO +144 -0
  28. evalpilot-0.1.0/evalpilot.egg-info/SOURCES.txt +49 -0
  29. evalpilot-0.1.0/evalpilot.egg-info/dependency_links.txt +1 -0
  30. evalpilot-0.1.0/evalpilot.egg-info/entry_points.txt +4 -0
  31. evalpilot-0.1.0/evalpilot.egg-info/requires.txt +11 -0
  32. evalpilot-0.1.0/evalpilot.egg-info/top_level.txt +1 -0
  33. evalpilot-0.1.0/pyproject.toml +46 -0
  34. evalpilot-0.1.0/setup.cfg +4 -0
  35. evalpilot-0.1.0/tests/test_adapter_cli.py +47 -0
  36. evalpilot-0.1.0/tests/test_adapter_function.py +34 -0
  37. evalpilot-0.1.0/tests/test_adapter_http.py +48 -0
  38. evalpilot-0.1.0/tests/test_adapter_webui.py +210 -0
  39. evalpilot-0.1.0/tests/test_aggregate.py +44 -0
  40. evalpilot-0.1.0/tests/test_cli.py +159 -0
  41. evalpilot-0.1.0/tests/test_index.py +45 -0
  42. evalpilot-0.1.0/tests/test_loader.py +37 -0
  43. evalpilot-0.1.0/tests/test_models.py +31 -0
  44. evalpilot-0.1.0/tests/test_reporter.py +43 -0
  45. evalpilot-0.1.0/tests/test_runner.py +58 -0
  46. evalpilot-0.1.0/tests/test_sandbox.py +25 -0
  47. evalpilot-0.1.0/tests/test_scorer_checkpoint.py +103 -0
  48. evalpilot-0.1.0/tests/test_scorer_efficiency.py +29 -0
  49. evalpilot-0.1.0/tests/test_scorer_llm_judge.py +36 -0
  50. evalpilot-0.1.0/tests/test_scorer_trajectory.py +44 -0
  51. evalpilot-0.1.0/tests/test_wizard.py +122 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 ailearneryang
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,144 @@
1
+ Metadata-Version: 2.4
2
+ Name: evalpilot
3
+ Version: 0.1.0
4
+ Summary: A general framework for online-driving, recording, and scoring AI agents against task suites.
5
+ Author-email: ailearneryang <ailearneryang@gmail.com>
6
+ License: MIT
7
+ Keywords: agent,evaluation,eval,llm,benchmark,testing,cli
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Environment :: Console
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Software Development :: Testing
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.11
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: pydantic>=2.6
21
+ Requires-Dist: PyYAML>=6.0
22
+ Requires-Dist: jinja2>=3.1
23
+ Requires-Dist: click>=8.1
24
+ Requires-Dist: anthropic>=0.40
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=8.0; extra == "dev"
27
+ Provides-Extra: webui
28
+ Requires-Dist: playwright>=1.40; extra == "webui"
29
+ Dynamic: license-file
30
+
31
+ # eval-agent
32
+
33
+ A general framework for evaluating agents. It online-drives any agent through a
34
+ task suite, records its trajectory, scores it with four pluggable scorers
35
+ (objective checkpoints, LLM-as-judge, trajectory/process, efficiency), and emits
36
+ a JSON + static HTML report.
37
+
38
+ ## Install
39
+
40
+ Published on PyPI as `evalpilot` (the CLI is available as `eval-agent`,
41
+ `evalagent`, or `evalpilot` — all the same command).
42
+
43
+ Run with no install (recommended):
44
+
45
+ uvx evalpilot run
46
+
47
+ Install as a tool:
48
+
49
+ uv tool install evalpilot # or: pipx install evalpilot
50
+
51
+ For browser-UI (`webui`) agents, install the extra and the browser once:
52
+
53
+ uv tool install "evalpilot[webui]"
54
+ playwright install chromium
55
+
56
+ From source (development):
57
+
58
+ python -m venv .venv
59
+ .venv/bin/pip install -e ".[dev]"
60
+
61
+ ## Quick start
62
+
63
+ .venv/bin/eval-agent run --suite suites/example.yaml --agent configs/function-echo.yaml --run-id demo --no-llm-judge
64
+ open runs/demo/report.html
65
+
66
+ Or run it interactively — omit `--suite`/`--agent` and a wizard walks you through
67
+ defining the agent and tasks (no YAML needed):
68
+
69
+ eval-agent run
70
+
71
+ Browse all runs at once:
72
+
73
+ open runs/index.html
74
+
75
+ ## Concepts
76
+
77
+ - **Task / TaskSuite** — declarative units of evaluation (prompt, setup,
78
+ checkpoints, rubric, budget).
79
+ - **AgentAdapter** — narrow interface that drives the agent under test. Built-in:
80
+ `function`, `cli`, `http`, `webui`. Add a new agent by writing one subclass.
81
+ - **Trajectory** — the recorded run (steps, output, usage, final state). Sole
82
+ data source for all scorers.
83
+ - **Scorers** — checkpoint, llm_judge, trajectory, efficiency. Read-only,
84
+ pluggable, weighted. Checkpoints include file/command checks plus
85
+ `output_contains` / `output_matches` for grading text answers objectively.
86
+ - **Runner / Reporter** — orchestrate and report.
87
+
88
+ ## Adding an agent
89
+
90
+ Write a YAML config selecting an adapter type. For a CLI agent:
91
+
92
+ type: cli
93
+ name: my-agent
94
+ command: my-agent --prompt {prompt} --json-stream
95
+
96
+ For an in-process Python callable:
97
+
98
+ type: function
99
+ name: my-agent
100
+ entrypoint: my_module:run
101
+
102
+ For an OpenAI-style HTTP endpoint:
103
+
104
+ type: http
105
+ name: my-agent
106
+ url: http://localhost:8000/v1/chat/completions
107
+
108
+ For a browser-based (web UI) agent:
109
+
110
+ type: webui
111
+ name: my-chat-agent
112
+ url: https://my-agent.example.com/chat
113
+ input_selector: "textarea"
114
+ output_selector: ".message.assistant:last-child"
115
+
116
+ The `webui` adapter needs Playwright (an optional extra):
117
+
118
+ pip install -e ".[webui]"
119
+ playwright install chromium
120
+
121
+ It types the prompt into `input_selector`, waits until the reply text in
122
+ `output_selector` stops changing (default 1.5s), then scrapes it. Reuse a
123
+ logged-in session with `storage_state: ./auth.json`.
124
+
125
+ ## Scoring weights
126
+
127
+ Defaults: checkpoint=3, llm_judge=2, trajectory=1, efficiency=0. Override per
128
+ suite (`default_weights`) or per task (`weights`). Each scorer's sub-score and
129
+ its reason are always preserved in the report, not just the aggregate.
130
+
131
+ ## Commands
132
+
133
+ eval-agent run --suite SUITE.yaml --agent AGENT.yaml [--run-id ID] [--out DIR] [--no-llm-judge]
134
+ eval-agent run # interactive wizard (builds agent + tasks)
135
+ eval-agent report --run runs/ID # re-render report.html from result.json
136
+ eval-agent compare runs/a runs/b # combined leaderboard across runs
137
+ eval-agent index [--out runs] # overview page listing all runs -> runs/index.html
138
+
139
+ ## Testing
140
+
141
+ .venv/bin/python -m pytest
142
+
143
+ All scorer/adapter tests inject fakes (fake CLI scripts, fake HTTP post, fake
144
+ LLM completion), so the suite runs fully offline with no network or API key.
@@ -0,0 +1,114 @@
1
+ # eval-agent
2
+
3
+ A general framework for evaluating agents. It online-drives any agent through a
4
+ task suite, records its trajectory, scores it with four pluggable scorers
5
+ (objective checkpoints, LLM-as-judge, trajectory/process, efficiency), and emits
6
+ a JSON + static HTML report.
7
+
8
+ ## Install
9
+
10
+ Published on PyPI as `evalpilot` (the CLI is available as `eval-agent`,
11
+ `evalagent`, or `evalpilot` — all the same command).
12
+
13
+ Run with no install (recommended):
14
+
15
+ uvx evalpilot run
16
+
17
+ Install as a tool:
18
+
19
+ uv tool install evalpilot # or: pipx install evalpilot
20
+
21
+ For browser-UI (`webui`) agents, install the extra and the browser once:
22
+
23
+ uv tool install "evalpilot[webui]"
24
+ playwright install chromium
25
+
26
+ From source (development):
27
+
28
+ python -m venv .venv
29
+ .venv/bin/pip install -e ".[dev]"
30
+
31
+ ## Quick start
32
+
33
+ .venv/bin/eval-agent run --suite suites/example.yaml --agent configs/function-echo.yaml --run-id demo --no-llm-judge
34
+ open runs/demo/report.html
35
+
36
+ Or run it interactively — omit `--suite`/`--agent` and a wizard walks you through
37
+ defining the agent and tasks (no YAML needed):
38
+
39
+ eval-agent run
40
+
41
+ Browse all runs at once:
42
+
43
+ open runs/index.html
44
+
45
+ ## Concepts
46
+
47
+ - **Task / TaskSuite** — declarative units of evaluation (prompt, setup,
48
+ checkpoints, rubric, budget).
49
+ - **AgentAdapter** — narrow interface that drives the agent under test. Built-in:
50
+ `function`, `cli`, `http`, `webui`. Add a new agent by writing one subclass.
51
+ - **Trajectory** — the recorded run (steps, output, usage, final state). Sole
52
+ data source for all scorers.
53
+ - **Scorers** — checkpoint, llm_judge, trajectory, efficiency. Read-only,
54
+ pluggable, weighted. Checkpoints include file/command checks plus
55
+ `output_contains` / `output_matches` for grading text answers objectively.
56
+ - **Runner / Reporter** — orchestrate and report.
57
+
58
+ ## Adding an agent
59
+
60
+ Write a YAML config selecting an adapter type. For a CLI agent:
61
+
62
+ type: cli
63
+ name: my-agent
64
+ command: my-agent --prompt {prompt} --json-stream
65
+
66
+ For an in-process Python callable:
67
+
68
+ type: function
69
+ name: my-agent
70
+ entrypoint: my_module:run
71
+
72
+ For an OpenAI-style HTTP endpoint:
73
+
74
+ type: http
75
+ name: my-agent
76
+ url: http://localhost:8000/v1/chat/completions
77
+
78
+ For a browser-based (web UI) agent:
79
+
80
+ type: webui
81
+ name: my-chat-agent
82
+ url: https://my-agent.example.com/chat
83
+ input_selector: "textarea"
84
+ output_selector: ".message.assistant:last-child"
85
+
86
+ The `webui` adapter needs Playwright (an optional extra):
87
+
88
+ pip install -e ".[webui]"
89
+ playwright install chromium
90
+
91
+ It types the prompt into `input_selector`, waits until the reply text in
92
+ `output_selector` stops changing (default 1.5s), then scrapes it. Reuse a
93
+ logged-in session with `storage_state: ./auth.json`.
94
+
95
+ ## Scoring weights
96
+
97
+ Defaults: checkpoint=3, llm_judge=2, trajectory=1, efficiency=0. Override per
98
+ suite (`default_weights`) or per task (`weights`). Each scorer's sub-score and
99
+ its reason are always preserved in the report, not just the aggregate.
100
+
101
+ ## Commands
102
+
103
+ eval-agent run --suite SUITE.yaml --agent AGENT.yaml [--run-id ID] [--out DIR] [--no-llm-judge]
104
+ eval-agent run # interactive wizard (builds agent + tasks)
105
+ eval-agent report --run runs/ID # re-render report.html from result.json
106
+ eval-agent compare runs/a runs/b # combined leaderboard across runs
107
+ eval-agent index [--out runs] # overview page listing all runs -> runs/index.html
108
+
109
+ ## Testing
110
+
111
+ .venv/bin/python -m pytest
112
+
113
+ All scorer/adapter tests inject fakes (fake CLI scripts, fake HTTP post, fake
114
+ LLM completion), so the suite runs fully offline with no network or API key.
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
@@ -0,0 +1,39 @@
1
+ from __future__ import annotations
2
+ from abc import ABC, abstractmethod
3
+ from pathlib import Path
4
+ from eval_agent.models import Task, Trajectory
5
+
6
+
7
+ class AgentAdapter(ABC):
8
+ """Narrow interface: drive the agent under test for one task.
9
+
10
+ The framework only ever calls .run(); how the adapter invokes the
11
+ underlying agent (subprocess, HTTP, in-process function) is internal.
12
+ """
13
+
14
+ name: str = "agent"
15
+
16
+ @abstractmethod
17
+ def run(self, task: Task, workspace: Path) -> Trajectory:
18
+ ...
19
+
20
+
21
+ def build_adapter(config: dict) -> AgentAdapter:
22
+ """Factory: pick an adapter implementation from a config dict.
23
+
24
+ config["type"] in {"function", "cli", "http", "webui"}.
25
+ """
26
+ atype = config["type"]
27
+ if atype == "cli":
28
+ from eval_agent.adapters.cli import CLIAgentAdapter
29
+ return CLIAgentAdapter.from_config(config)
30
+ if atype == "http":
31
+ from eval_agent.adapters.http import HTTPAgentAdapter
32
+ return HTTPAgentAdapter.from_config(config)
33
+ if atype == "function":
34
+ from eval_agent.adapters.function import FunctionAgentAdapter
35
+ return FunctionAgentAdapter.from_config(config)
36
+ if atype == "webui":
37
+ from eval_agent.adapters.webui import WebUIAgentAdapter
38
+ return WebUIAgentAdapter.from_config(config)
39
+ raise ValueError(f"unknown adapter type: {atype}")
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+ import json
3
+ import shlex
4
+ import subprocess
5
+ import time
6
+ from pathlib import Path
7
+ from eval_agent.adapters import AgentAdapter
8
+ from eval_agent.models import Task, Trajectory, Step, Usage
9
+
10
+
11
+ class CLIAgentAdapter(AgentAdapter):
12
+ """Drives a command-line agent that emits a JSONL stream on stdout.
13
+
14
+ command_template may contain {prompt}; it is substituted (shell-quoted)
15
+ with the task prompt. The process runs with cwd=workspace.
16
+ """
17
+
18
+ name = "cli-agent"
19
+
20
+ def __init__(self, command_template: str, name: str = "cli-agent"):
21
+ self.command_template = command_template
22
+ self.name = name
23
+
24
+ @classmethod
25
+ def from_config(cls, config: dict) -> "CLIAgentAdapter":
26
+ return cls(command_template=config["command"],
27
+ name=config.get("name", "cli-agent"))
28
+
29
+ def run(self, task: Task, workspace: Path) -> Trajectory:
30
+ cmd = self.command_template.format(prompt=shlex.quote(task.prompt))
31
+ steps: list[Step] = []
32
+ final_output = ""
33
+ total_tokens = 0
34
+ start = time.monotonic()
35
+ try:
36
+ proc = subprocess.run(
37
+ cmd, shell=True, cwd=str(workspace),
38
+ capture_output=True, text=True,
39
+ timeout=task.budget.timeout_seconds,
40
+ )
41
+ except subprocess.TimeoutExpired:
42
+ return Trajectory(
43
+ steps=steps,
44
+ usage=Usage(wall_seconds=time.monotonic() - start,
45
+ num_steps=len(steps)),
46
+ status="timeout",
47
+ error="process exceeded timeout",
48
+ )
49
+
50
+ for line in proc.stdout.splitlines():
51
+ line = line.strip()
52
+ if not line:
53
+ continue
54
+ try:
55
+ obj = json.loads(line)
56
+ except json.JSONDecodeError:
57
+ continue
58
+ t = obj.get("type")
59
+ if t == "tool_use":
60
+ steps.append(Step(index=len(steps), kind="tool_call",
61
+ name=obj.get("name"),
62
+ payload=obj.get("input", {})))
63
+ elif t == "text":
64
+ steps.append(Step(index=len(steps), kind="message",
65
+ payload={"text": obj.get("text", "")}))
66
+ elif t == "result":
67
+ final_output = obj.get("output", "")
68
+ total_tokens = obj.get("usage", {}).get("total_tokens", 0)
69
+
70
+ status = "success" if proc.returncode == 0 else "crash"
71
+ return Trajectory(
72
+ steps=steps,
73
+ final_output=final_output,
74
+ usage=Usage(total_tokens=total_tokens,
75
+ wall_seconds=time.monotonic() - start,
76
+ num_steps=len(steps)),
77
+ status=status,
78
+ error=None if status == "success" else proc.stderr[-2000:],
79
+ )
@@ -0,0 +1,61 @@
1
+ from __future__ import annotations
2
+ import importlib
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Callable, Optional
6
+ from eval_agent.adapters import AgentAdapter
7
+ from eval_agent.models import Task, Trajectory, Step, Usage
8
+
9
+
10
+ class FunctionAgentAdapter(AgentAdapter):
11
+ """Drives an in-process Python callable.
12
+
13
+ The callable signature is fn(task, workspace, record) where `record`
14
+ is record(kind, name, payload) and it may return a dict with optional
15
+ keys: final_output (str), tokens (int), retries (int).
16
+ """
17
+
18
+ name = "function-agent"
19
+
20
+ def __init__(self, fn: Callable, name: Optional[str] = None):
21
+ self.fn = fn
22
+ if name:
23
+ self.name = name
24
+
25
+ @classmethod
26
+ def from_config(cls, config: dict) -> "FunctionAgentAdapter":
27
+ # config["entrypoint"] = "module.path:callable"
28
+ module_path, attr = config["entrypoint"].split(":")
29
+ fn = getattr(importlib.import_module(module_path), attr)
30
+ return cls(fn=fn, name=config.get("name", "function-agent"))
31
+
32
+ def run(self, task: Task, workspace: Path) -> Trajectory:
33
+ steps: list[Step] = []
34
+
35
+ def record(kind: str, name=None, payload=None):
36
+ steps.append(Step(index=len(steps), kind=kind, name=name,
37
+ payload=payload or {}))
38
+
39
+ start = time.monotonic()
40
+ try:
41
+ ret = self.fn(task, str(workspace), record) or {}
42
+ elapsed = time.monotonic() - start
43
+ return Trajectory(
44
+ steps=steps,
45
+ final_output=ret.get("final_output", ""),
46
+ usage=Usage(
47
+ total_tokens=ret.get("tokens", 0),
48
+ wall_seconds=elapsed,
49
+ num_steps=len(steps),
50
+ num_retries=ret.get("retries", 0),
51
+ ),
52
+ status="success",
53
+ )
54
+ except Exception as e:
55
+ elapsed = time.monotonic() - start
56
+ return Trajectory(
57
+ steps=steps,
58
+ usage=Usage(wall_seconds=elapsed, num_steps=len(steps)),
59
+ status="crash",
60
+ error=f"{type(e).__name__}: {e}",
61
+ )
@@ -0,0 +1,78 @@
1
+ from __future__ import annotations
2
+ import json as jsonlib
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Callable, Optional
6
+ from eval_agent.adapters import AgentAdapter
7
+ from eval_agent.models import Task, Trajectory, Step, Usage
8
+
9
+
10
+ def _default_post(url: str, json_body: dict, timeout: Optional[float]):
11
+ import urllib.request
12
+ data = jsonlib.dumps(json_body).encode()
13
+ req = urllib.request.Request(url, data=data,
14
+ headers={"Content-Type": "application/json"})
15
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
16
+ return jsonlib.loads(resp.read().decode())
17
+
18
+
19
+ class HTTPAgentAdapter(AgentAdapter):
20
+ """Drives an OpenAI-style chat endpoint.
21
+
22
+ `post(url, json_body, timeout) -> dict` is injectable for testing.
23
+ """
24
+
25
+ name = "http-agent"
26
+
27
+ def __init__(self, url: str, post: Callable = _default_post,
28
+ name: str = "http-agent"):
29
+ self.url = url
30
+ self.post = post
31
+ self.name = name
32
+
33
+ @classmethod
34
+ def from_config(cls, config: dict) -> "HTTPAgentAdapter":
35
+ return cls(url=config["url"], name=config.get("name", "http-agent"))
36
+
37
+ def run(self, task: Task, workspace: Path) -> Trajectory:
38
+ body = {"messages": [{"role": "user", "content": task.prompt}]}
39
+ start = time.monotonic()
40
+ try:
41
+ resp = self.post(self.url, body, task.budget.timeout_seconds)
42
+ except Exception as e:
43
+ return Trajectory(
44
+ usage=Usage(wall_seconds=time.monotonic() - start),
45
+ status="crash",
46
+ error=f"{type(e).__name__}: {e}",
47
+ )
48
+
49
+ try:
50
+ message = resp["choices"][0]["message"]
51
+ steps: list[Step] = []
52
+ text = message.get("content") or ""
53
+ if text:
54
+ steps.append(Step(index=len(steps), kind="message",
55
+ payload={"text": text}))
56
+ for tc in message.get("tool_calls", []) or []:
57
+ fn = tc.get("function", {})
58
+ try:
59
+ args = jsonlib.loads(fn.get("arguments", "{}"))
60
+ except jsonlib.JSONDecodeError:
61
+ args = {"raw": fn.get("arguments")}
62
+ steps.append(Step(index=len(steps), kind="tool_call",
63
+ name=fn.get("name"), payload=args))
64
+ except Exception as e:
65
+ return Trajectory(
66
+ usage=Usage(wall_seconds=time.monotonic() - start),
67
+ status="crash",
68
+ error=f"{type(e).__name__}: {e}",
69
+ )
70
+
71
+ return Trajectory(
72
+ steps=steps,
73
+ final_output=text,
74
+ usage=Usage(total_tokens=resp.get("usage", {}).get("total_tokens", 0),
75
+ wall_seconds=time.monotonic() - start,
76
+ num_steps=len(steps)),
77
+ status="success",
78
+ )