evalpilot 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- eval_agent/__init__.py +1 -0
- eval_agent/adapters/__init__.py +39 -0
- eval_agent/adapters/cli.py +79 -0
- eval_agent/adapters/function.py +61 -0
- eval_agent/adapters/http.py +78 -0
- eval_agent/adapters/webui.py +197 -0
- eval_agent/aggregate.py +57 -0
- eval_agent/cli.py +106 -0
- eval_agent/examples/__init__.py +0 -0
- eval_agent/examples/echo.py +11 -0
- eval_agent/loader.py +35 -0
- eval_agent/models.py +79 -0
- eval_agent/reporter.py +70 -0
- eval_agent/runner.py +37 -0
- eval_agent/sandbox.py +43 -0
- eval_agent/scorers/__init__.py +34 -0
- eval_agent/scorers/checkpoint.py +66 -0
- eval_agent/scorers/efficiency.py +19 -0
- eval_agent/scorers/llm_judge.py +57 -0
- eval_agent/scorers/trajectory.py +50 -0
- eval_agent/templates/index.html.j2 +73 -0
- eval_agent/templates/report.html.j2 +68 -0
- eval_agent/wizard.py +132 -0
- evalpilot-0.1.0.dist-info/METADATA +144 -0
- evalpilot-0.1.0.dist-info/RECORD +29 -0
- evalpilot-0.1.0.dist-info/WHEEL +5 -0
- evalpilot-0.1.0.dist-info/entry_points.txt +4 -0
- evalpilot-0.1.0.dist-info/licenses/LICENSE +21 -0
- evalpilot-0.1.0.dist-info/top_level.txt +1 -0
eval_agent/models.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import Any, Literal, Optional
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Budget(BaseModel):
|
|
7
|
+
max_tokens: Optional[int] = None
|
|
8
|
+
max_steps: Optional[int] = None
|
|
9
|
+
timeout_seconds: Optional[float] = None
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Checkpoint(BaseModel):
|
|
13
|
+
type: str
|
|
14
|
+
args: dict[str, Any] = Field(default_factory=dict)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Task(BaseModel):
|
|
18
|
+
id: str
|
|
19
|
+
prompt: str
|
|
20
|
+
setup: dict[str, Any] = Field(default_factory=dict)
|
|
21
|
+
checkpoints: list[Checkpoint] = Field(default_factory=list)
|
|
22
|
+
rubric: Optional[str] = None
|
|
23
|
+
budget: Budget = Field(default_factory=Budget)
|
|
24
|
+
weights: dict[str, float] = Field(default_factory=dict)
|
|
25
|
+
tags: list[str] = Field(default_factory=list)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TaskSuite(BaseModel):
|
|
29
|
+
name: str
|
|
30
|
+
tasks: list[Task]
|
|
31
|
+
default_weights: dict[str, float] = Field(default_factory=dict)
|
|
32
|
+
meta: dict[str, Any] = Field(default_factory=dict)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Step(BaseModel):
|
|
36
|
+
index: int
|
|
37
|
+
kind: Literal["tool_call", "message", "result"]
|
|
38
|
+
name: Optional[str] = None
|
|
39
|
+
payload: dict[str, Any] = Field(default_factory=dict)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Usage(BaseModel):
|
|
43
|
+
total_tokens: int = 0
|
|
44
|
+
wall_seconds: float = 0.0
|
|
45
|
+
num_steps: int = 0
|
|
46
|
+
num_retries: int = 0
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Trajectory(BaseModel):
|
|
50
|
+
steps: list[Step] = Field(default_factory=list)
|
|
51
|
+
final_output: str = ""
|
|
52
|
+
usage: Usage = Field(default_factory=Usage)
|
|
53
|
+
final_state: Optional[str] = None # path to workspace snapshot
|
|
54
|
+
status: Literal["success", "timeout", "crash"] = "success"
|
|
55
|
+
error: Optional[str] = None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Score(BaseModel):
|
|
59
|
+
scorer_name: str
|
|
60
|
+
value: float
|
|
61
|
+
weight: float = 1.0
|
|
62
|
+
reason: str = ""
|
|
63
|
+
available: bool = True
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class TaskResult(BaseModel):
|
|
67
|
+
task_id: str
|
|
68
|
+
agent_name: str
|
|
69
|
+
trajectory: Trajectory
|
|
70
|
+
scores: list[Score] = Field(default_factory=list)
|
|
71
|
+
aggregate_score: float = 0.0
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class RunResult(BaseModel):
|
|
75
|
+
run_id: str
|
|
76
|
+
suite_name: str
|
|
77
|
+
task_results: list[TaskResult] = Field(default_factory=list)
|
|
78
|
+
stats: dict[str, Any] = Field(default_factory=dict)
|
|
79
|
+
meta: dict[str, Any] = Field(default_factory=dict)
|
eval_agent/reporter.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
import time
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from jinja2 import Environment, PackageLoader, select_autoescape
|
|
6
|
+
from eval_agent.models import RunResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def write_json(run_result: RunResult, out_dir: str) -> str:
|
|
10
|
+
out = Path(out_dir)
|
|
11
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
12
|
+
path = out / "result.json"
|
|
13
|
+
path.write_text(run_result.model_dump_json(indent=2))
|
|
14
|
+
return str(path)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def write_html(run_results: list[RunResult], out_dir: str) -> str:
|
|
18
|
+
env = Environment(
|
|
19
|
+
loader=PackageLoader("eval_agent", "templates"),
|
|
20
|
+
autoescape=select_autoescape(["html"]),
|
|
21
|
+
)
|
|
22
|
+
template = env.get_template("report.html.j2")
|
|
23
|
+
html = template.render(runs=run_results)
|
|
24
|
+
out = Path(out_dir)
|
|
25
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
path = out / "report.html"
|
|
27
|
+
path.write_text(html)
|
|
28
|
+
return str(path)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _scan_runs(out_dir: str) -> list[dict]:
|
|
32
|
+
"""Read each <out_dir>/*/result.json into a row dict, newest first.
|
|
33
|
+
Unreadable/malformed result files are skipped."""
|
|
34
|
+
base = Path(out_dir)
|
|
35
|
+
rows: list[dict] = []
|
|
36
|
+
for result_path in sorted(base.glob("*/result.json")):
|
|
37
|
+
try:
|
|
38
|
+
data = json.loads(result_path.read_text())
|
|
39
|
+
except Exception:
|
|
40
|
+
continue
|
|
41
|
+
stats = data.get("stats", {})
|
|
42
|
+
mtime = result_path.stat().st_mtime
|
|
43
|
+
rows.append({
|
|
44
|
+
"run_id": data.get("run_id", result_path.parent.name),
|
|
45
|
+
"agent": data.get("meta", {}).get("agent", ""),
|
|
46
|
+
"suite": data.get("suite_name", ""),
|
|
47
|
+
"mean_score": stats.get("mean_score", 0.0),
|
|
48
|
+
"num_tasks": stats.get("num_tasks", 0),
|
|
49
|
+
"num_success": stats.get("num_success", 0),
|
|
50
|
+
"mtime": mtime,
|
|
51
|
+
"time": time.strftime("%m-%d %H:%M", time.localtime(mtime)),
|
|
52
|
+
})
|
|
53
|
+
rows.sort(key=lambda r: r["mtime"], reverse=True)
|
|
54
|
+
return rows
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def write_index(out_dir: str) -> str:
|
|
58
|
+
"""Render <out_dir>/index.html listing all runs found under out_dir."""
|
|
59
|
+
rows = _scan_runs(out_dir)
|
|
60
|
+
env = Environment(
|
|
61
|
+
loader=PackageLoader("eval_agent", "templates"),
|
|
62
|
+
autoescape=select_autoescape(["html"]),
|
|
63
|
+
)
|
|
64
|
+
template = env.get_template("index.html.j2")
|
|
65
|
+
html = template.render(rows=rows)
|
|
66
|
+
out = Path(out_dir)
|
|
67
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
68
|
+
path = out / "index.html"
|
|
69
|
+
path.write_text(html)
|
|
70
|
+
return str(path)
|
eval_agent/runner.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import shutil
|
|
3
|
+
from eval_agent.adapters import AgentAdapter
|
|
4
|
+
from eval_agent.scorers import Scorer
|
|
5
|
+
from eval_agent.sandbox import Sandbox
|
|
6
|
+
from eval_agent.aggregate import aggregate_task, aggregate_run, apply_weights
|
|
7
|
+
from eval_agent.models import TaskSuite, Task, TaskResult, RunResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class Runner:
|
|
11
|
+
def __init__(self, adapter: AgentAdapter, scorers: list[Scorer]):
|
|
12
|
+
self.adapter = adapter
|
|
13
|
+
self.scorers = scorers
|
|
14
|
+
|
|
15
|
+
def run(self, suite: TaskSuite, run_id: str) -> RunResult:
|
|
16
|
+
results = [self._run_task(t, suite) for t in suite.tasks]
|
|
17
|
+
stats = aggregate_run(results)
|
|
18
|
+
return RunResult(run_id=run_id, suite_name=suite.name,
|
|
19
|
+
task_results=results, stats=stats,
|
|
20
|
+
meta={"agent": self.adapter.name})
|
|
21
|
+
|
|
22
|
+
def _run_task(self, task: Task, suite: TaskSuite) -> TaskResult:
|
|
23
|
+
with Sandbox.create(task.setup) as ws:
|
|
24
|
+
trajectory = self.adapter.run(task, ws.path)
|
|
25
|
+
trajectory.final_state = ws.snapshot()
|
|
26
|
+
|
|
27
|
+
snapshot_path = trajectory.final_state
|
|
28
|
+
try:
|
|
29
|
+
scores = [s.score(task, trajectory) for s in self.scorers]
|
|
30
|
+
scores = apply_weights(scores, task.weights, suite.default_weights)
|
|
31
|
+
agg = aggregate_task(scores, task.weights, suite.default_weights)
|
|
32
|
+
return TaskResult(task_id=task.id, agent_name=self.adapter.name,
|
|
33
|
+
trajectory=trajectory, scores=scores,
|
|
34
|
+
aggregate_score=agg)
|
|
35
|
+
finally:
|
|
36
|
+
if snapshot_path:
|
|
37
|
+
shutil.rmtree(snapshot_path, ignore_errors=True)
|
eval_agent/sandbox.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import shutil
|
|
3
|
+
import tempfile
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Sandbox:
|
|
9
|
+
"""Isolated per-task workspace. Use as a context manager.
|
|
10
|
+
|
|
11
|
+
setup schema:
|
|
12
|
+
{"files": {"<relpath>": "<content>", ...}}
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
def __init__(self, path: Path):
|
|
16
|
+
self.path = path
|
|
17
|
+
self._snapshot_dir: Path | None = None
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def create(cls, setup: dict[str, Any]) -> "Sandbox":
|
|
21
|
+
path = Path(tempfile.mkdtemp(prefix="evalagent_ws_"))
|
|
22
|
+
sb = cls(path)
|
|
23
|
+
sb._apply_setup(setup or {})
|
|
24
|
+
return sb
|
|
25
|
+
|
|
26
|
+
def _apply_setup(self, setup: dict[str, Any]) -> None:
|
|
27
|
+
files = setup.get("files", {})
|
|
28
|
+
for relpath, content in files.items():
|
|
29
|
+
target = self.path / relpath
|
|
30
|
+
target.parent.mkdir(parents=True, exist_ok=True)
|
|
31
|
+
target.write_text(content)
|
|
32
|
+
|
|
33
|
+
def snapshot(self) -> str:
|
|
34
|
+
"""Copy current workspace to a persistent temp dir; return its path."""
|
|
35
|
+
self._snapshot_dir = Path(tempfile.mkdtemp(prefix="evalagent_snap_"))
|
|
36
|
+
shutil.copytree(self.path, self._snapshot_dir, dirs_exist_ok=True)
|
|
37
|
+
return str(self._snapshot_dir)
|
|
38
|
+
|
|
39
|
+
def __enter__(self) -> "Sandbox":
|
|
40
|
+
return self
|
|
41
|
+
|
|
42
|
+
def __exit__(self, *exc) -> None:
|
|
43
|
+
shutil.rmtree(self.path, ignore_errors=True)
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from abc import ABC, abstractmethod
|
|
3
|
+
from eval_agent.models import Task, Trajectory, Score
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Scorer(ABC):
|
|
7
|
+
"""Read-only grader. Reads a Trajectory, returns a Score. Never mutates."""
|
|
8
|
+
|
|
9
|
+
name: str = "scorer"
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def score(self, task: Task, trajectory: Trajectory) -> Score:
|
|
13
|
+
...
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def build_scorers(config: dict | None = None) -> list[Scorer]:
|
|
17
|
+
"""Factory: instantiate the default set of scorers.
|
|
18
|
+
|
|
19
|
+
config may carry per-scorer options (e.g. llm_judge model). For the
|
|
20
|
+
default build we instantiate all four.
|
|
21
|
+
"""
|
|
22
|
+
config = config or {}
|
|
23
|
+
from eval_agent.scorers.checkpoint import CheckpointScorer
|
|
24
|
+
from eval_agent.scorers.trajectory import TrajectoryScorer
|
|
25
|
+
from eval_agent.scorers.efficiency import EfficiencyScorer
|
|
26
|
+
scorers: list[Scorer] = [
|
|
27
|
+
CheckpointScorer(),
|
|
28
|
+
TrajectoryScorer(),
|
|
29
|
+
EfficiencyScorer(),
|
|
30
|
+
]
|
|
31
|
+
if config.get("llm_judge", {}).get("enabled", True):
|
|
32
|
+
from eval_agent.scorers.llm_judge import LLMJudgeScorer
|
|
33
|
+
scorers.append(LLMJudgeScorer(**config.get("llm_judge", {}).get("kwargs", {})))
|
|
34
|
+
return scorers
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
import subprocess
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from eval_agent.scorers import Scorer
|
|
7
|
+
from eval_agent.models import Task, Trajectory, Score, Checkpoint
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class CheckpointScorer(Scorer):
|
|
11
|
+
name = "checkpoint"
|
|
12
|
+
|
|
13
|
+
def score(self, task: Task, trajectory: Trajectory) -> Score:
|
|
14
|
+
if not task.checkpoints:
|
|
15
|
+
return Score(scorer_name=self.name, value=0.0, reason="no checkpoints",
|
|
16
|
+
available=False)
|
|
17
|
+
root = Path(trajectory.final_state) if trajectory.final_state else None
|
|
18
|
+
passed = 0
|
|
19
|
+
details = []
|
|
20
|
+
for cp in task.checkpoints:
|
|
21
|
+
ok = self._check(cp, root, trajectory)
|
|
22
|
+
passed += int(ok)
|
|
23
|
+
details.append(f"{cp.type}={'ok' if ok else 'fail'}")
|
|
24
|
+
value = passed / len(task.checkpoints)
|
|
25
|
+
return Score(scorer_name=self.name, value=value,
|
|
26
|
+
reason=f"{passed}/{len(task.checkpoints)} passed | " +
|
|
27
|
+
", ".join(details))
|
|
28
|
+
|
|
29
|
+
def _check(self, cp: Checkpoint, root: Path | None,
|
|
30
|
+
trajectory: Trajectory) -> bool:
|
|
31
|
+
try:
|
|
32
|
+
a = cp.args
|
|
33
|
+
if cp.type == "file_exists":
|
|
34
|
+
return root is not None and (root / a["path"]).exists()
|
|
35
|
+
if cp.type == "file_contains":
|
|
36
|
+
p = root / a["path"] if root else None
|
|
37
|
+
return bool(p and p.exists() and a["pattern"] in p.read_text())
|
|
38
|
+
if cp.type == "command_passes":
|
|
39
|
+
cwd = str(root) if root else None
|
|
40
|
+
r = subprocess.run(a["cmd"], shell=True, cwd=cwd,
|
|
41
|
+
capture_output=True)
|
|
42
|
+
return r.returncode == 0
|
|
43
|
+
if cp.type == "json_equals":
|
|
44
|
+
p = root / a["path"] if root else None
|
|
45
|
+
if not (p and p.exists()):
|
|
46
|
+
return False
|
|
47
|
+
data = json.loads(p.read_text())
|
|
48
|
+
for key in a["field"].split("."):
|
|
49
|
+
data = data.get(key) if isinstance(data, dict) else None
|
|
50
|
+
return data == a["value"]
|
|
51
|
+
if cp.type == "output_contains":
|
|
52
|
+
out = trajectory.final_output or ""
|
|
53
|
+
text = a["text"]
|
|
54
|
+
if a.get("case_sensitive", False):
|
|
55
|
+
return text in out
|
|
56
|
+
return text.lower() in out.lower()
|
|
57
|
+
if cp.type == "output_matches":
|
|
58
|
+
return re.search(a["pattern"], trajectory.final_output or "") is not None
|
|
59
|
+
if cp.type == "custom":
|
|
60
|
+
cwd = str(root) if root else None
|
|
61
|
+
r = subprocess.run(a["script"], shell=True, cwd=cwd,
|
|
62
|
+
capture_output=True)
|
|
63
|
+
return r.returncode == 0
|
|
64
|
+
return False
|
|
65
|
+
except Exception:
|
|
66
|
+
return False
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from eval_agent.scorers import Scorer
|
|
3
|
+
from eval_agent.models import Task, Trajectory, Score
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class EfficiencyScorer(Scorer):
|
|
7
|
+
name = "efficiency"
|
|
8
|
+
|
|
9
|
+
def score(self, task: Task, trajectory: Trajectory) -> Score:
|
|
10
|
+
u = trajectory.usage
|
|
11
|
+
b = task.budget
|
|
12
|
+
raw = (f"tokens={u.total_tokens}, steps={u.num_steps}, "
|
|
13
|
+
f"wall={u.wall_seconds:.2f}s, retries={u.num_retries}")
|
|
14
|
+
if b.max_tokens:
|
|
15
|
+
value = max(0.0, min(1.0, 1 - u.total_tokens / b.max_tokens))
|
|
16
|
+
return Score(scorer_name=self.name, value=value, weight=0.0,
|
|
17
|
+
reason=raw, available=True)
|
|
18
|
+
return Score(scorer_name=self.name, value=0.0, weight=0.0,
|
|
19
|
+
reason=raw + " (no token budget set)", available=False)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
from typing import Callable, Optional
|
|
4
|
+
from eval_agent.scorers import Scorer
|
|
5
|
+
from eval_agent.models import Task, Trajectory, Score
|
|
6
|
+
|
|
7
|
+
_SYSTEM = (
|
|
8
|
+
"You are a strict evaluation judge. Given a rubric and an agent's output, "
|
|
9
|
+
"return ONLY a JSON object: {\"score\": <float 0-1>, \"reasons\": [<str>, ...]}. "
|
|
10
|
+
"Anchor the score to the rubric's stated criteria."
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _anthropic_complete(model: str) -> Callable[[str, str], str]:
|
|
15
|
+
def complete(system: str, user: str) -> str:
|
|
16
|
+
import anthropic
|
|
17
|
+
client = anthropic.Anthropic()
|
|
18
|
+
resp = client.messages.create(
|
|
19
|
+
model=model, max_tokens=1024, system=system,
|
|
20
|
+
messages=[{"role": "user", "content": user}],
|
|
21
|
+
)
|
|
22
|
+
return resp.content[0].text
|
|
23
|
+
return complete
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class LLMJudgeScorer(Scorer):
|
|
27
|
+
name = "llm_judge"
|
|
28
|
+
|
|
29
|
+
def __init__(self, complete: Optional[Callable[[str, str], str]] = None,
|
|
30
|
+
model: str = "claude-opus-4-8"):
|
|
31
|
+
self.complete = complete or _anthropic_complete(model)
|
|
32
|
+
|
|
33
|
+
def score(self, task: Task, trajectory: Trajectory) -> Score:
|
|
34
|
+
if not task.rubric:
|
|
35
|
+
return Score(scorer_name=self.name, value=0.0,
|
|
36
|
+
reason="no rubric provided", available=False)
|
|
37
|
+
user = (f"RUBRIC:\n{task.rubric}\n\n"
|
|
38
|
+
f"AGENT OUTPUT:\n{trajectory.final_output}\n\n"
|
|
39
|
+
f"Return the JSON now.")
|
|
40
|
+
raw = self.complete(_SYSTEM, user)
|
|
41
|
+
try:
|
|
42
|
+
data = json.loads(self._extract_json(raw))
|
|
43
|
+
value = float(data["score"])
|
|
44
|
+
reasons = data.get("reasons", [])
|
|
45
|
+
return Score(scorer_name=self.name, value=value,
|
|
46
|
+
reason="; ".join(reasons))
|
|
47
|
+
except (json.JSONDecodeError, KeyError, ValueError, TypeError):
|
|
48
|
+
return Score(scorer_name=self.name, value=0.0,
|
|
49
|
+
reason=f"could not parse judge output: {raw[:200]}")
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def _extract_json(text: str) -> str:
|
|
53
|
+
start = text.find("{")
|
|
54
|
+
end = text.rfind("}")
|
|
55
|
+
if start == -1 or end == -1:
|
|
56
|
+
return text
|
|
57
|
+
return text[start:end + 1]
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import json
|
|
3
|
+
from eval_agent.scorers import Scorer
|
|
4
|
+
from eval_agent.models import Task, Trajectory, Score
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TrajectoryScorer(Scorer):
|
|
8
|
+
"""Rule-based process quality. Degrades to unavailable for black-box
|
|
9
|
+
agents that expose no steps."""
|
|
10
|
+
|
|
11
|
+
name = "trajectory"
|
|
12
|
+
|
|
13
|
+
def score(self, task: Task, trajectory: Trajectory) -> Score:
|
|
14
|
+
steps = trajectory.steps
|
|
15
|
+
if not steps:
|
|
16
|
+
return Score(scorer_name=self.name, value=0.0,
|
|
17
|
+
reason="no steps recorded (black-box agent)",
|
|
18
|
+
available=False)
|
|
19
|
+
|
|
20
|
+
penalties = []
|
|
21
|
+
value = 1.0
|
|
22
|
+
|
|
23
|
+
repeats = self._max_consecutive_repeats(trajectory)
|
|
24
|
+
if repeats >= 3:
|
|
25
|
+
value -= 0.4
|
|
26
|
+
penalties.append(f"repeated identical action x{repeats}")
|
|
27
|
+
|
|
28
|
+
if task.budget.max_steps and len(steps) > task.budget.max_steps:
|
|
29
|
+
value -= 0.3
|
|
30
|
+
penalties.append(
|
|
31
|
+
f"exceeded max_steps ({len(steps)}>{task.budget.max_steps})")
|
|
32
|
+
|
|
33
|
+
value = max(0.0, value)
|
|
34
|
+
reason = "clean run" if not penalties else "; ".join(penalties)
|
|
35
|
+
return Score(scorer_name=self.name, value=value, reason=reason)
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def _max_consecutive_repeats(trajectory: Trajectory) -> int:
|
|
39
|
+
best = 1
|
|
40
|
+
run = 1
|
|
41
|
+
prev = None
|
|
42
|
+
for s in trajectory.steps:
|
|
43
|
+
key = (s.name, json.dumps(s.payload, sort_keys=True))
|
|
44
|
+
if key == prev:
|
|
45
|
+
run += 1
|
|
46
|
+
best = max(best, run)
|
|
47
|
+
else:
|
|
48
|
+
run = 1
|
|
49
|
+
prev = key
|
|
50
|
+
return best
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<html>
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="utf-8">
|
|
5
|
+
<title>eval-agent runs</title>
|
|
6
|
+
<style>
|
|
7
|
+
body { font-family: system-ui, sans-serif; margin: 2rem; }
|
|
8
|
+
table { border-collapse: collapse; width: 100%; }
|
|
9
|
+
th, td { border: 1px solid #ccc; padding: 6px 10px; text-align: left; }
|
|
10
|
+
th { background: #f3f3f3; cursor: pointer; user-select: none; }
|
|
11
|
+
tr.run-row:hover { background: #f9f9f9; cursor: pointer; }
|
|
12
|
+
.empty { color: #777; }
|
|
13
|
+
</style>
|
|
14
|
+
</head>
|
|
15
|
+
<body>
|
|
16
|
+
<h1>eval-agent runs</h1>
|
|
17
|
+
{% if rows %}
|
|
18
|
+
<table id="runs">
|
|
19
|
+
<thead>
|
|
20
|
+
<tr>
|
|
21
|
+
<th data-type="text">run-id</th>
|
|
22
|
+
<th data-type="text">agent</th>
|
|
23
|
+
<th data-type="text">suite</th>
|
|
24
|
+
<th data-type="num">score</th>
|
|
25
|
+
<th data-type="num">tasks</th>
|
|
26
|
+
<th data-type="num">ok</th>
|
|
27
|
+
<th data-type="text">time</th>
|
|
28
|
+
</tr>
|
|
29
|
+
</thead>
|
|
30
|
+
<tbody>
|
|
31
|
+
{% for r in rows %}
|
|
32
|
+
<tr class="run-row" data-href="{{ r.run_id }}/report.html">
|
|
33
|
+
<td>{{ r.run_id }}</td>
|
|
34
|
+
<td>{{ r.agent }}</td>
|
|
35
|
+
<td>{{ r.suite }}</td>
|
|
36
|
+
<td>{{ "%.3f"|format(r.mean_score) }}</td>
|
|
37
|
+
<td>{{ r.num_tasks }}</td>
|
|
38
|
+
<td>{{ r.num_success }}</td>
|
|
39
|
+
<td>{{ r.time }}</td>
|
|
40
|
+
</tr>
|
|
41
|
+
{% endfor %}
|
|
42
|
+
</tbody>
|
|
43
|
+
</table>
|
|
44
|
+
<script>
|
|
45
|
+
document.querySelectorAll("tr.run-row").forEach(function (row) {
|
|
46
|
+
row.addEventListener("click", function () {
|
|
47
|
+
window.location.href = row.dataset.href;
|
|
48
|
+
});
|
|
49
|
+
});
|
|
50
|
+
document.querySelectorAll("#runs thead th").forEach(function (th, idx) {
|
|
51
|
+
th.addEventListener("click", function () {
|
|
52
|
+
var tbody = document.querySelector("#runs tbody");
|
|
53
|
+
var rows = Array.prototype.slice.call(tbody.querySelectorAll("tr"));
|
|
54
|
+
var numeric = th.dataset.type === "num";
|
|
55
|
+
var asc = th.dataset.asc !== "true";
|
|
56
|
+
th.dataset.asc = asc;
|
|
57
|
+
rows.sort(function (a, b) {
|
|
58
|
+
var x = a.children[idx].textContent.trim();
|
|
59
|
+
var y = b.children[idx].textContent.trim();
|
|
60
|
+
if (numeric) { x = parseFloat(x); y = parseFloat(y); }
|
|
61
|
+
if (x < y) return asc ? -1 : 1;
|
|
62
|
+
if (x > y) return asc ? 1 : -1;
|
|
63
|
+
return 0;
|
|
64
|
+
});
|
|
65
|
+
rows.forEach(function (r) { tbody.appendChild(r); });
|
|
66
|
+
});
|
|
67
|
+
});
|
|
68
|
+
</script>
|
|
69
|
+
{% else %}
|
|
70
|
+
<p class="empty">暂无评测。先跑一次:<code>eval-agent run</code></p>
|
|
71
|
+
{% endif %}
|
|
72
|
+
</body>
|
|
73
|
+
</html>
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<html>
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="utf-8">
|
|
5
|
+
<title>Eval-Agent Report</title>
|
|
6
|
+
<style>
|
|
7
|
+
body { font-family: system-ui, sans-serif; margin: 2rem; }
|
|
8
|
+
table { border-collapse: collapse; margin-bottom: 2rem; }
|
|
9
|
+
th, td { border: 1px solid #ccc; padding: 6px 10px; text-align: left; }
|
|
10
|
+
th { background: #f3f3f3; }
|
|
11
|
+
.task { margin: 1rem 0; padding: 1rem; border: 1px solid #ddd; }
|
|
12
|
+
.reason { color: #555; font-size: 0.9em; }
|
|
13
|
+
.status-crash, .status-timeout { color: #b00; }
|
|
14
|
+
</style>
|
|
15
|
+
</head>
|
|
16
|
+
<body>
|
|
17
|
+
<h1>Eval-Agent Report</h1>
|
|
18
|
+
|
|
19
|
+
<h2>Leaderboard</h2>
|
|
20
|
+
<table>
|
|
21
|
+
<tr><th>Agent</th><th>Suite</th><th>Tasks</th><th>Success</th><th>Mean score</th></tr>
|
|
22
|
+
{% for rr in runs %}
|
|
23
|
+
{% set agent_label = rr.meta.get("agent") or (rr.task_results[0].agent_name if rr.task_results else rr.run_id) %}
|
|
24
|
+
<tr>
|
|
25
|
+
<td>{{ agent_label }}</td>
|
|
26
|
+
<td>{{ rr.suite_name }}</td>
|
|
27
|
+
<td>{{ rr.stats.get("num_tasks", 0) }}</td>
|
|
28
|
+
<td>{{ rr.stats.get("num_success", 0) }}</td>
|
|
29
|
+
<td>{{ "%.3f"|format(rr.stats.get("mean_score", 0.0)) }}</td>
|
|
30
|
+
</tr>
|
|
31
|
+
{% endfor %}
|
|
32
|
+
</table>
|
|
33
|
+
|
|
34
|
+
{% for rr in runs %}
|
|
35
|
+
{% set agent_label = rr.meta.get("agent") or (rr.task_results[0].agent_name if rr.task_results else rr.run_id) %}
|
|
36
|
+
<h2>{{ agent_label }} — task details</h2>
|
|
37
|
+
{% for tr in rr.task_results %}
|
|
38
|
+
<div class="task">
|
|
39
|
+
<h3>{{ tr.task_id }}
|
|
40
|
+
<span class="status-{{ tr.trajectory.status }}">[{{ tr.trajectory.status }}]</span>
|
|
41
|
+
— score {{ "%.3f"|format(tr.aggregate_score) }}
|
|
42
|
+
</h3>
|
|
43
|
+
<table>
|
|
44
|
+
<tr><th>Scorer</th><th>Value</th><th>Weight</th><th>Reason</th></tr>
|
|
45
|
+
{% for s in tr.scores %}
|
|
46
|
+
<tr>
|
|
47
|
+
<td>{{ s.scorer_name }}{% if not s.available %} (n/a){% endif %}</td>
|
|
48
|
+
<td>{{ "%.3f"|format(s.value) }}</td>
|
|
49
|
+
<td>{{ s.weight }}</td>
|
|
50
|
+
<td class="reason">{{ s.reason }}</td>
|
|
51
|
+
</tr>
|
|
52
|
+
{% endfor %}
|
|
53
|
+
</table>
|
|
54
|
+
<details>
|
|
55
|
+
<summary>Trajectory ({{ tr.trajectory.steps|length }} steps,
|
|
56
|
+
{{ tr.trajectory.usage.total_tokens }} tokens)</summary>
|
|
57
|
+
<ol>
|
|
58
|
+
{% for step in tr.trajectory.steps %}
|
|
59
|
+
<li>{{ step.kind }}{% if step.name %}: {{ step.name }}{% endif %}
|
|
60
|
+
— {{ step.payload }}</li>
|
|
61
|
+
{% endfor %}
|
|
62
|
+
</ol>
|
|
63
|
+
</details>
|
|
64
|
+
</div>
|
|
65
|
+
{% endfor %}
|
|
66
|
+
{% endfor %}
|
|
67
|
+
</body>
|
|
68
|
+
</html>
|