evalpilot 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
eval_agent/models.py ADDED
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+ from typing import Any, Literal, Optional
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class Budget(BaseModel):
7
+ max_tokens: Optional[int] = None
8
+ max_steps: Optional[int] = None
9
+ timeout_seconds: Optional[float] = None
10
+
11
+
12
+ class Checkpoint(BaseModel):
13
+ type: str
14
+ args: dict[str, Any] = Field(default_factory=dict)
15
+
16
+
17
+ class Task(BaseModel):
18
+ id: str
19
+ prompt: str
20
+ setup: dict[str, Any] = Field(default_factory=dict)
21
+ checkpoints: list[Checkpoint] = Field(default_factory=list)
22
+ rubric: Optional[str] = None
23
+ budget: Budget = Field(default_factory=Budget)
24
+ weights: dict[str, float] = Field(default_factory=dict)
25
+ tags: list[str] = Field(default_factory=list)
26
+
27
+
28
+ class TaskSuite(BaseModel):
29
+ name: str
30
+ tasks: list[Task]
31
+ default_weights: dict[str, float] = Field(default_factory=dict)
32
+ meta: dict[str, Any] = Field(default_factory=dict)
33
+
34
+
35
+ class Step(BaseModel):
36
+ index: int
37
+ kind: Literal["tool_call", "message", "result"]
38
+ name: Optional[str] = None
39
+ payload: dict[str, Any] = Field(default_factory=dict)
40
+
41
+
42
+ class Usage(BaseModel):
43
+ total_tokens: int = 0
44
+ wall_seconds: float = 0.0
45
+ num_steps: int = 0
46
+ num_retries: int = 0
47
+
48
+
49
+ class Trajectory(BaseModel):
50
+ steps: list[Step] = Field(default_factory=list)
51
+ final_output: str = ""
52
+ usage: Usage = Field(default_factory=Usage)
53
+ final_state: Optional[str] = None # path to workspace snapshot
54
+ status: Literal["success", "timeout", "crash"] = "success"
55
+ error: Optional[str] = None
56
+
57
+
58
+ class Score(BaseModel):
59
+ scorer_name: str
60
+ value: float
61
+ weight: float = 1.0
62
+ reason: str = ""
63
+ available: bool = True
64
+
65
+
66
+ class TaskResult(BaseModel):
67
+ task_id: str
68
+ agent_name: str
69
+ trajectory: Trajectory
70
+ scores: list[Score] = Field(default_factory=list)
71
+ aggregate_score: float = 0.0
72
+
73
+
74
+ class RunResult(BaseModel):
75
+ run_id: str
76
+ suite_name: str
77
+ task_results: list[TaskResult] = Field(default_factory=list)
78
+ stats: dict[str, Any] = Field(default_factory=dict)
79
+ meta: dict[str, Any] = Field(default_factory=dict)
eval_agent/reporter.py ADDED
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+ import json
3
+ import time
4
+ from pathlib import Path
5
+ from jinja2 import Environment, PackageLoader, select_autoescape
6
+ from eval_agent.models import RunResult
7
+
8
+
9
+ def write_json(run_result: RunResult, out_dir: str) -> str:
10
+ out = Path(out_dir)
11
+ out.mkdir(parents=True, exist_ok=True)
12
+ path = out / "result.json"
13
+ path.write_text(run_result.model_dump_json(indent=2))
14
+ return str(path)
15
+
16
+
17
+ def write_html(run_results: list[RunResult], out_dir: str) -> str:
18
+ env = Environment(
19
+ loader=PackageLoader("eval_agent", "templates"),
20
+ autoescape=select_autoescape(["html"]),
21
+ )
22
+ template = env.get_template("report.html.j2")
23
+ html = template.render(runs=run_results)
24
+ out = Path(out_dir)
25
+ out.mkdir(parents=True, exist_ok=True)
26
+ path = out / "report.html"
27
+ path.write_text(html)
28
+ return str(path)
29
+
30
+
31
+ def _scan_runs(out_dir: str) -> list[dict]:
32
+ """Read each <out_dir>/*/result.json into a row dict, newest first.
33
+ Unreadable/malformed result files are skipped."""
34
+ base = Path(out_dir)
35
+ rows: list[dict] = []
36
+ for result_path in sorted(base.glob("*/result.json")):
37
+ try:
38
+ data = json.loads(result_path.read_text())
39
+ except Exception:
40
+ continue
41
+ stats = data.get("stats", {})
42
+ mtime = result_path.stat().st_mtime
43
+ rows.append({
44
+ "run_id": data.get("run_id", result_path.parent.name),
45
+ "agent": data.get("meta", {}).get("agent", ""),
46
+ "suite": data.get("suite_name", ""),
47
+ "mean_score": stats.get("mean_score", 0.0),
48
+ "num_tasks": stats.get("num_tasks", 0),
49
+ "num_success": stats.get("num_success", 0),
50
+ "mtime": mtime,
51
+ "time": time.strftime("%m-%d %H:%M", time.localtime(mtime)),
52
+ })
53
+ rows.sort(key=lambda r: r["mtime"], reverse=True)
54
+ return rows
55
+
56
+
57
+ def write_index(out_dir: str) -> str:
58
+ """Render <out_dir>/index.html listing all runs found under out_dir."""
59
+ rows = _scan_runs(out_dir)
60
+ env = Environment(
61
+ loader=PackageLoader("eval_agent", "templates"),
62
+ autoescape=select_autoescape(["html"]),
63
+ )
64
+ template = env.get_template("index.html.j2")
65
+ html = template.render(rows=rows)
66
+ out = Path(out_dir)
67
+ out.mkdir(parents=True, exist_ok=True)
68
+ path = out / "index.html"
69
+ path.write_text(html)
70
+ return str(path)
eval_agent/runner.py ADDED
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+ import shutil
3
+ from eval_agent.adapters import AgentAdapter
4
+ from eval_agent.scorers import Scorer
5
+ from eval_agent.sandbox import Sandbox
6
+ from eval_agent.aggregate import aggregate_task, aggregate_run, apply_weights
7
+ from eval_agent.models import TaskSuite, Task, TaskResult, RunResult
8
+
9
+
10
+ class Runner:
11
+ def __init__(self, adapter: AgentAdapter, scorers: list[Scorer]):
12
+ self.adapter = adapter
13
+ self.scorers = scorers
14
+
15
+ def run(self, suite: TaskSuite, run_id: str) -> RunResult:
16
+ results = [self._run_task(t, suite) for t in suite.tasks]
17
+ stats = aggregate_run(results)
18
+ return RunResult(run_id=run_id, suite_name=suite.name,
19
+ task_results=results, stats=stats,
20
+ meta={"agent": self.adapter.name})
21
+
22
+ def _run_task(self, task: Task, suite: TaskSuite) -> TaskResult:
23
+ with Sandbox.create(task.setup) as ws:
24
+ trajectory = self.adapter.run(task, ws.path)
25
+ trajectory.final_state = ws.snapshot()
26
+
27
+ snapshot_path = trajectory.final_state
28
+ try:
29
+ scores = [s.score(task, trajectory) for s in self.scorers]
30
+ scores = apply_weights(scores, task.weights, suite.default_weights)
31
+ agg = aggregate_task(scores, task.weights, suite.default_weights)
32
+ return TaskResult(task_id=task.id, agent_name=self.adapter.name,
33
+ trajectory=trajectory, scores=scores,
34
+ aggregate_score=agg)
35
+ finally:
36
+ if snapshot_path:
37
+ shutil.rmtree(snapshot_path, ignore_errors=True)
eval_agent/sandbox.py ADDED
@@ -0,0 +1,43 @@
1
+ from __future__ import annotations
2
+ import shutil
3
+ import tempfile
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+
8
+ class Sandbox:
9
+ """Isolated per-task workspace. Use as a context manager.
10
+
11
+ setup schema:
12
+ {"files": {"<relpath>": "<content>", ...}}
13
+ """
14
+
15
+ def __init__(self, path: Path):
16
+ self.path = path
17
+ self._snapshot_dir: Path | None = None
18
+
19
+ @classmethod
20
+ def create(cls, setup: dict[str, Any]) -> "Sandbox":
21
+ path = Path(tempfile.mkdtemp(prefix="evalagent_ws_"))
22
+ sb = cls(path)
23
+ sb._apply_setup(setup or {})
24
+ return sb
25
+
26
+ def _apply_setup(self, setup: dict[str, Any]) -> None:
27
+ files = setup.get("files", {})
28
+ for relpath, content in files.items():
29
+ target = self.path / relpath
30
+ target.parent.mkdir(parents=True, exist_ok=True)
31
+ target.write_text(content)
32
+
33
+ def snapshot(self) -> str:
34
+ """Copy current workspace to a persistent temp dir; return its path."""
35
+ self._snapshot_dir = Path(tempfile.mkdtemp(prefix="evalagent_snap_"))
36
+ shutil.copytree(self.path, self._snapshot_dir, dirs_exist_ok=True)
37
+ return str(self._snapshot_dir)
38
+
39
+ def __enter__(self) -> "Sandbox":
40
+ return self
41
+
42
+ def __exit__(self, *exc) -> None:
43
+ shutil.rmtree(self.path, ignore_errors=True)
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+ from abc import ABC, abstractmethod
3
+ from eval_agent.models import Task, Trajectory, Score
4
+
5
+
6
+ class Scorer(ABC):
7
+ """Read-only grader. Reads a Trajectory, returns a Score. Never mutates."""
8
+
9
+ name: str = "scorer"
10
+
11
+ @abstractmethod
12
+ def score(self, task: Task, trajectory: Trajectory) -> Score:
13
+ ...
14
+
15
+
16
+ def build_scorers(config: dict | None = None) -> list[Scorer]:
17
+ """Factory: instantiate the default set of scorers.
18
+
19
+ config may carry per-scorer options (e.g. llm_judge model). For the
20
+ default build we instantiate all four.
21
+ """
22
+ config = config or {}
23
+ from eval_agent.scorers.checkpoint import CheckpointScorer
24
+ from eval_agent.scorers.trajectory import TrajectoryScorer
25
+ from eval_agent.scorers.efficiency import EfficiencyScorer
26
+ scorers: list[Scorer] = [
27
+ CheckpointScorer(),
28
+ TrajectoryScorer(),
29
+ EfficiencyScorer(),
30
+ ]
31
+ if config.get("llm_judge", {}).get("enabled", True):
32
+ from eval_agent.scorers.llm_judge import LLMJudgeScorer
33
+ scorers.append(LLMJudgeScorer(**config.get("llm_judge", {}).get("kwargs", {})))
34
+ return scorers
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+ import json
3
+ import re
4
+ import subprocess
5
+ from pathlib import Path
6
+ from eval_agent.scorers import Scorer
7
+ from eval_agent.models import Task, Trajectory, Score, Checkpoint
8
+
9
+
10
+ class CheckpointScorer(Scorer):
11
+ name = "checkpoint"
12
+
13
+ def score(self, task: Task, trajectory: Trajectory) -> Score:
14
+ if not task.checkpoints:
15
+ return Score(scorer_name=self.name, value=0.0, reason="no checkpoints",
16
+ available=False)
17
+ root = Path(trajectory.final_state) if trajectory.final_state else None
18
+ passed = 0
19
+ details = []
20
+ for cp in task.checkpoints:
21
+ ok = self._check(cp, root, trajectory)
22
+ passed += int(ok)
23
+ details.append(f"{cp.type}={'ok' if ok else 'fail'}")
24
+ value = passed / len(task.checkpoints)
25
+ return Score(scorer_name=self.name, value=value,
26
+ reason=f"{passed}/{len(task.checkpoints)} passed | " +
27
+ ", ".join(details))
28
+
29
+ def _check(self, cp: Checkpoint, root: Path | None,
30
+ trajectory: Trajectory) -> bool:
31
+ try:
32
+ a = cp.args
33
+ if cp.type == "file_exists":
34
+ return root is not None and (root / a["path"]).exists()
35
+ if cp.type == "file_contains":
36
+ p = root / a["path"] if root else None
37
+ return bool(p and p.exists() and a["pattern"] in p.read_text())
38
+ if cp.type == "command_passes":
39
+ cwd = str(root) if root else None
40
+ r = subprocess.run(a["cmd"], shell=True, cwd=cwd,
41
+ capture_output=True)
42
+ return r.returncode == 0
43
+ if cp.type == "json_equals":
44
+ p = root / a["path"] if root else None
45
+ if not (p and p.exists()):
46
+ return False
47
+ data = json.loads(p.read_text())
48
+ for key in a["field"].split("."):
49
+ data = data.get(key) if isinstance(data, dict) else None
50
+ return data == a["value"]
51
+ if cp.type == "output_contains":
52
+ out = trajectory.final_output or ""
53
+ text = a["text"]
54
+ if a.get("case_sensitive", False):
55
+ return text in out
56
+ return text.lower() in out.lower()
57
+ if cp.type == "output_matches":
58
+ return re.search(a["pattern"], trajectory.final_output or "") is not None
59
+ if cp.type == "custom":
60
+ cwd = str(root) if root else None
61
+ r = subprocess.run(a["script"], shell=True, cwd=cwd,
62
+ capture_output=True)
63
+ return r.returncode == 0
64
+ return False
65
+ except Exception:
66
+ return False
@@ -0,0 +1,19 @@
1
+ from __future__ import annotations
2
+ from eval_agent.scorers import Scorer
3
+ from eval_agent.models import Task, Trajectory, Score
4
+
5
+
6
+ class EfficiencyScorer(Scorer):
7
+ name = "efficiency"
8
+
9
+ def score(self, task: Task, trajectory: Trajectory) -> Score:
10
+ u = trajectory.usage
11
+ b = task.budget
12
+ raw = (f"tokens={u.total_tokens}, steps={u.num_steps}, "
13
+ f"wall={u.wall_seconds:.2f}s, retries={u.num_retries}")
14
+ if b.max_tokens:
15
+ value = max(0.0, min(1.0, 1 - u.total_tokens / b.max_tokens))
16
+ return Score(scorer_name=self.name, value=value, weight=0.0,
17
+ reason=raw, available=True)
18
+ return Score(scorer_name=self.name, value=0.0, weight=0.0,
19
+ reason=raw + " (no token budget set)", available=False)
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+ import json
3
+ from typing import Callable, Optional
4
+ from eval_agent.scorers import Scorer
5
+ from eval_agent.models import Task, Trajectory, Score
6
+
7
+ _SYSTEM = (
8
+ "You are a strict evaluation judge. Given a rubric and an agent's output, "
9
+ "return ONLY a JSON object: {\"score\": <float 0-1>, \"reasons\": [<str>, ...]}. "
10
+ "Anchor the score to the rubric's stated criteria."
11
+ )
12
+
13
+
14
+ def _anthropic_complete(model: str) -> Callable[[str, str], str]:
15
+ def complete(system: str, user: str) -> str:
16
+ import anthropic
17
+ client = anthropic.Anthropic()
18
+ resp = client.messages.create(
19
+ model=model, max_tokens=1024, system=system,
20
+ messages=[{"role": "user", "content": user}],
21
+ )
22
+ return resp.content[0].text
23
+ return complete
24
+
25
+
26
+ class LLMJudgeScorer(Scorer):
27
+ name = "llm_judge"
28
+
29
+ def __init__(self, complete: Optional[Callable[[str, str], str]] = None,
30
+ model: str = "claude-opus-4-8"):
31
+ self.complete = complete or _anthropic_complete(model)
32
+
33
+ def score(self, task: Task, trajectory: Trajectory) -> Score:
34
+ if not task.rubric:
35
+ return Score(scorer_name=self.name, value=0.0,
36
+ reason="no rubric provided", available=False)
37
+ user = (f"RUBRIC:\n{task.rubric}\n\n"
38
+ f"AGENT OUTPUT:\n{trajectory.final_output}\n\n"
39
+ f"Return the JSON now.")
40
+ raw = self.complete(_SYSTEM, user)
41
+ try:
42
+ data = json.loads(self._extract_json(raw))
43
+ value = float(data["score"])
44
+ reasons = data.get("reasons", [])
45
+ return Score(scorer_name=self.name, value=value,
46
+ reason="; ".join(reasons))
47
+ except (json.JSONDecodeError, KeyError, ValueError, TypeError):
48
+ return Score(scorer_name=self.name, value=0.0,
49
+ reason=f"could not parse judge output: {raw[:200]}")
50
+
51
+ @staticmethod
52
+ def _extract_json(text: str) -> str:
53
+ start = text.find("{")
54
+ end = text.rfind("}")
55
+ if start == -1 or end == -1:
56
+ return text
57
+ return text[start:end + 1]
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+ import json
3
+ from eval_agent.scorers import Scorer
4
+ from eval_agent.models import Task, Trajectory, Score
5
+
6
+
7
+ class TrajectoryScorer(Scorer):
8
+ """Rule-based process quality. Degrades to unavailable for black-box
9
+ agents that expose no steps."""
10
+
11
+ name = "trajectory"
12
+
13
+ def score(self, task: Task, trajectory: Trajectory) -> Score:
14
+ steps = trajectory.steps
15
+ if not steps:
16
+ return Score(scorer_name=self.name, value=0.0,
17
+ reason="no steps recorded (black-box agent)",
18
+ available=False)
19
+
20
+ penalties = []
21
+ value = 1.0
22
+
23
+ repeats = self._max_consecutive_repeats(trajectory)
24
+ if repeats >= 3:
25
+ value -= 0.4
26
+ penalties.append(f"repeated identical action x{repeats}")
27
+
28
+ if task.budget.max_steps and len(steps) > task.budget.max_steps:
29
+ value -= 0.3
30
+ penalties.append(
31
+ f"exceeded max_steps ({len(steps)}>{task.budget.max_steps})")
32
+
33
+ value = max(0.0, value)
34
+ reason = "clean run" if not penalties else "; ".join(penalties)
35
+ return Score(scorer_name=self.name, value=value, reason=reason)
36
+
37
+ @staticmethod
38
+ def _max_consecutive_repeats(trajectory: Trajectory) -> int:
39
+ best = 1
40
+ run = 1
41
+ prev = None
42
+ for s in trajectory.steps:
43
+ key = (s.name, json.dumps(s.payload, sort_keys=True))
44
+ if key == prev:
45
+ run += 1
46
+ best = max(best, run)
47
+ else:
48
+ run = 1
49
+ prev = key
50
+ return best
@@ -0,0 +1,73 @@
1
+ <!doctype html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>eval-agent runs</title>
6
+ <style>
7
+ body { font-family: system-ui, sans-serif; margin: 2rem; }
8
+ table { border-collapse: collapse; width: 100%; }
9
+ th, td { border: 1px solid #ccc; padding: 6px 10px; text-align: left; }
10
+ th { background: #f3f3f3; cursor: pointer; user-select: none; }
11
+ tr.run-row:hover { background: #f9f9f9; cursor: pointer; }
12
+ .empty { color: #777; }
13
+ </style>
14
+ </head>
15
+ <body>
16
+ <h1>eval-agent runs</h1>
17
+ {% if rows %}
18
+ <table id="runs">
19
+ <thead>
20
+ <tr>
21
+ <th data-type="text">run-id</th>
22
+ <th data-type="text">agent</th>
23
+ <th data-type="text">suite</th>
24
+ <th data-type="num">score</th>
25
+ <th data-type="num">tasks</th>
26
+ <th data-type="num">ok</th>
27
+ <th data-type="text">time</th>
28
+ </tr>
29
+ </thead>
30
+ <tbody>
31
+ {% for r in rows %}
32
+ <tr class="run-row" data-href="{{ r.run_id }}/report.html">
33
+ <td>{{ r.run_id }}</td>
34
+ <td>{{ r.agent }}</td>
35
+ <td>{{ r.suite }}</td>
36
+ <td>{{ "%.3f"|format(r.mean_score) }}</td>
37
+ <td>{{ r.num_tasks }}</td>
38
+ <td>{{ r.num_success }}</td>
39
+ <td>{{ r.time }}</td>
40
+ </tr>
41
+ {% endfor %}
42
+ </tbody>
43
+ </table>
44
+ <script>
45
+ document.querySelectorAll("tr.run-row").forEach(function (row) {
46
+ row.addEventListener("click", function () {
47
+ window.location.href = row.dataset.href;
48
+ });
49
+ });
50
+ document.querySelectorAll("#runs thead th").forEach(function (th, idx) {
51
+ th.addEventListener("click", function () {
52
+ var tbody = document.querySelector("#runs tbody");
53
+ var rows = Array.prototype.slice.call(tbody.querySelectorAll("tr"));
54
+ var numeric = th.dataset.type === "num";
55
+ var asc = th.dataset.asc !== "true";
56
+ th.dataset.asc = asc;
57
+ rows.sort(function (a, b) {
58
+ var x = a.children[idx].textContent.trim();
59
+ var y = b.children[idx].textContent.trim();
60
+ if (numeric) { x = parseFloat(x); y = parseFloat(y); }
61
+ if (x < y) return asc ? -1 : 1;
62
+ if (x > y) return asc ? 1 : -1;
63
+ return 0;
64
+ });
65
+ rows.forEach(function (r) { tbody.appendChild(r); });
66
+ });
67
+ });
68
+ </script>
69
+ {% else %}
70
+ <p class="empty">暂无评测。先跑一次:<code>eval-agent run</code></p>
71
+ {% endif %}
72
+ </body>
73
+ </html>
@@ -0,0 +1,68 @@
1
+ <!doctype html>
2
+ <html>
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Eval-Agent Report</title>
6
+ <style>
7
+ body { font-family: system-ui, sans-serif; margin: 2rem; }
8
+ table { border-collapse: collapse; margin-bottom: 2rem; }
9
+ th, td { border: 1px solid #ccc; padding: 6px 10px; text-align: left; }
10
+ th { background: #f3f3f3; }
11
+ .task { margin: 1rem 0; padding: 1rem; border: 1px solid #ddd; }
12
+ .reason { color: #555; font-size: 0.9em; }
13
+ .status-crash, .status-timeout { color: #b00; }
14
+ </style>
15
+ </head>
16
+ <body>
17
+ <h1>Eval-Agent Report</h1>
18
+
19
+ <h2>Leaderboard</h2>
20
+ <table>
21
+ <tr><th>Agent</th><th>Suite</th><th>Tasks</th><th>Success</th><th>Mean score</th></tr>
22
+ {% for rr in runs %}
23
+ {% set agent_label = rr.meta.get("agent") or (rr.task_results[0].agent_name if rr.task_results else rr.run_id) %}
24
+ <tr>
25
+ <td>{{ agent_label }}</td>
26
+ <td>{{ rr.suite_name }}</td>
27
+ <td>{{ rr.stats.get("num_tasks", 0) }}</td>
28
+ <td>{{ rr.stats.get("num_success", 0) }}</td>
29
+ <td>{{ "%.3f"|format(rr.stats.get("mean_score", 0.0)) }}</td>
30
+ </tr>
31
+ {% endfor %}
32
+ </table>
33
+
34
+ {% for rr in runs %}
35
+ {% set agent_label = rr.meta.get("agent") or (rr.task_results[0].agent_name if rr.task_results else rr.run_id) %}
36
+ <h2>{{ agent_label }} — task details</h2>
37
+ {% for tr in rr.task_results %}
38
+ <div class="task">
39
+ <h3>{{ tr.task_id }}
40
+ <span class="status-{{ tr.trajectory.status }}">[{{ tr.trajectory.status }}]</span>
41
+ — score {{ "%.3f"|format(tr.aggregate_score) }}
42
+ </h3>
43
+ <table>
44
+ <tr><th>Scorer</th><th>Value</th><th>Weight</th><th>Reason</th></tr>
45
+ {% for s in tr.scores %}
46
+ <tr>
47
+ <td>{{ s.scorer_name }}{% if not s.available %} (n/a){% endif %}</td>
48
+ <td>{{ "%.3f"|format(s.value) }}</td>
49
+ <td>{{ s.weight }}</td>
50
+ <td class="reason">{{ s.reason }}</td>
51
+ </tr>
52
+ {% endfor %}
53
+ </table>
54
+ <details>
55
+ <summary>Trajectory ({{ tr.trajectory.steps|length }} steps,
56
+ {{ tr.trajectory.usage.total_tokens }} tokens)</summary>
57
+ <ol>
58
+ {% for step in tr.trajectory.steps %}
59
+ <li>{{ step.kind }}{% if step.name %}: {{ step.name }}{% endif %}
60
+ — {{ step.payload }}</li>
61
+ {% endfor %}
62
+ </ol>
63
+ </details>
64
+ </div>
65
+ {% endfor %}
66
+ {% endfor %}
67
+ </body>
68
+ </html>