brooder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
brooder/recorder.py ADDED
@@ -0,0 +1,342 @@
1
+ """The SDK and the run-lifecycle core.
2
+
3
+ Public surface: the ``@record`` decorator and ``tool_call`` / ``turn`` capture. The CLI drives
4
+ behavior through env vars set before it executes your script:
5
+
6
+ BROODER_MODE "record" | "run" | "off"
7
+ BROODER_MODEL optional label stored on the run (used by `migrate`)
8
+ BROODER_RUN_INDEX which repetition this is (used for flakiness detection)
9
+
10
+ - record : every decorated call is saved as a golden baseline and collected in SESSION.
11
+ - run : every decorated call is saved as a run and collected in SESSION (diffing happens later).
12
+ - off : passthrough, no side effects (normal execution of your app).
13
+
14
+ Each call is captured as an ordered **trajectory** of steps (TURN / TOOL / FINAL). Diffing is
15
+ intentionally *not* done here — see ``analysis`` — so multiple runs of the same case can be grouped
16
+ to detect flakiness.
17
+
18
+ **Run lifecycle.** A :class:`RunHandle` owns one run's capture: opening it, appending steps,
19
+ finalizing and saving. ``@record`` uses a handle bound to the calling context (so nested
20
+ :func:`tool_call` / :func:`turn` — including provider auto-capture — attach to it). Framework
21
+ adapters instead call :func:`open_run` with the framework's own id (a trace or session), append
22
+ steps as events arrive — possibly from other threads — and :meth:`RunHandle.finish` when the run
23
+ ends; :func:`get_run` looks a handle back up by that id. Both paths feed the same ``Step`` model.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import functools
29
+ import inspect
30
+ import os
31
+ import threading
32
+ from collections.abc import Callable
33
+ from contextvars import ContextVar
34
+ from typing import Any, Optional
35
+
36
+ from . import storage
37
+ from .errors import RunawayError
38
+ from .log import get_logger
39
+ from .models import Run, Step, StepKind, make_case_id
40
+
41
+ _log = get_logger()
42
+
43
+ # The handle currently being recorded via @record (so nested steps attach to it).
44
+ _current: ContextVar[Optional[RunHandle]] = ContextVar("brooder_current_run", default=None)
45
+
46
+ # Runs captured during the most recent CLI invocation.
47
+ SESSION: list[Run] = []
48
+
49
+ # Adapter-owned runs keyed by an external id (trace_id / session_id), for out-of-band capture.
50
+ _registry: dict[str, RunHandle] = {}
51
+ _registry_lock = threading.Lock()
52
+
53
+
54
+ def _mode() -> str:
55
+ return os.environ.get("BROODER_MODE", "off")
56
+
57
+
58
+ def _model() -> Optional[str]:
59
+ return os.environ.get("BROODER_MODEL")
60
+
61
+
62
+ def _max_steps() -> int:
63
+ """The ``trajectory.max_steps`` guardrail (0 = unlimited), passed in by the CLI."""
64
+ try:
65
+ return int(os.environ.get("BROODER_MAX_STEPS", "0"))
66
+ except ValueError:
67
+ return 0
68
+
69
+
70
+ def reset_session() -> None:
71
+ """Clear the captured-run buffer and any dangling adapter runs before a new CLI invocation."""
72
+ SESSION.clear()
73
+ with _registry_lock:
74
+ _registry.clear()
75
+
76
+
77
+ class RunHandle:
78
+ """A handle to one run being captured — the single run-lifecycle implementation.
79
+
80
+ Append steps with :meth:`turn` / :meth:`tool_call` / :meth:`add_step`, mark the terminal answer
81
+ with :meth:`final`, and persist with :meth:`finish`. Instances are thread-safe so a framework
82
+ adapter can receive events on export threads.
83
+
84
+ ``@record`` opens a handle with ``raise_on_runaway=True`` so exceeding ``max_steps`` raises
85
+ :class:`~brooder.errors.RunawayError` (aborting the Python loop). Adapters leave it ``False``:
86
+ they usually can't cancel the framework's loop, so a runaway is *flagged* (via
87
+ ``run.meta["runaway"]``) and further steps are dropped, rather than raised into the framework.
88
+ """
89
+
90
+ __slots__ = ("_external_id", "_finished", "_lock", "_raise_on_runaway", "run")
91
+
92
+ def __init__(
93
+ self,
94
+ run: Run,
95
+ *,
96
+ external_id: Optional[str] = None,
97
+ raise_on_runaway: bool = False,
98
+ ) -> None:
99
+ """Wrap ``run`` for capture; see :func:`open_run` for the arguments."""
100
+ self.run = run
101
+ self._external_id = external_id
102
+ self._raise_on_runaway = raise_on_runaway
103
+ self._finished = False
104
+ self._lock = threading.Lock()
105
+
106
+ def _append(self, step: Step) -> None:
107
+ """Append a step, enforcing the ``max_steps`` guardrail (raise or flag per the mode)."""
108
+ with self._lock:
109
+ cap = _max_steps()
110
+ if cap and len(self.run.trajectory) >= cap:
111
+ self.run.meta["runaway"] = True
112
+ if self._raise_on_runaway:
113
+ raise RunawayError(cap)
114
+ _log.debug(
115
+ "run %s/%s hit max_steps=%d; dropping further steps",
116
+ self.run.agent,
117
+ self.run.case_id,
118
+ cap,
119
+ )
120
+ return
121
+ self.run.trajectory.append(step)
122
+
123
+ def turn(self, meta: Optional[dict[str, Any]] = None) -> None:
124
+ """Append a model-turn step (a decision point)."""
125
+ self._append(Step(kind=StepKind.TURN, name="turn", args=meta or {}))
126
+
127
+ def tool_call(
128
+ self, name: str, args: Optional[dict[str, Any]] = None, result: Any = None
129
+ ) -> None:
130
+ """Append a tool-call step; ``result`` is the step's observation."""
131
+ self._append(Step(kind=StepKind.TOOL, name=name, args=args or {}, result=result))
132
+
133
+ def add_step(self, step: Step) -> None:
134
+ """Append a pre-built :class:`~brooder.models.Step` (escape hatch for adapters)."""
135
+ self._append(step)
136
+
137
+ def set_inputs(self, inputs: Any) -> None:
138
+ """Set the run inputs after opening and recompute its case id.
139
+
140
+ Framework adapters often learn the agent's input mid-run (e.g. from a root span). The case
141
+ id is derived from the inputs so a run can be matched to its baseline, so it is recomputed.
142
+ """
143
+ with self._lock:
144
+ self.run.inputs = inputs
145
+ self.run.case_id = make_case_id(self.run.agent, inputs)
146
+
147
+ def final(self, output: Any) -> None:
148
+ """Record the terminal answer: set the run output and append a ``FINAL`` step."""
149
+ with self._lock:
150
+ self.run.output = output
151
+ self.run.trajectory.append(Step(kind=StepKind.FINAL, name="final", result=output))
152
+
153
+ def mark_runaway(self) -> None:
154
+ """Flag the run as a guardrail hit (exceeded ``max_steps``)."""
155
+ self.run.meta["runaway"] = True
156
+
157
+ def finish(self, output: Any = None) -> Run:
158
+ """Finalize and persist the run. Idempotent — extra calls are no-ops.
159
+
160
+ Appends a ``FINAL`` step from ``output`` unless the run is a runaway or gave up (returned
161
+ no answer). Then saves per the current mode (``record`` → baseline, ``run`` → run) and
162
+ appends to :data:`SESSION`; ``off`` persists nothing. Unregisters any ``external_id``.
163
+
164
+ Args:
165
+ output: The agent's final output, or ``None`` if it gave up / was aborted.
166
+
167
+ Returns:
168
+ The finalized :class:`~brooder.models.Run`.
169
+ """
170
+ with self._lock:
171
+ if self._finished:
172
+ return self.run
173
+ self._finished = True
174
+
175
+ if self.run.runaway:
176
+ _log.debug("run %s/%s aborted: exceeded max_steps", self.run.agent, self.run.case_id)
177
+ elif output is not None:
178
+ self.final(output)
179
+ else:
180
+ _log.debug(
181
+ "run %s/%s gave up: finished without a final answer",
182
+ self.run.agent,
183
+ self.run.case_id,
184
+ )
185
+
186
+ mode = _mode()
187
+ if mode == "record":
188
+ storage.save_baseline(self.run)
189
+ SESSION.append(self.run)
190
+ _log.debug("recorded baseline %s/%s", self.run.agent, self.run.case_id)
191
+ elif mode == "run":
192
+ storage.save_run(self.run)
193
+ SESSION.append(self.run)
194
+ _log.debug("captured run %s/%s", self.run.agent, self.run.case_id)
195
+ # mode == "off": passthrough, no capture
196
+
197
+ if self._external_id is not None:
198
+ with _registry_lock:
199
+ _registry.pop(self._external_id, None)
200
+ return self.run
201
+
202
+
203
+ def open_run(
204
+ agent: str,
205
+ *,
206
+ inputs: Any = None,
207
+ model: Optional[str] = None,
208
+ external_id: Optional[str] = None,
209
+ raise_on_runaway: bool = False,
210
+ ) -> RunHandle:
211
+ """Open a run for out-of-band capture from a framework adapter.
212
+
213
+ Unlike :func:`record` (which owns a Python call's lifecycle on one thread), this lets an adapter
214
+ open a run driven by a framework's own events — a trace or a session — and append steps as they
215
+ arrive, possibly out of order and from other threads. Register it under ``external_id`` and look
216
+ it back up with :func:`get_run`; call :meth:`RunHandle.finish` when the run ends.
217
+
218
+ Args:
219
+ agent: Logical agent name (groups baselines).
220
+ inputs: The run inputs; used to derive the case id. If not known yet, set it later with
221
+ :meth:`RunHandle.set_inputs`.
222
+ model: Model label; defaults to the CLI-provided ``BROODER_MODEL``.
223
+ external_id: A stable id (trace/session) to register the handle under for later lookup.
224
+ raise_on_runaway: If ``True``, exceeding ``max_steps`` raises ``RunawayError`` (used by
225
+ ``@record``). Adapters leave this ``False`` so a runaway is flagged, not raised into the
226
+ framework's loop.
227
+
228
+ Returns:
229
+ A :class:`RunHandle`.
230
+ """
231
+ run = Run(
232
+ agent=agent,
233
+ case_id=make_case_id(agent, inputs),
234
+ inputs=inputs,
235
+ model=model or _model(),
236
+ )
237
+ handle = RunHandle(run, external_id=external_id, raise_on_runaway=raise_on_runaway)
238
+ if external_id is not None:
239
+ with _registry_lock:
240
+ _registry[external_id] = handle
241
+ return handle
242
+
243
+
244
+ def get_run(external_id: str) -> Optional[RunHandle]:
245
+ """Return the open run registered under ``external_id``, or ``None`` if there is none."""
246
+ with _registry_lock:
247
+ return _registry.get(external_id)
248
+
249
+
250
+ def tool_call(name: str, args: Optional[dict[str, Any]] = None, result: Any = None) -> None:
251
+ """Record a tool call / step into the active recorded run's trajectory.
252
+
253
+ Call this from inside a recorded agent for every tool the agent invokes. It is a no-op when
254
+ called outside a recorded agent, so it is safe to leave in production code.
255
+
256
+ Args:
257
+ name: The tool/function name.
258
+ args: The arguments passed to the tool.
259
+ result: The value the tool returned (the step's observation).
260
+ """
261
+ handle = _current.get()
262
+ if handle is not None:
263
+ handle.tool_call(name, args, result)
264
+
265
+
266
+ def turn(meta: Optional[dict[str, Any]] = None) -> None:
267
+ """Record a model turn (a decision point) into the active recorded run's trajectory.
268
+
269
+ Usually emitted automatically by provider auto-capture; call it manually to mark a turn in a
270
+ hand-instrumented loop. No-op outside a recorded agent.
271
+
272
+ Args:
273
+ meta: Optional metadata to attach to the turn (e.g. the provider).
274
+ """
275
+ handle = _current.get()
276
+ if handle is not None:
277
+ handle.turn(meta)
278
+
279
+
280
+ def _open_recorded(name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) -> RunHandle:
281
+ """Open a run for a ``@record`` call, deriving its inputs (and case id) from the call args."""
282
+ return open_run(
283
+ name, inputs={"args": list(args), "kwargs": dict(kwargs)}, raise_on_runaway=True
284
+ )
285
+
286
+
287
+ def record(name: str) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
288
+ """Decorate an agent entrypoint so Brooder can record and replay it.
289
+
290
+ Works on both plain and ``async def`` agents: an async agent is wrapped with a coroutine so the
291
+ active-run :class:`~contextvars.ContextVar` stays bound across the agent's ``await`` points (and
292
+ into any child tasks), letting nested :func:`tool_call` / provider auto-capture attach.
293
+
294
+ Args:
295
+ name: The logical agent name used to group its baselines.
296
+
297
+ Returns:
298
+ A decorator that wraps the agent function; the wrapper captures each call as a
299
+ :class:`~brooder.models.Run` (its full trajectory) when Brooder is active, and is a
300
+ passthrough otherwise.
301
+ """
302
+
303
+ def decorator(fn: Callable[..., Any]) -> Callable[..., Any]:
304
+ if inspect.iscoroutinefunction(fn):
305
+
306
+ @functools.wraps(fn)
307
+ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
308
+ handle = _open_recorded(name, args, kwargs)
309
+ token = _current.set(handle)
310
+ output = None
311
+ try:
312
+ output = await fn(*args, **kwargs)
313
+ except RunawayError:
314
+ # Guardrail hit: record the run as a runaway; no FINAL answer was produced.
315
+ handle.mark_runaway()
316
+ finally:
317
+ _current.reset(token)
318
+
319
+ handle.finish(output)
320
+ return output
321
+
322
+ return async_wrapper
323
+
324
+ @functools.wraps(fn)
325
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
326
+ handle = _open_recorded(name, args, kwargs)
327
+ token = _current.set(handle)
328
+ output = None
329
+ try:
330
+ output = fn(*args, **kwargs)
331
+ except RunawayError:
332
+ # Guardrail hit: record the run as a runaway; no FINAL answer was produced.
333
+ handle.mark_runaway()
334
+ finally:
335
+ _current.reset(token)
336
+
337
+ handle.finish(output)
338
+ return output
339
+
340
+ return wrapper
341
+
342
+ return decorator
brooder/report.py ADDED
@@ -0,0 +1,261 @@
1
+ """Rendering of diffs — terminal (rich), and machine-readable JSON / Markdown.
2
+
3
+ The terminal renderers (`print_summary` / `print_diff_detail`) are for humans; `diffs_to_summary` /
4
+ `render_json` / `render_markdown` produce the machine-readable output consumed by CI (`--json`), the
5
+ OTLP metric emitter, and the GitHub Action's PR comment (`--format markdown`).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from typing import Any, Optional
12
+
13
+ from rich.console import Console
14
+ from rich.panel import Panel
15
+ from rich.table import Table
16
+
17
+ from .models import Diff, Verdict
18
+
19
+ console = Console()
20
+
21
+ # Bump when the shape of `diffs_to_summary` changes, so machine consumers can detect it.
22
+ SUMMARY_SCHEMA_VERSION = 1
23
+
24
+ _VERDICT: dict[Verdict, tuple[str, str]] = {
25
+ Verdict.PASS: ("PASS", "green"),
26
+ Verdict.REGRESSED: ("REGRESSED", "red"),
27
+ Verdict.NEW: ("NEW", "yellow"),
28
+ Verdict.FLAKY: ("FLAKY", "magenta"),
29
+ }
30
+
31
+
32
+ def _fmt_step(summary: Optional[dict[str, Any]]) -> str:
33
+ """Render a step summary (``{kind, name, args}``) as e.g. ``TOOL search(q=x)``."""
34
+ if not summary:
35
+ return "—"
36
+ kind = str(summary.get("kind", "")).upper()
37
+ name = summary.get("name", "")
38
+ args = summary.get("args") or {}
39
+ inner = ", ".join(f"{k}={v}" for k, v in list(args.items())[:2])
40
+ return f"{kind} {name}({inner})" if inner else f"{kind} {name}"
41
+
42
+
43
+ def _trajectory_headline(diff: Diff) -> Optional[str]:
44
+ """The single "path diverged at step N" line, from the first per-step trajectory change."""
45
+ for change in diff.changes:
46
+ path = change.path
47
+ if not (path.startswith("trajectory[") and path.endswith("]")):
48
+ continue
49
+ step = path[len("trajectory[") : -1]
50
+ if change.kind == "removed":
51
+ return f"path diverged at step {step}: was {_fmt_step(change.before)}, now dropped"
52
+ if change.kind == "added":
53
+ return f"path diverged at step {step}: nothing before, now {_fmt_step(change.after)}"
54
+ return (
55
+ f"path diverged at step {step}: "
56
+ f"was {_fmt_step(change.before)}, now {_fmt_step(change.after)}"
57
+ )
58
+ return None
59
+
60
+
61
+ def _guardrail_line(diff: Diff) -> Optional[str]:
62
+ """The guardrail terminal note (runaway / gave-up) if the current run newly hit one."""
63
+ for change in diff.changes:
64
+ if change.path == "trajectory.runaway" and change.after:
65
+ return "runaway: the agent exceeded max_steps and was aborted"
66
+ if change.path == "trajectory.gave_up" and change.after:
67
+ return "gave up: the agent returned without a final answer"
68
+ return None
69
+
70
+
71
+ def _trajectory_signal(diff: Diff) -> str:
72
+ """The most salient trajectory signal for the summary table.
73
+
74
+ Priority: guardrail terminals (runaway / gave-up) > turn-count > step-count > path change.
75
+ """
76
+ for change in diff.changes:
77
+ if change.path == "trajectory.runaway" and change.after:
78
+ return "runaway!"
79
+ if change.path == "trajectory.gave_up" and change.after:
80
+ return "gave up"
81
+ for change in diff.changes:
82
+ if change.path == "trajectory.turns":
83
+ return f"turns {change.before}→{change.after}"
84
+ for change in diff.changes:
85
+ if change.path == "trajectory.steps":
86
+ return f"steps {change.before}→{change.after}"
87
+ if any(c.path.startswith("trajectory[") for c in diff.changes):
88
+ return "path changed"
89
+ return "—"
90
+
91
+
92
+ def print_summary(diffs: list[Diff]) -> None:
93
+ """Print a one-row-per-case results table.
94
+
95
+ Args:
96
+ diffs: The per-case diffs to summarize.
97
+ """
98
+ table = Table(title="Brooder results", title_style="bold")
99
+ table.add_column("Agent")
100
+ table.add_column("Case")
101
+ table.add_column("Verdict")
102
+ table.add_column("Changes", justify="right")
103
+ table.add_column("Trajectory")
104
+ table.add_column("Stability", justify="right")
105
+ for diff in diffs:
106
+ label, style = _VERDICT[diff.verdict]
107
+ table.add_row(
108
+ diff.agent,
109
+ diff.case_id,
110
+ f"[{style}]{label}[/]",
111
+ str(len(diff.changes)),
112
+ _trajectory_signal(diff),
113
+ str(diff.stability),
114
+ )
115
+ console.print(table)
116
+
117
+
118
+ def print_diff_detail(diff: Diff) -> None:
119
+ """Print a detailed, color-coded panel of the changes in a single diff.
120
+
121
+ Args:
122
+ diff: The diff to render.
123
+ """
124
+ label, style = _VERDICT[diff.verdict]
125
+ title = f"{diff.agent} · {diff.case_id} · [{style}]{label}[/]"
126
+ if not diff.changes:
127
+ console.print(Panel("No behavioral changes.", title=title, border_style=style))
128
+ return
129
+
130
+ lines: list[str] = []
131
+ guardrail = _guardrail_line(diff)
132
+ headline = _trajectory_headline(diff)
133
+ if guardrail:
134
+ lines.append(f"[bold red]⚠ {guardrail}[/]")
135
+ if headline:
136
+ lines.append(f"[bold]{headline}[/]")
137
+ if lines:
138
+ lines.append("")
139
+ for change in diff.changes:
140
+ if change.kind == "removed":
141
+ lines.append(f"[red]- {change.path}[/] {change.before}")
142
+ elif change.kind == "added":
143
+ lines.append(f"[green]+ {change.path}[/] {change.after}")
144
+ else:
145
+ lines.append(f"[yellow]~ {change.path}[/]")
146
+ lines.append(f" before: {change.before}")
147
+ lines.append(f" after: {change.after}")
148
+ console.print(Panel("\n".join(lines), title=title, border_style=style))
149
+
150
+
151
+ # --- machine-readable output ------------------------------------------------
152
+
153
+
154
+ def diffs_to_summary(diffs: list[Diff]) -> dict[str, Any]:
155
+ """Serialize per-case diffs into a plain, JSON-safe summary dict.
156
+
157
+ The single source of truth for machine-readable output: `render_json`, the OTLP metric emitter,
158
+ and the GitHub Action's Markdown comment all derive from this. Everything comes from the
159
+ :class:`~brooder.models.Diff` objects alone (verdict, stability, changes); cost/latency are not
160
+ included because the run model does not capture them yet.
161
+
162
+ Args:
163
+ diffs: The per-case diffs (one per case), as returned by ``analysis.analyze``.
164
+
165
+ Returns:
166
+ ``{schema_version, summary: {...counts, mean_stability}, cases: [...]}``.
167
+ """
168
+ counts = {verdict.value: 0 for verdict in Verdict}
169
+ for diff in diffs:
170
+ counts[diff.verdict.value] += 1
171
+ total = len(diffs)
172
+ mean_stability = round(sum(d.stability for d in diffs) / total) if total else 100
173
+ # "regressions" is what gates CI: anything that is not PASS/NEW (regressed + flaky).
174
+ regressions = sum(1 for d in diffs if not d.ok)
175
+
176
+ cases = [
177
+ {
178
+ "agent": diff.agent,
179
+ "case_id": diff.case_id,
180
+ "verdict": diff.verdict.value,
181
+ "ok": diff.ok,
182
+ "stability": diff.stability,
183
+ "change_count": len(diff.changes),
184
+ "changes": [
185
+ {"path": c.path, "kind": c.kind, "before": c.before, "after": c.after}
186
+ for c in diff.changes
187
+ ],
188
+ }
189
+ for diff in diffs
190
+ ]
191
+ return {
192
+ "schema_version": SUMMARY_SCHEMA_VERSION,
193
+ "summary": {
194
+ "total": total,
195
+ "passed": counts[Verdict.PASS.value],
196
+ "regressed": counts[Verdict.REGRESSED.value],
197
+ "new": counts[Verdict.NEW.value],
198
+ "flaky": counts[Verdict.FLAKY.value],
199
+ "regressions": regressions,
200
+ "mean_stability": mean_stability,
201
+ },
202
+ "cases": cases,
203
+ }
204
+
205
+
206
+ def render_json(diffs: list[Diff]) -> str:
207
+ """Render diffs as a pretty-printed JSON document (see :func:`diffs_to_summary`)."""
208
+ return json.dumps(diffs_to_summary(diffs), indent=2, sort_keys=True, default=str)
209
+
210
+
211
+ def render_markdown(diffs: list[Diff]) -> str:
212
+ """Render diffs as a Markdown report suitable for a PR comment.
213
+
214
+ A headline + one-row-per-case table, plus a collapsible details block for the regressions.
215
+ Reuses the same trajectory-signal helpers as the terminal renderer so the two stay consistent.
216
+ """
217
+ summary = diffs_to_summary(diffs)["summary"]
218
+ headline = (
219
+ "✅ no behavioral regressions"
220
+ if summary["regressions"] == 0
221
+ else f"❌ {summary['regressions']} behavioral regression(s)"
222
+ )
223
+ lines = [
224
+ "## Brooder results",
225
+ "",
226
+ f"**{headline}** — {summary['total']} case(s): "
227
+ f"{summary['passed']} passed · {summary['regressed']} regressed · "
228
+ f"{summary['flaky']} flaky · {summary['new']} new · "
229
+ f"mean stability {summary['mean_stability']}",
230
+ "",
231
+ "| Agent | Case | Verdict | Changes | Trajectory | Stability |",
232
+ "| --- | --- | --- | ---: | --- | ---: |",
233
+ ]
234
+ for diff in diffs:
235
+ label = _VERDICT[diff.verdict][0]
236
+ lines.append(
237
+ f"| {diff.agent} | `{diff.case_id}` | {label} | {len(diff.changes)} "
238
+ f"| {_trajectory_signal(diff)} | {diff.stability} |"
239
+ )
240
+
241
+ regressions = [d for d in diffs if not d.ok]
242
+ if regressions:
243
+ lines += ["", "<details><summary>What changed</summary>", ""]
244
+ for diff in regressions:
245
+ lines.append(f"#### {diff.agent} · {diff.case_id} — {_VERDICT[diff.verdict][0]}")
246
+ guardrail = _guardrail_line(diff)
247
+ trajectory = _trajectory_headline(diff)
248
+ if guardrail:
249
+ lines.append(f"- ⚠ {guardrail}")
250
+ if trajectory:
251
+ lines.append(f"- {trajectory}")
252
+ for change in diff.changes:
253
+ if change.kind == "removed":
254
+ lines.append(f"- `- {change.path}` — {change.before}")
255
+ elif change.kind == "added":
256
+ lines.append(f"- `+ {change.path}` — {change.after}")
257
+ else:
258
+ lines.append(f"- `~ {change.path}` — {change.before} → {change.after}")
259
+ lines.append("")
260
+ lines.append("</details>")
261
+ return "\n".join(lines).rstrip() + "\n"