brooder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brooder/__init__.py +31 -0
- brooder/analysis.py +79 -0
- brooder/cli.py +281 -0
- brooder/config.py +88 -0
- brooder/diffing.py +217 -0
- brooder/errors.py +31 -0
- brooder/integrations/__init__.py +75 -0
- brooder/integrations/anthropic.py +46 -0
- brooder/integrations/base.py +170 -0
- brooder/integrations/bedrock.py +49 -0
- brooder/integrations/claude_agent.py +164 -0
- brooder/integrations/google.py +61 -0
- brooder/integrations/langchain.py +321 -0
- brooder/integrations/openai.py +43 -0
- brooder/integrations/openai_agents.py +208 -0
- brooder/integrations/otel.py +216 -0
- brooder/judges.py +109 -0
- brooder/log.py +33 -0
- brooder/metrics.py +116 -0
- brooder/models.py +148 -0
- brooder/py.typed +1 -0
- brooder/recorder.py +342 -0
- brooder/report.py +261 -0
- brooder/storage.py +150 -0
- brooder-0.1.0.dist-info/METADATA +338 -0
- brooder-0.1.0.dist-info/RECORD +30 -0
- brooder-0.1.0.dist-info/WHEEL +4 -0
- brooder-0.1.0.dist-info/entry_points.txt +2 -0
- brooder-0.1.0.dist-info/licenses/LICENSE +201 -0
- brooder-0.1.0.dist-info/licenses/NOTICE +7 -0
brooder/recorder.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
"""The SDK and the run-lifecycle core.
|
|
2
|
+
|
|
3
|
+
Public surface: the ``@record`` decorator and ``tool_call`` / ``turn`` capture. The CLI drives
|
|
4
|
+
behavior through env vars set before it executes your script:
|
|
5
|
+
|
|
6
|
+
BROODER_MODE "record" | "run" | "off"
|
|
7
|
+
BROODER_MODEL optional label stored on the run (used by `migrate`)
|
|
8
|
+
BROODER_RUN_INDEX which repetition this is (used for flakiness detection)
|
|
9
|
+
|
|
10
|
+
- record : every decorated call is saved as a golden baseline and collected in SESSION.
|
|
11
|
+
- run : every decorated call is saved as a run and collected in SESSION (diffing happens later).
|
|
12
|
+
- off : passthrough, no side effects (normal execution of your app).
|
|
13
|
+
|
|
14
|
+
Each call is captured as an ordered **trajectory** of steps (TURN / TOOL / FINAL). Diffing is
|
|
15
|
+
intentionally *not* done here — see ``analysis`` — so multiple runs of the same case can be grouped
|
|
16
|
+
to detect flakiness.
|
|
17
|
+
|
|
18
|
+
**Run lifecycle.** A :class:`RunHandle` owns one run's capture: opening it, appending steps,
|
|
19
|
+
finalizing and saving. ``@record`` uses a handle bound to the calling context (so nested
|
|
20
|
+
:func:`tool_call` / :func:`turn` — including provider auto-capture — attach to it). Framework
|
|
21
|
+
adapters instead call :func:`open_run` with the framework's own id (a trace or session), append
|
|
22
|
+
steps as events arrive — possibly from other threads — and :meth:`RunHandle.finish` when the run
|
|
23
|
+
ends; :func:`get_run` looks a handle back up by that id. Both paths feed the same ``Step`` model.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import functools
|
|
29
|
+
import inspect
|
|
30
|
+
import os
|
|
31
|
+
import threading
|
|
32
|
+
from collections.abc import Callable
|
|
33
|
+
from contextvars import ContextVar
|
|
34
|
+
from typing import Any, Optional
|
|
35
|
+
|
|
36
|
+
from . import storage
|
|
37
|
+
from .errors import RunawayError
|
|
38
|
+
from .log import get_logger
|
|
39
|
+
from .models import Run, Step, StepKind, make_case_id
|
|
40
|
+
|
|
41
|
+
_log = get_logger()
|
|
42
|
+
|
|
43
|
+
# The handle currently being recorded via @record (so nested steps attach to it).
|
|
44
|
+
_current: ContextVar[Optional[RunHandle]] = ContextVar("brooder_current_run", default=None)
|
|
45
|
+
|
|
46
|
+
# Runs captured during the most recent CLI invocation.
|
|
47
|
+
SESSION: list[Run] = []
|
|
48
|
+
|
|
49
|
+
# Adapter-owned runs keyed by an external id (trace_id / session_id), for out-of-band capture.
|
|
50
|
+
_registry: dict[str, RunHandle] = {}
|
|
51
|
+
_registry_lock = threading.Lock()
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _mode() -> str:
|
|
55
|
+
return os.environ.get("BROODER_MODE", "off")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _model() -> Optional[str]:
|
|
59
|
+
return os.environ.get("BROODER_MODEL")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _max_steps() -> int:
|
|
63
|
+
"""The ``trajectory.max_steps`` guardrail (0 = unlimited), passed in by the CLI."""
|
|
64
|
+
try:
|
|
65
|
+
return int(os.environ.get("BROODER_MAX_STEPS", "0"))
|
|
66
|
+
except ValueError:
|
|
67
|
+
return 0
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def reset_session() -> None:
|
|
71
|
+
"""Clear the captured-run buffer and any dangling adapter runs before a new CLI invocation."""
|
|
72
|
+
SESSION.clear()
|
|
73
|
+
with _registry_lock:
|
|
74
|
+
_registry.clear()
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class RunHandle:
|
|
78
|
+
"""A handle to one run being captured — the single run-lifecycle implementation.
|
|
79
|
+
|
|
80
|
+
Append steps with :meth:`turn` / :meth:`tool_call` / :meth:`add_step`, mark the terminal answer
|
|
81
|
+
with :meth:`final`, and persist with :meth:`finish`. Instances are thread-safe so a framework
|
|
82
|
+
adapter can receive events on export threads.
|
|
83
|
+
|
|
84
|
+
``@record`` opens a handle with ``raise_on_runaway=True`` so exceeding ``max_steps`` raises
|
|
85
|
+
:class:`~brooder.errors.RunawayError` (aborting the Python loop). Adapters leave it ``False``:
|
|
86
|
+
they usually can't cancel the framework's loop, so a runaway is *flagged* (via
|
|
87
|
+
``run.meta["runaway"]``) and further steps are dropped, rather than raised into the framework.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
__slots__ = ("_external_id", "_finished", "_lock", "_raise_on_runaway", "run")
|
|
91
|
+
|
|
92
|
+
def __init__(
|
|
93
|
+
self,
|
|
94
|
+
run: Run,
|
|
95
|
+
*,
|
|
96
|
+
external_id: Optional[str] = None,
|
|
97
|
+
raise_on_runaway: bool = False,
|
|
98
|
+
) -> None:
|
|
99
|
+
"""Wrap ``run`` for capture; see :func:`open_run` for the arguments."""
|
|
100
|
+
self.run = run
|
|
101
|
+
self._external_id = external_id
|
|
102
|
+
self._raise_on_runaway = raise_on_runaway
|
|
103
|
+
self._finished = False
|
|
104
|
+
self._lock = threading.Lock()
|
|
105
|
+
|
|
106
|
+
def _append(self, step: Step) -> None:
|
|
107
|
+
"""Append a step, enforcing the ``max_steps`` guardrail (raise or flag per the mode)."""
|
|
108
|
+
with self._lock:
|
|
109
|
+
cap = _max_steps()
|
|
110
|
+
if cap and len(self.run.trajectory) >= cap:
|
|
111
|
+
self.run.meta["runaway"] = True
|
|
112
|
+
if self._raise_on_runaway:
|
|
113
|
+
raise RunawayError(cap)
|
|
114
|
+
_log.debug(
|
|
115
|
+
"run %s/%s hit max_steps=%d; dropping further steps",
|
|
116
|
+
self.run.agent,
|
|
117
|
+
self.run.case_id,
|
|
118
|
+
cap,
|
|
119
|
+
)
|
|
120
|
+
return
|
|
121
|
+
self.run.trajectory.append(step)
|
|
122
|
+
|
|
123
|
+
def turn(self, meta: Optional[dict[str, Any]] = None) -> None:
|
|
124
|
+
"""Append a model-turn step (a decision point)."""
|
|
125
|
+
self._append(Step(kind=StepKind.TURN, name="turn", args=meta or {}))
|
|
126
|
+
|
|
127
|
+
def tool_call(
|
|
128
|
+
self, name: str, args: Optional[dict[str, Any]] = None, result: Any = None
|
|
129
|
+
) -> None:
|
|
130
|
+
"""Append a tool-call step; ``result`` is the step's observation."""
|
|
131
|
+
self._append(Step(kind=StepKind.TOOL, name=name, args=args or {}, result=result))
|
|
132
|
+
|
|
133
|
+
def add_step(self, step: Step) -> None:
|
|
134
|
+
"""Append a pre-built :class:`~brooder.models.Step` (escape hatch for adapters)."""
|
|
135
|
+
self._append(step)
|
|
136
|
+
|
|
137
|
+
def set_inputs(self, inputs: Any) -> None:
|
|
138
|
+
"""Set the run inputs after opening and recompute its case id.
|
|
139
|
+
|
|
140
|
+
Framework adapters often learn the agent's input mid-run (e.g. from a root span). The case
|
|
141
|
+
id is derived from the inputs so a run can be matched to its baseline, so it is recomputed.
|
|
142
|
+
"""
|
|
143
|
+
with self._lock:
|
|
144
|
+
self.run.inputs = inputs
|
|
145
|
+
self.run.case_id = make_case_id(self.run.agent, inputs)
|
|
146
|
+
|
|
147
|
+
def final(self, output: Any) -> None:
|
|
148
|
+
"""Record the terminal answer: set the run output and append a ``FINAL`` step."""
|
|
149
|
+
with self._lock:
|
|
150
|
+
self.run.output = output
|
|
151
|
+
self.run.trajectory.append(Step(kind=StepKind.FINAL, name="final", result=output))
|
|
152
|
+
|
|
153
|
+
def mark_runaway(self) -> None:
|
|
154
|
+
"""Flag the run as a guardrail hit (exceeded ``max_steps``)."""
|
|
155
|
+
self.run.meta["runaway"] = True
|
|
156
|
+
|
|
157
|
+
def finish(self, output: Any = None) -> Run:
|
|
158
|
+
"""Finalize and persist the run. Idempotent — extra calls are no-ops.
|
|
159
|
+
|
|
160
|
+
Appends a ``FINAL`` step from ``output`` unless the run is a runaway or gave up (returned
|
|
161
|
+
no answer). Then saves per the current mode (``record`` → baseline, ``run`` → run) and
|
|
162
|
+
appends to :data:`SESSION`; ``off`` persists nothing. Unregisters any ``external_id``.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
output: The agent's final output, or ``None`` if it gave up / was aborted.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
The finalized :class:`~brooder.models.Run`.
|
|
169
|
+
"""
|
|
170
|
+
with self._lock:
|
|
171
|
+
if self._finished:
|
|
172
|
+
return self.run
|
|
173
|
+
self._finished = True
|
|
174
|
+
|
|
175
|
+
if self.run.runaway:
|
|
176
|
+
_log.debug("run %s/%s aborted: exceeded max_steps", self.run.agent, self.run.case_id)
|
|
177
|
+
elif output is not None:
|
|
178
|
+
self.final(output)
|
|
179
|
+
else:
|
|
180
|
+
_log.debug(
|
|
181
|
+
"run %s/%s gave up: finished without a final answer",
|
|
182
|
+
self.run.agent,
|
|
183
|
+
self.run.case_id,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
mode = _mode()
|
|
187
|
+
if mode == "record":
|
|
188
|
+
storage.save_baseline(self.run)
|
|
189
|
+
SESSION.append(self.run)
|
|
190
|
+
_log.debug("recorded baseline %s/%s", self.run.agent, self.run.case_id)
|
|
191
|
+
elif mode == "run":
|
|
192
|
+
storage.save_run(self.run)
|
|
193
|
+
SESSION.append(self.run)
|
|
194
|
+
_log.debug("captured run %s/%s", self.run.agent, self.run.case_id)
|
|
195
|
+
# mode == "off": passthrough, no capture
|
|
196
|
+
|
|
197
|
+
if self._external_id is not None:
|
|
198
|
+
with _registry_lock:
|
|
199
|
+
_registry.pop(self._external_id, None)
|
|
200
|
+
return self.run
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def open_run(
|
|
204
|
+
agent: str,
|
|
205
|
+
*,
|
|
206
|
+
inputs: Any = None,
|
|
207
|
+
model: Optional[str] = None,
|
|
208
|
+
external_id: Optional[str] = None,
|
|
209
|
+
raise_on_runaway: bool = False,
|
|
210
|
+
) -> RunHandle:
|
|
211
|
+
"""Open a run for out-of-band capture from a framework adapter.
|
|
212
|
+
|
|
213
|
+
Unlike :func:`record` (which owns a Python call's lifecycle on one thread), this lets an adapter
|
|
214
|
+
open a run driven by a framework's own events — a trace or a session — and append steps as they
|
|
215
|
+
arrive, possibly out of order and from other threads. Register it under ``external_id`` and look
|
|
216
|
+
it back up with :func:`get_run`; call :meth:`RunHandle.finish` when the run ends.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
agent: Logical agent name (groups baselines).
|
|
220
|
+
inputs: The run inputs; used to derive the case id. If not known yet, set it later with
|
|
221
|
+
:meth:`RunHandle.set_inputs`.
|
|
222
|
+
model: Model label; defaults to the CLI-provided ``BROODER_MODEL``.
|
|
223
|
+
external_id: A stable id (trace/session) to register the handle under for later lookup.
|
|
224
|
+
raise_on_runaway: If ``True``, exceeding ``max_steps`` raises ``RunawayError`` (used by
|
|
225
|
+
``@record``). Adapters leave this ``False`` so a runaway is flagged, not raised into the
|
|
226
|
+
framework's loop.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
A :class:`RunHandle`.
|
|
230
|
+
"""
|
|
231
|
+
run = Run(
|
|
232
|
+
agent=agent,
|
|
233
|
+
case_id=make_case_id(agent, inputs),
|
|
234
|
+
inputs=inputs,
|
|
235
|
+
model=model or _model(),
|
|
236
|
+
)
|
|
237
|
+
handle = RunHandle(run, external_id=external_id, raise_on_runaway=raise_on_runaway)
|
|
238
|
+
if external_id is not None:
|
|
239
|
+
with _registry_lock:
|
|
240
|
+
_registry[external_id] = handle
|
|
241
|
+
return handle
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def get_run(external_id: str) -> Optional[RunHandle]:
|
|
245
|
+
"""Return the open run registered under ``external_id``, or ``None`` if there is none."""
|
|
246
|
+
with _registry_lock:
|
|
247
|
+
return _registry.get(external_id)
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def tool_call(name: str, args: Optional[dict[str, Any]] = None, result: Any = None) -> None:
|
|
251
|
+
"""Record a tool call / step into the active recorded run's trajectory.
|
|
252
|
+
|
|
253
|
+
Call this from inside a recorded agent for every tool the agent invokes. It is a no-op when
|
|
254
|
+
called outside a recorded agent, so it is safe to leave in production code.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
name: The tool/function name.
|
|
258
|
+
args: The arguments passed to the tool.
|
|
259
|
+
result: The value the tool returned (the step's observation).
|
|
260
|
+
"""
|
|
261
|
+
handle = _current.get()
|
|
262
|
+
if handle is not None:
|
|
263
|
+
handle.tool_call(name, args, result)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def turn(meta: Optional[dict[str, Any]] = None) -> None:
|
|
267
|
+
"""Record a model turn (a decision point) into the active recorded run's trajectory.
|
|
268
|
+
|
|
269
|
+
Usually emitted automatically by provider auto-capture; call it manually to mark a turn in a
|
|
270
|
+
hand-instrumented loop. No-op outside a recorded agent.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
meta: Optional metadata to attach to the turn (e.g. the provider).
|
|
274
|
+
"""
|
|
275
|
+
handle = _current.get()
|
|
276
|
+
if handle is not None:
|
|
277
|
+
handle.turn(meta)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _open_recorded(name: str, args: tuple[Any, ...], kwargs: dict[str, Any]) -> RunHandle:
|
|
281
|
+
"""Open a run for a ``@record`` call, deriving its inputs (and case id) from the call args."""
|
|
282
|
+
return open_run(
|
|
283
|
+
name, inputs={"args": list(args), "kwargs": dict(kwargs)}, raise_on_runaway=True
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def record(name: str) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
|
|
288
|
+
"""Decorate an agent entrypoint so Brooder can record and replay it.
|
|
289
|
+
|
|
290
|
+
Works on both plain and ``async def`` agents: an async agent is wrapped with a coroutine so the
|
|
291
|
+
active-run :class:`~contextvars.ContextVar` stays bound across the agent's ``await`` points (and
|
|
292
|
+
into any child tasks), letting nested :func:`tool_call` / provider auto-capture attach.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
name: The logical agent name used to group its baselines.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
A decorator that wraps the agent function; the wrapper captures each call as a
|
|
299
|
+
:class:`~brooder.models.Run` (its full trajectory) when Brooder is active, and is a
|
|
300
|
+
passthrough otherwise.
|
|
301
|
+
"""
|
|
302
|
+
|
|
303
|
+
def decorator(fn: Callable[..., Any]) -> Callable[..., Any]:
|
|
304
|
+
if inspect.iscoroutinefunction(fn):
|
|
305
|
+
|
|
306
|
+
@functools.wraps(fn)
|
|
307
|
+
async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
308
|
+
handle = _open_recorded(name, args, kwargs)
|
|
309
|
+
token = _current.set(handle)
|
|
310
|
+
output = None
|
|
311
|
+
try:
|
|
312
|
+
output = await fn(*args, **kwargs)
|
|
313
|
+
except RunawayError:
|
|
314
|
+
# Guardrail hit: record the run as a runaway; no FINAL answer was produced.
|
|
315
|
+
handle.mark_runaway()
|
|
316
|
+
finally:
|
|
317
|
+
_current.reset(token)
|
|
318
|
+
|
|
319
|
+
handle.finish(output)
|
|
320
|
+
return output
|
|
321
|
+
|
|
322
|
+
return async_wrapper
|
|
323
|
+
|
|
324
|
+
@functools.wraps(fn)
|
|
325
|
+
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
326
|
+
handle = _open_recorded(name, args, kwargs)
|
|
327
|
+
token = _current.set(handle)
|
|
328
|
+
output = None
|
|
329
|
+
try:
|
|
330
|
+
output = fn(*args, **kwargs)
|
|
331
|
+
except RunawayError:
|
|
332
|
+
# Guardrail hit: record the run as a runaway; no FINAL answer was produced.
|
|
333
|
+
handle.mark_runaway()
|
|
334
|
+
finally:
|
|
335
|
+
_current.reset(token)
|
|
336
|
+
|
|
337
|
+
handle.finish(output)
|
|
338
|
+
return output
|
|
339
|
+
|
|
340
|
+
return wrapper
|
|
341
|
+
|
|
342
|
+
return decorator
|
brooder/report.py
ADDED
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""Rendering of diffs — terminal (rich), and machine-readable JSON / Markdown.
|
|
2
|
+
|
|
3
|
+
The terminal renderers (`print_summary` / `print_diff_detail`) are for humans; `diffs_to_summary` /
|
|
4
|
+
`render_json` / `render_markdown` produce the machine-readable output consumed by CI (`--json`), the
|
|
5
|
+
OTLP metric emitter, and the GitHub Action's PR comment (`--format markdown`).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
|
|
13
|
+
from rich.console import Console
|
|
14
|
+
from rich.panel import Panel
|
|
15
|
+
from rich.table import Table
|
|
16
|
+
|
|
17
|
+
from .models import Diff, Verdict
|
|
18
|
+
|
|
19
|
+
console = Console()
|
|
20
|
+
|
|
21
|
+
# Bump when the shape of `diffs_to_summary` changes, so machine consumers can detect it.
|
|
22
|
+
SUMMARY_SCHEMA_VERSION = 1
|
|
23
|
+
|
|
24
|
+
_VERDICT: dict[Verdict, tuple[str, str]] = {
|
|
25
|
+
Verdict.PASS: ("PASS", "green"),
|
|
26
|
+
Verdict.REGRESSED: ("REGRESSED", "red"),
|
|
27
|
+
Verdict.NEW: ("NEW", "yellow"),
|
|
28
|
+
Verdict.FLAKY: ("FLAKY", "magenta"),
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _fmt_step(summary: Optional[dict[str, Any]]) -> str:
|
|
33
|
+
"""Render a step summary (``{kind, name, args}``) as e.g. ``TOOL search(q=x)``."""
|
|
34
|
+
if not summary:
|
|
35
|
+
return "—"
|
|
36
|
+
kind = str(summary.get("kind", "")).upper()
|
|
37
|
+
name = summary.get("name", "")
|
|
38
|
+
args = summary.get("args") or {}
|
|
39
|
+
inner = ", ".join(f"{k}={v}" for k, v in list(args.items())[:2])
|
|
40
|
+
return f"{kind} {name}({inner})" if inner else f"{kind} {name}"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _trajectory_headline(diff: Diff) -> Optional[str]:
|
|
44
|
+
"""The single "path diverged at step N" line, from the first per-step trajectory change."""
|
|
45
|
+
for change in diff.changes:
|
|
46
|
+
path = change.path
|
|
47
|
+
if not (path.startswith("trajectory[") and path.endswith("]")):
|
|
48
|
+
continue
|
|
49
|
+
step = path[len("trajectory[") : -1]
|
|
50
|
+
if change.kind == "removed":
|
|
51
|
+
return f"path diverged at step {step}: was {_fmt_step(change.before)}, now dropped"
|
|
52
|
+
if change.kind == "added":
|
|
53
|
+
return f"path diverged at step {step}: nothing before, now {_fmt_step(change.after)}"
|
|
54
|
+
return (
|
|
55
|
+
f"path diverged at step {step}: "
|
|
56
|
+
f"was {_fmt_step(change.before)}, now {_fmt_step(change.after)}"
|
|
57
|
+
)
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _guardrail_line(diff: Diff) -> Optional[str]:
|
|
62
|
+
"""The guardrail terminal note (runaway / gave-up) if the current run newly hit one."""
|
|
63
|
+
for change in diff.changes:
|
|
64
|
+
if change.path == "trajectory.runaway" and change.after:
|
|
65
|
+
return "runaway: the agent exceeded max_steps and was aborted"
|
|
66
|
+
if change.path == "trajectory.gave_up" and change.after:
|
|
67
|
+
return "gave up: the agent returned without a final answer"
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _trajectory_signal(diff: Diff) -> str:
|
|
72
|
+
"""The most salient trajectory signal for the summary table.
|
|
73
|
+
|
|
74
|
+
Priority: guardrail terminals (runaway / gave-up) > turn-count > step-count > path change.
|
|
75
|
+
"""
|
|
76
|
+
for change in diff.changes:
|
|
77
|
+
if change.path == "trajectory.runaway" and change.after:
|
|
78
|
+
return "runaway!"
|
|
79
|
+
if change.path == "trajectory.gave_up" and change.after:
|
|
80
|
+
return "gave up"
|
|
81
|
+
for change in diff.changes:
|
|
82
|
+
if change.path == "trajectory.turns":
|
|
83
|
+
return f"turns {change.before}→{change.after}"
|
|
84
|
+
for change in diff.changes:
|
|
85
|
+
if change.path == "trajectory.steps":
|
|
86
|
+
return f"steps {change.before}→{change.after}"
|
|
87
|
+
if any(c.path.startswith("trajectory[") for c in diff.changes):
|
|
88
|
+
return "path changed"
|
|
89
|
+
return "—"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def print_summary(diffs: list[Diff]) -> None:
|
|
93
|
+
"""Print a one-row-per-case results table.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
diffs: The per-case diffs to summarize.
|
|
97
|
+
"""
|
|
98
|
+
table = Table(title="Brooder results", title_style="bold")
|
|
99
|
+
table.add_column("Agent")
|
|
100
|
+
table.add_column("Case")
|
|
101
|
+
table.add_column("Verdict")
|
|
102
|
+
table.add_column("Changes", justify="right")
|
|
103
|
+
table.add_column("Trajectory")
|
|
104
|
+
table.add_column("Stability", justify="right")
|
|
105
|
+
for diff in diffs:
|
|
106
|
+
label, style = _VERDICT[diff.verdict]
|
|
107
|
+
table.add_row(
|
|
108
|
+
diff.agent,
|
|
109
|
+
diff.case_id,
|
|
110
|
+
f"[{style}]{label}[/]",
|
|
111
|
+
str(len(diff.changes)),
|
|
112
|
+
_trajectory_signal(diff),
|
|
113
|
+
str(diff.stability),
|
|
114
|
+
)
|
|
115
|
+
console.print(table)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def print_diff_detail(diff: Diff) -> None:
|
|
119
|
+
"""Print a detailed, color-coded panel of the changes in a single diff.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
diff: The diff to render.
|
|
123
|
+
"""
|
|
124
|
+
label, style = _VERDICT[diff.verdict]
|
|
125
|
+
title = f"{diff.agent} · {diff.case_id} · [{style}]{label}[/]"
|
|
126
|
+
if not diff.changes:
|
|
127
|
+
console.print(Panel("No behavioral changes.", title=title, border_style=style))
|
|
128
|
+
return
|
|
129
|
+
|
|
130
|
+
lines: list[str] = []
|
|
131
|
+
guardrail = _guardrail_line(diff)
|
|
132
|
+
headline = _trajectory_headline(diff)
|
|
133
|
+
if guardrail:
|
|
134
|
+
lines.append(f"[bold red]⚠ {guardrail}[/]")
|
|
135
|
+
if headline:
|
|
136
|
+
lines.append(f"[bold]{headline}[/]")
|
|
137
|
+
if lines:
|
|
138
|
+
lines.append("")
|
|
139
|
+
for change in diff.changes:
|
|
140
|
+
if change.kind == "removed":
|
|
141
|
+
lines.append(f"[red]- {change.path}[/] {change.before}")
|
|
142
|
+
elif change.kind == "added":
|
|
143
|
+
lines.append(f"[green]+ {change.path}[/] {change.after}")
|
|
144
|
+
else:
|
|
145
|
+
lines.append(f"[yellow]~ {change.path}[/]")
|
|
146
|
+
lines.append(f" before: {change.before}")
|
|
147
|
+
lines.append(f" after: {change.after}")
|
|
148
|
+
console.print(Panel("\n".join(lines), title=title, border_style=style))
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# --- machine-readable output ------------------------------------------------
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def diffs_to_summary(diffs: list[Diff]) -> dict[str, Any]:
|
|
155
|
+
"""Serialize per-case diffs into a plain, JSON-safe summary dict.
|
|
156
|
+
|
|
157
|
+
The single source of truth for machine-readable output: `render_json`, the OTLP metric emitter,
|
|
158
|
+
and the GitHub Action's Markdown comment all derive from this. Everything comes from the
|
|
159
|
+
:class:`~brooder.models.Diff` objects alone (verdict, stability, changes); cost/latency are not
|
|
160
|
+
included because the run model does not capture them yet.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
diffs: The per-case diffs (one per case), as returned by ``analysis.analyze``.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
``{schema_version, summary: {...counts, mean_stability}, cases: [...]}``.
|
|
167
|
+
"""
|
|
168
|
+
counts = {verdict.value: 0 for verdict in Verdict}
|
|
169
|
+
for diff in diffs:
|
|
170
|
+
counts[diff.verdict.value] += 1
|
|
171
|
+
total = len(diffs)
|
|
172
|
+
mean_stability = round(sum(d.stability for d in diffs) / total) if total else 100
|
|
173
|
+
# "regressions" is what gates CI: anything that is not PASS/NEW (regressed + flaky).
|
|
174
|
+
regressions = sum(1 for d in diffs if not d.ok)
|
|
175
|
+
|
|
176
|
+
cases = [
|
|
177
|
+
{
|
|
178
|
+
"agent": diff.agent,
|
|
179
|
+
"case_id": diff.case_id,
|
|
180
|
+
"verdict": diff.verdict.value,
|
|
181
|
+
"ok": diff.ok,
|
|
182
|
+
"stability": diff.stability,
|
|
183
|
+
"change_count": len(diff.changes),
|
|
184
|
+
"changes": [
|
|
185
|
+
{"path": c.path, "kind": c.kind, "before": c.before, "after": c.after}
|
|
186
|
+
for c in diff.changes
|
|
187
|
+
],
|
|
188
|
+
}
|
|
189
|
+
for diff in diffs
|
|
190
|
+
]
|
|
191
|
+
return {
|
|
192
|
+
"schema_version": SUMMARY_SCHEMA_VERSION,
|
|
193
|
+
"summary": {
|
|
194
|
+
"total": total,
|
|
195
|
+
"passed": counts[Verdict.PASS.value],
|
|
196
|
+
"regressed": counts[Verdict.REGRESSED.value],
|
|
197
|
+
"new": counts[Verdict.NEW.value],
|
|
198
|
+
"flaky": counts[Verdict.FLAKY.value],
|
|
199
|
+
"regressions": regressions,
|
|
200
|
+
"mean_stability": mean_stability,
|
|
201
|
+
},
|
|
202
|
+
"cases": cases,
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def render_json(diffs: list[Diff]) -> str:
|
|
207
|
+
"""Render diffs as a pretty-printed JSON document (see :func:`diffs_to_summary`)."""
|
|
208
|
+
return json.dumps(diffs_to_summary(diffs), indent=2, sort_keys=True, default=str)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def render_markdown(diffs: list[Diff]) -> str:
|
|
212
|
+
"""Render diffs as a Markdown report suitable for a PR comment.
|
|
213
|
+
|
|
214
|
+
A headline + one-row-per-case table, plus a collapsible details block for the regressions.
|
|
215
|
+
Reuses the same trajectory-signal helpers as the terminal renderer so the two stay consistent.
|
|
216
|
+
"""
|
|
217
|
+
summary = diffs_to_summary(diffs)["summary"]
|
|
218
|
+
headline = (
|
|
219
|
+
"✅ no behavioral regressions"
|
|
220
|
+
if summary["regressions"] == 0
|
|
221
|
+
else f"❌ {summary['regressions']} behavioral regression(s)"
|
|
222
|
+
)
|
|
223
|
+
lines = [
|
|
224
|
+
"## Brooder results",
|
|
225
|
+
"",
|
|
226
|
+
f"**{headline}** — {summary['total']} case(s): "
|
|
227
|
+
f"{summary['passed']} passed · {summary['regressed']} regressed · "
|
|
228
|
+
f"{summary['flaky']} flaky · {summary['new']} new · "
|
|
229
|
+
f"mean stability {summary['mean_stability']}",
|
|
230
|
+
"",
|
|
231
|
+
"| Agent | Case | Verdict | Changes | Trajectory | Stability |",
|
|
232
|
+
"| --- | --- | --- | ---: | --- | ---: |",
|
|
233
|
+
]
|
|
234
|
+
for diff in diffs:
|
|
235
|
+
label = _VERDICT[diff.verdict][0]
|
|
236
|
+
lines.append(
|
|
237
|
+
f"| {diff.agent} | `{diff.case_id}` | {label} | {len(diff.changes)} "
|
|
238
|
+
f"| {_trajectory_signal(diff)} | {diff.stability} |"
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
regressions = [d for d in diffs if not d.ok]
|
|
242
|
+
if regressions:
|
|
243
|
+
lines += ["", "<details><summary>What changed</summary>", ""]
|
|
244
|
+
for diff in regressions:
|
|
245
|
+
lines.append(f"#### {diff.agent} · {diff.case_id} — {_VERDICT[diff.verdict][0]}")
|
|
246
|
+
guardrail = _guardrail_line(diff)
|
|
247
|
+
trajectory = _trajectory_headline(diff)
|
|
248
|
+
if guardrail:
|
|
249
|
+
lines.append(f"- ⚠ {guardrail}")
|
|
250
|
+
if trajectory:
|
|
251
|
+
lines.append(f"- {trajectory}")
|
|
252
|
+
for change in diff.changes:
|
|
253
|
+
if change.kind == "removed":
|
|
254
|
+
lines.append(f"- `- {change.path}` — {change.before}")
|
|
255
|
+
elif change.kind == "added":
|
|
256
|
+
lines.append(f"- `+ {change.path}` — {change.after}")
|
|
257
|
+
else:
|
|
258
|
+
lines.append(f"- `~ {change.path}` — {change.before} → {change.after}")
|
|
259
|
+
lines.append("")
|
|
260
|
+
lines.append("</details>")
|
|
261
|
+
return "\n".join(lines).rstrip() + "\n"
|