brooder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brooder/__init__.py +31 -0
- brooder/analysis.py +79 -0
- brooder/cli.py +281 -0
- brooder/config.py +88 -0
- brooder/diffing.py +217 -0
- brooder/errors.py +31 -0
- brooder/integrations/__init__.py +75 -0
- brooder/integrations/anthropic.py +46 -0
- brooder/integrations/base.py +170 -0
- brooder/integrations/bedrock.py +49 -0
- brooder/integrations/claude_agent.py +164 -0
- brooder/integrations/google.py +61 -0
- brooder/integrations/langchain.py +321 -0
- brooder/integrations/openai.py +43 -0
- brooder/integrations/openai_agents.py +208 -0
- brooder/integrations/otel.py +216 -0
- brooder/judges.py +109 -0
- brooder/log.py +33 -0
- brooder/metrics.py +116 -0
- brooder/models.py +148 -0
- brooder/py.typed +1 -0
- brooder/recorder.py +342 -0
- brooder/report.py +261 -0
- brooder/storage.py +150 -0
- brooder-0.1.0.dist-info/METADATA +338 -0
- brooder-0.1.0.dist-info/RECORD +30 -0
- brooder-0.1.0.dist-info/WHEEL +4 -0
- brooder-0.1.0.dist-info/entry_points.txt +2 -0
- brooder-0.1.0.dist-info/licenses/LICENSE +201 -0
- brooder-0.1.0.dist-info/licenses/NOTICE +7 -0
brooder/storage.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""On-disk storage for baselines and runs.
|
|
2
|
+
|
|
3
|
+
Baselines are golden runs, committed to the repo so behavioral changes show up in code review.
|
|
4
|
+
Runs are the latest (uncommitted) executions used to compute diffs. Writes are atomic and
|
|
5
|
+
corrupt records raise :class:`~brooder.errors.CorruptRecordError` rather than crashing.
|
|
6
|
+
|
|
7
|
+
.brooder/
|
|
8
|
+
baselines/<agent>/<case_id>.json # golden, committed
|
|
9
|
+
runs/<agent>/<case_id>.json # latest run, gitignored
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import tempfile
|
|
16
|
+
from collections.abc import Iterator
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
from pydantic import ValidationError
|
|
21
|
+
|
|
22
|
+
from .errors import CorruptRecordError
|
|
23
|
+
from .models import Run
|
|
24
|
+
|
|
25
|
+
BROODER_DIR = ".brooder"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _root(base: Optional[Path] = None) -> Path:
|
|
29
|
+
return (base or Path.cwd()) / BROODER_DIR
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def baseline_path(agent: str, case_id: str, base: Optional[Path] = None) -> Path:
|
|
33
|
+
"""Return the file path of a case's golden baseline.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
agent: The agent name.
|
|
37
|
+
case_id: The case id (see :func:`~brooder.models.make_case_id`).
|
|
38
|
+
base: Project root; defaults to the current working directory.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
The path to ``.brooder/baselines/<agent>/<case_id>.json``.
|
|
42
|
+
"""
|
|
43
|
+
return _root(base) / "baselines" / agent / f"{case_id}.json"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def run_path(agent: str, case_id: str, base: Optional[Path] = None) -> Path:
|
|
47
|
+
"""Return the file path of a case's latest run.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
agent: The agent name.
|
|
51
|
+
case_id: The case id.
|
|
52
|
+
base: Project root; defaults to the current working directory.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
The path to ``.brooder/runs/<agent>/<case_id>.json``.
|
|
56
|
+
"""
|
|
57
|
+
return _root(base) / "runs" / agent / f"{case_id}.json"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _atomic_write(path: Path, text: str) -> None:
|
|
61
|
+
"""Write ``text`` to ``path`` atomically (temp file + ``os.replace``)."""
|
|
62
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
63
|
+
fd, tmp = tempfile.mkstemp(dir=str(path.parent), suffix=".tmp")
|
|
64
|
+
try:
|
|
65
|
+
with os.fdopen(fd, "w", encoding="utf-8") as handle:
|
|
66
|
+
handle.write(text)
|
|
67
|
+
os.replace(tmp, path)
|
|
68
|
+
finally:
|
|
69
|
+
if os.path.exists(tmp):
|
|
70
|
+
os.remove(tmp)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _load(path: Path) -> Run:
|
|
74
|
+
try:
|
|
75
|
+
return Run.model_validate_json(path.read_text(encoding="utf-8"))
|
|
76
|
+
except (ValidationError, ValueError, OSError) as exc:
|
|
77
|
+
raise CorruptRecordError(f"{path} is not a valid Brooder record: {exc}") from exc
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def save_baseline(run: Run, base: Optional[Path] = None) -> None:
|
|
81
|
+
"""Persist ``run`` as the golden baseline for its case.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
run: The run to store as the baseline.
|
|
85
|
+
base: Project root; defaults to the current working directory.
|
|
86
|
+
"""
|
|
87
|
+
_atomic_write(baseline_path(run.agent, run.case_id, base), run.model_dump_json(indent=2))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def load_baseline(agent: str, case_id: str, base: Optional[Path] = None) -> Optional[Run]:
|
|
91
|
+
"""Load a case's golden baseline, if one exists.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
agent: The agent name.
|
|
95
|
+
case_id: The case id.
|
|
96
|
+
base: Project root; defaults to the current working directory.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
The stored :class:`~brooder.models.Run`, or ``None`` if there is no baseline.
|
|
100
|
+
|
|
101
|
+
Raises:
|
|
102
|
+
CorruptRecordError: If the baseline file exists but cannot be parsed.
|
|
103
|
+
"""
|
|
104
|
+
path = baseline_path(agent, case_id, base)
|
|
105
|
+
return _load(path) if path.exists() else None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def save_run(run: Run, base: Optional[Path] = None) -> None:
|
|
109
|
+
"""Persist ``run`` as the latest run for its case.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
run: The run to store.
|
|
113
|
+
base: Project root; defaults to the current working directory.
|
|
114
|
+
"""
|
|
115
|
+
_atomic_write(run_path(run.agent, run.case_id, base), run.model_dump_json(indent=2))
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def iter_runs(base: Optional[Path] = None) -> Iterator[Run]:
|
|
119
|
+
"""Yield every stored latest-run record.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
base: Project root; defaults to the current working directory.
|
|
123
|
+
|
|
124
|
+
Yields:
|
|
125
|
+
Each stored :class:`~brooder.models.Run`, sorted by path.
|
|
126
|
+
|
|
127
|
+
Raises:
|
|
128
|
+
CorruptRecordError: If a run file cannot be parsed.
|
|
129
|
+
"""
|
|
130
|
+
runs_dir = _root(base) / "runs"
|
|
131
|
+
if not runs_dir.exists():
|
|
132
|
+
return
|
|
133
|
+
for file in sorted(runs_dir.rglob("*.json")):
|
|
134
|
+
yield _load(file)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def promote_runs_to_baselines(base: Optional[Path] = None) -> int:
|
|
138
|
+
"""Accept every latest run as the new baseline (like ``jest -u``).
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
base: Project root; defaults to the current working directory.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
The number of runs promoted to baselines.
|
|
145
|
+
"""
|
|
146
|
+
count = 0
|
|
147
|
+
for run in iter_runs(base):
|
|
148
|
+
save_baseline(run, base)
|
|
149
|
+
count += 1
|
|
150
|
+
return count
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: brooder
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Snapshot testing for AI agents — catch behavior regressions before they ship.
|
|
5
|
+
Project-URL: Homepage, https://brooder.dev
|
|
6
|
+
Project-URL: Repository, https://github.com/agentbrooder/brooder
|
|
7
|
+
Project-URL: Issues, https://github.com/agentbrooder/brooder/issues
|
|
8
|
+
Author: Brooder
|
|
9
|
+
License: Apache-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
License-File: NOTICE
|
|
12
|
+
Keywords: agents,ai,ci,evals,llm,regression,snapshot,testing
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Software Development :: Testing
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Requires-Dist: pydantic>=2.5
|
|
24
|
+
Requires-Dist: pyyaml>=6.0
|
|
25
|
+
Requires-Dist: rich>=13.0
|
|
26
|
+
Requires-Dist: typer>=0.12
|
|
27
|
+
Provides-Extra: claude-agent
|
|
28
|
+
Requires-Dist: claude-agent-sdk>=0.1; extra == 'claude-agent'
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: mypy>=1.11; extra == 'dev'
|
|
31
|
+
Requires-Dist: pre-commit>=3.7; extra == 'dev'
|
|
32
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
33
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
35
|
+
Requires-Dist: types-pyyaml>=6.0; extra == 'dev'
|
|
36
|
+
Provides-Extra: docs
|
|
37
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
38
|
+
Requires-Dist: mkdocstrings[python]>=0.25; extra == 'docs'
|
|
39
|
+
Provides-Extra: langchain
|
|
40
|
+
Requires-Dist: langchain-core>=0.3; extra == 'langchain'
|
|
41
|
+
Provides-Extra: openai-agents
|
|
42
|
+
Requires-Dist: openai-agents>=0.1; extra == 'openai-agents'
|
|
43
|
+
Provides-Extra: otel
|
|
44
|
+
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.20; extra == 'otel'
|
|
45
|
+
Requires-Dist: opentelemetry-sdk>=1.20; extra == 'otel'
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
|
|
48
|
+
<p align="center">
|
|
49
|
+
<img src="assets/banner.svg" alt="Brooder — snapshot testing for AI agents" width="760">
|
|
50
|
+
</p>
|
|
51
|
+
|
|
52
|
+
<p align="center">
|
|
53
|
+
<a href="https://github.com/agentbrooder/brooder/actions/workflows/ci.yml"><img src="https://github.com/agentbrooder/brooder/actions/workflows/ci.yml/badge.svg" alt="CI"></a>
|
|
54
|
+
<a href="https://pypi.org/project/brooder/"><img src="https://img.shields.io/pypi/v/brooder?color=3b82f6" alt="PyPI"></a>
|
|
55
|
+
<a href="https://pypi.org/project/brooder/"><img src="https://img.shields.io/pypi/pyversions/brooder" alt="Python versions"></a>
|
|
56
|
+
<a href="LICENSE"><img src="https://img.shields.io/badge/license-Apache--2.0-blue" alt="License: Apache-2.0"></a>
|
|
57
|
+
<a href="https://github.com/astral-sh/ruff"><img src="https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json" alt="Ruff"></a>
|
|
58
|
+
</p>
|
|
59
|
+
|
|
60
|
+
**Snapshot testing for AI agents. Catch behavior regressions before they ship.**
|
|
61
|
+
|
|
62
|
+
Your AI agent is one model upgrade away from silently breaking. You bump the model, tweak a
|
|
63
|
+
prompt, or change a tool — and the agent starts behaving differently. You find out from a customer.
|
|
64
|
+
|
|
65
|
+
Brooder is the safety net. Wrap your agent once, and Brooder records its real runs as **golden
|
|
66
|
+
baselines**. Every time you change the model, a prompt, or a tool, it re-runs and shows you a
|
|
67
|
+
**behavioral diff** — what changed, what broke — and fails your CI if it regressed.
|
|
68
|
+
|
|
69
|
+
No eval datasets to hand-write. One command. It's `jest --updateSnapshot`, but for agents.
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
pip install brooder
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
<p align="center">
|
|
76
|
+
<img src="assets/demo.svg" alt="brooder migrate catching a dropped tool call and a flipped answer" width="760">
|
|
77
|
+
</p>
|
|
78
|
+
|
|
79
|
+
> Status: early alpha, built in public. Apache-2.0.
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## 60-second demo (no API keys needed)
|
|
84
|
+
|
|
85
|
+
The included example agent simulates a model upgrade with an env var, so you can see Brooder catch
|
|
86
|
+
a real regression completely offline.
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
git clone https://github.com/agentbrooder/brooder && cd brooder
|
|
90
|
+
pip install -e .
|
|
91
|
+
|
|
92
|
+
# The signature move: what breaks if I migrate from one model to another?
|
|
93
|
+
brooder migrate --from gpt-4o --to gpt-5-new examples/regressing_agent.py
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Output (abridged):
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
──────────────────────── Model Migration Report ────────────────────────
|
|
100
|
+
1 of 3 cases change behavior when migrating gpt-4o → gpt-5-new.
|
|
101
|
+
|
|
102
|
+
support-agent · e1ded4070eee · REGRESSED · stability 40
|
|
103
|
+
path diverged at step 0: was TOOL create_ticket(order=12345), now dropped
|
|
104
|
+
- trajectory[0] {'name': 'create_ticket', 'args': {'order': '12345'}}
|
|
105
|
+
~ output
|
|
106
|
+
before: I've started your refund.
|
|
107
|
+
after: Refunds are not supported.
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
The "new model" silently stopped creating the refund ticket **and** flipped its answer. That would
|
|
111
|
+
have shipped to production unnoticed. Brooder caught it — and exited non-zero, so CI would block it.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## The normal workflow
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
brooder record examples/regressing_agent.py # capture golden baselines from real runs
|
|
119
|
+
brooder run examples/regressing_agent.py # re-run after a change, diff vs baseline
|
|
120
|
+
brooder diff # see exactly what changed
|
|
121
|
+
brooder approve # accept the new behavior as the baseline
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
`brooder run` exits non-zero when behavior regressed — drop it into CI and it gates your PRs.
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## Instrument your own agent
|
|
129
|
+
|
|
130
|
+
Add one decorator. Log tool calls with one function. That's the whole SDK.
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
import brooder
|
|
134
|
+
|
|
135
|
+
def search_kb(query):
|
|
136
|
+
brooder.tool_call("search_kb", {"query": query}, result="...")
|
|
137
|
+
return "..."
|
|
138
|
+
|
|
139
|
+
@brooder.record("support-agent")
|
|
140
|
+
def agent(question: str) -> str:
|
|
141
|
+
docs = search_kb(question)
|
|
142
|
+
return answer_from(docs)
|
|
143
|
+
|
|
144
|
+
# call it over your real inputs; brooder records/replays automatically
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
Then run it through the CLI. Baselines are plain JSON committed to your repo, so diffs show up in
|
|
148
|
+
code review like any other change.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Auto-capture (no manual `tool_call`)
|
|
153
|
+
|
|
154
|
+
Wrap your LLM client and Brooder records the model's tool-call decisions automatically:
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
import brooder
|
|
158
|
+
import openai
|
|
159
|
+
|
|
160
|
+
client = brooder.instrument(openai.OpenAI())
|
|
161
|
+
# now every client.chat.completions.create(...) call is captured while recording
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Supported providers: **OpenAI**, **Azure OpenAI**, **Anthropic**, **AWS Bedrock**, and
|
|
165
|
+
**Google (Gemini / Vertex)**. The provider is auto-detected; override it with
|
|
166
|
+
`brooder.instrument(client, provider="bedrock")`. Model *names* are intentionally not diffed, so
|
|
167
|
+
switching models isn't itself a change — only the model's *behavior* (which tools it calls, with
|
|
168
|
+
what arguments) is.
|
|
169
|
+
|
|
170
|
+
**Async works too.** `@brooder.record` and `instrument(...)` handle `async def` agents and async
|
|
171
|
+
clients — `AsyncOpenAI`, `AsyncAzureOpenAI`, `AsyncAnthropic`, and Google's `generate_content_async`
|
|
172
|
+
— with no extra setup (the recording context follows your `await`s and into child tasks):
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
client = brooder.instrument(openai.AsyncOpenAI())
|
|
176
|
+
|
|
177
|
+
@brooder.record("support-agent")
|
|
178
|
+
async def agent(question: str) -> str:
|
|
179
|
+
await client.chat.completions.create(model="gpt-4o", messages=[...])
|
|
180
|
+
...
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
(Async AWS Bedrock via aioboto3 isn't covered yet — the sync boto3 client is.)
|
|
184
|
+
|
|
185
|
+
## Capture from agent frameworks (OpenTelemetry)
|
|
186
|
+
|
|
187
|
+
Building on an agent framework? If it emits OpenTelemetry GenAI spans — **LangGraph, CrewAI,
|
|
188
|
+
AutoGen**, and anything else on the convention — add one span processor and Brooder ingests the
|
|
189
|
+
whole trajectory, no manual `tool_call`:
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from opentelemetry import trace
|
|
193
|
+
from brooder.integrations.otel import BrooderSpanProcessor
|
|
194
|
+
|
|
195
|
+
trace.get_tracer_provider().add_span_processor(BrooderSpanProcessor(agent="support-agent"))
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
It maps inference spans → turns, `execute_tool` spans → tool calls, and the agent-root span's
|
|
199
|
+
input/output → the case identity and final answer. It also drops straight into the OTel pipelines
|
|
200
|
+
you already run (Datadog / Arize / Honeycomb).
|
|
201
|
+
|
|
202
|
+
Building directly on the **Claude Agent SDK**? Register Brooder's hooks and it records the tool
|
|
203
|
+
trajectory automatically:
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
import brooder
|
|
207
|
+
from claude_agent_sdk import ClaudeSDKClient, ClaudeAgentOptions, ResultMessage
|
|
208
|
+
from brooder.integrations import claude_agent
|
|
209
|
+
|
|
210
|
+
options = ClaudeAgentOptions(hooks=brooder.claude_agent_hooks(agent="support-agent"))
|
|
211
|
+
async with ClaudeSDKClient(options=options) as client:
|
|
212
|
+
await client.query(prompt)
|
|
213
|
+
async for msg in client.receive_response():
|
|
214
|
+
if isinstance(msg, ResultMessage):
|
|
215
|
+
claude_agent.record_output(msg.session_id, msg.result) # optional: capture the answer
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
`UserPromptSubmit` opens a run (the prompt is the case identity), `PostToolUse` becomes a tool step,
|
|
219
|
+
and `Stop` finalizes it.
|
|
220
|
+
|
|
221
|
+
On the **OpenAI Agents SDK**? Its tracing is on by default — install Brooder's trace processor once
|
|
222
|
+
and every run is captured (no OpenAI API key required for capture):
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
import brooder.integrations.openai_agents as bd_agents
|
|
226
|
+
|
|
227
|
+
bd_agents.install(agent="support-agent") # then run your agents as usual
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
It maps generation/response spans → turns, function spans → tool calls, and handoffs and triggered
|
|
231
|
+
guardrails into the trajectory too — so both tool selection *and* control-flow regressions get
|
|
232
|
+
diffed.
|
|
233
|
+
|
|
234
|
+
Using **LangChain or LangGraph**? Attach one callback handler — no OpenTelemetry setup required:
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
import brooder.integrations.langchain as bd_lc
|
|
238
|
+
|
|
239
|
+
handler = bd_lc.callback_handler(agent="support-agent")
|
|
240
|
+
graph.invoke({"messages": [...]}, config={"callbacks": [handler]})
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
The root chain start opens a run (its input is the case identity), model calls become turns, and
|
|
244
|
+
tool calls become tool steps — one handler covers both LangChain and LangGraph.
|
|
245
|
+
|
|
246
|
+
## It tests agents (the whole trajectory), not single LLM calls
|
|
247
|
+
|
|
248
|
+
`@brooder.record` wraps your **entire agent** — every step of its plan → act → observe loop.
|
|
249
|
+
The baseline is the full **trajectory**: every tool call across every turn, in order, plus the
|
|
250
|
+
final output. So Brooder catches agent-level regressions, not just token changes in one model
|
|
251
|
+
response.
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
# A multi-step agent that silently stops verifying before answering on the newer model:
|
|
255
|
+
brooder migrate --from gpt-4o --to gpt-5-new examples/loop_agent.py
|
|
256
|
+
# -> REGRESSED: trajectory[1] "verify" removed
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
That dropped `verify` step happened *inside the loop* — the kind of thing an LLM-output eval
|
|
260
|
+
would never see.
|
|
261
|
+
|
|
262
|
+
## Why not just use observability / eval tools?
|
|
263
|
+
|
|
264
|
+
| Tool type | Examples | What it does | The gap Brooder fills |
|
|
265
|
+
| --- | --- | --- | --- |
|
|
266
|
+
| Observability | Langfuse, Laminar, Phoenix | Trace/monitor **after** it runs | Doesn't gate **before** you ship |
|
|
267
|
+
| Eval frameworks | DeepEval, Braintrust, Ragas | Score against **hand-written** datasets | Requires eval authoring nobody maintains |
|
|
268
|
+
| **Brooder** | — | **Record real runs → behavioral diff on every change → CI gate** | **Zero eval-writing, catches model-migration regressions** |
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
## Gate your PRs (GitHub Action)
|
|
273
|
+
|
|
274
|
+
Drop Brooder into CI and it re-runs your agent on every pull request, comments the behavioral diff,
|
|
275
|
+
and fails the check when behavior regresses. Copy [examples/github-action.yml](examples/github-action.yml)
|
|
276
|
+
to `.github/workflows/brooder.yml`:
|
|
277
|
+
|
|
278
|
+
```yaml
|
|
279
|
+
permissions:
|
|
280
|
+
contents: read
|
|
281
|
+
pull-requests: write # so it can comment the diff
|
|
282
|
+
|
|
283
|
+
jobs:
|
|
284
|
+
agent-snapshot:
|
|
285
|
+
runs-on: ubuntu-latest
|
|
286
|
+
steps:
|
|
287
|
+
- uses: actions/checkout@v4
|
|
288
|
+
- uses: agentbrooder/brooder@v1
|
|
289
|
+
with:
|
|
290
|
+
script: tests/agent_snapshot.py
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
The comment is upserted (updated in place, not spammed) and looks like the `--format markdown`
|
|
294
|
+
output below.
|
|
295
|
+
|
|
296
|
+
## Machine-readable output (`--json` / OTLP)
|
|
297
|
+
|
|
298
|
+
`run`, `ci`, and `diff` take `--format table|json|markdown` (`--json` is a shortcut). Exit codes are
|
|
299
|
+
unchanged, so you can gate *and* parse:
|
|
300
|
+
|
|
301
|
+
```bash
|
|
302
|
+
brooder run agent.py --json | jq '.summary'
|
|
303
|
+
# { "total": 3, "passed": 2, "regressed": 1, "flaky": 0, "regressions": 1, "mean_stability": 80 }
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
For dashboards, point Brooder at any OTLP endpoint and each run emits a snapshot of gauges
|
|
307
|
+
(`brooder.cases.*`, `brooder.stability.mean`) — **one exporter** that reaches Datadog, Grafana,
|
|
308
|
+
Honeycomb, and CloudWatch:
|
|
309
|
+
|
|
310
|
+
```bash
|
|
311
|
+
pip install 'brooder[otel]'
|
|
312
|
+
export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318/v1/metrics # or metrics.otlp_endpoint in brooder.yaml
|
|
313
|
+
brooder ci agent.py
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
---
|
|
317
|
+
|
|
318
|
+
## What it checks
|
|
319
|
+
|
|
320
|
+
- **Structural diff** — the sequence of tool calls, their arguments, and the final output.
|
|
321
|
+
- **Semantic diff** — a pluggable judge (`judge: exact | llm`) so equivalent wording isn't a regression.
|
|
322
|
+
- **Flakiness** — `brooder run --runs 3` runs each case N times and flags non-determinism (`FLAKY`).
|
|
323
|
+
|
|
324
|
+
Each case gets a verdict — `PASS` / `REGRESSED` / `NEW` / `FLAKY` — and a stability score.
|
|
325
|
+
|
|
326
|
+
---
|
|
327
|
+
|
|
328
|
+
## Roadmap
|
|
329
|
+
|
|
330
|
+
See **[ROADMAP.md](ROADMAP.md)** for what's shipped and what's planned.
|
|
331
|
+
|
|
332
|
+
## Contributing
|
|
333
|
+
|
|
334
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md). Issues and PRs welcome — this is being built in public.
|
|
335
|
+
|
|
336
|
+
## License
|
|
337
|
+
|
|
338
|
+
[Apache-2.0](LICENSE).
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
brooder/__init__.py,sha256=Z2bDpeECu4SryoGTVZFw40mq7MazNGnjYjiNPLQjEe4,946
|
|
2
|
+
brooder/analysis.py,sha256=JR49w0_wRVpDUnMtiARfJ4ll5ADAUl3-QNCind0hcgY,2716
|
|
3
|
+
brooder/cli.py,sha256=xtnN5sXCQHJgx_MoZSaQ_xE7QBgWXDhbh2NVxXh8DiE,9191
|
|
4
|
+
brooder/config.py,sha256=qRfhjH3lZhoryC6wkO5RezLHlTb0A2tS8vZKyJ_6N0k,2911
|
|
5
|
+
brooder/diffing.py,sha256=zoqDrkvJoa6gnSi0aALobz5fYiEdwEwGimS90IX425M,8136
|
|
6
|
+
brooder/errors.py,sha256=GTDorMJiHavHWgDykVKsy4ScCKHa4tyM9ExbPc4UFNE,976
|
|
7
|
+
brooder/judges.py,sha256=MbMcwnQM9viJejPjCEW65XU-iC9ubF5BeYjgP95SJ0A,3696
|
|
8
|
+
brooder/log.py,sha256=JX_0Fd_N3QBjqqGF2WnX9Uh1gApqrusdnZUSdANgNM0,1055
|
|
9
|
+
brooder/metrics.py,sha256=OHDuWUYyTfFU77IhpPYaRE4CaFdZlLR-rzMR3wUz-k0,4398
|
|
10
|
+
brooder/models.py,sha256=IHfYmckuU_cV0rJZgFZloABNvJWrB9A5adFN9e9PS-Q,5131
|
|
11
|
+
brooder/py.typed,sha256=8AkVlIO-WJ6ehRAt6sfsojYu_5YFMRy28xwDjT12VUE,71
|
|
12
|
+
brooder/recorder.py,sha256=V6t8S7kRc93K3SwaYmUKs07_s87IlN_8uyVifiKBneo,13420
|
|
13
|
+
brooder/report.py,sha256=LnIO5gp12r2tyxu4pQD7zwk0NSd9crQbOqdquAVBSSk,9942
|
|
14
|
+
brooder/storage.py,sha256=dqVc5kAafL_Sx9AJLu4-2zSudigqEdfLM0vvXeUpSoo,4641
|
|
15
|
+
brooder/integrations/__init__.py,sha256=syhPKraqalZP-wIwgaStg82GfTL69LazYnSWYpqIL10,2628
|
|
16
|
+
brooder/integrations/anthropic.py,sha256=EvUOPnE9bnE18XcV8_5IKUh3xm_gTABXPl22KXhkBOU,1505
|
|
17
|
+
brooder/integrations/base.py,sha256=Qgqx5M9tUXhMmYiI-e_PAad0vNh9H6e95PKZGX6okrU,6100
|
|
18
|
+
brooder/integrations/bedrock.py,sha256=6S2k1q3G9L5RcKWzsfXtqw5U0YQjgBfF3hV5Yba-ftM,1594
|
|
19
|
+
brooder/integrations/claude_agent.py,sha256=UGqjVZwa_LwVeThaKU9A1GxospvbPftGREViw2dfx4M,6750
|
|
20
|
+
brooder/integrations/google.py,sha256=TQlE5nuzfrRRApz8inZTe9Rrob3yI9_vQRPCuphURzk,2322
|
|
21
|
+
brooder/integrations/langchain.py,sha256=ViB7ORu-cZ2b_ntDUlFc1x-q1FohiIMPZy8lu7Prdr8,12941
|
|
22
|
+
brooder/integrations/openai.py,sha256=YdEhVH30ZW_byT893pT86JcBr0R1KPRDVt6Q66C3Osg,1396
|
|
23
|
+
brooder/integrations/openai_agents.py,sha256=SxjTOeXTDHWx0ZH3FITlYAuG7WpPfKltXoVnqzPxvwE,8032
|
|
24
|
+
brooder/integrations/otel.py,sha256=w-0YjLaHV6fIDLkSwtvCqPlj_EBlxLRhIG_gmFxEc2g,9100
|
|
25
|
+
brooder-0.1.0.dist-info/METADATA,sha256=acAyHgk6xvaX035dgvxuUpLrX_DWwNYUl3oisK8JaaM,13012
|
|
26
|
+
brooder-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
27
|
+
brooder-0.1.0.dist-info/entry_points.txt,sha256=mAUYK2TXw5_8oLgemf05Q7QFjaU02tQmcltHmj-ARZI,45
|
|
28
|
+
brooder-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
29
|
+
brooder-0.1.0.dist-info/licenses/NOTICE,sha256=FwrMmOBXP-InAZhoIB2VXmxp8m9pVe3c2xiefhnJnX4,295
|
|
30
|
+
brooder-0.1.0.dist-info/RECORD,,
|