brooder 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {brooder-0.1.0 → brooder-0.2.0}/.github/workflows/ci.yml +8 -8
- {brooder-0.1.0 → brooder-0.2.0}/.github/workflows/release.yml +23 -5
- {brooder-0.1.0 → brooder-0.2.0}/CHANGELOG.md +46 -1
- {brooder-0.1.0 → brooder-0.2.0}/PKG-INFO +204 -95
- {brooder-0.1.0 → brooder-0.2.0}/README.md +201 -94
- brooder-0.2.0/ROADMAP.md +336 -0
- brooder-0.2.0/SECURITY.md +39 -0
- {brooder-0.1.0 → brooder-0.2.0}/action.yml +25 -8
- brooder-0.2.0/design/anti-flakiness.md +277 -0
- {brooder-0.1.0 → brooder-0.2.0}/examples/github-action.yml +10 -4
- {brooder-0.1.0 → brooder-0.2.0}/pyproject.toml +10 -1
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/__init__.py +3 -0
- brooder-0.2.0/src/brooder/analysis.py +143 -0
- brooder-0.2.0/src/brooder/budget.py +195 -0
- brooder-0.2.0/src/brooder/cli.py +486 -0
- brooder-0.2.0/src/brooder/config.py +266 -0
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/diffing.py +159 -12
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/errors.py +6 -3
- brooder-0.2.0/src/brooder/integrations/__init__.py +90 -0
- brooder-0.2.0/src/brooder/integrations/anthropic.py +127 -0
- brooder-0.2.0/src/brooder/integrations/base.py +568 -0
- brooder-0.2.0/src/brooder/integrations/bedrock.py +128 -0
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/integrations/claude_agent.py +83 -19
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/integrations/google.py +20 -4
- brooder-0.2.0/src/brooder/integrations/openai.py +112 -0
- brooder-0.2.0/src/brooder/judges.py +161 -0
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/metrics.py +52 -15
- brooder-0.2.0/src/brooder/models.py +379 -0
- brooder-0.2.0/src/brooder/pytest_plugin.py +309 -0
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/recorder.py +153 -2
- brooder-0.2.0/src/brooder/redaction.py +153 -0
- brooder-0.2.0/src/brooder/report.py +411 -0
- brooder-0.2.0/src/brooder/storage.py +309 -0
- brooder-0.2.0/tests/conftest.py +7 -0
- brooder-0.2.0/tests/test_action.py +143 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_analysis.py +19 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_async_capture.py +5 -3
- brooder-0.2.0/tests/test_budget.py +274 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_claude_agent.py +97 -9
- brooder-0.2.0/tests/test_cli.py +472 -0
- brooder-0.2.0/tests/test_config.py +98 -0
- brooder-0.2.0/tests/test_diffing.py +159 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_integrations.py +43 -2
- brooder-0.2.0/tests/test_judges.py +77 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_metrics.py +73 -1
- brooder-0.2.0/tests/test_models.py +122 -0
- brooder-0.2.0/tests/test_output.py +200 -0
- brooder-0.2.0/tests/test_pytest_plugin.py +159 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_recorder.py +62 -0
- brooder-0.2.0/tests/test_redaction.py +140 -0
- brooder-0.2.0/tests/test_report.py +93 -0
- brooder-0.2.0/tests/test_severity.py +110 -0
- brooder-0.2.0/tests/test_storage.py +227 -0
- brooder-0.2.0/tests/test_streaming.py +422 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_trajectory_diff.py +24 -0
- brooder-0.1.0/ROADMAP.md +0 -134
- brooder-0.1.0/SECURITY.md +0 -14
- brooder-0.1.0/src/brooder/analysis.py +0 -79
- brooder-0.1.0/src/brooder/cli.py +0 -281
- brooder-0.1.0/src/brooder/config.py +0 -88
- brooder-0.1.0/src/brooder/integrations/__init__.py +0 -75
- brooder-0.1.0/src/brooder/integrations/anthropic.py +0 -46
- brooder-0.1.0/src/brooder/integrations/base.py +0 -170
- brooder-0.1.0/src/brooder/integrations/bedrock.py +0 -49
- brooder-0.1.0/src/brooder/integrations/openai.py +0 -43
- brooder-0.1.0/src/brooder/judges.py +0 -109
- brooder-0.1.0/src/brooder/models.py +0 -148
- brooder-0.1.0/src/brooder/report.py +0 -261
- brooder-0.1.0/src/brooder/storage.py +0 -150
- brooder-0.1.0/tests/test_action.py +0 -57
- brooder-0.1.0/tests/test_cli.py +0 -194
- brooder-0.1.0/tests/test_config.py +0 -44
- brooder-0.1.0/tests/test_diffing.py +0 -54
- brooder-0.1.0/tests/test_judges.py +0 -31
- brooder-0.1.0/tests/test_output.py +0 -99
- brooder-0.1.0/tests/test_storage.py +0 -39
- {brooder-0.1.0 → brooder-0.2.0}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/.github/dependabot.yml +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/.github/pull_request_template.md +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/.gitignore +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/.pre-commit-config.yaml +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/CONTRIBUTING.md +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/DCO +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/LICENSE +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/LICENSING.md +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/NOTICE +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/TRADEMARKS.md +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/assets/banner.svg +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/assets/demo.svg +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/assets/record-demo.sh +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/design/framework-adapters.md +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/design/trajectory.md +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/docs/api.md +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/docs/index.md +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/examples/flaky_agent.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/examples/loop_agent.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/examples/regressing_agent.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/examples/stable_agent.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/mkdocs.yml +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/integrations/langchain.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/integrations/openai_agents.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/integrations/otel.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/log.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/src/brooder/py.typed +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_capture_core.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_langchain.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_openai_agents.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_otel.py +0 -0
- {brooder-0.1.0 → brooder-0.2.0}/tests/test_trajectory.py +0 -0
|
@@ -9,8 +9,8 @@ jobs:
|
|
|
9
9
|
lint:
|
|
10
10
|
runs-on: ubuntu-latest
|
|
11
11
|
steps:
|
|
12
|
-
- uses: actions/checkout@
|
|
13
|
-
- uses: actions/setup-python@
|
|
12
|
+
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
|
|
13
|
+
- uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
|
|
14
14
|
with:
|
|
15
15
|
python-version: "3.12"
|
|
16
16
|
- run: pip install -e ".[dev]"
|
|
@@ -27,8 +27,8 @@ jobs:
|
|
|
27
27
|
matrix:
|
|
28
28
|
python-version: ["3.10", "3.11", "3.12"]
|
|
29
29
|
steps:
|
|
30
|
-
- uses: actions/checkout@
|
|
31
|
-
- uses: actions/setup-python@
|
|
30
|
+
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
|
|
31
|
+
- uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
|
|
32
32
|
with:
|
|
33
33
|
python-version: ${{ matrix.python-version }}
|
|
34
34
|
- run: pip install -e ".[dev]"
|
|
@@ -42,8 +42,8 @@ jobs:
|
|
|
42
42
|
docs:
|
|
43
43
|
runs-on: ubuntu-latest
|
|
44
44
|
steps:
|
|
45
|
-
- uses: actions/checkout@
|
|
46
|
-
- uses: actions/setup-python@
|
|
45
|
+
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
|
|
46
|
+
- uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
|
|
47
47
|
with:
|
|
48
48
|
python-version: "3.12"
|
|
49
49
|
- run: pip install -e ".[docs]"
|
|
@@ -54,8 +54,8 @@ jobs:
|
|
|
54
54
|
# Keep the Apache-2.0 core free of strong copyleft (GPL/AGPL/SSPL). LGPL is allowed.
|
|
55
55
|
runs-on: ubuntu-latest
|
|
56
56
|
steps:
|
|
57
|
-
- uses: actions/checkout@
|
|
58
|
-
- uses: actions/setup-python@
|
|
57
|
+
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
|
|
58
|
+
- uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
|
|
59
59
|
with:
|
|
60
60
|
python-version: "3.12"
|
|
61
61
|
- name: Install runtime deps only
|
|
@@ -23,8 +23,8 @@ jobs:
|
|
|
23
23
|
build:
|
|
24
24
|
runs-on: ubuntu-latest
|
|
25
25
|
steps:
|
|
26
|
-
- uses: actions/checkout@
|
|
27
|
-
- uses: actions/setup-python@
|
|
26
|
+
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
|
|
27
|
+
- uses: actions/setup-python@ece7cb06caefa5fff74198d8649806c4678c61a1 # v6.3.0
|
|
28
28
|
with:
|
|
29
29
|
python-version: "3.12"
|
|
30
30
|
- name: Build sdist + wheel
|
|
@@ -35,7 +35,7 @@ jobs:
|
|
|
35
35
|
run: |
|
|
36
36
|
python -m pip install --upgrade twine
|
|
37
37
|
python -m twine check dist/*
|
|
38
|
-
- uses: actions/upload-artifact@
|
|
38
|
+
- uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
|
|
39
39
|
with:
|
|
40
40
|
name: dist
|
|
41
41
|
path: dist/
|
|
@@ -47,9 +47,27 @@ jobs:
|
|
|
47
47
|
permissions:
|
|
48
48
|
id-token: write # OIDC token for Trusted Publishing
|
|
49
49
|
steps:
|
|
50
|
-
- uses: actions/download-artifact@
|
|
50
|
+
- uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
|
|
51
51
|
with:
|
|
52
52
|
name: dist
|
|
53
53
|
path: dist/
|
|
54
54
|
- name: Publish to PyPI
|
|
55
|
-
uses: pypa/gh-action-pypi-publish@
|
|
55
|
+
uses: pypa/gh-action-pypi-publish@cef221092ed1bacb1cc03d23a2d87d1d172e277b # v1.14.0
|
|
56
|
+
|
|
57
|
+
major-tag:
|
|
58
|
+
# Move the `v1` (Action interface) tag to this release so `uses: agentbrooder/brooder@v1`
|
|
59
|
+
# always resolves to the latest release — no manual tag bumping. Runs only after publish.
|
|
60
|
+
# Safe from loops: `v1` doesn't match the `v*.*.*` trigger, and a GITHUB_TOKEN push doesn't
|
|
61
|
+
# re-trigger workflows anyway.
|
|
62
|
+
needs: publish
|
|
63
|
+
runs-on: ubuntu-latest
|
|
64
|
+
permissions:
|
|
65
|
+
contents: write
|
|
66
|
+
steps:
|
|
67
|
+
- uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0
|
|
68
|
+
- name: Update the v1 tag to point at this release
|
|
69
|
+
run: |
|
|
70
|
+
git config user.name "github-actions[bot]"
|
|
71
|
+
git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
|
|
72
|
+
git tag -f v1
|
|
73
|
+
git push -f origin v1
|
|
@@ -6,6 +6,50 @@ All notable changes to this project are documented here. The format is based on
|
|
|
6
6
|
|
|
7
7
|
## [Unreleased]
|
|
8
8
|
|
|
9
|
+
## [0.2.0] — 2026-07-04
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- **Review UX — snapshot-fatigue defense** — every failing case is now classified **suspicious**
|
|
13
|
+
(a material change: the output, a tool/final step, an observed tool result, or a guardrail
|
|
14
|
+
terminal changed — or the case is flaky) vs. **expected** (cosmetic reasoning-turn or
|
|
15
|
+
turn/step-count churn with the tool path and answer intact). The results table and the Markdown PR
|
|
16
|
+
comment headline "N suspicious · M expected", sort the worst cases first, and add a **Review**
|
|
17
|
+
column. `brooder approve` gained selective acceptance: `brooder approve [SELECTOR]... --only
|
|
18
|
+
{all,expected,suspicious} [--dry-run]` — e.g. `brooder approve --only expected` clears the cosmetic
|
|
19
|
+
drift in one command so you review the suspicious few individually. Classification is advisory and
|
|
20
|
+
**never changes the CI gate** — every regression still fails the build. The machine-readable
|
|
21
|
+
summary adds `suspicious`/`expected` counts and a per-case `severity` (summary schema **v1 → v2**,
|
|
22
|
+
additive).
|
|
23
|
+
- **Streaming capture** — auto-capture now records tool calls, content, and token usage from
|
|
24
|
+
streamed responses (`stream=True`) instead of only warning. The streamed iterator is teed as your
|
|
25
|
+
code consumes it and reduced into a normal captured call once the stream ends. Covers OpenAI
|
|
26
|
+
`create(stream=True)` (sync + async), Anthropic `messages.create(stream=True)`, and Bedrock
|
|
27
|
+
`converse_stream`. (The `messages.stream()` / `.stream()` context-manager helpers stay warn-only.)
|
|
28
|
+
- **Cost/latency budget gate** — runs now capture `Run.usage` (wall-clock `duration_ms` via
|
|
29
|
+
`@record`, token counts via provider auto-capture for OpenAI/Anthropic/Bedrock/Google, and derived
|
|
30
|
+
`cost_usd` when `budget.prices` are configured). `brooder ci --budget` / `run --budget` fail on a
|
|
31
|
+
`budget:` breach — absolute caps (`max_total_tokens` / `max_duration_ms` / `max_cost_usd`) or
|
|
32
|
+
per-baseline drift (`max_tokens_increase` etc.). Usage is **not** part of the behavioral diff, so a
|
|
33
|
+
latency blip or token drift never reads as a regression; the gate is orthogonal to the verdict.
|
|
34
|
+
OTLP now also emits mean `brooder.usage.*` gauges. Baseline schema bumped to **v2** (additive — v1
|
|
35
|
+
baselines still load).
|
|
36
|
+
- **pytest plugin** (`pip install brooder[pytest]`, auto-registers via the `pytest11` entry point):
|
|
37
|
+
a `brooder` fixture + `brooder.snapshot(result, inputs=...)` to snapshot-test agents inside pytest.
|
|
38
|
+
`pytest` checks each run against its committed baseline (a regression fails the test, a missing
|
|
39
|
+
baseline fails with a hint — never a silent pass); `pytest --brooder-update` records/refreshes
|
|
40
|
+
baselines. Configurable per-test with `@pytest.mark.brooder(agent=..., inputs=...)`. New public
|
|
41
|
+
`brooder.recorder.active_run(handle)` context manager and `brooder.analysis.analyze_with_config`
|
|
42
|
+
(shared by the CLI and the plugin so verdicts can't diverge).
|
|
43
|
+
|
|
44
|
+
### Changed
|
|
45
|
+
- **PR-comment hardening** — the Markdown report now opens with a machine-stable
|
|
46
|
+
`<!-- brooder-report -->` sentinel, and the GitHub Action slices the PR comment on that marker
|
|
47
|
+
instead of the content-derived `## Brooder results` headline (captured content escapes `<`, so it
|
|
48
|
+
can't forge the marker or truncate the comment).
|
|
49
|
+
- **Baseline schema versioning** — loading a baseline written by a newer Brooder now fails with a
|
|
50
|
+
clear "upgrade Brooder" error instead of a confusing validation failure, and there is a migration
|
|
51
|
+
seam so a future `Run`/`Step` change can upgrade old committed baselines on load.
|
|
52
|
+
|
|
9
53
|
## [0.1.0] — 2026-07-02
|
|
10
54
|
|
|
11
55
|
First public release.
|
|
@@ -92,5 +136,6 @@ First public release.
|
|
|
92
136
|
- Strict typing (`py.typed`), atomic storage writes, typed config, structured logging.
|
|
93
137
|
- Tooling: ruff, mypy (strict), pre-commit, pytest + coverage, CI matrix (3.10–3.12).
|
|
94
138
|
|
|
95
|
-
[Unreleased]: https://github.com/agentbrooder/brooder/compare/v0.
|
|
139
|
+
[Unreleased]: https://github.com/agentbrooder/brooder/compare/v0.2.0...HEAD
|
|
140
|
+
[0.2.0]: https://github.com/agentbrooder/brooder/compare/v0.1.0...v0.2.0
|
|
96
141
|
[0.1.0]: https://github.com/agentbrooder/brooder/releases/tag/v0.1.0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: brooder
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Snapshot testing for AI agents — catch behavior regressions before they ship.
|
|
5
5
|
Project-URL: Homepage, https://brooder.dev
|
|
6
6
|
Project-URL: Repository, https://github.com/agentbrooder/brooder
|
|
@@ -43,6 +43,8 @@ Requires-Dist: openai-agents>=0.1; extra == 'openai-agents'
|
|
|
43
43
|
Provides-Extra: otel
|
|
44
44
|
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.20; extra == 'otel'
|
|
45
45
|
Requires-Dist: opentelemetry-sdk>=1.20; extra == 'otel'
|
|
46
|
+
Provides-Extra: pytest
|
|
47
|
+
Requires-Dist: pytest>=8.0; extra == 'pytest'
|
|
46
48
|
Description-Content-Type: text/markdown
|
|
47
49
|
|
|
48
50
|
<p align="center">
|
|
@@ -112,60 +114,204 @@ have shipped to production unnoticed. Brooder caught it — and exited non-zero,
|
|
|
112
114
|
|
|
113
115
|
---
|
|
114
116
|
|
|
115
|
-
## The
|
|
117
|
+
## The workflow
|
|
116
118
|
|
|
117
119
|
```bash
|
|
118
120
|
brooder record examples/regressing_agent.py # capture golden baselines from real runs
|
|
119
121
|
brooder run examples/regressing_agent.py # re-run after a change, diff vs baseline
|
|
120
122
|
brooder diff # see exactly what changed
|
|
121
|
-
brooder approve
|
|
123
|
+
brooder approve --only expected # bulk-accept the cosmetic drift...
|
|
124
|
+
brooder approve <case> # ...then accept the reviewed ones case-by-case
|
|
122
125
|
```
|
|
123
126
|
|
|
124
127
|
`brooder run` exits non-zero when behavior regressed — drop it into CI and it gates your PRs.
|
|
125
128
|
|
|
129
|
+
**No snapshot fatigue.** When a model bump surfaces a wall of diffs, Brooder classifies each as
|
|
130
|
+
**suspicious** (the output, tool path, or a guardrail actually changed) or **expected** (cosmetic
|
|
131
|
+
reasoning-turn / count drift), headlines the summary `N suspicious · M expected`, and sorts the
|
|
132
|
+
scary ones first — so `brooder approve --only expected` clears the noise in one command and you spend
|
|
133
|
+
review on the few that matter. (`brooder approve` with no args still accepts everything.)
|
|
134
|
+
|
|
126
135
|
---
|
|
127
136
|
|
|
128
|
-
## Instrument your
|
|
137
|
+
## Instrument your agent
|
|
129
138
|
|
|
130
|
-
Add one decorator. Log tool calls with
|
|
139
|
+
Add one decorator. That's the whole SDK. Log tool calls explicitly with `brooder.tool_call`, or
|
|
140
|
+
wrap your LLM client with `brooder.instrument(...)` and Brooder captures the model's tool-call
|
|
141
|
+
decisions for you.
|
|
131
142
|
|
|
132
143
|
```python
|
|
133
144
|
import brooder
|
|
145
|
+
import openai
|
|
134
146
|
|
|
135
|
-
|
|
136
|
-
brooder.tool_call("search_kb", {"query": query}, result="...")
|
|
137
|
-
return "..."
|
|
147
|
+
client = brooder.instrument(openai.OpenAI()) # auto-captures tool calls while recording
|
|
138
148
|
|
|
139
149
|
@brooder.record("support-agent")
|
|
140
150
|
def agent(question: str) -> str:
|
|
141
|
-
docs =
|
|
151
|
+
docs = client.chat.completions.create(model="gpt-4o", messages=[...])
|
|
142
152
|
return answer_from(docs)
|
|
143
153
|
|
|
144
154
|
# call it over your real inputs; brooder records/replays automatically
|
|
145
155
|
```
|
|
146
156
|
|
|
147
|
-
|
|
148
|
-
|
|
157
|
+
Baselines are plain JSON committed to your repo, so diffs show up in code review like any other
|
|
158
|
+
change.
|
|
159
|
+
|
|
160
|
+
**It tests the whole trajectory, not single LLM calls.** `@brooder.record` wraps your *entire*
|
|
161
|
+
agent — every step of its plan → act → observe loop. The baseline is the full trajectory: every
|
|
162
|
+
tool call across every turn, in order, plus the final output. So Brooder catches a `verify` step
|
|
163
|
+
that silently disappears *inside the loop* — the kind of agent-level regression an LLM-output eval
|
|
164
|
+
never sees.
|
|
165
|
+
|
|
166
|
+
### Works with your stack
|
|
167
|
+
|
|
168
|
+
| Layer | Supported |
|
|
169
|
+
| --- | --- |
|
|
170
|
+
| **LLM providers** | OpenAI, Azure OpenAI, Anthropic, AWS Bedrock, Google (Gemini / Vertex) — auto-detected |
|
|
171
|
+
| **Agent frameworks** | LangChain, LangGraph, CrewAI, AutoGen (via OpenTelemetry), OpenAI Agents SDK, Claude Agent SDK |
|
|
172
|
+
| **Async** | `AsyncOpenAI`, `AsyncAzureOpenAI`, `AsyncAnthropic`, Google `generate_content_async` — no extra setup |
|
|
173
|
+
| **Custom endpoints** | Any base URL / proxy / OpenAI-compatible gateway — Brooder never touches credentials |
|
|
174
|
+
|
|
175
|
+
Setup for each is in **[Integrations](#integrations)** below.
|
|
176
|
+
|
|
177
|
+
---
|
|
178
|
+
|
|
179
|
+
## Why not just use observability / eval tools?
|
|
180
|
+
|
|
181
|
+
| Tool type | Examples | What it does | The gap Brooder fills |
|
|
182
|
+
| --- | --- | --- | --- |
|
|
183
|
+
| Observability | Langfuse, Laminar, Phoenix | Trace/monitor **after** it runs | Doesn't gate **before** you ship |
|
|
184
|
+
| Eval frameworks | DeepEval, Braintrust, Ragas | Score against **hand-written** datasets | Requires eval authoring nobody maintains |
|
|
185
|
+
| **Brooder** | — | **Record real runs → behavioral diff on every change → CI gate** | **Zero eval-writing, catches model-migration regressions** |
|
|
186
|
+
|
|
187
|
+
Your baselines are JSON files in **your** repo. No SaaS, no cloud account — nobody can acquire your
|
|
188
|
+
test suite out from under you.
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
## Gate your PRs (GitHub Action)
|
|
193
|
+
|
|
194
|
+
Drop Brooder into CI and it re-runs your agent on every pull request, comments the behavioral diff,
|
|
195
|
+
and fails the check when behavior regresses. Copy [examples/github-action.yml](examples/github-action.yml)
|
|
196
|
+
to `.github/workflows/brooder.yml`:
|
|
197
|
+
|
|
198
|
+
```yaml
|
|
199
|
+
permissions:
|
|
200
|
+
contents: read
|
|
201
|
+
pull-requests: write # so it can comment the diff
|
|
202
|
+
|
|
203
|
+
jobs:
|
|
204
|
+
agent-snapshot:
|
|
205
|
+
runs-on: ubuntu-latest
|
|
206
|
+
steps:
|
|
207
|
+
- uses: actions/checkout@v4
|
|
208
|
+
- uses: agentbrooder/brooder@v1
|
|
209
|
+
with:
|
|
210
|
+
script: tests/agent_snapshot.py
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
The comment is upserted (updated in place, not spammed) and looks like the `--format markdown`
|
|
214
|
+
output.
|
|
215
|
+
|
|
216
|
+
> **Security:** `brooder ci` runs your checked-out agent script, so don't wire live provider
|
|
217
|
+
> secrets into a `pull_request`-triggered job — see the
|
|
218
|
+
> [CI trust model](SECURITY.md#running-brooder-in-ci-safely-trust-model).
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## Snapshot-test inside pytest
|
|
223
|
+
|
|
224
|
+
Prefer to stay in the test runner you already have? `pip install brooder[pytest]` and use the
|
|
225
|
+
`brooder` fixture — no separate CLI harness:
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
def test_support_agent(brooder):
|
|
229
|
+
answer = support_agent("refund my order") # tool calls / LLM turns auto-captured
|
|
230
|
+
brooder.snapshot(answer, inputs="refund my order")
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
- `pytest --brooder-update` records the golden baseline (commit it, like any snapshot).
|
|
234
|
+
- `pytest` checks each run against that baseline: a behavioral regression **fails the test**, and a
|
|
235
|
+
missing baseline fails with a hint instead of passing silently.
|
|
236
|
+
|
|
237
|
+
It honors your `brooder.yaml` (judge, normalization, redaction) and reuses the same capture and diff
|
|
238
|
+
engine as the CLI. Configure a case with `@pytest.mark.brooder(agent="support", inputs=...)`.
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
## What it checks
|
|
243
|
+
|
|
244
|
+
- **Structural diff** — the sequence of tool calls, their arguments, and the final output.
|
|
245
|
+
- **Semantic diff** — a pluggable judge (`judge: exact | llm`) so equivalent wording isn't a regression.
|
|
246
|
+
- **Flakiness** — `brooder run --runs 3` runs each case N times and flags non-determinism (`FLAKY`).
|
|
247
|
+
- **Review triage** — each regression is tagged **suspicious** (material) or **expected** (cosmetic)
|
|
248
|
+
so a model bump's wall of diffs sorts by attention, not just count (see *The workflow* above).
|
|
249
|
+
|
|
250
|
+
Each case gets a verdict — `PASS` / `REGRESSED` / `NEW` / `FLAKY` — a review class, and a stability
|
|
251
|
+
score.
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
## Gate on cost & latency drift
|
|
256
|
+
|
|
257
|
+
Behavior isn't the only thing that regresses — a model swap can keep the *same* behavior while
|
|
258
|
+
quietly doubling your token bill. Brooder captures each run's **latency and token usage** (and cost,
|
|
259
|
+
if you configure prices) and can fail CI when they spike. Usage is tracked separately from behavior,
|
|
260
|
+
so a noisy latency blip never reads as a behavioral regression.
|
|
261
|
+
|
|
262
|
+
```yaml
|
|
263
|
+
# brooder.yaml
|
|
264
|
+
budget:
|
|
265
|
+
max_total_tokens: 3000 # absolute ceiling per case
|
|
266
|
+
max_tokens_increase: 0.2 # …or fail if tokens drift >20% vs the baseline
|
|
267
|
+
prices: # optional: enable USD cost caps
|
|
268
|
+
gpt-4o: { input_per_mtok: 2.5, output_per_mtok: 10.0 }
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
```console
|
|
272
|
+
$ brooder ci --budget agent.py
|
|
273
|
+
💸 Budget — 1 limit(s) exceeded
|
|
274
|
+
• assistant/19761739: total_tokens 2600 is +160% vs baseline 1000 (limit 1200)
|
|
275
|
+
```
|
|
149
276
|
|
|
150
277
|
---
|
|
151
278
|
|
|
152
|
-
##
|
|
279
|
+
## Integrations
|
|
280
|
+
|
|
281
|
+
Everything above works with one decorator. These sections show the exact setup for each provider,
|
|
282
|
+
framework, and output format — expand what you need.
|
|
153
283
|
|
|
154
|
-
|
|
284
|
+
<details>
|
|
285
|
+
<summary><b>All providers, custom endpoints & async</b></summary>
|
|
286
|
+
|
|
287
|
+
Wrap your LLM client and Brooder records the model's tool-call decisions automatically. The provider
|
|
288
|
+
is auto-detected from the client; override it with a name, an alias, or a `Provider`:
|
|
155
289
|
|
|
156
290
|
```python
|
|
157
291
|
import brooder
|
|
158
|
-
import
|
|
292
|
+
from brooder import Provider
|
|
159
293
|
|
|
160
|
-
|
|
161
|
-
|
|
294
|
+
brooder.instrument(openai.OpenAI()) # OpenAI
|
|
295
|
+
brooder.instrument(openai.AzureOpenAI(...)) # Azure OpenAI (or provider="azure")
|
|
296
|
+
brooder.instrument(anthropic.Anthropic()) # Anthropic (or provider=Provider.ANTHROPIC)
|
|
297
|
+
brooder.instrument(boto3.client("bedrock-runtime")) # AWS Bedrock (or provider="aws")
|
|
298
|
+
brooder.instrument(genai.GenerativeModel("gemini-1.5-pro")) # Google Gemini / Vertex (or provider="gemini")
|
|
162
299
|
```
|
|
163
300
|
|
|
164
|
-
|
|
165
|
-
**Google (Gemini / Vertex)**.
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
301
|
+
The canonical set is `brooder.Provider`: **OpenAI**, **Azure OpenAI**, **Anthropic**, **AWS
|
|
302
|
+
Bedrock**, and **Google (Gemini / Vertex)**.
|
|
303
|
+
|
|
304
|
+
**Custom endpoints & proxies.** Brooder never manages credentials or URLs — the provider's own SDK
|
|
305
|
+
does. Point a client at any base URL (an internal gateway, an OpenAI-compatible server, an
|
|
306
|
+
Azure-APIM-proxied Anthropic endpoint, …) and hand it to `instrument` unchanged:
|
|
307
|
+
|
|
308
|
+
```python
|
|
309
|
+
client = anthropic.Anthropic(base_url="https://your-proxy/…", api_key="…")
|
|
310
|
+
brooder.instrument(client) # captured exactly the same
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
Model *names* are intentionally not diffed, so switching models isn't itself a change — only the
|
|
314
|
+
model's *behavior* (which tools it calls, with what arguments) is.
|
|
169
315
|
|
|
170
316
|
**Async works too.** `@brooder.record` and `instrument(...)` handle `async def` agents and async
|
|
171
317
|
clients — `AsyncOpenAI`, `AsyncAzureOpenAI`, `AsyncAnthropic`, and Google's `generate_content_async`
|
|
@@ -182,11 +328,13 @@ async def agent(question: str) -> str:
|
|
|
182
328
|
|
|
183
329
|
(Async AWS Bedrock via aioboto3 isn't covered yet — the sync boto3 client is.)
|
|
184
330
|
|
|
185
|
-
|
|
331
|
+
</details>
|
|
332
|
+
|
|
333
|
+
<details>
|
|
334
|
+
<summary><b>OpenTelemetry (LangGraph, CrewAI, AutoGen, …)</b></summary>
|
|
186
335
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
whole trajectory, no manual `tool_call`:
|
|
336
|
+
If your framework emits OpenTelemetry GenAI spans, add one span processor and Brooder ingests the
|
|
337
|
+
whole trajectory — no manual `tool_call`:
|
|
190
338
|
|
|
191
339
|
```python
|
|
192
340
|
from opentelemetry import trace
|
|
@@ -199,27 +347,34 @@ It maps inference spans → turns, `execute_tool` spans → tool calls, and the
|
|
|
199
347
|
input/output → the case identity and final answer. It also drops straight into the OTel pipelines
|
|
200
348
|
you already run (Datadog / Arize / Honeycomb).
|
|
201
349
|
|
|
202
|
-
|
|
203
|
-
|
|
350
|
+
</details>
|
|
351
|
+
|
|
352
|
+
<details>
|
|
353
|
+
<summary><b>Claude Agent SDK</b></summary>
|
|
354
|
+
|
|
355
|
+
Register Brooder's hooks and it records the tool trajectory and the final answer automatically:
|
|
204
356
|
|
|
205
357
|
```python
|
|
206
358
|
import brooder
|
|
207
|
-
from claude_agent_sdk import ClaudeSDKClient, ClaudeAgentOptions
|
|
208
|
-
from brooder.integrations import claude_agent
|
|
359
|
+
from claude_agent_sdk import ClaudeSDKClient, ClaudeAgentOptions
|
|
209
360
|
|
|
210
361
|
options = ClaudeAgentOptions(hooks=brooder.claude_agent_hooks(agent="support-agent"))
|
|
211
362
|
async with ClaudeSDKClient(options=options) as client:
|
|
212
363
|
await client.query(prompt)
|
|
213
364
|
async for msg in client.receive_response():
|
|
214
|
-
|
|
215
|
-
claude_agent.record_output(msg.session_id, msg.result) # optional: capture the answer
|
|
365
|
+
... # nothing Brooder-specific needed
|
|
216
366
|
```
|
|
217
367
|
|
|
218
|
-
`UserPromptSubmit` opens a run (the prompt is the case identity),
|
|
368
|
+
`UserPromptSubmit` opens a run (the prompt is the case identity), tool-use hooks become tool steps,
|
|
219
369
|
and `Stop` finalizes it.
|
|
220
370
|
|
|
221
|
-
|
|
222
|
-
|
|
371
|
+
</details>
|
|
372
|
+
|
|
373
|
+
<details>
|
|
374
|
+
<summary><b>OpenAI Agents SDK</b></summary>
|
|
375
|
+
|
|
376
|
+
Its tracing is on by default — install Brooder's trace processor once and every run is captured (no
|
|
377
|
+
OpenAI API key required for capture):
|
|
223
378
|
|
|
224
379
|
```python
|
|
225
380
|
import brooder.integrations.openai_agents as bd_agents
|
|
@@ -231,7 +386,12 @@ It maps generation/response spans → turns, function spans → tool calls, and
|
|
|
231
386
|
guardrails into the trajectory too — so both tool selection *and* control-flow regressions get
|
|
232
387
|
diffed.
|
|
233
388
|
|
|
234
|
-
|
|
389
|
+
</details>
|
|
390
|
+
|
|
391
|
+
<details>
|
|
392
|
+
<summary><b>LangChain / LangGraph</b></summary>
|
|
393
|
+
|
|
394
|
+
Attach one callback handler — no OpenTelemetry setup required:
|
|
235
395
|
|
|
236
396
|
```python
|
|
237
397
|
import brooder.integrations.langchain as bd_lc
|
|
@@ -243,66 +403,23 @@ graph.invoke({"messages": [...]}, config={"callbacks": [handler]})
|
|
|
243
403
|
The root chain start opens a run (its input is the case identity), model calls become turns, and
|
|
244
404
|
tool calls become tool steps — one handler covers both LangChain and LangGraph.
|
|
245
405
|
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
`@brooder.record` wraps your **entire agent** — every step of its plan → act → observe loop.
|
|
249
|
-
The baseline is the full **trajectory**: every tool call across every turn, in order, plus the
|
|
250
|
-
final output. So Brooder catches agent-level regressions, not just token changes in one model
|
|
251
|
-
response.
|
|
252
|
-
|
|
253
|
-
```bash
|
|
254
|
-
# A multi-step agent that silently stops verifying before answering on the newer model:
|
|
255
|
-
brooder migrate --from gpt-4o --to gpt-5-new examples/loop_agent.py
|
|
256
|
-
# -> REGRESSED: trajectory[1] "verify" removed
|
|
257
|
-
```
|
|
258
|
-
|
|
259
|
-
That dropped `verify` step happened *inside the loop* — the kind of thing an LLM-output eval
|
|
260
|
-
would never see.
|
|
261
|
-
|
|
262
|
-
## Why not just use observability / eval tools?
|
|
263
|
-
|
|
264
|
-
| Tool type | Examples | What it does | The gap Brooder fills |
|
|
265
|
-
| --- | --- | --- | --- |
|
|
266
|
-
| Observability | Langfuse, Laminar, Phoenix | Trace/monitor **after** it runs | Doesn't gate **before** you ship |
|
|
267
|
-
| Eval frameworks | DeepEval, Braintrust, Ragas | Score against **hand-written** datasets | Requires eval authoring nobody maintains |
|
|
268
|
-
| **Brooder** | — | **Record real runs → behavioral diff on every change → CI gate** | **Zero eval-writing, catches model-migration regressions** |
|
|
269
|
-
|
|
270
|
-
---
|
|
406
|
+
</details>
|
|
271
407
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
Drop Brooder into CI and it re-runs your agent on every pull request, comments the behavioral diff,
|
|
275
|
-
and fails the check when behavior regresses. Copy [examples/github-action.yml](examples/github-action.yml)
|
|
276
|
-
to `.github/workflows/brooder.yml`:
|
|
277
|
-
|
|
278
|
-
```yaml
|
|
279
|
-
permissions:
|
|
280
|
-
contents: read
|
|
281
|
-
pull-requests: write # so it can comment the diff
|
|
282
|
-
|
|
283
|
-
jobs:
|
|
284
|
-
agent-snapshot:
|
|
285
|
-
runs-on: ubuntu-latest
|
|
286
|
-
steps:
|
|
287
|
-
- uses: actions/checkout@v4
|
|
288
|
-
- uses: agentbrooder/brooder@v1
|
|
289
|
-
with:
|
|
290
|
-
script: tests/agent_snapshot.py
|
|
291
|
-
```
|
|
292
|
-
|
|
293
|
-
The comment is upserted (updated in place, not spammed) and looks like the `--format markdown`
|
|
294
|
-
output below.
|
|
295
|
-
|
|
296
|
-
## Machine-readable output (`--json` / OTLP)
|
|
408
|
+
<details>
|
|
409
|
+
<summary><b>Machine-readable output & dashboards (<code>--json</code> / OTLP)</b></summary>
|
|
297
410
|
|
|
298
411
|
`run`, `ci`, and `diff` take `--format table|json|markdown` (`--json` is a shortcut). Exit codes are
|
|
299
412
|
unchanged, so you can gate *and* parse:
|
|
300
413
|
|
|
301
414
|
```bash
|
|
302
415
|
brooder run agent.py --json | jq '.summary'
|
|
303
|
-
# { "total": 3, "passed": 2, "regressed": 1, "flaky": 0, "regressions": 1,
|
|
416
|
+
# { "total": 3, "passed": 2, "regressed": 1, "flaky": 0, "regressions": 1,
|
|
417
|
+
# "suspicious": 1, "expected": 0, "mean_stability": 80 }
|
|
304
418
|
```
|
|
305
419
|
|
|
420
|
+
Each case also carries a `severity` (`suspicious` / `expected` / `none`) so a dashboard can rank the
|
|
421
|
+
regressions that need a human by attention, not just count them.
|
|
422
|
+
|
|
306
423
|
For dashboards, point Brooder at any OTLP endpoint and each run emits a snapshot of gauges
|
|
307
424
|
(`brooder.cases.*`, `brooder.stability.mean`) — **one exporter** that reaches Datadog, Grafana,
|
|
308
425
|
Honeycomb, and CloudWatch:
|
|
@@ -313,15 +430,7 @@ export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318/v1/metrics # or metri
|
|
|
313
430
|
brooder ci agent.py
|
|
314
431
|
```
|
|
315
432
|
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
## What it checks
|
|
319
|
-
|
|
320
|
-
- **Structural diff** — the sequence of tool calls, their arguments, and the final output.
|
|
321
|
-
- **Semantic diff** — a pluggable judge (`judge: exact | llm`) so equivalent wording isn't a regression.
|
|
322
|
-
- **Flakiness** — `brooder run --runs 3` runs each case N times and flags non-determinism (`FLAKY`).
|
|
323
|
-
|
|
324
|
-
Each case gets a verdict — `PASS` / `REGRESSED` / `NEW` / `FLAKY` — and a stability score.
|
|
433
|
+
</details>
|
|
325
434
|
|
|
326
435
|
---
|
|
327
436
|
|