brooder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- brooder/__init__.py +31 -0
- brooder/analysis.py +79 -0
- brooder/cli.py +281 -0
- brooder/config.py +88 -0
- brooder/diffing.py +217 -0
- brooder/errors.py +31 -0
- brooder/integrations/__init__.py +75 -0
- brooder/integrations/anthropic.py +46 -0
- brooder/integrations/base.py +170 -0
- brooder/integrations/bedrock.py +49 -0
- brooder/integrations/claude_agent.py +164 -0
- brooder/integrations/google.py +61 -0
- brooder/integrations/langchain.py +321 -0
- brooder/integrations/openai.py +43 -0
- brooder/integrations/openai_agents.py +208 -0
- brooder/integrations/otel.py +216 -0
- brooder/judges.py +109 -0
- brooder/log.py +33 -0
- brooder/metrics.py +116 -0
- brooder/models.py +148 -0
- brooder/py.typed +1 -0
- brooder/recorder.py +342 -0
- brooder/report.py +261 -0
- brooder/storage.py +150 -0
- brooder-0.1.0.dist-info/METADATA +338 -0
- brooder-0.1.0.dist-info/RECORD +30 -0
- brooder-0.1.0.dist-info/WHEEL +4 -0
- brooder-0.1.0.dist-info/entry_points.txt +2 -0
- brooder-0.1.0.dist-info/licenses/LICENSE +201 -0
- brooder-0.1.0.dist-info/licenses/NOTICE +7 -0
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""OpenTelemetry GenAI ingestion — capture agent trajectories from OTel spans.
|
|
2
|
+
|
|
3
|
+
Add :class:`BrooderSpanProcessor` to your tracer provider and Brooder turns the GenAI/agent spans
|
|
4
|
+
your stack already emits into a diffable :class:`~brooder.models.Step` trajectory — no manual
|
|
5
|
+
``tool_call``. Because frameworks like **LangGraph, CrewAI and AutoGen** emit OpenTelemetry GenAI
|
|
6
|
+
spans, this one adapter covers all of them, and it slots into the OTel pipelines you already run
|
|
7
|
+
(Datadog / Arize / Honeycomb).
|
|
8
|
+
|
|
9
|
+
from opentelemetry import trace
|
|
10
|
+
from brooder.integrations.otel import BrooderSpanProcessor
|
|
11
|
+
|
|
12
|
+
trace.get_tracer_provider().add_span_processor(BrooderSpanProcessor(agent="support-agent"))
|
|
13
|
+
|
|
14
|
+
The processor is **duck-typed** against the OTel SDK's ``SpanProcessor`` protocol, so importing this
|
|
15
|
+
module does not require ``opentelemetry`` to be installed. Registration is explicit — Brooder never
|
|
16
|
+
mutates your tracer provider for you.
|
|
17
|
+
|
|
18
|
+
**How spans map to steps** (driven by ``gen_ai.operation.name``):
|
|
19
|
+
|
|
20
|
+
- inference (``chat`` / ``text_completion`` / ``generate_content``) → ``TURN``
|
|
21
|
+
- ``execute_tool`` → ``TOOL`` (name from ``gen_ai.tool.name``; args/result if present)
|
|
22
|
+
- the agent-root span (``invoke_agent`` or the trace's parentless span) → the run boundary: its
|
|
23
|
+
input becomes the case identity, its output the ``FINAL`` step.
|
|
24
|
+
|
|
25
|
+
**Content is opt-in.** Per the GenAI conventions, tool arguments/results and message content are
|
|
26
|
+
only emitted when content capture is enabled (it can be sensitive). Brooder always captures tool
|
|
27
|
+
*names* (the core diff signal) and degrades gracefully when content is absent — but a stable case id
|
|
28
|
+
needs the input, so enable content capture *or* set a ``brooder.case_id`` attribute on the root
|
|
29
|
+
span. A one-time warning is logged when a trace carries no content.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import json
|
|
35
|
+
import threading
|
|
36
|
+
from typing import Any, Optional
|
|
37
|
+
|
|
38
|
+
from .. import recorder
|
|
39
|
+
from ..log import get_logger
|
|
40
|
+
from ..models import Step, StepKind
|
|
41
|
+
from .base import parse_json
|
|
42
|
+
|
|
43
|
+
_log = get_logger()
|
|
44
|
+
|
|
45
|
+
# GenAI semantic-convention attribute keys (see the OpenTelemetry GenAI conventions).
|
|
46
|
+
_OP = "gen_ai.operation.name"
|
|
47
|
+
_TOOL_NAME = "gen_ai.tool.name"
|
|
48
|
+
_TOOL_ARGS = "gen_ai.tool.call.arguments"
|
|
49
|
+
_TOOL_RESULT = "gen_ai.tool.call.result"
|
|
50
|
+
_AGENT_NAME = "gen_ai.agent.name"
|
|
51
|
+
_PROVIDER = "gen_ai.provider.name" # newer key; falls back to gen_ai.system
|
|
52
|
+
_SYSTEM = "gen_ai.system"
|
|
53
|
+
_INPUT_MESSAGES = "gen_ai.input.messages"
|
|
54
|
+
_OUTPUT_MESSAGES = "gen_ai.output.messages"
|
|
55
|
+
_CASE_ID = "brooder.case_id" # Brooder escape hatch: a caller-set stable case key
|
|
56
|
+
|
|
57
|
+
_INFERENCE_OPS = frozenset({"chat", "text_completion", "generate_content"})
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _attrs(span: Any) -> Any:
|
|
61
|
+
"""Return a span's attribute mapping (empty mapping if it has none)."""
|
|
62
|
+
return getattr(span, "attributes", None) or {}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _start_time(span: Any) -> int:
|
|
66
|
+
"""Return a span's start time in ns for ordering (0 if unknown)."""
|
|
67
|
+
return getattr(span, "start_time", 0) or 0
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _span_suffix(span: Any, prefix: str) -> Optional[str]:
|
|
71
|
+
"""Return the ``<name>`` part of a ``"<prefix> <name>"`` span name, or None."""
|
|
72
|
+
name = getattr(span, "name", "") or ""
|
|
73
|
+
head = prefix + " "
|
|
74
|
+
return name[len(head) :] if name.startswith(head) else None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _coerce(raw: Any) -> Any:
|
|
78
|
+
"""Decode a JSON-string attribute into structured data; pass other values through."""
|
|
79
|
+
if isinstance(raw, str):
|
|
80
|
+
try:
|
|
81
|
+
return json.loads(raw)
|
|
82
|
+
except json.JSONDecodeError:
|
|
83
|
+
return raw
|
|
84
|
+
return raw
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _root_span(spans: list[Any]) -> Any:
|
|
88
|
+
"""Pick the agent-boundary span: an ``invoke_agent`` span, else the parentless one, else 1st."""
|
|
89
|
+
for span in spans:
|
|
90
|
+
if _attrs(span).get(_OP) == "invoke_agent":
|
|
91
|
+
return span
|
|
92
|
+
for span in spans:
|
|
93
|
+
if getattr(span, "parent", None) is None:
|
|
94
|
+
return span
|
|
95
|
+
return spans[0]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _extract_inputs(root: Any) -> Any:
|
|
99
|
+
"""Extract the run's inputs (for case identity) from the root span, or None if unavailable."""
|
|
100
|
+
attrs = _attrs(root)
|
|
101
|
+
if _CASE_ID in attrs:
|
|
102
|
+
return {"case_id": attrs[_CASE_ID]}
|
|
103
|
+
raw = attrs.get(_INPUT_MESSAGES)
|
|
104
|
+
return _coerce(raw) if raw is not None else None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _extract_output(root: Any, spans: list[Any]) -> Any:
|
|
108
|
+
"""Extract the final output from the root span, falling back to the last inference span."""
|
|
109
|
+
raw = _attrs(root).get(_OUTPUT_MESSAGES)
|
|
110
|
+
if raw is not None:
|
|
111
|
+
return _coerce(raw)
|
|
112
|
+
for span in reversed(spans):
|
|
113
|
+
attrs = _attrs(span)
|
|
114
|
+
if attrs.get(_OP) in _INFERENCE_OPS and attrs.get(_OUTPUT_MESSAGES) is not None:
|
|
115
|
+
return _coerce(attrs[_OUTPUT_MESSAGES])
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _to_step(span: Any) -> tuple[Optional[Step], bool]:
|
|
120
|
+
"""Map a span to a step (or None to skip); the bool is whether it carried content."""
|
|
121
|
+
attrs = _attrs(span)
|
|
122
|
+
op = attrs.get(_OP)
|
|
123
|
+
if op in _INFERENCE_OPS:
|
|
124
|
+
provider = attrs.get(_PROVIDER) or attrs.get(_SYSTEM)
|
|
125
|
+
meta = {"provider": provider} if provider else {}
|
|
126
|
+
return Step(kind=StepKind.TURN, name="turn", args=meta), False
|
|
127
|
+
if op == "execute_tool" or _TOOL_NAME in attrs:
|
|
128
|
+
name = attrs.get(_TOOL_NAME) or _span_suffix(span, "execute_tool") or "tool"
|
|
129
|
+
raw_args = attrs.get(_TOOL_ARGS)
|
|
130
|
+
raw_result = attrs.get(_TOOL_RESULT)
|
|
131
|
+
args = parse_json(raw_args) if raw_args is not None else {}
|
|
132
|
+
result = _coerce(raw_result) if raw_result is not None else None
|
|
133
|
+
had_content = raw_args is not None or raw_result is not None
|
|
134
|
+
return Step(kind=StepKind.TOOL, name=name, args=args, result=result), had_content
|
|
135
|
+
# invoke_agent / create_agent / non-GenAI spans are the boundary or noise, not a step.
|
|
136
|
+
return None, False
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class BrooderSpanProcessor:
|
|
140
|
+
"""An OpenTelemetry ``SpanProcessor`` that ingests GenAI spans into Brooder trajectories.
|
|
141
|
+
|
|
142
|
+
Spans are buffered per trace; when the trace's root span ends they are sorted by start time and
|
|
143
|
+
mapped into one run (see the module docstring for the mapping). Buffering + sort makes
|
|
144
|
+
out-of-order and cross-thread ``on_end`` delivery a non-issue. Incomplete traces (no parentless
|
|
145
|
+
span seen) are drained on :meth:`force_flush` / :meth:`shutdown`.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
agent: Logical agent name used to group baselines. If omitted, falls back to the root span's
|
|
149
|
+
``gen_ai.agent.name``, then its span name, then ``"otel-agent"``.
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
def __init__(self, agent: Optional[str] = None) -> None:
|
|
153
|
+
"""Create a processor that records into runs named ``agent`` (see the class docstring)."""
|
|
154
|
+
self._agent = agent
|
|
155
|
+
self._buffers: dict[int, list[Any]] = {}
|
|
156
|
+
self._lock = threading.Lock()
|
|
157
|
+
self._warned_no_content = False
|
|
158
|
+
|
|
159
|
+
def on_start(self, span: Any, parent_context: Any = None) -> None:
|
|
160
|
+
"""Part of the ``SpanProcessor`` protocol; capture happens at :meth:`on_end`."""
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
def on_end(self, span: Any) -> None:
|
|
164
|
+
"""Buffer an ended span under its trace id, finalizing the run when the root span ends."""
|
|
165
|
+
trace_id = getattr(getattr(span, "context", None), "trace_id", None)
|
|
166
|
+
if trace_id is None:
|
|
167
|
+
return
|
|
168
|
+
with self._lock:
|
|
169
|
+
self._buffers.setdefault(trace_id, []).append(span)
|
|
170
|
+
is_root = getattr(span, "parent", None) is None
|
|
171
|
+
if is_root:
|
|
172
|
+
self._finalize(trace_id)
|
|
173
|
+
|
|
174
|
+
def shutdown(self) -> None:
|
|
175
|
+
"""Drain every buffered (possibly incomplete) trace into a run."""
|
|
176
|
+
for trace_id in list(self._buffers):
|
|
177
|
+
self._finalize(trace_id)
|
|
178
|
+
|
|
179
|
+
def force_flush(self, timeout_millis: int = 30000) -> bool:
|
|
180
|
+
"""Drain every buffered trace; always returns ``True``."""
|
|
181
|
+
self.shutdown()
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
def _finalize(self, trace_id: int) -> None:
|
|
185
|
+
"""Turn a trace's buffered spans into one recorded run."""
|
|
186
|
+
with self._lock:
|
|
187
|
+
spans = self._buffers.pop(trace_id, None)
|
|
188
|
+
if not spans:
|
|
189
|
+
return
|
|
190
|
+
spans.sort(key=_start_time)
|
|
191
|
+
|
|
192
|
+
root = _root_span(spans)
|
|
193
|
+
agent = self._agent or _attrs(root).get(_AGENT_NAME) or getattr(root, "name", None)
|
|
194
|
+
inputs = _extract_inputs(root)
|
|
195
|
+
handle = recorder.open_run(
|
|
196
|
+
agent or "otel-agent", inputs=inputs, external_id=f"{trace_id:032x}"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
saw_content = inputs is not None
|
|
200
|
+
for span in spans:
|
|
201
|
+
step, had_content = _to_step(span)
|
|
202
|
+
if step is not None:
|
|
203
|
+
handle.add_step(step)
|
|
204
|
+
saw_content = saw_content or had_content
|
|
205
|
+
|
|
206
|
+
output = _extract_output(root, spans)
|
|
207
|
+
if output is not None:
|
|
208
|
+
saw_content = True
|
|
209
|
+
if not saw_content and not self._warned_no_content:
|
|
210
|
+
self._warned_no_content = True
|
|
211
|
+
_log.warning(
|
|
212
|
+
"brooder: OTel spans carry no GenAI content (tool args/results, messages); "
|
|
213
|
+
"capturing tool names only. Enable content capture, or set a 'brooder.case_id' "
|
|
214
|
+
"span attribute, for stable and richer baselines."
|
|
215
|
+
)
|
|
216
|
+
handle.finish(output)
|
brooder/judges.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Output-equivalence judges.
|
|
2
|
+
|
|
3
|
+
Structural diffing (tool calls) is always exact. *Output* equivalence, however, is where
|
|
4
|
+
agents are legitimately non-deterministic ("I've started your refund" vs "Your refund is on
|
|
5
|
+
its way" mean the same thing). A :class:`Judge` decides whether two outputs are equivalent.
|
|
6
|
+
|
|
7
|
+
- ``ExactJudge`` (default): byte-for-byte equality. Deterministic, offline, zero cost.
|
|
8
|
+
- ``LLMJudge``: asks a model whether two outputs are semantically equivalent. Provider-agnostic
|
|
9
|
+
via an injected ``complete`` callable, so it stays testable and never hard-depends on a vendor.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
from typing import Any, Optional, Protocol
|
|
16
|
+
|
|
17
|
+
from .log import get_logger
|
|
18
|
+
|
|
19
|
+
_log = get_logger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Judge(Protocol):
|
|
23
|
+
"""Decides whether a current output is equivalent to the baseline output."""
|
|
24
|
+
|
|
25
|
+
def equivalent(self, baseline: Any, current: Any) -> bool:
|
|
26
|
+
"""Return ``True`` if ``current`` is an acceptable substitute for ``baseline``.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
baseline: The golden output recorded previously.
|
|
30
|
+
current: The output produced by the run under test.
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
``True`` if the two outputs are considered equivalent.
|
|
34
|
+
"""
|
|
35
|
+
...
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ExactJudge:
|
|
39
|
+
"""Equivalent iff equal. The safe, deterministic default."""
|
|
40
|
+
|
|
41
|
+
def equivalent(self, baseline: Any, current: Any) -> bool:
|
|
42
|
+
"""Return ``True`` iff the outputs are equal.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
baseline: The golden output.
|
|
46
|
+
current: The output under test.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
``True`` when ``baseline == current``.
|
|
50
|
+
"""
|
|
51
|
+
return bool(baseline == current)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
_PROMPT = (
|
|
55
|
+
"You are grading an AI agent's output against a known-good baseline. "
|
|
56
|
+
"Answer with exactly 'yes' if they are semantically equivalent for an end user, "
|
|
57
|
+
"or 'no' if the meaning or outcome differs.\n\n"
|
|
58
|
+
"BASELINE:\n{baseline}\n\nCURRENT:\n{current}\n\nEquivalent?"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class LLMJudge:
|
|
63
|
+
"""Judge semantic equivalence with an LLM via an injected completion function."""
|
|
64
|
+
|
|
65
|
+
def __init__(self, complete: Callable[[str], str]) -> None:
|
|
66
|
+
"""Initialize the judge.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
complete: A function that takes a prompt and returns the model's text response.
|
|
70
|
+
Injecting it keeps the judge provider-agnostic and testable.
|
|
71
|
+
"""
|
|
72
|
+
self._complete = complete
|
|
73
|
+
|
|
74
|
+
def equivalent(self, baseline: Any, current: Any) -> bool:
|
|
75
|
+
"""Return ``True`` if the LLM judges the outputs semantically equivalent.
|
|
76
|
+
|
|
77
|
+
Equal outputs short-circuit without calling the backend.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
baseline: The golden output.
|
|
81
|
+
current: The output under test.
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
``True`` if outputs are equal, or if the model answers "yes".
|
|
85
|
+
"""
|
|
86
|
+
if baseline == current:
|
|
87
|
+
return True
|
|
88
|
+
prompt = _PROMPT.format(baseline=baseline, current=current)
|
|
89
|
+
answer = self._complete(prompt).strip().lower()
|
|
90
|
+
return answer.startswith("y")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def make_judge(kind: str, complete: Optional[Callable[[str], str]] = None) -> Judge:
|
|
94
|
+
"""Build a judge from configuration.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
kind: ``"exact"`` or ``"llm"``.
|
|
98
|
+
complete: The completion backend for ``"llm"``. If omitted, falls back to exact
|
|
99
|
+
(with a warning) so the tool never hard-fails on a missing backend.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
A :class:`Judge` implementation.
|
|
103
|
+
"""
|
|
104
|
+
if kind == "llm":
|
|
105
|
+
if complete is None:
|
|
106
|
+
_log.warning("judge: llm is configured but no LLM backend is wired; using exact")
|
|
107
|
+
return ExactJudge()
|
|
108
|
+
return LLMJudge(complete)
|
|
109
|
+
return ExactJudge()
|
brooder/log.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Structured logging for Brooder.
|
|
2
|
+
|
|
3
|
+
The library logs through the ``brooder`` logger and stays quiet by default (WARNING).
|
|
4
|
+
``brooder -v`` raises it to DEBUG. User-facing output goes through ``report.console``,
|
|
5
|
+
not this logger — logging is for diagnostics.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
from rich.logging import RichHandler
|
|
13
|
+
|
|
14
|
+
_LOGGER_NAME = "brooder"
|
|
15
|
+
_configured = False
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def setup_logging(verbose: bool = False) -> None:
|
|
19
|
+
"""Configure the ``brooder`` logger once. Idempotent across calls."""
|
|
20
|
+
global _configured
|
|
21
|
+
logger = logging.getLogger(_LOGGER_NAME)
|
|
22
|
+
logger.setLevel(logging.DEBUG if verbose else logging.WARNING)
|
|
23
|
+
if not _configured:
|
|
24
|
+
handler = RichHandler(rich_tracebacks=True, show_path=False, show_time=False)
|
|
25
|
+
handler.setFormatter(logging.Formatter("%(message)s"))
|
|
26
|
+
logger.addHandler(handler)
|
|
27
|
+
logger.propagate = False
|
|
28
|
+
_configured = True
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_logger() -> logging.Logger:
|
|
32
|
+
"""Return the shared ``brooder`` logger."""
|
|
33
|
+
return logging.getLogger(_LOGGER_NAME)
|
brooder/metrics.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Optional OpenTelemetry (OTLP) metric emission for a Brooder run.
|
|
2
|
+
|
|
3
|
+
**One OTLP emitter — not per-vendor exporters.** A single ``OTEL_EXPORTER_OTLP_ENDPOINT`` reaches
|
|
4
|
+
Datadog, Grafana, Honeycomb, and CloudWatch (via the ADOT collector), so the OSS core stays lean
|
|
5
|
+
while feeding whatever telemetry backend a team already runs.
|
|
6
|
+
|
|
7
|
+
Duck-typed like the OTel span processor: the ``opentelemetry`` SDK is imported lazily inside
|
|
8
|
+
:func:`emit`, so the package imports fine without it. If the SDK is missing, :func:`emit` warns once
|
|
9
|
+
and returns — emitting metrics must never fail a CI run.
|
|
10
|
+
|
|
11
|
+
Each invocation emits a *snapshot* of the run (gauges, not monotonic counters): every ``brooder``
|
|
12
|
+
run reports its own ``total`` / ``regressed`` / ``mean_stability``, which is what a longitudinal
|
|
13
|
+
dashboard wants to chart over time.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
from typing import Any, Optional
|
|
20
|
+
|
|
21
|
+
from .log import get_logger
|
|
22
|
+
from .models import Diff
|
|
23
|
+
from .report import diffs_to_summary
|
|
24
|
+
|
|
25
|
+
_log = get_logger()
|
|
26
|
+
|
|
27
|
+
# Emit the "SDK not installed" hint at most once per process (this may run in a loop over cases).
|
|
28
|
+
_warned = False
|
|
29
|
+
|
|
30
|
+
# Default flush budget (ms) so a slow/unreachable collector can't hang CI.
|
|
31
|
+
_FLUSH_TIMEOUT_MS = 5000
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _warn_once(message: str) -> None:
|
|
35
|
+
global _warned
|
|
36
|
+
if not _warned:
|
|
37
|
+
_log.warning("brooder: %s", message)
|
|
38
|
+
_warned = True
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _metric_values(diffs: list[Diff]) -> dict[str, int]:
|
|
42
|
+
"""Map diffs to the flat ``{metric_name: value}`` snapshot emitted for a run."""
|
|
43
|
+
summary = diffs_to_summary(diffs)["summary"]
|
|
44
|
+
return {
|
|
45
|
+
"brooder.cases.total": summary["total"],
|
|
46
|
+
"brooder.cases.passed": summary["passed"],
|
|
47
|
+
"brooder.cases.regressed": summary["regressed"],
|
|
48
|
+
"brooder.cases.new": summary["new"],
|
|
49
|
+
"brooder.cases.flaky": summary["flaky"],
|
|
50
|
+
"brooder.cases.regressions": summary["regressions"],
|
|
51
|
+
"brooder.stability.mean": summary["mean_stability"],
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _build_provider(endpoint: str) -> tuple[Any, Any]:
|
|
56
|
+
"""Build an OTLP ``MeterProvider`` and a meter. Raises if the SDK isn't installed.
|
|
57
|
+
|
|
58
|
+
The HTTP exporter also reads ``OTEL_EXPORTER_OTLP_ENDPOINT`` itself; we pass ours explicitly.
|
|
59
|
+
"""
|
|
60
|
+
from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
|
|
61
|
+
from opentelemetry.sdk.metrics import MeterProvider
|
|
62
|
+
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
|
63
|
+
|
|
64
|
+
reader = PeriodicExportingMetricReader(OTLPMetricExporter(endpoint=endpoint))
|
|
65
|
+
provider = MeterProvider(metric_readers=[reader])
|
|
66
|
+
return provider, provider.get_meter("brooder")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _record(meter: Any, values: dict[str, int]) -> None:
|
|
70
|
+
"""Register one observable gauge per metric (collected at ``force_flush``)."""
|
|
71
|
+
from opentelemetry.metrics import Observation
|
|
72
|
+
|
|
73
|
+
def _callback_for(value: int) -> Any:
|
|
74
|
+
def _observe(_options: Any) -> list[Any]:
|
|
75
|
+
return [Observation(value)]
|
|
76
|
+
|
|
77
|
+
return _observe
|
|
78
|
+
|
|
79
|
+
for name, value in values.items():
|
|
80
|
+
meter.create_observable_gauge(name, callbacks=[_callback_for(value)])
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def emit(diffs: list[Diff], endpoint: Optional[str] = None) -> None:
|
|
84
|
+
"""Emit a run's result metrics over OTLP, best-effort.
|
|
85
|
+
|
|
86
|
+
A no-op unless an endpoint is configured (argument or ``OTEL_EXPORTER_OTLP_ENDPOINT``). If the
|
|
87
|
+
``opentelemetry`` SDK isn't installed it warns once and returns; any export error is logged and
|
|
88
|
+
swallowed. Never raises into the caller — metric emission must not fail a CI run.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
diffs: The per-case diffs to summarize into metrics.
|
|
92
|
+
endpoint: OTLP endpoint; falls back to ``OTEL_EXPORTER_OTLP_ENDPOINT`` if not given.
|
|
93
|
+
"""
|
|
94
|
+
endpoint = endpoint or os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
|
|
95
|
+
if not endpoint:
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
try:
|
|
99
|
+
provider, meter = _build_provider(endpoint)
|
|
100
|
+
except Exception:
|
|
101
|
+
_warn_once(
|
|
102
|
+
"opentelemetry not installed; skipping OTLP metric emission "
|
|
103
|
+
"(pip install 'brooder[otel]')"
|
|
104
|
+
)
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
_record(meter, _metric_values(diffs))
|
|
109
|
+
provider.force_flush(_FLUSH_TIMEOUT_MS)
|
|
110
|
+
except Exception:
|
|
111
|
+
_log.debug("brooder: OTLP metric emission failed", exc_info=True)
|
|
112
|
+
finally:
|
|
113
|
+
try:
|
|
114
|
+
provider.shutdown()
|
|
115
|
+
except Exception:
|
|
116
|
+
_log.debug("brooder: OTLP meter shutdown failed", exc_info=True)
|
brooder/models.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Core data model for Brooder.
|
|
2
|
+
|
|
3
|
+
A :class:`Run` is one execution of an agent over one input, captured as an ordered
|
|
4
|
+
:class:`Step` **trajectory** (the agent's path through its loop) plus the final output.
|
|
5
|
+
Baselines are golden ``Run``s; a :class:`Diff` compares a fresh ``Run`` to its baseline.
|
|
6
|
+
All models forbid unknown fields so malformed records fail loudly.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import hashlib
|
|
12
|
+
import json
|
|
13
|
+
from enum import Enum
|
|
14
|
+
from typing import Any, Optional
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class StepKind(str, Enum):
|
|
20
|
+
"""The kind of a step in an agent's trajectory.
|
|
21
|
+
|
|
22
|
+
Attributes:
|
|
23
|
+
TURN: A model turn / decision point.
|
|
24
|
+
TOOL: A tool invocation (its observation is in ``result``).
|
|
25
|
+
FINAL: The agent's terminal answer.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
TURN = "turn"
|
|
29
|
+
TOOL = "tool"
|
|
30
|
+
FINAL = "final"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Step(BaseModel):
|
|
34
|
+
"""One step in an agent's trajectory (a model turn, a tool call, or the final answer)."""
|
|
35
|
+
|
|
36
|
+
model_config = ConfigDict(extra="forbid", frozen=True)
|
|
37
|
+
|
|
38
|
+
kind: StepKind = Field(..., description="Model turn, tool call, or final answer.")
|
|
39
|
+
name: str = Field(..., min_length=1, description='Tool name, or "turn" / "final".')
|
|
40
|
+
args: dict[str, Any] = Field(
|
|
41
|
+
default_factory=dict, description="Tool arguments or turn metadata."
|
|
42
|
+
)
|
|
43
|
+
result: Any = Field(default=None, description="Tool observation, or the final output.")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Run(BaseModel):
|
|
47
|
+
"""One execution of an agent over one input, captured as an ordered trajectory."""
|
|
48
|
+
|
|
49
|
+
model_config = ConfigDict(extra="forbid")
|
|
50
|
+
|
|
51
|
+
agent: str = Field(..., min_length=1, description="Logical name of the agent under test.")
|
|
52
|
+
case_id: str = Field(..., min_length=1, description="Stable id derived from the inputs.")
|
|
53
|
+
inputs: Any = Field(default=None, description="The inputs this run was executed with.")
|
|
54
|
+
model: Optional[str] = Field(default=None, description="Model label (used by `migrate`).")
|
|
55
|
+
trajectory: list[Step] = Field(
|
|
56
|
+
default_factory=list, description="Ordered steps the agent took."
|
|
57
|
+
)
|
|
58
|
+
output: Any = Field(default=None, description="The agent's final output.")
|
|
59
|
+
meta: dict[str, Any] = Field(default_factory=dict, description="Free-form metadata.")
|
|
60
|
+
|
|
61
|
+
@field_validator("agent")
|
|
62
|
+
@classmethod
|
|
63
|
+
def _non_blank_agent(cls, value: str) -> str:
|
|
64
|
+
value = value.strip()
|
|
65
|
+
if not value:
|
|
66
|
+
raise ValueError("agent name must not be blank")
|
|
67
|
+
return value
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def turns(self) -> int:
|
|
71
|
+
"""Number of model turns (``TURN`` steps) in the trajectory."""
|
|
72
|
+
return sum(1 for step in self.trajectory if step.kind == StepKind.TURN)
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def step_count(self) -> int:
|
|
76
|
+
"""Total number of steps in the trajectory."""
|
|
77
|
+
return len(self.trajectory)
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def gave_up(self) -> bool:
|
|
81
|
+
"""True if the run has no ``FINAL`` step — the agent terminated without an answer."""
|
|
82
|
+
return not any(step.kind == StepKind.FINAL for step in self.trajectory)
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def runaway(self) -> bool:
|
|
86
|
+
"""True if the run was aborted for exceeding ``trajectory.max_steps`` (a guardrail hit)."""
|
|
87
|
+
return bool(self.meta.get("runaway"))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class Verdict(str, Enum):
|
|
91
|
+
"""The outcome of comparing a run (or set of runs) to a baseline.
|
|
92
|
+
|
|
93
|
+
Attributes:
|
|
94
|
+
PASS: Behavior matched the baseline.
|
|
95
|
+
REGRESSED: Behavior changed versus the baseline.
|
|
96
|
+
NEW: No baseline existed yet for this case.
|
|
97
|
+
FLAKY: Repeated runs of the same case disagreed with each other.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
PASS = "pass"
|
|
101
|
+
REGRESSED = "regressed"
|
|
102
|
+
NEW = "new"
|
|
103
|
+
FLAKY = "flaky"
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class Change(BaseModel):
|
|
107
|
+
"""One difference between a baseline run and the current run."""
|
|
108
|
+
|
|
109
|
+
model_config = ConfigDict(extra="forbid", frozen=True)
|
|
110
|
+
|
|
111
|
+
path: str = Field(..., description='Where the change is, e.g. "trajectory[1]".')
|
|
112
|
+
kind: str = Field(..., pattern="^(added|removed|changed)$")
|
|
113
|
+
before: Any = None
|
|
114
|
+
after: Any = None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class Diff(BaseModel):
|
|
118
|
+
"""The result of comparing a run to its baseline."""
|
|
119
|
+
|
|
120
|
+
model_config = ConfigDict(extra="forbid")
|
|
121
|
+
|
|
122
|
+
agent: str
|
|
123
|
+
case_id: str
|
|
124
|
+
verdict: Verdict
|
|
125
|
+
changes: list[Change] = Field(default_factory=list)
|
|
126
|
+
stability: int = Field(default=100, ge=0, le=100, description="100 = identical behavior.")
|
|
127
|
+
|
|
128
|
+
@property
|
|
129
|
+
def ok(self) -> bool:
|
|
130
|
+
"""True if the run matched the baseline (or there was no baseline yet)."""
|
|
131
|
+
return self.verdict in (Verdict.PASS, Verdict.NEW)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def make_case_id(agent: str, inputs: Any) -> str:
|
|
135
|
+
"""Derive a stable case id from an agent name and its inputs.
|
|
136
|
+
|
|
137
|
+
The same inputs always map to the same id, so a run can be matched to its baseline
|
|
138
|
+
across executions.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
agent: The logical agent name.
|
|
142
|
+
inputs: The inputs the agent was called with (must be JSON-serializable).
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
A 12-character hex digest identifying the case.
|
|
146
|
+
"""
|
|
147
|
+
payload = json.dumps({"agent": agent, "inputs": inputs}, sort_keys=True, default=str)
|
|
148
|
+
return hashlib.sha1(payload.encode("utf-8")).hexdigest()[:12]
|
brooder/py.typed
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Marker file for PEP 561. This package ships inline type information.
|