brooder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,216 @@
1
+ """OpenTelemetry GenAI ingestion — capture agent trajectories from OTel spans.
2
+
3
+ Add :class:`BrooderSpanProcessor` to your tracer provider and Brooder turns the GenAI/agent spans
4
+ your stack already emits into a diffable :class:`~brooder.models.Step` trajectory — no manual
5
+ ``tool_call``. Because frameworks like **LangGraph, CrewAI and AutoGen** emit OpenTelemetry GenAI
6
+ spans, this one adapter covers all of them, and it slots into the OTel pipelines you already run
7
+ (Datadog / Arize / Honeycomb).
8
+
9
+ from opentelemetry import trace
10
+ from brooder.integrations.otel import BrooderSpanProcessor
11
+
12
+ trace.get_tracer_provider().add_span_processor(BrooderSpanProcessor(agent="support-agent"))
13
+
14
+ The processor is **duck-typed** against the OTel SDK's ``SpanProcessor`` protocol, so importing this
15
+ module does not require ``opentelemetry`` to be installed. Registration is explicit — Brooder never
16
+ mutates your tracer provider for you.
17
+
18
+ **How spans map to steps** (driven by ``gen_ai.operation.name``):
19
+
20
+ - inference (``chat`` / ``text_completion`` / ``generate_content``) → ``TURN``
21
+ - ``execute_tool`` → ``TOOL`` (name from ``gen_ai.tool.name``; args/result if present)
22
+ - the agent-root span (``invoke_agent`` or the trace's parentless span) → the run boundary: its
23
+ input becomes the case identity, its output the ``FINAL`` step.
24
+
25
+ **Content is opt-in.** Per the GenAI conventions, tool arguments/results and message content are
26
+ only emitted when content capture is enabled (it can be sensitive). Brooder always captures tool
27
+ *names* (the core diff signal) and degrades gracefully when content is absent — but a stable case id
28
+ needs the input, so enable content capture *or* set a ``brooder.case_id`` attribute on the root
29
+ span. A one-time warning is logged when a trace carries no content.
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import json
35
+ import threading
36
+ from typing import Any, Optional
37
+
38
+ from .. import recorder
39
+ from ..log import get_logger
40
+ from ..models import Step, StepKind
41
+ from .base import parse_json
42
+
43
+ _log = get_logger()
44
+
45
+ # GenAI semantic-convention attribute keys (see the OpenTelemetry GenAI conventions).
46
+ _OP = "gen_ai.operation.name"
47
+ _TOOL_NAME = "gen_ai.tool.name"
48
+ _TOOL_ARGS = "gen_ai.tool.call.arguments"
49
+ _TOOL_RESULT = "gen_ai.tool.call.result"
50
+ _AGENT_NAME = "gen_ai.agent.name"
51
+ _PROVIDER = "gen_ai.provider.name" # newer key; falls back to gen_ai.system
52
+ _SYSTEM = "gen_ai.system"
53
+ _INPUT_MESSAGES = "gen_ai.input.messages"
54
+ _OUTPUT_MESSAGES = "gen_ai.output.messages"
55
+ _CASE_ID = "brooder.case_id" # Brooder escape hatch: a caller-set stable case key
56
+
57
+ _INFERENCE_OPS = frozenset({"chat", "text_completion", "generate_content"})
58
+
59
+
60
+ def _attrs(span: Any) -> Any:
61
+ """Return a span's attribute mapping (empty mapping if it has none)."""
62
+ return getattr(span, "attributes", None) or {}
63
+
64
+
65
+ def _start_time(span: Any) -> int:
66
+ """Return a span's start time in ns for ordering (0 if unknown)."""
67
+ return getattr(span, "start_time", 0) or 0
68
+
69
+
70
+ def _span_suffix(span: Any, prefix: str) -> Optional[str]:
71
+ """Return the ``<name>`` part of a ``"<prefix> <name>"`` span name, or None."""
72
+ name = getattr(span, "name", "") or ""
73
+ head = prefix + " "
74
+ return name[len(head) :] if name.startswith(head) else None
75
+
76
+
77
+ def _coerce(raw: Any) -> Any:
78
+ """Decode a JSON-string attribute into structured data; pass other values through."""
79
+ if isinstance(raw, str):
80
+ try:
81
+ return json.loads(raw)
82
+ except json.JSONDecodeError:
83
+ return raw
84
+ return raw
85
+
86
+
87
+ def _root_span(spans: list[Any]) -> Any:
88
+ """Pick the agent-boundary span: an ``invoke_agent`` span, else the parentless one, else 1st."""
89
+ for span in spans:
90
+ if _attrs(span).get(_OP) == "invoke_agent":
91
+ return span
92
+ for span in spans:
93
+ if getattr(span, "parent", None) is None:
94
+ return span
95
+ return spans[0]
96
+
97
+
98
+ def _extract_inputs(root: Any) -> Any:
99
+ """Extract the run's inputs (for case identity) from the root span, or None if unavailable."""
100
+ attrs = _attrs(root)
101
+ if _CASE_ID in attrs:
102
+ return {"case_id": attrs[_CASE_ID]}
103
+ raw = attrs.get(_INPUT_MESSAGES)
104
+ return _coerce(raw) if raw is not None else None
105
+
106
+
107
+ def _extract_output(root: Any, spans: list[Any]) -> Any:
108
+ """Extract the final output from the root span, falling back to the last inference span."""
109
+ raw = _attrs(root).get(_OUTPUT_MESSAGES)
110
+ if raw is not None:
111
+ return _coerce(raw)
112
+ for span in reversed(spans):
113
+ attrs = _attrs(span)
114
+ if attrs.get(_OP) in _INFERENCE_OPS and attrs.get(_OUTPUT_MESSAGES) is not None:
115
+ return _coerce(attrs[_OUTPUT_MESSAGES])
116
+ return None
117
+
118
+
119
+ def _to_step(span: Any) -> tuple[Optional[Step], bool]:
120
+ """Map a span to a step (or None to skip); the bool is whether it carried content."""
121
+ attrs = _attrs(span)
122
+ op = attrs.get(_OP)
123
+ if op in _INFERENCE_OPS:
124
+ provider = attrs.get(_PROVIDER) or attrs.get(_SYSTEM)
125
+ meta = {"provider": provider} if provider else {}
126
+ return Step(kind=StepKind.TURN, name="turn", args=meta), False
127
+ if op == "execute_tool" or _TOOL_NAME in attrs:
128
+ name = attrs.get(_TOOL_NAME) or _span_suffix(span, "execute_tool") or "tool"
129
+ raw_args = attrs.get(_TOOL_ARGS)
130
+ raw_result = attrs.get(_TOOL_RESULT)
131
+ args = parse_json(raw_args) if raw_args is not None else {}
132
+ result = _coerce(raw_result) if raw_result is not None else None
133
+ had_content = raw_args is not None or raw_result is not None
134
+ return Step(kind=StepKind.TOOL, name=name, args=args, result=result), had_content
135
+ # invoke_agent / create_agent / non-GenAI spans are the boundary or noise, not a step.
136
+ return None, False
137
+
138
+
139
+ class BrooderSpanProcessor:
140
+ """An OpenTelemetry ``SpanProcessor`` that ingests GenAI spans into Brooder trajectories.
141
+
142
+ Spans are buffered per trace; when the trace's root span ends they are sorted by start time and
143
+ mapped into one run (see the module docstring for the mapping). Buffering + sort makes
144
+ out-of-order and cross-thread ``on_end`` delivery a non-issue. Incomplete traces (no parentless
145
+ span seen) are drained on :meth:`force_flush` / :meth:`shutdown`.
146
+
147
+ Args:
148
+ agent: Logical agent name used to group baselines. If omitted, falls back to the root span's
149
+ ``gen_ai.agent.name``, then its span name, then ``"otel-agent"``.
150
+ """
151
+
152
+ def __init__(self, agent: Optional[str] = None) -> None:
153
+ """Create a processor that records into runs named ``agent`` (see the class docstring)."""
154
+ self._agent = agent
155
+ self._buffers: dict[int, list[Any]] = {}
156
+ self._lock = threading.Lock()
157
+ self._warned_no_content = False
158
+
159
+ def on_start(self, span: Any, parent_context: Any = None) -> None:
160
+ """Part of the ``SpanProcessor`` protocol; capture happens at :meth:`on_end`."""
161
+ return None
162
+
163
+ def on_end(self, span: Any) -> None:
164
+ """Buffer an ended span under its trace id, finalizing the run when the root span ends."""
165
+ trace_id = getattr(getattr(span, "context", None), "trace_id", None)
166
+ if trace_id is None:
167
+ return
168
+ with self._lock:
169
+ self._buffers.setdefault(trace_id, []).append(span)
170
+ is_root = getattr(span, "parent", None) is None
171
+ if is_root:
172
+ self._finalize(trace_id)
173
+
174
+ def shutdown(self) -> None:
175
+ """Drain every buffered (possibly incomplete) trace into a run."""
176
+ for trace_id in list(self._buffers):
177
+ self._finalize(trace_id)
178
+
179
+ def force_flush(self, timeout_millis: int = 30000) -> bool:
180
+ """Drain every buffered trace; always returns ``True``."""
181
+ self.shutdown()
182
+ return True
183
+
184
+ def _finalize(self, trace_id: int) -> None:
185
+ """Turn a trace's buffered spans into one recorded run."""
186
+ with self._lock:
187
+ spans = self._buffers.pop(trace_id, None)
188
+ if not spans:
189
+ return
190
+ spans.sort(key=_start_time)
191
+
192
+ root = _root_span(spans)
193
+ agent = self._agent or _attrs(root).get(_AGENT_NAME) or getattr(root, "name", None)
194
+ inputs = _extract_inputs(root)
195
+ handle = recorder.open_run(
196
+ agent or "otel-agent", inputs=inputs, external_id=f"{trace_id:032x}"
197
+ )
198
+
199
+ saw_content = inputs is not None
200
+ for span in spans:
201
+ step, had_content = _to_step(span)
202
+ if step is not None:
203
+ handle.add_step(step)
204
+ saw_content = saw_content or had_content
205
+
206
+ output = _extract_output(root, spans)
207
+ if output is not None:
208
+ saw_content = True
209
+ if not saw_content and not self._warned_no_content:
210
+ self._warned_no_content = True
211
+ _log.warning(
212
+ "brooder: OTel spans carry no GenAI content (tool args/results, messages); "
213
+ "capturing tool names only. Enable content capture, or set a 'brooder.case_id' "
214
+ "span attribute, for stable and richer baselines."
215
+ )
216
+ handle.finish(output)
brooder/judges.py ADDED
@@ -0,0 +1,109 @@
1
+ """Output-equivalence judges.
2
+
3
+ Structural diffing (tool calls) is always exact. *Output* equivalence, however, is where
4
+ agents are legitimately non-deterministic ("I've started your refund" vs "Your refund is on
5
+ its way" mean the same thing). A :class:`Judge` decides whether two outputs are equivalent.
6
+
7
+ - ``ExactJudge`` (default): byte-for-byte equality. Deterministic, offline, zero cost.
8
+ - ``LLMJudge``: asks a model whether two outputs are semantically equivalent. Provider-agnostic
9
+ via an injected ``complete`` callable, so it stays testable and never hard-depends on a vendor.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from collections.abc import Callable
15
+ from typing import Any, Optional, Protocol
16
+
17
+ from .log import get_logger
18
+
19
+ _log = get_logger()
20
+
21
+
22
+ class Judge(Protocol):
23
+ """Decides whether a current output is equivalent to the baseline output."""
24
+
25
+ def equivalent(self, baseline: Any, current: Any) -> bool:
26
+ """Return ``True`` if ``current`` is an acceptable substitute for ``baseline``.
27
+
28
+ Args:
29
+ baseline: The golden output recorded previously.
30
+ current: The output produced by the run under test.
31
+
32
+ Returns:
33
+ ``True`` if the two outputs are considered equivalent.
34
+ """
35
+ ...
36
+
37
+
38
+ class ExactJudge:
39
+ """Equivalent iff equal. The safe, deterministic default."""
40
+
41
+ def equivalent(self, baseline: Any, current: Any) -> bool:
42
+ """Return ``True`` iff the outputs are equal.
43
+
44
+ Args:
45
+ baseline: The golden output.
46
+ current: The output under test.
47
+
48
+ Returns:
49
+ ``True`` when ``baseline == current``.
50
+ """
51
+ return bool(baseline == current)
52
+
53
+
54
+ _PROMPT = (
55
+ "You are grading an AI agent's output against a known-good baseline. "
56
+ "Answer with exactly 'yes' if they are semantically equivalent for an end user, "
57
+ "or 'no' if the meaning or outcome differs.\n\n"
58
+ "BASELINE:\n{baseline}\n\nCURRENT:\n{current}\n\nEquivalent?"
59
+ )
60
+
61
+
62
+ class LLMJudge:
63
+ """Judge semantic equivalence with an LLM via an injected completion function."""
64
+
65
+ def __init__(self, complete: Callable[[str], str]) -> None:
66
+ """Initialize the judge.
67
+
68
+ Args:
69
+ complete: A function that takes a prompt and returns the model's text response.
70
+ Injecting it keeps the judge provider-agnostic and testable.
71
+ """
72
+ self._complete = complete
73
+
74
+ def equivalent(self, baseline: Any, current: Any) -> bool:
75
+ """Return ``True`` if the LLM judges the outputs semantically equivalent.
76
+
77
+ Equal outputs short-circuit without calling the backend.
78
+
79
+ Args:
80
+ baseline: The golden output.
81
+ current: The output under test.
82
+
83
+ Returns:
84
+ ``True`` if outputs are equal, or if the model answers "yes".
85
+ """
86
+ if baseline == current:
87
+ return True
88
+ prompt = _PROMPT.format(baseline=baseline, current=current)
89
+ answer = self._complete(prompt).strip().lower()
90
+ return answer.startswith("y")
91
+
92
+
93
+ def make_judge(kind: str, complete: Optional[Callable[[str], str]] = None) -> Judge:
94
+ """Build a judge from configuration.
95
+
96
+ Args:
97
+ kind: ``"exact"`` or ``"llm"``.
98
+ complete: The completion backend for ``"llm"``. If omitted, falls back to exact
99
+ (with a warning) so the tool never hard-fails on a missing backend.
100
+
101
+ Returns:
102
+ A :class:`Judge` implementation.
103
+ """
104
+ if kind == "llm":
105
+ if complete is None:
106
+ _log.warning("judge: llm is configured but no LLM backend is wired; using exact")
107
+ return ExactJudge()
108
+ return LLMJudge(complete)
109
+ return ExactJudge()
brooder/log.py ADDED
@@ -0,0 +1,33 @@
1
+ """Structured logging for Brooder.
2
+
3
+ The library logs through the ``brooder`` logger and stays quiet by default (WARNING).
4
+ ``brooder -v`` raises it to DEBUG. User-facing output goes through ``report.console``,
5
+ not this logger — logging is for diagnostics.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+
12
+ from rich.logging import RichHandler
13
+
14
+ _LOGGER_NAME = "brooder"
15
+ _configured = False
16
+
17
+
18
+ def setup_logging(verbose: bool = False) -> None:
19
+ """Configure the ``brooder`` logger once. Idempotent across calls."""
20
+ global _configured
21
+ logger = logging.getLogger(_LOGGER_NAME)
22
+ logger.setLevel(logging.DEBUG if verbose else logging.WARNING)
23
+ if not _configured:
24
+ handler = RichHandler(rich_tracebacks=True, show_path=False, show_time=False)
25
+ handler.setFormatter(logging.Formatter("%(message)s"))
26
+ logger.addHandler(handler)
27
+ logger.propagate = False
28
+ _configured = True
29
+
30
+
31
+ def get_logger() -> logging.Logger:
32
+ """Return the shared ``brooder`` logger."""
33
+ return logging.getLogger(_LOGGER_NAME)
brooder/metrics.py ADDED
@@ -0,0 +1,116 @@
1
+ """Optional OpenTelemetry (OTLP) metric emission for a Brooder run.
2
+
3
+ **One OTLP emitter — not per-vendor exporters.** A single ``OTEL_EXPORTER_OTLP_ENDPOINT`` reaches
4
+ Datadog, Grafana, Honeycomb, and CloudWatch (via the ADOT collector), so the OSS core stays lean
5
+ while feeding whatever telemetry backend a team already runs.
6
+
7
+ Duck-typed like the OTel span processor: the ``opentelemetry`` SDK is imported lazily inside
8
+ :func:`emit`, so the package imports fine without it. If the SDK is missing, :func:`emit` warns once
9
+ and returns — emitting metrics must never fail a CI run.
10
+
11
+ Each invocation emits a *snapshot* of the run (gauges, not monotonic counters): every ``brooder``
12
+ run reports its own ``total`` / ``regressed`` / ``mean_stability``, which is what a longitudinal
13
+ dashboard wants to chart over time.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ from typing import Any, Optional
20
+
21
+ from .log import get_logger
22
+ from .models import Diff
23
+ from .report import diffs_to_summary
24
+
25
+ _log = get_logger()
26
+
27
+ # Emit the "SDK not installed" hint at most once per process (this may run in a loop over cases).
28
+ _warned = False
29
+
30
+ # Default flush budget (ms) so a slow/unreachable collector can't hang CI.
31
+ _FLUSH_TIMEOUT_MS = 5000
32
+
33
+
34
+ def _warn_once(message: str) -> None:
35
+ global _warned
36
+ if not _warned:
37
+ _log.warning("brooder: %s", message)
38
+ _warned = True
39
+
40
+
41
+ def _metric_values(diffs: list[Diff]) -> dict[str, int]:
42
+ """Map diffs to the flat ``{metric_name: value}`` snapshot emitted for a run."""
43
+ summary = diffs_to_summary(diffs)["summary"]
44
+ return {
45
+ "brooder.cases.total": summary["total"],
46
+ "brooder.cases.passed": summary["passed"],
47
+ "brooder.cases.regressed": summary["regressed"],
48
+ "brooder.cases.new": summary["new"],
49
+ "brooder.cases.flaky": summary["flaky"],
50
+ "brooder.cases.regressions": summary["regressions"],
51
+ "brooder.stability.mean": summary["mean_stability"],
52
+ }
53
+
54
+
55
+ def _build_provider(endpoint: str) -> tuple[Any, Any]:
56
+ """Build an OTLP ``MeterProvider`` and a meter. Raises if the SDK isn't installed.
57
+
58
+ The HTTP exporter also reads ``OTEL_EXPORTER_OTLP_ENDPOINT`` itself; we pass ours explicitly.
59
+ """
60
+ from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter
61
+ from opentelemetry.sdk.metrics import MeterProvider
62
+ from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
63
+
64
+ reader = PeriodicExportingMetricReader(OTLPMetricExporter(endpoint=endpoint))
65
+ provider = MeterProvider(metric_readers=[reader])
66
+ return provider, provider.get_meter("brooder")
67
+
68
+
69
+ def _record(meter: Any, values: dict[str, int]) -> None:
70
+ """Register one observable gauge per metric (collected at ``force_flush``)."""
71
+ from opentelemetry.metrics import Observation
72
+
73
+ def _callback_for(value: int) -> Any:
74
+ def _observe(_options: Any) -> list[Any]:
75
+ return [Observation(value)]
76
+
77
+ return _observe
78
+
79
+ for name, value in values.items():
80
+ meter.create_observable_gauge(name, callbacks=[_callback_for(value)])
81
+
82
+
83
+ def emit(diffs: list[Diff], endpoint: Optional[str] = None) -> None:
84
+ """Emit a run's result metrics over OTLP, best-effort.
85
+
86
+ A no-op unless an endpoint is configured (argument or ``OTEL_EXPORTER_OTLP_ENDPOINT``). If the
87
+ ``opentelemetry`` SDK isn't installed it warns once and returns; any export error is logged and
88
+ swallowed. Never raises into the caller — metric emission must not fail a CI run.
89
+
90
+ Args:
91
+ diffs: The per-case diffs to summarize into metrics.
92
+ endpoint: OTLP endpoint; falls back to ``OTEL_EXPORTER_OTLP_ENDPOINT`` if not given.
93
+ """
94
+ endpoint = endpoint or os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
95
+ if not endpoint:
96
+ return
97
+
98
+ try:
99
+ provider, meter = _build_provider(endpoint)
100
+ except Exception:
101
+ _warn_once(
102
+ "opentelemetry not installed; skipping OTLP metric emission "
103
+ "(pip install 'brooder[otel]')"
104
+ )
105
+ return
106
+
107
+ try:
108
+ _record(meter, _metric_values(diffs))
109
+ provider.force_flush(_FLUSH_TIMEOUT_MS)
110
+ except Exception:
111
+ _log.debug("brooder: OTLP metric emission failed", exc_info=True)
112
+ finally:
113
+ try:
114
+ provider.shutdown()
115
+ except Exception:
116
+ _log.debug("brooder: OTLP meter shutdown failed", exc_info=True)
brooder/models.py ADDED
@@ -0,0 +1,148 @@
1
+ """Core data model for Brooder.
2
+
3
+ A :class:`Run` is one execution of an agent over one input, captured as an ordered
4
+ :class:`Step` **trajectory** (the agent's path through its loop) plus the final output.
5
+ Baselines are golden ``Run``s; a :class:`Diff` compares a fresh ``Run`` to its baseline.
6
+ All models forbid unknown fields so malformed records fail loudly.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+ import json
13
+ from enum import Enum
14
+ from typing import Any, Optional
15
+
16
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
17
+
18
+
19
+ class StepKind(str, Enum):
20
+ """The kind of a step in an agent's trajectory.
21
+
22
+ Attributes:
23
+ TURN: A model turn / decision point.
24
+ TOOL: A tool invocation (its observation is in ``result``).
25
+ FINAL: The agent's terminal answer.
26
+ """
27
+
28
+ TURN = "turn"
29
+ TOOL = "tool"
30
+ FINAL = "final"
31
+
32
+
33
+ class Step(BaseModel):
34
+ """One step in an agent's trajectory (a model turn, a tool call, or the final answer)."""
35
+
36
+ model_config = ConfigDict(extra="forbid", frozen=True)
37
+
38
+ kind: StepKind = Field(..., description="Model turn, tool call, or final answer.")
39
+ name: str = Field(..., min_length=1, description='Tool name, or "turn" / "final".')
40
+ args: dict[str, Any] = Field(
41
+ default_factory=dict, description="Tool arguments or turn metadata."
42
+ )
43
+ result: Any = Field(default=None, description="Tool observation, or the final output.")
44
+
45
+
46
+ class Run(BaseModel):
47
+ """One execution of an agent over one input, captured as an ordered trajectory."""
48
+
49
+ model_config = ConfigDict(extra="forbid")
50
+
51
+ agent: str = Field(..., min_length=1, description="Logical name of the agent under test.")
52
+ case_id: str = Field(..., min_length=1, description="Stable id derived from the inputs.")
53
+ inputs: Any = Field(default=None, description="The inputs this run was executed with.")
54
+ model: Optional[str] = Field(default=None, description="Model label (used by `migrate`).")
55
+ trajectory: list[Step] = Field(
56
+ default_factory=list, description="Ordered steps the agent took."
57
+ )
58
+ output: Any = Field(default=None, description="The agent's final output.")
59
+ meta: dict[str, Any] = Field(default_factory=dict, description="Free-form metadata.")
60
+
61
+ @field_validator("agent")
62
+ @classmethod
63
+ def _non_blank_agent(cls, value: str) -> str:
64
+ value = value.strip()
65
+ if not value:
66
+ raise ValueError("agent name must not be blank")
67
+ return value
68
+
69
+ @property
70
+ def turns(self) -> int:
71
+ """Number of model turns (``TURN`` steps) in the trajectory."""
72
+ return sum(1 for step in self.trajectory if step.kind == StepKind.TURN)
73
+
74
+ @property
75
+ def step_count(self) -> int:
76
+ """Total number of steps in the trajectory."""
77
+ return len(self.trajectory)
78
+
79
+ @property
80
+ def gave_up(self) -> bool:
81
+ """True if the run has no ``FINAL`` step — the agent terminated without an answer."""
82
+ return not any(step.kind == StepKind.FINAL for step in self.trajectory)
83
+
84
+ @property
85
+ def runaway(self) -> bool:
86
+ """True if the run was aborted for exceeding ``trajectory.max_steps`` (a guardrail hit)."""
87
+ return bool(self.meta.get("runaway"))
88
+
89
+
90
+ class Verdict(str, Enum):
91
+ """The outcome of comparing a run (or set of runs) to a baseline.
92
+
93
+ Attributes:
94
+ PASS: Behavior matched the baseline.
95
+ REGRESSED: Behavior changed versus the baseline.
96
+ NEW: No baseline existed yet for this case.
97
+ FLAKY: Repeated runs of the same case disagreed with each other.
98
+ """
99
+
100
+ PASS = "pass"
101
+ REGRESSED = "regressed"
102
+ NEW = "new"
103
+ FLAKY = "flaky"
104
+
105
+
106
+ class Change(BaseModel):
107
+ """One difference between a baseline run and the current run."""
108
+
109
+ model_config = ConfigDict(extra="forbid", frozen=True)
110
+
111
+ path: str = Field(..., description='Where the change is, e.g. "trajectory[1]".')
112
+ kind: str = Field(..., pattern="^(added|removed|changed)$")
113
+ before: Any = None
114
+ after: Any = None
115
+
116
+
117
+ class Diff(BaseModel):
118
+ """The result of comparing a run to its baseline."""
119
+
120
+ model_config = ConfigDict(extra="forbid")
121
+
122
+ agent: str
123
+ case_id: str
124
+ verdict: Verdict
125
+ changes: list[Change] = Field(default_factory=list)
126
+ stability: int = Field(default=100, ge=0, le=100, description="100 = identical behavior.")
127
+
128
+ @property
129
+ def ok(self) -> bool:
130
+ """True if the run matched the baseline (or there was no baseline yet)."""
131
+ return self.verdict in (Verdict.PASS, Verdict.NEW)
132
+
133
+
134
+ def make_case_id(agent: str, inputs: Any) -> str:
135
+ """Derive a stable case id from an agent name and its inputs.
136
+
137
+ The same inputs always map to the same id, so a run can be matched to its baseline
138
+ across executions.
139
+
140
+ Args:
141
+ agent: The logical agent name.
142
+ inputs: The inputs the agent was called with (must be JSON-serializable).
143
+
144
+ Returns:
145
+ A 12-character hex digest identifying the case.
146
+ """
147
+ payload = json.dumps({"agent": agent, "inputs": inputs}, sort_keys=True, default=str)
148
+ return hashlib.sha1(payload.encode("utf-8")).hexdigest()[:12]
brooder/py.typed ADDED
@@ -0,0 +1 @@
1
+ # Marker file for PEP 561. This package ships inline type information.