PyPI - brooder - Versions diffs - 0.1.0__py3-none-any.whl - Mend

brooder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

brooder/__init__.py +31 -0
brooder/analysis.py +79 -0
brooder/cli.py +281 -0
brooder/config.py +88 -0
brooder/diffing.py +217 -0
brooder/errors.py +31 -0
brooder/integrations/__init__.py +75 -0
brooder/integrations/anthropic.py +46 -0
brooder/integrations/base.py +170 -0
brooder/integrations/bedrock.py +49 -0
brooder/integrations/claude_agent.py +164 -0
brooder/integrations/google.py +61 -0
brooder/integrations/langchain.py +321 -0
brooder/integrations/openai.py +43 -0
brooder/integrations/openai_agents.py +208 -0
brooder/integrations/otel.py +216 -0
brooder/judges.py +109 -0
brooder/log.py +33 -0
brooder/metrics.py +116 -0
brooder/models.py +148 -0
brooder/py.typed +1 -0
brooder/recorder.py +342 -0
brooder/report.py +261 -0
brooder/storage.py +150 -0
brooder-0.1.0.dist-info/METADATA +338 -0
brooder-0.1.0.dist-info/RECORD +30 -0
brooder-0.1.0.dist-info/WHEEL +4 -0
brooder-0.1.0.dist-info/entry_points.txt +2 -0
brooder-0.1.0.dist-info/licenses/LICENSE +201 -0
brooder-0.1.0.dist-info/licenses/NOTICE +7 -0

brooder/errors.py ADDED Viewed

@@ -0,0 +1,31 @@
+"""Typed, user-facing exceptions.
+Anything raised as a ``BrooderError`` is considered safe to print to the user (no stack
+trace). The CLI's single error boundary (``cli.main``) catches these and exits cleanly.
+"""
+from __future__ import annotations
+class BrooderError(Exception):
+    """Base class for expected, user-facing Brooder errors."""
+class ScriptNotFoundError(BrooderError):
+    """The target agent script does not exist."""
+class CorruptRecordError(BrooderError):
+    """A stored baseline or run file could not be parsed as a Brooder record."""
+class ConfigError(BrooderError):
+    """brooder.yaml is present but invalid."""
+class RunawayError(Exception):
+    """Internal control-flow signal: a run exceeded ``trajectory.max_steps`` and was aborted.
+    Deliberately **not** a :class:`BrooderError`: it is always caught by the ``@record`` wrapper
+    (which turns it into a recorded ``runaway`` run) and must never reach the CLI error boundary.
+    """

brooder/integrations/__init__.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""Provider auto-capture.
+Wrap an LLM client once and Brooder records the model's tool-call decisions automatically —
+no manual :func:`brooder.tool_call`. Supported: OpenAI, Azure OpenAI, Anthropic, AWS Bedrock,
+and Google (Gemini / Vertex).
+    import brooder, openai
+    client = brooder.instrument(openai.OpenAI())
+"""
+from __future__ import annotations
+from collections.abc import Callable
+from typing import Any, Optional
+from ..errors import BrooderError
+from . import anthropic, bedrock, google, openai
+# User-facing provider name -> internal adapter key.
+_ALIASES = {
+    "openai": "openai",
+    "azure": "openai",
+    "anthropic": "anthropic",
+    "bedrock": "bedrock",
+    "aws": "bedrock",
+    "google": "google",
+    "gcp": "google",
+    "vertex": "google",
+    "gemini": "google",
+}
+# Adapter key -> its instrument function.
+_ADAPTERS: dict[str, Callable[..., Any]] = {
+    "openai": openai.instrument,
+    "anthropic": anthropic.instrument,
+    "bedrock": bedrock.instrument,
+    "google": google.instrument,
+}
+def _detect(client: Any) -> str:
+    """Infer the provider key from a client, or raise if it can't be determined."""
+    root = (type(client).__module__ or "").split(".")[0]
+    if root == "openai":
+        return "openai"
+    if root == "anthropic":
+        return "anthropic"
+    if root in ("botocore", "boto3") or hasattr(client, "converse"):
+        return "bedrock"
+    if root in ("google", "vertexai") or hasattr(client, "generate_content"):
+        return "google"
+    raise BrooderError("could not detect provider from client; pass provider=... explicitly")
+def instrument(client: Any, provider: Optional[str] = None, capture_content: bool = False) -> Any:
+    """Instrument an LLM client so Brooder auto-records the model's tool calls.
+    Args:
+        client: A provider SDK client — OpenAI/AzureOpenAI, Anthropic, a boto3 Bedrock-runtime
+            client, or a Google ``GenerativeModel``.
+        provider: Force a provider (``"openai"``, ``"azure"``, ``"anthropic"``, ``"bedrock"``/
+            ``"aws"``, ``"google"``/``"gcp"``/``"vertex"``/``"gemini"``). Auto-detected if omitted.
+        capture_content: Also record assistant text, not just tool calls.
+    Returns:
+        The same client, with the relevant method patched in place.
+    Raises:
+        BrooderError: If the provider cannot be resolved.
+    """
+    key = provider.lower() if provider else _detect(client)
+    resolved = _ALIASES.get(key)
+    if resolved is None:
+        raise BrooderError(f"unknown provider: {provider!r}")
+    return _ADAPTERS[resolved](client, capture_content=capture_content)

brooder/integrations/anthropic.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Anthropic auto-capture.
+Wraps ``client.messages.create`` and normalizes the content-block list (``text`` and
+``tool_use`` blocks). Also covers the Anthropic-on-Bedrock / Vertex clients from the same SDK.
+"""
+from __future__ import annotations
+from typing import Any, Optional
+from .base import NormalizedCall, ToolRequest, as_dict, get, wrap
+def _normalize(_kwargs: dict[str, Any], response: Any) -> Optional[NormalizedCall]:
+    texts: list[str] = []
+    tool_calls: list[ToolRequest] = []
+    for block in get(response, "content") or []:
+        kind = get(block, "type")
+        if kind == "text":
+            text = get(block, "text")
+            if text:
+                texts.append(text)
+        elif kind == "tool_use":
+            tool_calls.append(
+                ToolRequest(name=get(block, "name") or "", arguments=as_dict(get(block, "input")))
+            )
+    return NormalizedCall(
+        provider="anthropic",
+        model=get(response, "model"),
+        content="".join(texts) or None,
+        tool_calls=tool_calls,
+    )
+def instrument(client: Any, capture_content: bool = False) -> Any:
+    """Instrument an Anthropic client's ``messages.create``.
+    Args:
+        client: An ``Anthropic`` (or ``AnthropicBedrock`` / ``AnthropicVertex``) client.
+        capture_content: Also record assistant text, not just tool calls.
+    Returns:
+        The same client, patched in place.
+    """
+    wrap(client.messages, "create", _normalize, capture_content)
+    return client

brooder/integrations/base.py ADDED Viewed

@@ -0,0 +1,170 @@
+"""Provider-agnostic core for auto-capture.
+Each provider adapter turns its SDK's response into a :class:`NormalizedCall`, and :func:`wrap`
+patches the SDK method so every call is normalized and recorded into the active run. Capture is
+best-effort: if normalization ever fails it is logged at debug level and the user's original call
+result is returned unchanged — instrumentation must never break the app it observes.
+"""
+from __future__ import annotations
+import functools
+import inspect
+import json
+import weakref
+from collections.abc import Callable
+from dataclasses import dataclass, field
+from typing import Any, Optional
+from .. import recorder
+from ..errors import RunawayError
+from ..log import get_logger
+_log = get_logger()
+# Methods we've already wrapped, so instrumenting twice is a no-op.
+_wrapped: weakref.WeakSet[Any] = weakref.WeakSet()
+@dataclass
+class ToolRequest:
+    """A tool/function the model asked to call, in provider-neutral form."""
+    name: str
+    arguments: dict[str, Any] = field(default_factory=dict)
+@dataclass
+class NormalizedCall:
+    """A single model call normalized across providers."""
+    provider: str
+    model: Optional[str] = None
+    content: Optional[str] = None
+    tool_calls: list[ToolRequest] = field(default_factory=list)
+# A capture function receives the call kwargs and the raw response and returns a NormalizedCall
+# (or None to skip). It must not raise for expected shapes.
+CaptureFn = Callable[[dict[str, Any], Any], Optional[NormalizedCall]]
+def record_call(call: NormalizedCall, capture_content: bool = False) -> None:
+    """Record a normalized model call into the active run's trajectory.
+    Records a ``TURN`` step for the call (so turn counts are captured), then a tool-call step for
+    each tool the model requested (matching manual :func:`brooder.tool_call` semantics), so
+    behavioral regressions in tool selection are caught. The model name is deliberately *not*
+    recorded, so switching models is not itself a diff.
+    Args:
+        call: The normalized call.
+        capture_content: If ``True``, also record the assistant's text content (only when it made
+            no tool calls). Off by default to keep diffs focused on tool decisions.
+    """
+    recorder.turn({"provider": call.provider})
+    for tool in call.tool_calls:
+        recorder.tool_call(tool.name, tool.arguments)
+    if capture_content and call.content is not None and not call.tool_calls:
+        recorder.tool_call(f"llm:{call.provider}", {}, result=call.content)
+def _capture_into_run(
+    holder: Any,
+    attr: str,
+    capture: CaptureFn,
+    kwargs: dict[str, Any],
+    response: Any,
+    capture_content: bool,
+) -> None:
+    """Normalize ``response`` and record it into the active run — best-effort.
+    Shared by the sync and async wrappers. A capture failure is logged and swallowed so
+    instrumentation never breaks the user's call; only :class:`RunawayError` (the ``max_steps``
+    guardrail) is allowed to propagate.
+    """
+    try:
+        call = capture(kwargs, response)
+        if call is not None:
+            record_call(call, capture_content=capture_content)
+    except RunawayError:
+        # The max_steps guardrail must abort the run, not be swallowed as a capture failure.
+        raise
+    except Exception:
+        # Capture is best-effort and must never break the user's call.
+        _log.debug("brooder: capture failed for %s.%s", type(holder).__name__, attr, exc_info=True)
+def wrap(holder: Any, attr: str, capture: CaptureFn, capture_content: bool = False) -> None:
+    """Patch ``holder.attr`` so each call is normalized and recorded.
+    Works for both sync and ``async def`` methods: an async method (e.g. ``AsyncOpenAI``'s
+    ``chat.completions.create``) is wrapped with a coroutine that awaits the original before
+    recording. Idempotent: wrapping an already-wrapped method is a no-op.
+    Args:
+        holder: The object owning the method (e.g. ``client.chat.completions``).
+        attr: The method name to wrap (e.g. ``"create"``).
+        capture: Turns ``(kwargs, response)`` into a :class:`NormalizedCall`.
+        capture_content: Forwarded to :func:`record_call`.
+    """
+    original = getattr(holder, attr)
+    if original in _wrapped:
+        return
+    wrapper: Callable[..., Any]
+    if inspect.iscoroutinefunction(original):
+        @functools.wraps(original)
+        async def _async_wrapper(*args: Any, **kwargs: Any) -> Any:
+            response = await original(*args, **kwargs)
+            _capture_into_run(holder, attr, capture, kwargs, response, capture_content)
+            return response
+        wrapper = _async_wrapper
+    else:
+        @functools.wraps(original)
+        def _sync_wrapper(*args: Any, **kwargs: Any) -> Any:
+            response = original(*args, **kwargs)
+            _capture_into_run(holder, attr, capture, kwargs, response, capture_content)
+            return response
+        wrapper = _sync_wrapper
+    _wrapped.add(wrapper)
+    setattr(holder, attr, wrapper)
+def get(obj: Any, key: str, default: Any = None) -> Any:
+    """Read ``key`` from an object by attribute or a dict by item, tolerating either shape."""
+    if obj is None:
+        return default
+    if isinstance(obj, dict):
+        return obj.get(key, default)
+    return getattr(obj, key, default)
+def parse_json(raw: Any) -> dict[str, Any]:
+    """Coerce a tool-arguments payload (JSON string, dict, or None) into a dict."""
+    if isinstance(raw, dict):
+        return dict(raw)
+    if isinstance(raw, str):
+        try:
+            parsed = json.loads(raw)
+        except json.JSONDecodeError:
+            return {"_raw": raw}
+        return parsed if isinstance(parsed, dict) else {"_value": parsed}
+    return as_dict(raw)
+def as_dict(value: Any) -> dict[str, Any]:
+    """Best-effort conversion of a mapping-like value into a plain dict."""
+    if value is None:
+        return {}
+    if isinstance(value, dict):
+        return dict(value)
+    try:
+        return dict(value)
+    except (TypeError, ValueError):
+        return {"_value": str(value)}

brooder/integrations/bedrock.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""AWS Bedrock auto-capture.
+Wraps the boto3 ``bedrock-runtime`` client's ``converse`` method. The model id comes from the
+call kwargs (``modelId``); content and tool use come from the ``output.message.content`` blocks.
+"""
+from __future__ import annotations
+from typing import Any, Optional
+from .base import NormalizedCall, ToolRequest, as_dict, get, wrap
+def _normalize(kwargs: dict[str, Any], response: Any) -> Optional[NormalizedCall]:
+    message = get(get(response, "output"), "message")
+    texts: list[str] = []
+    tool_calls: list[ToolRequest] = []
+    for block in get(message, "content") or []:
+        text = get(block, "text")
+        if text is not None:
+            texts.append(text)
+        tool_use = get(block, "toolUse")
+        if tool_use:
+            tool_calls.append(
+                ToolRequest(
+                    name=get(tool_use, "name") or "",
+                    arguments=as_dict(get(tool_use, "input")),
+                )
+            )
+    return NormalizedCall(
+        provider="bedrock",
+        model=get(kwargs, "modelId"),
+        content="".join(texts) or None,
+        tool_calls=tool_calls,
+    )
+def instrument(client: Any, capture_content: bool = False) -> Any:
+    """Instrument a boto3 ``bedrock-runtime`` client's ``converse``.
+    Args:
+        client: A boto3 client created with ``boto3.client("bedrock-runtime")``.
+        capture_content: Also record assistant text, not just tool calls.
+    Returns:
+        The same client, patched in place.
+    """
+    wrap(client, "converse", _normalize, capture_content)
+    return client

brooder/integrations/claude_agent.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""Claude Agent SDK capture — trajectories from the SDK's hooks.
+Register Brooder's hooks with the Claude Agent SDK and it records the agent's tool trajectory
+automatically — no manual ``tool_call``:
+    import brooder
+    from claude_agent_sdk import ClaudeSDKClient, ClaudeAgentOptions, ResultMessage
+    options = ClaudeAgentOptions(hooks=brooder.claude_agent_hooks(agent="support-agent"))
+    async with ClaudeSDKClient(options=options) as client:
+        await client.query(prompt)
+        async for message in client.receive_response():
+            if isinstance(message, ResultMessage):
+                brooder.integrations.claude_agent.record_output(message.session_id, message.result)
+**How hooks map to steps** (verified against the Python SDK):
+- ``UserPromptSubmit`` → open a run keyed by ``session_id``; the prompt becomes the case identity.
+- ``PostToolUse`` / ``PostToolUseFailure`` → a ``TOOL`` step (name, input, and the tool response).
+- ``Stop`` → a ``FINAL`` step and save.
+**SDK constraints this works around.** The Python SDK exposes no ``SessionStart`` / ``SessionEnd``
+hooks (those are settings-file only), so the prompt/stop cycle delimits a run — each user turn in a
+session is one case. And ``Stop`` does not carry the final assistant text (it lives in the
+``ResultMessage`` stream), so the tool *trajectory* — the core diff signal — is always captured, but
+the final *answer* is only recorded if you feed it via :func:`record_output`. Turn counts aren't
+exposed by hooks, so no ``TURN`` steps are fabricated.
+"""
+from __future__ import annotations
+import threading
+from typing import Any, Optional
+from .. import recorder
+from ..errors import BrooderError
+from ..log import get_logger
+from .base import as_dict
+_log = get_logger()
+# Final answers recorded out-of-band (hooks don't expose them), keyed by session id.
+_outputs: dict[str, Any] = {}
+_outputs_lock = threading.Lock()
+def record_output(session_id: str, output: Any) -> None:
+    """Record the agent's final answer for a session so its run gets a populated ``FINAL`` step.
+    The Claude Agent SDK's ``Stop`` hook does not carry the final assistant text — it comes from the
+    ``ResultMessage`` in the message stream. Call this from your loop (with ``message.result``) to
+    make the final output diffable. Optional: the tool trajectory is captured from hooks regardless.
+    Args:
+        session_id: The session id from the message / hook payload.
+        output: The agent's final answer.
+    """
+    with _outputs_lock:
+        _outputs[session_id] = output
+def _pop_output(session_id: str) -> Any:
+    """Take and clear any out-of-band output recorded for ``session_id``."""
+    with _outputs_lock:
+        return _outputs.pop(session_id, None)
+def _reset() -> None:
+    """Clear out-of-band output state (used by tests)."""
+    with _outputs_lock:
+        _outputs.clear()
+class _Capture:
+    """Builds the Brooder hook callbacks; each payload carries the ``session_id`` to key on."""
+    def __init__(self, agent: Optional[str]) -> None:
+        self._agent = agent
+    def _handle(self, session_id: str) -> recorder.RunHandle:
+        """Get the session's open run, opening one lazily if a hook fired before the prompt."""
+        handle = recorder.get_run(session_id)
+        if handle is None:
+            handle = recorder.open_run(self._agent or "claude-agent", external_id=session_id)
+        return handle
+    async def _on_user_prompt(
+        self, input_data: dict[str, Any], tool_use_id: Optional[str], context: Any
+    ) -> dict[str, Any]:
+        try:
+            session_id = input_data.get("session_id")
+            if session_id is not None:
+                stale = recorder.get_run(session_id)
+                if stale is not None:
+                    stale.finish(_pop_output(session_id))  # prior turn never hit Stop; close it
+                handle = recorder.open_run(self._agent or "claude-agent", external_id=session_id)
+                prompt = input_data.get("prompt")
+                if prompt is not None:
+                    handle.set_inputs(prompt)
+        except Exception:  # capture must never break the agent
+            _log.debug("brooder: claude-agent UserPromptSubmit capture failed", exc_info=True)
+        return {}
+    async def _on_tool(
+        self, input_data: dict[str, Any], tool_use_id: Optional[str], context: Any
+    ) -> dict[str, Any]:
+        try:
+            session_id = input_data.get("session_id")
+            if session_id is not None:
+                self._handle(session_id).tool_call(
+                    input_data.get("tool_name") or "tool",
+                    as_dict(input_data.get("tool_input")),
+                    result=input_data.get("tool_response"),
+                )
+        except Exception:
+            _log.debug("brooder: claude-agent PostToolUse capture failed", exc_info=True)
+        return {}
+    async def _on_stop(
+        self, input_data: dict[str, Any], tool_use_id: Optional[str], context: Any
+    ) -> dict[str, Any]:
+        try:
+            session_id = input_data.get("session_id")
+            if session_id is not None:
+                handle = recorder.get_run(session_id)
+                if handle is not None:
+                    handle.final(_pop_output(session_id))  # Stop = the agent produced an answer
+                    handle.finish()
+        except Exception:
+            _log.debug("brooder: claude-agent Stop capture failed", exc_info=True)
+        return {}
+    def callbacks(self) -> dict[str, list[Any]]:
+        """Return ``{event_name: [callback]}`` for the events Brooder captures."""
+        return {
+            "UserPromptSubmit": [self._on_user_prompt],
+            "PostToolUse": [self._on_tool],
+            "PostToolUseFailure": [self._on_tool],
+            "Stop": [self._on_stop],
+        }
+def claude_agent_hooks(agent: Optional[str] = None) -> dict[str, list[Any]]:
+    """Build the hooks mapping to pass to ``ClaudeAgentOptions(hooks=...)``.
+    Records the agent's tool trajectory into Brooder runs (see the module docstring for details).
+    Args:
+        agent: Logical agent name used to group baselines (defaults to ``"claude-agent"``).
+    Returns:
+        A ``{event_name: [HookMatcher(...)]}`` mapping for ``ClaudeAgentOptions``.
+    Raises:
+        BrooderError: If the Claude Agent SDK is not installed.
+    """
+    try:
+        from claude_agent_sdk import HookMatcher
+    except ImportError as exc:
+        raise BrooderError(
+            "claude_agent_hooks() needs the Claude Agent SDK — `pip install claude-agent-sdk`"
+        ) from exc
+    capture = _Capture(agent)
+    return {event: [HookMatcher(hooks=cbs)] for event, cbs in capture.callbacks().items()}

brooder/integrations/google.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""Google Gemini / Vertex AI auto-capture.
+Wraps a ``GenerativeModel``'s ``generate_content``. The model name is read from the model object
+(``model_name``); content and function calls come from the first candidate's parts. Covers both
+the ``google-generativeai`` (Gemini API) and ``vertexai`` SDKs, which share this response shape.
+"""
+from __future__ import annotations
+from collections.abc import Callable
+from typing import Any, Optional
+from .base import NormalizedCall, ToolRequest, as_dict, get, wrap
+CaptureFn = Callable[[dict[str, Any], Any], Optional[NormalizedCall]]
+def _normalizer(model_name: Optional[str]) -> CaptureFn:
+    def _normalize(_kwargs: dict[str, Any], response: Any) -> Optional[NormalizedCall]:
+        candidates = get(response, "candidates") or []
+        content = get(candidates[0], "content") if candidates else None
+        texts: list[str] = []
+        tool_calls: list[ToolRequest] = []
+        for part in get(content, "parts") or []:
+            text = get(part, "text")
+            if text:
+                texts.append(text)
+            function_call = get(part, "function_call")
+            if function_call:
+                tool_calls.append(
+                    ToolRequest(
+                        name=get(function_call, "name") or "",
+                        arguments=as_dict(get(function_call, "args")),
+                    )
+                )
+        return NormalizedCall(
+            provider="google",
+            model=model_name,
+            content="".join(texts) or None,
+            tool_calls=tool_calls,
+        )
+    return _normalize
+def instrument(model: Any, capture_content: bool = False) -> Any:
+    """Instrument a Google ``GenerativeModel``'s ``generate_content``.
+    Args:
+        model: A ``google.generativeai`` or ``vertexai`` ``GenerativeModel`` instance.
+        capture_content: Also record assistant text, not just tool calls.
+    Returns:
+        The same model, patched in place.
+    """
+    normalize = _normalizer(get(model, "model_name"))
+    wrap(model, "generate_content", normalize, capture_content)
+    # Async Gemini/Vertex uses a distinct coroutine method; wrap it too when present.
+    if hasattr(model, "generate_content_async"):
+        wrap(model, "generate_content_async", normalize, capture_content)
+    return model