PyPI - braintrust - Versions diffs - 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

braintrust 0.4.2py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

braintrust/_generated_types.py +328 -126
braintrust/cli/install/api.py +1 -1
braintrust/conftest.py +24 -0
braintrust/devserver/test_server_integration.py +0 -11
braintrust/framework.py +98 -1
braintrust/functions/invoke.py +4 -9
braintrust/functions/test_invoke.py +61 -0
braintrust/generated_types.py +13 -7
braintrust/logger.py +107 -66
braintrust/prompt_cache/test_disk_cache.py +3 -3
braintrust/span_cache.py +337 -0
braintrust/span_identifier_v3.py +21 -0
braintrust/span_types.py +3 -0
braintrust/test_bt_json.py +23 -19
braintrust/test_logger.py +116 -0
braintrust/test_span_cache.py +344 -0
braintrust/test_trace.py +267 -0
braintrust/trace.py +385 -0
braintrust/version.py +2 -2
braintrust/wrappers/claude_agent_sdk/_wrapper.py +48 -6
braintrust/wrappers/claude_agent_sdk/test_wrapper.py +106 -0
braintrust/wrappers/langsmith_wrapper.py +517 -0
braintrust/wrappers/test_agno.py +0 -12
braintrust/wrappers/test_anthropic.py +1 -11
braintrust/wrappers/test_dspy.py +0 -11
braintrust/wrappers/test_google_genai.py +6 -1
braintrust/wrappers/test_langsmith_wrapper.py +338 -0
braintrust/wrappers/test_litellm.py +0 -10
braintrust/wrappers/test_oai_attachments.py +0 -10
braintrust/wrappers/test_openai.py +3 -12
braintrust/wrappers/test_openrouter.py +0 -9
braintrust/wrappers/test_pydantic_ai_integration.py +0 -11
braintrust/wrappers/test_pydantic_ai_wrap_openai.py +2 -0
{braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/METADATA +1 -1
{braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/RECORD +38 -31
{braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/WHEEL +1 -1
{braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/entry_points.txt +0 -0
{braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/top_level.txt +0 -0

braintrust/trace.py ADDED Viewed

@@ -0,0 +1,385 @@
+"""
+Trace objects for accessing spans in evaluations.
+This module provides the LocalTrace class which allows scorers to access
+spans from the current evaluation task without making server round-trips.
+"""
+import asyncio
+from typing import Any, Awaitable, Callable, Optional, Protocol
+from braintrust.logger import BraintrustState, ObjectFetcher
+class SpanData:
+    """Span data returned by get_spans()."""
+    def __init__(
+        self,
+        input: Optional[Any] = None,
+        output: Optional[Any] = None,
+        metadata: Optional[dict[str, Any]] = None,
+        span_id: Optional[str] = None,
+        span_parents: Optional[list[str]] = None,
+        span_attributes: Optional[dict[str, Any]] = None,
+        **kwargs: Any,
+    ):
+        self.input = input
+        self.output = output
+        self.metadata = metadata
+        self.span_id = span_id
+        self.span_parents = span_parents
+        self.span_attributes = span_attributes
+        # Store any additional fields
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "SpanData":
+        """Create SpanData from a dictionary."""
+        return cls(**data)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary."""
+        result = {}
+        for key, value in self.__dict__.items():
+            if value is not None:
+                result[key] = value
+        return result
+class SpanFetcher(ObjectFetcher[dict[str, Any]]):
+    """
+    Fetcher for spans by root_span_id, using the ObjectFetcher pattern.
+    Handles pagination automatically via cursor-based iteration.
+    """
+    def __init__(
+        self,
+        object_type: str,  # Literal["experiment", "project_logs", "playground_logs"]
+        object_id: str,
+        root_span_id: str,
+        state: BraintrustState,
+        span_type_filter: Optional[list[str]] = None,
+    ):
+        # Build the filter expression for root_span_id and optionally span_attributes.type
+        filter_expr = self._build_filter(root_span_id, span_type_filter)
+        super().__init__(
+            object_type=object_type,
+            _internal_btql={"filter": filter_expr},
+        )
+        self._object_id = object_id
+        self._state = state
+    @staticmethod
+    def _build_filter(root_span_id: str, span_type_filter: Optional[list[str]] = None) -> dict[str, Any]:
+        """Build BTQL filter expression."""
+        children = [
+            # Base filter: root_span_id = 'value'
+            {
+                "op": "eq",
+                "left": {"op": "ident", "name": ["root_span_id"]},
+                "right": {"op": "literal", "value": root_span_id},
+            },
+            # Exclude span_attributes.purpose = 'scorer'
+            {
+                "op": "or",
+                "children": [
+                    {
+                        "op": "isnull",
+                        "expr": {"op": "ident", "name": ["span_attributes", "purpose"]},
+                    },
+                    {
+                        "op": "ne",
+                        "left": {"op": "ident", "name": ["span_attributes", "purpose"]},
+                        "right": {"op": "literal", "value": "scorer"},
+                    },
+                ],
+            },
+        ]
+        # If span type filter specified, add it
+        if span_type_filter and len(span_type_filter) > 0:
+            children.append(
+                {
+                    "op": "in",
+                    "left": {"op": "ident", "name": ["span_attributes", "type"]},
+                    "right": {"op": "literal", "value": span_type_filter},
+                }
+            )
+        return {"op": "and", "children": children}
+    @property
+    def id(self) -> str:
+        return self._object_id
+    def _get_state(self) -> BraintrustState:
+        return self._state
+SpanFetchFn = Callable[[Optional[list[str]]], Awaitable[list[SpanData]]]
+class CachedSpanFetcher:
+    """
+    Cached span fetcher that handles fetching and caching spans by type.
+    Caching strategy:
+    - Cache spans by span type (dict[spanType, list[SpanData]])
+    - Track if all spans have been fetched (all_fetched flag)
+    - When filtering by spanType, only fetch types not already in cache
+    """
+    def __init__(
+        self,
+        object_type: Optional[str] = None,  # Literal["experiment", "project_logs", "playground_logs"]
+        object_id: Optional[str] = None,
+        root_span_id: Optional[str] = None,
+        get_state: Optional[Callable[[], Awaitable[BraintrustState]]] = None,
+        fetch_fn: Optional[SpanFetchFn] = None,
+    ):
+        self._span_cache: dict[str, list[SpanData]] = {}
+        self._all_fetched = False
+        if fetch_fn is not None:
+            # Direct fetch function injection (for testing)
+            self._fetch_fn = fetch_fn
+        else:
+            # Standard constructor with SpanFetcher
+            if object_type is None or object_id is None or root_span_id is None or get_state is None:
+                raise ValueError("Must provide either fetch_fn or all of object_type, object_id, root_span_id, get_state")
+            async def _fetch_fn(span_type: Optional[list[str]]) -> list[SpanData]:
+                state = await get_state()
+                fetcher = SpanFetcher(
+                    object_type=object_type,
+                    object_id=object_id,
+                    root_span_id=root_span_id,
+                    state=state,
+                    span_type_filter=span_type,
+                )
+                rows = list(fetcher.fetch())
+                # Filter out scorer spans
+                filtered = [
+                    row
+                    for row in rows
+                    if not (
+                        isinstance(row.get("span_attributes"), dict)
+                        and row.get("span_attributes", {}).get("purpose") == "scorer"
+                    )
+                ]
+                return [
+                    SpanData(
+                        input=row.get("input"),
+                        output=row.get("output"),
+                        metadata=row.get("metadata"),
+                        span_id=row.get("span_id"),
+                        span_parents=row.get("span_parents"),
+                        span_attributes=row.get("span_attributes"),
+                        id=row.get("id"),
+                        _xact_id=row.get("_xact_id"),
+                        _pagination_key=row.get("_pagination_key"),
+                        root_span_id=row.get("root_span_id"),
+                    )
+                    for row in filtered
+                ]
+            self._fetch_fn = _fetch_fn
+    async def get_spans(self, span_type: Optional[list[str]] = None) -> list[SpanData]:
+        """
+        Get spans, using cache when possible.
+        Args:
+            span_type: Optional list of span types to filter by
+        Returns:
+            List of matching spans
+        """
+        # If we've fetched all spans, just filter from cache
+        if self._all_fetched:
+            return self._get_from_cache(span_type)
+        # If no filter requested, fetch everything
+        if not span_type or len(span_type) == 0:
+            await self._fetch_spans(None)
+            self._all_fetched = True
+            return self._get_from_cache(None)
+        # Find which spanTypes we don't have in cache yet
+        missing_types = [t for t in span_type if t not in self._span_cache]
+        # If all requested types are cached, return from cache
+        if not missing_types:
+            return self._get_from_cache(span_type)
+        # Fetch only the missing types
+        await self._fetch_spans(missing_types)
+        return self._get_from_cache(span_type)
+    async def _fetch_spans(self, span_type: Optional[list[str]]) -> None:
+        """Fetch spans from the server."""
+        spans = await self._fetch_fn(span_type)
+        for span in spans:
+            span_attrs = span.span_attributes or {}
+            span_type_str = span_attrs.get("type", "")
+            if span_type_str not in self._span_cache:
+                self._span_cache[span_type_str] = []
+            self._span_cache[span_type_str].append(span)
+    def _get_from_cache(self, span_type: Optional[list[str]]) -> list[SpanData]:
+        """Get spans from cache, optionally filtering by type."""
+        if not span_type or len(span_type) == 0:
+            # Return all spans
+            result = []
+            for spans in self._span_cache.values():
+                result.extend(spans)
+            return result
+        # Return only requested types
+        result = []
+        for type_str in span_type:
+            if type_str in self._span_cache:
+                result.extend(self._span_cache[type_str])
+        return result
+class Trace(Protocol):
+    """
+    Interface for trace objects that can be used by scorers.
+    Both the SDK's LocalTrace class and the API wrapper's WrapperTrace implement this.
+    """
+    def get_configuration(self) -> dict[str, str]:
+        """Get the trace configuration (object_type, object_id, root_span_id)."""
+        ...
+    async def get_spans(self, span_type: Optional[list[str]] = None) -> list[SpanData]:
+        """
+        Fetch all spans for this root span.
+        Args:
+            span_type: Optional list of span types to filter by
+        Returns:
+            List of matching spans
+        """
+        ...
+class LocalTrace(dict):
+    """
+    SDK implementation of Trace that uses local span cache and falls back to BTQL.
+    Carries identifying information about the evaluation so scorers can perform
+    richer logging or side effects.
+    Inherits from dict so that it serializes to {"trace_ref": {...}} when passed
+    to json.dumps(). This allows LocalTrace to be transparently serialized when
+    passed through invoke() or other JSON-serializing code paths.
+    """
+    def __init__(
+        self,
+        object_type: str,  # Literal["experiment", "project_logs", "playground_logs"]
+        object_id: str,
+        root_span_id: str,
+        ensure_spans_flushed: Optional[Callable[[], Awaitable[None]]],
+        state: BraintrustState,
+    ):
+        # Initialize dict with trace_ref for JSON serialization
+        super().__init__({
+            "trace_ref": {
+                "object_type": object_type,
+                "object_id": object_id,
+                "root_span_id": root_span_id,
+            }
+        })
+        self._object_type = object_type
+        self._object_id = object_id
+        self._root_span_id = root_span_id
+        self._ensure_spans_flushed = ensure_spans_flushed
+        self._state = state
+        self._spans_flushed = False
+        self._spans_flush_promise: Optional[asyncio.Task[None]] = None
+        async def get_state() -> BraintrustState:
+            await self._ensure_spans_ready()
+            # Ensure state is logged in
+            await asyncio.get_event_loop().run_in_executor(None, lambda: state.login())
+            return state
+        self._cached_fetcher = CachedSpanFetcher(
+            object_type=object_type,
+            object_id=object_id,
+            root_span_id=root_span_id,
+            get_state=get_state,
+        )
+    def get_configuration(self) -> dict[str, str]:
+        """Get the trace configuration."""
+        return {
+            "object_type": self._object_type,
+            "object_id": self._object_id,
+            "root_span_id": self._root_span_id,
+        }
+    async def get_spans(self, span_type: Optional[list[str]] = None) -> list[SpanData]:
+        """
+        Fetch all rows for this root span from its parent object (experiment or project logs).
+        First checks the local span cache for recently logged spans, then falls
+        back to CachedSpanFetcher which handles BTQL fetching and caching.
+        Args:
+            span_type: Optional list of span types to filter by
+        Returns:
+            List of matching spans
+        """
+        # Try local span cache first (for recently logged spans not yet flushed)
+        cached_spans = self._state.span_cache.get_by_root_span_id(self._root_span_id)
+        if cached_spans and len(cached_spans) > 0:
+            # Filter by purpose
+            spans = [span for span in cached_spans if not (span.span_attributes or {}).get("purpose") == "scorer"]
+            # Filter by span type if requested
+            if span_type and len(span_type) > 0:
+                spans = [span for span in spans if (span.span_attributes or {}).get("type", "") in span_type]
+            # Convert to SpanData
+            return [
+                SpanData(
+                    input=span.input,
+                    output=span.output,
+                    metadata=span.metadata,
+                    span_id=span.span_id,
+                    span_parents=span.span_parents,
+                    span_attributes=span.span_attributes,
+                )
+                for span in spans
+            ]
+        # Fall back to CachedSpanFetcher for BTQL fetching with caching
+        return await self._cached_fetcher.get_spans(span_type)
+    async def _ensure_spans_ready(self) -> None:
+        """Ensure spans are flushed before fetching."""
+        if self._spans_flushed or not self._ensure_spans_flushed:
+            return
+        if self._spans_flush_promise is None:
+            async def flush_and_mark():
+                try:
+                    await self._ensure_spans_flushed()
+                    self._spans_flushed = True
+                except Exception as err:
+                    self._spans_flush_promise = None
+                    raise err
+            self._spans_flush_promise = asyncio.create_task(flush_and_mark())
+        await self._spans_flush_promise

braintrust/version.py CHANGED Viewed

@@ -1,4 +1,4 @@
-VERSION = "0.4.2"
+VERSION = "0.5.0"
 # this will be templated during the build
-GIT_COMMIT = "3ca420e53e77d4665b91ccc7631c95dc97ce566d"
+GIT_COMMIT = "617d9b730b37e96b7d05a099b95f5387944d0951"

braintrust/wrappers/claude_agent_sdk/_wrapper.py CHANGED Viewed

@@ -2,7 +2,7 @@ import dataclasses
 import logging
 import threading
 import time
-from collections.abc import AsyncGenerator, Callable
+from collections.abc import AsyncGenerator, AsyncIterable, Callable
 from typing import Any
 from braintrust.logger import start_span
@@ -191,17 +191,38 @@ def _create_client_wrapper_class(original_client_class: Any) -> Any:
             self.__client = client
             self.__last_prompt: str | None = None
             self.__query_start_time: float | None = None
+            self.__captured_messages: list[dict[str, Any]] | None = None
         async def query(self, *args: Any, **kwargs: Any) -> Any:
             """Wrap query to capture the prompt and start time for tracing."""
             # Capture the time when query is called (when LLM call starts)
             self.__query_start_time = time.time()
+            self.__captured_messages = None
             # Capture the prompt for use in receive_response
-            if args:
-                self.__last_prompt = str(args[0])
-            elif "prompt" in kwargs:
-                self.__last_prompt = str(kwargs["prompt"])
+            prompt = args[0] if args else kwargs.get("prompt")
+            if prompt is not None:
+                if isinstance(prompt, str):
+                    self.__last_prompt = prompt
+                elif isinstance(prompt, AsyncIterable):
+                    # AsyncIterable[dict] - wrap it to capture messages as they're yielded
+                    captured: list[dict[str, Any]] = []
+                    self.__captured_messages = captured
+                    self.__last_prompt = None  # Will be set after messages are captured
+                    async def capturing_wrapper() -> AsyncGenerator[dict[str, Any], None]:
+                        async for msg in prompt:
+                            captured.append(msg)
+                            yield msg
+                    # Replace the prompt with our capturing wrapper
+                    if args:
+                        args = (capturing_wrapper(),) + args[1:]
+                    else:
+                        kwargs["prompt"] = capturing_wrapper()
+                else:
+                    self.__last_prompt = str(prompt)
             return await self.__client.query(*args, **kwargs)
@@ -215,11 +236,16 @@ def _create_client_wrapper_class(original_client_class: Any) -> Any:
             """
             generator = self.__client.receive_response()
+            # Determine the initial input - may be updated later if using async generator
+            initial_input = self.__last_prompt if self.__last_prompt else None
             with start_span(
                 name="Claude Agent",
                 span_attributes={"type": SpanTypeAttribute.TASK},
-                input=self.__last_prompt if self.__last_prompt else None,
+                input=initial_input,
             ) as span:
+                # If we're capturing async messages, we'll update input after they're consumed
+                input_needs_update = self.__captured_messages is not None
                 # Store the parent span export in thread-local storage for tool handlers
                 _thread_local.parent_span_export = span.export()
@@ -228,6 +254,13 @@ def _create_client_wrapper_class(original_client_class: Any) -> Any:
                 try:
                     async for message in generator:
+                        # Update input from captured async messages (once, after they're consumed)
+                        if input_needs_update and self.__captured_messages:
+                            captured_input = _format_captured_messages(self.__captured_messages)
+                            if captured_input:
+                                span.log(input=captured_input)
+                            input_needs_update = False
                         message_type = type(message).__name__
                         if message_type == "AssistantMessage":
@@ -390,3 +423,12 @@ def _build_llm_input(prompt: Any, conversation_history: list[dict[str, Any]]) ->
             return [{"content": prompt, "role": "user"}] + conversation_history
     return conversation_history if conversation_history else None
+def _format_captured_messages(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Formats captured async generator messages into structured input.
+    Returns the messages as-is to preserve structure for tracing.
+    Empty list returns empty list.
+    """
+    return messages if messages else []

braintrust/wrappers/claude_agent_sdk/test_wrapper.py CHANGED Viewed

@@ -177,3 +177,109 @@ async def test_calculator_with_multiple_operations(memory_logger):
         if span["span_id"] != root_span_id:
             assert span["root_span_id"] == root_span_id
             assert root_span_id in span["span_parents"]
+def _make_message(content: str) -> dict:
+    """Create a streaming format message dict."""
+    return {"type": "user", "message": {"role": "user", "content": content}}
+def _assert_structured_input(task_span: dict, expected_contents: list[str]) -> None:
+    """Assert that task span input is a structured list with expected content."""
+    inp = task_span.get("input")
+    assert isinstance(inp, list), f"Expected list input, got {type(inp).__name__}: {inp}"
+    assert [x["message"]["content"] for x in inp] == expected_contents
+class CustomAsyncIterable:
+    """Custom AsyncIterable class (not a generator) for testing."""
+    def __init__(self, messages: list[dict]):
+        self._messages = messages
+    def __aiter__(self):
+        return CustomAsyncIterator(self._messages)
+class CustomAsyncIterator:
+    """Iterator for CustomAsyncIterable."""
+    def __init__(self, messages: list[dict]):
+        self._messages = messages
+        self._index = 0
+    async def __anext__(self):
+        if self._index >= len(self._messages):
+            raise StopAsyncIteration
+        msg = self._messages[self._index]
+        self._index += 1
+        return msg
+@pytest.mark.skipif(not CLAUDE_SDK_AVAILABLE, reason="Claude Agent SDK not installed")
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "input_factory,expected_contents",
+    [
+        pytest.param(
+            lambda: (msg async for msg in _single_message_generator()),
+            ["What is 2 + 2?"],
+            id="asyncgen_single",
+        ),
+        pytest.param(
+            lambda: (msg async for msg in _multi_message_generator()),
+            ["Part 1", "Part 2"],
+            id="asyncgen_multi",
+        ),
+        pytest.param(
+            lambda: CustomAsyncIterable([_make_message("Custom 1"), _make_message("Custom 2")]),
+            ["Custom 1", "Custom 2"],
+            id="custom_async_iterable",
+        ),
+    ],
+)
+async def test_query_async_iterable(memory_logger, input_factory, expected_contents):
+    """Test that async iterable inputs are captured as structured lists.
+    Verifies that passing AsyncIterable[dict] to query() results in the span
+    input showing the structured message list, not a flattened string or repr.
+    """
+    assert not memory_logger.pop()
+    original_client = claude_agent_sdk.ClaudeSDKClient
+    claude_agent_sdk.ClaudeSDKClient = _create_client_wrapper_class(original_client)
+    try:
+        options = claude_agent_sdk.ClaudeAgentOptions(model=TEST_MODEL)
+        async with claude_agent_sdk.ClaudeSDKClient(options=options) as client:
+            await client.query(input_factory())
+            async for message in client.receive_response():
+                if type(message).__name__ == "ResultMessage":
+                    break
+        spans = memory_logger.pop()
+        task_spans = [s for s in spans if s["span_attributes"]["type"] == SpanTypeAttribute.TASK]
+        assert len(task_spans) >= 1, f"Should have at least one task span, got {len(task_spans)}"
+        task_span = next(
+            (s for s in task_spans if s["span_attributes"]["name"] == "Claude Agent"),
+            task_spans[0],
+        )
+        _assert_structured_input(task_span, expected_contents)
+    finally:
+        claude_agent_sdk.ClaudeSDKClient = original_client
+async def _single_message_generator():
+    """Generator yielding a single message."""
+    yield _make_message("What is 2 + 2?")
+async def _multi_message_generator():
+    """Generator yielding multiple messages."""
+    yield _make_message("Part 1")
+    yield _make_message("Part 2")

braintrust 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

braintrust 0.4.2py3-none-any.whl → 0.5.0py3-none-any.whl