PyPI - braintrust - Versions diffs - 0.4.3__tar.gz → 0.5.2__tar.gz - Mend

braintrust 0.4.3tar.gz → 0.5.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

{braintrust-0.4.3 → braintrust-0.5.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: braintrust
-Version: 0.4.3
+Version: 0.5.2
 Summary: SDK for integrating Braintrust
 Home-page: https://www.braintrust.dev
 Author: Braintrust

{braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/__init__.py RENAMED Viewed

@@ -50,6 +50,9 @@ BRAINTRUST_API_KEY=<YOUR_BRAINTRUST_API_KEY> braintrust eval eval_hello.py
 """
 from .audit import *
+from .auto import (
+    auto_instrument,  # noqa: F401 # type: ignore[reportUnusedImport]
+)
 from .framework import *
 from .framework2 import *
 from .functions.invoke import *

{braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/_generated_types.py RENAMED Viewed

@@ -144,6 +144,11 @@ class AsyncScoringControlAsyncScoringControl5(TypedDict):
     triggered_xact_id: str
+class AsyncScoringControlAsyncScoringControl6(TypedDict):
+    kind: Literal['mark_attempt_failed']
+    function_ids: Sequence[Any]
 class AsyncScoringStateAsyncScoringState(TypedDict):
     status: Literal['enabled']
     token: str
@@ -484,6 +489,10 @@ class Dataset(TypedDict):
     """
     User-controlled metadata about the dataset
     """
+    url_slug: str
+    """
+    URL slug for the dataset. used to construct dataset URLs
+    """
 class DatasetEventMetadata(TypedDict):
@@ -532,6 +541,43 @@ class EnvVar(TypedDict):
     """
+class EvalStatusPageConfig(TypedDict):
+    score_columns: NotRequired[Sequence[str] | None]
+    """
+    The score columns to display on the page
+    """
+    metric_columns: NotRequired[Sequence[str] | None]
+    """
+    The metric columns to display on the page
+    """
+    grouping_field: NotRequired[str | None]
+    """
+    The metadata field to use for grouping experiments (model)
+    """
+    filter: NotRequired[str | None]
+    """
+    BTQL filter to apply to experiment data
+    """
+    sort_by: NotRequired[str | None]
+    """
+    Field to sort results by (format: 'score:<name>' or 'metric:<name>')
+    """
+    sort_order: NotRequired[Literal['asc', 'desc'] | None]
+    """
+    Sort order (ascending or descending)
+    """
+    api_key: NotRequired[str | None]
+    """
+    The API key used for fetching experiment data
+    """
+EvalStatusPageTheme: TypeAlias = Literal['light', 'dark']
+"""
+The theme for the page
+"""
 class ExperimentEventMetadata(TypedDict):
     model: NotRequired[str | None]
     """
@@ -749,20 +795,24 @@ FunctionIdRef: TypeAlias = Mapping[str, Any]
 FunctionObjectType: TypeAlias = Literal[
-    'prompt', 'tool', 'scorer', 'task', 'custom_view', 'preprocessor', 'facet'
+    'prompt', 'tool', 'scorer', 'task', 'workflow', 'custom_view', 'preprocessor', 'facet', 'classifier'
 ]
-FunctionOutputType: TypeAlias = Literal['completion', 'score', 'any']
+FunctionOutputType: TypeAlias = Literal['completion', 'score', 'facet', 'classification', 'any']
-FunctionTypeEnum: TypeAlias = Literal['llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet']
+FunctionTypeEnum: TypeAlias = Literal[
+    'llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet', 'classifier'
+]
 """
 The type of global function. Defaults to 'scorer'.
 """
-FunctionTypeEnumNullish: TypeAlias = Literal['llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet']
+FunctionTypeEnumNullish: TypeAlias = Literal[
+    'llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet', 'classifier'
+]
 class GitMetadataSettings(TypedDict):
@@ -1674,7 +1724,18 @@ class PromptDataNullishOrigin(TypedDict):
 class PromptParserNullish(TypedDict):
     type: Literal['llm_classifier']
     use_cot: bool
-    choice_scores: Mapping[str, float]
+    choice_scores: NotRequired[Mapping[str, float] | None]
+    """
+    Map of choices to scores (0-1). Used by scorers.
+    """
+    choice: NotRequired[Sequence[str] | None]
+    """
+    List of valid choices without score mapping. Used by classifiers that deposit output to tags.
+    """
+    allow_no_match: NotRequired[bool | None]
+    """
+    If true, adds a 'No match' option. When selected, no tag is deposited.
+    """
 class PromptSessionEvent(TypedDict):
@@ -2104,7 +2165,7 @@ class SpanScope(TypedDict):
 SpanType: TypeAlias = Literal[
-    'llm', 'score', 'function', 'eval', 'task', 'tool', 'automation', 'facet', 'preprocessor'
+    'llm', 'score', 'function', 'eval', 'task', 'tool', 'automation', 'facet', 'preprocessor', 'classifier'
 ]
 """
 Type of the span, for display purposes only
@@ -2384,6 +2445,7 @@ AsyncScoringControl: TypeAlias = (
     | AsyncScoringControlAsyncScoringControl3
     | AsyncScoringControlAsyncScoringControl4
     | AsyncScoringControlAsyncScoringControl5
+    | AsyncScoringControlAsyncScoringControl6
 )
@@ -2530,6 +2592,43 @@ class DatasetEvent(TypedDict):
     """
+class EvalStatusPage(TypedDict):
+    id: str
+    """
+    Unique identifier for the eval status page
+    """
+    project_id: str
+    """
+    Unique identifier for the project that the eval status page belongs under
+    """
+    user_id: NotRequired[str | None]
+    """
+    Identifies the user who created the eval status page
+    """
+    created: NotRequired[str | None]
+    """
+    Date of eval status page creation
+    """
+    deleted_at: NotRequired[str | None]
+    """
+    Date of eval status page deletion, or null if the eval status page is still active
+    """
+    name: str
+    """
+    Name of the eval status page
+    """
+    description: NotRequired[str | None]
+    """
+    Textual description of the eval status page
+    """
+    logo_url: NotRequired[str | None]
+    """
+    URL of the logo to display on the page
+    """
+    theme: EvalStatusPageTheme
+    config: EvalStatusPageConfig
 class Experiment(TypedDict):
     id: str
     """
@@ -3228,6 +3327,7 @@ class View(TypedDict):
         'prompts',
         'tools',
         'scorers',
+        'classifiers',
         'logs',
         'monitor',
         'for_review',

braintrust-0.5.2/src/braintrust/auto.py ADDED Viewed

@@ -0,0 +1,179 @@
+"""
+Auto-instrumentation for AI/ML libraries.
+Provides one-line instrumentation for supported libraries.
+"""
+from __future__ import annotations
+import logging
+from contextlib import contextmanager
+__all__ = ["auto_instrument"]
+logger = logging.getLogger(__name__)
+@contextmanager
+def _try_patch():
+    """Context manager that suppresses ImportError and logs other exceptions."""
+    try:
+        yield
+    except ImportError:
+        pass
+    except Exception:
+        logger.exception("Failed to instrument")
+def auto_instrument(
+    *,
+    openai: bool = True,
+    anthropic: bool = True,
+    litellm: bool = True,
+    pydantic_ai: bool = True,
+    google_genai: bool = True,
+    agno: bool = True,
+    claude_agent_sdk: bool = True,
+    dspy: bool = True,
+) -> dict[str, bool]:
+    """
+    Auto-instrument supported AI/ML libraries for Braintrust tracing.
+    Safe to call multiple times - already instrumented libraries are skipped.
+    Note on import order: If you use `from openai import OpenAI` style imports,
+    call auto_instrument() first. If you use `import openai` style imports,
+    order doesn't matter since attribute lookup happens dynamically.
+    Args:
+        openai: Enable OpenAI instrumentation (default: True)
+        anthropic: Enable Anthropic instrumentation (default: True)
+        litellm: Enable LiteLLM instrumentation (default: True)
+        pydantic_ai: Enable Pydantic AI instrumentation (default: True)
+        google_genai: Enable Google GenAI instrumentation (default: True)
+        agno: Enable Agno instrumentation (default: True)
+        claude_agent_sdk: Enable Claude Agent SDK instrumentation (default: True)
+        dspy: Enable DSPy instrumentation (default: True)
+    Returns:
+        Dict mapping integration name to whether it was successfully instrumented.
+    Example:
+        ```python
+        import braintrust
+        braintrust.auto_instrument()
+        # OpenAI
+        import openai
+        client = openai.OpenAI()
+        client.chat.completions.create(model="gpt-4o-mini", messages=[...])
+        # Anthropic
+        import anthropic
+        client = anthropic.Anthropic()
+        client.messages.create(model="claude-sonnet-4-20250514", messages=[...])
+        # LiteLLM
+        import litellm
+        litellm.completion(model="gpt-4o-mini", messages=[...])
+        # DSPy
+        import dspy
+        lm = dspy.LM("openai/gpt-4o-mini")
+        dspy.configure(lm=lm)
+        # Pydantic AI
+        from pydantic_ai import Agent
+        agent = Agent("openai:gpt-4o-mini")
+        result = agent.run_sync("Hello!")
+        # Google GenAI
+        from google.genai import Client
+        client = Client()
+        client.models.generate_content(model="gemini-2.0-flash", contents="Hello!")
+        ```
+    """
+    results = {}
+    if openai:
+        results["openai"] = _instrument_openai()
+    if anthropic:
+        results["anthropic"] = _instrument_anthropic()
+    if litellm:
+        results["litellm"] = _instrument_litellm()
+    if pydantic_ai:
+        results["pydantic_ai"] = _instrument_pydantic_ai()
+    if google_genai:
+        results["google_genai"] = _instrument_google_genai()
+    if agno:
+        results["agno"] = _instrument_agno()
+    if claude_agent_sdk:
+        results["claude_agent_sdk"] = _instrument_claude_agent_sdk()
+    if dspy:
+        results["dspy"] = _instrument_dspy()
+    return results
+def _instrument_openai() -> bool:
+    with _try_patch():
+        from braintrust.oai import patch_openai
+        return patch_openai()
+    return False
+def _instrument_anthropic() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.anthropic import patch_anthropic
+        return patch_anthropic()
+    return False
+def _instrument_litellm() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.litellm import patch_litellm
+        return patch_litellm()
+    return False
+def _instrument_pydantic_ai() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.pydantic_ai import setup_pydantic_ai
+        return setup_pydantic_ai()
+    return False
+def _instrument_google_genai() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.google_genai import setup_genai
+        return setup_genai()
+    return False
+def _instrument_agno() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.agno import setup_agno
+        return setup_agno()
+    return False
+def _instrument_claude_agent_sdk() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.claude_agent_sdk import setup_claude_agent_sdk
+        return setup_claude_agent_sdk()
+    return False
+def _instrument_dspy() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.dspy import patch_dspy
+        return patch_dspy()
+    return False

{braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/conftest.py RENAMED Viewed

@@ -48,16 +48,29 @@ def reset_braintrust_state():
     logger._state = logger.BraintrustState()
-@pytest.fixture(scope="session")
-def vcr_config():
+@pytest.fixture(autouse=True)
+def skip_vcr_tests_in_wheel_mode(request):
+    """Skip VCR tests when running from an installed wheel.
+    Wheel mode (BRAINTRUST_TESTING_WHEEL=1) is a pre-release sanity check
+    that verifies the built package installs and runs correctly. It's not
+    intended to be a full test suite - VCR cassettes are not included in
+    the wheel, so we skip those tests here. The full test suite with VCR
+    tests runs against source code during normal CI.
+    """
+    if os.environ.get("BRAINTRUST_TESTING_WHEEL") == "1":
+        if request.node.get_closest_marker("vcr"):
+            pytest.skip("VCR tests skipped in wheel mode (pre-release sanity check only)")
+def get_vcr_config():
     """
-    VCR configuration for recording/playing back HTTP interactions.
+    Get VCR configuration for recording/playing back HTTP interactions.
     In CI, use "none" to fail if cassette is missing.
     Locally, use "once" to record new cassettes if they don't exist.
     """
     record_mode = "none" if (os.environ.get("CI") or os.environ.get("GITHUB_ACTIONS")) else "once"
     return {
         "record_mode": record_mode,
         "filter_headers": [
@@ -70,3 +83,9 @@ def vcr_config():
             "x-bt-auth-token",
         ],
     }
+@pytest.fixture(scope="session")
+def vcr_config():
+    """Pytest fixture wrapper for get_vcr_config()."""
+    return get_vcr_config()

{braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/framework.py RENAMED Viewed

@@ -673,6 +673,7 @@ def _EvalCommon(
     stream: Callable[[SSEProgressEvent], None] | None = None,
     parent: str | None = None,
     state: BraintrustState | None = None,
+    enable_cache: bool = True,
 ) -> Callable[[], Coroutine[Any, Any, EvalResultWithSummary[Input, Output]]]:
     """
     This helper is needed because in case of `_lazy_load`, we need to update
@@ -759,7 +760,7 @@ def _EvalCommon(
         async def run_to_completion():
             with parent_context(parent, state):
                 try:
-                    ret = await run_evaluator(experiment, evaluator, 0, [], stream, state)
+                    ret = await run_evaluator(experiment, evaluator, 0, [], stream, state, enable_cache)
                     reporter.report_eval(evaluator, ret, verbose=True, jsonl=False)
                     return ret
                 finally:
@@ -798,6 +799,7 @@ async def EvalAsync(
     stream: Callable[[SSEProgressEvent], None] | None = None,
     parent: str | None = None,
     state: BraintrustState | None = None,
+    enable_cache: bool = True,
 ) -> EvalResultWithSummary[Input, Output]:
     """
     A function you can use to define an evaluator. This is a convenience wrapper around the `Evaluator` class.
@@ -855,6 +857,8 @@ async def EvalAsync(
     :param parent: If specified, instead of creating a new experiment object, the Eval() will populate
     the object or span specified by this parent.
     :param state: Optional BraintrustState to use for the evaluation. If not specified, the global login state will be used.
+    :param enable_cache: Whether to enable the span cache for this evaluation. Defaults to True. The span cache stores
+    span data on disk to minimize memory usage and allow scorers to read spans without server round-trips.
     :return: An `EvalResultWithSummary` object, which contains all results and a summary.
     """
     f = _EvalCommon(
@@ -883,6 +887,7 @@ async def EvalAsync(
         stream=stream,
         parent=parent,
         state=state,
+        enable_cache=enable_cache,
     )
     return await f()
@@ -918,6 +923,7 @@ def Eval(
     stream: Callable[[SSEProgressEvent], None] | None = None,
     parent: str | None = None,
     state: BraintrustState | None = None,
+    enable_cache: bool = True,
 ) -> EvalResultWithSummary[Input, Output]:
     """
     A function you can use to define an evaluator. This is a convenience wrapper around the `Evaluator` class.
@@ -975,6 +981,8 @@ def Eval(
     :param parent: If specified, instead of creating a new experiment object, the Eval() will populate
     the object or span specified by this parent.
     :param state: Optional BraintrustState to use for the evaluation. If not specified, the global login state will be used.
+    :param enable_cache: Whether to enable the span cache for this evaluation. Defaults to True. The span cache stores
+    span data on disk to minimize memory usage and allow scorers to read spans without server round-trips.
     :return: An `EvalResultWithSummary` object, which contains all results and a summary.
     """
@@ -1005,6 +1013,7 @@ def Eval(
         stream=stream,
         parent=parent,
         state=state,
+        enable_cache=enable_cache,
     )
     # https://stackoverflow.com/questions/55409641/asyncio-run-cannot-be-called-from-a-running-event-loop-when-using-jupyter-no
@@ -1249,10 +1258,11 @@ async def run_evaluator(
     filters: list[Filter],
     stream: Callable[[SSEProgressEvent], None] | None = None,
     state: BraintrustState | None = None,
+    enable_cache: bool = True,
 ) -> EvalResultWithSummary[Input, Output]:
     """Wrapper on _run_evaluator_internal that times out execution after evaluator.timeout."""
     results = await asyncio.wait_for(
-        _run_evaluator_internal(experiment, evaluator, position, filters, stream, state), evaluator.timeout
+        _run_evaluator_internal(experiment, evaluator, position, filters, stream, state, enable_cache), evaluator.timeout
     )
     if experiment:
@@ -1280,6 +1290,32 @@ async def _run_evaluator_internal(
     filters: list[Filter],
     stream: Callable[[SSEProgressEvent], None] | None = None,
     state: BraintrustState | None = None,
+    enable_cache: bool = True,
+):
+    # Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
+    if state is None:
+        from braintrust.logger import _internal_get_global_state
+        state = _internal_get_global_state()
+    if enable_cache:
+        state.span_cache.start()
+    try:
+        return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
+    finally:
+        # Clean up disk-based span cache after eval completes and stop caching
+        if enable_cache:
+            state.span_cache.dispose()
+            state.span_cache.stop()
+async def _run_evaluator_internal_impl(
+    experiment,
+    evaluator: Evaluator,
+    position: int | None,
+    filters: list[Filter],
+    stream: Callable[[SSEProgressEvent], None] | None = None,
+    state: BraintrustState | None = None,
 ):
     event_loop = asyncio.get_event_loop()
@@ -1290,11 +1326,13 @@ async def _run_evaluator_internal(
             {**parent_propagated},
             {"span_attributes": {"purpose": "scorer"}},
         )
+        # Strip trace from logged input - it's internal plumbing that shouldn't appear in spans
+        logged_input = {k: v for k, v in kwargs.items() if k != "trace"}
         with root_span.start_span(
             name=name,
             span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
             propagated_event=merged_propagated,
-            input=dict(**kwargs),
+            input=logged_input,
         ) as span:
             score = scorer
             if hasattr(scorer, "eval_async"):
@@ -1415,6 +1453,77 @@ async def _run_evaluator_internal(
                 tags = hooks.tags if hooks.tags else None
                 root_span.log(output=output, metadata=metadata, tags=tags)
+                # Create trace object for scorers
+                from braintrust.trace import LocalTrace
+                async def ensure_spans_flushed():
+                    # Flush native Braintrust spans
+                    if experiment:
+                        await asyncio.get_event_loop().run_in_executor(
+                            None, lambda: experiment.state.flush()
+                        )
+                    elif state:
+                        await asyncio.get_event_loop().run_in_executor(None, lambda: state.flush())
+                    else:
+                        from braintrust.logger import flush as flush_logger
+                        await asyncio.get_event_loop().run_in_executor(None, flush_logger)
+                    # Also flush OTEL spans if registered
+                    if state:
+                        await state.flush_otel()
+                experiment_id = None
+                if experiment:
+                    try:
+                        experiment_id = experiment.id
+                    except:
+                        experiment_id = None
+                trace = None
+                if state or experiment:
+                    # Get the state to use
+                    trace_state = state
+                    if not trace_state and experiment:
+                        trace_state = experiment.state
+                    if not trace_state:
+                        # Fall back to global state
+                        from braintrust.logger import _internal_get_global_state
+                        trace_state = _internal_get_global_state()
+                    # Access root_span_id from the concrete SpanImpl instance
+                    # The Span interface doesn't expose this but SpanImpl has it
+                    root_span_id_value = getattr(root_span, "root_span_id", root_span.id)
+                    # Check if there's a parent in the context to determine object_type and object_id
+                    from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string
+                    parent_str = trace_state.current_parent.get()
+                    parent_components = None
+                    if parent_str:
+                        try:
+                            parent_components = SpanComponentsV3.from_str(parent_str)
+                        except Exception:
+                            # If parsing fails, parent_components stays None
+                            pass
+                    # Determine object_type and object_id based on parent or experiment
+                    if parent_components:
+                        trace_object_type = span_object_type_v3_to_typed_string(parent_components.object_type)
+                        trace_object_id = parent_components.object_id or ""
+                    else:
+                        trace_object_type = "experiment"
+                        trace_object_id = experiment_id or ""
+                    trace = LocalTrace(
+                        object_type=trace_object_type,
+                        object_id=trace_object_id,
+                        root_span_id=root_span_id_value,
+                        ensure_spans_flushed=ensure_spans_flushed,
+                        state=trace_state,
+                    )
                 score_promises = [
                     asyncio.create_task(
                         await_or_run_scorer(
@@ -1426,6 +1535,7 @@ async def _run_evaluator_internal(
                                 "expected": datum.expected,
                                 "metadata": metadata,
                                 "output": output,
+                                "trace": trace,
                             },
                         )
                     )

{braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/functions/invoke.py RENAMED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Literal, TypedDict, TypeVar, overload
 from sseclient import SSEClient
 from .._generated_types import FunctionTypeEnum
-from ..logger import Exportable, get_span_parent_object, login, proxy_conn
+from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
 from ..util import response_raise_for_status
 from .constants import INVOKE_API_VERSION
 from .stream import BraintrustInvokeError, BraintrustStream
@@ -243,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None):
     :param version: Optional version of the function to use. Defaults to latest.
     :return: A function that can be used as a task or scorer.
     """
+    # Disable span cache since remote function spans won't be in the local cache
+    _internal_get_global_state().span_cache.disable()
     def f(*args: Any, **kwargs: Any) -> Any:
         if len(args) > 0:

braintrust 0.4.3__tar.gz → 0.5.2__tar.gz

braintrust 0.4.3tar.gz → 0.5.2tar.gz