PyPI - braintrust - Versions diffs - 0.5.0__tar.gz → 0.5.2__tar.gz - Mend

braintrust 0.5.0tar.gz → 0.5.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

{braintrust-0.5.0 → braintrust-0.5.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: braintrust
-Version: 0.5.0
+Version: 0.5.2
 Summary: SDK for integrating Braintrust
 Home-page: https://www.braintrust.dev
 Author: Braintrust

{braintrust-0.5.0 → braintrust-0.5.2}/src/braintrust/__init__.py RENAMED Viewed

@@ -50,6 +50,9 @@ BRAINTRUST_API_KEY=<YOUR_BRAINTRUST_API_KEY> braintrust eval eval_hello.py
 """
 from .audit import *
+from .auto import (
+    auto_instrument,  # noqa: F401 # type: ignore[reportUnusedImport]
+)
 from .framework import *
 from .framework2 import *
 from .functions.invoke import *

braintrust-0.5.2/src/braintrust/auto.py ADDED Viewed

@@ -0,0 +1,179 @@
+"""
+Auto-instrumentation for AI/ML libraries.
+Provides one-line instrumentation for supported libraries.
+"""
+from __future__ import annotations
+import logging
+from contextlib import contextmanager
+__all__ = ["auto_instrument"]
+logger = logging.getLogger(__name__)
+@contextmanager
+def _try_patch():
+    """Context manager that suppresses ImportError and logs other exceptions."""
+    try:
+        yield
+    except ImportError:
+        pass
+    except Exception:
+        logger.exception("Failed to instrument")
+def auto_instrument(
+    *,
+    openai: bool = True,
+    anthropic: bool = True,
+    litellm: bool = True,
+    pydantic_ai: bool = True,
+    google_genai: bool = True,
+    agno: bool = True,
+    claude_agent_sdk: bool = True,
+    dspy: bool = True,
+) -> dict[str, bool]:
+    """
+    Auto-instrument supported AI/ML libraries for Braintrust tracing.
+    Safe to call multiple times - already instrumented libraries are skipped.
+    Note on import order: If you use `from openai import OpenAI` style imports,
+    call auto_instrument() first. If you use `import openai` style imports,
+    order doesn't matter since attribute lookup happens dynamically.
+    Args:
+        openai: Enable OpenAI instrumentation (default: True)
+        anthropic: Enable Anthropic instrumentation (default: True)
+        litellm: Enable LiteLLM instrumentation (default: True)
+        pydantic_ai: Enable Pydantic AI instrumentation (default: True)
+        google_genai: Enable Google GenAI instrumentation (default: True)
+        agno: Enable Agno instrumentation (default: True)
+        claude_agent_sdk: Enable Claude Agent SDK instrumentation (default: True)
+        dspy: Enable DSPy instrumentation (default: True)
+    Returns:
+        Dict mapping integration name to whether it was successfully instrumented.
+    Example:
+        ```python
+        import braintrust
+        braintrust.auto_instrument()
+        # OpenAI
+        import openai
+        client = openai.OpenAI()
+        client.chat.completions.create(model="gpt-4o-mini", messages=[...])
+        # Anthropic
+        import anthropic
+        client = anthropic.Anthropic()
+        client.messages.create(model="claude-sonnet-4-20250514", messages=[...])
+        # LiteLLM
+        import litellm
+        litellm.completion(model="gpt-4o-mini", messages=[...])
+        # DSPy
+        import dspy
+        lm = dspy.LM("openai/gpt-4o-mini")
+        dspy.configure(lm=lm)
+        # Pydantic AI
+        from pydantic_ai import Agent
+        agent = Agent("openai:gpt-4o-mini")
+        result = agent.run_sync("Hello!")
+        # Google GenAI
+        from google.genai import Client
+        client = Client()
+        client.models.generate_content(model="gemini-2.0-flash", contents="Hello!")
+        ```
+    """
+    results = {}
+    if openai:
+        results["openai"] = _instrument_openai()
+    if anthropic:
+        results["anthropic"] = _instrument_anthropic()
+    if litellm:
+        results["litellm"] = _instrument_litellm()
+    if pydantic_ai:
+        results["pydantic_ai"] = _instrument_pydantic_ai()
+    if google_genai:
+        results["google_genai"] = _instrument_google_genai()
+    if agno:
+        results["agno"] = _instrument_agno()
+    if claude_agent_sdk:
+        results["claude_agent_sdk"] = _instrument_claude_agent_sdk()
+    if dspy:
+        results["dspy"] = _instrument_dspy()
+    return results
+def _instrument_openai() -> bool:
+    with _try_patch():
+        from braintrust.oai import patch_openai
+        return patch_openai()
+    return False
+def _instrument_anthropic() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.anthropic import patch_anthropic
+        return patch_anthropic()
+    return False
+def _instrument_litellm() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.litellm import patch_litellm
+        return patch_litellm()
+    return False
+def _instrument_pydantic_ai() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.pydantic_ai import setup_pydantic_ai
+        return setup_pydantic_ai()
+    return False
+def _instrument_google_genai() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.google_genai import setup_genai
+        return setup_genai()
+    return False
+def _instrument_agno() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.agno import setup_agno
+        return setup_agno()
+    return False
+def _instrument_claude_agent_sdk() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.claude_agent_sdk import setup_claude_agent_sdk
+        return setup_claude_agent_sdk()
+    return False
+def _instrument_dspy() -> bool:
+    with _try_patch():
+        from braintrust.wrappers.dspy import patch_dspy
+        return patch_dspy()
+    return False

{braintrust-0.5.0 → braintrust-0.5.2}/src/braintrust/conftest.py RENAMED Viewed

@@ -48,16 +48,29 @@ def reset_braintrust_state():
     logger._state = logger.BraintrustState()
-@pytest.fixture(scope="session")
-def vcr_config():
+@pytest.fixture(autouse=True)
+def skip_vcr_tests_in_wheel_mode(request):
+    """Skip VCR tests when running from an installed wheel.
+    Wheel mode (BRAINTRUST_TESTING_WHEEL=1) is a pre-release sanity check
+    that verifies the built package installs and runs correctly. It's not
+    intended to be a full test suite - VCR cassettes are not included in
+    the wheel, so we skip those tests here. The full test suite with VCR
+    tests runs against source code during normal CI.
+    """
+    if os.environ.get("BRAINTRUST_TESTING_WHEEL") == "1":
+        if request.node.get_closest_marker("vcr"):
+            pytest.skip("VCR tests skipped in wheel mode (pre-release sanity check only)")
+def get_vcr_config():
     """
-    VCR configuration for recording/playing back HTTP interactions.
+    Get VCR configuration for recording/playing back HTTP interactions.
     In CI, use "none" to fail if cassette is missing.
     Locally, use "once" to record new cassettes if they don't exist.
     """
     record_mode = "none" if (os.environ.get("CI") or os.environ.get("GITHUB_ACTIONS")) else "once"
     return {
         "record_mode": record_mode,
         "filter_headers": [
@@ -70,3 +83,9 @@ def vcr_config():
             "x-bt-auth-token",
         ],
     }
+@pytest.fixture(scope="session")
+def vcr_config():
+    """Pytest fixture wrapper for get_vcr_config()."""
+    return get_vcr_config()

{braintrust-0.5.0 → braintrust-0.5.2}/src/braintrust/framework.py RENAMED Viewed

@@ -673,6 +673,7 @@ def _EvalCommon(
     stream: Callable[[SSEProgressEvent], None] | None = None,
     parent: str | None = None,
     state: BraintrustState | None = None,
+    enable_cache: bool = True,
 ) -> Callable[[], Coroutine[Any, Any, EvalResultWithSummary[Input, Output]]]:
     """
     This helper is needed because in case of `_lazy_load`, we need to update
@@ -759,7 +760,7 @@ def _EvalCommon(
         async def run_to_completion():
             with parent_context(parent, state):
                 try:
-                    ret = await run_evaluator(experiment, evaluator, 0, [], stream, state)
+                    ret = await run_evaluator(experiment, evaluator, 0, [], stream, state, enable_cache)
                     reporter.report_eval(evaluator, ret, verbose=True, jsonl=False)
                     return ret
                 finally:
@@ -798,6 +799,7 @@ async def EvalAsync(
     stream: Callable[[SSEProgressEvent], None] | None = None,
     parent: str | None = None,
     state: BraintrustState | None = None,
+    enable_cache: bool = True,
 ) -> EvalResultWithSummary[Input, Output]:
     """
     A function you can use to define an evaluator. This is a convenience wrapper around the `Evaluator` class.
@@ -855,6 +857,8 @@ async def EvalAsync(
     :param parent: If specified, instead of creating a new experiment object, the Eval() will populate
     the object or span specified by this parent.
     :param state: Optional BraintrustState to use for the evaluation. If not specified, the global login state will be used.
+    :param enable_cache: Whether to enable the span cache for this evaluation. Defaults to True. The span cache stores
+    span data on disk to minimize memory usage and allow scorers to read spans without server round-trips.
     :return: An `EvalResultWithSummary` object, which contains all results and a summary.
     """
     f = _EvalCommon(
@@ -883,6 +887,7 @@ async def EvalAsync(
         stream=stream,
         parent=parent,
         state=state,
+        enable_cache=enable_cache,
     )
     return await f()
@@ -918,6 +923,7 @@ def Eval(
     stream: Callable[[SSEProgressEvent], None] | None = None,
     parent: str | None = None,
     state: BraintrustState | None = None,
+    enable_cache: bool = True,
 ) -> EvalResultWithSummary[Input, Output]:
     """
     A function you can use to define an evaluator. This is a convenience wrapper around the `Evaluator` class.
@@ -975,6 +981,8 @@ def Eval(
     :param parent: If specified, instead of creating a new experiment object, the Eval() will populate
     the object or span specified by this parent.
     :param state: Optional BraintrustState to use for the evaluation. If not specified, the global login state will be used.
+    :param enable_cache: Whether to enable the span cache for this evaluation. Defaults to True. The span cache stores
+    span data on disk to minimize memory usage and allow scorers to read spans without server round-trips.
     :return: An `EvalResultWithSummary` object, which contains all results and a summary.
     """
@@ -1005,6 +1013,7 @@ def Eval(
         stream=stream,
         parent=parent,
         state=state,
+        enable_cache=enable_cache,
     )
     # https://stackoverflow.com/questions/55409641/asyncio-run-cannot-be-called-from-a-running-event-loop-when-using-jupyter-no
@@ -1249,10 +1258,11 @@ async def run_evaluator(
     filters: list[Filter],
     stream: Callable[[SSEProgressEvent], None] | None = None,
     state: BraintrustState | None = None,
+    enable_cache: bool = True,
 ) -> EvalResultWithSummary[Input, Output]:
     """Wrapper on _run_evaluator_internal that times out execution after evaluator.timeout."""
     results = await asyncio.wait_for(
-        _run_evaluator_internal(experiment, evaluator, position, filters, stream, state), evaluator.timeout
+        _run_evaluator_internal(experiment, evaluator, position, filters, stream, state, enable_cache), evaluator.timeout
     )
     if experiment:
@@ -1280,6 +1290,7 @@ async def _run_evaluator_internal(
     filters: list[Filter],
     stream: Callable[[SSEProgressEvent], None] | None = None,
     state: BraintrustState | None = None,
+    enable_cache: bool = True,
 ):
     # Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
     if state is None:
@@ -1287,13 +1298,15 @@ async def _run_evaluator_internal(
         state = _internal_get_global_state()
-    state.span_cache.start()
+    if enable_cache:
+        state.span_cache.start()
     try:
         return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
     finally:
         # Clean up disk-based span cache after eval completes and stop caching
-        state.span_cache.dispose()
-        state.span_cache.stop()
+        if enable_cache:
+            state.span_cache.dispose()
+            state.span_cache.stop()
 async def _run_evaluator_internal_impl(

{braintrust-0.5.0 → braintrust-0.5.2}/src/braintrust/logger.py RENAMED Viewed

@@ -87,6 +87,7 @@ from .util import (
     get_caller_location,
     mask_api_key,
     merge_dicts,
+    parse_env_var_float,
     response_raise_for_status,
 )
@@ -349,9 +350,16 @@ class BraintrustState:
     def __init__(self):
         self.id = str(uuid.uuid4())
         self.current_experiment: Experiment | None = None
-        self.current_logger: contextvars.ContextVar[Logger | None] = contextvars.ContextVar(
+        # We use both a ContextVar and a plain attribute for the current logger:
+        # - _cv_logger (ContextVar): Provides async context isolation so different
+        #   async tasks can have different loggers without affecting each other.
+        # - _local_logger (plain attribute): Fallback for threads, since ContextVars
+        #   don't propagate to new threads. This way if users don't want to do
+        #   anything specific they'll always have a "global logger"
+        self._cv_logger: contextvars.ContextVar[Logger | None] = contextvars.ContextVar(
             "braintrust_current_logger", default=None
         )
+        self._local_logger: Logger | None = None
         self.current_parent: contextvars.ContextVar[str | None] = contextvars.ContextVar(
             "braintrust_current_parent", default=None
         )
@@ -425,7 +433,8 @@ class BraintrustState:
     def reset_parent_state(self):
         # reset possible parent state for tests
         self.current_experiment = None
-        self.current_logger.set(None)
+        self._cv_logger.set(None)
+        self._local_logger = None
         self.current_parent.set(None)
         self.current_span.set(NOOP_SPAN)
@@ -485,7 +494,8 @@ class BraintrustState:
             if k
             not in (
                 "current_experiment",
-                "current_logger",
+                "_cv_logger",
+                "_local_logger",
                 "current_parent",
                 "current_span",
                 "_global_bg_logger",
@@ -555,10 +565,6 @@ class BraintrustState:
             self._user_info = self.api_conn().get_json("ping")
         return self._user_info
-    def set_user_info_if_null(self, info: Mapping[str, Any]):
-        if not self._user_info:
-            self._user_info = info
     def global_bg_logger(self) -> "_BackgroundLogger":
         return getattr(self._override_bg_logger, "logger", None) or self._global_bg_logger.get()
@@ -620,14 +626,28 @@ class RetryRequestExceptionsAdapter(HTTPAdapter):
         base_num_retries: Maximum number of retries before giving up and re-raising the exception.
         backoff_factor: A multiplier used to determine the time to wait between retries.
                        The actual wait time is calculated as: backoff_factor * (2 ** retry_count).
+        default_timeout_secs: Default timeout in seconds for requests that don't specify one.
+                             Prevents indefinite hangs on stale connections.
     """
-    def __init__(self, *args: Any, base_num_retries: int = 0, backoff_factor: float = 0.5, **kwargs: Any):
+    def __init__(
+        self,
+        *args: Any,
+        base_num_retries: int = 0,
+        backoff_factor: float = 0.5,
+        default_timeout_secs: float = 60,
+        **kwargs: Any,
+    ):
         self.base_num_retries = base_num_retries
         self.backoff_factor = backoff_factor
+        self.default_timeout_secs = default_timeout_secs
         super().__init__(*args, **kwargs)
     def send(self, *args, **kwargs):
+        # Apply default timeout if none provided to prevent indefinite hangs
+        if kwargs.get("timeout") is None:
+            kwargs["timeout"] = self.default_timeout_secs
         num_prev_retries = 0
         while True:
             try:
@@ -639,6 +659,14 @@ class RetryRequestExceptionsAdapter(HTTPAdapter):
                 return response
             except (urllib3.exceptions.HTTPError, requests.exceptions.RequestException) as e:
                 if num_prev_retries < self.base_num_retries:
+                    if isinstance(e, requests.exceptions.ReadTimeout):
+                        # Clear all connection pools to discard stale connections. This
+                        # fixes hangs caused by NAT gateways silently dropping idle TCP
+                        # connections (e.g., Azure's ~4 min timeout). close() calls
+                        # PoolManager.clear() which is thread-safe: in-flight requests
+                        # keep their checked-out connections, and new requests create
+                        # fresh pools on demand.
+                        self.close()
                     # Emulates the sleeping logic in the backoff_factor of urllib3 Retry
                     sleep_s = self.backoff_factor * (2**num_prev_retries)
                     print("Retrying request after error:", e, file=sys.stderr)
@@ -660,14 +688,16 @@ class HTTPConnection:
     def ping(self) -> bool:
         try:
             resp = self.get("ping")
-            _state.set_user_info_if_null(resp.json())
             return resp.ok
         except requests.exceptions.ConnectionError:
             return False
     def make_long_lived(self) -> None:
         if not self.adapter:
-            self.adapter = RetryRequestExceptionsAdapter(base_num_retries=10, backoff_factor=0.5)
+            timeout_secs = parse_env_var_float("BRAINTRUST_HTTP_TIMEOUT", 60.0)
+            self.adapter = RetryRequestExceptionsAdapter(
+                base_num_retries=10, backoff_factor=0.5, default_timeout_secs=timeout_secs
+            )
         self._reset()
     @staticmethod
@@ -712,6 +742,8 @@ class HTTPConnection:
         return self.session.delete(_urljoin(self.base_url, path), *args, **kwargs)
     def get_json(self, object_type: str, args: Mapping[str, Any] | None = None, retries: int = 0) -> Mapping[str, Any]:
+        # FIXME[matt]: the retry logic seems to be unused and could be n*2 because of the the retry logic
+        # in the RetryRequestExceptionsAdapter. We should probably remove this.
         tries = retries + 1
         for i in range(tries):
             resp = self.get(f"/{object_type}", params=args)
@@ -1634,7 +1666,8 @@ def init_logger(
     if set_current:
         if _state is None:
             raise RuntimeError("_state is None in init_logger. This should never happen.")
-        _state.current_logger.set(ret)
+        _state._cv_logger.set(ret)
+        _state._local_logger = ret
     return ret
@@ -1955,7 +1988,7 @@ def current_experiment() -> Optional["Experiment"]:
 def current_logger() -> Optional["Logger"]:
     """Returns the currently-active logger (set by `braintrust.init_logger(...)`). Returns None if no current logger has been set."""
-    return _state.current_logger.get()
+    return _state._cv_logger.get() or _state._local_logger
 def current_span() -> Span:
@@ -3984,6 +4017,9 @@ class SpanImpl(Span):
         use_v4 = os.getenv("BRAINTRUST_OTEL_COMPAT", "false").lower() == "true"
         span_components_class = SpanComponentsV4 if use_v4 else SpanComponentsV3
+        # Disable span cache since remote function spans won't be in the local cache
+        self.state.span_cache.disable()
         return span_components_class(
             object_type=self.parent_object_type,
             object_id=object_id,
@@ -3997,7 +4033,7 @@ class SpanImpl(Span):
     def link(self) -> str:
         parent_type, info = self._get_parent_info()
         if parent_type == SpanObjectTypeV3.PROJECT_LOGS:
-            cur_logger = self.state.current_logger.get()
+            cur_logger = self.state._cv_logger.get() or self.state._local_logger
             if not cur_logger:
                 return NOOP_SPAN_PERMALINK
             base_url = cur_logger._get_link_base_url()

{braintrust-0.5.0 → braintrust-0.5.2}/src/braintrust/oai.py RENAMED Viewed

@@ -5,6 +5,8 @@ import time
 from collections.abc import Callable
 from typing import Any
+from wrapt import wrap_function_wrapper
 from .logger import Attachment, Span, start_span
 from .span_types import SpanTypeAttribute
 from .util import merge_dicts
@@ -986,3 +988,52 @@ def _is_not_given(value: Any) -> bool:
         return type_name == "NotGiven"
     except Exception:
         return False
+def _openai_init_wrapper(wrapped, instance, args, kwargs):
+    """Wrapper for OpenAI.__init__ that applies tracing after initialization."""
+    wrapped(*args, **kwargs)
+    _apply_openai_wrapper(instance)
+def patch_openai() -> bool:
+    """
+    Patch OpenAI to add Braintrust tracing globally.
+    After calling this, all new OpenAI() and AsyncOpenAI() clients
+    will automatically have tracing enabled.
+    Returns:
+        True if OpenAI was patched (or already patched), False if OpenAI is not installed.
+    Example:
+        ```python
+        import braintrust
+        braintrust.patch_openai()
+        import openai
+        client = openai.OpenAI()
+        # All calls are now traced!
+        ```
+    """
+    try:
+        import openai
+        if getattr(openai, "__braintrust_wrapped__", False):
+            return True  # Already patched
+        wrap_function_wrapper("openai", "OpenAI.__init__", _openai_init_wrapper)
+        wrap_function_wrapper("openai", "AsyncOpenAI.__init__", _openai_init_wrapper)
+        openai.__braintrust_wrapped__ = True
+        return True
+    except ImportError:
+        return False
+def _apply_openai_wrapper(client):
+    """Apply tracing wrapper to an OpenAI client instance in-place."""
+    wrapped = wrap_openai(client)
+    for attr in ("chat", "responses", "embeddings", "moderations", "beta"):
+        if hasattr(wrapped, attr):
+            setattr(client, attr, getattr(wrapped, attr))

{braintrust-0.5.0 → braintrust-0.5.2}/src/braintrust/test_bt_json.py RENAMED Viewed

@@ -302,11 +302,6 @@ def test_to_bt_safe_special_objects():
     assert _to_bt_safe(dataset) == "<dataset>"
     assert _to_bt_safe(logger) == "<logger>"
-    # Clean up
-    exp.flush()
-    dataset.flush()
-    logger.flush()
 class TestBTJsonAttachments(TestCase):
     def test_to_bt_safe_attachments(self):

{braintrust-0.5.0 → braintrust-0.5.2}/src/braintrust/test_framework.py RENAMED Viewed

@@ -1,6 +1,8 @@
 from typing import List
+from unittest.mock import MagicMock
 import pytest
+from braintrust.logger import BraintrustState
 from .framework import (
     Eval,
@@ -241,6 +243,7 @@ async def test_hooks_trial_index_multiple_inputs():
     assert sorted(input_2_trials) == [0, 1]
+@pytest.mark.vcr
 @pytest.mark.asyncio
 async def test_scorer_spans_have_purpose_attribute(with_memory_logger, with_simulate_login):
     """Test that scorer spans have span_attributes.purpose='scorer' and propagate to subspans."""
@@ -527,3 +530,37 @@ async def test_hooks_without_setting_tags(with_memory_logger, with_simulate_logi
     root_span = [log for log in logs if not log["span_parents"]]
     assert len(root_span) == 1
     assert root_span[0].get("tags") == None
+@pytest.mark.asyncio
+async def test_eval_enable_cache():
+    state = BraintrustState()
+    state.span_cache = MagicMock()
+    # Test enable_cache=False
+    await Eval(
+        "test-enable-cache-false",
+        data=[EvalCase(input=1, expected=1)],
+        task=lambda x: x,
+        scores=[],
+        state=state,
+        no_send_logs=True,
+        enable_cache=False,
+    )
+    state.span_cache.start.assert_not_called()
+    state.span_cache.stop.assert_not_called()
+    # Test enable_cache=True (default)
+    state.span_cache.start.reset_mock()
+    state.span_cache.stop.reset_mock()
+    await Eval(
+        "test-enable-cache-true",
+        data=[EvalCase(input=1, expected=1)],
+        task=lambda x: x,
+        scores=[],
+        state=state,
+        no_send_logs=True,
+        # enable_cache defaults to True
+    )
+    state.span_cache.start.assert_called()
+    state.span_cache.stop.assert_called()

braintrust 0.5.0__tar.gz → 0.5.2__tar.gz

braintrust 0.5.0tar.gz → 0.5.2tar.gz