PyPI - braintrust - Versions diffs - 0.4.3__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

braintrust 0.4.3py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

braintrust/__init__.py +3 -0
braintrust/_generated_types.py +106 -6
braintrust/auto.py +179 -0
braintrust/conftest.py +23 -4
braintrust/framework.py +113 -3
braintrust/functions/invoke.py +3 -1
braintrust/functions/test_invoke.py +61 -0
braintrust/generated_types.py +7 -1
braintrust/logger.py +127 -45
braintrust/oai.py +51 -0
braintrust/span_cache.py +337 -0
braintrust/span_identifier_v3.py +21 -0
braintrust/test_bt_json.py +0 -5
braintrust/test_framework.py +37 -0
braintrust/test_http.py +444 -0
braintrust/test_logger.py +295 -5
braintrust/test_span_cache.py +344 -0
braintrust/test_trace.py +267 -0
braintrust/test_util.py +58 -1
braintrust/trace.py +385 -0
braintrust/util.py +20 -0
braintrust/version.py +2 -2
braintrust/wrappers/agno/__init__.py +2 -3
braintrust/wrappers/anthropic.py +64 -0
braintrust/wrappers/claude_agent_sdk/__init__.py +2 -3
braintrust/wrappers/claude_agent_sdk/_wrapper.py +48 -6
braintrust/wrappers/claude_agent_sdk/test_wrapper.py +115 -0
braintrust/wrappers/dspy.py +52 -1
braintrust/wrappers/google_genai/__init__.py +9 -6
braintrust/wrappers/litellm.py +6 -43
braintrust/wrappers/pydantic_ai.py +2 -3
braintrust/wrappers/test_agno.py +9 -0
braintrust/wrappers/test_anthropic.py +156 -0
braintrust/wrappers/test_dspy.py +117 -0
braintrust/wrappers/test_google_genai.py +9 -0
braintrust/wrappers/test_litellm.py +57 -55
braintrust/wrappers/test_openai.py +253 -1
braintrust/wrappers/test_pydantic_ai_integration.py +9 -0
braintrust/wrappers/test_utils.py +79 -0
{braintrust-0.4.3.dist-info → braintrust-0.5.2.dist-info}/METADATA +1 -1
{braintrust-0.4.3.dist-info → braintrust-0.5.2.dist-info}/RECORD +44 -37
{braintrust-0.4.3.dist-info → braintrust-0.5.2.dist-info}/WHEEL +1 -1
{braintrust-0.4.3.dist-info → braintrust-0.5.2.dist-info}/entry_points.txt +0 -0
{braintrust-0.4.3.dist-info → braintrust-0.5.2.dist-info}/top_level.txt +0 -0

braintrust/functions/test_invoke.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""Tests for the invoke module, particularly init_function."""
+from braintrust.functions.invoke import init_function
+from braintrust.logger import _internal_get_global_state, _internal_reset_global_state
+class TestInitFunction:
+    """Tests for init_function."""
+    def setup_method(self):
+        """Reset state before each test."""
+        _internal_reset_global_state()
+    def teardown_method(self):
+        """Clean up after each test."""
+        _internal_reset_global_state()
+    def test_init_function_disables_span_cache(self):
+        """Test that init_function disables the span cache."""
+        state = _internal_get_global_state()
+        # Cache should be disabled by default (it's only enabled during evals)
+        assert state.span_cache.disabled is True
+        # Enable the cache (simulating what happens during eval)
+        state.span_cache.start()
+        assert state.span_cache.disabled is False
+        # Call init_function
+        f = init_function("test-project", "test-function")
+        # Cache should now be disabled (init_function explicitly disables it)
+        assert state.span_cache.disabled is True
+        assert f.__name__ == "init_function-test-project-test-function-latest"
+    def test_init_function_with_version(self):
+        """Test that init_function creates a function with the correct name including version."""
+        f = init_function("my-project", "my-scorer", version="v1")
+        assert f.__name__ == "init_function-my-project-my-scorer-v1"
+    def test_init_function_without_version_uses_latest(self):
+        """Test that init_function uses 'latest' in name when version not specified."""
+        f = init_function("my-project", "my-scorer")
+        assert f.__name__ == "init_function-my-project-my-scorer-latest"
+    def test_init_function_permanently_disables_cache(self):
+        """Test that init_function permanently disables the cache (can't be re-enabled)."""
+        state = _internal_get_global_state()
+        # Enable the cache
+        state.span_cache.start()
+        assert state.span_cache.disabled is False
+        # Call init_function
+        init_function("test-project", "test-function")
+        assert state.span_cache.disabled is True
+        # Try to start again - should still be disabled because of explicit disable
+        state.span_cache.start()
+        assert state.span_cache.disabled is True

braintrust/generated_types.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Auto-generated file (internal git SHA 87ac73f4945a47eff2d4e42775ba4dbc58854c73) -- do not modify"""
+"""Auto-generated file (internal git SHA 21146f64bf5ad1eadd3a99d186274728e25e5399) -- do not modify"""
 from ._generated_types import (
     Acl,
@@ -29,6 +29,9 @@ from ._generated_types import (
     Dataset,
     DatasetEvent,
     EnvVar,
+    EvalStatusPage,
+    EvalStatusPageConfig,
+    EvalStatusPageTheme,
     Experiment,
     ExperimentEvent,
     ExtendedSavedFunctionId,
@@ -136,6 +139,9 @@ __all__ = [
     "Dataset",
     "DatasetEvent",
     "EnvVar",
+    "EvalStatusPage",
+    "EvalStatusPageConfig",
+    "EvalStatusPageTheme",
     "Experiment",
     "ExperimentEvent",
     "ExtendedSavedFunctionId",

braintrust/logger.py CHANGED Viewed

@@ -47,12 +47,9 @@ from urllib3.util.retry import Retry
 from . import context, id_gen
 from .bt_json import bt_dumps, bt_safe_deep_copy
 from .db_fields import (
-    ASYNC_SCORING_CONTROL_FIELD,
     AUDIT_METADATA_FIELD,
     AUDIT_SOURCE_FIELD,
     IS_MERGE_FIELD,
-    MERGE_PATHS_FIELD,
-    SKIP_ASYNC_SCORING_FIELD,
     TRANSACTION_ID_FIELD,
     VALID_SOURCES,
 )
@@ -90,6 +87,7 @@ from .util import (
     get_caller_location,
     mask_api_key,
     merge_dicts,
+    parse_env_var_float,
     response_raise_for_status,
 )
@@ -101,6 +99,14 @@ from .xact_ids import prettify_xact
 Metadata = dict[str, Any]
 DATA_API_VERSION = 2
+class DatasetRef(TypedDict, total=False):
+    """Reference to a dataset by ID and optional version."""
+    id: str
+    version: str
 T = TypeVar("T")
 TMapping = TypeVar("TMapping", bound=Mapping[str, Any])
 TMutableMapping = TypeVar("TMutableMapping", bound=MutableMapping[str, Any])
@@ -344,9 +350,16 @@ class BraintrustState:
     def __init__(self):
         self.id = str(uuid.uuid4())
         self.current_experiment: Experiment | None = None
-        self.current_logger: contextvars.ContextVar[Logger | None] = contextvars.ContextVar(
+        # We use both a ContextVar and a plain attribute for the current logger:
+        # - _cv_logger (ContextVar): Provides async context isolation so different
+        #   async tasks can have different loggers without affecting each other.
+        # - _local_logger (plain attribute): Fallback for threads, since ContextVars
+        #   don't propagate to new threads. This way if users don't want to do
+        #   anything specific they'll always have a "global logger"
+        self._cv_logger: contextvars.ContextVar[Logger | None] = contextvars.ContextVar(
             "braintrust_current_logger", default=None
         )
+        self._local_logger: Logger | None = None
         self.current_parent: contextvars.ContextVar[str | None] = contextvars.ContextVar(
             "braintrust_current_parent", default=None
         )
@@ -396,6 +409,11 @@ class BraintrustState:
             ),
         )
+        from braintrust.span_cache import SpanCache
+        self.span_cache = SpanCache()
+        self._otel_flush_callback: Any | None = None
     def reset_login_info(self):
         self.app_url: str | None = None
         self.app_public_url: str | None = None
@@ -415,7 +433,8 @@ class BraintrustState:
     def reset_parent_state(self):
         # reset possible parent state for tests
         self.current_experiment = None
-        self.current_logger.set(None)
+        self._cv_logger.set(None)
+        self._local_logger = None
         self.current_parent.set(None)
         self.current_span.set(NOOP_SPAN)
@@ -452,6 +471,21 @@ class BraintrustState:
         return self._context_manager
+    def register_otel_flush(self, callback: Any) -> None:
+        """
+        Register an OTEL flush callback. This is called by the OTEL integration
+        when it initializes a span processor/exporter.
+        """
+        self._otel_flush_callback = callback
+    async def flush_otel(self) -> None:
+        """
+        Flush OTEL spans if a callback is registered.
+        Called during ensure_spans_flushed to ensure OTEL spans are visible in BTQL.
+        """
+        if self._otel_flush_callback:
+            await self._otel_flush_callback()
     def copy_state(self, other: "BraintrustState"):
         """Copy login information from another BraintrustState instance."""
         self.__dict__.update({
@@ -460,7 +494,8 @@ class BraintrustState:
             if k
             not in (
                 "current_experiment",
-                "current_logger",
+                "_cv_logger",
+                "_local_logger",
                 "current_parent",
                 "current_span",
                 "_global_bg_logger",
@@ -530,10 +565,6 @@ class BraintrustState:
             self._user_info = self.api_conn().get_json("ping")
         return self._user_info
-    def set_user_info_if_null(self, info: Mapping[str, Any]):
-        if not self._user_info:
-            self._user_info = info
     def global_bg_logger(self) -> "_BackgroundLogger":
         return getattr(self._override_bg_logger, "logger", None) or self._global_bg_logger.get()
@@ -595,14 +626,28 @@ class RetryRequestExceptionsAdapter(HTTPAdapter):
         base_num_retries: Maximum number of retries before giving up and re-raising the exception.
         backoff_factor: A multiplier used to determine the time to wait between retries.
                        The actual wait time is calculated as: backoff_factor * (2 ** retry_count).
+        default_timeout_secs: Default timeout in seconds for requests that don't specify one.
+                             Prevents indefinite hangs on stale connections.
     """
-    def __init__(self, *args: Any, base_num_retries: int = 0, backoff_factor: float = 0.5, **kwargs: Any):
+    def __init__(
+        self,
+        *args: Any,
+        base_num_retries: int = 0,
+        backoff_factor: float = 0.5,
+        default_timeout_secs: float = 60,
+        **kwargs: Any,
+    ):
         self.base_num_retries = base_num_retries
         self.backoff_factor = backoff_factor
+        self.default_timeout_secs = default_timeout_secs
         super().__init__(*args, **kwargs)
     def send(self, *args, **kwargs):
+        # Apply default timeout if none provided to prevent indefinite hangs
+        if kwargs.get("timeout") is None:
+            kwargs["timeout"] = self.default_timeout_secs
         num_prev_retries = 0
         while True:
             try:
@@ -614,6 +659,14 @@ class RetryRequestExceptionsAdapter(HTTPAdapter):
                 return response
             except (urllib3.exceptions.HTTPError, requests.exceptions.RequestException) as e:
                 if num_prev_retries < self.base_num_retries:
+                    if isinstance(e, requests.exceptions.ReadTimeout):
+                        # Clear all connection pools to discard stale connections. This
+                        # fixes hangs caused by NAT gateways silently dropping idle TCP
+                        # connections (e.g., Azure's ~4 min timeout). close() calls
+                        # PoolManager.clear() which is thread-safe: in-flight requests
+                        # keep their checked-out connections, and new requests create
+                        # fresh pools on demand.
+                        self.close()
                     # Emulates the sleeping logic in the backoff_factor of urllib3 Retry
                     sleep_s = self.backoff_factor * (2**num_prev_retries)
                     print("Retrying request after error:", e, file=sys.stderr)
@@ -635,14 +688,16 @@ class HTTPConnection:
     def ping(self) -> bool:
         try:
             resp = self.get("ping")
-            _state.set_user_info_if_null(resp.json())
             return resp.ok
         except requests.exceptions.ConnectionError:
             return False
     def make_long_lived(self) -> None:
         if not self.adapter:
-            self.adapter = RetryRequestExceptionsAdapter(base_num_retries=10, backoff_factor=0.5)
+            timeout_secs = parse_env_var_float("BRAINTRUST_HTTP_TIMEOUT", 60.0)
+            self.adapter = RetryRequestExceptionsAdapter(
+                base_num_retries=10, backoff_factor=0.5, default_timeout_secs=timeout_secs
+            )
         self._reset()
     @staticmethod
@@ -687,6 +742,8 @@ class HTTPConnection:
         return self.session.delete(_urljoin(self.base_url, path), *args, **kwargs)
     def get_json(self, object_type: str, args: Mapping[str, Any] | None = None, retries: int = 0) -> Mapping[str, Any]:
+        # FIXME[matt]: the retry logic seems to be unused and could be n*2 because of the the retry logic
+        # in the RetryRequestExceptionsAdapter. We should probably remove this.
         tries = retries + 1
         for i in range(tries):
             resp = self.get(f"/{object_type}", params=args)
@@ -1297,7 +1354,7 @@ def init(
     project: str | None = None,
     experiment: str | None = None,
     description: str | None = None,
-    dataset: Optional["Dataset"] = None,
+    dataset: Optional["Dataset"] | DatasetRef = None,
     open: bool = False,
     base_experiment: str | None = None,
     is_public: bool = False,
@@ -1410,12 +1467,19 @@ def init(
             args["base_exp_id"] = base_experiment_id
         elif base_experiment is not None:
             args["base_experiment"] = base_experiment
-        else:
+        elif merged_git_metadata_settings and merged_git_metadata_settings.collect != "none":
             args["ancestor_commits"] = list(get_past_n_ancestors())
         if dataset is not None:
-            args["dataset_id"] = dataset.id
-            args["dataset_version"] = dataset.version
+            if isinstance(dataset, dict):
+                # Simple {"id": ..., "version": ...} dict
+                args["dataset_id"] = dataset["id"]
+                if "version" in dataset:
+                    args["dataset_version"] = dataset["version"]
+            else:
+                # Full Dataset object
+                args["dataset_id"] = dataset.id
+                args["dataset_version"] = dataset.version
         if is_public is not None:
             args["public"] = is_public
@@ -1446,7 +1510,11 @@ def init(
     # For experiments, disable queue size limit enforcement (unlimited queue)
     state.enforce_queue_size_limit(False)
-    ret = Experiment(lazy_metadata=LazyValue(compute_metadata, use_mutex=True), dataset=dataset, state=state)
+    ret = Experiment(
+        lazy_metadata=LazyValue(compute_metadata, use_mutex=True),
+        dataset=dataset if isinstance(dataset, Dataset) else None,
+        state=state,
+    )
     if set_current:
         state.current_experiment = ret
     return ret
@@ -1598,7 +1666,8 @@ def init_logger(
     if set_current:
         if _state is None:
             raise RuntimeError("_state is None in init_logger. This should never happen.")
-        _state.current_logger.set(ret)
+        _state._cv_logger.set(ret)
+        _state._local_logger = ret
     return ret
@@ -1761,6 +1830,25 @@ def login(
         _state.login(app_url=app_url, api_key=api_key, org_name=org_name, force_login=force_login)
+def register_otel_flush(callback: Any) -> None:
+    """
+    Register a callback to flush OTEL spans. This is called by the OTEL integration
+    when it initializes a span processor/exporter.
+    When ensure_spans_flushed is called (e.g., before a BTQL query in scorers),
+    this callback will be invoked to ensure OTEL spans are flushed to the server.
+    Also disables the span cache, since OTEL spans aren't in the local cache
+    and we need BTQL to see the complete span tree (both native + OTEL spans).
+    :param callback: The async callback function to flush OTEL spans.
+    """
+    global _state
+    _state.register_otel_flush(callback)
+    # Disable span cache since OTEL spans aren't in the local cache
+    _state.span_cache.disable()
 def login_to_state(
     app_url: str | None = None,
     api_key: str | None = None,
@@ -1900,7 +1988,7 @@ def current_experiment() -> Optional["Experiment"]:
 def current_logger() -> Optional["Logger"]:
     """Returns the currently-active logger (set by `braintrust.init_logger(...)`). Returns None if no current logger has been set."""
-    return _state.current_logger.get()
+    return _state._cv_logger.get() or _state._local_logger
 def current_span() -> Span:
@@ -2323,30 +2411,6 @@ def _enrich_attachments(event: TMutableMapping) -> TMutableMapping:
 def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any]) -> dict[str, Any]:
-    # Make sure only certain keys are specified.
-    forbidden_keys = set(event.keys()) - {
-        "input",
-        "output",
-        "expected",
-        "tags",
-        "scores",
-        "metadata",
-        "metrics",
-        "error",
-        "dataset_record_id",
-        "origin",
-        "inputs",
-        "span_attributes",
-        ASYNC_SCORING_CONTROL_FIELD,
-        MERGE_PATHS_FIELD,
-        SKIP_ASYNC_SCORING_FIELD,
-        "span_id",
-        "root_span_id",
-        "_bt_internal_override_pagination_key",
-    }
-    if forbidden_keys:
-        raise ValueError(f"The following keys are not permitted: {forbidden_keys}")
     scores = event.get("scores")
     if scores:
         for name, score in scores.items():
@@ -3855,6 +3919,21 @@ class SpanImpl(Span):
         if serializable_partial_record.get("metrics", {}).get("end") is not None:
             self._logged_end_time = serializable_partial_record["metrics"]["end"]
+        # Write to local span cache for scorer access
+        # Only cache experiment spans - regular logs don't need caching
+        if self.parent_object_type == SpanObjectTypeV3.EXPERIMENT:
+            from braintrust.span_cache import CachedSpan
+            cached_span = CachedSpan(
+                span_id=self.span_id,
+                input=serializable_partial_record.get("input"),
+                output=serializable_partial_record.get("output"),
+                metadata=serializable_partial_record.get("metadata"),
+                span_parents=self.span_parents,
+                span_attributes=serializable_partial_record.get("span_attributes"),
+            )
+            self.state.span_cache.queue_write(self.root_span_id, self.span_id, cached_span)
         def compute_record() -> dict[str, Any]:
             exporter = _get_exporter()
             return dict(
@@ -3938,6 +4017,9 @@ class SpanImpl(Span):
         use_v4 = os.getenv("BRAINTRUST_OTEL_COMPAT", "false").lower() == "true"
         span_components_class = SpanComponentsV4 if use_v4 else SpanComponentsV3
+        # Disable span cache since remote function spans won't be in the local cache
+        self.state.span_cache.disable()
         return span_components_class(
             object_type=self.parent_object_type,
             object_id=object_id,
@@ -3951,7 +4033,7 @@ class SpanImpl(Span):
     def link(self) -> str:
         parent_type, info = self._get_parent_info()
         if parent_type == SpanObjectTypeV3.PROJECT_LOGS:
-            cur_logger = self.state.current_logger.get()
+            cur_logger = self.state._cv_logger.get() or self.state._local_logger
             if not cur_logger:
                 return NOOP_SPAN_PERMALINK
             base_url = cur_logger._get_link_base_url()

braintrust/oai.py CHANGED Viewed

@@ -5,6 +5,8 @@ import time
 from collections.abc import Callable
 from typing import Any
+from wrapt import wrap_function_wrapper
 from .logger import Attachment, Span, start_span
 from .span_types import SpanTypeAttribute
 from .util import merge_dicts
@@ -986,3 +988,52 @@ def _is_not_given(value: Any) -> bool:
         return type_name == "NotGiven"
     except Exception:
         return False
+def _openai_init_wrapper(wrapped, instance, args, kwargs):
+    """Wrapper for OpenAI.__init__ that applies tracing after initialization."""
+    wrapped(*args, **kwargs)
+    _apply_openai_wrapper(instance)
+def patch_openai() -> bool:
+    """
+    Patch OpenAI to add Braintrust tracing globally.
+    After calling this, all new OpenAI() and AsyncOpenAI() clients
+    will automatically have tracing enabled.
+    Returns:
+        True if OpenAI was patched (or already patched), False if OpenAI is not installed.
+    Example:
+        ```python
+        import braintrust
+        braintrust.patch_openai()
+        import openai
+        client = openai.OpenAI()
+        # All calls are now traced!
+        ```
+    """
+    try:
+        import openai
+        if getattr(openai, "__braintrust_wrapped__", False):
+            return True  # Already patched
+        wrap_function_wrapper("openai", "OpenAI.__init__", _openai_init_wrapper)
+        wrap_function_wrapper("openai", "AsyncOpenAI.__init__", _openai_init_wrapper)
+        openai.__braintrust_wrapped__ = True
+        return True
+    except ImportError:
+        return False
+def _apply_openai_wrapper(client):
+    """Apply tracing wrapper to an OpenAI client instance in-place."""
+    wrapped = wrap_openai(client)
+    for attr in ("chat", "responses", "embeddings", "moderations", "beta"):
+        if hasattr(wrapped, attr):
+            setattr(client, attr, getattr(wrapped, attr))

braintrust 0.4.3__py3-none-any.whl → 0.5.2__py3-none-any.whl

braintrust 0.4.3py3-none-any.whl → 0.5.2py3-none-any.whl