PyPI - braintrust - Versions diffs - 0.4.3__tar.gz → 0.5.0__tar.gz - Mend

braintrust 0.4.3tar.gz → 0.5.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (134) hide show

{braintrust-0.4.3 → braintrust-0.5.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: braintrust
-Version: 0.4.3
+Version: 0.5.0
 Summary: SDK for integrating Braintrust
 Home-page: https://www.braintrust.dev
 Author: Braintrust

{braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/_generated_types.py RENAMED Viewed

@@ -144,6 +144,11 @@ class AsyncScoringControlAsyncScoringControl5(TypedDict):
     triggered_xact_id: str
+class AsyncScoringControlAsyncScoringControl6(TypedDict):
+    kind: Literal['mark_attempt_failed']
+    function_ids: Sequence[Any]
 class AsyncScoringStateAsyncScoringState(TypedDict):
     status: Literal['enabled']
     token: str
@@ -484,6 +489,10 @@ class Dataset(TypedDict):
     """
     User-controlled metadata about the dataset
     """
+    url_slug: str
+    """
+    URL slug for the dataset. used to construct dataset URLs
+    """
 class DatasetEventMetadata(TypedDict):
@@ -532,6 +541,43 @@ class EnvVar(TypedDict):
     """
+class EvalStatusPageConfig(TypedDict):
+    score_columns: NotRequired[Sequence[str] | None]
+    """
+    The score columns to display on the page
+    """
+    metric_columns: NotRequired[Sequence[str] | None]
+    """
+    The metric columns to display on the page
+    """
+    grouping_field: NotRequired[str | None]
+    """
+    The metadata field to use for grouping experiments (model)
+    """
+    filter: NotRequired[str | None]
+    """
+    BTQL filter to apply to experiment data
+    """
+    sort_by: NotRequired[str | None]
+    """
+    Field to sort results by (format: 'score:<name>' or 'metric:<name>')
+    """
+    sort_order: NotRequired[Literal['asc', 'desc'] | None]
+    """
+    Sort order (ascending or descending)
+    """
+    api_key: NotRequired[str | None]
+    """
+    The API key used for fetching experiment data
+    """
+EvalStatusPageTheme: TypeAlias = Literal['light', 'dark']
+"""
+The theme for the page
+"""
 class ExperimentEventMetadata(TypedDict):
     model: NotRequired[str | None]
     """
@@ -749,20 +795,24 @@ FunctionIdRef: TypeAlias = Mapping[str, Any]
 FunctionObjectType: TypeAlias = Literal[
-    'prompt', 'tool', 'scorer', 'task', 'custom_view', 'preprocessor', 'facet'
+    'prompt', 'tool', 'scorer', 'task', 'workflow', 'custom_view', 'preprocessor', 'facet', 'classifier'
 ]
-FunctionOutputType: TypeAlias = Literal['completion', 'score', 'any']
+FunctionOutputType: TypeAlias = Literal['completion', 'score', 'facet', 'classification', 'any']
-FunctionTypeEnum: TypeAlias = Literal['llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet']
+FunctionTypeEnum: TypeAlias = Literal[
+    'llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet', 'classifier'
+]
 """
 The type of global function. Defaults to 'scorer'.
 """
-FunctionTypeEnumNullish: TypeAlias = Literal['llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet']
+FunctionTypeEnumNullish: TypeAlias = Literal[
+    'llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet', 'classifier'
+]
 class GitMetadataSettings(TypedDict):
@@ -1674,7 +1724,18 @@ class PromptDataNullishOrigin(TypedDict):
 class PromptParserNullish(TypedDict):
     type: Literal['llm_classifier']
     use_cot: bool
-    choice_scores: Mapping[str, float]
+    choice_scores: NotRequired[Mapping[str, float] | None]
+    """
+    Map of choices to scores (0-1). Used by scorers.
+    """
+    choice: NotRequired[Sequence[str] | None]
+    """
+    List of valid choices without score mapping. Used by classifiers that deposit output to tags.
+    """
+    allow_no_match: NotRequired[bool | None]
+    """
+    If true, adds a 'No match' option. When selected, no tag is deposited.
+    """
 class PromptSessionEvent(TypedDict):
@@ -2104,7 +2165,7 @@ class SpanScope(TypedDict):
 SpanType: TypeAlias = Literal[
-    'llm', 'score', 'function', 'eval', 'task', 'tool', 'automation', 'facet', 'preprocessor'
+    'llm', 'score', 'function', 'eval', 'task', 'tool', 'automation', 'facet', 'preprocessor', 'classifier'
 ]
 """
 Type of the span, for display purposes only
@@ -2384,6 +2445,7 @@ AsyncScoringControl: TypeAlias = (
     | AsyncScoringControlAsyncScoringControl3
     | AsyncScoringControlAsyncScoringControl4
     | AsyncScoringControlAsyncScoringControl5
+    | AsyncScoringControlAsyncScoringControl6
 )
@@ -2530,6 +2592,43 @@ class DatasetEvent(TypedDict):
     """
+class EvalStatusPage(TypedDict):
+    id: str
+    """
+    Unique identifier for the eval status page
+    """
+    project_id: str
+    """
+    Unique identifier for the project that the eval status page belongs under
+    """
+    user_id: NotRequired[str | None]
+    """
+    Identifies the user who created the eval status page
+    """
+    created: NotRequired[str | None]
+    """
+    Date of eval status page creation
+    """
+    deleted_at: NotRequired[str | None]
+    """
+    Date of eval status page deletion, or null if the eval status page is still active
+    """
+    name: str
+    """
+    Name of the eval status page
+    """
+    description: NotRequired[str | None]
+    """
+    Textual description of the eval status page
+    """
+    logo_url: NotRequired[str | None]
+    """
+    URL of the logo to display on the page
+    """
+    theme: EvalStatusPageTheme
+    config: EvalStatusPageConfig
 class Experiment(TypedDict):
     id: str
     """
@@ -3228,6 +3327,7 @@ class View(TypedDict):
         'prompts',
         'tools',
         'scorers',
+        'classifiers',
         'logs',
         'monitor',
         'for_review',

{braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/framework.py RENAMED Viewed

@@ -1280,6 +1280,29 @@ async def _run_evaluator_internal(
     filters: list[Filter],
     stream: Callable[[SSEProgressEvent], None] | None = None,
     state: BraintrustState | None = None,
+):
+    # Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
+    if state is None:
+        from braintrust.logger import _internal_get_global_state
+        state = _internal_get_global_state()
+    state.span_cache.start()
+    try:
+        return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
+    finally:
+        # Clean up disk-based span cache after eval completes and stop caching
+        state.span_cache.dispose()
+        state.span_cache.stop()
+async def _run_evaluator_internal_impl(
+    experiment,
+    evaluator: Evaluator,
+    position: int | None,
+    filters: list[Filter],
+    stream: Callable[[SSEProgressEvent], None] | None = None,
+    state: BraintrustState | None = None,
 ):
     event_loop = asyncio.get_event_loop()
@@ -1290,11 +1313,13 @@ async def _run_evaluator_internal(
             {**parent_propagated},
             {"span_attributes": {"purpose": "scorer"}},
         )
+        # Strip trace from logged input - it's internal plumbing that shouldn't appear in spans
+        logged_input = {k: v for k, v in kwargs.items() if k != "trace"}
         with root_span.start_span(
             name=name,
             span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
             propagated_event=merged_propagated,
-            input=dict(**kwargs),
+            input=logged_input,
         ) as span:
             score = scorer
             if hasattr(scorer, "eval_async"):
@@ -1415,6 +1440,77 @@ async def _run_evaluator_internal(
                 tags = hooks.tags if hooks.tags else None
                 root_span.log(output=output, metadata=metadata, tags=tags)
+                # Create trace object for scorers
+                from braintrust.trace import LocalTrace
+                async def ensure_spans_flushed():
+                    # Flush native Braintrust spans
+                    if experiment:
+                        await asyncio.get_event_loop().run_in_executor(
+                            None, lambda: experiment.state.flush()
+                        )
+                    elif state:
+                        await asyncio.get_event_loop().run_in_executor(None, lambda: state.flush())
+                    else:
+                        from braintrust.logger import flush as flush_logger
+                        await asyncio.get_event_loop().run_in_executor(None, flush_logger)
+                    # Also flush OTEL spans if registered
+                    if state:
+                        await state.flush_otel()
+                experiment_id = None
+                if experiment:
+                    try:
+                        experiment_id = experiment.id
+                    except:
+                        experiment_id = None
+                trace = None
+                if state or experiment:
+                    # Get the state to use
+                    trace_state = state
+                    if not trace_state and experiment:
+                        trace_state = experiment.state
+                    if not trace_state:
+                        # Fall back to global state
+                        from braintrust.logger import _internal_get_global_state
+                        trace_state = _internal_get_global_state()
+                    # Access root_span_id from the concrete SpanImpl instance
+                    # The Span interface doesn't expose this but SpanImpl has it
+                    root_span_id_value = getattr(root_span, "root_span_id", root_span.id)
+                    # Check if there's a parent in the context to determine object_type and object_id
+                    from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string
+                    parent_str = trace_state.current_parent.get()
+                    parent_components = None
+                    if parent_str:
+                        try:
+                            parent_components = SpanComponentsV3.from_str(parent_str)
+                        except Exception:
+                            # If parsing fails, parent_components stays None
+                            pass
+                    # Determine object_type and object_id based on parent or experiment
+                    if parent_components:
+                        trace_object_type = span_object_type_v3_to_typed_string(parent_components.object_type)
+                        trace_object_id = parent_components.object_id or ""
+                    else:
+                        trace_object_type = "experiment"
+                        trace_object_id = experiment_id or ""
+                    trace = LocalTrace(
+                        object_type=trace_object_type,
+                        object_id=trace_object_id,
+                        root_span_id=root_span_id_value,
+                        ensure_spans_flushed=ensure_spans_flushed,
+                        state=trace_state,
+                    )
                 score_promises = [
                     asyncio.create_task(
                         await_or_run_scorer(
@@ -1426,6 +1522,7 @@ async def _run_evaluator_internal(
                                 "expected": datum.expected,
                                 "metadata": metadata,
                                 "output": output,
+                                "trace": trace,
                             },
                         )
                     )

{braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/functions/invoke.py RENAMED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Literal, TypedDict, TypeVar, overload
 from sseclient import SSEClient
 from .._generated_types import FunctionTypeEnum
-from ..logger import Exportable, get_span_parent_object, login, proxy_conn
+from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
 from ..util import response_raise_for_status
 from .constants import INVOKE_API_VERSION
 from .stream import BraintrustInvokeError, BraintrustStream
@@ -243,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None):
     :param version: Optional version of the function to use. Defaults to latest.
     :return: A function that can be used as a task or scorer.
     """
+    # Disable span cache since remote function spans won't be in the local cache
+    _internal_get_global_state().span_cache.disable()
     def f(*args: Any, **kwargs: Any) -> Any:
         if len(args) > 0:

braintrust-0.5.0/src/braintrust/functions/test_invoke.py ADDED Viewed

@@ -0,0 +1,61 @@
+"""Tests for the invoke module, particularly init_function."""
+from braintrust.functions.invoke import init_function
+from braintrust.logger import _internal_get_global_state, _internal_reset_global_state
+class TestInitFunction:
+    """Tests for init_function."""
+    def setup_method(self):
+        """Reset state before each test."""
+        _internal_reset_global_state()
+    def teardown_method(self):
+        """Clean up after each test."""
+        _internal_reset_global_state()
+    def test_init_function_disables_span_cache(self):
+        """Test that init_function disables the span cache."""
+        state = _internal_get_global_state()
+        # Cache should be disabled by default (it's only enabled during evals)
+        assert state.span_cache.disabled is True
+        # Enable the cache (simulating what happens during eval)
+        state.span_cache.start()
+        assert state.span_cache.disabled is False
+        # Call init_function
+        f = init_function("test-project", "test-function")
+        # Cache should now be disabled (init_function explicitly disables it)
+        assert state.span_cache.disabled is True
+        assert f.__name__ == "init_function-test-project-test-function-latest"
+    def test_init_function_with_version(self):
+        """Test that init_function creates a function with the correct name including version."""
+        f = init_function("my-project", "my-scorer", version="v1")
+        assert f.__name__ == "init_function-my-project-my-scorer-v1"
+    def test_init_function_without_version_uses_latest(self):
+        """Test that init_function uses 'latest' in name when version not specified."""
+        f = init_function("my-project", "my-scorer")
+        assert f.__name__ == "init_function-my-project-my-scorer-latest"
+    def test_init_function_permanently_disables_cache(self):
+        """Test that init_function permanently disables the cache (can't be re-enabled)."""
+        state = _internal_get_global_state()
+        # Enable the cache
+        state.span_cache.start()
+        assert state.span_cache.disabled is False
+        # Call init_function
+        init_function("test-project", "test-function")
+        assert state.span_cache.disabled is True
+        # Try to start again - should still be disabled because of explicit disable
+        state.span_cache.start()
+        assert state.span_cache.disabled is True

{braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/generated_types.py RENAMED Viewed

@@ -1,4 +1,4 @@
-"""Auto-generated file (internal git SHA 87ac73f4945a47eff2d4e42775ba4dbc58854c73) -- do not modify"""
+"""Auto-generated file (internal git SHA 21146f64bf5ad1eadd3a99d186274728e25e5399) -- do not modify"""
 from ._generated_types import (
     Acl,
@@ -29,6 +29,9 @@ from ._generated_types import (
     Dataset,
     DatasetEvent,
     EnvVar,
+    EvalStatusPage,
+    EvalStatusPageConfig,
+    EvalStatusPageTheme,
     Experiment,
     ExperimentEvent,
     ExtendedSavedFunctionId,
@@ -136,6 +139,9 @@ __all__ = [
     "Dataset",
     "DatasetEvent",
     "EnvVar",
+    "EvalStatusPage",
+    "EvalStatusPageConfig",
+    "EvalStatusPageTheme",
     "Experiment",
     "ExperimentEvent",
     "ExtendedSavedFunctionId",

{braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/logger.py RENAMED Viewed

@@ -47,12 +47,9 @@ from urllib3.util.retry import Retry
 from . import context, id_gen
 from .bt_json import bt_dumps, bt_safe_deep_copy
 from .db_fields import (
-    ASYNC_SCORING_CONTROL_FIELD,
     AUDIT_METADATA_FIELD,
     AUDIT_SOURCE_FIELD,
     IS_MERGE_FIELD,
-    MERGE_PATHS_FIELD,
-    SKIP_ASYNC_SCORING_FIELD,
     TRANSACTION_ID_FIELD,
     VALID_SOURCES,
 )
@@ -101,6 +98,14 @@ from .xact_ids import prettify_xact
 Metadata = dict[str, Any]
 DATA_API_VERSION = 2
+class DatasetRef(TypedDict, total=False):
+    """Reference to a dataset by ID and optional version."""
+    id: str
+    version: str
 T = TypeVar("T")
 TMapping = TypeVar("TMapping", bound=Mapping[str, Any])
 TMutableMapping = TypeVar("TMutableMapping", bound=MutableMapping[str, Any])
@@ -396,6 +401,11 @@ class BraintrustState:
             ),
         )
+        from braintrust.span_cache import SpanCache
+        self.span_cache = SpanCache()
+        self._otel_flush_callback: Any | None = None
     def reset_login_info(self):
         self.app_url: str | None = None
         self.app_public_url: str | None = None
@@ -452,6 +462,21 @@ class BraintrustState:
         return self._context_manager
+    def register_otel_flush(self, callback: Any) -> None:
+        """
+        Register an OTEL flush callback. This is called by the OTEL integration
+        when it initializes a span processor/exporter.
+        """
+        self._otel_flush_callback = callback
+    async def flush_otel(self) -> None:
+        """
+        Flush OTEL spans if a callback is registered.
+        Called during ensure_spans_flushed to ensure OTEL spans are visible in BTQL.
+        """
+        if self._otel_flush_callback:
+            await self._otel_flush_callback()
     def copy_state(self, other: "BraintrustState"):
         """Copy login information from another BraintrustState instance."""
         self.__dict__.update({
@@ -1297,7 +1322,7 @@ def init(
     project: str | None = None,
     experiment: str | None = None,
     description: str | None = None,
-    dataset: Optional["Dataset"] = None,
+    dataset: Optional["Dataset"] | DatasetRef = None,
     open: bool = False,
     base_experiment: str | None = None,
     is_public: bool = False,
@@ -1410,12 +1435,19 @@ def init(
             args["base_exp_id"] = base_experiment_id
         elif base_experiment is not None:
             args["base_experiment"] = base_experiment
-        else:
+        elif merged_git_metadata_settings and merged_git_metadata_settings.collect != "none":
             args["ancestor_commits"] = list(get_past_n_ancestors())
         if dataset is not None:
-            args["dataset_id"] = dataset.id
-            args["dataset_version"] = dataset.version
+            if isinstance(dataset, dict):
+                # Simple {"id": ..., "version": ...} dict
+                args["dataset_id"] = dataset["id"]
+                if "version" in dataset:
+                    args["dataset_version"] = dataset["version"]
+            else:
+                # Full Dataset object
+                args["dataset_id"] = dataset.id
+                args["dataset_version"] = dataset.version
         if is_public is not None:
             args["public"] = is_public
@@ -1446,7 +1478,11 @@ def init(
     # For experiments, disable queue size limit enforcement (unlimited queue)
     state.enforce_queue_size_limit(False)
-    ret = Experiment(lazy_metadata=LazyValue(compute_metadata, use_mutex=True), dataset=dataset, state=state)
+    ret = Experiment(
+        lazy_metadata=LazyValue(compute_metadata, use_mutex=True),
+        dataset=dataset if isinstance(dataset, Dataset) else None,
+        state=state,
+    )
     if set_current:
         state.current_experiment = ret
     return ret
@@ -1761,6 +1797,25 @@ def login(
         _state.login(app_url=app_url, api_key=api_key, org_name=org_name, force_login=force_login)
+def register_otel_flush(callback: Any) -> None:
+    """
+    Register a callback to flush OTEL spans. This is called by the OTEL integration
+    when it initializes a span processor/exporter.
+    When ensure_spans_flushed is called (e.g., before a BTQL query in scorers),
+    this callback will be invoked to ensure OTEL spans are flushed to the server.
+    Also disables the span cache, since OTEL spans aren't in the local cache
+    and we need BTQL to see the complete span tree (both native + OTEL spans).
+    :param callback: The async callback function to flush OTEL spans.
+    """
+    global _state
+    _state.register_otel_flush(callback)
+    # Disable span cache since OTEL spans aren't in the local cache
+    _state.span_cache.disable()
 def login_to_state(
     app_url: str | None = None,
     api_key: str | None = None,
@@ -2323,30 +2378,6 @@ def _enrich_attachments(event: TMutableMapping) -> TMutableMapping:
 def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any]) -> dict[str, Any]:
-    # Make sure only certain keys are specified.
-    forbidden_keys = set(event.keys()) - {
-        "input",
-        "output",
-        "expected",
-        "tags",
-        "scores",
-        "metadata",
-        "metrics",
-        "error",
-        "dataset_record_id",
-        "origin",
-        "inputs",
-        "span_attributes",
-        ASYNC_SCORING_CONTROL_FIELD,
-        MERGE_PATHS_FIELD,
-        SKIP_ASYNC_SCORING_FIELD,
-        "span_id",
-        "root_span_id",
-        "_bt_internal_override_pagination_key",
-    }
-    if forbidden_keys:
-        raise ValueError(f"The following keys are not permitted: {forbidden_keys}")
     scores = event.get("scores")
     if scores:
         for name, score in scores.items():
@@ -3855,6 +3886,21 @@ class SpanImpl(Span):
         if serializable_partial_record.get("metrics", {}).get("end") is not None:
             self._logged_end_time = serializable_partial_record["metrics"]["end"]
+        # Write to local span cache for scorer access
+        # Only cache experiment spans - regular logs don't need caching
+        if self.parent_object_type == SpanObjectTypeV3.EXPERIMENT:
+            from braintrust.span_cache import CachedSpan
+            cached_span = CachedSpan(
+                span_id=self.span_id,
+                input=serializable_partial_record.get("input"),
+                output=serializable_partial_record.get("output"),
+                metadata=serializable_partial_record.get("metadata"),
+                span_parents=self.span_parents,
+                span_attributes=serializable_partial_record.get("span_attributes"),
+            )
+            self.state.span_cache.queue_write(self.root_span_id, self.span_id, cached_span)
         def compute_record() -> dict[str, Any]:
             exporter = _get_exporter()
             return dict(

braintrust 0.4.3__tar.gz → 0.5.0__tar.gz

braintrust 0.4.3tar.gz → 0.5.0tar.gz