braintrust 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. braintrust/_generated_types.py +328 -126
  2. braintrust/cli/install/api.py +1 -1
  3. braintrust/conftest.py +24 -0
  4. braintrust/devserver/test_server_integration.py +0 -11
  5. braintrust/framework.py +98 -1
  6. braintrust/functions/invoke.py +4 -9
  7. braintrust/functions/test_invoke.py +61 -0
  8. braintrust/generated_types.py +13 -7
  9. braintrust/logger.py +107 -66
  10. braintrust/prompt_cache/test_disk_cache.py +3 -3
  11. braintrust/span_cache.py +337 -0
  12. braintrust/span_identifier_v3.py +21 -0
  13. braintrust/span_types.py +3 -0
  14. braintrust/test_bt_json.py +23 -19
  15. braintrust/test_logger.py +116 -0
  16. braintrust/test_span_cache.py +344 -0
  17. braintrust/test_trace.py +267 -0
  18. braintrust/trace.py +385 -0
  19. braintrust/version.py +2 -2
  20. braintrust/wrappers/claude_agent_sdk/_wrapper.py +48 -6
  21. braintrust/wrappers/claude_agent_sdk/test_wrapper.py +106 -0
  22. braintrust/wrappers/langsmith_wrapper.py +517 -0
  23. braintrust/wrappers/test_agno.py +0 -12
  24. braintrust/wrappers/test_anthropic.py +1 -11
  25. braintrust/wrappers/test_dspy.py +0 -11
  26. braintrust/wrappers/test_google_genai.py +6 -1
  27. braintrust/wrappers/test_langsmith_wrapper.py +338 -0
  28. braintrust/wrappers/test_litellm.py +0 -10
  29. braintrust/wrappers/test_oai_attachments.py +0 -10
  30. braintrust/wrappers/test_openai.py +3 -12
  31. braintrust/wrappers/test_openrouter.py +0 -9
  32. braintrust/wrappers/test_pydantic_ai_integration.py +0 -11
  33. braintrust/wrappers/test_pydantic_ai_wrap_openai.py +2 -0
  34. {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/METADATA +1 -1
  35. {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/RECORD +38 -31
  36. {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/WHEEL +1 -1
  37. {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/entry_points.txt +0 -0
  38. {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/top_level.txt +0 -0
@@ -326,7 +326,7 @@ def main(args):
326
326
  textwrap.dedent(
327
327
  f"""\
328
328
  Stack with name {args.name} does not exist. Either create it manually by following
329
- https://www.braintrust.dev/docs/guides/self-hosting/aws or use the --create flag."""
329
+ https://www.braintrust.dev/docs/admin/self-hosting/aws or use the --create flag."""
330
330
  )
331
331
  )
332
332
  exit(1)
braintrust/conftest.py CHANGED
@@ -46,3 +46,27 @@ def reset_braintrust_state():
46
46
  from braintrust import logger
47
47
 
48
48
  logger._state = logger.BraintrustState()
49
+
50
+
51
+ @pytest.fixture(scope="session")
52
+ def vcr_config():
53
+ """
54
+ VCR configuration for recording/playing back HTTP interactions.
55
+
56
+ In CI, use "none" to fail if cassette is missing.
57
+ Locally, use "once" to record new cassettes if they don't exist.
58
+ """
59
+ record_mode = "none" if (os.environ.get("CI") or os.environ.get("GITHUB_ACTIONS")) else "once"
60
+
61
+ return {
62
+ "record_mode": record_mode,
63
+ "filter_headers": [
64
+ "authorization",
65
+ "openai-organization",
66
+ "x-api-key",
67
+ "api-key",
68
+ "openai-api-key",
69
+ "x-goog-api-key",
70
+ "x-bt-auth-token",
71
+ ],
72
+ }
@@ -8,17 +8,6 @@ from braintrust.framework import _evals
8
8
  from braintrust.test_helpers import has_devserver_installed
9
9
 
10
10
 
11
- @pytest.fixture(scope="module")
12
- def vcr_config():
13
- """VCR configuration to filter sensitive headers."""
14
- return {
15
- "filter_headers": [
16
- "x-bt-auth-token",
17
- "authorization",
18
- ]
19
- }
20
-
21
-
22
11
  @pytest.fixture
23
12
  def client():
24
13
  """Create test client using the real simple_eval.py example."""
braintrust/framework.py CHANGED
@@ -1280,6 +1280,29 @@ async def _run_evaluator_internal(
1280
1280
  filters: list[Filter],
1281
1281
  stream: Callable[[SSEProgressEvent], None] | None = None,
1282
1282
  state: BraintrustState | None = None,
1283
+ ):
1284
+ # Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
1285
+ if state is None:
1286
+ from braintrust.logger import _internal_get_global_state
1287
+
1288
+ state = _internal_get_global_state()
1289
+
1290
+ state.span_cache.start()
1291
+ try:
1292
+ return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
1293
+ finally:
1294
+ # Clean up disk-based span cache after eval completes and stop caching
1295
+ state.span_cache.dispose()
1296
+ state.span_cache.stop()
1297
+
1298
+
1299
+ async def _run_evaluator_internal_impl(
1300
+ experiment,
1301
+ evaluator: Evaluator,
1302
+ position: int | None,
1303
+ filters: list[Filter],
1304
+ stream: Callable[[SSEProgressEvent], None] | None = None,
1305
+ state: BraintrustState | None = None,
1283
1306
  ):
1284
1307
  event_loop = asyncio.get_event_loop()
1285
1308
 
@@ -1290,11 +1313,13 @@ async def _run_evaluator_internal(
1290
1313
  {**parent_propagated},
1291
1314
  {"span_attributes": {"purpose": "scorer"}},
1292
1315
  )
1316
+ # Strip trace from logged input - it's internal plumbing that shouldn't appear in spans
1317
+ logged_input = {k: v for k, v in kwargs.items() if k != "trace"}
1293
1318
  with root_span.start_span(
1294
1319
  name=name,
1295
1320
  span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
1296
1321
  propagated_event=merged_propagated,
1297
- input=dict(**kwargs),
1322
+ input=logged_input,
1298
1323
  ) as span:
1299
1324
  score = scorer
1300
1325
  if hasattr(scorer, "eval_async"):
@@ -1415,6 +1440,77 @@ async def _run_evaluator_internal(
1415
1440
  tags = hooks.tags if hooks.tags else None
1416
1441
  root_span.log(output=output, metadata=metadata, tags=tags)
1417
1442
 
1443
+ # Create trace object for scorers
1444
+ from braintrust.trace import LocalTrace
1445
+
1446
+ async def ensure_spans_flushed():
1447
+ # Flush native Braintrust spans
1448
+ if experiment:
1449
+ await asyncio.get_event_loop().run_in_executor(
1450
+ None, lambda: experiment.state.flush()
1451
+ )
1452
+ elif state:
1453
+ await asyncio.get_event_loop().run_in_executor(None, lambda: state.flush())
1454
+ else:
1455
+ from braintrust.logger import flush as flush_logger
1456
+
1457
+ await asyncio.get_event_loop().run_in_executor(None, flush_logger)
1458
+
1459
+ # Also flush OTEL spans if registered
1460
+ if state:
1461
+ await state.flush_otel()
1462
+
1463
+ experiment_id = None
1464
+ if experiment:
1465
+ try:
1466
+ experiment_id = experiment.id
1467
+ except:
1468
+ experiment_id = None
1469
+
1470
+ trace = None
1471
+ if state or experiment:
1472
+ # Get the state to use
1473
+ trace_state = state
1474
+ if not trace_state and experiment:
1475
+ trace_state = experiment.state
1476
+ if not trace_state:
1477
+ # Fall back to global state
1478
+ from braintrust.logger import _internal_get_global_state
1479
+
1480
+ trace_state = _internal_get_global_state()
1481
+
1482
+ # Access root_span_id from the concrete SpanImpl instance
1483
+ # The Span interface doesn't expose this but SpanImpl has it
1484
+ root_span_id_value = getattr(root_span, "root_span_id", root_span.id)
1485
+
1486
+ # Check if there's a parent in the context to determine object_type and object_id
1487
+ from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string
1488
+
1489
+ parent_str = trace_state.current_parent.get()
1490
+ parent_components = None
1491
+ if parent_str:
1492
+ try:
1493
+ parent_components = SpanComponentsV3.from_str(parent_str)
1494
+ except Exception:
1495
+ # If parsing fails, parent_components stays None
1496
+ pass
1497
+
1498
+ # Determine object_type and object_id based on parent or experiment
1499
+ if parent_components:
1500
+ trace_object_type = span_object_type_v3_to_typed_string(parent_components.object_type)
1501
+ trace_object_id = parent_components.object_id or ""
1502
+ else:
1503
+ trace_object_type = "experiment"
1504
+ trace_object_id = experiment_id or ""
1505
+
1506
+ trace = LocalTrace(
1507
+ object_type=trace_object_type,
1508
+ object_id=trace_object_id,
1509
+ root_span_id=root_span_id_value,
1510
+ ensure_spans_flushed=ensure_spans_flushed,
1511
+ state=trace_state,
1512
+ )
1513
+
1418
1514
  score_promises = [
1419
1515
  asyncio.create_task(
1420
1516
  await_or_run_scorer(
@@ -1426,6 +1522,7 @@ async def _run_evaluator_internal(
1426
1522
  "expected": datum.expected,
1427
1523
  "metadata": metadata,
1428
1524
  "output": output,
1525
+ "trace": trace,
1429
1526
  },
1430
1527
  )
1431
1528
  )
@@ -2,8 +2,8 @@ from typing import Any, Literal, TypedDict, TypeVar, overload
2
2
 
3
3
  from sseclient import SSEClient
4
4
 
5
- from .._generated_types import FunctionTypeEnum, InvokeContext
6
- from ..logger import Exportable, get_span_parent_object, login, proxy_conn
5
+ from .._generated_types import FunctionTypeEnum
6
+ from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
7
7
  from ..util import response_raise_for_status
8
8
  from .constants import INVOKE_API_VERSION
9
9
  from .stream import BraintrustInvokeError, BraintrustStream
@@ -43,7 +43,6 @@ def invoke(
43
43
  # arguments to the function
44
44
  input: Any = None,
45
45
  messages: list[Any] | None = None,
46
- context: InvokeContext | None = None,
47
46
  metadata: dict[str, Any] | None = None,
48
47
  tags: list[str] | None = None,
49
48
  parent: Exportable | str | None = None,
@@ -72,7 +71,6 @@ def invoke(
72
71
  # arguments to the function
73
72
  input: Any = None,
74
73
  messages: list[Any] | None = None,
75
- context: InvokeContext | None = None,
76
74
  metadata: dict[str, Any] | None = None,
77
75
  tags: list[str] | None = None,
78
76
  parent: Exportable | str | None = None,
@@ -100,7 +98,6 @@ def invoke(
100
98
  # arguments to the function
101
99
  input: Any = None,
102
100
  messages: list[Any] | None = None,
103
- context: InvokeContext | None = None,
104
101
  metadata: dict[str, Any] | None = None,
105
102
  tags: list[str] | None = None,
106
103
  parent: Exportable | str | None = None,
@@ -119,8 +116,6 @@ def invoke(
119
116
  Args:
120
117
  input: The input to the function. This will be logged as the `input` field in the span.
121
118
  messages: Additional OpenAI-style messages to add to the prompt (only works for llm functions).
122
- context: Context for functions that operate on spans/traces (e.g., facets). Should contain
123
- `object_type`, `object_id`, and `scope` fields.
124
119
  metadata: Additional metadata to add to the span. This will be logged as the `metadata` field in the span.
125
120
  It will also be available as the {{metadata}} field in the prompt and as the `metadata` argument
126
121
  to the function.
@@ -195,8 +190,6 @@ def invoke(
195
190
  )
196
191
  if messages is not None:
197
192
  request["messages"] = messages
198
- if context is not None:
199
- request["context"] = context
200
193
  if mode is not None:
201
194
  request["mode"] = mode
202
195
  if strict is not None:
@@ -250,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None):
250
243
  :param version: Optional version of the function to use. Defaults to latest.
251
244
  :return: A function that can be used as a task or scorer.
252
245
  """
246
+ # Disable span cache since remote function spans won't be in the local cache
247
+ _internal_get_global_state().span_cache.disable()
253
248
 
254
249
  def f(*args: Any, **kwargs: Any) -> Any:
255
250
  if len(args) > 0:
@@ -0,0 +1,61 @@
1
+ """Tests for the invoke module, particularly init_function."""
2
+
3
+
4
+ from braintrust.functions.invoke import init_function
5
+ from braintrust.logger import _internal_get_global_state, _internal_reset_global_state
6
+
7
+
8
+ class TestInitFunction:
9
+ """Tests for init_function."""
10
+
11
+ def setup_method(self):
12
+ """Reset state before each test."""
13
+ _internal_reset_global_state()
14
+
15
+ def teardown_method(self):
16
+ """Clean up after each test."""
17
+ _internal_reset_global_state()
18
+
19
+ def test_init_function_disables_span_cache(self):
20
+ """Test that init_function disables the span cache."""
21
+ state = _internal_get_global_state()
22
+
23
+ # Cache should be disabled by default (it's only enabled during evals)
24
+ assert state.span_cache.disabled is True
25
+
26
+ # Enable the cache (simulating what happens during eval)
27
+ state.span_cache.start()
28
+ assert state.span_cache.disabled is False
29
+
30
+ # Call init_function
31
+ f = init_function("test-project", "test-function")
32
+
33
+ # Cache should now be disabled (init_function explicitly disables it)
34
+ assert state.span_cache.disabled is True
35
+ assert f.__name__ == "init_function-test-project-test-function-latest"
36
+
37
+ def test_init_function_with_version(self):
38
+ """Test that init_function creates a function with the correct name including version."""
39
+ f = init_function("my-project", "my-scorer", version="v1")
40
+ assert f.__name__ == "init_function-my-project-my-scorer-v1"
41
+
42
+ def test_init_function_without_version_uses_latest(self):
43
+ """Test that init_function uses 'latest' in name when version not specified."""
44
+ f = init_function("my-project", "my-scorer")
45
+ assert f.__name__ == "init_function-my-project-my-scorer-latest"
46
+
47
+ def test_init_function_permanently_disables_cache(self):
48
+ """Test that init_function permanently disables the cache (can't be re-enabled)."""
49
+ state = _internal_get_global_state()
50
+
51
+ # Enable the cache
52
+ state.span_cache.start()
53
+ assert state.span_cache.disabled is False
54
+
55
+ # Call init_function
56
+ init_function("test-project", "test-function")
57
+ assert state.span_cache.disabled is True
58
+
59
+ # Try to start again - should still be disabled because of explicit disable
60
+ state.span_cache.start()
61
+ assert state.span_cache.disabled is True
@@ -1,4 +1,4 @@
1
- """Auto-generated file (internal git SHA 547fa17c0937e0e25fdf9214487be6f31c91a37a) -- do not modify"""
1
+ """Auto-generated file (internal git SHA 21146f64bf5ad1eadd3a99d186274728e25e5399) -- do not modify"""
2
2
 
3
3
  from ._generated_types import (
4
4
  Acl,
@@ -10,6 +10,7 @@ from ._generated_types import (
10
10
  AsyncScoringState,
11
11
  AttachmentReference,
12
12
  AttachmentStatus,
13
+ BatchedFacetData,
13
14
  BraintrustAttachmentReference,
14
15
  BraintrustModelParams,
15
16
  CallEvent,
@@ -28,6 +29,9 @@ from ._generated_types import (
28
29
  Dataset,
29
30
  DatasetEvent,
30
31
  EnvVar,
32
+ EvalStatusPage,
33
+ EvalStatusPageConfig,
34
+ EvalStatusPageTheme,
31
35
  Experiment,
32
36
  ExperimentEvent,
33
37
  ExtendedSavedFunctionId,
@@ -47,15 +51,13 @@ from ._generated_types import (
47
51
  GraphEdge,
48
52
  GraphNode,
49
53
  Group,
54
+ GroupScope,
50
55
  IfExists,
51
- InvokeContext,
52
56
  InvokeFunction,
53
57
  InvokeParent,
54
- InvokeScope,
55
58
  MCPServer,
56
59
  MessageRole,
57
60
  ModelParams,
58
- NullableFunctionTypeEnum,
59
61
  NullableSavedFunctionId,
60
62
  ObjectReference,
61
63
  ObjectReferenceNullish,
@@ -99,6 +101,7 @@ from ._generated_types import (
99
101
  StreamingMode,
100
102
  ToolFunctionDefinition,
101
103
  TraceScope,
104
+ TriggeredFunctionState,
102
105
  UploadStatus,
103
106
  User,
104
107
  View,
@@ -117,6 +120,7 @@ __all__ = [
117
120
  "AsyncScoringState",
118
121
  "AttachmentReference",
119
122
  "AttachmentStatus",
123
+ "BatchedFacetData",
120
124
  "BraintrustAttachmentReference",
121
125
  "BraintrustModelParams",
122
126
  "CallEvent",
@@ -135,6 +139,9 @@ __all__ = [
135
139
  "Dataset",
136
140
  "DatasetEvent",
137
141
  "EnvVar",
142
+ "EvalStatusPage",
143
+ "EvalStatusPageConfig",
144
+ "EvalStatusPageTheme",
138
145
  "Experiment",
139
146
  "ExperimentEvent",
140
147
  "ExtendedSavedFunctionId",
@@ -154,15 +161,13 @@ __all__ = [
154
161
  "GraphEdge",
155
162
  "GraphNode",
156
163
  "Group",
164
+ "GroupScope",
157
165
  "IfExists",
158
- "InvokeContext",
159
166
  "InvokeFunction",
160
167
  "InvokeParent",
161
- "InvokeScope",
162
168
  "MCPServer",
163
169
  "MessageRole",
164
170
  "ModelParams",
165
- "NullableFunctionTypeEnum",
166
171
  "NullableSavedFunctionId",
167
172
  "ObjectReference",
168
173
  "ObjectReferenceNullish",
@@ -206,6 +211,7 @@ __all__ = [
206
211
  "StreamingMode",
207
212
  "ToolFunctionDefinition",
208
213
  "TraceScope",
214
+ "TriggeredFunctionState",
209
215
  "UploadStatus",
210
216
  "User",
211
217
  "View",
braintrust/logger.py CHANGED
@@ -47,12 +47,9 @@ from urllib3.util.retry import Retry
47
47
  from . import context, id_gen
48
48
  from .bt_json import bt_dumps, bt_safe_deep_copy
49
49
  from .db_fields import (
50
- ASYNC_SCORING_CONTROL_FIELD,
51
50
  AUDIT_METADATA_FIELD,
52
51
  AUDIT_SOURCE_FIELD,
53
52
  IS_MERGE_FIELD,
54
- MERGE_PATHS_FIELD,
55
- SKIP_ASYNC_SCORING_FIELD,
56
53
  TRANSACTION_ID_FIELD,
57
54
  VALID_SOURCES,
58
55
  )
@@ -101,6 +98,14 @@ from .xact_ids import prettify_xact
101
98
  Metadata = dict[str, Any]
102
99
  DATA_API_VERSION = 2
103
100
 
101
+
102
+ class DatasetRef(TypedDict, total=False):
103
+ """Reference to a dataset by ID and optional version."""
104
+
105
+ id: str
106
+ version: str
107
+
108
+
104
109
  T = TypeVar("T")
105
110
  TMapping = TypeVar("TMapping", bound=Mapping[str, Any])
106
111
  TMutableMapping = TypeVar("TMutableMapping", bound=MutableMapping[str, Any])
@@ -396,6 +401,11 @@ class BraintrustState:
396
401
  ),
397
402
  )
398
403
 
404
+ from braintrust.span_cache import SpanCache
405
+
406
+ self.span_cache = SpanCache()
407
+ self._otel_flush_callback: Any | None = None
408
+
399
409
  def reset_login_info(self):
400
410
  self.app_url: str | None = None
401
411
  self.app_public_url: str | None = None
@@ -452,26 +462,39 @@ class BraintrustState:
452
462
 
453
463
  return self._context_manager
454
464
 
465
+ def register_otel_flush(self, callback: Any) -> None:
466
+ """
467
+ Register an OTEL flush callback. This is called by the OTEL integration
468
+ when it initializes a span processor/exporter.
469
+ """
470
+ self._otel_flush_callback = callback
471
+
472
+ async def flush_otel(self) -> None:
473
+ """
474
+ Flush OTEL spans if a callback is registered.
475
+ Called during ensure_spans_flushed to ensure OTEL spans are visible in BTQL.
476
+ """
477
+ if self._otel_flush_callback:
478
+ await self._otel_flush_callback()
479
+
455
480
  def copy_state(self, other: "BraintrustState"):
456
481
  """Copy login information from another BraintrustState instance."""
457
- self.__dict__.update(
458
- {
459
- k: v
460
- for (k, v) in other.__dict__.items()
461
- if k
462
- not in (
463
- "current_experiment",
464
- "current_logger",
465
- "current_parent",
466
- "current_span",
467
- "_global_bg_logger",
468
- "_override_bg_logger",
469
- "_context_manager",
470
- "_last_otel_setting",
471
- "_context_manager_lock",
472
- )
473
- }
474
- )
482
+ self.__dict__.update({
483
+ k: v
484
+ for (k, v) in other.__dict__.items()
485
+ if k
486
+ not in (
487
+ "current_experiment",
488
+ "current_logger",
489
+ "current_parent",
490
+ "current_span",
491
+ "_global_bg_logger",
492
+ "_override_bg_logger",
493
+ "_context_manager",
494
+ "_last_otel_setting",
495
+ "_context_manager_lock",
496
+ )
497
+ })
475
498
 
476
499
  def login(
477
500
  self,
@@ -1299,7 +1322,7 @@ def init(
1299
1322
  project: str | None = None,
1300
1323
  experiment: str | None = None,
1301
1324
  description: str | None = None,
1302
- dataset: Optional["Dataset"] = None,
1325
+ dataset: Optional["Dataset"] | DatasetRef = None,
1303
1326
  open: bool = False,
1304
1327
  base_experiment: str | None = None,
1305
1328
  is_public: bool = False,
@@ -1412,12 +1435,19 @@ def init(
1412
1435
  args["base_exp_id"] = base_experiment_id
1413
1436
  elif base_experiment is not None:
1414
1437
  args["base_experiment"] = base_experiment
1415
- else:
1438
+ elif merged_git_metadata_settings and merged_git_metadata_settings.collect != "none":
1416
1439
  args["ancestor_commits"] = list(get_past_n_ancestors())
1417
1440
 
1418
1441
  if dataset is not None:
1419
- args["dataset_id"] = dataset.id
1420
- args["dataset_version"] = dataset.version
1442
+ if isinstance(dataset, dict):
1443
+ # Simple {"id": ..., "version": ...} dict
1444
+ args["dataset_id"] = dataset["id"]
1445
+ if "version" in dataset:
1446
+ args["dataset_version"] = dataset["version"]
1447
+ else:
1448
+ # Full Dataset object
1449
+ args["dataset_id"] = dataset.id
1450
+ args["dataset_version"] = dataset.version
1421
1451
 
1422
1452
  if is_public is not None:
1423
1453
  args["public"] = is_public
@@ -1448,7 +1478,11 @@ def init(
1448
1478
  # For experiments, disable queue size limit enforcement (unlimited queue)
1449
1479
  state.enforce_queue_size_limit(False)
1450
1480
 
1451
- ret = Experiment(lazy_metadata=LazyValue(compute_metadata, use_mutex=True), dataset=dataset, state=state)
1481
+ ret = Experiment(
1482
+ lazy_metadata=LazyValue(compute_metadata, use_mutex=True),
1483
+ dataset=dataset if isinstance(dataset, Dataset) else None,
1484
+ state=state,
1485
+ )
1452
1486
  if set_current:
1453
1487
  state.current_experiment = ret
1454
1488
  return ret
@@ -1763,6 +1797,25 @@ def login(
1763
1797
  _state.login(app_url=app_url, api_key=api_key, org_name=org_name, force_login=force_login)
1764
1798
 
1765
1799
 
1800
+ def register_otel_flush(callback: Any) -> None:
1801
+ """
1802
+ Register a callback to flush OTEL spans. This is called by the OTEL integration
1803
+ when it initializes a span processor/exporter.
1804
+
1805
+ When ensure_spans_flushed is called (e.g., before a BTQL query in scorers),
1806
+ this callback will be invoked to ensure OTEL spans are flushed to the server.
1807
+
1808
+ Also disables the span cache, since OTEL spans aren't in the local cache
1809
+ and we need BTQL to see the complete span tree (both native + OTEL spans).
1810
+
1811
+ :param callback: The async callback function to flush OTEL spans.
1812
+ """
1813
+ global _state
1814
+ _state.register_otel_flush(callback)
1815
+ # Disable span cache since OTEL spans aren't in the local cache
1816
+ _state.span_cache.disable()
1817
+
1818
+
1766
1819
  def login_to_state(
1767
1820
  app_url: str | None = None,
1768
1821
  api_key: str | None = None,
@@ -2325,29 +2378,6 @@ def _enrich_attachments(event: TMutableMapping) -> TMutableMapping:
2325
2378
 
2326
2379
 
2327
2380
  def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any]) -> dict[str, Any]:
2328
- # Make sure only certain keys are specified.
2329
- forbidden_keys = set(event.keys()) - {
2330
- "input",
2331
- "output",
2332
- "expected",
2333
- "tags",
2334
- "scores",
2335
- "metadata",
2336
- "metrics",
2337
- "error",
2338
- "dataset_record_id",
2339
- "origin",
2340
- "inputs",
2341
- "span_attributes",
2342
- ASYNC_SCORING_CONTROL_FIELD,
2343
- MERGE_PATHS_FIELD,
2344
- SKIP_ASYNC_SCORING_FIELD,
2345
- "span_id",
2346
- "root_span_id",
2347
- }
2348
- if forbidden_keys:
2349
- raise ValueError(f"The following keys are not permitted: {forbidden_keys}")
2350
-
2351
2381
  scores = event.get("scores")
2352
2382
  if scores:
2353
2383
  for name, score in scores.items():
@@ -3856,6 +3886,21 @@ class SpanImpl(Span):
3856
3886
  if serializable_partial_record.get("metrics", {}).get("end") is not None:
3857
3887
  self._logged_end_time = serializable_partial_record["metrics"]["end"]
3858
3888
 
3889
+ # Write to local span cache for scorer access
3890
+ # Only cache experiment spans - regular logs don't need caching
3891
+ if self.parent_object_type == SpanObjectTypeV3.EXPERIMENT:
3892
+ from braintrust.span_cache import CachedSpan
3893
+
3894
+ cached_span = CachedSpan(
3895
+ span_id=self.span_id,
3896
+ input=serializable_partial_record.get("input"),
3897
+ output=serializable_partial_record.get("output"),
3898
+ metadata=serializable_partial_record.get("metadata"),
3899
+ span_parents=self.span_parents,
3900
+ span_attributes=serializable_partial_record.get("span_attributes"),
3901
+ )
3902
+ self.state.span_cache.queue_write(self.root_span_id, self.span_id, cached_span)
3903
+
3859
3904
  def compute_record() -> dict[str, Any]:
3860
3905
  exporter = _get_exporter()
3861
3906
  return dict(
@@ -4403,24 +4448,20 @@ def render_message(render: Callable[[str], str], message: PromptMessage):
4403
4448
  if c["type"] == "text":
4404
4449
  rendered_content.append({**c, "text": render(c["text"])})
4405
4450
  elif c["type"] == "image_url":
4406
- rendered_content.append(
4407
- {
4408
- **c,
4409
- "image_url": {**c["image_url"], "url": render(c["image_url"]["url"])},
4410
- }
4411
- )
4451
+ rendered_content.append({
4452
+ **c,
4453
+ "image_url": {**c["image_url"], "url": render(c["image_url"]["url"])},
4454
+ })
4412
4455
  elif c["type"] == "file":
4413
- rendered_content.append(
4414
- {
4415
- **c,
4416
- "file": {
4417
- **c["file"],
4418
- "file_data": render(c["file"]["file_data"]),
4419
- **({} if "file_id" not in c["file"] else {"file_id": render(c["file"]["file_id"])}),
4420
- **({} if "filename" not in c["file"] else {"filename": render(c["file"]["filename"])}),
4421
- },
4422
- }
4423
- )
4456
+ rendered_content.append({
4457
+ **c,
4458
+ "file": {
4459
+ **c["file"],
4460
+ "file_data": render(c["file"]["file_data"]),
4461
+ **({} if "file_id" not in c["file"] else {"file_id": render(c["file"]["file_id"])}),
4462
+ **({} if "filename" not in c["file"] else {"filename": render(c["file"]["filename"])}),
4463
+ },
4464
+ })
4424
4465
  else:
4425
4466
  raise ValueError(f"Unknown content type: {c['type']}")
4426
4467
 
@@ -39,7 +39,7 @@ class TestDiskCache(unittest.TestCase):
39
39
  "a\nb",
40
40
  ]
41
41
  for k in weird_keys:
42
- time.sleep(0.05) # make sure the mtimes are different
42
+ time.sleep(0.01) # make sure the mtimes are different
43
43
  self.cache.set(k, data)
44
44
  result = self.cache.get(k)
45
45
  assert data == result
@@ -61,7 +61,7 @@ class TestDiskCache(unittest.TestCase):
61
61
  # Fill cache beyond max size (3).
62
62
  for i in range(3):
63
63
  self.cache.set(f"key{i}", {"value": i})
64
- time.sleep(0.1) # wait to ensure different mtimes
64
+ time.sleep(0.01) # wait to ensure different mtimes
65
65
 
66
66
  # Add one more to trigger eviction.
67
67
  self.cache.set("key3", {"value": 3})
@@ -75,7 +75,7 @@ class TestDiskCache(unittest.TestCase):
75
75
  # Fill cache beyond max size (3).
76
76
  for i in range(3):
77
77
  self.cache.set(f"key{i}", {"value": i})
78
- time.sleep(0.1) # wait to ensure different mtimes
78
+ time.sleep(0.01) # wait to ensure different mtimes
79
79
 
80
80
  # Add one more to trigger eviction.
81
81
  self.cache.set("key3", {"value": 3})