braintrust 0.4.3__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. braintrust/__init__.py +3 -0
  2. braintrust/_generated_types.py +106 -6
  3. braintrust/auto.py +179 -0
  4. braintrust/conftest.py +23 -4
  5. braintrust/framework.py +113 -3
  6. braintrust/functions/invoke.py +3 -1
  7. braintrust/functions/test_invoke.py +61 -0
  8. braintrust/generated_types.py +7 -1
  9. braintrust/logger.py +127 -45
  10. braintrust/oai.py +51 -0
  11. braintrust/span_cache.py +337 -0
  12. braintrust/span_identifier_v3.py +21 -0
  13. braintrust/test_bt_json.py +0 -5
  14. braintrust/test_framework.py +37 -0
  15. braintrust/test_http.py +444 -0
  16. braintrust/test_logger.py +295 -5
  17. braintrust/test_span_cache.py +344 -0
  18. braintrust/test_trace.py +267 -0
  19. braintrust/test_util.py +58 -1
  20. braintrust/trace.py +385 -0
  21. braintrust/util.py +20 -0
  22. braintrust/version.py +2 -2
  23. braintrust/wrappers/agno/__init__.py +2 -3
  24. braintrust/wrappers/anthropic.py +64 -0
  25. braintrust/wrappers/claude_agent_sdk/__init__.py +2 -3
  26. braintrust/wrappers/claude_agent_sdk/_wrapper.py +48 -6
  27. braintrust/wrappers/claude_agent_sdk/test_wrapper.py +115 -0
  28. braintrust/wrappers/dspy.py +52 -1
  29. braintrust/wrappers/google_genai/__init__.py +9 -6
  30. braintrust/wrappers/litellm.py +6 -43
  31. braintrust/wrappers/pydantic_ai.py +2 -3
  32. braintrust/wrappers/test_agno.py +9 -0
  33. braintrust/wrappers/test_anthropic.py +156 -0
  34. braintrust/wrappers/test_dspy.py +117 -0
  35. braintrust/wrappers/test_google_genai.py +9 -0
  36. braintrust/wrappers/test_litellm.py +57 -55
  37. braintrust/wrappers/test_openai.py +253 -1
  38. braintrust/wrappers/test_pydantic_ai_integration.py +9 -0
  39. braintrust/wrappers/test_utils.py +79 -0
  40. {braintrust-0.4.3.dist-info → braintrust-0.5.2.dist-info}/METADATA +1 -1
  41. {braintrust-0.4.3.dist-info → braintrust-0.5.2.dist-info}/RECORD +44 -37
  42. {braintrust-0.4.3.dist-info → braintrust-0.5.2.dist-info}/WHEEL +1 -1
  43. {braintrust-0.4.3.dist-info → braintrust-0.5.2.dist-info}/entry_points.txt +0 -0
  44. {braintrust-0.4.3.dist-info → braintrust-0.5.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,337 @@
1
+ """
2
+ SpanCache provides a disk-based cache for span data, allowing
3
+ scorers to read spans without making server round-trips when possible.
4
+
5
+ Spans are stored on disk to minimize memory usage during evaluations.
6
+ The cache file is automatically cleaned up when dispose() is called.
7
+ """
8
+
9
+ import atexit
10
+ import json
11
+ import os
12
+ import tempfile
13
+ import uuid
14
+ from typing import Any, Optional
15
+
16
+ from braintrust.util import merge_dicts
17
+
18
+ # Global registry of active span caches for process exit cleanup
19
+ _active_caches: set["SpanCache"] = set()
20
+ _exit_handlers_registered = False
21
+
22
+
23
+ class CachedSpan:
24
+ """Cached span data structure."""
25
+
26
+ def __init__(
27
+ self,
28
+ span_id: str,
29
+ input: Optional[Any] = None,
30
+ output: Optional[Any] = None,
31
+ metadata: Optional[dict[str, Any]] = None,
32
+ span_parents: Optional[list[str]] = None,
33
+ span_attributes: Optional[dict[str, Any]] = None,
34
+ ):
35
+ self.span_id = span_id
36
+ self.input = input
37
+ self.output = output
38
+ self.metadata = metadata
39
+ self.span_parents = span_parents
40
+ self.span_attributes = span_attributes
41
+
42
+ def to_dict(self) -> dict[str, Any]:
43
+ """Convert to dictionary for serialization."""
44
+ result = {"span_id": self.span_id}
45
+ if self.input is not None:
46
+ result["input"] = self.input
47
+ if self.output is not None:
48
+ result["output"] = self.output
49
+ if self.metadata is not None:
50
+ result["metadata"] = self.metadata
51
+ if self.span_parents is not None:
52
+ result["span_parents"] = self.span_parents
53
+ if self.span_attributes is not None:
54
+ result["span_attributes"] = self.span_attributes
55
+ return result
56
+
57
+ @classmethod
58
+ def from_dict(cls, data: dict[str, Any]) -> "CachedSpan":
59
+ """Create from dictionary."""
60
+ return cls(
61
+ span_id=data["span_id"],
62
+ input=data.get("input"),
63
+ output=data.get("output"),
64
+ metadata=data.get("metadata"),
65
+ span_parents=data.get("span_parents"),
66
+ span_attributes=data.get("span_attributes"),
67
+ )
68
+
69
+
70
+ class DiskSpanRecord:
71
+ """Record structure for disk storage."""
72
+
73
+ def __init__(self, root_span_id: str, span_id: str, data: CachedSpan):
74
+ self.root_span_id = root_span_id
75
+ self.span_id = span_id
76
+ self.data = data
77
+
78
+ def to_dict(self) -> dict[str, Any]:
79
+ """Convert to dictionary for JSON serialization."""
80
+ return {
81
+ "rootSpanId": self.root_span_id,
82
+ "spanId": self.span_id,
83
+ "data": self.data.to_dict(),
84
+ }
85
+
86
+ @classmethod
87
+ def from_dict(cls, data: dict[str, Any]) -> "DiskSpanRecord":
88
+ """Create from dictionary."""
89
+ return cls(
90
+ root_span_id=data["rootSpanId"],
91
+ span_id=data["spanId"],
92
+ data=CachedSpan.from_dict(data["data"]),
93
+ )
94
+
95
+
96
+ class SpanCache:
97
+ """
98
+ Disk-based cache for span data, keyed by rootSpanId.
99
+
100
+ This cache writes spans to a temporary file to minimize memory usage.
101
+ It uses append-only writes and reads the full file when querying.
102
+ """
103
+
104
+ def __init__(self, disabled: bool = False):
105
+ self._cache_file_path: Optional[str] = None
106
+ self._initialized = False
107
+ # Tracks whether the cache was explicitly disabled (via constructor or disable())
108
+ self._explicitly_disabled = disabled
109
+ # Tracks whether the cache has been enabled (for evals only)
110
+ self._enabled = False
111
+ # Reference count of active evals using this cache
112
+ self._active_eval_count = 0
113
+ # Small in-memory index tracking which rootSpanIds have data
114
+ self._root_span_index: set[str] = set()
115
+ # Buffer for pending writes
116
+ self._write_buffer: list[DiskSpanRecord] = []
117
+
118
+ def disable(self) -> None:
119
+ """
120
+ Disable the cache at runtime. This is called automatically when
121
+ OTEL is registered, since OTEL spans won't be in the cache.
122
+ """
123
+ self._explicitly_disabled = True
124
+
125
+ def start(self) -> None:
126
+ """
127
+ Start caching spans for use during evaluations.
128
+ This only starts caching if the cache wasn't permanently disabled.
129
+ Called by Eval() to turn on caching for the duration of the eval.
130
+ Uses reference counting to support parallel evals.
131
+ """
132
+ if not self._explicitly_disabled:
133
+ self._enabled = True
134
+ self._active_eval_count += 1
135
+
136
+ def stop(self) -> None:
137
+ """
138
+ Stop caching spans and return to the default disabled state.
139
+ Unlike disable(), this allows start() to work again for future evals.
140
+ Called after an eval completes to return to the default state.
141
+ Uses reference counting - only disables when all evals are complete.
142
+ """
143
+ self._active_eval_count -= 1
144
+ if self._active_eval_count <= 0:
145
+ self._active_eval_count = 0
146
+ self._enabled = False
147
+
148
+ @property
149
+ def disabled(self) -> bool:
150
+ """Check if cache is disabled."""
151
+ return self._explicitly_disabled or not self._enabled
152
+
153
+ def _ensure_initialized(self) -> None:
154
+ """Initialize the cache file if not already done."""
155
+ if self.disabled or self._initialized:
156
+ return
157
+
158
+ try:
159
+ # Create temporary file
160
+ unique_id = f"{int(os.times().elapsed * 1000000)}-{uuid.uuid4().hex[:8]}"
161
+ self._cache_file_path = os.path.join(tempfile.gettempdir(), f"braintrust-span-cache-{unique_id}.jsonl")
162
+
163
+ # Create the file
164
+ with open(self._cache_file_path, "w") as f:
165
+ pass
166
+
167
+ self._initialized = True
168
+ self._register_exit_handler()
169
+ except Exception:
170
+ # Silently fail if filesystem is unavailable - cache is best-effort
171
+ # This can happen if temp directory is not writable or disk is full
172
+ self._explicitly_disabled = True
173
+ return
174
+
175
+ def _register_exit_handler(self) -> None:
176
+ """Register a handler to clean up the temp file on process exit."""
177
+ global _exit_handlers_registered
178
+ _active_caches.add(self)
179
+
180
+ if not _exit_handlers_registered:
181
+ _exit_handlers_registered = True
182
+
183
+ def cleanup_all_caches():
184
+ """Clean up all active caches."""
185
+ for cache in _active_caches:
186
+ if cache._cache_file_path and os.path.exists(cache._cache_file_path):
187
+ try:
188
+ os.unlink(cache._cache_file_path)
189
+ except Exception:
190
+ # Ignore cleanup errors - file might not exist or already deleted
191
+ pass
192
+
193
+ atexit.register(cleanup_all_caches)
194
+
195
+ def queue_write(self, root_span_id: str, span_id: str, data: CachedSpan) -> None:
196
+ """
197
+ Write a span to the cache.
198
+ In Python, we write synchronously (no async queue like in TS).
199
+ """
200
+ if self.disabled:
201
+ return
202
+
203
+ self._ensure_initialized()
204
+
205
+ record = DiskSpanRecord(root_span_id, span_id, data)
206
+ self._write_buffer.append(record)
207
+ self._root_span_index.add(root_span_id)
208
+
209
+ # Write to disk immediately (simplified compared to TS async version)
210
+ self._flush_write_buffer()
211
+
212
+ def _flush_write_buffer(self) -> None:
213
+ """Flush the write buffer to disk."""
214
+ if not self._write_buffer or not self._cache_file_path:
215
+ return
216
+
217
+ try:
218
+ with open(self._cache_file_path, "a") as f:
219
+ for record in self._write_buffer:
220
+ f.write(json.dumps(record.to_dict()) + "\n")
221
+ self._write_buffer.clear()
222
+ except Exception:
223
+ # Silently fail if write fails - cache is best-effort
224
+ # This can happen if disk is full or file permissions changed
225
+ pass
226
+
227
+ def get_by_root_span_id(self, root_span_id: str) -> Optional[list[CachedSpan]]:
228
+ """
229
+ Get all cached spans for a given rootSpanId.
230
+
231
+ This reads the file and merges all records for the given rootSpanId.
232
+
233
+ Args:
234
+ root_span_id: The root span ID to look up
235
+
236
+ Returns:
237
+ List of cached spans, or None if not in cache
238
+ """
239
+ if self.disabled:
240
+ return None
241
+
242
+ # Quick check using in-memory index
243
+ if root_span_id not in self._root_span_index:
244
+ return None
245
+
246
+ # Accumulate spans by spanId, merging updates
247
+ span_map: dict[str, dict[str, Any]] = {}
248
+
249
+ # Read from disk if initialized
250
+ if self._initialized and self._cache_file_path and os.path.exists(self._cache_file_path):
251
+ try:
252
+ with open(self._cache_file_path, "r") as f:
253
+ for line in f:
254
+ line = line.strip()
255
+ if not line:
256
+ continue
257
+ try:
258
+ record_dict = json.loads(line)
259
+ record = DiskSpanRecord.from_dict(record_dict)
260
+ if record.root_span_id != root_span_id:
261
+ continue
262
+
263
+ if record.span_id in span_map:
264
+ merge_dicts(span_map[record.span_id], record.data.to_dict())
265
+ else:
266
+ span_map[record.span_id] = record.data.to_dict()
267
+ except Exception:
268
+ # Skip malformed lines - may occur if file was corrupted or truncated
269
+ pass
270
+ except Exception:
271
+ # Continue to check buffer even if disk read fails
272
+ # This can happen if file was deleted or permissions changed
273
+ pass
274
+
275
+ # Also check the in-memory write buffer for unflushed data
276
+ for record in self._write_buffer:
277
+ if record.root_span_id != root_span_id:
278
+ continue
279
+ if record.span_id in span_map:
280
+ merge_dicts(span_map[record.span_id], record.data.to_dict())
281
+ else:
282
+ span_map[record.span_id] = record.data.to_dict()
283
+
284
+ if not span_map:
285
+ return None
286
+
287
+ return [CachedSpan.from_dict(data) for data in span_map.values()]
288
+
289
+ def has(self, root_span_id: str) -> bool:
290
+ """Check if a rootSpanId has cached data."""
291
+ if self.disabled:
292
+ return False
293
+ return root_span_id in self._root_span_index
294
+
295
+ def clear(self, root_span_id: str) -> None:
296
+ """
297
+ Clear all cached spans for a given rootSpanId.
298
+ Note: This only removes from the index. The data remains in the file
299
+ but will be ignored on reads.
300
+ """
301
+ self._root_span_index.discard(root_span_id)
302
+
303
+ def clear_all(self) -> None:
304
+ """Clear all cached data and remove the cache file."""
305
+ self._root_span_index.clear()
306
+ self.dispose()
307
+
308
+ @property
309
+ def size(self) -> int:
310
+ """Get the number of root spans currently tracked."""
311
+ return len(self._root_span_index)
312
+
313
+ def dispose(self) -> None:
314
+ """
315
+ Clean up the cache file. Call this when the eval is complete.
316
+ Only performs cleanup when all active evals have completed (refcount = 0).
317
+ """
318
+ # Only dispose if no active evals are using this cache
319
+ if self._active_eval_count > 0:
320
+ return
321
+
322
+ # Remove from global registry
323
+ _active_caches.discard(self)
324
+
325
+ # Clear pending writes
326
+ self._write_buffer.clear()
327
+
328
+ if self._cache_file_path and os.path.exists(self._cache_file_path):
329
+ try:
330
+ os.unlink(self._cache_file_path)
331
+ except Exception:
332
+ # Ignore cleanup errors - file might not exist or already deleted
333
+ pass
334
+ self._cache_file_path = None
335
+
336
+ self._initialized = False
337
+ self._root_span_index.clear()
@@ -38,6 +38,27 @@ class SpanObjectTypeV3(Enum):
38
38
  }[self]
39
39
 
40
40
 
41
+ def span_object_type_v3_to_typed_string(
42
+ object_type: SpanObjectTypeV3,
43
+ ) -> str:
44
+ """Convert SpanObjectTypeV3 enum to typed string literal.
45
+
46
+ Args:
47
+ object_type: The SpanObjectTypeV3 enum value
48
+
49
+ Returns:
50
+ One of "experiment", "project_logs", or "playground_logs"
51
+ """
52
+ if object_type == SpanObjectTypeV3.EXPERIMENT:
53
+ return "experiment"
54
+ elif object_type == SpanObjectTypeV3.PROJECT_LOGS:
55
+ return "project_logs"
56
+ elif object_type == SpanObjectTypeV3.PLAYGROUND_LOGS:
57
+ return "playground_logs"
58
+ else:
59
+ raise ValueError(f"Unknown SpanObjectTypeV3: {object_type}")
60
+
61
+
41
62
  class InternalSpanComponentUUIDFields(Enum):
42
63
  OBJECT_ID = 1
43
64
  ROW_ID = 2
@@ -302,11 +302,6 @@ def test_to_bt_safe_special_objects():
302
302
  assert _to_bt_safe(dataset) == "<dataset>"
303
303
  assert _to_bt_safe(logger) == "<logger>"
304
304
 
305
- # Clean up
306
- exp.flush()
307
- dataset.flush()
308
- logger.flush()
309
-
310
305
 
311
306
  class TestBTJsonAttachments(TestCase):
312
307
  def test_to_bt_safe_attachments(self):
@@ -1,6 +1,8 @@
1
1
  from typing import List
2
+ from unittest.mock import MagicMock
2
3
 
3
4
  import pytest
5
+ from braintrust.logger import BraintrustState
4
6
 
5
7
  from .framework import (
6
8
  Eval,
@@ -241,6 +243,7 @@ async def test_hooks_trial_index_multiple_inputs():
241
243
  assert sorted(input_2_trials) == [0, 1]
242
244
 
243
245
 
246
+ @pytest.mark.vcr
244
247
  @pytest.mark.asyncio
245
248
  async def test_scorer_spans_have_purpose_attribute(with_memory_logger, with_simulate_login):
246
249
  """Test that scorer spans have span_attributes.purpose='scorer' and propagate to subspans."""
@@ -527,3 +530,37 @@ async def test_hooks_without_setting_tags(with_memory_logger, with_simulate_logi
527
530
  root_span = [log for log in logs if not log["span_parents"]]
528
531
  assert len(root_span) == 1
529
532
  assert root_span[0].get("tags") == None
533
+
534
+ @pytest.mark.asyncio
535
+ async def test_eval_enable_cache():
536
+ state = BraintrustState()
537
+ state.span_cache = MagicMock()
538
+
539
+ # Test enable_cache=False
540
+ await Eval(
541
+ "test-enable-cache-false",
542
+ data=[EvalCase(input=1, expected=1)],
543
+ task=lambda x: x,
544
+ scores=[],
545
+ state=state,
546
+ no_send_logs=True,
547
+ enable_cache=False,
548
+ )
549
+ state.span_cache.start.assert_not_called()
550
+ state.span_cache.stop.assert_not_called()
551
+
552
+ # Test enable_cache=True (default)
553
+ state.span_cache.start.reset_mock()
554
+ state.span_cache.stop.reset_mock()
555
+
556
+ await Eval(
557
+ "test-enable-cache-true",
558
+ data=[EvalCase(input=1, expected=1)],
559
+ task=lambda x: x,
560
+ scores=[],
561
+ state=state,
562
+ no_send_logs=True,
563
+ # enable_cache defaults to True
564
+ )
565
+ state.span_cache.start.assert_called()
566
+ state.span_cache.stop.assert_called()