braintrust 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. braintrust/_generated_types.py +328 -126
  2. braintrust/cli/install/api.py +1 -1
  3. braintrust/conftest.py +24 -0
  4. braintrust/devserver/test_server_integration.py +0 -11
  5. braintrust/framework.py +98 -1
  6. braintrust/functions/invoke.py +4 -9
  7. braintrust/functions/test_invoke.py +61 -0
  8. braintrust/generated_types.py +13 -7
  9. braintrust/logger.py +107 -66
  10. braintrust/prompt_cache/test_disk_cache.py +3 -3
  11. braintrust/span_cache.py +337 -0
  12. braintrust/span_identifier_v3.py +21 -0
  13. braintrust/span_types.py +3 -0
  14. braintrust/test_bt_json.py +23 -19
  15. braintrust/test_logger.py +116 -0
  16. braintrust/test_span_cache.py +344 -0
  17. braintrust/test_trace.py +267 -0
  18. braintrust/trace.py +385 -0
  19. braintrust/version.py +2 -2
  20. braintrust/wrappers/claude_agent_sdk/_wrapper.py +48 -6
  21. braintrust/wrappers/claude_agent_sdk/test_wrapper.py +106 -0
  22. braintrust/wrappers/langsmith_wrapper.py +517 -0
  23. braintrust/wrappers/test_agno.py +0 -12
  24. braintrust/wrappers/test_anthropic.py +1 -11
  25. braintrust/wrappers/test_dspy.py +0 -11
  26. braintrust/wrappers/test_google_genai.py +6 -1
  27. braintrust/wrappers/test_langsmith_wrapper.py +338 -0
  28. braintrust/wrappers/test_litellm.py +0 -10
  29. braintrust/wrappers/test_oai_attachments.py +0 -10
  30. braintrust/wrappers/test_openai.py +3 -12
  31. braintrust/wrappers/test_openrouter.py +0 -9
  32. braintrust/wrappers/test_pydantic_ai_integration.py +0 -11
  33. braintrust/wrappers/test_pydantic_ai_wrap_openai.py +2 -0
  34. {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/METADATA +1 -1
  35. {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/RECORD +38 -31
  36. {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/WHEEL +1 -1
  37. {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/entry_points.txt +0 -0
  38. {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,344 @@
1
+ """Tests for SpanCache (disk-based cache)."""
2
+
3
+
4
+ from braintrust.span_cache import CachedSpan, SpanCache
5
+
6
+
7
+ def test_span_cache_write_and_read():
8
+ """Test storing and retrieving spans by rootSpanId."""
9
+ cache = SpanCache()
10
+ cache.start() # Start for testing (cache is disabled by default)
11
+
12
+ root_span_id = "root-123"
13
+ span1 = CachedSpan(
14
+ span_id="span-1",
15
+ input={"text": "hello"},
16
+ output={"response": "world"},
17
+ )
18
+ span2 = CachedSpan(
19
+ span_id="span-2",
20
+ input={"text": "foo"},
21
+ output={"response": "bar"},
22
+ )
23
+
24
+ cache.queue_write(root_span_id, span1.span_id, span1)
25
+ cache.queue_write(root_span_id, span2.span_id, span2)
26
+
27
+ spans = cache.get_by_root_span_id(root_span_id)
28
+ assert spans is not None
29
+ assert len(spans) == 2
30
+
31
+ span_ids = {s.span_id for s in spans}
32
+ assert "span-1" in span_ids
33
+ assert "span-2" in span_ids
34
+
35
+ cache.stop()
36
+ cache.dispose()
37
+
38
+
39
+ def test_span_cache_return_none_for_unknown():
40
+ """Test that unknown rootSpanId returns None."""
41
+ cache = SpanCache()
42
+ cache.start()
43
+
44
+ spans = cache.get_by_root_span_id("nonexistent")
45
+ assert spans is None
46
+
47
+ cache.stop()
48
+ cache.dispose()
49
+
50
+
51
+ def test_span_cache_merge_on_duplicate_writes():
52
+ """Test that subsequent writes to same spanId merge data."""
53
+ cache = SpanCache()
54
+ cache.start()
55
+
56
+ root_span_id = "root-123"
57
+ span_id = "span-1"
58
+
59
+ cache.queue_write(
60
+ root_span_id,
61
+ span_id,
62
+ CachedSpan(span_id=span_id, input={"text": "hello"}),
63
+ )
64
+
65
+ cache.queue_write(
66
+ root_span_id,
67
+ span_id,
68
+ CachedSpan(span_id=span_id, output={"response": "world"}),
69
+ )
70
+
71
+ spans = cache.get_by_root_span_id(root_span_id)
72
+ assert spans is not None
73
+ assert len(spans) == 1
74
+ assert spans[0].span_id == span_id
75
+ assert spans[0].input == {"text": "hello"}
76
+ assert spans[0].output == {"response": "world"}
77
+
78
+ cache.stop()
79
+ cache.dispose()
80
+
81
+
82
+ def test_span_cache_merge_metadata():
83
+ """Test that metadata objects are merged."""
84
+ cache = SpanCache()
85
+ cache.start()
86
+
87
+ root_span_id = "root-123"
88
+ span_id = "span-1"
89
+
90
+ cache.queue_write(
91
+ root_span_id,
92
+ span_id,
93
+ CachedSpan(span_id=span_id, metadata={"key1": "value1"}),
94
+ )
95
+
96
+ cache.queue_write(
97
+ root_span_id,
98
+ span_id,
99
+ CachedSpan(span_id=span_id, metadata={"key2": "value2"}),
100
+ )
101
+
102
+ spans = cache.get_by_root_span_id(root_span_id)
103
+ assert spans is not None
104
+ assert spans[0].metadata == {"key1": "value1", "key2": "value2"}
105
+
106
+ cache.stop()
107
+ cache.dispose()
108
+
109
+
110
+ def test_span_cache_has():
111
+ """Test the has() method."""
112
+ cache = SpanCache()
113
+ cache.start()
114
+
115
+ cache.queue_write("root-123", "span-1", CachedSpan(span_id="span-1"))
116
+ assert cache.has("root-123") is True
117
+ assert cache.has("nonexistent") is False
118
+
119
+ cache.stop()
120
+ cache.dispose()
121
+
122
+
123
+ def test_span_cache_clear():
124
+ """Test clearing spans for a specific rootSpanId."""
125
+ cache = SpanCache()
126
+ cache.start()
127
+
128
+ cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1"))
129
+ cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2"))
130
+
131
+ cache.clear("root-1")
132
+
133
+ assert cache.has("root-1") is False
134
+ assert cache.has("root-2") is True
135
+
136
+ cache.stop()
137
+ cache.dispose()
138
+
139
+
140
+ def test_span_cache_clear_all():
141
+ """Test clearing all cached spans."""
142
+ cache = SpanCache()
143
+ cache.start()
144
+
145
+ cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1"))
146
+ cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2"))
147
+
148
+ cache.clear_all()
149
+
150
+ assert cache.size == 0
151
+
152
+ cache.stop()
153
+ cache.dispose()
154
+
155
+
156
+ def test_span_cache_size():
157
+ """Test the size property."""
158
+ cache = SpanCache()
159
+ cache.start()
160
+
161
+ assert cache.size == 0
162
+
163
+ cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1"))
164
+ assert cache.size == 1
165
+
166
+ cache.queue_write("root-1", "span-2", CachedSpan(span_id="span-2")) # Same root
167
+ assert cache.size == 1
168
+
169
+ cache.queue_write("root-2", "span-3", CachedSpan(span_id="span-3")) # Different root
170
+ assert cache.size == 2
171
+
172
+ cache.stop()
173
+ cache.dispose()
174
+
175
+
176
+ def test_span_cache_dispose():
177
+ """Test that dispose cleans up and allows reuse."""
178
+ cache = SpanCache()
179
+ cache.start()
180
+
181
+ cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1"))
182
+ assert cache.size == 1
183
+
184
+ # Stop first to decrement refcount, then dispose
185
+ cache.stop()
186
+ cache.dispose()
187
+
188
+ assert cache.size == 0
189
+ assert cache.has("root-1") is False
190
+
191
+ # Should be able to write again after dispose (if we start again)
192
+ cache.start()
193
+ cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2"))
194
+ assert cache.size == 1
195
+
196
+ cache.stop()
197
+ cache.dispose()
198
+
199
+
200
+ def test_span_cache_disable():
201
+ """Test that disable() prevents writes."""
202
+ cache = SpanCache()
203
+ cache.start()
204
+
205
+ cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1"))
206
+ assert cache.size == 1
207
+
208
+ cache.disable()
209
+
210
+ # Writes after disable should be no-ops
211
+ cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2"))
212
+ assert cache.size == 1 # Still 1, not 2
213
+
214
+ cache.stop()
215
+ cache.dispose()
216
+
217
+
218
+ def test_span_cache_disabled_getter():
219
+ """Test the disabled property."""
220
+ # Cache is disabled by default until start() is called
221
+ cache = SpanCache()
222
+ assert cache.disabled is True
223
+
224
+ cache.start()
225
+ assert cache.disabled is False
226
+
227
+ cache.disable()
228
+ assert cache.disabled is True
229
+
230
+ cache.dispose()
231
+
232
+
233
+ def test_span_cache_disabled_from_constructor():
234
+ """Test that cache can be disabled via constructor."""
235
+ cache = SpanCache(disabled=True)
236
+ assert cache.disabled is True
237
+
238
+ # Writes should be no-ops
239
+ cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1"))
240
+ assert cache.size == 0
241
+ assert cache.get_by_root_span_id("root-1") is None
242
+
243
+ cache.dispose()
244
+
245
+
246
+ def test_span_cache_start_stop_lifecycle():
247
+ """Test that stop() allows start() to work again."""
248
+ cache = SpanCache()
249
+
250
+ # Initially disabled by default
251
+ assert cache.disabled is True
252
+
253
+ # Start for first "eval"
254
+ cache.start()
255
+ assert cache.disabled is False
256
+ cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1"))
257
+ assert cache.size == 1
258
+
259
+ # Stop after first "eval"
260
+ cache.stop()
261
+ cache.dispose()
262
+ assert cache.disabled is True
263
+
264
+ # Start for second "eval" - should work!
265
+ cache.start()
266
+ assert cache.disabled is False
267
+ cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2"))
268
+ assert cache.size == 1
269
+
270
+ cache.stop()
271
+ cache.dispose()
272
+
273
+
274
+ def test_span_cache_disable_prevents_start():
275
+ """Test that disable() prevents start() from working."""
276
+ cache = SpanCache()
277
+
278
+ # Simulate disable being called
279
+ cache.disable()
280
+ assert cache.disabled is True
281
+
282
+ # start() should be a no-op after disable()
283
+ cache.start()
284
+ assert cache.disabled is True
285
+
286
+ # Writes should still be no-ops
287
+ cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1"))
288
+ assert cache.size == 0
289
+
290
+ cache.dispose()
291
+
292
+
293
+ def test_span_cache_parallel_eval_refcount():
294
+ """Test reference counting for parallel evals."""
295
+ cache = SpanCache()
296
+
297
+ # Simulate two evals starting
298
+ cache.start() # Eval 1
299
+ assert cache.disabled is False
300
+
301
+ cache.start() # Eval 2
302
+ assert cache.disabled is False
303
+
304
+ # Write data from both evals
305
+ cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1"))
306
+ cache.queue_write("root-2", "span-2", CachedSpan(span_id="span-2"))
307
+ assert cache.size == 2
308
+
309
+ # Eval 1 finishes first
310
+ cache.dispose() # Should NOT dispose (refcount = 2)
311
+ cache.stop() # Decrements to 1
312
+
313
+ # Cache should still be enabled and data intact
314
+ assert cache.disabled is False
315
+ assert cache.size == 2
316
+ assert cache.get_by_root_span_id("root-1") is not None
317
+ assert cache.get_by_root_span_id("root-2") is not None
318
+
319
+ # Eval 2 finishes
320
+ cache.dispose() # Should NOT dispose yet (refcount = 1)
321
+ cache.stop() # Decrements to 0, disables cache
322
+
323
+ # Now cache should be disabled
324
+ assert cache.disabled is True
325
+
326
+ # Final dispose should now work
327
+ cache.dispose() # NOW it disposes (refcount = 0)
328
+ assert cache.size == 0
329
+
330
+
331
+ def test_span_cache_refcount_underflow():
332
+ """Test that refcount handles underflow gracefully."""
333
+ cache = SpanCache()
334
+
335
+ # Call stop without start
336
+ cache.stop()
337
+
338
+ # Should work normally after
339
+ cache.start()
340
+ cache.queue_write("root-1", "span-1", CachedSpan(span_id="span-1"))
341
+ assert cache.size == 1
342
+
343
+ cache.stop()
344
+ cache.dispose()
@@ -0,0 +1,267 @@
1
+ """Tests for Trace functionality."""
2
+
3
+ import pytest
4
+ from braintrust.trace import CachedSpanFetcher, SpanData
5
+
6
+
7
+ # Helper to create mock spans
8
+ def make_span(span_id: str, span_type: str, **extra) -> SpanData:
9
+ return SpanData(
10
+ span_id=span_id,
11
+ input={"text": f"input-{span_id}"},
12
+ output={"text": f"output-{span_id}"},
13
+ span_attributes={"type": span_type},
14
+ **extra,
15
+ )
16
+
17
+
18
+ class TestCachedSpanFetcher:
19
+ """Test CachedSpanFetcher caching behavior."""
20
+
21
+ @pytest.mark.asyncio
22
+ async def test_fetch_all_spans_without_filter(self):
23
+ """Test fetching all spans when no filter specified."""
24
+ mock_spans = [
25
+ make_span("span-1", "llm"),
26
+ make_span("span-2", "function"),
27
+ make_span("span-3", "llm"),
28
+ ]
29
+
30
+ call_count = 0
31
+
32
+ async def fetch_fn(span_type):
33
+ nonlocal call_count
34
+ call_count += 1
35
+ return mock_spans
36
+
37
+ fetcher = CachedSpanFetcher(fetch_fn=fetch_fn)
38
+ result = await fetcher.get_spans()
39
+
40
+ assert call_count == 1
41
+ assert len(result) == 3
42
+ assert {s.span_id for s in result} == {"span-1", "span-2", "span-3"}
43
+
44
+ @pytest.mark.asyncio
45
+ async def test_fetch_specific_span_types(self):
46
+ """Test fetching specific span types when filter specified."""
47
+ llm_spans = [make_span("span-1", "llm"), make_span("span-2", "llm")]
48
+
49
+ call_count = 0
50
+
51
+ async def fetch_fn(span_type):
52
+ nonlocal call_count
53
+ call_count += 1
54
+ assert span_type == ["llm"]
55
+ return llm_spans
56
+
57
+ fetcher = CachedSpanFetcher(fetch_fn=fetch_fn)
58
+ result = await fetcher.get_spans(span_type=["llm"])
59
+
60
+ assert call_count == 1
61
+ assert len(result) == 2
62
+
63
+ @pytest.mark.asyncio
64
+ async def test_return_cached_spans_after_fetching_all(self):
65
+ """Test that cached spans are returned without re-fetching after fetching all."""
66
+ mock_spans = [
67
+ make_span("span-1", "llm"),
68
+ make_span("span-2", "function"),
69
+ ]
70
+
71
+ call_count = 0
72
+
73
+ async def fetch_fn(span_type):
74
+ nonlocal call_count
75
+ call_count += 1
76
+ return mock_spans
77
+
78
+ fetcher = CachedSpanFetcher(fetch_fn=fetch_fn)
79
+
80
+ # First call - fetches
81
+ await fetcher.get_spans()
82
+ assert call_count == 1
83
+
84
+ # Second call - should use cache
85
+ result = await fetcher.get_spans()
86
+ assert call_count == 1 # Still 1
87
+ assert len(result) == 2
88
+
89
+ @pytest.mark.asyncio
90
+ async def test_return_cached_spans_for_previously_fetched_types(self):
91
+ """Test that previously fetched types are returned from cache."""
92
+ llm_spans = [make_span("span-1", "llm"), make_span("span-2", "llm")]
93
+
94
+ call_count = 0
95
+
96
+ async def fetch_fn(span_type):
97
+ nonlocal call_count
98
+ call_count += 1
99
+ return llm_spans
100
+
101
+ fetcher = CachedSpanFetcher(fetch_fn=fetch_fn)
102
+
103
+ # First call - fetches llm spans
104
+ await fetcher.get_spans(span_type=["llm"])
105
+ assert call_count == 1
106
+
107
+ # Second call for same type - should use cache
108
+ result = await fetcher.get_spans(span_type=["llm"])
109
+ assert call_count == 1 # Still 1
110
+ assert len(result) == 2
111
+
112
+ @pytest.mark.asyncio
113
+ async def test_only_fetch_missing_span_types(self):
114
+ """Test that only missing span types are fetched."""
115
+ llm_spans = [make_span("span-1", "llm")]
116
+ function_spans = [make_span("span-2", "function")]
117
+
118
+ call_count = 0
119
+
120
+ async def fetch_fn(span_type):
121
+ nonlocal call_count
122
+ call_count += 1
123
+ if span_type == ["llm"]:
124
+ return llm_spans
125
+ elif span_type == ["function"]:
126
+ return function_spans
127
+ return []
128
+
129
+ fetcher = CachedSpanFetcher(fetch_fn=fetch_fn)
130
+
131
+ # First call - fetches llm spans
132
+ await fetcher.get_spans(span_type=["llm"])
133
+ assert call_count == 1
134
+
135
+ # Second call for both types - should only fetch function
136
+ result = await fetcher.get_spans(span_type=["llm", "function"])
137
+ assert call_count == 2
138
+ assert len(result) == 2
139
+
140
+ @pytest.mark.asyncio
141
+ async def test_no_refetch_after_fetching_all_spans(self):
142
+ """Test that no re-fetching occurs after fetching all spans."""
143
+ all_spans = [
144
+ make_span("span-1", "llm"),
145
+ make_span("span-2", "function"),
146
+ make_span("span-3", "tool"),
147
+ ]
148
+
149
+ call_count = 0
150
+
151
+ async def fetch_fn(span_type):
152
+ nonlocal call_count
153
+ call_count += 1
154
+ return all_spans
155
+
156
+ fetcher = CachedSpanFetcher(fetch_fn=fetch_fn)
157
+
158
+ # Fetch all spans
159
+ await fetcher.get_spans()
160
+ assert call_count == 1
161
+
162
+ # Subsequent filtered calls should use cache
163
+ llm_result = await fetcher.get_spans(span_type=["llm"])
164
+ assert call_count == 1 # Still 1
165
+ assert len(llm_result) == 1
166
+ assert llm_result[0].span_id == "span-1"
167
+
168
+ function_result = await fetcher.get_spans(span_type=["function"])
169
+ assert call_count == 1 # Still 1
170
+ assert len(function_result) == 1
171
+ assert function_result[0].span_id == "span-2"
172
+
173
+ @pytest.mark.asyncio
174
+ async def test_filter_by_multiple_span_types_from_cache(self):
175
+ """Test filtering by multiple span types from cache."""
176
+ all_spans = [
177
+ make_span("span-1", "llm"),
178
+ make_span("span-2", "function"),
179
+ make_span("span-3", "tool"),
180
+ make_span("span-4", "llm"),
181
+ ]
182
+
183
+ async def fetch_fn(span_type):
184
+ return all_spans
185
+
186
+ fetcher = CachedSpanFetcher(fetch_fn=fetch_fn)
187
+
188
+ # Fetch all first
189
+ await fetcher.get_spans()
190
+
191
+ # Filter for llm and tool
192
+ result = await fetcher.get_spans(span_type=["llm", "tool"])
193
+ assert len(result) == 3
194
+ assert {s.span_id for s in result} == {"span-1", "span-3", "span-4"}
195
+
196
+ @pytest.mark.asyncio
197
+ async def test_return_empty_for_nonexistent_span_type(self):
198
+ """Test that empty array is returned for non-existent span type."""
199
+ all_spans = [make_span("span-1", "llm")]
200
+
201
+ async def fetch_fn(span_type):
202
+ return all_spans
203
+
204
+ fetcher = CachedSpanFetcher(fetch_fn=fetch_fn)
205
+
206
+ # Fetch all first
207
+ await fetcher.get_spans()
208
+
209
+ # Query for non-existent type
210
+ result = await fetcher.get_spans(span_type=["nonexistent"])
211
+ assert len(result) == 0
212
+
213
+ @pytest.mark.asyncio
214
+ async def test_handle_spans_with_no_type(self):
215
+ """Test handling spans without type (empty string type)."""
216
+ spans = [
217
+ make_span("span-1", "llm"),
218
+ SpanData(span_id="span-2", input={}, span_attributes={}), # No type
219
+ SpanData(span_id="span-3", input={}), # No span_attributes
220
+ ]
221
+
222
+ async def fetch_fn(span_type):
223
+ return spans
224
+
225
+ fetcher = CachedSpanFetcher(fetch_fn=fetch_fn)
226
+
227
+ # Fetch all
228
+ result = await fetcher.get_spans()
229
+ assert len(result) == 3
230
+
231
+ # Spans without type go into "" bucket
232
+ no_type_result = await fetcher.get_spans(span_type=[""])
233
+ assert len(no_type_result) == 2
234
+
235
+ @pytest.mark.asyncio
236
+ async def test_handle_empty_results(self):
237
+ """Test handling empty results."""
238
+
239
+ async def fetch_fn(span_type):
240
+ return []
241
+
242
+ fetcher = CachedSpanFetcher(fetch_fn=fetch_fn)
243
+
244
+ result = await fetcher.get_spans()
245
+ assert len(result) == 0
246
+
247
+ # Should still mark as fetched
248
+ await fetcher.get_spans(span_type=["llm"])
249
+ # No additional assertions, just making sure it doesn't crash
250
+
251
+ @pytest.mark.asyncio
252
+ async def test_handle_empty_span_type_array(self):
253
+ """Test that empty spanType array is handled same as undefined."""
254
+ mock_spans = [make_span("span-1", "llm")]
255
+
256
+ call_args = []
257
+
258
+ async def fetch_fn(span_type):
259
+ call_args.append(span_type)
260
+ return mock_spans
261
+
262
+ fetcher = CachedSpanFetcher(fetch_fn=fetch_fn)
263
+
264
+ result = await fetcher.get_spans(span_type=[])
265
+
266
+ assert call_args[0] is None or call_args[0] == []
267
+ assert len(result) == 1