braintrust 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- braintrust/_generated_types.py +328 -126
- braintrust/cli/install/api.py +1 -1
- braintrust/conftest.py +24 -0
- braintrust/devserver/test_server_integration.py +0 -11
- braintrust/framework.py +98 -1
- braintrust/functions/invoke.py +4 -9
- braintrust/functions/test_invoke.py +61 -0
- braintrust/generated_types.py +13 -7
- braintrust/logger.py +107 -66
- braintrust/prompt_cache/test_disk_cache.py +3 -3
- braintrust/span_cache.py +337 -0
- braintrust/span_identifier_v3.py +21 -0
- braintrust/span_types.py +3 -0
- braintrust/test_bt_json.py +23 -19
- braintrust/test_logger.py +116 -0
- braintrust/test_span_cache.py +344 -0
- braintrust/test_trace.py +267 -0
- braintrust/trace.py +385 -0
- braintrust/version.py +2 -2
- braintrust/wrappers/claude_agent_sdk/_wrapper.py +48 -6
- braintrust/wrappers/claude_agent_sdk/test_wrapper.py +106 -0
- braintrust/wrappers/langsmith_wrapper.py +517 -0
- braintrust/wrappers/test_agno.py +0 -12
- braintrust/wrappers/test_anthropic.py +1 -11
- braintrust/wrappers/test_dspy.py +0 -11
- braintrust/wrappers/test_google_genai.py +6 -1
- braintrust/wrappers/test_langsmith_wrapper.py +338 -0
- braintrust/wrappers/test_litellm.py +0 -10
- braintrust/wrappers/test_oai_attachments.py +0 -10
- braintrust/wrappers/test_openai.py +3 -12
- braintrust/wrappers/test_openrouter.py +0 -9
- braintrust/wrappers/test_pydantic_ai_integration.py +0 -11
- braintrust/wrappers/test_pydantic_ai_wrap_openai.py +2 -0
- {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/METADATA +1 -1
- {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/RECORD +38 -31
- {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/WHEEL +1 -1
- {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/entry_points.txt +0 -0
- {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Braintrust integration for LangSmith - provides a migration path from LangSmith to Braintrust.
|
|
3
|
+
|
|
4
|
+
This module patches LangSmith's tracing and evaluation APIs to use Braintrust under the hood,
|
|
5
|
+
allowing users to migrate with minimal code changes.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
```python
|
|
9
|
+
import os
|
|
10
|
+
|
|
11
|
+
# Enable LangSmith tracing and set project name (used by both services)
|
|
12
|
+
os.environ.setdefault("LANGCHAIN_TRACING_V2", "true")
|
|
13
|
+
os.environ.setdefault("LANGCHAIN_PROJECT", "my-project")
|
|
14
|
+
|
|
15
|
+
from braintrust.wrappers.langsmith_wrapper import setup_langsmith
|
|
16
|
+
|
|
17
|
+
# Call setup BEFORE importing from langsmith
|
|
18
|
+
# project_name defaults to LANGCHAIN_PROJECT env var
|
|
19
|
+
setup_langsmith()
|
|
20
|
+
|
|
21
|
+
# Continue using langsmith imports - they now use Braintrust
|
|
22
|
+
from langsmith import traceable, Client
|
|
23
|
+
|
|
24
|
+
@traceable
|
|
25
|
+
def my_function(inputs: dict) -> dict:
|
|
26
|
+
return {"result": inputs["x"] * 2}
|
|
27
|
+
|
|
28
|
+
client = Client()
|
|
29
|
+
results = client.evaluate(
|
|
30
|
+
my_function,
|
|
31
|
+
data=[{"inputs": {"x": 1}, "outputs": {"result": 2}}],
|
|
32
|
+
evaluators=[my_evaluator],
|
|
33
|
+
)
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Set BRAINTRUST_STANDALONE=1 to completely replace LangSmith with Braintrust
|
|
37
|
+
(no LangSmith code runs). Otherwise, both services run in tandem.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
import inspect
|
|
41
|
+
import logging
|
|
42
|
+
import os
|
|
43
|
+
from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, ParamSpec, TypeVar
|
|
44
|
+
|
|
45
|
+
from braintrust.framework import EvalCase
|
|
46
|
+
from braintrust.logger import NOOP_SPAN, current_span, init_logger, traced
|
|
47
|
+
from wrapt import wrap_function_wrapper
|
|
48
|
+
|
|
49
|
+
logger = logging.getLogger(__name__)
|
|
50
|
+
|
|
51
|
+
# Global list to store Braintrust eval results when running in tandem mode
|
|
52
|
+
_braintrust_eval_results: List[Any] = []
|
|
53
|
+
|
|
54
|
+
# TODO: langsmith.test/unit/expect, langsmith.AsyncClient, trace
|
|
55
|
+
__all__ = [
|
|
56
|
+
"setup_langsmith",
|
|
57
|
+
"wrap_traceable",
|
|
58
|
+
"wrap_client",
|
|
59
|
+
"wrap_evaluate",
|
|
60
|
+
"wrap_aevaluate",
|
|
61
|
+
"get_braintrust_results",
|
|
62
|
+
"clear_braintrust_results",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
F = TypeVar("F", bound=Callable[..., Any])
|
|
66
|
+
P = ParamSpec("P")
|
|
67
|
+
R = TypeVar("R")
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def get_braintrust_results() -> List[Any]:
|
|
71
|
+
"""Get all Braintrust eval results collected during tandem mode."""
|
|
72
|
+
return _braintrust_eval_results.copy()
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def clear_braintrust_results() -> None:
|
|
76
|
+
"""Clear all stored Braintrust eval results."""
|
|
77
|
+
_braintrust_eval_results.clear()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def setup_langsmith(
|
|
81
|
+
api_key: Optional[str] = None,
|
|
82
|
+
project_id: Optional[str] = None,
|
|
83
|
+
project_name: Optional[str] = None,
|
|
84
|
+
standalone: bool = False,
|
|
85
|
+
) -> bool:
|
|
86
|
+
"""
|
|
87
|
+
Setup Braintrust integration with LangSmith.
|
|
88
|
+
|
|
89
|
+
This patches LangSmith's @traceable, Client.evaluate(), and aevaluate()
|
|
90
|
+
to use Braintrust under the hood.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
api_key: Braintrust API key (optional, can use env var BRAINTRUST_API_KEY)
|
|
94
|
+
project_id: Braintrust project ID (optional)
|
|
95
|
+
project_name: Braintrust project name (optional, falls back to LANGCHAIN_PROJECT
|
|
96
|
+
env var, then BRAINTRUST_PROJECT env var)
|
|
97
|
+
standalone: If True, completely replace LangSmith with Braintrust (no LangSmith
|
|
98
|
+
code runs). If False (default), run both LangSmith and Braintrust
|
|
99
|
+
in tandem.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
True if setup was successful, False otherwise
|
|
103
|
+
"""
|
|
104
|
+
# Use LANGCHAIN_PROJECT as fallback for project_name to keep both services in sync
|
|
105
|
+
if project_name is None:
|
|
106
|
+
project_name = os.environ.get("LANGCHAIN_PROJECT")
|
|
107
|
+
|
|
108
|
+
span = current_span()
|
|
109
|
+
if span == NOOP_SPAN:
|
|
110
|
+
init_logger(project=project_name, api_key=api_key, project_id=project_id)
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
import langsmith
|
|
114
|
+
|
|
115
|
+
langsmith.traceable = wrap_traceable(langsmith.traceable, standalone=standalone)
|
|
116
|
+
wrap_client(langsmith.Client, project_name=project_name, project_id=project_id, standalone=standalone)
|
|
117
|
+
langsmith.evaluate = wrap_evaluate(
|
|
118
|
+
langsmith.evaluate, project_name=project_name, project_id=project_id, standalone=standalone
|
|
119
|
+
)
|
|
120
|
+
langsmith.aevaluate = wrap_aevaluate(
|
|
121
|
+
langsmith.aevaluate, project_name=project_name, project_id=project_id, standalone=standalone
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
logger.info("LangSmith integration with Braintrust enabled")
|
|
125
|
+
return True
|
|
126
|
+
|
|
127
|
+
except ImportError as e:
|
|
128
|
+
logger.error(f"Failed to import langsmith: {e}")
|
|
129
|
+
logger.error("langsmith is not installed. Please install it with: pip install langsmith")
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def wrap_traceable(traceable: F, standalone: bool = False) -> F:
|
|
134
|
+
"""
|
|
135
|
+
Wrap langsmith.traceable to also use Braintrust's @traced decorator.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
traceable: The langsmith.traceable function
|
|
139
|
+
standalone: If True, replace LangSmith tracing entirely with Braintrust.
|
|
140
|
+
If False, add Braintrust tracing alongside LangSmith tracing.
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
The wrapped traceable function (or the original if already patched)
|
|
144
|
+
"""
|
|
145
|
+
if _is_patched(traceable):
|
|
146
|
+
return traceable
|
|
147
|
+
|
|
148
|
+
def traceable_wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
149
|
+
# Handle both @traceable and @traceable(...) patterns
|
|
150
|
+
func = args[0] if args and callable(args[0]) else None
|
|
151
|
+
|
|
152
|
+
def decorator(fn: Callable[P, R]) -> Callable[P, R]:
|
|
153
|
+
span_name = kwargs.get("name") or fn.__name__
|
|
154
|
+
|
|
155
|
+
# Conditionally apply LangSmith decorator first
|
|
156
|
+
if not standalone:
|
|
157
|
+
fn = traceable(fn, **kwargs)
|
|
158
|
+
|
|
159
|
+
# Always apply Braintrust tracing
|
|
160
|
+
return traced(name=span_name)(fn) # type: ignore[return-value]
|
|
161
|
+
|
|
162
|
+
if func is not None:
|
|
163
|
+
return decorator(func)
|
|
164
|
+
return decorator
|
|
165
|
+
|
|
166
|
+
traceable_wrapper._braintrust_patched = True # type: ignore[attr-defined]
|
|
167
|
+
return traceable_wrapper # type: ignore[return-value]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def wrap_client(
|
|
171
|
+
Client: Any, project_name: Optional[str] = None, project_id: Optional[str] = None, standalone: bool = False
|
|
172
|
+
) -> Any:
|
|
173
|
+
"""
|
|
174
|
+
Wrap langsmith.Client to redirect evaluate() and aevaluate() to Braintrust's Eval.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
Client: The langsmith.Client class
|
|
178
|
+
project_name: Braintrust project name to use for evaluations
|
|
179
|
+
project_id: Braintrust project ID to use for evaluations
|
|
180
|
+
standalone: If True, only run Braintrust. If False, run both LangSmith and Braintrust.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
The Client class (modified in place)
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
if hasattr(Client, "evaluate") and not _is_patched(Client.evaluate):
|
|
187
|
+
wrap_function_wrapper(
|
|
188
|
+
Client,
|
|
189
|
+
"evaluate",
|
|
190
|
+
make_evaluate_wrapper(standalone=standalone, project_name=project_name, project_id=project_id),
|
|
191
|
+
)
|
|
192
|
+
Client.evaluate._braintrust_patched = True # type: ignore[attr-defined]
|
|
193
|
+
|
|
194
|
+
if hasattr(Client, "aevaluate") and not _is_patched(Client.aevaluate):
|
|
195
|
+
wrap_function_wrapper(
|
|
196
|
+
Client,
|
|
197
|
+
"aevaluate",
|
|
198
|
+
make_aevaluate_wrapper(standalone=standalone, project_name=project_name, project_id=project_id),
|
|
199
|
+
)
|
|
200
|
+
Client.aevaluate._braintrust_patched = True # type: ignore[attr-defined]
|
|
201
|
+
|
|
202
|
+
return Client
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def make_evaluate_wrapper(
|
|
206
|
+
*, project_name: Optional[str] = None, project_id: Optional[str] = None, standalone: bool = False
|
|
207
|
+
):
|
|
208
|
+
def evaluate_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any:
|
|
209
|
+
result = None
|
|
210
|
+
if not standalone:
|
|
211
|
+
result = wrapped(*args, **kwargs)
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
result = _run_braintrust_eval(
|
|
215
|
+
args,
|
|
216
|
+
kwargs,
|
|
217
|
+
project_name,
|
|
218
|
+
project_id,
|
|
219
|
+
)
|
|
220
|
+
_braintrust_eval_results.append(result)
|
|
221
|
+
except Exception as e:
|
|
222
|
+
if standalone:
|
|
223
|
+
raise e
|
|
224
|
+
else:
|
|
225
|
+
logger.warning(f"Braintrust evaluate failed: {e}")
|
|
226
|
+
|
|
227
|
+
return result
|
|
228
|
+
|
|
229
|
+
return evaluate_wrapper
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def make_aevaluate_wrapper(
|
|
233
|
+
*, project_name: Optional[str] = None, project_id: Optional[str] = None, standalone: bool = False
|
|
234
|
+
):
|
|
235
|
+
async def aevaluate_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any:
|
|
236
|
+
result = None
|
|
237
|
+
if not standalone:
|
|
238
|
+
result = await wrapped(*args, **kwargs)
|
|
239
|
+
|
|
240
|
+
try:
|
|
241
|
+
result = await _run_braintrust_eval_async(
|
|
242
|
+
args,
|
|
243
|
+
kwargs,
|
|
244
|
+
project_name,
|
|
245
|
+
project_id,
|
|
246
|
+
)
|
|
247
|
+
_braintrust_eval_results.append(result)
|
|
248
|
+
except Exception as e:
|
|
249
|
+
if standalone:
|
|
250
|
+
raise e
|
|
251
|
+
else:
|
|
252
|
+
logger.warning(f"Braintrust aevaluate failed: {e}")
|
|
253
|
+
|
|
254
|
+
return result
|
|
255
|
+
|
|
256
|
+
return aevaluate_wrapper
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def wrap_evaluate(
|
|
260
|
+
evaluate: F, project_name: Optional[str] = None, project_id: Optional[str] = None, standalone: bool = False
|
|
261
|
+
) -> F:
|
|
262
|
+
"""
|
|
263
|
+
Wrap module-level langsmith.evaluate to redirect to Braintrust's Eval.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
evaluate: The langsmith.evaluate function
|
|
267
|
+
project_name: Braintrust project name to use for evaluations
|
|
268
|
+
project_id: Braintrust project ID to use for evaluations
|
|
269
|
+
standalone: If True, only run Braintrust. If False, run both LangSmith and Braintrust.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
The wrapped evaluate function (or the original if already patched)
|
|
273
|
+
"""
|
|
274
|
+
if _is_patched(evaluate):
|
|
275
|
+
return evaluate
|
|
276
|
+
|
|
277
|
+
evaluate_wrapper = make_evaluate_wrapper(standalone=standalone, project_name=project_name, project_id=project_id)
|
|
278
|
+
evaluate_wrapper._braintrust_patched = True # type: ignore[attr-defined]
|
|
279
|
+
return evaluate_wrapper # type: ignore[return-value]
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def wrap_aevaluate(
|
|
283
|
+
aevaluate: F,
|
|
284
|
+
project_name: Optional[str] = None,
|
|
285
|
+
project_id: Optional[str] = None,
|
|
286
|
+
standalone: bool = False,
|
|
287
|
+
) -> F:
|
|
288
|
+
"""
|
|
289
|
+
Wrap module-level langsmith.aevaluate to redirect to Braintrust's EvalAsync.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
aevaluate: The langsmith.aevaluate function
|
|
293
|
+
project_name: Braintrust project name to use for evaluations
|
|
294
|
+
project_id: Braintrust project ID to use for evaluations
|
|
295
|
+
standalone: If True, only run Braintrust. If False, run both LangSmith and Braintrust.
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
The wrapped aevaluate function (or the original if already patched)
|
|
299
|
+
"""
|
|
300
|
+
if _is_patched(aevaluate):
|
|
301
|
+
return aevaluate
|
|
302
|
+
|
|
303
|
+
aevaluate_wrapper = make_aevaluate_wrapper(standalone=standalone, project_name=project_name, project_id=project_id)
|
|
304
|
+
aevaluate_wrapper._braintrust_patched = True # type: ignore[attr-defined]
|
|
305
|
+
return aevaluate_wrapper # type: ignore[return-value]
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def _is_patched(obj: Any) -> bool:
|
|
309
|
+
return getattr(obj, "_braintrust_patched", False)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
# =============================================================================
|
|
313
|
+
# Braintrust evaluation logic
|
|
314
|
+
# =============================================================================
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _run_braintrust_eval(
|
|
318
|
+
args: Any,
|
|
319
|
+
kwargs: Any,
|
|
320
|
+
project_name: Optional[str] = None,
|
|
321
|
+
project_id: Optional[str] = None,
|
|
322
|
+
) -> Any:
|
|
323
|
+
"""Run Braintrust Eval with LangSmith-style arguments."""
|
|
324
|
+
from braintrust.framework import Eval
|
|
325
|
+
|
|
326
|
+
target = args[0] if args else kwargs.get("target")
|
|
327
|
+
data = args[1] if len(args) > 1 else kwargs.get("data")
|
|
328
|
+
evaluators = kwargs.get("evaluators")
|
|
329
|
+
experiment_prefix = kwargs.get("experiment_prefix")
|
|
330
|
+
description = kwargs.get("description")
|
|
331
|
+
metadata = kwargs.get("metadata")
|
|
332
|
+
max_concurrency = kwargs.get("max_concurrency")
|
|
333
|
+
num_repetitions = kwargs.get("num_repetitions", 1)
|
|
334
|
+
|
|
335
|
+
# Convert evaluators to scorers
|
|
336
|
+
scorers = []
|
|
337
|
+
if evaluators:
|
|
338
|
+
for e in evaluators:
|
|
339
|
+
scorers.append(_make_braintrust_scorer(e))
|
|
340
|
+
|
|
341
|
+
return Eval(
|
|
342
|
+
name=project_name or "langsmith-migration",
|
|
343
|
+
data=_convert_langsmith_data(data),
|
|
344
|
+
task=_make_braintrust_task(target),
|
|
345
|
+
scores=scorers,
|
|
346
|
+
experiment_name=experiment_prefix,
|
|
347
|
+
project_id=project_id,
|
|
348
|
+
description=description,
|
|
349
|
+
metadata=metadata,
|
|
350
|
+
max_concurrency=max_concurrency,
|
|
351
|
+
trial_count=num_repetitions,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
async def _run_braintrust_eval_async(
|
|
356
|
+
args: Any,
|
|
357
|
+
kwargs: Any,
|
|
358
|
+
project_name: Optional[str] = None,
|
|
359
|
+
project_id: Optional[str] = None,
|
|
360
|
+
) -> Any:
|
|
361
|
+
"""Run Braintrust EvalAsync with LangSmith-style arguments."""
|
|
362
|
+
from braintrust.framework import EvalAsync
|
|
363
|
+
|
|
364
|
+
target = args[0] if args else kwargs.get("target")
|
|
365
|
+
data = args[1] if len(args) > 1 else kwargs.get("data")
|
|
366
|
+
evaluators = kwargs.get("evaluators")
|
|
367
|
+
experiment_prefix = kwargs.get("experiment_prefix")
|
|
368
|
+
description = kwargs.get("description")
|
|
369
|
+
metadata = kwargs.get("metadata")
|
|
370
|
+
max_concurrency = kwargs.get("max_concurrency")
|
|
371
|
+
num_repetitions = kwargs.get("num_repetitions", 1)
|
|
372
|
+
|
|
373
|
+
# Convert evaluators to scorers
|
|
374
|
+
scorers = []
|
|
375
|
+
if evaluators:
|
|
376
|
+
for e in evaluators:
|
|
377
|
+
scorers.append(_make_braintrust_scorer(e))
|
|
378
|
+
|
|
379
|
+
return await EvalAsync(
|
|
380
|
+
name=project_name or "langsmith-migration",
|
|
381
|
+
data=_convert_langsmith_data(data),
|
|
382
|
+
task=_make_braintrust_task(target),
|
|
383
|
+
scores=scorers,
|
|
384
|
+
experiment_name=experiment_prefix,
|
|
385
|
+
project_id=project_id,
|
|
386
|
+
description=description,
|
|
387
|
+
metadata=metadata,
|
|
388
|
+
max_concurrency=max_concurrency,
|
|
389
|
+
trial_count=num_repetitions,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
# =============================================================================
|
|
394
|
+
# Data conversion helpers
|
|
395
|
+
# =============================================================================
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _wrap_output(output: Any) -> Dict[str, Any]:
|
|
399
|
+
"""Wrap non-dict outputs the same way LangSmith does."""
|
|
400
|
+
if not isinstance(output, dict):
|
|
401
|
+
return {"output": output}
|
|
402
|
+
return output
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def _make_braintrust_scorer(
|
|
406
|
+
evaluator: Callable[..., Any],
|
|
407
|
+
) -> Callable[..., Any]:
|
|
408
|
+
"""
|
|
409
|
+
Create a Braintrust scorer from a LangSmith evaluator.
|
|
410
|
+
|
|
411
|
+
Always runs the evaluator through Braintrust for full tracing (span duration, child LLM calls, etc.).
|
|
412
|
+
"""
|
|
413
|
+
evaluator_name = getattr(evaluator, "__name__", "score")
|
|
414
|
+
|
|
415
|
+
def braintrust_scorer(input: Any, output: Any, expected: Optional[Any] = None, **kwargs: Any) -> Any:
|
|
416
|
+
from braintrust.score import Score
|
|
417
|
+
|
|
418
|
+
# Run the evaluator with LangSmith's signature
|
|
419
|
+
# LangSmith evaluators use: (inputs, outputs, reference_outputs) -> bool | dict
|
|
420
|
+
# LangSmith auto-wraps non-dict outputs as {"output": value}
|
|
421
|
+
outputs = _wrap_output(output)
|
|
422
|
+
|
|
423
|
+
# expected is the real LangSmith Example object passed through from data loading
|
|
424
|
+
reference_outputs = expected.outputs if hasattr(expected, "outputs") else expected
|
|
425
|
+
|
|
426
|
+
result = evaluator(input, outputs, reference_outputs)
|
|
427
|
+
|
|
428
|
+
return Score(
|
|
429
|
+
name=result.get("key", evaluator_name),
|
|
430
|
+
score=result.get("score"),
|
|
431
|
+
metadata=result.get("metadata", {}),
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
braintrust_scorer.__name__ = evaluator_name
|
|
435
|
+
return braintrust_scorer
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
def _convert_langsmith_data(data: Any) -> Callable[[], Iterator[EvalCase[Any, Any]]]:
|
|
439
|
+
"""Convert LangSmith data format to Braintrust data format."""
|
|
440
|
+
|
|
441
|
+
def load_data() -> Iterator[EvalCase[Any, Any]]:
|
|
442
|
+
# Determine the source iterable without loading everything into memory
|
|
443
|
+
source: Iterable[Any]
|
|
444
|
+
if callable(data):
|
|
445
|
+
source = data() # type: ignore
|
|
446
|
+
elif isinstance(data, str):
|
|
447
|
+
# Load examples from LangSmith dataset by name
|
|
448
|
+
try:
|
|
449
|
+
from langsmith import Client # pylint: disable=import-error
|
|
450
|
+
|
|
451
|
+
client = Client()
|
|
452
|
+
source = client.list_examples(dataset_name=data)
|
|
453
|
+
except Exception as e:
|
|
454
|
+
logger.warning(f"Failed to load LangSmith dataset '{data}': {e}")
|
|
455
|
+
return
|
|
456
|
+
elif hasattr(data, "__iter__"):
|
|
457
|
+
source = data
|
|
458
|
+
else:
|
|
459
|
+
source = [data]
|
|
460
|
+
|
|
461
|
+
# Process items as a generator - yield one at a time
|
|
462
|
+
for item in source:
|
|
463
|
+
# Pass through LangSmith Example objects directly
|
|
464
|
+
if hasattr(item, "inputs"):
|
|
465
|
+
yield EvalCase(
|
|
466
|
+
input=item.inputs,
|
|
467
|
+
expected=item, # Pass the whole Example object
|
|
468
|
+
metadata=getattr(item, "metadata", None),
|
|
469
|
+
)
|
|
470
|
+
elif isinstance(item, dict):
|
|
471
|
+
if "inputs" in item:
|
|
472
|
+
# LangSmith dict format
|
|
473
|
+
yield EvalCase(
|
|
474
|
+
input=item["inputs"],
|
|
475
|
+
expected=item, # Pass the whole dict
|
|
476
|
+
metadata=item.get("metadata"),
|
|
477
|
+
)
|
|
478
|
+
elif "input" in item:
|
|
479
|
+
# Braintrust format
|
|
480
|
+
yield EvalCase(
|
|
481
|
+
input=item["input"],
|
|
482
|
+
expected=item.get("expected"),
|
|
483
|
+
metadata=item.get("metadata"),
|
|
484
|
+
)
|
|
485
|
+
else:
|
|
486
|
+
yield EvalCase(input=item)
|
|
487
|
+
else:
|
|
488
|
+
yield EvalCase(input=item)
|
|
489
|
+
|
|
490
|
+
return load_data
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def _make_braintrust_task(target: Callable[..., Any]) -> Callable[..., Any]:
|
|
494
|
+
"""Convert a LangSmith target function to Braintrust task format."""
|
|
495
|
+
|
|
496
|
+
def task_fn(task_input: Any, hooks: Any) -> Any:
|
|
497
|
+
if isinstance(task_input, dict):
|
|
498
|
+
# Try to get the original function's signature (unwrap decorators)
|
|
499
|
+
unwrapped = inspect.unwrap(target)
|
|
500
|
+
|
|
501
|
+
try:
|
|
502
|
+
sig = inspect.signature(unwrapped)
|
|
503
|
+
params = list(sig.parameters.keys())
|
|
504
|
+
if len(params) == 1:
|
|
505
|
+
return target(task_input)
|
|
506
|
+
if all(p in task_input for p in params):
|
|
507
|
+
return target(**task_input)
|
|
508
|
+
return target(task_input)
|
|
509
|
+
except (ValueError, TypeError):
|
|
510
|
+
# Fallback: try kwargs first, then single arg
|
|
511
|
+
try:
|
|
512
|
+
return target(**task_input)
|
|
513
|
+
except TypeError:
|
|
514
|
+
return target(task_input)
|
|
515
|
+
return target(task_input)
|
|
516
|
+
|
|
517
|
+
return task_fn
|
braintrust/wrappers/test_agno.py
CHANGED
|
@@ -13,18 +13,6 @@ TEST_ORG_ID = "test-org-123"
|
|
|
13
13
|
PROJECT_NAME = "test-agno-app"
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
@pytest.fixture(scope="module")
|
|
17
|
-
def vcr_config():
|
|
18
|
-
return {
|
|
19
|
-
"filter_headers": [
|
|
20
|
-
"authorization",
|
|
21
|
-
"x-api-key",
|
|
22
|
-
"api-key",
|
|
23
|
-
"openai-api-key",
|
|
24
|
-
]
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
|
|
28
16
|
@pytest.fixture
|
|
29
17
|
def memory_logger():
|
|
30
18
|
init_test_logger(PROJECT_NAME)
|
|
@@ -15,16 +15,6 @@ PROJECT_NAME = "test-anthropic-app"
|
|
|
15
15
|
MODEL = "claude-3-haiku-20240307" # use the cheapest model since answers dont matter
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
@pytest.fixture(scope="module")
|
|
19
|
-
def vcr_config():
|
|
20
|
-
return {
|
|
21
|
-
"filter_headers": [
|
|
22
|
-
"authorization",
|
|
23
|
-
"x-api-key",
|
|
24
|
-
]
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
|
|
28
18
|
def _get_client():
|
|
29
19
|
return anthropic.Anthropic()
|
|
30
20
|
|
|
@@ -457,7 +447,7 @@ async def test_anthropic_beta_messages_create_async(memory_logger):
|
|
|
457
447
|
assert "10" in span["output"]["content"][0]["text"]
|
|
458
448
|
|
|
459
449
|
|
|
460
|
-
@pytest.mark.vcr
|
|
450
|
+
@pytest.mark.vcr(match_on=["method", "scheme", "host", "port", "path", "body"]) # exclude query - varies by SDK version
|
|
461
451
|
@pytest.mark.asyncio
|
|
462
452
|
async def test_anthropic_beta_messages_streaming_async(memory_logger):
|
|
463
453
|
assert not memory_logger.pop()
|
braintrust/wrappers/test_dspy.py
CHANGED
|
@@ -12,17 +12,6 @@ PROJECT_NAME = "test-dspy-app"
|
|
|
12
12
|
MODEL = "openai/gpt-4o-mini"
|
|
13
13
|
|
|
14
14
|
|
|
15
|
-
@pytest.fixture(scope="module")
|
|
16
|
-
def vcr_config():
|
|
17
|
-
return {
|
|
18
|
-
"filter_headers": [
|
|
19
|
-
"authorization",
|
|
20
|
-
"x-api-key",
|
|
21
|
-
"openai-api-key",
|
|
22
|
-
]
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
|
|
26
15
|
@pytest.fixture
|
|
27
16
|
def memory_logger():
|
|
28
17
|
init_test_logger(PROJECT_NAME)
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import os
|
|
1
2
|
import time
|
|
2
3
|
from pathlib import Path
|
|
3
4
|
|
|
@@ -15,12 +16,16 @@ FIXTURES_DIR = Path(__file__).parent.parent.parent.parent.parent / "internal/gol
|
|
|
15
16
|
|
|
16
17
|
@pytest.fixture(scope="module")
|
|
17
18
|
def vcr_config():
|
|
19
|
+
"""Google-specific VCR config - needs to uppercase HTTP methods."""
|
|
20
|
+
record_mode = "none" if (os.environ.get("CI") or os.environ.get("GITHUB_ACTIONS")) else "once"
|
|
21
|
+
|
|
18
22
|
def before_record_request(request):
|
|
19
|
-
# Normalize HTTP method to uppercase for consistency
|
|
23
|
+
# Normalize HTTP method to uppercase for consistency (Google API quirk)
|
|
20
24
|
request.method = request.method.upper()
|
|
21
25
|
return request
|
|
22
26
|
|
|
23
27
|
return {
|
|
28
|
+
"record_mode": record_mode,
|
|
24
29
|
"filter_headers": [
|
|
25
30
|
"authorization",
|
|
26
31
|
"x-api-key",
|