braintrust 0.4.2__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,517 @@
1
+ """
2
+ Braintrust integration for LangSmith - provides a migration path from LangSmith to Braintrust.
3
+
4
+ This module patches LangSmith's tracing and evaluation APIs to use Braintrust under the hood,
5
+ allowing users to migrate with minimal code changes.
6
+
7
+ Usage:
8
+ ```python
9
+ import os
10
+
11
+ # Enable LangSmith tracing and set project name (used by both services)
12
+ os.environ.setdefault("LANGCHAIN_TRACING_V2", "true")
13
+ os.environ.setdefault("LANGCHAIN_PROJECT", "my-project")
14
+
15
+ from braintrust.wrappers.langsmith_wrapper import setup_langsmith
16
+
17
+ # Call setup BEFORE importing from langsmith
18
+ # project_name defaults to LANGCHAIN_PROJECT env var
19
+ setup_langsmith()
20
+
21
+ # Continue using langsmith imports - they now use Braintrust
22
+ from langsmith import traceable, Client
23
+
24
+ @traceable
25
+ def my_function(inputs: dict) -> dict:
26
+ return {"result": inputs["x"] * 2}
27
+
28
+ client = Client()
29
+ results = client.evaluate(
30
+ my_function,
31
+ data=[{"inputs": {"x": 1}, "outputs": {"result": 2}}],
32
+ evaluators=[my_evaluator],
33
+ )
34
+ ```
35
+
36
+ Set BRAINTRUST_STANDALONE=1 to completely replace LangSmith with Braintrust
37
+ (no LangSmith code runs). Otherwise, both services run in tandem.
38
+ """
39
+
40
+ import inspect
41
+ import logging
42
+ import os
43
+ from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, ParamSpec, TypeVar
44
+
45
+ from braintrust.framework import EvalCase
46
+ from braintrust.logger import NOOP_SPAN, current_span, init_logger, traced
47
+ from wrapt import wrap_function_wrapper
48
+
49
+ logger = logging.getLogger(__name__)
50
+
51
+ # Global list to store Braintrust eval results when running in tandem mode
52
+ _braintrust_eval_results: List[Any] = []
53
+
54
+ # TODO: langsmith.test/unit/expect, langsmith.AsyncClient, trace
55
+ __all__ = [
56
+ "setup_langsmith",
57
+ "wrap_traceable",
58
+ "wrap_client",
59
+ "wrap_evaluate",
60
+ "wrap_aevaluate",
61
+ "get_braintrust_results",
62
+ "clear_braintrust_results",
63
+ ]
64
+
65
+ F = TypeVar("F", bound=Callable[..., Any])
66
+ P = ParamSpec("P")
67
+ R = TypeVar("R")
68
+
69
+
70
+ def get_braintrust_results() -> List[Any]:
71
+ """Get all Braintrust eval results collected during tandem mode."""
72
+ return _braintrust_eval_results.copy()
73
+
74
+
75
+ def clear_braintrust_results() -> None:
76
+ """Clear all stored Braintrust eval results."""
77
+ _braintrust_eval_results.clear()
78
+
79
+
80
+ def setup_langsmith(
81
+ api_key: Optional[str] = None,
82
+ project_id: Optional[str] = None,
83
+ project_name: Optional[str] = None,
84
+ standalone: bool = False,
85
+ ) -> bool:
86
+ """
87
+ Setup Braintrust integration with LangSmith.
88
+
89
+ This patches LangSmith's @traceable, Client.evaluate(), and aevaluate()
90
+ to use Braintrust under the hood.
91
+
92
+ Args:
93
+ api_key: Braintrust API key (optional, can use env var BRAINTRUST_API_KEY)
94
+ project_id: Braintrust project ID (optional)
95
+ project_name: Braintrust project name (optional, falls back to LANGCHAIN_PROJECT
96
+ env var, then BRAINTRUST_PROJECT env var)
97
+ standalone: If True, completely replace LangSmith with Braintrust (no LangSmith
98
+ code runs). If False (default), run both LangSmith and Braintrust
99
+ in tandem.
100
+
101
+ Returns:
102
+ True if setup was successful, False otherwise
103
+ """
104
+ # Use LANGCHAIN_PROJECT as fallback for project_name to keep both services in sync
105
+ if project_name is None:
106
+ project_name = os.environ.get("LANGCHAIN_PROJECT")
107
+
108
+ span = current_span()
109
+ if span == NOOP_SPAN:
110
+ init_logger(project=project_name, api_key=api_key, project_id=project_id)
111
+
112
+ try:
113
+ import langsmith
114
+
115
+ langsmith.traceable = wrap_traceable(langsmith.traceable, standalone=standalone)
116
+ wrap_client(langsmith.Client, project_name=project_name, project_id=project_id, standalone=standalone)
117
+ langsmith.evaluate = wrap_evaluate(
118
+ langsmith.evaluate, project_name=project_name, project_id=project_id, standalone=standalone
119
+ )
120
+ langsmith.aevaluate = wrap_aevaluate(
121
+ langsmith.aevaluate, project_name=project_name, project_id=project_id, standalone=standalone
122
+ )
123
+
124
+ logger.info("LangSmith integration with Braintrust enabled")
125
+ return True
126
+
127
+ except ImportError as e:
128
+ logger.error(f"Failed to import langsmith: {e}")
129
+ logger.error("langsmith is not installed. Please install it with: pip install langsmith")
130
+ return False
131
+
132
+
133
+ def wrap_traceable(traceable: F, standalone: bool = False) -> F:
134
+ """
135
+ Wrap langsmith.traceable to also use Braintrust's @traced decorator.
136
+
137
+ Args:
138
+ traceable: The langsmith.traceable function
139
+ standalone: If True, replace LangSmith tracing entirely with Braintrust.
140
+ If False, add Braintrust tracing alongside LangSmith tracing.
141
+
142
+ Returns:
143
+ The wrapped traceable function (or the original if already patched)
144
+ """
145
+ if _is_patched(traceable):
146
+ return traceable
147
+
148
+ def traceable_wrapper(*args: Any, **kwargs: Any) -> Any:
149
+ # Handle both @traceable and @traceable(...) patterns
150
+ func = args[0] if args and callable(args[0]) else None
151
+
152
+ def decorator(fn: Callable[P, R]) -> Callable[P, R]:
153
+ span_name = kwargs.get("name") or fn.__name__
154
+
155
+ # Conditionally apply LangSmith decorator first
156
+ if not standalone:
157
+ fn = traceable(fn, **kwargs)
158
+
159
+ # Always apply Braintrust tracing
160
+ return traced(name=span_name)(fn) # type: ignore[return-value]
161
+
162
+ if func is not None:
163
+ return decorator(func)
164
+ return decorator
165
+
166
+ traceable_wrapper._braintrust_patched = True # type: ignore[attr-defined]
167
+ return traceable_wrapper # type: ignore[return-value]
168
+
169
+
170
+ def wrap_client(
171
+ Client: Any, project_name: Optional[str] = None, project_id: Optional[str] = None, standalone: bool = False
172
+ ) -> Any:
173
+ """
174
+ Wrap langsmith.Client to redirect evaluate() and aevaluate() to Braintrust's Eval.
175
+
176
+ Args:
177
+ Client: The langsmith.Client class
178
+ project_name: Braintrust project name to use for evaluations
179
+ project_id: Braintrust project ID to use for evaluations
180
+ standalone: If True, only run Braintrust. If False, run both LangSmith and Braintrust.
181
+
182
+ Returns:
183
+ The Client class (modified in place)
184
+ """
185
+
186
+ if hasattr(Client, "evaluate") and not _is_patched(Client.evaluate):
187
+ wrap_function_wrapper(
188
+ Client,
189
+ "evaluate",
190
+ make_evaluate_wrapper(standalone=standalone, project_name=project_name, project_id=project_id),
191
+ )
192
+ Client.evaluate._braintrust_patched = True # type: ignore[attr-defined]
193
+
194
+ if hasattr(Client, "aevaluate") and not _is_patched(Client.aevaluate):
195
+ wrap_function_wrapper(
196
+ Client,
197
+ "aevaluate",
198
+ make_aevaluate_wrapper(standalone=standalone, project_name=project_name, project_id=project_id),
199
+ )
200
+ Client.aevaluate._braintrust_patched = True # type: ignore[attr-defined]
201
+
202
+ return Client
203
+
204
+
205
+ def make_evaluate_wrapper(
206
+ *, project_name: Optional[str] = None, project_id: Optional[str] = None, standalone: bool = False
207
+ ):
208
+ def evaluate_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any:
209
+ result = None
210
+ if not standalone:
211
+ result = wrapped(*args, **kwargs)
212
+
213
+ try:
214
+ result = _run_braintrust_eval(
215
+ args,
216
+ kwargs,
217
+ project_name,
218
+ project_id,
219
+ )
220
+ _braintrust_eval_results.append(result)
221
+ except Exception as e:
222
+ if standalone:
223
+ raise e
224
+ else:
225
+ logger.warning(f"Braintrust evaluate failed: {e}")
226
+
227
+ return result
228
+
229
+ return evaluate_wrapper
230
+
231
+
232
+ def make_aevaluate_wrapper(
233
+ *, project_name: Optional[str] = None, project_id: Optional[str] = None, standalone: bool = False
234
+ ):
235
+ async def aevaluate_wrapper(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> Any:
236
+ result = None
237
+ if not standalone:
238
+ result = await wrapped(*args, **kwargs)
239
+
240
+ try:
241
+ result = await _run_braintrust_eval_async(
242
+ args,
243
+ kwargs,
244
+ project_name,
245
+ project_id,
246
+ )
247
+ _braintrust_eval_results.append(result)
248
+ except Exception as e:
249
+ if standalone:
250
+ raise e
251
+ else:
252
+ logger.warning(f"Braintrust aevaluate failed: {e}")
253
+
254
+ return result
255
+
256
+ return aevaluate_wrapper
257
+
258
+
259
+ def wrap_evaluate(
260
+ evaluate: F, project_name: Optional[str] = None, project_id: Optional[str] = None, standalone: bool = False
261
+ ) -> F:
262
+ """
263
+ Wrap module-level langsmith.evaluate to redirect to Braintrust's Eval.
264
+
265
+ Args:
266
+ evaluate: The langsmith.evaluate function
267
+ project_name: Braintrust project name to use for evaluations
268
+ project_id: Braintrust project ID to use for evaluations
269
+ standalone: If True, only run Braintrust. If False, run both LangSmith and Braintrust.
270
+
271
+ Returns:
272
+ The wrapped evaluate function (or the original if already patched)
273
+ """
274
+ if _is_patched(evaluate):
275
+ return evaluate
276
+
277
+ evaluate_wrapper = make_evaluate_wrapper(standalone=standalone, project_name=project_name, project_id=project_id)
278
+ evaluate_wrapper._braintrust_patched = True # type: ignore[attr-defined]
279
+ return evaluate_wrapper # type: ignore[return-value]
280
+
281
+
282
+ def wrap_aevaluate(
283
+ aevaluate: F,
284
+ project_name: Optional[str] = None,
285
+ project_id: Optional[str] = None,
286
+ standalone: bool = False,
287
+ ) -> F:
288
+ """
289
+ Wrap module-level langsmith.aevaluate to redirect to Braintrust's EvalAsync.
290
+
291
+ Args:
292
+ aevaluate: The langsmith.aevaluate function
293
+ project_name: Braintrust project name to use for evaluations
294
+ project_id: Braintrust project ID to use for evaluations
295
+ standalone: If True, only run Braintrust. If False, run both LangSmith and Braintrust.
296
+
297
+ Returns:
298
+ The wrapped aevaluate function (or the original if already patched)
299
+ """
300
+ if _is_patched(aevaluate):
301
+ return aevaluate
302
+
303
+ aevaluate_wrapper = make_aevaluate_wrapper(standalone=standalone, project_name=project_name, project_id=project_id)
304
+ aevaluate_wrapper._braintrust_patched = True # type: ignore[attr-defined]
305
+ return aevaluate_wrapper # type: ignore[return-value]
306
+
307
+
308
+ def _is_patched(obj: Any) -> bool:
309
+ return getattr(obj, "_braintrust_patched", False)
310
+
311
+
312
+ # =============================================================================
313
+ # Braintrust evaluation logic
314
+ # =============================================================================
315
+
316
+
317
+ def _run_braintrust_eval(
318
+ args: Any,
319
+ kwargs: Any,
320
+ project_name: Optional[str] = None,
321
+ project_id: Optional[str] = None,
322
+ ) -> Any:
323
+ """Run Braintrust Eval with LangSmith-style arguments."""
324
+ from braintrust.framework import Eval
325
+
326
+ target = args[0] if args else kwargs.get("target")
327
+ data = args[1] if len(args) > 1 else kwargs.get("data")
328
+ evaluators = kwargs.get("evaluators")
329
+ experiment_prefix = kwargs.get("experiment_prefix")
330
+ description = kwargs.get("description")
331
+ metadata = kwargs.get("metadata")
332
+ max_concurrency = kwargs.get("max_concurrency")
333
+ num_repetitions = kwargs.get("num_repetitions", 1)
334
+
335
+ # Convert evaluators to scorers
336
+ scorers = []
337
+ if evaluators:
338
+ for e in evaluators:
339
+ scorers.append(_make_braintrust_scorer(e))
340
+
341
+ return Eval(
342
+ name=project_name or "langsmith-migration",
343
+ data=_convert_langsmith_data(data),
344
+ task=_make_braintrust_task(target),
345
+ scores=scorers,
346
+ experiment_name=experiment_prefix,
347
+ project_id=project_id,
348
+ description=description,
349
+ metadata=metadata,
350
+ max_concurrency=max_concurrency,
351
+ trial_count=num_repetitions,
352
+ )
353
+
354
+
355
+ async def _run_braintrust_eval_async(
356
+ args: Any,
357
+ kwargs: Any,
358
+ project_name: Optional[str] = None,
359
+ project_id: Optional[str] = None,
360
+ ) -> Any:
361
+ """Run Braintrust EvalAsync with LangSmith-style arguments."""
362
+ from braintrust.framework import EvalAsync
363
+
364
+ target = args[0] if args else kwargs.get("target")
365
+ data = args[1] if len(args) > 1 else kwargs.get("data")
366
+ evaluators = kwargs.get("evaluators")
367
+ experiment_prefix = kwargs.get("experiment_prefix")
368
+ description = kwargs.get("description")
369
+ metadata = kwargs.get("metadata")
370
+ max_concurrency = kwargs.get("max_concurrency")
371
+ num_repetitions = kwargs.get("num_repetitions", 1)
372
+
373
+ # Convert evaluators to scorers
374
+ scorers = []
375
+ if evaluators:
376
+ for e in evaluators:
377
+ scorers.append(_make_braintrust_scorer(e))
378
+
379
+ return await EvalAsync(
380
+ name=project_name or "langsmith-migration",
381
+ data=_convert_langsmith_data(data),
382
+ task=_make_braintrust_task(target),
383
+ scores=scorers,
384
+ experiment_name=experiment_prefix,
385
+ project_id=project_id,
386
+ description=description,
387
+ metadata=metadata,
388
+ max_concurrency=max_concurrency,
389
+ trial_count=num_repetitions,
390
+ )
391
+
392
+
393
+ # =============================================================================
394
+ # Data conversion helpers
395
+ # =============================================================================
396
+
397
+
398
+ def _wrap_output(output: Any) -> Dict[str, Any]:
399
+ """Wrap non-dict outputs the same way LangSmith does."""
400
+ if not isinstance(output, dict):
401
+ return {"output": output}
402
+ return output
403
+
404
+
405
+ def _make_braintrust_scorer(
406
+ evaluator: Callable[..., Any],
407
+ ) -> Callable[..., Any]:
408
+ """
409
+ Create a Braintrust scorer from a LangSmith evaluator.
410
+
411
+ Always runs the evaluator through Braintrust for full tracing (span duration, child LLM calls, etc.).
412
+ """
413
+ evaluator_name = getattr(evaluator, "__name__", "score")
414
+
415
+ def braintrust_scorer(input: Any, output: Any, expected: Optional[Any] = None, **kwargs: Any) -> Any:
416
+ from braintrust.score import Score
417
+
418
+ # Run the evaluator with LangSmith's signature
419
+ # LangSmith evaluators use: (inputs, outputs, reference_outputs) -> bool | dict
420
+ # LangSmith auto-wraps non-dict outputs as {"output": value}
421
+ outputs = _wrap_output(output)
422
+
423
+ # expected is the real LangSmith Example object passed through from data loading
424
+ reference_outputs = expected.outputs if hasattr(expected, "outputs") else expected
425
+
426
+ result = evaluator(input, outputs, reference_outputs)
427
+
428
+ return Score(
429
+ name=result.get("key", evaluator_name),
430
+ score=result.get("score"),
431
+ metadata=result.get("metadata", {}),
432
+ )
433
+
434
+ braintrust_scorer.__name__ = evaluator_name
435
+ return braintrust_scorer
436
+
437
+
438
+ def _convert_langsmith_data(data: Any) -> Callable[[], Iterator[EvalCase[Any, Any]]]:
439
+ """Convert LangSmith data format to Braintrust data format."""
440
+
441
+ def load_data() -> Iterator[EvalCase[Any, Any]]:
442
+ # Determine the source iterable without loading everything into memory
443
+ source: Iterable[Any]
444
+ if callable(data):
445
+ source = data() # type: ignore
446
+ elif isinstance(data, str):
447
+ # Load examples from LangSmith dataset by name
448
+ try:
449
+ from langsmith import Client # pylint: disable=import-error
450
+
451
+ client = Client()
452
+ source = client.list_examples(dataset_name=data)
453
+ except Exception as e:
454
+ logger.warning(f"Failed to load LangSmith dataset '{data}': {e}")
455
+ return
456
+ elif hasattr(data, "__iter__"):
457
+ source = data
458
+ else:
459
+ source = [data]
460
+
461
+ # Process items as a generator - yield one at a time
462
+ for item in source:
463
+ # Pass through LangSmith Example objects directly
464
+ if hasattr(item, "inputs"):
465
+ yield EvalCase(
466
+ input=item.inputs,
467
+ expected=item, # Pass the whole Example object
468
+ metadata=getattr(item, "metadata", None),
469
+ )
470
+ elif isinstance(item, dict):
471
+ if "inputs" in item:
472
+ # LangSmith dict format
473
+ yield EvalCase(
474
+ input=item["inputs"],
475
+ expected=item, # Pass the whole dict
476
+ metadata=item.get("metadata"),
477
+ )
478
+ elif "input" in item:
479
+ # Braintrust format
480
+ yield EvalCase(
481
+ input=item["input"],
482
+ expected=item.get("expected"),
483
+ metadata=item.get("metadata"),
484
+ )
485
+ else:
486
+ yield EvalCase(input=item)
487
+ else:
488
+ yield EvalCase(input=item)
489
+
490
+ return load_data
491
+
492
+
493
+ def _make_braintrust_task(target: Callable[..., Any]) -> Callable[..., Any]:
494
+ """Convert a LangSmith target function to Braintrust task format."""
495
+
496
+ def task_fn(task_input: Any, hooks: Any) -> Any:
497
+ if isinstance(task_input, dict):
498
+ # Try to get the original function's signature (unwrap decorators)
499
+ unwrapped = inspect.unwrap(target)
500
+
501
+ try:
502
+ sig = inspect.signature(unwrapped)
503
+ params = list(sig.parameters.keys())
504
+ if len(params) == 1:
505
+ return target(task_input)
506
+ if all(p in task_input for p in params):
507
+ return target(**task_input)
508
+ return target(task_input)
509
+ except (ValueError, TypeError):
510
+ # Fallback: try kwargs first, then single arg
511
+ try:
512
+ return target(**task_input)
513
+ except TypeError:
514
+ return target(task_input)
515
+ return target(task_input)
516
+
517
+ return task_fn
@@ -13,18 +13,6 @@ TEST_ORG_ID = "test-org-123"
13
13
  PROJECT_NAME = "test-agno-app"
14
14
 
15
15
 
16
- @pytest.fixture(scope="module")
17
- def vcr_config():
18
- return {
19
- "filter_headers": [
20
- "authorization",
21
- "x-api-key",
22
- "api-key",
23
- "openai-api-key",
24
- ]
25
- }
26
-
27
-
28
16
  @pytest.fixture
29
17
  def memory_logger():
30
18
  init_test_logger(PROJECT_NAME)
@@ -15,16 +15,6 @@ PROJECT_NAME = "test-anthropic-app"
15
15
  MODEL = "claude-3-haiku-20240307" # use the cheapest model since answers dont matter
16
16
 
17
17
 
18
- @pytest.fixture(scope="module")
19
- def vcr_config():
20
- return {
21
- "filter_headers": [
22
- "authorization",
23
- "x-api-key",
24
- ]
25
- }
26
-
27
-
28
18
  def _get_client():
29
19
  return anthropic.Anthropic()
30
20
 
@@ -457,7 +447,7 @@ async def test_anthropic_beta_messages_create_async(memory_logger):
457
447
  assert "10" in span["output"]["content"][0]["text"]
458
448
 
459
449
 
460
- @pytest.mark.vcr
450
+ @pytest.mark.vcr(match_on=["method", "scheme", "host", "port", "path", "body"]) # exclude query - varies by SDK version
461
451
  @pytest.mark.asyncio
462
452
  async def test_anthropic_beta_messages_streaming_async(memory_logger):
463
453
  assert not memory_logger.pop()
@@ -12,17 +12,6 @@ PROJECT_NAME = "test-dspy-app"
12
12
  MODEL = "openai/gpt-4o-mini"
13
13
 
14
14
 
15
- @pytest.fixture(scope="module")
16
- def vcr_config():
17
- return {
18
- "filter_headers": [
19
- "authorization",
20
- "x-api-key",
21
- "openai-api-key",
22
- ]
23
- }
24
-
25
-
26
15
  @pytest.fixture
27
16
  def memory_logger():
28
17
  init_test_logger(PROJECT_NAME)
@@ -1,3 +1,4 @@
1
+ import os
1
2
  import time
2
3
  from pathlib import Path
3
4
 
@@ -15,12 +16,16 @@ FIXTURES_DIR = Path(__file__).parent.parent.parent.parent.parent / "internal/gol
15
16
 
16
17
  @pytest.fixture(scope="module")
17
18
  def vcr_config():
19
+ """Google-specific VCR config - needs to uppercase HTTP methods."""
20
+ record_mode = "none" if (os.environ.get("CI") or os.environ.get("GITHUB_ACTIONS")) else "once"
21
+
18
22
  def before_record_request(request):
19
- # Normalize HTTP method to uppercase for consistency
23
+ # Normalize HTTP method to uppercase for consistency (Google API quirk)
20
24
  request.method = request.method.upper()
21
25
  return request
22
26
 
23
27
  return {
28
+ "record_mode": record_mode,
24
29
  "filter_headers": [
25
30
  "authorization",
26
31
  "x-api-key",