braintrust 0.4.3__tar.gz → 0.5.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. {braintrust-0.4.3 → braintrust-0.5.2}/PKG-INFO +1 -1
  2. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/__init__.py +3 -0
  3. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/_generated_types.py +106 -6
  4. braintrust-0.5.2/src/braintrust/auto.py +179 -0
  5. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/conftest.py +23 -4
  6. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/framework.py +113 -3
  7. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/functions/invoke.py +3 -1
  8. braintrust-0.5.2/src/braintrust/functions/test_invoke.py +61 -0
  9. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/generated_types.py +7 -1
  10. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/logger.py +127 -45
  11. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/oai.py +51 -0
  12. braintrust-0.5.2/src/braintrust/span_cache.py +337 -0
  13. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/span_identifier_v3.py +21 -0
  14. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_bt_json.py +0 -5
  15. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_framework.py +37 -0
  16. braintrust-0.5.2/src/braintrust/test_http.py +444 -0
  17. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_logger.py +295 -5
  18. braintrust-0.5.2/src/braintrust/test_span_cache.py +344 -0
  19. braintrust-0.5.2/src/braintrust/test_trace.py +267 -0
  20. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_util.py +58 -1
  21. braintrust-0.5.2/src/braintrust/trace.py +385 -0
  22. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/util.py +20 -0
  23. braintrust-0.5.2/src/braintrust/version.py +4 -0
  24. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/agno/__init__.py +2 -3
  25. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/anthropic.py +64 -0
  26. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/claude_agent_sdk/__init__.py +2 -3
  27. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py +48 -6
  28. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py +115 -0
  29. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/dspy.py +52 -1
  30. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/google_genai/__init__.py +9 -6
  31. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/litellm.py +6 -43
  32. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/pydantic_ai.py +2 -3
  33. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/test_agno.py +9 -0
  34. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/test_anthropic.py +156 -0
  35. braintrust-0.5.2/src/braintrust/wrappers/test_dspy.py +177 -0
  36. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/test_google_genai.py +9 -0
  37. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/test_litellm.py +57 -55
  38. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/test_openai.py +253 -1
  39. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/test_pydantic_ai_integration.py +9 -0
  40. braintrust-0.5.2/src/braintrust/wrappers/test_utils.py +91 -0
  41. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust.egg-info/PKG-INFO +1 -1
  42. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust.egg-info/SOURCES.txt +7 -0
  43. braintrust-0.4.3/src/braintrust/version.py +0 -4
  44. braintrust-0.4.3/src/braintrust/wrappers/test_dspy.py +0 -60
  45. braintrust-0.4.3/src/braintrust/wrappers/test_utils.py +0 -12
  46. {braintrust-0.4.3 → braintrust-0.5.2}/README.md +0 -0
  47. {braintrust-0.4.3 → braintrust-0.5.2}/setup.cfg +0 -0
  48. {braintrust-0.4.3 → braintrust-0.5.2}/setup.py +0 -0
  49. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/audit.py +0 -0
  50. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/aws.py +0 -0
  51. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/bt_json.py +0 -0
  52. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/cli/__init__.py +0 -0
  53. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/cli/__main__.py +0 -0
  54. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/cli/eval.py +0 -0
  55. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/cli/install/__init__.py +0 -0
  56. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/cli/install/api.py +0 -0
  57. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/cli/install/bump_versions.py +0 -0
  58. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/cli/install/logs.py +0 -0
  59. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/cli/install/redshift.py +0 -0
  60. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/cli/install/run_migrations.py +0 -0
  61. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/cli/push.py +0 -0
  62. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/context.py +0 -0
  63. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/contrib/__init__.py +0 -0
  64. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/contrib/temporal/__init__.py +0 -0
  65. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/contrib/temporal/test_temporal.py +0 -0
  66. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/db_fields.py +0 -0
  67. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/devserver/__init__.py +0 -0
  68. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/devserver/auth.py +0 -0
  69. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/devserver/cache.py +0 -0
  70. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/devserver/cors.py +0 -0
  71. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/devserver/dataset.py +0 -0
  72. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/devserver/eval_hooks.py +0 -0
  73. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/devserver/schemas.py +0 -0
  74. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/devserver/server.py +0 -0
  75. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/devserver/test_cached_login.py +0 -0
  76. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/devserver/test_lru_cache.py +0 -0
  77. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/devserver/test_server_integration.py +0 -0
  78. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/framework2.py +0 -0
  79. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/functions/__init__.py +0 -0
  80. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/functions/constants.py +0 -0
  81. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/functions/stream.py +0 -0
  82. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/git_fields.py +0 -0
  83. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/gitutil.py +0 -0
  84. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/graph_util.py +0 -0
  85. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/http_headers.py +0 -0
  86. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/id_gen.py +0 -0
  87. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/merge_row_batch.py +0 -0
  88. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/object.py +0 -0
  89. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/otel/__init__.py +0 -0
  90. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/otel/context.py +0 -0
  91. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/otel/test_distributed_tracing.py +0 -0
  92. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/otel/test_otel_bt_integration.py +0 -0
  93. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/parameters.py +0 -0
  94. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/prompt.py +0 -0
  95. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/prompt_cache/__init__.py +0 -0
  96. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/prompt_cache/disk_cache.py +0 -0
  97. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/prompt_cache/lru_cache.py +0 -0
  98. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/prompt_cache/prompt_cache.py +0 -0
  99. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/prompt_cache/test_disk_cache.py +0 -0
  100. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/prompt_cache/test_lru_cache.py +0 -0
  101. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/prompt_cache/test_prompt_cache.py +0 -0
  102. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/py.typed +0 -0
  103. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/queue.py +0 -0
  104. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/resource_manager.py +0 -0
  105. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/score.py +0 -0
  106. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/serializable_data_class.py +0 -0
  107. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/span_identifier_v1.py +0 -0
  108. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/span_identifier_v2.py +0 -0
  109. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/span_identifier_v4.py +0 -0
  110. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/span_types.py +0 -0
  111. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_framework2.py +0 -0
  112. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_helpers.py +0 -0
  113. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_id_gen.py +0 -0
  114. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_otel.py +0 -0
  115. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_queue.py +0 -0
  116. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_score.py +0 -0
  117. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_serializable_data_class.py +0 -0
  118. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_span_components.py +0 -0
  119. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/test_version.py +0 -0
  120. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/__init__.py +0 -0
  121. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/_anthropic_utils.py +0 -0
  122. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/agno/agent.py +0 -0
  123. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/agno/function_call.py +0 -0
  124. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/agno/model.py +0 -0
  125. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/agno/team.py +0 -0
  126. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/agno/utils.py +0 -0
  127. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/langchain.py +0 -0
  128. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/langsmith_wrapper.py +0 -0
  129. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/openai.py +0 -0
  130. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/test_langsmith_wrapper.py +0 -0
  131. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/test_oai_attachments.py +0 -0
  132. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/test_openrouter.py +0 -0
  133. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/wrappers/test_pydantic_ai_wrap_openai.py +0 -0
  134. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust/xact_ids.py +0 -0
  135. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust.egg-info/dependency_links.txt +0 -0
  136. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust.egg-info/entry_points.txt +0 -0
  137. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust.egg-info/requires.txt +0 -0
  138. {braintrust-0.4.3 → braintrust-0.5.2}/src/braintrust.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: braintrust
3
- Version: 0.4.3
3
+ Version: 0.5.2
4
4
  Summary: SDK for integrating Braintrust
5
5
  Home-page: https://www.braintrust.dev
6
6
  Author: Braintrust
@@ -50,6 +50,9 @@ BRAINTRUST_API_KEY=<YOUR_BRAINTRUST_API_KEY> braintrust eval eval_hello.py
50
50
  """
51
51
 
52
52
  from .audit import *
53
+ from .auto import (
54
+ auto_instrument, # noqa: F401 # type: ignore[reportUnusedImport]
55
+ )
53
56
  from .framework import *
54
57
  from .framework2 import *
55
58
  from .functions.invoke import *
@@ -144,6 +144,11 @@ class AsyncScoringControlAsyncScoringControl5(TypedDict):
144
144
  triggered_xact_id: str
145
145
 
146
146
 
147
+ class AsyncScoringControlAsyncScoringControl6(TypedDict):
148
+ kind: Literal['mark_attempt_failed']
149
+ function_ids: Sequence[Any]
150
+
151
+
147
152
  class AsyncScoringStateAsyncScoringState(TypedDict):
148
153
  status: Literal['enabled']
149
154
  token: str
@@ -484,6 +489,10 @@ class Dataset(TypedDict):
484
489
  """
485
490
  User-controlled metadata about the dataset
486
491
  """
492
+ url_slug: str
493
+ """
494
+ URL slug for the dataset. used to construct dataset URLs
495
+ """
487
496
 
488
497
 
489
498
  class DatasetEventMetadata(TypedDict):
@@ -532,6 +541,43 @@ class EnvVar(TypedDict):
532
541
  """
533
542
 
534
543
 
544
+ class EvalStatusPageConfig(TypedDict):
545
+ score_columns: NotRequired[Sequence[str] | None]
546
+ """
547
+ The score columns to display on the page
548
+ """
549
+ metric_columns: NotRequired[Sequence[str] | None]
550
+ """
551
+ The metric columns to display on the page
552
+ """
553
+ grouping_field: NotRequired[str | None]
554
+ """
555
+ The metadata field to use for grouping experiments (model)
556
+ """
557
+ filter: NotRequired[str | None]
558
+ """
559
+ BTQL filter to apply to experiment data
560
+ """
561
+ sort_by: NotRequired[str | None]
562
+ """
563
+ Field to sort results by (format: 'score:<name>' or 'metric:<name>')
564
+ """
565
+ sort_order: NotRequired[Literal['asc', 'desc'] | None]
566
+ """
567
+ Sort order (ascending or descending)
568
+ """
569
+ api_key: NotRequired[str | None]
570
+ """
571
+ The API key used for fetching experiment data
572
+ """
573
+
574
+
575
+ EvalStatusPageTheme: TypeAlias = Literal['light', 'dark']
576
+ """
577
+ The theme for the page
578
+ """
579
+
580
+
535
581
  class ExperimentEventMetadata(TypedDict):
536
582
  model: NotRequired[str | None]
537
583
  """
@@ -749,20 +795,24 @@ FunctionIdRef: TypeAlias = Mapping[str, Any]
749
795
 
750
796
 
751
797
  FunctionObjectType: TypeAlias = Literal[
752
- 'prompt', 'tool', 'scorer', 'task', 'custom_view', 'preprocessor', 'facet'
798
+ 'prompt', 'tool', 'scorer', 'task', 'workflow', 'custom_view', 'preprocessor', 'facet', 'classifier'
753
799
  ]
754
800
 
755
801
 
756
- FunctionOutputType: TypeAlias = Literal['completion', 'score', 'any']
802
+ FunctionOutputType: TypeAlias = Literal['completion', 'score', 'facet', 'classification', 'any']
757
803
 
758
804
 
759
- FunctionTypeEnum: TypeAlias = Literal['llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet']
805
+ FunctionTypeEnum: TypeAlias = Literal[
806
+ 'llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet', 'classifier'
807
+ ]
760
808
  """
761
809
  The type of global function. Defaults to 'scorer'.
762
810
  """
763
811
 
764
812
 
765
- FunctionTypeEnumNullish: TypeAlias = Literal['llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet']
813
+ FunctionTypeEnumNullish: TypeAlias = Literal[
814
+ 'llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet', 'classifier'
815
+ ]
766
816
 
767
817
 
768
818
  class GitMetadataSettings(TypedDict):
@@ -1674,7 +1724,18 @@ class PromptDataNullishOrigin(TypedDict):
1674
1724
  class PromptParserNullish(TypedDict):
1675
1725
  type: Literal['llm_classifier']
1676
1726
  use_cot: bool
1677
- choice_scores: Mapping[str, float]
1727
+ choice_scores: NotRequired[Mapping[str, float] | None]
1728
+ """
1729
+ Map of choices to scores (0-1). Used by scorers.
1730
+ """
1731
+ choice: NotRequired[Sequence[str] | None]
1732
+ """
1733
+ List of valid choices without score mapping. Used by classifiers that deposit output to tags.
1734
+ """
1735
+ allow_no_match: NotRequired[bool | None]
1736
+ """
1737
+ If true, adds a 'No match' option. When selected, no tag is deposited.
1738
+ """
1678
1739
 
1679
1740
 
1680
1741
  class PromptSessionEvent(TypedDict):
@@ -2104,7 +2165,7 @@ class SpanScope(TypedDict):
2104
2165
 
2105
2166
 
2106
2167
  SpanType: TypeAlias = Literal[
2107
- 'llm', 'score', 'function', 'eval', 'task', 'tool', 'automation', 'facet', 'preprocessor'
2168
+ 'llm', 'score', 'function', 'eval', 'task', 'tool', 'automation', 'facet', 'preprocessor', 'classifier'
2108
2169
  ]
2109
2170
  """
2110
2171
  Type of the span, for display purposes only
@@ -2384,6 +2445,7 @@ AsyncScoringControl: TypeAlias = (
2384
2445
  | AsyncScoringControlAsyncScoringControl3
2385
2446
  | AsyncScoringControlAsyncScoringControl4
2386
2447
  | AsyncScoringControlAsyncScoringControl5
2448
+ | AsyncScoringControlAsyncScoringControl6
2387
2449
  )
2388
2450
 
2389
2451
 
@@ -2530,6 +2592,43 @@ class DatasetEvent(TypedDict):
2530
2592
  """
2531
2593
 
2532
2594
 
2595
+ class EvalStatusPage(TypedDict):
2596
+ id: str
2597
+ """
2598
+ Unique identifier for the eval status page
2599
+ """
2600
+ project_id: str
2601
+ """
2602
+ Unique identifier for the project that the eval status page belongs under
2603
+ """
2604
+ user_id: NotRequired[str | None]
2605
+ """
2606
+ Identifies the user who created the eval status page
2607
+ """
2608
+ created: NotRequired[str | None]
2609
+ """
2610
+ Date of eval status page creation
2611
+ """
2612
+ deleted_at: NotRequired[str | None]
2613
+ """
2614
+ Date of eval status page deletion, or null if the eval status page is still active
2615
+ """
2616
+ name: str
2617
+ """
2618
+ Name of the eval status page
2619
+ """
2620
+ description: NotRequired[str | None]
2621
+ """
2622
+ Textual description of the eval status page
2623
+ """
2624
+ logo_url: NotRequired[str | None]
2625
+ """
2626
+ URL of the logo to display on the page
2627
+ """
2628
+ theme: EvalStatusPageTheme
2629
+ config: EvalStatusPageConfig
2630
+
2631
+
2533
2632
  class Experiment(TypedDict):
2534
2633
  id: str
2535
2634
  """
@@ -3228,6 +3327,7 @@ class View(TypedDict):
3228
3327
  'prompts',
3229
3328
  'tools',
3230
3329
  'scorers',
3330
+ 'classifiers',
3231
3331
  'logs',
3232
3332
  'monitor',
3233
3333
  'for_review',
@@ -0,0 +1,179 @@
1
+ """
2
+ Auto-instrumentation for AI/ML libraries.
3
+
4
+ Provides one-line instrumentation for supported libraries.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from contextlib import contextmanager
11
+
12
+ __all__ = ["auto_instrument"]
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ @contextmanager
18
+ def _try_patch():
19
+ """Context manager that suppresses ImportError and logs other exceptions."""
20
+ try:
21
+ yield
22
+ except ImportError:
23
+ pass
24
+ except Exception:
25
+ logger.exception("Failed to instrument")
26
+
27
+
28
+ def auto_instrument(
29
+ *,
30
+ openai: bool = True,
31
+ anthropic: bool = True,
32
+ litellm: bool = True,
33
+ pydantic_ai: bool = True,
34
+ google_genai: bool = True,
35
+ agno: bool = True,
36
+ claude_agent_sdk: bool = True,
37
+ dspy: bool = True,
38
+ ) -> dict[str, bool]:
39
+ """
40
+ Auto-instrument supported AI/ML libraries for Braintrust tracing.
41
+
42
+ Safe to call multiple times - already instrumented libraries are skipped.
43
+
44
+ Note on import order: If you use `from openai import OpenAI` style imports,
45
+ call auto_instrument() first. If you use `import openai` style imports,
46
+ order doesn't matter since attribute lookup happens dynamically.
47
+
48
+ Args:
49
+ openai: Enable OpenAI instrumentation (default: True)
50
+ anthropic: Enable Anthropic instrumentation (default: True)
51
+ litellm: Enable LiteLLM instrumentation (default: True)
52
+ pydantic_ai: Enable Pydantic AI instrumentation (default: True)
53
+ google_genai: Enable Google GenAI instrumentation (default: True)
54
+ agno: Enable Agno instrumentation (default: True)
55
+ claude_agent_sdk: Enable Claude Agent SDK instrumentation (default: True)
56
+ dspy: Enable DSPy instrumentation (default: True)
57
+
58
+ Returns:
59
+ Dict mapping integration name to whether it was successfully instrumented.
60
+
61
+ Example:
62
+ ```python
63
+ import braintrust
64
+ braintrust.auto_instrument()
65
+
66
+ # OpenAI
67
+ import openai
68
+ client = openai.OpenAI()
69
+ client.chat.completions.create(model="gpt-4o-mini", messages=[...])
70
+
71
+ # Anthropic
72
+ import anthropic
73
+ client = anthropic.Anthropic()
74
+ client.messages.create(model="claude-sonnet-4-20250514", messages=[...])
75
+
76
+ # LiteLLM
77
+ import litellm
78
+ litellm.completion(model="gpt-4o-mini", messages=[...])
79
+
80
+ # DSPy
81
+ import dspy
82
+ lm = dspy.LM("openai/gpt-4o-mini")
83
+ dspy.configure(lm=lm)
84
+
85
+ # Pydantic AI
86
+ from pydantic_ai import Agent
87
+ agent = Agent("openai:gpt-4o-mini")
88
+ result = agent.run_sync("Hello!")
89
+
90
+ # Google GenAI
91
+ from google.genai import Client
92
+ client = Client()
93
+ client.models.generate_content(model="gemini-2.0-flash", contents="Hello!")
94
+ ```
95
+ """
96
+ results = {}
97
+
98
+ if openai:
99
+ results["openai"] = _instrument_openai()
100
+ if anthropic:
101
+ results["anthropic"] = _instrument_anthropic()
102
+ if litellm:
103
+ results["litellm"] = _instrument_litellm()
104
+ if pydantic_ai:
105
+ results["pydantic_ai"] = _instrument_pydantic_ai()
106
+ if google_genai:
107
+ results["google_genai"] = _instrument_google_genai()
108
+ if agno:
109
+ results["agno"] = _instrument_agno()
110
+ if claude_agent_sdk:
111
+ results["claude_agent_sdk"] = _instrument_claude_agent_sdk()
112
+ if dspy:
113
+ results["dspy"] = _instrument_dspy()
114
+
115
+ return results
116
+
117
+
118
+ def _instrument_openai() -> bool:
119
+ with _try_patch():
120
+ from braintrust.oai import patch_openai
121
+
122
+ return patch_openai()
123
+ return False
124
+
125
+
126
+ def _instrument_anthropic() -> bool:
127
+ with _try_patch():
128
+ from braintrust.wrappers.anthropic import patch_anthropic
129
+
130
+ return patch_anthropic()
131
+ return False
132
+
133
+
134
+ def _instrument_litellm() -> bool:
135
+ with _try_patch():
136
+ from braintrust.wrappers.litellm import patch_litellm
137
+
138
+ return patch_litellm()
139
+ return False
140
+
141
+
142
+ def _instrument_pydantic_ai() -> bool:
143
+ with _try_patch():
144
+ from braintrust.wrappers.pydantic_ai import setup_pydantic_ai
145
+
146
+ return setup_pydantic_ai()
147
+ return False
148
+
149
+
150
+ def _instrument_google_genai() -> bool:
151
+ with _try_patch():
152
+ from braintrust.wrappers.google_genai import setup_genai
153
+
154
+ return setup_genai()
155
+ return False
156
+
157
+
158
+ def _instrument_agno() -> bool:
159
+ with _try_patch():
160
+ from braintrust.wrappers.agno import setup_agno
161
+
162
+ return setup_agno()
163
+ return False
164
+
165
+
166
+ def _instrument_claude_agent_sdk() -> bool:
167
+ with _try_patch():
168
+ from braintrust.wrappers.claude_agent_sdk import setup_claude_agent_sdk
169
+
170
+ return setup_claude_agent_sdk()
171
+ return False
172
+
173
+
174
+ def _instrument_dspy() -> bool:
175
+ with _try_patch():
176
+ from braintrust.wrappers.dspy import patch_dspy
177
+
178
+ return patch_dspy()
179
+ return False
@@ -48,16 +48,29 @@ def reset_braintrust_state():
48
48
  logger._state = logger.BraintrustState()
49
49
 
50
50
 
51
- @pytest.fixture(scope="session")
52
- def vcr_config():
51
+ @pytest.fixture(autouse=True)
52
+ def skip_vcr_tests_in_wheel_mode(request):
53
+ """Skip VCR tests when running from an installed wheel.
54
+
55
+ Wheel mode (BRAINTRUST_TESTING_WHEEL=1) is a pre-release sanity check
56
+ that verifies the built package installs and runs correctly. It's not
57
+ intended to be a full test suite - VCR cassettes are not included in
58
+ the wheel, so we skip those tests here. The full test suite with VCR
59
+ tests runs against source code during normal CI.
60
+ """
61
+ if os.environ.get("BRAINTRUST_TESTING_WHEEL") == "1":
62
+ if request.node.get_closest_marker("vcr"):
63
+ pytest.skip("VCR tests skipped in wheel mode (pre-release sanity check only)")
64
+
65
+
66
+ def get_vcr_config():
53
67
  """
54
- VCR configuration for recording/playing back HTTP interactions.
68
+ Get VCR configuration for recording/playing back HTTP interactions.
55
69
 
56
70
  In CI, use "none" to fail if cassette is missing.
57
71
  Locally, use "once" to record new cassettes if they don't exist.
58
72
  """
59
73
  record_mode = "none" if (os.environ.get("CI") or os.environ.get("GITHUB_ACTIONS")) else "once"
60
-
61
74
  return {
62
75
  "record_mode": record_mode,
63
76
  "filter_headers": [
@@ -70,3 +83,9 @@ def vcr_config():
70
83
  "x-bt-auth-token",
71
84
  ],
72
85
  }
86
+
87
+
88
+ @pytest.fixture(scope="session")
89
+ def vcr_config():
90
+ """Pytest fixture wrapper for get_vcr_config()."""
91
+ return get_vcr_config()
@@ -673,6 +673,7 @@ def _EvalCommon(
673
673
  stream: Callable[[SSEProgressEvent], None] | None = None,
674
674
  parent: str | None = None,
675
675
  state: BraintrustState | None = None,
676
+ enable_cache: bool = True,
676
677
  ) -> Callable[[], Coroutine[Any, Any, EvalResultWithSummary[Input, Output]]]:
677
678
  """
678
679
  This helper is needed because in case of `_lazy_load`, we need to update
@@ -759,7 +760,7 @@ def _EvalCommon(
759
760
  async def run_to_completion():
760
761
  with parent_context(parent, state):
761
762
  try:
762
- ret = await run_evaluator(experiment, evaluator, 0, [], stream, state)
763
+ ret = await run_evaluator(experiment, evaluator, 0, [], stream, state, enable_cache)
763
764
  reporter.report_eval(evaluator, ret, verbose=True, jsonl=False)
764
765
  return ret
765
766
  finally:
@@ -798,6 +799,7 @@ async def EvalAsync(
798
799
  stream: Callable[[SSEProgressEvent], None] | None = None,
799
800
  parent: str | None = None,
800
801
  state: BraintrustState | None = None,
802
+ enable_cache: bool = True,
801
803
  ) -> EvalResultWithSummary[Input, Output]:
802
804
  """
803
805
  A function you can use to define an evaluator. This is a convenience wrapper around the `Evaluator` class.
@@ -855,6 +857,8 @@ async def EvalAsync(
855
857
  :param parent: If specified, instead of creating a new experiment object, the Eval() will populate
856
858
  the object or span specified by this parent.
857
859
  :param state: Optional BraintrustState to use for the evaluation. If not specified, the global login state will be used.
860
+ :param enable_cache: Whether to enable the span cache for this evaluation. Defaults to True. The span cache stores
861
+ span data on disk to minimize memory usage and allow scorers to read spans without server round-trips.
858
862
  :return: An `EvalResultWithSummary` object, which contains all results and a summary.
859
863
  """
860
864
  f = _EvalCommon(
@@ -883,6 +887,7 @@ async def EvalAsync(
883
887
  stream=stream,
884
888
  parent=parent,
885
889
  state=state,
890
+ enable_cache=enable_cache,
886
891
  )
887
892
 
888
893
  return await f()
@@ -918,6 +923,7 @@ def Eval(
918
923
  stream: Callable[[SSEProgressEvent], None] | None = None,
919
924
  parent: str | None = None,
920
925
  state: BraintrustState | None = None,
926
+ enable_cache: bool = True,
921
927
  ) -> EvalResultWithSummary[Input, Output]:
922
928
  """
923
929
  A function you can use to define an evaluator. This is a convenience wrapper around the `Evaluator` class.
@@ -975,6 +981,8 @@ def Eval(
975
981
  :param parent: If specified, instead of creating a new experiment object, the Eval() will populate
976
982
  the object or span specified by this parent.
977
983
  :param state: Optional BraintrustState to use for the evaluation. If not specified, the global login state will be used.
984
+ :param enable_cache: Whether to enable the span cache for this evaluation. Defaults to True. The span cache stores
985
+ span data on disk to minimize memory usage and allow scorers to read spans without server round-trips.
978
986
  :return: An `EvalResultWithSummary` object, which contains all results and a summary.
979
987
  """
980
988
 
@@ -1005,6 +1013,7 @@ def Eval(
1005
1013
  stream=stream,
1006
1014
  parent=parent,
1007
1015
  state=state,
1016
+ enable_cache=enable_cache,
1008
1017
  )
1009
1018
 
1010
1019
  # https://stackoverflow.com/questions/55409641/asyncio-run-cannot-be-called-from-a-running-event-loop-when-using-jupyter-no
@@ -1249,10 +1258,11 @@ async def run_evaluator(
1249
1258
  filters: list[Filter],
1250
1259
  stream: Callable[[SSEProgressEvent], None] | None = None,
1251
1260
  state: BraintrustState | None = None,
1261
+ enable_cache: bool = True,
1252
1262
  ) -> EvalResultWithSummary[Input, Output]:
1253
1263
  """Wrapper on _run_evaluator_internal that times out execution after evaluator.timeout."""
1254
1264
  results = await asyncio.wait_for(
1255
- _run_evaluator_internal(experiment, evaluator, position, filters, stream, state), evaluator.timeout
1265
+ _run_evaluator_internal(experiment, evaluator, position, filters, stream, state, enable_cache), evaluator.timeout
1256
1266
  )
1257
1267
 
1258
1268
  if experiment:
@@ -1280,6 +1290,32 @@ async def _run_evaluator_internal(
1280
1290
  filters: list[Filter],
1281
1291
  stream: Callable[[SSEProgressEvent], None] | None = None,
1282
1292
  state: BraintrustState | None = None,
1293
+ enable_cache: bool = True,
1294
+ ):
1295
+ # Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
1296
+ if state is None:
1297
+ from braintrust.logger import _internal_get_global_state
1298
+
1299
+ state = _internal_get_global_state()
1300
+
1301
+ if enable_cache:
1302
+ state.span_cache.start()
1303
+ try:
1304
+ return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
1305
+ finally:
1306
+ # Clean up disk-based span cache after eval completes and stop caching
1307
+ if enable_cache:
1308
+ state.span_cache.dispose()
1309
+ state.span_cache.stop()
1310
+
1311
+
1312
+ async def _run_evaluator_internal_impl(
1313
+ experiment,
1314
+ evaluator: Evaluator,
1315
+ position: int | None,
1316
+ filters: list[Filter],
1317
+ stream: Callable[[SSEProgressEvent], None] | None = None,
1318
+ state: BraintrustState | None = None,
1283
1319
  ):
1284
1320
  event_loop = asyncio.get_event_loop()
1285
1321
 
@@ -1290,11 +1326,13 @@ async def _run_evaluator_internal(
1290
1326
  {**parent_propagated},
1291
1327
  {"span_attributes": {"purpose": "scorer"}},
1292
1328
  )
1329
+ # Strip trace from logged input - it's internal plumbing that shouldn't appear in spans
1330
+ logged_input = {k: v for k, v in kwargs.items() if k != "trace"}
1293
1331
  with root_span.start_span(
1294
1332
  name=name,
1295
1333
  span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
1296
1334
  propagated_event=merged_propagated,
1297
- input=dict(**kwargs),
1335
+ input=logged_input,
1298
1336
  ) as span:
1299
1337
  score = scorer
1300
1338
  if hasattr(scorer, "eval_async"):
@@ -1415,6 +1453,77 @@ async def _run_evaluator_internal(
1415
1453
  tags = hooks.tags if hooks.tags else None
1416
1454
  root_span.log(output=output, metadata=metadata, tags=tags)
1417
1455
 
1456
+ # Create trace object for scorers
1457
+ from braintrust.trace import LocalTrace
1458
+
1459
+ async def ensure_spans_flushed():
1460
+ # Flush native Braintrust spans
1461
+ if experiment:
1462
+ await asyncio.get_event_loop().run_in_executor(
1463
+ None, lambda: experiment.state.flush()
1464
+ )
1465
+ elif state:
1466
+ await asyncio.get_event_loop().run_in_executor(None, lambda: state.flush())
1467
+ else:
1468
+ from braintrust.logger import flush as flush_logger
1469
+
1470
+ await asyncio.get_event_loop().run_in_executor(None, flush_logger)
1471
+
1472
+ # Also flush OTEL spans if registered
1473
+ if state:
1474
+ await state.flush_otel()
1475
+
1476
+ experiment_id = None
1477
+ if experiment:
1478
+ try:
1479
+ experiment_id = experiment.id
1480
+ except:
1481
+ experiment_id = None
1482
+
1483
+ trace = None
1484
+ if state or experiment:
1485
+ # Get the state to use
1486
+ trace_state = state
1487
+ if not trace_state and experiment:
1488
+ trace_state = experiment.state
1489
+ if not trace_state:
1490
+ # Fall back to global state
1491
+ from braintrust.logger import _internal_get_global_state
1492
+
1493
+ trace_state = _internal_get_global_state()
1494
+
1495
+ # Access root_span_id from the concrete SpanImpl instance
1496
+ # The Span interface doesn't expose this but SpanImpl has it
1497
+ root_span_id_value = getattr(root_span, "root_span_id", root_span.id)
1498
+
1499
+ # Check if there's a parent in the context to determine object_type and object_id
1500
+ from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string
1501
+
1502
+ parent_str = trace_state.current_parent.get()
1503
+ parent_components = None
1504
+ if parent_str:
1505
+ try:
1506
+ parent_components = SpanComponentsV3.from_str(parent_str)
1507
+ except Exception:
1508
+ # If parsing fails, parent_components stays None
1509
+ pass
1510
+
1511
+ # Determine object_type and object_id based on parent or experiment
1512
+ if parent_components:
1513
+ trace_object_type = span_object_type_v3_to_typed_string(parent_components.object_type)
1514
+ trace_object_id = parent_components.object_id or ""
1515
+ else:
1516
+ trace_object_type = "experiment"
1517
+ trace_object_id = experiment_id or ""
1518
+
1519
+ trace = LocalTrace(
1520
+ object_type=trace_object_type,
1521
+ object_id=trace_object_id,
1522
+ root_span_id=root_span_id_value,
1523
+ ensure_spans_flushed=ensure_spans_flushed,
1524
+ state=trace_state,
1525
+ )
1526
+
1418
1527
  score_promises = [
1419
1528
  asyncio.create_task(
1420
1529
  await_or_run_scorer(
@@ -1426,6 +1535,7 @@ async def _run_evaluator_internal(
1426
1535
  "expected": datum.expected,
1427
1536
  "metadata": metadata,
1428
1537
  "output": output,
1538
+ "trace": trace,
1429
1539
  },
1430
1540
  )
1431
1541
  )
@@ -3,7 +3,7 @@ from typing import Any, Literal, TypedDict, TypeVar, overload
3
3
  from sseclient import SSEClient
4
4
 
5
5
  from .._generated_types import FunctionTypeEnum
6
- from ..logger import Exportable, get_span_parent_object, login, proxy_conn
6
+ from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
7
7
  from ..util import response_raise_for_status
8
8
  from .constants import INVOKE_API_VERSION
9
9
  from .stream import BraintrustInvokeError, BraintrustStream
@@ -243,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None):
243
243
  :param version: Optional version of the function to use. Defaults to latest.
244
244
  :return: A function that can be used as a task or scorer.
245
245
  """
246
+ # Disable span cache since remote function spans won't be in the local cache
247
+ _internal_get_global_state().span_cache.disable()
246
248
 
247
249
  def f(*args: Any, **kwargs: Any) -> Any:
248
250
  if len(args) > 0: