braintrust 0.4.3__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {braintrust-0.4.3 → braintrust-0.5.0}/PKG-INFO +1 -1
  2. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/_generated_types.py +106 -6
  3. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/framework.py +98 -1
  4. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/functions/invoke.py +3 -1
  5. braintrust-0.5.0/src/braintrust/functions/test_invoke.py +61 -0
  6. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/generated_types.py +7 -1
  7. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/logger.py +78 -32
  8. braintrust-0.5.0/src/braintrust/span_cache.py +337 -0
  9. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/span_identifier_v3.py +21 -0
  10. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_logger.py +116 -0
  11. braintrust-0.5.0/src/braintrust/test_span_cache.py +344 -0
  12. braintrust-0.5.0/src/braintrust/test_trace.py +267 -0
  13. braintrust-0.5.0/src/braintrust/trace.py +385 -0
  14. braintrust-0.5.0/src/braintrust/version.py +4 -0
  15. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py +48 -6
  16. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py +106 -0
  17. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/PKG-INFO +1 -1
  18. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/SOURCES.txt +5 -0
  19. braintrust-0.4.3/src/braintrust/version.py +0 -4
  20. {braintrust-0.4.3 → braintrust-0.5.0}/README.md +0 -0
  21. {braintrust-0.4.3 → braintrust-0.5.0}/setup.cfg +0 -0
  22. {braintrust-0.4.3 → braintrust-0.5.0}/setup.py +0 -0
  23. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/__init__.py +0 -0
  24. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/audit.py +0 -0
  25. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/aws.py +0 -0
  26. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/bt_json.py +0 -0
  27. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/__init__.py +0 -0
  28. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/__main__.py +0 -0
  29. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/eval.py +0 -0
  30. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/__init__.py +0 -0
  31. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/api.py +0 -0
  32. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/bump_versions.py +0 -0
  33. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/logs.py +0 -0
  34. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/redshift.py +0 -0
  35. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/run_migrations.py +0 -0
  36. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/push.py +0 -0
  37. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/conftest.py +0 -0
  38. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/context.py +0 -0
  39. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/contrib/__init__.py +0 -0
  40. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/contrib/temporal/__init__.py +0 -0
  41. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/contrib/temporal/test_temporal.py +0 -0
  42. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/db_fields.py +0 -0
  43. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/__init__.py +0 -0
  44. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/auth.py +0 -0
  45. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/cache.py +0 -0
  46. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/cors.py +0 -0
  47. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/dataset.py +0 -0
  48. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/eval_hooks.py +0 -0
  49. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/schemas.py +0 -0
  50. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/server.py +0 -0
  51. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/test_cached_login.py +0 -0
  52. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/test_lru_cache.py +0 -0
  53. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/test_server_integration.py +0 -0
  54. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/framework2.py +0 -0
  55. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/functions/__init__.py +0 -0
  56. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/functions/constants.py +0 -0
  57. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/functions/stream.py +0 -0
  58. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/git_fields.py +0 -0
  59. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/gitutil.py +0 -0
  60. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/graph_util.py +0 -0
  61. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/http_headers.py +0 -0
  62. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/id_gen.py +0 -0
  63. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/merge_row_batch.py +0 -0
  64. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/oai.py +0 -0
  65. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/object.py +0 -0
  66. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/otel/__init__.py +0 -0
  67. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/otel/context.py +0 -0
  68. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/otel/test_distributed_tracing.py +0 -0
  69. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/otel/test_otel_bt_integration.py +0 -0
  70. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/parameters.py +0 -0
  71. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt.py +0 -0
  72. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/__init__.py +0 -0
  73. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/disk_cache.py +0 -0
  74. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/lru_cache.py +0 -0
  75. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/prompt_cache.py +0 -0
  76. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/test_disk_cache.py +0 -0
  77. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/test_lru_cache.py +0 -0
  78. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/test_prompt_cache.py +0 -0
  79. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/py.typed +0 -0
  80. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/queue.py +0 -0
  81. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/resource_manager.py +0 -0
  82. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/score.py +0 -0
  83. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/serializable_data_class.py +0 -0
  84. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/span_identifier_v1.py +0 -0
  85. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/span_identifier_v2.py +0 -0
  86. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/span_identifier_v4.py +0 -0
  87. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/span_types.py +0 -0
  88. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_bt_json.py +0 -0
  89. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_framework.py +0 -0
  90. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_framework2.py +0 -0
  91. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_helpers.py +0 -0
  92. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_id_gen.py +0 -0
  93. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_otel.py +0 -0
  94. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_queue.py +0 -0
  95. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_score.py +0 -0
  96. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_serializable_data_class.py +0 -0
  97. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_span_components.py +0 -0
  98. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_util.py +0 -0
  99. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_version.py +0 -0
  100. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/util.py +0 -0
  101. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/__init__.py +0 -0
  102. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/_anthropic_utils.py +0 -0
  103. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/__init__.py +0 -0
  104. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/agent.py +0 -0
  105. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/function_call.py +0 -0
  106. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/model.py +0 -0
  107. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/team.py +0 -0
  108. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/utils.py +0 -0
  109. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/anthropic.py +0 -0
  110. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/claude_agent_sdk/__init__.py +0 -0
  111. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/dspy.py +0 -0
  112. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/google_genai/__init__.py +0 -0
  113. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/langchain.py +0 -0
  114. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/langsmith_wrapper.py +0 -0
  115. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/litellm.py +0 -0
  116. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/openai.py +0 -0
  117. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/pydantic_ai.py +0 -0
  118. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_agno.py +0 -0
  119. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_anthropic.py +0 -0
  120. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_dspy.py +0 -0
  121. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_google_genai.py +0 -0
  122. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_langsmith_wrapper.py +0 -0
  123. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_litellm.py +0 -0
  124. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_oai_attachments.py +0 -0
  125. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_openai.py +0 -0
  126. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_openrouter.py +0 -0
  127. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_pydantic_ai_integration.py +0 -0
  128. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_pydantic_ai_wrap_openai.py +0 -0
  129. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_utils.py +0 -0
  130. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/xact_ids.py +0 -0
  131. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/dependency_links.txt +0 -0
  132. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/entry_points.txt +0 -0
  133. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/requires.txt +0 -0
  134. {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: braintrust
3
- Version: 0.4.3
3
+ Version: 0.5.0
4
4
  Summary: SDK for integrating Braintrust
5
5
  Home-page: https://www.braintrust.dev
6
6
  Author: Braintrust
@@ -144,6 +144,11 @@ class AsyncScoringControlAsyncScoringControl5(TypedDict):
144
144
  triggered_xact_id: str
145
145
 
146
146
 
147
+ class AsyncScoringControlAsyncScoringControl6(TypedDict):
148
+ kind: Literal['mark_attempt_failed']
149
+ function_ids: Sequence[Any]
150
+
151
+
147
152
  class AsyncScoringStateAsyncScoringState(TypedDict):
148
153
  status: Literal['enabled']
149
154
  token: str
@@ -484,6 +489,10 @@ class Dataset(TypedDict):
484
489
  """
485
490
  User-controlled metadata about the dataset
486
491
  """
492
+ url_slug: str
493
+ """
494
+ URL slug for the dataset. used to construct dataset URLs
495
+ """
487
496
 
488
497
 
489
498
  class DatasetEventMetadata(TypedDict):
@@ -532,6 +541,43 @@ class EnvVar(TypedDict):
532
541
  """
533
542
 
534
543
 
544
+ class EvalStatusPageConfig(TypedDict):
545
+ score_columns: NotRequired[Sequence[str] | None]
546
+ """
547
+ The score columns to display on the page
548
+ """
549
+ metric_columns: NotRequired[Sequence[str] | None]
550
+ """
551
+ The metric columns to display on the page
552
+ """
553
+ grouping_field: NotRequired[str | None]
554
+ """
555
+ The metadata field to use for grouping experiments (model)
556
+ """
557
+ filter: NotRequired[str | None]
558
+ """
559
+ BTQL filter to apply to experiment data
560
+ """
561
+ sort_by: NotRequired[str | None]
562
+ """
563
+ Field to sort results by (format: 'score:<name>' or 'metric:<name>')
564
+ """
565
+ sort_order: NotRequired[Literal['asc', 'desc'] | None]
566
+ """
567
+ Sort order (ascending or descending)
568
+ """
569
+ api_key: NotRequired[str | None]
570
+ """
571
+ The API key used for fetching experiment data
572
+ """
573
+
574
+
575
+ EvalStatusPageTheme: TypeAlias = Literal['light', 'dark']
576
+ """
577
+ The theme for the page
578
+ """
579
+
580
+
535
581
  class ExperimentEventMetadata(TypedDict):
536
582
  model: NotRequired[str | None]
537
583
  """
@@ -749,20 +795,24 @@ FunctionIdRef: TypeAlias = Mapping[str, Any]
749
795
 
750
796
 
751
797
  FunctionObjectType: TypeAlias = Literal[
752
- 'prompt', 'tool', 'scorer', 'task', 'custom_view', 'preprocessor', 'facet'
798
+ 'prompt', 'tool', 'scorer', 'task', 'workflow', 'custom_view', 'preprocessor', 'facet', 'classifier'
753
799
  ]
754
800
 
755
801
 
756
- FunctionOutputType: TypeAlias = Literal['completion', 'score', 'any']
802
+ FunctionOutputType: TypeAlias = Literal['completion', 'score', 'facet', 'classification', 'any']
757
803
 
758
804
 
759
- FunctionTypeEnum: TypeAlias = Literal['llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet']
805
+ FunctionTypeEnum: TypeAlias = Literal[
806
+ 'llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet', 'classifier'
807
+ ]
760
808
  """
761
809
  The type of global function. Defaults to 'scorer'.
762
810
  """
763
811
 
764
812
 
765
- FunctionTypeEnumNullish: TypeAlias = Literal['llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet']
813
+ FunctionTypeEnumNullish: TypeAlias = Literal[
814
+ 'llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet', 'classifier'
815
+ ]
766
816
 
767
817
 
768
818
  class GitMetadataSettings(TypedDict):
@@ -1674,7 +1724,18 @@ class PromptDataNullishOrigin(TypedDict):
1674
1724
  class PromptParserNullish(TypedDict):
1675
1725
  type: Literal['llm_classifier']
1676
1726
  use_cot: bool
1677
- choice_scores: Mapping[str, float]
1727
+ choice_scores: NotRequired[Mapping[str, float] | None]
1728
+ """
1729
+ Map of choices to scores (0-1). Used by scorers.
1730
+ """
1731
+ choice: NotRequired[Sequence[str] | None]
1732
+ """
1733
+ List of valid choices without score mapping. Used by classifiers that deposit output to tags.
1734
+ """
1735
+ allow_no_match: NotRequired[bool | None]
1736
+ """
1737
+ If true, adds a 'No match' option. When selected, no tag is deposited.
1738
+ """
1678
1739
 
1679
1740
 
1680
1741
  class PromptSessionEvent(TypedDict):
@@ -2104,7 +2165,7 @@ class SpanScope(TypedDict):
2104
2165
 
2105
2166
 
2106
2167
  SpanType: TypeAlias = Literal[
2107
- 'llm', 'score', 'function', 'eval', 'task', 'tool', 'automation', 'facet', 'preprocessor'
2168
+ 'llm', 'score', 'function', 'eval', 'task', 'tool', 'automation', 'facet', 'preprocessor', 'classifier'
2108
2169
  ]
2109
2170
  """
2110
2171
  Type of the span, for display purposes only
@@ -2384,6 +2445,7 @@ AsyncScoringControl: TypeAlias = (
2384
2445
  | AsyncScoringControlAsyncScoringControl3
2385
2446
  | AsyncScoringControlAsyncScoringControl4
2386
2447
  | AsyncScoringControlAsyncScoringControl5
2448
+ | AsyncScoringControlAsyncScoringControl6
2387
2449
  )
2388
2450
 
2389
2451
 
@@ -2530,6 +2592,43 @@ class DatasetEvent(TypedDict):
2530
2592
  """
2531
2593
 
2532
2594
 
2595
+ class EvalStatusPage(TypedDict):
2596
+ id: str
2597
+ """
2598
+ Unique identifier for the eval status page
2599
+ """
2600
+ project_id: str
2601
+ """
2602
+ Unique identifier for the project that the eval status page belongs under
2603
+ """
2604
+ user_id: NotRequired[str | None]
2605
+ """
2606
+ Identifies the user who created the eval status page
2607
+ """
2608
+ created: NotRequired[str | None]
2609
+ """
2610
+ Date of eval status page creation
2611
+ """
2612
+ deleted_at: NotRequired[str | None]
2613
+ """
2614
+ Date of eval status page deletion, or null if the eval status page is still active
2615
+ """
2616
+ name: str
2617
+ """
2618
+ Name of the eval status page
2619
+ """
2620
+ description: NotRequired[str | None]
2621
+ """
2622
+ Textual description of the eval status page
2623
+ """
2624
+ logo_url: NotRequired[str | None]
2625
+ """
2626
+ URL of the logo to display on the page
2627
+ """
2628
+ theme: EvalStatusPageTheme
2629
+ config: EvalStatusPageConfig
2630
+
2631
+
2533
2632
  class Experiment(TypedDict):
2534
2633
  id: str
2535
2634
  """
@@ -3228,6 +3327,7 @@ class View(TypedDict):
3228
3327
  'prompts',
3229
3328
  'tools',
3230
3329
  'scorers',
3330
+ 'classifiers',
3231
3331
  'logs',
3232
3332
  'monitor',
3233
3333
  'for_review',
@@ -1280,6 +1280,29 @@ async def _run_evaluator_internal(
1280
1280
  filters: list[Filter],
1281
1281
  stream: Callable[[SSEProgressEvent], None] | None = None,
1282
1282
  state: BraintrustState | None = None,
1283
+ ):
1284
+ # Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
1285
+ if state is None:
1286
+ from braintrust.logger import _internal_get_global_state
1287
+
1288
+ state = _internal_get_global_state()
1289
+
1290
+ state.span_cache.start()
1291
+ try:
1292
+ return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
1293
+ finally:
1294
+ # Clean up disk-based span cache after eval completes and stop caching
1295
+ state.span_cache.dispose()
1296
+ state.span_cache.stop()
1297
+
1298
+
1299
+ async def _run_evaluator_internal_impl(
1300
+ experiment,
1301
+ evaluator: Evaluator,
1302
+ position: int | None,
1303
+ filters: list[Filter],
1304
+ stream: Callable[[SSEProgressEvent], None] | None = None,
1305
+ state: BraintrustState | None = None,
1283
1306
  ):
1284
1307
  event_loop = asyncio.get_event_loop()
1285
1308
 
@@ -1290,11 +1313,13 @@ async def _run_evaluator_internal(
1290
1313
  {**parent_propagated},
1291
1314
  {"span_attributes": {"purpose": "scorer"}},
1292
1315
  )
1316
+ # Strip trace from logged input - it's internal plumbing that shouldn't appear in spans
1317
+ logged_input = {k: v for k, v in kwargs.items() if k != "trace"}
1293
1318
  with root_span.start_span(
1294
1319
  name=name,
1295
1320
  span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
1296
1321
  propagated_event=merged_propagated,
1297
- input=dict(**kwargs),
1322
+ input=logged_input,
1298
1323
  ) as span:
1299
1324
  score = scorer
1300
1325
  if hasattr(scorer, "eval_async"):
@@ -1415,6 +1440,77 @@ async def _run_evaluator_internal(
1415
1440
  tags = hooks.tags if hooks.tags else None
1416
1441
  root_span.log(output=output, metadata=metadata, tags=tags)
1417
1442
 
1443
+ # Create trace object for scorers
1444
+ from braintrust.trace import LocalTrace
1445
+
1446
+ async def ensure_spans_flushed():
1447
+ # Flush native Braintrust spans
1448
+ if experiment:
1449
+ await asyncio.get_event_loop().run_in_executor(
1450
+ None, lambda: experiment.state.flush()
1451
+ )
1452
+ elif state:
1453
+ await asyncio.get_event_loop().run_in_executor(None, lambda: state.flush())
1454
+ else:
1455
+ from braintrust.logger import flush as flush_logger
1456
+
1457
+ await asyncio.get_event_loop().run_in_executor(None, flush_logger)
1458
+
1459
+ # Also flush OTEL spans if registered
1460
+ if state:
1461
+ await state.flush_otel()
1462
+
1463
+ experiment_id = None
1464
+ if experiment:
1465
+ try:
1466
+ experiment_id = experiment.id
1467
+ except:
1468
+ experiment_id = None
1469
+
1470
+ trace = None
1471
+ if state or experiment:
1472
+ # Get the state to use
1473
+ trace_state = state
1474
+ if not trace_state and experiment:
1475
+ trace_state = experiment.state
1476
+ if not trace_state:
1477
+ # Fall back to global state
1478
+ from braintrust.logger import _internal_get_global_state
1479
+
1480
+ trace_state = _internal_get_global_state()
1481
+
1482
+ # Access root_span_id from the concrete SpanImpl instance
1483
+ # The Span interface doesn't expose this but SpanImpl has it
1484
+ root_span_id_value = getattr(root_span, "root_span_id", root_span.id)
1485
+
1486
+ # Check if there's a parent in the context to determine object_type and object_id
1487
+ from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string
1488
+
1489
+ parent_str = trace_state.current_parent.get()
1490
+ parent_components = None
1491
+ if parent_str:
1492
+ try:
1493
+ parent_components = SpanComponentsV3.from_str(parent_str)
1494
+ except Exception:
1495
+ # If parsing fails, parent_components stays None
1496
+ pass
1497
+
1498
+ # Determine object_type and object_id based on parent or experiment
1499
+ if parent_components:
1500
+ trace_object_type = span_object_type_v3_to_typed_string(parent_components.object_type)
1501
+ trace_object_id = parent_components.object_id or ""
1502
+ else:
1503
+ trace_object_type = "experiment"
1504
+ trace_object_id = experiment_id or ""
1505
+
1506
+ trace = LocalTrace(
1507
+ object_type=trace_object_type,
1508
+ object_id=trace_object_id,
1509
+ root_span_id=root_span_id_value,
1510
+ ensure_spans_flushed=ensure_spans_flushed,
1511
+ state=trace_state,
1512
+ )
1513
+
1418
1514
  score_promises = [
1419
1515
  asyncio.create_task(
1420
1516
  await_or_run_scorer(
@@ -1426,6 +1522,7 @@ async def _run_evaluator_internal(
1426
1522
  "expected": datum.expected,
1427
1523
  "metadata": metadata,
1428
1524
  "output": output,
1525
+ "trace": trace,
1429
1526
  },
1430
1527
  )
1431
1528
  )
@@ -3,7 +3,7 @@ from typing import Any, Literal, TypedDict, TypeVar, overload
3
3
  from sseclient import SSEClient
4
4
 
5
5
  from .._generated_types import FunctionTypeEnum
6
- from ..logger import Exportable, get_span_parent_object, login, proxy_conn
6
+ from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
7
7
  from ..util import response_raise_for_status
8
8
  from .constants import INVOKE_API_VERSION
9
9
  from .stream import BraintrustInvokeError, BraintrustStream
@@ -243,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None):
243
243
  :param version: Optional version of the function to use. Defaults to latest.
244
244
  :return: A function that can be used as a task or scorer.
245
245
  """
246
+ # Disable span cache since remote function spans won't be in the local cache
247
+ _internal_get_global_state().span_cache.disable()
246
248
 
247
249
  def f(*args: Any, **kwargs: Any) -> Any:
248
250
  if len(args) > 0:
@@ -0,0 +1,61 @@
1
+ """Tests for the invoke module, particularly init_function."""
2
+
3
+
4
+ from braintrust.functions.invoke import init_function
5
+ from braintrust.logger import _internal_get_global_state, _internal_reset_global_state
6
+
7
+
8
+ class TestInitFunction:
9
+ """Tests for init_function."""
10
+
11
+ def setup_method(self):
12
+ """Reset state before each test."""
13
+ _internal_reset_global_state()
14
+
15
+ def teardown_method(self):
16
+ """Clean up after each test."""
17
+ _internal_reset_global_state()
18
+
19
+ def test_init_function_disables_span_cache(self):
20
+ """Test that init_function disables the span cache."""
21
+ state = _internal_get_global_state()
22
+
23
+ # Cache should be disabled by default (it's only enabled during evals)
24
+ assert state.span_cache.disabled is True
25
+
26
+ # Enable the cache (simulating what happens during eval)
27
+ state.span_cache.start()
28
+ assert state.span_cache.disabled is False
29
+
30
+ # Call init_function
31
+ f = init_function("test-project", "test-function")
32
+
33
+ # Cache should now be disabled (init_function explicitly disables it)
34
+ assert state.span_cache.disabled is True
35
+ assert f.__name__ == "init_function-test-project-test-function-latest"
36
+
37
+ def test_init_function_with_version(self):
38
+ """Test that init_function creates a function with the correct name including version."""
39
+ f = init_function("my-project", "my-scorer", version="v1")
40
+ assert f.__name__ == "init_function-my-project-my-scorer-v1"
41
+
42
+ def test_init_function_without_version_uses_latest(self):
43
+ """Test that init_function uses 'latest' in name when version not specified."""
44
+ f = init_function("my-project", "my-scorer")
45
+ assert f.__name__ == "init_function-my-project-my-scorer-latest"
46
+
47
+ def test_init_function_permanently_disables_cache(self):
48
+ """Test that init_function permanently disables the cache (can't be re-enabled)."""
49
+ state = _internal_get_global_state()
50
+
51
+ # Enable the cache
52
+ state.span_cache.start()
53
+ assert state.span_cache.disabled is False
54
+
55
+ # Call init_function
56
+ init_function("test-project", "test-function")
57
+ assert state.span_cache.disabled is True
58
+
59
+ # Try to start again - should still be disabled because of explicit disable
60
+ state.span_cache.start()
61
+ assert state.span_cache.disabled is True
@@ -1,4 +1,4 @@
1
- """Auto-generated file (internal git SHA 87ac73f4945a47eff2d4e42775ba4dbc58854c73) -- do not modify"""
1
+ """Auto-generated file (internal git SHA 21146f64bf5ad1eadd3a99d186274728e25e5399) -- do not modify"""
2
2
 
3
3
  from ._generated_types import (
4
4
  Acl,
@@ -29,6 +29,9 @@ from ._generated_types import (
29
29
  Dataset,
30
30
  DatasetEvent,
31
31
  EnvVar,
32
+ EvalStatusPage,
33
+ EvalStatusPageConfig,
34
+ EvalStatusPageTheme,
32
35
  Experiment,
33
36
  ExperimentEvent,
34
37
  ExtendedSavedFunctionId,
@@ -136,6 +139,9 @@ __all__ = [
136
139
  "Dataset",
137
140
  "DatasetEvent",
138
141
  "EnvVar",
142
+ "EvalStatusPage",
143
+ "EvalStatusPageConfig",
144
+ "EvalStatusPageTheme",
139
145
  "Experiment",
140
146
  "ExperimentEvent",
141
147
  "ExtendedSavedFunctionId",
@@ -47,12 +47,9 @@ from urllib3.util.retry import Retry
47
47
  from . import context, id_gen
48
48
  from .bt_json import bt_dumps, bt_safe_deep_copy
49
49
  from .db_fields import (
50
- ASYNC_SCORING_CONTROL_FIELD,
51
50
  AUDIT_METADATA_FIELD,
52
51
  AUDIT_SOURCE_FIELD,
53
52
  IS_MERGE_FIELD,
54
- MERGE_PATHS_FIELD,
55
- SKIP_ASYNC_SCORING_FIELD,
56
53
  TRANSACTION_ID_FIELD,
57
54
  VALID_SOURCES,
58
55
  )
@@ -101,6 +98,14 @@ from .xact_ids import prettify_xact
101
98
  Metadata = dict[str, Any]
102
99
  DATA_API_VERSION = 2
103
100
 
101
+
102
+ class DatasetRef(TypedDict, total=False):
103
+ """Reference to a dataset by ID and optional version."""
104
+
105
+ id: str
106
+ version: str
107
+
108
+
104
109
  T = TypeVar("T")
105
110
  TMapping = TypeVar("TMapping", bound=Mapping[str, Any])
106
111
  TMutableMapping = TypeVar("TMutableMapping", bound=MutableMapping[str, Any])
@@ -396,6 +401,11 @@ class BraintrustState:
396
401
  ),
397
402
  )
398
403
 
404
+ from braintrust.span_cache import SpanCache
405
+
406
+ self.span_cache = SpanCache()
407
+ self._otel_flush_callback: Any | None = None
408
+
399
409
  def reset_login_info(self):
400
410
  self.app_url: str | None = None
401
411
  self.app_public_url: str | None = None
@@ -452,6 +462,21 @@ class BraintrustState:
452
462
 
453
463
  return self._context_manager
454
464
 
465
+ def register_otel_flush(self, callback: Any) -> None:
466
+ """
467
+ Register an OTEL flush callback. This is called by the OTEL integration
468
+ when it initializes a span processor/exporter.
469
+ """
470
+ self._otel_flush_callback = callback
471
+
472
+ async def flush_otel(self) -> None:
473
+ """
474
+ Flush OTEL spans if a callback is registered.
475
+ Called during ensure_spans_flushed to ensure OTEL spans are visible in BTQL.
476
+ """
477
+ if self._otel_flush_callback:
478
+ await self._otel_flush_callback()
479
+
455
480
  def copy_state(self, other: "BraintrustState"):
456
481
  """Copy login information from another BraintrustState instance."""
457
482
  self.__dict__.update({
@@ -1297,7 +1322,7 @@ def init(
1297
1322
  project: str | None = None,
1298
1323
  experiment: str | None = None,
1299
1324
  description: str | None = None,
1300
- dataset: Optional["Dataset"] = None,
1325
+ dataset: Optional["Dataset"] | DatasetRef = None,
1301
1326
  open: bool = False,
1302
1327
  base_experiment: str | None = None,
1303
1328
  is_public: bool = False,
@@ -1410,12 +1435,19 @@ def init(
1410
1435
  args["base_exp_id"] = base_experiment_id
1411
1436
  elif base_experiment is not None:
1412
1437
  args["base_experiment"] = base_experiment
1413
- else:
1438
+ elif merged_git_metadata_settings and merged_git_metadata_settings.collect != "none":
1414
1439
  args["ancestor_commits"] = list(get_past_n_ancestors())
1415
1440
 
1416
1441
  if dataset is not None:
1417
- args["dataset_id"] = dataset.id
1418
- args["dataset_version"] = dataset.version
1442
+ if isinstance(dataset, dict):
1443
+ # Simple {"id": ..., "version": ...} dict
1444
+ args["dataset_id"] = dataset["id"]
1445
+ if "version" in dataset:
1446
+ args["dataset_version"] = dataset["version"]
1447
+ else:
1448
+ # Full Dataset object
1449
+ args["dataset_id"] = dataset.id
1450
+ args["dataset_version"] = dataset.version
1419
1451
 
1420
1452
  if is_public is not None:
1421
1453
  args["public"] = is_public
@@ -1446,7 +1478,11 @@ def init(
1446
1478
  # For experiments, disable queue size limit enforcement (unlimited queue)
1447
1479
  state.enforce_queue_size_limit(False)
1448
1480
 
1449
- ret = Experiment(lazy_metadata=LazyValue(compute_metadata, use_mutex=True), dataset=dataset, state=state)
1481
+ ret = Experiment(
1482
+ lazy_metadata=LazyValue(compute_metadata, use_mutex=True),
1483
+ dataset=dataset if isinstance(dataset, Dataset) else None,
1484
+ state=state,
1485
+ )
1450
1486
  if set_current:
1451
1487
  state.current_experiment = ret
1452
1488
  return ret
@@ -1761,6 +1797,25 @@ def login(
1761
1797
  _state.login(app_url=app_url, api_key=api_key, org_name=org_name, force_login=force_login)
1762
1798
 
1763
1799
 
1800
+ def register_otel_flush(callback: Any) -> None:
1801
+ """
1802
+ Register a callback to flush OTEL spans. This is called by the OTEL integration
1803
+ when it initializes a span processor/exporter.
1804
+
1805
+ When ensure_spans_flushed is called (e.g., before a BTQL query in scorers),
1806
+ this callback will be invoked to ensure OTEL spans are flushed to the server.
1807
+
1808
+ Also disables the span cache, since OTEL spans aren't in the local cache
1809
+ and we need BTQL to see the complete span tree (both native + OTEL spans).
1810
+
1811
+ :param callback: The async callback function to flush OTEL spans.
1812
+ """
1813
+ global _state
1814
+ _state.register_otel_flush(callback)
1815
+ # Disable span cache since OTEL spans aren't in the local cache
1816
+ _state.span_cache.disable()
1817
+
1818
+
1764
1819
  def login_to_state(
1765
1820
  app_url: str | None = None,
1766
1821
  api_key: str | None = None,
@@ -2323,30 +2378,6 @@ def _enrich_attachments(event: TMutableMapping) -> TMutableMapping:
2323
2378
 
2324
2379
 
2325
2380
  def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any]) -> dict[str, Any]:
2326
- # Make sure only certain keys are specified.
2327
- forbidden_keys = set(event.keys()) - {
2328
- "input",
2329
- "output",
2330
- "expected",
2331
- "tags",
2332
- "scores",
2333
- "metadata",
2334
- "metrics",
2335
- "error",
2336
- "dataset_record_id",
2337
- "origin",
2338
- "inputs",
2339
- "span_attributes",
2340
- ASYNC_SCORING_CONTROL_FIELD,
2341
- MERGE_PATHS_FIELD,
2342
- SKIP_ASYNC_SCORING_FIELD,
2343
- "span_id",
2344
- "root_span_id",
2345
- "_bt_internal_override_pagination_key",
2346
- }
2347
- if forbidden_keys:
2348
- raise ValueError(f"The following keys are not permitted: {forbidden_keys}")
2349
-
2350
2381
  scores = event.get("scores")
2351
2382
  if scores:
2352
2383
  for name, score in scores.items():
@@ -3855,6 +3886,21 @@ class SpanImpl(Span):
3855
3886
  if serializable_partial_record.get("metrics", {}).get("end") is not None:
3856
3887
  self._logged_end_time = serializable_partial_record["metrics"]["end"]
3857
3888
 
3889
+ # Write to local span cache for scorer access
3890
+ # Only cache experiment spans - regular logs don't need caching
3891
+ if self.parent_object_type == SpanObjectTypeV3.EXPERIMENT:
3892
+ from braintrust.span_cache import CachedSpan
3893
+
3894
+ cached_span = CachedSpan(
3895
+ span_id=self.span_id,
3896
+ input=serializable_partial_record.get("input"),
3897
+ output=serializable_partial_record.get("output"),
3898
+ metadata=serializable_partial_record.get("metadata"),
3899
+ span_parents=self.span_parents,
3900
+ span_attributes=serializable_partial_record.get("span_attributes"),
3901
+ )
3902
+ self.state.span_cache.queue_write(self.root_span_id, self.span_id, cached_span)
3903
+
3858
3904
  def compute_record() -> dict[str, Any]:
3859
3905
  exporter = _get_exporter()
3860
3906
  return dict(