braintrust 0.4.3__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {braintrust-0.4.3 → braintrust-0.5.0}/PKG-INFO +1 -1
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/_generated_types.py +106 -6
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/framework.py +98 -1
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/functions/invoke.py +3 -1
- braintrust-0.5.0/src/braintrust/functions/test_invoke.py +61 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/generated_types.py +7 -1
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/logger.py +78 -32
- braintrust-0.5.0/src/braintrust/span_cache.py +337 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/span_identifier_v3.py +21 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_logger.py +116 -0
- braintrust-0.5.0/src/braintrust/test_span_cache.py +344 -0
- braintrust-0.5.0/src/braintrust/test_trace.py +267 -0
- braintrust-0.5.0/src/braintrust/trace.py +385 -0
- braintrust-0.5.0/src/braintrust/version.py +4 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/claude_agent_sdk/_wrapper.py +48 -6
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/claude_agent_sdk/test_wrapper.py +106 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/PKG-INFO +1 -1
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/SOURCES.txt +5 -0
- braintrust-0.4.3/src/braintrust/version.py +0 -4
- {braintrust-0.4.3 → braintrust-0.5.0}/README.md +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/setup.cfg +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/setup.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/audit.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/aws.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/bt_json.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/__main__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/eval.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/api.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/bump_versions.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/logs.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/redshift.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/install/run_migrations.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/cli/push.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/conftest.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/context.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/contrib/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/contrib/temporal/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/contrib/temporal/test_temporal.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/db_fields.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/auth.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/cache.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/cors.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/dataset.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/eval_hooks.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/schemas.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/server.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/test_cached_login.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/test_lru_cache.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/devserver/test_server_integration.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/framework2.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/functions/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/functions/constants.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/functions/stream.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/git_fields.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/gitutil.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/graph_util.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/http_headers.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/id_gen.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/merge_row_batch.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/oai.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/object.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/otel/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/otel/context.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/otel/test_distributed_tracing.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/otel/test_otel_bt_integration.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/parameters.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/disk_cache.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/lru_cache.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/prompt_cache.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/test_disk_cache.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/test_lru_cache.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/prompt_cache/test_prompt_cache.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/py.typed +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/queue.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/resource_manager.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/score.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/serializable_data_class.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/span_identifier_v1.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/span_identifier_v2.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/span_identifier_v4.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/span_types.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_bt_json.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_framework.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_framework2.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_helpers.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_id_gen.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_otel.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_queue.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_score.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_serializable_data_class.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_span_components.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_util.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/test_version.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/util.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/_anthropic_utils.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/agent.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/function_call.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/model.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/team.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/agno/utils.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/anthropic.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/claude_agent_sdk/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/dspy.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/google_genai/__init__.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/langchain.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/langsmith_wrapper.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/litellm.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/openai.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/pydantic_ai.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_agno.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_anthropic.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_dspy.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_google_genai.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_langsmith_wrapper.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_litellm.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_oai_attachments.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_openai.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_openrouter.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_pydantic_ai_integration.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_pydantic_ai_wrap_openai.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/wrappers/test_utils.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust/xact_ids.py +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/dependency_links.txt +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/entry_points.txt +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/requires.txt +0 -0
- {braintrust-0.4.3 → braintrust-0.5.0}/src/braintrust.egg-info/top_level.txt +0 -0
|
@@ -144,6 +144,11 @@ class AsyncScoringControlAsyncScoringControl5(TypedDict):
|
|
|
144
144
|
triggered_xact_id: str
|
|
145
145
|
|
|
146
146
|
|
|
147
|
+
class AsyncScoringControlAsyncScoringControl6(TypedDict):
|
|
148
|
+
kind: Literal['mark_attempt_failed']
|
|
149
|
+
function_ids: Sequence[Any]
|
|
150
|
+
|
|
151
|
+
|
|
147
152
|
class AsyncScoringStateAsyncScoringState(TypedDict):
|
|
148
153
|
status: Literal['enabled']
|
|
149
154
|
token: str
|
|
@@ -484,6 +489,10 @@ class Dataset(TypedDict):
|
|
|
484
489
|
"""
|
|
485
490
|
User-controlled metadata about the dataset
|
|
486
491
|
"""
|
|
492
|
+
url_slug: str
|
|
493
|
+
"""
|
|
494
|
+
URL slug for the dataset. used to construct dataset URLs
|
|
495
|
+
"""
|
|
487
496
|
|
|
488
497
|
|
|
489
498
|
class DatasetEventMetadata(TypedDict):
|
|
@@ -532,6 +541,43 @@ class EnvVar(TypedDict):
|
|
|
532
541
|
"""
|
|
533
542
|
|
|
534
543
|
|
|
544
|
+
class EvalStatusPageConfig(TypedDict):
|
|
545
|
+
score_columns: NotRequired[Sequence[str] | None]
|
|
546
|
+
"""
|
|
547
|
+
The score columns to display on the page
|
|
548
|
+
"""
|
|
549
|
+
metric_columns: NotRequired[Sequence[str] | None]
|
|
550
|
+
"""
|
|
551
|
+
The metric columns to display on the page
|
|
552
|
+
"""
|
|
553
|
+
grouping_field: NotRequired[str | None]
|
|
554
|
+
"""
|
|
555
|
+
The metadata field to use for grouping experiments (model)
|
|
556
|
+
"""
|
|
557
|
+
filter: NotRequired[str | None]
|
|
558
|
+
"""
|
|
559
|
+
BTQL filter to apply to experiment data
|
|
560
|
+
"""
|
|
561
|
+
sort_by: NotRequired[str | None]
|
|
562
|
+
"""
|
|
563
|
+
Field to sort results by (format: 'score:<name>' or 'metric:<name>')
|
|
564
|
+
"""
|
|
565
|
+
sort_order: NotRequired[Literal['asc', 'desc'] | None]
|
|
566
|
+
"""
|
|
567
|
+
Sort order (ascending or descending)
|
|
568
|
+
"""
|
|
569
|
+
api_key: NotRequired[str | None]
|
|
570
|
+
"""
|
|
571
|
+
The API key used for fetching experiment data
|
|
572
|
+
"""
|
|
573
|
+
|
|
574
|
+
|
|
575
|
+
EvalStatusPageTheme: TypeAlias = Literal['light', 'dark']
|
|
576
|
+
"""
|
|
577
|
+
The theme for the page
|
|
578
|
+
"""
|
|
579
|
+
|
|
580
|
+
|
|
535
581
|
class ExperimentEventMetadata(TypedDict):
|
|
536
582
|
model: NotRequired[str | None]
|
|
537
583
|
"""
|
|
@@ -749,20 +795,24 @@ FunctionIdRef: TypeAlias = Mapping[str, Any]
|
|
|
749
795
|
|
|
750
796
|
|
|
751
797
|
FunctionObjectType: TypeAlias = Literal[
|
|
752
|
-
'prompt', 'tool', 'scorer', 'task', 'custom_view', 'preprocessor', 'facet'
|
|
798
|
+
'prompt', 'tool', 'scorer', 'task', 'workflow', 'custom_view', 'preprocessor', 'facet', 'classifier'
|
|
753
799
|
]
|
|
754
800
|
|
|
755
801
|
|
|
756
|
-
FunctionOutputType: TypeAlias = Literal['completion', 'score', 'any']
|
|
802
|
+
FunctionOutputType: TypeAlias = Literal['completion', 'score', 'facet', 'classification', 'any']
|
|
757
803
|
|
|
758
804
|
|
|
759
|
-
FunctionTypeEnum: TypeAlias = Literal[
|
|
805
|
+
FunctionTypeEnum: TypeAlias = Literal[
|
|
806
|
+
'llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet', 'classifier'
|
|
807
|
+
]
|
|
760
808
|
"""
|
|
761
809
|
The type of global function. Defaults to 'scorer'.
|
|
762
810
|
"""
|
|
763
811
|
|
|
764
812
|
|
|
765
|
-
FunctionTypeEnumNullish: TypeAlias = Literal[
|
|
813
|
+
FunctionTypeEnumNullish: TypeAlias = Literal[
|
|
814
|
+
'llm', 'scorer', 'task', 'tool', 'custom_view', 'preprocessor', 'facet', 'classifier'
|
|
815
|
+
]
|
|
766
816
|
|
|
767
817
|
|
|
768
818
|
class GitMetadataSettings(TypedDict):
|
|
@@ -1674,7 +1724,18 @@ class PromptDataNullishOrigin(TypedDict):
|
|
|
1674
1724
|
class PromptParserNullish(TypedDict):
|
|
1675
1725
|
type: Literal['llm_classifier']
|
|
1676
1726
|
use_cot: bool
|
|
1677
|
-
choice_scores: Mapping[str, float]
|
|
1727
|
+
choice_scores: NotRequired[Mapping[str, float] | None]
|
|
1728
|
+
"""
|
|
1729
|
+
Map of choices to scores (0-1). Used by scorers.
|
|
1730
|
+
"""
|
|
1731
|
+
choice: NotRequired[Sequence[str] | None]
|
|
1732
|
+
"""
|
|
1733
|
+
List of valid choices without score mapping. Used by classifiers that deposit output to tags.
|
|
1734
|
+
"""
|
|
1735
|
+
allow_no_match: NotRequired[bool | None]
|
|
1736
|
+
"""
|
|
1737
|
+
If true, adds a 'No match' option. When selected, no tag is deposited.
|
|
1738
|
+
"""
|
|
1678
1739
|
|
|
1679
1740
|
|
|
1680
1741
|
class PromptSessionEvent(TypedDict):
|
|
@@ -2104,7 +2165,7 @@ class SpanScope(TypedDict):
|
|
|
2104
2165
|
|
|
2105
2166
|
|
|
2106
2167
|
SpanType: TypeAlias = Literal[
|
|
2107
|
-
'llm', 'score', 'function', 'eval', 'task', 'tool', 'automation', 'facet', 'preprocessor'
|
|
2168
|
+
'llm', 'score', 'function', 'eval', 'task', 'tool', 'automation', 'facet', 'preprocessor', 'classifier'
|
|
2108
2169
|
]
|
|
2109
2170
|
"""
|
|
2110
2171
|
Type of the span, for display purposes only
|
|
@@ -2384,6 +2445,7 @@ AsyncScoringControl: TypeAlias = (
|
|
|
2384
2445
|
| AsyncScoringControlAsyncScoringControl3
|
|
2385
2446
|
| AsyncScoringControlAsyncScoringControl4
|
|
2386
2447
|
| AsyncScoringControlAsyncScoringControl5
|
|
2448
|
+
| AsyncScoringControlAsyncScoringControl6
|
|
2387
2449
|
)
|
|
2388
2450
|
|
|
2389
2451
|
|
|
@@ -2530,6 +2592,43 @@ class DatasetEvent(TypedDict):
|
|
|
2530
2592
|
"""
|
|
2531
2593
|
|
|
2532
2594
|
|
|
2595
|
+
class EvalStatusPage(TypedDict):
|
|
2596
|
+
id: str
|
|
2597
|
+
"""
|
|
2598
|
+
Unique identifier for the eval status page
|
|
2599
|
+
"""
|
|
2600
|
+
project_id: str
|
|
2601
|
+
"""
|
|
2602
|
+
Unique identifier for the project that the eval status page belongs under
|
|
2603
|
+
"""
|
|
2604
|
+
user_id: NotRequired[str | None]
|
|
2605
|
+
"""
|
|
2606
|
+
Identifies the user who created the eval status page
|
|
2607
|
+
"""
|
|
2608
|
+
created: NotRequired[str | None]
|
|
2609
|
+
"""
|
|
2610
|
+
Date of eval status page creation
|
|
2611
|
+
"""
|
|
2612
|
+
deleted_at: NotRequired[str | None]
|
|
2613
|
+
"""
|
|
2614
|
+
Date of eval status page deletion, or null if the eval status page is still active
|
|
2615
|
+
"""
|
|
2616
|
+
name: str
|
|
2617
|
+
"""
|
|
2618
|
+
Name of the eval status page
|
|
2619
|
+
"""
|
|
2620
|
+
description: NotRequired[str | None]
|
|
2621
|
+
"""
|
|
2622
|
+
Textual description of the eval status page
|
|
2623
|
+
"""
|
|
2624
|
+
logo_url: NotRequired[str | None]
|
|
2625
|
+
"""
|
|
2626
|
+
URL of the logo to display on the page
|
|
2627
|
+
"""
|
|
2628
|
+
theme: EvalStatusPageTheme
|
|
2629
|
+
config: EvalStatusPageConfig
|
|
2630
|
+
|
|
2631
|
+
|
|
2533
2632
|
class Experiment(TypedDict):
|
|
2534
2633
|
id: str
|
|
2535
2634
|
"""
|
|
@@ -3228,6 +3327,7 @@ class View(TypedDict):
|
|
|
3228
3327
|
'prompts',
|
|
3229
3328
|
'tools',
|
|
3230
3329
|
'scorers',
|
|
3330
|
+
'classifiers',
|
|
3231
3331
|
'logs',
|
|
3232
3332
|
'monitor',
|
|
3233
3333
|
'for_review',
|
|
@@ -1280,6 +1280,29 @@ async def _run_evaluator_internal(
|
|
|
1280
1280
|
filters: list[Filter],
|
|
1281
1281
|
stream: Callable[[SSEProgressEvent], None] | None = None,
|
|
1282
1282
|
state: BraintrustState | None = None,
|
|
1283
|
+
):
|
|
1284
|
+
# Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
|
|
1285
|
+
if state is None:
|
|
1286
|
+
from braintrust.logger import _internal_get_global_state
|
|
1287
|
+
|
|
1288
|
+
state = _internal_get_global_state()
|
|
1289
|
+
|
|
1290
|
+
state.span_cache.start()
|
|
1291
|
+
try:
|
|
1292
|
+
return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
|
|
1293
|
+
finally:
|
|
1294
|
+
# Clean up disk-based span cache after eval completes and stop caching
|
|
1295
|
+
state.span_cache.dispose()
|
|
1296
|
+
state.span_cache.stop()
|
|
1297
|
+
|
|
1298
|
+
|
|
1299
|
+
async def _run_evaluator_internal_impl(
|
|
1300
|
+
experiment,
|
|
1301
|
+
evaluator: Evaluator,
|
|
1302
|
+
position: int | None,
|
|
1303
|
+
filters: list[Filter],
|
|
1304
|
+
stream: Callable[[SSEProgressEvent], None] | None = None,
|
|
1305
|
+
state: BraintrustState | None = None,
|
|
1283
1306
|
):
|
|
1284
1307
|
event_loop = asyncio.get_event_loop()
|
|
1285
1308
|
|
|
@@ -1290,11 +1313,13 @@ async def _run_evaluator_internal(
|
|
|
1290
1313
|
{**parent_propagated},
|
|
1291
1314
|
{"span_attributes": {"purpose": "scorer"}},
|
|
1292
1315
|
)
|
|
1316
|
+
# Strip trace from logged input - it's internal plumbing that shouldn't appear in spans
|
|
1317
|
+
logged_input = {k: v for k, v in kwargs.items() if k != "trace"}
|
|
1293
1318
|
with root_span.start_span(
|
|
1294
1319
|
name=name,
|
|
1295
1320
|
span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
|
|
1296
1321
|
propagated_event=merged_propagated,
|
|
1297
|
-
input=
|
|
1322
|
+
input=logged_input,
|
|
1298
1323
|
) as span:
|
|
1299
1324
|
score = scorer
|
|
1300
1325
|
if hasattr(scorer, "eval_async"):
|
|
@@ -1415,6 +1440,77 @@ async def _run_evaluator_internal(
|
|
|
1415
1440
|
tags = hooks.tags if hooks.tags else None
|
|
1416
1441
|
root_span.log(output=output, metadata=metadata, tags=tags)
|
|
1417
1442
|
|
|
1443
|
+
# Create trace object for scorers
|
|
1444
|
+
from braintrust.trace import LocalTrace
|
|
1445
|
+
|
|
1446
|
+
async def ensure_spans_flushed():
|
|
1447
|
+
# Flush native Braintrust spans
|
|
1448
|
+
if experiment:
|
|
1449
|
+
await asyncio.get_event_loop().run_in_executor(
|
|
1450
|
+
None, lambda: experiment.state.flush()
|
|
1451
|
+
)
|
|
1452
|
+
elif state:
|
|
1453
|
+
await asyncio.get_event_loop().run_in_executor(None, lambda: state.flush())
|
|
1454
|
+
else:
|
|
1455
|
+
from braintrust.logger import flush as flush_logger
|
|
1456
|
+
|
|
1457
|
+
await asyncio.get_event_loop().run_in_executor(None, flush_logger)
|
|
1458
|
+
|
|
1459
|
+
# Also flush OTEL spans if registered
|
|
1460
|
+
if state:
|
|
1461
|
+
await state.flush_otel()
|
|
1462
|
+
|
|
1463
|
+
experiment_id = None
|
|
1464
|
+
if experiment:
|
|
1465
|
+
try:
|
|
1466
|
+
experiment_id = experiment.id
|
|
1467
|
+
except:
|
|
1468
|
+
experiment_id = None
|
|
1469
|
+
|
|
1470
|
+
trace = None
|
|
1471
|
+
if state or experiment:
|
|
1472
|
+
# Get the state to use
|
|
1473
|
+
trace_state = state
|
|
1474
|
+
if not trace_state and experiment:
|
|
1475
|
+
trace_state = experiment.state
|
|
1476
|
+
if not trace_state:
|
|
1477
|
+
# Fall back to global state
|
|
1478
|
+
from braintrust.logger import _internal_get_global_state
|
|
1479
|
+
|
|
1480
|
+
trace_state = _internal_get_global_state()
|
|
1481
|
+
|
|
1482
|
+
# Access root_span_id from the concrete SpanImpl instance
|
|
1483
|
+
# The Span interface doesn't expose this but SpanImpl has it
|
|
1484
|
+
root_span_id_value = getattr(root_span, "root_span_id", root_span.id)
|
|
1485
|
+
|
|
1486
|
+
# Check if there's a parent in the context to determine object_type and object_id
|
|
1487
|
+
from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string
|
|
1488
|
+
|
|
1489
|
+
parent_str = trace_state.current_parent.get()
|
|
1490
|
+
parent_components = None
|
|
1491
|
+
if parent_str:
|
|
1492
|
+
try:
|
|
1493
|
+
parent_components = SpanComponentsV3.from_str(parent_str)
|
|
1494
|
+
except Exception:
|
|
1495
|
+
# If parsing fails, parent_components stays None
|
|
1496
|
+
pass
|
|
1497
|
+
|
|
1498
|
+
# Determine object_type and object_id based on parent or experiment
|
|
1499
|
+
if parent_components:
|
|
1500
|
+
trace_object_type = span_object_type_v3_to_typed_string(parent_components.object_type)
|
|
1501
|
+
trace_object_id = parent_components.object_id or ""
|
|
1502
|
+
else:
|
|
1503
|
+
trace_object_type = "experiment"
|
|
1504
|
+
trace_object_id = experiment_id or ""
|
|
1505
|
+
|
|
1506
|
+
trace = LocalTrace(
|
|
1507
|
+
object_type=trace_object_type,
|
|
1508
|
+
object_id=trace_object_id,
|
|
1509
|
+
root_span_id=root_span_id_value,
|
|
1510
|
+
ensure_spans_flushed=ensure_spans_flushed,
|
|
1511
|
+
state=trace_state,
|
|
1512
|
+
)
|
|
1513
|
+
|
|
1418
1514
|
score_promises = [
|
|
1419
1515
|
asyncio.create_task(
|
|
1420
1516
|
await_or_run_scorer(
|
|
@@ -1426,6 +1522,7 @@ async def _run_evaluator_internal(
|
|
|
1426
1522
|
"expected": datum.expected,
|
|
1427
1523
|
"metadata": metadata,
|
|
1428
1524
|
"output": output,
|
|
1525
|
+
"trace": trace,
|
|
1429
1526
|
},
|
|
1430
1527
|
)
|
|
1431
1528
|
)
|
|
@@ -3,7 +3,7 @@ from typing import Any, Literal, TypedDict, TypeVar, overload
|
|
|
3
3
|
from sseclient import SSEClient
|
|
4
4
|
|
|
5
5
|
from .._generated_types import FunctionTypeEnum
|
|
6
|
-
from ..logger import Exportable, get_span_parent_object, login, proxy_conn
|
|
6
|
+
from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
|
|
7
7
|
from ..util import response_raise_for_status
|
|
8
8
|
from .constants import INVOKE_API_VERSION
|
|
9
9
|
from .stream import BraintrustInvokeError, BraintrustStream
|
|
@@ -243,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None):
|
|
|
243
243
|
:param version: Optional version of the function to use. Defaults to latest.
|
|
244
244
|
:return: A function that can be used as a task or scorer.
|
|
245
245
|
"""
|
|
246
|
+
# Disable span cache since remote function spans won't be in the local cache
|
|
247
|
+
_internal_get_global_state().span_cache.disable()
|
|
246
248
|
|
|
247
249
|
def f(*args: Any, **kwargs: Any) -> Any:
|
|
248
250
|
if len(args) > 0:
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Tests for the invoke module, particularly init_function."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from braintrust.functions.invoke import init_function
|
|
5
|
+
from braintrust.logger import _internal_get_global_state, _internal_reset_global_state
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestInitFunction:
|
|
9
|
+
"""Tests for init_function."""
|
|
10
|
+
|
|
11
|
+
def setup_method(self):
|
|
12
|
+
"""Reset state before each test."""
|
|
13
|
+
_internal_reset_global_state()
|
|
14
|
+
|
|
15
|
+
def teardown_method(self):
|
|
16
|
+
"""Clean up after each test."""
|
|
17
|
+
_internal_reset_global_state()
|
|
18
|
+
|
|
19
|
+
def test_init_function_disables_span_cache(self):
|
|
20
|
+
"""Test that init_function disables the span cache."""
|
|
21
|
+
state = _internal_get_global_state()
|
|
22
|
+
|
|
23
|
+
# Cache should be disabled by default (it's only enabled during evals)
|
|
24
|
+
assert state.span_cache.disabled is True
|
|
25
|
+
|
|
26
|
+
# Enable the cache (simulating what happens during eval)
|
|
27
|
+
state.span_cache.start()
|
|
28
|
+
assert state.span_cache.disabled is False
|
|
29
|
+
|
|
30
|
+
# Call init_function
|
|
31
|
+
f = init_function("test-project", "test-function")
|
|
32
|
+
|
|
33
|
+
# Cache should now be disabled (init_function explicitly disables it)
|
|
34
|
+
assert state.span_cache.disabled is True
|
|
35
|
+
assert f.__name__ == "init_function-test-project-test-function-latest"
|
|
36
|
+
|
|
37
|
+
def test_init_function_with_version(self):
|
|
38
|
+
"""Test that init_function creates a function with the correct name including version."""
|
|
39
|
+
f = init_function("my-project", "my-scorer", version="v1")
|
|
40
|
+
assert f.__name__ == "init_function-my-project-my-scorer-v1"
|
|
41
|
+
|
|
42
|
+
def test_init_function_without_version_uses_latest(self):
|
|
43
|
+
"""Test that init_function uses 'latest' in name when version not specified."""
|
|
44
|
+
f = init_function("my-project", "my-scorer")
|
|
45
|
+
assert f.__name__ == "init_function-my-project-my-scorer-latest"
|
|
46
|
+
|
|
47
|
+
def test_init_function_permanently_disables_cache(self):
|
|
48
|
+
"""Test that init_function permanently disables the cache (can't be re-enabled)."""
|
|
49
|
+
state = _internal_get_global_state()
|
|
50
|
+
|
|
51
|
+
# Enable the cache
|
|
52
|
+
state.span_cache.start()
|
|
53
|
+
assert state.span_cache.disabled is False
|
|
54
|
+
|
|
55
|
+
# Call init_function
|
|
56
|
+
init_function("test-project", "test-function")
|
|
57
|
+
assert state.span_cache.disabled is True
|
|
58
|
+
|
|
59
|
+
# Try to start again - should still be disabled because of explicit disable
|
|
60
|
+
state.span_cache.start()
|
|
61
|
+
assert state.span_cache.disabled is True
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Auto-generated file (internal git SHA
|
|
1
|
+
"""Auto-generated file (internal git SHA 21146f64bf5ad1eadd3a99d186274728e25e5399) -- do not modify"""
|
|
2
2
|
|
|
3
3
|
from ._generated_types import (
|
|
4
4
|
Acl,
|
|
@@ -29,6 +29,9 @@ from ._generated_types import (
|
|
|
29
29
|
Dataset,
|
|
30
30
|
DatasetEvent,
|
|
31
31
|
EnvVar,
|
|
32
|
+
EvalStatusPage,
|
|
33
|
+
EvalStatusPageConfig,
|
|
34
|
+
EvalStatusPageTheme,
|
|
32
35
|
Experiment,
|
|
33
36
|
ExperimentEvent,
|
|
34
37
|
ExtendedSavedFunctionId,
|
|
@@ -136,6 +139,9 @@ __all__ = [
|
|
|
136
139
|
"Dataset",
|
|
137
140
|
"DatasetEvent",
|
|
138
141
|
"EnvVar",
|
|
142
|
+
"EvalStatusPage",
|
|
143
|
+
"EvalStatusPageConfig",
|
|
144
|
+
"EvalStatusPageTheme",
|
|
139
145
|
"Experiment",
|
|
140
146
|
"ExperimentEvent",
|
|
141
147
|
"ExtendedSavedFunctionId",
|
|
@@ -47,12 +47,9 @@ from urllib3.util.retry import Retry
|
|
|
47
47
|
from . import context, id_gen
|
|
48
48
|
from .bt_json import bt_dumps, bt_safe_deep_copy
|
|
49
49
|
from .db_fields import (
|
|
50
|
-
ASYNC_SCORING_CONTROL_FIELD,
|
|
51
50
|
AUDIT_METADATA_FIELD,
|
|
52
51
|
AUDIT_SOURCE_FIELD,
|
|
53
52
|
IS_MERGE_FIELD,
|
|
54
|
-
MERGE_PATHS_FIELD,
|
|
55
|
-
SKIP_ASYNC_SCORING_FIELD,
|
|
56
53
|
TRANSACTION_ID_FIELD,
|
|
57
54
|
VALID_SOURCES,
|
|
58
55
|
)
|
|
@@ -101,6 +98,14 @@ from .xact_ids import prettify_xact
|
|
|
101
98
|
Metadata = dict[str, Any]
|
|
102
99
|
DATA_API_VERSION = 2
|
|
103
100
|
|
|
101
|
+
|
|
102
|
+
class DatasetRef(TypedDict, total=False):
|
|
103
|
+
"""Reference to a dataset by ID and optional version."""
|
|
104
|
+
|
|
105
|
+
id: str
|
|
106
|
+
version: str
|
|
107
|
+
|
|
108
|
+
|
|
104
109
|
T = TypeVar("T")
|
|
105
110
|
TMapping = TypeVar("TMapping", bound=Mapping[str, Any])
|
|
106
111
|
TMutableMapping = TypeVar("TMutableMapping", bound=MutableMapping[str, Any])
|
|
@@ -396,6 +401,11 @@ class BraintrustState:
|
|
|
396
401
|
),
|
|
397
402
|
)
|
|
398
403
|
|
|
404
|
+
from braintrust.span_cache import SpanCache
|
|
405
|
+
|
|
406
|
+
self.span_cache = SpanCache()
|
|
407
|
+
self._otel_flush_callback: Any | None = None
|
|
408
|
+
|
|
399
409
|
def reset_login_info(self):
|
|
400
410
|
self.app_url: str | None = None
|
|
401
411
|
self.app_public_url: str | None = None
|
|
@@ -452,6 +462,21 @@ class BraintrustState:
|
|
|
452
462
|
|
|
453
463
|
return self._context_manager
|
|
454
464
|
|
|
465
|
+
def register_otel_flush(self, callback: Any) -> None:
|
|
466
|
+
"""
|
|
467
|
+
Register an OTEL flush callback. This is called by the OTEL integration
|
|
468
|
+
when it initializes a span processor/exporter.
|
|
469
|
+
"""
|
|
470
|
+
self._otel_flush_callback = callback
|
|
471
|
+
|
|
472
|
+
async def flush_otel(self) -> None:
|
|
473
|
+
"""
|
|
474
|
+
Flush OTEL spans if a callback is registered.
|
|
475
|
+
Called during ensure_spans_flushed to ensure OTEL spans are visible in BTQL.
|
|
476
|
+
"""
|
|
477
|
+
if self._otel_flush_callback:
|
|
478
|
+
await self._otel_flush_callback()
|
|
479
|
+
|
|
455
480
|
def copy_state(self, other: "BraintrustState"):
|
|
456
481
|
"""Copy login information from another BraintrustState instance."""
|
|
457
482
|
self.__dict__.update({
|
|
@@ -1297,7 +1322,7 @@ def init(
|
|
|
1297
1322
|
project: str | None = None,
|
|
1298
1323
|
experiment: str | None = None,
|
|
1299
1324
|
description: str | None = None,
|
|
1300
|
-
dataset: Optional["Dataset"] = None,
|
|
1325
|
+
dataset: Optional["Dataset"] | DatasetRef = None,
|
|
1301
1326
|
open: bool = False,
|
|
1302
1327
|
base_experiment: str | None = None,
|
|
1303
1328
|
is_public: bool = False,
|
|
@@ -1410,12 +1435,19 @@ def init(
|
|
|
1410
1435
|
args["base_exp_id"] = base_experiment_id
|
|
1411
1436
|
elif base_experiment is not None:
|
|
1412
1437
|
args["base_experiment"] = base_experiment
|
|
1413
|
-
|
|
1438
|
+
elif merged_git_metadata_settings and merged_git_metadata_settings.collect != "none":
|
|
1414
1439
|
args["ancestor_commits"] = list(get_past_n_ancestors())
|
|
1415
1440
|
|
|
1416
1441
|
if dataset is not None:
|
|
1417
|
-
|
|
1418
|
-
|
|
1442
|
+
if isinstance(dataset, dict):
|
|
1443
|
+
# Simple {"id": ..., "version": ...} dict
|
|
1444
|
+
args["dataset_id"] = dataset["id"]
|
|
1445
|
+
if "version" in dataset:
|
|
1446
|
+
args["dataset_version"] = dataset["version"]
|
|
1447
|
+
else:
|
|
1448
|
+
# Full Dataset object
|
|
1449
|
+
args["dataset_id"] = dataset.id
|
|
1450
|
+
args["dataset_version"] = dataset.version
|
|
1419
1451
|
|
|
1420
1452
|
if is_public is not None:
|
|
1421
1453
|
args["public"] = is_public
|
|
@@ -1446,7 +1478,11 @@ def init(
|
|
|
1446
1478
|
# For experiments, disable queue size limit enforcement (unlimited queue)
|
|
1447
1479
|
state.enforce_queue_size_limit(False)
|
|
1448
1480
|
|
|
1449
|
-
ret = Experiment(
|
|
1481
|
+
ret = Experiment(
|
|
1482
|
+
lazy_metadata=LazyValue(compute_metadata, use_mutex=True),
|
|
1483
|
+
dataset=dataset if isinstance(dataset, Dataset) else None,
|
|
1484
|
+
state=state,
|
|
1485
|
+
)
|
|
1450
1486
|
if set_current:
|
|
1451
1487
|
state.current_experiment = ret
|
|
1452
1488
|
return ret
|
|
@@ -1761,6 +1797,25 @@ def login(
|
|
|
1761
1797
|
_state.login(app_url=app_url, api_key=api_key, org_name=org_name, force_login=force_login)
|
|
1762
1798
|
|
|
1763
1799
|
|
|
1800
|
+
def register_otel_flush(callback: Any) -> None:
|
|
1801
|
+
"""
|
|
1802
|
+
Register a callback to flush OTEL spans. This is called by the OTEL integration
|
|
1803
|
+
when it initializes a span processor/exporter.
|
|
1804
|
+
|
|
1805
|
+
When ensure_spans_flushed is called (e.g., before a BTQL query in scorers),
|
|
1806
|
+
this callback will be invoked to ensure OTEL spans are flushed to the server.
|
|
1807
|
+
|
|
1808
|
+
Also disables the span cache, since OTEL spans aren't in the local cache
|
|
1809
|
+
and we need BTQL to see the complete span tree (both native + OTEL spans).
|
|
1810
|
+
|
|
1811
|
+
:param callback: The async callback function to flush OTEL spans.
|
|
1812
|
+
"""
|
|
1813
|
+
global _state
|
|
1814
|
+
_state.register_otel_flush(callback)
|
|
1815
|
+
# Disable span cache since OTEL spans aren't in the local cache
|
|
1816
|
+
_state.span_cache.disable()
|
|
1817
|
+
|
|
1818
|
+
|
|
1764
1819
|
def login_to_state(
|
|
1765
1820
|
app_url: str | None = None,
|
|
1766
1821
|
api_key: str | None = None,
|
|
@@ -2323,30 +2378,6 @@ def _enrich_attachments(event: TMutableMapping) -> TMutableMapping:
|
|
|
2323
2378
|
|
|
2324
2379
|
|
|
2325
2380
|
def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any]) -> dict[str, Any]:
|
|
2326
|
-
# Make sure only certain keys are specified.
|
|
2327
|
-
forbidden_keys = set(event.keys()) - {
|
|
2328
|
-
"input",
|
|
2329
|
-
"output",
|
|
2330
|
-
"expected",
|
|
2331
|
-
"tags",
|
|
2332
|
-
"scores",
|
|
2333
|
-
"metadata",
|
|
2334
|
-
"metrics",
|
|
2335
|
-
"error",
|
|
2336
|
-
"dataset_record_id",
|
|
2337
|
-
"origin",
|
|
2338
|
-
"inputs",
|
|
2339
|
-
"span_attributes",
|
|
2340
|
-
ASYNC_SCORING_CONTROL_FIELD,
|
|
2341
|
-
MERGE_PATHS_FIELD,
|
|
2342
|
-
SKIP_ASYNC_SCORING_FIELD,
|
|
2343
|
-
"span_id",
|
|
2344
|
-
"root_span_id",
|
|
2345
|
-
"_bt_internal_override_pagination_key",
|
|
2346
|
-
}
|
|
2347
|
-
if forbidden_keys:
|
|
2348
|
-
raise ValueError(f"The following keys are not permitted: {forbidden_keys}")
|
|
2349
|
-
|
|
2350
2381
|
scores = event.get("scores")
|
|
2351
2382
|
if scores:
|
|
2352
2383
|
for name, score in scores.items():
|
|
@@ -3855,6 +3886,21 @@ class SpanImpl(Span):
|
|
|
3855
3886
|
if serializable_partial_record.get("metrics", {}).get("end") is not None:
|
|
3856
3887
|
self._logged_end_time = serializable_partial_record["metrics"]["end"]
|
|
3857
3888
|
|
|
3889
|
+
# Write to local span cache for scorer access
|
|
3890
|
+
# Only cache experiment spans - regular logs don't need caching
|
|
3891
|
+
if self.parent_object_type == SpanObjectTypeV3.EXPERIMENT:
|
|
3892
|
+
from braintrust.span_cache import CachedSpan
|
|
3893
|
+
|
|
3894
|
+
cached_span = CachedSpan(
|
|
3895
|
+
span_id=self.span_id,
|
|
3896
|
+
input=serializable_partial_record.get("input"),
|
|
3897
|
+
output=serializable_partial_record.get("output"),
|
|
3898
|
+
metadata=serializable_partial_record.get("metadata"),
|
|
3899
|
+
span_parents=self.span_parents,
|
|
3900
|
+
span_attributes=serializable_partial_record.get("span_attributes"),
|
|
3901
|
+
)
|
|
3902
|
+
self.state.span_cache.queue_write(self.root_span_id, self.span_id, cached_span)
|
|
3903
|
+
|
|
3858
3904
|
def compute_record() -> dict[str, Any]:
|
|
3859
3905
|
exporter = _get_exporter()
|
|
3860
3906
|
return dict(
|