braintrust 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- braintrust/_generated_types.py +328 -126
- braintrust/cli/install/api.py +1 -1
- braintrust/conftest.py +24 -0
- braintrust/devserver/test_server_integration.py +0 -11
- braintrust/framework.py +98 -1
- braintrust/functions/invoke.py +4 -9
- braintrust/functions/test_invoke.py +61 -0
- braintrust/generated_types.py +13 -7
- braintrust/logger.py +107 -66
- braintrust/prompt_cache/test_disk_cache.py +3 -3
- braintrust/span_cache.py +337 -0
- braintrust/span_identifier_v3.py +21 -0
- braintrust/span_types.py +3 -0
- braintrust/test_bt_json.py +23 -19
- braintrust/test_logger.py +116 -0
- braintrust/test_span_cache.py +344 -0
- braintrust/test_trace.py +267 -0
- braintrust/trace.py +385 -0
- braintrust/version.py +2 -2
- braintrust/wrappers/claude_agent_sdk/_wrapper.py +48 -6
- braintrust/wrappers/claude_agent_sdk/test_wrapper.py +106 -0
- braintrust/wrappers/langsmith_wrapper.py +517 -0
- braintrust/wrappers/test_agno.py +0 -12
- braintrust/wrappers/test_anthropic.py +1 -11
- braintrust/wrappers/test_dspy.py +0 -11
- braintrust/wrappers/test_google_genai.py +6 -1
- braintrust/wrappers/test_langsmith_wrapper.py +338 -0
- braintrust/wrappers/test_litellm.py +0 -10
- braintrust/wrappers/test_oai_attachments.py +0 -10
- braintrust/wrappers/test_openai.py +3 -12
- braintrust/wrappers/test_openrouter.py +0 -9
- braintrust/wrappers/test_pydantic_ai_integration.py +0 -11
- braintrust/wrappers/test_pydantic_ai_wrap_openai.py +2 -0
- {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/METADATA +1 -1
- {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/RECORD +38 -31
- {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/WHEEL +1 -1
- {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/entry_points.txt +0 -0
- {braintrust-0.4.2.dist-info → braintrust-0.5.0.dist-info}/top_level.txt +0 -0
braintrust/cli/install/api.py
CHANGED
|
@@ -326,7 +326,7 @@ def main(args):
|
|
|
326
326
|
textwrap.dedent(
|
|
327
327
|
f"""\
|
|
328
328
|
Stack with name {args.name} does not exist. Either create it manually by following
|
|
329
|
-
https://www.braintrust.dev/docs/
|
|
329
|
+
https://www.braintrust.dev/docs/admin/self-hosting/aws or use the --create flag."""
|
|
330
330
|
)
|
|
331
331
|
)
|
|
332
332
|
exit(1)
|
braintrust/conftest.py
CHANGED
|
@@ -46,3 +46,27 @@ def reset_braintrust_state():
|
|
|
46
46
|
from braintrust import logger
|
|
47
47
|
|
|
48
48
|
logger._state = logger.BraintrustState()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@pytest.fixture(scope="session")
|
|
52
|
+
def vcr_config():
|
|
53
|
+
"""
|
|
54
|
+
VCR configuration for recording/playing back HTTP interactions.
|
|
55
|
+
|
|
56
|
+
In CI, use "none" to fail if cassette is missing.
|
|
57
|
+
Locally, use "once" to record new cassettes if they don't exist.
|
|
58
|
+
"""
|
|
59
|
+
record_mode = "none" if (os.environ.get("CI") or os.environ.get("GITHUB_ACTIONS")) else "once"
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
"record_mode": record_mode,
|
|
63
|
+
"filter_headers": [
|
|
64
|
+
"authorization",
|
|
65
|
+
"openai-organization",
|
|
66
|
+
"x-api-key",
|
|
67
|
+
"api-key",
|
|
68
|
+
"openai-api-key",
|
|
69
|
+
"x-goog-api-key",
|
|
70
|
+
"x-bt-auth-token",
|
|
71
|
+
],
|
|
72
|
+
}
|
|
@@ -8,17 +8,6 @@ from braintrust.framework import _evals
|
|
|
8
8
|
from braintrust.test_helpers import has_devserver_installed
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
@pytest.fixture(scope="module")
|
|
12
|
-
def vcr_config():
|
|
13
|
-
"""VCR configuration to filter sensitive headers."""
|
|
14
|
-
return {
|
|
15
|
-
"filter_headers": [
|
|
16
|
-
"x-bt-auth-token",
|
|
17
|
-
"authorization",
|
|
18
|
-
]
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
|
|
22
11
|
@pytest.fixture
|
|
23
12
|
def client():
|
|
24
13
|
"""Create test client using the real simple_eval.py example."""
|
braintrust/framework.py
CHANGED
|
@@ -1280,6 +1280,29 @@ async def _run_evaluator_internal(
|
|
|
1280
1280
|
filters: list[Filter],
|
|
1281
1281
|
stream: Callable[[SSEProgressEvent], None] | None = None,
|
|
1282
1282
|
state: BraintrustState | None = None,
|
|
1283
|
+
):
|
|
1284
|
+
# Start span cache for this eval (it's disabled by default to avoid temp files outside of evals)
|
|
1285
|
+
if state is None:
|
|
1286
|
+
from braintrust.logger import _internal_get_global_state
|
|
1287
|
+
|
|
1288
|
+
state = _internal_get_global_state()
|
|
1289
|
+
|
|
1290
|
+
state.span_cache.start()
|
|
1291
|
+
try:
|
|
1292
|
+
return await _run_evaluator_internal_impl(experiment, evaluator, position, filters, stream, state)
|
|
1293
|
+
finally:
|
|
1294
|
+
# Clean up disk-based span cache after eval completes and stop caching
|
|
1295
|
+
state.span_cache.dispose()
|
|
1296
|
+
state.span_cache.stop()
|
|
1297
|
+
|
|
1298
|
+
|
|
1299
|
+
async def _run_evaluator_internal_impl(
|
|
1300
|
+
experiment,
|
|
1301
|
+
evaluator: Evaluator,
|
|
1302
|
+
position: int | None,
|
|
1303
|
+
filters: list[Filter],
|
|
1304
|
+
stream: Callable[[SSEProgressEvent], None] | None = None,
|
|
1305
|
+
state: BraintrustState | None = None,
|
|
1283
1306
|
):
|
|
1284
1307
|
event_loop = asyncio.get_event_loop()
|
|
1285
1308
|
|
|
@@ -1290,11 +1313,13 @@ async def _run_evaluator_internal(
|
|
|
1290
1313
|
{**parent_propagated},
|
|
1291
1314
|
{"span_attributes": {"purpose": "scorer"}},
|
|
1292
1315
|
)
|
|
1316
|
+
# Strip trace from logged input - it's internal plumbing that shouldn't appear in spans
|
|
1317
|
+
logged_input = {k: v for k, v in kwargs.items() if k != "trace"}
|
|
1293
1318
|
with root_span.start_span(
|
|
1294
1319
|
name=name,
|
|
1295
1320
|
span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
|
|
1296
1321
|
propagated_event=merged_propagated,
|
|
1297
|
-
input=
|
|
1322
|
+
input=logged_input,
|
|
1298
1323
|
) as span:
|
|
1299
1324
|
score = scorer
|
|
1300
1325
|
if hasattr(scorer, "eval_async"):
|
|
@@ -1415,6 +1440,77 @@ async def _run_evaluator_internal(
|
|
|
1415
1440
|
tags = hooks.tags if hooks.tags else None
|
|
1416
1441
|
root_span.log(output=output, metadata=metadata, tags=tags)
|
|
1417
1442
|
|
|
1443
|
+
# Create trace object for scorers
|
|
1444
|
+
from braintrust.trace import LocalTrace
|
|
1445
|
+
|
|
1446
|
+
async def ensure_spans_flushed():
|
|
1447
|
+
# Flush native Braintrust spans
|
|
1448
|
+
if experiment:
|
|
1449
|
+
await asyncio.get_event_loop().run_in_executor(
|
|
1450
|
+
None, lambda: experiment.state.flush()
|
|
1451
|
+
)
|
|
1452
|
+
elif state:
|
|
1453
|
+
await asyncio.get_event_loop().run_in_executor(None, lambda: state.flush())
|
|
1454
|
+
else:
|
|
1455
|
+
from braintrust.logger import flush as flush_logger
|
|
1456
|
+
|
|
1457
|
+
await asyncio.get_event_loop().run_in_executor(None, flush_logger)
|
|
1458
|
+
|
|
1459
|
+
# Also flush OTEL spans if registered
|
|
1460
|
+
if state:
|
|
1461
|
+
await state.flush_otel()
|
|
1462
|
+
|
|
1463
|
+
experiment_id = None
|
|
1464
|
+
if experiment:
|
|
1465
|
+
try:
|
|
1466
|
+
experiment_id = experiment.id
|
|
1467
|
+
except:
|
|
1468
|
+
experiment_id = None
|
|
1469
|
+
|
|
1470
|
+
trace = None
|
|
1471
|
+
if state or experiment:
|
|
1472
|
+
# Get the state to use
|
|
1473
|
+
trace_state = state
|
|
1474
|
+
if not trace_state and experiment:
|
|
1475
|
+
trace_state = experiment.state
|
|
1476
|
+
if not trace_state:
|
|
1477
|
+
# Fall back to global state
|
|
1478
|
+
from braintrust.logger import _internal_get_global_state
|
|
1479
|
+
|
|
1480
|
+
trace_state = _internal_get_global_state()
|
|
1481
|
+
|
|
1482
|
+
# Access root_span_id from the concrete SpanImpl instance
|
|
1483
|
+
# The Span interface doesn't expose this but SpanImpl has it
|
|
1484
|
+
root_span_id_value = getattr(root_span, "root_span_id", root_span.id)
|
|
1485
|
+
|
|
1486
|
+
# Check if there's a parent in the context to determine object_type and object_id
|
|
1487
|
+
from braintrust.span_identifier_v3 import SpanComponentsV3, span_object_type_v3_to_typed_string
|
|
1488
|
+
|
|
1489
|
+
parent_str = trace_state.current_parent.get()
|
|
1490
|
+
parent_components = None
|
|
1491
|
+
if parent_str:
|
|
1492
|
+
try:
|
|
1493
|
+
parent_components = SpanComponentsV3.from_str(parent_str)
|
|
1494
|
+
except Exception:
|
|
1495
|
+
# If parsing fails, parent_components stays None
|
|
1496
|
+
pass
|
|
1497
|
+
|
|
1498
|
+
# Determine object_type and object_id based on parent or experiment
|
|
1499
|
+
if parent_components:
|
|
1500
|
+
trace_object_type = span_object_type_v3_to_typed_string(parent_components.object_type)
|
|
1501
|
+
trace_object_id = parent_components.object_id or ""
|
|
1502
|
+
else:
|
|
1503
|
+
trace_object_type = "experiment"
|
|
1504
|
+
trace_object_id = experiment_id or ""
|
|
1505
|
+
|
|
1506
|
+
trace = LocalTrace(
|
|
1507
|
+
object_type=trace_object_type,
|
|
1508
|
+
object_id=trace_object_id,
|
|
1509
|
+
root_span_id=root_span_id_value,
|
|
1510
|
+
ensure_spans_flushed=ensure_spans_flushed,
|
|
1511
|
+
state=trace_state,
|
|
1512
|
+
)
|
|
1513
|
+
|
|
1418
1514
|
score_promises = [
|
|
1419
1515
|
asyncio.create_task(
|
|
1420
1516
|
await_or_run_scorer(
|
|
@@ -1426,6 +1522,7 @@ async def _run_evaluator_internal(
|
|
|
1426
1522
|
"expected": datum.expected,
|
|
1427
1523
|
"metadata": metadata,
|
|
1428
1524
|
"output": output,
|
|
1525
|
+
"trace": trace,
|
|
1429
1526
|
},
|
|
1430
1527
|
)
|
|
1431
1528
|
)
|
braintrust/functions/invoke.py
CHANGED
|
@@ -2,8 +2,8 @@ from typing import Any, Literal, TypedDict, TypeVar, overload
|
|
|
2
2
|
|
|
3
3
|
from sseclient import SSEClient
|
|
4
4
|
|
|
5
|
-
from .._generated_types import FunctionTypeEnum
|
|
6
|
-
from ..logger import Exportable, get_span_parent_object, login, proxy_conn
|
|
5
|
+
from .._generated_types import FunctionTypeEnum
|
|
6
|
+
from ..logger import Exportable, _internal_get_global_state, get_span_parent_object, login, proxy_conn
|
|
7
7
|
from ..util import response_raise_for_status
|
|
8
8
|
from .constants import INVOKE_API_VERSION
|
|
9
9
|
from .stream import BraintrustInvokeError, BraintrustStream
|
|
@@ -43,7 +43,6 @@ def invoke(
|
|
|
43
43
|
# arguments to the function
|
|
44
44
|
input: Any = None,
|
|
45
45
|
messages: list[Any] | None = None,
|
|
46
|
-
context: InvokeContext | None = None,
|
|
47
46
|
metadata: dict[str, Any] | None = None,
|
|
48
47
|
tags: list[str] | None = None,
|
|
49
48
|
parent: Exportable | str | None = None,
|
|
@@ -72,7 +71,6 @@ def invoke(
|
|
|
72
71
|
# arguments to the function
|
|
73
72
|
input: Any = None,
|
|
74
73
|
messages: list[Any] | None = None,
|
|
75
|
-
context: InvokeContext | None = None,
|
|
76
74
|
metadata: dict[str, Any] | None = None,
|
|
77
75
|
tags: list[str] | None = None,
|
|
78
76
|
parent: Exportable | str | None = None,
|
|
@@ -100,7 +98,6 @@ def invoke(
|
|
|
100
98
|
# arguments to the function
|
|
101
99
|
input: Any = None,
|
|
102
100
|
messages: list[Any] | None = None,
|
|
103
|
-
context: InvokeContext | None = None,
|
|
104
101
|
metadata: dict[str, Any] | None = None,
|
|
105
102
|
tags: list[str] | None = None,
|
|
106
103
|
parent: Exportable | str | None = None,
|
|
@@ -119,8 +116,6 @@ def invoke(
|
|
|
119
116
|
Args:
|
|
120
117
|
input: The input to the function. This will be logged as the `input` field in the span.
|
|
121
118
|
messages: Additional OpenAI-style messages to add to the prompt (only works for llm functions).
|
|
122
|
-
context: Context for functions that operate on spans/traces (e.g., facets). Should contain
|
|
123
|
-
`object_type`, `object_id`, and `scope` fields.
|
|
124
119
|
metadata: Additional metadata to add to the span. This will be logged as the `metadata` field in the span.
|
|
125
120
|
It will also be available as the {{metadata}} field in the prompt and as the `metadata` argument
|
|
126
121
|
to the function.
|
|
@@ -195,8 +190,6 @@ def invoke(
|
|
|
195
190
|
)
|
|
196
191
|
if messages is not None:
|
|
197
192
|
request["messages"] = messages
|
|
198
|
-
if context is not None:
|
|
199
|
-
request["context"] = context
|
|
200
193
|
if mode is not None:
|
|
201
194
|
request["mode"] = mode
|
|
202
195
|
if strict is not None:
|
|
@@ -250,6 +243,8 @@ def init_function(project_name: str, slug: str, version: str | None = None):
|
|
|
250
243
|
:param version: Optional version of the function to use. Defaults to latest.
|
|
251
244
|
:return: A function that can be used as a task or scorer.
|
|
252
245
|
"""
|
|
246
|
+
# Disable span cache since remote function spans won't be in the local cache
|
|
247
|
+
_internal_get_global_state().span_cache.disable()
|
|
253
248
|
|
|
254
249
|
def f(*args: Any, **kwargs: Any) -> Any:
|
|
255
250
|
if len(args) > 0:
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""Tests for the invoke module, particularly init_function."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
from braintrust.functions.invoke import init_function
|
|
5
|
+
from braintrust.logger import _internal_get_global_state, _internal_reset_global_state
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class TestInitFunction:
|
|
9
|
+
"""Tests for init_function."""
|
|
10
|
+
|
|
11
|
+
def setup_method(self):
|
|
12
|
+
"""Reset state before each test."""
|
|
13
|
+
_internal_reset_global_state()
|
|
14
|
+
|
|
15
|
+
def teardown_method(self):
|
|
16
|
+
"""Clean up after each test."""
|
|
17
|
+
_internal_reset_global_state()
|
|
18
|
+
|
|
19
|
+
def test_init_function_disables_span_cache(self):
|
|
20
|
+
"""Test that init_function disables the span cache."""
|
|
21
|
+
state = _internal_get_global_state()
|
|
22
|
+
|
|
23
|
+
# Cache should be disabled by default (it's only enabled during evals)
|
|
24
|
+
assert state.span_cache.disabled is True
|
|
25
|
+
|
|
26
|
+
# Enable the cache (simulating what happens during eval)
|
|
27
|
+
state.span_cache.start()
|
|
28
|
+
assert state.span_cache.disabled is False
|
|
29
|
+
|
|
30
|
+
# Call init_function
|
|
31
|
+
f = init_function("test-project", "test-function")
|
|
32
|
+
|
|
33
|
+
# Cache should now be disabled (init_function explicitly disables it)
|
|
34
|
+
assert state.span_cache.disabled is True
|
|
35
|
+
assert f.__name__ == "init_function-test-project-test-function-latest"
|
|
36
|
+
|
|
37
|
+
def test_init_function_with_version(self):
|
|
38
|
+
"""Test that init_function creates a function with the correct name including version."""
|
|
39
|
+
f = init_function("my-project", "my-scorer", version="v1")
|
|
40
|
+
assert f.__name__ == "init_function-my-project-my-scorer-v1"
|
|
41
|
+
|
|
42
|
+
def test_init_function_without_version_uses_latest(self):
|
|
43
|
+
"""Test that init_function uses 'latest' in name when version not specified."""
|
|
44
|
+
f = init_function("my-project", "my-scorer")
|
|
45
|
+
assert f.__name__ == "init_function-my-project-my-scorer-latest"
|
|
46
|
+
|
|
47
|
+
def test_init_function_permanently_disables_cache(self):
|
|
48
|
+
"""Test that init_function permanently disables the cache (can't be re-enabled)."""
|
|
49
|
+
state = _internal_get_global_state()
|
|
50
|
+
|
|
51
|
+
# Enable the cache
|
|
52
|
+
state.span_cache.start()
|
|
53
|
+
assert state.span_cache.disabled is False
|
|
54
|
+
|
|
55
|
+
# Call init_function
|
|
56
|
+
init_function("test-project", "test-function")
|
|
57
|
+
assert state.span_cache.disabled is True
|
|
58
|
+
|
|
59
|
+
# Try to start again - should still be disabled because of explicit disable
|
|
60
|
+
state.span_cache.start()
|
|
61
|
+
assert state.span_cache.disabled is True
|
braintrust/generated_types.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Auto-generated file (internal git SHA
|
|
1
|
+
"""Auto-generated file (internal git SHA 21146f64bf5ad1eadd3a99d186274728e25e5399) -- do not modify"""
|
|
2
2
|
|
|
3
3
|
from ._generated_types import (
|
|
4
4
|
Acl,
|
|
@@ -10,6 +10,7 @@ from ._generated_types import (
|
|
|
10
10
|
AsyncScoringState,
|
|
11
11
|
AttachmentReference,
|
|
12
12
|
AttachmentStatus,
|
|
13
|
+
BatchedFacetData,
|
|
13
14
|
BraintrustAttachmentReference,
|
|
14
15
|
BraintrustModelParams,
|
|
15
16
|
CallEvent,
|
|
@@ -28,6 +29,9 @@ from ._generated_types import (
|
|
|
28
29
|
Dataset,
|
|
29
30
|
DatasetEvent,
|
|
30
31
|
EnvVar,
|
|
32
|
+
EvalStatusPage,
|
|
33
|
+
EvalStatusPageConfig,
|
|
34
|
+
EvalStatusPageTheme,
|
|
31
35
|
Experiment,
|
|
32
36
|
ExperimentEvent,
|
|
33
37
|
ExtendedSavedFunctionId,
|
|
@@ -47,15 +51,13 @@ from ._generated_types import (
|
|
|
47
51
|
GraphEdge,
|
|
48
52
|
GraphNode,
|
|
49
53
|
Group,
|
|
54
|
+
GroupScope,
|
|
50
55
|
IfExists,
|
|
51
|
-
InvokeContext,
|
|
52
56
|
InvokeFunction,
|
|
53
57
|
InvokeParent,
|
|
54
|
-
InvokeScope,
|
|
55
58
|
MCPServer,
|
|
56
59
|
MessageRole,
|
|
57
60
|
ModelParams,
|
|
58
|
-
NullableFunctionTypeEnum,
|
|
59
61
|
NullableSavedFunctionId,
|
|
60
62
|
ObjectReference,
|
|
61
63
|
ObjectReferenceNullish,
|
|
@@ -99,6 +101,7 @@ from ._generated_types import (
|
|
|
99
101
|
StreamingMode,
|
|
100
102
|
ToolFunctionDefinition,
|
|
101
103
|
TraceScope,
|
|
104
|
+
TriggeredFunctionState,
|
|
102
105
|
UploadStatus,
|
|
103
106
|
User,
|
|
104
107
|
View,
|
|
@@ -117,6 +120,7 @@ __all__ = [
|
|
|
117
120
|
"AsyncScoringState",
|
|
118
121
|
"AttachmentReference",
|
|
119
122
|
"AttachmentStatus",
|
|
123
|
+
"BatchedFacetData",
|
|
120
124
|
"BraintrustAttachmentReference",
|
|
121
125
|
"BraintrustModelParams",
|
|
122
126
|
"CallEvent",
|
|
@@ -135,6 +139,9 @@ __all__ = [
|
|
|
135
139
|
"Dataset",
|
|
136
140
|
"DatasetEvent",
|
|
137
141
|
"EnvVar",
|
|
142
|
+
"EvalStatusPage",
|
|
143
|
+
"EvalStatusPageConfig",
|
|
144
|
+
"EvalStatusPageTheme",
|
|
138
145
|
"Experiment",
|
|
139
146
|
"ExperimentEvent",
|
|
140
147
|
"ExtendedSavedFunctionId",
|
|
@@ -154,15 +161,13 @@ __all__ = [
|
|
|
154
161
|
"GraphEdge",
|
|
155
162
|
"GraphNode",
|
|
156
163
|
"Group",
|
|
164
|
+
"GroupScope",
|
|
157
165
|
"IfExists",
|
|
158
|
-
"InvokeContext",
|
|
159
166
|
"InvokeFunction",
|
|
160
167
|
"InvokeParent",
|
|
161
|
-
"InvokeScope",
|
|
162
168
|
"MCPServer",
|
|
163
169
|
"MessageRole",
|
|
164
170
|
"ModelParams",
|
|
165
|
-
"NullableFunctionTypeEnum",
|
|
166
171
|
"NullableSavedFunctionId",
|
|
167
172
|
"ObjectReference",
|
|
168
173
|
"ObjectReferenceNullish",
|
|
@@ -206,6 +211,7 @@ __all__ = [
|
|
|
206
211
|
"StreamingMode",
|
|
207
212
|
"ToolFunctionDefinition",
|
|
208
213
|
"TraceScope",
|
|
214
|
+
"TriggeredFunctionState",
|
|
209
215
|
"UploadStatus",
|
|
210
216
|
"User",
|
|
211
217
|
"View",
|
braintrust/logger.py
CHANGED
|
@@ -47,12 +47,9 @@ from urllib3.util.retry import Retry
|
|
|
47
47
|
from . import context, id_gen
|
|
48
48
|
from .bt_json import bt_dumps, bt_safe_deep_copy
|
|
49
49
|
from .db_fields import (
|
|
50
|
-
ASYNC_SCORING_CONTROL_FIELD,
|
|
51
50
|
AUDIT_METADATA_FIELD,
|
|
52
51
|
AUDIT_SOURCE_FIELD,
|
|
53
52
|
IS_MERGE_FIELD,
|
|
54
|
-
MERGE_PATHS_FIELD,
|
|
55
|
-
SKIP_ASYNC_SCORING_FIELD,
|
|
56
53
|
TRANSACTION_ID_FIELD,
|
|
57
54
|
VALID_SOURCES,
|
|
58
55
|
)
|
|
@@ -101,6 +98,14 @@ from .xact_ids import prettify_xact
|
|
|
101
98
|
Metadata = dict[str, Any]
|
|
102
99
|
DATA_API_VERSION = 2
|
|
103
100
|
|
|
101
|
+
|
|
102
|
+
class DatasetRef(TypedDict, total=False):
|
|
103
|
+
"""Reference to a dataset by ID and optional version."""
|
|
104
|
+
|
|
105
|
+
id: str
|
|
106
|
+
version: str
|
|
107
|
+
|
|
108
|
+
|
|
104
109
|
T = TypeVar("T")
|
|
105
110
|
TMapping = TypeVar("TMapping", bound=Mapping[str, Any])
|
|
106
111
|
TMutableMapping = TypeVar("TMutableMapping", bound=MutableMapping[str, Any])
|
|
@@ -396,6 +401,11 @@ class BraintrustState:
|
|
|
396
401
|
),
|
|
397
402
|
)
|
|
398
403
|
|
|
404
|
+
from braintrust.span_cache import SpanCache
|
|
405
|
+
|
|
406
|
+
self.span_cache = SpanCache()
|
|
407
|
+
self._otel_flush_callback: Any | None = None
|
|
408
|
+
|
|
399
409
|
def reset_login_info(self):
|
|
400
410
|
self.app_url: str | None = None
|
|
401
411
|
self.app_public_url: str | None = None
|
|
@@ -452,26 +462,39 @@ class BraintrustState:
|
|
|
452
462
|
|
|
453
463
|
return self._context_manager
|
|
454
464
|
|
|
465
|
+
def register_otel_flush(self, callback: Any) -> None:
|
|
466
|
+
"""
|
|
467
|
+
Register an OTEL flush callback. This is called by the OTEL integration
|
|
468
|
+
when it initializes a span processor/exporter.
|
|
469
|
+
"""
|
|
470
|
+
self._otel_flush_callback = callback
|
|
471
|
+
|
|
472
|
+
async def flush_otel(self) -> None:
|
|
473
|
+
"""
|
|
474
|
+
Flush OTEL spans if a callback is registered.
|
|
475
|
+
Called during ensure_spans_flushed to ensure OTEL spans are visible in BTQL.
|
|
476
|
+
"""
|
|
477
|
+
if self._otel_flush_callback:
|
|
478
|
+
await self._otel_flush_callback()
|
|
479
|
+
|
|
455
480
|
def copy_state(self, other: "BraintrustState"):
|
|
456
481
|
"""Copy login information from another BraintrustState instance."""
|
|
457
|
-
self.__dict__.update(
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
}
|
|
474
|
-
)
|
|
482
|
+
self.__dict__.update({
|
|
483
|
+
k: v
|
|
484
|
+
for (k, v) in other.__dict__.items()
|
|
485
|
+
if k
|
|
486
|
+
not in (
|
|
487
|
+
"current_experiment",
|
|
488
|
+
"current_logger",
|
|
489
|
+
"current_parent",
|
|
490
|
+
"current_span",
|
|
491
|
+
"_global_bg_logger",
|
|
492
|
+
"_override_bg_logger",
|
|
493
|
+
"_context_manager",
|
|
494
|
+
"_last_otel_setting",
|
|
495
|
+
"_context_manager_lock",
|
|
496
|
+
)
|
|
497
|
+
})
|
|
475
498
|
|
|
476
499
|
def login(
|
|
477
500
|
self,
|
|
@@ -1299,7 +1322,7 @@ def init(
|
|
|
1299
1322
|
project: str | None = None,
|
|
1300
1323
|
experiment: str | None = None,
|
|
1301
1324
|
description: str | None = None,
|
|
1302
|
-
dataset: Optional["Dataset"] = None,
|
|
1325
|
+
dataset: Optional["Dataset"] | DatasetRef = None,
|
|
1303
1326
|
open: bool = False,
|
|
1304
1327
|
base_experiment: str | None = None,
|
|
1305
1328
|
is_public: bool = False,
|
|
@@ -1412,12 +1435,19 @@ def init(
|
|
|
1412
1435
|
args["base_exp_id"] = base_experiment_id
|
|
1413
1436
|
elif base_experiment is not None:
|
|
1414
1437
|
args["base_experiment"] = base_experiment
|
|
1415
|
-
|
|
1438
|
+
elif merged_git_metadata_settings and merged_git_metadata_settings.collect != "none":
|
|
1416
1439
|
args["ancestor_commits"] = list(get_past_n_ancestors())
|
|
1417
1440
|
|
|
1418
1441
|
if dataset is not None:
|
|
1419
|
-
|
|
1420
|
-
|
|
1442
|
+
if isinstance(dataset, dict):
|
|
1443
|
+
# Simple {"id": ..., "version": ...} dict
|
|
1444
|
+
args["dataset_id"] = dataset["id"]
|
|
1445
|
+
if "version" in dataset:
|
|
1446
|
+
args["dataset_version"] = dataset["version"]
|
|
1447
|
+
else:
|
|
1448
|
+
# Full Dataset object
|
|
1449
|
+
args["dataset_id"] = dataset.id
|
|
1450
|
+
args["dataset_version"] = dataset.version
|
|
1421
1451
|
|
|
1422
1452
|
if is_public is not None:
|
|
1423
1453
|
args["public"] = is_public
|
|
@@ -1448,7 +1478,11 @@ def init(
|
|
|
1448
1478
|
# For experiments, disable queue size limit enforcement (unlimited queue)
|
|
1449
1479
|
state.enforce_queue_size_limit(False)
|
|
1450
1480
|
|
|
1451
|
-
ret = Experiment(
|
|
1481
|
+
ret = Experiment(
|
|
1482
|
+
lazy_metadata=LazyValue(compute_metadata, use_mutex=True),
|
|
1483
|
+
dataset=dataset if isinstance(dataset, Dataset) else None,
|
|
1484
|
+
state=state,
|
|
1485
|
+
)
|
|
1452
1486
|
if set_current:
|
|
1453
1487
|
state.current_experiment = ret
|
|
1454
1488
|
return ret
|
|
@@ -1763,6 +1797,25 @@ def login(
|
|
|
1763
1797
|
_state.login(app_url=app_url, api_key=api_key, org_name=org_name, force_login=force_login)
|
|
1764
1798
|
|
|
1765
1799
|
|
|
1800
|
+
def register_otel_flush(callback: Any) -> None:
|
|
1801
|
+
"""
|
|
1802
|
+
Register a callback to flush OTEL spans. This is called by the OTEL integration
|
|
1803
|
+
when it initializes a span processor/exporter.
|
|
1804
|
+
|
|
1805
|
+
When ensure_spans_flushed is called (e.g., before a BTQL query in scorers),
|
|
1806
|
+
this callback will be invoked to ensure OTEL spans are flushed to the server.
|
|
1807
|
+
|
|
1808
|
+
Also disables the span cache, since OTEL spans aren't in the local cache
|
|
1809
|
+
and we need BTQL to see the complete span tree (both native + OTEL spans).
|
|
1810
|
+
|
|
1811
|
+
:param callback: The async callback function to flush OTEL spans.
|
|
1812
|
+
"""
|
|
1813
|
+
global _state
|
|
1814
|
+
_state.register_otel_flush(callback)
|
|
1815
|
+
# Disable span cache since OTEL spans aren't in the local cache
|
|
1816
|
+
_state.span_cache.disable()
|
|
1817
|
+
|
|
1818
|
+
|
|
1766
1819
|
def login_to_state(
|
|
1767
1820
|
app_url: str | None = None,
|
|
1768
1821
|
api_key: str | None = None,
|
|
@@ -2325,29 +2378,6 @@ def _enrich_attachments(event: TMutableMapping) -> TMutableMapping:
|
|
|
2325
2378
|
|
|
2326
2379
|
|
|
2327
2380
|
def _validate_and_sanitize_experiment_log_partial_args(event: Mapping[str, Any]) -> dict[str, Any]:
|
|
2328
|
-
# Make sure only certain keys are specified.
|
|
2329
|
-
forbidden_keys = set(event.keys()) - {
|
|
2330
|
-
"input",
|
|
2331
|
-
"output",
|
|
2332
|
-
"expected",
|
|
2333
|
-
"tags",
|
|
2334
|
-
"scores",
|
|
2335
|
-
"metadata",
|
|
2336
|
-
"metrics",
|
|
2337
|
-
"error",
|
|
2338
|
-
"dataset_record_id",
|
|
2339
|
-
"origin",
|
|
2340
|
-
"inputs",
|
|
2341
|
-
"span_attributes",
|
|
2342
|
-
ASYNC_SCORING_CONTROL_FIELD,
|
|
2343
|
-
MERGE_PATHS_FIELD,
|
|
2344
|
-
SKIP_ASYNC_SCORING_FIELD,
|
|
2345
|
-
"span_id",
|
|
2346
|
-
"root_span_id",
|
|
2347
|
-
}
|
|
2348
|
-
if forbidden_keys:
|
|
2349
|
-
raise ValueError(f"The following keys are not permitted: {forbidden_keys}")
|
|
2350
|
-
|
|
2351
2381
|
scores = event.get("scores")
|
|
2352
2382
|
if scores:
|
|
2353
2383
|
for name, score in scores.items():
|
|
@@ -3856,6 +3886,21 @@ class SpanImpl(Span):
|
|
|
3856
3886
|
if serializable_partial_record.get("metrics", {}).get("end") is not None:
|
|
3857
3887
|
self._logged_end_time = serializable_partial_record["metrics"]["end"]
|
|
3858
3888
|
|
|
3889
|
+
# Write to local span cache for scorer access
|
|
3890
|
+
# Only cache experiment spans - regular logs don't need caching
|
|
3891
|
+
if self.parent_object_type == SpanObjectTypeV3.EXPERIMENT:
|
|
3892
|
+
from braintrust.span_cache import CachedSpan
|
|
3893
|
+
|
|
3894
|
+
cached_span = CachedSpan(
|
|
3895
|
+
span_id=self.span_id,
|
|
3896
|
+
input=serializable_partial_record.get("input"),
|
|
3897
|
+
output=serializable_partial_record.get("output"),
|
|
3898
|
+
metadata=serializable_partial_record.get("metadata"),
|
|
3899
|
+
span_parents=self.span_parents,
|
|
3900
|
+
span_attributes=serializable_partial_record.get("span_attributes"),
|
|
3901
|
+
)
|
|
3902
|
+
self.state.span_cache.queue_write(self.root_span_id, self.span_id, cached_span)
|
|
3903
|
+
|
|
3859
3904
|
def compute_record() -> dict[str, Any]:
|
|
3860
3905
|
exporter = _get_exporter()
|
|
3861
3906
|
return dict(
|
|
@@ -4403,24 +4448,20 @@ def render_message(render: Callable[[str], str], message: PromptMessage):
|
|
|
4403
4448
|
if c["type"] == "text":
|
|
4404
4449
|
rendered_content.append({**c, "text": render(c["text"])})
|
|
4405
4450
|
elif c["type"] == "image_url":
|
|
4406
|
-
rendered_content.append(
|
|
4407
|
-
|
|
4408
|
-
|
|
4409
|
-
|
|
4410
|
-
}
|
|
4411
|
-
)
|
|
4451
|
+
rendered_content.append({
|
|
4452
|
+
**c,
|
|
4453
|
+
"image_url": {**c["image_url"], "url": render(c["image_url"]["url"])},
|
|
4454
|
+
})
|
|
4412
4455
|
elif c["type"] == "file":
|
|
4413
|
-
rendered_content.append(
|
|
4414
|
-
|
|
4415
|
-
|
|
4416
|
-
"file"
|
|
4417
|
-
|
|
4418
|
-
|
|
4419
|
-
|
|
4420
|
-
|
|
4421
|
-
|
|
4422
|
-
}
|
|
4423
|
-
)
|
|
4456
|
+
rendered_content.append({
|
|
4457
|
+
**c,
|
|
4458
|
+
"file": {
|
|
4459
|
+
**c["file"],
|
|
4460
|
+
"file_data": render(c["file"]["file_data"]),
|
|
4461
|
+
**({} if "file_id" not in c["file"] else {"file_id": render(c["file"]["file_id"])}),
|
|
4462
|
+
**({} if "filename" not in c["file"] else {"filename": render(c["file"]["filename"])}),
|
|
4463
|
+
},
|
|
4464
|
+
})
|
|
4424
4465
|
else:
|
|
4425
4466
|
raise ValueError(f"Unknown content type: {c['type']}")
|
|
4426
4467
|
|
|
@@ -39,7 +39,7 @@ class TestDiskCache(unittest.TestCase):
|
|
|
39
39
|
"a\nb",
|
|
40
40
|
]
|
|
41
41
|
for k in weird_keys:
|
|
42
|
-
time.sleep(0.
|
|
42
|
+
time.sleep(0.01) # make sure the mtimes are different
|
|
43
43
|
self.cache.set(k, data)
|
|
44
44
|
result = self.cache.get(k)
|
|
45
45
|
assert data == result
|
|
@@ -61,7 +61,7 @@ class TestDiskCache(unittest.TestCase):
|
|
|
61
61
|
# Fill cache beyond max size (3).
|
|
62
62
|
for i in range(3):
|
|
63
63
|
self.cache.set(f"key{i}", {"value": i})
|
|
64
|
-
time.sleep(0.
|
|
64
|
+
time.sleep(0.01) # wait to ensure different mtimes
|
|
65
65
|
|
|
66
66
|
# Add one more to trigger eviction.
|
|
67
67
|
self.cache.set("key3", {"value": 3})
|
|
@@ -75,7 +75,7 @@ class TestDiskCache(unittest.TestCase):
|
|
|
75
75
|
# Fill cache beyond max size (3).
|
|
76
76
|
for i in range(3):
|
|
77
77
|
self.cache.set(f"key{i}", {"value": i})
|
|
78
|
-
time.sleep(0.
|
|
78
|
+
time.sleep(0.01) # wait to ensure different mtimes
|
|
79
79
|
|
|
80
80
|
# Add one more to trigger eviction.
|
|
81
81
|
self.cache.set("key3", {"value": 3})
|