braintrust 0.3.14__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. braintrust/__init__.py +4 -0
  2. braintrust/_generated_types.py +1200 -611
  3. braintrust/audit.py +2 -2
  4. braintrust/cli/eval.py +6 -7
  5. braintrust/cli/push.py +11 -11
  6. braintrust/conftest.py +1 -0
  7. braintrust/context.py +12 -17
  8. braintrust/contrib/temporal/__init__.py +16 -27
  9. braintrust/contrib/temporal/test_temporal.py +8 -3
  10. braintrust/devserver/auth.py +8 -8
  11. braintrust/devserver/cache.py +3 -4
  12. braintrust/devserver/cors.py +8 -7
  13. braintrust/devserver/dataset.py +3 -5
  14. braintrust/devserver/eval_hooks.py +7 -6
  15. braintrust/devserver/schemas.py +22 -19
  16. braintrust/devserver/server.py +19 -12
  17. braintrust/devserver/test_cached_login.py +4 -4
  18. braintrust/framework.py +128 -140
  19. braintrust/framework2.py +88 -87
  20. braintrust/functions/invoke.py +93 -53
  21. braintrust/functions/stream.py +3 -2
  22. braintrust/generated_types.py +17 -1
  23. braintrust/git_fields.py +11 -11
  24. braintrust/gitutil.py +2 -3
  25. braintrust/graph_util.py +10 -10
  26. braintrust/id_gen.py +2 -2
  27. braintrust/logger.py +346 -357
  28. braintrust/merge_row_batch.py +10 -9
  29. braintrust/oai.py +107 -24
  30. braintrust/otel/__init__.py +49 -49
  31. braintrust/otel/context.py +16 -30
  32. braintrust/otel/test_distributed_tracing.py +14 -11
  33. braintrust/otel/test_otel_bt_integration.py +32 -31
  34. braintrust/parameters.py +8 -8
  35. braintrust/prompt.py +14 -14
  36. braintrust/prompt_cache/disk_cache.py +5 -4
  37. braintrust/prompt_cache/lru_cache.py +3 -2
  38. braintrust/prompt_cache/prompt_cache.py +13 -14
  39. braintrust/queue.py +4 -4
  40. braintrust/score.py +4 -4
  41. braintrust/serializable_data_class.py +4 -4
  42. braintrust/span_identifier_v1.py +1 -2
  43. braintrust/span_identifier_v2.py +3 -4
  44. braintrust/span_identifier_v3.py +23 -20
  45. braintrust/span_identifier_v4.py +34 -25
  46. braintrust/test_framework.py +16 -6
  47. braintrust/test_helpers.py +5 -5
  48. braintrust/test_id_gen.py +2 -3
  49. braintrust/test_otel.py +61 -53
  50. braintrust/test_queue.py +0 -1
  51. braintrust/test_score.py +1 -3
  52. braintrust/test_span_components.py +29 -44
  53. braintrust/util.py +9 -8
  54. braintrust/version.py +2 -2
  55. braintrust/wrappers/_anthropic_utils.py +4 -4
  56. braintrust/wrappers/agno/__init__.py +3 -4
  57. braintrust/wrappers/agno/agent.py +1 -2
  58. braintrust/wrappers/agno/function_call.py +1 -2
  59. braintrust/wrappers/agno/model.py +1 -2
  60. braintrust/wrappers/agno/team.py +1 -2
  61. braintrust/wrappers/agno/utils.py +12 -12
  62. braintrust/wrappers/anthropic.py +7 -8
  63. braintrust/wrappers/claude_agent_sdk/__init__.py +3 -4
  64. braintrust/wrappers/claude_agent_sdk/_wrapper.py +29 -27
  65. braintrust/wrappers/dspy.py +15 -17
  66. braintrust/wrappers/google_genai/__init__.py +16 -16
  67. braintrust/wrappers/langchain.py +22 -24
  68. braintrust/wrappers/litellm.py +4 -3
  69. braintrust/wrappers/openai.py +15 -15
  70. braintrust/wrappers/pydantic_ai.py +1204 -0
  71. braintrust/wrappers/test_agno.py +0 -1
  72. braintrust/wrappers/test_dspy.py +0 -1
  73. braintrust/wrappers/test_google_genai.py +2 -3
  74. braintrust/wrappers/test_litellm.py +0 -1
  75. braintrust/wrappers/test_oai_attachments.py +322 -0
  76. braintrust/wrappers/test_pydantic_ai_integration.py +1788 -0
  77. braintrust/wrappers/{test_pydantic_ai.py → test_pydantic_ai_wrap_openai.py} +1 -2
  78. {braintrust-0.3.14.dist-info → braintrust-0.4.0.dist-info}/METADATA +3 -2
  79. braintrust-0.4.0.dist-info/RECORD +120 -0
  80. braintrust-0.3.14.dist-info/RECORD +0 -117
  81. {braintrust-0.3.14.dist-info → braintrust-0.4.0.dist-info}/WHEEL +0 -0
  82. {braintrust-0.3.14.dist-info → braintrust-0.4.0.dist-info}/entry_points.txt +0 -0
  83. {braintrust-0.3.14.dist-info → braintrust-0.4.0.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,8 @@
1
1
  import json
2
- from typing import Any, Dict, List, Optional, Sequence, Union, get_args, get_origin
2
+ from collections.abc import Sequence
3
+ from typing import Any, Union, get_args, get_origin, get_type_hints
3
4
 
4
- from typing_extensions import TypedDict, get_type_hints
5
+ from typing_extensions import TypedDict
5
6
 
6
7
  # This is not beautiful code, but it saves us from introducing Pydantic as a dependency, and it is fairly
7
8
  # straightforward for an LLM to keep it up to date with runEvalBodySchema in JS.
@@ -16,12 +17,12 @@ class ValidationError(Exception):
16
17
  class ParsedFunctionId(TypedDict, total=False):
17
18
  """Parsed function identifier."""
18
19
 
19
- function_id: Optional[str]
20
- version: Optional[str]
21
- name: Optional[str]
22
- prompt_session_id: Optional[str]
23
- inline_code: Optional[str]
24
- global_function: Optional[str]
20
+ function_id: str | None
21
+ version: str | None
22
+ name: str | None
23
+ prompt_session_id: str | None
24
+ inline_code: str | None
25
+ global_function: str | None
25
26
 
26
27
 
27
28
  class ParsedParent(TypedDict):
@@ -35,16 +36,16 @@ class ParsedEvalBody(TypedDict, total=False):
35
36
  """Type for parsed eval request body."""
36
37
 
37
38
  name: str # Required
38
- parameters: Dict[str, Any]
39
+ parameters: dict[str, Any]
39
40
  data: Any
40
- scores: List[ParsedFunctionId]
41
+ scores: list[ParsedFunctionId]
41
42
  experiment_name: str
42
43
  project_id: str
43
- parent: Union[str, ParsedParent]
44
+ parent: str | ParsedParent
44
45
  stream: bool
45
46
 
46
47
 
47
- def validate_typed_dict(data: Any, typed_dict_class: type, path: str = "") -> Dict[str, Any]:
48
+ def validate_typed_dict(data: Any, typed_dict_class: type, path: str = "") -> dict[str, Any]:
48
49
  """Validate data against a TypedDict definition."""
49
50
  if not isinstance(data, dict):
50
51
  raise ValidationError(f"{path or 'Root'} must be a dictionary, got {type(data).__name__}")
@@ -107,7 +108,7 @@ def validate_value(value: Any, expected_type: type, path: str) -> Any:
107
108
  return validate_value(value, inner_type, path)
108
109
 
109
110
  # Handle List/Sequence
110
- if origin in (list, List, Sequence):
111
+ if origin in (list, list, Sequence):
111
112
  if not isinstance(value, list):
112
113
  raise ValidationError(f"{path} must be a list, got {type(value).__name__}")
113
114
 
@@ -115,7 +116,7 @@ def validate_value(value: Any, expected_type: type, path: str) -> Any:
115
116
  return [validate_value(item, item_type, f"{path}[{i}]") for i, item in enumerate(value)]
116
117
 
117
118
  # Handle Dict/Mapping
118
- if origin in (dict, Dict):
119
+ if origin in (dict, dict):
119
120
  if not isinstance(value, dict):
120
121
  raise ValidationError(f"{path} must be a dict, got {type(value).__name__}")
121
122
 
@@ -172,7 +173,7 @@ def parse_function_id(data: Any, path: str = "function") -> ParsedFunctionId:
172
173
  raise ValidationError(f"{path} must specify function_id, name, prompt_session_id, or inline_code")
173
174
 
174
175
 
175
- def parse_eval_body(request_data: Union[str, bytes, dict]) -> ParsedEvalBody:
176
+ def parse_eval_body(request_data: str | bytes | dict) -> ParsedEvalBody:
176
177
  """
177
178
  Parse request body for eval execution.
178
179
 
@@ -221,10 +222,12 @@ def parse_eval_body(request_data: Union[str, bytes, dict]) -> ParsedEvalBody:
221
222
  parsed_scores = []
222
223
  for i, score in enumerate(scores_data):
223
224
  try:
224
- parsed_scores.append({
225
- "name": score["name"],
226
- "function_id": parse_function_id(score["function_id"], f"scores[{i}]"),
227
- })
225
+ parsed_scores.append(
226
+ {
227
+ "name": score["name"],
228
+ "function_id": parse_function_id(score["function_id"], f"scores[{i}]"),
229
+ }
230
+ )
228
231
  except ValidationError as e:
229
232
  raise ValidationError(f"Invalid score at index {i}: {e}")
230
233
 
@@ -2,7 +2,7 @@ import asyncio
2
2
  import json
3
3
  import sys
4
4
  import textwrap
5
- from typing import Any, Optional, Union
5
+ from typing import Any
6
6
 
7
7
  try:
8
8
  import uvicorn
@@ -40,7 +40,7 @@ _all_evaluators: dict[str, Evaluator[Any, Any]] = {}
40
40
 
41
41
 
42
42
  class CheckAuthorizedMiddleware(BaseHTTPMiddleware):
43
- def __init__(self, app, allowed_org_name: Optional[str] = None):
43
+ def __init__(self, app, allowed_org_name: str | None = None):
44
44
  super().__init__(app)
45
45
  self.allowed_org_name = allowed_org_name
46
46
  self.protected_paths = ["/list", "/eval"]
@@ -100,7 +100,7 @@ async def list_evaluators(request: Request) -> JSONResponse:
100
100
  return JSONResponse(evaluator_list)
101
101
 
102
102
 
103
- async def run_eval(request: Request) -> Union[JSONResponse, StreamingResponse]:
103
+ async def run_eval(request: Request) -> JSONResponse | StreamingResponse:
104
104
  """Handle eval execution requests."""
105
105
  try:
106
106
  # Get request body
@@ -157,12 +157,14 @@ async def run_eval(request: Request) -> Union[JSONResponse, StreamingResponse]:
157
157
  result = await evaluator.task(input, hooks)
158
158
  else:
159
159
  result = evaluator.task(input, hooks)
160
- hooks.report_progress({
161
- "format": "code",
162
- "output_type": "completion",
163
- "event": "json_delta",
164
- "data": json.dumps(result),
165
- })
160
+ hooks.report_progress(
161
+ {
162
+ "format": "code",
163
+ "output_type": "completion",
164
+ "event": "json_delta",
165
+ "data": json.dumps(result),
166
+ }
167
+ )
166
168
  return result
167
169
 
168
170
  def on_start_fn(summary: ExperimentSummary):
@@ -214,6 +216,7 @@ async def run_eval(request: Request) -> Union[JSONResponse, StreamingResponse]:
214
216
 
215
217
  async def event_generator():
216
218
  """Generate SSE events from the queue."""
219
+
217
220
  # Create a task to run the eval and signal completion
218
221
  async def run_and_complete():
219
222
  try:
@@ -255,7 +258,7 @@ async def run_eval(request: Request) -> Union[JSONResponse, StreamingResponse]:
255
258
  return JSONResponse({"error": f"Failed to run evaluation: {str(e)}"}, status_code=500)
256
259
 
257
260
 
258
- def create_app(evaluators: list[Evaluator[Any, Any]], org_name: Optional[str] = None):
261
+ def create_app(evaluators: list[Evaluator[Any, Any]], org_name: str | None = None):
259
262
  """Create and configure the Starlette app for the dev server.
260
263
 
261
264
  Args:
@@ -283,7 +286,9 @@ def create_app(evaluators: list[Evaluator[Any, Any]], org_name: Optional[str] =
283
286
  return app
284
287
 
285
288
 
286
- def run_dev_server(evaluators: list[Evaluator[Any, Any]], host: str = "localhost", port: int = 8300, org_name: Optional[str] = None):
289
+ def run_dev_server(
290
+ evaluators: list[Evaluator[Any, Any]], host: str = "localhost", port: int = 8300, org_name: str | None = None
291
+ ):
287
292
  """Start the dev server.
288
293
 
289
294
  Args:
@@ -305,7 +310,9 @@ def snake_to_camel(snake_str: str) -> str:
305
310
  return components[0] + "".join(x.title() for x in components[1:]) if components else snake_str
306
311
 
307
312
 
308
- def make_scorer(state: BraintrustState, name: str, score: FunctionId, project_id: Optional[str] = None) -> EvalScorer[Any, Any]:
313
+ def make_scorer(
314
+ state: BraintrustState, name: str, score: FunctionId, project_id: str | None = None
315
+ ) -> EvalScorer[Any, Any]:
309
316
  def scorer_fn(input, output, expected, metadata):
310
317
  request = {
311
318
  **score,
@@ -10,7 +10,7 @@ class TestCachedLogin(unittest.TestCase):
10
10
  """Clear the cache before each test."""
11
11
  cache._login_cache = cache.LRUCache(max_size=32)
12
12
 
13
- @patch('braintrust.devserver.cache.login_to_state')
13
+ @patch("braintrust.devserver.cache.login_to_state")
14
14
  def test_cached_login_caches_results(self, mock_login):
15
15
  """Test that cached_login caches and reuses results."""
16
16
  mock_state = MagicMock()
@@ -26,7 +26,7 @@ class TestCachedLogin(unittest.TestCase):
26
26
  self.assertEqual(result2, mock_state)
27
27
  self.assertEqual(mock_login.call_count, 1) # Still 1, not called again
28
28
 
29
- @patch('braintrust.devserver.cache.login_to_state')
29
+ @patch("braintrust.devserver.cache.login_to_state")
30
30
  def test_cached_login_different_keys(self, mock_login):
31
31
  """Test that different cache keys create separate entries."""
32
32
  mock_state1 = MagicMock()
@@ -48,7 +48,7 @@ class TestCachedLogin(unittest.TestCase):
48
48
  self.assertEqual(result3, mock_state3)
49
49
  self.assertEqual(mock_login.call_count, 3)
50
50
 
51
- @patch('braintrust.devserver.cache.login_to_state')
51
+ @patch("braintrust.devserver.cache.login_to_state")
52
52
  def test_cached_login_with_org_name(self, mock_login):
53
53
  """Test caching with org_name parameter."""
54
54
  mock_state = MagicMock()
@@ -68,7 +68,7 @@ class TestCachedLogin(unittest.TestCase):
68
68
  result3 = asyncio.run(cache.cached_login("api_key_1", "https://app.braintrust.com", org_name="other_org"))
69
69
  self.assertEqual(mock_login.call_count, 2)
70
70
 
71
- @patch('braintrust.devserver.cache.login_to_state')
71
+ @patch("braintrust.devserver.cache.login_to_state")
72
72
  def test_cached_login_propagates_exceptions(self, mock_login):
73
73
  """Test that exceptions from login_to_state are propagated."""
74
74
  mock_login.side_effect = ValueError("Invalid API key")