braintrust 0.3.15__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. braintrust/_generated_types.py +737 -672
  2. braintrust/audit.py +2 -2
  3. braintrust/bt_json.py +178 -19
  4. braintrust/cli/eval.py +6 -7
  5. braintrust/cli/push.py +11 -11
  6. braintrust/context.py +12 -17
  7. braintrust/contrib/temporal/__init__.py +16 -27
  8. braintrust/contrib/temporal/test_temporal.py +8 -3
  9. braintrust/devserver/auth.py +8 -8
  10. braintrust/devserver/cache.py +3 -4
  11. braintrust/devserver/cors.py +8 -7
  12. braintrust/devserver/dataset.py +3 -5
  13. braintrust/devserver/eval_hooks.py +7 -6
  14. braintrust/devserver/schemas.py +22 -19
  15. braintrust/devserver/server.py +19 -12
  16. braintrust/devserver/test_cached_login.py +4 -4
  17. braintrust/framework.py +139 -142
  18. braintrust/framework2.py +88 -87
  19. braintrust/functions/invoke.py +66 -59
  20. braintrust/functions/stream.py +3 -2
  21. braintrust/generated_types.py +3 -1
  22. braintrust/git_fields.py +11 -11
  23. braintrust/gitutil.py +2 -3
  24. braintrust/graph_util.py +10 -10
  25. braintrust/id_gen.py +2 -2
  26. braintrust/logger.py +373 -471
  27. braintrust/merge_row_batch.py +10 -9
  28. braintrust/oai.py +21 -20
  29. braintrust/otel/__init__.py +49 -49
  30. braintrust/otel/context.py +16 -30
  31. braintrust/otel/test_distributed_tracing.py +14 -11
  32. braintrust/otel/test_otel_bt_integration.py +32 -31
  33. braintrust/parameters.py +8 -8
  34. braintrust/prompt.py +14 -14
  35. braintrust/prompt_cache/disk_cache.py +5 -4
  36. braintrust/prompt_cache/lru_cache.py +3 -2
  37. braintrust/prompt_cache/prompt_cache.py +13 -14
  38. braintrust/queue.py +4 -4
  39. braintrust/score.py +4 -4
  40. braintrust/serializable_data_class.py +4 -4
  41. braintrust/span_identifier_v1.py +1 -2
  42. braintrust/span_identifier_v2.py +3 -4
  43. braintrust/span_identifier_v3.py +23 -20
  44. braintrust/span_identifier_v4.py +34 -25
  45. braintrust/test_bt_json.py +644 -0
  46. braintrust/test_framework.py +72 -6
  47. braintrust/test_helpers.py +5 -5
  48. braintrust/test_id_gen.py +2 -3
  49. braintrust/test_logger.py +211 -107
  50. braintrust/test_otel.py +61 -53
  51. braintrust/test_queue.py +0 -1
  52. braintrust/test_score.py +1 -3
  53. braintrust/test_span_components.py +29 -44
  54. braintrust/util.py +9 -8
  55. braintrust/version.py +2 -2
  56. braintrust/wrappers/_anthropic_utils.py +4 -4
  57. braintrust/wrappers/agno/__init__.py +3 -4
  58. braintrust/wrappers/agno/agent.py +1 -2
  59. braintrust/wrappers/agno/function_call.py +1 -2
  60. braintrust/wrappers/agno/model.py +1 -2
  61. braintrust/wrappers/agno/team.py +1 -2
  62. braintrust/wrappers/agno/utils.py +12 -12
  63. braintrust/wrappers/anthropic.py +7 -8
  64. braintrust/wrappers/claude_agent_sdk/__init__.py +3 -4
  65. braintrust/wrappers/claude_agent_sdk/_wrapper.py +29 -27
  66. braintrust/wrappers/dspy.py +15 -17
  67. braintrust/wrappers/google_genai/__init__.py +17 -30
  68. braintrust/wrappers/langchain.py +22 -24
  69. braintrust/wrappers/litellm.py +4 -3
  70. braintrust/wrappers/openai.py +15 -15
  71. braintrust/wrappers/pydantic_ai.py +225 -110
  72. braintrust/wrappers/test_agno.py +0 -1
  73. braintrust/wrappers/test_dspy.py +0 -1
  74. braintrust/wrappers/test_google_genai.py +64 -4
  75. braintrust/wrappers/test_litellm.py +0 -1
  76. braintrust/wrappers/test_pydantic_ai_integration.py +819 -22
  77. {braintrust-0.3.15.dist-info → braintrust-0.4.1.dist-info}/METADATA +3 -2
  78. braintrust-0.4.1.dist-info/RECORD +121 -0
  79. braintrust-0.3.15.dist-info/RECORD +0 -120
  80. {braintrust-0.3.15.dist-info → braintrust-0.4.1.dist-info}/WHEEL +0 -0
  81. {braintrust-0.3.15.dist-info → braintrust-0.4.1.dist-info}/entry_points.txt +0 -0
  82. {braintrust-0.3.15.dist-info → braintrust-0.4.1.dist-info}/top_level.txt +0 -0
braintrust/framework.py CHANGED
@@ -9,23 +9,15 @@ import sys
9
9
  import traceback
10
10
  import warnings
11
11
  from collections import defaultdict
12
+ from collections.abc import Awaitable, Callable, Coroutine, Iterable, Iterator, Sequence
12
13
  from concurrent.futures import ThreadPoolExecutor
13
14
  from contextlib import contextmanager
14
15
  from multiprocessing import cpu_count
15
16
  from typing import (
16
17
  Any,
17
- Awaitable,
18
- Callable,
19
- Coroutine,
20
- Dict,
21
18
  Generic,
22
- Iterable,
23
- Iterator,
24
- List,
25
19
  Literal,
26
20
  Optional,
27
- Sequence,
28
- Type,
29
21
  TypeVar,
30
22
  Union,
31
23
  )
@@ -55,7 +47,7 @@ from .resource_manager import ResourceManager
55
47
  from .score import Score, is_score, is_scorer
56
48
  from .serializable_data_class import SerializableDataClass
57
49
  from .span_types import SpanTypeAttribute
58
- from .util import bt_iscoroutinefunction, eprint
50
+ from .util import bt_iscoroutinefunction, eprint, merge_dicts
59
51
 
60
52
  Input = TypeVar("Input")
61
53
  Output = TypeVar("Output")
@@ -82,14 +74,14 @@ class EvalCase(SerializableDataClass, Generic[Input, Output]):
82
74
  """
83
75
 
84
76
  input: Input
85
- expected: Optional[Output] = None
86
- metadata: Optional[Metadata] = None
87
- tags: Optional[Sequence[str]] = None
77
+ expected: Output | None = None
78
+ metadata: Metadata | None = None
79
+ tags: Sequence[str] | None = None
88
80
 
89
81
  # These fields are only set if the EvalCase is part of a Dataset.
90
- id: Optional[str] = None
91
- _xact_id: Optional[str] = None
92
- created: Optional[str] = None
82
+ id: str | None = None
83
+ _xact_id: str | None = None
84
+ created: str | None = None
93
85
 
94
86
 
95
87
  class _EvalCaseDictNoOutput(Generic[Input], TypedDict):
@@ -101,11 +93,11 @@ class _EvalCaseDictNoOutput(Generic[Input], TypedDict):
101
93
  """
102
94
 
103
95
  input: Input
104
- metadata: NotRequired[Optional[Metadata]]
105
- tags: NotRequired[Optional[Sequence[str]]]
96
+ metadata: NotRequired[Metadata | None]
97
+ tags: NotRequired[Sequence[str] | None]
106
98
 
107
- id: NotRequired[Optional[str]]
108
- _xact_id: NotRequired[Optional[str]]
99
+ id: NotRequired[str | None]
100
+ _xact_id: NotRequired[str | None]
109
101
 
110
102
 
111
103
  class _EvalCaseDict(Generic[Input, Output], _EvalCaseDictNoOutput[Input]):
@@ -113,7 +105,7 @@ class _EvalCaseDict(Generic[Input, Output], _EvalCaseDictNoOutput[Input]):
113
105
  Mirrors EvalCase for callers who pass a dict instead of dataclass.
114
106
  """
115
107
 
116
- expected: NotRequired[Optional[Output]]
108
+ expected: NotRequired[Output | None]
117
109
 
118
110
 
119
111
  # Inheritance doesn't quite work for dataclasses, so we redefine the fields
@@ -124,12 +116,12 @@ class EvalResult(SerializableDataClass, Generic[Input, Output]):
124
116
 
125
117
  input: Input
126
118
  output: Output
127
- scores: Dict[str, Optional[float]]
128
- expected: Optional[Output] = None
129
- metadata: Optional[Metadata] = None
130
- tags: Optional[List[str]] = None
131
- error: Optional[Exception] = None
132
- exc_info: Optional[str] = None
119
+ scores: dict[str, float | None]
120
+ expected: Output | None = None
121
+ metadata: Metadata | None = None
122
+ tags: list[str] | None = None
123
+ error: Exception | None = None
124
+ exc_info: str | None = None
133
125
 
134
126
 
135
127
  @dataclasses.dataclass
@@ -177,7 +169,7 @@ class EvalHooks(abc.ABC, Generic[Output]):
177
169
 
178
170
  @property
179
171
  @abc.abstractmethod
180
- def expected(self) -> Optional[Output]:
172
+ def expected(self) -> Output | None:
181
173
  """
182
174
  The expected output for the current evaluation.
183
175
  """
@@ -222,7 +214,7 @@ class EvalHooks(abc.ABC, Generic[Output]):
222
214
 
223
215
  @property
224
216
  @abc.abstractmethod
225
- def parameters(self) -> Optional[Dict[str, Any]]:
217
+ def parameters(self) -> dict[str, Any] | None:
226
218
  """
227
219
  The parameters for the current evaluation. These are the validated parameter values
228
220
  that were passed to the evaluator.
@@ -236,11 +228,11 @@ class EvalScorerArgs(SerializableDataClass, Generic[Input, Output]):
236
228
 
237
229
  input: Input
238
230
  output: Output
239
- expected: Optional[Output] = None
240
- metadata: Optional[Metadata] = None
231
+ expected: Output | None = None
232
+ metadata: Metadata | None = None
241
233
 
242
234
 
243
- OneOrMoreScores = Union[float, int, bool, None, Score, List[Score]]
235
+ OneOrMoreScores = Union[float, int, bool, None, Score, list[Score]]
244
236
 
245
237
 
246
238
  # Synchronous scorer interface - implements callable
@@ -251,7 +243,7 @@ class SyncScorerLike(Protocol, Generic[Input, Output]):
251
243
  """
252
244
 
253
245
  def __call__(
254
- self, input: Input, output: Output, expected: Optional[Output] = None, **kwargs: Any
246
+ self, input: Input, output: Output, expected: Output | None = None, **kwargs: Any
255
247
  ) -> OneOrMoreScores: ...
256
248
 
257
249
 
@@ -262,9 +254,7 @@ class AsyncScorerLike(Protocol, Generic[Input, Output]):
262
254
  The framework will prefer this interface if available.
263
255
  """
264
256
 
265
- async def eval_async(
266
- self, output: Output, expected: Optional[Output] = None, **kwargs: Any
267
- ) -> OneOrMoreScores: ...
257
+ async def eval_async(self, output: Output, expected: Output | None = None, **kwargs: Any) -> OneOrMoreScores: ...
268
258
 
269
259
 
270
260
  # Union type for any kind of scorer (for typing)
@@ -272,7 +262,7 @@ ScorerLike = Union[SyncScorerLike[Input, Output], AsyncScorerLike[Input, Output]
272
262
 
273
263
  EvalScorer = Union[
274
264
  ScorerLike[Input, Output],
275
- Type[ScorerLike[Input, Output]],
265
+ type[ScorerLike[Input, Output]],
276
266
  Callable[[Input, Output, Output], OneOrMoreScores],
277
267
  Callable[[Input, Output, Output], Awaitable[OneOrMoreScores]],
278
268
  ]
@@ -286,7 +276,7 @@ class BaseExperiment:
286
276
  use based on your git history (or fall back to timestamps).
287
277
  """
288
278
 
289
- name: Optional[str] = None
279
+ name: str | None = None
290
280
  """
291
281
  The name of the base experiment to use. If unspecified, Braintrust will automatically figure out the best base
292
282
  using your git history (or fall back to timestamps).
@@ -308,14 +298,14 @@ _EvalDataObject = Union[
308
298
  BaseExperiment,
309
299
  ]
310
300
 
311
- EvalData = Union[_EvalDataObject[Input, Output], Type[_EvalDataObject[Input, Output]], Dataset]
301
+ EvalData = Union[_EvalDataObject[Input, Output], type[_EvalDataObject[Input, Output]], Dataset]
312
302
 
313
303
  EvalTask = Union[
314
304
  Callable[[Input], Union[Output, Awaitable[Output]]],
315
305
  Callable[[Input, EvalHooks[Output]], Union[Output, Awaitable[Output]]],
316
306
  ]
317
307
 
318
- ErrorScoreHandler = Callable[[Span, EvalCase[Input, Output], List[str]], Optional[Dict[str, float]]]
308
+ ErrorScoreHandler = Callable[[Span, EvalCase[Input, Output], list[str]], Optional[dict[str, float]]]
319
309
 
320
310
 
321
311
  @dataclasses.dataclass
@@ -350,18 +340,18 @@ class Evaluator(Generic[Input, Output]):
350
340
  Runs the evaluation task on a single input. The `hooks` object can be used to add metadata to the evaluation.
351
341
  """
352
342
 
353
- scores: List[EvalScorer[Input, Output]]
343
+ scores: list[EvalScorer[Input, Output]]
354
344
  """
355
345
  A list of scorers to evaluate the results of the task. Each scorer can be a Scorer object or a function
356
346
  that takes `input`, `output`, and `expected` arguments and returns a `Score` object. The function can be async.
357
347
  """
358
348
 
359
- experiment_name: Optional[str]
349
+ experiment_name: str | None
360
350
  """
361
351
  Optional experiment name. If not specified, a name will be generated automatically.
362
352
  """
363
353
 
364
- metadata: Optional[Metadata]
354
+ metadata: Metadata | None
365
355
  """
366
356
  A dictionary with additional data about the test example, model outputs, or just about anything else that's
367
357
  relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`,
@@ -386,54 +376,54 @@ class Evaluator(Generic[Input, Output]):
386
376
  Whether to update an existing experiment with `experiment_name` if one exists. Defaults to false.
387
377
  """
388
378
 
389
- timeout: Optional[float] = None
379
+ timeout: float | None = None
390
380
  """
391
381
  The duration, in seconds, after which to time out the evaluation.
392
382
  Defaults to None, in which case there is no timeout.
393
383
  """
394
384
 
395
- max_concurrency: Optional[int] = None
385
+ max_concurrency: int | None = None
396
386
  """
397
387
  The maximum number of tasks/scorers that will be run concurrently.
398
388
  Defaults to None, in which case there is no max concurrency.
399
389
  """
400
390
 
401
- project_id: Optional[str] = None
391
+ project_id: str | None = None
402
392
  """
403
393
  If specified, uses the given project ID instead of the evaluator's name to identify the project.
404
394
  """
405
395
 
406
- base_experiment_name: Optional[str] = None
396
+ base_experiment_name: str | None = None
407
397
  """
408
398
  An optional experiment name to use as a base. If specified, the new experiment will be summarized and
409
399
  compared to this experiment.
410
400
  """
411
401
 
412
- base_experiment_id: Optional[str] = None
402
+ base_experiment_id: str | None = None
413
403
  """
414
404
  An optional experiment id to use as a base. If specified, the new experiment will be summarized and
415
405
  compared to this experiment. This takes precedence over `base_experiment_name` if specified.
416
406
  """
417
407
 
418
- git_metadata_settings: Optional[GitMetadataSettings] = None
408
+ git_metadata_settings: GitMetadataSettings | None = None
419
409
  """
420
410
  Optional settings for collecting git metadata. By default, will collect all
421
411
  git metadata fields allowed in org-level settings.
422
412
  """
423
413
 
424
- repo_info: Optional[RepoInfo] = None
414
+ repo_info: RepoInfo | None = None
425
415
  """
426
416
  Optionally explicitly specify the git metadata for this experiment. This
427
417
  takes precedence over `git_metadata_settings` if specified.
428
418
  """
429
419
 
430
- error_score_handler: Optional[ErrorScoreHandler] = None
420
+ error_score_handler: ErrorScoreHandler | None = None
431
421
  """
432
422
  Optionally supply a custom function to specifically handle score values when tasks or scoring functions have errored.
433
423
  A default implementation is exported as `default_error_score_handler` which will log a 0 score to the root span for any scorer that was not run.
434
424
  """
435
425
 
436
- description: Optional[str] = None
426
+ description: str | None = None
437
427
  """
438
428
  An optional description for the experiment.
439
429
  """
@@ -443,7 +433,7 @@ class Evaluator(Generic[Input, Output]):
443
433
  Whether to summarize the scores of the experiment after it has run.
444
434
  """
445
435
 
446
- parameters: Optional[EvalParameters] = None
436
+ parameters: EvalParameters | None = None
447
437
  """
448
438
  A set of parameters that will be passed to the evaluator.
449
439
  Can be used to define prompts or other configurable values.
@@ -453,7 +443,7 @@ class Evaluator(Generic[Input, Output]):
453
443
  @dataclasses.dataclass
454
444
  class EvalResultWithSummary(SerializableDataClass, Generic[Input, Output]):
455
445
  summary: ExperimentSummary
456
- results: List[EvalResult[Input, Output]]
446
+ results: list[EvalResult[Input, Output]]
457
447
 
458
448
  def _repr_pretty_(self, p, cycle):
459
449
  p.text(f'EvalResultWithSummary(summary="...", results=[...])')
@@ -529,13 +519,13 @@ class ReporterDef(SerializableDataClass, Generic[Input, Output, EvalReport]):
529
519
 
530
520
  report_eval: Callable[
531
521
  [Evaluator[Input, Output], EvalResultWithSummary[Input, Output], bool, bool],
532
- Union[EvalReport, Awaitable[EvalReport]],
522
+ EvalReport | Awaitable[EvalReport],
533
523
  ]
534
524
  """
535
525
  A function that takes an evaluator and its result and returns a report.
536
526
  """
537
527
 
538
- report_run: Callable[[List[EvalReport], bool, bool], Union[bool, Awaitable[bool]]]
528
+ report_run: Callable[[list[EvalReport], bool, bool], bool | Awaitable[bool]]
539
529
  """
540
530
  A function that takes all evaluator results and returns a boolean indicating whether the run was successful.
541
531
  If you return false, the `braintrust eval` command will exit with a non-zero status code.
@@ -547,15 +537,13 @@ class ReporterDef(SerializableDataClass, Generic[Input, Output, EvalReport]):
547
537
  result: EvalResultWithSummary[Input, Output],
548
538
  verbose: bool,
549
539
  jsonl: bool,
550
- ) -> Union[EvalReport, Awaitable[EvalReport]]:
540
+ ) -> EvalReport | Awaitable[EvalReport]:
551
541
  event_loop = asyncio.get_event_loop()
552
542
  return await call_user_fn(
553
543
  event_loop, self.report_eval, evaluator=evaluator, result=result, verbose=verbose, jsonl=jsonl
554
544
  )
555
545
 
556
- async def _call_report_run(
557
- self, results: List[EvalReport], verbose: bool, jsonl: bool
558
- ) -> Union[bool, Awaitable[bool]]:
546
+ async def _call_report_run(self, results: list[EvalReport], verbose: bool, jsonl: bool) -> bool | Awaitable[bool]:
559
547
  event_loop = asyncio.get_event_loop()
560
548
  return await call_user_fn(event_loop, self.report_run, results=results, verbose=verbose, jsonl=jsonl)
561
549
 
@@ -563,13 +551,13 @@ class ReporterDef(SerializableDataClass, Generic[Input, Output, EvalReport]):
563
551
  @dataclasses.dataclass
564
552
  class EvaluatorInstance(SerializableDataClass, Generic[Input, Output, EvalReport]):
565
553
  evaluator: Evaluator[Input, Output]
566
- reporter: Optional[Union[ReporterDef[Input, Output, EvalReport], str]]
554
+ reporter: ReporterDef[Input, Output, EvalReport] | str | None
567
555
 
568
556
 
569
557
  @dataclasses.dataclass
570
558
  class EvaluatorFile(SerializableDataClass):
571
- evaluators: Dict[str, EvaluatorInstance] = dataclasses.field(default_factory=dict)
572
- reporters: Dict[str, ReporterDef] = dataclasses.field(default_factory=dict)
559
+ evaluators: dict[str, EvaluatorInstance] = dataclasses.field(default_factory=dict)
560
+ reporters: dict[str, ReporterDef] = dataclasses.field(default_factory=dict)
573
561
 
574
562
  def clear(self):
575
563
  self.evaluators.clear()
@@ -651,7 +639,7 @@ default_reporter = ReporterDef(
651
639
  )
652
640
 
653
641
 
654
- def _make_eval_name(name: str, experiment_name: Optional[str]):
642
+ def _make_eval_name(name: str, experiment_name: str | None):
655
643
  out = name
656
644
  if experiment_name is not None:
657
645
  out += f" [experiment_name={experiment_name}]"
@@ -663,28 +651,28 @@ def _EvalCommon(
663
651
  data: EvalData[Input, Output],
664
652
  task: EvalTask[Input, Output],
665
653
  scores: Sequence[EvalScorer[Input, Output]],
666
- experiment_name: Optional[str],
654
+ experiment_name: str | None,
667
655
  trial_count: int,
668
- metadata: Optional[Metadata],
656
+ metadata: Metadata | None,
669
657
  is_public: bool,
670
658
  update: bool,
671
- reporter: Optional[ReporterDef[Input, Output, EvalReport]],
672
- timeout: Optional[float],
673
- max_concurrency: Optional[int],
674
- project_id: Optional[str],
675
- base_experiment_name: Optional[str],
676
- base_experiment_id: Optional[str],
677
- git_metadata_settings: Optional[GitMetadataSettings],
678
- repo_info: Optional[RepoInfo],
679
- description: Optional[str],
659
+ reporter: ReporterDef[Input, Output, EvalReport] | None,
660
+ timeout: float | None,
661
+ max_concurrency: int | None,
662
+ project_id: str | None,
663
+ base_experiment_name: str | None,
664
+ base_experiment_id: str | None,
665
+ git_metadata_settings: GitMetadataSettings | None,
666
+ repo_info: RepoInfo | None,
667
+ description: str | None,
680
668
  summarize_scores: bool,
681
669
  no_send_logs: bool,
682
- error_score_handler: Optional[ErrorScoreHandler] = None,
683
- parameters: Optional[EvalParameters] = None,
684
- on_start: Optional[Callable[[ExperimentSummary], None]] = None,
685
- stream: Optional[Callable[[SSEProgressEvent], None]] = None,
686
- parent: Optional[str] = None,
687
- state: Optional[BraintrustState] = None,
670
+ error_score_handler: ErrorScoreHandler | None = None,
671
+ parameters: EvalParameters | None = None,
672
+ on_start: Callable[[ExperimentSummary], None] | None = None,
673
+ stream: Callable[[SSEProgressEvent], None] | None = None,
674
+ parent: str | None = None,
675
+ state: BraintrustState | None = None,
688
676
  ) -> Callable[[], Coroutine[Any, Any, EvalResultWithSummary[Input, Output]]]:
689
677
  """
690
678
  This helper is needed because in case of `_lazy_load`, we need to update
@@ -788,28 +776,28 @@ async def EvalAsync(
788
776
  data: EvalData[Input, Output],
789
777
  task: EvalTask[Input, Output],
790
778
  scores: Sequence[EvalScorer[Input, Output]],
791
- experiment_name: Optional[str] = None,
779
+ experiment_name: str | None = None,
792
780
  trial_count: int = 1,
793
- metadata: Optional[Metadata] = None,
781
+ metadata: Metadata | None = None,
794
782
  is_public: bool = False,
795
783
  update: bool = False,
796
- reporter: Optional[ReporterDef[Input, Output, EvalReport]] = None,
797
- timeout: Optional[float] = None,
798
- max_concurrency: Optional[int] = None,
799
- project_id: Optional[str] = None,
800
- base_experiment_name: Optional[str] = None,
801
- base_experiment_id: Optional[str] = None,
802
- git_metadata_settings: Optional[GitMetadataSettings] = None,
803
- repo_info: Optional[RepoInfo] = None,
804
- error_score_handler: Optional[ErrorScoreHandler] = None,
805
- description: Optional[str] = None,
784
+ reporter: ReporterDef[Input, Output, EvalReport] | None = None,
785
+ timeout: float | None = None,
786
+ max_concurrency: int | None = None,
787
+ project_id: str | None = None,
788
+ base_experiment_name: str | None = None,
789
+ base_experiment_id: str | None = None,
790
+ git_metadata_settings: GitMetadataSettings | None = None,
791
+ repo_info: RepoInfo | None = None,
792
+ error_score_handler: ErrorScoreHandler | None = None,
793
+ description: str | None = None,
806
794
  summarize_scores: bool = True,
807
795
  no_send_logs: bool = False,
808
- parameters: Optional[EvalParameters] = None,
809
- on_start: Optional[Callable[[ExperimentSummary], None]] = None,
810
- stream: Optional[Callable[[SSEProgressEvent], None]] = None,
811
- parent: Optional[str] = None,
812
- state: Optional[BraintrustState] = None,
796
+ parameters: EvalParameters | None = None,
797
+ on_start: Callable[[ExperimentSummary], None] | None = None,
798
+ stream: Callable[[SSEProgressEvent], None] | None = None,
799
+ parent: str | None = None,
800
+ state: BraintrustState | None = None,
813
801
  ) -> EvalResultWithSummary[Input, Output]:
814
802
  """
815
803
  A function you can use to define an evaluator. This is a convenience wrapper around the `Evaluator` class.
@@ -908,28 +896,28 @@ def Eval(
908
896
  data: EvalData[Input, Output],
909
897
  task: EvalTask[Input, Output],
910
898
  scores: Sequence[EvalScorer[Input, Output]],
911
- experiment_name: Optional[str] = None,
899
+ experiment_name: str | None = None,
912
900
  trial_count: int = 1,
913
- metadata: Optional[Metadata] = None,
901
+ metadata: Metadata | None = None,
914
902
  is_public: bool = False,
915
903
  update: bool = False,
916
- reporter: Optional[ReporterDef[Input, Output, EvalReport]] = None,
917
- timeout: Optional[float] = None,
918
- max_concurrency: Optional[int] = None,
919
- project_id: Optional[str] = None,
920
- base_experiment_name: Optional[str] = None,
921
- base_experiment_id: Optional[str] = None,
922
- git_metadata_settings: Optional[GitMetadataSettings] = None,
923
- repo_info: Optional[RepoInfo] = None,
924
- error_score_handler: Optional[ErrorScoreHandler] = None,
925
- description: Optional[str] = None,
904
+ reporter: ReporterDef[Input, Output, EvalReport] | None = None,
905
+ timeout: float | None = None,
906
+ max_concurrency: int | None = None,
907
+ project_id: str | None = None,
908
+ base_experiment_name: str | None = None,
909
+ base_experiment_id: str | None = None,
910
+ git_metadata_settings: GitMetadataSettings | None = None,
911
+ repo_info: RepoInfo | None = None,
912
+ error_score_handler: ErrorScoreHandler | None = None,
913
+ description: str | None = None,
926
914
  summarize_scores: bool = True,
927
915
  no_send_logs: bool = False,
928
- parameters: Optional[EvalParameters] = None,
929
- on_start: Optional[Callable[[ExperimentSummary], None]] = None,
930
- stream: Optional[Callable[[SSEProgressEvent], None]] = None,
931
- parent: Optional[str] = None,
932
- state: Optional[BraintrustState] = None,
916
+ parameters: EvalParameters | None = None,
917
+ on_start: Callable[[ExperimentSummary], None] | None = None,
918
+ stream: Callable[[SSEProgressEvent], None] | None = None,
919
+ parent: str | None = None,
920
+ state: BraintrustState | None = None,
933
921
  ) -> EvalResultWithSummary[Input, Output]:
934
922
  """
935
923
  A function you can use to define an evaluator. This is a convenience wrapper around the `Evaluator` class.
@@ -1046,9 +1034,9 @@ def Reporter(
1046
1034
  name: str,
1047
1035
  report_eval: Callable[
1048
1036
  [Evaluator[Input, Output], EvalResultWithSummary[Input, Output], bool, bool],
1049
- Union[EvalReport, Awaitable[EvalReport]],
1037
+ EvalReport | Awaitable[EvalReport],
1050
1038
  ],
1051
- report_run: Callable[[List[EvalReport], bool, bool], Union[bool, Awaitable[bool]]],
1039
+ report_run: Callable[[list[EvalReport], bool, bool], bool | Awaitable[bool]],
1052
1040
  ):
1053
1041
  """
1054
1042
  A function you can use to define a reporter. This is a convenience wrapper around the `ReporterDef` class.
@@ -1086,7 +1074,7 @@ def Reporter(
1086
1074
 
1087
1075
  @dataclasses.dataclass
1088
1076
  class Filter:
1089
- path: List[str]
1077
+ path: list[str]
1090
1078
  pattern: re.Pattern
1091
1079
 
1092
1080
 
@@ -1104,7 +1092,7 @@ def deserialize_plain_string_as_json(s: str) -> Any:
1104
1092
  return {"value": s, "error": e}
1105
1093
 
1106
1094
 
1107
- def parse_filters(filters: List[str]) -> List[Filter]:
1095
+ def parse_filters(filters: list[str]) -> list[Filter]:
1108
1096
  result = []
1109
1097
  for f in filters:
1110
1098
  equals_idx = f.index("=")
@@ -1133,15 +1121,15 @@ def evaluate_filter(object, filter: Filter):
1133
1121
  return filter.pattern.match(serialize_json_with_plain_string(key)) is not None
1134
1122
 
1135
1123
 
1136
- class DictEvalHooks(Dict[str, Any]):
1124
+ class DictEvalHooks(dict[str, Any]):
1137
1125
  def __init__(
1138
1126
  self,
1139
- metadata: Optional[Any] = None,
1140
- expected: Optional[Any] = None,
1127
+ metadata: Any | None = None,
1128
+ expected: Any | None = None,
1141
1129
  trial_index: int = 0,
1142
- tags: Optional[Sequence[str]] = None,
1130
+ tags: Sequence[str] | None = None,
1143
1131
  report_progress: Callable[[TaskProgressEvent], None] = None,
1144
- parameters: Optional[Dict[str, Any]] = None,
1132
+ parameters: dict[str, Any] | None = None,
1145
1133
  ):
1146
1134
  if metadata is not None:
1147
1135
  self.update({"metadata": metadata})
@@ -1170,10 +1158,10 @@ class DictEvalHooks(Dict[str, Any]):
1170
1158
  return self.get("trial_index", 0)
1171
1159
 
1172
1160
  @property
1173
- def span(self) -> Optional[Span]:
1161
+ def span(self) -> Span | None:
1174
1162
  return self._span
1175
1163
 
1176
- def set_span(self, span: Optional[Span]):
1164
+ def set_span(self, span: Span | None):
1177
1165
  self._span = span
1178
1166
 
1179
1167
  @property
@@ -1181,8 +1169,8 @@ class DictEvalHooks(Dict[str, Any]):
1181
1169
  return self["tags"]
1182
1170
 
1183
1171
  @tags.setter
1184
- def tags(self, tags: Optional[Sequence[str]]) -> None:
1185
- self["tags"] = [] if tags is None else list(tags)
1172
+ def tags(self, tags: Sequence[str] | None) -> None:
1173
+ self["tags"] = [] if tags is None else list(tags)
1186
1174
 
1187
1175
  def meta(self, **info: Any):
1188
1176
  warnings.warn(
@@ -1199,12 +1187,12 @@ class DictEvalHooks(Dict[str, Any]):
1199
1187
  return self._report_progress(event)
1200
1188
 
1201
1189
  @property
1202
- def parameters(self) -> Optional[Dict[str, Any]]:
1190
+ def parameters(self) -> dict[str, Any] | None:
1203
1191
  return self._parameters
1204
1192
 
1205
1193
 
1206
1194
  def init_experiment(
1207
- project_name: Optional[str] = None, experiment_name: Optional[str] = None, set_current: bool = False, **kwargs: Any
1195
+ project_name: str | None = None, experiment_name: str | None = None, set_current: bool = False, **kwargs: Any
1208
1196
  ) -> Experiment:
1209
1197
  ret = _init_experiment(project=project_name, experiment=experiment_name, set_current=set_current, **kwargs)
1210
1198
  summary = ret.summarize(summarize_scores=False)
@@ -1255,12 +1243,12 @@ def _scorer_name(scorer, scorer_idx):
1255
1243
 
1256
1244
 
1257
1245
  async def run_evaluator(
1258
- experiment: Optional[Experiment],
1246
+ experiment: Experiment | None,
1259
1247
  evaluator: Evaluator[Input, Output],
1260
- position: Optional[int],
1261
- filters: List[Filter],
1262
- stream: Optional[Callable[[SSEProgressEvent], None]] = None,
1263
- state: Optional[BraintrustState] = None,
1248
+ position: int | None,
1249
+ filters: list[Filter],
1250
+ stream: Callable[[SSEProgressEvent], None] | None = None,
1251
+ state: BraintrustState | None = None,
1264
1252
  ) -> EvalResultWithSummary[Input, Output]:
1265
1253
  """Wrapper on _run_evaluator_internal that times out execution after evaluator.timeout."""
1266
1254
  results = await asyncio.wait_for(
@@ -1278,7 +1266,7 @@ async def run_evaluator(
1278
1266
  def default_error_score_handler(
1279
1267
  root_span: Span,
1280
1268
  data: EvalCase[Input, Output],
1281
- unhandled_scores: List[str],
1269
+ unhandled_scores: list[str],
1282
1270
  ):
1283
1271
  scores = {s: 0 for s in unhandled_scores}
1284
1272
  root_span.log(scores=scores)
@@ -1288,16 +1276,25 @@ def default_error_score_handler(
1288
1276
  async def _run_evaluator_internal(
1289
1277
  experiment,
1290
1278
  evaluator: Evaluator,
1291
- position: Optional[int],
1292
- filters: List[Filter],
1293
- stream: Optional[Callable[[SSEProgressEvent], None]] = None,
1294
- state: Optional[BraintrustState] = None,
1279
+ position: int | None,
1280
+ filters: list[Filter],
1281
+ stream: Callable[[SSEProgressEvent], None] | None = None,
1282
+ state: BraintrustState | None = None,
1295
1283
  ):
1296
1284
  event_loop = asyncio.get_event_loop()
1297
1285
 
1298
1286
  async def await_or_run_scorer(root_span, scorer, name, **kwargs):
1287
+ # Merge purpose into parent's propagated_event rather than replacing it
1288
+ parent_propagated = root_span.propagated_event or {}
1289
+ merged_propagated = merge_dicts(
1290
+ {**parent_propagated},
1291
+ {"span_attributes": {"purpose": "scorer"}},
1292
+ )
1299
1293
  with root_span.start_span(
1300
- name=name, span_attributes={"type": SpanTypeAttribute.SCORE}, input=dict(**kwargs)
1294
+ name=name,
1295
+ span_attributes={"type": SpanTypeAttribute.SCORE, "purpose": "scorer"},
1296
+ propagated_event=merged_propagated,
1297
+ input=dict(**kwargs),
1301
1298
  ) as span:
1302
1299
  score = scorer
1303
1300
  if hasattr(scorer, "eval_async"):
@@ -1557,7 +1554,7 @@ async def _run_evaluator_internal(
1557
1554
 
1558
1555
 
1559
1556
  def build_local_summary(
1560
- evaluator: Evaluator[Input, Output], results: List[EvalResultWithSummary[Input, Output]]
1557
+ evaluator: Evaluator[Input, Output], results: list[EvalResultWithSummary[Input, Output]]
1561
1558
  ) -> ExperimentSummary:
1562
1559
  scores_by_name = defaultdict(lambda: (0, 0))
1563
1560
  for result in results: