braintrust 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- braintrust/_generated_types.py +737 -672
- braintrust/audit.py +2 -2
- braintrust/cli/eval.py +6 -7
- braintrust/cli/push.py +11 -11
- braintrust/context.py +12 -17
- braintrust/contrib/temporal/__init__.py +16 -27
- braintrust/contrib/temporal/test_temporal.py +8 -3
- braintrust/devserver/auth.py +8 -8
- braintrust/devserver/cache.py +3 -4
- braintrust/devserver/cors.py +8 -7
- braintrust/devserver/dataset.py +3 -5
- braintrust/devserver/eval_hooks.py +7 -6
- braintrust/devserver/schemas.py +22 -19
- braintrust/devserver/server.py +19 -12
- braintrust/devserver/test_cached_login.py +4 -4
- braintrust/framework.py +128 -140
- braintrust/framework2.py +88 -87
- braintrust/functions/invoke.py +66 -59
- braintrust/functions/stream.py +3 -2
- braintrust/generated_types.py +3 -1
- braintrust/git_fields.py +11 -11
- braintrust/gitutil.py +2 -3
- braintrust/graph_util.py +10 -10
- braintrust/id_gen.py +2 -2
- braintrust/logger.py +346 -357
- braintrust/merge_row_batch.py +10 -9
- braintrust/oai.py +21 -20
- braintrust/otel/__init__.py +49 -49
- braintrust/otel/context.py +16 -30
- braintrust/otel/test_distributed_tracing.py +14 -11
- braintrust/otel/test_otel_bt_integration.py +32 -31
- braintrust/parameters.py +8 -8
- braintrust/prompt.py +14 -14
- braintrust/prompt_cache/disk_cache.py +5 -4
- braintrust/prompt_cache/lru_cache.py +3 -2
- braintrust/prompt_cache/prompt_cache.py +13 -14
- braintrust/queue.py +4 -4
- braintrust/score.py +4 -4
- braintrust/serializable_data_class.py +4 -4
- braintrust/span_identifier_v1.py +1 -2
- braintrust/span_identifier_v2.py +3 -4
- braintrust/span_identifier_v3.py +23 -20
- braintrust/span_identifier_v4.py +34 -25
- braintrust/test_framework.py +16 -6
- braintrust/test_helpers.py +5 -5
- braintrust/test_id_gen.py +2 -3
- braintrust/test_otel.py +61 -53
- braintrust/test_queue.py +0 -1
- braintrust/test_score.py +1 -3
- braintrust/test_span_components.py +29 -44
- braintrust/util.py +9 -8
- braintrust/version.py +2 -2
- braintrust/wrappers/_anthropic_utils.py +4 -4
- braintrust/wrappers/agno/__init__.py +3 -4
- braintrust/wrappers/agno/agent.py +1 -2
- braintrust/wrappers/agno/function_call.py +1 -2
- braintrust/wrappers/agno/model.py +1 -2
- braintrust/wrappers/agno/team.py +1 -2
- braintrust/wrappers/agno/utils.py +12 -12
- braintrust/wrappers/anthropic.py +7 -8
- braintrust/wrappers/claude_agent_sdk/__init__.py +3 -4
- braintrust/wrappers/claude_agent_sdk/_wrapper.py +29 -27
- braintrust/wrappers/dspy.py +15 -17
- braintrust/wrappers/google_genai/__init__.py +16 -16
- braintrust/wrappers/langchain.py +22 -24
- braintrust/wrappers/litellm.py +4 -3
- braintrust/wrappers/openai.py +15 -15
- braintrust/wrappers/pydantic_ai.py +21 -20
- braintrust/wrappers/test_agno.py +0 -1
- braintrust/wrappers/test_dspy.py +0 -1
- braintrust/wrappers/test_google_genai.py +2 -3
- braintrust/wrappers/test_litellm.py +0 -1
- {braintrust-0.3.15.dist-info → braintrust-0.4.0.dist-info}/METADATA +3 -2
- braintrust-0.4.0.dist-info/RECORD +120 -0
- braintrust-0.3.15.dist-info/RECORD +0 -120
- {braintrust-0.3.15.dist-info → braintrust-0.4.0.dist-info}/WHEEL +0 -0
- {braintrust-0.3.15.dist-info → braintrust-0.4.0.dist-info}/entry_points.txt +0 -0
- {braintrust-0.3.15.dist-info → braintrust-0.4.0.dist-info}/top_level.txt +0 -0
braintrust/framework.py
CHANGED
|
@@ -9,23 +9,15 @@ import sys
|
|
|
9
9
|
import traceback
|
|
10
10
|
import warnings
|
|
11
11
|
from collections import defaultdict
|
|
12
|
+
from collections.abc import Awaitable, Callable, Coroutine, Iterable, Iterator, Sequence
|
|
12
13
|
from concurrent.futures import ThreadPoolExecutor
|
|
13
14
|
from contextlib import contextmanager
|
|
14
15
|
from multiprocessing import cpu_count
|
|
15
16
|
from typing import (
|
|
16
17
|
Any,
|
|
17
|
-
Awaitable,
|
|
18
|
-
Callable,
|
|
19
|
-
Coroutine,
|
|
20
|
-
Dict,
|
|
21
18
|
Generic,
|
|
22
|
-
Iterable,
|
|
23
|
-
Iterator,
|
|
24
|
-
List,
|
|
25
19
|
Literal,
|
|
26
20
|
Optional,
|
|
27
|
-
Sequence,
|
|
28
|
-
Type,
|
|
29
21
|
TypeVar,
|
|
30
22
|
Union,
|
|
31
23
|
)
|
|
@@ -82,14 +74,14 @@ class EvalCase(SerializableDataClass, Generic[Input, Output]):
|
|
|
82
74
|
"""
|
|
83
75
|
|
|
84
76
|
input: Input
|
|
85
|
-
expected:
|
|
86
|
-
metadata:
|
|
87
|
-
tags:
|
|
77
|
+
expected: Output | None = None
|
|
78
|
+
metadata: Metadata | None = None
|
|
79
|
+
tags: Sequence[str] | None = None
|
|
88
80
|
|
|
89
81
|
# These fields are only set if the EvalCase is part of a Dataset.
|
|
90
|
-
id:
|
|
91
|
-
_xact_id:
|
|
92
|
-
created:
|
|
82
|
+
id: str | None = None
|
|
83
|
+
_xact_id: str | None = None
|
|
84
|
+
created: str | None = None
|
|
93
85
|
|
|
94
86
|
|
|
95
87
|
class _EvalCaseDictNoOutput(Generic[Input], TypedDict):
|
|
@@ -101,11 +93,11 @@ class _EvalCaseDictNoOutput(Generic[Input], TypedDict):
|
|
|
101
93
|
"""
|
|
102
94
|
|
|
103
95
|
input: Input
|
|
104
|
-
metadata: NotRequired[
|
|
105
|
-
tags: NotRequired[
|
|
96
|
+
metadata: NotRequired[Metadata | None]
|
|
97
|
+
tags: NotRequired[Sequence[str] | None]
|
|
106
98
|
|
|
107
|
-
id: NotRequired[
|
|
108
|
-
_xact_id: NotRequired[
|
|
99
|
+
id: NotRequired[str | None]
|
|
100
|
+
_xact_id: NotRequired[str | None]
|
|
109
101
|
|
|
110
102
|
|
|
111
103
|
class _EvalCaseDict(Generic[Input, Output], _EvalCaseDictNoOutput[Input]):
|
|
@@ -113,7 +105,7 @@ class _EvalCaseDict(Generic[Input, Output], _EvalCaseDictNoOutput[Input]):
|
|
|
113
105
|
Mirrors EvalCase for callers who pass a dict instead of dataclass.
|
|
114
106
|
"""
|
|
115
107
|
|
|
116
|
-
expected: NotRequired[
|
|
108
|
+
expected: NotRequired[Output | None]
|
|
117
109
|
|
|
118
110
|
|
|
119
111
|
# Inheritance doesn't quite work for dataclasses, so we redefine the fields
|
|
@@ -124,12 +116,12 @@ class EvalResult(SerializableDataClass, Generic[Input, Output]):
|
|
|
124
116
|
|
|
125
117
|
input: Input
|
|
126
118
|
output: Output
|
|
127
|
-
scores:
|
|
128
|
-
expected:
|
|
129
|
-
metadata:
|
|
130
|
-
tags:
|
|
131
|
-
error:
|
|
132
|
-
exc_info:
|
|
119
|
+
scores: dict[str, float | None]
|
|
120
|
+
expected: Output | None = None
|
|
121
|
+
metadata: Metadata | None = None
|
|
122
|
+
tags: list[str] | None = None
|
|
123
|
+
error: Exception | None = None
|
|
124
|
+
exc_info: str | None = None
|
|
133
125
|
|
|
134
126
|
|
|
135
127
|
@dataclasses.dataclass
|
|
@@ -177,7 +169,7 @@ class EvalHooks(abc.ABC, Generic[Output]):
|
|
|
177
169
|
|
|
178
170
|
@property
|
|
179
171
|
@abc.abstractmethod
|
|
180
|
-
def expected(self) ->
|
|
172
|
+
def expected(self) -> Output | None:
|
|
181
173
|
"""
|
|
182
174
|
The expected output for the current evaluation.
|
|
183
175
|
"""
|
|
@@ -222,7 +214,7 @@ class EvalHooks(abc.ABC, Generic[Output]):
|
|
|
222
214
|
|
|
223
215
|
@property
|
|
224
216
|
@abc.abstractmethod
|
|
225
|
-
def parameters(self) ->
|
|
217
|
+
def parameters(self) -> dict[str, Any] | None:
|
|
226
218
|
"""
|
|
227
219
|
The parameters for the current evaluation. These are the validated parameter values
|
|
228
220
|
that were passed to the evaluator.
|
|
@@ -236,11 +228,11 @@ class EvalScorerArgs(SerializableDataClass, Generic[Input, Output]):
|
|
|
236
228
|
|
|
237
229
|
input: Input
|
|
238
230
|
output: Output
|
|
239
|
-
expected:
|
|
240
|
-
metadata:
|
|
231
|
+
expected: Output | None = None
|
|
232
|
+
metadata: Metadata | None = None
|
|
241
233
|
|
|
242
234
|
|
|
243
|
-
OneOrMoreScores = Union[float, int, bool, None, Score,
|
|
235
|
+
OneOrMoreScores = Union[float, int, bool, None, Score, list[Score]]
|
|
244
236
|
|
|
245
237
|
|
|
246
238
|
# Synchronous scorer interface - implements callable
|
|
@@ -251,7 +243,7 @@ class SyncScorerLike(Protocol, Generic[Input, Output]):
|
|
|
251
243
|
"""
|
|
252
244
|
|
|
253
245
|
def __call__(
|
|
254
|
-
self, input: Input, output: Output, expected:
|
|
246
|
+
self, input: Input, output: Output, expected: Output | None = None, **kwargs: Any
|
|
255
247
|
) -> OneOrMoreScores: ...
|
|
256
248
|
|
|
257
249
|
|
|
@@ -262,9 +254,7 @@ class AsyncScorerLike(Protocol, Generic[Input, Output]):
|
|
|
262
254
|
The framework will prefer this interface if available.
|
|
263
255
|
"""
|
|
264
256
|
|
|
265
|
-
async def eval_async(
|
|
266
|
-
self, output: Output, expected: Optional[Output] = None, **kwargs: Any
|
|
267
|
-
) -> OneOrMoreScores: ...
|
|
257
|
+
async def eval_async(self, output: Output, expected: Output | None = None, **kwargs: Any) -> OneOrMoreScores: ...
|
|
268
258
|
|
|
269
259
|
|
|
270
260
|
# Union type for any kind of scorer (for typing)
|
|
@@ -272,7 +262,7 @@ ScorerLike = Union[SyncScorerLike[Input, Output], AsyncScorerLike[Input, Output]
|
|
|
272
262
|
|
|
273
263
|
EvalScorer = Union[
|
|
274
264
|
ScorerLike[Input, Output],
|
|
275
|
-
|
|
265
|
+
type[ScorerLike[Input, Output]],
|
|
276
266
|
Callable[[Input, Output, Output], OneOrMoreScores],
|
|
277
267
|
Callable[[Input, Output, Output], Awaitable[OneOrMoreScores]],
|
|
278
268
|
]
|
|
@@ -286,7 +276,7 @@ class BaseExperiment:
|
|
|
286
276
|
use based on your git history (or fall back to timestamps).
|
|
287
277
|
"""
|
|
288
278
|
|
|
289
|
-
name:
|
|
279
|
+
name: str | None = None
|
|
290
280
|
"""
|
|
291
281
|
The name of the base experiment to use. If unspecified, Braintrust will automatically figure out the best base
|
|
292
282
|
using your git history (or fall back to timestamps).
|
|
@@ -308,14 +298,14 @@ _EvalDataObject = Union[
|
|
|
308
298
|
BaseExperiment,
|
|
309
299
|
]
|
|
310
300
|
|
|
311
|
-
EvalData = Union[_EvalDataObject[Input, Output],
|
|
301
|
+
EvalData = Union[_EvalDataObject[Input, Output], type[_EvalDataObject[Input, Output]], Dataset]
|
|
312
302
|
|
|
313
303
|
EvalTask = Union[
|
|
314
304
|
Callable[[Input], Union[Output, Awaitable[Output]]],
|
|
315
305
|
Callable[[Input, EvalHooks[Output]], Union[Output, Awaitable[Output]]],
|
|
316
306
|
]
|
|
317
307
|
|
|
318
|
-
ErrorScoreHandler = Callable[[Span, EvalCase[Input, Output],
|
|
308
|
+
ErrorScoreHandler = Callable[[Span, EvalCase[Input, Output], list[str]], Optional[dict[str, float]]]
|
|
319
309
|
|
|
320
310
|
|
|
321
311
|
@dataclasses.dataclass
|
|
@@ -350,18 +340,18 @@ class Evaluator(Generic[Input, Output]):
|
|
|
350
340
|
Runs the evaluation task on a single input. The `hooks` object can be used to add metadata to the evaluation.
|
|
351
341
|
"""
|
|
352
342
|
|
|
353
|
-
scores:
|
|
343
|
+
scores: list[EvalScorer[Input, Output]]
|
|
354
344
|
"""
|
|
355
345
|
A list of scorers to evaluate the results of the task. Each scorer can be a Scorer object or a function
|
|
356
346
|
that takes `input`, `output`, and `expected` arguments and returns a `Score` object. The function can be async.
|
|
357
347
|
"""
|
|
358
348
|
|
|
359
|
-
experiment_name:
|
|
349
|
+
experiment_name: str | None
|
|
360
350
|
"""
|
|
361
351
|
Optional experiment name. If not specified, a name will be generated automatically.
|
|
362
352
|
"""
|
|
363
353
|
|
|
364
|
-
metadata:
|
|
354
|
+
metadata: Metadata | None
|
|
365
355
|
"""
|
|
366
356
|
A dictionary with additional data about the test example, model outputs, or just about anything else that's
|
|
367
357
|
relevant, that you can use to help find and analyze examples later. For example, you could log the `prompt`,
|
|
@@ -386,54 +376,54 @@ class Evaluator(Generic[Input, Output]):
|
|
|
386
376
|
Whether to update an existing experiment with `experiment_name` if one exists. Defaults to false.
|
|
387
377
|
"""
|
|
388
378
|
|
|
389
|
-
timeout:
|
|
379
|
+
timeout: float | None = None
|
|
390
380
|
"""
|
|
391
381
|
The duration, in seconds, after which to time out the evaluation.
|
|
392
382
|
Defaults to None, in which case there is no timeout.
|
|
393
383
|
"""
|
|
394
384
|
|
|
395
|
-
max_concurrency:
|
|
385
|
+
max_concurrency: int | None = None
|
|
396
386
|
"""
|
|
397
387
|
The maximum number of tasks/scorers that will be run concurrently.
|
|
398
388
|
Defaults to None, in which case there is no max concurrency.
|
|
399
389
|
"""
|
|
400
390
|
|
|
401
|
-
project_id:
|
|
391
|
+
project_id: str | None = None
|
|
402
392
|
"""
|
|
403
393
|
If specified, uses the given project ID instead of the evaluator's name to identify the project.
|
|
404
394
|
"""
|
|
405
395
|
|
|
406
|
-
base_experiment_name:
|
|
396
|
+
base_experiment_name: str | None = None
|
|
407
397
|
"""
|
|
408
398
|
An optional experiment name to use as a base. If specified, the new experiment will be summarized and
|
|
409
399
|
compared to this experiment.
|
|
410
400
|
"""
|
|
411
401
|
|
|
412
|
-
base_experiment_id:
|
|
402
|
+
base_experiment_id: str | None = None
|
|
413
403
|
"""
|
|
414
404
|
An optional experiment id to use as a base. If specified, the new experiment will be summarized and
|
|
415
405
|
compared to this experiment. This takes precedence over `base_experiment_name` if specified.
|
|
416
406
|
"""
|
|
417
407
|
|
|
418
|
-
git_metadata_settings:
|
|
408
|
+
git_metadata_settings: GitMetadataSettings | None = None
|
|
419
409
|
"""
|
|
420
410
|
Optional settings for collecting git metadata. By default, will collect all
|
|
421
411
|
git metadata fields allowed in org-level settings.
|
|
422
412
|
"""
|
|
423
413
|
|
|
424
|
-
repo_info:
|
|
414
|
+
repo_info: RepoInfo | None = None
|
|
425
415
|
"""
|
|
426
416
|
Optionally explicitly specify the git metadata for this experiment. This
|
|
427
417
|
takes precedence over `git_metadata_settings` if specified.
|
|
428
418
|
"""
|
|
429
419
|
|
|
430
|
-
error_score_handler:
|
|
420
|
+
error_score_handler: ErrorScoreHandler | None = None
|
|
431
421
|
"""
|
|
432
422
|
Optionally supply a custom function to specifically handle score values when tasks or scoring functions have errored.
|
|
433
423
|
A default implementation is exported as `default_error_score_handler` which will log a 0 score to the root span for any scorer that was not run.
|
|
434
424
|
"""
|
|
435
425
|
|
|
436
|
-
description:
|
|
426
|
+
description: str | None = None
|
|
437
427
|
"""
|
|
438
428
|
An optional description for the experiment.
|
|
439
429
|
"""
|
|
@@ -443,7 +433,7 @@ class Evaluator(Generic[Input, Output]):
|
|
|
443
433
|
Whether to summarize the scores of the experiment after it has run.
|
|
444
434
|
"""
|
|
445
435
|
|
|
446
|
-
parameters:
|
|
436
|
+
parameters: EvalParameters | None = None
|
|
447
437
|
"""
|
|
448
438
|
A set of parameters that will be passed to the evaluator.
|
|
449
439
|
Can be used to define prompts or other configurable values.
|
|
@@ -453,7 +443,7 @@ class Evaluator(Generic[Input, Output]):
|
|
|
453
443
|
@dataclasses.dataclass
|
|
454
444
|
class EvalResultWithSummary(SerializableDataClass, Generic[Input, Output]):
|
|
455
445
|
summary: ExperimentSummary
|
|
456
|
-
results:
|
|
446
|
+
results: list[EvalResult[Input, Output]]
|
|
457
447
|
|
|
458
448
|
def _repr_pretty_(self, p, cycle):
|
|
459
449
|
p.text(f'EvalResultWithSummary(summary="...", results=[...])')
|
|
@@ -529,13 +519,13 @@ class ReporterDef(SerializableDataClass, Generic[Input, Output, EvalReport]):
|
|
|
529
519
|
|
|
530
520
|
report_eval: Callable[
|
|
531
521
|
[Evaluator[Input, Output], EvalResultWithSummary[Input, Output], bool, bool],
|
|
532
|
-
|
|
522
|
+
EvalReport | Awaitable[EvalReport],
|
|
533
523
|
]
|
|
534
524
|
"""
|
|
535
525
|
A function that takes an evaluator and its result and returns a report.
|
|
536
526
|
"""
|
|
537
527
|
|
|
538
|
-
report_run: Callable[[
|
|
528
|
+
report_run: Callable[[list[EvalReport], bool, bool], bool | Awaitable[bool]]
|
|
539
529
|
"""
|
|
540
530
|
A function that takes all evaluator results and returns a boolean indicating whether the run was successful.
|
|
541
531
|
If you return false, the `braintrust eval` command will exit with a non-zero status code.
|
|
@@ -547,15 +537,13 @@ class ReporterDef(SerializableDataClass, Generic[Input, Output, EvalReport]):
|
|
|
547
537
|
result: EvalResultWithSummary[Input, Output],
|
|
548
538
|
verbose: bool,
|
|
549
539
|
jsonl: bool,
|
|
550
|
-
) ->
|
|
540
|
+
) -> EvalReport | Awaitable[EvalReport]:
|
|
551
541
|
event_loop = asyncio.get_event_loop()
|
|
552
542
|
return await call_user_fn(
|
|
553
543
|
event_loop, self.report_eval, evaluator=evaluator, result=result, verbose=verbose, jsonl=jsonl
|
|
554
544
|
)
|
|
555
545
|
|
|
556
|
-
async def _call_report_run(
|
|
557
|
-
self, results: List[EvalReport], verbose: bool, jsonl: bool
|
|
558
|
-
) -> Union[bool, Awaitable[bool]]:
|
|
546
|
+
async def _call_report_run(self, results: list[EvalReport], verbose: bool, jsonl: bool) -> bool | Awaitable[bool]:
|
|
559
547
|
event_loop = asyncio.get_event_loop()
|
|
560
548
|
return await call_user_fn(event_loop, self.report_run, results=results, verbose=verbose, jsonl=jsonl)
|
|
561
549
|
|
|
@@ -563,13 +551,13 @@ class ReporterDef(SerializableDataClass, Generic[Input, Output, EvalReport]):
|
|
|
563
551
|
@dataclasses.dataclass
|
|
564
552
|
class EvaluatorInstance(SerializableDataClass, Generic[Input, Output, EvalReport]):
|
|
565
553
|
evaluator: Evaluator[Input, Output]
|
|
566
|
-
reporter:
|
|
554
|
+
reporter: ReporterDef[Input, Output, EvalReport] | str | None
|
|
567
555
|
|
|
568
556
|
|
|
569
557
|
@dataclasses.dataclass
|
|
570
558
|
class EvaluatorFile(SerializableDataClass):
|
|
571
|
-
evaluators:
|
|
572
|
-
reporters:
|
|
559
|
+
evaluators: dict[str, EvaluatorInstance] = dataclasses.field(default_factory=dict)
|
|
560
|
+
reporters: dict[str, ReporterDef] = dataclasses.field(default_factory=dict)
|
|
573
561
|
|
|
574
562
|
def clear(self):
|
|
575
563
|
self.evaluators.clear()
|
|
@@ -651,7 +639,7 @@ default_reporter = ReporterDef(
|
|
|
651
639
|
)
|
|
652
640
|
|
|
653
641
|
|
|
654
|
-
def _make_eval_name(name: str, experiment_name:
|
|
642
|
+
def _make_eval_name(name: str, experiment_name: str | None):
|
|
655
643
|
out = name
|
|
656
644
|
if experiment_name is not None:
|
|
657
645
|
out += f" [experiment_name={experiment_name}]"
|
|
@@ -663,28 +651,28 @@ def _EvalCommon(
|
|
|
663
651
|
data: EvalData[Input, Output],
|
|
664
652
|
task: EvalTask[Input, Output],
|
|
665
653
|
scores: Sequence[EvalScorer[Input, Output]],
|
|
666
|
-
experiment_name:
|
|
654
|
+
experiment_name: str | None,
|
|
667
655
|
trial_count: int,
|
|
668
|
-
metadata:
|
|
656
|
+
metadata: Metadata | None,
|
|
669
657
|
is_public: bool,
|
|
670
658
|
update: bool,
|
|
671
|
-
reporter:
|
|
672
|
-
timeout:
|
|
673
|
-
max_concurrency:
|
|
674
|
-
project_id:
|
|
675
|
-
base_experiment_name:
|
|
676
|
-
base_experiment_id:
|
|
677
|
-
git_metadata_settings:
|
|
678
|
-
repo_info:
|
|
679
|
-
description:
|
|
659
|
+
reporter: ReporterDef[Input, Output, EvalReport] | None,
|
|
660
|
+
timeout: float | None,
|
|
661
|
+
max_concurrency: int | None,
|
|
662
|
+
project_id: str | None,
|
|
663
|
+
base_experiment_name: str | None,
|
|
664
|
+
base_experiment_id: str | None,
|
|
665
|
+
git_metadata_settings: GitMetadataSettings | None,
|
|
666
|
+
repo_info: RepoInfo | None,
|
|
667
|
+
description: str | None,
|
|
680
668
|
summarize_scores: bool,
|
|
681
669
|
no_send_logs: bool,
|
|
682
|
-
error_score_handler:
|
|
683
|
-
parameters:
|
|
684
|
-
on_start:
|
|
685
|
-
stream:
|
|
686
|
-
parent:
|
|
687
|
-
state:
|
|
670
|
+
error_score_handler: ErrorScoreHandler | None = None,
|
|
671
|
+
parameters: EvalParameters | None = None,
|
|
672
|
+
on_start: Callable[[ExperimentSummary], None] | None = None,
|
|
673
|
+
stream: Callable[[SSEProgressEvent], None] | None = None,
|
|
674
|
+
parent: str | None = None,
|
|
675
|
+
state: BraintrustState | None = None,
|
|
688
676
|
) -> Callable[[], Coroutine[Any, Any, EvalResultWithSummary[Input, Output]]]:
|
|
689
677
|
"""
|
|
690
678
|
This helper is needed because in case of `_lazy_load`, we need to update
|
|
@@ -788,28 +776,28 @@ async def EvalAsync(
|
|
|
788
776
|
data: EvalData[Input, Output],
|
|
789
777
|
task: EvalTask[Input, Output],
|
|
790
778
|
scores: Sequence[EvalScorer[Input, Output]],
|
|
791
|
-
experiment_name:
|
|
779
|
+
experiment_name: str | None = None,
|
|
792
780
|
trial_count: int = 1,
|
|
793
|
-
metadata:
|
|
781
|
+
metadata: Metadata | None = None,
|
|
794
782
|
is_public: bool = False,
|
|
795
783
|
update: bool = False,
|
|
796
|
-
reporter:
|
|
797
|
-
timeout:
|
|
798
|
-
max_concurrency:
|
|
799
|
-
project_id:
|
|
800
|
-
base_experiment_name:
|
|
801
|
-
base_experiment_id:
|
|
802
|
-
git_metadata_settings:
|
|
803
|
-
repo_info:
|
|
804
|
-
error_score_handler:
|
|
805
|
-
description:
|
|
784
|
+
reporter: ReporterDef[Input, Output, EvalReport] | None = None,
|
|
785
|
+
timeout: float | None = None,
|
|
786
|
+
max_concurrency: int | None = None,
|
|
787
|
+
project_id: str | None = None,
|
|
788
|
+
base_experiment_name: str | None = None,
|
|
789
|
+
base_experiment_id: str | None = None,
|
|
790
|
+
git_metadata_settings: GitMetadataSettings | None = None,
|
|
791
|
+
repo_info: RepoInfo | None = None,
|
|
792
|
+
error_score_handler: ErrorScoreHandler | None = None,
|
|
793
|
+
description: str | None = None,
|
|
806
794
|
summarize_scores: bool = True,
|
|
807
795
|
no_send_logs: bool = False,
|
|
808
|
-
parameters:
|
|
809
|
-
on_start:
|
|
810
|
-
stream:
|
|
811
|
-
parent:
|
|
812
|
-
state:
|
|
796
|
+
parameters: EvalParameters | None = None,
|
|
797
|
+
on_start: Callable[[ExperimentSummary], None] | None = None,
|
|
798
|
+
stream: Callable[[SSEProgressEvent], None] | None = None,
|
|
799
|
+
parent: str | None = None,
|
|
800
|
+
state: BraintrustState | None = None,
|
|
813
801
|
) -> EvalResultWithSummary[Input, Output]:
|
|
814
802
|
"""
|
|
815
803
|
A function you can use to define an evaluator. This is a convenience wrapper around the `Evaluator` class.
|
|
@@ -908,28 +896,28 @@ def Eval(
|
|
|
908
896
|
data: EvalData[Input, Output],
|
|
909
897
|
task: EvalTask[Input, Output],
|
|
910
898
|
scores: Sequence[EvalScorer[Input, Output]],
|
|
911
|
-
experiment_name:
|
|
899
|
+
experiment_name: str | None = None,
|
|
912
900
|
trial_count: int = 1,
|
|
913
|
-
metadata:
|
|
901
|
+
metadata: Metadata | None = None,
|
|
914
902
|
is_public: bool = False,
|
|
915
903
|
update: bool = False,
|
|
916
|
-
reporter:
|
|
917
|
-
timeout:
|
|
918
|
-
max_concurrency:
|
|
919
|
-
project_id:
|
|
920
|
-
base_experiment_name:
|
|
921
|
-
base_experiment_id:
|
|
922
|
-
git_metadata_settings:
|
|
923
|
-
repo_info:
|
|
924
|
-
error_score_handler:
|
|
925
|
-
description:
|
|
904
|
+
reporter: ReporterDef[Input, Output, EvalReport] | None = None,
|
|
905
|
+
timeout: float | None = None,
|
|
906
|
+
max_concurrency: int | None = None,
|
|
907
|
+
project_id: str | None = None,
|
|
908
|
+
base_experiment_name: str | None = None,
|
|
909
|
+
base_experiment_id: str | None = None,
|
|
910
|
+
git_metadata_settings: GitMetadataSettings | None = None,
|
|
911
|
+
repo_info: RepoInfo | None = None,
|
|
912
|
+
error_score_handler: ErrorScoreHandler | None = None,
|
|
913
|
+
description: str | None = None,
|
|
926
914
|
summarize_scores: bool = True,
|
|
927
915
|
no_send_logs: bool = False,
|
|
928
|
-
parameters:
|
|
929
|
-
on_start:
|
|
930
|
-
stream:
|
|
931
|
-
parent:
|
|
932
|
-
state:
|
|
916
|
+
parameters: EvalParameters | None = None,
|
|
917
|
+
on_start: Callable[[ExperimentSummary], None] | None = None,
|
|
918
|
+
stream: Callable[[SSEProgressEvent], None] | None = None,
|
|
919
|
+
parent: str | None = None,
|
|
920
|
+
state: BraintrustState | None = None,
|
|
933
921
|
) -> EvalResultWithSummary[Input, Output]:
|
|
934
922
|
"""
|
|
935
923
|
A function you can use to define an evaluator. This is a convenience wrapper around the `Evaluator` class.
|
|
@@ -1046,9 +1034,9 @@ def Reporter(
|
|
|
1046
1034
|
name: str,
|
|
1047
1035
|
report_eval: Callable[
|
|
1048
1036
|
[Evaluator[Input, Output], EvalResultWithSummary[Input, Output], bool, bool],
|
|
1049
|
-
|
|
1037
|
+
EvalReport | Awaitable[EvalReport],
|
|
1050
1038
|
],
|
|
1051
|
-
report_run: Callable[[
|
|
1039
|
+
report_run: Callable[[list[EvalReport], bool, bool], bool | Awaitable[bool]],
|
|
1052
1040
|
):
|
|
1053
1041
|
"""
|
|
1054
1042
|
A function you can use to define a reporter. This is a convenience wrapper around the `ReporterDef` class.
|
|
@@ -1086,7 +1074,7 @@ def Reporter(
|
|
|
1086
1074
|
|
|
1087
1075
|
@dataclasses.dataclass
|
|
1088
1076
|
class Filter:
|
|
1089
|
-
path:
|
|
1077
|
+
path: list[str]
|
|
1090
1078
|
pattern: re.Pattern
|
|
1091
1079
|
|
|
1092
1080
|
|
|
@@ -1104,7 +1092,7 @@ def deserialize_plain_string_as_json(s: str) -> Any:
|
|
|
1104
1092
|
return {"value": s, "error": e}
|
|
1105
1093
|
|
|
1106
1094
|
|
|
1107
|
-
def parse_filters(filters:
|
|
1095
|
+
def parse_filters(filters: list[str]) -> list[Filter]:
|
|
1108
1096
|
result = []
|
|
1109
1097
|
for f in filters:
|
|
1110
1098
|
equals_idx = f.index("=")
|
|
@@ -1133,15 +1121,15 @@ def evaluate_filter(object, filter: Filter):
|
|
|
1133
1121
|
return filter.pattern.match(serialize_json_with_plain_string(key)) is not None
|
|
1134
1122
|
|
|
1135
1123
|
|
|
1136
|
-
class DictEvalHooks(
|
|
1124
|
+
class DictEvalHooks(dict[str, Any]):
|
|
1137
1125
|
def __init__(
|
|
1138
1126
|
self,
|
|
1139
|
-
metadata:
|
|
1140
|
-
expected:
|
|
1127
|
+
metadata: Any | None = None,
|
|
1128
|
+
expected: Any | None = None,
|
|
1141
1129
|
trial_index: int = 0,
|
|
1142
|
-
tags:
|
|
1130
|
+
tags: Sequence[str] | None = None,
|
|
1143
1131
|
report_progress: Callable[[TaskProgressEvent], None] = None,
|
|
1144
|
-
parameters:
|
|
1132
|
+
parameters: dict[str, Any] | None = None,
|
|
1145
1133
|
):
|
|
1146
1134
|
if metadata is not None:
|
|
1147
1135
|
self.update({"metadata": metadata})
|
|
@@ -1170,10 +1158,10 @@ class DictEvalHooks(Dict[str, Any]):
|
|
|
1170
1158
|
return self.get("trial_index", 0)
|
|
1171
1159
|
|
|
1172
1160
|
@property
|
|
1173
|
-
def span(self) ->
|
|
1161
|
+
def span(self) -> Span | None:
|
|
1174
1162
|
return self._span
|
|
1175
1163
|
|
|
1176
|
-
def set_span(self, span:
|
|
1164
|
+
def set_span(self, span: Span | None):
|
|
1177
1165
|
self._span = span
|
|
1178
1166
|
|
|
1179
1167
|
@property
|
|
@@ -1181,8 +1169,8 @@ class DictEvalHooks(Dict[str, Any]):
|
|
|
1181
1169
|
return self["tags"]
|
|
1182
1170
|
|
|
1183
1171
|
@tags.setter
|
|
1184
|
-
def tags(self, tags:
|
|
1185
|
-
self["tags"] =
|
|
1172
|
+
def tags(self, tags: Sequence[str] | None) -> None:
|
|
1173
|
+
self["tags"] = [] if tags is None else list(tags)
|
|
1186
1174
|
|
|
1187
1175
|
def meta(self, **info: Any):
|
|
1188
1176
|
warnings.warn(
|
|
@@ -1199,12 +1187,12 @@ class DictEvalHooks(Dict[str, Any]):
|
|
|
1199
1187
|
return self._report_progress(event)
|
|
1200
1188
|
|
|
1201
1189
|
@property
|
|
1202
|
-
def parameters(self) ->
|
|
1190
|
+
def parameters(self) -> dict[str, Any] | None:
|
|
1203
1191
|
return self._parameters
|
|
1204
1192
|
|
|
1205
1193
|
|
|
1206
1194
|
def init_experiment(
|
|
1207
|
-
project_name:
|
|
1195
|
+
project_name: str | None = None, experiment_name: str | None = None, set_current: bool = False, **kwargs: Any
|
|
1208
1196
|
) -> Experiment:
|
|
1209
1197
|
ret = _init_experiment(project=project_name, experiment=experiment_name, set_current=set_current, **kwargs)
|
|
1210
1198
|
summary = ret.summarize(summarize_scores=False)
|
|
@@ -1255,12 +1243,12 @@ def _scorer_name(scorer, scorer_idx):
|
|
|
1255
1243
|
|
|
1256
1244
|
|
|
1257
1245
|
async def run_evaluator(
|
|
1258
|
-
experiment:
|
|
1246
|
+
experiment: Experiment | None,
|
|
1259
1247
|
evaluator: Evaluator[Input, Output],
|
|
1260
|
-
position:
|
|
1261
|
-
filters:
|
|
1262
|
-
stream:
|
|
1263
|
-
state:
|
|
1248
|
+
position: int | None,
|
|
1249
|
+
filters: list[Filter],
|
|
1250
|
+
stream: Callable[[SSEProgressEvent], None] | None = None,
|
|
1251
|
+
state: BraintrustState | None = None,
|
|
1264
1252
|
) -> EvalResultWithSummary[Input, Output]:
|
|
1265
1253
|
"""Wrapper on _run_evaluator_internal that times out execution after evaluator.timeout."""
|
|
1266
1254
|
results = await asyncio.wait_for(
|
|
@@ -1278,7 +1266,7 @@ async def run_evaluator(
|
|
|
1278
1266
|
def default_error_score_handler(
|
|
1279
1267
|
root_span: Span,
|
|
1280
1268
|
data: EvalCase[Input, Output],
|
|
1281
|
-
unhandled_scores:
|
|
1269
|
+
unhandled_scores: list[str],
|
|
1282
1270
|
):
|
|
1283
1271
|
scores = {s: 0 for s in unhandled_scores}
|
|
1284
1272
|
root_span.log(scores=scores)
|
|
@@ -1288,10 +1276,10 @@ def default_error_score_handler(
|
|
|
1288
1276
|
async def _run_evaluator_internal(
|
|
1289
1277
|
experiment,
|
|
1290
1278
|
evaluator: Evaluator,
|
|
1291
|
-
position:
|
|
1292
|
-
filters:
|
|
1293
|
-
stream:
|
|
1294
|
-
state:
|
|
1279
|
+
position: int | None,
|
|
1280
|
+
filters: list[Filter],
|
|
1281
|
+
stream: Callable[[SSEProgressEvent], None] | None = None,
|
|
1282
|
+
state: BraintrustState | None = None,
|
|
1295
1283
|
):
|
|
1296
1284
|
event_loop = asyncio.get_event_loop()
|
|
1297
1285
|
|
|
@@ -1557,7 +1545,7 @@ async def _run_evaluator_internal(
|
|
|
1557
1545
|
|
|
1558
1546
|
|
|
1559
1547
|
def build_local_summary(
|
|
1560
|
-
evaluator: Evaluator[Input, Output], results:
|
|
1548
|
+
evaluator: Evaluator[Input, Output], results: list[EvalResultWithSummary[Input, Output]]
|
|
1561
1549
|
) -> ExperimentSummary:
|
|
1562
1550
|
scores_by_name = defaultdict(lambda: (0, 0))
|
|
1563
1551
|
for result in results:
|