pydantic-evals 1.0.11__py3-none-any.whl → 1.0.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydantic-evals might be problematic. Click here for more details.
- pydantic_evals/dataset.py +57 -15
- pydantic_evals/evaluators/llm_as_a_judge.py +1 -2
- pydantic_evals/generation.py +2 -1
- {pydantic_evals-1.0.11.dist-info → pydantic_evals-1.0.13.dist-info}/METADATA +2 -2
- {pydantic_evals-1.0.11.dist-info → pydantic_evals-1.0.13.dist-info}/RECORD +7 -7
- {pydantic_evals-1.0.11.dist-info → pydantic_evals-1.0.13.dist-info}/WHEEL +0 -0
- {pydantic_evals-1.0.11.dist-info → pydantic_evals-1.0.13.dist-info}/licenses/LICENSE +0 -0
pydantic_evals/dataset.py
CHANGED
|
@@ -98,6 +98,7 @@ class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forb
|
|
|
98
98
|
|
|
99
99
|
# $schema is included to avoid validation fails from the `$schema` key, see `_add_json_schema` below for context
|
|
100
100
|
json_schema_path: str | None = Field(default=None, alias='$schema')
|
|
101
|
+
name: str | None = None
|
|
101
102
|
cases: list[_CaseModel[InputsT, OutputT, MetadataT]]
|
|
102
103
|
evaluators: list[EvaluatorSpec] = Field(default_factory=list)
|
|
103
104
|
|
|
@@ -218,6 +219,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
218
219
|
```
|
|
219
220
|
"""
|
|
220
221
|
|
|
222
|
+
name: str | None = None
|
|
223
|
+
"""Optional name of the dataset."""
|
|
221
224
|
cases: list[Case[InputsT, OutputT, MetadataT]]
|
|
222
225
|
"""List of test cases in the dataset."""
|
|
223
226
|
evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = []
|
|
@@ -226,12 +229,14 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
226
229
|
def __init__(
|
|
227
230
|
self,
|
|
228
231
|
*,
|
|
232
|
+
name: str | None = None,
|
|
229
233
|
cases: Sequence[Case[InputsT, OutputT, MetadataT]],
|
|
230
234
|
evaluators: Sequence[Evaluator[InputsT, OutputT, MetadataT]] = (),
|
|
231
235
|
):
|
|
232
236
|
"""Initialize a new dataset with test cases and optional evaluators.
|
|
233
237
|
|
|
234
238
|
Args:
|
|
239
|
+
name: Optional name for the dataset.
|
|
235
240
|
cases: Sequence of test cases to include in the dataset.
|
|
236
241
|
evaluators: Optional sequence of evaluators to apply to all cases in the dataset.
|
|
237
242
|
"""
|
|
@@ -244,10 +249,12 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
244
249
|
case_names.add(case.name)
|
|
245
250
|
|
|
246
251
|
super().__init__(
|
|
252
|
+
name=name,
|
|
247
253
|
cases=cases,
|
|
248
254
|
evaluators=list(evaluators),
|
|
249
255
|
)
|
|
250
256
|
|
|
257
|
+
# TODO in v2: Make everything not required keyword-only
|
|
251
258
|
async def evaluate(
|
|
252
259
|
self,
|
|
253
260
|
task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
|
|
@@ -256,6 +263,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
256
263
|
progress: bool = True,
|
|
257
264
|
retry_task: RetryConfig | None = None,
|
|
258
265
|
retry_evaluators: RetryConfig | None = None,
|
|
266
|
+
*,
|
|
267
|
+
task_name: str | None = None,
|
|
259
268
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
260
269
|
"""Evaluates the test cases in the dataset using the given task.
|
|
261
270
|
|
|
@@ -265,28 +274,38 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
265
274
|
Args:
|
|
266
275
|
task: The task to evaluate. This should be a callable that takes the inputs of the case
|
|
267
276
|
and returns the output.
|
|
268
|
-
name: The name of the
|
|
269
|
-
If omitted, the name of the task function
|
|
277
|
+
name: The name of the experiment being run, this is used to identify the experiment in the report.
|
|
278
|
+
If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
|
|
270
279
|
max_concurrency: The maximum number of concurrent evaluations of the task to allow.
|
|
271
280
|
If None, all cases will be evaluated concurrently.
|
|
272
281
|
progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
|
|
273
282
|
retry_task: Optional retry configuration for the task execution.
|
|
274
283
|
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
284
|
+
task_name: Optional override to the name of the task being executed, otherwise the name of the task
|
|
285
|
+
function will be used.
|
|
275
286
|
|
|
276
287
|
Returns:
|
|
277
288
|
A report containing the results of the evaluation.
|
|
278
289
|
"""
|
|
279
|
-
|
|
290
|
+
task_name = task_name or get_unwrapped_function_name(task)
|
|
291
|
+
name = name or task_name
|
|
280
292
|
total_cases = len(self.cases)
|
|
281
293
|
progress_bar = Progress() if progress else None
|
|
282
294
|
|
|
283
295
|
limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
|
|
284
296
|
|
|
285
297
|
with (
|
|
286
|
-
logfire_span(
|
|
298
|
+
logfire_span(
|
|
299
|
+
'evaluate {name}',
|
|
300
|
+
name=name,
|
|
301
|
+
task_name=task_name,
|
|
302
|
+
dataset_name=self.name,
|
|
303
|
+
n_cases=len(self.cases),
|
|
304
|
+
**{'gen_ai.operation.name': 'experiment'}, # pyright: ignore[reportArgumentType]
|
|
305
|
+
) as eval_span,
|
|
287
306
|
progress_bar or nullcontext(),
|
|
288
307
|
):
|
|
289
|
-
task_id = progress_bar.add_task(f'Evaluating {
|
|
308
|
+
task_id = progress_bar.add_task(f'Evaluating {task_name}', total=total_cases) if progress_bar else None
|
|
290
309
|
|
|
291
310
|
async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str):
|
|
292
311
|
async with limiter:
|
|
@@ -333,6 +352,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
333
352
|
name: str | None = None,
|
|
334
353
|
max_concurrency: int | None = None,
|
|
335
354
|
progress: bool = True,
|
|
355
|
+
retry_task: RetryConfig | None = None,
|
|
356
|
+
retry_evaluators: RetryConfig | None = None,
|
|
336
357
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
337
358
|
"""Evaluates the test cases in the dataset using the given task.
|
|
338
359
|
|
|
@@ -346,12 +367,21 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
346
367
|
max_concurrency: The maximum number of concurrent evaluations of the task to allow.
|
|
347
368
|
If None, all cases will be evaluated concurrently.
|
|
348
369
|
progress: Whether to show a progress bar for the evaluation. Defaults to True.
|
|
370
|
+
retry_task: Optional retry configuration for the task execution.
|
|
371
|
+
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
349
372
|
|
|
350
373
|
Returns:
|
|
351
374
|
A report containing the results of the evaluation.
|
|
352
375
|
"""
|
|
353
376
|
return get_event_loop().run_until_complete(
|
|
354
|
-
self.evaluate(
|
|
377
|
+
self.evaluate(
|
|
378
|
+
task,
|
|
379
|
+
task_name=name,
|
|
380
|
+
max_concurrency=max_concurrency,
|
|
381
|
+
progress=progress,
|
|
382
|
+
retry_task=retry_task,
|
|
383
|
+
retry_evaluators=retry_evaluators,
|
|
384
|
+
)
|
|
355
385
|
)
|
|
356
386
|
|
|
357
387
|
def add_case(
|
|
@@ -463,7 +493,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
463
493
|
|
|
464
494
|
raw = Path(path).read_text()
|
|
465
495
|
try:
|
|
466
|
-
return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types)
|
|
496
|
+
return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types, default_name=path.stem)
|
|
467
497
|
except ValidationError as e: # pragma: no cover
|
|
468
498
|
raise ValueError(f'{path} contains data that does not match the schema for {cls.__name__}:\n{e}.') from e
|
|
469
499
|
|
|
@@ -473,6 +503,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
473
503
|
contents: str,
|
|
474
504
|
fmt: Literal['yaml', 'json'] = 'yaml',
|
|
475
505
|
custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
|
|
506
|
+
*,
|
|
507
|
+
default_name: str | None = None,
|
|
476
508
|
) -> Self:
|
|
477
509
|
"""Load a dataset from a string.
|
|
478
510
|
|
|
@@ -481,6 +513,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
481
513
|
fmt: Format of the content. Must be either 'yaml' or 'json'.
|
|
482
514
|
custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.
|
|
483
515
|
These are additional evaluators beyond the default ones.
|
|
516
|
+
default_name: Default name of the dataset, to be used if not specified in the serialized contents.
|
|
484
517
|
|
|
485
518
|
Returns:
|
|
486
519
|
A new Dataset instance parsed from the string.
|
|
@@ -490,17 +523,19 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
490
523
|
"""
|
|
491
524
|
if fmt == 'yaml':
|
|
492
525
|
loaded = yaml.safe_load(contents)
|
|
493
|
-
return cls.from_dict(loaded, custom_evaluator_types)
|
|
526
|
+
return cls.from_dict(loaded, custom_evaluator_types, default_name=default_name)
|
|
494
527
|
else:
|
|
495
528
|
dataset_model_type = cls._serialization_type()
|
|
496
529
|
dataset_model = dataset_model_type.model_validate_json(contents)
|
|
497
|
-
return cls._from_dataset_model(dataset_model, custom_evaluator_types)
|
|
530
|
+
return cls._from_dataset_model(dataset_model, custom_evaluator_types, default_name)
|
|
498
531
|
|
|
499
532
|
@classmethod
|
|
500
533
|
def from_dict(
|
|
501
534
|
cls,
|
|
502
535
|
data: dict[str, Any],
|
|
503
536
|
custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
|
|
537
|
+
*,
|
|
538
|
+
default_name: str | None = None,
|
|
504
539
|
) -> Self:
|
|
505
540
|
"""Load a dataset from a dictionary.
|
|
506
541
|
|
|
@@ -508,6 +543,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
508
543
|
data: Dictionary representation of the dataset.
|
|
509
544
|
custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.
|
|
510
545
|
These are additional evaluators beyond the default ones.
|
|
546
|
+
default_name: Default name of the dataset, to be used if not specified in the data.
|
|
511
547
|
|
|
512
548
|
Returns:
|
|
513
549
|
A new Dataset instance created from the dictionary.
|
|
@@ -517,19 +553,21 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
517
553
|
"""
|
|
518
554
|
dataset_model_type = cls._serialization_type()
|
|
519
555
|
dataset_model = dataset_model_type.model_validate(data)
|
|
520
|
-
return cls._from_dataset_model(dataset_model, custom_evaluator_types)
|
|
556
|
+
return cls._from_dataset_model(dataset_model, custom_evaluator_types, default_name)
|
|
521
557
|
|
|
522
558
|
@classmethod
|
|
523
559
|
def _from_dataset_model(
|
|
524
560
|
cls,
|
|
525
561
|
dataset_model: _DatasetModel[InputsT, OutputT, MetadataT],
|
|
526
562
|
custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
|
|
563
|
+
default_name: str | None = None,
|
|
527
564
|
) -> Self:
|
|
528
565
|
"""Create a Dataset from a _DatasetModel.
|
|
529
566
|
|
|
530
567
|
Args:
|
|
531
568
|
dataset_model: The _DatasetModel to convert.
|
|
532
569
|
custom_evaluator_types: Custom evaluator classes to register for deserialization.
|
|
570
|
+
default_name: Default name of the dataset, to be used if the value is `None` in the provided model.
|
|
533
571
|
|
|
534
572
|
Returns:
|
|
535
573
|
A new Dataset instance created from the _DatasetModel.
|
|
@@ -566,7 +604,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
566
604
|
cases.append(row)
|
|
567
605
|
if errors:
|
|
568
606
|
raise ExceptionGroup(f'{len(errors)} error(s) loading evaluators from registry', errors[:3])
|
|
569
|
-
result = cls(cases=cases)
|
|
607
|
+
result = cls(name=dataset_model.name, cases=cases)
|
|
608
|
+
if result.name is None:
|
|
609
|
+
result.name = default_name
|
|
570
610
|
result.evaluators = dataset_evaluators
|
|
571
611
|
return result
|
|
572
612
|
|
|
@@ -886,12 +926,14 @@ async def _run_task(
|
|
|
886
926
|
# That way users can customize this logic. We'd default to a function that does the current thing but also
|
|
887
927
|
# allow `None` to disable it entirely.
|
|
888
928
|
for node in span_tree:
|
|
889
|
-
if node.attributes.get('gen_ai.operation.name') == 'chat':
|
|
890
|
-
task_run.increment_metric('requests', 1)
|
|
891
929
|
for k, v in node.attributes.items():
|
|
892
|
-
if
|
|
930
|
+
if k == 'gen_ai.operation.name' and v == 'chat':
|
|
931
|
+
task_run.increment_metric('requests', 1)
|
|
932
|
+
elif not isinstance(v, int | float):
|
|
893
933
|
continue
|
|
894
|
-
|
|
934
|
+
elif k == 'operation.cost':
|
|
935
|
+
task_run.increment_metric('cost', v)
|
|
936
|
+
elif k.startswith('gen_ai.usage.details.'):
|
|
895
937
|
task_run.increment_metric(k.removeprefix('gen_ai.usage.details.'), v)
|
|
896
938
|
elif k.startswith('gen_ai.usage.'):
|
|
897
939
|
task_run.increment_metric(k.removeprefix('gen_ai.usage.'), v)
|
|
@@ -7,8 +7,7 @@ from typing import Any
|
|
|
7
7
|
from pydantic import BaseModel, Field
|
|
8
8
|
from pydantic_core import to_json
|
|
9
9
|
|
|
10
|
-
from pydantic_ai import Agent, models
|
|
11
|
-
from pydantic_ai.messages import MultiModalContent, UserContent
|
|
10
|
+
from pydantic_ai import Agent, MultiModalContent, UserContent, models
|
|
12
11
|
from pydantic_ai.settings import ModelSettings
|
|
13
12
|
|
|
14
13
|
__all__ = (
|
pydantic_evals/generation.py
CHANGED
|
@@ -59,7 +59,8 @@ async def generate_dataset(
|
|
|
59
59
|
"""
|
|
60
60
|
output_schema = dataset_type.model_json_schema_with_evaluators(custom_evaluator_types)
|
|
61
61
|
|
|
62
|
-
# TODO
|
|
62
|
+
# TODO: Use `output_type=StructuredDict(output_schema)` (and `from_dict` below) once https://github.com/pydantic/pydantic/issues/12145
|
|
63
|
+
# is fixed and `StructuredDict` no longer needs to use `InlineDefsJsonSchemaTransformer`.
|
|
63
64
|
agent = Agent(
|
|
64
65
|
model,
|
|
65
66
|
system_prompt=(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.13
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=0
|
|
32
32
|
Requires-Dist: logfire-api>=3.14.1
|
|
33
|
-
Requires-Dist: pydantic-ai-slim==1.0.
|
|
33
|
+
Requires-Dist: pydantic-ai-slim==1.0.13
|
|
34
34
|
Requires-Dist: pydantic>=2.10
|
|
35
35
|
Requires-Dist: pyyaml>=6.0.2
|
|
36
36
|
Requires-Dist: rich>=13.9.4
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
|
|
2
2
|
pydantic_evals/_utils.py,sha256=1muGTc2zqjwxqngz6quRSLoZM88onjp0Xgt-a9n2aPQ,4111
|
|
3
|
-
pydantic_evals/dataset.py,sha256=
|
|
4
|
-
pydantic_evals/generation.py,sha256=
|
|
3
|
+
pydantic_evals/dataset.py,sha256=IfaS65LqHW0654iAxc7bxA7mETo1qUqhFwITS_wFZ5s,50447
|
|
4
|
+
pydantic_evals/generation.py,sha256=ROB8bZ6XKFquWNjWTd3lsXXwsx8-VgSCu_okbovNw9s,3619
|
|
5
5
|
pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
pydantic_evals/evaluators/__init__.py,sha256=E_JT6o96Ef-oS_IZ1Hyy95NRLwz7EOHewp-o13IdXEM,1032
|
|
7
7
|
pydantic_evals/evaluators/_run_evaluator.py,sha256=uGmH67gCTeF9BSprCiBC4DtKEpKLrKYaXgsAQiCbCLY,3630
|
|
8
8
|
pydantic_evals/evaluators/common.py,sha256=Cc9RMsSf5P2gcq3IDwmZxgfo1xnu7HEehiAS2Hgibz4,11609
|
|
9
9
|
pydantic_evals/evaluators/context.py,sha256=mTxcm0Hvkev9htpqwoJMCJIqEYBtY5g86SXcjoqQxHY,3884
|
|
10
10
|
pydantic_evals/evaluators/evaluator.py,sha256=ylfKRytoM9KzbZkSsFkEEnsg4XhK4usuyy1Rb1emoPo,11474
|
|
11
|
-
pydantic_evals/evaluators/llm_as_a_judge.py,sha256=
|
|
11
|
+
pydantic_evals/evaluators/llm_as_a_judge.py,sha256=BPdUfEsLPSxN2kJPt3dtJBRCBP46ctRoW_n24WubaB0,9567
|
|
12
12
|
pydantic_evals/evaluators/spec.py,sha256=szAUsY4gb8KK_l1R81HYrByh4Rawrjav7w9835FZg1w,6690
|
|
13
13
|
pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
|
|
14
14
|
pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=FrG0pXKjuvTp3bXNdrUyzdPkqm0DQWe4ehkiHaxSvz4,6742
|
|
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
|
|
|
17
17
|
pydantic_evals/otel/span_tree.py,sha256=RzX4VGpEqc2QUhkyxMTXtBRo5yHHO1c0hI7QJJuiXPU,23043
|
|
18
18
|
pydantic_evals/reporting/__init__.py,sha256=4S8q_KfOflQlJYTISWM1Vp6_wPDHOMjbh9mSc3dU4-8,51562
|
|
19
19
|
pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
|
|
20
|
-
pydantic_evals-1.0.
|
|
21
|
-
pydantic_evals-1.0.
|
|
22
|
-
pydantic_evals-1.0.
|
|
23
|
-
pydantic_evals-1.0.
|
|
20
|
+
pydantic_evals-1.0.13.dist-info/METADATA,sha256=D6mudJtjmS-SJhv8xawBYvm3Sd8bRzjL-AKaK9cc4vA,7846
|
|
21
|
+
pydantic_evals-1.0.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
22
|
+
pydantic_evals-1.0.13.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
|
|
23
|
+
pydantic_evals-1.0.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|