pydantic-evals 1.0.12__tar.gz → 1.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydantic-evals might be problematic. Click here for more details.
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/PKG-INFO +2 -2
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/dataset.py +40 -11
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/.gitignore +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/LICENSE +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/README.md +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/__init__.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/_utils.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/evaluators/__init__.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/evaluators/common.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/evaluators/context.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/evaluators/evaluator.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/evaluators/llm_as_a_judge.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/evaluators/spec.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/generation.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/otel/__init__.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/otel/_context_subtree.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/otel/_errors.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/otel/span_tree.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/py.typed +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/reporting/__init__.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pydantic_evals/reporting/render_numbers.py +0 -0
- {pydantic_evals-1.0.12 → pydantic_evals-1.0.14}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.14
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=0
|
|
32
32
|
Requires-Dist: logfire-api>=3.14.1
|
|
33
|
-
Requires-Dist: pydantic-ai-slim==1.0.
|
|
33
|
+
Requires-Dist: pydantic-ai-slim==1.0.14
|
|
34
34
|
Requires-Dist: pydantic>=2.10
|
|
35
35
|
Requires-Dist: pyyaml>=6.0.2
|
|
36
36
|
Requires-Dist: rich>=13.9.4
|
|
@@ -98,6 +98,7 @@ class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forb
|
|
|
98
98
|
|
|
99
99
|
# $schema is included to avoid validation fails from the `$schema` key, see `_add_json_schema` below for context
|
|
100
100
|
json_schema_path: str | None = Field(default=None, alias='$schema')
|
|
101
|
+
name: str | None = None
|
|
101
102
|
cases: list[_CaseModel[InputsT, OutputT, MetadataT]]
|
|
102
103
|
evaluators: list[EvaluatorSpec] = Field(default_factory=list)
|
|
103
104
|
|
|
@@ -218,6 +219,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
218
219
|
```
|
|
219
220
|
"""
|
|
220
221
|
|
|
222
|
+
name: str | None = None
|
|
223
|
+
"""Optional name of the dataset."""
|
|
221
224
|
cases: list[Case[InputsT, OutputT, MetadataT]]
|
|
222
225
|
"""List of test cases in the dataset."""
|
|
223
226
|
evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = []
|
|
@@ -226,12 +229,14 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
226
229
|
def __init__(
|
|
227
230
|
self,
|
|
228
231
|
*,
|
|
232
|
+
name: str | None = None,
|
|
229
233
|
cases: Sequence[Case[InputsT, OutputT, MetadataT]],
|
|
230
234
|
evaluators: Sequence[Evaluator[InputsT, OutputT, MetadataT]] = (),
|
|
231
235
|
):
|
|
232
236
|
"""Initialize a new dataset with test cases and optional evaluators.
|
|
233
237
|
|
|
234
238
|
Args:
|
|
239
|
+
name: Optional name for the dataset.
|
|
235
240
|
cases: Sequence of test cases to include in the dataset.
|
|
236
241
|
evaluators: Optional sequence of evaluators to apply to all cases in the dataset.
|
|
237
242
|
"""
|
|
@@ -244,10 +249,12 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
244
249
|
case_names.add(case.name)
|
|
245
250
|
|
|
246
251
|
super().__init__(
|
|
252
|
+
name=name,
|
|
247
253
|
cases=cases,
|
|
248
254
|
evaluators=list(evaluators),
|
|
249
255
|
)
|
|
250
256
|
|
|
257
|
+
# TODO in v2: Make everything not required keyword-only
|
|
251
258
|
async def evaluate(
|
|
252
259
|
self,
|
|
253
260
|
task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
|
|
@@ -256,6 +263,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
256
263
|
progress: bool = True,
|
|
257
264
|
retry_task: RetryConfig | None = None,
|
|
258
265
|
retry_evaluators: RetryConfig | None = None,
|
|
266
|
+
*,
|
|
267
|
+
task_name: str | None = None,
|
|
259
268
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
260
269
|
"""Evaluates the test cases in the dataset using the given task.
|
|
261
270
|
|
|
@@ -265,28 +274,38 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
265
274
|
Args:
|
|
266
275
|
task: The task to evaluate. This should be a callable that takes the inputs of the case
|
|
267
276
|
and returns the output.
|
|
268
|
-
name: The name of the
|
|
269
|
-
If omitted, the name of the task function
|
|
277
|
+
name: The name of the experiment being run, this is used to identify the experiment in the report.
|
|
278
|
+
If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
|
|
270
279
|
max_concurrency: The maximum number of concurrent evaluations of the task to allow.
|
|
271
280
|
If None, all cases will be evaluated concurrently.
|
|
272
281
|
progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
|
|
273
282
|
retry_task: Optional retry configuration for the task execution.
|
|
274
283
|
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
284
|
+
task_name: Optional override to the name of the task being executed, otherwise the name of the task
|
|
285
|
+
function will be used.
|
|
275
286
|
|
|
276
287
|
Returns:
|
|
277
288
|
A report containing the results of the evaluation.
|
|
278
289
|
"""
|
|
279
|
-
|
|
290
|
+
task_name = task_name or get_unwrapped_function_name(task)
|
|
291
|
+
name = name or task_name
|
|
280
292
|
total_cases = len(self.cases)
|
|
281
293
|
progress_bar = Progress() if progress else None
|
|
282
294
|
|
|
283
295
|
limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
|
|
284
296
|
|
|
285
297
|
with (
|
|
286
|
-
logfire_span(
|
|
298
|
+
logfire_span(
|
|
299
|
+
'evaluate {name}',
|
|
300
|
+
name=name,
|
|
301
|
+
task_name=task_name,
|
|
302
|
+
dataset_name=self.name,
|
|
303
|
+
n_cases=len(self.cases),
|
|
304
|
+
**{'gen_ai.operation.name': 'experiment'}, # pyright: ignore[reportArgumentType]
|
|
305
|
+
) as eval_span,
|
|
287
306
|
progress_bar or nullcontext(),
|
|
288
307
|
):
|
|
289
|
-
task_id = progress_bar.add_task(f'Evaluating {
|
|
308
|
+
task_id = progress_bar.add_task(f'Evaluating {task_name}', total=total_cases) if progress_bar else None
|
|
290
309
|
|
|
291
310
|
async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str):
|
|
292
311
|
async with limiter:
|
|
@@ -357,7 +376,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
357
376
|
return get_event_loop().run_until_complete(
|
|
358
377
|
self.evaluate(
|
|
359
378
|
task,
|
|
360
|
-
|
|
379
|
+
task_name=name,
|
|
361
380
|
max_concurrency=max_concurrency,
|
|
362
381
|
progress=progress,
|
|
363
382
|
retry_task=retry_task,
|
|
@@ -474,7 +493,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
474
493
|
|
|
475
494
|
raw = Path(path).read_text()
|
|
476
495
|
try:
|
|
477
|
-
return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types)
|
|
496
|
+
return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types, default_name=path.stem)
|
|
478
497
|
except ValidationError as e: # pragma: no cover
|
|
479
498
|
raise ValueError(f'{path} contains data that does not match the schema for {cls.__name__}:\n{e}.') from e
|
|
480
499
|
|
|
@@ -484,6 +503,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
484
503
|
contents: str,
|
|
485
504
|
fmt: Literal['yaml', 'json'] = 'yaml',
|
|
486
505
|
custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
|
|
506
|
+
*,
|
|
507
|
+
default_name: str | None = None,
|
|
487
508
|
) -> Self:
|
|
488
509
|
"""Load a dataset from a string.
|
|
489
510
|
|
|
@@ -492,6 +513,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
492
513
|
fmt: Format of the content. Must be either 'yaml' or 'json'.
|
|
493
514
|
custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.
|
|
494
515
|
These are additional evaluators beyond the default ones.
|
|
516
|
+
default_name: Default name of the dataset, to be used if not specified in the serialized contents.
|
|
495
517
|
|
|
496
518
|
Returns:
|
|
497
519
|
A new Dataset instance parsed from the string.
|
|
@@ -501,17 +523,19 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
501
523
|
"""
|
|
502
524
|
if fmt == 'yaml':
|
|
503
525
|
loaded = yaml.safe_load(contents)
|
|
504
|
-
return cls.from_dict(loaded, custom_evaluator_types)
|
|
526
|
+
return cls.from_dict(loaded, custom_evaluator_types, default_name=default_name)
|
|
505
527
|
else:
|
|
506
528
|
dataset_model_type = cls._serialization_type()
|
|
507
529
|
dataset_model = dataset_model_type.model_validate_json(contents)
|
|
508
|
-
return cls._from_dataset_model(dataset_model, custom_evaluator_types)
|
|
530
|
+
return cls._from_dataset_model(dataset_model, custom_evaluator_types, default_name)
|
|
509
531
|
|
|
510
532
|
@classmethod
|
|
511
533
|
def from_dict(
|
|
512
534
|
cls,
|
|
513
535
|
data: dict[str, Any],
|
|
514
536
|
custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
|
|
537
|
+
*,
|
|
538
|
+
default_name: str | None = None,
|
|
515
539
|
) -> Self:
|
|
516
540
|
"""Load a dataset from a dictionary.
|
|
517
541
|
|
|
@@ -519,6 +543,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
519
543
|
data: Dictionary representation of the dataset.
|
|
520
544
|
custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.
|
|
521
545
|
These are additional evaluators beyond the default ones.
|
|
546
|
+
default_name: Default name of the dataset, to be used if not specified in the data.
|
|
522
547
|
|
|
523
548
|
Returns:
|
|
524
549
|
A new Dataset instance created from the dictionary.
|
|
@@ -528,19 +553,21 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
528
553
|
"""
|
|
529
554
|
dataset_model_type = cls._serialization_type()
|
|
530
555
|
dataset_model = dataset_model_type.model_validate(data)
|
|
531
|
-
return cls._from_dataset_model(dataset_model, custom_evaluator_types)
|
|
556
|
+
return cls._from_dataset_model(dataset_model, custom_evaluator_types, default_name)
|
|
532
557
|
|
|
533
558
|
@classmethod
|
|
534
559
|
def _from_dataset_model(
|
|
535
560
|
cls,
|
|
536
561
|
dataset_model: _DatasetModel[InputsT, OutputT, MetadataT],
|
|
537
562
|
custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
|
|
563
|
+
default_name: str | None = None,
|
|
538
564
|
) -> Self:
|
|
539
565
|
"""Create a Dataset from a _DatasetModel.
|
|
540
566
|
|
|
541
567
|
Args:
|
|
542
568
|
dataset_model: The _DatasetModel to convert.
|
|
543
569
|
custom_evaluator_types: Custom evaluator classes to register for deserialization.
|
|
570
|
+
default_name: Default name of the dataset, to be used if the value is `None` in the provided model.
|
|
544
571
|
|
|
545
572
|
Returns:
|
|
546
573
|
A new Dataset instance created from the _DatasetModel.
|
|
@@ -577,7 +604,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
577
604
|
cases.append(row)
|
|
578
605
|
if errors:
|
|
579
606
|
raise ExceptionGroup(f'{len(errors)} error(s) loading evaluators from registry', errors[:3])
|
|
580
|
-
result = cls(cases=cases)
|
|
607
|
+
result = cls(name=dataset_model.name, cases=cases)
|
|
608
|
+
if result.name is None:
|
|
609
|
+
result.name = default_name
|
|
581
610
|
result.evaluators = dataset_evaluators
|
|
582
611
|
return result
|
|
583
612
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|