pydantic-evals 0.3.7__tar.gz → 0.4.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydantic-evals might be problematic. Click here for more details.
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/PKG-INFO +2 -2
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/dataset.py +14 -9
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/reporting/__init__.py +30 -16
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/.gitignore +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/LICENSE +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/README.md +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/__init__.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/_utils.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/evaluators/__init__.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/evaluators/_spec.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/evaluators/common.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/evaluators/context.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/evaluators/evaluator.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/evaluators/llm_as_a_judge.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/generation.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/otel/__init__.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/otel/_context_subtree.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/otel/_errors.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/otel/span_tree.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/py.typed +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/reporting/render_numbers.py +0 -0
- {pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.1
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
|
|
|
32
32
|
Requires-Dist: anyio>=0
|
|
33
33
|
Requires-Dist: eval-type-backport>=0; python_version < '3.11'
|
|
34
34
|
Requires-Dist: logfire-api>=1.2.0
|
|
35
|
-
Requires-Dist: pydantic-ai-slim==0.
|
|
35
|
+
Requires-Dist: pydantic-ai-slim==0.4.1
|
|
36
36
|
Requires-Dist: pydantic>=2.10
|
|
37
37
|
Requires-Dist: pyyaml>=6.0.2
|
|
38
38
|
Requires-Dist: rich>=13.9.4
|
|
@@ -18,12 +18,14 @@ from collections.abc import Awaitable, Mapping, Sequence
|
|
|
18
18
|
from contextlib import AsyncExitStack, nullcontext
|
|
19
19
|
from contextvars import ContextVar
|
|
20
20
|
from dataclasses import dataclass, field
|
|
21
|
+
from inspect import iscoroutinefunction
|
|
21
22
|
from pathlib import Path
|
|
22
23
|
from typing import Any, Callable, Generic, Literal, Union, cast
|
|
23
24
|
|
|
24
25
|
import anyio
|
|
25
26
|
import logfire_api
|
|
26
27
|
import yaml
|
|
28
|
+
from anyio import to_thread
|
|
27
29
|
from pydantic import BaseModel, ConfigDict, Field, TypeAdapter, ValidationError, model_serializer
|
|
28
30
|
from pydantic._internal import _typing_extra
|
|
29
31
|
from pydantic_core import to_json
|
|
@@ -253,11 +255,11 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
253
255
|
|
|
254
256
|
async def evaluate(
|
|
255
257
|
self,
|
|
256
|
-
task: Callable[[InputsT], Awaitable[OutputT]],
|
|
258
|
+
task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
|
|
257
259
|
name: str | None = None,
|
|
258
260
|
max_concurrency: int | None = None,
|
|
259
261
|
progress: bool = True,
|
|
260
|
-
) -> EvaluationReport:
|
|
262
|
+
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
261
263
|
"""Evaluates the test cases in the dataset using the given task.
|
|
262
264
|
|
|
263
265
|
This method runs the task on each case in the dataset, applies evaluators,
|
|
@@ -308,11 +310,11 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
308
310
|
|
|
309
311
|
def evaluate_sync(
|
|
310
312
|
self,
|
|
311
|
-
task: Callable[[InputsT], Awaitable[OutputT]],
|
|
313
|
+
task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
|
|
312
314
|
name: str | None = None,
|
|
313
315
|
max_concurrency: int | None = None,
|
|
314
316
|
progress: bool = True,
|
|
315
|
-
) -> EvaluationReport:
|
|
317
|
+
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
316
318
|
"""Evaluates the test cases in the dataset using the given task.
|
|
317
319
|
|
|
318
320
|
This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
|
|
@@ -811,7 +813,7 @@ class _TaskRun:
|
|
|
811
813
|
|
|
812
814
|
|
|
813
815
|
async def _run_task(
|
|
814
|
-
task: Callable[[InputsT], Awaitable[OutputT]], case: Case[InputsT, OutputT, MetadataT]
|
|
816
|
+
task: Callable[[InputsT], Awaitable[OutputT] | OutputT], case: Case[InputsT, OutputT, MetadataT]
|
|
815
817
|
) -> EvaluatorContext[InputsT, OutputT, MetadataT]:
|
|
816
818
|
"""Run a task on a case and return the context for evaluators.
|
|
817
819
|
|
|
@@ -836,7 +838,10 @@ async def _run_task(
|
|
|
836
838
|
with _logfire.span('execute {task}', task=get_unwrapped_function_name(task)) as task_span:
|
|
837
839
|
with context_subtree() as span_tree:
|
|
838
840
|
t0 = time.perf_counter()
|
|
839
|
-
|
|
841
|
+
if iscoroutinefunction(task):
|
|
842
|
+
task_output = cast(OutputT, await task(case.inputs))
|
|
843
|
+
else:
|
|
844
|
+
task_output = cast(OutputT, await to_thread.run_sync(task, case.inputs))
|
|
840
845
|
fallback_duration = time.perf_counter() - t0
|
|
841
846
|
finally:
|
|
842
847
|
_CURRENT_TASK_RUN.reset(token)
|
|
@@ -873,11 +878,11 @@ async def _run_task(
|
|
|
873
878
|
|
|
874
879
|
|
|
875
880
|
async def _run_task_and_evaluators(
|
|
876
|
-
task: Callable[[InputsT], Awaitable[OutputT]],
|
|
881
|
+
task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
|
|
877
882
|
case: Case[InputsT, OutputT, MetadataT],
|
|
878
883
|
report_case_name: str,
|
|
879
884
|
dataset_evaluators: list[Evaluator[InputsT, OutputT, MetadataT]],
|
|
880
|
-
) -> ReportCase:
|
|
885
|
+
) -> ReportCase[InputsT, OutputT, MetadataT]:
|
|
881
886
|
"""Run a task on a case and evaluate the results.
|
|
882
887
|
|
|
883
888
|
Args:
|
|
@@ -927,7 +932,7 @@ async def _run_task_and_evaluators(
|
|
|
927
932
|
span_id = f'{context.span_id:016x}'
|
|
928
933
|
fallback_duration = time.time() - t0
|
|
929
934
|
|
|
930
|
-
return ReportCase(
|
|
935
|
+
return ReportCase[InputsT, OutputT, MetadataT](
|
|
931
936
|
name=report_case_name,
|
|
932
937
|
inputs=case.inputs,
|
|
933
938
|
metadata=case.metadata,
|
|
@@ -2,14 +2,14 @@ from __future__ import annotations as _annotations
|
|
|
2
2
|
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
from collections.abc import Mapping
|
|
5
|
-
from dataclasses import dataclass
|
|
5
|
+
from dataclasses import dataclass
|
|
6
6
|
from io import StringIO
|
|
7
|
-
from typing import Any, Callable, Literal, Protocol
|
|
7
|
+
from typing import Any, Callable, Generic, Literal, Protocol
|
|
8
8
|
|
|
9
|
-
from pydantic import BaseModel
|
|
9
|
+
from pydantic import BaseModel, TypeAdapter
|
|
10
10
|
from rich.console import Console
|
|
11
11
|
from rich.table import Table
|
|
12
|
-
from typing_extensions import TypedDict
|
|
12
|
+
from typing_extensions import TypedDict, TypeVar
|
|
13
13
|
|
|
14
14
|
from pydantic_evals._utils import UNSET, Unset
|
|
15
15
|
|
|
@@ -24,7 +24,9 @@ from .render_numbers import (
|
|
|
24
24
|
|
|
25
25
|
__all__ = (
|
|
26
26
|
'EvaluationReport',
|
|
27
|
+
'EvaluationReportAdapter',
|
|
27
28
|
'ReportCase',
|
|
29
|
+
'ReportCaseAdapter',
|
|
28
30
|
'EvaluationRenderer',
|
|
29
31
|
'RenderValueConfig',
|
|
30
32
|
'RenderNumberConfig',
|
|
@@ -35,27 +37,32 @@ MISSING_VALUE_STR = '[i]<missing>[/i]'
|
|
|
35
37
|
EMPTY_CELL_STR = '-'
|
|
36
38
|
EMPTY_AGGREGATE_CELL_STR = ''
|
|
37
39
|
|
|
40
|
+
InputsT = TypeVar('InputsT', default=Any)
|
|
41
|
+
OutputT = TypeVar('OutputT', default=Any)
|
|
42
|
+
MetadataT = TypeVar('MetadataT', default=Any)
|
|
38
43
|
|
|
39
|
-
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ReportCase(Generic[InputsT, OutputT, MetadataT]):
|
|
40
47
|
"""A single case in an evaluation report."""
|
|
41
48
|
|
|
42
49
|
name: str
|
|
43
50
|
"""The name of the [case][pydantic_evals.Case]."""
|
|
44
|
-
inputs:
|
|
51
|
+
inputs: InputsT
|
|
45
52
|
"""The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
|
|
46
|
-
metadata:
|
|
53
|
+
metadata: MetadataT | None
|
|
47
54
|
"""Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
|
|
48
|
-
expected_output:
|
|
55
|
+
expected_output: OutputT | None
|
|
49
56
|
"""The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
|
|
50
|
-
output:
|
|
57
|
+
output: OutputT
|
|
51
58
|
"""The output of the task execution."""
|
|
52
59
|
|
|
53
60
|
metrics: dict[str, float | int]
|
|
54
61
|
attributes: dict[str, Any]
|
|
55
62
|
|
|
56
|
-
scores: dict[str, EvaluationResult[int | float]]
|
|
57
|
-
labels: dict[str, EvaluationResult[str]]
|
|
58
|
-
assertions: dict[str, EvaluationResult[bool]]
|
|
63
|
+
scores: dict[str, EvaluationResult[int | float]]
|
|
64
|
+
labels: dict[str, EvaluationResult[str]]
|
|
65
|
+
assertions: dict[str, EvaluationResult[bool]]
|
|
59
66
|
|
|
60
67
|
task_duration: float
|
|
61
68
|
total_duration: float # includes evaluator execution time
|
|
@@ -65,6 +72,9 @@ class ReportCase(BaseModel):
|
|
|
65
72
|
span_id: str
|
|
66
73
|
|
|
67
74
|
|
|
75
|
+
ReportCaseAdapter = TypeAdapter(ReportCase[Any, Any, Any])
|
|
76
|
+
|
|
77
|
+
|
|
68
78
|
class ReportCaseAggregate(BaseModel):
|
|
69
79
|
"""A synthetic case that summarizes a set of cases."""
|
|
70
80
|
|
|
@@ -142,12 +152,13 @@ class ReportCaseAggregate(BaseModel):
|
|
|
142
152
|
)
|
|
143
153
|
|
|
144
154
|
|
|
145
|
-
|
|
155
|
+
@dataclass
|
|
156
|
+
class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
146
157
|
"""A report of the results of evaluating a model on a set of cases."""
|
|
147
158
|
|
|
148
159
|
name: str
|
|
149
160
|
"""The name of the report."""
|
|
150
|
-
cases: list[ReportCase]
|
|
161
|
+
cases: list[ReportCase[InputsT, OutputT, MetadataT]]
|
|
151
162
|
"""The cases in the report."""
|
|
152
163
|
|
|
153
164
|
def averages(self) -> ReportCaseAggregate:
|
|
@@ -156,7 +167,7 @@ class EvaluationReport(BaseModel):
|
|
|
156
167
|
def print(
|
|
157
168
|
self,
|
|
158
169
|
width: int | None = None,
|
|
159
|
-
baseline: EvaluationReport | None = None,
|
|
170
|
+
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
160
171
|
include_input: bool = False,
|
|
161
172
|
include_metadata: bool = False,
|
|
162
173
|
include_expected_output: bool = False,
|
|
@@ -199,7 +210,7 @@ class EvaluationReport(BaseModel):
|
|
|
199
210
|
|
|
200
211
|
def console_table(
|
|
201
212
|
self,
|
|
202
|
-
baseline: EvaluationReport | None = None,
|
|
213
|
+
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
203
214
|
include_input: bool = False,
|
|
204
215
|
include_metadata: bool = False,
|
|
205
216
|
include_expected_output: bool = False,
|
|
@@ -250,6 +261,9 @@ class EvaluationReport(BaseModel):
|
|
|
250
261
|
return io_file.getvalue()
|
|
251
262
|
|
|
252
263
|
|
|
264
|
+
EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
|
|
265
|
+
|
|
266
|
+
|
|
253
267
|
class RenderValueConfig(TypedDict, total=False):
|
|
254
268
|
"""A configuration for rendering a values in an Evaluation report."""
|
|
255
269
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|