pydantic-evals 0.3.7__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydantic_evals/dataset.py +4 -4
- pydantic_evals/reporting/__init__.py +30 -16
- {pydantic_evals-0.3.7.dist-info → pydantic_evals-0.4.0.dist-info}/METADATA +2 -2
- {pydantic_evals-0.3.7.dist-info → pydantic_evals-0.4.0.dist-info}/RECORD +6 -6
- {pydantic_evals-0.3.7.dist-info → pydantic_evals-0.4.0.dist-info}/WHEEL +0 -0
- {pydantic_evals-0.3.7.dist-info → pydantic_evals-0.4.0.dist-info}/licenses/LICENSE +0 -0
pydantic_evals/dataset.py
CHANGED
|
@@ -257,7 +257,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
257
257
|
name: str | None = None,
|
|
258
258
|
max_concurrency: int | None = None,
|
|
259
259
|
progress: bool = True,
|
|
260
|
-
) -> EvaluationReport:
|
|
260
|
+
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
261
261
|
"""Evaluates the test cases in the dataset using the given task.
|
|
262
262
|
|
|
263
263
|
This method runs the task on each case in the dataset, applies evaluators,
|
|
@@ -312,7 +312,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
312
312
|
name: str | None = None,
|
|
313
313
|
max_concurrency: int | None = None,
|
|
314
314
|
progress: bool = True,
|
|
315
|
-
) -> EvaluationReport:
|
|
315
|
+
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
316
316
|
"""Evaluates the test cases in the dataset using the given task.
|
|
317
317
|
|
|
318
318
|
This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
|
|
@@ -877,7 +877,7 @@ async def _run_task_and_evaluators(
|
|
|
877
877
|
case: Case[InputsT, OutputT, MetadataT],
|
|
878
878
|
report_case_name: str,
|
|
879
879
|
dataset_evaluators: list[Evaluator[InputsT, OutputT, MetadataT]],
|
|
880
|
-
) -> ReportCase:
|
|
880
|
+
) -> ReportCase[InputsT, OutputT, MetadataT]:
|
|
881
881
|
"""Run a task on a case and evaluate the results.
|
|
882
882
|
|
|
883
883
|
Args:
|
|
@@ -927,7 +927,7 @@ async def _run_task_and_evaluators(
|
|
|
927
927
|
span_id = f'{context.span_id:016x}'
|
|
928
928
|
fallback_duration = time.time() - t0
|
|
929
929
|
|
|
930
|
-
return ReportCase(
|
|
930
|
+
return ReportCase[InputsT, OutputT, MetadataT](
|
|
931
931
|
name=report_case_name,
|
|
932
932
|
inputs=case.inputs,
|
|
933
933
|
metadata=case.metadata,
|
|
@@ -2,14 +2,14 @@ from __future__ import annotations as _annotations
|
|
|
2
2
|
|
|
3
3
|
from collections import defaultdict
|
|
4
4
|
from collections.abc import Mapping
|
|
5
|
-
from dataclasses import dataclass
|
|
5
|
+
from dataclasses import dataclass
|
|
6
6
|
from io import StringIO
|
|
7
|
-
from typing import Any, Callable, Literal, Protocol
|
|
7
|
+
from typing import Any, Callable, Generic, Literal, Protocol
|
|
8
8
|
|
|
9
|
-
from pydantic import BaseModel
|
|
9
|
+
from pydantic import BaseModel, TypeAdapter
|
|
10
10
|
from rich.console import Console
|
|
11
11
|
from rich.table import Table
|
|
12
|
-
from typing_extensions import TypedDict
|
|
12
|
+
from typing_extensions import TypedDict, TypeVar
|
|
13
13
|
|
|
14
14
|
from pydantic_evals._utils import UNSET, Unset
|
|
15
15
|
|
|
@@ -24,7 +24,9 @@ from .render_numbers import (
|
|
|
24
24
|
|
|
25
25
|
__all__ = (
|
|
26
26
|
'EvaluationReport',
|
|
27
|
+
'EvaluationReportAdapter',
|
|
27
28
|
'ReportCase',
|
|
29
|
+
'ReportCaseAdapter',
|
|
28
30
|
'EvaluationRenderer',
|
|
29
31
|
'RenderValueConfig',
|
|
30
32
|
'RenderNumberConfig',
|
|
@@ -35,27 +37,32 @@ MISSING_VALUE_STR = '[i]<missing>[/i]'
|
|
|
35
37
|
EMPTY_CELL_STR = '-'
|
|
36
38
|
EMPTY_AGGREGATE_CELL_STR = ''
|
|
37
39
|
|
|
40
|
+
InputsT = TypeVar('InputsT', default=Any)
|
|
41
|
+
OutputT = TypeVar('OutputT', default=Any)
|
|
42
|
+
MetadataT = TypeVar('MetadataT', default=Any)
|
|
38
43
|
|
|
39
|
-
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ReportCase(Generic[InputsT, OutputT, MetadataT]):
|
|
40
47
|
"""A single case in an evaluation report."""
|
|
41
48
|
|
|
42
49
|
name: str
|
|
43
50
|
"""The name of the [case][pydantic_evals.Case]."""
|
|
44
|
-
inputs:
|
|
51
|
+
inputs: InputsT
|
|
45
52
|
"""The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
|
|
46
|
-
metadata:
|
|
53
|
+
metadata: MetadataT | None
|
|
47
54
|
"""Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
|
|
48
|
-
expected_output:
|
|
55
|
+
expected_output: OutputT | None
|
|
49
56
|
"""The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
|
|
50
|
-
output:
|
|
57
|
+
output: OutputT
|
|
51
58
|
"""The output of the task execution."""
|
|
52
59
|
|
|
53
60
|
metrics: dict[str, float | int]
|
|
54
61
|
attributes: dict[str, Any]
|
|
55
62
|
|
|
56
|
-
scores: dict[str, EvaluationResult[int | float]]
|
|
57
|
-
labels: dict[str, EvaluationResult[str]]
|
|
58
|
-
assertions: dict[str, EvaluationResult[bool]]
|
|
63
|
+
scores: dict[str, EvaluationResult[int | float]]
|
|
64
|
+
labels: dict[str, EvaluationResult[str]]
|
|
65
|
+
assertions: dict[str, EvaluationResult[bool]]
|
|
59
66
|
|
|
60
67
|
task_duration: float
|
|
61
68
|
total_duration: float # includes evaluator execution time
|
|
@@ -65,6 +72,9 @@ class ReportCase(BaseModel):
|
|
|
65
72
|
span_id: str
|
|
66
73
|
|
|
67
74
|
|
|
75
|
+
ReportCaseAdapter = TypeAdapter(ReportCase[Any, Any, Any])
|
|
76
|
+
|
|
77
|
+
|
|
68
78
|
class ReportCaseAggregate(BaseModel):
|
|
69
79
|
"""A synthetic case that summarizes a set of cases."""
|
|
70
80
|
|
|
@@ -142,12 +152,13 @@ class ReportCaseAggregate(BaseModel):
|
|
|
142
152
|
)
|
|
143
153
|
|
|
144
154
|
|
|
145
|
-
|
|
155
|
+
@dataclass
|
|
156
|
+
class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
146
157
|
"""A report of the results of evaluating a model on a set of cases."""
|
|
147
158
|
|
|
148
159
|
name: str
|
|
149
160
|
"""The name of the report."""
|
|
150
|
-
cases: list[ReportCase]
|
|
161
|
+
cases: list[ReportCase[InputsT, OutputT, MetadataT]]
|
|
151
162
|
"""The cases in the report."""
|
|
152
163
|
|
|
153
164
|
def averages(self) -> ReportCaseAggregate:
|
|
@@ -156,7 +167,7 @@ class EvaluationReport(BaseModel):
|
|
|
156
167
|
def print(
|
|
157
168
|
self,
|
|
158
169
|
width: int | None = None,
|
|
159
|
-
baseline: EvaluationReport | None = None,
|
|
170
|
+
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
160
171
|
include_input: bool = False,
|
|
161
172
|
include_metadata: bool = False,
|
|
162
173
|
include_expected_output: bool = False,
|
|
@@ -199,7 +210,7 @@ class EvaluationReport(BaseModel):
|
|
|
199
210
|
|
|
200
211
|
def console_table(
|
|
201
212
|
self,
|
|
202
|
-
baseline: EvaluationReport | None = None,
|
|
213
|
+
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
203
214
|
include_input: bool = False,
|
|
204
215
|
include_metadata: bool = False,
|
|
205
216
|
include_expected_output: bool = False,
|
|
@@ -250,6 +261,9 @@ class EvaluationReport(BaseModel):
|
|
|
250
261
|
return io_file.getvalue()
|
|
251
262
|
|
|
252
263
|
|
|
264
|
+
EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
|
|
265
|
+
|
|
266
|
+
|
|
253
267
|
class RenderValueConfig(TypedDict, total=False):
|
|
254
268
|
"""A configuration for rendering a values in an Evaluation report."""
|
|
255
269
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
|
|
|
32
32
|
Requires-Dist: anyio>=0
|
|
33
33
|
Requires-Dist: eval-type-backport>=0; python_version < '3.11'
|
|
34
34
|
Requires-Dist: logfire-api>=1.2.0
|
|
35
|
-
Requires-Dist: pydantic-ai-slim==0.
|
|
35
|
+
Requires-Dist: pydantic-ai-slim==0.4.0
|
|
36
36
|
Requires-Dist: pydantic>=2.10
|
|
37
37
|
Requires-Dist: pyyaml>=6.0.2
|
|
38
38
|
Requires-Dist: rich>=13.9.4
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
|
|
2
2
|
pydantic_evals/_utils.py,sha256=PfhmPbdQp-q90s568LuG45zDDXxgO13BEz8MQJK8qw4,2922
|
|
3
|
-
pydantic_evals/dataset.py,sha256
|
|
3
|
+
pydantic_evals/dataset.py,sha256=-wLreOfr7fsr2NqPHeVbrHh_dIlyjjTrY_QK4eBZFnw,46126
|
|
4
4
|
pydantic_evals/generation.py,sha256=-w-4-zpJuW8mLj5ed60LUYm--b-2G42p-UDuPhOQgRE,3492
|
|
5
5
|
pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
pydantic_evals/evaluators/__init__.py,sha256=uj110viFiDvqrIcuDcWexk_uBgJqhKMGPHT2YvDb7LA,624
|
|
@@ -15,9 +15,9 @@ pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5
|
|
|
15
15
|
pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
|
|
16
16
|
pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD4,268
|
|
17
17
|
pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
|
|
18
|
-
pydantic_evals/reporting/__init__.py,sha256=
|
|
18
|
+
pydantic_evals/reporting/__init__.py,sha256=k_3tteqXGh0yGvgpN68gB0CjG9wzrakzDTve2GHend4,42148
|
|
19
19
|
pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
|
|
20
|
-
pydantic_evals-0.
|
|
21
|
-
pydantic_evals-0.
|
|
22
|
-
pydantic_evals-0.
|
|
23
|
-
pydantic_evals-0.
|
|
20
|
+
pydantic_evals-0.4.0.dist-info/METADATA,sha256=Fj6Jpt6VisJsz97AID-AEzcpfRWPVuaocmKfVTmyaHY,7785
|
|
21
|
+
pydantic_evals-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
22
|
+
pydantic_evals-0.4.0.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
|
|
23
|
+
pydantic_evals-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|