pydantic-evals 0.3.7__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-evals might be problematic. Click here for more details.

pydantic_evals/dataset.py CHANGED
@@ -18,12 +18,14 @@ from collections.abc import Awaitable, Mapping, Sequence
18
18
  from contextlib import AsyncExitStack, nullcontext
19
19
  from contextvars import ContextVar
20
20
  from dataclasses import dataclass, field
21
+ from inspect import iscoroutinefunction
21
22
  from pathlib import Path
22
23
  from typing import Any, Callable, Generic, Literal, Union, cast
23
24
 
24
25
  import anyio
25
26
  import logfire_api
26
27
  import yaml
28
+ from anyio import to_thread
27
29
  from pydantic import BaseModel, ConfigDict, Field, TypeAdapter, ValidationError, model_serializer
28
30
  from pydantic._internal import _typing_extra
29
31
  from pydantic_core import to_json
@@ -253,11 +255,11 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
253
255
 
254
256
  async def evaluate(
255
257
  self,
256
- task: Callable[[InputsT], Awaitable[OutputT]],
258
+ task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
257
259
  name: str | None = None,
258
260
  max_concurrency: int | None = None,
259
261
  progress: bool = True,
260
- ) -> EvaluationReport:
262
+ ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
261
263
  """Evaluates the test cases in the dataset using the given task.
262
264
 
263
265
  This method runs the task on each case in the dataset, applies evaluators,
@@ -308,11 +310,11 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
308
310
 
309
311
  def evaluate_sync(
310
312
  self,
311
- task: Callable[[InputsT], Awaitable[OutputT]],
313
+ task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
312
314
  name: str | None = None,
313
315
  max_concurrency: int | None = None,
314
316
  progress: bool = True,
315
- ) -> EvaluationReport:
317
+ ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
316
318
  """Evaluates the test cases in the dataset using the given task.
317
319
 
318
320
  This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
@@ -811,7 +813,7 @@ class _TaskRun:
811
813
 
812
814
 
813
815
  async def _run_task(
814
- task: Callable[[InputsT], Awaitable[OutputT]], case: Case[InputsT, OutputT, MetadataT]
816
+ task: Callable[[InputsT], Awaitable[OutputT] | OutputT], case: Case[InputsT, OutputT, MetadataT]
815
817
  ) -> EvaluatorContext[InputsT, OutputT, MetadataT]:
816
818
  """Run a task on a case and return the context for evaluators.
817
819
 
@@ -836,7 +838,10 @@ async def _run_task(
836
838
  with _logfire.span('execute {task}', task=get_unwrapped_function_name(task)) as task_span:
837
839
  with context_subtree() as span_tree:
838
840
  t0 = time.perf_counter()
839
- task_output = await task(case.inputs)
841
+ if iscoroutinefunction(task):
842
+ task_output = cast(OutputT, await task(case.inputs))
843
+ else:
844
+ task_output = cast(OutputT, await to_thread.run_sync(task, case.inputs))
840
845
  fallback_duration = time.perf_counter() - t0
841
846
  finally:
842
847
  _CURRENT_TASK_RUN.reset(token)
@@ -873,11 +878,11 @@ async def _run_task(
873
878
 
874
879
 
875
880
  async def _run_task_and_evaluators(
876
- task: Callable[[InputsT], Awaitable[OutputT]],
881
+ task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
877
882
  case: Case[InputsT, OutputT, MetadataT],
878
883
  report_case_name: str,
879
884
  dataset_evaluators: list[Evaluator[InputsT, OutputT, MetadataT]],
880
- ) -> ReportCase:
885
+ ) -> ReportCase[InputsT, OutputT, MetadataT]:
881
886
  """Run a task on a case and evaluate the results.
882
887
 
883
888
  Args:
@@ -927,7 +932,7 @@ async def _run_task_and_evaluators(
927
932
  span_id = f'{context.span_id:016x}'
928
933
  fallback_duration = time.time() - t0
929
934
 
930
- return ReportCase(
935
+ return ReportCase[InputsT, OutputT, MetadataT](
931
936
  name=report_case_name,
932
937
  inputs=case.inputs,
933
938
  metadata=case.metadata,
@@ -2,14 +2,14 @@ from __future__ import annotations as _annotations
2
2
 
3
3
  from collections import defaultdict
4
4
  from collections.abc import Mapping
5
- from dataclasses import dataclass, field
5
+ from dataclasses import dataclass
6
6
  from io import StringIO
7
- from typing import Any, Callable, Literal, Protocol, TypeVar
7
+ from typing import Any, Callable, Generic, Literal, Protocol
8
8
 
9
- from pydantic import BaseModel
9
+ from pydantic import BaseModel, TypeAdapter
10
10
  from rich.console import Console
11
11
  from rich.table import Table
12
- from typing_extensions import TypedDict
12
+ from typing_extensions import TypedDict, TypeVar
13
13
 
14
14
  from pydantic_evals._utils import UNSET, Unset
15
15
 
@@ -24,7 +24,9 @@ from .render_numbers import (
24
24
 
25
25
  __all__ = (
26
26
  'EvaluationReport',
27
+ 'EvaluationReportAdapter',
27
28
  'ReportCase',
29
+ 'ReportCaseAdapter',
28
30
  'EvaluationRenderer',
29
31
  'RenderValueConfig',
30
32
  'RenderNumberConfig',
@@ -35,27 +37,32 @@ MISSING_VALUE_STR = '[i]<missing>[/i]'
35
37
  EMPTY_CELL_STR = '-'
36
38
  EMPTY_AGGREGATE_CELL_STR = ''
37
39
 
40
+ InputsT = TypeVar('InputsT', default=Any)
41
+ OutputT = TypeVar('OutputT', default=Any)
42
+ MetadataT = TypeVar('MetadataT', default=Any)
38
43
 
39
- class ReportCase(BaseModel):
44
+
45
+ @dataclass
46
+ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
40
47
  """A single case in an evaluation report."""
41
48
 
42
49
  name: str
43
50
  """The name of the [case][pydantic_evals.Case]."""
44
- inputs: Any
51
+ inputs: InputsT
45
52
  """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
46
- metadata: Any
53
+ metadata: MetadataT | None
47
54
  """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
48
- expected_output: Any
55
+ expected_output: OutputT | None
49
56
  """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
50
- output: Any
57
+ output: OutputT
51
58
  """The output of the task execution."""
52
59
 
53
60
  metrics: dict[str, float | int]
54
61
  attributes: dict[str, Any]
55
62
 
56
- scores: dict[str, EvaluationResult[int | float]] = field(init=False)
57
- labels: dict[str, EvaluationResult[str]] = field(init=False)
58
- assertions: dict[str, EvaluationResult[bool]] = field(init=False)
63
+ scores: dict[str, EvaluationResult[int | float]]
64
+ labels: dict[str, EvaluationResult[str]]
65
+ assertions: dict[str, EvaluationResult[bool]]
59
66
 
60
67
  task_duration: float
61
68
  total_duration: float # includes evaluator execution time
@@ -65,6 +72,9 @@ class ReportCase(BaseModel):
65
72
  span_id: str
66
73
 
67
74
 
75
+ ReportCaseAdapter = TypeAdapter(ReportCase[Any, Any, Any])
76
+
77
+
68
78
  class ReportCaseAggregate(BaseModel):
69
79
  """A synthetic case that summarizes a set of cases."""
70
80
 
@@ -142,12 +152,13 @@ class ReportCaseAggregate(BaseModel):
142
152
  )
143
153
 
144
154
 
145
- class EvaluationReport(BaseModel):
155
+ @dataclass
156
+ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
146
157
  """A report of the results of evaluating a model on a set of cases."""
147
158
 
148
159
  name: str
149
160
  """The name of the report."""
150
- cases: list[ReportCase]
161
+ cases: list[ReportCase[InputsT, OutputT, MetadataT]]
151
162
  """The cases in the report."""
152
163
 
153
164
  def averages(self) -> ReportCaseAggregate:
@@ -156,7 +167,7 @@ class EvaluationReport(BaseModel):
156
167
  def print(
157
168
  self,
158
169
  width: int | None = None,
159
- baseline: EvaluationReport | None = None,
170
+ baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
160
171
  include_input: bool = False,
161
172
  include_metadata: bool = False,
162
173
  include_expected_output: bool = False,
@@ -199,7 +210,7 @@ class EvaluationReport(BaseModel):
199
210
 
200
211
  def console_table(
201
212
  self,
202
- baseline: EvaluationReport | None = None,
213
+ baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
203
214
  include_input: bool = False,
204
215
  include_metadata: bool = False,
205
216
  include_expected_output: bool = False,
@@ -250,6 +261,9 @@ class EvaluationReport(BaseModel):
250
261
  return io_file.getvalue()
251
262
 
252
263
 
264
+ EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
265
+
266
+
253
267
  class RenderValueConfig(TypedDict, total=False):
254
268
  """A configuration for rendering a values in an Evaluation report."""
255
269
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 0.3.7
3
+ Version: 0.4.1
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
32
32
  Requires-Dist: anyio>=0
33
33
  Requires-Dist: eval-type-backport>=0; python_version < '3.11'
34
34
  Requires-Dist: logfire-api>=1.2.0
35
- Requires-Dist: pydantic-ai-slim==0.3.7
35
+ Requires-Dist: pydantic-ai-slim==0.4.1
36
36
  Requires-Dist: pydantic>=2.10
37
37
  Requires-Dist: pyyaml>=6.0.2
38
38
  Requires-Dist: rich>=13.9.4
@@ -1,6 +1,6 @@
1
1
  pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
2
2
  pydantic_evals/_utils.py,sha256=PfhmPbdQp-q90s568LuG45zDDXxgO13BEz8MQJK8qw4,2922
3
- pydantic_evals/dataset.py,sha256=UPyl8Jey18LlcvXQKZ4et5F3AFZ_ar100KREEO5Zfd0,46010
3
+ pydantic_evals/dataset.py,sha256=SY0k2htYG0d0KRRem3pnQdN7rPztJ_TCFnCb0zkXbCk,46477
4
4
  pydantic_evals/generation.py,sha256=-w-4-zpJuW8mLj5ed60LUYm--b-2G42p-UDuPhOQgRE,3492
5
5
  pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  pydantic_evals/evaluators/__init__.py,sha256=uj110viFiDvqrIcuDcWexk_uBgJqhKMGPHT2YvDb7LA,624
@@ -15,9 +15,9 @@ pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5
15
15
  pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
16
16
  pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD4,268
17
17
  pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
18
- pydantic_evals/reporting/__init__.py,sha256=tknRGM2fm8EUENxbq4K5duHZ_DgNzrVWhpGHFkoQ9zo,41677
18
+ pydantic_evals/reporting/__init__.py,sha256=k_3tteqXGh0yGvgpN68gB0CjG9wzrakzDTve2GHend4,42148
19
19
  pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
20
- pydantic_evals-0.3.7.dist-info/METADATA,sha256=fAByT-yJm5MsLv76cvJVSx8kHMj9XjBU7VudwlporzU,7785
21
- pydantic_evals-0.3.7.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
- pydantic_evals-0.3.7.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
- pydantic_evals-0.3.7.dist-info/RECORD,,
20
+ pydantic_evals-0.4.1.dist-info/METADATA,sha256=IXq49FDCjJBQQ_mMPuZyljAXQaDvN-OZk21js1DlN9Q,7785
21
+ pydantic_evals-0.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
+ pydantic_evals-0.4.1.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
+ pydantic_evals-0.4.1.dist-info/RECORD,,