PyPI - pydantic-evals - Versions diffs - 0.3.7__tar.gz → 0.4.1__tar.gz - Mend

pydantic-evals 0.3.7tar.gz → 0.4.1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydantic-evals might be problematic. Click here for more details.

Files changed (24) hide show

{pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydantic-evals
-Version: 0.3.7
+Version: 0.4.1
 Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
 Project-URL: Homepage, https://ai.pydantic.dev/evals
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
 Requires-Dist: anyio>=0
 Requires-Dist: eval-type-backport>=0; python_version < '3.11'
 Requires-Dist: logfire-api>=1.2.0
-Requires-Dist: pydantic-ai-slim==0.3.7
+Requires-Dist: pydantic-ai-slim==0.4.1
 Requires-Dist: pydantic>=2.10
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: rich>=13.9.4

{pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/dataset.py RENAMED Viewed

@@ -18,12 +18,14 @@ from collections.abc import Awaitable, Mapping, Sequence
 from contextlib import AsyncExitStack, nullcontext
 from contextvars import ContextVar
 from dataclasses import dataclass, field
+from inspect import iscoroutinefunction
 from pathlib import Path
 from typing import Any, Callable, Generic, Literal, Union, cast
 import anyio
 import logfire_api
 import yaml
+from anyio import to_thread
 from pydantic import BaseModel, ConfigDict, Field, TypeAdapter, ValidationError, model_serializer
 from pydantic._internal import _typing_extra
 from pydantic_core import to_json
@@ -253,11 +255,11 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
     async def evaluate(
         self,
-        task: Callable[[InputsT], Awaitable[OutputT]],
+        task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
         name: str | None = None,
         max_concurrency: int | None = None,
         progress: bool = True,
-    ) -> EvaluationReport:
+    ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
         This method runs the task on each case in the dataset, applies evaluators,
@@ -308,11 +310,11 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
     def evaluate_sync(
         self,
-        task: Callable[[InputsT], Awaitable[OutputT]],
+        task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
         name: str | None = None,
         max_concurrency: int | None = None,
         progress: bool = True,
-    ) -> EvaluationReport:
+    ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
         This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
@@ -811,7 +813,7 @@ class _TaskRun:
 async def _run_task(
-    task: Callable[[InputsT], Awaitable[OutputT]], case: Case[InputsT, OutputT, MetadataT]
+    task: Callable[[InputsT], Awaitable[OutputT] | OutputT], case: Case[InputsT, OutputT, MetadataT]
 ) -> EvaluatorContext[InputsT, OutputT, MetadataT]:
     """Run a task on a case and return the context for evaluators.
@@ -836,7 +838,10 @@ async def _run_task(
         with _logfire.span('execute {task}', task=get_unwrapped_function_name(task)) as task_span:
             with context_subtree() as span_tree:
                 t0 = time.perf_counter()
-                task_output = await task(case.inputs)
+                if iscoroutinefunction(task):
+                    task_output = cast(OutputT, await task(case.inputs))
+                else:
+                    task_output = cast(OutputT, await to_thread.run_sync(task, case.inputs))
                 fallback_duration = time.perf_counter() - t0
     finally:
         _CURRENT_TASK_RUN.reset(token)
@@ -873,11 +878,11 @@ async def _run_task(
 async def _run_task_and_evaluators(
-    task: Callable[[InputsT], Awaitable[OutputT]],
+    task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
     case: Case[InputsT, OutputT, MetadataT],
     report_case_name: str,
     dataset_evaluators: list[Evaluator[InputsT, OutputT, MetadataT]],
-) -> ReportCase:
+) -> ReportCase[InputsT, OutputT, MetadataT]:
     """Run a task on a case and evaluate the results.
     Args:
@@ -927,7 +932,7 @@ async def _run_task_and_evaluators(
             span_id = f'{context.span_id:016x}'
         fallback_duration = time.time() - t0
-    return ReportCase(
+    return ReportCase[InputsT, OutputT, MetadataT](
         name=report_case_name,
         inputs=case.inputs,
         metadata=case.metadata,

{pydantic_evals-0.3.7 → pydantic_evals-0.4.1}/pydantic_evals/reporting/__init__.py RENAMED Viewed

@@ -2,14 +2,14 @@ from __future__ import annotations as _annotations
 from collections import defaultdict
 from collections.abc import Mapping
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from io import StringIO
-from typing import Any, Callable, Literal, Protocol, TypeVar
+from typing import Any, Callable, Generic, Literal, Protocol
-from pydantic import BaseModel
+from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
 from rich.table import Table
-from typing_extensions import TypedDict
+from typing_extensions import TypedDict, TypeVar
 from pydantic_evals._utils import UNSET, Unset
@@ -24,7 +24,9 @@ from .render_numbers import (
 __all__ = (
     'EvaluationReport',
+    'EvaluationReportAdapter',
     'ReportCase',
+    'ReportCaseAdapter',
     'EvaluationRenderer',
     'RenderValueConfig',
     'RenderNumberConfig',
@@ -35,27 +37,32 @@ MISSING_VALUE_STR = '[i]<missing>[/i]'
 EMPTY_CELL_STR = '-'
 EMPTY_AGGREGATE_CELL_STR = ''
+InputsT = TypeVar('InputsT', default=Any)
+OutputT = TypeVar('OutputT', default=Any)
+MetadataT = TypeVar('MetadataT', default=Any)
-class ReportCase(BaseModel):
+@dataclass
+class ReportCase(Generic[InputsT, OutputT, MetadataT]):
     """A single case in an evaluation report."""
     name: str
     """The name of the [case][pydantic_evals.Case]."""
-    inputs: Any
+    inputs: InputsT
     """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
-    metadata: Any
+    metadata: MetadataT | None
     """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
-    expected_output: Any
+    expected_output: OutputT | None
     """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
-    output: Any
+    output: OutputT
     """The output of the task execution."""
     metrics: dict[str, float | int]
     attributes: dict[str, Any]
-    scores: dict[str, EvaluationResult[int | float]] = field(init=False)
-    labels: dict[str, EvaluationResult[str]] = field(init=False)
-    assertions: dict[str, EvaluationResult[bool]] = field(init=False)
+    scores: dict[str, EvaluationResult[int | float]]
+    labels: dict[str, EvaluationResult[str]]
+    assertions: dict[str, EvaluationResult[bool]]
     task_duration: float
     total_duration: float  # includes evaluator execution time
@@ -65,6 +72,9 @@ class ReportCase(BaseModel):
     span_id: str
+ReportCaseAdapter = TypeAdapter(ReportCase[Any, Any, Any])
 class ReportCaseAggregate(BaseModel):
     """A synthetic case that summarizes a set of cases."""
@@ -142,12 +152,13 @@ class ReportCaseAggregate(BaseModel):
         )
-class EvaluationReport(BaseModel):
+@dataclass
+class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
     """A report of the results of evaluating a model on a set of cases."""
     name: str
     """The name of the report."""
-    cases: list[ReportCase]
+    cases: list[ReportCase[InputsT, OutputT, MetadataT]]
     """The cases in the report."""
     def averages(self) -> ReportCaseAggregate:
@@ -156,7 +167,7 @@ class EvaluationReport(BaseModel):
     def print(
         self,
         width: int | None = None,
-        baseline: EvaluationReport | None = None,
+        baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -199,7 +210,7 @@ class EvaluationReport(BaseModel):
     def console_table(
         self,
-        baseline: EvaluationReport | None = None,
+        baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -250,6 +261,9 @@ class EvaluationReport(BaseModel):
         return io_file.getvalue()
+EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
 class RenderValueConfig(TypedDict, total=False):
     """A configuration for rendering a values in an Evaluation report."""