PyPI - pydantic-evals - Versions diffs - 0.3.6__tar.gz → 0.4.0__tar.gz - Mend

pydantic-evals 0.3.6tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydantic-evals might be problematic. Click here for more details.

Files changed (24) hide show

{pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydantic-evals
-Version: 0.3.6
+Version: 0.4.0
 Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
 Project-URL: Homepage, https://ai.pydantic.dev/evals
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
 Requires-Dist: anyio>=0
 Requires-Dist: eval-type-backport>=0; python_version < '3.11'
 Requires-Dist: logfire-api>=1.2.0
-Requires-Dist: pydantic-ai-slim==0.3.6
+Requires-Dist: pydantic-ai-slim==0.4.0
 Requires-Dist: pydantic>=2.10
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: rich>=13.9.4

{pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/dataset.py RENAMED Viewed

@@ -257,7 +257,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         name: str | None = None,
         max_concurrency: int | None = None,
         progress: bool = True,
-    ) -> EvaluationReport:
+    ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
         This method runs the task on each case in the dataset, applies evaluators,
@@ -312,7 +312,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         name: str | None = None,
         max_concurrency: int | None = None,
         progress: bool = True,
-    ) -> EvaluationReport:
+    ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
         This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
@@ -877,7 +877,7 @@ async def _run_task_and_evaluators(
     case: Case[InputsT, OutputT, MetadataT],
     report_case_name: str,
     dataset_evaluators: list[Evaluator[InputsT, OutputT, MetadataT]],
-) -> ReportCase:
+) -> ReportCase[InputsT, OutputT, MetadataT]:
     """Run a task on a case and evaluate the results.
     Args:
@@ -927,7 +927,7 @@ async def _run_task_and_evaluators(
             span_id = f'{context.span_id:016x}'
         fallback_duration = time.time() - t0
-    return ReportCase(
+    return ReportCase[InputsT, OutputT, MetadataT](
         name=report_case_name,
         inputs=case.inputs,
         metadata=case.metadata,

{pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/reporting/__init__.py RENAMED Viewed

@@ -2,14 +2,14 @@ from __future__ import annotations as _annotations
 from collections import defaultdict
 from collections.abc import Mapping
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from io import StringIO
-from typing import Any, Callable, Literal, Protocol, TypeVar
+from typing import Any, Callable, Generic, Literal, Protocol
-from pydantic import BaseModel
+from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
 from rich.table import Table
-from typing_extensions import TypedDict
+from typing_extensions import TypedDict, TypeVar
 from pydantic_evals._utils import UNSET, Unset
@@ -24,7 +24,9 @@ from .render_numbers import (
 __all__ = (
     'EvaluationReport',
+    'EvaluationReportAdapter',
     'ReportCase',
+    'ReportCaseAdapter',
     'EvaluationRenderer',
     'RenderValueConfig',
     'RenderNumberConfig',
@@ -35,27 +37,32 @@ MISSING_VALUE_STR = '[i]<missing>[/i]'
 EMPTY_CELL_STR = '-'
 EMPTY_AGGREGATE_CELL_STR = ''
+InputsT = TypeVar('InputsT', default=Any)
+OutputT = TypeVar('OutputT', default=Any)
+MetadataT = TypeVar('MetadataT', default=Any)
-class ReportCase(BaseModel):
+@dataclass
+class ReportCase(Generic[InputsT, OutputT, MetadataT]):
     """A single case in an evaluation report."""
     name: str
     """The name of the [case][pydantic_evals.Case]."""
-    inputs: Any
+    inputs: InputsT
     """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
-    metadata: Any
+    metadata: MetadataT | None
     """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
-    expected_output: Any
+    expected_output: OutputT | None
     """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
-    output: Any
+    output: OutputT
     """The output of the task execution."""
     metrics: dict[str, float | int]
     attributes: dict[str, Any]
-    scores: dict[str, EvaluationResult[int | float]] = field(init=False)
-    labels: dict[str, EvaluationResult[str]] = field(init=False)
-    assertions: dict[str, EvaluationResult[bool]] = field(init=False)
+    scores: dict[str, EvaluationResult[int | float]]
+    labels: dict[str, EvaluationResult[str]]
+    assertions: dict[str, EvaluationResult[bool]]
     task_duration: float
     total_duration: float  # includes evaluator execution time
@@ -65,6 +72,9 @@ class ReportCase(BaseModel):
     span_id: str
+ReportCaseAdapter = TypeAdapter(ReportCase[Any, Any, Any])
 class ReportCaseAggregate(BaseModel):
     """A synthetic case that summarizes a set of cases."""
@@ -142,12 +152,13 @@ class ReportCaseAggregate(BaseModel):
         )
-class EvaluationReport(BaseModel):
+@dataclass
+class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
     """A report of the results of evaluating a model on a set of cases."""
     name: str
     """The name of the report."""
-    cases: list[ReportCase]
+    cases: list[ReportCase[InputsT, OutputT, MetadataT]]
     """The cases in the report."""
     def averages(self) -> ReportCaseAggregate:
@@ -156,7 +167,7 @@ class EvaluationReport(BaseModel):
     def print(
         self,
         width: int | None = None,
-        baseline: EvaluationReport | None = None,
+        baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -199,7 +210,7 @@ class EvaluationReport(BaseModel):
     def console_table(
         self,
-        baseline: EvaluationReport | None = None,
+        baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -250,6 +261,9 @@ class EvaluationReport(BaseModel):
         return io_file.getvalue()
+EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
 class RenderValueConfig(TypedDict, total=False):
     """A configuration for rendering a values in an Evaluation report."""