PyPI - pydantic-evals - Versions diffs - 1.0.14__tar.gz → 1.22.0__tar.gz - Mend

pydantic-evals 1.0.14tar.gz → 1.22.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/.gitignore RENAMED Viewed

@@ -10,7 +10,7 @@ env*/
 /TODO.md
 /postgres-data/
 .DS_Store
-examples/pydantic_ai_examples/.chat_app_messages.sqlite
+.chat_app_messages.sqlite
 .cache/
 .vscode/
 /question_graph_history.json
@@ -21,3 +21,5 @@ node_modules/
 /test_tmp/
 .mcp.json
 .claude/
+/.cursor/
+/.devcontainer/

{pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydantic-evals
-Version: 1.0.14
+Version: 1.22.0
 Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
 Project-URL: Homepage, https://ai.pydantic.dev/evals
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.10
 Requires-Dist: anyio>=0
 Requires-Dist: logfire-api>=3.14.1
-Requires-Dist: pydantic-ai-slim==1.0.14
+Requires-Dist: pydantic-ai-slim==1.22.0
 Requires-Dist: pydantic>=2.10
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: rich>=13.9.4

pydantic_evals-1.22.0/pydantic_evals/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
+This package provides functionality for:
+- Creating and loading test datasets with structured inputs and outputs
+- Evaluating model performance using various metrics and evaluators
+- Generating reports for evaluation results
+"""
+from .dataset import Case, Dataset, increment_eval_metric, set_eval_attribute
+__all__ = (
+    'Case',
+    'Dataset',
+    'increment_eval_metric',
+    'set_eval_attribute',
+)

{pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/dataset.py RENAMED Viewed

@@ -265,6 +265,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         retry_evaluators: RetryConfig | None = None,
         *,
         task_name: str | None = None,
+        metadata: dict[str, Any] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
@@ -283,6 +284,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             retry_evaluators: Optional retry configuration for evaluator execution.
             task_name: Optional override to the name of the task being executed, otherwise the name of the task
                 function will be used.
+            metadata: Optional dict of experiment metadata.
         Returns:
             A report containing the results of the evaluation.
@@ -294,6 +296,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
+        extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'}
+        if metadata is not None:
+            extra_attributes['metadata'] = metadata
         with (
             logfire_span(
                 'evaluate {name}',
@@ -301,7 +306,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
                 task_name=task_name,
                 dataset_name=self.name,
                 n_cases=len(self.cases),
-                **{'gen_ai.operation.name': 'experiment'},  # pyright: ignore[reportArgumentType]
+                **extra_attributes,
             ) as eval_span,
             progress_bar or nullcontext(),
         ):
@@ -339,11 +344,18 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
                 name=name,
                 cases=cases,
                 failures=failures,
+                experiment_metadata=metadata,
                 span_id=span_id,
                 trace_id=trace_id,
             )
-            if (averages := report.averages()) is not None and averages.assertions is not None:
-                eval_span.set_attribute('assertion_pass_rate', averages.assertions)
+            full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
+            if metadata is not None:
+                full_experiment_metadata['metadata'] = metadata
+            if (averages := report.averages()) is not None:
+                full_experiment_metadata['averages'] = averages
+                if averages.assertions is not None:
+                    eval_span.set_attribute('assertion_pass_rate', averages.assertions)
+            eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata)
         return report
     def evaluate_sync(
@@ -354,6 +366,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         progress: bool = True,
         retry_task: RetryConfig | None = None,
         retry_evaluators: RetryConfig | None = None,
+        *,
+        task_name: str | None = None,
+        metadata: dict[str, Any] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
@@ -362,13 +377,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         Args:
             task: The task to evaluate. This should be a callable that takes the inputs of the case
                 and returns the output.
-            name: The name of the task being evaluated, this is used to identify the task in the report.
-                If omitted, the name of the task function will be used.
+            name: The name of the experiment being run, this is used to identify the experiment in the report.
+                If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
             max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                 If None, all cases will be evaluated concurrently.
-            progress: Whether to show a progress bar for the evaluation. Defaults to True.
+            progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
             retry_task: Optional retry configuration for the task execution.
             retry_evaluators: Optional retry configuration for evaluator execution.
+            task_name: Optional override to the name of the task being executed, otherwise the name of the task
+                function will be used.
+            metadata: Optional dict of experiment metadata.
         Returns:
             A report containing the results of the evaluation.
@@ -376,11 +394,13 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         return get_event_loop().run_until_complete(
             self.evaluate(
                 task,
-                task_name=name,
+                name=name,
                 max_concurrency=max_concurrency,
                 progress=progress,
                 retry_task=retry_task,
                 retry_evaluators=retry_evaluators,
+                task_name=task_name,
+                metadata=metadata,
             )
         )
@@ -646,7 +666,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         context: dict[str, Any] = {'use_short_form': True}
         if fmt == 'yaml':
-            dumped_data = self.model_dump(mode='json', by_alias=True, exclude_defaults=True, context=context)
+            dumped_data = self.model_dump(mode='json', by_alias=True, context=context)
             content = yaml.dump(dumped_data, sort_keys=False)
             if schema_ref:  # pragma: no branch
                 yaml_language_server_line = f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}'
@@ -654,7 +674,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             path.write_text(content)
         else:
             context['$schema'] = schema_ref
-            json_data = self.model_dump_json(indent=2, by_alias=True, exclude_defaults=True, context=context)
+            json_data = self.model_dump_json(indent=2, by_alias=True, context=context)
             path.write_text(json_data + '\n')
     @classmethod
@@ -724,6 +744,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
                 evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa UP007
         class Dataset(BaseModel, extra='forbid'):
+            name: str | None = None
             cases: list[Case]
             if evaluator_schema_types:  # pragma: no branch
                 evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa UP007

{pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/evaluators/llm_as_a_judge.py RENAMED Viewed

@@ -201,7 +201,7 @@ async def judge_output_expected(
     ).output
-def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:  # pragma: no cover
+def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
     """Set the default model used for judging.
     This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.

{pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/generation.py RENAMED Viewed

@@ -14,6 +14,7 @@ from pydantic import ValidationError
 from typing_extensions import TypeVar
 from pydantic_ai import Agent, models
+from pydantic_ai._utils import strip_markdown_fences
 from pydantic_evals import Dataset
 from pydantic_evals.evaluators.evaluator import Evaluator
@@ -73,8 +74,9 @@ async def generate_dataset(
     )
     result = await agent.run(extra_instructions or 'Please generate the object.')
+    output = strip_markdown_fences(result.output)
     try:
-        result = dataset_type.from_text(result.output, fmt='json', custom_evaluator_types=custom_evaluator_types)
+        result = dataset_type.from_text(output, fmt='json', custom_evaluator_types=custom_evaluator_types)
     except ValidationError as e:  # pragma: no cover
         print(f'Raw response from model:\n{result.output}')
         raise e

{pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/reporting/__init__.py RENAMED Viewed

@@ -7,8 +7,10 @@ from io import StringIO
 from typing import Any, Generic, Literal, Protocol, cast
 from pydantic import BaseModel, TypeAdapter
-from rich.console import Console
+from rich.console import Console, Group, RenderableType
+from rich.panel import Panel
 from rich.table import Table
+from rich.text import Text
 from typing_extensions import TypedDict, TypeVar
 from pydantic_evals._utils import UNSET, Unset
@@ -196,6 +198,8 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
     failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
     """The failures in the report. These are cases where task execution raised an exception."""
+    experiment_metadata: dict[str, Any] | None = None
+    """Metadata associated with the specific experiment represented by this report."""
     trace_id: str | None = None
     """The trace ID of the evaluation."""
     span_id: str | None = None
@@ -206,11 +210,69 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             return ReportCaseAggregate.average(self.cases)
         return None
+    def render(
+        self,
+        width: int | None = None,
+        baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
+        *,
+        include_input: bool = False,
+        include_metadata: bool = False,
+        include_expected_output: bool = False,
+        include_output: bool = False,
+        include_durations: bool = True,
+        include_total_duration: bool = False,
+        include_removed_cases: bool = False,
+        include_averages: bool = True,
+        include_errors: bool = True,
+        include_error_stacktrace: bool = False,
+        include_evaluator_failures: bool = True,
+        input_config: RenderValueConfig | None = None,
+        metadata_config: RenderValueConfig | None = None,
+        output_config: RenderValueConfig | None = None,
+        score_configs: dict[str, RenderNumberConfig] | None = None,
+        label_configs: dict[str, RenderValueConfig] | None = None,
+        metric_configs: dict[str, RenderNumberConfig] | None = None,
+        duration_config: RenderNumberConfig | None = None,
+        include_reasons: bool = False,
+    ) -> str:
+        """Render this report to a nicely-formatted string, optionally comparing it to a baseline report.
+        If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
+        """
+        io_file = StringIO()
+        console = Console(width=width, file=io_file)
+        self.print(
+            width=width,
+            baseline=baseline,
+            console=console,
+            include_input=include_input,
+            include_metadata=include_metadata,
+            include_expected_output=include_expected_output,
+            include_output=include_output,
+            include_durations=include_durations,
+            include_total_duration=include_total_duration,
+            include_removed_cases=include_removed_cases,
+            include_averages=include_averages,
+            include_errors=include_errors,
+            include_error_stacktrace=include_error_stacktrace,
+            include_evaluator_failures=include_evaluator_failures,
+            input_config=input_config,
+            metadata_config=metadata_config,
+            output_config=output_config,
+            score_configs=score_configs,
+            label_configs=label_configs,
+            metric_configs=metric_configs,
+            duration_config=duration_config,
+            include_reasons=include_reasons,
+        )
+        return io_file.getvalue()
     def print(
         self,
         width: int | None = None,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
         *,
+        console: Console | None = None,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -230,12 +292,16 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
         include_reasons: bool = False,
-    ):  # pragma: no cover
+    ) -> None:
         """Print this report to the console, optionally comparing it to a baseline report.
         If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
         """
-        table = self.console_table(
+        if console is None:  # pragma: no branch
+            console = Console(width=width)
+        metadata_panel = self._metadata_panel(baseline=baseline)
+        renderable: RenderableType = self.console_table(
             baseline=baseline,
             include_input=include_input,
             include_metadata=include_metadata,
@@ -254,10 +320,13 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             metric_configs=metric_configs,
             duration_config=duration_config,
             include_reasons=include_reasons,
+            with_title=not metadata_panel,
         )
-        console = Console(width=width)
-        console.print(table)
-        if include_errors and self.failures:
+        # Wrap table with experiment metadata panel if present
+        if metadata_panel:
+            renderable = Group(metadata_panel, renderable)
+        console.print(renderable)
+        if include_errors and self.failures:  # pragma: no cover
             failures_table = self.failures_table(
                 include_input=include_input,
                 include_metadata=include_metadata,
@@ -269,6 +338,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             )
             console.print(failures_table, style='red')
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def console_table(
         self,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
@@ -290,9 +360,11 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
         include_reasons: bool = False,
+        with_title: bool = True,
     ) -> Table:
-        """Return a table containing the data from this report, or the diff between this report and a baseline report.
+        """Return a table containing the data from this report.
+        If a baseline is provided, returns a diff between this report and the baseline report.
         Optionally include input and output details.
         """
         renderer = EvaluationRenderer(
@@ -317,10 +389,82 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             include_reasons=include_reasons,
         )
         if baseline is None:
-            return renderer.build_table(self)
-        else:  # pragma: no cover
-            return renderer.build_diff_table(self, baseline)
+            return renderer.build_table(self, with_title=with_title)
+        else:
+            return renderer.build_diff_table(self, baseline, with_title=with_title)
+    def _metadata_panel(
+        self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None
+    ) -> RenderableType | None:
+        """Wrap a table with an experiment metadata panel if metadata exists.
+        Args:
+            table: The table to wrap
+            baseline: Optional baseline report for diff metadata
+        Returns:
+            Either the table unchanged or a Group with Panel and Table
+        """
+        if baseline is None:
+            # Single report - show metadata if present
+            if self.experiment_metadata:
+                metadata_text = Text()
+                items = list(self.experiment_metadata.items())
+                for i, (key, value) in enumerate(items):
+                    metadata_text.append(f'{key}: {value}', style='dim')
+                    if i < len(items) - 1:
+                        metadata_text.append('\n')
+                return Panel(
+                    metadata_text,
+                    title=f'Evaluation Summary: {self.name}',
+                    title_align='left',
+                    border_style='dim',
+                    padding=(0, 1),
+                    expand=False,
+                )
+        else:
+            # Diff report - show metadata diff if either has metadata
+            if self.experiment_metadata or baseline.experiment_metadata:
+                diff_name = baseline.name if baseline.name == self.name else f'{baseline.name} → {self.name}'
+                metadata_text = Text()
+                lines_styles: list[tuple[str, str]] = []
+                if baseline.experiment_metadata and self.experiment_metadata:
+                    # Collect all keys from both
+                    all_keys = sorted(set(baseline.experiment_metadata.keys()) | set(self.experiment_metadata.keys()))
+                    for key in all_keys:
+                        baseline_val = baseline.experiment_metadata.get(key)
+                        report_val = self.experiment_metadata.get(key)
+                        if baseline_val == report_val:
+                            lines_styles.append((f'{key}: {report_val}', 'dim'))
+                        elif baseline_val is None:
+                            lines_styles.append((f'+ {key}: {report_val}', 'green'))
+                        elif report_val is None:
+                            lines_styles.append((f'- {key}: {baseline_val}', 'red'))
+                        else:
+                            lines_styles.append((f'{key}: {baseline_val} → {report_val}', 'yellow'))
+                elif self.experiment_metadata:
+                    lines_styles = [(f'+ {k}: {v}', 'green') for k, v in self.experiment_metadata.items()]
+                else:  # baseline.experiment_metadata only
+                    assert baseline.experiment_metadata is not None
+                    lines_styles = [(f'- {k}: {v}', 'red') for k, v in baseline.experiment_metadata.items()]
+                for i, (line, style) in enumerate(lines_styles):
+                    metadata_text.append(line, style=style)
+                    if i < len(lines_styles) - 1:
+                        metadata_text.append('\n')
+                return Panel(
+                    metadata_text,
+                    title=f'Evaluation Diff: {diff_name}',
+                    title_align='left',
+                    border_style='dim',
+                    padding=(0, 1),
+                    expand=False,
+                )
+        return None
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def failures_table(
         self,
         *,
@@ -358,10 +502,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
     def __str__(self) -> str:  # pragma: lax no cover
         """Return a string representation of the report."""
-        table = self.console_table()
-        io_file = StringIO()
-        Console(file=io_file).print(table)
-        return io_file.getvalue()
+        return self.render()
 EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
@@ -647,6 +788,7 @@ class ReportCaseRenderer:
     metric_renderers: Mapping[str, _NumberRenderer]
     duration_renderer: _NumberRenderer
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def build_base_table(self, title: str) -> Table:
         """Build and return a Rich Table for the diff output."""
         table = Table(title=title, show_lines=True)
@@ -673,6 +815,7 @@ class ReportCaseRenderer:
             table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right')
         return table
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def build_failures_table(self, title: str) -> Table:
         """Build and return a Rich Table for the failures output."""
         table = Table(title=title, show_lines=True)
@@ -1132,9 +1275,22 @@ class EvaluationRenderer:
             duration_renderer=duration_renderer,
         )
-    def build_table(self, report: EvaluationReport) -> Table:
+    # TODO(DavidM): in v2, change the return type here to RenderableType
+    def build_table(self, report: EvaluationReport, *, with_title: bool = True) -> Table:
+        """Build a table for the report.
+        Args:
+            report: The evaluation report to render
+            with_title: Whether to include the title in the table (default True)
+        Returns:
+            A Rich Table object
+        """
         case_renderer = self._get_case_renderer(report)
-        table = case_renderer.build_base_table(f'Evaluation Summary: {report.name}')
+        title = f'Evaluation Summary: {report.name}' if with_title else ''
+        table = case_renderer.build_base_table(title)
         for case in report.cases:
             table.add_row(*case_renderer.build_row(case))
@@ -1145,7 +1301,20 @@ class EvaluationRenderer:
         return table
-    def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) -> Table:
+    # TODO(DavidM): in v2, change the return type here to RenderableType
+    def build_diff_table(
+        self, report: EvaluationReport, baseline: EvaluationReport, *, with_title: bool = True
+    ) -> Table:
+        """Build a diff table comparing report to baseline.
+        Args:
+            report: The evaluation report to compare
+            baseline: The baseline report to compare against
+            with_title: Whether to include the title in the table (default True)
+        Returns:
+            A Rich Table object
+        """
         report_cases = report.cases
         baseline_cases = self._baseline_cases_to_include(report, baseline)
@@ -1170,7 +1339,10 @@ class EvaluationRenderer:
         case_renderer = self._get_case_renderer(report, baseline)
         diff_name = baseline.name if baseline.name == report.name else f'{baseline.name} → {report.name}'
-        table = case_renderer.build_base_table(f'Evaluation Diff: {diff_name}')
+        title = f'Evaluation Diff: {diff_name}' if with_title else ''
+        table = case_renderer.build_base_table(title)
         for baseline_case, new_case in diff_cases:
             table.add_row(*case_renderer.build_diff_row(new_case, baseline_case))
         for case in added_cases:
@@ -1189,6 +1361,7 @@ class EvaluationRenderer:
         return table
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def build_failures_table(self, report: EvaluationReport) -> Table:
         case_renderer = self._get_case_renderer(report)
         table = case_renderer.build_failures_table('Case Failures')

pydantic_evals-1.0.14/pydantic_evals/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
-This package provides functionality for:
-- Creating and loading test datasets with structured inputs and outputs
-- Evaluating model performance using various metrics and evaluators
-- Generating reports for evaluation results
-TODO(DavidM): Implement serialization of reports for later comparison, and add git hashes etc.
-  Note: I made pydantic_ai.evals.reports.EvalReport a BaseModel specifically to make this easier
-TODO(DavidM): Add commit hash, timestamp, and other metadata to reports (like pytest-speed does), possibly in a dedicated struct
-TODO(DavidM): Implement a CLI with some pytest-like filtering API to make it easier to run only specific cases
-"""
-from .dataset import Case, Dataset
-__all__ = (
-    'Case',
-    'Dataset',
-)