PyPI - pydantic-evals - Versions diffs - 1.0.14__tar.gz → 1.50.0__tar.gz - Mend

pydantic-evals 1.0.14tar.gz → 1.50.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/.gitignore RENAMED Viewed

@@ -10,7 +10,7 @@ env*/
 /TODO.md
 /postgres-data/
 .DS_Store
-examples/pydantic_ai_examples/.chat_app_messages.sqlite
+.chat_app_messages.sqlite
 .cache/
 .vscode/
 /question_graph_history.json
@@ -21,3 +21,5 @@ node_modules/
 /test_tmp/
 .mcp.json
 .claude/
+/.cursor/
+/.devcontainer/

{pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydantic-evals
-Version: 1.0.14
+Version: 1.50.0
 Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
 Project-URL: Homepage, https://ai.pydantic.dev/evals
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.10
 Requires-Dist: anyio>=0
 Requires-Dist: logfire-api>=3.14.1
-Requires-Dist: pydantic-ai-slim==1.0.14
+Requires-Dist: pydantic-ai-slim==1.50.0
 Requires-Dist: pydantic>=2.10
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: rich>=13.9.4

pydantic_evals-1.50.0/pydantic_evals/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
+This package provides functionality for:
+- Creating and loading test datasets with structured inputs and outputs
+- Evaluating model performance using various metrics and evaluators
+- Generating reports for evaluation results
+"""
+from .dataset import Case, Dataset, increment_eval_metric, set_eval_attribute
+__all__ = (
+    'Case',
+    'Dataset',
+    'increment_eval_metric',
+    'set_eval_attribute',
+)

{pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/_utils.py RENAMED Viewed

@@ -112,7 +112,7 @@ async def task_group_gather(tasks: Sequence[Callable[[], Awaitable[T]]]) -> list
 try:
     from logfire._internal.config import (
-        LogfireNotConfiguredWarning,  # pyright: ignore[reportAssignmentType,reportPrivateImportUsage]
+        LogfireNotConfiguredWarning,  # pyright: ignore[reportAssignmentType]
     )
 # TODO: Remove this `pragma: no cover` once we test evals without pydantic-ai (which includes logfire)
 except ImportError:  # pragma: no cover

{pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/dataset.py RENAMED Viewed

@@ -90,7 +90,7 @@ class _CaseModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'
     inputs: InputsT
     metadata: MetadataT | None = None
     expected_output: OutputT | None = None
-    evaluators: list[EvaluatorSpec] = Field(default_factory=list)
+    evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
 class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'):
@@ -100,7 +100,7 @@ class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forb
     json_schema_path: str | None = Field(default=None, alias='$schema')
     name: str | None = None
     cases: list[_CaseModel[InputsT, OutputT, MetadataT]]
-    evaluators: list[EvaluatorSpec] = Field(default_factory=list)
+    evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
 @dataclass(init=False)
@@ -136,7 +136,9 @@ class Case(Generic[InputsT, OutputT, MetadataT]):
     """
     expected_output: OutputT | None = None
     """Expected output of the task. This is the expected output of the task that will be evaluated."""
-    evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(default_factory=list)
+    evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(
+        default_factory=list[Evaluator[InputsT, OutputT, MetadataT]]
+    )
     """Evaluators to be used just on this case."""
     def __init__(
@@ -265,6 +267,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         retry_evaluators: RetryConfig | None = None,
         *,
         task_name: str | None = None,
+        metadata: dict[str, Any] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
@@ -283,6 +286,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             retry_evaluators: Optional retry configuration for evaluator execution.
             task_name: Optional override to the name of the task being executed, otherwise the name of the task
                 function will be used.
+            metadata: Optional dict of experiment metadata.
         Returns:
             A report containing the results of the evaluation.
@@ -294,6 +298,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
+        extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'}
+        if metadata is not None:
+            extra_attributes['metadata'] = metadata
         with (
             logfire_span(
                 'evaluate {name}',
@@ -301,7 +308,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
                 task_name=task_name,
                 dataset_name=self.name,
                 n_cases=len(self.cases),
-                **{'gen_ai.operation.name': 'experiment'},  # pyright: ignore[reportArgumentType]
+                **extra_attributes,
             ) as eval_span,
             progress_bar or nullcontext(),
         ):
@@ -339,11 +346,18 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
                 name=name,
                 cases=cases,
                 failures=failures,
+                experiment_metadata=metadata,
                 span_id=span_id,
                 trace_id=trace_id,
             )
-            if (averages := report.averages()) is not None and averages.assertions is not None:
-                eval_span.set_attribute('assertion_pass_rate', averages.assertions)
+            full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
+            if metadata is not None:
+                full_experiment_metadata['metadata'] = metadata
+            if (averages := report.averages()) is not None:
+                full_experiment_metadata['averages'] = averages
+                if averages.assertions is not None:
+                    eval_span.set_attribute('assertion_pass_rate', averages.assertions)
+            eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata)
         return report
     def evaluate_sync(
@@ -354,21 +368,27 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         progress: bool = True,
         retry_task: RetryConfig | None = None,
         retry_evaluators: RetryConfig | None = None,
+        *,
+        task_name: str | None = None,
+        metadata: dict[str, Any] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
-        This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
+        This is a synchronous wrapper around [`evaluate`][pydantic_evals.dataset.Dataset.evaluate] provided for convenience.
         Args:
             task: The task to evaluate. This should be a callable that takes the inputs of the case
                 and returns the output.
-            name: The name of the task being evaluated, this is used to identify the task in the report.
-                If omitted, the name of the task function will be used.
+            name: The name of the experiment being run, this is used to identify the experiment in the report.
+                If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
             max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                 If None, all cases will be evaluated concurrently.
-            progress: Whether to show a progress bar for the evaluation. Defaults to True.
+            progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
             retry_task: Optional retry configuration for the task execution.
             retry_evaluators: Optional retry configuration for evaluator execution.
+            task_name: Optional override to the name of the task being executed, otherwise the name of the task
+                function will be used.
+            metadata: Optional dict of experiment metadata.
         Returns:
             A report containing the results of the evaluation.
@@ -376,11 +396,13 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         return get_event_loop().run_until_complete(
             self.evaluate(
                 task,
-                task_name=name,
+                name=name,
                 max_concurrency=max_concurrency,
                 progress=progress,
                 retry_task=retry_task,
                 retry_evaluators=retry_evaluators,
+                task_name=task_name,
+                metadata=metadata,
             )
         )
@@ -491,7 +513,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         path = Path(path)
         fmt = cls._infer_fmt(path, fmt)
-        raw = Path(path).read_text()
+        raw = Path(path).read_text(encoding='utf-8')
         try:
             return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types, default_name=path.stem)
         except ValidationError as e:  # pragma: no cover
@@ -646,16 +668,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         context: dict[str, Any] = {'use_short_form': True}
         if fmt == 'yaml':
-            dumped_data = self.model_dump(mode='json', by_alias=True, exclude_defaults=True, context=context)
+            dumped_data = self.model_dump(mode='json', by_alias=True, context=context)
             content = yaml.dump(dumped_data, sort_keys=False)
             if schema_ref:  # pragma: no branch
                 yaml_language_server_line = f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}'
                 content = f'{yaml_language_server_line}\n{content}'
-            path.write_text(content)
+            path.write_text(content, encoding='utf-8')
         else:
             context['$schema'] = schema_ref
-            json_data = self.model_dump_json(indent=2, by_alias=True, exclude_defaults=True, context=context)
-            path.write_text(json_data + '\n')
+            json_data = self.model_dump_json(indent=2, by_alias=True, context=context)
+            path.write_text(json_data + '\n', encoding='utf-8')
     @classmethod
     def model_json_schema_with_evaluators(
@@ -718,15 +740,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         class Case(BaseModel, extra='forbid'):  # pyright: ignore[reportUnusedClass]  # this _is_ used below, but pyright doesn't seem to notice..
             name: str | None = None
             inputs: in_type  # pyright: ignore[reportInvalidTypeForm]
-            metadata: meta_type | None = None  # pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]
-            expected_output: out_type | None = None  # pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]
+            metadata: meta_type | None = None  # pyright: ignore[reportInvalidTypeForm]
+            expected_output: out_type | None = None  # pyright: ignore[reportInvalidTypeForm]
             if evaluator_schema_types:  # pragma: no branch
-                evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa UP007
+                evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa: UP007
         class Dataset(BaseModel, extra='forbid'):
+            name: str | None = None
             cases: list[Case]
             if evaluator_schema_types:  # pragma: no branch
-                evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa UP007
+                evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa: UP007
         json_schema = Dataset.model_json_schema()
         # See `_add_json_schema` below, since `$schema` is added to the JSON, it has to be supported in the JSON
@@ -746,8 +769,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         path = Path(path)
         json_schema = cls.model_json_schema_with_evaluators(custom_evaluator_types)
         schema_content = to_json(json_schema, indent=2).decode() + '\n'
-        if not path.exists() or path.read_text() != schema_content:  # pragma: no branch
-            path.write_text(schema_content)
+        if not path.exists() or path.read_text(encoding='utf-8') != schema_content:  # pragma: no branch
+            path.write_text(schema_content, encoding='utf-8')
     @classmethod
     @functools.cache
@@ -833,8 +856,8 @@ def _get_relative_path_reference(target: Path, source: Path, _prefix: str = '')
 class _TaskRun:
     """Internal class to track metrics and attributes for a task run."""
-    attributes: dict[str, Any] = field(init=False, default_factory=dict)
-    metrics: dict[str, int | float] = field(init=False, default_factory=dict)
+    attributes: dict[str, Any] = field(init=False, default_factory=dict[str, Any])
+    metrics: dict[str, int | float] = field(init=False, default_factory=dict[str, int | float])
     def record_metric(self, name: str, value: int | float) -> None:
         """Record a metric value.
@@ -926,6 +949,8 @@ async def _run_task(
         #   That way users can customize this logic. We'd default to a function that does the current thing but also
         #   allow `None` to disable it entirely.
         for node in span_tree:
+            if 'gen_ai.request.model' not in node.attributes:
+                continue  # we only want to count the below specifically for the individual LLM requests, not agent runs
             for k, v in node.attributes.items():
                 if k == 'gen_ai.operation.name' and v == 'chat':
                     task_run.increment_metric('requests', 1)

{pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/common.py RENAMED Viewed

@@ -191,7 +191,7 @@ class LLMJudge(Evaluator[object, object, object]):
     """
     rubric: str
-    model: models.Model | models.KnownModelName | None = None
+    model: models.Model | models.KnownModelName | str | None = None
     include_input: bool = False
     include_expected_output: bool = False
     model_settings: ModelSettings | None = None

{pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/llm_as_a_judge.py RENAMED Viewed

@@ -55,7 +55,7 @@ _judge_output_agent = Agent(
 async def judge_output(
     output: Any,
     rubric: str,
-    model: models.Model | models.KnownModelName | None = None,
+    model: models.Model | models.KnownModelName | str | None = None,
     model_settings: ModelSettings | None = None,
 ) -> GradingOutput:
     """Judge the output of a model based on a rubric.
@@ -96,7 +96,7 @@ async def judge_input_output(
     inputs: Any,
     output: Any,
     rubric: str,
-    model: models.Model | models.KnownModelName | None = None,
+    model: models.Model | models.KnownModelName | str | None = None,
     model_settings: ModelSettings | None = None,
 ) -> GradingOutput:
     """Judge the output of a model based on the inputs and a rubric.
@@ -141,7 +141,7 @@ async def judge_input_output_expected(
     output: Any,
     expected_output: Any,
     rubric: str,
-    model: models.Model | models.KnownModelName | None = None,
+    model: models.Model | models.KnownModelName | str | None = None,
     model_settings: ModelSettings | None = None,
 ) -> GradingOutput:
     """Judge the output of a model based on the inputs and a rubric.
@@ -185,7 +185,7 @@ async def judge_output_expected(
     output: Any,
     expected_output: Any,
     rubric: str,
-    model: models.Model | models.KnownModelName | None = None,
+    model: models.Model | models.KnownModelName | str | None = None,
     model_settings: ModelSettings | None = None,
 ) -> GradingOutput:
     """Judge the output of a model based on the expected output, output, and a rubric.
@@ -201,7 +201,7 @@ async def judge_output_expected(
     ).output
-def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:  # pragma: no cover
+def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
     """Set the default model used for judging.
     This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
@@ -221,39 +221,44 @@ def _stringify(value: Any) -> str:
         return repr(value)
+def _make_section(content: Any, tag: str) -> list[str | UserContent]:
+    """Create a tagged section, handling different content types, for use in the LLMJudge's prompt.
+    Args:
+        content (Any): content to include in the section_
+        tag (str): tag name for the section
+    Returns:
+        list[str | UserContent]: the tagged section as a list of strings or UserContent
+    """
+    sections: list[str | UserContent] = []
+    items: Sequence[str | UserContent] = (  # pyright: ignore[reportUnknownVariableType]
+        content if isinstance(content, Sequence) and not isinstance(content, str) else [content]
+    )
+    sections.append(f'<{tag}>')
+    for item in items:
+        sections.append(item if isinstance(item, str | MultiModalContent) else _stringify(item))
+    sections.append(f'</{tag}>')
+    return sections
 def _build_prompt(
     output: Any,
     rubric: str,
     inputs: Any | None = None,
     expected_output: Any | None = None,
 ) -> str | Sequence[str | UserContent]:
-    """Build a prompt that includes input, output, and rubric."""
+    """Build a prompt that includes input, output, expected output, and rubric."""
     sections: list[str | UserContent] = []
     if inputs is not None:
-        if isinstance(inputs, str):
-            sections.append(f'<Input>\n{inputs}\n</Input>')
-        else:
-            sections.append('<Input>\n')
-            if isinstance(inputs, Sequence):
-                for item in inputs:  # type: ignore
-                    if isinstance(item, str | MultiModalContent):
-                        sections.append(item)
-                    else:
-                        sections.append(_stringify(item))
-            elif isinstance(inputs, MultiModalContent):
-                sections.append(inputs)
-            else:
-                sections.append(_stringify(inputs))
-            sections.append('</Input>')
-    sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
-    sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
+        sections.extend(_make_section(inputs, 'Input'))
-    if expected_output is not None:
-        sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')
+    sections.extend(_make_section(output, 'Output'))
+    sections.extend(_make_section(rubric, 'Rubric'))
-    if inputs is None or isinstance(inputs, str):
-        return '\n\n'.join(sections)  # type: ignore[arg-type]
-    else:
-        return sections
+    if expected_output is not None:
+        sections.extend(_make_section(expected_output, 'ExpectedOutput'))
+    if all(isinstance(section, str) for section in sections):
+        return '\n'.join(sections)  # type: ignore[arg-type]
+    return sections

{pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/generation.py RENAMED Viewed

@@ -14,6 +14,7 @@ from pydantic import ValidationError
 from typing_extensions import TypeVar
 from pydantic_ai import Agent, models
+from pydantic_ai._utils import strip_markdown_fences
 from pydantic_evals import Dataset
 from pydantic_evals.evaluators.evaluator import Evaluator
@@ -73,8 +74,9 @@ async def generate_dataset(
     )
     result = await agent.run(extra_instructions or 'Please generate the object.')
+    output = strip_markdown_fences(result.output)
     try:
-        result = dataset_type.from_text(result.output, fmt='json', custom_evaluator_types=custom_evaluator_types)
+        result = dataset_type.from_text(output, fmt='json', custom_evaluator_types=custom_evaluator_types)
     except ValidationError as e:  # pragma: no cover
         print(f'Raw response from model:\n{result.output}')
         raise e

{pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/otel/span_tree.py RENAMED Viewed

@@ -241,7 +241,7 @@ class SpanNode:
         return self._matches_query(query)
-    def _matches_query(self, query: SpanQuery) -> bool:  # noqa C901
+    def _matches_query(self, query: SpanQuery) -> bool:  # noqa: C901
         """Check if the span matches the query conditions."""
         # Logical combinations
         if or_ := query.get('or_'):
@@ -433,8 +433,8 @@ class SpanTree:
     You can then search or iterate the tree to make your assertions (using DFS for traversal).
     """
-    roots: list[SpanNode] = field(default_factory=list)
-    nodes_by_id: dict[str, SpanNode] = field(default_factory=dict)
+    roots: list[SpanNode] = field(default_factory=list[SpanNode])
+    nodes_by_id: dict[str, SpanNode] = field(default_factory=dict[str, SpanNode])
     # -------------------------------------------------------------------------
     # Construction

{pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/reporting/__init__.py RENAMED Viewed

@@ -7,8 +7,10 @@ from io import StringIO
 from typing import Any, Generic, Literal, Protocol, cast
 from pydantic import BaseModel, TypeAdapter
-from rich.console import Console
+from rich.console import Console, Group, RenderableType
+from rich.panel import Panel
 from rich.table import Table
+from rich.text import Text
 from typing_extensions import TypedDict, TypeVar
 from pydantic_evals._utils import UNSET, Unset
@@ -53,11 +55,11 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
     name: str
     """The name of the [case][pydantic_evals.Case]."""
     inputs: InputsT
-    """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
+    """The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
     metadata: MetadataT | None
-    """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
+    """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
     expected_output: OutputT | None
-    """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
+    """The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
     output: OutputT
     """The output of the task execution."""
@@ -76,7 +78,7 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
     span_id: str | None = None
     """The span ID of the case span."""
-    evaluator_failures: list[EvaluatorFailure] = field(default_factory=list)
+    evaluator_failures: list[EvaluatorFailure] = field(default_factory=list[EvaluatorFailure])
 @dataclass(kw_only=True)
@@ -86,11 +88,11 @@ class ReportCaseFailure(Generic[InputsT, OutputT, MetadataT]):
     name: str
     """The name of the [case][pydantic_evals.Case]."""
     inputs: InputsT
-    """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
+    """The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
     metadata: MetadataT | None
-    """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
+    """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
     expected_output: OutputT | None
-    """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
+    """The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
     error_message: str
     """The message of the exception that caused the failure."""
@@ -193,9 +195,13 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
     cases: list[ReportCase[InputsT, OutputT, MetadataT]]
     """The cases in the report."""
-    failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
+    failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(
+        default_factory=list[ReportCaseFailure[InputsT, OutputT, MetadataT]]
+    )
     """The failures in the report. These are cases where task execution raised an exception."""
+    experiment_metadata: dict[str, Any] | None = None
+    """Metadata associated with the specific experiment represented by this report."""
     trace_id: str | None = None
     """The trace ID of the evaluation."""
     span_id: str | None = None
@@ -206,11 +212,69 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             return ReportCaseAggregate.average(self.cases)
         return None
+    def render(
+        self,
+        width: int | None = None,
+        baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
+        *,
+        include_input: bool = False,
+        include_metadata: bool = False,
+        include_expected_output: bool = False,
+        include_output: bool = False,
+        include_durations: bool = True,
+        include_total_duration: bool = False,
+        include_removed_cases: bool = False,
+        include_averages: bool = True,
+        include_errors: bool = True,
+        include_error_stacktrace: bool = False,
+        include_evaluator_failures: bool = True,
+        input_config: RenderValueConfig | None = None,
+        metadata_config: RenderValueConfig | None = None,
+        output_config: RenderValueConfig | None = None,
+        score_configs: dict[str, RenderNumberConfig] | None = None,
+        label_configs: dict[str, RenderValueConfig] | None = None,
+        metric_configs: dict[str, RenderNumberConfig] | None = None,
+        duration_config: RenderNumberConfig | None = None,
+        include_reasons: bool = False,
+    ) -> str:
+        """Render this report to a nicely-formatted string, optionally comparing it to a baseline report.
+        If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
+        """
+        io_file = StringIO()
+        console = Console(width=width, file=io_file)
+        self.print(
+            width=width,
+            baseline=baseline,
+            console=console,
+            include_input=include_input,
+            include_metadata=include_metadata,
+            include_expected_output=include_expected_output,
+            include_output=include_output,
+            include_durations=include_durations,
+            include_total_duration=include_total_duration,
+            include_removed_cases=include_removed_cases,
+            include_averages=include_averages,
+            include_errors=include_errors,
+            include_error_stacktrace=include_error_stacktrace,
+            include_evaluator_failures=include_evaluator_failures,
+            input_config=input_config,
+            metadata_config=metadata_config,
+            output_config=output_config,
+            score_configs=score_configs,
+            label_configs=label_configs,
+            metric_configs=metric_configs,
+            duration_config=duration_config,
+            include_reasons=include_reasons,
+        )
+        return io_file.getvalue()
     def print(
         self,
         width: int | None = None,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
         *,
+        console: Console | None = None,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -230,12 +294,16 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
         include_reasons: bool = False,
-    ):  # pragma: no cover
+    ) -> None:
         """Print this report to the console, optionally comparing it to a baseline report.
         If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
         """
-        table = self.console_table(
+        if console is None:  # pragma: no branch
+            console = Console(width=width)
+        metadata_panel = self._metadata_panel(baseline=baseline)
+        renderable: RenderableType = self.console_table(
             baseline=baseline,
             include_input=include_input,
             include_metadata=include_metadata,
@@ -254,10 +322,13 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             metric_configs=metric_configs,
             duration_config=duration_config,
             include_reasons=include_reasons,
+            with_title=not metadata_panel,
         )
-        console = Console(width=width)
-        console.print(table)
-        if include_errors and self.failures:
+        # Wrap table with experiment metadata panel if present
+        if metadata_panel:
+            renderable = Group(metadata_panel, renderable)
+        console.print(renderable)
+        if include_errors and self.failures:  # pragma: no cover
             failures_table = self.failures_table(
                 include_input=include_input,
                 include_metadata=include_metadata,
@@ -269,6 +340,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             )
             console.print(failures_table, style='red')
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def console_table(
         self,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
@@ -290,9 +362,11 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
         include_reasons: bool = False,
+        with_title: bool = True,
     ) -> Table:
-        """Return a table containing the data from this report, or the diff between this report and a baseline report.
+        """Return a table containing the data from this report.
+        If a baseline is provided, returns a diff between this report and the baseline report.
         Optionally include input and output details.
         """
         renderer = EvaluationRenderer(
@@ -317,10 +391,82 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             include_reasons=include_reasons,
         )
         if baseline is None:
-            return renderer.build_table(self)
-        else:  # pragma: no cover
-            return renderer.build_diff_table(self, baseline)
+            return renderer.build_table(self, with_title=with_title)
+        else:
+            return renderer.build_diff_table(self, baseline, with_title=with_title)
+    def _metadata_panel(
+        self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None
+    ) -> RenderableType | None:
+        """Wrap a table with an experiment metadata panel if metadata exists.
+        Args:
+            table: The table to wrap
+            baseline: Optional baseline report for diff metadata
+        Returns:
+            Either the table unchanged or a Group with Panel and Table
+        """
+        if baseline is None:
+            # Single report - show metadata if present
+            if self.experiment_metadata:
+                metadata_text = Text()
+                items = list(self.experiment_metadata.items())
+                for i, (key, value) in enumerate(items):
+                    metadata_text.append(f'{key}: {value}', style='dim')
+                    if i < len(items) - 1:
+                        metadata_text.append('\n')
+                return Panel(
+                    metadata_text,
+                    title=f'Evaluation Summary: {self.name}',
+                    title_align='left',
+                    border_style='dim',
+                    padding=(0, 1),
+                    expand=False,
+                )
+        else:
+            # Diff report - show metadata diff if either has metadata
+            if self.experiment_metadata or baseline.experiment_metadata:
+                diff_name = baseline.name if baseline.name == self.name else f'{baseline.name} → {self.name}'
+                metadata_text = Text()
+                lines_styles: list[tuple[str, str]] = []
+                if baseline.experiment_metadata and self.experiment_metadata:
+                    # Collect all keys from both
+                    all_keys = sorted(set(baseline.experiment_metadata.keys()) | set(self.experiment_metadata.keys()))
+                    for key in all_keys:
+                        baseline_val = baseline.experiment_metadata.get(key)
+                        report_val = self.experiment_metadata.get(key)
+                        if baseline_val == report_val:
+                            lines_styles.append((f'{key}: {report_val}', 'dim'))
+                        elif baseline_val is None:
+                            lines_styles.append((f'+ {key}: {report_val}', 'green'))
+                        elif report_val is None:
+                            lines_styles.append((f'- {key}: {baseline_val}', 'red'))
+                        else:
+                            lines_styles.append((f'{key}: {baseline_val} → {report_val}', 'yellow'))
+                elif self.experiment_metadata:
+                    lines_styles = [(f'+ {k}: {v}', 'green') for k, v in self.experiment_metadata.items()]
+                else:  # baseline.experiment_metadata only
+                    assert baseline.experiment_metadata is not None
+                    lines_styles = [(f'- {k}: {v}', 'red') for k, v in baseline.experiment_metadata.items()]
+                for i, (line, style) in enumerate(lines_styles):
+                    metadata_text.append(line, style=style)
+                    if i < len(lines_styles) - 1:
+                        metadata_text.append('\n')
+                return Panel(
+                    metadata_text,
+                    title=f'Evaluation Diff: {diff_name}',
+                    title_align='left',
+                    border_style='dim',
+                    padding=(0, 1),
+                    expand=False,
+                )
+        return None
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def failures_table(
         self,
         *,
@@ -358,10 +504,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
     def __str__(self) -> str:  # pragma: lax no cover
         """Return a string representation of the report."""
-        table = self.console_table()
-        io_file = StringIO()
-        Console(file=io_file).print(table)
-        return io_file.getvalue()
+        return self.render()
 EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
@@ -647,6 +790,7 @@ class ReportCaseRenderer:
     metric_renderers: Mapping[str, _NumberRenderer]
     duration_renderer: _NumberRenderer
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def build_base_table(self, title: str) -> Table:
         """Build and return a Rich Table for the diff output."""
         table = Table(title=title, show_lines=True)
@@ -673,6 +817,7 @@ class ReportCaseRenderer:
             table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right')
         return table
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def build_failures_table(self, title: str) -> Table:
         """Build and return a Rich Table for the failures output."""
         table = Table(title=title, show_lines=True)
@@ -1132,9 +1277,22 @@ class EvaluationRenderer:
             duration_renderer=duration_renderer,
         )
-    def build_table(self, report: EvaluationReport) -> Table:
+    # TODO(DavidM): in v2, change the return type here to RenderableType
+    def build_table(self, report: EvaluationReport, *, with_title: bool = True) -> Table:
+        """Build a table for the report.
+        Args:
+            report: The evaluation report to render
+            with_title: Whether to include the title in the table (default True)
+        Returns:
+            A Rich Table object
+        """
         case_renderer = self._get_case_renderer(report)
-        table = case_renderer.build_base_table(f'Evaluation Summary: {report.name}')
+        title = f'Evaluation Summary: {report.name}' if with_title else ''
+        table = case_renderer.build_base_table(title)
         for case in report.cases:
             table.add_row(*case_renderer.build_row(case))
@@ -1145,7 +1303,20 @@ class EvaluationRenderer:
         return table
-    def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) -> Table:
+    # TODO(DavidM): in v2, change the return type here to RenderableType
+    def build_diff_table(
+        self, report: EvaluationReport, baseline: EvaluationReport, *, with_title: bool = True
+    ) -> Table:
+        """Build a diff table comparing report to baseline.
+        Args:
+            report: The evaluation report to compare
+            baseline: The baseline report to compare against
+            with_title: Whether to include the title in the table (default True)
+        Returns:
+            A Rich Table object
+        """
         report_cases = report.cases
         baseline_cases = self._baseline_cases_to_include(report, baseline)
@@ -1170,7 +1341,10 @@ class EvaluationRenderer:
         case_renderer = self._get_case_renderer(report, baseline)
         diff_name = baseline.name if baseline.name == report.name else f'{baseline.name} → {report.name}'
-        table = case_renderer.build_base_table(f'Evaluation Diff: {diff_name}')
+        title = f'Evaluation Diff: {diff_name}' if with_title else ''
+        table = case_renderer.build_base_table(title)
         for baseline_case, new_case in diff_cases:
             table.add_row(*case_renderer.build_diff_row(new_case, baseline_case))
         for case in added_cases:
@@ -1189,6 +1363,7 @@ class EvaluationRenderer:
         return table
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def build_failures_table(self, report: EvaluationReport) -> Table:
         case_renderer = self._get_case_renderer(report)
         table = case_renderer.build_failures_table('Case Failures')

pydantic_evals-1.0.14/pydantic_evals/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
-This package provides functionality for:
-- Creating and loading test datasets with structured inputs and outputs
-- Evaluating model performance using various metrics and evaluators
-- Generating reports for evaluation results
-TODO(DavidM): Implement serialization of reports for later comparison, and add git hashes etc.
-  Note: I made pydantic_ai.evals.reports.EvalReport a BaseModel specifically to make this easier
-TODO(DavidM): Add commit hash, timestamp, and other metadata to reports (like pytest-speed does), possibly in a dedicated struct
-TODO(DavidM): Implement a CLI with some pytest-like filtering API to make it easier to run only specific cases
-"""
-from .dataset import Case, Dataset
-__all__ = (
-    'Case',
-    'Dataset',
-)