PyPI - pydantic-evals - Versions diffs - 0.8.0__tar.gz → 1.0.0b1__tar.gz - Mend

pydantic-evals 0.8.0tar.gz → 1.0.0b1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydantic-evals might be problematic. Click here for more details.

Files changed (24) hide show

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydantic-evals
-Version: 0.8.0
+Version: 1.0.0b1
 Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
 Project-URL: Homepage, https://ai.pydantic.dev/evals
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -21,18 +21,17 @@ Classifier: Operating System :: Unix
 Classifier: Programming Language :: Python
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3 :: Only
-Classifier: Programming Language :: Python :: 3.9
 Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Internet
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
-Requires-Python: >=3.9
+Requires-Python: >=3.10
 Requires-Dist: anyio>=0
 Requires-Dist: eval-type-backport>=0; python_version < '3.11'
 Requires-Dist: logfire-api>=3.14.1
-Requires-Dist: pydantic-ai-slim==0.8.0
+Requires-Dist: pydantic-ai-slim==1.0.0b1
 Requires-Dist: pydantic>=2.10
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: rich>=13.9.4

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/pydantic_evals/_utils.py RENAMED Viewed

@@ -2,9 +2,9 @@ from __future__ import annotations as _annotations
 import asyncio
 import inspect
-from collections.abc import Awaitable, Sequence
+from collections.abc import Awaitable, Callable, Sequence
 from functools import partial
-from typing import Any, Callable, TypeVar
+from typing import Any, TypeVar
 import anyio
 from typing_extensions import ParamSpec, TypeIs

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/pydantic_evals/dataset.py RENAMED Viewed

@@ -13,14 +13,15 @@ import functools
 import inspect
 import sys
 import time
+import traceback
 import warnings
-from collections.abc import Awaitable, Mapping, Sequence
+from collections.abc import Awaitable, Callable, Mapping, Sequence
 from contextlib import AsyncExitStack, nullcontext
 from contextvars import ContextVar
 from dataclasses import dataclass, field
 from inspect import iscoroutinefunction
 from pathlib import Path
-from typing import Any, Callable, Generic, Literal, Union, cast
+from typing import TYPE_CHECKING, Any, Generic, Literal, Union, cast
 import anyio
 import logfire_api
@@ -40,26 +41,20 @@ from .evaluators import EvaluationResult, Evaluator
 from .evaluators._run_evaluator import run_evaluator
 from .evaluators.common import DEFAULT_EVALUATORS
 from .evaluators.context import EvaluatorContext
+from .evaluators.evaluator import EvaluatorFailure
 from .evaluators.spec import EvaluatorSpec
 from .otel import SpanTree
 from .otel._context_subtree import context_subtree
-from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate
+from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate, ReportCaseFailure
+if TYPE_CHECKING:
+    from pydantic_ai.retries import RetryConfig
 if sys.version_info < (3, 11):
     from exceptiongroup import ExceptionGroup  # pragma: lax no cover
 else:
     ExceptionGroup = ExceptionGroup  # pragma: lax no cover
-# while waiting for https://github.com/pydantic/logfire/issues/745
-try:
-    import logfire._internal.stack_info
-except ImportError:
-    pass
-else:
-    from pathlib import Path
-    logfire._internal.stack_info.NON_USER_CODE_PREFIXES += (str(Path(__file__).parent.absolute()),)  # pyright: ignore[reportPrivateImportUsage]
 __all__ = (
     'Case',
     'Dataset',
@@ -84,6 +79,7 @@ _YAML_SCHEMA_LINE_PREFIX = '# yaml-language-server: $schema='
 _REPORT_CASES_ADAPTER = TypeAdapter(list[ReportCase])
+_REPORT_CASE_FAILURES_ADAPTER = TypeAdapter(list[ReportCaseFailure])
 _REPORT_CASE_AGGREGATE_ADAPTER = TypeAdapter(ReportCaseAggregate)
@@ -171,11 +167,6 @@ class Case(Generic[InputsT, OutputT, MetadataT]):
         self.evaluators = list(evaluators)
-# TODO: Consider making one or more of the following changes to this type:
-#  * Add `task: Callable[[InputsT], Awaitable[OutputT]` as a field
-#  * Add `inputs_type`, `output_type`, etc. as kwargs on `__init__`
-#  * Rename to `Evaluation`
-# TODO: Allow `task` to be sync _or_ async
 class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', arbitrary_types_allowed=True):
     """A dataset of test [cases][pydantic_evals.Case].
@@ -263,6 +254,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         name: str | None = None,
         max_concurrency: int | None = None,
         progress: bool = True,
+        retry_task: RetryConfig | None = None,
+        retry_evaluators: RetryConfig | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
@@ -277,6 +270,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                 If None, all cases will be evaluated concurrently.
             progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
+            retry_task: Optional retry configuration for the task execution.
+            retry_evaluators: Optional retry configuration for evaluator execution.
         Returns:
             A report containing the results of the evaluation.
@@ -287,12 +282,17 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
-        with _logfire.span('evaluate {name}', name=name) as eval_span, progress_bar or nullcontext():
+        with (
+            _logfire.span('evaluate {name}', name=name, n_cases=len(self.cases)) as eval_span,
+            progress_bar or nullcontext(),
+        ):
             task_id = progress_bar.add_task(f'Evaluating {name}', total=total_cases) if progress_bar else None
             async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str):
                 async with limiter:
-                    result = await _run_task_and_evaluators(task, case, report_case_name, self.evaluators)
+                    result = await _run_task_and_evaluators(
+                        task, case, report_case_name, self.evaluators, retry_task, retry_evaluators
+                    )
                     if progress_bar and task_id is not None:  # pragma: no branch
                         progress_bar.update(task_id, advance=1)
                     return result
@@ -303,21 +303,28 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             else:
                 trace_id = f'{context.trace_id:032x}'
                 span_id = f'{context.span_id:016x}'
+            cases_and_failures = await task_group_gather(
+                [
+                    lambda case=case, i=i: _handle_case(case, case.name or f'Case {i}')
+                    for i, case in enumerate(self.cases, 1)
+                ]
+            )
+            cases: list[ReportCase] = []
+            failures: list[ReportCaseFailure] = []
+            for item in cases_and_failures:
+                if isinstance(item, ReportCase):
+                    cases.append(item)
+                else:
+                    failures.append(item)
             report = EvaluationReport(
                 name=name,
-                cases=await task_group_gather(
-                    [
-                        lambda case=case, i=i: _handle_case(case, case.name or f'Case {i}')
-                        for i, case in enumerate(self.cases, 1)
-                    ]
-                ),
+                cases=cases,
+                failures=failures,
                 span_id=span_id,
                 trace_id=trace_id,
             )
-            # TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel:
-            eval_span.set_attribute('cases', _REPORT_CASES_ADAPTER.dump_python(report.cases))
-            # TODO(DavidM): Remove this 'averages' attribute once we compute it in the details panel
-            eval_span.set_attribute('averages', _REPORT_CASE_AGGREGATE_ADAPTER.dump_python(report.averages()))
+            if (averages := report.averages()) is not None and averages.assertions is not None:
+                eval_span.set_attribute('assertion_pass_rate', averages.assertions)
         return report
     def evaluate_sync(
@@ -644,7 +651,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             def _make_typed_dict(cls_name_prefix: str, fields: dict[str, Any]) -> Any:
                 td = TypedDict(f'{cls_name_prefix}_{name}', fields)  # pyright: ignore[reportArgumentType]
                 config = ConfigDict(extra='forbid', arbitrary_types_allowed=True)
-                # TODO: Replace with pydantic.with_config after pydantic 2.11 is released
+                # TODO: Replace with pydantic.with_config once pydantic 2.11 is the min supported version
                 td.__pydantic_config__ = config  # pyright: ignore[reportAttributeAccessIssue]
                 return td
@@ -745,7 +752,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         See <https://github.com/json-schema-org/json-schema-spec/issues/828> for context, that seems to be the nearest
         there is to a spec for this.
         """
-        context = cast(Union[dict[str, Any], None], info.context)
+        context = cast(dict[str, Any] | None, info.context)
         if isinstance(context, dict) and (schema := context.get('$schema')):
             return {'$schema': schema} | nxt(self)
         else:
@@ -825,13 +832,16 @@ class _TaskRun:
 async def _run_task(
-    task: Callable[[InputsT], Awaitable[OutputT] | OutputT], case: Case[InputsT, OutputT, MetadataT]
+    task: Callable[[InputsT], Awaitable[OutputT] | OutputT],
+    case: Case[InputsT, OutputT, MetadataT],
+    retry: RetryConfig | None = None,
 ) -> EvaluatorContext[InputsT, OutputT, MetadataT]:
     """Run a task on a case and return the context for evaluators.
     Args:
         task: The task to run.
         case: The case to run the task on.
+        retry: The retry config to use.
     Returns:
         An EvaluatorContext containing the inputs, actual output, expected output, and metadata.
@@ -839,38 +849,48 @@ async def _run_task(
     Raises:
         Exception: Any exception raised by the task.
     """
-    task_run = _TaskRun()
-    if _CURRENT_TASK_RUN.get() is not None:  # pragma: no cover
-        raise RuntimeError('A task run has already been entered. Task runs should not be nested')
-    # Note: the current behavior is for task execution errors to just bubble up all the way and kill the evaluation.
-    # Should we handle them for the user in some way? If so, I guess we'd want to do that here.
-    token = _CURRENT_TASK_RUN.set(task_run)
-    try:
-        with _logfire.span('execute {task}', task=get_unwrapped_function_name(task)) as task_span:
-            with context_subtree() as span_tree:
+    async def _run_once():
+        task_run_ = _TaskRun()
+        if _CURRENT_TASK_RUN.get() is not None:  # pragma: no cover
+            raise RuntimeError('A task run has already been entered. Task runs should not be nested')
+        token = _CURRENT_TASK_RUN.set(task_run_)
+        try:
+            with (
+                _logfire.span('execute {task}', task=get_unwrapped_function_name(task)) as task_span,
+                context_subtree() as span_tree_,
+            ):
                 t0 = time.perf_counter()
                 if iscoroutinefunction(task):
-                    task_output = cast(OutputT, await task(case.inputs))
+                    task_output_ = cast(OutputT, await task(case.inputs))
                 else:
-                    task_output = cast(OutputT, await to_thread.run_sync(task, case.inputs))
+                    task_output_ = cast(OutputT, await to_thread.run_sync(task, case.inputs))
                 fallback_duration = time.perf_counter() - t0
-    finally:
-        _CURRENT_TASK_RUN.reset(token)
+            duration_ = _get_span_duration(task_span, fallback_duration)
+            return task_run_, task_output_, duration_, span_tree_
+        finally:
+            _CURRENT_TASK_RUN.reset(token)
+    if retry:
+        # import from pydantic_ai.retries to trigger more descriptive import error if tenacity is missing
+        from pydantic_ai.retries import retry as tenacity_retry
+        _run_once = tenacity_retry(**retry)(_run_once)
+    task_run, task_output, duration, span_tree = await _run_once()
     if isinstance(span_tree, SpanTree):  # pragma: no branch
-        # TODO: Question: Should we make this metric-attributes functionality more user-configurable in some way before merging?
-        #   Note: the use of otel for collecting these metrics is the main reason why I think we should require at least otel as a dependency, if not logfire;
-        #   otherwise, we don't have a great way to get usage data from arbitrary frameworks.
-        #   Ideally we wouldn't need to hard-code the specific logic here, but I'm not sure a great way to expose it to
-        #   users. Maybe via an argument of type Callable[[SpanTree], dict[str, int | float]] or similar?
+        # Idea for making this more configurable: replace the following logic with a call to a user-provided function
+        #   of type Callable[[_TaskRun, SpanTree], None] or similar, (maybe no _TaskRun and just use the public APIs).
+        #   That way users can customize this logic. We'd default to a function that does the current thing but also
+        #   allow `None` to disable it entirely.
         for node in span_tree:
             if node.attributes.get('gen_ai.operation.name') == 'chat':
                 task_run.increment_metric('requests', 1)
             for k, v in node.attributes.items():
-                if not isinstance(v, (int, float)):
+                if not isinstance(v, int | float):
                     continue
-                # TODO: Revisit this choice to strip the prefix..
                 if k.startswith('gen_ai.usage.details.'):
                     task_run.increment_metric(k.removeprefix('gen_ai.usage.details.'), v)
                 elif k.startswith('gen_ai.usage.'):
@@ -882,7 +902,7 @@ async def _run_task(
         metadata=case.metadata,
         expected_output=case.expected_output,
         output=task_output,
-        duration=_get_span_duration(task_span, fallback_duration),
+        duration=duration,
         _span_tree=span_tree,
         attributes=task_run.attributes,
         metrics=task_run.metrics,
@@ -894,7 +914,9 @@ async def _run_task_and_evaluators(
     case: Case[InputsT, OutputT, MetadataT],
     report_case_name: str,
     dataset_evaluators: list[Evaluator[InputsT, OutputT, MetadataT]],
-) -> ReportCase[InputsT, OutputT, MetadataT]:
+    retry_task: RetryConfig | None,
+    retry_evaluators: RetryConfig | None,
+) -> ReportCase[InputsT, OutputT, MetadataT] | ReportCaseFailure[InputsT, OutputT, MetadataT]:
     """Run a task on a case and evaluate the results.
     Args:
@@ -902,64 +924,83 @@ async def _run_task_and_evaluators(
         case: The case to run the task on.
         report_case_name: The name to use for this case in the report.
         dataset_evaluators: Evaluators from the dataset to apply to this case.
+        retry_task: The retry config to use for running the task.
+        retry_evaluators: The retry config to use for running the evaluators.
     Returns:
         A ReportCase containing the evaluation results.
     """
-    with _logfire.span(
-        'case: {case_name}',
-        task_name=get_unwrapped_function_name(task),
-        case_name=report_case_name,
-        inputs=case.inputs,
-        metadata=case.metadata,
-        expected_output=case.expected_output,
-    ) as case_span:
-        t0 = time.time()
-        scoring_context = await _run_task(task, case)
-        case_span.set_attribute('output', scoring_context.output)
-        case_span.set_attribute('task_duration', scoring_context.duration)
-        case_span.set_attribute('metrics', scoring_context.metrics)
-        case_span.set_attribute('attributes', scoring_context.attributes)
-        evaluators = case.evaluators + dataset_evaluators
-        evaluator_outputs: list[EvaluationResult] = []
-        if evaluators:
-            evaluator_outputs_by_task = await task_group_gather(
-                [lambda ev=ev: run_evaluator(ev, scoring_context) for ev in evaluators]
-            )
-            evaluator_outputs += [out for outputs in evaluator_outputs_by_task for out in outputs]
-        assertions, scores, labels = _group_evaluator_outputs_by_type(evaluator_outputs)
-        case_span.set_attribute('assertions', _evaluation_results_adapter.dump_python(assertions))
-        case_span.set_attribute('scores', _evaluation_results_adapter.dump_python(scores))
-        case_span.set_attribute('labels', _evaluation_results_adapter.dump_python(labels))
+    trace_id: str | None = None
+    span_id: str | None = None
+    try:
+        with _logfire.span(
+            'case: {case_name}',
+            task_name=get_unwrapped_function_name(task),
+            case_name=report_case_name,
+            inputs=case.inputs,
+            metadata=case.metadata,
+            expected_output=case.expected_output,
+        ) as case_span:
+            context = case_span.context
+            if context is not None:  # pragma: no branch
+                trace_id = f'{context.trace_id:032x}'
+                span_id = f'{context.span_id:016x}'
-        context = case_span.context
-        if context is None:  # pragma: no cover
-            trace_id = None
-            span_id = None
-        else:
-            trace_id = f'{context.trace_id:032x}'
-            span_id = f'{context.span_id:016x}'
+            t0 = time.time()
+            scoring_context = await _run_task(task, case, retry_task)
+            case_span.set_attribute('output', scoring_context.output)
+            case_span.set_attribute('task_duration', scoring_context.duration)
+            case_span.set_attribute('metrics', scoring_context.metrics)
+            case_span.set_attribute('attributes', scoring_context.attributes)
+            evaluators = case.evaluators + dataset_evaluators
+            evaluator_outputs: list[EvaluationResult] = []
+            evaluator_failures: list[EvaluatorFailure] = []
+            if evaluators:
+                evaluator_outputs_by_task = await task_group_gather(
+                    [lambda ev=ev: run_evaluator(ev, scoring_context, retry_evaluators) for ev in evaluators]
+                )
+                for outputs in evaluator_outputs_by_task:
+                    if isinstance(outputs, EvaluatorFailure):
+                        evaluator_failures.append(outputs)
+                    else:
+                        evaluator_outputs.extend(outputs)
+            assertions, scores, labels = _group_evaluator_outputs_by_type(evaluator_outputs)
+            case_span.set_attribute('assertions', _evaluation_results_adapter.dump_python(assertions))
+            case_span.set_attribute('scores', _evaluation_results_adapter.dump_python(scores))
+            case_span.set_attribute('labels', _evaluation_results_adapter.dump_python(labels))
         fallback_duration = time.time() - t0
-    return ReportCase[InputsT, OutputT, MetadataT](
-        name=report_case_name,
-        inputs=case.inputs,
-        metadata=case.metadata,
-        expected_output=case.expected_output,
-        output=scoring_context.output,
-        metrics=scoring_context.metrics,
-        attributes=scoring_context.attributes,
-        scores=scores,
-        labels=labels,
-        assertions=assertions,
-        task_duration=scoring_context.duration,
-        total_duration=_get_span_duration(case_span, fallback_duration),
-        trace_id=trace_id,
-        span_id=span_id,
-    )
+        return ReportCase[InputsT, OutputT, MetadataT](
+            name=report_case_name,
+            inputs=case.inputs,
+            metadata=case.metadata,
+            expected_output=case.expected_output,
+            output=scoring_context.output,
+            metrics=scoring_context.metrics,
+            attributes=scoring_context.attributes,
+            scores=scores,
+            labels=labels,
+            assertions=assertions,
+            task_duration=scoring_context.duration,
+            total_duration=_get_span_duration(case_span, fallback_duration),
+            trace_id=trace_id,
+            span_id=span_id,
+            evaluator_failures=evaluator_failures,
+        )
+    except Exception as exc:
+        return ReportCaseFailure[InputsT, OutputT, MetadataT](
+            name=report_case_name,
+            inputs=case.inputs,
+            metadata=case.metadata,
+            expected_output=case.expected_output,
+            error_message=f'{type(exc).__name__}: {exc}',
+            error_stacktrace=traceback.format_exc(),
+            trace_id=trace_id,
+            span_id=span_id,
+        )
 _evaluation_results_adapter = TypeAdapter(Mapping[str, EvaluationResult])

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/pydantic_evals/evaluators/__init__.py RENAMED Viewed

@@ -10,7 +10,7 @@ from .common import (
     Python,
 )
 from .context import EvaluatorContext
-from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorOutput, EvaluatorSpec
+from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorFailure, EvaluatorOutput, EvaluatorSpec
 __all__ = (
     # common
@@ -27,6 +27,8 @@ __all__ = (
     'EvaluatorContext',
     # evaluator
     'Evaluator',
+    'EvaluationReason',
+    'EvaluatorFailure',
     'EvaluatorOutput',
     'EvaluatorSpec',
     'EvaluationReason',

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/pydantic_evals/evaluators/_run_evaluator.py RENAMED Viewed

@@ -1,8 +1,11 @@
 from __future__ import annotations
+import traceback
 from collections.abc import Mapping
-from typing import Any
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+import logfire_api
 from pydantic import (
     TypeAdapter,
     ValidationError,
@@ -10,7 +13,20 @@ from pydantic import (
 from typing_extensions import TypeVar
 from .context import EvaluatorContext
-from .evaluator import EvaluationReason, EvaluationResult, EvaluationScalar, Evaluator, EvaluatorOutput
+from .evaluator import (
+    EvaluationReason,
+    EvaluationResult,
+    EvaluationScalar,
+    Evaluator,
+    EvaluatorFailure,
+    EvaluatorOutput,
+)
+if TYPE_CHECKING:
+    from pydantic_ai.retries import RetryConfig
+_logfire = logfire_api.Logfire(otel_scope='pydantic-evals')
+logfire_api.add_non_user_code_prefix(Path(__file__).parent.absolute())
 InputsT = TypeVar('InputsT', default=Any, contravariant=True)
 OutputT = TypeVar('OutputT', default=Any, contravariant=True)
@@ -18,8 +34,10 @@ MetadataT = TypeVar('MetadataT', default=Any, contravariant=True)
 async def run_evaluator(
-    evaluator: Evaluator[InputsT, OutputT, MetadataT], ctx: EvaluatorContext[InputsT, OutputT, MetadataT]
-) -> list[EvaluationResult]:
+    evaluator: Evaluator[InputsT, OutputT, MetadataT],
+    ctx: EvaluatorContext[InputsT, OutputT, MetadataT],
+    retry: RetryConfig | None = None,
+) -> list[EvaluationResult] | EvaluatorFailure:
     """Run an evaluator and return the results.
     This function runs an evaluator on the given context and processes the results into
@@ -28,19 +46,39 @@ async def run_evaluator(
     Args:
         evaluator: The evaluator to run.
         ctx: The context containing the inputs, outputs, and metadata for evaluation.
+        retry: The retry configuration to use for running the evaluator.
     Returns:
-        A list of evaluation results.
+        A list of evaluation results, or an evaluator failure if an exception is raised during its execution.
     Raises:
         ValueError: If the evaluator returns a value of an invalid type.
     """
-    raw_results = await evaluator.evaluate_async(ctx)
+    evaluate = evaluator.evaluate_async
+    if retry is not None:
+        # import from pydantic_ai.retries to trigger more descriptive import error if tenacity is missing
+        from pydantic_ai.retries import retry as tenacity_retry
+        evaluate = tenacity_retry(**retry)(evaluate)
     try:
-        results = _EVALUATOR_OUTPUT_ADAPTER.validate_python(raw_results)
-    except ValidationError as e:
-        raise ValueError(f'{evaluator!r}.evaluate returned a value of an invalid type: {raw_results!r}.') from e
+        with _logfire.span(
+            'evaluator: {evaluator_name}',
+            evaluator_name=evaluator.get_default_evaluation_name(),
+        ):
+            raw_results = await evaluate(ctx)
+            try:
+                results = _EVALUATOR_OUTPUT_ADAPTER.validate_python(raw_results)
+            except ValidationError as e:
+                raise ValueError(f'{evaluator!r}.evaluate returned a value of an invalid type: {raw_results!r}.') from e
+    except Exception as e:
+        return EvaluatorFailure(
+            name=evaluator.get_default_evaluation_name(),
+            error_message=f'{type(e).__name__}: {e}',
+            error_stacktrace=traceback.format_exc(),
+            source=evaluator.as_spec(),
+        )
     results = _convert_to_mapping(results, scalar_name=evaluator.get_default_evaluation_name())

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/pydantic_evals/evaluators/context.py RENAMED Viewed

@@ -27,7 +27,7 @@ MetadataT = TypeVar('MetadataT', default=Any, covariant=True)
 """Type variable for the metadata associated with the task being evaluated."""
-@dataclass
+@dataclass(kw_only=True)
 class EvaluatorContext(Generic[InputsT, OutputT, MetadataT]):
     """Context for evaluating a task execution.

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/pydantic_evals/evaluators/evaluator.py RENAMED Viewed

@@ -4,7 +4,7 @@ import inspect
 from abc import ABCMeta, abstractmethod
 from collections.abc import Awaitable, Mapping
 from dataclasses import MISSING, dataclass, fields
-from typing import Any, Generic, Union, cast
+from typing import Any, Generic, cast
 from pydantic import (
     ConfigDict,
@@ -25,11 +25,12 @@ __all__ = (
     'EvaluationResult',
     'EvaluationScalar',
     'Evaluator',
+    'EvaluatorFailure',
     'EvaluatorOutput',
     'EvaluatorSpec',
 )
-EvaluationScalar = Union[bool, int, float, str]
+EvaluationScalar = bool | int | float | str
 """The most primitive output allowed as an output from an Evaluator.
 `int` and `float` are treated as scores, `str` as labels, and `bool` as assertions.
@@ -51,11 +52,11 @@ class EvaluationReason:
     reason: str | None = None
-EvaluatorOutput = Union[EvaluationScalar, EvaluationReason, Mapping[str, Union[EvaluationScalar, EvaluationReason]]]
+EvaluatorOutput = EvaluationScalar | EvaluationReason | Mapping[str, EvaluationScalar | EvaluationReason]
 """Type for the output of an evaluator, which can be a scalar, an EvaluationReason, or a mapping of names to either."""
-# TODO(DavidM): Add bound=EvaluationScalar to the following typevar after we upgrade to pydantic 2.11
+# TODO(DavidM): Add bound=EvaluationScalar to the following typevar once pydantic 2.11 is the min supported version
 EvaluationScalarT = TypeVar('EvaluationScalarT', default=EvaluationScalar, covariant=True)
 """Type variable for the scalar result type of an evaluation."""
@@ -100,6 +101,16 @@ class EvaluationResult(Generic[EvaluationScalarT]):
         return None
+@dataclass
+class EvaluatorFailure:
+    """Represents a failure raised during the execution of an evaluator."""
+    name: str
+    error_message: str
+    error_stacktrace: str
+    source: EvaluatorSpec
 # Evaluators are contravariant in all of its parameters.
 InputsT = TypeVar('InputsT', default=Any, contravariant=True)
 """Type variable for the inputs type of the task being evaluated."""

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/pydantic_evals/evaluators/llm_as_a_judge.py RENAMED Viewed

@@ -8,7 +8,7 @@ from pydantic import BaseModel, Field
 from pydantic_core import to_json
 from pydantic_ai import Agent, models
-from pydantic_ai.messages import MultiModalContentTypes, UserContent
+from pydantic_ai.messages import MultiModalContent, UserContent
 from pydantic_ai.settings import ModelSettings
 __all__ = (
@@ -238,11 +238,11 @@ def _build_prompt(
             sections.append('<Input>\n')
             if isinstance(inputs, Sequence):
                 for item in inputs:  # type: ignore
-                    if isinstance(item, (str, MultiModalContentTypes)):
+                    if isinstance(item, str | MultiModalContent):
                         sections.append(item)
                     else:
                         sections.append(_stringify(item))
-            elif isinstance(inputs, MultiModalContentTypes):
+            elif isinstance(inputs, MultiModalContent):
                 sections.append(inputs)
             else:
                 sections.append(_stringify(inputs))

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/pydantic_evals/evaluators/spec.py RENAMED Viewed

@@ -2,7 +2,7 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING, Any, Union, cast
+from typing import TYPE_CHECKING, Any, cast
 from pydantic import (
     BaseModel,
@@ -17,7 +17,7 @@ from pydantic_core.core_schema import SerializationInfo, SerializerFunctionWrapH
 if TYPE_CHECKING:
     # This import seems to fail on Pydantic 2.10.1 in CI
     from pydantic import ModelWrapValidatorHandler
-    # TODO: Try removing this when we update to pydantic 2.11
+    # TODO: Remove this once pydantic 2.11 is the min supported version
 class EvaluatorSpec(BaseModel):
@@ -112,7 +112,7 @@ class EvaluatorSpec(BaseModel):
             return handler(self)
-class _SerializedEvaluatorSpec(RootModel[Union[str, dict[str, Any]]]):
+class _SerializedEvaluatorSpec(RootModel[str | dict[str, Any]]):
     """Internal class for handling the serialized form of an EvaluatorSpec.
     This is an auxiliary class used to serialize/deserialize instances of EvaluatorSpec

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/pydantic_evals/otel/span_tree.py RENAMED Viewed

@@ -1,12 +1,12 @@
 from __future__ import annotations
 import re
-from collections.abc import Iterator, Sequence
+from collections.abc import Callable, Iterator, Sequence
 from dataclasses import dataclass, field
 from datetime import datetime, timedelta, timezone
 from functools import cache
 from textwrap import indent
-from typing import TYPE_CHECKING, Any, Callable, Union
+from typing import TYPE_CHECKING, Any
 from pydantic import TypeAdapter
 from typing_extensions import TypedDict
@@ -16,16 +16,7 @@ if TYPE_CHECKING:  # pragma: no cover
     from opentelemetry.sdk.trace import ReadableSpan
 # Should match opentelemetry.util.types.AttributeValue
-AttributeValue = Union[
-    str,
-    bool,
-    int,
-    float,
-    Sequence[str],
-    Sequence[bool],
-    Sequence[int],
-    Sequence[float],
-]
+AttributeValue = str | bool | int | float | Sequence[str] | Sequence[bool] | Sequence[int] | Sequence[float]
 __all__ = 'SpanNode', 'SpanTree', 'SpanQuery'
@@ -87,7 +78,7 @@ class SpanQuery(TypedDict, total=False):
     no_ancestor_has: SpanQuery
-@dataclass(repr=False)
+@dataclass(repr=False, kw_only=True)
 class SpanNode:
     """A node in the span tree; provides references to parents/children for easy traversal and queries."""
@@ -435,7 +426,7 @@ class SpanNode:
 SpanPredicate = Callable[[SpanNode], bool]
-@dataclass(repr=False)
+@dataclass(repr=False, kw_only=True)
 class SpanTree:
     """A container that builds a hierarchy of SpanNode objects from a list of finished spans.

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/pydantic_evals/reporting/__init__.py RENAMED Viewed

@@ -1,10 +1,10 @@
 from __future__ import annotations as _annotations
 from collections import defaultdict
-from collections.abc import Mapping
-from dataclasses import dataclass
+from collections.abc import Callable, Mapping
+from dataclasses import dataclass, field
 from io import StringIO
-from typing import Any, Callable, Generic, Literal, Protocol, cast
+from typing import Any, Generic, Literal, Protocol, cast
 from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
@@ -27,12 +27,16 @@ __all__ = (
     'EvaluationReportAdapter',
     'ReportCase',
     'ReportCaseAdapter',
+    'ReportCaseFailure',
+    'ReportCaseFailureAdapter',
     'EvaluationRenderer',
     'RenderValueConfig',
     'RenderNumberConfig',
     'ReportCaseAggregate',
 )
+from ..evaluators.evaluator import EvaluatorFailure
 MISSING_VALUE_STR = '[i]<missing>[/i]'
 EMPTY_CELL_STR = '-'
 EMPTY_AGGREGATE_CELL_STR = ''
@@ -42,7 +46,7 @@ OutputT = TypeVar('OutputT', default=Any)
 MetadataT = TypeVar('MetadataT', default=Any)
-@dataclass
+@dataclass(kw_only=True)
 class ReportCase(Generic[InputsT, OutputT, MetadataT]):
     """A single case in an evaluation report."""
@@ -67,12 +71,40 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
     task_duration: float
     total_duration: float  # includes evaluator execution time
-    # TODO(DavidM): Drop these once we can reference child spans in details panel:
-    trace_id: str | None
-    span_id: str | None
+    trace_id: str | None = None
+    """The trace ID of the case span."""
+    span_id: str | None = None
+    """The span ID of the case span."""
+    evaluator_failures: list[EvaluatorFailure] = field(default_factory=list)
+@dataclass(kw_only=True)
+class ReportCaseFailure(Generic[InputsT, OutputT, MetadataT]):
+    """A single case in an evaluation report that failed due to an error during task execution."""
+    name: str
+    """The name of the [case][pydantic_evals.Case]."""
+    inputs: InputsT
+    """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
+    metadata: MetadataT | None
+    """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
+    expected_output: OutputT | None
+    """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
+    error_message: str
+    """The message of the exception that caused the failure."""
+    error_stacktrace: str
+    """The stacktrace of the exception that caused the failure."""
+    trace_id: str | None = None
+    """The trace ID of the case span."""
+    span_id: str | None = None
+    """The span ID of the case span."""
 ReportCaseAdapter = TypeAdapter(ReportCase[Any, Any, Any])
+ReportCaseFailureAdapter = TypeAdapter(ReportCaseFailure[Any, Any, Any])
 class ReportCaseAggregate(BaseModel):
@@ -152,7 +184,7 @@ class ReportCaseAggregate(BaseModel):
         )
-@dataclass
+@dataclass(kw_only=True)
 class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
     """A report of the results of evaluating a model on a set of cases."""
@@ -161,15 +193,18 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
     cases: list[ReportCase[InputsT, OutputT, MetadataT]]
     """The cases in the report."""
-    span_id: str | None = None
-    """The span ID of the evaluation."""
+    failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
+    """The failures in the report. These are cases where task execution raised an exception."""
     trace_id: str | None = None
     """The trace ID of the evaluation."""
+    span_id: str | None = None
+    """The span ID of the evaluation."""
-    def averages(self) -> ReportCaseAggregate:
-        return ReportCaseAggregate.average(self.cases)
+    def averages(self) -> ReportCaseAggregate | None:
+        if self.cases:
+            return ReportCaseAggregate.average(self.cases)
+        return None
     def print(
         self,
@@ -184,6 +219,9 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
         include_total_duration: bool = False,
         include_removed_cases: bool = False,
         include_averages: bool = True,
+        include_errors: bool = True,
+        include_error_stacktrace: bool = False,
+        include_evaluator_failures: bool = True,
         input_config: RenderValueConfig | None = None,
         metadata_config: RenderValueConfig | None = None,
         output_config: RenderValueConfig | None = None,
@@ -207,6 +245,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             include_total_duration=include_total_duration,
             include_removed_cases=include_removed_cases,
             include_averages=include_averages,
+            include_evaluator_failures=include_evaluator_failures,
             input_config=input_config,
             metadata_config=metadata_config,
             output_config=output_config,
@@ -216,7 +255,19 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             duration_config=duration_config,
             include_reasons=include_reasons,
         )
-        Console(width=width).print(table)
+        console = Console(width=width)
+        console.print(table)
+        if include_errors and self.failures:
+            failures_table = self.failures_table(
+                include_input=include_input,
+                include_metadata=include_metadata,
+                include_expected_output=include_expected_output,
+                include_error_message=True,
+                include_error_stacktrace=include_error_stacktrace,
+                input_config=input_config,
+                metadata_config=metadata_config,
+            )
+            console.print(failures_table, style='red')
     def console_table(
         self,
@@ -230,6 +281,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
         include_total_duration: bool = False,
         include_removed_cases: bool = False,
         include_averages: bool = True,
+        include_evaluator_failures: bool = True,
         input_config: RenderValueConfig | None = None,
         metadata_config: RenderValueConfig | None = None,
         output_config: RenderValueConfig | None = None,
@@ -252,6 +304,9 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             include_total_duration=include_total_duration,
             include_removed_cases=include_removed_cases,
             include_averages=include_averages,
+            include_error_message=False,
+            include_error_stacktrace=False,
+            include_evaluator_failures=include_evaluator_failures,
             input_config={**_DEFAULT_VALUE_CONFIG, **(input_config or {})},
             metadata_config={**_DEFAULT_VALUE_CONFIG, **(metadata_config or {})},
             output_config=output_config or _DEFAULT_VALUE_CONFIG,
@@ -266,6 +321,41 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
         else:  # pragma: no cover
             return renderer.build_diff_table(self, baseline)
+    def failures_table(
+        self,
+        *,
+        include_input: bool = False,
+        include_metadata: bool = False,
+        include_expected_output: bool = False,
+        include_error_message: bool = True,
+        include_error_stacktrace: bool = True,
+        input_config: RenderValueConfig | None = None,
+        metadata_config: RenderValueConfig | None = None,
+    ) -> Table:
+        """Return a table containing the failures in this report."""
+        renderer = EvaluationRenderer(
+            include_input=include_input,
+            include_metadata=include_metadata,
+            include_expected_output=include_expected_output,
+            include_output=False,
+            include_durations=False,
+            include_total_duration=False,
+            include_removed_cases=False,
+            include_averages=False,
+            input_config={**_DEFAULT_VALUE_CONFIG, **(input_config or {})},
+            metadata_config={**_DEFAULT_VALUE_CONFIG, **(metadata_config or {})},
+            output_config=_DEFAULT_VALUE_CONFIG,
+            score_configs={},
+            label_configs={},
+            metric_configs={},
+            duration_config=_DEFAULT_DURATION_CONFIG,
+            include_reasons=False,
+            include_error_message=include_error_message,
+            include_error_stacktrace=include_error_stacktrace,
+            include_evaluator_failures=False,  # Not applicable for failures table
+        )
+        return renderer.build_failures_table(self)
     def __str__(self) -> str:  # pragma: lax no cover
         """Return a string representation of the report."""
         table = self.console_table()
@@ -286,7 +376,7 @@ class RenderValueConfig(TypedDict, total=False):
     diff_style: str
-@dataclass
+@dataclass(kw_only=True)
 class _ValueRenderer:
     value_formatter: str | Callable[[Any], str] = '{}'
     diff_checker: Callable[[Any, Any], bool] | None = lambda x, y: x != y
@@ -401,7 +491,7 @@ class RenderNumberConfig(TypedDict, total=False):
     """
-@dataclass
+@dataclass(kw_only=True)
 class _NumberRenderer:
     """See documentation of `RenderNumberConfig` for more details about the parameters here."""
@@ -503,7 +593,7 @@ class _NumberRenderer:
             return None
         diff = new - old
-        if abs(diff) < self.diff_atol + self.diff_rtol * abs(old):  # pragma: no cover
+        if abs(diff) < self.diff_atol + self.diff_rtol * abs(old):
             return None
         return self.diff_increase_style if diff > 0 else self.diff_decrease_style
@@ -532,7 +622,7 @@ _DEFAULT_DURATION_CONFIG = RenderNumberConfig(
 T = TypeVar('T')
-@dataclass
+@dataclass(kw_only=True)
 class ReportCaseRenderer:
     include_input: bool
     include_metadata: bool
@@ -545,6 +635,9 @@ class ReportCaseRenderer:
     include_reasons: bool
     include_durations: bool
     include_total_duration: bool
+    include_error_message: bool
+    include_error_stacktrace: bool
+    include_evaluator_failures: bool
     input_renderer: _ValueRenderer
     metadata_renderer: _ValueRenderer
@@ -574,10 +667,28 @@ class ReportCaseRenderer:
             table.add_column('Metrics', overflow='fold')
         if self.include_assertions:
             table.add_column('Assertions', overflow='fold')
+        if self.include_evaluator_failures:
+            table.add_column('Evaluator Failures', overflow='fold')
         if self.include_durations:
             table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right')
         return table
+    def build_failures_table(self, title: str) -> Table:
+        """Build and return a Rich Table for the failures output."""
+        table = Table(title=title, show_lines=True)
+        table.add_column('Case ID', style='bold')
+        if self.include_input:
+            table.add_column('Inputs', overflow='fold')
+        if self.include_metadata:
+            table.add_column('Metadata', overflow='fold')
+        if self.include_expected_output:
+            table.add_column('Expected Output', overflow='fold')
+        if self.include_error_message:
+            table.add_column('Error Message', overflow='fold')
+        if self.include_error_stacktrace:
+            table.add_column('Error Stacktrace', overflow='fold')
+        return table
     def build_row(self, case: ReportCase) -> list[str]:
         """Build a table row for a single case."""
         row = [case.name]
@@ -606,6 +717,9 @@ class ReportCaseRenderer:
         if self.include_assertions:
             row.append(self._render_assertions(list(case.assertions.values())))
+        if self.include_evaluator_failures:
+            row.append(self._render_evaluator_failures(case.evaluator_failures))
         if self.include_durations:
             row.append(self._render_durations(case))
@@ -639,6 +753,9 @@ class ReportCaseRenderer:
         if self.include_assertions:
             row.append(self._render_aggregate_assertions(aggregate.assertions))
+        if self.include_evaluator_failures:
+            row.append(EMPTY_AGGREGATE_CELL_STR)
         if self.include_durations:
             row.append(self._render_durations(aggregate))
@@ -700,6 +817,12 @@ class ReportCaseRenderer:
             )
             row.append(assertions_diff)
+        if self.include_evaluator_failures:  # pragma: no branch
+            evaluator_failures_diff = self._render_evaluator_failures_diff(
+                baseline.evaluator_failures, new_case.evaluator_failures
+            )
+            row.append(evaluator_failures_diff)
         if self.include_durations:  # pragma: no branch
             durations_diff = self._render_durations_diff(baseline, new_case)
             row.append(durations_diff)
@@ -743,12 +866,36 @@ class ReportCaseRenderer:
             assertions_diff = self._render_aggregate_assertions_diff(baseline.assertions, new.assertions)
             row.append(assertions_diff)
+        if self.include_evaluator_failures:  # pragma: no branch
+            row.append(EMPTY_AGGREGATE_CELL_STR)
         if self.include_durations:  # pragma: no branch
             durations_diff = self._render_durations_diff(baseline, new)
             row.append(durations_diff)
         return row
+    def build_failure_row(self, case: ReportCaseFailure) -> list[str]:
+        """Build a table row for a single case failure."""
+        row = [case.name]
+        if self.include_input:
+            row.append(self.input_renderer.render_value(None, case.inputs) or EMPTY_CELL_STR)
+        if self.include_metadata:
+            row.append(self.metadata_renderer.render_value(None, case.metadata) or EMPTY_CELL_STR)
+        if self.include_expected_output:
+            row.append(self.output_renderer.render_value(None, case.expected_output) or EMPTY_CELL_STR)
+        if self.include_error_message:
+            row.append(case.error_message or EMPTY_CELL_STR)
+        if self.include_error_stacktrace:
+            row.append(case.error_stacktrace or EMPTY_CELL_STR)
+        return row
     def _render_durations(self, case: ReportCase | ReportCaseAggregate) -> str:
         """Build the diff string for a duration value."""
         case_durations: dict[str, float] = {'task': case.task_duration}
@@ -862,8 +1009,33 @@ class ReportCaseRenderer:
         rendered_new = default_render_percentage(new) + ' [green]✔[/]' if new is not None else EMPTY_CELL_STR
         return rendered_new if rendered_baseline == rendered_new else f'{rendered_baseline} → {rendered_new}'
+    def _render_evaluator_failures(
+        self,
+        failures: list[EvaluatorFailure],
+    ) -> str:
+        if not failures:
+            return EMPTY_CELL_STR  # pragma: no cover
+        lines: list[str] = []
+        for failure in failures:
+            line = f'[red]{failure.name}[/]'
+            if failure.error_message:
+                line += f': {failure.error_message}'
+            lines.append(line)
+        return '\n'.join(lines)
+    def _render_evaluator_failures_diff(
+        self,
+        baseline_failures: list[EvaluatorFailure],
+        new_failures: list[EvaluatorFailure],
+    ) -> str:
+        baseline_str = self._render_evaluator_failures(baseline_failures)
+        new_str = self._render_evaluator_failures(new_failures)
+        if baseline_str == new_str:
+            return baseline_str  # pragma: no cover
+        return f'{baseline_str}\n→\n{new_str}'
-@dataclass
+@dataclass(kw_only=True)
 class EvaluationRenderer:
     """A class for rendering an EvalReport or the diff between two EvalReports."""
@@ -887,10 +1059,13 @@ class EvaluationRenderer:
     metric_configs: dict[str, RenderNumberConfig]
     duration_config: RenderNumberConfig
-    # TODO: Make this class kw-only so we can reorder the kwargs
     # Data to include
     include_reasons: bool  # only applies to reports, not to diffs
+    include_error_message: bool
+    include_error_stacktrace: bool
+    include_evaluator_failures: bool
     def include_scores(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
         return any(case.scores for case in self._all_cases(report, baseline))
@@ -903,6 +1078,11 @@ class EvaluationRenderer:
     def include_assertions(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
         return any(case.assertions for case in self._all_cases(report, baseline))
+    def include_evaluator_failures_column(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
+        return self.include_evaluator_failures and any(
+            case.evaluator_failures for case in self._all_cases(report, baseline)
+        )
     def _all_cases(self, report: EvaluationReport, baseline: EvaluationReport | None) -> list[ReportCase]:
         if not baseline:
             return report.cases
@@ -940,6 +1120,9 @@ class EvaluationRenderer:
             include_reasons=self.include_reasons,
             include_durations=self.include_durations,
             include_total_duration=self.include_total_duration,
+            include_error_message=self.include_error_message,
+            include_error_stacktrace=self.include_error_stacktrace,
+            include_evaluator_failures=self.include_evaluator_failures_column(report, baseline),
             input_renderer=input_renderer,
             metadata_renderer=metadata_renderer,
             output_renderer=output_renderer,
@@ -957,7 +1140,9 @@ class EvaluationRenderer:
         if self.include_averages:  # pragma: no branch
             average = report.averages()
-            table.add_row(*case_renderer.build_aggregate_row(average))
+            if average:  # pragma: no branch
+                table.add_row(*case_renderer.build_aggregate_row(average))
         return table
     def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) -> Table:
@@ -1004,6 +1189,14 @@ class EvaluationRenderer:
         return table
+    def build_failures_table(self, report: EvaluationReport) -> Table:
+        case_renderer = self._get_case_renderer(report)
+        table = case_renderer.build_failures_table('Case Failures')
+        for case in report.failures:
+            table.add_row(*case_renderer.build_failure_row(case))
+        return table
     def _infer_score_renderers(
         self, report: EvaluationReport, baseline: EvaluationReport | None
     ) -> dict[str, _NumberRenderer]:

{pydantic_evals-0.8.0 → pydantic_evals-1.0.0b1}/pyproject.toml RENAMED Viewed

@@ -28,7 +28,6 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
@@ -44,7 +43,7 @@ classifiers = [
     "Topic :: Software Development :: Libraries :: Python Modules",
     "Topic :: Internet",
 ]
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 [tool.hatch.metadata.hooks.uv-dynamic-versioning]
 dependencies = [