PyPI - pydantic-evals - Versions diffs - 0.4.11__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

pydantic-evals 0.4.11py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydantic-evals might be problematic. Click here for more details.

Files changed (10) hide show

pydantic_evals/dataset.py CHANGED Viewed

@@ -38,9 +38,9 @@ from pydantic_evals._utils import get_event_loop
 from ._utils import get_unwrapped_function_name, task_group_gather
 from .evaluators import EvaluationResult, Evaluator
 from .evaluators._run_evaluator import run_evaluator
-from .evaluators._spec import EvaluatorSpec
 from .evaluators.common import DEFAULT_EVALUATORS
 from .evaluators.context import EvaluatorContext
+from .evaluators.spec import EvaluatorSpec
 from .otel import SpanTree
 from .otel._context_subtree import context_subtree
 from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate

pydantic_evals/evaluators/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ from .common import (
     Python,
 )
 from .context import EvaluatorContext
-from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorOutput
+from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorOutput, EvaluatorSpec
 __all__ = (
     # common
@@ -27,7 +27,8 @@ __all__ = (
     'EvaluatorContext',
     # evaluator
     'Evaluator',
-    'EvaluationReason',
     'EvaluatorOutput',
+    'EvaluatorSpec',
+    'EvaluationReason',
     'EvaluationResult',
 )

pydantic_evals/evaluators/_run_evaluator.py CHANGED Viewed

@@ -48,7 +48,9 @@ async def run_evaluator(
     for name, result in results.items():
         if not isinstance(result, EvaluationReason):
             result = EvaluationReason(value=result)
-        details.append(EvaluationResult(name=name, value=result.value, reason=result.reason, source=evaluator))
+        details.append(
+            EvaluationResult(name=name, value=result.value, reason=result.reason, source=evaluator.as_spec())
+        )
     return details

pydantic_evals/evaluators/evaluator.py CHANGED Viewed

@@ -17,8 +17,8 @@ from typing_extensions import TypeVar, deprecated
 from pydantic_ai import _utils
 from .._utils import get_event_loop
-from ._spec import EvaluatorSpec
 from .context import EvaluatorContext
+from .spec import EvaluatorSpec
 __all__ = (
     'EvaluationReason',
@@ -26,6 +26,7 @@ __all__ = (
     'EvaluationScalar',
     'Evaluator',
     'EvaluatorOutput',
+    'EvaluatorSpec',
 )
 EvaluationScalar = Union[bool, int, float, str]
@@ -71,13 +72,13 @@ class EvaluationResult(Generic[EvaluationScalarT]):
         name: The name of the evaluation.
         value: The scalar result of the evaluation.
         reason: An optional explanation of the evaluation result.
-        source: The evaluator that produced this result.
+        source: The spec of the evaluator that produced this result.
     """
     name: str
     value: EvaluationScalarT
     reason: str | None
-    source: Evaluator
+    source: EvaluatorSpec
     def downcast(self, *value_types: type[T]) -> EvaluationResult[T] | None:
         """Attempt to downcast this result to a more specific type.
@@ -246,6 +247,13 @@ class Evaluator(Generic[InputsT, OutputT, MetadataT], metaclass=_StrictABCMeta):
         Returns:
             A JSON-serializable representation of this evaluator as an EvaluatorSpec.
         """
+        return to_jsonable_python(
+            self.as_spec(),
+            context=info.context,
+            serialize_unknown=True,
+        )
+    def as_spec(self) -> EvaluatorSpec:
         raw_arguments = self.build_serialization_arguments()
         arguments: None | tuple[Any,] | dict[str, Any]
@@ -255,11 +263,8 @@ class Evaluator(Generic[InputsT, OutputT, MetadataT], metaclass=_StrictABCMeta):
             arguments = (next(iter(raw_arguments.values())),)
         else:
             arguments = raw_arguments
-        return to_jsonable_python(
-            EvaluatorSpec(name=self.get_serialization_name(), arguments=arguments),
-            context=info.context,
-            serialize_unknown=True,
-        )
+        return EvaluatorSpec(name=self.get_serialization_name(), arguments=arguments)
     def build_serialization_arguments(self) -> dict[str, Any]:
         """Build the arguments for serialization.

pydantic_evals/evaluators/{_spec.py → spec.py} RENAMED Viewed

@@ -30,12 +30,6 @@ class EvaluatorSpec(BaseModel):
     * `'MyEvaluator'` - Just the (string) name of the Evaluator subclass is used if its `__init__` takes no arguments
     * `{'MyEvaluator': first_arg}` - A single argument is passed as the first positional argument to `MyEvaluator.__init__`
     * `{'MyEvaluator': {k1: v1, k2: v2}}` - Multiple kwargs are passed to `MyEvaluator.__init__`
-    Args:
-        name: The serialization name of the evaluator class returned by `EvaluatorClass.get_serialization_name()`;
-            this is usually just the class name itself.
-        arguments: The arguments to pass to the evaluator's constructor. Can be None (for no arguments),
-            a tuple (for a single positional argument), or a dict (for multiple keyword arguments).
     """
     name: str

pydantic_evals/reporting/__init__.py CHANGED Viewed

@@ -4,7 +4,7 @@ from collections import defaultdict
 from collections.abc import Mapping
 from dataclasses import dataclass
 from io import StringIO
-from typing import Any, Callable, Generic, Literal, Protocol
+from typing import Any, Callable, Generic, Literal, Protocol, cast
 from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
@@ -168,6 +168,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
         self,
         width: int | None = None,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
+        *,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -183,6 +184,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
         label_configs: dict[str, RenderValueConfig] | None = None,
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
+        include_reasons: bool = False,
     ):  # pragma: no cover
         """Print this report to the console, optionally comparing it to a baseline report.
@@ -205,12 +207,14 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             label_configs=label_configs,
             metric_configs=metric_configs,
             duration_config=duration_config,
+            include_reasons=include_reasons,
         )
         Console(width=width).print(table)
     def console_table(
         self,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
+        *,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -226,6 +230,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
         label_configs: dict[str, RenderValueConfig] | None = None,
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
+        include_reasons: bool = False,
     ) -> Table:
         """Return a table containing the data from this report, or the diff between this report and a baseline report.
@@ -247,6 +252,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
             label_configs=label_configs or {},
             metric_configs=metric_configs or {},
             duration_config=duration_config or _DEFAULT_DURATION_CONFIG,
+            include_reasons=include_reasons,
         )
         if baseline is None:
             return renderer.build_table(self)
@@ -529,15 +535,16 @@ class ReportCaseRenderer:
     include_labels: bool
     include_metrics: bool
     include_assertions: bool
+    include_reasons: bool
     include_durations: bool
     include_total_duration: bool
     input_renderer: _ValueRenderer
     metadata_renderer: _ValueRenderer
     output_renderer: _ValueRenderer
-    score_renderers: dict[str, _NumberRenderer]
-    label_renderers: dict[str, _ValueRenderer]
-    metric_renderers: dict[str, _NumberRenderer]
+    score_renderers: Mapping[str, _NumberRenderer]
+    label_renderers: Mapping[str, _ValueRenderer]
+    metric_renderers: Mapping[str, _NumberRenderer]
     duration_renderer: _NumberRenderer
     def build_base_table(self, title: str) -> Table:
@@ -581,10 +588,10 @@ class ReportCaseRenderer:
             row.append(self.output_renderer.render_value(None, case.output) or EMPTY_CELL_STR)
         if self.include_scores:
-            row.append(self._render_dict({k: v.value for k, v in case.scores.items()}, self.score_renderers))
+            row.append(self._render_dict({k: v for k, v in case.scores.items()}, self.score_renderers))
         if self.include_labels:
-            row.append(self._render_dict({k: v.value for k, v in case.labels.items()}, self.label_renderers))
+            row.append(self._render_dict({k: v for k, v in case.labels.items()}, self.label_renderers))
         if self.include_metrics:
             row.append(self._render_dict(case.metrics, self.metric_renderers))
@@ -669,7 +676,11 @@ class ReportCaseRenderer:
             row.append(scores_diff)
         if self.include_labels:  # pragma: no branch
-            labels_diff = self._render_dicts_diff(baseline.labels, new_case.labels, self.label_renderers)
+            labels_diff = self._render_dicts_diff(
+                {k: v.value for k, v in baseline.labels.items()},
+                {k: v.value for k, v in new_case.labels.items()},
+                self.label_renderers,
+            )
             row.append(labels_diff)
         if self.include_metrics:  # pragma: no branch
@@ -779,26 +790,36 @@ class ReportCaseRenderer:
             diff_lines.append(rendered)
         return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR
-    @staticmethod
     def _render_dict(
-        case_dict: dict[str, T],
+        self,
+        case_dict: Mapping[str, EvaluationResult[T] | T],
         renderers: Mapping[str, _AbstractRenderer[T]],
         *,
         include_names: bool = True,
     ) -> str:
         diff_lines: list[str] = []
         for key, val in case_dict.items():
-            rendered = renderers[key].render_value(key if include_names else None, val)
+            value = cast(EvaluationResult[T], val).value if isinstance(val, EvaluationResult) else val
+            rendered = renderers[key].render_value(key if include_names else None, value)
+            if self.include_reasons and isinstance(val, EvaluationResult) and (reason := val.reason):
+                rendered += f'\n  Reason: {reason}\n'
             diff_lines.append(rendered)
         return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR
-    @staticmethod
     def _render_assertions(
+        self,
         assertions: list[EvaluationResult[bool]],
     ) -> str:
         if not assertions:
             return EMPTY_CELL_STR
-        return ''.join(['[green]✔[/]' if a.value else '[red]✗[/]' for a in assertions])
+        lines: list[str] = []
+        for a in assertions:
+            line = '[green]✔[/]' if a.value else '[red]✗[/]'
+            if self.include_reasons:
+                line = f'{a.name}: {line}\n'
+                line = f'{line}  Reason: {a.reason}\n\n' if a.reason else line
+            lines.append(line)
+        return ''.join(lines)
     @staticmethod
     def _render_aggregate_assertions(
@@ -859,6 +880,10 @@ class EvaluationRenderer:
     metric_configs: dict[str, RenderNumberConfig]
     duration_config: RenderNumberConfig
+    # TODO: Make this class kw-only so we can reorder the kwargs
+    # Data to include
+    include_reasons: bool  # only applies to reports, not to diffs
     def include_scores(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
         return any(case.scores for case in self._all_cases(report, baseline))
@@ -905,6 +930,7 @@ class EvaluationRenderer:
             include_labels=self.include_labels(report, baseline),
             include_metrics=self.include_metrics(report, baseline),
             include_assertions=self.include_assertions(report, baseline),
+            include_reasons=self.include_reasons,
             include_durations=self.include_durations,
             include_total_duration=self.include_total_duration,
             input_renderer=input_renderer,

{pydantic_evals-0.4.11.dist-info → pydantic_evals-0.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydantic-evals
-Version: 0.4.11
+Version: 0.5.0
 Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
 Project-URL: Homepage, https://ai.pydantic.dev/evals
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
 Requires-Dist: anyio>=0
 Requires-Dist: eval-type-backport>=0; python_version < '3.11'
 Requires-Dist: logfire-api>=1.2.0
-Requires-Dist: pydantic-ai-slim==0.4.11
+Requires-Dist: pydantic-ai-slim==0.5.0
 Requires-Dist: pydantic>=2.10
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: rich>=13.9.4

{pydantic_evals-0.4.11.dist-info → pydantic_evals-0.5.0.dist-info}/RECORD RENAMED Viewed

@@ -1,23 +1,23 @@
 pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
 pydantic_evals/_utils.py,sha256=PfhmPbdQp-q90s568LuG45zDDXxgO13BEz8MQJK8qw4,2922
-pydantic_evals/dataset.py,sha256=yk6nHzzbEJqh9p3Y_MuBQyP0szp5oh-oFUDavi4N9D8,46699
+pydantic_evals/dataset.py,sha256=uOH7vW8YbF5NYx9jtcXeYLountSwl93DnO5qBmpyRVw,46698
 pydantic_evals/generation.py,sha256=Yd1rfbsDjjBBHDk-1KDu48hlITjM2-74rTnPBD_sqbA,3494
 pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pydantic_evals/evaluators/__init__.py,sha256=uj110viFiDvqrIcuDcWexk_uBgJqhKMGPHT2YvDb7LA,624
-pydantic_evals/evaluators/_run_evaluator.py,sha256=Dsnqxno7CrcKWYcnkLuwvPKWQGDRBmbBTwwstcmc0ak,2448
-pydantic_evals/evaluators/_spec.py,sha256=Xi_FHwnmAZ1x2hoJFw4MBZuG0TilNKqMRW3P74se32I,7080
+pydantic_evals/evaluators/__init__.py,sha256=k_sTzRezIOUj7L2x3WC0_r8943jRo72uqb2vKplD5EU,660
+pydantic_evals/evaluators/_run_evaluator.py,sha256=YQgddaB4zxMnoK1yRhxpAJ_aQhwClExXcVrq2QPjyqs,2480
 pydantic_evals/evaluators/common.py,sha256=ZBrNTfPJoOpT4WNXTRGS0UcKhnuhfYJxjNzum-zHFk8,12064
 pydantic_evals/evaluators/context.py,sha256=8osQRCFW8Vekw7JiiOOCHHH3HOGdhDaUlr8i-twSetg,3870
-pydantic_evals/evaluators/evaluator.py,sha256=yOEKLOxElm7_4tLcq6_myXI0e4Ei9svZP9y5DTq4SYI,11147
+pydantic_evals/evaluators/evaluator.py,sha256=CvPLh4SITopiRoiZg_dkLuMUStjhxlRXxWJkBVOoIb4,11255
 pydantic_evals/evaluators/llm_as_a_judge.py,sha256=xQjaGuCRXZdlExacFyR4Y4kFmwBh2QxAfEyaed_aqvk,9615
+pydantic_evals/evaluators/spec.py,sha256=zsJ3BZQHTGX7crd3SyGsqVsLrEyoxHyVfJkYy3JEbsU,6693
 pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
 pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5hN_R8VRK4jzmdfe1DgWdXwxVbc,6758
 pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
 pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD4,268
 pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
-pydantic_evals/reporting/__init__.py,sha256=k_3tteqXGh0yGvgpN68gB0CjG9wzrakzDTve2GHend4,42148
+pydantic_evals/reporting/__init__.py,sha256=lRDlveMIOzYHaqqTIEiD2PQi6FSSejEMmY4aQbmIzTc,43233
 pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
-pydantic_evals-0.4.11.dist-info/METADATA,sha256=FFoTkvtho_TVAbGn25UYu4CaxZLvXsGtJcBpVBqGoFM,7940
-pydantic_evals-0.4.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-pydantic_evals-0.4.11.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
-pydantic_evals-0.4.11.dist-info/RECORD,,
+pydantic_evals-0.5.0.dist-info/METADATA,sha256=ea5Ph75O8epHLQjRn7mKc9UvOqYqy2f8GUE_Y-ztTlY,7938
+pydantic_evals-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+pydantic_evals-0.5.0.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
+pydantic_evals-0.5.0.dist-info/RECORD,,

{pydantic_evals-0.4.11.dist-info → pydantic_evals-0.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{pydantic_evals-0.4.11.dist-info → pydantic_evals-0.5.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

pydantic-evals 0.4.11__py3-none-any.whl → 0.5.0__py3-none-any.whl

Potentially problematic release.

pydantic-evals 0.4.11py3-none-any.whl → 0.5.0py3-none-any.whl