pydantic-evals 0.4.11__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-evals might be problematic. Click here for more details.

pydantic_evals/dataset.py CHANGED
@@ -38,9 +38,9 @@ from pydantic_evals._utils import get_event_loop
38
38
  from ._utils import get_unwrapped_function_name, task_group_gather
39
39
  from .evaluators import EvaluationResult, Evaluator
40
40
  from .evaluators._run_evaluator import run_evaluator
41
- from .evaluators._spec import EvaluatorSpec
42
41
  from .evaluators.common import DEFAULT_EVALUATORS
43
42
  from .evaluators.context import EvaluatorContext
43
+ from .evaluators.spec import EvaluatorSpec
44
44
  from .otel import SpanTree
45
45
  from .otel._context_subtree import context_subtree
46
46
  from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate
@@ -1038,7 +1038,7 @@ def _get_span_duration(span: logfire_api.LogfireSpan, fallback: float) -> float:
1038
1038
  """
1039
1039
  try:
1040
1040
  return (span.end_time - span.start_time) / 1_000_000_000 # type: ignore
1041
- except (AttributeError, TypeError): # pragma: no cover
1041
+ except (AttributeError, TypeError): # pragma: lax no cover
1042
1042
  return fallback
1043
1043
 
1044
1044
 
@@ -10,7 +10,7 @@ from .common import (
10
10
  Python,
11
11
  )
12
12
  from .context import EvaluatorContext
13
- from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorOutput
13
+ from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorOutput, EvaluatorSpec
14
14
 
15
15
  __all__ = (
16
16
  # common
@@ -27,7 +27,8 @@ __all__ = (
27
27
  'EvaluatorContext',
28
28
  # evaluator
29
29
  'Evaluator',
30
- 'EvaluationReason',
31
30
  'EvaluatorOutput',
31
+ 'EvaluatorSpec',
32
+ 'EvaluationReason',
32
33
  'EvaluationResult',
33
34
  )
@@ -48,7 +48,9 @@ async def run_evaluator(
48
48
  for name, result in results.items():
49
49
  if not isinstance(result, EvaluationReason):
50
50
  result = EvaluationReason(value=result)
51
- details.append(EvaluationResult(name=name, value=result.value, reason=result.reason, source=evaluator))
51
+ details.append(
52
+ EvaluationResult(name=name, value=result.value, reason=result.reason, source=evaluator.as_spec())
53
+ )
52
54
 
53
55
  return details
54
56
 
@@ -17,8 +17,8 @@ from typing_extensions import TypeVar, deprecated
17
17
  from pydantic_ai import _utils
18
18
 
19
19
  from .._utils import get_event_loop
20
- from ._spec import EvaluatorSpec
21
20
  from .context import EvaluatorContext
21
+ from .spec import EvaluatorSpec
22
22
 
23
23
  __all__ = (
24
24
  'EvaluationReason',
@@ -26,6 +26,7 @@ __all__ = (
26
26
  'EvaluationScalar',
27
27
  'Evaluator',
28
28
  'EvaluatorOutput',
29
+ 'EvaluatorSpec',
29
30
  )
30
31
 
31
32
  EvaluationScalar = Union[bool, int, float, str]
@@ -71,13 +72,13 @@ class EvaluationResult(Generic[EvaluationScalarT]):
71
72
  name: The name of the evaluation.
72
73
  value: The scalar result of the evaluation.
73
74
  reason: An optional explanation of the evaluation result.
74
- source: The evaluator that produced this result.
75
+ source: The spec of the evaluator that produced this result.
75
76
  """
76
77
 
77
78
  name: str
78
79
  value: EvaluationScalarT
79
80
  reason: str | None
80
- source: Evaluator
81
+ source: EvaluatorSpec
81
82
 
82
83
  def downcast(self, *value_types: type[T]) -> EvaluationResult[T] | None:
83
84
  """Attempt to downcast this result to a more specific type.
@@ -246,6 +247,13 @@ class Evaluator(Generic[InputsT, OutputT, MetadataT], metaclass=_StrictABCMeta):
246
247
  Returns:
247
248
  A JSON-serializable representation of this evaluator as an EvaluatorSpec.
248
249
  """
250
+ return to_jsonable_python(
251
+ self.as_spec(),
252
+ context=info.context,
253
+ serialize_unknown=True,
254
+ )
255
+
256
+ def as_spec(self) -> EvaluatorSpec:
249
257
  raw_arguments = self.build_serialization_arguments()
250
258
 
251
259
  arguments: None | tuple[Any,] | dict[str, Any]
@@ -255,11 +263,8 @@ class Evaluator(Generic[InputsT, OutputT, MetadataT], metaclass=_StrictABCMeta):
255
263
  arguments = (next(iter(raw_arguments.values())),)
256
264
  else:
257
265
  arguments = raw_arguments
258
- return to_jsonable_python(
259
- EvaluatorSpec(name=self.get_serialization_name(), arguments=arguments),
260
- context=info.context,
261
- serialize_unknown=True,
262
- )
266
+
267
+ return EvaluatorSpec(name=self.get_serialization_name(), arguments=arguments)
263
268
 
264
269
  def build_serialization_arguments(self) -> dict[str, Any]:
265
270
  """Build the arguments for serialization.
@@ -30,12 +30,6 @@ class EvaluatorSpec(BaseModel):
30
30
  * `'MyEvaluator'` - Just the (string) name of the Evaluator subclass is used if its `__init__` takes no arguments
31
31
  * `{'MyEvaluator': first_arg}` - A single argument is passed as the first positional argument to `MyEvaluator.__init__`
32
32
  * `{'MyEvaluator': {k1: v1, k2: v2}}` - Multiple kwargs are passed to `MyEvaluator.__init__`
33
-
34
- Args:
35
- name: The serialization name of the evaluator class returned by `EvaluatorClass.get_serialization_name()`;
36
- this is usually just the class name itself.
37
- arguments: The arguments to pass to the evaluator's constructor. Can be None (for no arguments),
38
- a tuple (for a single positional argument), or a dict (for multiple keyword arguments).
39
33
  """
40
34
 
41
35
  name: str
@@ -4,7 +4,7 @@ from collections import defaultdict
4
4
  from collections.abc import Mapping
5
5
  from dataclasses import dataclass
6
6
  from io import StringIO
7
- from typing import Any, Callable, Generic, Literal, Protocol
7
+ from typing import Any, Callable, Generic, Literal, Protocol, cast
8
8
 
9
9
  from pydantic import BaseModel, TypeAdapter
10
10
  from rich.console import Console
@@ -168,6 +168,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
168
168
  self,
169
169
  width: int | None = None,
170
170
  baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
171
+ *,
171
172
  include_input: bool = False,
172
173
  include_metadata: bool = False,
173
174
  include_expected_output: bool = False,
@@ -183,6 +184,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
183
184
  label_configs: dict[str, RenderValueConfig] | None = None,
184
185
  metric_configs: dict[str, RenderNumberConfig] | None = None,
185
186
  duration_config: RenderNumberConfig | None = None,
187
+ include_reasons: bool = False,
186
188
  ): # pragma: no cover
187
189
  """Print this report to the console, optionally comparing it to a baseline report.
188
190
 
@@ -205,12 +207,14 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
205
207
  label_configs=label_configs,
206
208
  metric_configs=metric_configs,
207
209
  duration_config=duration_config,
210
+ include_reasons=include_reasons,
208
211
  )
209
212
  Console(width=width).print(table)
210
213
 
211
214
  def console_table(
212
215
  self,
213
216
  baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
217
+ *,
214
218
  include_input: bool = False,
215
219
  include_metadata: bool = False,
216
220
  include_expected_output: bool = False,
@@ -226,6 +230,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
226
230
  label_configs: dict[str, RenderValueConfig] | None = None,
227
231
  metric_configs: dict[str, RenderNumberConfig] | None = None,
228
232
  duration_config: RenderNumberConfig | None = None,
233
+ include_reasons: bool = False,
229
234
  ) -> Table:
230
235
  """Return a table containing the data from this report, or the diff between this report and a baseline report.
231
236
 
@@ -247,6 +252,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
247
252
  label_configs=label_configs or {},
248
253
  metric_configs=metric_configs or {},
249
254
  duration_config=duration_config or _DEFAULT_DURATION_CONFIG,
255
+ include_reasons=include_reasons,
250
256
  )
251
257
  if baseline is None:
252
258
  return renderer.build_table(self)
@@ -529,15 +535,16 @@ class ReportCaseRenderer:
529
535
  include_labels: bool
530
536
  include_metrics: bool
531
537
  include_assertions: bool
538
+ include_reasons: bool
532
539
  include_durations: bool
533
540
  include_total_duration: bool
534
541
 
535
542
  input_renderer: _ValueRenderer
536
543
  metadata_renderer: _ValueRenderer
537
544
  output_renderer: _ValueRenderer
538
- score_renderers: dict[str, _NumberRenderer]
539
- label_renderers: dict[str, _ValueRenderer]
540
- metric_renderers: dict[str, _NumberRenderer]
545
+ score_renderers: Mapping[str, _NumberRenderer]
546
+ label_renderers: Mapping[str, _ValueRenderer]
547
+ metric_renderers: Mapping[str, _NumberRenderer]
541
548
  duration_renderer: _NumberRenderer
542
549
 
543
550
  def build_base_table(self, title: str) -> Table:
@@ -581,10 +588,10 @@ class ReportCaseRenderer:
581
588
  row.append(self.output_renderer.render_value(None, case.output) or EMPTY_CELL_STR)
582
589
 
583
590
  if self.include_scores:
584
- row.append(self._render_dict({k: v.value for k, v in case.scores.items()}, self.score_renderers))
591
+ row.append(self._render_dict({k: v for k, v in case.scores.items()}, self.score_renderers))
585
592
 
586
593
  if self.include_labels:
587
- row.append(self._render_dict({k: v.value for k, v in case.labels.items()}, self.label_renderers))
594
+ row.append(self._render_dict({k: v for k, v in case.labels.items()}, self.label_renderers))
588
595
 
589
596
  if self.include_metrics:
590
597
  row.append(self._render_dict(case.metrics, self.metric_renderers))
@@ -669,7 +676,11 @@ class ReportCaseRenderer:
669
676
  row.append(scores_diff)
670
677
 
671
678
  if self.include_labels: # pragma: no branch
672
- labels_diff = self._render_dicts_diff(baseline.labels, new_case.labels, self.label_renderers)
679
+ labels_diff = self._render_dicts_diff(
680
+ {k: v.value for k, v in baseline.labels.items()},
681
+ {k: v.value for k, v in new_case.labels.items()},
682
+ self.label_renderers,
683
+ )
673
684
  row.append(labels_diff)
674
685
 
675
686
  if self.include_metrics: # pragma: no branch
@@ -779,26 +790,36 @@ class ReportCaseRenderer:
779
790
  diff_lines.append(rendered)
780
791
  return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR
781
792
 
782
- @staticmethod
783
793
  def _render_dict(
784
- case_dict: dict[str, T],
794
+ self,
795
+ case_dict: Mapping[str, EvaluationResult[T] | T],
785
796
  renderers: Mapping[str, _AbstractRenderer[T]],
786
797
  *,
787
798
  include_names: bool = True,
788
799
  ) -> str:
789
800
  diff_lines: list[str] = []
790
801
  for key, val in case_dict.items():
791
- rendered = renderers[key].render_value(key if include_names else None, val)
802
+ value = cast(EvaluationResult[T], val).value if isinstance(val, EvaluationResult) else val
803
+ rendered = renderers[key].render_value(key if include_names else None, value)
804
+ if self.include_reasons and isinstance(val, EvaluationResult) and (reason := val.reason):
805
+ rendered += f'\n Reason: {reason}\n'
792
806
  diff_lines.append(rendered)
793
807
  return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR
794
808
 
795
- @staticmethod
796
809
  def _render_assertions(
810
+ self,
797
811
  assertions: list[EvaluationResult[bool]],
798
812
  ) -> str:
799
813
  if not assertions:
800
814
  return EMPTY_CELL_STR
801
- return ''.join(['[green]✔[/]' if a.value else '[red]✗[/]' for a in assertions])
815
+ lines: list[str] = []
816
+ for a in assertions:
817
+ line = '[green]✔[/]' if a.value else '[red]✗[/]'
818
+ if self.include_reasons:
819
+ line = f'{a.name}: {line}\n'
820
+ line = f'{line} Reason: {a.reason}\n\n' if a.reason else line
821
+ lines.append(line)
822
+ return ''.join(lines)
802
823
 
803
824
  @staticmethod
804
825
  def _render_aggregate_assertions(
@@ -859,6 +880,10 @@ class EvaluationRenderer:
859
880
  metric_configs: dict[str, RenderNumberConfig]
860
881
  duration_config: RenderNumberConfig
861
882
 
883
+ # TODO: Make this class kw-only so we can reorder the kwargs
884
+ # Data to include
885
+ include_reasons: bool # only applies to reports, not to diffs
886
+
862
887
  def include_scores(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
863
888
  return any(case.scores for case in self._all_cases(report, baseline))
864
889
 
@@ -905,6 +930,7 @@ class EvaluationRenderer:
905
930
  include_labels=self.include_labels(report, baseline),
906
931
  include_metrics=self.include_metrics(report, baseline),
907
932
  include_assertions=self.include_assertions(report, baseline),
933
+ include_reasons=self.include_reasons,
908
934
  include_durations=self.include_durations,
909
935
  include_total_duration=self.include_total_duration,
910
936
  input_renderer=input_renderer,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 0.4.11
3
+ Version: 0.5.1
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
32
32
  Requires-Dist: anyio>=0
33
33
  Requires-Dist: eval-type-backport>=0; python_version < '3.11'
34
34
  Requires-Dist: logfire-api>=1.2.0
35
- Requires-Dist: pydantic-ai-slim==0.4.11
35
+ Requires-Dist: pydantic-ai-slim==0.5.1
36
36
  Requires-Dist: pydantic>=2.10
37
37
  Requires-Dist: pyyaml>=6.0.2
38
38
  Requires-Dist: rich>=13.9.4
@@ -1,23 +1,23 @@
1
1
  pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
2
2
  pydantic_evals/_utils.py,sha256=PfhmPbdQp-q90s568LuG45zDDXxgO13BEz8MQJK8qw4,2922
3
- pydantic_evals/dataset.py,sha256=yk6nHzzbEJqh9p3Y_MuBQyP0szp5oh-oFUDavi4N9D8,46699
3
+ pydantic_evals/dataset.py,sha256=ZtDSqsLQYe2ExdLeVpwZo1akdSU8mkBrZ8Xgm7jh7n4,46702
4
4
  pydantic_evals/generation.py,sha256=Yd1rfbsDjjBBHDk-1KDu48hlITjM2-74rTnPBD_sqbA,3494
5
5
  pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- pydantic_evals/evaluators/__init__.py,sha256=uj110viFiDvqrIcuDcWexk_uBgJqhKMGPHT2YvDb7LA,624
7
- pydantic_evals/evaluators/_run_evaluator.py,sha256=Dsnqxno7CrcKWYcnkLuwvPKWQGDRBmbBTwwstcmc0ak,2448
8
- pydantic_evals/evaluators/_spec.py,sha256=Xi_FHwnmAZ1x2hoJFw4MBZuG0TilNKqMRW3P74se32I,7080
6
+ pydantic_evals/evaluators/__init__.py,sha256=k_sTzRezIOUj7L2x3WC0_r8943jRo72uqb2vKplD5EU,660
7
+ pydantic_evals/evaluators/_run_evaluator.py,sha256=YQgddaB4zxMnoK1yRhxpAJ_aQhwClExXcVrq2QPjyqs,2480
9
8
  pydantic_evals/evaluators/common.py,sha256=ZBrNTfPJoOpT4WNXTRGS0UcKhnuhfYJxjNzum-zHFk8,12064
10
9
  pydantic_evals/evaluators/context.py,sha256=8osQRCFW8Vekw7JiiOOCHHH3HOGdhDaUlr8i-twSetg,3870
11
- pydantic_evals/evaluators/evaluator.py,sha256=yOEKLOxElm7_4tLcq6_myXI0e4Ei9svZP9y5DTq4SYI,11147
10
+ pydantic_evals/evaluators/evaluator.py,sha256=CvPLh4SITopiRoiZg_dkLuMUStjhxlRXxWJkBVOoIb4,11255
12
11
  pydantic_evals/evaluators/llm_as_a_judge.py,sha256=xQjaGuCRXZdlExacFyR4Y4kFmwBh2QxAfEyaed_aqvk,9615
12
+ pydantic_evals/evaluators/spec.py,sha256=zsJ3BZQHTGX7crd3SyGsqVsLrEyoxHyVfJkYy3JEbsU,6693
13
13
  pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
14
14
  pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5hN_R8VRK4jzmdfe1DgWdXwxVbc,6758
15
15
  pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
16
16
  pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD4,268
17
17
  pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
18
- pydantic_evals/reporting/__init__.py,sha256=k_3tteqXGh0yGvgpN68gB0CjG9wzrakzDTve2GHend4,42148
18
+ pydantic_evals/reporting/__init__.py,sha256=lRDlveMIOzYHaqqTIEiD2PQi6FSSejEMmY4aQbmIzTc,43233
19
19
  pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
20
- pydantic_evals-0.4.11.dist-info/METADATA,sha256=FFoTkvtho_TVAbGn25UYu4CaxZLvXsGtJcBpVBqGoFM,7940
21
- pydantic_evals-0.4.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
- pydantic_evals-0.4.11.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
- pydantic_evals-0.4.11.dist-info/RECORD,,
20
+ pydantic_evals-0.5.1.dist-info/METADATA,sha256=ZuyB1AMzigg09oD0v49aiaHwRoIxZeNa-y6lvOzcty0,7938
21
+ pydantic_evals-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
+ pydantic_evals-0.5.1.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
+ pydantic_evals-0.5.1.dist-info/RECORD,,