pydantic-evals 0.4.11__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydantic-evals might be problematic. Click here for more details.
- pydantic_evals/dataset.py +1 -1
- pydantic_evals/evaluators/__init__.py +3 -2
- pydantic_evals/evaluators/_run_evaluator.py +3 -1
- pydantic_evals/evaluators/evaluator.py +13 -8
- pydantic_evals/evaluators/{_spec.py → spec.py} +0 -6
- pydantic_evals/reporting/__init__.py +38 -12
- {pydantic_evals-0.4.11.dist-info → pydantic_evals-0.5.0.dist-info}/METADATA +2 -2
- {pydantic_evals-0.4.11.dist-info → pydantic_evals-0.5.0.dist-info}/RECORD +10 -10
- {pydantic_evals-0.4.11.dist-info → pydantic_evals-0.5.0.dist-info}/WHEEL +0 -0
- {pydantic_evals-0.4.11.dist-info → pydantic_evals-0.5.0.dist-info}/licenses/LICENSE +0 -0
pydantic_evals/dataset.py
CHANGED
|
@@ -38,9 +38,9 @@ from pydantic_evals._utils import get_event_loop
|
|
|
38
38
|
from ._utils import get_unwrapped_function_name, task_group_gather
|
|
39
39
|
from .evaluators import EvaluationResult, Evaluator
|
|
40
40
|
from .evaluators._run_evaluator import run_evaluator
|
|
41
|
-
from .evaluators._spec import EvaluatorSpec
|
|
42
41
|
from .evaluators.common import DEFAULT_EVALUATORS
|
|
43
42
|
from .evaluators.context import EvaluatorContext
|
|
43
|
+
from .evaluators.spec import EvaluatorSpec
|
|
44
44
|
from .otel import SpanTree
|
|
45
45
|
from .otel._context_subtree import context_subtree
|
|
46
46
|
from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate
|
|
@@ -10,7 +10,7 @@ from .common import (
|
|
|
10
10
|
Python,
|
|
11
11
|
)
|
|
12
12
|
from .context import EvaluatorContext
|
|
13
|
-
from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorOutput
|
|
13
|
+
from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorOutput, EvaluatorSpec
|
|
14
14
|
|
|
15
15
|
__all__ = (
|
|
16
16
|
# common
|
|
@@ -27,7 +27,8 @@ __all__ = (
|
|
|
27
27
|
'EvaluatorContext',
|
|
28
28
|
# evaluator
|
|
29
29
|
'Evaluator',
|
|
30
|
-
'EvaluationReason',
|
|
31
30
|
'EvaluatorOutput',
|
|
31
|
+
'EvaluatorSpec',
|
|
32
|
+
'EvaluationReason',
|
|
32
33
|
'EvaluationResult',
|
|
33
34
|
)
|
|
@@ -48,7 +48,9 @@ async def run_evaluator(
|
|
|
48
48
|
for name, result in results.items():
|
|
49
49
|
if not isinstance(result, EvaluationReason):
|
|
50
50
|
result = EvaluationReason(value=result)
|
|
51
|
-
details.append(
|
|
51
|
+
details.append(
|
|
52
|
+
EvaluationResult(name=name, value=result.value, reason=result.reason, source=evaluator.as_spec())
|
|
53
|
+
)
|
|
52
54
|
|
|
53
55
|
return details
|
|
54
56
|
|
|
@@ -17,8 +17,8 @@ from typing_extensions import TypeVar, deprecated
|
|
|
17
17
|
from pydantic_ai import _utils
|
|
18
18
|
|
|
19
19
|
from .._utils import get_event_loop
|
|
20
|
-
from ._spec import EvaluatorSpec
|
|
21
20
|
from .context import EvaluatorContext
|
|
21
|
+
from .spec import EvaluatorSpec
|
|
22
22
|
|
|
23
23
|
__all__ = (
|
|
24
24
|
'EvaluationReason',
|
|
@@ -26,6 +26,7 @@ __all__ = (
|
|
|
26
26
|
'EvaluationScalar',
|
|
27
27
|
'Evaluator',
|
|
28
28
|
'EvaluatorOutput',
|
|
29
|
+
'EvaluatorSpec',
|
|
29
30
|
)
|
|
30
31
|
|
|
31
32
|
EvaluationScalar = Union[bool, int, float, str]
|
|
@@ -71,13 +72,13 @@ class EvaluationResult(Generic[EvaluationScalarT]):
|
|
|
71
72
|
name: The name of the evaluation.
|
|
72
73
|
value: The scalar result of the evaluation.
|
|
73
74
|
reason: An optional explanation of the evaluation result.
|
|
74
|
-
source: The evaluator that produced this result.
|
|
75
|
+
source: The spec of the evaluator that produced this result.
|
|
75
76
|
"""
|
|
76
77
|
|
|
77
78
|
name: str
|
|
78
79
|
value: EvaluationScalarT
|
|
79
80
|
reason: str | None
|
|
80
|
-
source:
|
|
81
|
+
source: EvaluatorSpec
|
|
81
82
|
|
|
82
83
|
def downcast(self, *value_types: type[T]) -> EvaluationResult[T] | None:
|
|
83
84
|
"""Attempt to downcast this result to a more specific type.
|
|
@@ -246,6 +247,13 @@ class Evaluator(Generic[InputsT, OutputT, MetadataT], metaclass=_StrictABCMeta):
|
|
|
246
247
|
Returns:
|
|
247
248
|
A JSON-serializable representation of this evaluator as an EvaluatorSpec.
|
|
248
249
|
"""
|
|
250
|
+
return to_jsonable_python(
|
|
251
|
+
self.as_spec(),
|
|
252
|
+
context=info.context,
|
|
253
|
+
serialize_unknown=True,
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
def as_spec(self) -> EvaluatorSpec:
|
|
249
257
|
raw_arguments = self.build_serialization_arguments()
|
|
250
258
|
|
|
251
259
|
arguments: None | tuple[Any,] | dict[str, Any]
|
|
@@ -255,11 +263,8 @@ class Evaluator(Generic[InputsT, OutputT, MetadataT], metaclass=_StrictABCMeta):
|
|
|
255
263
|
arguments = (next(iter(raw_arguments.values())),)
|
|
256
264
|
else:
|
|
257
265
|
arguments = raw_arguments
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
context=info.context,
|
|
261
|
-
serialize_unknown=True,
|
|
262
|
-
)
|
|
266
|
+
|
|
267
|
+
return EvaluatorSpec(name=self.get_serialization_name(), arguments=arguments)
|
|
263
268
|
|
|
264
269
|
def build_serialization_arguments(self) -> dict[str, Any]:
|
|
265
270
|
"""Build the arguments for serialization.
|
|
@@ -30,12 +30,6 @@ class EvaluatorSpec(BaseModel):
|
|
|
30
30
|
* `'MyEvaluator'` - Just the (string) name of the Evaluator subclass is used if its `__init__` takes no arguments
|
|
31
31
|
* `{'MyEvaluator': first_arg}` - A single argument is passed as the first positional argument to `MyEvaluator.__init__`
|
|
32
32
|
* `{'MyEvaluator': {k1: v1, k2: v2}}` - Multiple kwargs are passed to `MyEvaluator.__init__`
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
name: The serialization name of the evaluator class returned by `EvaluatorClass.get_serialization_name()`;
|
|
36
|
-
this is usually just the class name itself.
|
|
37
|
-
arguments: The arguments to pass to the evaluator's constructor. Can be None (for no arguments),
|
|
38
|
-
a tuple (for a single positional argument), or a dict (for multiple keyword arguments).
|
|
39
33
|
"""
|
|
40
34
|
|
|
41
35
|
name: str
|
|
@@ -4,7 +4,7 @@ from collections import defaultdict
|
|
|
4
4
|
from collections.abc import Mapping
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from io import StringIO
|
|
7
|
-
from typing import Any, Callable, Generic, Literal, Protocol
|
|
7
|
+
from typing import Any, Callable, Generic, Literal, Protocol, cast
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel, TypeAdapter
|
|
10
10
|
from rich.console import Console
|
|
@@ -168,6 +168,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
168
168
|
self,
|
|
169
169
|
width: int | None = None,
|
|
170
170
|
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
171
|
+
*,
|
|
171
172
|
include_input: bool = False,
|
|
172
173
|
include_metadata: bool = False,
|
|
173
174
|
include_expected_output: bool = False,
|
|
@@ -183,6 +184,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
183
184
|
label_configs: dict[str, RenderValueConfig] | None = None,
|
|
184
185
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
185
186
|
duration_config: RenderNumberConfig | None = None,
|
|
187
|
+
include_reasons: bool = False,
|
|
186
188
|
): # pragma: no cover
|
|
187
189
|
"""Print this report to the console, optionally comparing it to a baseline report.
|
|
188
190
|
|
|
@@ -205,12 +207,14 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
205
207
|
label_configs=label_configs,
|
|
206
208
|
metric_configs=metric_configs,
|
|
207
209
|
duration_config=duration_config,
|
|
210
|
+
include_reasons=include_reasons,
|
|
208
211
|
)
|
|
209
212
|
Console(width=width).print(table)
|
|
210
213
|
|
|
211
214
|
def console_table(
|
|
212
215
|
self,
|
|
213
216
|
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
217
|
+
*,
|
|
214
218
|
include_input: bool = False,
|
|
215
219
|
include_metadata: bool = False,
|
|
216
220
|
include_expected_output: bool = False,
|
|
@@ -226,6 +230,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
226
230
|
label_configs: dict[str, RenderValueConfig] | None = None,
|
|
227
231
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
228
232
|
duration_config: RenderNumberConfig | None = None,
|
|
233
|
+
include_reasons: bool = False,
|
|
229
234
|
) -> Table:
|
|
230
235
|
"""Return a table containing the data from this report, or the diff between this report and a baseline report.
|
|
231
236
|
|
|
@@ -247,6 +252,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
247
252
|
label_configs=label_configs or {},
|
|
248
253
|
metric_configs=metric_configs or {},
|
|
249
254
|
duration_config=duration_config or _DEFAULT_DURATION_CONFIG,
|
|
255
|
+
include_reasons=include_reasons,
|
|
250
256
|
)
|
|
251
257
|
if baseline is None:
|
|
252
258
|
return renderer.build_table(self)
|
|
@@ -529,15 +535,16 @@ class ReportCaseRenderer:
|
|
|
529
535
|
include_labels: bool
|
|
530
536
|
include_metrics: bool
|
|
531
537
|
include_assertions: bool
|
|
538
|
+
include_reasons: bool
|
|
532
539
|
include_durations: bool
|
|
533
540
|
include_total_duration: bool
|
|
534
541
|
|
|
535
542
|
input_renderer: _ValueRenderer
|
|
536
543
|
metadata_renderer: _ValueRenderer
|
|
537
544
|
output_renderer: _ValueRenderer
|
|
538
|
-
score_renderers:
|
|
539
|
-
label_renderers:
|
|
540
|
-
metric_renderers:
|
|
545
|
+
score_renderers: Mapping[str, _NumberRenderer]
|
|
546
|
+
label_renderers: Mapping[str, _ValueRenderer]
|
|
547
|
+
metric_renderers: Mapping[str, _NumberRenderer]
|
|
541
548
|
duration_renderer: _NumberRenderer
|
|
542
549
|
|
|
543
550
|
def build_base_table(self, title: str) -> Table:
|
|
@@ -581,10 +588,10 @@ class ReportCaseRenderer:
|
|
|
581
588
|
row.append(self.output_renderer.render_value(None, case.output) or EMPTY_CELL_STR)
|
|
582
589
|
|
|
583
590
|
if self.include_scores:
|
|
584
|
-
row.append(self._render_dict({k: v
|
|
591
|
+
row.append(self._render_dict({k: v for k, v in case.scores.items()}, self.score_renderers))
|
|
585
592
|
|
|
586
593
|
if self.include_labels:
|
|
587
|
-
row.append(self._render_dict({k: v
|
|
594
|
+
row.append(self._render_dict({k: v for k, v in case.labels.items()}, self.label_renderers))
|
|
588
595
|
|
|
589
596
|
if self.include_metrics:
|
|
590
597
|
row.append(self._render_dict(case.metrics, self.metric_renderers))
|
|
@@ -669,7 +676,11 @@ class ReportCaseRenderer:
|
|
|
669
676
|
row.append(scores_diff)
|
|
670
677
|
|
|
671
678
|
if self.include_labels: # pragma: no branch
|
|
672
|
-
labels_diff = self._render_dicts_diff(
|
|
679
|
+
labels_diff = self._render_dicts_diff(
|
|
680
|
+
{k: v.value for k, v in baseline.labels.items()},
|
|
681
|
+
{k: v.value for k, v in new_case.labels.items()},
|
|
682
|
+
self.label_renderers,
|
|
683
|
+
)
|
|
673
684
|
row.append(labels_diff)
|
|
674
685
|
|
|
675
686
|
if self.include_metrics: # pragma: no branch
|
|
@@ -779,26 +790,36 @@ class ReportCaseRenderer:
|
|
|
779
790
|
diff_lines.append(rendered)
|
|
780
791
|
return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR
|
|
781
792
|
|
|
782
|
-
@staticmethod
|
|
783
793
|
def _render_dict(
|
|
784
|
-
|
|
794
|
+
self,
|
|
795
|
+
case_dict: Mapping[str, EvaluationResult[T] | T],
|
|
785
796
|
renderers: Mapping[str, _AbstractRenderer[T]],
|
|
786
797
|
*,
|
|
787
798
|
include_names: bool = True,
|
|
788
799
|
) -> str:
|
|
789
800
|
diff_lines: list[str] = []
|
|
790
801
|
for key, val in case_dict.items():
|
|
791
|
-
|
|
802
|
+
value = cast(EvaluationResult[T], val).value if isinstance(val, EvaluationResult) else val
|
|
803
|
+
rendered = renderers[key].render_value(key if include_names else None, value)
|
|
804
|
+
if self.include_reasons and isinstance(val, EvaluationResult) and (reason := val.reason):
|
|
805
|
+
rendered += f'\n Reason: {reason}\n'
|
|
792
806
|
diff_lines.append(rendered)
|
|
793
807
|
return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR
|
|
794
808
|
|
|
795
|
-
@staticmethod
|
|
796
809
|
def _render_assertions(
|
|
810
|
+
self,
|
|
797
811
|
assertions: list[EvaluationResult[bool]],
|
|
798
812
|
) -> str:
|
|
799
813
|
if not assertions:
|
|
800
814
|
return EMPTY_CELL_STR
|
|
801
|
-
|
|
815
|
+
lines: list[str] = []
|
|
816
|
+
for a in assertions:
|
|
817
|
+
line = '[green]✔[/]' if a.value else '[red]✗[/]'
|
|
818
|
+
if self.include_reasons:
|
|
819
|
+
line = f'{a.name}: {line}\n'
|
|
820
|
+
line = f'{line} Reason: {a.reason}\n\n' if a.reason else line
|
|
821
|
+
lines.append(line)
|
|
822
|
+
return ''.join(lines)
|
|
802
823
|
|
|
803
824
|
@staticmethod
|
|
804
825
|
def _render_aggregate_assertions(
|
|
@@ -859,6 +880,10 @@ class EvaluationRenderer:
|
|
|
859
880
|
metric_configs: dict[str, RenderNumberConfig]
|
|
860
881
|
duration_config: RenderNumberConfig
|
|
861
882
|
|
|
883
|
+
# TODO: Make this class kw-only so we can reorder the kwargs
|
|
884
|
+
# Data to include
|
|
885
|
+
include_reasons: bool # only applies to reports, not to diffs
|
|
886
|
+
|
|
862
887
|
def include_scores(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
|
|
863
888
|
return any(case.scores for case in self._all_cases(report, baseline))
|
|
864
889
|
|
|
@@ -905,6 +930,7 @@ class EvaluationRenderer:
|
|
|
905
930
|
include_labels=self.include_labels(report, baseline),
|
|
906
931
|
include_metrics=self.include_metrics(report, baseline),
|
|
907
932
|
include_assertions=self.include_assertions(report, baseline),
|
|
933
|
+
include_reasons=self.include_reasons,
|
|
908
934
|
include_durations=self.include_durations,
|
|
909
935
|
include_total_duration=self.include_total_duration,
|
|
910
936
|
input_renderer=input_renderer,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
|
|
|
32
32
|
Requires-Dist: anyio>=0
|
|
33
33
|
Requires-Dist: eval-type-backport>=0; python_version < '3.11'
|
|
34
34
|
Requires-Dist: logfire-api>=1.2.0
|
|
35
|
-
Requires-Dist: pydantic-ai-slim==0.
|
|
35
|
+
Requires-Dist: pydantic-ai-slim==0.5.0
|
|
36
36
|
Requires-Dist: pydantic>=2.10
|
|
37
37
|
Requires-Dist: pyyaml>=6.0.2
|
|
38
38
|
Requires-Dist: rich>=13.9.4
|
|
@@ -1,23 +1,23 @@
|
|
|
1
1
|
pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
|
|
2
2
|
pydantic_evals/_utils.py,sha256=PfhmPbdQp-q90s568LuG45zDDXxgO13BEz8MQJK8qw4,2922
|
|
3
|
-
pydantic_evals/dataset.py,sha256=
|
|
3
|
+
pydantic_evals/dataset.py,sha256=uOH7vW8YbF5NYx9jtcXeYLountSwl93DnO5qBmpyRVw,46698
|
|
4
4
|
pydantic_evals/generation.py,sha256=Yd1rfbsDjjBBHDk-1KDu48hlITjM2-74rTnPBD_sqbA,3494
|
|
5
5
|
pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
pydantic_evals/evaluators/__init__.py,sha256=
|
|
7
|
-
pydantic_evals/evaluators/_run_evaluator.py,sha256=
|
|
8
|
-
pydantic_evals/evaluators/_spec.py,sha256=Xi_FHwnmAZ1x2hoJFw4MBZuG0TilNKqMRW3P74se32I,7080
|
|
6
|
+
pydantic_evals/evaluators/__init__.py,sha256=k_sTzRezIOUj7L2x3WC0_r8943jRo72uqb2vKplD5EU,660
|
|
7
|
+
pydantic_evals/evaluators/_run_evaluator.py,sha256=YQgddaB4zxMnoK1yRhxpAJ_aQhwClExXcVrq2QPjyqs,2480
|
|
9
8
|
pydantic_evals/evaluators/common.py,sha256=ZBrNTfPJoOpT4WNXTRGS0UcKhnuhfYJxjNzum-zHFk8,12064
|
|
10
9
|
pydantic_evals/evaluators/context.py,sha256=8osQRCFW8Vekw7JiiOOCHHH3HOGdhDaUlr8i-twSetg,3870
|
|
11
|
-
pydantic_evals/evaluators/evaluator.py,sha256=
|
|
10
|
+
pydantic_evals/evaluators/evaluator.py,sha256=CvPLh4SITopiRoiZg_dkLuMUStjhxlRXxWJkBVOoIb4,11255
|
|
12
11
|
pydantic_evals/evaluators/llm_as_a_judge.py,sha256=xQjaGuCRXZdlExacFyR4Y4kFmwBh2QxAfEyaed_aqvk,9615
|
|
12
|
+
pydantic_evals/evaluators/spec.py,sha256=zsJ3BZQHTGX7crd3SyGsqVsLrEyoxHyVfJkYy3JEbsU,6693
|
|
13
13
|
pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
|
|
14
14
|
pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5hN_R8VRK4jzmdfe1DgWdXwxVbc,6758
|
|
15
15
|
pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
|
|
16
16
|
pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD4,268
|
|
17
17
|
pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
|
|
18
|
-
pydantic_evals/reporting/__init__.py,sha256=
|
|
18
|
+
pydantic_evals/reporting/__init__.py,sha256=lRDlveMIOzYHaqqTIEiD2PQi6FSSejEMmY4aQbmIzTc,43233
|
|
19
19
|
pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
|
|
20
|
-
pydantic_evals-0.
|
|
21
|
-
pydantic_evals-0.
|
|
22
|
-
pydantic_evals-0.
|
|
23
|
-
pydantic_evals-0.
|
|
20
|
+
pydantic_evals-0.5.0.dist-info/METADATA,sha256=ea5Ph75O8epHLQjRn7mKc9UvOqYqy2f8GUE_Y-ztTlY,7938
|
|
21
|
+
pydantic_evals-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
22
|
+
pydantic_evals-0.5.0.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
|
|
23
|
+
pydantic_evals-0.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|