pydantic-evals 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydantic-evals might be problematic. Click here for more details.
- pydantic_evals/dataset.py +7 -3
- pydantic_evals/evaluators/llm_as_a_judge.py +45 -52
- {pydantic_evals-0.4.4.dist-info → pydantic_evals-0.4.6.dist-info}/METADATA +2 -2
- {pydantic_evals-0.4.4.dist-info → pydantic_evals-0.4.6.dist-info}/RECORD +6 -6
- {pydantic_evals-0.4.4.dist-info → pydantic_evals-0.4.6.dist-info}/WHEEL +0 -0
- {pydantic_evals-0.4.4.dist-info → pydantic_evals-0.4.6.dist-info}/licenses/LICENSE +0 -0
pydantic_evals/dataset.py
CHANGED
|
@@ -43,7 +43,7 @@ from .evaluators.common import DEFAULT_EVALUATORS
|
|
|
43
43
|
from .evaluators.context import EvaluatorContext
|
|
44
44
|
from .otel import SpanTree
|
|
45
45
|
from .otel._context_subtree import context_subtree
|
|
46
|
-
from .reporting import EvaluationReport, ReportCase
|
|
46
|
+
from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate
|
|
47
47
|
|
|
48
48
|
if sys.version_info < (3, 11):
|
|
49
49
|
from exceptiongroup import ExceptionGroup # pragma: lax no cover
|
|
@@ -83,6 +83,10 @@ DEFAULT_SCHEMA_PATH_TEMPLATE = './{stem}_schema.json'
|
|
|
83
83
|
_YAML_SCHEMA_LINE_PREFIX = '# yaml-language-server: $schema='
|
|
84
84
|
|
|
85
85
|
|
|
86
|
+
_REPORT_CASES_ADAPTER = TypeAdapter(list[ReportCase])
|
|
87
|
+
_REPORT_CASE_AGGREGATE_ADAPTER = TypeAdapter(ReportCaseAggregate)
|
|
88
|
+
|
|
89
|
+
|
|
86
90
|
class _CaseModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'):
|
|
87
91
|
"""Internal model for a case, used for serialization/deserialization."""
|
|
88
92
|
|
|
@@ -303,9 +307,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
303
307
|
),
|
|
304
308
|
)
|
|
305
309
|
# TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel:
|
|
306
|
-
eval_span.set_attribute('cases', report.cases)
|
|
310
|
+
eval_span.set_attribute('cases', _REPORT_CASES_ADAPTER.dump_python(report.cases))
|
|
307
311
|
# TODO(DavidM): Remove this 'averages' attribute once we compute it in the details panel
|
|
308
|
-
eval_span.set_attribute('averages', report.averages())
|
|
312
|
+
eval_span.set_attribute('averages', _REPORT_CASE_AGGREGATE_ADAPTER.dump_python(report.averages()))
|
|
309
313
|
return report
|
|
310
314
|
|
|
311
315
|
def evaluate_sync(
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from collections.abc import Sequence
|
|
3
4
|
from textwrap import dedent
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
@@ -7,6 +8,7 @@ from pydantic import BaseModel, Field
|
|
|
7
8
|
from pydantic_core import to_json
|
|
8
9
|
|
|
9
10
|
from pydantic_ai import Agent, models
|
|
11
|
+
from pydantic_ai.messages import MultiModalContentTypes, UserContent
|
|
10
12
|
from pydantic_ai.settings import ModelSettings
|
|
11
13
|
|
|
12
14
|
__all__ = (
|
|
@@ -62,16 +64,7 @@ async def judge_output(
|
|
|
62
64
|
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
63
65
|
but this can be changed using the `set_default_judge_model` function.
|
|
64
66
|
"""
|
|
65
|
-
user_prompt =
|
|
66
|
-
f"""
|
|
67
|
-
<Output>
|
|
68
|
-
{_stringify(output)}
|
|
69
|
-
</Output>
|
|
70
|
-
<Rubric>
|
|
71
|
-
{rubric}
|
|
72
|
-
</Rubric>
|
|
73
|
-
"""
|
|
74
|
-
)
|
|
67
|
+
user_prompt = _build_prompt(output=output, rubric=rubric)
|
|
75
68
|
return (
|
|
76
69
|
await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
|
|
77
70
|
).output
|
|
@@ -112,19 +105,8 @@ async def judge_input_output(
|
|
|
112
105
|
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
113
106
|
but this can be changed using the `set_default_judge_model` function.
|
|
114
107
|
"""
|
|
115
|
-
user_prompt =
|
|
116
|
-
|
|
117
|
-
<Input>
|
|
118
|
-
{_stringify(inputs)}
|
|
119
|
-
</Input>
|
|
120
|
-
<Output>
|
|
121
|
-
{_stringify(output)}
|
|
122
|
-
</Output>
|
|
123
|
-
<Rubric>
|
|
124
|
-
{rubric}
|
|
125
|
-
</Rubric>
|
|
126
|
-
"""
|
|
127
|
-
)
|
|
108
|
+
user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric)
|
|
109
|
+
|
|
128
110
|
return (
|
|
129
111
|
await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
|
|
130
112
|
).output
|
|
@@ -168,22 +150,7 @@ async def judge_input_output_expected(
|
|
|
168
150
|
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
169
151
|
but this can be changed using the `set_default_judge_model` function.
|
|
170
152
|
"""
|
|
171
|
-
user_prompt =
|
|
172
|
-
f"""
|
|
173
|
-
<Input>
|
|
174
|
-
{_stringify(inputs)}
|
|
175
|
-
</Input>
|
|
176
|
-
<ExpectedOutput>
|
|
177
|
-
{_stringify(expected_output)}
|
|
178
|
-
</ExpectedOutput>
|
|
179
|
-
<Output>
|
|
180
|
-
{_stringify(output)}
|
|
181
|
-
</Output>
|
|
182
|
-
<Rubric>
|
|
183
|
-
{rubric}
|
|
184
|
-
</Rubric>
|
|
185
|
-
"""
|
|
186
|
-
)
|
|
153
|
+
user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric, expected_output=expected_output)
|
|
187
154
|
|
|
188
155
|
return (
|
|
189
156
|
await _judge_input_output_expected_agent.run(
|
|
@@ -227,19 +194,7 @@ async def judge_output_expected(
|
|
|
227
194
|
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
228
195
|
but this can be changed using the `set_default_judge_model` function.
|
|
229
196
|
"""
|
|
230
|
-
user_prompt =
|
|
231
|
-
f"""
|
|
232
|
-
<ExpectedOutput>
|
|
233
|
-
{_stringify(expected_output)}
|
|
234
|
-
</ExpectedOutput>
|
|
235
|
-
<Output>
|
|
236
|
-
{_stringify(output)}
|
|
237
|
-
</Output>
|
|
238
|
-
<Rubric>
|
|
239
|
-
{rubric}
|
|
240
|
-
</Rubric>
|
|
241
|
-
"""
|
|
242
|
-
)
|
|
197
|
+
user_prompt = _build_prompt(output=output, rubric=rubric, expected_output=expected_output)
|
|
243
198
|
return (
|
|
244
199
|
await _judge_output_expected_agent.run(
|
|
245
200
|
user_prompt, model=model or _default_model, model_settings=model_settings
|
|
@@ -265,3 +220,41 @@ def _stringify(value: Any) -> str:
|
|
|
265
220
|
return to_json(value).decode()
|
|
266
221
|
except Exception:
|
|
267
222
|
return repr(value)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _build_prompt(
|
|
226
|
+
output: Any,
|
|
227
|
+
rubric: str,
|
|
228
|
+
inputs: Any | None = None,
|
|
229
|
+
expected_output: Any | None = None,
|
|
230
|
+
) -> str | Sequence[str | UserContent]:
|
|
231
|
+
"""Build a prompt that includes input, output, and rubric."""
|
|
232
|
+
sections: list[str | UserContent] = []
|
|
233
|
+
|
|
234
|
+
if inputs is not None:
|
|
235
|
+
if isinstance(inputs, str):
|
|
236
|
+
sections.append(f'<Input>\n{inputs}\n</Input>')
|
|
237
|
+
else:
|
|
238
|
+
sections.append('<Input>\n')
|
|
239
|
+
if isinstance(inputs, Sequence):
|
|
240
|
+
for item in inputs: # type: ignore
|
|
241
|
+
if isinstance(item, (str, MultiModalContentTypes)):
|
|
242
|
+
sections.append(item)
|
|
243
|
+
else:
|
|
244
|
+
sections.append(_stringify(item))
|
|
245
|
+
elif isinstance(inputs, MultiModalContentTypes):
|
|
246
|
+
sections.append(inputs)
|
|
247
|
+
else:
|
|
248
|
+
sections.append(_stringify(inputs))
|
|
249
|
+
sections.append('</Input>')
|
|
250
|
+
|
|
251
|
+
sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
|
|
252
|
+
sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
|
|
253
|
+
|
|
254
|
+
if expected_output is not None:
|
|
255
|
+
sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')
|
|
256
|
+
|
|
257
|
+
if inputs is None or isinstance(inputs, str):
|
|
258
|
+
return '\n\n'.join(sections) # type: ignore[arg-type]
|
|
259
|
+
else:
|
|
260
|
+
return sections
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.6
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
|
|
|
32
32
|
Requires-Dist: anyio>=0
|
|
33
33
|
Requires-Dist: eval-type-backport>=0; python_version < '3.11'
|
|
34
34
|
Requires-Dist: logfire-api>=1.2.0
|
|
35
|
-
Requires-Dist: pydantic-ai-slim==0.4.
|
|
35
|
+
Requires-Dist: pydantic-ai-slim==0.4.6
|
|
36
36
|
Requires-Dist: pydantic>=2.10
|
|
37
37
|
Requires-Dist: pyyaml>=6.0.2
|
|
38
38
|
Requires-Dist: rich>=13.9.4
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
|
|
2
2
|
pydantic_evals/_utils.py,sha256=PfhmPbdQp-q90s568LuG45zDDXxgO13BEz8MQJK8qw4,2922
|
|
3
|
-
pydantic_evals/dataset.py,sha256=
|
|
3
|
+
pydantic_evals/dataset.py,sha256=yk6nHzzbEJqh9p3Y_MuBQyP0szp5oh-oFUDavi4N9D8,46699
|
|
4
4
|
pydantic_evals/generation.py,sha256=Yd1rfbsDjjBBHDk-1KDu48hlITjM2-74rTnPBD_sqbA,3494
|
|
5
5
|
pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
pydantic_evals/evaluators/__init__.py,sha256=uj110viFiDvqrIcuDcWexk_uBgJqhKMGPHT2YvDb7LA,624
|
|
@@ -9,7 +9,7 @@ pydantic_evals/evaluators/_spec.py,sha256=Xi_FHwnmAZ1x2hoJFw4MBZuG0TilNKqMRW3P74
|
|
|
9
9
|
pydantic_evals/evaluators/common.py,sha256=ZBrNTfPJoOpT4WNXTRGS0UcKhnuhfYJxjNzum-zHFk8,12064
|
|
10
10
|
pydantic_evals/evaluators/context.py,sha256=8osQRCFW8Vekw7JiiOOCHHH3HOGdhDaUlr8i-twSetg,3870
|
|
11
11
|
pydantic_evals/evaluators/evaluator.py,sha256=yOEKLOxElm7_4tLcq6_myXI0e4Ei9svZP9y5DTq4SYI,11147
|
|
12
|
-
pydantic_evals/evaluators/llm_as_a_judge.py,sha256=
|
|
12
|
+
pydantic_evals/evaluators/llm_as_a_judge.py,sha256=xQjaGuCRXZdlExacFyR4Y4kFmwBh2QxAfEyaed_aqvk,9615
|
|
13
13
|
pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
|
|
14
14
|
pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5hN_R8VRK4jzmdfe1DgWdXwxVbc,6758
|
|
15
15
|
pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
|
|
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
|
|
|
17
17
|
pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
|
|
18
18
|
pydantic_evals/reporting/__init__.py,sha256=k_3tteqXGh0yGvgpN68gB0CjG9wzrakzDTve2GHend4,42148
|
|
19
19
|
pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
|
|
20
|
-
pydantic_evals-0.4.
|
|
21
|
-
pydantic_evals-0.4.
|
|
22
|
-
pydantic_evals-0.4.
|
|
23
|
-
pydantic_evals-0.4.
|
|
20
|
+
pydantic_evals-0.4.6.dist-info/METADATA,sha256=zw53LmYNzSu9Zvg1oC3Xjhv4X7XOdBeIVE45MNRJvz4,7938
|
|
21
|
+
pydantic_evals-0.4.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
22
|
+
pydantic_evals-0.4.6.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
|
|
23
|
+
pydantic_evals-0.4.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|