pydantic-evals 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydantic_evals/evaluators/llm_as_a_judge.py +45 -52
- {pydantic_evals-0.4.4.dist-info → pydantic_evals-0.4.5.dist-info}/METADATA +2 -2
- {pydantic_evals-0.4.4.dist-info → pydantic_evals-0.4.5.dist-info}/RECORD +5 -5
- {pydantic_evals-0.4.4.dist-info → pydantic_evals-0.4.5.dist-info}/WHEEL +0 -0
- {pydantic_evals-0.4.4.dist-info → pydantic_evals-0.4.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from collections.abc import Sequence
|
|
3
4
|
from textwrap import dedent
|
|
4
5
|
from typing import Any
|
|
5
6
|
|
|
@@ -7,6 +8,7 @@ from pydantic import BaseModel, Field
|
|
|
7
8
|
from pydantic_core import to_json
|
|
8
9
|
|
|
9
10
|
from pydantic_ai import Agent, models
|
|
11
|
+
from pydantic_ai.messages import MultiModalContentTypes, UserContent
|
|
10
12
|
from pydantic_ai.settings import ModelSettings
|
|
11
13
|
|
|
12
14
|
__all__ = (
|
|
@@ -62,16 +64,7 @@ async def judge_output(
|
|
|
62
64
|
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
63
65
|
but this can be changed using the `set_default_judge_model` function.
|
|
64
66
|
"""
|
|
65
|
-
user_prompt =
|
|
66
|
-
f"""
|
|
67
|
-
<Output>
|
|
68
|
-
{_stringify(output)}
|
|
69
|
-
</Output>
|
|
70
|
-
<Rubric>
|
|
71
|
-
{rubric}
|
|
72
|
-
</Rubric>
|
|
73
|
-
"""
|
|
74
|
-
)
|
|
67
|
+
user_prompt = _build_prompt(output=output, rubric=rubric)
|
|
75
68
|
return (
|
|
76
69
|
await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
|
|
77
70
|
).output
|
|
@@ -112,19 +105,8 @@ async def judge_input_output(
|
|
|
112
105
|
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
113
106
|
but this can be changed using the `set_default_judge_model` function.
|
|
114
107
|
"""
|
|
115
|
-
user_prompt =
|
|
116
|
-
|
|
117
|
-
<Input>
|
|
118
|
-
{_stringify(inputs)}
|
|
119
|
-
</Input>
|
|
120
|
-
<Output>
|
|
121
|
-
{_stringify(output)}
|
|
122
|
-
</Output>
|
|
123
|
-
<Rubric>
|
|
124
|
-
{rubric}
|
|
125
|
-
</Rubric>
|
|
126
|
-
"""
|
|
127
|
-
)
|
|
108
|
+
user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric)
|
|
109
|
+
|
|
128
110
|
return (
|
|
129
111
|
await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
|
|
130
112
|
).output
|
|
@@ -168,22 +150,7 @@ async def judge_input_output_expected(
|
|
|
168
150
|
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
169
151
|
but this can be changed using the `set_default_judge_model` function.
|
|
170
152
|
"""
|
|
171
|
-
user_prompt =
|
|
172
|
-
f"""
|
|
173
|
-
<Input>
|
|
174
|
-
{_stringify(inputs)}
|
|
175
|
-
</Input>
|
|
176
|
-
<ExpectedOutput>
|
|
177
|
-
{_stringify(expected_output)}
|
|
178
|
-
</ExpectedOutput>
|
|
179
|
-
<Output>
|
|
180
|
-
{_stringify(output)}
|
|
181
|
-
</Output>
|
|
182
|
-
<Rubric>
|
|
183
|
-
{rubric}
|
|
184
|
-
</Rubric>
|
|
185
|
-
"""
|
|
186
|
-
)
|
|
153
|
+
user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric, expected_output=expected_output)
|
|
187
154
|
|
|
188
155
|
return (
|
|
189
156
|
await _judge_input_output_expected_agent.run(
|
|
@@ -227,19 +194,7 @@ async def judge_output_expected(
|
|
|
227
194
|
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
228
195
|
but this can be changed using the `set_default_judge_model` function.
|
|
229
196
|
"""
|
|
230
|
-
user_prompt =
|
|
231
|
-
f"""
|
|
232
|
-
<ExpectedOutput>
|
|
233
|
-
{_stringify(expected_output)}
|
|
234
|
-
</ExpectedOutput>
|
|
235
|
-
<Output>
|
|
236
|
-
{_stringify(output)}
|
|
237
|
-
</Output>
|
|
238
|
-
<Rubric>
|
|
239
|
-
{rubric}
|
|
240
|
-
</Rubric>
|
|
241
|
-
"""
|
|
242
|
-
)
|
|
197
|
+
user_prompt = _build_prompt(output=output, rubric=rubric, expected_output=expected_output)
|
|
243
198
|
return (
|
|
244
199
|
await _judge_output_expected_agent.run(
|
|
245
200
|
user_prompt, model=model or _default_model, model_settings=model_settings
|
|
@@ -265,3 +220,41 @@ def _stringify(value: Any) -> str:
|
|
|
265
220
|
return to_json(value).decode()
|
|
266
221
|
except Exception:
|
|
267
222
|
return repr(value)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _build_prompt(
|
|
226
|
+
output: Any,
|
|
227
|
+
rubric: str,
|
|
228
|
+
inputs: Any | None = None,
|
|
229
|
+
expected_output: Any | None = None,
|
|
230
|
+
) -> str | Sequence[str | UserContent]:
|
|
231
|
+
"""Build a prompt that includes input, output, and rubric."""
|
|
232
|
+
sections: list[str | UserContent] = []
|
|
233
|
+
|
|
234
|
+
if inputs is not None:
|
|
235
|
+
if isinstance(inputs, str):
|
|
236
|
+
sections.append(f'<Input>\n{inputs}\n</Input>')
|
|
237
|
+
else:
|
|
238
|
+
sections.append('<Input>\n')
|
|
239
|
+
if isinstance(inputs, Sequence):
|
|
240
|
+
for item in inputs: # type: ignore
|
|
241
|
+
if isinstance(item, (str, MultiModalContentTypes)):
|
|
242
|
+
sections.append(item)
|
|
243
|
+
else:
|
|
244
|
+
sections.append(_stringify(item))
|
|
245
|
+
elif isinstance(inputs, MultiModalContentTypes):
|
|
246
|
+
sections.append(inputs)
|
|
247
|
+
else:
|
|
248
|
+
sections.append(_stringify(inputs))
|
|
249
|
+
sections.append('</Input>')
|
|
250
|
+
|
|
251
|
+
sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
|
|
252
|
+
sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
|
|
253
|
+
|
|
254
|
+
if expected_output is not None:
|
|
255
|
+
sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')
|
|
256
|
+
|
|
257
|
+
if inputs is None or isinstance(inputs, str):
|
|
258
|
+
return '\n\n'.join(sections) # type: ignore[arg-type]
|
|
259
|
+
else:
|
|
260
|
+
return sections
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.5
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
|
|
|
32
32
|
Requires-Dist: anyio>=0
|
|
33
33
|
Requires-Dist: eval-type-backport>=0; python_version < '3.11'
|
|
34
34
|
Requires-Dist: logfire-api>=1.2.0
|
|
35
|
-
Requires-Dist: pydantic-ai-slim==0.4.
|
|
35
|
+
Requires-Dist: pydantic-ai-slim==0.4.5
|
|
36
36
|
Requires-Dist: pydantic>=2.10
|
|
37
37
|
Requires-Dist: pyyaml>=6.0.2
|
|
38
38
|
Requires-Dist: rich>=13.9.4
|
|
@@ -9,7 +9,7 @@ pydantic_evals/evaluators/_spec.py,sha256=Xi_FHwnmAZ1x2hoJFw4MBZuG0TilNKqMRW3P74
|
|
|
9
9
|
pydantic_evals/evaluators/common.py,sha256=ZBrNTfPJoOpT4WNXTRGS0UcKhnuhfYJxjNzum-zHFk8,12064
|
|
10
10
|
pydantic_evals/evaluators/context.py,sha256=8osQRCFW8Vekw7JiiOOCHHH3HOGdhDaUlr8i-twSetg,3870
|
|
11
11
|
pydantic_evals/evaluators/evaluator.py,sha256=yOEKLOxElm7_4tLcq6_myXI0e4Ei9svZP9y5DTq4SYI,11147
|
|
12
|
-
pydantic_evals/evaluators/llm_as_a_judge.py,sha256=
|
|
12
|
+
pydantic_evals/evaluators/llm_as_a_judge.py,sha256=xQjaGuCRXZdlExacFyR4Y4kFmwBh2QxAfEyaed_aqvk,9615
|
|
13
13
|
pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
|
|
14
14
|
pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5hN_R8VRK4jzmdfe1DgWdXwxVbc,6758
|
|
15
15
|
pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
|
|
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
|
|
|
17
17
|
pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
|
|
18
18
|
pydantic_evals/reporting/__init__.py,sha256=k_3tteqXGh0yGvgpN68gB0CjG9wzrakzDTve2GHend4,42148
|
|
19
19
|
pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
|
|
20
|
-
pydantic_evals-0.4.
|
|
21
|
-
pydantic_evals-0.4.
|
|
22
|
-
pydantic_evals-0.4.
|
|
23
|
-
pydantic_evals-0.4.
|
|
20
|
+
pydantic_evals-0.4.5.dist-info/METADATA,sha256=6axvUhXnQNjpGjtEIsUv2FHDNQnOQeBMEZmaqzfoo_s,7938
|
|
21
|
+
pydantic_evals-0.4.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
22
|
+
pydantic_evals-0.4.5.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
|
|
23
|
+
pydantic_evals-0.4.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|