pydantic-evals 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydantic_evals/evaluators/common.py +22 -7
- pydantic_evals/evaluators/llm_as_a_judge.py +148 -3
- {pydantic_evals-0.2.12.dist-info → pydantic_evals-0.2.13.dist-info}/METADATA +2 -2
- {pydantic_evals-0.2.12.dist-info → pydantic_evals-0.2.13.dist-info}/RECORD +6 -6
- {pydantic_evals-0.2.12.dist-info → pydantic_evals-0.2.13.dist-info}/WHEEL +0 -0
- {pydantic_evals-0.2.12.dist-info → pydantic_evals-0.2.13.dist-info}/licenses/LICENSE +0 -0
|
@@ -194,6 +194,7 @@ class LLMJudge(Evaluator[object, object, object]):
|
|
|
194
194
|
rubric: str
|
|
195
195
|
model: models.Model | models.KnownModelName | None = None
|
|
196
196
|
include_input: bool = False
|
|
197
|
+
include_expected_output: bool = False
|
|
197
198
|
model_settings: ModelSettings | None = None
|
|
198
199
|
score: OutputConfig | Literal[False] = False
|
|
199
200
|
assertion: OutputConfig | Literal[False] = field(default_factory=lambda: OutputConfig(include_reason=True))
|
|
@@ -203,15 +204,29 @@ class LLMJudge(Evaluator[object, object, object]):
|
|
|
203
204
|
ctx: EvaluatorContext[object, object, object],
|
|
204
205
|
) -> EvaluatorOutput:
|
|
205
206
|
if self.include_input:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
207
|
+
if self.include_expected_output:
|
|
208
|
+
from .llm_as_a_judge import judge_input_output_expected
|
|
209
|
+
|
|
210
|
+
grading_output = await judge_input_output_expected(
|
|
211
|
+
ctx.inputs, ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
|
|
212
|
+
)
|
|
213
|
+
else:
|
|
214
|
+
from .llm_as_a_judge import judge_input_output
|
|
215
|
+
|
|
216
|
+
grading_output = await judge_input_output(
|
|
217
|
+
ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
|
|
218
|
+
)
|
|
211
219
|
else:
|
|
212
|
-
|
|
220
|
+
if self.include_expected_output:
|
|
221
|
+
from .llm_as_a_judge import judge_output_expected
|
|
222
|
+
|
|
223
|
+
grading_output = await judge_output_expected(
|
|
224
|
+
ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
|
|
225
|
+
)
|
|
226
|
+
else:
|
|
227
|
+
from .llm_as_a_judge import judge_output
|
|
213
228
|
|
|
214
|
-
|
|
229
|
+
grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
|
|
215
230
|
|
|
216
231
|
output: dict[str, EvaluationScalar | EvaluationReason] = {}
|
|
217
232
|
include_both = self.score is not False and self.assertion is not False
|
|
@@ -9,7 +9,14 @@ from pydantic_core import to_json
|
|
|
9
9
|
from pydantic_ai import Agent, models
|
|
10
10
|
from pydantic_ai.settings import ModelSettings
|
|
11
11
|
|
|
12
|
-
__all__ = (
|
|
12
|
+
__all__ = (
|
|
13
|
+
'GradingOutput',
|
|
14
|
+
'judge_input_output',
|
|
15
|
+
'judge_input_output_expected',
|
|
16
|
+
'judge_output',
|
|
17
|
+
'judge_output_expected',
|
|
18
|
+
'set_default_judge_model',
|
|
19
|
+
)
|
|
13
20
|
|
|
14
21
|
|
|
15
22
|
_default_model: models.Model | models.KnownModelName = 'openai:gpt-4o'
|
|
@@ -55,7 +62,16 @@ async def judge_output(
|
|
|
55
62
|
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
56
63
|
but this can be changed using the `set_default_judge_model` function.
|
|
57
64
|
"""
|
|
58
|
-
user_prompt =
|
|
65
|
+
user_prompt = dedent(
|
|
66
|
+
f"""
|
|
67
|
+
<Output>
|
|
68
|
+
{_stringify(output)}
|
|
69
|
+
</Output>
|
|
70
|
+
<Rubric>
|
|
71
|
+
{rubric}
|
|
72
|
+
</Rubric>
|
|
73
|
+
"""
|
|
74
|
+
)
|
|
59
75
|
return (
|
|
60
76
|
await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
|
|
61
77
|
).output
|
|
@@ -96,12 +112,141 @@ async def judge_input_output(
|
|
|
96
112
|
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
97
113
|
but this can be changed using the `set_default_judge_model` function.
|
|
98
114
|
"""
|
|
99
|
-
user_prompt =
|
|
115
|
+
user_prompt = dedent(
|
|
116
|
+
f"""
|
|
117
|
+
<Input>
|
|
118
|
+
{_stringify(inputs)}
|
|
119
|
+
</Input>
|
|
120
|
+
<Output>
|
|
121
|
+
{_stringify(output)}
|
|
122
|
+
</Output>
|
|
123
|
+
<Rubric>
|
|
124
|
+
{rubric}
|
|
125
|
+
</Rubric>
|
|
126
|
+
"""
|
|
127
|
+
)
|
|
100
128
|
return (
|
|
101
129
|
await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
|
|
102
130
|
).output
|
|
103
131
|
|
|
104
132
|
|
|
133
|
+
_judge_input_output_expected_agent = Agent(
|
|
134
|
+
name='judge_input_output_expected',
|
|
135
|
+
system_prompt=dedent(
|
|
136
|
+
"""
|
|
137
|
+
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input, expected output, and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
138
|
+
|
|
139
|
+
Examples:
|
|
140
|
+
|
|
141
|
+
<Input>What color is the sky?</Input>
|
|
142
|
+
<ExpectedOutput>Blue</ExpectedOutput>
|
|
143
|
+
<Output>Cerulean</Output>
|
|
144
|
+
<Rubric>The output is consistent with the expected output but doesn't have to match exactly</Rubric>
|
|
145
|
+
{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
|
|
146
|
+
|
|
147
|
+
<Input>How many legs does a spider have?</Input>
|
|
148
|
+
<ExpectedOutput>8</ExpectedOutput>
|
|
149
|
+
<Output>Six</Output>
|
|
150
|
+
<Rubric>The output is factually consistent with the expected output</Rubric>
|
|
151
|
+
{"reason": "Spiders have 8 legs", "pass": false, "score": 0.0}
|
|
152
|
+
"""
|
|
153
|
+
),
|
|
154
|
+
output_type=GradingOutput,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
async def judge_input_output_expected(
|
|
159
|
+
inputs: Any,
|
|
160
|
+
output: Any,
|
|
161
|
+
expected_output: Any,
|
|
162
|
+
rubric: str,
|
|
163
|
+
model: models.Model | models.KnownModelName | None = None,
|
|
164
|
+
model_settings: ModelSettings | None = None,
|
|
165
|
+
) -> GradingOutput:
|
|
166
|
+
"""Judge the output of a model based on the inputs and a rubric.
|
|
167
|
+
|
|
168
|
+
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
169
|
+
but this can be changed using the `set_default_judge_model` function.
|
|
170
|
+
"""
|
|
171
|
+
user_prompt = dedent(
|
|
172
|
+
f"""
|
|
173
|
+
<Input>
|
|
174
|
+
{_stringify(inputs)}
|
|
175
|
+
</Input>
|
|
176
|
+
<ExpectedOutput>
|
|
177
|
+
{_stringify(expected_output)}
|
|
178
|
+
</ExpectedOutput>
|
|
179
|
+
<Output>
|
|
180
|
+
{_stringify(output)}
|
|
181
|
+
</Output>
|
|
182
|
+
<Rubric>
|
|
183
|
+
{rubric}
|
|
184
|
+
</Rubric>
|
|
185
|
+
"""
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return (
|
|
189
|
+
await _judge_input_output_expected_agent.run(
|
|
190
|
+
user_prompt, model=model or _default_model, model_settings=model_settings
|
|
191
|
+
)
|
|
192
|
+
).output
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
_judge_output_expected_agent = Agent(
|
|
196
|
+
name='judge_output_expected',
|
|
197
|
+
system_prompt=dedent(
|
|
198
|
+
"""
|
|
199
|
+
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided expected output and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
200
|
+
|
|
201
|
+
Examples:
|
|
202
|
+
|
|
203
|
+
<ExpectedOutput>Blue</ExpectedOutput>
|
|
204
|
+
<Output>Cerulean</Output>
|
|
205
|
+
<Rubric>The output should be a shade of the expected output color</Rubric>
|
|
206
|
+
{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
|
|
207
|
+
|
|
208
|
+
<ExpectedOutput>8</ExpectedOutput>
|
|
209
|
+
<Output>Six</Output>
|
|
210
|
+
<Rubric>The output should be a number written in words which matches the number written in digits in the expected output</Rubric>
|
|
211
|
+
{"reason": "The output is 'Six' which is a different number than 8", "pass": false, "score": 0.0}
|
|
212
|
+
"""
|
|
213
|
+
),
|
|
214
|
+
output_type=GradingOutput,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
async def judge_output_expected(
|
|
219
|
+
output: Any,
|
|
220
|
+
expected_output: Any,
|
|
221
|
+
rubric: str,
|
|
222
|
+
model: models.Model | models.KnownModelName | None = None,
|
|
223
|
+
model_settings: ModelSettings | None = None,
|
|
224
|
+
) -> GradingOutput:
|
|
225
|
+
"""Judge the output of a model based on the expected output, output, and a rubric.
|
|
226
|
+
|
|
227
|
+
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
228
|
+
but this can be changed using the `set_default_judge_model` function.
|
|
229
|
+
"""
|
|
230
|
+
user_prompt = dedent(
|
|
231
|
+
f"""
|
|
232
|
+
<ExpectedOutput>
|
|
233
|
+
{_stringify(expected_output)}
|
|
234
|
+
</ExpectedOutput>
|
|
235
|
+
<Output>
|
|
236
|
+
{_stringify(output)}
|
|
237
|
+
</Output>
|
|
238
|
+
<Rubric>
|
|
239
|
+
{rubric}
|
|
240
|
+
</Rubric>
|
|
241
|
+
"""
|
|
242
|
+
)
|
|
243
|
+
return (
|
|
244
|
+
await _judge_output_expected_agent.run(
|
|
245
|
+
user_prompt, model=model or _default_model, model_settings=model_settings
|
|
246
|
+
)
|
|
247
|
+
).output
|
|
248
|
+
|
|
249
|
+
|
|
105
250
|
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None: # pragma: no cover
|
|
106
251
|
"""Set the default model used for judging.
|
|
107
252
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.13
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
|
|
|
32
32
|
Requires-Dist: anyio>=0
|
|
33
33
|
Requires-Dist: eval-type-backport>=0; python_version < '3.11'
|
|
34
34
|
Requires-Dist: logfire-api>=1.2.0
|
|
35
|
-
Requires-Dist: pydantic-ai-slim==0.2.
|
|
35
|
+
Requires-Dist: pydantic-ai-slim==0.2.13
|
|
36
36
|
Requires-Dist: pydantic>=2.10
|
|
37
37
|
Requires-Dist: pyyaml>=6.0.2
|
|
38
38
|
Requires-Dist: rich>=13.9.4
|
|
@@ -6,10 +6,10 @@ pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
6
6
|
pydantic_evals/evaluators/__init__.py,sha256=uj110viFiDvqrIcuDcWexk_uBgJqhKMGPHT2YvDb7LA,624
|
|
7
7
|
pydantic_evals/evaluators/_run_evaluator.py,sha256=Dsnqxno7CrcKWYcnkLuwvPKWQGDRBmbBTwwstcmc0ak,2448
|
|
8
8
|
pydantic_evals/evaluators/_spec.py,sha256=Xi_FHwnmAZ1x2hoJFw4MBZuG0TilNKqMRW3P74se32I,7080
|
|
9
|
-
pydantic_evals/evaluators/common.py,sha256
|
|
9
|
+
pydantic_evals/evaluators/common.py,sha256=ZBrNTfPJoOpT4WNXTRGS0UcKhnuhfYJxjNzum-zHFk8,12064
|
|
10
10
|
pydantic_evals/evaluators/context.py,sha256=8osQRCFW8Vekw7JiiOOCHHH3HOGdhDaUlr8i-twSetg,3870
|
|
11
11
|
pydantic_evals/evaluators/evaluator.py,sha256=yOEKLOxElm7_4tLcq6_myXI0e4Ei9svZP9y5DTq4SYI,11147
|
|
12
|
-
pydantic_evals/evaluators/llm_as_a_judge.py,sha256=
|
|
12
|
+
pydantic_evals/evaluators/llm_as_a_judge.py,sha256=raty91NWdu_FEt4ze_ugQHCouj1o72gYe3abJBtMqlU,8793
|
|
13
13
|
pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
|
|
14
14
|
pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5hN_R8VRK4jzmdfe1DgWdXwxVbc,6758
|
|
15
15
|
pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
|
|
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
|
|
|
17
17
|
pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
|
|
18
18
|
pydantic_evals/reporting/__init__.py,sha256=tknRGM2fm8EUENxbq4K5duHZ_DgNzrVWhpGHFkoQ9zo,41677
|
|
19
19
|
pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
|
|
20
|
-
pydantic_evals-0.2.
|
|
21
|
-
pydantic_evals-0.2.
|
|
22
|
-
pydantic_evals-0.2.
|
|
23
|
-
pydantic_evals-0.2.
|
|
20
|
+
pydantic_evals-0.2.13.dist-info/METADATA,sha256=Q3gMew8mckZeKRn7FTu-HENMFEJHknwoE4dcXGH5fHU,7787
|
|
21
|
+
pydantic_evals-0.2.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
22
|
+
pydantic_evals-0.2.13.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
|
|
23
|
+
pydantic_evals-0.2.13.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|