pydantic-evals 0.2.12__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -194,6 +194,7 @@ class LLMJudge(Evaluator[object, object, object]):
194
194
  rubric: str
195
195
  model: models.Model | models.KnownModelName | None = None
196
196
  include_input: bool = False
197
+ include_expected_output: bool = False
197
198
  model_settings: ModelSettings | None = None
198
199
  score: OutputConfig | Literal[False] = False
199
200
  assertion: OutputConfig | Literal[False] = field(default_factory=lambda: OutputConfig(include_reason=True))
@@ -203,15 +204,29 @@ class LLMJudge(Evaluator[object, object, object]):
203
204
  ctx: EvaluatorContext[object, object, object],
204
205
  ) -> EvaluatorOutput:
205
206
  if self.include_input:
206
- from .llm_as_a_judge import judge_input_output
207
-
208
- grading_output = await judge_input_output(
209
- ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
210
- )
207
+ if self.include_expected_output:
208
+ from .llm_as_a_judge import judge_input_output_expected
209
+
210
+ grading_output = await judge_input_output_expected(
211
+ ctx.inputs, ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
212
+ )
213
+ else:
214
+ from .llm_as_a_judge import judge_input_output
215
+
216
+ grading_output = await judge_input_output(
217
+ ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
218
+ )
211
219
  else:
212
- from .llm_as_a_judge import judge_output
220
+ if self.include_expected_output:
221
+ from .llm_as_a_judge import judge_output_expected
222
+
223
+ grading_output = await judge_output_expected(
224
+ ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
225
+ )
226
+ else:
227
+ from .llm_as_a_judge import judge_output
213
228
 
214
- grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
229
+ grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
215
230
 
216
231
  output: dict[str, EvaluationScalar | EvaluationReason] = {}
217
232
  include_both = self.score is not False and self.assertion is not False
@@ -9,7 +9,14 @@ from pydantic_core import to_json
9
9
  from pydantic_ai import Agent, models
10
10
  from pydantic_ai.settings import ModelSettings
11
11
 
12
- __all__ = ('GradingOutput', 'judge_input_output', 'judge_output', 'set_default_judge_model')
12
+ __all__ = (
13
+ 'GradingOutput',
14
+ 'judge_input_output',
15
+ 'judge_input_output_expected',
16
+ 'judge_output',
17
+ 'judge_output_expected',
18
+ 'set_default_judge_model',
19
+ )
13
20
 
14
21
 
15
22
  _default_model: models.Model | models.KnownModelName = 'openai:gpt-4o'
@@ -55,7 +62,16 @@ async def judge_output(
55
62
  If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
56
63
  but this can be changed using the `set_default_judge_model` function.
57
64
  """
58
- user_prompt = f'<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
65
+ user_prompt = dedent(
66
+ f"""
67
+ <Output>
68
+ {_stringify(output)}
69
+ </Output>
70
+ <Rubric>
71
+ {rubric}
72
+ </Rubric>
73
+ """
74
+ )
59
75
  return (
60
76
  await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
61
77
  ).output
@@ -96,12 +112,141 @@ async def judge_input_output(
96
112
  If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
97
113
  but this can be changed using the `set_default_judge_model` function.
98
114
  """
99
- user_prompt = f'<Input>\n{_stringify(inputs)}\n</Input>\n<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
115
+ user_prompt = dedent(
116
+ f"""
117
+ <Input>
118
+ {_stringify(inputs)}
119
+ </Input>
120
+ <Output>
121
+ {_stringify(output)}
122
+ </Output>
123
+ <Rubric>
124
+ {rubric}
125
+ </Rubric>
126
+ """
127
+ )
100
128
  return (
101
129
  await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
102
130
  ).output
103
131
 
104
132
 
133
+ _judge_input_output_expected_agent = Agent(
134
+ name='judge_input_output_expected',
135
+ system_prompt=dedent(
136
+ """
137
+ You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input, expected output, and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
138
+
139
+ Examples:
140
+
141
+ <Input>What color is the sky?</Input>
142
+ <ExpectedOutput>Blue</ExpectedOutput>
143
+ <Output>Cerulean</Output>
144
+ <Rubric>The output is consistent with the expected output but doesn't have to match exactly</Rubric>
145
+ {"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
146
+
147
+ <Input>How many legs does a spider have?</Input>
148
+ <ExpectedOutput>8</ExpectedOutput>
149
+ <Output>Six</Output>
150
+ <Rubric>The output is factually consistent with the expected output</Rubric>
151
+ {"reason": "Spiders have 8 legs", "pass": false, "score": 0.0}
152
+ """
153
+ ),
154
+ output_type=GradingOutput,
155
+ )
156
+
157
+
158
+ async def judge_input_output_expected(
159
+ inputs: Any,
160
+ output: Any,
161
+ expected_output: Any,
162
+ rubric: str,
163
+ model: models.Model | models.KnownModelName | None = None,
164
+ model_settings: ModelSettings | None = None,
165
+ ) -> GradingOutput:
166
+ """Judge the output of a model based on the inputs and a rubric.
167
+
168
+ If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
169
+ but this can be changed using the `set_default_judge_model` function.
170
+ """
171
+ user_prompt = dedent(
172
+ f"""
173
+ <Input>
174
+ {_stringify(inputs)}
175
+ </Input>
176
+ <ExpectedOutput>
177
+ {_stringify(expected_output)}
178
+ </ExpectedOutput>
179
+ <Output>
180
+ {_stringify(output)}
181
+ </Output>
182
+ <Rubric>
183
+ {rubric}
184
+ </Rubric>
185
+ """
186
+ )
187
+
188
+ return (
189
+ await _judge_input_output_expected_agent.run(
190
+ user_prompt, model=model or _default_model, model_settings=model_settings
191
+ )
192
+ ).output
193
+
194
+
195
+ _judge_output_expected_agent = Agent(
196
+ name='judge_output_expected',
197
+ system_prompt=dedent(
198
+ """
199
+ You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided expected output and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
200
+
201
+ Examples:
202
+
203
+ <ExpectedOutput>Blue</ExpectedOutput>
204
+ <Output>Cerulean</Output>
205
+ <Rubric>The output should be a shade of the expected output color</Rubric>
206
+ {"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
207
+
208
+ <ExpectedOutput>8</ExpectedOutput>
209
+ <Output>Six</Output>
210
+ <Rubric>The output should be a number written in words which matches the number written in digits in the expected output</Rubric>
211
+ {"reason": "The output is 'Six' which is a different number than 8", "pass": false, "score": 0.0}
212
+ """
213
+ ),
214
+ output_type=GradingOutput,
215
+ )
216
+
217
+
218
+ async def judge_output_expected(
219
+ output: Any,
220
+ expected_output: Any,
221
+ rubric: str,
222
+ model: models.Model | models.KnownModelName | None = None,
223
+ model_settings: ModelSettings | None = None,
224
+ ) -> GradingOutput:
225
+ """Judge the output of a model based on the expected output, output, and a rubric.
226
+
227
+ If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
228
+ but this can be changed using the `set_default_judge_model` function.
229
+ """
230
+ user_prompt = dedent(
231
+ f"""
232
+ <ExpectedOutput>
233
+ {_stringify(expected_output)}
234
+ </ExpectedOutput>
235
+ <Output>
236
+ {_stringify(output)}
237
+ </Output>
238
+ <Rubric>
239
+ {rubric}
240
+ </Rubric>
241
+ """
242
+ )
243
+ return (
244
+ await _judge_output_expected_agent.run(
245
+ user_prompt, model=model or _default_model, model_settings=model_settings
246
+ )
247
+ ).output
248
+
249
+
105
250
  def set_default_judge_model(model: models.Model | models.KnownModelName) -> None: # pragma: no cover
106
251
  """Set the default model used for judging.
107
252
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 0.2.12
3
+ Version: 0.2.14
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
32
32
  Requires-Dist: anyio>=0
33
33
  Requires-Dist: eval-type-backport>=0; python_version < '3.11'
34
34
  Requires-Dist: logfire-api>=1.2.0
35
- Requires-Dist: pydantic-ai-slim==0.2.12
35
+ Requires-Dist: pydantic-ai-slim==0.2.14
36
36
  Requires-Dist: pydantic>=2.10
37
37
  Requires-Dist: pyyaml>=6.0.2
38
38
  Requires-Dist: rich>=13.9.4
@@ -6,10 +6,10 @@ pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  pydantic_evals/evaluators/__init__.py,sha256=uj110viFiDvqrIcuDcWexk_uBgJqhKMGPHT2YvDb7LA,624
7
7
  pydantic_evals/evaluators/_run_evaluator.py,sha256=Dsnqxno7CrcKWYcnkLuwvPKWQGDRBmbBTwwstcmc0ak,2448
8
8
  pydantic_evals/evaluators/_spec.py,sha256=Xi_FHwnmAZ1x2hoJFw4MBZuG0TilNKqMRW3P74se32I,7080
9
- pydantic_evals/evaluators/common.py,sha256=-c1g9HMnUDcuswzHDkkFZbjWzbV909lPmusU2TybjI0,11358
9
+ pydantic_evals/evaluators/common.py,sha256=ZBrNTfPJoOpT4WNXTRGS0UcKhnuhfYJxjNzum-zHFk8,12064
10
10
  pydantic_evals/evaluators/context.py,sha256=8osQRCFW8Vekw7JiiOOCHHH3HOGdhDaUlr8i-twSetg,3870
11
11
  pydantic_evals/evaluators/evaluator.py,sha256=yOEKLOxElm7_4tLcq6_myXI0e4Ei9svZP9y5DTq4SYI,11147
12
- pydantic_evals/evaluators/llm_as_a_judge.py,sha256=GZDFKagp4X2u0moXvIRnNizB_-7d_igLJLNBxRjmtr0,4470
12
+ pydantic_evals/evaluators/llm_as_a_judge.py,sha256=raty91NWdu_FEt4ze_ugQHCouj1o72gYe3abJBtMqlU,8793
13
13
  pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
14
14
  pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5hN_R8VRK4jzmdfe1DgWdXwxVbc,6758
15
15
  pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
17
17
  pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
18
18
  pydantic_evals/reporting/__init__.py,sha256=tknRGM2fm8EUENxbq4K5duHZ_DgNzrVWhpGHFkoQ9zo,41677
19
19
  pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
20
- pydantic_evals-0.2.12.dist-info/METADATA,sha256=O_9pav94phGfodGNWhD0Oz5K6mfiJ7svvVkypIy2-ts,7787
21
- pydantic_evals-0.2.12.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
- pydantic_evals-0.2.12.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
- pydantic_evals-0.2.12.dist-info/RECORD,,
20
+ pydantic_evals-0.2.14.dist-info/METADATA,sha256=NLz39IXiT9ZpWzEEMePg8pkTslORYQaZIlYqDsZFgaI,7787
21
+ pydantic_evals-0.2.14.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
+ pydantic_evals-0.2.14.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
+ pydantic_evals-0.2.14.dist-info/RECORD,,