pydantic-evals 0.2.11__tar.gz → 0.2.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/PKG-INFO +2 -2
  2. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/common.py +22 -7
  3. pydantic_evals-0.2.13/pydantic_evals/evaluators/llm_as_a_judge.py +267 -0
  4. pydantic_evals-0.2.11/pydantic_evals/evaluators/llm_as_a_judge.py +0 -122
  5. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/.gitignore +0 -0
  6. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/LICENSE +0 -0
  7. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/README.md +0 -0
  8. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/__init__.py +0 -0
  9. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/_utils.py +0 -0
  10. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/dataset.py +0 -0
  11. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/__init__.py +0 -0
  12. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
  13. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/_spec.py +0 -0
  14. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/context.py +0 -0
  15. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/evaluator.py +0 -0
  16. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/generation.py +0 -0
  17. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/otel/__init__.py +0 -0
  18. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
  19. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/otel/_context_subtree.py +0 -0
  20. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/otel/_errors.py +0 -0
  21. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/otel/span_tree.py +0 -0
  22. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/py.typed +0 -0
  23. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/reporting/__init__.py +0 -0
  24. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pydantic_evals/reporting/render_numbers.py +0 -0
  25. {pydantic_evals-0.2.11 → pydantic_evals-0.2.13}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 0.2.11
3
+ Version: 0.2.13
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
32
32
  Requires-Dist: anyio>=0
33
33
  Requires-Dist: eval-type-backport>=0; python_version < '3.11'
34
34
  Requires-Dist: logfire-api>=1.2.0
35
- Requires-Dist: pydantic-ai-slim==0.2.11
35
+ Requires-Dist: pydantic-ai-slim==0.2.13
36
36
  Requires-Dist: pydantic>=2.10
37
37
  Requires-Dist: pyyaml>=6.0.2
38
38
  Requires-Dist: rich>=13.9.4
@@ -194,6 +194,7 @@ class LLMJudge(Evaluator[object, object, object]):
194
194
  rubric: str
195
195
  model: models.Model | models.KnownModelName | None = None
196
196
  include_input: bool = False
197
+ include_expected_output: bool = False
197
198
  model_settings: ModelSettings | None = None
198
199
  score: OutputConfig | Literal[False] = False
199
200
  assertion: OutputConfig | Literal[False] = field(default_factory=lambda: OutputConfig(include_reason=True))
@@ -203,15 +204,29 @@ class LLMJudge(Evaluator[object, object, object]):
203
204
  ctx: EvaluatorContext[object, object, object],
204
205
  ) -> EvaluatorOutput:
205
206
  if self.include_input:
206
- from .llm_as_a_judge import judge_input_output
207
-
208
- grading_output = await judge_input_output(
209
- ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
210
- )
207
+ if self.include_expected_output:
208
+ from .llm_as_a_judge import judge_input_output_expected
209
+
210
+ grading_output = await judge_input_output_expected(
211
+ ctx.inputs, ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
212
+ )
213
+ else:
214
+ from .llm_as_a_judge import judge_input_output
215
+
216
+ grading_output = await judge_input_output(
217
+ ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
218
+ )
211
219
  else:
212
- from .llm_as_a_judge import judge_output
220
+ if self.include_expected_output:
221
+ from .llm_as_a_judge import judge_output_expected
222
+
223
+ grading_output = await judge_output_expected(
224
+ ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
225
+ )
226
+ else:
227
+ from .llm_as_a_judge import judge_output
213
228
 
214
- grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
229
+ grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
215
230
 
216
231
  output: dict[str, EvaluationScalar | EvaluationReason] = {}
217
232
  include_both = self.score is not False and self.assertion is not False
@@ -0,0 +1,267 @@
1
+ from __future__ import annotations
2
+
3
+ from textwrap import dedent
4
+ from typing import Any
5
+
6
+ from pydantic import BaseModel, Field
7
+ from pydantic_core import to_json
8
+
9
+ from pydantic_ai import Agent, models
10
+ from pydantic_ai.settings import ModelSettings
11
+
12
+ __all__ = (
13
+ 'GradingOutput',
14
+ 'judge_input_output',
15
+ 'judge_input_output_expected',
16
+ 'judge_output',
17
+ 'judge_output_expected',
18
+ 'set_default_judge_model',
19
+ )
20
+
21
+
22
+ _default_model: models.Model | models.KnownModelName = 'openai:gpt-4o'
23
+
24
+
25
+ class GradingOutput(BaseModel, populate_by_name=True):
26
+ """The output of a grading operation."""
27
+
28
+ reason: str
29
+ pass_: bool = Field(validation_alias='pass', serialization_alias='pass')
30
+ score: float
31
+
32
+
33
+ _judge_output_agent = Agent(
34
+ name='judge_output',
35
+ system_prompt=dedent(
36
+ """
37
+ You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
38
+
39
+ Examples:
40
+
41
+ <Output>Hello world</Output>
42
+ <Rubric>Content contains a greeting</Rubric>
43
+ {"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
44
+
45
+ <Output>Avast ye swabs, repel the invaders!</Output>
46
+ <Rubric>Does not speak like a pirate</Rubric>
47
+ {"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
48
+ """
49
+ ),
50
+ output_type=GradingOutput,
51
+ )
52
+
53
+
54
+ async def judge_output(
55
+ output: Any,
56
+ rubric: str,
57
+ model: models.Model | models.KnownModelName | None = None,
58
+ model_settings: ModelSettings | None = None,
59
+ ) -> GradingOutput:
60
+ """Judge the output of a model based on a rubric.
61
+
62
+ If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
63
+ but this can be changed using the `set_default_judge_model` function.
64
+ """
65
+ user_prompt = dedent(
66
+ f"""
67
+ <Output>
68
+ {_stringify(output)}
69
+ </Output>
70
+ <Rubric>
71
+ {rubric}
72
+ </Rubric>
73
+ """
74
+ )
75
+ return (
76
+ await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
77
+ ).output
78
+
79
+
80
+ _judge_input_output_agent = Agent(
81
+ name='judge_input_output',
82
+ system_prompt=dedent(
83
+ """
84
+ You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
85
+
86
+ Examples:
87
+
88
+ <Input>Hello world</Input>
89
+ <Output>Hello</Output>
90
+ <Rubric>Content contains a greeting word which is present in the input</Rubric>
91
+ {"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
92
+
93
+ <Input>Pirate</Input>
94
+ <Output>Avast ye swabs, repel the invaders!</Output>
95
+ <Rubric>Does not speak in the style described by the input</Rubric>
96
+ {"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
97
+ """
98
+ ),
99
+ output_type=GradingOutput,
100
+ )
101
+
102
+
103
+ async def judge_input_output(
104
+ inputs: Any,
105
+ output: Any,
106
+ rubric: str,
107
+ model: models.Model | models.KnownModelName | None = None,
108
+ model_settings: ModelSettings | None = None,
109
+ ) -> GradingOutput:
110
+ """Judge the output of a model based on the inputs and a rubric.
111
+
112
+ If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
113
+ but this can be changed using the `set_default_judge_model` function.
114
+ """
115
+ user_prompt = dedent(
116
+ f"""
117
+ <Input>
118
+ {_stringify(inputs)}
119
+ </Input>
120
+ <Output>
121
+ {_stringify(output)}
122
+ </Output>
123
+ <Rubric>
124
+ {rubric}
125
+ </Rubric>
126
+ """
127
+ )
128
+ return (
129
+ await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
130
+ ).output
131
+
132
+
133
+ _judge_input_output_expected_agent = Agent(
134
+ name='judge_input_output_expected',
135
+ system_prompt=dedent(
136
+ """
137
+ You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input, expected output, and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
138
+
139
+ Examples:
140
+
141
+ <Input>What color is the sky?</Input>
142
+ <ExpectedOutput>Blue</ExpectedOutput>
143
+ <Output>Cerulean</Output>
144
+ <Rubric>The output is consistent with the expected output but doesn't have to match exactly</Rubric>
145
+ {"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
146
+
147
+ <Input>How many legs does a spider have?</Input>
148
+ <ExpectedOutput>8</ExpectedOutput>
149
+ <Output>Six</Output>
150
+ <Rubric>The output is factually consistent with the expected output</Rubric>
151
+ {"reason": "Spiders have 8 legs", "pass": false, "score": 0.0}
152
+ """
153
+ ),
154
+ output_type=GradingOutput,
155
+ )
156
+
157
+
158
+ async def judge_input_output_expected(
159
+ inputs: Any,
160
+ output: Any,
161
+ expected_output: Any,
162
+ rubric: str,
163
+ model: models.Model | models.KnownModelName | None = None,
164
+ model_settings: ModelSettings | None = None,
165
+ ) -> GradingOutput:
166
+ """Judge the output of a model based on the inputs and a rubric.
167
+
168
+ If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
169
+ but this can be changed using the `set_default_judge_model` function.
170
+ """
171
+ user_prompt = dedent(
172
+ f"""
173
+ <Input>
174
+ {_stringify(inputs)}
175
+ </Input>
176
+ <ExpectedOutput>
177
+ {_stringify(expected_output)}
178
+ </ExpectedOutput>
179
+ <Output>
180
+ {_stringify(output)}
181
+ </Output>
182
+ <Rubric>
183
+ {rubric}
184
+ </Rubric>
185
+ """
186
+ )
187
+
188
+ return (
189
+ await _judge_input_output_expected_agent.run(
190
+ user_prompt, model=model or _default_model, model_settings=model_settings
191
+ )
192
+ ).output
193
+
194
+
195
+ _judge_output_expected_agent = Agent(
196
+ name='judge_output_expected',
197
+ system_prompt=dedent(
198
+ """
199
+ You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided expected output and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
200
+
201
+ Examples:
202
+
203
+ <ExpectedOutput>Blue</ExpectedOutput>
204
+ <Output>Cerulean</Output>
205
+ <Rubric>The output should be a shade of the expected output color</Rubric>
206
+ {"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
207
+
208
+ <ExpectedOutput>8</ExpectedOutput>
209
+ <Output>Six</Output>
210
+ <Rubric>The output should be a number written in words which matches the number written in digits in the expected output</Rubric>
211
+ {"reason": "The output is 'Six' which is a different number than 8", "pass": false, "score": 0.0}
212
+ """
213
+ ),
214
+ output_type=GradingOutput,
215
+ )
216
+
217
+
218
+ async def judge_output_expected(
219
+ output: Any,
220
+ expected_output: Any,
221
+ rubric: str,
222
+ model: models.Model | models.KnownModelName | None = None,
223
+ model_settings: ModelSettings | None = None,
224
+ ) -> GradingOutput:
225
+ """Judge the output of a model based on the expected output, output, and a rubric.
226
+
227
+ If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
228
+ but this can be changed using the `set_default_judge_model` function.
229
+ """
230
+ user_prompt = dedent(
231
+ f"""
232
+ <ExpectedOutput>
233
+ {_stringify(expected_output)}
234
+ </ExpectedOutput>
235
+ <Output>
236
+ {_stringify(output)}
237
+ </Output>
238
+ <Rubric>
239
+ {rubric}
240
+ </Rubric>
241
+ """
242
+ )
243
+ return (
244
+ await _judge_output_expected_agent.run(
245
+ user_prompt, model=model or _default_model, model_settings=model_settings
246
+ )
247
+ ).output
248
+
249
+
250
+ def set_default_judge_model(model: models.Model | models.KnownModelName) -> None: # pragma: no cover
251
+ """Set the default model used for judging.
252
+
253
+ This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
254
+ """
255
+ global _default_model
256
+ _default_model = model
257
+
258
+
259
+ def _stringify(value: Any) -> str:
260
+ if isinstance(value, str):
261
+ return value
262
+ try:
263
+ # If the value can be serialized to JSON, use that.
264
+ # If that behavior is undesirable, the user could manually call repr on the arguments to the judge_* functions
265
+ return to_json(value).decode()
266
+ except Exception:
267
+ return repr(value)
@@ -1,122 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from textwrap import dedent
4
- from typing import Any
5
-
6
- from pydantic import BaseModel, Field
7
- from pydantic_core import to_json
8
-
9
- from pydantic_ai import Agent, models
10
- from pydantic_ai.settings import ModelSettings
11
-
12
- __all__ = ('GradingOutput', 'judge_input_output', 'judge_output', 'set_default_judge_model')
13
-
14
-
15
- _default_model: models.Model | models.KnownModelName = 'openai:gpt-4o'
16
-
17
-
18
- class GradingOutput(BaseModel, populate_by_name=True):
19
- """The output of a grading operation."""
20
-
21
- reason: str
22
- pass_: bool = Field(validation_alias='pass', serialization_alias='pass')
23
- score: float
24
-
25
-
26
- _judge_output_agent = Agent(
27
- name='judge_output',
28
- system_prompt=dedent(
29
- """
30
- You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
31
-
32
- Examples:
33
-
34
- <Output>Hello world</Output>
35
- <Rubric>Content contains a greeting</Rubric>
36
- {"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
37
-
38
- <Output>Avast ye swabs, repel the invaders!</Output>
39
- <Rubric>Does not speak like a pirate</Rubric>
40
- {"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
41
- """
42
- ),
43
- output_type=GradingOutput,
44
- )
45
-
46
-
47
- async def judge_output(
48
- output: Any,
49
- rubric: str,
50
- model: models.Model | models.KnownModelName | None = None,
51
- model_settings: ModelSettings | None = None,
52
- ) -> GradingOutput:
53
- """Judge the output of a model based on a rubric.
54
-
55
- If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
56
- but this can be changed using the `set_default_judge_model` function.
57
- """
58
- user_prompt = f'<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
59
- return (
60
- await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
61
- ).output
62
-
63
-
64
- _judge_input_output_agent = Agent(
65
- name='judge_input_output',
66
- system_prompt=dedent(
67
- """
68
- You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
69
-
70
- Examples:
71
-
72
- <Input>Hello world</Input>
73
- <Output>Hello</Output>
74
- <Rubric>Content contains a greeting word which is present in the input</Rubric>
75
- {"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
76
-
77
- <Input>Pirate</Input>
78
- <Output>Avast ye swabs, repel the invaders!</Output>
79
- <Rubric>Does not speak in the style described by the input</Rubric>
80
- {"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
81
- """
82
- ),
83
- output_type=GradingOutput,
84
- )
85
-
86
-
87
- async def judge_input_output(
88
- inputs: Any,
89
- output: Any,
90
- rubric: str,
91
- model: models.Model | models.KnownModelName | None = None,
92
- model_settings: ModelSettings | None = None,
93
- ) -> GradingOutput:
94
- """Judge the output of a model based on the inputs and a rubric.
95
-
96
- If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
97
- but this can be changed using the `set_default_judge_model` function.
98
- """
99
- user_prompt = f'<Input>\n{_stringify(inputs)}\n</Input>\n<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
100
- return (
101
- await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
102
- ).output
103
-
104
-
105
- def set_default_judge_model(model: models.Model | models.KnownModelName) -> None: # pragma: no cover
106
- """Set the default model used for judging.
107
-
108
- This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
109
- """
110
- global _default_model
111
- _default_model = model
112
-
113
-
114
- def _stringify(value: Any) -> str:
115
- if isinstance(value, str):
116
- return value
117
- try:
118
- # If the value can be serialized to JSON, use that.
119
- # If that behavior is undesirable, the user could manually call repr on the arguments to the judge_* functions
120
- return to_json(value).decode()
121
- except Exception:
122
- return repr(value)
File without changes