pydantic-evals 0.2.12__tar.gz → 0.2.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/PKG-INFO +2 -2
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/common.py +22 -7
- pydantic_evals-0.2.13/pydantic_evals/evaluators/llm_as_a_judge.py +267 -0
- pydantic_evals-0.2.12/pydantic_evals/evaluators/llm_as_a_judge.py +0 -122
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/.gitignore +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/LICENSE +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/README.md +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/__init__.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/_utils.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/dataset.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/__init__.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/_spec.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/context.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/evaluators/evaluator.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/generation.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/otel/__init__.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/otel/_context_subtree.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/otel/_errors.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/otel/span_tree.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/py.typed +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/reporting/__init__.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pydantic_evals/reporting/render_numbers.py +0 -0
- {pydantic_evals-0.2.12 → pydantic_evals-0.2.13}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.13
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
|
|
|
32
32
|
Requires-Dist: anyio>=0
|
|
33
33
|
Requires-Dist: eval-type-backport>=0; python_version < '3.11'
|
|
34
34
|
Requires-Dist: logfire-api>=1.2.0
|
|
35
|
-
Requires-Dist: pydantic-ai-slim==0.2.
|
|
35
|
+
Requires-Dist: pydantic-ai-slim==0.2.13
|
|
36
36
|
Requires-Dist: pydantic>=2.10
|
|
37
37
|
Requires-Dist: pyyaml>=6.0.2
|
|
38
38
|
Requires-Dist: rich>=13.9.4
|
|
@@ -194,6 +194,7 @@ class LLMJudge(Evaluator[object, object, object]):
|
|
|
194
194
|
rubric: str
|
|
195
195
|
model: models.Model | models.KnownModelName | None = None
|
|
196
196
|
include_input: bool = False
|
|
197
|
+
include_expected_output: bool = False
|
|
197
198
|
model_settings: ModelSettings | None = None
|
|
198
199
|
score: OutputConfig | Literal[False] = False
|
|
199
200
|
assertion: OutputConfig | Literal[False] = field(default_factory=lambda: OutputConfig(include_reason=True))
|
|
@@ -203,15 +204,29 @@ class LLMJudge(Evaluator[object, object, object]):
|
|
|
203
204
|
ctx: EvaluatorContext[object, object, object],
|
|
204
205
|
) -> EvaluatorOutput:
|
|
205
206
|
if self.include_input:
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
207
|
+
if self.include_expected_output:
|
|
208
|
+
from .llm_as_a_judge import judge_input_output_expected
|
|
209
|
+
|
|
210
|
+
grading_output = await judge_input_output_expected(
|
|
211
|
+
ctx.inputs, ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
|
|
212
|
+
)
|
|
213
|
+
else:
|
|
214
|
+
from .llm_as_a_judge import judge_input_output
|
|
215
|
+
|
|
216
|
+
grading_output = await judge_input_output(
|
|
217
|
+
ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
|
|
218
|
+
)
|
|
211
219
|
else:
|
|
212
|
-
|
|
220
|
+
if self.include_expected_output:
|
|
221
|
+
from .llm_as_a_judge import judge_output_expected
|
|
222
|
+
|
|
223
|
+
grading_output = await judge_output_expected(
|
|
224
|
+
ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
|
|
225
|
+
)
|
|
226
|
+
else:
|
|
227
|
+
from .llm_as_a_judge import judge_output
|
|
213
228
|
|
|
214
|
-
|
|
229
|
+
grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
|
|
215
230
|
|
|
216
231
|
output: dict[str, EvaluationScalar | EvaluationReason] = {}
|
|
217
232
|
include_both = self.score is not False and self.assertion is not False
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from textwrap import dedent
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
from pydantic_core import to_json
|
|
8
|
+
|
|
9
|
+
from pydantic_ai import Agent, models
|
|
10
|
+
from pydantic_ai.settings import ModelSettings
|
|
11
|
+
|
|
12
|
+
__all__ = (
|
|
13
|
+
'GradingOutput',
|
|
14
|
+
'judge_input_output',
|
|
15
|
+
'judge_input_output_expected',
|
|
16
|
+
'judge_output',
|
|
17
|
+
'judge_output_expected',
|
|
18
|
+
'set_default_judge_model',
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
_default_model: models.Model | models.KnownModelName = 'openai:gpt-4o'
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class GradingOutput(BaseModel, populate_by_name=True):
|
|
26
|
+
"""The output of a grading operation."""
|
|
27
|
+
|
|
28
|
+
reason: str
|
|
29
|
+
pass_: bool = Field(validation_alias='pass', serialization_alias='pass')
|
|
30
|
+
score: float
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
_judge_output_agent = Agent(
|
|
34
|
+
name='judge_output',
|
|
35
|
+
system_prompt=dedent(
|
|
36
|
+
"""
|
|
37
|
+
You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
|
|
41
|
+
<Output>Hello world</Output>
|
|
42
|
+
<Rubric>Content contains a greeting</Rubric>
|
|
43
|
+
{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
|
|
44
|
+
|
|
45
|
+
<Output>Avast ye swabs, repel the invaders!</Output>
|
|
46
|
+
<Rubric>Does not speak like a pirate</Rubric>
|
|
47
|
+
{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
|
|
48
|
+
"""
|
|
49
|
+
),
|
|
50
|
+
output_type=GradingOutput,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
async def judge_output(
|
|
55
|
+
output: Any,
|
|
56
|
+
rubric: str,
|
|
57
|
+
model: models.Model | models.KnownModelName | None = None,
|
|
58
|
+
model_settings: ModelSettings | None = None,
|
|
59
|
+
) -> GradingOutput:
|
|
60
|
+
"""Judge the output of a model based on a rubric.
|
|
61
|
+
|
|
62
|
+
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
63
|
+
but this can be changed using the `set_default_judge_model` function.
|
|
64
|
+
"""
|
|
65
|
+
user_prompt = dedent(
|
|
66
|
+
f"""
|
|
67
|
+
<Output>
|
|
68
|
+
{_stringify(output)}
|
|
69
|
+
</Output>
|
|
70
|
+
<Rubric>
|
|
71
|
+
{rubric}
|
|
72
|
+
</Rubric>
|
|
73
|
+
"""
|
|
74
|
+
)
|
|
75
|
+
return (
|
|
76
|
+
await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
|
|
77
|
+
).output
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
_judge_input_output_agent = Agent(
|
|
81
|
+
name='judge_input_output',
|
|
82
|
+
system_prompt=dedent(
|
|
83
|
+
"""
|
|
84
|
+
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
85
|
+
|
|
86
|
+
Examples:
|
|
87
|
+
|
|
88
|
+
<Input>Hello world</Input>
|
|
89
|
+
<Output>Hello</Output>
|
|
90
|
+
<Rubric>Content contains a greeting word which is present in the input</Rubric>
|
|
91
|
+
{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
|
|
92
|
+
|
|
93
|
+
<Input>Pirate</Input>
|
|
94
|
+
<Output>Avast ye swabs, repel the invaders!</Output>
|
|
95
|
+
<Rubric>Does not speak in the style described by the input</Rubric>
|
|
96
|
+
{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
|
|
97
|
+
"""
|
|
98
|
+
),
|
|
99
|
+
output_type=GradingOutput,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
async def judge_input_output(
|
|
104
|
+
inputs: Any,
|
|
105
|
+
output: Any,
|
|
106
|
+
rubric: str,
|
|
107
|
+
model: models.Model | models.KnownModelName | None = None,
|
|
108
|
+
model_settings: ModelSettings | None = None,
|
|
109
|
+
) -> GradingOutput:
|
|
110
|
+
"""Judge the output of a model based on the inputs and a rubric.
|
|
111
|
+
|
|
112
|
+
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
113
|
+
but this can be changed using the `set_default_judge_model` function.
|
|
114
|
+
"""
|
|
115
|
+
user_prompt = dedent(
|
|
116
|
+
f"""
|
|
117
|
+
<Input>
|
|
118
|
+
{_stringify(inputs)}
|
|
119
|
+
</Input>
|
|
120
|
+
<Output>
|
|
121
|
+
{_stringify(output)}
|
|
122
|
+
</Output>
|
|
123
|
+
<Rubric>
|
|
124
|
+
{rubric}
|
|
125
|
+
</Rubric>
|
|
126
|
+
"""
|
|
127
|
+
)
|
|
128
|
+
return (
|
|
129
|
+
await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
|
|
130
|
+
).output
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
_judge_input_output_expected_agent = Agent(
|
|
134
|
+
name='judge_input_output_expected',
|
|
135
|
+
system_prompt=dedent(
|
|
136
|
+
"""
|
|
137
|
+
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input, expected output, and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
138
|
+
|
|
139
|
+
Examples:
|
|
140
|
+
|
|
141
|
+
<Input>What color is the sky?</Input>
|
|
142
|
+
<ExpectedOutput>Blue</ExpectedOutput>
|
|
143
|
+
<Output>Cerulean</Output>
|
|
144
|
+
<Rubric>The output is consistent with the expected output but doesn't have to match exactly</Rubric>
|
|
145
|
+
{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
|
|
146
|
+
|
|
147
|
+
<Input>How many legs does a spider have?</Input>
|
|
148
|
+
<ExpectedOutput>8</ExpectedOutput>
|
|
149
|
+
<Output>Six</Output>
|
|
150
|
+
<Rubric>The output is factually consistent with the expected output</Rubric>
|
|
151
|
+
{"reason": "Spiders have 8 legs", "pass": false, "score": 0.0}
|
|
152
|
+
"""
|
|
153
|
+
),
|
|
154
|
+
output_type=GradingOutput,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
async def judge_input_output_expected(
|
|
159
|
+
inputs: Any,
|
|
160
|
+
output: Any,
|
|
161
|
+
expected_output: Any,
|
|
162
|
+
rubric: str,
|
|
163
|
+
model: models.Model | models.KnownModelName | None = None,
|
|
164
|
+
model_settings: ModelSettings | None = None,
|
|
165
|
+
) -> GradingOutput:
|
|
166
|
+
"""Judge the output of a model based on the inputs and a rubric.
|
|
167
|
+
|
|
168
|
+
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
169
|
+
but this can be changed using the `set_default_judge_model` function.
|
|
170
|
+
"""
|
|
171
|
+
user_prompt = dedent(
|
|
172
|
+
f"""
|
|
173
|
+
<Input>
|
|
174
|
+
{_stringify(inputs)}
|
|
175
|
+
</Input>
|
|
176
|
+
<ExpectedOutput>
|
|
177
|
+
{_stringify(expected_output)}
|
|
178
|
+
</ExpectedOutput>
|
|
179
|
+
<Output>
|
|
180
|
+
{_stringify(output)}
|
|
181
|
+
</Output>
|
|
182
|
+
<Rubric>
|
|
183
|
+
{rubric}
|
|
184
|
+
</Rubric>
|
|
185
|
+
"""
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
return (
|
|
189
|
+
await _judge_input_output_expected_agent.run(
|
|
190
|
+
user_prompt, model=model or _default_model, model_settings=model_settings
|
|
191
|
+
)
|
|
192
|
+
).output
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
_judge_output_expected_agent = Agent(
|
|
196
|
+
name='judge_output_expected',
|
|
197
|
+
system_prompt=dedent(
|
|
198
|
+
"""
|
|
199
|
+
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided expected output and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
200
|
+
|
|
201
|
+
Examples:
|
|
202
|
+
|
|
203
|
+
<ExpectedOutput>Blue</ExpectedOutput>
|
|
204
|
+
<Output>Cerulean</Output>
|
|
205
|
+
<Rubric>The output should be a shade of the expected output color</Rubric>
|
|
206
|
+
{"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
|
|
207
|
+
|
|
208
|
+
<ExpectedOutput>8</ExpectedOutput>
|
|
209
|
+
<Output>Six</Output>
|
|
210
|
+
<Rubric>The output should be a number written in words which matches the number written in digits in the expected output</Rubric>
|
|
211
|
+
{"reason": "The output is 'Six' which is a different number than 8", "pass": false, "score": 0.0}
|
|
212
|
+
"""
|
|
213
|
+
),
|
|
214
|
+
output_type=GradingOutput,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
async def judge_output_expected(
|
|
219
|
+
output: Any,
|
|
220
|
+
expected_output: Any,
|
|
221
|
+
rubric: str,
|
|
222
|
+
model: models.Model | models.KnownModelName | None = None,
|
|
223
|
+
model_settings: ModelSettings | None = None,
|
|
224
|
+
) -> GradingOutput:
|
|
225
|
+
"""Judge the output of a model based on the expected output, output, and a rubric.
|
|
226
|
+
|
|
227
|
+
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
228
|
+
but this can be changed using the `set_default_judge_model` function.
|
|
229
|
+
"""
|
|
230
|
+
user_prompt = dedent(
|
|
231
|
+
f"""
|
|
232
|
+
<ExpectedOutput>
|
|
233
|
+
{_stringify(expected_output)}
|
|
234
|
+
</ExpectedOutput>
|
|
235
|
+
<Output>
|
|
236
|
+
{_stringify(output)}
|
|
237
|
+
</Output>
|
|
238
|
+
<Rubric>
|
|
239
|
+
{rubric}
|
|
240
|
+
</Rubric>
|
|
241
|
+
"""
|
|
242
|
+
)
|
|
243
|
+
return (
|
|
244
|
+
await _judge_output_expected_agent.run(
|
|
245
|
+
user_prompt, model=model or _default_model, model_settings=model_settings
|
|
246
|
+
)
|
|
247
|
+
).output
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None: # pragma: no cover
|
|
251
|
+
"""Set the default model used for judging.
|
|
252
|
+
|
|
253
|
+
This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
|
|
254
|
+
"""
|
|
255
|
+
global _default_model
|
|
256
|
+
_default_model = model
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def _stringify(value: Any) -> str:
|
|
260
|
+
if isinstance(value, str):
|
|
261
|
+
return value
|
|
262
|
+
try:
|
|
263
|
+
# If the value can be serialized to JSON, use that.
|
|
264
|
+
# If that behavior is undesirable, the user could manually call repr on the arguments to the judge_* functions
|
|
265
|
+
return to_json(value).decode()
|
|
266
|
+
except Exception:
|
|
267
|
+
return repr(value)
|
|
@@ -1,122 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from textwrap import dedent
|
|
4
|
-
from typing import Any
|
|
5
|
-
|
|
6
|
-
from pydantic import BaseModel, Field
|
|
7
|
-
from pydantic_core import to_json
|
|
8
|
-
|
|
9
|
-
from pydantic_ai import Agent, models
|
|
10
|
-
from pydantic_ai.settings import ModelSettings
|
|
11
|
-
|
|
12
|
-
__all__ = ('GradingOutput', 'judge_input_output', 'judge_output', 'set_default_judge_model')
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
_default_model: models.Model | models.KnownModelName = 'openai:gpt-4o'
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class GradingOutput(BaseModel, populate_by_name=True):
|
|
19
|
-
"""The output of a grading operation."""
|
|
20
|
-
|
|
21
|
-
reason: str
|
|
22
|
-
pass_: bool = Field(validation_alias='pass', serialization_alias='pass')
|
|
23
|
-
score: float
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
_judge_output_agent = Agent(
|
|
27
|
-
name='judge_output',
|
|
28
|
-
system_prompt=dedent(
|
|
29
|
-
"""
|
|
30
|
-
You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
31
|
-
|
|
32
|
-
Examples:
|
|
33
|
-
|
|
34
|
-
<Output>Hello world</Output>
|
|
35
|
-
<Rubric>Content contains a greeting</Rubric>
|
|
36
|
-
{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
|
|
37
|
-
|
|
38
|
-
<Output>Avast ye swabs, repel the invaders!</Output>
|
|
39
|
-
<Rubric>Does not speak like a pirate</Rubric>
|
|
40
|
-
{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
|
|
41
|
-
"""
|
|
42
|
-
),
|
|
43
|
-
output_type=GradingOutput,
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
async def judge_output(
|
|
48
|
-
output: Any,
|
|
49
|
-
rubric: str,
|
|
50
|
-
model: models.Model | models.KnownModelName | None = None,
|
|
51
|
-
model_settings: ModelSettings | None = None,
|
|
52
|
-
) -> GradingOutput:
|
|
53
|
-
"""Judge the output of a model based on a rubric.
|
|
54
|
-
|
|
55
|
-
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
56
|
-
but this can be changed using the `set_default_judge_model` function.
|
|
57
|
-
"""
|
|
58
|
-
user_prompt = f'<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
|
|
59
|
-
return (
|
|
60
|
-
await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
|
|
61
|
-
).output
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
_judge_input_output_agent = Agent(
|
|
65
|
-
name='judge_input_output',
|
|
66
|
-
system_prompt=dedent(
|
|
67
|
-
"""
|
|
68
|
-
You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
|
|
69
|
-
|
|
70
|
-
Examples:
|
|
71
|
-
|
|
72
|
-
<Input>Hello world</Input>
|
|
73
|
-
<Output>Hello</Output>
|
|
74
|
-
<Rubric>Content contains a greeting word which is present in the input</Rubric>
|
|
75
|
-
{"reason": "the content contains the word 'Hello'", "pass": true, "score": 1.0}
|
|
76
|
-
|
|
77
|
-
<Input>Pirate</Input>
|
|
78
|
-
<Output>Avast ye swabs, repel the invaders!</Output>
|
|
79
|
-
<Rubric>Does not speak in the style described by the input</Rubric>
|
|
80
|
-
{"reason": "'avast ye' is a common pirate term", "pass": false, "score": 0.0}
|
|
81
|
-
"""
|
|
82
|
-
),
|
|
83
|
-
output_type=GradingOutput,
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
async def judge_input_output(
|
|
88
|
-
inputs: Any,
|
|
89
|
-
output: Any,
|
|
90
|
-
rubric: str,
|
|
91
|
-
model: models.Model | models.KnownModelName | None = None,
|
|
92
|
-
model_settings: ModelSettings | None = None,
|
|
93
|
-
) -> GradingOutput:
|
|
94
|
-
"""Judge the output of a model based on the inputs and a rubric.
|
|
95
|
-
|
|
96
|
-
If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
|
|
97
|
-
but this can be changed using the `set_default_judge_model` function.
|
|
98
|
-
"""
|
|
99
|
-
user_prompt = f'<Input>\n{_stringify(inputs)}\n</Input>\n<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
|
|
100
|
-
return (
|
|
101
|
-
await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
|
|
102
|
-
).output
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None: # pragma: no cover
|
|
106
|
-
"""Set the default model used for judging.
|
|
107
|
-
|
|
108
|
-
This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
|
|
109
|
-
"""
|
|
110
|
-
global _default_model
|
|
111
|
-
_default_model = model
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
def _stringify(value: Any) -> str:
|
|
115
|
-
if isinstance(value, str):
|
|
116
|
-
return value
|
|
117
|
-
try:
|
|
118
|
-
# If the value can be serialized to JSON, use that.
|
|
119
|
-
# If that behavior is undesirable, the user could manually call repr on the arguments to the judge_* functions
|
|
120
|
-
return to_json(value).decode()
|
|
121
|
-
except Exception:
|
|
122
|
-
return repr(value)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|