pydantic-evals 0.4.4__py3-none-any.whl → 0.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-evals might be problematic. Click here for more details.

pydantic_evals/dataset.py CHANGED
@@ -43,7 +43,7 @@ from .evaluators.common import DEFAULT_EVALUATORS
43
43
  from .evaluators.context import EvaluatorContext
44
44
  from .otel import SpanTree
45
45
  from .otel._context_subtree import context_subtree
46
- from .reporting import EvaluationReport, ReportCase
46
+ from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate
47
47
 
48
48
  if sys.version_info < (3, 11):
49
49
  from exceptiongroup import ExceptionGroup # pragma: lax no cover
@@ -83,6 +83,10 @@ DEFAULT_SCHEMA_PATH_TEMPLATE = './{stem}_schema.json'
83
83
  _YAML_SCHEMA_LINE_PREFIX = '# yaml-language-server: $schema='
84
84
 
85
85
 
86
+ _REPORT_CASES_ADAPTER = TypeAdapter(list[ReportCase])
87
+ _REPORT_CASE_AGGREGATE_ADAPTER = TypeAdapter(ReportCaseAggregate)
88
+
89
+
86
90
  class _CaseModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'):
87
91
  """Internal model for a case, used for serialization/deserialization."""
88
92
 
@@ -303,9 +307,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
303
307
  ),
304
308
  )
305
309
  # TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel:
306
- eval_span.set_attribute('cases', report.cases)
310
+ eval_span.set_attribute('cases', _REPORT_CASES_ADAPTER.dump_python(report.cases))
307
311
  # TODO(DavidM): Remove this 'averages' attribute once we compute it in the details panel
308
- eval_span.set_attribute('averages', report.averages())
312
+ eval_span.set_attribute('averages', _REPORT_CASE_AGGREGATE_ADAPTER.dump_python(report.averages()))
309
313
  return report
310
314
 
311
315
  def evaluate_sync(
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from collections.abc import Sequence
3
4
  from textwrap import dedent
4
5
  from typing import Any
5
6
 
@@ -7,6 +8,7 @@ from pydantic import BaseModel, Field
7
8
  from pydantic_core import to_json
8
9
 
9
10
  from pydantic_ai import Agent, models
11
+ from pydantic_ai.messages import MultiModalContentTypes, UserContent
10
12
  from pydantic_ai.settings import ModelSettings
11
13
 
12
14
  __all__ = (
@@ -62,16 +64,7 @@ async def judge_output(
62
64
  If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
63
65
  but this can be changed using the `set_default_judge_model` function.
64
66
  """
65
- user_prompt = dedent(
66
- f"""
67
- <Output>
68
- {_stringify(output)}
69
- </Output>
70
- <Rubric>
71
- {rubric}
72
- </Rubric>
73
- """
74
- )
67
+ user_prompt = _build_prompt(output=output, rubric=rubric)
75
68
  return (
76
69
  await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
77
70
  ).output
@@ -112,19 +105,8 @@ async def judge_input_output(
112
105
  If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
113
106
  but this can be changed using the `set_default_judge_model` function.
114
107
  """
115
- user_prompt = dedent(
116
- f"""
117
- <Input>
118
- {_stringify(inputs)}
119
- </Input>
120
- <Output>
121
- {_stringify(output)}
122
- </Output>
123
- <Rubric>
124
- {rubric}
125
- </Rubric>
126
- """
127
- )
108
+ user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric)
109
+
128
110
  return (
129
111
  await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
130
112
  ).output
@@ -168,22 +150,7 @@ async def judge_input_output_expected(
168
150
  If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
169
151
  but this can be changed using the `set_default_judge_model` function.
170
152
  """
171
- user_prompt = dedent(
172
- f"""
173
- <Input>
174
- {_stringify(inputs)}
175
- </Input>
176
- <ExpectedOutput>
177
- {_stringify(expected_output)}
178
- </ExpectedOutput>
179
- <Output>
180
- {_stringify(output)}
181
- </Output>
182
- <Rubric>
183
- {rubric}
184
- </Rubric>
185
- """
186
- )
153
+ user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric, expected_output=expected_output)
187
154
 
188
155
  return (
189
156
  await _judge_input_output_expected_agent.run(
@@ -227,19 +194,7 @@ async def judge_output_expected(
227
194
  If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
228
195
  but this can be changed using the `set_default_judge_model` function.
229
196
  """
230
- user_prompt = dedent(
231
- f"""
232
- <ExpectedOutput>
233
- {_stringify(expected_output)}
234
- </ExpectedOutput>
235
- <Output>
236
- {_stringify(output)}
237
- </Output>
238
- <Rubric>
239
- {rubric}
240
- </Rubric>
241
- """
242
- )
197
+ user_prompt = _build_prompt(output=output, rubric=rubric, expected_output=expected_output)
243
198
  return (
244
199
  await _judge_output_expected_agent.run(
245
200
  user_prompt, model=model or _default_model, model_settings=model_settings
@@ -265,3 +220,41 @@ def _stringify(value: Any) -> str:
265
220
  return to_json(value).decode()
266
221
  except Exception:
267
222
  return repr(value)
223
+
224
+
225
+ def _build_prompt(
226
+ output: Any,
227
+ rubric: str,
228
+ inputs: Any | None = None,
229
+ expected_output: Any | None = None,
230
+ ) -> str | Sequence[str | UserContent]:
231
+ """Build a prompt that includes input, output, and rubric."""
232
+ sections: list[str | UserContent] = []
233
+
234
+ if inputs is not None:
235
+ if isinstance(inputs, str):
236
+ sections.append(f'<Input>\n{inputs}\n</Input>')
237
+ else:
238
+ sections.append('<Input>\n')
239
+ if isinstance(inputs, Sequence):
240
+ for item in inputs: # type: ignore
241
+ if isinstance(item, (str, MultiModalContentTypes)):
242
+ sections.append(item)
243
+ else:
244
+ sections.append(_stringify(item))
245
+ elif isinstance(inputs, MultiModalContentTypes):
246
+ sections.append(inputs)
247
+ else:
248
+ sections.append(_stringify(inputs))
249
+ sections.append('</Input>')
250
+
251
+ sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
252
+ sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
253
+
254
+ if expected_output is not None:
255
+ sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')
256
+
257
+ if inputs is None or isinstance(inputs, str):
258
+ return '\n\n'.join(sections) # type: ignore[arg-type]
259
+ else:
260
+ return sections
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 0.4.4
3
+ Version: 0.4.6
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
32
32
  Requires-Dist: anyio>=0
33
33
  Requires-Dist: eval-type-backport>=0; python_version < '3.11'
34
34
  Requires-Dist: logfire-api>=1.2.0
35
- Requires-Dist: pydantic-ai-slim==0.4.4
35
+ Requires-Dist: pydantic-ai-slim==0.4.6
36
36
  Requires-Dist: pydantic>=2.10
37
37
  Requires-Dist: pyyaml>=6.0.2
38
38
  Requires-Dist: rich>=13.9.4
@@ -1,6 +1,6 @@
1
1
  pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
2
2
  pydantic_evals/_utils.py,sha256=PfhmPbdQp-q90s568LuG45zDDXxgO13BEz8MQJK8qw4,2922
3
- pydantic_evals/dataset.py,sha256=SY0k2htYG0d0KRRem3pnQdN7rPztJ_TCFnCb0zkXbCk,46477
3
+ pydantic_evals/dataset.py,sha256=yk6nHzzbEJqh9p3Y_MuBQyP0szp5oh-oFUDavi4N9D8,46699
4
4
  pydantic_evals/generation.py,sha256=Yd1rfbsDjjBBHDk-1KDu48hlITjM2-74rTnPBD_sqbA,3494
5
5
  pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  pydantic_evals/evaluators/__init__.py,sha256=uj110viFiDvqrIcuDcWexk_uBgJqhKMGPHT2YvDb7LA,624
@@ -9,7 +9,7 @@ pydantic_evals/evaluators/_spec.py,sha256=Xi_FHwnmAZ1x2hoJFw4MBZuG0TilNKqMRW3P74
9
9
  pydantic_evals/evaluators/common.py,sha256=ZBrNTfPJoOpT4WNXTRGS0UcKhnuhfYJxjNzum-zHFk8,12064
10
10
  pydantic_evals/evaluators/context.py,sha256=8osQRCFW8Vekw7JiiOOCHHH3HOGdhDaUlr8i-twSetg,3870
11
11
  pydantic_evals/evaluators/evaluator.py,sha256=yOEKLOxElm7_4tLcq6_myXI0e4Ei9svZP9y5DTq4SYI,11147
12
- pydantic_evals/evaluators/llm_as_a_judge.py,sha256=raty91NWdu_FEt4ze_ugQHCouj1o72gYe3abJBtMqlU,8793
12
+ pydantic_evals/evaluators/llm_as_a_judge.py,sha256=xQjaGuCRXZdlExacFyR4Y4kFmwBh2QxAfEyaed_aqvk,9615
13
13
  pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
14
14
  pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5hN_R8VRK4jzmdfe1DgWdXwxVbc,6758
15
15
  pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
17
17
  pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
18
18
  pydantic_evals/reporting/__init__.py,sha256=k_3tteqXGh0yGvgpN68gB0CjG9wzrakzDTve2GHend4,42148
19
19
  pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
20
- pydantic_evals-0.4.4.dist-info/METADATA,sha256=DyDqmxe9d_3gC3QhRuUffzRb1O5Ul9bT_xRNb9_9Rr4,7938
21
- pydantic_evals-0.4.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
- pydantic_evals-0.4.4.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
- pydantic_evals-0.4.4.dist-info/RECORD,,
20
+ pydantic_evals-0.4.6.dist-info/METADATA,sha256=zw53LmYNzSu9Zvg1oC3Xjhv4X7XOdBeIVE45MNRJvz4,7938
21
+ pydantic_evals-0.4.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
+ pydantic_evals-0.4.6.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
+ pydantic_evals-0.4.6.dist-info/RECORD,,