PyPI - pydantic-evals - Versions diffs - 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

pydantic-evals 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydantic-evals might be problematic. Click here for more details.

Files changed (6) hide show

pydantic_evals/evaluators/llm_as_a_judge.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+from collections.abc import Sequence
 from textwrap import dedent
 from typing import Any
@@ -7,6 +8,7 @@ from pydantic import BaseModel, Field
 from pydantic_core import to_json
 from pydantic_ai import Agent, models
+from pydantic_ai.messages import MultiModalContentTypes, UserContent
 from pydantic_ai.settings import ModelSettings
 __all__ = (
@@ -62,16 +64,7 @@ async def judge_output(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = dedent(
-        f"""
-        <Output>
-        {_stringify(output)}
-        </Output>
-        <Rubric>
-        {rubric}
-        </Rubric>
-        """
-    )
+    user_prompt = _build_prompt(output=output, rubric=rubric)
     return (
         await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
     ).output
@@ -112,19 +105,8 @@ async def judge_input_output(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = dedent(
-        f"""
-        <Input>
-        {_stringify(inputs)}
-        </Input>
-        <Output>
-        {_stringify(output)}
-        </Output>
-        <Rubric>
-        {rubric}
-        </Rubric>
-        """
-    )
+    user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric)
     return (
         await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
     ).output
@@ -168,22 +150,7 @@ async def judge_input_output_expected(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = dedent(
-        f"""
-        <Input>
-        {_stringify(inputs)}
-        </Input>
-        <ExpectedOutput>
-        {_stringify(expected_output)}
-        </ExpectedOutput>
-        <Output>
-        {_stringify(output)}
-        </Output>
-        <Rubric>
-        {rubric}
-        </Rubric>
-        """
-    )
+    user_prompt = _build_prompt(inputs=inputs, output=output, rubric=rubric, expected_output=expected_output)
     return (
         await _judge_input_output_expected_agent.run(
@@ -227,19 +194,7 @@ async def judge_output_expected(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = dedent(
-        f"""
-        <ExpectedOutput>
-        {_stringify(expected_output)}
-        </ExpectedOutput>
-        <Output>
-        {_stringify(output)}
-        </Output>
-        <Rubric>
-        {rubric}
-        </Rubric>
-        """
-    )
+    user_prompt = _build_prompt(output=output, rubric=rubric, expected_output=expected_output)
     return (
         await _judge_output_expected_agent.run(
             user_prompt, model=model or _default_model, model_settings=model_settings
@@ -265,3 +220,41 @@ def _stringify(value: Any) -> str:
         return to_json(value).decode()
     except Exception:
         return repr(value)
+def _build_prompt(
+    output: Any,
+    rubric: str,
+    inputs: Any | None = None,
+    expected_output: Any | None = None,
+) -> str | Sequence[str | UserContent]:
+    """Build a prompt that includes input, output, and rubric."""
+    sections: list[str | UserContent] = []
+    if inputs is not None:
+        if isinstance(inputs, str):
+            sections.append(f'<Input>\n{inputs}\n</Input>')
+        else:
+            sections.append('<Input>\n')
+            if isinstance(inputs, Sequence):
+                for item in inputs:  # type: ignore
+                    if isinstance(item, (str, MultiModalContentTypes)):
+                        sections.append(item)
+                    else:
+                        sections.append(_stringify(item))
+            elif isinstance(inputs, MultiModalContentTypes):
+                sections.append(inputs)
+            else:
+                sections.append(_stringify(inputs))
+            sections.append('</Input>')
+    sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
+    sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
+    if expected_output is not None:
+        sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')
+    if inputs is None or isinstance(inputs, str):
+        return '\n\n'.join(sections)  # type: ignore[arg-type]
+    else:
+        return sections

pydantic_evals/generation.py CHANGED Viewed

@@ -47,7 +47,7 @@ async def generate_dataset(
         path: Optional path to save the generated dataset. If provided, the dataset will be saved to this location.
         dataset_type: The type of dataset to generate, with the desired input, output, and metadata types.
         custom_evaluator_types: Optional sequence of custom evaluator classes to include in the schema.
-        model: The PydanticAI model to use for generation. Defaults to 'gpt-4o'.
+        model: The Pydantic AI model to use for generation. Defaults to 'gpt-4o'.
         n_examples: Number of examples to generate. Defaults to 3.
         extra_instructions: Optional additional instructions to provide to the LLM.
@@ -59,7 +59,7 @@ async def generate_dataset(
     """
     output_schema = dataset_type.model_json_schema_with_evaluators(custom_evaluator_types)
-    # TODO(DavidM): Update this once we add better response_format and/or ResultTool support to PydanticAI
+    # TODO(DavidM): Update this once we add better response_format and/or ResultTool support to Pydantic AI
     agent = Agent(
         model,
         system_prompt=(

{pydantic_evals-0.4.3.dist-info → pydantic_evals-0.4.5.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,12 @@
 Metadata-Version: 2.4
 Name: pydantic-evals
-Version: 0.4.3
+Version: 0.4.5
 Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
 Project-URL: Homepage, https://ai.pydantic.dev/evals
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
 Project-URL: Documentation, https://ai.pydantic.dev/evals
 Project-URL: Changelog, https://github.com/pydantic/pydantic-ai/releases
-Author-email: David Montague <david@pydantic.dev>
+Author-email: Samuel Colvin <samuel@pydantic.dev>, Marcelo Trylesinski <marcelotryle@gmail.com>, David Montague <david@pydantic.dev>, Alex Hall <alex@pydantic.dev>, Douwe Maan <douwe@pydantic.dev>
 License-Expression: MIT
 License-File: LICENSE
 Classifier: Development Status :: 4 - Beta
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
 Requires-Dist: anyio>=0
 Requires-Dist: eval-type-backport>=0; python_version < '3.11'
 Requires-Dist: logfire-api>=1.2.0
-Requires-Dist: pydantic-ai-slim==0.4.3
+Requires-Dist: pydantic-ai-slim==0.4.5
 Requires-Dist: pydantic>=2.10
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: rich>=13.9.4
@@ -51,18 +51,18 @@ Description-Content-Type: text/markdown
 This is a library for evaluating non-deterministic (or "stochastic") functions in Python. It provides a simple,
 Pythonic interface for defining and running stochastic functions, and analyzing the results of running those functions.
-While this library is developed as part of [PydanticAI](https://ai.pydantic.dev), it only uses PydanticAI for a small
+While this library is developed as part of [Pydantic AI](https://ai.pydantic.dev), it only uses Pydantic AI for a small
 subset of generative functionality internally, and it is designed to be used with arbitrary "stochastic function"
-implementations. In particular, it can be used with other (non-PydanticAI) AI libraries, agent frameworks, etc.
+implementations. In particular, it can be used with other (non-Pydantic AI) AI libraries, agent frameworks, etc.
-As with PydanticAI, this library prioritizes type safety and use of common Python syntax over esoteric, domain-specific
+As with Pydantic AI, this library prioritizes type safety and use of common Python syntax over esoteric, domain-specific
 use of Python syntax.
 Full documentation is available at [ai.pydantic.dev/evals](https://ai.pydantic.dev/evals).
 ## Example
-While you'd typically use Pydantic Evals with more complex functions (such as PydanticAI agents or graphs), here's a
+While you'd typically use Pydantic Evals with more complex functions (such as Pydantic AI agents or graphs), here's a
 quick example that evaluates a simple function against a test case using both custom and built-in evaluators:
 ```python
@@ -110,7 +110,7 @@ report.print(include_input=True, include_output=True)
 """
 ```
-Using the library with more complex functions, such as PydanticAI agents, is similar — all you need to do is define a
+Using the library with more complex functions, such as Pydantic AI agents, is similar — all you need to do is define a
 task function wrapping the function you want to evaluate, with a signature that matches the inputs and outputs of your
 test cases.

{pydantic_evals-0.4.3.dist-info → pydantic_evals-0.4.5.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
 pydantic_evals/_utils.py,sha256=PfhmPbdQp-q90s568LuG45zDDXxgO13BEz8MQJK8qw4,2922
 pydantic_evals/dataset.py,sha256=SY0k2htYG0d0KRRem3pnQdN7rPztJ_TCFnCb0zkXbCk,46477
-pydantic_evals/generation.py,sha256=-w-4-zpJuW8mLj5ed60LUYm--b-2G42p-UDuPhOQgRE,3492
+pydantic_evals/generation.py,sha256=Yd1rfbsDjjBBHDk-1KDu48hlITjM2-74rTnPBD_sqbA,3494
 pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pydantic_evals/evaluators/__init__.py,sha256=uj110viFiDvqrIcuDcWexk_uBgJqhKMGPHT2YvDb7LA,624
 pydantic_evals/evaluators/_run_evaluator.py,sha256=Dsnqxno7CrcKWYcnkLuwvPKWQGDRBmbBTwwstcmc0ak,2448
@@ -9,7 +9,7 @@ pydantic_evals/evaluators/_spec.py,sha256=Xi_FHwnmAZ1x2hoJFw4MBZuG0TilNKqMRW3P74
 pydantic_evals/evaluators/common.py,sha256=ZBrNTfPJoOpT4WNXTRGS0UcKhnuhfYJxjNzum-zHFk8,12064
 pydantic_evals/evaluators/context.py,sha256=8osQRCFW8Vekw7JiiOOCHHH3HOGdhDaUlr8i-twSetg,3870
 pydantic_evals/evaluators/evaluator.py,sha256=yOEKLOxElm7_4tLcq6_myXI0e4Ei9svZP9y5DTq4SYI,11147
-pydantic_evals/evaluators/llm_as_a_judge.py,sha256=raty91NWdu_FEt4ze_ugQHCouj1o72gYe3abJBtMqlU,8793
+pydantic_evals/evaluators/llm_as_a_judge.py,sha256=xQjaGuCRXZdlExacFyR4Y4kFmwBh2QxAfEyaed_aqvk,9615
 pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
 pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5hN_R8VRK4jzmdfe1DgWdXwxVbc,6758
 pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
 pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
 pydantic_evals/reporting/__init__.py,sha256=k_3tteqXGh0yGvgpN68gB0CjG9wzrakzDTve2GHend4,42148
 pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
-pydantic_evals-0.4.3.dist-info/METADATA,sha256=Zdwy3MQ-fI3g1GxcKplPCQQZLsR8FdmyYVH2tE1capk,7785
-pydantic_evals-0.4.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-pydantic_evals-0.4.3.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
-pydantic_evals-0.4.3.dist-info/RECORD,,
+pydantic_evals-0.4.5.dist-info/METADATA,sha256=6axvUhXnQNjpGjtEIsUv2FHDNQnOQeBMEZmaqzfoo_s,7938
+pydantic_evals-0.4.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+pydantic_evals-0.4.5.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
+pydantic_evals-0.4.5.dist-info/RECORD,,

{pydantic_evals-0.4.3.dist-info → pydantic_evals-0.4.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{pydantic_evals-0.4.3.dist-info → pydantic_evals-0.4.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

pydantic-evals 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

Potentially problematic release.

pydantic-evals 0.4.3py3-none-any.whl → 0.4.5py3-none-any.whl