PyPI - pydantic-evals - Versions diffs - 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl - Mend

pydantic-evals 0.2.12py3-none-any.whl → 0.2.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

pydantic_evals/evaluators/common.py CHANGED Viewed

@@ -194,6 +194,7 @@ class LLMJudge(Evaluator[object, object, object]):
     rubric: str
     model: models.Model | models.KnownModelName | None = None
     include_input: bool = False
+    include_expected_output: bool = False
     model_settings: ModelSettings | None = None
     score: OutputConfig | Literal[False] = False
     assertion: OutputConfig | Literal[False] = field(default_factory=lambda: OutputConfig(include_reason=True))
@@ -203,15 +204,29 @@ class LLMJudge(Evaluator[object, object, object]):
         ctx: EvaluatorContext[object, object, object],
     ) -> EvaluatorOutput:
         if self.include_input:
-            from .llm_as_a_judge import judge_input_output
-            grading_output = await judge_input_output(
-                ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
-            )
+            if self.include_expected_output:
+                from .llm_as_a_judge import judge_input_output_expected
+                grading_output = await judge_input_output_expected(
+                    ctx.inputs, ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
+                )
+            else:
+                from .llm_as_a_judge import judge_input_output
+                grading_output = await judge_input_output(
+                    ctx.inputs, ctx.output, self.rubric, self.model, self.model_settings
+                )
         else:
-            from .llm_as_a_judge import judge_output
+            if self.include_expected_output:
+                from .llm_as_a_judge import judge_output_expected
+                grading_output = await judge_output_expected(
+                    ctx.output, ctx.expected_output, self.rubric, self.model, self.model_settings
+                )
+            else:
+                from .llm_as_a_judge import judge_output
-            grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
+                grading_output = await judge_output(ctx.output, self.rubric, self.model, self.model_settings)
         output: dict[str, EvaluationScalar | EvaluationReason] = {}
         include_both = self.score is not False and self.assertion is not False

pydantic_evals/evaluators/llm_as_a_judge.py CHANGED Viewed

@@ -9,7 +9,14 @@ from pydantic_core import to_json
 from pydantic_ai import Agent, models
 from pydantic_ai.settings import ModelSettings
-__all__ = ('GradingOutput', 'judge_input_output', 'judge_output', 'set_default_judge_model')
+__all__ = (
+    'GradingOutput',
+    'judge_input_output',
+    'judge_input_output_expected',
+    'judge_output',
+    'judge_output_expected',
+    'set_default_judge_model',
+)
 _default_model: models.Model | models.KnownModelName = 'openai:gpt-4o'
@@ -55,7 +62,16 @@ async def judge_output(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = f'<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
+    user_prompt = dedent(
+        f"""
+        <Output>
+        {_stringify(output)}
+        </Output>
+        <Rubric>
+        {rubric}
+        </Rubric>
+        """
+    )
     return (
         await _judge_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
     ).output
@@ -96,12 +112,141 @@ async def judge_input_output(
     If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
     but this can be changed using the `set_default_judge_model` function.
     """
-    user_prompt = f'<Input>\n{_stringify(inputs)}\n</Input>\n<Output>\n{_stringify(output)}\n</Output>\n<Rubric>\n{rubric}\n</Rubric>'
+    user_prompt = dedent(
+        f"""
+        <Input>
+        {_stringify(inputs)}
+        </Input>
+        <Output>
+        {_stringify(output)}
+        </Output>
+        <Rubric>
+        {rubric}
+        </Rubric>
+        """
+    )
     return (
         await _judge_input_output_agent.run(user_prompt, model=model or _default_model, model_settings=model_settings)
     ).output
+_judge_input_output_expected_agent = Agent(
+    name='judge_input_output_expected',
+    system_prompt=dedent(
+        """
+        You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided input, expected output, and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
+        Examples:
+        <Input>What color is the sky?</Input>
+        <ExpectedOutput>Blue</ExpectedOutput>
+        <Output>Cerulean</Output>
+        <Rubric>The output is consistent with the expected output but doesn't have to match exactly</Rubric>
+        {"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
+        <Input>How many legs does a spider have?</Input>
+        <ExpectedOutput>8</ExpectedOutput>
+        <Output>Six</Output>
+        <Rubric>The output is factually consistent with the expected output</Rubric>
+        {"reason": "Spiders have 8 legs", "pass": false, "score": 0.0}
+        """
+    ),
+    output_type=GradingOutput,
+)
+async def judge_input_output_expected(
+    inputs: Any,
+    output: Any,
+    expected_output: Any,
+    rubric: str,
+    model: models.Model | models.KnownModelName | None = None,
+    model_settings: ModelSettings | None = None,
+) -> GradingOutput:
+    """Judge the output of a model based on the inputs and a rubric.
+    If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
+    but this can be changed using the `set_default_judge_model` function.
+    """
+    user_prompt = dedent(
+        f"""
+        <Input>
+        {_stringify(inputs)}
+        </Input>
+        <ExpectedOutput>
+        {_stringify(expected_output)}
+        </ExpectedOutput>
+        <Output>
+        {_stringify(output)}
+        </Output>
+        <Rubric>
+        {rubric}
+        </Rubric>
+        """
+    )
+    return (
+        await _judge_input_output_expected_agent.run(
+            user_prompt, model=model or _default_model, model_settings=model_settings
+        )
+    ).output
+_judge_output_expected_agent = Agent(
+    name='judge_output_expected',
+    system_prompt=dedent(
+        """
+        You are grading output according to a user-specified rubric. If the statement in the rubric is true for the provided expected output and output, then the output passes the test. You respond with a JSON object with this structure: {reason: string, pass: boolean, score: number}
+        Examples:
+        <ExpectedOutput>Blue</ExpectedOutput>
+        <Output>Cerulean</Output>
+        <Rubric>The output should be a shade of the expected output color</Rubric>
+        {"reason": "'Cerulean' is a shade of blue", "pass": true, "score": 1.0}
+        <ExpectedOutput>8</ExpectedOutput>
+        <Output>Six</Output>
+        <Rubric>The output should be a number written in words which matches the number written in digits in the expected output</Rubric>
+        {"reason": "The output is 'Six' which is a different number than 8", "pass": false, "score": 0.0}
+        """
+    ),
+    output_type=GradingOutput,
+)
+async def judge_output_expected(
+    output: Any,
+    expected_output: Any,
+    rubric: str,
+    model: models.Model | models.KnownModelName | None = None,
+    model_settings: ModelSettings | None = None,
+) -> GradingOutput:
+    """Judge the output of a model based on the expected output, output, and a rubric.
+    If the model is not specified, a default model is used. The default model starts as 'openai:gpt-4o',
+    but this can be changed using the `set_default_judge_model` function.
+    """
+    user_prompt = dedent(
+        f"""
+        <ExpectedOutput>
+        {_stringify(expected_output)}
+        </ExpectedOutput>
+        <Output>
+        {_stringify(output)}
+        </Output>
+        <Rubric>
+        {rubric}
+        </Rubric>
+        """
+    )
+    return (
+        await _judge_output_expected_agent.run(
+            user_prompt, model=model or _default_model, model_settings=model_settings
+        )
+    ).output
 def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:  # pragma: no cover
     """Set the default model used for judging.

{pydantic_evals-0.2.12.dist-info → pydantic_evals-0.2.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydantic-evals
-Version: 0.2.12
+Version: 0.2.13
 Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
 Project-URL: Homepage, https://ai.pydantic.dev/evals
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
 Requires-Dist: anyio>=0
 Requires-Dist: eval-type-backport>=0; python_version < '3.11'
 Requires-Dist: logfire-api>=1.2.0
-Requires-Dist: pydantic-ai-slim==0.2.12
+Requires-Dist: pydantic-ai-slim==0.2.13
 Requires-Dist: pydantic>=2.10
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: rich>=13.9.4

{pydantic_evals-0.2.12.dist-info → pydantic_evals-0.2.13.dist-info}/RECORD RENAMED Viewed

@@ -6,10 +6,10 @@ pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pydantic_evals/evaluators/__init__.py,sha256=uj110viFiDvqrIcuDcWexk_uBgJqhKMGPHT2YvDb7LA,624
 pydantic_evals/evaluators/_run_evaluator.py,sha256=Dsnqxno7CrcKWYcnkLuwvPKWQGDRBmbBTwwstcmc0ak,2448
 pydantic_evals/evaluators/_spec.py,sha256=Xi_FHwnmAZ1x2hoJFw4MBZuG0TilNKqMRW3P74se32I,7080
-pydantic_evals/evaluators/common.py,sha256=-c1g9HMnUDcuswzHDkkFZbjWzbV909lPmusU2TybjI0,11358
+pydantic_evals/evaluators/common.py,sha256=ZBrNTfPJoOpT4WNXTRGS0UcKhnuhfYJxjNzum-zHFk8,12064
 pydantic_evals/evaluators/context.py,sha256=8osQRCFW8Vekw7JiiOOCHHH3HOGdhDaUlr8i-twSetg,3870
 pydantic_evals/evaluators/evaluator.py,sha256=yOEKLOxElm7_4tLcq6_myXI0e4Ei9svZP9y5DTq4SYI,11147
-pydantic_evals/evaluators/llm_as_a_judge.py,sha256=GZDFKagp4X2u0moXvIRnNizB_-7d_igLJLNBxRjmtr0,4470
+pydantic_evals/evaluators/llm_as_a_judge.py,sha256=raty91NWdu_FEt4ze_ugQHCouj1o72gYe3abJBtMqlU,8793
 pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
 pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=vIDF9-6lDuNKZuSM5hN_R8VRK4jzmdfe1DgWdXwxVbc,6758
 pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
 pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
 pydantic_evals/reporting/__init__.py,sha256=tknRGM2fm8EUENxbq4K5duHZ_DgNzrVWhpGHFkoQ9zo,41677
 pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
-pydantic_evals-0.2.12.dist-info/METADATA,sha256=O_9pav94phGfodGNWhD0Oz5K6mfiJ7svvVkypIy2-ts,7787
-pydantic_evals-0.2.12.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-pydantic_evals-0.2.12.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
-pydantic_evals-0.2.12.dist-info/RECORD,,
+pydantic_evals-0.2.13.dist-info/METADATA,sha256=Q3gMew8mckZeKRn7FTu-HENMFEJHknwoE4dcXGH5fHU,7787
+pydantic_evals-0.2.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+pydantic_evals-0.2.13.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
+pydantic_evals-0.2.13.dist-info/RECORD,,

{pydantic_evals-0.2.12.dist-info → pydantic_evals-0.2.13.dist-info}/WHEEL RENAMED Viewed

File without changes

{pydantic_evals-0.2.12.dist-info → pydantic_evals-0.2.13.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

pydantic-evals 0.2.12__py3-none-any.whl → 0.2.13__py3-none-any.whl

pydantic-evals 0.2.12py3-none-any.whl → 0.2.13py3-none-any.whl