PyPI - pydantic-ai - Versions diffs - 0.0.49__tar.gz → 0.0.51__tar.gz - Mend

pydantic-ai 0.0.49tar.gz → 0.0.51tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydantic-ai might be problematic. Click here for more details.

Files changed (142) hide show

{pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/Makefile RENAMED Viewed

@@ -37,7 +37,7 @@ lint: ## Lint the code
 .PHONY: lint-js
 lint-js: ## Lint JS and TS code
-	cd mcp-run-python && npm run lint
+	cd mcp-run-python && deno task lint-format
 .PHONY: typecheck-pyright
 typecheck-pyright:

{pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydantic-ai
-Version: 0.0.49
+Version: 0.0.51
 Summary: Agent Framework / shim to use Pydantic with LLMs
 Project-URL: Homepage, https://ai.pydantic.dev
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -28,9 +28,9 @@ Classifier: Topic :: Internet
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.9
-Requires-Dist: pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,groq,mcp,mistral,openai,vertexai]==0.0.49
+Requires-Dist: pydantic-ai-slim[anthropic,bedrock,cli,cohere,evals,groq,mcp,mistral,openai,vertexai]==0.0.51
 Provides-Extra: examples
-Requires-Dist: pydantic-ai-examples==0.0.49; extra == 'examples'
+Requires-Dist: pydantic-ai-examples==0.0.51; extra == 'examples'
 Provides-Extra: logfire
 Requires-Dist: logfire>=3.11.0; extra == 'logfire'
 Description-Content-Type: text/markdown

{pydantic_ai-0.0.49 → pydantic_ai-0.0.51}/tests/evals/test_dataset.py RENAMED Viewed

@@ -13,6 +13,7 @@ from inline_snapshot import snapshot
 from pydantic import BaseModel
 from ..conftest import try_import
+from .utils import render_table
 with try_import() as imports_successful:
     from pydantic_evals import Case, Dataset
@@ -342,40 +343,42 @@ async def test_increment_eval_metric(example_dataset: Dataset[TaskInput, TaskOut
         return TaskOutput(answer=f'answer to {inputs.query}')
     report = await example_dataset.evaluate(my_task)
-    assert report.cases == [
-        ReportCase(
-            name='case1',
-            inputs={'query': 'What is 2+2?'},
-            metadata=TaskMetadata(difficulty='easy', category='general'),
-            expected_output=TaskOutput(answer='4', confidence=1.0),
-            output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
-            metrics={'chars': 12},
-            attributes={'is_about_france': False},
-            scores={},
-            labels={},
-            assertions={},
-            task_duration=1.0,
-            total_duration=3.0,
-            trace_id='00000000000000000000000000000001',
-            span_id='0000000000000003',
-        ),
-        ReportCase(
-            name='case2',
-            inputs={'query': 'What is the capital of France?'},
-            metadata=TaskMetadata(difficulty='medium', category='geography'),
-            expected_output=TaskOutput(answer='Paris', confidence=1.0),
-            output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
-            metrics={'chars': 30},
-            attributes={'is_about_france': True},
-            scores={},
-            labels={},
-            assertions={},
-            task_duration=1.0,
-            total_duration=3.0,
-            trace_id='00000000000000000000000000000001',
-            span_id='0000000000000007',
-        ),
-    ]
+    assert report.cases == snapshot(
+        [
+            ReportCase(
+                name='case1',
+                inputs=TaskInput(query='What is 2+2?'),
+                metadata=TaskMetadata(difficulty='easy', category='general'),
+                expected_output=TaskOutput(answer='4', confidence=1.0),
+                output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
+                metrics={'chars': 12},
+                attributes={'is_about_france': False},
+                scores={},
+                labels={},
+                assertions={},
+                task_duration=1.0,
+                total_duration=3.0,
+                trace_id='00000000000000000000000000000001',
+                span_id='0000000000000003',
+            ),
+            ReportCase(
+                name='case2',
+                inputs=TaskInput(query='What is the capital of France?'),
+                metadata=TaskMetadata(difficulty='medium', category='geography'),
+                expected_output=TaskOutput(answer='Paris', confidence=1.0),
+                output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
+                metrics={'chars': 30},
+                attributes={'is_about_france': True},
+                scores={},
+                labels={},
+                assertions={},
+                task_duration=1.0,
+                total_duration=3.0,
+                trace_id='00000000000000000000000000000001',
+                span_id='0000000000000007',
+            ),
+        ]
+    )
 async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata]):
@@ -393,7 +396,7 @@ async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOut
         [
             ReportCase(
                 name='case1',
-                inputs={'query': 'What is 2+2?'},
+                inputs=TaskInput(query='What is 2+2?'),
                 metadata=TaskMetadata(difficulty='easy', category='general'),
                 expected_output=TaskOutput(answer='4', confidence=1.0),
                 output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
@@ -419,7 +422,7 @@ async def test_repeated_name_outputs(example_dataset: Dataset[TaskInput, TaskOut
             ),
             ReportCase(
                 name='case2',
-                inputs={'query': 'What is the capital of France?'},
+                inputs=TaskInput(query='What is the capital of France?'),
                 metadata=TaskMetadata(difficulty='medium', category='geography'),
                 expected_output=TaskOutput(answer='Paris', confidence=1.0),
                 output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
@@ -467,7 +470,7 @@ async def test_genai_attribute_collection(example_dataset: Dataset[TaskInput, Ta
         [
             ReportCase(
                 name='case1',
-                inputs={'query': 'What is 2+2?'},
+                inputs=TaskInput(query='What is 2+2?'),
                 metadata=TaskMetadata(difficulty='easy', category='general'),
                 expected_output=TaskOutput(answer='4', confidence=1.0),
                 output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
@@ -483,7 +486,7 @@ async def test_genai_attribute_collection(example_dataset: Dataset[TaskInput, Ta
             ),
             ReportCase(
                 name='case2',
-                inputs={'query': 'What is the capital of France?'},
+                inputs=TaskInput(query='What is the capital of France?'),
                 metadata=TaskMetadata(difficulty='medium', category='geography'),
                 expected_output=TaskOutput(answer='Paris', confidence=1.0),
                 output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
@@ -988,3 +991,47 @@ def test_import_generate_dataset():
     from pydantic_evals.generation import generate_dataset
     assert generate_dataset
+def test_evaluate_non_serializable_inputs():
+    @dataclass
+    class MyInputs:
+        result_type: type[str] | type[int]
+    my_dataset = Dataset[MyInputs, Any, Any](
+        cases=[
+            Case(
+                name='str',
+                inputs=MyInputs(result_type=str),
+                expected_output='abc',
+            ),
+            Case(
+                name='int',
+                inputs=MyInputs(result_type=int),
+                expected_output=123,
+            ),
+        ],
+    )
+    async def my_task(my_inputs: MyInputs) -> int | str:
+        if issubclass(my_inputs.result_type, str):
+            return my_inputs.result_type('abc')
+        else:
+            return my_inputs.result_type(123)
+    report = my_dataset.evaluate_sync(task=my_task)
+    assert [c.inputs for c in report.cases] == snapshot([MyInputs(result_type=str), MyInputs(result_type=int)])
+    table = report.console_table(include_input=True)
+    assert render_table(table) == snapshot("""\
+                                        Evaluation Summary: my_task
+┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID  ┃ Inputs                                                                             ┃ Duration ┃
+┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ str      │ test_evaluate_non_serializable_inputs.<locals>.MyInputs(result_type=<class 'str'>) │     1.0s │
+├──────────┼────────────────────────────────────────────────────────────────────────────────────┼──────────┤
+│ int      │ test_evaluate_non_serializable_inputs.<locals>.MyInputs(result_type=<class 'int'>) │     1.0s │
+├──────────┼────────────────────────────────────────────────────────────────────────────────────┼──────────┤
+│ Averages │                                                                                    │     1.0s │
+└──────────┴────────────────────────────────────────────────────────────────────────────────────┴──────────┘
+""")

pydantic_ai-0.0.51/tests/evals/test_reporting.py ADDED Viewed

@@ -0,0 +1,437 @@
+from __future__ import annotations as _annotations
+from dataclasses import dataclass
+import pytest
+from inline_snapshot import snapshot
+from pydantic import BaseModel
+from ..conftest import try_import
+from .utils import render_table
+with try_import() as imports_successful:
+    from pydantic_evals.evaluators import EvaluationResult, Evaluator, EvaluatorContext
+    from pydantic_evals.reporting import (
+        EvaluationRenderer,
+        EvaluationReport,
+        ReportCase,
+        ReportCaseAggregate,
+    )
+pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio]
+class TaskInput(BaseModel):
+    query: str
+class TaskOutput(BaseModel):
+    answer: str
+class TaskMetadata(BaseModel):
+    difficulty: str
+@pytest.fixture
+def mock_evaluator() -> Evaluator[TaskInput, TaskOutput, TaskMetadata]:
+    class MockEvaluator(Evaluator[TaskInput, TaskOutput, TaskMetadata]):
+        def evaluate(self, ctx: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]) -> bool:
+            raise NotImplementedError
+    return MockEvaluator()
+@pytest.fixture
+def sample_assertion(mock_evaluator: Evaluator[TaskInput, TaskOutput, TaskMetadata]) -> EvaluationResult[bool]:
+    return EvaluationResult(
+        name='MockEvaluator',
+        value=True,
+        reason=None,
+        source=mock_evaluator,
+    )
+@pytest.fixture
+def sample_score(mock_evaluator: Evaluator[TaskInput, TaskOutput, TaskMetadata]) -> EvaluationResult[float]:
+    return EvaluationResult(
+        name='MockEvaluator',
+        value=2.5,
+        reason=None,
+        source=mock_evaluator,
+    )
+@pytest.fixture
+def sample_label(mock_evaluator: Evaluator[TaskInput, TaskOutput, TaskMetadata]) -> EvaluationResult[str]:
+    return EvaluationResult(
+        name='MockEvaluator',
+        value='hello',
+        reason=None,
+        source=mock_evaluator,
+    )
+@pytest.fixture
+def sample_report_case(
+    sample_assertion: EvaluationResult[bool], sample_score: EvaluationResult[float], sample_label: EvaluationResult[str]
+) -> ReportCase:
+    return ReportCase(
+        name='test_case',
+        inputs={'query': 'What is 2+2?'},
+        output={'answer': '4'},
+        expected_output={'answer': '4'},
+        metadata={'difficulty': 'easy'},
+        metrics={'accuracy': 0.95},
+        attributes={},
+        scores={'score1': sample_score},
+        labels={'label1': sample_label},
+        assertions={sample_assertion.name: sample_assertion},
+        task_duration=0.1,
+        total_duration=0.2,
+        trace_id='test-trace-id',
+        span_id='test-span-id',
+    )
+@pytest.fixture
+def sample_report(sample_report_case: ReportCase) -> EvaluationReport:
+    return EvaluationReport(
+        cases=[sample_report_case],
+        name='test_report',
+    )
+async def test_evaluation_renderer_basic(sample_report: EvaluationReport):
+    """Test basic functionality of EvaluationRenderer."""
+    renderer = EvaluationRenderer(
+        include_input=True,
+        include_output=True,
+        include_metadata=True,
+        include_expected_output=True,
+        include_durations=True,
+        include_total_duration=True,
+        include_removed_cases=False,
+        include_averages=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+    )
+    table = renderer.build_table(sample_report)
+    assert render_table(table) == snapshot("""\
+                                                                              Evaluation Summary: test_report
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
+┃ Case ID   ┃ Inputs                    ┃ Metadata               ┃ Expected Output ┃ Outputs         ┃ Scores       ┃ Labels                 ┃ Metrics         ┃ Assertions ┃    Durations ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
+│ test_case │ {'query': 'What is 2+2?'} │ {'difficulty': 'easy'} │ {'answer': '4'} │ {'answer': '4'} │ score1: 2.50 │ label1: hello          │ accuracy: 0.950 │ ✔          │  task: 0.100 │
+│           │                           │                        │                 │                 │              │                        │                 │            │ total: 0.200 │
+├───────────┼───────────────────────────┼────────────────────────┼─────────────────┼─────────────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────────┤
+│ Averages  │                           │                        │                 │                 │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔   │  task: 0.100 │
+│           │                           │                        │                 │                 │              │                        │                 │            │ total: 0.200 │
+└───────────┴───────────────────────────┴────────────────────────┴─────────────────┴─────────────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────────┘
+""")
+async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport):
+    """Test EvaluationRenderer with baseline comparison."""
+    baseline_report = EvaluationReport(
+        cases=[
+            ReportCase(
+                name='test_case',
+                inputs={'query': 'What is 2+2?'},
+                output={'answer': '4'},
+                expected_output={'answer': '4'},
+                metadata={'difficulty': 'easy'},
+                metrics={'accuracy': 0.90},
+                attributes={},
+                scores={
+                    'score1': EvaluationResult(
+                        name='MockEvaluator',
+                        value=2.5,
+                        reason=None,
+                        source=sample_report.cases[0].scores['score1'].source,
+                    )
+                },
+                labels={
+                    'label1': EvaluationResult(
+                        name='MockEvaluator',
+                        value='hello',
+                        reason=None,
+                        source=sample_report.cases[0].labels['label1'].source,
+                    )
+                },
+                assertions={},
+                task_duration=0.15,
+                total_duration=0.25,
+                trace_id='test-trace-id',
+                span_id='test-span-id',
+            )
+        ],
+        name='baseline_report',
+    )
+    renderer = EvaluationRenderer(
+        include_input=True,
+        include_metadata=True,
+        include_expected_output=True,
+        include_output=True,
+        include_durations=True,
+        include_total_duration=True,
+        include_removed_cases=False,
+        include_averages=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+    )
+    table = renderer.build_diff_table(sample_report, baseline_report)
+    assert render_table(table) == snapshot("""\
+                                                                                                                               Evaluation Diff: baseline_report → test_report
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ Case ID   ┃ Inputs                    ┃ Metadata               ┃ Expected Output ┃ Outputs         ┃ Scores       ┃ Labels                                                                              ┃ Metrics                                 ┃ Assertions   ┃                             Durations ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ test_case │ {'query': 'What is 2+2?'} │ {'difficulty': 'easy'} │ {'answer': '4'} │ {'answer': '4'} │ score1: 2.50 │ label1: EvaluationResult(name='MockEvaluator', value='hello', reason=None,          │ accuracy: 0.900 → 0.950 (+0.05 / +5.6%) │  → ✔         │  task: 0.150 → 0.100 (-0.05 / -33.3%) │
+│           │                           │                        │                 │                 │              │ source=mock_evaluator.<locals>.MockEvaluator())                                     │                                         │              │ total: 0.250 → 0.200 (-0.05 / -20.0%) │
+├───────────┼───────────────────────────┼────────────────────────┼─────────────────┼─────────────────┼──────────────┼─────────────────────────────────────────────────────────────────────────────────────┼─────────────────────────────────────────┼──────────────┼───────────────────────────────────────┤
+│ Averages  │                           │                        │                 │                 │ score1: 2.50 │ label1: {'hello': 1.0}                                                              │ accuracy: 0.900 → 0.950 (+0.05 / +5.6%) │ - → 100.0% ✔ │  task: 0.150 → 0.100 (-0.05 / -33.3%) │
+│           │                           │                        │                 │                 │              │                                                                                     │                                         │              │ total: 0.250 → 0.200 (-0.05 / -20.0%) │
+└───────────┴───────────────────────────┴────────────────────────┴─────────────────┴─────────────────┴──────────────┴─────────────────────────────────────────────────────────────────────────────────────┴─────────────────────────────────────────┴──────────────┴───────────────────────────────────────┘
+""")
+async def test_evaluation_renderer_with_removed_cases(sample_report: EvaluationReport):
+    """Test EvaluationRenderer with removed cases."""
+    baseline_report = EvaluationReport(
+        cases=[
+            ReportCase(
+                name='removed_case',
+                inputs={'query': 'What is 3+3?'},
+                output={'answer': '6'},
+                expected_output={'answer': '6'},
+                metadata={'difficulty': 'medium'},
+                metrics={'accuracy': 0.85},
+                attributes={},
+                scores={},
+                labels={},
+                assertions={},
+                task_duration=0.1,
+                total_duration=0.15,
+                trace_id='test-trace-id-2',
+                span_id='test-span-id-2',
+            )
+        ],
+        name='baseline_report',
+    )
+    renderer = EvaluationRenderer(
+        include_input=True,
+        include_metadata=True,
+        include_expected_output=True,
+        include_output=True,
+        include_durations=True,
+        include_total_duration=True,
+        include_removed_cases=True,
+        include_averages=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+    )
+    table = renderer.build_diff_table(sample_report, baseline_report)
+    assert render_table(table) == snapshot("""\
+                                                                                                                Evaluation Diff: baseline_report → test_report
+┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
+┃ Case ID        ┃ Inputs                    ┃ Metadata                 ┃ Expected Output ┃ Outputs         ┃ Scores                   ┃ Labels                             ┃ Metrics                                 ┃ Assertions   ┃                             Durations ┃
+┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
+│ + Added Case   │ {'query': 'What is 2+2?'} │ {'difficulty': 'easy'}   │ {'answer': '4'} │ {'answer': '4'} │ score1: 2.50             │ label1: hello                      │ accuracy: 0.950                         │ ✔            │                           task: 0.100 │
+│ test_case      │                           │                          │                 │                 │                          │                                    │                                         │              │                          total: 0.200 │
+├────────────────┼───────────────────────────┼──────────────────────────┼─────────────────┼─────────────────┼──────────────────────────┼────────────────────────────────────┼─────────────────────────────────────────┼──────────────┼───────────────────────────────────────┤
+│ - Removed Case │ {'query': 'What is 3+3?'} │ {'difficulty': 'medium'} │ {'answer': '6'} │ {'answer': '6'} │ -                        │ -                                  │ accuracy: 0.850                         │ -            │                           task: 0.100 │
+│ removed_case   │                           │                          │                 │                 │                          │                                    │                                         │              │                          total: 0.150 │
+├────────────────┼───────────────────────────┼──────────────────────────┼─────────────────┼─────────────────┼──────────────────────────┼────────────────────────────────────┼─────────────────────────────────────────┼──────────────┼───────────────────────────────────────┤
+│ Averages       │                           │                          │                 │                 │ score1: <missing> → 2.50 │ label1: <missing> → {'hello': 1.0} │ accuracy: 0.850 → 0.950 (+0.1 / +11.8%) │ - → 100.0% ✔ │                           task: 0.100 │
+│                │                           │                          │                 │                 │                          │                                    │                                         │              │ total: 0.150 → 0.200 (+0.05 / +33.3%) │
+└────────────────┴───────────────────────────┴──────────────────────────┴─────────────────┴─────────────────┴──────────────────────────┴────────────────────────────────────┴─────────────────────────────────────────┴──────────────┴───────────────────────────────────────┘
+""")
+async def test_evaluation_renderer_with_custom_configs(sample_report: EvaluationReport):
+    """Test EvaluationRenderer with custom render configurations."""
+    renderer = EvaluationRenderer(
+        include_input=True,
+        include_metadata=True,
+        include_expected_output=True,
+        include_output=True,
+        include_durations=True,
+        include_total_duration=True,
+        include_removed_cases=False,
+        include_averages=True,
+        input_config={'value_formatter': lambda x: str(x)},
+        metadata_config={'value_formatter': lambda x: str(x)},
+        output_config={'value_formatter': lambda x: str(x)},
+        score_configs={
+            'score1': {
+                'value_formatter': '{:.2f}',
+                'diff_formatter': '{:+.2f}',
+                'diff_atol': 0.01,
+                'diff_rtol': 0.05,
+                'diff_increase_style': 'bold green',
+                'diff_decrease_style': 'bold red',
+            }
+        },
+        label_configs={'label1': {'value_formatter': lambda x: str(x)}},
+        metric_configs={
+            'accuracy': {
+                'value_formatter': '{:.1%}',
+                'diff_formatter': '{:+.1%}',
+                'diff_atol': 0.01,
+                'diff_rtol': 0.05,
+                'diff_increase_style': 'bold green',
+                'diff_decrease_style': 'bold red',
+            }
+        },
+        duration_config={
+            'value_formatter': '{:.3f}s',
+            'diff_formatter': '{:+.3f}s',
+            'diff_atol': 0.001,
+            'diff_rtol': 0.05,
+            'diff_increase_style': 'bold red',
+            'diff_decrease_style': 'bold green',
+        },
+    )
+    table = renderer.build_table(sample_report)
+    assert render_table(table) == snapshot("""\
+                                                                               Evaluation Summary: test_report
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
+┃ Case ID   ┃ Inputs                    ┃ Metadata               ┃ Expected Output ┃ Outputs         ┃ Scores       ┃ Labels                 ┃ Metrics         ┃ Assertions ┃     Durations ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
+│ test_case │ {'query': 'What is 2+2?'} │ {'difficulty': 'easy'} │ {'answer': '4'} │ {'answer': '4'} │ score1: 2.50 │ label1: hello          │ accuracy: 95.0% │ ✔          │  task: 0.100s │
+│           │                           │                        │                 │                 │              │                        │                 │            │ total: 0.200s │
+├───────────┼───────────────────────────┼────────────────────────┼─────────────────┼─────────────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼───────────────┤
+│ Averages  │                           │                        │                 │                 │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 95.0% │ 100.0% ✔   │  task: 0.100s │
+│           │                           │                        │                 │                 │              │                        │                 │            │ total: 0.200s │
+└───────────┴───────────────────────────┴────────────────────────┴─────────────────┴─────────────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴───────────────┘
+""")
+async def test_report_case_aggregate_average():
+    """Test ReportCaseAggregate.average() method."""
+    @dataclass
+    class MockEvaluator(Evaluator[TaskInput, TaskOutput, TaskMetadata]):
+        def evaluate(self, ctx: EvaluatorContext[TaskInput, TaskOutput, TaskMetadata]) -> float:
+            raise NotImplementedError
+    cases = [
+        ReportCase(
+            name='case1',
+            inputs={'query': 'What is 2+2?'},
+            output={'answer': '4'},
+            expected_output={'answer': '4'},
+            metadata={'difficulty': 'easy'},
+            metrics={'accuracy': 0.95},
+            attributes={},
+            scores={
+                'score1': EvaluationResult(
+                    name='MockEvaluator',
+                    value=0.8,
+                    reason=None,
+                    source=MockEvaluator(),
+                )
+            },
+            labels={
+                'label1': EvaluationResult(
+                    name='MockEvaluator',
+                    value='good',
+                    reason=None,
+                    source=MockEvaluator(),
+                )
+            },
+            assertions={
+                'assert1': EvaluationResult(
+                    name='MockEvaluator',
+                    value=True,
+                    reason=None,
+                    source=MockEvaluator(),
+                )
+            },
+            task_duration=0.1,
+            total_duration=0.2,
+            trace_id='test-trace-id-1',
+            span_id='test-span-id-1',
+        ),
+        ReportCase(
+            name='case2',
+            inputs={'query': 'What is 3+3?'},
+            output={'answer': '6'},
+            expected_output={'answer': '6'},
+            metadata={'difficulty': 'medium'},
+            metrics={'accuracy': 0.85},
+            attributes={},
+            scores={
+                'score1': EvaluationResult(
+                    name='MockEvaluator',
+                    value=0.7,
+                    reason=None,
+                    source=MockEvaluator(),
+                )
+            },
+            labels={
+                'label1': EvaluationResult(
+                    name='MockEvaluator',
+                    value='good',
+                    reason=None,
+                    source=MockEvaluator(),
+                )
+            },
+            assertions={
+                'assert1': EvaluationResult(
+                    name='MockEvaluator',
+                    value=False,
+                    reason=None,
+                    source=MockEvaluator(),
+                )
+            },
+            task_duration=0.15,
+            total_duration=0.25,
+            trace_id='test-trace-id-2',
+            span_id='test-span-id-2',
+        ),
+    ]
+    aggregate = ReportCaseAggregate.average(cases)
+    assert aggregate.name == 'Averages'
+    assert aggregate.scores['score1'] == 0.75  # (0.8 + 0.7) / 2
+    assert aggregate.labels['label1']['good'] == 1.0  # Both cases have 'good' label
+    assert abs(aggregate.metrics['accuracy'] - 0.90) < 1e-10  # floating-point error  # (0.95 + 0.85) / 2
+    assert aggregate.assertions == 0.5  # 1 passing out of 2 assertions
+    assert aggregate.task_duration == 0.125  # (0.1 + 0.15) / 2
+    assert aggregate.total_duration == 0.225  # (0.2 + 0.25) / 2
+async def test_report_case_aggregate_empty():
+    """Test ReportCaseAggregate.average() with empty cases list."""
+    assert ReportCaseAggregate.average([]).model_dump() == {
+        'assertions': None,
+        'labels': {},
+        'metrics': {},
+        'name': 'Averages',
+        'scores': {},
+        'task_duration': 0.0,
+        'total_duration': 0.0,
+    }

pydantic-ai 0.0.49__tar.gz → 0.0.51__tar.gz

Potentially problematic release.

pydantic-ai 0.0.49tar.gz → 0.0.51tar.gz