PyPI - pydantic-evals - Versions diffs - 1.0.10__tar.gz → 1.0.12__tar.gz - Mend

pydantic-evals 1.0.10tar.gz → 1.0.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydantic-evals might be problematic. Click here for more details.

Files changed (24) hide show

{pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydantic-evals
-Version: 1.0.10
+Version: 1.0.12
 Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
 Project-URL: Homepage, https://ai.pydantic.dev/evals
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.10
 Requires-Dist: anyio>=0
 Requires-Dist: logfire-api>=3.14.1
-Requires-Dist: pydantic-ai-slim==1.0.10
+Requires-Dist: pydantic-ai-slim==1.0.12
 Requires-Dist: pydantic>=2.10
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: rich>=13.9.4

{pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/dataset.py RENAMED Viewed

@@ -333,6 +333,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         name: str | None = None,
         max_concurrency: int | None = None,
         progress: bool = True,
+        retry_task: RetryConfig | None = None,
+        retry_evaluators: RetryConfig | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
@@ -346,12 +348,21 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                 If None, all cases will be evaluated concurrently.
             progress: Whether to show a progress bar for the evaluation. Defaults to True.
+            retry_task: Optional retry configuration for the task execution.
+            retry_evaluators: Optional retry configuration for evaluator execution.
         Returns:
             A report containing the results of the evaluation.
         """
         return get_event_loop().run_until_complete(
-            self.evaluate(task, name=name, max_concurrency=max_concurrency, progress=progress)
+            self.evaluate(
+                task,
+                name=name,
+                max_concurrency=max_concurrency,
+                progress=progress,
+                retry_task=retry_task,
+                retry_evaluators=retry_evaluators,
+            )
         )
     def add_case(
@@ -886,12 +897,14 @@ async def _run_task(
         #   That way users can customize this logic. We'd default to a function that does the current thing but also
         #   allow `None` to disable it entirely.
         for node in span_tree:
-            if node.attributes.get('gen_ai.operation.name') == 'chat':
-                task_run.increment_metric('requests', 1)
             for k, v in node.attributes.items():
-                if not isinstance(v, int | float):
+                if k == 'gen_ai.operation.name' and v == 'chat':
+                    task_run.increment_metric('requests', 1)
+                elif not isinstance(v, int | float):
                     continue
-                if k.startswith('gen_ai.usage.details.'):
+                elif k == 'operation.cost':
+                    task_run.increment_metric('cost', v)
+                elif k.startswith('gen_ai.usage.details.'):
                     task_run.increment_metric(k.removeprefix('gen_ai.usage.details.'), v)
                 elif k.startswith('gen_ai.usage.'):
                     task_run.increment_metric(k.removeprefix('gen_ai.usage.'), v)

{pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/llm_as_a_judge.py RENAMED Viewed

@@ -7,8 +7,7 @@ from typing import Any
 from pydantic import BaseModel, Field
 from pydantic_core import to_json
-from pydantic_ai import Agent, models
-from pydantic_ai.messages import MultiModalContent, UserContent
+from pydantic_ai import Agent, MultiModalContent, UserContent, models
 from pydantic_ai.settings import ModelSettings
 __all__ = (

{pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/generation.py RENAMED Viewed

@@ -59,7 +59,8 @@ async def generate_dataset(
     """
     output_schema = dataset_type.model_json_schema_with_evaluators(custom_evaluator_types)
-    # TODO(DavidM): Update this once we add better response_format and/or ResultTool support to Pydantic AI
+    # TODO: Use `output_type=StructuredDict(output_schema)` (and `from_dict` below) once https://github.com/pydantic/pydantic/issues/12145
+    # is fixed and `StructuredDict` no longer needs to use `InlineDefsJsonSchemaTransformer`.
     agent = Agent(
         model,
         system_prompt=(