PyPI - pydantic-evals - Versions diffs - 1.0.11__py3-none-any.whl → 1.0.13__py3-none-any.whl - Mend

pydantic-evals 1.0.11py3-none-any.whl → 1.0.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pydantic-evals might be problematic. Click here for more details.

Files changed (7) hide show

pydantic_evals/dataset.py CHANGED Viewed

@@ -98,6 +98,7 @@ class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forb
     # $schema is included to avoid validation fails from the `$schema` key, see `_add_json_schema` below for context
     json_schema_path: str | None = Field(default=None, alias='$schema')
+    name: str | None = None
     cases: list[_CaseModel[InputsT, OutputT, MetadataT]]
     evaluators: list[EvaluatorSpec] = Field(default_factory=list)
@@ -218,6 +219,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
     ```
     """
+    name: str | None = None
+    """Optional name of the dataset."""
     cases: list[Case[InputsT, OutputT, MetadataT]]
     """List of test cases in the dataset."""
     evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = []
@@ -226,12 +229,14 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
     def __init__(
         self,
         *,
+        name: str | None = None,
         cases: Sequence[Case[InputsT, OutputT, MetadataT]],
         evaluators: Sequence[Evaluator[InputsT, OutputT, MetadataT]] = (),
     ):
         """Initialize a new dataset with test cases and optional evaluators.
         Args:
+            name: Optional name for the dataset.
             cases: Sequence of test cases to include in the dataset.
             evaluators: Optional sequence of evaluators to apply to all cases in the dataset.
         """
@@ -244,10 +249,12 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             case_names.add(case.name)
         super().__init__(
+            name=name,
             cases=cases,
             evaluators=list(evaluators),
         )
+    # TODO in v2: Make everything not required keyword-only
     async def evaluate(
         self,
         task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
@@ -256,6 +263,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         progress: bool = True,
         retry_task: RetryConfig | None = None,
         retry_evaluators: RetryConfig | None = None,
+        *,
+        task_name: str | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
@@ -265,28 +274,38 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         Args:
             task: The task to evaluate. This should be a callable that takes the inputs of the case
                 and returns the output.
-            name: The name of the task being evaluated, this is used to identify the task in the report.
-                If omitted, the name of the task function will be used.
+            name: The name of the experiment being run, this is used to identify the experiment in the report.
+                If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
             max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                 If None, all cases will be evaluated concurrently.
             progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
             retry_task: Optional retry configuration for the task execution.
             retry_evaluators: Optional retry configuration for evaluator execution.
+            task_name: Optional override to the name of the task being executed, otherwise the name of the task
+                function will be used.
         Returns:
             A report containing the results of the evaluation.
         """
-        name = name or get_unwrapped_function_name(task)
+        task_name = task_name or get_unwrapped_function_name(task)
+        name = name or task_name
         total_cases = len(self.cases)
         progress_bar = Progress() if progress else None
         limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
         with (
-            logfire_span('evaluate {name}', name=name, n_cases=len(self.cases)) as eval_span,
+            logfire_span(
+                'evaluate {name}',
+                name=name,
+                task_name=task_name,
+                dataset_name=self.name,
+                n_cases=len(self.cases),
+                **{'gen_ai.operation.name': 'experiment'},  # pyright: ignore[reportArgumentType]
+            ) as eval_span,
             progress_bar or nullcontext(),
         ):
-            task_id = progress_bar.add_task(f'Evaluating {name}', total=total_cases) if progress_bar else None
+            task_id = progress_bar.add_task(f'Evaluating {task_name}', total=total_cases) if progress_bar else None
             async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str):
                 async with limiter:
@@ -333,6 +352,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         name: str | None = None,
         max_concurrency: int | None = None,
         progress: bool = True,
+        retry_task: RetryConfig | None = None,
+        retry_evaluators: RetryConfig | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
@@ -346,12 +367,21 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                 If None, all cases will be evaluated concurrently.
             progress: Whether to show a progress bar for the evaluation. Defaults to True.
+            retry_task: Optional retry configuration for the task execution.
+            retry_evaluators: Optional retry configuration for evaluator execution.
         Returns:
             A report containing the results of the evaluation.
         """
         return get_event_loop().run_until_complete(
-            self.evaluate(task, name=name, max_concurrency=max_concurrency, progress=progress)
+            self.evaluate(
+                task,
+                task_name=name,
+                max_concurrency=max_concurrency,
+                progress=progress,
+                retry_task=retry_task,
+                retry_evaluators=retry_evaluators,
+            )
         )
     def add_case(
@@ -463,7 +493,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         raw = Path(path).read_text()
         try:
-            return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types)
+            return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types, default_name=path.stem)
         except ValidationError as e:  # pragma: no cover
             raise ValueError(f'{path} contains data that does not match the schema for {cls.__name__}:\n{e}.') from e
@@ -473,6 +503,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         contents: str,
         fmt: Literal['yaml', 'json'] = 'yaml',
         custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
+        *,
+        default_name: str | None = None,
     ) -> Self:
         """Load a dataset from a string.
@@ -481,6 +513,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             fmt: Format of the content. Must be either 'yaml' or 'json'.
             custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.
                 These are additional evaluators beyond the default ones.
+            default_name: Default name of the dataset, to be used if not specified in the serialized contents.
         Returns:
             A new Dataset instance parsed from the string.
@@ -490,17 +523,19 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         """
         if fmt == 'yaml':
             loaded = yaml.safe_load(contents)
-            return cls.from_dict(loaded, custom_evaluator_types)
+            return cls.from_dict(loaded, custom_evaluator_types, default_name=default_name)
         else:
             dataset_model_type = cls._serialization_type()
             dataset_model = dataset_model_type.model_validate_json(contents)
-            return cls._from_dataset_model(dataset_model, custom_evaluator_types)
+            return cls._from_dataset_model(dataset_model, custom_evaluator_types, default_name)
     @classmethod
     def from_dict(
         cls,
         data: dict[str, Any],
         custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
+        *,
+        default_name: str | None = None,
     ) -> Self:
         """Load a dataset from a dictionary.
@@ -508,6 +543,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             data: Dictionary representation of the dataset.
             custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.
                 These are additional evaluators beyond the default ones.
+            default_name: Default name of the dataset, to be used if not specified in the data.
         Returns:
             A new Dataset instance created from the dictionary.
@@ -517,19 +553,21 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         """
         dataset_model_type = cls._serialization_type()
         dataset_model = dataset_model_type.model_validate(data)
-        return cls._from_dataset_model(dataset_model, custom_evaluator_types)
+        return cls._from_dataset_model(dataset_model, custom_evaluator_types, default_name)
     @classmethod
     def _from_dataset_model(
         cls,
         dataset_model: _DatasetModel[InputsT, OutputT, MetadataT],
         custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
+        default_name: str | None = None,
     ) -> Self:
         """Create a Dataset from a _DatasetModel.
         Args:
             dataset_model: The _DatasetModel to convert.
             custom_evaluator_types: Custom evaluator classes to register for deserialization.
+            default_name: Default name of the dataset, to be used if the value is `None` in the provided model.
         Returns:
             A new Dataset instance created from the _DatasetModel.
@@ -566,7 +604,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             cases.append(row)
         if errors:
             raise ExceptionGroup(f'{len(errors)} error(s) loading evaluators from registry', errors[:3])
-        result = cls(cases=cases)
+        result = cls(name=dataset_model.name, cases=cases)
+        if result.name is None:
+            result.name = default_name
         result.evaluators = dataset_evaluators
         return result
@@ -886,12 +926,14 @@ async def _run_task(
         #   That way users can customize this logic. We'd default to a function that does the current thing but also
         #   allow `None` to disable it entirely.
         for node in span_tree:
-            if node.attributes.get('gen_ai.operation.name') == 'chat':
-                task_run.increment_metric('requests', 1)
             for k, v in node.attributes.items():
-                if not isinstance(v, int | float):
+                if k == 'gen_ai.operation.name' and v == 'chat':
+                    task_run.increment_metric('requests', 1)
+                elif not isinstance(v, int | float):
                     continue
-                if k.startswith('gen_ai.usage.details.'):
+                elif k == 'operation.cost':
+                    task_run.increment_metric('cost', v)
+                elif k.startswith('gen_ai.usage.details.'):
                     task_run.increment_metric(k.removeprefix('gen_ai.usage.details.'), v)
                 elif k.startswith('gen_ai.usage.'):
                     task_run.increment_metric(k.removeprefix('gen_ai.usage.'), v)

pydantic_evals/evaluators/llm_as_a_judge.py CHANGED Viewed

@@ -7,8 +7,7 @@ from typing import Any
 from pydantic import BaseModel, Field
 from pydantic_core import to_json
-from pydantic_ai import Agent, models
-from pydantic_ai.messages import MultiModalContent, UserContent
+from pydantic_ai import Agent, MultiModalContent, UserContent, models
 from pydantic_ai.settings import ModelSettings
 __all__ = (

pydantic_evals/generation.py CHANGED Viewed

@@ -59,7 +59,8 @@ async def generate_dataset(
     """
     output_schema = dataset_type.model_json_schema_with_evaluators(custom_evaluator_types)
-    # TODO(DavidM): Update this once we add better response_format and/or ResultTool support to Pydantic AI
+    # TODO: Use `output_type=StructuredDict(output_schema)` (and `from_dict` below) once https://github.com/pydantic/pydantic/issues/12145
+    # is fixed and `StructuredDict` no longer needs to use `InlineDefsJsonSchemaTransformer`.
     agent = Agent(
         model,
         system_prompt=(

{pydantic_evals-1.0.11.dist-info → pydantic_evals-1.0.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydantic-evals
-Version: 1.0.11
+Version: 1.0.13
 Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
 Project-URL: Homepage, https://ai.pydantic.dev/evals
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.10
 Requires-Dist: anyio>=0
 Requires-Dist: logfire-api>=3.14.1
-Requires-Dist: pydantic-ai-slim==1.0.11
+Requires-Dist: pydantic-ai-slim==1.0.13
 Requires-Dist: pydantic>=2.10
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: rich>=13.9.4

{pydantic_evals-1.0.11.dist-info → pydantic_evals-1.0.13.dist-info}/RECORD RENAMED Viewed

@@ -1,14 +1,14 @@
 pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
 pydantic_evals/_utils.py,sha256=1muGTc2zqjwxqngz6quRSLoZM88onjp0Xgt-a9n2aPQ,4111
-pydantic_evals/dataset.py,sha256=8rcw_hJb9H01M22NInn-2Pi27xtZgfADUboMCW-nrj4,48468
-pydantic_evals/generation.py,sha256=Yd1rfbsDjjBBHDk-1KDu48hlITjM2-74rTnPBD_sqbA,3494
+pydantic_evals/dataset.py,sha256=IfaS65LqHW0654iAxc7bxA7mETo1qUqhFwITS_wFZ5s,50447
+pydantic_evals/generation.py,sha256=ROB8bZ6XKFquWNjWTd3lsXXwsx8-VgSCu_okbovNw9s,3619
 pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pydantic_evals/evaluators/__init__.py,sha256=E_JT6o96Ef-oS_IZ1Hyy95NRLwz7EOHewp-o13IdXEM,1032
 pydantic_evals/evaluators/_run_evaluator.py,sha256=uGmH67gCTeF9BSprCiBC4DtKEpKLrKYaXgsAQiCbCLY,3630
 pydantic_evals/evaluators/common.py,sha256=Cc9RMsSf5P2gcq3IDwmZxgfo1xnu7HEehiAS2Hgibz4,11609
 pydantic_evals/evaluators/context.py,sha256=mTxcm0Hvkev9htpqwoJMCJIqEYBtY5g86SXcjoqQxHY,3884
 pydantic_evals/evaluators/evaluator.py,sha256=ylfKRytoM9KzbZkSsFkEEnsg4XhK4usuyy1Rb1emoPo,11474
-pydantic_evals/evaluators/llm_as_a_judge.py,sha256=i20c506j9f5J2VMzPeUky677lfGq27xaZ7xcYIFltiA,9599
+pydantic_evals/evaluators/llm_as_a_judge.py,sha256=BPdUfEsLPSxN2kJPt3dtJBRCBP46ctRoW_n24WubaB0,9567
 pydantic_evals/evaluators/spec.py,sha256=szAUsY4gb8KK_l1R81HYrByh4Rawrjav7w9835FZg1w,6690
 pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
 pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=FrG0pXKjuvTp3bXNdrUyzdPkqm0DQWe4ehkiHaxSvz4,6742
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
 pydantic_evals/otel/span_tree.py,sha256=RzX4VGpEqc2QUhkyxMTXtBRo5yHHO1c0hI7QJJuiXPU,23043
 pydantic_evals/reporting/__init__.py,sha256=4S8q_KfOflQlJYTISWM1Vp6_wPDHOMjbh9mSc3dU4-8,51562
 pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
-pydantic_evals-1.0.11.dist-info/METADATA,sha256=AQAeNQ19UK65CJKbxg5Igu2TxYIYbVeZaPpyBLE6_FE,7846
-pydantic_evals-1.0.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-pydantic_evals-1.0.11.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
-pydantic_evals-1.0.11.dist-info/RECORD,,
+pydantic_evals-1.0.13.dist-info/METADATA,sha256=D6mudJtjmS-SJhv8xawBYvm3Sd8bRzjL-AKaK9cc4vA,7846
+pydantic_evals-1.0.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+pydantic_evals-1.0.13.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
+pydantic_evals-1.0.13.dist-info/RECORD,,

{pydantic_evals-1.0.11.dist-info → pydantic_evals-1.0.13.dist-info}/WHEEL RENAMED Viewed

File without changes

{pydantic_evals-1.0.11.dist-info → pydantic_evals-1.0.13.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

pydantic-evals 1.0.11__py3-none-any.whl → 1.0.13__py3-none-any.whl

Potentially problematic release.

pydantic-evals 1.0.11py3-none-any.whl → 1.0.13py3-none-any.whl