PyPI - pydantic-evals - Versions diffs - 1.22.0__tar.gz → 1.50.0__tar.gz - Mend

pydantic-evals 1.22.0tar.gz → 1.50.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

{pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: pydantic-evals
-Version: 1.22.0
+Version: 1.50.0
 Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
 Project-URL: Homepage, https://ai.pydantic.dev/evals
 Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
 Requires-Python: >=3.10
 Requires-Dist: anyio>=0
 Requires-Dist: logfire-api>=3.14.1
-Requires-Dist: pydantic-ai-slim==1.22.0
+Requires-Dist: pydantic-ai-slim==1.50.0
 Requires-Dist: pydantic>=2.10
 Requires-Dist: pyyaml>=6.0.2
 Requires-Dist: rich>=13.9.4

{pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/_utils.py RENAMED Viewed

@@ -112,7 +112,7 @@ async def task_group_gather(tasks: Sequence[Callable[[], Awaitable[T]]]) -> list
 try:
     from logfire._internal.config import (
-        LogfireNotConfiguredWarning,  # pyright: ignore[reportAssignmentType,reportPrivateImportUsage]
+        LogfireNotConfiguredWarning,  # pyright: ignore[reportAssignmentType]
     )
 # TODO: Remove this `pragma: no cover` once we test evals without pydantic-ai (which includes logfire)
 except ImportError:  # pragma: no cover

{pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/dataset.py RENAMED Viewed

@@ -90,7 +90,7 @@ class _CaseModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'
     inputs: InputsT
     metadata: MetadataT | None = None
     expected_output: OutputT | None = None
-    evaluators: list[EvaluatorSpec] = Field(default_factory=list)
+    evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
 class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'):
@@ -100,7 +100,7 @@ class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forb
     json_schema_path: str | None = Field(default=None, alias='$schema')
     name: str | None = None
     cases: list[_CaseModel[InputsT, OutputT, MetadataT]]
-    evaluators: list[EvaluatorSpec] = Field(default_factory=list)
+    evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
 @dataclass(init=False)
@@ -136,7 +136,9 @@ class Case(Generic[InputsT, OutputT, MetadataT]):
     """
     expected_output: OutputT | None = None
     """Expected output of the task. This is the expected output of the task that will be evaluated."""
-    evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(default_factory=list)
+    evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(
+        default_factory=list[Evaluator[InputsT, OutputT, MetadataT]]
+    )
     """Evaluators to be used just on this case."""
     def __init__(
@@ -372,7 +374,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
-        This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
+        This is a synchronous wrapper around [`evaluate`][pydantic_evals.dataset.Dataset.evaluate] provided for convenience.
         Args:
             task: The task to evaluate. This should be a callable that takes the inputs of the case
@@ -511,7 +513,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         path = Path(path)
         fmt = cls._infer_fmt(path, fmt)
-        raw = Path(path).read_text()
+        raw = Path(path).read_text(encoding='utf-8')
         try:
             return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types, default_name=path.stem)
         except ValidationError as e:  # pragma: no cover
@@ -671,11 +673,11 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
             if schema_ref:  # pragma: no branch
                 yaml_language_server_line = f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}'
                 content = f'{yaml_language_server_line}\n{content}'
-            path.write_text(content)
+            path.write_text(content, encoding='utf-8')
         else:
             context['$schema'] = schema_ref
             json_data = self.model_dump_json(indent=2, by_alias=True, context=context)
-            path.write_text(json_data + '\n')
+            path.write_text(json_data + '\n', encoding='utf-8')
     @classmethod
     def model_json_schema_with_evaluators(
@@ -738,16 +740,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         class Case(BaseModel, extra='forbid'):  # pyright: ignore[reportUnusedClass]  # this _is_ used below, but pyright doesn't seem to notice..
             name: str | None = None
             inputs: in_type  # pyright: ignore[reportInvalidTypeForm]
-            metadata: meta_type | None = None  # pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]
-            expected_output: out_type | None = None  # pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]
+            metadata: meta_type | None = None  # pyright: ignore[reportInvalidTypeForm]
+            expected_output: out_type | None = None  # pyright: ignore[reportInvalidTypeForm]
             if evaluator_schema_types:  # pragma: no branch
-                evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa UP007
+                evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa: UP007
         class Dataset(BaseModel, extra='forbid'):
             name: str | None = None
             cases: list[Case]
             if evaluator_schema_types:  # pragma: no branch
-                evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa UP007
+                evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa: UP007
         json_schema = Dataset.model_json_schema()
         # See `_add_json_schema` below, since `$schema` is added to the JSON, it has to be supported in the JSON
@@ -767,8 +769,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
         path = Path(path)
         json_schema = cls.model_json_schema_with_evaluators(custom_evaluator_types)
         schema_content = to_json(json_schema, indent=2).decode() + '\n'
-        if not path.exists() or path.read_text() != schema_content:  # pragma: no branch
-            path.write_text(schema_content)
+        if not path.exists() or path.read_text(encoding='utf-8') != schema_content:  # pragma: no branch
+            path.write_text(schema_content, encoding='utf-8')
     @classmethod
     @functools.cache
@@ -854,8 +856,8 @@ def _get_relative_path_reference(target: Path, source: Path, _prefix: str = '')
 class _TaskRun:
     """Internal class to track metrics and attributes for a task run."""
-    attributes: dict[str, Any] = field(init=False, default_factory=dict)
-    metrics: dict[str, int | float] = field(init=False, default_factory=dict)
+    attributes: dict[str, Any] = field(init=False, default_factory=dict[str, Any])
+    metrics: dict[str, int | float] = field(init=False, default_factory=dict[str, int | float])
     def record_metric(self, name: str, value: int | float) -> None:
         """Record a metric value.
@@ -947,6 +949,8 @@ async def _run_task(
         #   That way users can customize this logic. We'd default to a function that does the current thing but also
         #   allow `None` to disable it entirely.
         for node in span_tree:
+            if 'gen_ai.request.model' not in node.attributes:
+                continue  # we only want to count the below specifically for the individual LLM requests, not agent runs
             for k, v in node.attributes.items():
                 if k == 'gen_ai.operation.name' and v == 'chat':
                     task_run.increment_metric('requests', 1)

{pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/common.py RENAMED Viewed

@@ -191,7 +191,7 @@ class LLMJudge(Evaluator[object, object, object]):
     """
     rubric: str
-    model: models.Model | models.KnownModelName | None = None
+    model: models.Model | models.KnownModelName | str | None = None
     include_input: bool = False
     include_expected_output: bool = False
     model_settings: ModelSettings | None = None

{pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/llm_as_a_judge.py RENAMED Viewed

@@ -55,7 +55,7 @@ _judge_output_agent = Agent(
 async def judge_output(
     output: Any,
     rubric: str,
-    model: models.Model | models.KnownModelName | None = None,
+    model: models.Model | models.KnownModelName | str | None = None,
     model_settings: ModelSettings | None = None,
 ) -> GradingOutput:
     """Judge the output of a model based on a rubric.
@@ -96,7 +96,7 @@ async def judge_input_output(
     inputs: Any,
     output: Any,
     rubric: str,
-    model: models.Model | models.KnownModelName | None = None,
+    model: models.Model | models.KnownModelName | str | None = None,
     model_settings: ModelSettings | None = None,
 ) -> GradingOutput:
     """Judge the output of a model based on the inputs and a rubric.
@@ -141,7 +141,7 @@ async def judge_input_output_expected(
     output: Any,
     expected_output: Any,
     rubric: str,
-    model: models.Model | models.KnownModelName | None = None,
+    model: models.Model | models.KnownModelName | str | None = None,
     model_settings: ModelSettings | None = None,
 ) -> GradingOutput:
     """Judge the output of a model based on the inputs and a rubric.
@@ -185,7 +185,7 @@ async def judge_output_expected(
     output: Any,
     expected_output: Any,
     rubric: str,
-    model: models.Model | models.KnownModelName | None = None,
+    model: models.Model | models.KnownModelName | str | None = None,
     model_settings: ModelSettings | None = None,
 ) -> GradingOutput:
     """Judge the output of a model based on the expected output, output, and a rubric.
@@ -221,39 +221,44 @@ def _stringify(value: Any) -> str:
         return repr(value)
+def _make_section(content: Any, tag: str) -> list[str | UserContent]:
+    """Create a tagged section, handling different content types, for use in the LLMJudge's prompt.
+    Args:
+        content (Any): content to include in the section_
+        tag (str): tag name for the section
+    Returns:
+        list[str | UserContent]: the tagged section as a list of strings or UserContent
+    """
+    sections: list[str | UserContent] = []
+    items: Sequence[str | UserContent] = (  # pyright: ignore[reportUnknownVariableType]
+        content if isinstance(content, Sequence) and not isinstance(content, str) else [content]
+    )
+    sections.append(f'<{tag}>')
+    for item in items:
+        sections.append(item if isinstance(item, str | MultiModalContent) else _stringify(item))
+    sections.append(f'</{tag}>')
+    return sections
 def _build_prompt(
     output: Any,
     rubric: str,
     inputs: Any | None = None,
     expected_output: Any | None = None,
 ) -> str | Sequence[str | UserContent]:
-    """Build a prompt that includes input, output, and rubric."""
+    """Build a prompt that includes input, output, expected output, and rubric."""
     sections: list[str | UserContent] = []
     if inputs is not None:
-        if isinstance(inputs, str):
-            sections.append(f'<Input>\n{inputs}\n</Input>')
-        else:
-            sections.append('<Input>\n')
-            if isinstance(inputs, Sequence):
-                for item in inputs:  # type: ignore
-                    if isinstance(item, str | MultiModalContent):
-                        sections.append(item)
-                    else:
-                        sections.append(_stringify(item))
-            elif isinstance(inputs, MultiModalContent):
-                sections.append(inputs)
-            else:
-                sections.append(_stringify(inputs))
-            sections.append('</Input>')
-    sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
-    sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
+        sections.extend(_make_section(inputs, 'Input'))
-    if expected_output is not None:
-        sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')
+    sections.extend(_make_section(output, 'Output'))
+    sections.extend(_make_section(rubric, 'Rubric'))
-    if inputs is None or isinstance(inputs, str):
-        return '\n\n'.join(sections)  # type: ignore[arg-type]
-    else:
-        return sections
+    if expected_output is not None:
+        sections.extend(_make_section(expected_output, 'ExpectedOutput'))
+    if all(isinstance(section, str) for section in sections):
+        return '\n'.join(sections)  # type: ignore[arg-type]
+    return sections

{pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/otel/span_tree.py RENAMED Viewed

@@ -241,7 +241,7 @@ class SpanNode:
         return self._matches_query(query)
-    def _matches_query(self, query: SpanQuery) -> bool:  # noqa C901
+    def _matches_query(self, query: SpanQuery) -> bool:  # noqa: C901
         """Check if the span matches the query conditions."""
         # Logical combinations
         if or_ := query.get('or_'):
@@ -433,8 +433,8 @@ class SpanTree:
     You can then search or iterate the tree to make your assertions (using DFS for traversal).
     """
-    roots: list[SpanNode] = field(default_factory=list)
-    nodes_by_id: dict[str, SpanNode] = field(default_factory=dict)
+    roots: list[SpanNode] = field(default_factory=list[SpanNode])
+    nodes_by_id: dict[str, SpanNode] = field(default_factory=dict[str, SpanNode])
     # -------------------------------------------------------------------------
     # Construction

{pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/reporting/__init__.py RENAMED Viewed

@@ -55,11 +55,11 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
     name: str
     """The name of the [case][pydantic_evals.Case]."""
     inputs: InputsT
-    """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
+    """The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
     metadata: MetadataT | None
-    """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
+    """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
     expected_output: OutputT | None
-    """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
+    """The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
     output: OutputT
     """The output of the task execution."""
@@ -78,7 +78,7 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
     span_id: str | None = None
     """The span ID of the case span."""
-    evaluator_failures: list[EvaluatorFailure] = field(default_factory=list)
+    evaluator_failures: list[EvaluatorFailure] = field(default_factory=list[EvaluatorFailure])
 @dataclass(kw_only=True)
@@ -88,11 +88,11 @@ class ReportCaseFailure(Generic[InputsT, OutputT, MetadataT]):
     name: str
     """The name of the [case][pydantic_evals.Case]."""
     inputs: InputsT
-    """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
+    """The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
     metadata: MetadataT | None
-    """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
+    """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
     expected_output: OutputT | None
-    """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
+    """The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
     error_message: str
     """The message of the exception that caused the failure."""
@@ -195,7 +195,9 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
     cases: list[ReportCase[InputsT, OutputT, MetadataT]]
     """The cases in the report."""
-    failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
+    failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(
+        default_factory=list[ReportCaseFailure[InputsT, OutputT, MetadataT]]
+    )
     """The failures in the report. These are cases where task execution raised an exception."""
     experiment_metadata: dict[str, Any] | None = None