pydantic-evals 1.22.0__tar.gz → 1.50.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/PKG-INFO +2 -2
  2. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/_utils.py +1 -1
  3. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/dataset.py +19 -15
  4. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/common.py +1 -1
  5. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/llm_as_a_judge.py +35 -30
  6. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/otel/span_tree.py +3 -3
  7. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/reporting/__init__.py +10 -8
  8. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/.gitignore +0 -0
  9. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/LICENSE +0 -0
  10. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/README.md +0 -0
  11. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/__init__.py +0 -0
  12. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/__init__.py +0 -0
  13. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
  14. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/context.py +0 -0
  15. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/evaluator.py +0 -0
  16. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/spec.py +0 -0
  17. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/generation.py +0 -0
  18. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/otel/__init__.py +0 -0
  19. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
  20. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/otel/_context_subtree.py +0 -0
  21. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/otel/_errors.py +0 -0
  22. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/py.typed +0 -0
  23. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/reporting/render_numbers.py +0 -0
  24. {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 1.22.0
3
+ Version: 1.50.0
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
30
30
  Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=0
32
32
  Requires-Dist: logfire-api>=3.14.1
33
- Requires-Dist: pydantic-ai-slim==1.22.0
33
+ Requires-Dist: pydantic-ai-slim==1.50.0
34
34
  Requires-Dist: pydantic>=2.10
35
35
  Requires-Dist: pyyaml>=6.0.2
36
36
  Requires-Dist: rich>=13.9.4
@@ -112,7 +112,7 @@ async def task_group_gather(tasks: Sequence[Callable[[], Awaitable[T]]]) -> list
112
112
 
113
113
  try:
114
114
  from logfire._internal.config import (
115
- LogfireNotConfiguredWarning, # pyright: ignore[reportAssignmentType,reportPrivateImportUsage]
115
+ LogfireNotConfiguredWarning, # pyright: ignore[reportAssignmentType]
116
116
  )
117
117
  # TODO: Remove this `pragma: no cover` once we test evals without pydantic-ai (which includes logfire)
118
118
  except ImportError: # pragma: no cover
@@ -90,7 +90,7 @@ class _CaseModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'
90
90
  inputs: InputsT
91
91
  metadata: MetadataT | None = None
92
92
  expected_output: OutputT | None = None
93
- evaluators: list[EvaluatorSpec] = Field(default_factory=list)
93
+ evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
94
94
 
95
95
 
96
96
  class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'):
@@ -100,7 +100,7 @@ class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forb
100
100
  json_schema_path: str | None = Field(default=None, alias='$schema')
101
101
  name: str | None = None
102
102
  cases: list[_CaseModel[InputsT, OutputT, MetadataT]]
103
- evaluators: list[EvaluatorSpec] = Field(default_factory=list)
103
+ evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
104
104
 
105
105
 
106
106
  @dataclass(init=False)
@@ -136,7 +136,9 @@ class Case(Generic[InputsT, OutputT, MetadataT]):
136
136
  """
137
137
  expected_output: OutputT | None = None
138
138
  """Expected output of the task. This is the expected output of the task that will be evaluated."""
139
- evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(default_factory=list)
139
+ evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(
140
+ default_factory=list[Evaluator[InputsT, OutputT, MetadataT]]
141
+ )
140
142
  """Evaluators to be used just on this case."""
141
143
 
142
144
  def __init__(
@@ -372,7 +374,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
372
374
  ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
373
375
  """Evaluates the test cases in the dataset using the given task.
374
376
 
375
- This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
377
+ This is a synchronous wrapper around [`evaluate`][pydantic_evals.dataset.Dataset.evaluate] provided for convenience.
376
378
 
377
379
  Args:
378
380
  task: The task to evaluate. This should be a callable that takes the inputs of the case
@@ -511,7 +513,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
511
513
  path = Path(path)
512
514
  fmt = cls._infer_fmt(path, fmt)
513
515
 
514
- raw = Path(path).read_text()
516
+ raw = Path(path).read_text(encoding='utf-8')
515
517
  try:
516
518
  return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types, default_name=path.stem)
517
519
  except ValidationError as e: # pragma: no cover
@@ -671,11 +673,11 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
671
673
  if schema_ref: # pragma: no branch
672
674
  yaml_language_server_line = f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}'
673
675
  content = f'{yaml_language_server_line}\n{content}'
674
- path.write_text(content)
676
+ path.write_text(content, encoding='utf-8')
675
677
  else:
676
678
  context['$schema'] = schema_ref
677
679
  json_data = self.model_dump_json(indent=2, by_alias=True, context=context)
678
- path.write_text(json_data + '\n')
680
+ path.write_text(json_data + '\n', encoding='utf-8')
679
681
 
680
682
  @classmethod
681
683
  def model_json_schema_with_evaluators(
@@ -738,16 +740,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
738
740
  class Case(BaseModel, extra='forbid'): # pyright: ignore[reportUnusedClass] # this _is_ used below, but pyright doesn't seem to notice..
739
741
  name: str | None = None
740
742
  inputs: in_type # pyright: ignore[reportInvalidTypeForm]
741
- metadata: meta_type | None = None # pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]
742
- expected_output: out_type | None = None # pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]
743
+ metadata: meta_type | None = None # pyright: ignore[reportInvalidTypeForm]
744
+ expected_output: out_type | None = None # pyright: ignore[reportInvalidTypeForm]
743
745
  if evaluator_schema_types: # pragma: no branch
744
- evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007
746
+ evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa: UP007
745
747
 
746
748
  class Dataset(BaseModel, extra='forbid'):
747
749
  name: str | None = None
748
750
  cases: list[Case]
749
751
  if evaluator_schema_types: # pragma: no branch
750
- evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007
752
+ evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa: UP007
751
753
 
752
754
  json_schema = Dataset.model_json_schema()
753
755
  # See `_add_json_schema` below, since `$schema` is added to the JSON, it has to be supported in the JSON
@@ -767,8 +769,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
767
769
  path = Path(path)
768
770
  json_schema = cls.model_json_schema_with_evaluators(custom_evaluator_types)
769
771
  schema_content = to_json(json_schema, indent=2).decode() + '\n'
770
- if not path.exists() or path.read_text() != schema_content: # pragma: no branch
771
- path.write_text(schema_content)
772
+ if not path.exists() or path.read_text(encoding='utf-8') != schema_content: # pragma: no branch
773
+ path.write_text(schema_content, encoding='utf-8')
772
774
 
773
775
  @classmethod
774
776
  @functools.cache
@@ -854,8 +856,8 @@ def _get_relative_path_reference(target: Path, source: Path, _prefix: str = '')
854
856
  class _TaskRun:
855
857
  """Internal class to track metrics and attributes for a task run."""
856
858
 
857
- attributes: dict[str, Any] = field(init=False, default_factory=dict)
858
- metrics: dict[str, int | float] = field(init=False, default_factory=dict)
859
+ attributes: dict[str, Any] = field(init=False, default_factory=dict[str, Any])
860
+ metrics: dict[str, int | float] = field(init=False, default_factory=dict[str, int | float])
859
861
 
860
862
  def record_metric(self, name: str, value: int | float) -> None:
861
863
  """Record a metric value.
@@ -947,6 +949,8 @@ async def _run_task(
947
949
  # That way users can customize this logic. We'd default to a function that does the current thing but also
948
950
  # allow `None` to disable it entirely.
949
951
  for node in span_tree:
952
+ if 'gen_ai.request.model' not in node.attributes:
953
+ continue # we only want to count the below specifically for the individual LLM requests, not agent runs
950
954
  for k, v in node.attributes.items():
951
955
  if k == 'gen_ai.operation.name' and v == 'chat':
952
956
  task_run.increment_metric('requests', 1)
@@ -191,7 +191,7 @@ class LLMJudge(Evaluator[object, object, object]):
191
191
  """
192
192
 
193
193
  rubric: str
194
- model: models.Model | models.KnownModelName | None = None
194
+ model: models.Model | models.KnownModelName | str | None = None
195
195
  include_input: bool = False
196
196
  include_expected_output: bool = False
197
197
  model_settings: ModelSettings | None = None
@@ -55,7 +55,7 @@ _judge_output_agent = Agent(
55
55
  async def judge_output(
56
56
  output: Any,
57
57
  rubric: str,
58
- model: models.Model | models.KnownModelName | None = None,
58
+ model: models.Model | models.KnownModelName | str | None = None,
59
59
  model_settings: ModelSettings | None = None,
60
60
  ) -> GradingOutput:
61
61
  """Judge the output of a model based on a rubric.
@@ -96,7 +96,7 @@ async def judge_input_output(
96
96
  inputs: Any,
97
97
  output: Any,
98
98
  rubric: str,
99
- model: models.Model | models.KnownModelName | None = None,
99
+ model: models.Model | models.KnownModelName | str | None = None,
100
100
  model_settings: ModelSettings | None = None,
101
101
  ) -> GradingOutput:
102
102
  """Judge the output of a model based on the inputs and a rubric.
@@ -141,7 +141,7 @@ async def judge_input_output_expected(
141
141
  output: Any,
142
142
  expected_output: Any,
143
143
  rubric: str,
144
- model: models.Model | models.KnownModelName | None = None,
144
+ model: models.Model | models.KnownModelName | str | None = None,
145
145
  model_settings: ModelSettings | None = None,
146
146
  ) -> GradingOutput:
147
147
  """Judge the output of a model based on the inputs and a rubric.
@@ -185,7 +185,7 @@ async def judge_output_expected(
185
185
  output: Any,
186
186
  expected_output: Any,
187
187
  rubric: str,
188
- model: models.Model | models.KnownModelName | None = None,
188
+ model: models.Model | models.KnownModelName | str | None = None,
189
189
  model_settings: ModelSettings | None = None,
190
190
  ) -> GradingOutput:
191
191
  """Judge the output of a model based on the expected output, output, and a rubric.
@@ -221,39 +221,44 @@ def _stringify(value: Any) -> str:
221
221
  return repr(value)
222
222
 
223
223
 
224
+ def _make_section(content: Any, tag: str) -> list[str | UserContent]:
225
+ """Create a tagged section, handling different content types, for use in the LLMJudge's prompt.
226
+
227
+ Args:
228
+ content (Any): content to include in the section_
229
+ tag (str): tag name for the section
230
+
231
+ Returns:
232
+ list[str | UserContent]: the tagged section as a list of strings or UserContent
233
+ """
234
+ sections: list[str | UserContent] = []
235
+ items: Sequence[str | UserContent] = ( # pyright: ignore[reportUnknownVariableType]
236
+ content if isinstance(content, Sequence) and not isinstance(content, str) else [content]
237
+ )
238
+
239
+ sections.append(f'<{tag}>')
240
+ for item in items:
241
+ sections.append(item if isinstance(item, str | MultiModalContent) else _stringify(item))
242
+ sections.append(f'</{tag}>')
243
+ return sections
244
+
245
+
224
246
  def _build_prompt(
225
247
  output: Any,
226
248
  rubric: str,
227
249
  inputs: Any | None = None,
228
250
  expected_output: Any | None = None,
229
251
  ) -> str | Sequence[str | UserContent]:
230
- """Build a prompt that includes input, output, and rubric."""
252
+ """Build a prompt that includes input, output, expected output, and rubric."""
231
253
  sections: list[str | UserContent] = []
232
-
233
254
  if inputs is not None:
234
- if isinstance(inputs, str):
235
- sections.append(f'<Input>\n{inputs}\n</Input>')
236
- else:
237
- sections.append('<Input>\n')
238
- if isinstance(inputs, Sequence):
239
- for item in inputs: # type: ignore
240
- if isinstance(item, str | MultiModalContent):
241
- sections.append(item)
242
- else:
243
- sections.append(_stringify(item))
244
- elif isinstance(inputs, MultiModalContent):
245
- sections.append(inputs)
246
- else:
247
- sections.append(_stringify(inputs))
248
- sections.append('</Input>')
249
-
250
- sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
251
- sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
255
+ sections.extend(_make_section(inputs, 'Input'))
252
256
 
253
- if expected_output is not None:
254
- sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')
257
+ sections.extend(_make_section(output, 'Output'))
258
+ sections.extend(_make_section(rubric, 'Rubric'))
255
259
 
256
- if inputs is None or isinstance(inputs, str):
257
- return '\n\n'.join(sections) # type: ignore[arg-type]
258
- else:
259
- return sections
260
+ if expected_output is not None:
261
+ sections.extend(_make_section(expected_output, 'ExpectedOutput'))
262
+ if all(isinstance(section, str) for section in sections):
263
+ return '\n'.join(sections) # type: ignore[arg-type]
264
+ return sections
@@ -241,7 +241,7 @@ class SpanNode:
241
241
 
242
242
  return self._matches_query(query)
243
243
 
244
- def _matches_query(self, query: SpanQuery) -> bool: # noqa C901
244
+ def _matches_query(self, query: SpanQuery) -> bool: # noqa: C901
245
245
  """Check if the span matches the query conditions."""
246
246
  # Logical combinations
247
247
  if or_ := query.get('or_'):
@@ -433,8 +433,8 @@ class SpanTree:
433
433
  You can then search or iterate the tree to make your assertions (using DFS for traversal).
434
434
  """
435
435
 
436
- roots: list[SpanNode] = field(default_factory=list)
437
- nodes_by_id: dict[str, SpanNode] = field(default_factory=dict)
436
+ roots: list[SpanNode] = field(default_factory=list[SpanNode])
437
+ nodes_by_id: dict[str, SpanNode] = field(default_factory=dict[str, SpanNode])
438
438
 
439
439
  # -------------------------------------------------------------------------
440
440
  # Construction
@@ -55,11 +55,11 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
55
55
  name: str
56
56
  """The name of the [case][pydantic_evals.Case]."""
57
57
  inputs: InputsT
58
- """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
58
+ """The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
59
59
  metadata: MetadataT | None
60
- """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
60
+ """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
61
61
  expected_output: OutputT | None
62
- """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
62
+ """The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
63
63
  output: OutputT
64
64
  """The output of the task execution."""
65
65
 
@@ -78,7 +78,7 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
78
78
  span_id: str | None = None
79
79
  """The span ID of the case span."""
80
80
 
81
- evaluator_failures: list[EvaluatorFailure] = field(default_factory=list)
81
+ evaluator_failures: list[EvaluatorFailure] = field(default_factory=list[EvaluatorFailure])
82
82
 
83
83
 
84
84
  @dataclass(kw_only=True)
@@ -88,11 +88,11 @@ class ReportCaseFailure(Generic[InputsT, OutputT, MetadataT]):
88
88
  name: str
89
89
  """The name of the [case][pydantic_evals.Case]."""
90
90
  inputs: InputsT
91
- """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
91
+ """The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
92
92
  metadata: MetadataT | None
93
- """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
93
+ """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
94
94
  expected_output: OutputT | None
95
- """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
95
+ """The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
96
96
 
97
97
  error_message: str
98
98
  """The message of the exception that caused the failure."""
@@ -195,7 +195,9 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
195
195
 
196
196
  cases: list[ReportCase[InputsT, OutputT, MetadataT]]
197
197
  """The cases in the report."""
198
- failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
198
+ failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(
199
+ default_factory=list[ReportCaseFailure[InputsT, OutputT, MetadataT]]
200
+ )
199
201
  """The failures in the report. These are cases where task execution raised an exception."""
200
202
 
201
203
  experiment_metadata: dict[str, Any] | None = None
File without changes