pydantic-evals 0.3.6__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-evals might be problematic. Click here for more details.

Files changed (24) hide show
  1. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/PKG-INFO +2 -2
  2. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/dataset.py +4 -4
  3. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/reporting/__init__.py +30 -16
  4. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/.gitignore +0 -0
  5. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/LICENSE +0 -0
  6. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/README.md +0 -0
  7. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/__init__.py +0 -0
  8. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/_utils.py +0 -0
  9. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/evaluators/__init__.py +0 -0
  10. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
  11. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/evaluators/_spec.py +0 -0
  12. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/evaluators/common.py +0 -0
  13. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/evaluators/context.py +0 -0
  14. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/evaluators/evaluator.py +0 -0
  15. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/evaluators/llm_as_a_judge.py +0 -0
  16. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/generation.py +0 -0
  17. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/otel/__init__.py +0 -0
  18. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
  19. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/otel/_context_subtree.py +0 -0
  20. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/otel/_errors.py +0 -0
  21. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/otel/span_tree.py +0 -0
  22. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/py.typed +0 -0
  23. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pydantic_evals/reporting/render_numbers.py +0 -0
  24. {pydantic_evals-0.3.6 → pydantic_evals-0.4.0}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 0.3.6
3
+ Version: 0.4.0
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
32
32
  Requires-Dist: anyio>=0
33
33
  Requires-Dist: eval-type-backport>=0; python_version < '3.11'
34
34
  Requires-Dist: logfire-api>=1.2.0
35
- Requires-Dist: pydantic-ai-slim==0.3.6
35
+ Requires-Dist: pydantic-ai-slim==0.4.0
36
36
  Requires-Dist: pydantic>=2.10
37
37
  Requires-Dist: pyyaml>=6.0.2
38
38
  Requires-Dist: rich>=13.9.4
@@ -257,7 +257,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
257
257
  name: str | None = None,
258
258
  max_concurrency: int | None = None,
259
259
  progress: bool = True,
260
- ) -> EvaluationReport:
260
+ ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
261
261
  """Evaluates the test cases in the dataset using the given task.
262
262
 
263
263
  This method runs the task on each case in the dataset, applies evaluators,
@@ -312,7 +312,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
312
312
  name: str | None = None,
313
313
  max_concurrency: int | None = None,
314
314
  progress: bool = True,
315
- ) -> EvaluationReport:
315
+ ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
316
316
  """Evaluates the test cases in the dataset using the given task.
317
317
 
318
318
  This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
@@ -877,7 +877,7 @@ async def _run_task_and_evaluators(
877
877
  case: Case[InputsT, OutputT, MetadataT],
878
878
  report_case_name: str,
879
879
  dataset_evaluators: list[Evaluator[InputsT, OutputT, MetadataT]],
880
- ) -> ReportCase:
880
+ ) -> ReportCase[InputsT, OutputT, MetadataT]:
881
881
  """Run a task on a case and evaluate the results.
882
882
 
883
883
  Args:
@@ -927,7 +927,7 @@ async def _run_task_and_evaluators(
927
927
  span_id = f'{context.span_id:016x}'
928
928
  fallback_duration = time.time() - t0
929
929
 
930
- return ReportCase(
930
+ return ReportCase[InputsT, OutputT, MetadataT](
931
931
  name=report_case_name,
932
932
  inputs=case.inputs,
933
933
  metadata=case.metadata,
@@ -2,14 +2,14 @@ from __future__ import annotations as _annotations
2
2
 
3
3
  from collections import defaultdict
4
4
  from collections.abc import Mapping
5
- from dataclasses import dataclass, field
5
+ from dataclasses import dataclass
6
6
  from io import StringIO
7
- from typing import Any, Callable, Literal, Protocol, TypeVar
7
+ from typing import Any, Callable, Generic, Literal, Protocol
8
8
 
9
- from pydantic import BaseModel
9
+ from pydantic import BaseModel, TypeAdapter
10
10
  from rich.console import Console
11
11
  from rich.table import Table
12
- from typing_extensions import TypedDict
12
+ from typing_extensions import TypedDict, TypeVar
13
13
 
14
14
  from pydantic_evals._utils import UNSET, Unset
15
15
 
@@ -24,7 +24,9 @@ from .render_numbers import (
24
24
 
25
25
  __all__ = (
26
26
  'EvaluationReport',
27
+ 'EvaluationReportAdapter',
27
28
  'ReportCase',
29
+ 'ReportCaseAdapter',
28
30
  'EvaluationRenderer',
29
31
  'RenderValueConfig',
30
32
  'RenderNumberConfig',
@@ -35,27 +37,32 @@ MISSING_VALUE_STR = '[i]<missing>[/i]'
35
37
  EMPTY_CELL_STR = '-'
36
38
  EMPTY_AGGREGATE_CELL_STR = ''
37
39
 
40
+ InputsT = TypeVar('InputsT', default=Any)
41
+ OutputT = TypeVar('OutputT', default=Any)
42
+ MetadataT = TypeVar('MetadataT', default=Any)
38
43
 
39
- class ReportCase(BaseModel):
44
+
45
+ @dataclass
46
+ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
40
47
  """A single case in an evaluation report."""
41
48
 
42
49
  name: str
43
50
  """The name of the [case][pydantic_evals.Case]."""
44
- inputs: Any
51
+ inputs: InputsT
45
52
  """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
46
- metadata: Any
53
+ metadata: MetadataT | None
47
54
  """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
48
- expected_output: Any
55
+ expected_output: OutputT | None
49
56
  """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
50
- output: Any
57
+ output: OutputT
51
58
  """The output of the task execution."""
52
59
 
53
60
  metrics: dict[str, float | int]
54
61
  attributes: dict[str, Any]
55
62
 
56
- scores: dict[str, EvaluationResult[int | float]] = field(init=False)
57
- labels: dict[str, EvaluationResult[str]] = field(init=False)
58
- assertions: dict[str, EvaluationResult[bool]] = field(init=False)
63
+ scores: dict[str, EvaluationResult[int | float]]
64
+ labels: dict[str, EvaluationResult[str]]
65
+ assertions: dict[str, EvaluationResult[bool]]
59
66
 
60
67
  task_duration: float
61
68
  total_duration: float # includes evaluator execution time
@@ -65,6 +72,9 @@ class ReportCase(BaseModel):
65
72
  span_id: str
66
73
 
67
74
 
75
+ ReportCaseAdapter = TypeAdapter(ReportCase[Any, Any, Any])
76
+
77
+
68
78
  class ReportCaseAggregate(BaseModel):
69
79
  """A synthetic case that summarizes a set of cases."""
70
80
 
@@ -142,12 +152,13 @@ class ReportCaseAggregate(BaseModel):
142
152
  )
143
153
 
144
154
 
145
- class EvaluationReport(BaseModel):
155
+ @dataclass
156
+ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
146
157
  """A report of the results of evaluating a model on a set of cases."""
147
158
 
148
159
  name: str
149
160
  """The name of the report."""
150
- cases: list[ReportCase]
161
+ cases: list[ReportCase[InputsT, OutputT, MetadataT]]
151
162
  """The cases in the report."""
152
163
 
153
164
  def averages(self) -> ReportCaseAggregate:
@@ -156,7 +167,7 @@ class EvaluationReport(BaseModel):
156
167
  def print(
157
168
  self,
158
169
  width: int | None = None,
159
- baseline: EvaluationReport | None = None,
170
+ baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
160
171
  include_input: bool = False,
161
172
  include_metadata: bool = False,
162
173
  include_expected_output: bool = False,
@@ -199,7 +210,7 @@ class EvaluationReport(BaseModel):
199
210
 
200
211
  def console_table(
201
212
  self,
202
- baseline: EvaluationReport | None = None,
213
+ baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
203
214
  include_input: bool = False,
204
215
  include_metadata: bool = False,
205
216
  include_expected_output: bool = False,
@@ -250,6 +261,9 @@ class EvaluationReport(BaseModel):
250
261
  return io_file.getvalue()
251
262
 
252
263
 
264
+ EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
265
+
266
+
253
267
  class RenderValueConfig(TypedDict, total=False):
254
268
  """A configuration for rendering a values in an Evaluation report."""
255
269
 
File without changes
File without changes