pydantic-evals 0.2.15__tar.gz → 1.12.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-evals might be problematic. Click here for more details.

Files changed (26) hide show
  1. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/.gitignore +4 -2
  2. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/PKG-INFO +12 -14
  3. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/README.md +5 -5
  4. pydantic_evals-1.12.0/pydantic_evals/__init__.py +16 -0
  5. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/_utils.py +34 -2
  6. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/dataset.py +273 -133
  7. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/evaluators/__init__.py +12 -3
  8. pydantic_evals-1.12.0/pydantic_evals/evaluators/_run_evaluator.py +111 -0
  9. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/evaluators/common.py +8 -18
  10. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/evaluators/context.py +1 -1
  11. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/evaluators/evaluator.py +28 -12
  12. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/evaluators/llm_as_a_judge.py +46 -54
  13. pydantic_evals-0.2.15/pydantic_evals/evaluators/_spec.py → pydantic_evals-1.12.0/pydantic_evals/evaluators/spec.py +3 -9
  14. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/generation.py +6 -3
  15. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +2 -2
  16. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/otel/span_tree.py +5 -14
  17. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/reporting/__init__.py +467 -54
  18. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pyproject.toml +8 -6
  19. pydantic_evals-0.2.15/pydantic_evals/__init__.py +0 -19
  20. pydantic_evals-0.2.15/pydantic_evals/evaluators/_run_evaluator.py +0 -73
  21. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/LICENSE +0 -0
  22. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/otel/__init__.py +0 -0
  23. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/otel/_context_subtree.py +0 -0
  24. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/otel/_errors.py +0 -0
  25. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/py.typed +0 -0
  26. {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/reporting/render_numbers.py +0 -0
@@ -10,12 +10,14 @@ env*/
10
10
  /TODO.md
11
11
  /postgres-data/
12
12
  .DS_Store
13
- examples/pydantic_ai_examples/.chat_app_messages.sqlite
13
+ .chat_app_messages.sqlite
14
14
  .cache/
15
15
  .vscode/
16
16
  /question_graph_history.json
17
17
  /docs-site/.wrangler/
18
- /CLAUDE.md
19
18
  node_modules/
20
19
  **.idea/
21
20
  .coverage*
21
+ /test_tmp/
22
+ .mcp.json
23
+ .claude/
@@ -1,15 +1,15 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 0.2.15
3
+ Version: 1.12.0
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
7
7
  Project-URL: Documentation, https://ai.pydantic.dev/evals
8
8
  Project-URL: Changelog, https://github.com/pydantic/pydantic-ai/releases
9
- Author-email: David Montague <david@pydantic.dev>
9
+ Author-email: Samuel Colvin <samuel@pydantic.dev>, Marcelo Trylesinski <marcelotryle@gmail.com>, David Montague <david@pydantic.dev>, Alex Hall <alex@pydantic.dev>, Douwe Maan <douwe@pydantic.dev>
10
10
  License-Expression: MIT
11
11
  License-File: LICENSE
12
- Classifier: Development Status :: 4 - Beta
12
+ Classifier: Development Status :: 5 - Production/Stable
13
13
  Classifier: Environment :: Console
14
14
  Classifier: Environment :: MacOS X
15
15
  Classifier: Intended Audience :: Developers
@@ -21,23 +21,21 @@ Classifier: Operating System :: Unix
21
21
  Classifier: Programming Language :: Python
22
22
  Classifier: Programming Language :: Python :: 3
23
23
  Classifier: Programming Language :: Python :: 3 :: Only
24
- Classifier: Programming Language :: Python :: 3.9
25
24
  Classifier: Programming Language :: Python :: 3.10
26
25
  Classifier: Programming Language :: Python :: 3.11
27
26
  Classifier: Programming Language :: Python :: 3.12
28
27
  Classifier: Programming Language :: Python :: 3.13
29
28
  Classifier: Topic :: Internet
30
29
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
31
- Requires-Python: >=3.9
30
+ Requires-Python: >=3.10
32
31
  Requires-Dist: anyio>=0
33
- Requires-Dist: eval-type-backport>=0; python_version < '3.11'
34
- Requires-Dist: logfire-api>=1.2.0
35
- Requires-Dist: pydantic-ai-slim==0.2.15
32
+ Requires-Dist: logfire-api>=3.14.1
33
+ Requires-Dist: pydantic-ai-slim==1.12.0
36
34
  Requires-Dist: pydantic>=2.10
37
35
  Requires-Dist: pyyaml>=6.0.2
38
36
  Requires-Dist: rich>=13.9.4
39
37
  Provides-Extra: logfire
40
- Requires-Dist: logfire>=2.3; extra == 'logfire'
38
+ Requires-Dist: logfire>=3.14.1; extra == 'logfire'
41
39
  Description-Content-Type: text/markdown
42
40
 
43
41
  # Pydantic Evals
@@ -51,18 +49,18 @@ Description-Content-Type: text/markdown
51
49
  This is a library for evaluating non-deterministic (or "stochastic") functions in Python. It provides a simple,
52
50
  Pythonic interface for defining and running stochastic functions, and analyzing the results of running those functions.
53
51
 
54
- While this library is developed as part of [PydanticAI](https://ai.pydantic.dev), it only uses PydanticAI for a small
52
+ While this library is developed as part of [Pydantic AI](https://ai.pydantic.dev), it only uses Pydantic AI for a small
55
53
  subset of generative functionality internally, and it is designed to be used with arbitrary "stochastic function"
56
- implementations. In particular, it can be used with other (non-PydanticAI) AI libraries, agent frameworks, etc.
54
+ implementations. In particular, it can be used with other (non-Pydantic AI) AI libraries, agent frameworks, etc.
57
55
 
58
- As with PydanticAI, this library prioritizes type safety and use of common Python syntax over esoteric, domain-specific
56
+ As with Pydantic AI, this library prioritizes type safety and use of common Python syntax over esoteric, domain-specific
59
57
  use of Python syntax.
60
58
 
61
59
  Full documentation is available at [ai.pydantic.dev/evals](https://ai.pydantic.dev/evals).
62
60
 
63
61
  ## Example
64
62
 
65
- While you'd typically use Pydantic Evals with more complex functions (such as PydanticAI agents or graphs), here's a
63
+ While you'd typically use Pydantic Evals with more complex functions (such as Pydantic AI agents or graphs), here's a
66
64
  quick example that evaluates a simple function against a test case using both custom and built-in evaluators:
67
65
 
68
66
  ```python
@@ -110,7 +108,7 @@ report.print(include_input=True, include_output=True)
110
108
  """
111
109
  ```
112
110
 
113
- Using the library with more complex functions, such as PydanticAI agents, is similar — all you need to do is define a
111
+ Using the library with more complex functions, such as Pydantic AI agents, is similar — all you need to do is define a
114
112
  task function wrapping the function you want to evaluate, with a signature that matches the inputs and outputs of your
115
113
  test cases.
116
114
 
@@ -9,18 +9,18 @@
9
9
  This is a library for evaluating non-deterministic (or "stochastic") functions in Python. It provides a simple,
10
10
  Pythonic interface for defining and running stochastic functions, and analyzing the results of running those functions.
11
11
 
12
- While this library is developed as part of [PydanticAI](https://ai.pydantic.dev), it only uses PydanticAI for a small
12
+ While this library is developed as part of [Pydantic AI](https://ai.pydantic.dev), it only uses Pydantic AI for a small
13
13
  subset of generative functionality internally, and it is designed to be used with arbitrary "stochastic function"
14
- implementations. In particular, it can be used with other (non-PydanticAI) AI libraries, agent frameworks, etc.
14
+ implementations. In particular, it can be used with other (non-Pydantic AI) AI libraries, agent frameworks, etc.
15
15
 
16
- As with PydanticAI, this library prioritizes type safety and use of common Python syntax over esoteric, domain-specific
16
+ As with Pydantic AI, this library prioritizes type safety and use of common Python syntax over esoteric, domain-specific
17
17
  use of Python syntax.
18
18
 
19
19
  Full documentation is available at [ai.pydantic.dev/evals](https://ai.pydantic.dev/evals).
20
20
 
21
21
  ## Example
22
22
 
23
- While you'd typically use Pydantic Evals with more complex functions (such as PydanticAI agents or graphs), here's a
23
+ While you'd typically use Pydantic Evals with more complex functions (such as Pydantic AI agents or graphs), here's a
24
24
  quick example that evaluates a simple function against a test case using both custom and built-in evaluators:
25
25
 
26
26
  ```python
@@ -68,7 +68,7 @@ report.print(include_input=True, include_output=True)
68
68
  """
69
69
  ```
70
70
 
71
- Using the library with more complex functions, such as PydanticAI agents, is similar — all you need to do is define a
71
+ Using the library with more complex functions, such as Pydantic AI agents, is similar — all you need to do is define a
72
72
  task function wrapping the function you want to evaluate, with a signature that matches the inputs and outputs of your
73
73
  test cases.
74
74
 
@@ -0,0 +1,16 @@
1
+ """A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
2
+
3
+ This package provides functionality for:
4
+ - Creating and loading test datasets with structured inputs and outputs
5
+ - Evaluating model performance using various metrics and evaluators
6
+ - Generating reports for evaluation results
7
+ """
8
+
9
+ from .dataset import Case, Dataset, increment_eval_metric, set_eval_attribute
10
+
11
+ __all__ = (
12
+ 'Case',
13
+ 'Dataset',
14
+ 'increment_eval_metric',
15
+ 'set_eval_attribute',
16
+ )
@@ -2,13 +2,20 @@ from __future__ import annotations as _annotations
2
2
 
3
3
  import asyncio
4
4
  import inspect
5
- from collections.abc import Awaitable, Sequence
5
+ import warnings
6
+ from collections.abc import Awaitable, Callable, Generator, Sequence
7
+ from contextlib import contextmanager
6
8
  from functools import partial
7
- from typing import Any, Callable, TypeVar
9
+ from pathlib import Path
10
+ from typing import TYPE_CHECKING, Any, TypeVar
8
11
 
9
12
  import anyio
13
+ import logfire_api
10
14
  from typing_extensions import ParamSpec, TypeIs
11
15
 
16
+ _logfire = logfire_api.Logfire(otel_scope='pydantic-evals')
17
+ logfire_api.add_non_user_code_prefix(Path(__file__).parent.absolute())
18
+
12
19
 
13
20
  class Unset:
14
21
  """A singleton to represent an unset value.
@@ -101,3 +108,28 @@ async def task_group_gather(tasks: Sequence[Callable[[], Awaitable[T]]]) -> list
101
108
  tg.start_soon(_run_task, task, i)
102
109
 
103
110
  return results
111
+
112
+
113
+ try:
114
+ from logfire._internal.config import (
115
+ LogfireNotConfiguredWarning, # pyright: ignore[reportAssignmentType,reportPrivateImportUsage]
116
+ )
117
+ # TODO: Remove this `pragma: no cover` once we test evals without pydantic-ai (which includes logfire)
118
+ except ImportError: # pragma: no cover
119
+
120
+ class LogfireNotConfiguredWarning(UserWarning):
121
+ pass
122
+
123
+
124
+ if TYPE_CHECKING:
125
+ logfire_span = _logfire.span
126
+ else:
127
+
128
+ @contextmanager
129
+ def logfire_span(*args: Any, **kwargs: Any) -> Generator[logfire_api.LogfireSpan, None, None]:
130
+ """Create a Logfire span without warning if logfire is not configured."""
131
+ # TODO: Remove once Logfire has the ability to suppress this warning from non-user code
132
+ with warnings.catch_warnings():
133
+ warnings.filterwarnings('ignore', category=LogfireNotConfiguredWarning)
134
+ with _logfire.span(*args, **kwargs) as span:
135
+ yield span