pydantic-evals 0.2.15__tar.gz → 1.12.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydantic-evals might be problematic. Click here for more details.
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/.gitignore +4 -2
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/PKG-INFO +12 -14
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/README.md +5 -5
- pydantic_evals-1.12.0/pydantic_evals/__init__.py +16 -0
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/_utils.py +34 -2
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/dataset.py +273 -133
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/evaluators/__init__.py +12 -3
- pydantic_evals-1.12.0/pydantic_evals/evaluators/_run_evaluator.py +111 -0
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/evaluators/common.py +8 -18
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/evaluators/context.py +1 -1
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/evaluators/evaluator.py +28 -12
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/evaluators/llm_as_a_judge.py +46 -54
- pydantic_evals-0.2.15/pydantic_evals/evaluators/_spec.py → pydantic_evals-1.12.0/pydantic_evals/evaluators/spec.py +3 -9
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/generation.py +6 -3
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +2 -2
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/otel/span_tree.py +5 -14
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/reporting/__init__.py +467 -54
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pyproject.toml +8 -6
- pydantic_evals-0.2.15/pydantic_evals/__init__.py +0 -19
- pydantic_evals-0.2.15/pydantic_evals/evaluators/_run_evaluator.py +0 -73
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/LICENSE +0 -0
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/otel/__init__.py +0 -0
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/otel/_context_subtree.py +0 -0
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/otel/_errors.py +0 -0
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/py.typed +0 -0
- {pydantic_evals-0.2.15 → pydantic_evals-1.12.0}/pydantic_evals/reporting/render_numbers.py +0 -0
|
@@ -10,12 +10,14 @@ env*/
|
|
|
10
10
|
/TODO.md
|
|
11
11
|
/postgres-data/
|
|
12
12
|
.DS_Store
|
|
13
|
-
|
|
13
|
+
.chat_app_messages.sqlite
|
|
14
14
|
.cache/
|
|
15
15
|
.vscode/
|
|
16
16
|
/question_graph_history.json
|
|
17
17
|
/docs-site/.wrangler/
|
|
18
|
-
/CLAUDE.md
|
|
19
18
|
node_modules/
|
|
20
19
|
**.idea/
|
|
21
20
|
.coverage*
|
|
21
|
+
/test_tmp/
|
|
22
|
+
.mcp.json
|
|
23
|
+
.claude/
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version:
|
|
3
|
+
Version: 1.12.0
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
7
7
|
Project-URL: Documentation, https://ai.pydantic.dev/evals
|
|
8
8
|
Project-URL: Changelog, https://github.com/pydantic/pydantic-ai/releases
|
|
9
|
-
Author-email: David Montague <david@pydantic.dev>
|
|
9
|
+
Author-email: Samuel Colvin <samuel@pydantic.dev>, Marcelo Trylesinski <marcelotryle@gmail.com>, David Montague <david@pydantic.dev>, Alex Hall <alex@pydantic.dev>, Douwe Maan <douwe@pydantic.dev>
|
|
10
10
|
License-Expression: MIT
|
|
11
11
|
License-File: LICENSE
|
|
12
|
-
Classifier: Development Status ::
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
13
|
Classifier: Environment :: Console
|
|
14
14
|
Classifier: Environment :: MacOS X
|
|
15
15
|
Classifier: Intended Audience :: Developers
|
|
@@ -21,23 +21,21 @@ Classifier: Operating System :: Unix
|
|
|
21
21
|
Classifier: Programming Language :: Python
|
|
22
22
|
Classifier: Programming Language :: Python :: 3
|
|
23
23
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
24
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
25
24
|
Classifier: Programming Language :: Python :: 3.10
|
|
26
25
|
Classifier: Programming Language :: Python :: 3.11
|
|
27
26
|
Classifier: Programming Language :: Python :: 3.12
|
|
28
27
|
Classifier: Programming Language :: Python :: 3.13
|
|
29
28
|
Classifier: Topic :: Internet
|
|
30
29
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
31
|
-
Requires-Python: >=3.
|
|
30
|
+
Requires-Python: >=3.10
|
|
32
31
|
Requires-Dist: anyio>=0
|
|
33
|
-
Requires-Dist:
|
|
34
|
-
Requires-Dist:
|
|
35
|
-
Requires-Dist: pydantic-ai-slim==0.2.15
|
|
32
|
+
Requires-Dist: logfire-api>=3.14.1
|
|
33
|
+
Requires-Dist: pydantic-ai-slim==1.12.0
|
|
36
34
|
Requires-Dist: pydantic>=2.10
|
|
37
35
|
Requires-Dist: pyyaml>=6.0.2
|
|
38
36
|
Requires-Dist: rich>=13.9.4
|
|
39
37
|
Provides-Extra: logfire
|
|
40
|
-
Requires-Dist: logfire>=
|
|
38
|
+
Requires-Dist: logfire>=3.14.1; extra == 'logfire'
|
|
41
39
|
Description-Content-Type: text/markdown
|
|
42
40
|
|
|
43
41
|
# Pydantic Evals
|
|
@@ -51,18 +49,18 @@ Description-Content-Type: text/markdown
|
|
|
51
49
|
This is a library for evaluating non-deterministic (or "stochastic") functions in Python. It provides a simple,
|
|
52
50
|
Pythonic interface for defining and running stochastic functions, and analyzing the results of running those functions.
|
|
53
51
|
|
|
54
|
-
While this library is developed as part of [
|
|
52
|
+
While this library is developed as part of [Pydantic AI](https://ai.pydantic.dev), it only uses Pydantic AI for a small
|
|
55
53
|
subset of generative functionality internally, and it is designed to be used with arbitrary "stochastic function"
|
|
56
|
-
implementations. In particular, it can be used with other (non-
|
|
54
|
+
implementations. In particular, it can be used with other (non-Pydantic AI) AI libraries, agent frameworks, etc.
|
|
57
55
|
|
|
58
|
-
As with
|
|
56
|
+
As with Pydantic AI, this library prioritizes type safety and use of common Python syntax over esoteric, domain-specific
|
|
59
57
|
use of Python syntax.
|
|
60
58
|
|
|
61
59
|
Full documentation is available at [ai.pydantic.dev/evals](https://ai.pydantic.dev/evals).
|
|
62
60
|
|
|
63
61
|
## Example
|
|
64
62
|
|
|
65
|
-
While you'd typically use Pydantic Evals with more complex functions (such as
|
|
63
|
+
While you'd typically use Pydantic Evals with more complex functions (such as Pydantic AI agents or graphs), here's a
|
|
66
64
|
quick example that evaluates a simple function against a test case using both custom and built-in evaluators:
|
|
67
65
|
|
|
68
66
|
```python
|
|
@@ -110,7 +108,7 @@ report.print(include_input=True, include_output=True)
|
|
|
110
108
|
"""
|
|
111
109
|
```
|
|
112
110
|
|
|
113
|
-
Using the library with more complex functions, such as
|
|
111
|
+
Using the library with more complex functions, such as Pydantic AI agents, is similar — all you need to do is define a
|
|
114
112
|
task function wrapping the function you want to evaluate, with a signature that matches the inputs and outputs of your
|
|
115
113
|
test cases.
|
|
116
114
|
|
|
@@ -9,18 +9,18 @@
|
|
|
9
9
|
This is a library for evaluating non-deterministic (or "stochastic") functions in Python. It provides a simple,
|
|
10
10
|
Pythonic interface for defining and running stochastic functions, and analyzing the results of running those functions.
|
|
11
11
|
|
|
12
|
-
While this library is developed as part of [
|
|
12
|
+
While this library is developed as part of [Pydantic AI](https://ai.pydantic.dev), it only uses Pydantic AI for a small
|
|
13
13
|
subset of generative functionality internally, and it is designed to be used with arbitrary "stochastic function"
|
|
14
|
-
implementations. In particular, it can be used with other (non-
|
|
14
|
+
implementations. In particular, it can be used with other (non-Pydantic AI) AI libraries, agent frameworks, etc.
|
|
15
15
|
|
|
16
|
-
As with
|
|
16
|
+
As with Pydantic AI, this library prioritizes type safety and use of common Python syntax over esoteric, domain-specific
|
|
17
17
|
use of Python syntax.
|
|
18
18
|
|
|
19
19
|
Full documentation is available at [ai.pydantic.dev/evals](https://ai.pydantic.dev/evals).
|
|
20
20
|
|
|
21
21
|
## Example
|
|
22
22
|
|
|
23
|
-
While you'd typically use Pydantic Evals with more complex functions (such as
|
|
23
|
+
While you'd typically use Pydantic Evals with more complex functions (such as Pydantic AI agents or graphs), here's a
|
|
24
24
|
quick example that evaluates a simple function against a test case using both custom and built-in evaluators:
|
|
25
25
|
|
|
26
26
|
```python
|
|
@@ -68,7 +68,7 @@ report.print(include_input=True, include_output=True)
|
|
|
68
68
|
"""
|
|
69
69
|
```
|
|
70
70
|
|
|
71
|
-
Using the library with more complex functions, such as
|
|
71
|
+
Using the library with more complex functions, such as Pydantic AI agents, is similar — all you need to do is define a
|
|
72
72
|
task function wrapping the function you want to evaluate, with a signature that matches the inputs and outputs of your
|
|
73
73
|
test cases.
|
|
74
74
|
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
|
|
2
|
+
|
|
3
|
+
This package provides functionality for:
|
|
4
|
+
- Creating and loading test datasets with structured inputs and outputs
|
|
5
|
+
- Evaluating model performance using various metrics and evaluators
|
|
6
|
+
- Generating reports for evaluation results
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .dataset import Case, Dataset, increment_eval_metric, set_eval_attribute
|
|
10
|
+
|
|
11
|
+
__all__ = (
|
|
12
|
+
'Case',
|
|
13
|
+
'Dataset',
|
|
14
|
+
'increment_eval_metric',
|
|
15
|
+
'set_eval_attribute',
|
|
16
|
+
)
|
|
@@ -2,13 +2,20 @@ from __future__ import annotations as _annotations
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import inspect
|
|
5
|
-
|
|
5
|
+
import warnings
|
|
6
|
+
from collections.abc import Awaitable, Callable, Generator, Sequence
|
|
7
|
+
from contextlib import contextmanager
|
|
6
8
|
from functools import partial
|
|
7
|
-
from
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
8
11
|
|
|
9
12
|
import anyio
|
|
13
|
+
import logfire_api
|
|
10
14
|
from typing_extensions import ParamSpec, TypeIs
|
|
11
15
|
|
|
16
|
+
_logfire = logfire_api.Logfire(otel_scope='pydantic-evals')
|
|
17
|
+
logfire_api.add_non_user_code_prefix(Path(__file__).parent.absolute())
|
|
18
|
+
|
|
12
19
|
|
|
13
20
|
class Unset:
|
|
14
21
|
"""A singleton to represent an unset value.
|
|
@@ -101,3 +108,28 @@ async def task_group_gather(tasks: Sequence[Callable[[], Awaitable[T]]]) -> list
|
|
|
101
108
|
tg.start_soon(_run_task, task, i)
|
|
102
109
|
|
|
103
110
|
return results
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
from logfire._internal.config import (
|
|
115
|
+
LogfireNotConfiguredWarning, # pyright: ignore[reportAssignmentType,reportPrivateImportUsage]
|
|
116
|
+
)
|
|
117
|
+
# TODO: Remove this `pragma: no cover` once we test evals without pydantic-ai (which includes logfire)
|
|
118
|
+
except ImportError: # pragma: no cover
|
|
119
|
+
|
|
120
|
+
class LogfireNotConfiguredWarning(UserWarning):
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
if TYPE_CHECKING:
|
|
125
|
+
logfire_span = _logfire.span
|
|
126
|
+
else:
|
|
127
|
+
|
|
128
|
+
@contextmanager
|
|
129
|
+
def logfire_span(*args: Any, **kwargs: Any) -> Generator[logfire_api.LogfireSpan, None, None]:
|
|
130
|
+
"""Create a Logfire span without warning if logfire is not configured."""
|
|
131
|
+
# TODO: Remove once Logfire has the ability to suppress this warning from non-user code
|
|
132
|
+
with warnings.catch_warnings():
|
|
133
|
+
warnings.filterwarnings('ignore', category=LogfireNotConfiguredWarning)
|
|
134
|
+
with _logfire.span(*args, **kwargs) as span:
|
|
135
|
+
yield span
|