pydantic-evals 1.0.0b1__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydantic-evals might be problematic. Click here for more details.
- pydantic_evals/_utils.py +34 -2
- pydantic_evals/dataset.py +4 -4
- pydantic_evals/evaluators/__init__.py +8 -2
- pydantic_evals/evaluators/_run_evaluator.py +3 -5
- pydantic_evals/evaluators/common.py +8 -18
- {pydantic_evals-1.0.0b1.dist-info → pydantic_evals-1.0.1.dist-info}/METADATA +3 -4
- {pydantic_evals-1.0.0b1.dist-info → pydantic_evals-1.0.1.dist-info}/RECORD +9 -9
- {pydantic_evals-1.0.0b1.dist-info → pydantic_evals-1.0.1.dist-info}/WHEEL +0 -0
- {pydantic_evals-1.0.0b1.dist-info → pydantic_evals-1.0.1.dist-info}/licenses/LICENSE +0 -0
pydantic_evals/_utils.py
CHANGED
|
@@ -2,13 +2,20 @@ from __future__ import annotations as _annotations
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import inspect
|
|
5
|
-
|
|
5
|
+
import warnings
|
|
6
|
+
from collections.abc import Awaitable, Callable, Generator, Sequence
|
|
7
|
+
from contextlib import contextmanager
|
|
6
8
|
from functools import partial
|
|
7
|
-
from
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
8
11
|
|
|
9
12
|
import anyio
|
|
13
|
+
import logfire_api
|
|
10
14
|
from typing_extensions import ParamSpec, TypeIs
|
|
11
15
|
|
|
16
|
+
_logfire = logfire_api.Logfire(otel_scope='pydantic-evals')
|
|
17
|
+
logfire_api.add_non_user_code_prefix(Path(__file__).parent.absolute())
|
|
18
|
+
|
|
12
19
|
|
|
13
20
|
class Unset:
|
|
14
21
|
"""A singleton to represent an unset value.
|
|
@@ -101,3 +108,28 @@ async def task_group_gather(tasks: Sequence[Callable[[], Awaitable[T]]]) -> list
|
|
|
101
108
|
tg.start_soon(_run_task, task, i)
|
|
102
109
|
|
|
103
110
|
return results
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
try:
|
|
114
|
+
from logfire._internal.config import (
|
|
115
|
+
LogfireNotConfiguredWarning, # pyright: ignore[reportAssignmentType,reportPrivateImportUsage]
|
|
116
|
+
)
|
|
117
|
+
# TODO: Remove this `pragma: no cover` once we test evals without pydantic-ai (which includes logfire)
|
|
118
|
+
except ImportError: # pragma: no cover
|
|
119
|
+
|
|
120
|
+
class LogfireNotConfiguredWarning(UserWarning):
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
if TYPE_CHECKING:
|
|
125
|
+
logfire_span = _logfire.span
|
|
126
|
+
else:
|
|
127
|
+
|
|
128
|
+
@contextmanager
|
|
129
|
+
def logfire_span(*args: Any, **kwargs: Any) -> Generator[logfire_api.LogfireSpan, None, None]:
|
|
130
|
+
"""Create a Logfire span without warning if logfire is not configured."""
|
|
131
|
+
# TODO: Remove once Logfire has the ability to suppress this warning from non-user code
|
|
132
|
+
with warnings.catch_warnings():
|
|
133
|
+
warnings.filterwarnings('ignore', category=LogfireNotConfiguredWarning)
|
|
134
|
+
with _logfire.span(*args, **kwargs) as span:
|
|
135
|
+
yield span
|
pydantic_evals/dataset.py
CHANGED
|
@@ -36,7 +36,7 @@ from typing_extensions import NotRequired, Self, TypedDict, TypeVar
|
|
|
36
36
|
|
|
37
37
|
from pydantic_evals._utils import get_event_loop
|
|
38
38
|
|
|
39
|
-
from ._utils import get_unwrapped_function_name, task_group_gather
|
|
39
|
+
from ._utils import get_unwrapped_function_name, logfire_span, task_group_gather
|
|
40
40
|
from .evaluators import EvaluationResult, Evaluator
|
|
41
41
|
from .evaluators._run_evaluator import run_evaluator
|
|
42
42
|
from .evaluators.common import DEFAULT_EVALUATORS
|
|
@@ -283,7 +283,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
283
283
|
limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
|
|
284
284
|
|
|
285
285
|
with (
|
|
286
|
-
|
|
286
|
+
logfire_span('evaluate {name}', name=name, n_cases=len(self.cases)) as eval_span,
|
|
287
287
|
progress_bar or nullcontext(),
|
|
288
288
|
):
|
|
289
289
|
task_id = progress_bar.add_task(f'Evaluating {name}', total=total_cases) if progress_bar else None
|
|
@@ -858,7 +858,7 @@ async def _run_task(
|
|
|
858
858
|
token = _CURRENT_TASK_RUN.set(task_run_)
|
|
859
859
|
try:
|
|
860
860
|
with (
|
|
861
|
-
|
|
861
|
+
logfire_span('execute {task}', task=get_unwrapped_function_name(task)) as task_span,
|
|
862
862
|
context_subtree() as span_tree_,
|
|
863
863
|
):
|
|
864
864
|
t0 = time.perf_counter()
|
|
@@ -933,7 +933,7 @@ async def _run_task_and_evaluators(
|
|
|
933
933
|
trace_id: str | None = None
|
|
934
934
|
span_id: str | None = None
|
|
935
935
|
try:
|
|
936
|
-
with
|
|
936
|
+
with logfire_span(
|
|
937
937
|
'case: {case_name}',
|
|
938
938
|
task_name=get_unwrapped_function_name(task),
|
|
939
939
|
case_name=report_case_name,
|
|
@@ -7,7 +7,6 @@ from .common import (
|
|
|
7
7
|
LLMJudge,
|
|
8
8
|
MaxDuration,
|
|
9
9
|
OutputConfig,
|
|
10
|
-
Python,
|
|
11
10
|
)
|
|
12
11
|
from .context import EvaluatorContext
|
|
13
12
|
from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorFailure, EvaluatorOutput, EvaluatorSpec
|
|
@@ -22,7 +21,6 @@ __all__ = (
|
|
|
22
21
|
'LLMJudge',
|
|
23
22
|
'HasMatchingSpan',
|
|
24
23
|
'OutputConfig',
|
|
25
|
-
'Python',
|
|
26
24
|
# context
|
|
27
25
|
'EvaluatorContext',
|
|
28
26
|
# evaluator
|
|
@@ -34,3 +32,11 @@ __all__ = (
|
|
|
34
32
|
'EvaluationReason',
|
|
35
33
|
'EvaluationResult',
|
|
36
34
|
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def __getattr__(name: str):
|
|
38
|
+
if name == 'Python':
|
|
39
|
+
raise ImportError(
|
|
40
|
+
'The `Python` evaluator has been removed for security reasons. See https://github.com/pydantic/pydantic-ai/pull/2808 for more details and a workaround.'
|
|
41
|
+
)
|
|
42
|
+
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
|
|
@@ -2,16 +2,16 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import traceback
|
|
4
4
|
from collections.abc import Mapping
|
|
5
|
-
from pathlib import Path
|
|
6
5
|
from typing import TYPE_CHECKING, Any
|
|
7
6
|
|
|
8
|
-
import logfire_api
|
|
9
7
|
from pydantic import (
|
|
10
8
|
TypeAdapter,
|
|
11
9
|
ValidationError,
|
|
12
10
|
)
|
|
13
11
|
from typing_extensions import TypeVar
|
|
14
12
|
|
|
13
|
+
from pydantic_evals._utils import logfire_span
|
|
14
|
+
|
|
15
15
|
from .context import EvaluatorContext
|
|
16
16
|
from .evaluator import (
|
|
17
17
|
EvaluationReason,
|
|
@@ -25,8 +25,6 @@ from .evaluator import (
|
|
|
25
25
|
if TYPE_CHECKING:
|
|
26
26
|
from pydantic_ai.retries import RetryConfig
|
|
27
27
|
|
|
28
|
-
_logfire = logfire_api.Logfire(otel_scope='pydantic-evals')
|
|
29
|
-
logfire_api.add_non_user_code_prefix(Path(__file__).parent.absolute())
|
|
30
28
|
|
|
31
29
|
InputsT = TypeVar('InputsT', default=Any, contravariant=True)
|
|
32
30
|
OutputT = TypeVar('OutputT', default=Any, contravariant=True)
|
|
@@ -62,7 +60,7 @@ async def run_evaluator(
|
|
|
62
60
|
evaluate = tenacity_retry(**retry)(evaluate)
|
|
63
61
|
|
|
64
62
|
try:
|
|
65
|
-
with
|
|
63
|
+
with logfire_span(
|
|
66
64
|
'evaluator: {evaluator_name}',
|
|
67
65
|
evaluator_name=evaluator.get_default_evaluation_name(),
|
|
68
66
|
):
|
|
@@ -21,7 +21,6 @@ __all__ = (
|
|
|
21
21
|
'MaxDuration',
|
|
22
22
|
'LLMJudge',
|
|
23
23
|
'HasMatchingSpan',
|
|
24
|
-
'Python',
|
|
25
24
|
'OutputConfig',
|
|
26
25
|
)
|
|
27
26
|
|
|
@@ -268,22 +267,6 @@ class HasMatchingSpan(Evaluator[object, object, object]):
|
|
|
268
267
|
return ctx.span_tree.any(self.query)
|
|
269
268
|
|
|
270
269
|
|
|
271
|
-
# TODO: Consider moving this to docs rather than providing it with the library, given the security implications
|
|
272
|
-
@dataclass(repr=False)
|
|
273
|
-
class Python(Evaluator[object, object, object]):
|
|
274
|
-
"""The output of this evaluator is the result of evaluating the provided Python expression.
|
|
275
|
-
|
|
276
|
-
***WARNING***: this evaluator runs arbitrary Python code, so you should ***NEVER*** use it with untrusted inputs.
|
|
277
|
-
"""
|
|
278
|
-
|
|
279
|
-
expression: str
|
|
280
|
-
evaluation_name: str | None = field(default=None)
|
|
281
|
-
|
|
282
|
-
def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput:
|
|
283
|
-
# Evaluate the condition, exposing access to the evaluator context as `ctx`.
|
|
284
|
-
return eval(self.expression, {'ctx': ctx})
|
|
285
|
-
|
|
286
|
-
|
|
287
270
|
DEFAULT_EVALUATORS: tuple[type[Evaluator[object, object, object]], ...] = (
|
|
288
271
|
Equals,
|
|
289
272
|
EqualsExpected,
|
|
@@ -292,5 +275,12 @@ DEFAULT_EVALUATORS: tuple[type[Evaluator[object, object, object]], ...] = (
|
|
|
292
275
|
MaxDuration,
|
|
293
276
|
LLMJudge,
|
|
294
277
|
HasMatchingSpan,
|
|
295
|
-
# Python, # not included by default for security reasons
|
|
296
278
|
)
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def __getattr__(name: str):
|
|
282
|
+
if name == 'Python':
|
|
283
|
+
raise ImportError(
|
|
284
|
+
'The `Python` evaluator has been removed for security reasons. See https://github.com/pydantic/pydantic-ai/pull/2808 for more details and a workaround.'
|
|
285
|
+
)
|
|
286
|
+
raise AttributeError(f'module {__name__!r} has no attribute {name!r}')
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.1
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -9,7 +9,7 @@ Project-URL: Changelog, https://github.com/pydantic/pydantic-ai/releases
|
|
|
9
9
|
Author-email: Samuel Colvin <samuel@pydantic.dev>, Marcelo Trylesinski <marcelotryle@gmail.com>, David Montague <david@pydantic.dev>, Alex Hall <alex@pydantic.dev>, Douwe Maan <douwe@pydantic.dev>
|
|
10
10
|
License-Expression: MIT
|
|
11
11
|
License-File: LICENSE
|
|
12
|
-
Classifier: Development Status ::
|
|
12
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
13
13
|
Classifier: Environment :: Console
|
|
14
14
|
Classifier: Environment :: MacOS X
|
|
15
15
|
Classifier: Intended Audience :: Developers
|
|
@@ -29,9 +29,8 @@ Classifier: Topic :: Internet
|
|
|
29
29
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=0
|
|
32
|
-
Requires-Dist: eval-type-backport>=0; python_version < '3.11'
|
|
33
32
|
Requires-Dist: logfire-api>=3.14.1
|
|
34
|
-
Requires-Dist: pydantic-ai-slim==1.0.
|
|
33
|
+
Requires-Dist: pydantic-ai-slim==1.0.1
|
|
35
34
|
Requires-Dist: pydantic>=2.10
|
|
36
35
|
Requires-Dist: pyyaml>=6.0.2
|
|
37
36
|
Requires-Dist: rich>=13.9.4
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
|
|
2
|
-
pydantic_evals/_utils.py,sha256=
|
|
3
|
-
pydantic_evals/dataset.py,sha256=
|
|
2
|
+
pydantic_evals/_utils.py,sha256=1muGTc2zqjwxqngz6quRSLoZM88onjp0Xgt-a9n2aPQ,4111
|
|
3
|
+
pydantic_evals/dataset.py,sha256=8rcw_hJb9H01M22NInn-2Pi27xtZgfADUboMCW-nrj4,48468
|
|
4
4
|
pydantic_evals/generation.py,sha256=Yd1rfbsDjjBBHDk-1KDu48hlITjM2-74rTnPBD_sqbA,3494
|
|
5
5
|
pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
pydantic_evals/evaluators/__init__.py,sha256=
|
|
7
|
-
pydantic_evals/evaluators/_run_evaluator.py,sha256=
|
|
8
|
-
pydantic_evals/evaluators/common.py,sha256=
|
|
6
|
+
pydantic_evals/evaluators/__init__.py,sha256=E_JT6o96Ef-oS_IZ1Hyy95NRLwz7EOHewp-o13IdXEM,1032
|
|
7
|
+
pydantic_evals/evaluators/_run_evaluator.py,sha256=uGmH67gCTeF9BSprCiBC4DtKEpKLrKYaXgsAQiCbCLY,3630
|
|
8
|
+
pydantic_evals/evaluators/common.py,sha256=Cc9RMsSf5P2gcq3IDwmZxgfo1xnu7HEehiAS2Hgibz4,11609
|
|
9
9
|
pydantic_evals/evaluators/context.py,sha256=mTxcm0Hvkev9htpqwoJMCJIqEYBtY5g86SXcjoqQxHY,3884
|
|
10
10
|
pydantic_evals/evaluators/evaluator.py,sha256=ylfKRytoM9KzbZkSsFkEEnsg4XhK4usuyy1Rb1emoPo,11474
|
|
11
11
|
pydantic_evals/evaluators/llm_as_a_judge.py,sha256=i20c506j9f5J2VMzPeUky677lfGq27xaZ7xcYIFltiA,9599
|
|
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
|
|
|
17
17
|
pydantic_evals/otel/span_tree.py,sha256=RzX4VGpEqc2QUhkyxMTXtBRo5yHHO1c0hI7QJJuiXPU,23043
|
|
18
18
|
pydantic_evals/reporting/__init__.py,sha256=4S8q_KfOflQlJYTISWM1Vp6_wPDHOMjbh9mSc3dU4-8,51562
|
|
19
19
|
pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
|
|
20
|
-
pydantic_evals-1.0.
|
|
21
|
-
pydantic_evals-1.0.
|
|
22
|
-
pydantic_evals-1.0.
|
|
23
|
-
pydantic_evals-1.0.
|
|
20
|
+
pydantic_evals-1.0.1.dist-info/METADATA,sha256=sXCoSsXg3p6Ww1Lccq8pbidUSkkzxr2D36MeL4Ir4dc,7844
|
|
21
|
+
pydantic_evals-1.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
22
|
+
pydantic_evals-1.0.1.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
|
|
23
|
+
pydantic_evals-1.0.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|