pydantic-evals 1.0.10__tar.gz → 1.0.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydantic-evals might be problematic. Click here for more details.
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/PKG-INFO +2 -2
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/dataset.py +18 -5
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/llm_as_a_judge.py +1 -2
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/generation.py +2 -1
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/.gitignore +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/LICENSE +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/README.md +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/__init__.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/_utils.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/__init__.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/common.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/context.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/evaluator.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/spec.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/otel/__init__.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/otel/_context_subtree.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/otel/_errors.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/otel/span_tree.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/py.typed +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/reporting/__init__.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/reporting/render_numbers.py +0 -0
- {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.12
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=0
|
|
32
32
|
Requires-Dist: logfire-api>=3.14.1
|
|
33
|
-
Requires-Dist: pydantic-ai-slim==1.0.
|
|
33
|
+
Requires-Dist: pydantic-ai-slim==1.0.12
|
|
34
34
|
Requires-Dist: pydantic>=2.10
|
|
35
35
|
Requires-Dist: pyyaml>=6.0.2
|
|
36
36
|
Requires-Dist: rich>=13.9.4
|
|
@@ -333,6 +333,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
333
333
|
name: str | None = None,
|
|
334
334
|
max_concurrency: int | None = None,
|
|
335
335
|
progress: bool = True,
|
|
336
|
+
retry_task: RetryConfig | None = None,
|
|
337
|
+
retry_evaluators: RetryConfig | None = None,
|
|
336
338
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
337
339
|
"""Evaluates the test cases in the dataset using the given task.
|
|
338
340
|
|
|
@@ -346,12 +348,21 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
346
348
|
max_concurrency: The maximum number of concurrent evaluations of the task to allow.
|
|
347
349
|
If None, all cases will be evaluated concurrently.
|
|
348
350
|
progress: Whether to show a progress bar for the evaluation. Defaults to True.
|
|
351
|
+
retry_task: Optional retry configuration for the task execution.
|
|
352
|
+
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
349
353
|
|
|
350
354
|
Returns:
|
|
351
355
|
A report containing the results of the evaluation.
|
|
352
356
|
"""
|
|
353
357
|
return get_event_loop().run_until_complete(
|
|
354
|
-
self.evaluate(
|
|
358
|
+
self.evaluate(
|
|
359
|
+
task,
|
|
360
|
+
name=name,
|
|
361
|
+
max_concurrency=max_concurrency,
|
|
362
|
+
progress=progress,
|
|
363
|
+
retry_task=retry_task,
|
|
364
|
+
retry_evaluators=retry_evaluators,
|
|
365
|
+
)
|
|
355
366
|
)
|
|
356
367
|
|
|
357
368
|
def add_case(
|
|
@@ -886,12 +897,14 @@ async def _run_task(
|
|
|
886
897
|
# That way users can customize this logic. We'd default to a function that does the current thing but also
|
|
887
898
|
# allow `None` to disable it entirely.
|
|
888
899
|
for node in span_tree:
|
|
889
|
-
if node.attributes.get('gen_ai.operation.name') == 'chat':
|
|
890
|
-
task_run.increment_metric('requests', 1)
|
|
891
900
|
for k, v in node.attributes.items():
|
|
892
|
-
if
|
|
901
|
+
if k == 'gen_ai.operation.name' and v == 'chat':
|
|
902
|
+
task_run.increment_metric('requests', 1)
|
|
903
|
+
elif not isinstance(v, int | float):
|
|
893
904
|
continue
|
|
894
|
-
|
|
905
|
+
elif k == 'operation.cost':
|
|
906
|
+
task_run.increment_metric('cost', v)
|
|
907
|
+
elif k.startswith('gen_ai.usage.details.'):
|
|
895
908
|
task_run.increment_metric(k.removeprefix('gen_ai.usage.details.'), v)
|
|
896
909
|
elif k.startswith('gen_ai.usage.'):
|
|
897
910
|
task_run.increment_metric(k.removeprefix('gen_ai.usage.'), v)
|
|
@@ -7,8 +7,7 @@ from typing import Any
|
|
|
7
7
|
from pydantic import BaseModel, Field
|
|
8
8
|
from pydantic_core import to_json
|
|
9
9
|
|
|
10
|
-
from pydantic_ai import Agent, models
|
|
11
|
-
from pydantic_ai.messages import MultiModalContent, UserContent
|
|
10
|
+
from pydantic_ai import Agent, MultiModalContent, UserContent, models
|
|
12
11
|
from pydantic_ai.settings import ModelSettings
|
|
13
12
|
|
|
14
13
|
__all__ = (
|
|
@@ -59,7 +59,8 @@ async def generate_dataset(
|
|
|
59
59
|
"""
|
|
60
60
|
output_schema = dataset_type.model_json_schema_with_evaluators(custom_evaluator_types)
|
|
61
61
|
|
|
62
|
-
# TODO
|
|
62
|
+
# TODO: Use `output_type=StructuredDict(output_schema)` (and `from_dict` below) once https://github.com/pydantic/pydantic/issues/12145
|
|
63
|
+
# is fixed and `StructuredDict` no longer needs to use `InlineDefsJsonSchemaTransformer`.
|
|
63
64
|
agent = Agent(
|
|
64
65
|
model,
|
|
65
66
|
system_prompt=(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|