pydantic-evals 1.3.0__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydantic-evals might be problematic. Click here for more details.
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/PKG-INFO +2 -2
- pydantic_evals-1.4.0/pydantic_evals/__init__.py +16 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/evaluators/llm_as_a_judge.py +1 -1
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/reporting/__init__.py +3 -3
- pydantic_evals-1.3.0/pydantic_evals/__init__.py +0 -19
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/.gitignore +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/LICENSE +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/README.md +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/_utils.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/dataset.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/evaluators/__init__.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/evaluators/common.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/evaluators/context.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/evaluators/evaluator.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/evaluators/spec.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/generation.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/otel/__init__.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/otel/_context_subtree.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/otel/_errors.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/otel/span_tree.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/py.typed +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pydantic_evals/reporting/render_numbers.py +0 -0
- {pydantic_evals-1.3.0 → pydantic_evals-1.4.0}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=0
|
|
32
32
|
Requires-Dist: logfire-api>=3.14.1
|
|
33
|
-
Requires-Dist: pydantic-ai-slim==1.
|
|
33
|
+
Requires-Dist: pydantic-ai-slim==1.4.0
|
|
34
34
|
Requires-Dist: pydantic>=2.10
|
|
35
35
|
Requires-Dist: pyyaml>=6.0.2
|
|
36
36
|
Requires-Dist: rich>=13.9.4
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
|
|
2
|
+
|
|
3
|
+
This package provides functionality for:
|
|
4
|
+
- Creating and loading test datasets with structured inputs and outputs
|
|
5
|
+
- Evaluating model performance using various metrics and evaluators
|
|
6
|
+
- Generating reports for evaluation results
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .dataset import Case, Dataset, increment_eval_metric, set_eval_attribute
|
|
10
|
+
|
|
11
|
+
__all__ = (
|
|
12
|
+
'Case',
|
|
13
|
+
'Dataset',
|
|
14
|
+
'increment_eval_metric',
|
|
15
|
+
'set_eval_attribute',
|
|
16
|
+
)
|
|
@@ -201,7 +201,7 @@ async def judge_output_expected(
|
|
|
201
201
|
).output
|
|
202
202
|
|
|
203
203
|
|
|
204
|
-
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
|
|
204
|
+
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
|
|
205
205
|
"""Set the default model used for judging.
|
|
206
206
|
|
|
207
207
|
This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
|
|
@@ -289,12 +289,12 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
289
289
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
290
290
|
duration_config: RenderNumberConfig | None = None,
|
|
291
291
|
include_reasons: bool = False,
|
|
292
|
-
) -> None:
|
|
292
|
+
) -> None:
|
|
293
293
|
"""Print this report to the console, optionally comparing it to a baseline report.
|
|
294
294
|
|
|
295
295
|
If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
|
|
296
296
|
"""
|
|
297
|
-
if console is None:
|
|
297
|
+
if console is None: # pragma: no branch
|
|
298
298
|
console = Console(width=width)
|
|
299
299
|
|
|
300
300
|
table = self.console_table(
|
|
@@ -318,7 +318,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
318
318
|
include_reasons=include_reasons,
|
|
319
319
|
)
|
|
320
320
|
console.print(table)
|
|
321
|
-
if include_errors and self.failures:
|
|
321
|
+
if include_errors and self.failures: # pragma: no cover
|
|
322
322
|
failures_table = self.failures_table(
|
|
323
323
|
include_input=include_input,
|
|
324
324
|
include_metadata=include_metadata,
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
|
|
2
|
-
|
|
3
|
-
This package provides functionality for:
|
|
4
|
-
- Creating and loading test datasets with structured inputs and outputs
|
|
5
|
-
- Evaluating model performance using various metrics and evaluators
|
|
6
|
-
- Generating reports for evaluation results
|
|
7
|
-
|
|
8
|
-
TODO(DavidM): Implement serialization of reports for later comparison, and add git hashes etc.
|
|
9
|
-
Note: I made pydantic_ai.evals.reports.EvalReport a BaseModel specifically to make this easier
|
|
10
|
-
TODO(DavidM): Add commit hash, timestamp, and other metadata to reports (like pytest-speed does), possibly in a dedicated struct
|
|
11
|
-
TODO(DavidM): Implement a CLI with some pytest-like filtering API to make it easier to run only specific cases
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
from .dataset import Case, Dataset
|
|
15
|
-
|
|
16
|
-
__all__ = (
|
|
17
|
-
'Case',
|
|
18
|
-
'Dataset',
|
|
19
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|