pydantic-evals 1.0.10__tar.gz → 1.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-evals might be problematic. Click here for more details.

Files changed (24) hide show
  1. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/PKG-INFO +2 -2
  2. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/dataset.py +18 -5
  3. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/llm_as_a_judge.py +1 -2
  4. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/generation.py +2 -1
  5. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/.gitignore +0 -0
  6. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/LICENSE +0 -0
  7. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/README.md +0 -0
  8. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/__init__.py +0 -0
  9. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/_utils.py +0 -0
  10. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/__init__.py +0 -0
  11. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
  12. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/common.py +0 -0
  13. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/context.py +0 -0
  14. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/evaluator.py +0 -0
  15. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/evaluators/spec.py +0 -0
  16. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/otel/__init__.py +0 -0
  17. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
  18. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/otel/_context_subtree.py +0 -0
  19. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/otel/_errors.py +0 -0
  20. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/otel/span_tree.py +0 -0
  21. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/py.typed +0 -0
  22. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/reporting/__init__.py +0 -0
  23. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pydantic_evals/reporting/render_numbers.py +0 -0
  24. {pydantic_evals-1.0.10 → pydantic_evals-1.0.12}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 1.0.10
3
+ Version: 1.0.12
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
30
30
  Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=0
32
32
  Requires-Dist: logfire-api>=3.14.1
33
- Requires-Dist: pydantic-ai-slim==1.0.10
33
+ Requires-Dist: pydantic-ai-slim==1.0.12
34
34
  Requires-Dist: pydantic>=2.10
35
35
  Requires-Dist: pyyaml>=6.0.2
36
36
  Requires-Dist: rich>=13.9.4
@@ -333,6 +333,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
333
333
  name: str | None = None,
334
334
  max_concurrency: int | None = None,
335
335
  progress: bool = True,
336
+ retry_task: RetryConfig | None = None,
337
+ retry_evaluators: RetryConfig | None = None,
336
338
  ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
337
339
  """Evaluates the test cases in the dataset using the given task.
338
340
 
@@ -346,12 +348,21 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
346
348
  max_concurrency: The maximum number of concurrent evaluations of the task to allow.
347
349
  If None, all cases will be evaluated concurrently.
348
350
  progress: Whether to show a progress bar for the evaluation. Defaults to True.
351
+ retry_task: Optional retry configuration for the task execution.
352
+ retry_evaluators: Optional retry configuration for evaluator execution.
349
353
 
350
354
  Returns:
351
355
  A report containing the results of the evaluation.
352
356
  """
353
357
  return get_event_loop().run_until_complete(
354
- self.evaluate(task, name=name, max_concurrency=max_concurrency, progress=progress)
358
+ self.evaluate(
359
+ task,
360
+ name=name,
361
+ max_concurrency=max_concurrency,
362
+ progress=progress,
363
+ retry_task=retry_task,
364
+ retry_evaluators=retry_evaluators,
365
+ )
355
366
  )
356
367
 
357
368
  def add_case(
@@ -886,12 +897,14 @@ async def _run_task(
886
897
  # That way users can customize this logic. We'd default to a function that does the current thing but also
887
898
  # allow `None` to disable it entirely.
888
899
  for node in span_tree:
889
- if node.attributes.get('gen_ai.operation.name') == 'chat':
890
- task_run.increment_metric('requests', 1)
891
900
  for k, v in node.attributes.items():
892
- if not isinstance(v, int | float):
901
+ if k == 'gen_ai.operation.name' and v == 'chat':
902
+ task_run.increment_metric('requests', 1)
903
+ elif not isinstance(v, int | float):
893
904
  continue
894
- if k.startswith('gen_ai.usage.details.'):
905
+ elif k == 'operation.cost':
906
+ task_run.increment_metric('cost', v)
907
+ elif k.startswith('gen_ai.usage.details.'):
895
908
  task_run.increment_metric(k.removeprefix('gen_ai.usage.details.'), v)
896
909
  elif k.startswith('gen_ai.usage.'):
897
910
  task_run.increment_metric(k.removeprefix('gen_ai.usage.'), v)
@@ -7,8 +7,7 @@ from typing import Any
7
7
  from pydantic import BaseModel, Field
8
8
  from pydantic_core import to_json
9
9
 
10
- from pydantic_ai import Agent, models
11
- from pydantic_ai.messages import MultiModalContent, UserContent
10
+ from pydantic_ai import Agent, MultiModalContent, UserContent, models
12
11
  from pydantic_ai.settings import ModelSettings
13
12
 
14
13
  __all__ = (
@@ -59,7 +59,8 @@ async def generate_dataset(
59
59
  """
60
60
  output_schema = dataset_type.model_json_schema_with_evaluators(custom_evaluator_types)
61
61
 
62
- # TODO(DavidM): Update this once we add better response_format and/or ResultTool support to Pydantic AI
62
+ # TODO: Use `output_type=StructuredDict(output_schema)` (and `from_dict` below) once https://github.com/pydantic/pydantic/issues/12145
63
+ # is fixed and `StructuredDict` no longer needs to use `InlineDefsJsonSchemaTransformer`.
63
64
  agent = Agent(
64
65
  model,
65
66
  system_prompt=(
File without changes