pydantic-evals 1.0.11__py3-none-any.whl → 1.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-evals might be problematic. Click here for more details.

pydantic_evals/dataset.py CHANGED
@@ -98,6 +98,7 @@ class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forb
98
98
 
99
99
  # $schema is included to avoid validation fails from the `$schema` key, see `_add_json_schema` below for context
100
100
  json_schema_path: str | None = Field(default=None, alias='$schema')
101
+ name: str | None = None
101
102
  cases: list[_CaseModel[InputsT, OutputT, MetadataT]]
102
103
  evaluators: list[EvaluatorSpec] = Field(default_factory=list)
103
104
 
@@ -218,6 +219,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
218
219
  ```
219
220
  """
220
221
 
222
+ name: str | None = None
223
+ """Optional name of the dataset."""
221
224
  cases: list[Case[InputsT, OutputT, MetadataT]]
222
225
  """List of test cases in the dataset."""
223
226
  evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = []
@@ -226,12 +229,14 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
226
229
  def __init__(
227
230
  self,
228
231
  *,
232
+ name: str | None = None,
229
233
  cases: Sequence[Case[InputsT, OutputT, MetadataT]],
230
234
  evaluators: Sequence[Evaluator[InputsT, OutputT, MetadataT]] = (),
231
235
  ):
232
236
  """Initialize a new dataset with test cases and optional evaluators.
233
237
 
234
238
  Args:
239
+ name: Optional name for the dataset.
235
240
  cases: Sequence of test cases to include in the dataset.
236
241
  evaluators: Optional sequence of evaluators to apply to all cases in the dataset.
237
242
  """
@@ -244,10 +249,12 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
244
249
  case_names.add(case.name)
245
250
 
246
251
  super().__init__(
252
+ name=name,
247
253
  cases=cases,
248
254
  evaluators=list(evaluators),
249
255
  )
250
256
 
257
+ # TODO in v2: Make everything not required keyword-only
251
258
  async def evaluate(
252
259
  self,
253
260
  task: Callable[[InputsT], Awaitable[OutputT]] | Callable[[InputsT], OutputT],
@@ -256,6 +263,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
256
263
  progress: bool = True,
257
264
  retry_task: RetryConfig | None = None,
258
265
  retry_evaluators: RetryConfig | None = None,
266
+ *,
267
+ task_name: str | None = None,
259
268
  ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
260
269
  """Evaluates the test cases in the dataset using the given task.
261
270
 
@@ -265,28 +274,38 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
265
274
  Args:
266
275
  task: The task to evaluate. This should be a callable that takes the inputs of the case
267
276
  and returns the output.
268
- name: The name of the task being evaluated, this is used to identify the task in the report.
269
- If omitted, the name of the task function will be used.
277
+ name: The name of the experiment being run, this is used to identify the experiment in the report.
278
+ If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
270
279
  max_concurrency: The maximum number of concurrent evaluations of the task to allow.
271
280
  If None, all cases will be evaluated concurrently.
272
281
  progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
273
282
  retry_task: Optional retry configuration for the task execution.
274
283
  retry_evaluators: Optional retry configuration for evaluator execution.
284
+ task_name: Optional override to the name of the task being executed, otherwise the name of the task
285
+ function will be used.
275
286
 
276
287
  Returns:
277
288
  A report containing the results of the evaluation.
278
289
  """
279
- name = name or get_unwrapped_function_name(task)
290
+ task_name = task_name or get_unwrapped_function_name(task)
291
+ name = name or task_name
280
292
  total_cases = len(self.cases)
281
293
  progress_bar = Progress() if progress else None
282
294
 
283
295
  limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
284
296
 
285
297
  with (
286
- logfire_span('evaluate {name}', name=name, n_cases=len(self.cases)) as eval_span,
298
+ logfire_span(
299
+ 'evaluate {name}',
300
+ name=name,
301
+ task_name=task_name,
302
+ dataset_name=self.name,
303
+ n_cases=len(self.cases),
304
+ **{'gen_ai.operation.name': 'experiment'}, # pyright: ignore[reportArgumentType]
305
+ ) as eval_span,
287
306
  progress_bar or nullcontext(),
288
307
  ):
289
- task_id = progress_bar.add_task(f'Evaluating {name}', total=total_cases) if progress_bar else None
308
+ task_id = progress_bar.add_task(f'Evaluating {task_name}', total=total_cases) if progress_bar else None
290
309
 
291
310
  async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str):
292
311
  async with limiter:
@@ -333,6 +352,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
333
352
  name: str | None = None,
334
353
  max_concurrency: int | None = None,
335
354
  progress: bool = True,
355
+ retry_task: RetryConfig | None = None,
356
+ retry_evaluators: RetryConfig | None = None,
336
357
  ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
337
358
  """Evaluates the test cases in the dataset using the given task.
338
359
 
@@ -346,12 +367,21 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
346
367
  max_concurrency: The maximum number of concurrent evaluations of the task to allow.
347
368
  If None, all cases will be evaluated concurrently.
348
369
  progress: Whether to show a progress bar for the evaluation. Defaults to True.
370
+ retry_task: Optional retry configuration for the task execution.
371
+ retry_evaluators: Optional retry configuration for evaluator execution.
349
372
 
350
373
  Returns:
351
374
  A report containing the results of the evaluation.
352
375
  """
353
376
  return get_event_loop().run_until_complete(
354
- self.evaluate(task, name=name, max_concurrency=max_concurrency, progress=progress)
377
+ self.evaluate(
378
+ task,
379
+ task_name=name,
380
+ max_concurrency=max_concurrency,
381
+ progress=progress,
382
+ retry_task=retry_task,
383
+ retry_evaluators=retry_evaluators,
384
+ )
355
385
  )
356
386
 
357
387
  def add_case(
@@ -463,7 +493,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
463
493
 
464
494
  raw = Path(path).read_text()
465
495
  try:
466
- return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types)
496
+ return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types, default_name=path.stem)
467
497
  except ValidationError as e: # pragma: no cover
468
498
  raise ValueError(f'{path} contains data that does not match the schema for {cls.__name__}:\n{e}.') from e
469
499
 
@@ -473,6 +503,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
473
503
  contents: str,
474
504
  fmt: Literal['yaml', 'json'] = 'yaml',
475
505
  custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
506
+ *,
507
+ default_name: str | None = None,
476
508
  ) -> Self:
477
509
  """Load a dataset from a string.
478
510
 
@@ -481,6 +513,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
481
513
  fmt: Format of the content. Must be either 'yaml' or 'json'.
482
514
  custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.
483
515
  These are additional evaluators beyond the default ones.
516
+ default_name: Default name of the dataset, to be used if not specified in the serialized contents.
484
517
 
485
518
  Returns:
486
519
  A new Dataset instance parsed from the string.
@@ -490,17 +523,19 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
490
523
  """
491
524
  if fmt == 'yaml':
492
525
  loaded = yaml.safe_load(contents)
493
- return cls.from_dict(loaded, custom_evaluator_types)
526
+ return cls.from_dict(loaded, custom_evaluator_types, default_name=default_name)
494
527
  else:
495
528
  dataset_model_type = cls._serialization_type()
496
529
  dataset_model = dataset_model_type.model_validate_json(contents)
497
- return cls._from_dataset_model(dataset_model, custom_evaluator_types)
530
+ return cls._from_dataset_model(dataset_model, custom_evaluator_types, default_name)
498
531
 
499
532
  @classmethod
500
533
  def from_dict(
501
534
  cls,
502
535
  data: dict[str, Any],
503
536
  custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
537
+ *,
538
+ default_name: str | None = None,
504
539
  ) -> Self:
505
540
  """Load a dataset from a dictionary.
506
541
 
@@ -508,6 +543,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
508
543
  data: Dictionary representation of the dataset.
509
544
  custom_evaluator_types: Custom evaluator classes to use when deserializing the dataset.
510
545
  These are additional evaluators beyond the default ones.
546
+ default_name: Default name of the dataset, to be used if not specified in the data.
511
547
 
512
548
  Returns:
513
549
  A new Dataset instance created from the dictionary.
@@ -517,19 +553,21 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
517
553
  """
518
554
  dataset_model_type = cls._serialization_type()
519
555
  dataset_model = dataset_model_type.model_validate(data)
520
- return cls._from_dataset_model(dataset_model, custom_evaluator_types)
556
+ return cls._from_dataset_model(dataset_model, custom_evaluator_types, default_name)
521
557
 
522
558
  @classmethod
523
559
  def _from_dataset_model(
524
560
  cls,
525
561
  dataset_model: _DatasetModel[InputsT, OutputT, MetadataT],
526
562
  custom_evaluator_types: Sequence[type[Evaluator[InputsT, OutputT, MetadataT]]] = (),
563
+ default_name: str | None = None,
527
564
  ) -> Self:
528
565
  """Create a Dataset from a _DatasetModel.
529
566
 
530
567
  Args:
531
568
  dataset_model: The _DatasetModel to convert.
532
569
  custom_evaluator_types: Custom evaluator classes to register for deserialization.
570
+ default_name: Default name of the dataset, to be used if the value is `None` in the provided model.
533
571
 
534
572
  Returns:
535
573
  A new Dataset instance created from the _DatasetModel.
@@ -566,7 +604,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
566
604
  cases.append(row)
567
605
  if errors:
568
606
  raise ExceptionGroup(f'{len(errors)} error(s) loading evaluators from registry', errors[:3])
569
- result = cls(cases=cases)
607
+ result = cls(name=dataset_model.name, cases=cases)
608
+ if result.name is None:
609
+ result.name = default_name
570
610
  result.evaluators = dataset_evaluators
571
611
  return result
572
612
 
@@ -886,12 +926,14 @@ async def _run_task(
886
926
  # That way users can customize this logic. We'd default to a function that does the current thing but also
887
927
  # allow `None` to disable it entirely.
888
928
  for node in span_tree:
889
- if node.attributes.get('gen_ai.operation.name') == 'chat':
890
- task_run.increment_metric('requests', 1)
891
929
  for k, v in node.attributes.items():
892
- if not isinstance(v, int | float):
930
+ if k == 'gen_ai.operation.name' and v == 'chat':
931
+ task_run.increment_metric('requests', 1)
932
+ elif not isinstance(v, int | float):
893
933
  continue
894
- if k.startswith('gen_ai.usage.details.'):
934
+ elif k == 'operation.cost':
935
+ task_run.increment_metric('cost', v)
936
+ elif k.startswith('gen_ai.usage.details.'):
895
937
  task_run.increment_metric(k.removeprefix('gen_ai.usage.details.'), v)
896
938
  elif k.startswith('gen_ai.usage.'):
897
939
  task_run.increment_metric(k.removeprefix('gen_ai.usage.'), v)
@@ -7,8 +7,7 @@ from typing import Any
7
7
  from pydantic import BaseModel, Field
8
8
  from pydantic_core import to_json
9
9
 
10
- from pydantic_ai import Agent, models
11
- from pydantic_ai.messages import MultiModalContent, UserContent
10
+ from pydantic_ai import Agent, MultiModalContent, UserContent, models
12
11
  from pydantic_ai.settings import ModelSettings
13
12
 
14
13
  __all__ = (
@@ -59,7 +59,8 @@ async def generate_dataset(
59
59
  """
60
60
  output_schema = dataset_type.model_json_schema_with_evaluators(custom_evaluator_types)
61
61
 
62
- # TODO(DavidM): Update this once we add better response_format and/or ResultTool support to Pydantic AI
62
+ # TODO: Use `output_type=StructuredDict(output_schema)` (and `from_dict` below) once https://github.com/pydantic/pydantic/issues/12145
63
+ # is fixed and `StructuredDict` no longer needs to use `InlineDefsJsonSchemaTransformer`.
63
64
  agent = Agent(
64
65
  model,
65
66
  system_prompt=(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 1.0.11
3
+ Version: 1.0.13
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
30
30
  Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=0
32
32
  Requires-Dist: logfire-api>=3.14.1
33
- Requires-Dist: pydantic-ai-slim==1.0.11
33
+ Requires-Dist: pydantic-ai-slim==1.0.13
34
34
  Requires-Dist: pydantic>=2.10
35
35
  Requires-Dist: pyyaml>=6.0.2
36
36
  Requires-Dist: rich>=13.9.4
@@ -1,14 +1,14 @@
1
1
  pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
2
2
  pydantic_evals/_utils.py,sha256=1muGTc2zqjwxqngz6quRSLoZM88onjp0Xgt-a9n2aPQ,4111
3
- pydantic_evals/dataset.py,sha256=8rcw_hJb9H01M22NInn-2Pi27xtZgfADUboMCW-nrj4,48468
4
- pydantic_evals/generation.py,sha256=Yd1rfbsDjjBBHDk-1KDu48hlITjM2-74rTnPBD_sqbA,3494
3
+ pydantic_evals/dataset.py,sha256=IfaS65LqHW0654iAxc7bxA7mETo1qUqhFwITS_wFZ5s,50447
4
+ pydantic_evals/generation.py,sha256=ROB8bZ6XKFquWNjWTd3lsXXwsx8-VgSCu_okbovNw9s,3619
5
5
  pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  pydantic_evals/evaluators/__init__.py,sha256=E_JT6o96Ef-oS_IZ1Hyy95NRLwz7EOHewp-o13IdXEM,1032
7
7
  pydantic_evals/evaluators/_run_evaluator.py,sha256=uGmH67gCTeF9BSprCiBC4DtKEpKLrKYaXgsAQiCbCLY,3630
8
8
  pydantic_evals/evaluators/common.py,sha256=Cc9RMsSf5P2gcq3IDwmZxgfo1xnu7HEehiAS2Hgibz4,11609
9
9
  pydantic_evals/evaluators/context.py,sha256=mTxcm0Hvkev9htpqwoJMCJIqEYBtY5g86SXcjoqQxHY,3884
10
10
  pydantic_evals/evaluators/evaluator.py,sha256=ylfKRytoM9KzbZkSsFkEEnsg4XhK4usuyy1Rb1emoPo,11474
11
- pydantic_evals/evaluators/llm_as_a_judge.py,sha256=i20c506j9f5J2VMzPeUky677lfGq27xaZ7xcYIFltiA,9599
11
+ pydantic_evals/evaluators/llm_as_a_judge.py,sha256=BPdUfEsLPSxN2kJPt3dtJBRCBP46ctRoW_n24WubaB0,9567
12
12
  pydantic_evals/evaluators/spec.py,sha256=szAUsY4gb8KK_l1R81HYrByh4Rawrjav7w9835FZg1w,6690
13
13
  pydantic_evals/otel/__init__.py,sha256=i2p3vDrOW039N4XM-UkozDhCm0ZmE6ZSs1yV5t03vd0,117
14
14
  pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=FrG0pXKjuvTp3bXNdrUyzdPkqm0DQWe4ehkiHaxSvz4,6742
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
17
17
  pydantic_evals/otel/span_tree.py,sha256=RzX4VGpEqc2QUhkyxMTXtBRo5yHHO1c0hI7QJJuiXPU,23043
18
18
  pydantic_evals/reporting/__init__.py,sha256=4S8q_KfOflQlJYTISWM1Vp6_wPDHOMjbh9mSc3dU4-8,51562
19
19
  pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
20
- pydantic_evals-1.0.11.dist-info/METADATA,sha256=AQAeNQ19UK65CJKbxg5Igu2TxYIYbVeZaPpyBLE6_FE,7846
21
- pydantic_evals-1.0.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
- pydantic_evals-1.0.11.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
- pydantic_evals-1.0.11.dist-info/RECORD,,
20
+ pydantic_evals-1.0.13.dist-info/METADATA,sha256=D6mudJtjmS-SJhv8xawBYvm3Sd8bRzjL-AKaK9cc4vA,7846
21
+ pydantic_evals-1.0.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
+ pydantic_evals-1.0.13.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
+ pydantic_evals-1.0.13.dist-info/RECORD,,