pydantic-evals 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-evals might be problematic. Click here for more details.

pydantic_evals/dataset.py CHANGED
@@ -15,7 +15,7 @@ import sys
15
15
  import time
16
16
  import warnings
17
17
  from collections.abc import Awaitable, Mapping, Sequence
18
- from contextlib import AsyncExitStack
18
+ from contextlib import AsyncExitStack, nullcontext
19
19
  from contextvars import ContextVar
20
20
  from dataclasses import dataclass, field
21
21
  from pathlib import Path
@@ -28,6 +28,7 @@ from pydantic import BaseModel, ConfigDict, Field, TypeAdapter, ValidationError,
28
28
  from pydantic._internal import _typing_extra
29
29
  from pydantic_core import to_json
30
30
  from pydantic_core.core_schema import SerializationInfo, SerializerFunctionWrapHandler
31
+ from rich.progress import Progress
31
32
  from typing_extensions import NotRequired, Self, TypedDict, TypeVar
32
33
 
33
34
  from pydantic_evals._utils import get_event_loop
@@ -251,7 +252,11 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
251
252
  )
252
253
 
253
254
  async def evaluate(
254
- self, task: Callable[[InputsT], Awaitable[OutputT]], name: str | None = None, max_concurrency: int | None = None
255
+ self,
256
+ task: Callable[[InputsT], Awaitable[OutputT]],
257
+ name: str | None = None,
258
+ max_concurrency: int | None = None,
259
+ progress: bool = True,
255
260
  ) -> EvaluationReport:
256
261
  """Evaluates the test cases in the dataset using the given task.
257
262
 
@@ -265,18 +270,26 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
265
270
  If omitted, the name of the task function will be used.
266
271
  max_concurrency: The maximum number of concurrent evaluations of the task to allow.
267
272
  If None, all cases will be evaluated concurrently.
273
+ progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
268
274
 
269
275
  Returns:
270
276
  A report containing the results of the evaluation.
271
277
  """
272
278
  name = name or get_unwrapped_function_name(task)
279
+ total_cases = len(self.cases)
280
+ progress_bar = Progress() if progress else None
273
281
 
274
282
  limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
275
- with _logfire.span('evaluate {name}', name=name) as eval_span:
283
+
284
+ with _logfire.span('evaluate {name}', name=name) as eval_span, progress_bar or nullcontext():
285
+ task_id = progress_bar.add_task(f'Evaluating {name}', total=total_cases) if progress_bar else None
276
286
 
277
287
  async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str):
278
288
  async with limiter:
279
- return await _run_task_and_evaluators(task, case, report_case_name, self.evaluators)
289
+ result = await _run_task_and_evaluators(task, case, report_case_name, self.evaluators)
290
+ if progress_bar and task_id is not None: # pragma: no branch
291
+ progress_bar.update(task_id, advance=1)
292
+ return result
280
293
 
281
294
  report = EvaluationReport(
282
295
  name=name,
@@ -291,11 +304,14 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
291
304
  eval_span.set_attribute('cases', report.cases)
292
305
  # TODO(DavidM): Remove this 'averages' attribute once we compute it in the details panel
293
306
  eval_span.set_attribute('averages', report.averages())
294
-
295
307
  return report
296
308
 
297
309
  def evaluate_sync(
298
- self, task: Callable[[InputsT], Awaitable[OutputT]], name: str | None = None, max_concurrency: int | None = None
310
+ self,
311
+ task: Callable[[InputsT], Awaitable[OutputT]],
312
+ name: str | None = None,
313
+ max_concurrency: int | None = None,
314
+ progress: bool = True,
299
315
  ) -> EvaluationReport:
300
316
  """Evaluates the test cases in the dataset using the given task.
301
317
 
@@ -308,11 +324,14 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
308
324
  If omitted, the name of the task function will be used.
309
325
  max_concurrency: The maximum number of concurrent evaluations of the task to allow.
310
326
  If None, all cases will be evaluated concurrently.
327
+ progress: Whether to show a progress bar for the evaluation. Defaults to True.
311
328
 
312
329
  Returns:
313
330
  A report containing the results of the evaluation.
314
331
  """
315
- return get_event_loop().run_until_complete(self.evaluate(task, name=name, max_concurrency=max_concurrency))
332
+ return get_event_loop().run_until_complete(
333
+ self.evaluate(task, name=name, max_concurrency=max_concurrency, progress=progress)
334
+ )
316
335
 
317
336
  def add_case(
318
337
  self,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -32,7 +32,7 @@ Requires-Python: >=3.9
32
32
  Requires-Dist: anyio>=0
33
33
  Requires-Dist: eval-type-backport>=0; python_version < '3.11'
34
34
  Requires-Dist: logfire-api>=1.2.0
35
- Requires-Dist: pydantic-ai-slim==0.3.3
35
+ Requires-Dist: pydantic-ai-slim==0.3.5
36
36
  Requires-Dist: pydantic>=2.10
37
37
  Requires-Dist: pyyaml>=6.0.2
38
38
  Requires-Dist: rich>=13.9.4
@@ -1,6 +1,6 @@
1
1
  pydantic_evals/__init__.py,sha256=OKRbfhdc8UZPzrPJMZUQwvzIxLhXmEZxz1ZuD921fy4,839
2
2
  pydantic_evals/_utils.py,sha256=PfhmPbdQp-q90s568LuG45zDDXxgO13BEz8MQJK8qw4,2922
3
- pydantic_evals/dataset.py,sha256=BVeC1yaohXyAO07RJc51j1JtkpULJKIugcEpla8jB3Y,45207
3
+ pydantic_evals/dataset.py,sha256=UPyl8Jey18LlcvXQKZ4et5F3AFZ_ar100KREEO5Zfd0,46010
4
4
  pydantic_evals/generation.py,sha256=-w-4-zpJuW8mLj5ed60LUYm--b-2G42p-UDuPhOQgRE,3492
5
5
  pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  pydantic_evals/evaluators/__init__.py,sha256=uj110viFiDvqrIcuDcWexk_uBgJqhKMGPHT2YvDb7LA,624
@@ -17,7 +17,7 @@ pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD
17
17
  pydantic_evals/otel/span_tree.py,sha256=LV5Hsyo4riJzevHyBz8wxP82S-ry5zeKYi9bKWjGCS8,23057
18
18
  pydantic_evals/reporting/__init__.py,sha256=tknRGM2fm8EUENxbq4K5duHZ_DgNzrVWhpGHFkoQ9zo,41677
19
19
  pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
20
- pydantic_evals-0.3.3.dist-info/METADATA,sha256=TnZcmIIK5ZZMSXbCk3RifRmWoZAw_6EwPgGhsWWJ5Dc,7785
21
- pydantic_evals-0.3.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
- pydantic_evals-0.3.3.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
- pydantic_evals-0.3.3.dist-info/RECORD,,
20
+ pydantic_evals-0.3.5.dist-info/METADATA,sha256=spwNRtGWmgUy_EjgJZjoSYbfNgEHzaKCUSZ6dElhk60,7785
21
+ pydantic_evals-0.3.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
+ pydantic_evals-0.3.5.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
+ pydantic_evals-0.3.5.dist-info/RECORD,,