pydantic-evals 1.5.0__py3-none-any.whl → 1.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pydantic-evals might be problematic. Click here for more details.

pydantic_evals/dataset.py CHANGED
@@ -265,6 +265,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
265
265
  retry_evaluators: RetryConfig | None = None,
266
266
  *,
267
267
  task_name: str | None = None,
268
+ metadata: dict[str, Any] | None = None,
268
269
  ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
269
270
  """Evaluates the test cases in the dataset using the given task.
270
271
 
@@ -283,6 +284,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
283
284
  retry_evaluators: Optional retry configuration for evaluator execution.
284
285
  task_name: Optional override to the name of the task being executed, otherwise the name of the task
285
286
  function will be used.
287
+ metadata: Optional dict of experiment metadata.
286
288
 
287
289
  Returns:
288
290
  A report containing the results of the evaluation.
@@ -294,6 +296,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
294
296
 
295
297
  limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
296
298
 
299
+ extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'}
300
+ if metadata is not None:
301
+ extra_attributes['metadata'] = metadata
297
302
  with (
298
303
  logfire_span(
299
304
  'evaluate {name}',
@@ -301,7 +306,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
301
306
  task_name=task_name,
302
307
  dataset_name=self.name,
303
308
  n_cases=len(self.cases),
304
- **{'gen_ai.operation.name': 'experiment'}, # pyright: ignore[reportArgumentType]
309
+ **extra_attributes,
305
310
  ) as eval_span,
306
311
  progress_bar or nullcontext(),
307
312
  ):
@@ -339,13 +344,18 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
339
344
  name=name,
340
345
  cases=cases,
341
346
  failures=failures,
347
+ experiment_metadata=metadata,
342
348
  span_id=span_id,
343
349
  trace_id=trace_id,
344
350
  )
345
- if (averages := report.averages()) is not None and averages.assertions is not None:
346
- experiment_metadata = {'n_cases': len(self.cases), 'averages': averages}
347
- eval_span.set_attribute('logfire.experiment.metadata', experiment_metadata)
348
- eval_span.set_attribute('assertion_pass_rate', averages.assertions)
351
+ full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
352
+ if metadata is not None:
353
+ full_experiment_metadata['metadata'] = metadata
354
+ if (averages := report.averages()) is not None:
355
+ full_experiment_metadata['averages'] = averages
356
+ if averages.assertions is not None:
357
+ eval_span.set_attribute('assertion_pass_rate', averages.assertions)
358
+ eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata)
349
359
  return report
350
360
 
351
361
  def evaluate_sync(
@@ -356,6 +366,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
356
366
  progress: bool = True,
357
367
  retry_task: RetryConfig | None = None,
358
368
  retry_evaluators: RetryConfig | None = None,
369
+ *,
370
+ task_name: str | None = None,
371
+ metadata: dict[str, Any] | None = None,
359
372
  ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
360
373
  """Evaluates the test cases in the dataset using the given task.
361
374
 
@@ -364,13 +377,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
364
377
  Args:
365
378
  task: The task to evaluate. This should be a callable that takes the inputs of the case
366
379
  and returns the output.
367
- name: The name of the task being evaluated, this is used to identify the task in the report.
368
- If omitted, the name of the task function will be used.
380
+ name: The name of the experiment being run, this is used to identify the experiment in the report.
381
+ If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
369
382
  max_concurrency: The maximum number of concurrent evaluations of the task to allow.
370
383
  If None, all cases will be evaluated concurrently.
371
- progress: Whether to show a progress bar for the evaluation. Defaults to True.
384
+ progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
372
385
  retry_task: Optional retry configuration for the task execution.
373
386
  retry_evaluators: Optional retry configuration for evaluator execution.
387
+ task_name: Optional override to the name of the task being executed, otherwise the name of the task
388
+ function will be used.
389
+ metadata: Optional dict of experiment metadata.
374
390
 
375
391
  Returns:
376
392
  A report containing the results of the evaluation.
@@ -378,11 +394,13 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
378
394
  return get_event_loop().run_until_complete(
379
395
  self.evaluate(
380
396
  task,
381
- task_name=name,
397
+ name=name,
382
398
  max_concurrency=max_concurrency,
383
399
  progress=progress,
384
400
  retry_task=retry_task,
385
401
  retry_evaluators=retry_evaluators,
402
+ task_name=task_name,
403
+ metadata=metadata,
386
404
  )
387
405
  )
388
406
 
@@ -7,8 +7,10 @@ from io import StringIO
7
7
  from typing import Any, Generic, Literal, Protocol, cast
8
8
 
9
9
  from pydantic import BaseModel, TypeAdapter
10
- from rich.console import Console
10
+ from rich.console import Console, Group, RenderableType
11
+ from rich.panel import Panel
11
12
  from rich.table import Table
13
+ from rich.text import Text
12
14
  from typing_extensions import TypedDict, TypeVar
13
15
 
14
16
  from pydantic_evals._utils import UNSET, Unset
@@ -196,6 +198,8 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
196
198
  failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
197
199
  """The failures in the report. These are cases where task execution raised an exception."""
198
200
 
201
+ experiment_metadata: dict[str, Any] | None = None
202
+ """Metadata associated with the specific experiment represented by this report."""
199
203
  trace_id: str | None = None
200
204
  """The trace ID of the evaluation."""
201
205
  span_id: str | None = None
@@ -230,7 +234,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
230
234
  metric_configs: dict[str, RenderNumberConfig] | None = None,
231
235
  duration_config: RenderNumberConfig | None = None,
232
236
  include_reasons: bool = False,
233
- ) -> str: # pragma: no cover
237
+ ) -> str:
234
238
  """Render this report to a nicely-formatted string, optionally comparing it to a baseline report.
235
239
 
236
240
  If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
@@ -261,7 +265,6 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
261
265
  duration_config=duration_config,
262
266
  include_reasons=include_reasons,
263
267
  )
264
- Console(file=io_file)
265
268
  return io_file.getvalue()
266
269
 
267
270
  def print(
@@ -297,7 +300,8 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
297
300
  if console is None: # pragma: no branch
298
301
  console = Console(width=width)
299
302
 
300
- table = self.console_table(
303
+ metadata_panel = self._metadata_panel(baseline=baseline)
304
+ renderable: RenderableType = self.console_table(
301
305
  baseline=baseline,
302
306
  include_input=include_input,
303
307
  include_metadata=include_metadata,
@@ -316,8 +320,12 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
316
320
  metric_configs=metric_configs,
317
321
  duration_config=duration_config,
318
322
  include_reasons=include_reasons,
323
+ with_title=not metadata_panel,
319
324
  )
320
- console.print(table)
325
+ # Wrap table with experiment metadata panel if present
326
+ if metadata_panel:
327
+ renderable = Group(metadata_panel, renderable)
328
+ console.print(renderable)
321
329
  if include_errors and self.failures: # pragma: no cover
322
330
  failures_table = self.failures_table(
323
331
  include_input=include_input,
@@ -330,6 +338,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
330
338
  )
331
339
  console.print(failures_table, style='red')
332
340
 
341
+ # TODO(DavidM): in v2, change the return type here to RenderableType
333
342
  def console_table(
334
343
  self,
335
344
  baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
@@ -351,9 +360,11 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
351
360
  metric_configs: dict[str, RenderNumberConfig] | None = None,
352
361
  duration_config: RenderNumberConfig | None = None,
353
362
  include_reasons: bool = False,
363
+ with_title: bool = True,
354
364
  ) -> Table:
355
- """Return a table containing the data from this report, or the diff between this report and a baseline report.
365
+ """Return a table containing the data from this report.
356
366
 
367
+ If a baseline is provided, returns a diff between this report and the baseline report.
357
368
  Optionally include input and output details.
358
369
  """
359
370
  renderer = EvaluationRenderer(
@@ -378,10 +389,82 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
378
389
  include_reasons=include_reasons,
379
390
  )
380
391
  if baseline is None:
381
- return renderer.build_table(self)
382
- else: # pragma: no cover
383
- return renderer.build_diff_table(self, baseline)
392
+ return renderer.build_table(self, with_title=with_title)
393
+ else:
394
+ return renderer.build_diff_table(self, baseline, with_title=with_title)
395
+
396
+ def _metadata_panel(
397
+ self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None
398
+ ) -> RenderableType | None:
399
+ """Wrap a table with an experiment metadata panel if metadata exists.
384
400
 
401
+ Args:
402
+ table: The table to wrap
403
+ baseline: Optional baseline report for diff metadata
404
+
405
+ Returns:
406
+ Either the table unchanged or a Group with Panel and Table
407
+ """
408
+ if baseline is None:
409
+ # Single report - show metadata if present
410
+ if self.experiment_metadata:
411
+ metadata_text = Text()
412
+ items = list(self.experiment_metadata.items())
413
+ for i, (key, value) in enumerate(items):
414
+ metadata_text.append(f'{key}: {value}', style='dim')
415
+ if i < len(items) - 1:
416
+ metadata_text.append('\n')
417
+ return Panel(
418
+ metadata_text,
419
+ title=f'Evaluation Summary: {self.name}',
420
+ title_align='left',
421
+ border_style='dim',
422
+ padding=(0, 1),
423
+ expand=False,
424
+ )
425
+ else:
426
+ # Diff report - show metadata diff if either has metadata
427
+ if self.experiment_metadata or baseline.experiment_metadata:
428
+ diff_name = baseline.name if baseline.name == self.name else f'{baseline.name} → {self.name}'
429
+ metadata_text = Text()
430
+ lines_styles: list[tuple[str, str]] = []
431
+ if baseline.experiment_metadata and self.experiment_metadata:
432
+ # Collect all keys from both
433
+ all_keys = sorted(set(baseline.experiment_metadata.keys()) | set(self.experiment_metadata.keys()))
434
+ for key in all_keys:
435
+ baseline_val = baseline.experiment_metadata.get(key)
436
+ report_val = self.experiment_metadata.get(key)
437
+ if baseline_val == report_val:
438
+ lines_styles.append((f'{key}: {report_val}', 'dim'))
439
+ elif baseline_val is None:
440
+ lines_styles.append((f'+ {key}: {report_val}', 'green'))
441
+ elif report_val is None:
442
+ lines_styles.append((f'- {key}: {baseline_val}', 'red'))
443
+ else:
444
+ lines_styles.append((f'{key}: {baseline_val} → {report_val}', 'yellow'))
445
+ elif self.experiment_metadata:
446
+ lines_styles = [(f'+ {k}: {v}', 'green') for k, v in self.experiment_metadata.items()]
447
+ else: # baseline.experiment_metadata only
448
+ assert baseline.experiment_metadata is not None
449
+ lines_styles = [(f'- {k}: {v}', 'red') for k, v in baseline.experiment_metadata.items()]
450
+
451
+ for i, (line, style) in enumerate(lines_styles):
452
+ metadata_text.append(line, style=style)
453
+ if i < len(lines_styles) - 1:
454
+ metadata_text.append('\n')
455
+
456
+ return Panel(
457
+ metadata_text,
458
+ title=f'Evaluation Diff: {diff_name}',
459
+ title_align='left',
460
+ border_style='dim',
461
+ padding=(0, 1),
462
+ expand=False,
463
+ )
464
+
465
+ return None
466
+
467
+ # TODO(DavidM): in v2, change the return type here to RenderableType
385
468
  def failures_table(
386
469
  self,
387
470
  *,
@@ -705,6 +788,7 @@ class ReportCaseRenderer:
705
788
  metric_renderers: Mapping[str, _NumberRenderer]
706
789
  duration_renderer: _NumberRenderer
707
790
 
791
+ # TODO(DavidM): in v2, change the return type here to RenderableType
708
792
  def build_base_table(self, title: str) -> Table:
709
793
  """Build and return a Rich Table for the diff output."""
710
794
  table = Table(title=title, show_lines=True)
@@ -731,6 +815,7 @@ class ReportCaseRenderer:
731
815
  table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right')
732
816
  return table
733
817
 
818
+ # TODO(DavidM): in v2, change the return type here to RenderableType
734
819
  def build_failures_table(self, title: str) -> Table:
735
820
  """Build and return a Rich Table for the failures output."""
736
821
  table = Table(title=title, show_lines=True)
@@ -1190,9 +1275,22 @@ class EvaluationRenderer:
1190
1275
  duration_renderer=duration_renderer,
1191
1276
  )
1192
1277
 
1193
- def build_table(self, report: EvaluationReport) -> Table:
1278
+ # TODO(DavidM): in v2, change the return type here to RenderableType
1279
+ def build_table(self, report: EvaluationReport, *, with_title: bool = True) -> Table:
1280
+ """Build a table for the report.
1281
+
1282
+ Args:
1283
+ report: The evaluation report to render
1284
+ with_title: Whether to include the title in the table (default True)
1285
+
1286
+ Returns:
1287
+ A Rich Table object
1288
+ """
1194
1289
  case_renderer = self._get_case_renderer(report)
1195
- table = case_renderer.build_base_table(f'Evaluation Summary: {report.name}')
1290
+
1291
+ title = f'Evaluation Summary: {report.name}' if with_title else ''
1292
+ table = case_renderer.build_base_table(title)
1293
+
1196
1294
  for case in report.cases:
1197
1295
  table.add_row(*case_renderer.build_row(case))
1198
1296
 
@@ -1203,7 +1301,20 @@ class EvaluationRenderer:
1203
1301
 
1204
1302
  return table
1205
1303
 
1206
- def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) -> Table:
1304
+ # TODO(DavidM): in v2, change the return type here to RenderableType
1305
+ def build_diff_table(
1306
+ self, report: EvaluationReport, baseline: EvaluationReport, *, with_title: bool = True
1307
+ ) -> Table:
1308
+ """Build a diff table comparing report to baseline.
1309
+
1310
+ Args:
1311
+ report: The evaluation report to compare
1312
+ baseline: The baseline report to compare against
1313
+ with_title: Whether to include the title in the table (default True)
1314
+
1315
+ Returns:
1316
+ A Rich Table object
1317
+ """
1207
1318
  report_cases = report.cases
1208
1319
  baseline_cases = self._baseline_cases_to_include(report, baseline)
1209
1320
 
@@ -1228,7 +1339,10 @@ class EvaluationRenderer:
1228
1339
 
1229
1340
  case_renderer = self._get_case_renderer(report, baseline)
1230
1341
  diff_name = baseline.name if baseline.name == report.name else f'{baseline.name} → {report.name}'
1231
- table = case_renderer.build_base_table(f'Evaluation Diff: {diff_name}')
1342
+
1343
+ title = f'Evaluation Diff: {diff_name}' if with_title else ''
1344
+ table = case_renderer.build_base_table(title)
1345
+
1232
1346
  for baseline_case, new_case in diff_cases:
1233
1347
  table.add_row(*case_renderer.build_diff_row(new_case, baseline_case))
1234
1348
  for case in added_cases:
@@ -1247,6 +1361,7 @@ class EvaluationRenderer:
1247
1361
 
1248
1362
  return table
1249
1363
 
1364
+ # TODO(DavidM): in v2, change the return type here to RenderableType
1250
1365
  def build_failures_table(self, report: EvaluationReport) -> Table:
1251
1366
  case_renderer = self._get_case_renderer(report)
1252
1367
  table = case_renderer.build_failures_table('Case Failures')
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 1.5.0
3
+ Version: 1.12.0
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
30
30
  Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=0
32
32
  Requires-Dist: logfire-api>=3.14.1
33
- Requires-Dist: pydantic-ai-slim==1.5.0
33
+ Requires-Dist: pydantic-ai-slim==1.12.0
34
34
  Requires-Dist: pydantic>=2.10
35
35
  Requires-Dist: pyyaml>=6.0.2
36
36
  Requires-Dist: rich>=13.9.4
@@ -1,6 +1,6 @@
1
1
  pydantic_evals/__init__.py,sha256=X5m0fcEZ4e8hVhToX5PludEp8t7NTBmdNInFFM5hM_I,504
2
2
  pydantic_evals/_utils.py,sha256=1muGTc2zqjwxqngz6quRSLoZM88onjp0Xgt-a9n2aPQ,4111
3
- pydantic_evals/dataset.py,sha256=XobDGjjTj0oR5CARw8sWwC0KrIg0tpRzRiOkg8-Eeyc,50618
3
+ pydantic_evals/dataset.py,sha256=ugknLn5oHac3GRBW4FAa8kuGn_zMPaRtwa2C2La8rxE,51491
4
4
  pydantic_evals/generation.py,sha256=Qy03z7vGvE14cUBsqjorEx7Ar1KkR7Fb5SItZB429fc,3715
5
5
  pydantic_evals/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  pydantic_evals/evaluators/__init__.py,sha256=E_JT6o96Ef-oS_IZ1Hyy95NRLwz7EOHewp-o13IdXEM,1032
@@ -15,9 +15,9 @@ pydantic_evals/otel/_context_in_memory_span_exporter.py,sha256=FrG0pXKjuvTp3bXNd
15
15
  pydantic_evals/otel/_context_subtree.py,sha256=Iazp4w3IIBMCrkqWL-hTG-2QG_-2X81p794WG9MAsGk,1175
16
16
  pydantic_evals/otel/_errors.py,sha256=aW1414eTofpA7R_DUgOeT-gj7YA6OXmm8Y4oYeFukD4,268
17
17
  pydantic_evals/otel/span_tree.py,sha256=RzX4VGpEqc2QUhkyxMTXtBRo5yHHO1c0hI7QJJuiXPU,23043
18
- pydantic_evals/reporting/__init__.py,sha256=LGPZRKyRAl7Apx44-UnYENsAltknakf3dcYkjwoTSFw,54088
18
+ pydantic_evals/reporting/__init__.py,sha256=a1kM-1zPjgBOWMxIe-cQ0Qg_nx7GyrXLz5fGWocIqO0,59350
19
19
  pydantic_evals/reporting/render_numbers.py,sha256=8SKlK3etbD7HnSWWHCE993ceCNLZCepVQ-SsqUIhyxk,6916
20
- pydantic_evals-1.5.0.dist-info/METADATA,sha256=O1Alvy7xqrLx4N-S_HVdTlHPRRrQBQeb-7SpuNsi_OM,7844
21
- pydantic_evals-1.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
- pydantic_evals-1.5.0.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
- pydantic_evals-1.5.0.dist-info/RECORD,,
20
+ pydantic_evals-1.12.0.dist-info/METADATA,sha256=OWzXfoJk8aActDdES6GDHECeZrjnLhQp1yqs-ZzIt_s,7846
21
+ pydantic_evals-1.12.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
22
+ pydantic_evals-1.12.0.dist-info/licenses/LICENSE,sha256=vA6Jc482lEyBBuGUfD1pYx-cM7jxvLYOxPidZ30t_PQ,1100
23
+ pydantic_evals-1.12.0.dist-info/RECORD,,