pydantic-evals 1.0.14__tar.gz → 1.50.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/.gitignore +3 -1
  2. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/PKG-INFO +2 -2
  3. pydantic_evals-1.50.0/pydantic_evals/__init__.py +16 -0
  4. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/_utils.py +1 -1
  5. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/dataset.py +49 -24
  6. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/common.py +1 -1
  7. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/llm_as_a_judge.py +36 -31
  8. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/generation.py +3 -1
  9. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/otel/span_tree.py +3 -3
  10. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/reporting/__init__.py +201 -26
  11. pydantic_evals-1.0.14/pydantic_evals/__init__.py +0 -19
  12. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/LICENSE +0 -0
  13. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/README.md +0 -0
  14. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/__init__.py +0 -0
  15. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
  16. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/context.py +0 -0
  17. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/evaluator.py +0 -0
  18. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/spec.py +0 -0
  19. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/otel/__init__.py +0 -0
  20. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
  21. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/otel/_context_subtree.py +0 -0
  22. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/otel/_errors.py +0 -0
  23. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/py.typed +0 -0
  24. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/reporting/render_numbers.py +0 -0
  25. {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pyproject.toml +0 -0
@@ -10,7 +10,7 @@ env*/
10
10
  /TODO.md
11
11
  /postgres-data/
12
12
  .DS_Store
13
- examples/pydantic_ai_examples/.chat_app_messages.sqlite
13
+ .chat_app_messages.sqlite
14
14
  .cache/
15
15
  .vscode/
16
16
  /question_graph_history.json
@@ -21,3 +21,5 @@ node_modules/
21
21
  /test_tmp/
22
22
  .mcp.json
23
23
  .claude/
24
+ /.cursor/
25
+ /.devcontainer/
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pydantic-evals
3
- Version: 1.0.14
3
+ Version: 1.50.0
4
4
  Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
5
5
  Project-URL: Homepage, https://ai.pydantic.dev/evals
6
6
  Project-URL: Source, https://github.com/pydantic/pydantic-ai
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
30
30
  Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=0
32
32
  Requires-Dist: logfire-api>=3.14.1
33
- Requires-Dist: pydantic-ai-slim==1.0.14
33
+ Requires-Dist: pydantic-ai-slim==1.50.0
34
34
  Requires-Dist: pydantic>=2.10
35
35
  Requires-Dist: pyyaml>=6.0.2
36
36
  Requires-Dist: rich>=13.9.4
@@ -0,0 +1,16 @@
1
+ """A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
2
+
3
+ This package provides functionality for:
4
+ - Creating and loading test datasets with structured inputs and outputs
5
+ - Evaluating model performance using various metrics and evaluators
6
+ - Generating reports for evaluation results
7
+ """
8
+
9
+ from .dataset import Case, Dataset, increment_eval_metric, set_eval_attribute
10
+
11
+ __all__ = (
12
+ 'Case',
13
+ 'Dataset',
14
+ 'increment_eval_metric',
15
+ 'set_eval_attribute',
16
+ )
@@ -112,7 +112,7 @@ async def task_group_gather(tasks: Sequence[Callable[[], Awaitable[T]]]) -> list
112
112
 
113
113
  try:
114
114
  from logfire._internal.config import (
115
- LogfireNotConfiguredWarning, # pyright: ignore[reportAssignmentType,reportPrivateImportUsage]
115
+ LogfireNotConfiguredWarning, # pyright: ignore[reportAssignmentType]
116
116
  )
117
117
  # TODO: Remove this `pragma: no cover` once we test evals without pydantic-ai (which includes logfire)
118
118
  except ImportError: # pragma: no cover
@@ -90,7 +90,7 @@ class _CaseModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'
90
90
  inputs: InputsT
91
91
  metadata: MetadataT | None = None
92
92
  expected_output: OutputT | None = None
93
- evaluators: list[EvaluatorSpec] = Field(default_factory=list)
93
+ evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
94
94
 
95
95
 
96
96
  class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'):
@@ -100,7 +100,7 @@ class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forb
100
100
  json_schema_path: str | None = Field(default=None, alias='$schema')
101
101
  name: str | None = None
102
102
  cases: list[_CaseModel[InputsT, OutputT, MetadataT]]
103
- evaluators: list[EvaluatorSpec] = Field(default_factory=list)
103
+ evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
104
104
 
105
105
 
106
106
  @dataclass(init=False)
@@ -136,7 +136,9 @@ class Case(Generic[InputsT, OutputT, MetadataT]):
136
136
  """
137
137
  expected_output: OutputT | None = None
138
138
  """Expected output of the task. This is the expected output of the task that will be evaluated."""
139
- evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(default_factory=list)
139
+ evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(
140
+ default_factory=list[Evaluator[InputsT, OutputT, MetadataT]]
141
+ )
140
142
  """Evaluators to be used just on this case."""
141
143
 
142
144
  def __init__(
@@ -265,6 +267,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
265
267
  retry_evaluators: RetryConfig | None = None,
266
268
  *,
267
269
  task_name: str | None = None,
270
+ metadata: dict[str, Any] | None = None,
268
271
  ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
269
272
  """Evaluates the test cases in the dataset using the given task.
270
273
 
@@ -283,6 +286,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
283
286
  retry_evaluators: Optional retry configuration for evaluator execution.
284
287
  task_name: Optional override to the name of the task being executed, otherwise the name of the task
285
288
  function will be used.
289
+ metadata: Optional dict of experiment metadata.
286
290
 
287
291
  Returns:
288
292
  A report containing the results of the evaluation.
@@ -294,6 +298,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
294
298
 
295
299
  limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
296
300
 
301
+ extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'}
302
+ if metadata is not None:
303
+ extra_attributes['metadata'] = metadata
297
304
  with (
298
305
  logfire_span(
299
306
  'evaluate {name}',
@@ -301,7 +308,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
301
308
  task_name=task_name,
302
309
  dataset_name=self.name,
303
310
  n_cases=len(self.cases),
304
- **{'gen_ai.operation.name': 'experiment'}, # pyright: ignore[reportArgumentType]
311
+ **extra_attributes,
305
312
  ) as eval_span,
306
313
  progress_bar or nullcontext(),
307
314
  ):
@@ -339,11 +346,18 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
339
346
  name=name,
340
347
  cases=cases,
341
348
  failures=failures,
349
+ experiment_metadata=metadata,
342
350
  span_id=span_id,
343
351
  trace_id=trace_id,
344
352
  )
345
- if (averages := report.averages()) is not None and averages.assertions is not None:
346
- eval_span.set_attribute('assertion_pass_rate', averages.assertions)
353
+ full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
354
+ if metadata is not None:
355
+ full_experiment_metadata['metadata'] = metadata
356
+ if (averages := report.averages()) is not None:
357
+ full_experiment_metadata['averages'] = averages
358
+ if averages.assertions is not None:
359
+ eval_span.set_attribute('assertion_pass_rate', averages.assertions)
360
+ eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata)
347
361
  return report
348
362
 
349
363
  def evaluate_sync(
@@ -354,21 +368,27 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
354
368
  progress: bool = True,
355
369
  retry_task: RetryConfig | None = None,
356
370
  retry_evaluators: RetryConfig | None = None,
371
+ *,
372
+ task_name: str | None = None,
373
+ metadata: dict[str, Any] | None = None,
357
374
  ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
358
375
  """Evaluates the test cases in the dataset using the given task.
359
376
 
360
- This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
377
+ This is a synchronous wrapper around [`evaluate`][pydantic_evals.dataset.Dataset.evaluate] provided for convenience.
361
378
 
362
379
  Args:
363
380
  task: The task to evaluate. This should be a callable that takes the inputs of the case
364
381
  and returns the output.
365
- name: The name of the task being evaluated, this is used to identify the task in the report.
366
- If omitted, the name of the task function will be used.
382
+ name: The name of the experiment being run, this is used to identify the experiment in the report.
383
+ If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
367
384
  max_concurrency: The maximum number of concurrent evaluations of the task to allow.
368
385
  If None, all cases will be evaluated concurrently.
369
- progress: Whether to show a progress bar for the evaluation. Defaults to True.
386
+ progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
370
387
  retry_task: Optional retry configuration for the task execution.
371
388
  retry_evaluators: Optional retry configuration for evaluator execution.
389
+ task_name: Optional override to the name of the task being executed, otherwise the name of the task
390
+ function will be used.
391
+ metadata: Optional dict of experiment metadata.
372
392
 
373
393
  Returns:
374
394
  A report containing the results of the evaluation.
@@ -376,11 +396,13 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
376
396
  return get_event_loop().run_until_complete(
377
397
  self.evaluate(
378
398
  task,
379
- task_name=name,
399
+ name=name,
380
400
  max_concurrency=max_concurrency,
381
401
  progress=progress,
382
402
  retry_task=retry_task,
383
403
  retry_evaluators=retry_evaluators,
404
+ task_name=task_name,
405
+ metadata=metadata,
384
406
  )
385
407
  )
386
408
 
@@ -491,7 +513,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
491
513
  path = Path(path)
492
514
  fmt = cls._infer_fmt(path, fmt)
493
515
 
494
- raw = Path(path).read_text()
516
+ raw = Path(path).read_text(encoding='utf-8')
495
517
  try:
496
518
  return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types, default_name=path.stem)
497
519
  except ValidationError as e: # pragma: no cover
@@ -646,16 +668,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
646
668
 
647
669
  context: dict[str, Any] = {'use_short_form': True}
648
670
  if fmt == 'yaml':
649
- dumped_data = self.model_dump(mode='json', by_alias=True, exclude_defaults=True, context=context)
671
+ dumped_data = self.model_dump(mode='json', by_alias=True, context=context)
650
672
  content = yaml.dump(dumped_data, sort_keys=False)
651
673
  if schema_ref: # pragma: no branch
652
674
  yaml_language_server_line = f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}'
653
675
  content = f'{yaml_language_server_line}\n{content}'
654
- path.write_text(content)
676
+ path.write_text(content, encoding='utf-8')
655
677
  else:
656
678
  context['$schema'] = schema_ref
657
- json_data = self.model_dump_json(indent=2, by_alias=True, exclude_defaults=True, context=context)
658
- path.write_text(json_data + '\n')
679
+ json_data = self.model_dump_json(indent=2, by_alias=True, context=context)
680
+ path.write_text(json_data + '\n', encoding='utf-8')
659
681
 
660
682
  @classmethod
661
683
  def model_json_schema_with_evaluators(
@@ -718,15 +740,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
718
740
  class Case(BaseModel, extra='forbid'): # pyright: ignore[reportUnusedClass] # this _is_ used below, but pyright doesn't seem to notice..
719
741
  name: str | None = None
720
742
  inputs: in_type # pyright: ignore[reportInvalidTypeForm]
721
- metadata: meta_type | None = None # pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]
722
- expected_output: out_type | None = None # pyright: ignore[reportInvalidTypeForm,reportUnknownVariableType]
743
+ metadata: meta_type | None = None # pyright: ignore[reportInvalidTypeForm]
744
+ expected_output: out_type | None = None # pyright: ignore[reportInvalidTypeForm]
723
745
  if evaluator_schema_types: # pragma: no branch
724
- evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007
746
+ evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa: UP007
725
747
 
726
748
  class Dataset(BaseModel, extra='forbid'):
749
+ name: str | None = None
727
750
  cases: list[Case]
728
751
  if evaluator_schema_types: # pragma: no branch
729
- evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007
752
+ evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa: UP007
730
753
 
731
754
  json_schema = Dataset.model_json_schema()
732
755
  # See `_add_json_schema` below, since `$schema` is added to the JSON, it has to be supported in the JSON
@@ -746,8 +769,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
746
769
  path = Path(path)
747
770
  json_schema = cls.model_json_schema_with_evaluators(custom_evaluator_types)
748
771
  schema_content = to_json(json_schema, indent=2).decode() + '\n'
749
- if not path.exists() or path.read_text() != schema_content: # pragma: no branch
750
- path.write_text(schema_content)
772
+ if not path.exists() or path.read_text(encoding='utf-8') != schema_content: # pragma: no branch
773
+ path.write_text(schema_content, encoding='utf-8')
751
774
 
752
775
  @classmethod
753
776
  @functools.cache
@@ -833,8 +856,8 @@ def _get_relative_path_reference(target: Path, source: Path, _prefix: str = '')
833
856
  class _TaskRun:
834
857
  """Internal class to track metrics and attributes for a task run."""
835
858
 
836
- attributes: dict[str, Any] = field(init=False, default_factory=dict)
837
- metrics: dict[str, int | float] = field(init=False, default_factory=dict)
859
+ attributes: dict[str, Any] = field(init=False, default_factory=dict[str, Any])
860
+ metrics: dict[str, int | float] = field(init=False, default_factory=dict[str, int | float])
838
861
 
839
862
  def record_metric(self, name: str, value: int | float) -> None:
840
863
  """Record a metric value.
@@ -926,6 +949,8 @@ async def _run_task(
926
949
  # That way users can customize this logic. We'd default to a function that does the current thing but also
927
950
  # allow `None` to disable it entirely.
928
951
  for node in span_tree:
952
+ if 'gen_ai.request.model' not in node.attributes:
953
+ continue # we only want to count the below specifically for the individual LLM requests, not agent runs
929
954
  for k, v in node.attributes.items():
930
955
  if k == 'gen_ai.operation.name' and v == 'chat':
931
956
  task_run.increment_metric('requests', 1)
@@ -191,7 +191,7 @@ class LLMJudge(Evaluator[object, object, object]):
191
191
  """
192
192
 
193
193
  rubric: str
194
- model: models.Model | models.KnownModelName | None = None
194
+ model: models.Model | models.KnownModelName | str | None = None
195
195
  include_input: bool = False
196
196
  include_expected_output: bool = False
197
197
  model_settings: ModelSettings | None = None
@@ -55,7 +55,7 @@ _judge_output_agent = Agent(
55
55
  async def judge_output(
56
56
  output: Any,
57
57
  rubric: str,
58
- model: models.Model | models.KnownModelName | None = None,
58
+ model: models.Model | models.KnownModelName | str | None = None,
59
59
  model_settings: ModelSettings | None = None,
60
60
  ) -> GradingOutput:
61
61
  """Judge the output of a model based on a rubric.
@@ -96,7 +96,7 @@ async def judge_input_output(
96
96
  inputs: Any,
97
97
  output: Any,
98
98
  rubric: str,
99
- model: models.Model | models.KnownModelName | None = None,
99
+ model: models.Model | models.KnownModelName | str | None = None,
100
100
  model_settings: ModelSettings | None = None,
101
101
  ) -> GradingOutput:
102
102
  """Judge the output of a model based on the inputs and a rubric.
@@ -141,7 +141,7 @@ async def judge_input_output_expected(
141
141
  output: Any,
142
142
  expected_output: Any,
143
143
  rubric: str,
144
- model: models.Model | models.KnownModelName | None = None,
144
+ model: models.Model | models.KnownModelName | str | None = None,
145
145
  model_settings: ModelSettings | None = None,
146
146
  ) -> GradingOutput:
147
147
  """Judge the output of a model based on the inputs and a rubric.
@@ -185,7 +185,7 @@ async def judge_output_expected(
185
185
  output: Any,
186
186
  expected_output: Any,
187
187
  rubric: str,
188
- model: models.Model | models.KnownModelName | None = None,
188
+ model: models.Model | models.KnownModelName | str | None = None,
189
189
  model_settings: ModelSettings | None = None,
190
190
  ) -> GradingOutput:
191
191
  """Judge the output of a model based on the expected output, output, and a rubric.
@@ -201,7 +201,7 @@ async def judge_output_expected(
201
201
  ).output
202
202
 
203
203
 
204
- def set_default_judge_model(model: models.Model | models.KnownModelName) -> None: # pragma: no cover
204
+ def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
205
205
  """Set the default model used for judging.
206
206
 
207
207
  This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
@@ -221,39 +221,44 @@ def _stringify(value: Any) -> str:
221
221
  return repr(value)
222
222
 
223
223
 
224
+ def _make_section(content: Any, tag: str) -> list[str | UserContent]:
225
+ """Create a tagged section, handling different content types, for use in the LLMJudge's prompt.
226
+
227
+ Args:
228
+ content (Any): content to include in the section_
229
+ tag (str): tag name for the section
230
+
231
+ Returns:
232
+ list[str | UserContent]: the tagged section as a list of strings or UserContent
233
+ """
234
+ sections: list[str | UserContent] = []
235
+ items: Sequence[str | UserContent] = ( # pyright: ignore[reportUnknownVariableType]
236
+ content if isinstance(content, Sequence) and not isinstance(content, str) else [content]
237
+ )
238
+
239
+ sections.append(f'<{tag}>')
240
+ for item in items:
241
+ sections.append(item if isinstance(item, str | MultiModalContent) else _stringify(item))
242
+ sections.append(f'</{tag}>')
243
+ return sections
244
+
245
+
224
246
  def _build_prompt(
225
247
  output: Any,
226
248
  rubric: str,
227
249
  inputs: Any | None = None,
228
250
  expected_output: Any | None = None,
229
251
  ) -> str | Sequence[str | UserContent]:
230
- """Build a prompt that includes input, output, and rubric."""
252
+ """Build a prompt that includes input, output, expected output, and rubric."""
231
253
  sections: list[str | UserContent] = []
232
-
233
254
  if inputs is not None:
234
- if isinstance(inputs, str):
235
- sections.append(f'<Input>\n{inputs}\n</Input>')
236
- else:
237
- sections.append('<Input>\n')
238
- if isinstance(inputs, Sequence):
239
- for item in inputs: # type: ignore
240
- if isinstance(item, str | MultiModalContent):
241
- sections.append(item)
242
- else:
243
- sections.append(_stringify(item))
244
- elif isinstance(inputs, MultiModalContent):
245
- sections.append(inputs)
246
- else:
247
- sections.append(_stringify(inputs))
248
- sections.append('</Input>')
249
-
250
- sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
251
- sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
255
+ sections.extend(_make_section(inputs, 'Input'))
252
256
 
253
- if expected_output is not None:
254
- sections.append(f'<ExpectedOutput>\n{_stringify(expected_output)}\n</ExpectedOutput>')
257
+ sections.extend(_make_section(output, 'Output'))
258
+ sections.extend(_make_section(rubric, 'Rubric'))
255
259
 
256
- if inputs is None or isinstance(inputs, str):
257
- return '\n\n'.join(sections) # type: ignore[arg-type]
258
- else:
259
- return sections
260
+ if expected_output is not None:
261
+ sections.extend(_make_section(expected_output, 'ExpectedOutput'))
262
+ if all(isinstance(section, str) for section in sections):
263
+ return '\n'.join(sections) # type: ignore[arg-type]
264
+ return sections
@@ -14,6 +14,7 @@ from pydantic import ValidationError
14
14
  from typing_extensions import TypeVar
15
15
 
16
16
  from pydantic_ai import Agent, models
17
+ from pydantic_ai._utils import strip_markdown_fences
17
18
  from pydantic_evals import Dataset
18
19
  from pydantic_evals.evaluators.evaluator import Evaluator
19
20
 
@@ -73,8 +74,9 @@ async def generate_dataset(
73
74
  )
74
75
 
75
76
  result = await agent.run(extra_instructions or 'Please generate the object.')
77
+ output = strip_markdown_fences(result.output)
76
78
  try:
77
- result = dataset_type.from_text(result.output, fmt='json', custom_evaluator_types=custom_evaluator_types)
79
+ result = dataset_type.from_text(output, fmt='json', custom_evaluator_types=custom_evaluator_types)
78
80
  except ValidationError as e: # pragma: no cover
79
81
  print(f'Raw response from model:\n{result.output}')
80
82
  raise e
@@ -241,7 +241,7 @@ class SpanNode:
241
241
 
242
242
  return self._matches_query(query)
243
243
 
244
- def _matches_query(self, query: SpanQuery) -> bool: # noqa C901
244
+ def _matches_query(self, query: SpanQuery) -> bool: # noqa: C901
245
245
  """Check if the span matches the query conditions."""
246
246
  # Logical combinations
247
247
  if or_ := query.get('or_'):
@@ -433,8 +433,8 @@ class SpanTree:
433
433
  You can then search or iterate the tree to make your assertions (using DFS for traversal).
434
434
  """
435
435
 
436
- roots: list[SpanNode] = field(default_factory=list)
437
- nodes_by_id: dict[str, SpanNode] = field(default_factory=dict)
436
+ roots: list[SpanNode] = field(default_factory=list[SpanNode])
437
+ nodes_by_id: dict[str, SpanNode] = field(default_factory=dict[str, SpanNode])
438
438
 
439
439
  # -------------------------------------------------------------------------
440
440
  # Construction
@@ -7,8 +7,10 @@ from io import StringIO
7
7
  from typing import Any, Generic, Literal, Protocol, cast
8
8
 
9
9
  from pydantic import BaseModel, TypeAdapter
10
- from rich.console import Console
10
+ from rich.console import Console, Group, RenderableType
11
+ from rich.panel import Panel
11
12
  from rich.table import Table
13
+ from rich.text import Text
12
14
  from typing_extensions import TypedDict, TypeVar
13
15
 
14
16
  from pydantic_evals._utils import UNSET, Unset
@@ -53,11 +55,11 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
53
55
  name: str
54
56
  """The name of the [case][pydantic_evals.Case]."""
55
57
  inputs: InputsT
56
- """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
58
+ """The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
57
59
  metadata: MetadataT | None
58
- """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
60
+ """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
59
61
  expected_output: OutputT | None
60
- """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
62
+ """The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
61
63
  output: OutputT
62
64
  """The output of the task execution."""
63
65
 
@@ -76,7 +78,7 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
76
78
  span_id: str | None = None
77
79
  """The span ID of the case span."""
78
80
 
79
- evaluator_failures: list[EvaluatorFailure] = field(default_factory=list)
81
+ evaluator_failures: list[EvaluatorFailure] = field(default_factory=list[EvaluatorFailure])
80
82
 
81
83
 
82
84
  @dataclass(kw_only=True)
@@ -86,11 +88,11 @@ class ReportCaseFailure(Generic[InputsT, OutputT, MetadataT]):
86
88
  name: str
87
89
  """The name of the [case][pydantic_evals.Case]."""
88
90
  inputs: InputsT
89
- """The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
91
+ """The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
90
92
  metadata: MetadataT | None
91
- """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
93
+ """Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
92
94
  expected_output: OutputT | None
93
- """The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
95
+ """The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
94
96
 
95
97
  error_message: str
96
98
  """The message of the exception that caused the failure."""
@@ -193,9 +195,13 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
193
195
 
194
196
  cases: list[ReportCase[InputsT, OutputT, MetadataT]]
195
197
  """The cases in the report."""
196
- failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
198
+ failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(
199
+ default_factory=list[ReportCaseFailure[InputsT, OutputT, MetadataT]]
200
+ )
197
201
  """The failures in the report. These are cases where task execution raised an exception."""
198
202
 
203
+ experiment_metadata: dict[str, Any] | None = None
204
+ """Metadata associated with the specific experiment represented by this report."""
199
205
  trace_id: str | None = None
200
206
  """The trace ID of the evaluation."""
201
207
  span_id: str | None = None
@@ -206,11 +212,69 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
206
212
  return ReportCaseAggregate.average(self.cases)
207
213
  return None
208
214
 
215
+ def render(
216
+ self,
217
+ width: int | None = None,
218
+ baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
219
+ *,
220
+ include_input: bool = False,
221
+ include_metadata: bool = False,
222
+ include_expected_output: bool = False,
223
+ include_output: bool = False,
224
+ include_durations: bool = True,
225
+ include_total_duration: bool = False,
226
+ include_removed_cases: bool = False,
227
+ include_averages: bool = True,
228
+ include_errors: bool = True,
229
+ include_error_stacktrace: bool = False,
230
+ include_evaluator_failures: bool = True,
231
+ input_config: RenderValueConfig | None = None,
232
+ metadata_config: RenderValueConfig | None = None,
233
+ output_config: RenderValueConfig | None = None,
234
+ score_configs: dict[str, RenderNumberConfig] | None = None,
235
+ label_configs: dict[str, RenderValueConfig] | None = None,
236
+ metric_configs: dict[str, RenderNumberConfig] | None = None,
237
+ duration_config: RenderNumberConfig | None = None,
238
+ include_reasons: bool = False,
239
+ ) -> str:
240
+ """Render this report to a nicely-formatted string, optionally comparing it to a baseline report.
241
+
242
+ If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
243
+ """
244
+ io_file = StringIO()
245
+ console = Console(width=width, file=io_file)
246
+ self.print(
247
+ width=width,
248
+ baseline=baseline,
249
+ console=console,
250
+ include_input=include_input,
251
+ include_metadata=include_metadata,
252
+ include_expected_output=include_expected_output,
253
+ include_output=include_output,
254
+ include_durations=include_durations,
255
+ include_total_duration=include_total_duration,
256
+ include_removed_cases=include_removed_cases,
257
+ include_averages=include_averages,
258
+ include_errors=include_errors,
259
+ include_error_stacktrace=include_error_stacktrace,
260
+ include_evaluator_failures=include_evaluator_failures,
261
+ input_config=input_config,
262
+ metadata_config=metadata_config,
263
+ output_config=output_config,
264
+ score_configs=score_configs,
265
+ label_configs=label_configs,
266
+ metric_configs=metric_configs,
267
+ duration_config=duration_config,
268
+ include_reasons=include_reasons,
269
+ )
270
+ return io_file.getvalue()
271
+
209
272
  def print(
210
273
  self,
211
274
  width: int | None = None,
212
275
  baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
213
276
  *,
277
+ console: Console | None = None,
214
278
  include_input: bool = False,
215
279
  include_metadata: bool = False,
216
280
  include_expected_output: bool = False,
@@ -230,12 +294,16 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
230
294
  metric_configs: dict[str, RenderNumberConfig] | None = None,
231
295
  duration_config: RenderNumberConfig | None = None,
232
296
  include_reasons: bool = False,
233
- ): # pragma: no cover
297
+ ) -> None:
234
298
  """Print this report to the console, optionally comparing it to a baseline report.
235
299
 
236
300
  If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
237
301
  """
238
- table = self.console_table(
302
+ if console is None: # pragma: no branch
303
+ console = Console(width=width)
304
+
305
+ metadata_panel = self._metadata_panel(baseline=baseline)
306
+ renderable: RenderableType = self.console_table(
239
307
  baseline=baseline,
240
308
  include_input=include_input,
241
309
  include_metadata=include_metadata,
@@ -254,10 +322,13 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
254
322
  metric_configs=metric_configs,
255
323
  duration_config=duration_config,
256
324
  include_reasons=include_reasons,
325
+ with_title=not metadata_panel,
257
326
  )
258
- console = Console(width=width)
259
- console.print(table)
260
- if include_errors and self.failures:
327
+ # Wrap table with experiment metadata panel if present
328
+ if metadata_panel:
329
+ renderable = Group(metadata_panel, renderable)
330
+ console.print(renderable)
331
+ if include_errors and self.failures: # pragma: no cover
261
332
  failures_table = self.failures_table(
262
333
  include_input=include_input,
263
334
  include_metadata=include_metadata,
@@ -269,6 +340,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
269
340
  )
270
341
  console.print(failures_table, style='red')
271
342
 
343
+ # TODO(DavidM): in v2, change the return type here to RenderableType
272
344
  def console_table(
273
345
  self,
274
346
  baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
@@ -290,9 +362,11 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
290
362
  metric_configs: dict[str, RenderNumberConfig] | None = None,
291
363
  duration_config: RenderNumberConfig | None = None,
292
364
  include_reasons: bool = False,
365
+ with_title: bool = True,
293
366
  ) -> Table:
294
- """Return a table containing the data from this report, or the diff between this report and a baseline report.
367
+ """Return a table containing the data from this report.
295
368
 
369
+ If a baseline is provided, returns a diff between this report and the baseline report.
296
370
  Optionally include input and output details.
297
371
  """
298
372
  renderer = EvaluationRenderer(
@@ -317,10 +391,82 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
317
391
  include_reasons=include_reasons,
318
392
  )
319
393
  if baseline is None:
320
- return renderer.build_table(self)
321
- else: # pragma: no cover
322
- return renderer.build_diff_table(self, baseline)
394
+ return renderer.build_table(self, with_title=with_title)
395
+ else:
396
+ return renderer.build_diff_table(self, baseline, with_title=with_title)
397
+
398
+ def _metadata_panel(
399
+ self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None
400
+ ) -> RenderableType | None:
401
+ """Wrap a table with an experiment metadata panel if metadata exists.
402
+
403
+ Args:
404
+ table: The table to wrap
405
+ baseline: Optional baseline report for diff metadata
406
+
407
+ Returns:
408
+ Either the table unchanged or a Group with Panel and Table
409
+ """
410
+ if baseline is None:
411
+ # Single report - show metadata if present
412
+ if self.experiment_metadata:
413
+ metadata_text = Text()
414
+ items = list(self.experiment_metadata.items())
415
+ for i, (key, value) in enumerate(items):
416
+ metadata_text.append(f'{key}: {value}', style='dim')
417
+ if i < len(items) - 1:
418
+ metadata_text.append('\n')
419
+ return Panel(
420
+ metadata_text,
421
+ title=f'Evaluation Summary: {self.name}',
422
+ title_align='left',
423
+ border_style='dim',
424
+ padding=(0, 1),
425
+ expand=False,
426
+ )
427
+ else:
428
+ # Diff report - show metadata diff if either has metadata
429
+ if self.experiment_metadata or baseline.experiment_metadata:
430
+ diff_name = baseline.name if baseline.name == self.name else f'{baseline.name} → {self.name}'
431
+ metadata_text = Text()
432
+ lines_styles: list[tuple[str, str]] = []
433
+ if baseline.experiment_metadata and self.experiment_metadata:
434
+ # Collect all keys from both
435
+ all_keys = sorted(set(baseline.experiment_metadata.keys()) | set(self.experiment_metadata.keys()))
436
+ for key in all_keys:
437
+ baseline_val = baseline.experiment_metadata.get(key)
438
+ report_val = self.experiment_metadata.get(key)
439
+ if baseline_val == report_val:
440
+ lines_styles.append((f'{key}: {report_val}', 'dim'))
441
+ elif baseline_val is None:
442
+ lines_styles.append((f'+ {key}: {report_val}', 'green'))
443
+ elif report_val is None:
444
+ lines_styles.append((f'- {key}: {baseline_val}', 'red'))
445
+ else:
446
+ lines_styles.append((f'{key}: {baseline_val} → {report_val}', 'yellow'))
447
+ elif self.experiment_metadata:
448
+ lines_styles = [(f'+ {k}: {v}', 'green') for k, v in self.experiment_metadata.items()]
449
+ else: # baseline.experiment_metadata only
450
+ assert baseline.experiment_metadata is not None
451
+ lines_styles = [(f'- {k}: {v}', 'red') for k, v in baseline.experiment_metadata.items()]
452
+
453
+ for i, (line, style) in enumerate(lines_styles):
454
+ metadata_text.append(line, style=style)
455
+ if i < len(lines_styles) - 1:
456
+ metadata_text.append('\n')
457
+
458
+ return Panel(
459
+ metadata_text,
460
+ title=f'Evaluation Diff: {diff_name}',
461
+ title_align='left',
462
+ border_style='dim',
463
+ padding=(0, 1),
464
+ expand=False,
465
+ )
466
+
467
+ return None
323
468
 
469
+ # TODO(DavidM): in v2, change the return type here to RenderableType
324
470
  def failures_table(
325
471
  self,
326
472
  *,
@@ -358,10 +504,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
358
504
 
359
505
  def __str__(self) -> str: # pragma: lax no cover
360
506
  """Return a string representation of the report."""
361
- table = self.console_table()
362
- io_file = StringIO()
363
- Console(file=io_file).print(table)
364
- return io_file.getvalue()
507
+ return self.render()
365
508
 
366
509
 
367
510
  EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
@@ -647,6 +790,7 @@ class ReportCaseRenderer:
647
790
  metric_renderers: Mapping[str, _NumberRenderer]
648
791
  duration_renderer: _NumberRenderer
649
792
 
793
+ # TODO(DavidM): in v2, change the return type here to RenderableType
650
794
  def build_base_table(self, title: str) -> Table:
651
795
  """Build and return a Rich Table for the diff output."""
652
796
  table = Table(title=title, show_lines=True)
@@ -673,6 +817,7 @@ class ReportCaseRenderer:
673
817
  table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right')
674
818
  return table
675
819
 
820
+ # TODO(DavidM): in v2, change the return type here to RenderableType
676
821
  def build_failures_table(self, title: str) -> Table:
677
822
  """Build and return a Rich Table for the failures output."""
678
823
  table = Table(title=title, show_lines=True)
@@ -1132,9 +1277,22 @@ class EvaluationRenderer:
1132
1277
  duration_renderer=duration_renderer,
1133
1278
  )
1134
1279
 
1135
- def build_table(self, report: EvaluationReport) -> Table:
1280
+ # TODO(DavidM): in v2, change the return type here to RenderableType
1281
+ def build_table(self, report: EvaluationReport, *, with_title: bool = True) -> Table:
1282
+ """Build a table for the report.
1283
+
1284
+ Args:
1285
+ report: The evaluation report to render
1286
+ with_title: Whether to include the title in the table (default True)
1287
+
1288
+ Returns:
1289
+ A Rich Table object
1290
+ """
1136
1291
  case_renderer = self._get_case_renderer(report)
1137
- table = case_renderer.build_base_table(f'Evaluation Summary: {report.name}')
1292
+
1293
+ title = f'Evaluation Summary: {report.name}' if with_title else ''
1294
+ table = case_renderer.build_base_table(title)
1295
+
1138
1296
  for case in report.cases:
1139
1297
  table.add_row(*case_renderer.build_row(case))
1140
1298
 
@@ -1145,7 +1303,20 @@ class EvaluationRenderer:
1145
1303
 
1146
1304
  return table
1147
1305
 
1148
- def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) -> Table:
1306
+ # TODO(DavidM): in v2, change the return type here to RenderableType
1307
+ def build_diff_table(
1308
+ self, report: EvaluationReport, baseline: EvaluationReport, *, with_title: bool = True
1309
+ ) -> Table:
1310
+ """Build a diff table comparing report to baseline.
1311
+
1312
+ Args:
1313
+ report: The evaluation report to compare
1314
+ baseline: The baseline report to compare against
1315
+ with_title: Whether to include the title in the table (default True)
1316
+
1317
+ Returns:
1318
+ A Rich Table object
1319
+ """
1149
1320
  report_cases = report.cases
1150
1321
  baseline_cases = self._baseline_cases_to_include(report, baseline)
1151
1322
 
@@ -1170,7 +1341,10 @@ class EvaluationRenderer:
1170
1341
 
1171
1342
  case_renderer = self._get_case_renderer(report, baseline)
1172
1343
  diff_name = baseline.name if baseline.name == report.name else f'{baseline.name} → {report.name}'
1173
- table = case_renderer.build_base_table(f'Evaluation Diff: {diff_name}')
1344
+
1345
+ title = f'Evaluation Diff: {diff_name}' if with_title else ''
1346
+ table = case_renderer.build_base_table(title)
1347
+
1174
1348
  for baseline_case, new_case in diff_cases:
1175
1349
  table.add_row(*case_renderer.build_diff_row(new_case, baseline_case))
1176
1350
  for case in added_cases:
@@ -1189,6 +1363,7 @@ class EvaluationRenderer:
1189
1363
 
1190
1364
  return table
1191
1365
 
1366
+ # TODO(DavidM): in v2, change the return type here to RenderableType
1192
1367
  def build_failures_table(self, report: EvaluationReport) -> Table:
1193
1368
  case_renderer = self._get_case_renderer(report)
1194
1369
  table = case_renderer.build_failures_table('Case Failures')
@@ -1,19 +0,0 @@
1
- """A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
2
-
3
- This package provides functionality for:
4
- - Creating and loading test datasets with structured inputs and outputs
5
- - Evaluating model performance using various metrics and evaluators
6
- - Generating reports for evaluation results
7
-
8
- TODO(DavidM): Implement serialization of reports for later comparison, and add git hashes etc.
9
- Note: I made pydantic_ai.evals.reports.EvalReport a BaseModel specifically to make this easier
10
- TODO(DavidM): Add commit hash, timestamp, and other metadata to reports (like pytest-speed does), possibly in a dedicated struct
11
- TODO(DavidM): Implement a CLI with some pytest-like filtering API to make it easier to run only specific cases
12
- """
13
-
14
- from .dataset import Case, Dataset
15
-
16
- __all__ = (
17
- 'Case',
18
- 'Dataset',
19
- )
File without changes