pydantic-evals 1.0.14__tar.gz → 1.22.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/.gitignore +3 -1
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/PKG-INFO +2 -2
- pydantic_evals-1.22.0/pydantic_evals/__init__.py +16 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/dataset.py +30 -9
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/evaluators/llm_as_a_judge.py +1 -1
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/generation.py +3 -1
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/reporting/__init__.py +191 -18
- pydantic_evals-1.0.14/pydantic_evals/__init__.py +0 -19
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/LICENSE +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/README.md +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/_utils.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/evaluators/__init__.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/evaluators/common.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/evaluators/context.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/evaluators/evaluator.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/evaluators/spec.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/otel/__init__.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/otel/_context_subtree.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/otel/_errors.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/otel/span_tree.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/py.typed +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pydantic_evals/reporting/render_numbers.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.22.0}/pyproject.toml +0 -0
|
@@ -10,7 +10,7 @@ env*/
|
|
|
10
10
|
/TODO.md
|
|
11
11
|
/postgres-data/
|
|
12
12
|
.DS_Store
|
|
13
|
-
|
|
13
|
+
.chat_app_messages.sqlite
|
|
14
14
|
.cache/
|
|
15
15
|
.vscode/
|
|
16
16
|
/question_graph_history.json
|
|
@@ -21,3 +21,5 @@ node_modules/
|
|
|
21
21
|
/test_tmp/
|
|
22
22
|
.mcp.json
|
|
23
23
|
.claude/
|
|
24
|
+
/.cursor/
|
|
25
|
+
/.devcontainer/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.22.0
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=0
|
|
32
32
|
Requires-Dist: logfire-api>=3.14.1
|
|
33
|
-
Requires-Dist: pydantic-ai-slim==1.0
|
|
33
|
+
Requires-Dist: pydantic-ai-slim==1.22.0
|
|
34
34
|
Requires-Dist: pydantic>=2.10
|
|
35
35
|
Requires-Dist: pyyaml>=6.0.2
|
|
36
36
|
Requires-Dist: rich>=13.9.4
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
|
|
2
|
+
|
|
3
|
+
This package provides functionality for:
|
|
4
|
+
- Creating and loading test datasets with structured inputs and outputs
|
|
5
|
+
- Evaluating model performance using various metrics and evaluators
|
|
6
|
+
- Generating reports for evaluation results
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .dataset import Case, Dataset, increment_eval_metric, set_eval_attribute
|
|
10
|
+
|
|
11
|
+
__all__ = (
|
|
12
|
+
'Case',
|
|
13
|
+
'Dataset',
|
|
14
|
+
'increment_eval_metric',
|
|
15
|
+
'set_eval_attribute',
|
|
16
|
+
)
|
|
@@ -265,6 +265,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
265
265
|
retry_evaluators: RetryConfig | None = None,
|
|
266
266
|
*,
|
|
267
267
|
task_name: str | None = None,
|
|
268
|
+
metadata: dict[str, Any] | None = None,
|
|
268
269
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
269
270
|
"""Evaluates the test cases in the dataset using the given task.
|
|
270
271
|
|
|
@@ -283,6 +284,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
283
284
|
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
284
285
|
task_name: Optional override to the name of the task being executed, otherwise the name of the task
|
|
285
286
|
function will be used.
|
|
287
|
+
metadata: Optional dict of experiment metadata.
|
|
286
288
|
|
|
287
289
|
Returns:
|
|
288
290
|
A report containing the results of the evaluation.
|
|
@@ -294,6 +296,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
294
296
|
|
|
295
297
|
limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
|
|
296
298
|
|
|
299
|
+
extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'}
|
|
300
|
+
if metadata is not None:
|
|
301
|
+
extra_attributes['metadata'] = metadata
|
|
297
302
|
with (
|
|
298
303
|
logfire_span(
|
|
299
304
|
'evaluate {name}',
|
|
@@ -301,7 +306,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
301
306
|
task_name=task_name,
|
|
302
307
|
dataset_name=self.name,
|
|
303
308
|
n_cases=len(self.cases),
|
|
304
|
-
**
|
|
309
|
+
**extra_attributes,
|
|
305
310
|
) as eval_span,
|
|
306
311
|
progress_bar or nullcontext(),
|
|
307
312
|
):
|
|
@@ -339,11 +344,18 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
339
344
|
name=name,
|
|
340
345
|
cases=cases,
|
|
341
346
|
failures=failures,
|
|
347
|
+
experiment_metadata=metadata,
|
|
342
348
|
span_id=span_id,
|
|
343
349
|
trace_id=trace_id,
|
|
344
350
|
)
|
|
345
|
-
|
|
346
|
-
|
|
351
|
+
full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
|
|
352
|
+
if metadata is not None:
|
|
353
|
+
full_experiment_metadata['metadata'] = metadata
|
|
354
|
+
if (averages := report.averages()) is not None:
|
|
355
|
+
full_experiment_metadata['averages'] = averages
|
|
356
|
+
if averages.assertions is not None:
|
|
357
|
+
eval_span.set_attribute('assertion_pass_rate', averages.assertions)
|
|
358
|
+
eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata)
|
|
347
359
|
return report
|
|
348
360
|
|
|
349
361
|
def evaluate_sync(
|
|
@@ -354,6 +366,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
354
366
|
progress: bool = True,
|
|
355
367
|
retry_task: RetryConfig | None = None,
|
|
356
368
|
retry_evaluators: RetryConfig | None = None,
|
|
369
|
+
*,
|
|
370
|
+
task_name: str | None = None,
|
|
371
|
+
metadata: dict[str, Any] | None = None,
|
|
357
372
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
358
373
|
"""Evaluates the test cases in the dataset using the given task.
|
|
359
374
|
|
|
@@ -362,13 +377,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
362
377
|
Args:
|
|
363
378
|
task: The task to evaluate. This should be a callable that takes the inputs of the case
|
|
364
379
|
and returns the output.
|
|
365
|
-
name: The name of the
|
|
366
|
-
If omitted, the name of the task function
|
|
380
|
+
name: The name of the experiment being run, this is used to identify the experiment in the report.
|
|
381
|
+
If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
|
|
367
382
|
max_concurrency: The maximum number of concurrent evaluations of the task to allow.
|
|
368
383
|
If None, all cases will be evaluated concurrently.
|
|
369
|
-
progress: Whether to show a progress bar for the evaluation. Defaults to True
|
|
384
|
+
progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
|
|
370
385
|
retry_task: Optional retry configuration for the task execution.
|
|
371
386
|
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
387
|
+
task_name: Optional override to the name of the task being executed, otherwise the name of the task
|
|
388
|
+
function will be used.
|
|
389
|
+
metadata: Optional dict of experiment metadata.
|
|
372
390
|
|
|
373
391
|
Returns:
|
|
374
392
|
A report containing the results of the evaluation.
|
|
@@ -376,11 +394,13 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
376
394
|
return get_event_loop().run_until_complete(
|
|
377
395
|
self.evaluate(
|
|
378
396
|
task,
|
|
379
|
-
|
|
397
|
+
name=name,
|
|
380
398
|
max_concurrency=max_concurrency,
|
|
381
399
|
progress=progress,
|
|
382
400
|
retry_task=retry_task,
|
|
383
401
|
retry_evaluators=retry_evaluators,
|
|
402
|
+
task_name=task_name,
|
|
403
|
+
metadata=metadata,
|
|
384
404
|
)
|
|
385
405
|
)
|
|
386
406
|
|
|
@@ -646,7 +666,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
646
666
|
|
|
647
667
|
context: dict[str, Any] = {'use_short_form': True}
|
|
648
668
|
if fmt == 'yaml':
|
|
649
|
-
dumped_data = self.model_dump(mode='json', by_alias=True,
|
|
669
|
+
dumped_data = self.model_dump(mode='json', by_alias=True, context=context)
|
|
650
670
|
content = yaml.dump(dumped_data, sort_keys=False)
|
|
651
671
|
if schema_ref: # pragma: no branch
|
|
652
672
|
yaml_language_server_line = f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}'
|
|
@@ -654,7 +674,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
654
674
|
path.write_text(content)
|
|
655
675
|
else:
|
|
656
676
|
context['$schema'] = schema_ref
|
|
657
|
-
json_data = self.model_dump_json(indent=2, by_alias=True,
|
|
677
|
+
json_data = self.model_dump_json(indent=2, by_alias=True, context=context)
|
|
658
678
|
path.write_text(json_data + '\n')
|
|
659
679
|
|
|
660
680
|
@classmethod
|
|
@@ -724,6 +744,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
724
744
|
evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007
|
|
725
745
|
|
|
726
746
|
class Dataset(BaseModel, extra='forbid'):
|
|
747
|
+
name: str | None = None
|
|
727
748
|
cases: list[Case]
|
|
728
749
|
if evaluator_schema_types: # pragma: no branch
|
|
729
750
|
evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007
|
|
@@ -201,7 +201,7 @@ async def judge_output_expected(
|
|
|
201
201
|
).output
|
|
202
202
|
|
|
203
203
|
|
|
204
|
-
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
|
|
204
|
+
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
|
|
205
205
|
"""Set the default model used for judging.
|
|
206
206
|
|
|
207
207
|
This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
|
|
@@ -14,6 +14,7 @@ from pydantic import ValidationError
|
|
|
14
14
|
from typing_extensions import TypeVar
|
|
15
15
|
|
|
16
16
|
from pydantic_ai import Agent, models
|
|
17
|
+
from pydantic_ai._utils import strip_markdown_fences
|
|
17
18
|
from pydantic_evals import Dataset
|
|
18
19
|
from pydantic_evals.evaluators.evaluator import Evaluator
|
|
19
20
|
|
|
@@ -73,8 +74,9 @@ async def generate_dataset(
|
|
|
73
74
|
)
|
|
74
75
|
|
|
75
76
|
result = await agent.run(extra_instructions or 'Please generate the object.')
|
|
77
|
+
output = strip_markdown_fences(result.output)
|
|
76
78
|
try:
|
|
77
|
-
result = dataset_type.from_text(
|
|
79
|
+
result = dataset_type.from_text(output, fmt='json', custom_evaluator_types=custom_evaluator_types)
|
|
78
80
|
except ValidationError as e: # pragma: no cover
|
|
79
81
|
print(f'Raw response from model:\n{result.output}')
|
|
80
82
|
raise e
|
|
@@ -7,8 +7,10 @@ from io import StringIO
|
|
|
7
7
|
from typing import Any, Generic, Literal, Protocol, cast
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel, TypeAdapter
|
|
10
|
-
from rich.console import Console
|
|
10
|
+
from rich.console import Console, Group, RenderableType
|
|
11
|
+
from rich.panel import Panel
|
|
11
12
|
from rich.table import Table
|
|
13
|
+
from rich.text import Text
|
|
12
14
|
from typing_extensions import TypedDict, TypeVar
|
|
13
15
|
|
|
14
16
|
from pydantic_evals._utils import UNSET, Unset
|
|
@@ -196,6 +198,8 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
196
198
|
failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
|
|
197
199
|
"""The failures in the report. These are cases where task execution raised an exception."""
|
|
198
200
|
|
|
201
|
+
experiment_metadata: dict[str, Any] | None = None
|
|
202
|
+
"""Metadata associated with the specific experiment represented by this report."""
|
|
199
203
|
trace_id: str | None = None
|
|
200
204
|
"""The trace ID of the evaluation."""
|
|
201
205
|
span_id: str | None = None
|
|
@@ -206,11 +210,69 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
206
210
|
return ReportCaseAggregate.average(self.cases)
|
|
207
211
|
return None
|
|
208
212
|
|
|
213
|
+
def render(
|
|
214
|
+
self,
|
|
215
|
+
width: int | None = None,
|
|
216
|
+
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
217
|
+
*,
|
|
218
|
+
include_input: bool = False,
|
|
219
|
+
include_metadata: bool = False,
|
|
220
|
+
include_expected_output: bool = False,
|
|
221
|
+
include_output: bool = False,
|
|
222
|
+
include_durations: bool = True,
|
|
223
|
+
include_total_duration: bool = False,
|
|
224
|
+
include_removed_cases: bool = False,
|
|
225
|
+
include_averages: bool = True,
|
|
226
|
+
include_errors: bool = True,
|
|
227
|
+
include_error_stacktrace: bool = False,
|
|
228
|
+
include_evaluator_failures: bool = True,
|
|
229
|
+
input_config: RenderValueConfig | None = None,
|
|
230
|
+
metadata_config: RenderValueConfig | None = None,
|
|
231
|
+
output_config: RenderValueConfig | None = None,
|
|
232
|
+
score_configs: dict[str, RenderNumberConfig] | None = None,
|
|
233
|
+
label_configs: dict[str, RenderValueConfig] | None = None,
|
|
234
|
+
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
235
|
+
duration_config: RenderNumberConfig | None = None,
|
|
236
|
+
include_reasons: bool = False,
|
|
237
|
+
) -> str:
|
|
238
|
+
"""Render this report to a nicely-formatted string, optionally comparing it to a baseline report.
|
|
239
|
+
|
|
240
|
+
If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
|
|
241
|
+
"""
|
|
242
|
+
io_file = StringIO()
|
|
243
|
+
console = Console(width=width, file=io_file)
|
|
244
|
+
self.print(
|
|
245
|
+
width=width,
|
|
246
|
+
baseline=baseline,
|
|
247
|
+
console=console,
|
|
248
|
+
include_input=include_input,
|
|
249
|
+
include_metadata=include_metadata,
|
|
250
|
+
include_expected_output=include_expected_output,
|
|
251
|
+
include_output=include_output,
|
|
252
|
+
include_durations=include_durations,
|
|
253
|
+
include_total_duration=include_total_duration,
|
|
254
|
+
include_removed_cases=include_removed_cases,
|
|
255
|
+
include_averages=include_averages,
|
|
256
|
+
include_errors=include_errors,
|
|
257
|
+
include_error_stacktrace=include_error_stacktrace,
|
|
258
|
+
include_evaluator_failures=include_evaluator_failures,
|
|
259
|
+
input_config=input_config,
|
|
260
|
+
metadata_config=metadata_config,
|
|
261
|
+
output_config=output_config,
|
|
262
|
+
score_configs=score_configs,
|
|
263
|
+
label_configs=label_configs,
|
|
264
|
+
metric_configs=metric_configs,
|
|
265
|
+
duration_config=duration_config,
|
|
266
|
+
include_reasons=include_reasons,
|
|
267
|
+
)
|
|
268
|
+
return io_file.getvalue()
|
|
269
|
+
|
|
209
270
|
def print(
|
|
210
271
|
self,
|
|
211
272
|
width: int | None = None,
|
|
212
273
|
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
213
274
|
*,
|
|
275
|
+
console: Console | None = None,
|
|
214
276
|
include_input: bool = False,
|
|
215
277
|
include_metadata: bool = False,
|
|
216
278
|
include_expected_output: bool = False,
|
|
@@ -230,12 +292,16 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
230
292
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
231
293
|
duration_config: RenderNumberConfig | None = None,
|
|
232
294
|
include_reasons: bool = False,
|
|
233
|
-
)
|
|
295
|
+
) -> None:
|
|
234
296
|
"""Print this report to the console, optionally comparing it to a baseline report.
|
|
235
297
|
|
|
236
298
|
If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
|
|
237
299
|
"""
|
|
238
|
-
|
|
300
|
+
if console is None: # pragma: no branch
|
|
301
|
+
console = Console(width=width)
|
|
302
|
+
|
|
303
|
+
metadata_panel = self._metadata_panel(baseline=baseline)
|
|
304
|
+
renderable: RenderableType = self.console_table(
|
|
239
305
|
baseline=baseline,
|
|
240
306
|
include_input=include_input,
|
|
241
307
|
include_metadata=include_metadata,
|
|
@@ -254,10 +320,13 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
254
320
|
metric_configs=metric_configs,
|
|
255
321
|
duration_config=duration_config,
|
|
256
322
|
include_reasons=include_reasons,
|
|
323
|
+
with_title=not metadata_panel,
|
|
257
324
|
)
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
325
|
+
# Wrap table with experiment metadata panel if present
|
|
326
|
+
if metadata_panel:
|
|
327
|
+
renderable = Group(metadata_panel, renderable)
|
|
328
|
+
console.print(renderable)
|
|
329
|
+
if include_errors and self.failures: # pragma: no cover
|
|
261
330
|
failures_table = self.failures_table(
|
|
262
331
|
include_input=include_input,
|
|
263
332
|
include_metadata=include_metadata,
|
|
@@ -269,6 +338,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
269
338
|
)
|
|
270
339
|
console.print(failures_table, style='red')
|
|
271
340
|
|
|
341
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
272
342
|
def console_table(
|
|
273
343
|
self,
|
|
274
344
|
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
@@ -290,9 +360,11 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
290
360
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
291
361
|
duration_config: RenderNumberConfig | None = None,
|
|
292
362
|
include_reasons: bool = False,
|
|
363
|
+
with_title: bool = True,
|
|
293
364
|
) -> Table:
|
|
294
|
-
"""Return a table containing the data from this report
|
|
365
|
+
"""Return a table containing the data from this report.
|
|
295
366
|
|
|
367
|
+
If a baseline is provided, returns a diff between this report and the baseline report.
|
|
296
368
|
Optionally include input and output details.
|
|
297
369
|
"""
|
|
298
370
|
renderer = EvaluationRenderer(
|
|
@@ -317,10 +389,82 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
317
389
|
include_reasons=include_reasons,
|
|
318
390
|
)
|
|
319
391
|
if baseline is None:
|
|
320
|
-
return renderer.build_table(self)
|
|
321
|
-
else:
|
|
322
|
-
return renderer.build_diff_table(self, baseline)
|
|
392
|
+
return renderer.build_table(self, with_title=with_title)
|
|
393
|
+
else:
|
|
394
|
+
return renderer.build_diff_table(self, baseline, with_title=with_title)
|
|
395
|
+
|
|
396
|
+
def _metadata_panel(
|
|
397
|
+
self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None
|
|
398
|
+
) -> RenderableType | None:
|
|
399
|
+
"""Wrap a table with an experiment metadata panel if metadata exists.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
table: The table to wrap
|
|
403
|
+
baseline: Optional baseline report for diff metadata
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Either the table unchanged or a Group with Panel and Table
|
|
407
|
+
"""
|
|
408
|
+
if baseline is None:
|
|
409
|
+
# Single report - show metadata if present
|
|
410
|
+
if self.experiment_metadata:
|
|
411
|
+
metadata_text = Text()
|
|
412
|
+
items = list(self.experiment_metadata.items())
|
|
413
|
+
for i, (key, value) in enumerate(items):
|
|
414
|
+
metadata_text.append(f'{key}: {value}', style='dim')
|
|
415
|
+
if i < len(items) - 1:
|
|
416
|
+
metadata_text.append('\n')
|
|
417
|
+
return Panel(
|
|
418
|
+
metadata_text,
|
|
419
|
+
title=f'Evaluation Summary: {self.name}',
|
|
420
|
+
title_align='left',
|
|
421
|
+
border_style='dim',
|
|
422
|
+
padding=(0, 1),
|
|
423
|
+
expand=False,
|
|
424
|
+
)
|
|
425
|
+
else:
|
|
426
|
+
# Diff report - show metadata diff if either has metadata
|
|
427
|
+
if self.experiment_metadata or baseline.experiment_metadata:
|
|
428
|
+
diff_name = baseline.name if baseline.name == self.name else f'{baseline.name} → {self.name}'
|
|
429
|
+
metadata_text = Text()
|
|
430
|
+
lines_styles: list[tuple[str, str]] = []
|
|
431
|
+
if baseline.experiment_metadata and self.experiment_metadata:
|
|
432
|
+
# Collect all keys from both
|
|
433
|
+
all_keys = sorted(set(baseline.experiment_metadata.keys()) | set(self.experiment_metadata.keys()))
|
|
434
|
+
for key in all_keys:
|
|
435
|
+
baseline_val = baseline.experiment_metadata.get(key)
|
|
436
|
+
report_val = self.experiment_metadata.get(key)
|
|
437
|
+
if baseline_val == report_val:
|
|
438
|
+
lines_styles.append((f'{key}: {report_val}', 'dim'))
|
|
439
|
+
elif baseline_val is None:
|
|
440
|
+
lines_styles.append((f'+ {key}: {report_val}', 'green'))
|
|
441
|
+
elif report_val is None:
|
|
442
|
+
lines_styles.append((f'- {key}: {baseline_val}', 'red'))
|
|
443
|
+
else:
|
|
444
|
+
lines_styles.append((f'{key}: {baseline_val} → {report_val}', 'yellow'))
|
|
445
|
+
elif self.experiment_metadata:
|
|
446
|
+
lines_styles = [(f'+ {k}: {v}', 'green') for k, v in self.experiment_metadata.items()]
|
|
447
|
+
else: # baseline.experiment_metadata only
|
|
448
|
+
assert baseline.experiment_metadata is not None
|
|
449
|
+
lines_styles = [(f'- {k}: {v}', 'red') for k, v in baseline.experiment_metadata.items()]
|
|
450
|
+
|
|
451
|
+
for i, (line, style) in enumerate(lines_styles):
|
|
452
|
+
metadata_text.append(line, style=style)
|
|
453
|
+
if i < len(lines_styles) - 1:
|
|
454
|
+
metadata_text.append('\n')
|
|
455
|
+
|
|
456
|
+
return Panel(
|
|
457
|
+
metadata_text,
|
|
458
|
+
title=f'Evaluation Diff: {diff_name}',
|
|
459
|
+
title_align='left',
|
|
460
|
+
border_style='dim',
|
|
461
|
+
padding=(0, 1),
|
|
462
|
+
expand=False,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
return None
|
|
323
466
|
|
|
467
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
324
468
|
def failures_table(
|
|
325
469
|
self,
|
|
326
470
|
*,
|
|
@@ -358,10 +502,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
358
502
|
|
|
359
503
|
def __str__(self) -> str: # pragma: lax no cover
|
|
360
504
|
"""Return a string representation of the report."""
|
|
361
|
-
|
|
362
|
-
io_file = StringIO()
|
|
363
|
-
Console(file=io_file).print(table)
|
|
364
|
-
return io_file.getvalue()
|
|
505
|
+
return self.render()
|
|
365
506
|
|
|
366
507
|
|
|
367
508
|
EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
|
|
@@ -647,6 +788,7 @@ class ReportCaseRenderer:
|
|
|
647
788
|
metric_renderers: Mapping[str, _NumberRenderer]
|
|
648
789
|
duration_renderer: _NumberRenderer
|
|
649
790
|
|
|
791
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
650
792
|
def build_base_table(self, title: str) -> Table:
|
|
651
793
|
"""Build and return a Rich Table for the diff output."""
|
|
652
794
|
table = Table(title=title, show_lines=True)
|
|
@@ -673,6 +815,7 @@ class ReportCaseRenderer:
|
|
|
673
815
|
table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right')
|
|
674
816
|
return table
|
|
675
817
|
|
|
818
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
676
819
|
def build_failures_table(self, title: str) -> Table:
|
|
677
820
|
"""Build and return a Rich Table for the failures output."""
|
|
678
821
|
table = Table(title=title, show_lines=True)
|
|
@@ -1132,9 +1275,22 @@ class EvaluationRenderer:
|
|
|
1132
1275
|
duration_renderer=duration_renderer,
|
|
1133
1276
|
)
|
|
1134
1277
|
|
|
1135
|
-
|
|
1278
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1279
|
+
def build_table(self, report: EvaluationReport, *, with_title: bool = True) -> Table:
|
|
1280
|
+
"""Build a table for the report.
|
|
1281
|
+
|
|
1282
|
+
Args:
|
|
1283
|
+
report: The evaluation report to render
|
|
1284
|
+
with_title: Whether to include the title in the table (default True)
|
|
1285
|
+
|
|
1286
|
+
Returns:
|
|
1287
|
+
A Rich Table object
|
|
1288
|
+
"""
|
|
1136
1289
|
case_renderer = self._get_case_renderer(report)
|
|
1137
|
-
|
|
1290
|
+
|
|
1291
|
+
title = f'Evaluation Summary: {report.name}' if with_title else ''
|
|
1292
|
+
table = case_renderer.build_base_table(title)
|
|
1293
|
+
|
|
1138
1294
|
for case in report.cases:
|
|
1139
1295
|
table.add_row(*case_renderer.build_row(case))
|
|
1140
1296
|
|
|
@@ -1145,7 +1301,20 @@ class EvaluationRenderer:
|
|
|
1145
1301
|
|
|
1146
1302
|
return table
|
|
1147
1303
|
|
|
1148
|
-
|
|
1304
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1305
|
+
def build_diff_table(
|
|
1306
|
+
self, report: EvaluationReport, baseline: EvaluationReport, *, with_title: bool = True
|
|
1307
|
+
) -> Table:
|
|
1308
|
+
"""Build a diff table comparing report to baseline.
|
|
1309
|
+
|
|
1310
|
+
Args:
|
|
1311
|
+
report: The evaluation report to compare
|
|
1312
|
+
baseline: The baseline report to compare against
|
|
1313
|
+
with_title: Whether to include the title in the table (default True)
|
|
1314
|
+
|
|
1315
|
+
Returns:
|
|
1316
|
+
A Rich Table object
|
|
1317
|
+
"""
|
|
1149
1318
|
report_cases = report.cases
|
|
1150
1319
|
baseline_cases = self._baseline_cases_to_include(report, baseline)
|
|
1151
1320
|
|
|
@@ -1170,7 +1339,10 @@ class EvaluationRenderer:
|
|
|
1170
1339
|
|
|
1171
1340
|
case_renderer = self._get_case_renderer(report, baseline)
|
|
1172
1341
|
diff_name = baseline.name if baseline.name == report.name else f'{baseline.name} → {report.name}'
|
|
1173
|
-
|
|
1342
|
+
|
|
1343
|
+
title = f'Evaluation Diff: {diff_name}' if with_title else ''
|
|
1344
|
+
table = case_renderer.build_base_table(title)
|
|
1345
|
+
|
|
1174
1346
|
for baseline_case, new_case in diff_cases:
|
|
1175
1347
|
table.add_row(*case_renderer.build_diff_row(new_case, baseline_case))
|
|
1176
1348
|
for case in added_cases:
|
|
@@ -1189,6 +1361,7 @@ class EvaluationRenderer:
|
|
|
1189
1361
|
|
|
1190
1362
|
return table
|
|
1191
1363
|
|
|
1364
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1192
1365
|
def build_failures_table(self, report: EvaluationReport) -> Table:
|
|
1193
1366
|
case_renderer = self._get_case_renderer(report)
|
|
1194
1367
|
table = case_renderer.build_failures_table('Case Failures')
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
|
|
2
|
-
|
|
3
|
-
This package provides functionality for:
|
|
4
|
-
- Creating and loading test datasets with structured inputs and outputs
|
|
5
|
-
- Evaluating model performance using various metrics and evaluators
|
|
6
|
-
- Generating reports for evaluation results
|
|
7
|
-
|
|
8
|
-
TODO(DavidM): Implement serialization of reports for later comparison, and add git hashes etc.
|
|
9
|
-
Note: I made pydantic_ai.evals.reports.EvalReport a BaseModel specifically to make this easier
|
|
10
|
-
TODO(DavidM): Add commit hash, timestamp, and other metadata to reports (like pytest-speed does), possibly in a dedicated struct
|
|
11
|
-
TODO(DavidM): Implement a CLI with some pytest-like filtering API to make it easier to run only specific cases
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
from .dataset import Case, Dataset
|
|
15
|
-
|
|
16
|
-
__all__ = (
|
|
17
|
-
'Case',
|
|
18
|
-
'Dataset',
|
|
19
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|