pydantic-evals 1.1.0__tar.gz → 1.10.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/.gitignore +1 -1
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/PKG-INFO +2 -2
- pydantic_evals-1.10.0/pydantic_evals/__init__.py +16 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/dataset.py +27 -7
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/evaluators/llm_as_a_judge.py +1 -1
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/generation.py +3 -1
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/reporting/__init__.py +131 -16
- pydantic_evals-1.1.0/pydantic_evals/__init__.py +0 -19
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/LICENSE +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/README.md +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/_utils.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/evaluators/__init__.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/evaluators/common.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/evaluators/context.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/evaluators/evaluator.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/evaluators/spec.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/otel/__init__.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/otel/_context_subtree.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/otel/_errors.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/otel/span_tree.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/py.typed +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pydantic_evals/reporting/render_numbers.py +0 -0
- {pydantic_evals-1.1.0 → pydantic_evals-1.10.0}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.10.0
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=0
|
|
32
32
|
Requires-Dist: logfire-api>=3.14.1
|
|
33
|
-
Requires-Dist: pydantic-ai-slim==1.
|
|
33
|
+
Requires-Dist: pydantic-ai-slim==1.10.0
|
|
34
34
|
Requires-Dist: pydantic>=2.10
|
|
35
35
|
Requires-Dist: pyyaml>=6.0.2
|
|
36
36
|
Requires-Dist: rich>=13.9.4
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
|
|
2
|
+
|
|
3
|
+
This package provides functionality for:
|
|
4
|
+
- Creating and loading test datasets with structured inputs and outputs
|
|
5
|
+
- Evaluating model performance using various metrics and evaluators
|
|
6
|
+
- Generating reports for evaluation results
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .dataset import Case, Dataset, increment_eval_metric, set_eval_attribute
|
|
10
|
+
|
|
11
|
+
__all__ = (
|
|
12
|
+
'Case',
|
|
13
|
+
'Dataset',
|
|
14
|
+
'increment_eval_metric',
|
|
15
|
+
'set_eval_attribute',
|
|
16
|
+
)
|
|
@@ -265,6 +265,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
265
265
|
retry_evaluators: RetryConfig | None = None,
|
|
266
266
|
*,
|
|
267
267
|
task_name: str | None = None,
|
|
268
|
+
metadata: dict[str, Any] | None = None,
|
|
268
269
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
269
270
|
"""Evaluates the test cases in the dataset using the given task.
|
|
270
271
|
|
|
@@ -283,6 +284,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
283
284
|
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
284
285
|
task_name: Optional override to the name of the task being executed, otherwise the name of the task
|
|
285
286
|
function will be used.
|
|
287
|
+
metadata: Optional dict of experiment metadata.
|
|
286
288
|
|
|
287
289
|
Returns:
|
|
288
290
|
A report containing the results of the evaluation.
|
|
@@ -294,6 +296,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
294
296
|
|
|
295
297
|
limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
|
|
296
298
|
|
|
299
|
+
extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'}
|
|
300
|
+
if metadata is not None:
|
|
301
|
+
extra_attributes['metadata'] = metadata
|
|
297
302
|
with (
|
|
298
303
|
logfire_span(
|
|
299
304
|
'evaluate {name}',
|
|
@@ -301,7 +306,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
301
306
|
task_name=task_name,
|
|
302
307
|
dataset_name=self.name,
|
|
303
308
|
n_cases=len(self.cases),
|
|
304
|
-
**
|
|
309
|
+
**extra_attributes,
|
|
305
310
|
) as eval_span,
|
|
306
311
|
progress_bar or nullcontext(),
|
|
307
312
|
):
|
|
@@ -339,11 +344,18 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
339
344
|
name=name,
|
|
340
345
|
cases=cases,
|
|
341
346
|
failures=failures,
|
|
347
|
+
experiment_metadata=metadata,
|
|
342
348
|
span_id=span_id,
|
|
343
349
|
trace_id=trace_id,
|
|
344
350
|
)
|
|
345
|
-
|
|
346
|
-
|
|
351
|
+
full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
|
|
352
|
+
if metadata is not None:
|
|
353
|
+
full_experiment_metadata['metadata'] = metadata
|
|
354
|
+
if (averages := report.averages()) is not None:
|
|
355
|
+
full_experiment_metadata['averages'] = averages
|
|
356
|
+
if averages.assertions is not None:
|
|
357
|
+
eval_span.set_attribute('assertion_pass_rate', averages.assertions)
|
|
358
|
+
eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata)
|
|
347
359
|
return report
|
|
348
360
|
|
|
349
361
|
def evaluate_sync(
|
|
@@ -354,6 +366,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
354
366
|
progress: bool = True,
|
|
355
367
|
retry_task: RetryConfig | None = None,
|
|
356
368
|
retry_evaluators: RetryConfig | None = None,
|
|
369
|
+
*,
|
|
370
|
+
task_name: str | None = None,
|
|
371
|
+
metadata: dict[str, Any] | None = None,
|
|
357
372
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
358
373
|
"""Evaluates the test cases in the dataset using the given task.
|
|
359
374
|
|
|
@@ -362,13 +377,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
362
377
|
Args:
|
|
363
378
|
task: The task to evaluate. This should be a callable that takes the inputs of the case
|
|
364
379
|
and returns the output.
|
|
365
|
-
name: The name of the
|
|
366
|
-
If omitted, the name of the task function
|
|
380
|
+
name: The name of the experiment being run, this is used to identify the experiment in the report.
|
|
381
|
+
If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
|
|
367
382
|
max_concurrency: The maximum number of concurrent evaluations of the task to allow.
|
|
368
383
|
If None, all cases will be evaluated concurrently.
|
|
369
|
-
progress: Whether to show a progress bar for the evaluation. Defaults to True
|
|
384
|
+
progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
|
|
370
385
|
retry_task: Optional retry configuration for the task execution.
|
|
371
386
|
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
387
|
+
task_name: Optional override to the name of the task being executed, otherwise the name of the task
|
|
388
|
+
function will be used.
|
|
389
|
+
metadata: Optional dict of experiment metadata.
|
|
372
390
|
|
|
373
391
|
Returns:
|
|
374
392
|
A report containing the results of the evaluation.
|
|
@@ -376,11 +394,13 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
376
394
|
return get_event_loop().run_until_complete(
|
|
377
395
|
self.evaluate(
|
|
378
396
|
task,
|
|
379
|
-
|
|
397
|
+
name=name,
|
|
380
398
|
max_concurrency=max_concurrency,
|
|
381
399
|
progress=progress,
|
|
382
400
|
retry_task=retry_task,
|
|
383
401
|
retry_evaluators=retry_evaluators,
|
|
402
|
+
task_name=task_name,
|
|
403
|
+
metadata=metadata,
|
|
384
404
|
)
|
|
385
405
|
)
|
|
386
406
|
|
|
@@ -201,7 +201,7 @@ async def judge_output_expected(
|
|
|
201
201
|
).output
|
|
202
202
|
|
|
203
203
|
|
|
204
|
-
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
|
|
204
|
+
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
|
|
205
205
|
"""Set the default model used for judging.
|
|
206
206
|
|
|
207
207
|
This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
|
|
@@ -14,6 +14,7 @@ from pydantic import ValidationError
|
|
|
14
14
|
from typing_extensions import TypeVar
|
|
15
15
|
|
|
16
16
|
from pydantic_ai import Agent, models
|
|
17
|
+
from pydantic_ai._utils import strip_markdown_fences
|
|
17
18
|
from pydantic_evals import Dataset
|
|
18
19
|
from pydantic_evals.evaluators.evaluator import Evaluator
|
|
19
20
|
|
|
@@ -73,8 +74,9 @@ async def generate_dataset(
|
|
|
73
74
|
)
|
|
74
75
|
|
|
75
76
|
result = await agent.run(extra_instructions or 'Please generate the object.')
|
|
77
|
+
output = strip_markdown_fences(result.output)
|
|
76
78
|
try:
|
|
77
|
-
result = dataset_type.from_text(
|
|
79
|
+
result = dataset_type.from_text(output, fmt='json', custom_evaluator_types=custom_evaluator_types)
|
|
78
80
|
except ValidationError as e: # pragma: no cover
|
|
79
81
|
print(f'Raw response from model:\n{result.output}')
|
|
80
82
|
raise e
|
|
@@ -7,8 +7,10 @@ from io import StringIO
|
|
|
7
7
|
from typing import Any, Generic, Literal, Protocol, cast
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel, TypeAdapter
|
|
10
|
-
from rich.console import Console
|
|
10
|
+
from rich.console import Console, Group, RenderableType
|
|
11
|
+
from rich.panel import Panel
|
|
11
12
|
from rich.table import Table
|
|
13
|
+
from rich.text import Text
|
|
12
14
|
from typing_extensions import TypedDict, TypeVar
|
|
13
15
|
|
|
14
16
|
from pydantic_evals._utils import UNSET, Unset
|
|
@@ -196,6 +198,8 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
196
198
|
failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
|
|
197
199
|
"""The failures in the report. These are cases where task execution raised an exception."""
|
|
198
200
|
|
|
201
|
+
experiment_metadata: dict[str, Any] | None = None
|
|
202
|
+
"""Metadata associated with the specific experiment represented by this report."""
|
|
199
203
|
trace_id: str | None = None
|
|
200
204
|
"""The trace ID of the evaluation."""
|
|
201
205
|
span_id: str | None = None
|
|
@@ -230,7 +234,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
230
234
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
231
235
|
duration_config: RenderNumberConfig | None = None,
|
|
232
236
|
include_reasons: bool = False,
|
|
233
|
-
) -> str:
|
|
237
|
+
) -> str:
|
|
234
238
|
"""Render this report to a nicely-formatted string, optionally comparing it to a baseline report.
|
|
235
239
|
|
|
236
240
|
If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
|
|
@@ -261,7 +265,6 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
261
265
|
duration_config=duration_config,
|
|
262
266
|
include_reasons=include_reasons,
|
|
263
267
|
)
|
|
264
|
-
Console(file=io_file)
|
|
265
268
|
return io_file.getvalue()
|
|
266
269
|
|
|
267
270
|
def print(
|
|
@@ -289,15 +292,16 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
289
292
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
290
293
|
duration_config: RenderNumberConfig | None = None,
|
|
291
294
|
include_reasons: bool = False,
|
|
292
|
-
) -> None:
|
|
295
|
+
) -> None:
|
|
293
296
|
"""Print this report to the console, optionally comparing it to a baseline report.
|
|
294
297
|
|
|
295
298
|
If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
|
|
296
299
|
"""
|
|
297
|
-
if console is None:
|
|
300
|
+
if console is None: # pragma: no branch
|
|
298
301
|
console = Console(width=width)
|
|
299
302
|
|
|
300
|
-
|
|
303
|
+
metadata_panel = self._metadata_panel(baseline=baseline)
|
|
304
|
+
renderable: RenderableType = self.console_table(
|
|
301
305
|
baseline=baseline,
|
|
302
306
|
include_input=include_input,
|
|
303
307
|
include_metadata=include_metadata,
|
|
@@ -316,9 +320,13 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
316
320
|
metric_configs=metric_configs,
|
|
317
321
|
duration_config=duration_config,
|
|
318
322
|
include_reasons=include_reasons,
|
|
323
|
+
with_title=not metadata_panel,
|
|
319
324
|
)
|
|
320
|
-
|
|
321
|
-
if
|
|
325
|
+
# Wrap table with experiment metadata panel if present
|
|
326
|
+
if metadata_panel:
|
|
327
|
+
renderable = Group(metadata_panel, renderable)
|
|
328
|
+
console.print(renderable)
|
|
329
|
+
if include_errors and self.failures: # pragma: no cover
|
|
322
330
|
failures_table = self.failures_table(
|
|
323
331
|
include_input=include_input,
|
|
324
332
|
include_metadata=include_metadata,
|
|
@@ -330,6 +338,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
330
338
|
)
|
|
331
339
|
console.print(failures_table, style='red')
|
|
332
340
|
|
|
341
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
333
342
|
def console_table(
|
|
334
343
|
self,
|
|
335
344
|
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
@@ -351,9 +360,11 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
351
360
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
352
361
|
duration_config: RenderNumberConfig | None = None,
|
|
353
362
|
include_reasons: bool = False,
|
|
363
|
+
with_title: bool = True,
|
|
354
364
|
) -> Table:
|
|
355
|
-
"""Return a table containing the data from this report
|
|
365
|
+
"""Return a table containing the data from this report.
|
|
356
366
|
|
|
367
|
+
If a baseline is provided, returns a diff between this report and the baseline report.
|
|
357
368
|
Optionally include input and output details.
|
|
358
369
|
"""
|
|
359
370
|
renderer = EvaluationRenderer(
|
|
@@ -378,10 +389,82 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
378
389
|
include_reasons=include_reasons,
|
|
379
390
|
)
|
|
380
391
|
if baseline is None:
|
|
381
|
-
return renderer.build_table(self)
|
|
382
|
-
else:
|
|
383
|
-
return renderer.build_diff_table(self, baseline)
|
|
392
|
+
return renderer.build_table(self, with_title=with_title)
|
|
393
|
+
else:
|
|
394
|
+
return renderer.build_diff_table(self, baseline, with_title=with_title)
|
|
395
|
+
|
|
396
|
+
def _metadata_panel(
|
|
397
|
+
self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None
|
|
398
|
+
) -> RenderableType | None:
|
|
399
|
+
"""Wrap a table with an experiment metadata panel if metadata exists.
|
|
384
400
|
|
|
401
|
+
Args:
|
|
402
|
+
table: The table to wrap
|
|
403
|
+
baseline: Optional baseline report for diff metadata
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Either the table unchanged or a Group with Panel and Table
|
|
407
|
+
"""
|
|
408
|
+
if baseline is None:
|
|
409
|
+
# Single report - show metadata if present
|
|
410
|
+
if self.experiment_metadata:
|
|
411
|
+
metadata_text = Text()
|
|
412
|
+
items = list(self.experiment_metadata.items())
|
|
413
|
+
for i, (key, value) in enumerate(items):
|
|
414
|
+
metadata_text.append(f'{key}: {value}', style='dim')
|
|
415
|
+
if i < len(items) - 1:
|
|
416
|
+
metadata_text.append('\n')
|
|
417
|
+
return Panel(
|
|
418
|
+
metadata_text,
|
|
419
|
+
title=f'Evaluation Summary: {self.name}',
|
|
420
|
+
title_align='left',
|
|
421
|
+
border_style='dim',
|
|
422
|
+
padding=(0, 1),
|
|
423
|
+
expand=False,
|
|
424
|
+
)
|
|
425
|
+
else:
|
|
426
|
+
# Diff report - show metadata diff if either has metadata
|
|
427
|
+
if self.experiment_metadata or baseline.experiment_metadata:
|
|
428
|
+
diff_name = baseline.name if baseline.name == self.name else f'{baseline.name} → {self.name}'
|
|
429
|
+
metadata_text = Text()
|
|
430
|
+
lines_styles: list[tuple[str, str]] = []
|
|
431
|
+
if baseline.experiment_metadata and self.experiment_metadata:
|
|
432
|
+
# Collect all keys from both
|
|
433
|
+
all_keys = sorted(set(baseline.experiment_metadata.keys()) | set(self.experiment_metadata.keys()))
|
|
434
|
+
for key in all_keys:
|
|
435
|
+
baseline_val = baseline.experiment_metadata.get(key)
|
|
436
|
+
report_val = self.experiment_metadata.get(key)
|
|
437
|
+
if baseline_val == report_val:
|
|
438
|
+
lines_styles.append((f'{key}: {report_val}', 'dim'))
|
|
439
|
+
elif baseline_val is None:
|
|
440
|
+
lines_styles.append((f'+ {key}: {report_val}', 'green'))
|
|
441
|
+
elif report_val is None:
|
|
442
|
+
lines_styles.append((f'- {key}: {baseline_val}', 'red'))
|
|
443
|
+
else:
|
|
444
|
+
lines_styles.append((f'{key}: {baseline_val} → {report_val}', 'yellow'))
|
|
445
|
+
elif self.experiment_metadata:
|
|
446
|
+
lines_styles = [(f'+ {k}: {v}', 'green') for k, v in self.experiment_metadata.items()]
|
|
447
|
+
else: # baseline.experiment_metadata only
|
|
448
|
+
assert baseline.experiment_metadata is not None
|
|
449
|
+
lines_styles = [(f'- {k}: {v}', 'red') for k, v in baseline.experiment_metadata.items()]
|
|
450
|
+
|
|
451
|
+
for i, (line, style) in enumerate(lines_styles):
|
|
452
|
+
metadata_text.append(line, style=style)
|
|
453
|
+
if i < len(lines_styles) - 1:
|
|
454
|
+
metadata_text.append('\n')
|
|
455
|
+
|
|
456
|
+
return Panel(
|
|
457
|
+
metadata_text,
|
|
458
|
+
title=f'Evaluation Diff: {diff_name}',
|
|
459
|
+
title_align='left',
|
|
460
|
+
border_style='dim',
|
|
461
|
+
padding=(0, 1),
|
|
462
|
+
expand=False,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
return None
|
|
466
|
+
|
|
467
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
385
468
|
def failures_table(
|
|
386
469
|
self,
|
|
387
470
|
*,
|
|
@@ -705,6 +788,7 @@ class ReportCaseRenderer:
|
|
|
705
788
|
metric_renderers: Mapping[str, _NumberRenderer]
|
|
706
789
|
duration_renderer: _NumberRenderer
|
|
707
790
|
|
|
791
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
708
792
|
def build_base_table(self, title: str) -> Table:
|
|
709
793
|
"""Build and return a Rich Table for the diff output."""
|
|
710
794
|
table = Table(title=title, show_lines=True)
|
|
@@ -731,6 +815,7 @@ class ReportCaseRenderer:
|
|
|
731
815
|
table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right')
|
|
732
816
|
return table
|
|
733
817
|
|
|
818
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
734
819
|
def build_failures_table(self, title: str) -> Table:
|
|
735
820
|
"""Build and return a Rich Table for the failures output."""
|
|
736
821
|
table = Table(title=title, show_lines=True)
|
|
@@ -1190,9 +1275,22 @@ class EvaluationRenderer:
|
|
|
1190
1275
|
duration_renderer=duration_renderer,
|
|
1191
1276
|
)
|
|
1192
1277
|
|
|
1193
|
-
|
|
1278
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1279
|
+
def build_table(self, report: EvaluationReport, *, with_title: bool = True) -> Table:
|
|
1280
|
+
"""Build a table for the report.
|
|
1281
|
+
|
|
1282
|
+
Args:
|
|
1283
|
+
report: The evaluation report to render
|
|
1284
|
+
with_title: Whether to include the title in the table (default True)
|
|
1285
|
+
|
|
1286
|
+
Returns:
|
|
1287
|
+
A Rich Table object
|
|
1288
|
+
"""
|
|
1194
1289
|
case_renderer = self._get_case_renderer(report)
|
|
1195
|
-
|
|
1290
|
+
|
|
1291
|
+
title = f'Evaluation Summary: {report.name}' if with_title else ''
|
|
1292
|
+
table = case_renderer.build_base_table(title)
|
|
1293
|
+
|
|
1196
1294
|
for case in report.cases:
|
|
1197
1295
|
table.add_row(*case_renderer.build_row(case))
|
|
1198
1296
|
|
|
@@ -1203,7 +1301,20 @@ class EvaluationRenderer:
|
|
|
1203
1301
|
|
|
1204
1302
|
return table
|
|
1205
1303
|
|
|
1206
|
-
|
|
1304
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1305
|
+
def build_diff_table(
|
|
1306
|
+
self, report: EvaluationReport, baseline: EvaluationReport, *, with_title: bool = True
|
|
1307
|
+
) -> Table:
|
|
1308
|
+
"""Build a diff table comparing report to baseline.
|
|
1309
|
+
|
|
1310
|
+
Args:
|
|
1311
|
+
report: The evaluation report to compare
|
|
1312
|
+
baseline: The baseline report to compare against
|
|
1313
|
+
with_title: Whether to include the title in the table (default True)
|
|
1314
|
+
|
|
1315
|
+
Returns:
|
|
1316
|
+
A Rich Table object
|
|
1317
|
+
"""
|
|
1207
1318
|
report_cases = report.cases
|
|
1208
1319
|
baseline_cases = self._baseline_cases_to_include(report, baseline)
|
|
1209
1320
|
|
|
@@ -1228,7 +1339,10 @@ class EvaluationRenderer:
|
|
|
1228
1339
|
|
|
1229
1340
|
case_renderer = self._get_case_renderer(report, baseline)
|
|
1230
1341
|
diff_name = baseline.name if baseline.name == report.name else f'{baseline.name} → {report.name}'
|
|
1231
|
-
|
|
1342
|
+
|
|
1343
|
+
title = f'Evaluation Diff: {diff_name}' if with_title else ''
|
|
1344
|
+
table = case_renderer.build_base_table(title)
|
|
1345
|
+
|
|
1232
1346
|
for baseline_case, new_case in diff_cases:
|
|
1233
1347
|
table.add_row(*case_renderer.build_diff_row(new_case, baseline_case))
|
|
1234
1348
|
for case in added_cases:
|
|
@@ -1247,6 +1361,7 @@ class EvaluationRenderer:
|
|
|
1247
1361
|
|
|
1248
1362
|
return table
|
|
1249
1363
|
|
|
1364
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1250
1365
|
def build_failures_table(self, report: EvaluationReport) -> Table:
|
|
1251
1366
|
case_renderer = self._get_case_renderer(report)
|
|
1252
1367
|
table = case_renderer.build_failures_table('Case Failures')
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
|
|
2
|
-
|
|
3
|
-
This package provides functionality for:
|
|
4
|
-
- Creating and loading test datasets with structured inputs and outputs
|
|
5
|
-
- Evaluating model performance using various metrics and evaluators
|
|
6
|
-
- Generating reports for evaluation results
|
|
7
|
-
|
|
8
|
-
TODO(DavidM): Implement serialization of reports for later comparison, and add git hashes etc.
|
|
9
|
-
Note: I made pydantic_ai.evals.reports.EvalReport a BaseModel specifically to make this easier
|
|
10
|
-
TODO(DavidM): Add commit hash, timestamp, and other metadata to reports (like pytest-speed does), possibly in a dedicated struct
|
|
11
|
-
TODO(DavidM): Implement a CLI with some pytest-like filtering API to make it easier to run only specific cases
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
from .dataset import Case, Dataset
|
|
15
|
-
|
|
16
|
-
__all__ = (
|
|
17
|
-
'Case',
|
|
18
|
-
'Dataset',
|
|
19
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|