pydantic-evals 1.0.14__tar.gz → 1.50.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/.gitignore +3 -1
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/PKG-INFO +2 -2
- pydantic_evals-1.50.0/pydantic_evals/__init__.py +16 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/_utils.py +1 -1
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/dataset.py +49 -24
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/common.py +1 -1
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/llm_as_a_judge.py +36 -31
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/generation.py +3 -1
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/otel/span_tree.py +3 -3
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/reporting/__init__.py +201 -26
- pydantic_evals-1.0.14/pydantic_evals/__init__.py +0 -19
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/LICENSE +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/README.md +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/__init__.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/context.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/evaluator.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/spec.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/otel/__init__.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/otel/_context_subtree.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/otel/_errors.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/py.typed +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pydantic_evals/reporting/render_numbers.py +0 -0
- {pydantic_evals-1.0.14 → pydantic_evals-1.50.0}/pyproject.toml +0 -0
|
@@ -10,7 +10,7 @@ env*/
|
|
|
10
10
|
/TODO.md
|
|
11
11
|
/postgres-data/
|
|
12
12
|
.DS_Store
|
|
13
|
-
|
|
13
|
+
.chat_app_messages.sqlite
|
|
14
14
|
.cache/
|
|
15
15
|
.vscode/
|
|
16
16
|
/question_graph_history.json
|
|
@@ -21,3 +21,5 @@ node_modules/
|
|
|
21
21
|
/test_tmp/
|
|
22
22
|
.mcp.json
|
|
23
23
|
.claude/
|
|
24
|
+
/.cursor/
|
|
25
|
+
/.devcontainer/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.50.0
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=0
|
|
32
32
|
Requires-Dist: logfire-api>=3.14.1
|
|
33
|
-
Requires-Dist: pydantic-ai-slim==1.0
|
|
33
|
+
Requires-Dist: pydantic-ai-slim==1.50.0
|
|
34
34
|
Requires-Dist: pydantic>=2.10
|
|
35
35
|
Requires-Dist: pyyaml>=6.0.2
|
|
36
36
|
Requires-Dist: rich>=13.9.4
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
|
|
2
|
+
|
|
3
|
+
This package provides functionality for:
|
|
4
|
+
- Creating and loading test datasets with structured inputs and outputs
|
|
5
|
+
- Evaluating model performance using various metrics and evaluators
|
|
6
|
+
- Generating reports for evaluation results
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .dataset import Case, Dataset, increment_eval_metric, set_eval_attribute
|
|
10
|
+
|
|
11
|
+
__all__ = (
|
|
12
|
+
'Case',
|
|
13
|
+
'Dataset',
|
|
14
|
+
'increment_eval_metric',
|
|
15
|
+
'set_eval_attribute',
|
|
16
|
+
)
|
|
@@ -112,7 +112,7 @@ async def task_group_gather(tasks: Sequence[Callable[[], Awaitable[T]]]) -> list
|
|
|
112
112
|
|
|
113
113
|
try:
|
|
114
114
|
from logfire._internal.config import (
|
|
115
|
-
LogfireNotConfiguredWarning, # pyright: ignore[reportAssignmentType
|
|
115
|
+
LogfireNotConfiguredWarning, # pyright: ignore[reportAssignmentType]
|
|
116
116
|
)
|
|
117
117
|
# TODO: Remove this `pragma: no cover` once we test evals without pydantic-ai (which includes logfire)
|
|
118
118
|
except ImportError: # pragma: no cover
|
|
@@ -90,7 +90,7 @@ class _CaseModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'
|
|
|
90
90
|
inputs: InputsT
|
|
91
91
|
metadata: MetadataT | None = None
|
|
92
92
|
expected_output: OutputT | None = None
|
|
93
|
-
evaluators: list[EvaluatorSpec] = Field(default_factory=list)
|
|
93
|
+
evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
|
|
94
94
|
|
|
95
95
|
|
|
96
96
|
class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'):
|
|
@@ -100,7 +100,7 @@ class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forb
|
|
|
100
100
|
json_schema_path: str | None = Field(default=None, alias='$schema')
|
|
101
101
|
name: str | None = None
|
|
102
102
|
cases: list[_CaseModel[InputsT, OutputT, MetadataT]]
|
|
103
|
-
evaluators: list[EvaluatorSpec] = Field(default_factory=list)
|
|
103
|
+
evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
@dataclass(init=False)
|
|
@@ -136,7 +136,9 @@ class Case(Generic[InputsT, OutputT, MetadataT]):
|
|
|
136
136
|
"""
|
|
137
137
|
expected_output: OutputT | None = None
|
|
138
138
|
"""Expected output of the task. This is the expected output of the task that will be evaluated."""
|
|
139
|
-
evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(
|
|
139
|
+
evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(
|
|
140
|
+
default_factory=list[Evaluator[InputsT, OutputT, MetadataT]]
|
|
141
|
+
)
|
|
140
142
|
"""Evaluators to be used just on this case."""
|
|
141
143
|
|
|
142
144
|
def __init__(
|
|
@@ -265,6 +267,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
265
267
|
retry_evaluators: RetryConfig | None = None,
|
|
266
268
|
*,
|
|
267
269
|
task_name: str | None = None,
|
|
270
|
+
metadata: dict[str, Any] | None = None,
|
|
268
271
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
269
272
|
"""Evaluates the test cases in the dataset using the given task.
|
|
270
273
|
|
|
@@ -283,6 +286,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
283
286
|
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
284
287
|
task_name: Optional override to the name of the task being executed, otherwise the name of the task
|
|
285
288
|
function will be used.
|
|
289
|
+
metadata: Optional dict of experiment metadata.
|
|
286
290
|
|
|
287
291
|
Returns:
|
|
288
292
|
A report containing the results of the evaluation.
|
|
@@ -294,6 +298,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
294
298
|
|
|
295
299
|
limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
|
|
296
300
|
|
|
301
|
+
extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'}
|
|
302
|
+
if metadata is not None:
|
|
303
|
+
extra_attributes['metadata'] = metadata
|
|
297
304
|
with (
|
|
298
305
|
logfire_span(
|
|
299
306
|
'evaluate {name}',
|
|
@@ -301,7 +308,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
301
308
|
task_name=task_name,
|
|
302
309
|
dataset_name=self.name,
|
|
303
310
|
n_cases=len(self.cases),
|
|
304
|
-
**
|
|
311
|
+
**extra_attributes,
|
|
305
312
|
) as eval_span,
|
|
306
313
|
progress_bar or nullcontext(),
|
|
307
314
|
):
|
|
@@ -339,11 +346,18 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
339
346
|
name=name,
|
|
340
347
|
cases=cases,
|
|
341
348
|
failures=failures,
|
|
349
|
+
experiment_metadata=metadata,
|
|
342
350
|
span_id=span_id,
|
|
343
351
|
trace_id=trace_id,
|
|
344
352
|
)
|
|
345
|
-
|
|
346
|
-
|
|
353
|
+
full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
|
|
354
|
+
if metadata is not None:
|
|
355
|
+
full_experiment_metadata['metadata'] = metadata
|
|
356
|
+
if (averages := report.averages()) is not None:
|
|
357
|
+
full_experiment_metadata['averages'] = averages
|
|
358
|
+
if averages.assertions is not None:
|
|
359
|
+
eval_span.set_attribute('assertion_pass_rate', averages.assertions)
|
|
360
|
+
eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata)
|
|
347
361
|
return report
|
|
348
362
|
|
|
349
363
|
def evaluate_sync(
|
|
@@ -354,21 +368,27 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
354
368
|
progress: bool = True,
|
|
355
369
|
retry_task: RetryConfig | None = None,
|
|
356
370
|
retry_evaluators: RetryConfig | None = None,
|
|
371
|
+
*,
|
|
372
|
+
task_name: str | None = None,
|
|
373
|
+
metadata: dict[str, Any] | None = None,
|
|
357
374
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
358
375
|
"""Evaluates the test cases in the dataset using the given task.
|
|
359
376
|
|
|
360
|
-
This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
|
|
377
|
+
This is a synchronous wrapper around [`evaluate`][pydantic_evals.dataset.Dataset.evaluate] provided for convenience.
|
|
361
378
|
|
|
362
379
|
Args:
|
|
363
380
|
task: The task to evaluate. This should be a callable that takes the inputs of the case
|
|
364
381
|
and returns the output.
|
|
365
|
-
name: The name of the
|
|
366
|
-
If omitted, the name of the task function
|
|
382
|
+
name: The name of the experiment being run, this is used to identify the experiment in the report.
|
|
383
|
+
If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
|
|
367
384
|
max_concurrency: The maximum number of concurrent evaluations of the task to allow.
|
|
368
385
|
If None, all cases will be evaluated concurrently.
|
|
369
|
-
progress: Whether to show a progress bar for the evaluation. Defaults to True
|
|
386
|
+
progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
|
|
370
387
|
retry_task: Optional retry configuration for the task execution.
|
|
371
388
|
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
389
|
+
task_name: Optional override to the name of the task being executed, otherwise the name of the task
|
|
390
|
+
function will be used.
|
|
391
|
+
metadata: Optional dict of experiment metadata.
|
|
372
392
|
|
|
373
393
|
Returns:
|
|
374
394
|
A report containing the results of the evaluation.
|
|
@@ -376,11 +396,13 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
376
396
|
return get_event_loop().run_until_complete(
|
|
377
397
|
self.evaluate(
|
|
378
398
|
task,
|
|
379
|
-
|
|
399
|
+
name=name,
|
|
380
400
|
max_concurrency=max_concurrency,
|
|
381
401
|
progress=progress,
|
|
382
402
|
retry_task=retry_task,
|
|
383
403
|
retry_evaluators=retry_evaluators,
|
|
404
|
+
task_name=task_name,
|
|
405
|
+
metadata=metadata,
|
|
384
406
|
)
|
|
385
407
|
)
|
|
386
408
|
|
|
@@ -491,7 +513,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
491
513
|
path = Path(path)
|
|
492
514
|
fmt = cls._infer_fmt(path, fmt)
|
|
493
515
|
|
|
494
|
-
raw = Path(path).read_text()
|
|
516
|
+
raw = Path(path).read_text(encoding='utf-8')
|
|
495
517
|
try:
|
|
496
518
|
return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types, default_name=path.stem)
|
|
497
519
|
except ValidationError as e: # pragma: no cover
|
|
@@ -646,16 +668,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
646
668
|
|
|
647
669
|
context: dict[str, Any] = {'use_short_form': True}
|
|
648
670
|
if fmt == 'yaml':
|
|
649
|
-
dumped_data = self.model_dump(mode='json', by_alias=True,
|
|
671
|
+
dumped_data = self.model_dump(mode='json', by_alias=True, context=context)
|
|
650
672
|
content = yaml.dump(dumped_data, sort_keys=False)
|
|
651
673
|
if schema_ref: # pragma: no branch
|
|
652
674
|
yaml_language_server_line = f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}'
|
|
653
675
|
content = f'{yaml_language_server_line}\n{content}'
|
|
654
|
-
path.write_text(content)
|
|
676
|
+
path.write_text(content, encoding='utf-8')
|
|
655
677
|
else:
|
|
656
678
|
context['$schema'] = schema_ref
|
|
657
|
-
json_data = self.model_dump_json(indent=2, by_alias=True,
|
|
658
|
-
path.write_text(json_data + '\n')
|
|
679
|
+
json_data = self.model_dump_json(indent=2, by_alias=True, context=context)
|
|
680
|
+
path.write_text(json_data + '\n', encoding='utf-8')
|
|
659
681
|
|
|
660
682
|
@classmethod
|
|
661
683
|
def model_json_schema_with_evaluators(
|
|
@@ -718,15 +740,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
718
740
|
class Case(BaseModel, extra='forbid'): # pyright: ignore[reportUnusedClass] # this _is_ used below, but pyright doesn't seem to notice..
|
|
719
741
|
name: str | None = None
|
|
720
742
|
inputs: in_type # pyright: ignore[reportInvalidTypeForm]
|
|
721
|
-
metadata: meta_type | None = None # pyright: ignore[reportInvalidTypeForm
|
|
722
|
-
expected_output: out_type | None = None # pyright: ignore[reportInvalidTypeForm
|
|
743
|
+
metadata: meta_type | None = None # pyright: ignore[reportInvalidTypeForm]
|
|
744
|
+
expected_output: out_type | None = None # pyright: ignore[reportInvalidTypeForm]
|
|
723
745
|
if evaluator_schema_types: # pragma: no branch
|
|
724
|
-
evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007
|
|
746
|
+
evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa: UP007
|
|
725
747
|
|
|
726
748
|
class Dataset(BaseModel, extra='forbid'):
|
|
749
|
+
name: str | None = None
|
|
727
750
|
cases: list[Case]
|
|
728
751
|
if evaluator_schema_types: # pragma: no branch
|
|
729
|
-
evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007
|
|
752
|
+
evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa: UP007
|
|
730
753
|
|
|
731
754
|
json_schema = Dataset.model_json_schema()
|
|
732
755
|
# See `_add_json_schema` below, since `$schema` is added to the JSON, it has to be supported in the JSON
|
|
@@ -746,8 +769,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
746
769
|
path = Path(path)
|
|
747
770
|
json_schema = cls.model_json_schema_with_evaluators(custom_evaluator_types)
|
|
748
771
|
schema_content = to_json(json_schema, indent=2).decode() + '\n'
|
|
749
|
-
if not path.exists() or path.read_text() != schema_content: # pragma: no branch
|
|
750
|
-
path.write_text(schema_content)
|
|
772
|
+
if not path.exists() or path.read_text(encoding='utf-8') != schema_content: # pragma: no branch
|
|
773
|
+
path.write_text(schema_content, encoding='utf-8')
|
|
751
774
|
|
|
752
775
|
@classmethod
|
|
753
776
|
@functools.cache
|
|
@@ -833,8 +856,8 @@ def _get_relative_path_reference(target: Path, source: Path, _prefix: str = '')
|
|
|
833
856
|
class _TaskRun:
|
|
834
857
|
"""Internal class to track metrics and attributes for a task run."""
|
|
835
858
|
|
|
836
|
-
attributes: dict[str, Any] = field(init=False, default_factory=dict)
|
|
837
|
-
metrics: dict[str, int | float] = field(init=False, default_factory=dict)
|
|
859
|
+
attributes: dict[str, Any] = field(init=False, default_factory=dict[str, Any])
|
|
860
|
+
metrics: dict[str, int | float] = field(init=False, default_factory=dict[str, int | float])
|
|
838
861
|
|
|
839
862
|
def record_metric(self, name: str, value: int | float) -> None:
|
|
840
863
|
"""Record a metric value.
|
|
@@ -926,6 +949,8 @@ async def _run_task(
|
|
|
926
949
|
# That way users can customize this logic. We'd default to a function that does the current thing but also
|
|
927
950
|
# allow `None` to disable it entirely.
|
|
928
951
|
for node in span_tree:
|
|
952
|
+
if 'gen_ai.request.model' not in node.attributes:
|
|
953
|
+
continue # we only want to count the below specifically for the individual LLM requests, not agent runs
|
|
929
954
|
for k, v in node.attributes.items():
|
|
930
955
|
if k == 'gen_ai.operation.name' and v == 'chat':
|
|
931
956
|
task_run.increment_metric('requests', 1)
|
|
@@ -191,7 +191,7 @@ class LLMJudge(Evaluator[object, object, object]):
|
|
|
191
191
|
"""
|
|
192
192
|
|
|
193
193
|
rubric: str
|
|
194
|
-
model: models.Model | models.KnownModelName | None = None
|
|
194
|
+
model: models.Model | models.KnownModelName | str | None = None
|
|
195
195
|
include_input: bool = False
|
|
196
196
|
include_expected_output: bool = False
|
|
197
197
|
model_settings: ModelSettings | None = None
|
|
@@ -55,7 +55,7 @@ _judge_output_agent = Agent(
|
|
|
55
55
|
async def judge_output(
|
|
56
56
|
output: Any,
|
|
57
57
|
rubric: str,
|
|
58
|
-
model: models.Model | models.KnownModelName | None = None,
|
|
58
|
+
model: models.Model | models.KnownModelName | str | None = None,
|
|
59
59
|
model_settings: ModelSettings | None = None,
|
|
60
60
|
) -> GradingOutput:
|
|
61
61
|
"""Judge the output of a model based on a rubric.
|
|
@@ -96,7 +96,7 @@ async def judge_input_output(
|
|
|
96
96
|
inputs: Any,
|
|
97
97
|
output: Any,
|
|
98
98
|
rubric: str,
|
|
99
|
-
model: models.Model | models.KnownModelName | None = None,
|
|
99
|
+
model: models.Model | models.KnownModelName | str | None = None,
|
|
100
100
|
model_settings: ModelSettings | None = None,
|
|
101
101
|
) -> GradingOutput:
|
|
102
102
|
"""Judge the output of a model based on the inputs and a rubric.
|
|
@@ -141,7 +141,7 @@ async def judge_input_output_expected(
|
|
|
141
141
|
output: Any,
|
|
142
142
|
expected_output: Any,
|
|
143
143
|
rubric: str,
|
|
144
|
-
model: models.Model | models.KnownModelName | None = None,
|
|
144
|
+
model: models.Model | models.KnownModelName | str | None = None,
|
|
145
145
|
model_settings: ModelSettings | None = None,
|
|
146
146
|
) -> GradingOutput:
|
|
147
147
|
"""Judge the output of a model based on the inputs and a rubric.
|
|
@@ -185,7 +185,7 @@ async def judge_output_expected(
|
|
|
185
185
|
output: Any,
|
|
186
186
|
expected_output: Any,
|
|
187
187
|
rubric: str,
|
|
188
|
-
model: models.Model | models.KnownModelName | None = None,
|
|
188
|
+
model: models.Model | models.KnownModelName | str | None = None,
|
|
189
189
|
model_settings: ModelSettings | None = None,
|
|
190
190
|
) -> GradingOutput:
|
|
191
191
|
"""Judge the output of a model based on the expected output, output, and a rubric.
|
|
@@ -201,7 +201,7 @@ async def judge_output_expected(
|
|
|
201
201
|
).output
|
|
202
202
|
|
|
203
203
|
|
|
204
|
-
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
|
|
204
|
+
def set_default_judge_model(model: models.Model | models.KnownModelName) -> None:
|
|
205
205
|
"""Set the default model used for judging.
|
|
206
206
|
|
|
207
207
|
This model is used if `None` is passed to the `model` argument of `judge_output` and `judge_input_output`.
|
|
@@ -221,39 +221,44 @@ def _stringify(value: Any) -> str:
|
|
|
221
221
|
return repr(value)
|
|
222
222
|
|
|
223
223
|
|
|
224
|
+
def _make_section(content: Any, tag: str) -> list[str | UserContent]:
|
|
225
|
+
"""Create a tagged section, handling different content types, for use in the LLMJudge's prompt.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
content (Any): content to include in the section_
|
|
229
|
+
tag (str): tag name for the section
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
list[str | UserContent]: the tagged section as a list of strings or UserContent
|
|
233
|
+
"""
|
|
234
|
+
sections: list[str | UserContent] = []
|
|
235
|
+
items: Sequence[str | UserContent] = ( # pyright: ignore[reportUnknownVariableType]
|
|
236
|
+
content if isinstance(content, Sequence) and not isinstance(content, str) else [content]
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
sections.append(f'<{tag}>')
|
|
240
|
+
for item in items:
|
|
241
|
+
sections.append(item if isinstance(item, str | MultiModalContent) else _stringify(item))
|
|
242
|
+
sections.append(f'</{tag}>')
|
|
243
|
+
return sections
|
|
244
|
+
|
|
245
|
+
|
|
224
246
|
def _build_prompt(
|
|
225
247
|
output: Any,
|
|
226
248
|
rubric: str,
|
|
227
249
|
inputs: Any | None = None,
|
|
228
250
|
expected_output: Any | None = None,
|
|
229
251
|
) -> str | Sequence[str | UserContent]:
|
|
230
|
-
"""Build a prompt that includes input, output, and rubric."""
|
|
252
|
+
"""Build a prompt that includes input, output, expected output, and rubric."""
|
|
231
253
|
sections: list[str | UserContent] = []
|
|
232
|
-
|
|
233
254
|
if inputs is not None:
|
|
234
|
-
|
|
235
|
-
sections.append(f'<Input>\n{inputs}\n</Input>')
|
|
236
|
-
else:
|
|
237
|
-
sections.append('<Input>\n')
|
|
238
|
-
if isinstance(inputs, Sequence):
|
|
239
|
-
for item in inputs: # type: ignore
|
|
240
|
-
if isinstance(item, str | MultiModalContent):
|
|
241
|
-
sections.append(item)
|
|
242
|
-
else:
|
|
243
|
-
sections.append(_stringify(item))
|
|
244
|
-
elif isinstance(inputs, MultiModalContent):
|
|
245
|
-
sections.append(inputs)
|
|
246
|
-
else:
|
|
247
|
-
sections.append(_stringify(inputs))
|
|
248
|
-
sections.append('</Input>')
|
|
249
|
-
|
|
250
|
-
sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
|
|
251
|
-
sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
|
|
255
|
+
sections.extend(_make_section(inputs, 'Input'))
|
|
252
256
|
|
|
253
|
-
|
|
254
|
-
|
|
257
|
+
sections.extend(_make_section(output, 'Output'))
|
|
258
|
+
sections.extend(_make_section(rubric, 'Rubric'))
|
|
255
259
|
|
|
256
|
-
if
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
return sections
|
|
260
|
+
if expected_output is not None:
|
|
261
|
+
sections.extend(_make_section(expected_output, 'ExpectedOutput'))
|
|
262
|
+
if all(isinstance(section, str) for section in sections):
|
|
263
|
+
return '\n'.join(sections) # type: ignore[arg-type]
|
|
264
|
+
return sections
|
|
@@ -14,6 +14,7 @@ from pydantic import ValidationError
|
|
|
14
14
|
from typing_extensions import TypeVar
|
|
15
15
|
|
|
16
16
|
from pydantic_ai import Agent, models
|
|
17
|
+
from pydantic_ai._utils import strip_markdown_fences
|
|
17
18
|
from pydantic_evals import Dataset
|
|
18
19
|
from pydantic_evals.evaluators.evaluator import Evaluator
|
|
19
20
|
|
|
@@ -73,8 +74,9 @@ async def generate_dataset(
|
|
|
73
74
|
)
|
|
74
75
|
|
|
75
76
|
result = await agent.run(extra_instructions or 'Please generate the object.')
|
|
77
|
+
output = strip_markdown_fences(result.output)
|
|
76
78
|
try:
|
|
77
|
-
result = dataset_type.from_text(
|
|
79
|
+
result = dataset_type.from_text(output, fmt='json', custom_evaluator_types=custom_evaluator_types)
|
|
78
80
|
except ValidationError as e: # pragma: no cover
|
|
79
81
|
print(f'Raw response from model:\n{result.output}')
|
|
80
82
|
raise e
|
|
@@ -241,7 +241,7 @@ class SpanNode:
|
|
|
241
241
|
|
|
242
242
|
return self._matches_query(query)
|
|
243
243
|
|
|
244
|
-
def _matches_query(self, query: SpanQuery) -> bool: # noqa C901
|
|
244
|
+
def _matches_query(self, query: SpanQuery) -> bool: # noqa: C901
|
|
245
245
|
"""Check if the span matches the query conditions."""
|
|
246
246
|
# Logical combinations
|
|
247
247
|
if or_ := query.get('or_'):
|
|
@@ -433,8 +433,8 @@ class SpanTree:
|
|
|
433
433
|
You can then search or iterate the tree to make your assertions (using DFS for traversal).
|
|
434
434
|
"""
|
|
435
435
|
|
|
436
|
-
roots: list[SpanNode] = field(default_factory=list)
|
|
437
|
-
nodes_by_id: dict[str, SpanNode] = field(default_factory=dict)
|
|
436
|
+
roots: list[SpanNode] = field(default_factory=list[SpanNode])
|
|
437
|
+
nodes_by_id: dict[str, SpanNode] = field(default_factory=dict[str, SpanNode])
|
|
438
438
|
|
|
439
439
|
# -------------------------------------------------------------------------
|
|
440
440
|
# Construction
|
|
@@ -7,8 +7,10 @@ from io import StringIO
|
|
|
7
7
|
from typing import Any, Generic, Literal, Protocol, cast
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel, TypeAdapter
|
|
10
|
-
from rich.console import Console
|
|
10
|
+
from rich.console import Console, Group, RenderableType
|
|
11
|
+
from rich.panel import Panel
|
|
11
12
|
from rich.table import Table
|
|
13
|
+
from rich.text import Text
|
|
12
14
|
from typing_extensions import TypedDict, TypeVar
|
|
13
15
|
|
|
14
16
|
from pydantic_evals._utils import UNSET, Unset
|
|
@@ -53,11 +55,11 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
|
|
|
53
55
|
name: str
|
|
54
56
|
"""The name of the [case][pydantic_evals.Case]."""
|
|
55
57
|
inputs: InputsT
|
|
56
|
-
"""The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
|
|
58
|
+
"""The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
|
|
57
59
|
metadata: MetadataT | None
|
|
58
|
-
"""Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
|
|
60
|
+
"""Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
|
|
59
61
|
expected_output: OutputT | None
|
|
60
|
-
"""The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
|
|
62
|
+
"""The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
|
|
61
63
|
output: OutputT
|
|
62
64
|
"""The output of the task execution."""
|
|
63
65
|
|
|
@@ -76,7 +78,7 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
|
|
|
76
78
|
span_id: str | None = None
|
|
77
79
|
"""The span ID of the case span."""
|
|
78
80
|
|
|
79
|
-
evaluator_failures: list[EvaluatorFailure] = field(default_factory=list)
|
|
81
|
+
evaluator_failures: list[EvaluatorFailure] = field(default_factory=list[EvaluatorFailure])
|
|
80
82
|
|
|
81
83
|
|
|
82
84
|
@dataclass(kw_only=True)
|
|
@@ -86,11 +88,11 @@ class ReportCaseFailure(Generic[InputsT, OutputT, MetadataT]):
|
|
|
86
88
|
name: str
|
|
87
89
|
"""The name of the [case][pydantic_evals.Case]."""
|
|
88
90
|
inputs: InputsT
|
|
89
|
-
"""The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
|
|
91
|
+
"""The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
|
|
90
92
|
metadata: MetadataT | None
|
|
91
|
-
"""Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
|
|
93
|
+
"""Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
|
|
92
94
|
expected_output: OutputT | None
|
|
93
|
-
"""The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
|
|
95
|
+
"""The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
|
|
94
96
|
|
|
95
97
|
error_message: str
|
|
96
98
|
"""The message of the exception that caused the failure."""
|
|
@@ -193,9 +195,13 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
193
195
|
|
|
194
196
|
cases: list[ReportCase[InputsT, OutputT, MetadataT]]
|
|
195
197
|
"""The cases in the report."""
|
|
196
|
-
failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(
|
|
198
|
+
failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(
|
|
199
|
+
default_factory=list[ReportCaseFailure[InputsT, OutputT, MetadataT]]
|
|
200
|
+
)
|
|
197
201
|
"""The failures in the report. These are cases where task execution raised an exception."""
|
|
198
202
|
|
|
203
|
+
experiment_metadata: dict[str, Any] | None = None
|
|
204
|
+
"""Metadata associated with the specific experiment represented by this report."""
|
|
199
205
|
trace_id: str | None = None
|
|
200
206
|
"""The trace ID of the evaluation."""
|
|
201
207
|
span_id: str | None = None
|
|
@@ -206,11 +212,69 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
206
212
|
return ReportCaseAggregate.average(self.cases)
|
|
207
213
|
return None
|
|
208
214
|
|
|
215
|
+
def render(
|
|
216
|
+
self,
|
|
217
|
+
width: int | None = None,
|
|
218
|
+
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
219
|
+
*,
|
|
220
|
+
include_input: bool = False,
|
|
221
|
+
include_metadata: bool = False,
|
|
222
|
+
include_expected_output: bool = False,
|
|
223
|
+
include_output: bool = False,
|
|
224
|
+
include_durations: bool = True,
|
|
225
|
+
include_total_duration: bool = False,
|
|
226
|
+
include_removed_cases: bool = False,
|
|
227
|
+
include_averages: bool = True,
|
|
228
|
+
include_errors: bool = True,
|
|
229
|
+
include_error_stacktrace: bool = False,
|
|
230
|
+
include_evaluator_failures: bool = True,
|
|
231
|
+
input_config: RenderValueConfig | None = None,
|
|
232
|
+
metadata_config: RenderValueConfig | None = None,
|
|
233
|
+
output_config: RenderValueConfig | None = None,
|
|
234
|
+
score_configs: dict[str, RenderNumberConfig] | None = None,
|
|
235
|
+
label_configs: dict[str, RenderValueConfig] | None = None,
|
|
236
|
+
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
237
|
+
duration_config: RenderNumberConfig | None = None,
|
|
238
|
+
include_reasons: bool = False,
|
|
239
|
+
) -> str:
|
|
240
|
+
"""Render this report to a nicely-formatted string, optionally comparing it to a baseline report.
|
|
241
|
+
|
|
242
|
+
If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
|
|
243
|
+
"""
|
|
244
|
+
io_file = StringIO()
|
|
245
|
+
console = Console(width=width, file=io_file)
|
|
246
|
+
self.print(
|
|
247
|
+
width=width,
|
|
248
|
+
baseline=baseline,
|
|
249
|
+
console=console,
|
|
250
|
+
include_input=include_input,
|
|
251
|
+
include_metadata=include_metadata,
|
|
252
|
+
include_expected_output=include_expected_output,
|
|
253
|
+
include_output=include_output,
|
|
254
|
+
include_durations=include_durations,
|
|
255
|
+
include_total_duration=include_total_duration,
|
|
256
|
+
include_removed_cases=include_removed_cases,
|
|
257
|
+
include_averages=include_averages,
|
|
258
|
+
include_errors=include_errors,
|
|
259
|
+
include_error_stacktrace=include_error_stacktrace,
|
|
260
|
+
include_evaluator_failures=include_evaluator_failures,
|
|
261
|
+
input_config=input_config,
|
|
262
|
+
metadata_config=metadata_config,
|
|
263
|
+
output_config=output_config,
|
|
264
|
+
score_configs=score_configs,
|
|
265
|
+
label_configs=label_configs,
|
|
266
|
+
metric_configs=metric_configs,
|
|
267
|
+
duration_config=duration_config,
|
|
268
|
+
include_reasons=include_reasons,
|
|
269
|
+
)
|
|
270
|
+
return io_file.getvalue()
|
|
271
|
+
|
|
209
272
|
def print(
|
|
210
273
|
self,
|
|
211
274
|
width: int | None = None,
|
|
212
275
|
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
213
276
|
*,
|
|
277
|
+
console: Console | None = None,
|
|
214
278
|
include_input: bool = False,
|
|
215
279
|
include_metadata: bool = False,
|
|
216
280
|
include_expected_output: bool = False,
|
|
@@ -230,12 +294,16 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
230
294
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
231
295
|
duration_config: RenderNumberConfig | None = None,
|
|
232
296
|
include_reasons: bool = False,
|
|
233
|
-
)
|
|
297
|
+
) -> None:
|
|
234
298
|
"""Print this report to the console, optionally comparing it to a baseline report.
|
|
235
299
|
|
|
236
300
|
If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
|
|
237
301
|
"""
|
|
238
|
-
|
|
302
|
+
if console is None: # pragma: no branch
|
|
303
|
+
console = Console(width=width)
|
|
304
|
+
|
|
305
|
+
metadata_panel = self._metadata_panel(baseline=baseline)
|
|
306
|
+
renderable: RenderableType = self.console_table(
|
|
239
307
|
baseline=baseline,
|
|
240
308
|
include_input=include_input,
|
|
241
309
|
include_metadata=include_metadata,
|
|
@@ -254,10 +322,13 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
254
322
|
metric_configs=metric_configs,
|
|
255
323
|
duration_config=duration_config,
|
|
256
324
|
include_reasons=include_reasons,
|
|
325
|
+
with_title=not metadata_panel,
|
|
257
326
|
)
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
327
|
+
# Wrap table with experiment metadata panel if present
|
|
328
|
+
if metadata_panel:
|
|
329
|
+
renderable = Group(metadata_panel, renderable)
|
|
330
|
+
console.print(renderable)
|
|
331
|
+
if include_errors and self.failures: # pragma: no cover
|
|
261
332
|
failures_table = self.failures_table(
|
|
262
333
|
include_input=include_input,
|
|
263
334
|
include_metadata=include_metadata,
|
|
@@ -269,6 +340,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
269
340
|
)
|
|
270
341
|
console.print(failures_table, style='red')
|
|
271
342
|
|
|
343
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
272
344
|
def console_table(
|
|
273
345
|
self,
|
|
274
346
|
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
@@ -290,9 +362,11 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
290
362
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
291
363
|
duration_config: RenderNumberConfig | None = None,
|
|
292
364
|
include_reasons: bool = False,
|
|
365
|
+
with_title: bool = True,
|
|
293
366
|
) -> Table:
|
|
294
|
-
"""Return a table containing the data from this report
|
|
367
|
+
"""Return a table containing the data from this report.
|
|
295
368
|
|
|
369
|
+
If a baseline is provided, returns a diff between this report and the baseline report.
|
|
296
370
|
Optionally include input and output details.
|
|
297
371
|
"""
|
|
298
372
|
renderer = EvaluationRenderer(
|
|
@@ -317,10 +391,82 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
317
391
|
include_reasons=include_reasons,
|
|
318
392
|
)
|
|
319
393
|
if baseline is None:
|
|
320
|
-
return renderer.build_table(self)
|
|
321
|
-
else:
|
|
322
|
-
return renderer.build_diff_table(self, baseline)
|
|
394
|
+
return renderer.build_table(self, with_title=with_title)
|
|
395
|
+
else:
|
|
396
|
+
return renderer.build_diff_table(self, baseline, with_title=with_title)
|
|
397
|
+
|
|
398
|
+
def _metadata_panel(
|
|
399
|
+
self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None
|
|
400
|
+
) -> RenderableType | None:
|
|
401
|
+
"""Wrap a table with an experiment metadata panel if metadata exists.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
table: The table to wrap
|
|
405
|
+
baseline: Optional baseline report for diff metadata
|
|
406
|
+
|
|
407
|
+
Returns:
|
|
408
|
+
Either the table unchanged or a Group with Panel and Table
|
|
409
|
+
"""
|
|
410
|
+
if baseline is None:
|
|
411
|
+
# Single report - show metadata if present
|
|
412
|
+
if self.experiment_metadata:
|
|
413
|
+
metadata_text = Text()
|
|
414
|
+
items = list(self.experiment_metadata.items())
|
|
415
|
+
for i, (key, value) in enumerate(items):
|
|
416
|
+
metadata_text.append(f'{key}: {value}', style='dim')
|
|
417
|
+
if i < len(items) - 1:
|
|
418
|
+
metadata_text.append('\n')
|
|
419
|
+
return Panel(
|
|
420
|
+
metadata_text,
|
|
421
|
+
title=f'Evaluation Summary: {self.name}',
|
|
422
|
+
title_align='left',
|
|
423
|
+
border_style='dim',
|
|
424
|
+
padding=(0, 1),
|
|
425
|
+
expand=False,
|
|
426
|
+
)
|
|
427
|
+
else:
|
|
428
|
+
# Diff report - show metadata diff if either has metadata
|
|
429
|
+
if self.experiment_metadata or baseline.experiment_metadata:
|
|
430
|
+
diff_name = baseline.name if baseline.name == self.name else f'{baseline.name} → {self.name}'
|
|
431
|
+
metadata_text = Text()
|
|
432
|
+
lines_styles: list[tuple[str, str]] = []
|
|
433
|
+
if baseline.experiment_metadata and self.experiment_metadata:
|
|
434
|
+
# Collect all keys from both
|
|
435
|
+
all_keys = sorted(set(baseline.experiment_metadata.keys()) | set(self.experiment_metadata.keys()))
|
|
436
|
+
for key in all_keys:
|
|
437
|
+
baseline_val = baseline.experiment_metadata.get(key)
|
|
438
|
+
report_val = self.experiment_metadata.get(key)
|
|
439
|
+
if baseline_val == report_val:
|
|
440
|
+
lines_styles.append((f'{key}: {report_val}', 'dim'))
|
|
441
|
+
elif baseline_val is None:
|
|
442
|
+
lines_styles.append((f'+ {key}: {report_val}', 'green'))
|
|
443
|
+
elif report_val is None:
|
|
444
|
+
lines_styles.append((f'- {key}: {baseline_val}', 'red'))
|
|
445
|
+
else:
|
|
446
|
+
lines_styles.append((f'{key}: {baseline_val} → {report_val}', 'yellow'))
|
|
447
|
+
elif self.experiment_metadata:
|
|
448
|
+
lines_styles = [(f'+ {k}: {v}', 'green') for k, v in self.experiment_metadata.items()]
|
|
449
|
+
else: # baseline.experiment_metadata only
|
|
450
|
+
assert baseline.experiment_metadata is not None
|
|
451
|
+
lines_styles = [(f'- {k}: {v}', 'red') for k, v in baseline.experiment_metadata.items()]
|
|
452
|
+
|
|
453
|
+
for i, (line, style) in enumerate(lines_styles):
|
|
454
|
+
metadata_text.append(line, style=style)
|
|
455
|
+
if i < len(lines_styles) - 1:
|
|
456
|
+
metadata_text.append('\n')
|
|
457
|
+
|
|
458
|
+
return Panel(
|
|
459
|
+
metadata_text,
|
|
460
|
+
title=f'Evaluation Diff: {diff_name}',
|
|
461
|
+
title_align='left',
|
|
462
|
+
border_style='dim',
|
|
463
|
+
padding=(0, 1),
|
|
464
|
+
expand=False,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
return None
|
|
323
468
|
|
|
469
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
324
470
|
def failures_table(
|
|
325
471
|
self,
|
|
326
472
|
*,
|
|
@@ -358,10 +504,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
358
504
|
|
|
359
505
|
def __str__(self) -> str: # pragma: lax no cover
|
|
360
506
|
"""Return a string representation of the report."""
|
|
361
|
-
|
|
362
|
-
io_file = StringIO()
|
|
363
|
-
Console(file=io_file).print(table)
|
|
364
|
-
return io_file.getvalue()
|
|
507
|
+
return self.render()
|
|
365
508
|
|
|
366
509
|
|
|
367
510
|
EvaluationReportAdapter = TypeAdapter(EvaluationReport[Any, Any, Any])
|
|
@@ -647,6 +790,7 @@ class ReportCaseRenderer:
|
|
|
647
790
|
metric_renderers: Mapping[str, _NumberRenderer]
|
|
648
791
|
duration_renderer: _NumberRenderer
|
|
649
792
|
|
|
793
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
650
794
|
def build_base_table(self, title: str) -> Table:
|
|
651
795
|
"""Build and return a Rich Table for the diff output."""
|
|
652
796
|
table = Table(title=title, show_lines=True)
|
|
@@ -673,6 +817,7 @@ class ReportCaseRenderer:
|
|
|
673
817
|
table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right')
|
|
674
818
|
return table
|
|
675
819
|
|
|
820
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
676
821
|
def build_failures_table(self, title: str) -> Table:
|
|
677
822
|
"""Build and return a Rich Table for the failures output."""
|
|
678
823
|
table = Table(title=title, show_lines=True)
|
|
@@ -1132,9 +1277,22 @@ class EvaluationRenderer:
|
|
|
1132
1277
|
duration_renderer=duration_renderer,
|
|
1133
1278
|
)
|
|
1134
1279
|
|
|
1135
|
-
|
|
1280
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1281
|
+
def build_table(self, report: EvaluationReport, *, with_title: bool = True) -> Table:
|
|
1282
|
+
"""Build a table for the report.
|
|
1283
|
+
|
|
1284
|
+
Args:
|
|
1285
|
+
report: The evaluation report to render
|
|
1286
|
+
with_title: Whether to include the title in the table (default True)
|
|
1287
|
+
|
|
1288
|
+
Returns:
|
|
1289
|
+
A Rich Table object
|
|
1290
|
+
"""
|
|
1136
1291
|
case_renderer = self._get_case_renderer(report)
|
|
1137
|
-
|
|
1292
|
+
|
|
1293
|
+
title = f'Evaluation Summary: {report.name}' if with_title else ''
|
|
1294
|
+
table = case_renderer.build_base_table(title)
|
|
1295
|
+
|
|
1138
1296
|
for case in report.cases:
|
|
1139
1297
|
table.add_row(*case_renderer.build_row(case))
|
|
1140
1298
|
|
|
@@ -1145,7 +1303,20 @@ class EvaluationRenderer:
|
|
|
1145
1303
|
|
|
1146
1304
|
return table
|
|
1147
1305
|
|
|
1148
|
-
|
|
1306
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1307
|
+
def build_diff_table(
|
|
1308
|
+
self, report: EvaluationReport, baseline: EvaluationReport, *, with_title: bool = True
|
|
1309
|
+
) -> Table:
|
|
1310
|
+
"""Build a diff table comparing report to baseline.
|
|
1311
|
+
|
|
1312
|
+
Args:
|
|
1313
|
+
report: The evaluation report to compare
|
|
1314
|
+
baseline: The baseline report to compare against
|
|
1315
|
+
with_title: Whether to include the title in the table (default True)
|
|
1316
|
+
|
|
1317
|
+
Returns:
|
|
1318
|
+
A Rich Table object
|
|
1319
|
+
"""
|
|
1149
1320
|
report_cases = report.cases
|
|
1150
1321
|
baseline_cases = self._baseline_cases_to_include(report, baseline)
|
|
1151
1322
|
|
|
@@ -1170,7 +1341,10 @@ class EvaluationRenderer:
|
|
|
1170
1341
|
|
|
1171
1342
|
case_renderer = self._get_case_renderer(report, baseline)
|
|
1172
1343
|
diff_name = baseline.name if baseline.name == report.name else f'{baseline.name} → {report.name}'
|
|
1173
|
-
|
|
1344
|
+
|
|
1345
|
+
title = f'Evaluation Diff: {diff_name}' if with_title else ''
|
|
1346
|
+
table = case_renderer.build_base_table(title)
|
|
1347
|
+
|
|
1174
1348
|
for baseline_case, new_case in diff_cases:
|
|
1175
1349
|
table.add_row(*case_renderer.build_diff_row(new_case, baseline_case))
|
|
1176
1350
|
for case in added_cases:
|
|
@@ -1189,6 +1363,7 @@ class EvaluationRenderer:
|
|
|
1189
1363
|
|
|
1190
1364
|
return table
|
|
1191
1365
|
|
|
1366
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1192
1367
|
def build_failures_table(self, report: EvaluationReport) -> Table:
|
|
1193
1368
|
case_renderer = self._get_case_renderer(report)
|
|
1194
1369
|
table = case_renderer.build_failures_table('Case Failures')
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
"""A toolkit for evaluating the execution of arbitrary "stochastic functions", such as LLM calls.
|
|
2
|
-
|
|
3
|
-
This package provides functionality for:
|
|
4
|
-
- Creating and loading test datasets with structured inputs and outputs
|
|
5
|
-
- Evaluating model performance using various metrics and evaluators
|
|
6
|
-
- Generating reports for evaluation results
|
|
7
|
-
|
|
8
|
-
TODO(DavidM): Implement serialization of reports for later comparison, and add git hashes etc.
|
|
9
|
-
Note: I made pydantic_ai.evals.reports.EvalReport a BaseModel specifically to make this easier
|
|
10
|
-
TODO(DavidM): Add commit hash, timestamp, and other metadata to reports (like pytest-speed does), possibly in a dedicated struct
|
|
11
|
-
TODO(DavidM): Implement a CLI with some pytest-like filtering API to make it easier to run only specific cases
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
from .dataset import Case, Dataset
|
|
15
|
-
|
|
16
|
-
__all__ = (
|
|
17
|
-
'Case',
|
|
18
|
-
'Dataset',
|
|
19
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|