pydantic-evals 1.22.0__tar.gz → 1.50.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/PKG-INFO +2 -2
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/_utils.py +1 -1
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/dataset.py +19 -15
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/common.py +1 -1
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/llm_as_a_judge.py +35 -30
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/otel/span_tree.py +3 -3
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/reporting/__init__.py +10 -8
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/.gitignore +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/LICENSE +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/README.md +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/__init__.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/__init__.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/context.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/evaluator.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/evaluators/spec.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/generation.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/otel/__init__.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/otel/_context_subtree.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/otel/_errors.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/py.typed +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pydantic_evals/reporting/render_numbers.py +0 -0
- {pydantic_evals-1.22.0 → pydantic_evals-1.50.0}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.50.0
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=0
|
|
32
32
|
Requires-Dist: logfire-api>=3.14.1
|
|
33
|
-
Requires-Dist: pydantic-ai-slim==1.
|
|
33
|
+
Requires-Dist: pydantic-ai-slim==1.50.0
|
|
34
34
|
Requires-Dist: pydantic>=2.10
|
|
35
35
|
Requires-Dist: pyyaml>=6.0.2
|
|
36
36
|
Requires-Dist: rich>=13.9.4
|
|
@@ -112,7 +112,7 @@ async def task_group_gather(tasks: Sequence[Callable[[], Awaitable[T]]]) -> list
|
|
|
112
112
|
|
|
113
113
|
try:
|
|
114
114
|
from logfire._internal.config import (
|
|
115
|
-
LogfireNotConfiguredWarning, # pyright: ignore[reportAssignmentType
|
|
115
|
+
LogfireNotConfiguredWarning, # pyright: ignore[reportAssignmentType]
|
|
116
116
|
)
|
|
117
117
|
# TODO: Remove this `pragma: no cover` once we test evals without pydantic-ai (which includes logfire)
|
|
118
118
|
except ImportError: # pragma: no cover
|
|
@@ -90,7 +90,7 @@ class _CaseModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'
|
|
|
90
90
|
inputs: InputsT
|
|
91
91
|
metadata: MetadataT | None = None
|
|
92
92
|
expected_output: OutputT | None = None
|
|
93
|
-
evaluators: list[EvaluatorSpec] = Field(default_factory=list)
|
|
93
|
+
evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
|
|
94
94
|
|
|
95
95
|
|
|
96
96
|
class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid'):
|
|
@@ -100,7 +100,7 @@ class _DatasetModel(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forb
|
|
|
100
100
|
json_schema_path: str | None = Field(default=None, alias='$schema')
|
|
101
101
|
name: str | None = None
|
|
102
102
|
cases: list[_CaseModel[InputsT, OutputT, MetadataT]]
|
|
103
|
-
evaluators: list[EvaluatorSpec] = Field(default_factory=list)
|
|
103
|
+
evaluators: list[EvaluatorSpec] = Field(default_factory=list[EvaluatorSpec])
|
|
104
104
|
|
|
105
105
|
|
|
106
106
|
@dataclass(init=False)
|
|
@@ -136,7 +136,9 @@ class Case(Generic[InputsT, OutputT, MetadataT]):
|
|
|
136
136
|
"""
|
|
137
137
|
expected_output: OutputT | None = None
|
|
138
138
|
"""Expected output of the task. This is the expected output of the task that will be evaluated."""
|
|
139
|
-
evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(
|
|
139
|
+
evaluators: list[Evaluator[InputsT, OutputT, MetadataT]] = field(
|
|
140
|
+
default_factory=list[Evaluator[InputsT, OutputT, MetadataT]]
|
|
141
|
+
)
|
|
140
142
|
"""Evaluators to be used just on this case."""
|
|
141
143
|
|
|
142
144
|
def __init__(
|
|
@@ -372,7 +374,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
372
374
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
373
375
|
"""Evaluates the test cases in the dataset using the given task.
|
|
374
376
|
|
|
375
|
-
This is a synchronous wrapper around [`evaluate`][pydantic_evals.Dataset.evaluate] provided for convenience.
|
|
377
|
+
This is a synchronous wrapper around [`evaluate`][pydantic_evals.dataset.Dataset.evaluate] provided for convenience.
|
|
376
378
|
|
|
377
379
|
Args:
|
|
378
380
|
task: The task to evaluate. This should be a callable that takes the inputs of the case
|
|
@@ -511,7 +513,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
511
513
|
path = Path(path)
|
|
512
514
|
fmt = cls._infer_fmt(path, fmt)
|
|
513
515
|
|
|
514
|
-
raw = Path(path).read_text()
|
|
516
|
+
raw = Path(path).read_text(encoding='utf-8')
|
|
515
517
|
try:
|
|
516
518
|
return cls.from_text(raw, fmt=fmt, custom_evaluator_types=custom_evaluator_types, default_name=path.stem)
|
|
517
519
|
except ValidationError as e: # pragma: no cover
|
|
@@ -671,11 +673,11 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
671
673
|
if schema_ref: # pragma: no branch
|
|
672
674
|
yaml_language_server_line = f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}'
|
|
673
675
|
content = f'{yaml_language_server_line}\n{content}'
|
|
674
|
-
path.write_text(content)
|
|
676
|
+
path.write_text(content, encoding='utf-8')
|
|
675
677
|
else:
|
|
676
678
|
context['$schema'] = schema_ref
|
|
677
679
|
json_data = self.model_dump_json(indent=2, by_alias=True, context=context)
|
|
678
|
-
path.write_text(json_data + '\n')
|
|
680
|
+
path.write_text(json_data + '\n', encoding='utf-8')
|
|
679
681
|
|
|
680
682
|
@classmethod
|
|
681
683
|
def model_json_schema_with_evaluators(
|
|
@@ -738,16 +740,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
738
740
|
class Case(BaseModel, extra='forbid'): # pyright: ignore[reportUnusedClass] # this _is_ used below, but pyright doesn't seem to notice..
|
|
739
741
|
name: str | None = None
|
|
740
742
|
inputs: in_type # pyright: ignore[reportInvalidTypeForm]
|
|
741
|
-
metadata: meta_type | None = None # pyright: ignore[reportInvalidTypeForm
|
|
742
|
-
expected_output: out_type | None = None # pyright: ignore[reportInvalidTypeForm
|
|
743
|
+
metadata: meta_type | None = None # pyright: ignore[reportInvalidTypeForm]
|
|
744
|
+
expected_output: out_type | None = None # pyright: ignore[reportInvalidTypeForm]
|
|
743
745
|
if evaluator_schema_types: # pragma: no branch
|
|
744
|
-
evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007
|
|
746
|
+
evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa: UP007
|
|
745
747
|
|
|
746
748
|
class Dataset(BaseModel, extra='forbid'):
|
|
747
749
|
name: str | None = None
|
|
748
750
|
cases: list[Case]
|
|
749
751
|
if evaluator_schema_types: # pragma: no branch
|
|
750
|
-
evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa UP007
|
|
752
|
+
evaluators: list[Union[tuple(evaluator_schema_types)]] = [] # pyright: ignore # noqa: UP007
|
|
751
753
|
|
|
752
754
|
json_schema = Dataset.model_json_schema()
|
|
753
755
|
# See `_add_json_schema` below, since `$schema` is added to the JSON, it has to be supported in the JSON
|
|
@@ -767,8 +769,8 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
767
769
|
path = Path(path)
|
|
768
770
|
json_schema = cls.model_json_schema_with_evaluators(custom_evaluator_types)
|
|
769
771
|
schema_content = to_json(json_schema, indent=2).decode() + '\n'
|
|
770
|
-
if not path.exists() or path.read_text() != schema_content: # pragma: no branch
|
|
771
|
-
path.write_text(schema_content)
|
|
772
|
+
if not path.exists() or path.read_text(encoding='utf-8') != schema_content: # pragma: no branch
|
|
773
|
+
path.write_text(schema_content, encoding='utf-8')
|
|
772
774
|
|
|
773
775
|
@classmethod
|
|
774
776
|
@functools.cache
|
|
@@ -854,8 +856,8 @@ def _get_relative_path_reference(target: Path, source: Path, _prefix: str = '')
|
|
|
854
856
|
class _TaskRun:
|
|
855
857
|
"""Internal class to track metrics and attributes for a task run."""
|
|
856
858
|
|
|
857
|
-
attributes: dict[str, Any] = field(init=False, default_factory=dict)
|
|
858
|
-
metrics: dict[str, int | float] = field(init=False, default_factory=dict)
|
|
859
|
+
attributes: dict[str, Any] = field(init=False, default_factory=dict[str, Any])
|
|
860
|
+
metrics: dict[str, int | float] = field(init=False, default_factory=dict[str, int | float])
|
|
859
861
|
|
|
860
862
|
def record_metric(self, name: str, value: int | float) -> None:
|
|
861
863
|
"""Record a metric value.
|
|
@@ -947,6 +949,8 @@ async def _run_task(
|
|
|
947
949
|
# That way users can customize this logic. We'd default to a function that does the current thing but also
|
|
948
950
|
# allow `None` to disable it entirely.
|
|
949
951
|
for node in span_tree:
|
|
952
|
+
if 'gen_ai.request.model' not in node.attributes:
|
|
953
|
+
continue # we only want to count the below specifically for the individual LLM requests, not agent runs
|
|
950
954
|
for k, v in node.attributes.items():
|
|
951
955
|
if k == 'gen_ai.operation.name' and v == 'chat':
|
|
952
956
|
task_run.increment_metric('requests', 1)
|
|
@@ -191,7 +191,7 @@ class LLMJudge(Evaluator[object, object, object]):
|
|
|
191
191
|
"""
|
|
192
192
|
|
|
193
193
|
rubric: str
|
|
194
|
-
model: models.Model | models.KnownModelName | None = None
|
|
194
|
+
model: models.Model | models.KnownModelName | str | None = None
|
|
195
195
|
include_input: bool = False
|
|
196
196
|
include_expected_output: bool = False
|
|
197
197
|
model_settings: ModelSettings | None = None
|
|
@@ -55,7 +55,7 @@ _judge_output_agent = Agent(
|
|
|
55
55
|
async def judge_output(
|
|
56
56
|
output: Any,
|
|
57
57
|
rubric: str,
|
|
58
|
-
model: models.Model | models.KnownModelName | None = None,
|
|
58
|
+
model: models.Model | models.KnownModelName | str | None = None,
|
|
59
59
|
model_settings: ModelSettings | None = None,
|
|
60
60
|
) -> GradingOutput:
|
|
61
61
|
"""Judge the output of a model based on a rubric.
|
|
@@ -96,7 +96,7 @@ async def judge_input_output(
|
|
|
96
96
|
inputs: Any,
|
|
97
97
|
output: Any,
|
|
98
98
|
rubric: str,
|
|
99
|
-
model: models.Model | models.KnownModelName | None = None,
|
|
99
|
+
model: models.Model | models.KnownModelName | str | None = None,
|
|
100
100
|
model_settings: ModelSettings | None = None,
|
|
101
101
|
) -> GradingOutput:
|
|
102
102
|
"""Judge the output of a model based on the inputs and a rubric.
|
|
@@ -141,7 +141,7 @@ async def judge_input_output_expected(
|
|
|
141
141
|
output: Any,
|
|
142
142
|
expected_output: Any,
|
|
143
143
|
rubric: str,
|
|
144
|
-
model: models.Model | models.KnownModelName | None = None,
|
|
144
|
+
model: models.Model | models.KnownModelName | str | None = None,
|
|
145
145
|
model_settings: ModelSettings | None = None,
|
|
146
146
|
) -> GradingOutput:
|
|
147
147
|
"""Judge the output of a model based on the inputs and a rubric.
|
|
@@ -185,7 +185,7 @@ async def judge_output_expected(
|
|
|
185
185
|
output: Any,
|
|
186
186
|
expected_output: Any,
|
|
187
187
|
rubric: str,
|
|
188
|
-
model: models.Model | models.KnownModelName | None = None,
|
|
188
|
+
model: models.Model | models.KnownModelName | str | None = None,
|
|
189
189
|
model_settings: ModelSettings | None = None,
|
|
190
190
|
) -> GradingOutput:
|
|
191
191
|
"""Judge the output of a model based on the expected output, output, and a rubric.
|
|
@@ -221,39 +221,44 @@ def _stringify(value: Any) -> str:
|
|
|
221
221
|
return repr(value)
|
|
222
222
|
|
|
223
223
|
|
|
224
|
+
def _make_section(content: Any, tag: str) -> list[str | UserContent]:
|
|
225
|
+
"""Create a tagged section, handling different content types, for use in the LLMJudge's prompt.
|
|
226
|
+
|
|
227
|
+
Args:
|
|
228
|
+
content (Any): content to include in the section_
|
|
229
|
+
tag (str): tag name for the section
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
list[str | UserContent]: the tagged section as a list of strings or UserContent
|
|
233
|
+
"""
|
|
234
|
+
sections: list[str | UserContent] = []
|
|
235
|
+
items: Sequence[str | UserContent] = ( # pyright: ignore[reportUnknownVariableType]
|
|
236
|
+
content if isinstance(content, Sequence) and not isinstance(content, str) else [content]
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
sections.append(f'<{tag}>')
|
|
240
|
+
for item in items:
|
|
241
|
+
sections.append(item if isinstance(item, str | MultiModalContent) else _stringify(item))
|
|
242
|
+
sections.append(f'</{tag}>')
|
|
243
|
+
return sections
|
|
244
|
+
|
|
245
|
+
|
|
224
246
|
def _build_prompt(
|
|
225
247
|
output: Any,
|
|
226
248
|
rubric: str,
|
|
227
249
|
inputs: Any | None = None,
|
|
228
250
|
expected_output: Any | None = None,
|
|
229
251
|
) -> str | Sequence[str | UserContent]:
|
|
230
|
-
"""Build a prompt that includes input, output, and rubric."""
|
|
252
|
+
"""Build a prompt that includes input, output, expected output, and rubric."""
|
|
231
253
|
sections: list[str | UserContent] = []
|
|
232
|
-
|
|
233
254
|
if inputs is not None:
|
|
234
|
-
|
|
235
|
-
sections.append(f'<Input>\n{inputs}\n</Input>')
|
|
236
|
-
else:
|
|
237
|
-
sections.append('<Input>\n')
|
|
238
|
-
if isinstance(inputs, Sequence):
|
|
239
|
-
for item in inputs: # type: ignore
|
|
240
|
-
if isinstance(item, str | MultiModalContent):
|
|
241
|
-
sections.append(item)
|
|
242
|
-
else:
|
|
243
|
-
sections.append(_stringify(item))
|
|
244
|
-
elif isinstance(inputs, MultiModalContent):
|
|
245
|
-
sections.append(inputs)
|
|
246
|
-
else:
|
|
247
|
-
sections.append(_stringify(inputs))
|
|
248
|
-
sections.append('</Input>')
|
|
249
|
-
|
|
250
|
-
sections.append(f'<Output>\n{_stringify(output)}\n</Output>')
|
|
251
|
-
sections.append(f'<Rubric>\n{rubric}\n</Rubric>')
|
|
255
|
+
sections.extend(_make_section(inputs, 'Input'))
|
|
252
256
|
|
|
253
|
-
|
|
254
|
-
|
|
257
|
+
sections.extend(_make_section(output, 'Output'))
|
|
258
|
+
sections.extend(_make_section(rubric, 'Rubric'))
|
|
255
259
|
|
|
256
|
-
if
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
return sections
|
|
260
|
+
if expected_output is not None:
|
|
261
|
+
sections.extend(_make_section(expected_output, 'ExpectedOutput'))
|
|
262
|
+
if all(isinstance(section, str) for section in sections):
|
|
263
|
+
return '\n'.join(sections) # type: ignore[arg-type]
|
|
264
|
+
return sections
|
|
@@ -241,7 +241,7 @@ class SpanNode:
|
|
|
241
241
|
|
|
242
242
|
return self._matches_query(query)
|
|
243
243
|
|
|
244
|
-
def _matches_query(self, query: SpanQuery) -> bool: # noqa C901
|
|
244
|
+
def _matches_query(self, query: SpanQuery) -> bool: # noqa: C901
|
|
245
245
|
"""Check if the span matches the query conditions."""
|
|
246
246
|
# Logical combinations
|
|
247
247
|
if or_ := query.get('or_'):
|
|
@@ -433,8 +433,8 @@ class SpanTree:
|
|
|
433
433
|
You can then search or iterate the tree to make your assertions (using DFS for traversal).
|
|
434
434
|
"""
|
|
435
435
|
|
|
436
|
-
roots: list[SpanNode] = field(default_factory=list)
|
|
437
|
-
nodes_by_id: dict[str, SpanNode] = field(default_factory=dict)
|
|
436
|
+
roots: list[SpanNode] = field(default_factory=list[SpanNode])
|
|
437
|
+
nodes_by_id: dict[str, SpanNode] = field(default_factory=dict[str, SpanNode])
|
|
438
438
|
|
|
439
439
|
# -------------------------------------------------------------------------
|
|
440
440
|
# Construction
|
|
@@ -55,11 +55,11 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
|
|
|
55
55
|
name: str
|
|
56
56
|
"""The name of the [case][pydantic_evals.Case]."""
|
|
57
57
|
inputs: InputsT
|
|
58
|
-
"""The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
|
|
58
|
+
"""The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
|
|
59
59
|
metadata: MetadataT | None
|
|
60
|
-
"""Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
|
|
60
|
+
"""Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
|
|
61
61
|
expected_output: OutputT | None
|
|
62
|
-
"""The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
|
|
62
|
+
"""The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
|
|
63
63
|
output: OutputT
|
|
64
64
|
"""The output of the task execution."""
|
|
65
65
|
|
|
@@ -78,7 +78,7 @@ class ReportCase(Generic[InputsT, OutputT, MetadataT]):
|
|
|
78
78
|
span_id: str | None = None
|
|
79
79
|
"""The span ID of the case span."""
|
|
80
80
|
|
|
81
|
-
evaluator_failures: list[EvaluatorFailure] = field(default_factory=list)
|
|
81
|
+
evaluator_failures: list[EvaluatorFailure] = field(default_factory=list[EvaluatorFailure])
|
|
82
82
|
|
|
83
83
|
|
|
84
84
|
@dataclass(kw_only=True)
|
|
@@ -88,11 +88,11 @@ class ReportCaseFailure(Generic[InputsT, OutputT, MetadataT]):
|
|
|
88
88
|
name: str
|
|
89
89
|
"""The name of the [case][pydantic_evals.Case]."""
|
|
90
90
|
inputs: InputsT
|
|
91
|
-
"""The inputs to the task, from [`Case.inputs`][pydantic_evals.Case.inputs]."""
|
|
91
|
+
"""The inputs to the task, from [`Case.inputs`][pydantic_evals.dataset.Case.inputs]."""
|
|
92
92
|
metadata: MetadataT | None
|
|
93
|
-
"""Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.Case.metadata]."""
|
|
93
|
+
"""Any metadata associated with the case, from [`Case.metadata`][pydantic_evals.dataset.Case.metadata]."""
|
|
94
94
|
expected_output: OutputT | None
|
|
95
|
-
"""The expected output of the task, from [`Case.expected_output`][pydantic_evals.Case.expected_output]."""
|
|
95
|
+
"""The expected output of the task, from [`Case.expected_output`][pydantic_evals.dataset.Case.expected_output]."""
|
|
96
96
|
|
|
97
97
|
error_message: str
|
|
98
98
|
"""The message of the exception that caused the failure."""
|
|
@@ -195,7 +195,9 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
195
195
|
|
|
196
196
|
cases: list[ReportCase[InputsT, OutputT, MetadataT]]
|
|
197
197
|
"""The cases in the report."""
|
|
198
|
-
failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(
|
|
198
|
+
failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(
|
|
199
|
+
default_factory=list[ReportCaseFailure[InputsT, OutputT, MetadataT]]
|
|
200
|
+
)
|
|
199
201
|
"""The failures in the report. These are cases where task execution raised an exception."""
|
|
200
202
|
|
|
201
203
|
experiment_metadata: dict[str, Any] | None = None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|