pydantic-evals 1.7.0__tar.gz → 1.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pydantic-evals might be problematic. Click here for more details.
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/PKG-INFO +2 -2
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/dataset.py +27 -9
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/reporting/__init__.py +128 -13
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/.gitignore +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/LICENSE +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/README.md +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/__init__.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/_utils.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/evaluators/__init__.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/evaluators/_run_evaluator.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/evaluators/common.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/evaluators/context.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/evaluators/evaluator.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/evaluators/llm_as_a_judge.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/evaluators/spec.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/generation.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/otel/__init__.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/otel/_context_in_memory_span_exporter.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/otel/_context_subtree.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/otel/_errors.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/otel/span_tree.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/py.typed +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pydantic_evals/reporting/render_numbers.py +0 -0
- {pydantic_evals-1.7.0 → pydantic_evals-1.8.0}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pydantic-evals
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.8.0
|
|
4
4
|
Summary: Framework for evaluating stochastic code execution, especially code making use of LLMs
|
|
5
5
|
Project-URL: Homepage, https://ai.pydantic.dev/evals
|
|
6
6
|
Project-URL: Source, https://github.com/pydantic/pydantic-ai
|
|
@@ -30,7 +30,7 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
|
30
30
|
Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=0
|
|
32
32
|
Requires-Dist: logfire-api>=3.14.1
|
|
33
|
-
Requires-Dist: pydantic-ai-slim==1.
|
|
33
|
+
Requires-Dist: pydantic-ai-slim==1.8.0
|
|
34
34
|
Requires-Dist: pydantic>=2.10
|
|
35
35
|
Requires-Dist: pyyaml>=6.0.2
|
|
36
36
|
Requires-Dist: rich>=13.9.4
|
|
@@ -265,6 +265,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
265
265
|
retry_evaluators: RetryConfig | None = None,
|
|
266
266
|
*,
|
|
267
267
|
task_name: str | None = None,
|
|
268
|
+
metadata: dict[str, Any] | None = None,
|
|
268
269
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
269
270
|
"""Evaluates the test cases in the dataset using the given task.
|
|
270
271
|
|
|
@@ -283,6 +284,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
283
284
|
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
284
285
|
task_name: Optional override to the name of the task being executed, otherwise the name of the task
|
|
285
286
|
function will be used.
|
|
287
|
+
metadata: Optional dict of experiment metadata.
|
|
286
288
|
|
|
287
289
|
Returns:
|
|
288
290
|
A report containing the results of the evaluation.
|
|
@@ -294,6 +296,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
294
296
|
|
|
295
297
|
limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
|
|
296
298
|
|
|
299
|
+
extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'}
|
|
300
|
+
if metadata is not None:
|
|
301
|
+
extra_attributes['metadata'] = metadata
|
|
297
302
|
with (
|
|
298
303
|
logfire_span(
|
|
299
304
|
'evaluate {name}',
|
|
@@ -301,7 +306,7 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
301
306
|
task_name=task_name,
|
|
302
307
|
dataset_name=self.name,
|
|
303
308
|
n_cases=len(self.cases),
|
|
304
|
-
**
|
|
309
|
+
**extra_attributes,
|
|
305
310
|
) as eval_span,
|
|
306
311
|
progress_bar or nullcontext(),
|
|
307
312
|
):
|
|
@@ -339,13 +344,18 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
339
344
|
name=name,
|
|
340
345
|
cases=cases,
|
|
341
346
|
failures=failures,
|
|
347
|
+
experiment_metadata=metadata,
|
|
342
348
|
span_id=span_id,
|
|
343
349
|
trace_id=trace_id,
|
|
344
350
|
)
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
351
|
+
full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
|
|
352
|
+
if metadata is not None:
|
|
353
|
+
full_experiment_metadata['metadata'] = metadata
|
|
354
|
+
if (averages := report.averages()) is not None:
|
|
355
|
+
full_experiment_metadata['averages'] = averages
|
|
356
|
+
if averages.assertions is not None:
|
|
357
|
+
eval_span.set_attribute('assertion_pass_rate', averages.assertions)
|
|
358
|
+
eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata)
|
|
349
359
|
return report
|
|
350
360
|
|
|
351
361
|
def evaluate_sync(
|
|
@@ -356,6 +366,9 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
356
366
|
progress: bool = True,
|
|
357
367
|
retry_task: RetryConfig | None = None,
|
|
358
368
|
retry_evaluators: RetryConfig | None = None,
|
|
369
|
+
*,
|
|
370
|
+
task_name: str | None = None,
|
|
371
|
+
metadata: dict[str, Any] | None = None,
|
|
359
372
|
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
|
|
360
373
|
"""Evaluates the test cases in the dataset using the given task.
|
|
361
374
|
|
|
@@ -364,13 +377,16 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
364
377
|
Args:
|
|
365
378
|
task: The task to evaluate. This should be a callable that takes the inputs of the case
|
|
366
379
|
and returns the output.
|
|
367
|
-
name: The name of the
|
|
368
|
-
If omitted, the name of the task function
|
|
380
|
+
name: The name of the experiment being run, this is used to identify the experiment in the report.
|
|
381
|
+
If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
|
|
369
382
|
max_concurrency: The maximum number of concurrent evaluations of the task to allow.
|
|
370
383
|
If None, all cases will be evaluated concurrently.
|
|
371
|
-
progress: Whether to show a progress bar for the evaluation. Defaults to True
|
|
384
|
+
progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
|
|
372
385
|
retry_task: Optional retry configuration for the task execution.
|
|
373
386
|
retry_evaluators: Optional retry configuration for evaluator execution.
|
|
387
|
+
task_name: Optional override to the name of the task being executed, otherwise the name of the task
|
|
388
|
+
function will be used.
|
|
389
|
+
metadata: Optional dict of experiment metadata.
|
|
374
390
|
|
|
375
391
|
Returns:
|
|
376
392
|
A report containing the results of the evaluation.
|
|
@@ -378,11 +394,13 @@ class Dataset(BaseModel, Generic[InputsT, OutputT, MetadataT], extra='forbid', a
|
|
|
378
394
|
return get_event_loop().run_until_complete(
|
|
379
395
|
self.evaluate(
|
|
380
396
|
task,
|
|
381
|
-
|
|
397
|
+
name=name,
|
|
382
398
|
max_concurrency=max_concurrency,
|
|
383
399
|
progress=progress,
|
|
384
400
|
retry_task=retry_task,
|
|
385
401
|
retry_evaluators=retry_evaluators,
|
|
402
|
+
task_name=task_name,
|
|
403
|
+
metadata=metadata,
|
|
386
404
|
)
|
|
387
405
|
)
|
|
388
406
|
|
|
@@ -7,8 +7,10 @@ from io import StringIO
|
|
|
7
7
|
from typing import Any, Generic, Literal, Protocol, cast
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel, TypeAdapter
|
|
10
|
-
from rich.console import Console
|
|
10
|
+
from rich.console import Console, Group, RenderableType
|
|
11
|
+
from rich.panel import Panel
|
|
11
12
|
from rich.table import Table
|
|
13
|
+
from rich.text import Text
|
|
12
14
|
from typing_extensions import TypedDict, TypeVar
|
|
13
15
|
|
|
14
16
|
from pydantic_evals._utils import UNSET, Unset
|
|
@@ -196,6 +198,8 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
196
198
|
failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
|
|
197
199
|
"""The failures in the report. These are cases where task execution raised an exception."""
|
|
198
200
|
|
|
201
|
+
experiment_metadata: dict[str, Any] | None = None
|
|
202
|
+
"""Metadata associated with the specific experiment represented by this report."""
|
|
199
203
|
trace_id: str | None = None
|
|
200
204
|
"""The trace ID of the evaluation."""
|
|
201
205
|
span_id: str | None = None
|
|
@@ -230,7 +234,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
230
234
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
231
235
|
duration_config: RenderNumberConfig | None = None,
|
|
232
236
|
include_reasons: bool = False,
|
|
233
|
-
) -> str:
|
|
237
|
+
) -> str:
|
|
234
238
|
"""Render this report to a nicely-formatted string, optionally comparing it to a baseline report.
|
|
235
239
|
|
|
236
240
|
If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
|
|
@@ -261,7 +265,6 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
261
265
|
duration_config=duration_config,
|
|
262
266
|
include_reasons=include_reasons,
|
|
263
267
|
)
|
|
264
|
-
Console(file=io_file)
|
|
265
268
|
return io_file.getvalue()
|
|
266
269
|
|
|
267
270
|
def print(
|
|
@@ -297,7 +300,8 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
297
300
|
if console is None: # pragma: no branch
|
|
298
301
|
console = Console(width=width)
|
|
299
302
|
|
|
300
|
-
|
|
303
|
+
metadata_panel = self._metadata_panel(baseline=baseline)
|
|
304
|
+
renderable: RenderableType = self.console_table(
|
|
301
305
|
baseline=baseline,
|
|
302
306
|
include_input=include_input,
|
|
303
307
|
include_metadata=include_metadata,
|
|
@@ -316,8 +320,12 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
316
320
|
metric_configs=metric_configs,
|
|
317
321
|
duration_config=duration_config,
|
|
318
322
|
include_reasons=include_reasons,
|
|
323
|
+
with_title=not metadata_panel,
|
|
319
324
|
)
|
|
320
|
-
|
|
325
|
+
# Wrap table with experiment metadata panel if present
|
|
326
|
+
if metadata_panel:
|
|
327
|
+
renderable = Group(metadata_panel, renderable)
|
|
328
|
+
console.print(renderable)
|
|
321
329
|
if include_errors and self.failures: # pragma: no cover
|
|
322
330
|
failures_table = self.failures_table(
|
|
323
331
|
include_input=include_input,
|
|
@@ -330,6 +338,7 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
330
338
|
)
|
|
331
339
|
console.print(failures_table, style='red')
|
|
332
340
|
|
|
341
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
333
342
|
def console_table(
|
|
334
343
|
self,
|
|
335
344
|
baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
|
|
@@ -351,9 +360,11 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
351
360
|
metric_configs: dict[str, RenderNumberConfig] | None = None,
|
|
352
361
|
duration_config: RenderNumberConfig | None = None,
|
|
353
362
|
include_reasons: bool = False,
|
|
363
|
+
with_title: bool = True,
|
|
354
364
|
) -> Table:
|
|
355
|
-
"""Return a table containing the data from this report
|
|
365
|
+
"""Return a table containing the data from this report.
|
|
356
366
|
|
|
367
|
+
If a baseline is provided, returns a diff between this report and the baseline report.
|
|
357
368
|
Optionally include input and output details.
|
|
358
369
|
"""
|
|
359
370
|
renderer = EvaluationRenderer(
|
|
@@ -378,10 +389,82 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
|
|
|
378
389
|
include_reasons=include_reasons,
|
|
379
390
|
)
|
|
380
391
|
if baseline is None:
|
|
381
|
-
return renderer.build_table(self)
|
|
382
|
-
else:
|
|
383
|
-
return renderer.build_diff_table(self, baseline)
|
|
392
|
+
return renderer.build_table(self, with_title=with_title)
|
|
393
|
+
else:
|
|
394
|
+
return renderer.build_diff_table(self, baseline, with_title=with_title)
|
|
395
|
+
|
|
396
|
+
def _metadata_panel(
|
|
397
|
+
self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None
|
|
398
|
+
) -> RenderableType | None:
|
|
399
|
+
"""Wrap a table with an experiment metadata panel if metadata exists.
|
|
384
400
|
|
|
401
|
+
Args:
|
|
402
|
+
table: The table to wrap
|
|
403
|
+
baseline: Optional baseline report for diff metadata
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Either the table unchanged or a Group with Panel and Table
|
|
407
|
+
"""
|
|
408
|
+
if baseline is None:
|
|
409
|
+
# Single report - show metadata if present
|
|
410
|
+
if self.experiment_metadata:
|
|
411
|
+
metadata_text = Text()
|
|
412
|
+
items = list(self.experiment_metadata.items())
|
|
413
|
+
for i, (key, value) in enumerate(items):
|
|
414
|
+
metadata_text.append(f'{key}: {value}', style='dim')
|
|
415
|
+
if i < len(items) - 1:
|
|
416
|
+
metadata_text.append('\n')
|
|
417
|
+
return Panel(
|
|
418
|
+
metadata_text,
|
|
419
|
+
title=f'Evaluation Summary: {self.name}',
|
|
420
|
+
title_align='left',
|
|
421
|
+
border_style='dim',
|
|
422
|
+
padding=(0, 1),
|
|
423
|
+
expand=False,
|
|
424
|
+
)
|
|
425
|
+
else:
|
|
426
|
+
# Diff report - show metadata diff if either has metadata
|
|
427
|
+
if self.experiment_metadata or baseline.experiment_metadata:
|
|
428
|
+
diff_name = baseline.name if baseline.name == self.name else f'{baseline.name} → {self.name}'
|
|
429
|
+
metadata_text = Text()
|
|
430
|
+
lines_styles: list[tuple[str, str]] = []
|
|
431
|
+
if baseline.experiment_metadata and self.experiment_metadata:
|
|
432
|
+
# Collect all keys from both
|
|
433
|
+
all_keys = sorted(set(baseline.experiment_metadata.keys()) | set(self.experiment_metadata.keys()))
|
|
434
|
+
for key in all_keys:
|
|
435
|
+
baseline_val = baseline.experiment_metadata.get(key)
|
|
436
|
+
report_val = self.experiment_metadata.get(key)
|
|
437
|
+
if baseline_val == report_val:
|
|
438
|
+
lines_styles.append((f'{key}: {report_val}', 'dim'))
|
|
439
|
+
elif baseline_val is None:
|
|
440
|
+
lines_styles.append((f'+ {key}: {report_val}', 'green'))
|
|
441
|
+
elif report_val is None:
|
|
442
|
+
lines_styles.append((f'- {key}: {baseline_val}', 'red'))
|
|
443
|
+
else:
|
|
444
|
+
lines_styles.append((f'{key}: {baseline_val} → {report_val}', 'yellow'))
|
|
445
|
+
elif self.experiment_metadata:
|
|
446
|
+
lines_styles = [(f'+ {k}: {v}', 'green') for k, v in self.experiment_metadata.items()]
|
|
447
|
+
else: # baseline.experiment_metadata only
|
|
448
|
+
assert baseline.experiment_metadata is not None
|
|
449
|
+
lines_styles = [(f'- {k}: {v}', 'red') for k, v in baseline.experiment_metadata.items()]
|
|
450
|
+
|
|
451
|
+
for i, (line, style) in enumerate(lines_styles):
|
|
452
|
+
metadata_text.append(line, style=style)
|
|
453
|
+
if i < len(lines_styles) - 1:
|
|
454
|
+
metadata_text.append('\n')
|
|
455
|
+
|
|
456
|
+
return Panel(
|
|
457
|
+
metadata_text,
|
|
458
|
+
title=f'Evaluation Diff: {diff_name}',
|
|
459
|
+
title_align='left',
|
|
460
|
+
border_style='dim',
|
|
461
|
+
padding=(0, 1),
|
|
462
|
+
expand=False,
|
|
463
|
+
)
|
|
464
|
+
|
|
465
|
+
return None
|
|
466
|
+
|
|
467
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
385
468
|
def failures_table(
|
|
386
469
|
self,
|
|
387
470
|
*,
|
|
@@ -705,6 +788,7 @@ class ReportCaseRenderer:
|
|
|
705
788
|
metric_renderers: Mapping[str, _NumberRenderer]
|
|
706
789
|
duration_renderer: _NumberRenderer
|
|
707
790
|
|
|
791
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
708
792
|
def build_base_table(self, title: str) -> Table:
|
|
709
793
|
"""Build and return a Rich Table for the diff output."""
|
|
710
794
|
table = Table(title=title, show_lines=True)
|
|
@@ -731,6 +815,7 @@ class ReportCaseRenderer:
|
|
|
731
815
|
table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right')
|
|
732
816
|
return table
|
|
733
817
|
|
|
818
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
734
819
|
def build_failures_table(self, title: str) -> Table:
|
|
735
820
|
"""Build and return a Rich Table for the failures output."""
|
|
736
821
|
table = Table(title=title, show_lines=True)
|
|
@@ -1190,9 +1275,22 @@ class EvaluationRenderer:
|
|
|
1190
1275
|
duration_renderer=duration_renderer,
|
|
1191
1276
|
)
|
|
1192
1277
|
|
|
1193
|
-
|
|
1278
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1279
|
+
def build_table(self, report: EvaluationReport, *, with_title: bool = True) -> Table:
|
|
1280
|
+
"""Build a table for the report.
|
|
1281
|
+
|
|
1282
|
+
Args:
|
|
1283
|
+
report: The evaluation report to render
|
|
1284
|
+
with_title: Whether to include the title in the table (default True)
|
|
1285
|
+
|
|
1286
|
+
Returns:
|
|
1287
|
+
A Rich Table object
|
|
1288
|
+
"""
|
|
1194
1289
|
case_renderer = self._get_case_renderer(report)
|
|
1195
|
-
|
|
1290
|
+
|
|
1291
|
+
title = f'Evaluation Summary: {report.name}' if with_title else ''
|
|
1292
|
+
table = case_renderer.build_base_table(title)
|
|
1293
|
+
|
|
1196
1294
|
for case in report.cases:
|
|
1197
1295
|
table.add_row(*case_renderer.build_row(case))
|
|
1198
1296
|
|
|
@@ -1203,7 +1301,20 @@ class EvaluationRenderer:
|
|
|
1203
1301
|
|
|
1204
1302
|
return table
|
|
1205
1303
|
|
|
1206
|
-
|
|
1304
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1305
|
+
def build_diff_table(
|
|
1306
|
+
self, report: EvaluationReport, baseline: EvaluationReport, *, with_title: bool = True
|
|
1307
|
+
) -> Table:
|
|
1308
|
+
"""Build a diff table comparing report to baseline.
|
|
1309
|
+
|
|
1310
|
+
Args:
|
|
1311
|
+
report: The evaluation report to compare
|
|
1312
|
+
baseline: The baseline report to compare against
|
|
1313
|
+
with_title: Whether to include the title in the table (default True)
|
|
1314
|
+
|
|
1315
|
+
Returns:
|
|
1316
|
+
A Rich Table object
|
|
1317
|
+
"""
|
|
1207
1318
|
report_cases = report.cases
|
|
1208
1319
|
baseline_cases = self._baseline_cases_to_include(report, baseline)
|
|
1209
1320
|
|
|
@@ -1228,7 +1339,10 @@ class EvaluationRenderer:
|
|
|
1228
1339
|
|
|
1229
1340
|
case_renderer = self._get_case_renderer(report, baseline)
|
|
1230
1341
|
diff_name = baseline.name if baseline.name == report.name else f'{baseline.name} → {report.name}'
|
|
1231
|
-
|
|
1342
|
+
|
|
1343
|
+
title = f'Evaluation Diff: {diff_name}' if with_title else ''
|
|
1344
|
+
table = case_renderer.build_base_table(title)
|
|
1345
|
+
|
|
1232
1346
|
for baseline_case, new_case in diff_cases:
|
|
1233
1347
|
table.add_row(*case_renderer.build_diff_row(new_case, baseline_case))
|
|
1234
1348
|
for case in added_cases:
|
|
@@ -1247,6 +1361,7 @@ class EvaluationRenderer:
|
|
|
1247
1361
|
|
|
1248
1362
|
return table
|
|
1249
1363
|
|
|
1364
|
+
# TODO(DavidM): in v2, change the return type here to RenderableType
|
|
1250
1365
|
def build_failures_table(self, report: EvaluationReport) -> Table:
|
|
1251
1366
|
case_renderer = self._get_case_renderer(report)
|
|
1252
1367
|
table = case_renderer.build_failures_table('Case Failures')
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|