themis-eval 0.1.1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +429 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/commands/results.py +252 -0
  8. themis/cli/main.py +427 -57
  9. themis/comparison/__init__.py +25 -0
  10. themis/comparison/engine.py +348 -0
  11. themis/comparison/reports.py +283 -0
  12. themis/comparison/statistics.py +402 -0
  13. themis/core/entities.py +23 -3
  14. themis/evaluation/metrics/code/__init__.py +19 -0
  15. themis/evaluation/metrics/code/codebleu.py +144 -0
  16. themis/evaluation/metrics/code/execution.py +280 -0
  17. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  18. themis/evaluation/metrics/nlp/__init__.py +21 -0
  19. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  20. themis/evaluation/metrics/nlp/bleu.py +129 -0
  21. themis/evaluation/metrics/nlp/meteor.py +153 -0
  22. themis/evaluation/metrics/nlp/rouge.py +136 -0
  23. themis/evaluation/pipelines/standard_pipeline.py +68 -8
  24. themis/experiment/cache_manager.py +8 -3
  25. themis/experiment/export.py +110 -2
  26. themis/experiment/orchestrator.py +109 -11
  27. themis/experiment/storage.py +1457 -110
  28. themis/generation/providers/litellm_provider.py +46 -0
  29. themis/generation/runner.py +22 -6
  30. themis/integrations/huggingface.py +12 -1
  31. themis/integrations/wandb.py +13 -1
  32. themis/interfaces/__init__.py +86 -0
  33. themis/presets/__init__.py +10 -0
  34. themis/presets/benchmarks.py +354 -0
  35. themis/presets/models.py +190 -0
  36. themis/server/__init__.py +28 -0
  37. themis/server/app.py +337 -0
  38. themis_eval-0.2.1.dist-info/METADATA +596 -0
  39. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/RECORD +42 -19
  40. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/WHEEL +1 -1
  41. themis_eval-0.1.1.dist-info/METADATA +0 -758
  42. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/licenses/LICENSE +0 -0
  43. {themis_eval-0.1.1.dist-info → themis_eval-0.2.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,348 @@
1
+ """Comparison engine for analyzing multiple experiment runs.
2
+
3
+ This module provides the main ComparisonEngine class that orchestrates
4
+ loading runs, computing statistics, and generating comparison reports.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pathlib import Path
10
+ from typing import Sequence
11
+
12
+ from themis.comparison import reports, statistics
13
+ from themis.comparison.statistics import StatisticalTest
14
+ from themis.experiment import storage as experiment_storage
15
+
16
+
17
+ class ComparisonEngine:
18
+ """Engine for comparing multiple experiment runs.
19
+
20
+ This class loads experiment results from storage and performs
21
+ pairwise comparisons across all metrics with statistical testing.
22
+ """
23
+
24
+ def __init__(
25
+ self,
26
+ *,
27
+ storage: experiment_storage.ExperimentStorage | None = None,
28
+ storage_path: str | Path | None = None,
29
+ statistical_test: StatisticalTest = StatisticalTest.BOOTSTRAP,
30
+ alpha: float = 0.05,
31
+ n_bootstrap: int = 10000,
32
+ n_permutations: int = 10000,
33
+ ):
34
+ """Initialize comparison engine.
35
+
36
+ Args:
37
+ storage: Experiment storage instance
38
+ storage_path: Path to storage (if storage not provided)
39
+ statistical_test: Type of statistical test to use
40
+ alpha: Significance level for tests
41
+ n_bootstrap: Number of bootstrap iterations
42
+ n_permutations: Number of permutations for permutation test
43
+ """
44
+ if storage is None and storage_path is None:
45
+ raise ValueError("Either storage or storage_path must be provided")
46
+
47
+ self._storage = storage or experiment_storage.ExperimentStorage(storage_path)
48
+ self._statistical_test = statistical_test
49
+ self._alpha = alpha
50
+ self._n_bootstrap = n_bootstrap
51
+ self._n_permutations = n_permutations
52
+
53
+ def compare_runs(
54
+ self,
55
+ run_ids: Sequence[str],
56
+ *,
57
+ metrics: Sequence[str] | None = None,
58
+ statistical_test: StatisticalTest | None = None,
59
+ ) -> reports.ComparisonReport:
60
+ """Compare multiple runs across specified metrics.
61
+
62
+ Args:
63
+ run_ids: List of run IDs to compare
64
+ metrics: List of metrics to compare (None = all available)
65
+ statistical_test: Override default statistical test
66
+
67
+ Returns:
68
+ ComparisonReport with all comparisons and statistics
69
+
70
+ Raises:
71
+ ValueError: If fewer than 2 runs provided or runs not found
72
+ """
73
+ if len(run_ids) < 2:
74
+ raise ValueError("Need at least 2 runs to compare")
75
+
76
+ # Load all runs
77
+ run_data = {}
78
+ for run_id in run_ids:
79
+ try:
80
+ data = self._load_run_metrics(run_id)
81
+ run_data[run_id] = data
82
+ except FileNotFoundError:
83
+ raise ValueError(f"Run not found: {run_id}")
84
+
85
+ # Determine metrics to compare
86
+ if metrics is None:
87
+ # Use all metrics that appear in all runs
88
+ all_metrics = set(run_data[run_ids[0]].keys())
89
+ for run_id in run_ids[1:]:
90
+ all_metrics &= set(run_data[run_id].keys())
91
+ metrics = sorted(all_metrics)
92
+
93
+ if not metrics:
94
+ raise ValueError("No common metrics found across all runs")
95
+
96
+ # Perform pairwise comparisons
97
+ pairwise_results = []
98
+ for metric in metrics:
99
+ for i, run_a in enumerate(run_ids):
100
+ for run_b in run_ids[i + 1:]:
101
+ result = self._compare_pair(
102
+ run_a,
103
+ run_b,
104
+ metric,
105
+ run_data[run_a][metric],
106
+ run_data[run_b][metric],
107
+ statistical_test or self._statistical_test,
108
+ )
109
+ pairwise_results.append(result)
110
+
111
+ # Build win/loss matrices
112
+ win_loss_matrices = {}
113
+ for metric in metrics:
114
+ matrix = self._build_win_loss_matrix(run_ids, metric, pairwise_results)
115
+ win_loss_matrices[metric] = matrix
116
+
117
+ # Determine best run per metric
118
+ best_run_per_metric = {}
119
+ for metric in metrics:
120
+ # Find run with highest mean
121
+ best_run = max(
122
+ run_ids,
123
+ key=lambda rid: sum(run_data[rid][metric]) / len(run_data[rid][metric])
124
+ )
125
+ best_run_per_metric[metric] = best_run
126
+
127
+ # Determine overall best run (most wins across all metrics)
128
+ overall_wins = {run_id: 0 for run_id in run_ids}
129
+ for matrix in win_loss_matrices.values():
130
+ for run_id in run_ids:
131
+ overall_wins[run_id] += matrix.win_counts.get(run_id, 0)
132
+
133
+ overall_best_run = max(overall_wins, key=overall_wins.get)
134
+
135
+ return reports.ComparisonReport(
136
+ run_ids=list(run_ids),
137
+ metrics=list(metrics),
138
+ pairwise_results=pairwise_results,
139
+ win_loss_matrices=win_loss_matrices,
140
+ best_run_per_metric=best_run_per_metric,
141
+ overall_best_run=overall_best_run,
142
+ metadata={
143
+ "statistical_test": self._statistical_test.value,
144
+ "alpha": self._alpha,
145
+ "n_runs": len(run_ids),
146
+ "n_metrics": len(metrics),
147
+ },
148
+ )
149
+
150
+ def _load_run_metrics(self, run_id: str) -> dict[str, list[float]]:
151
+ """Load all metric scores for a run.
152
+
153
+ Returns:
154
+ Dictionary mapping metric names to lists of scores
155
+ """
156
+ # Load evaluation records from storage (returns dict of cache_key -> EvaluationRecord)
157
+ eval_dict = self._storage.load_cached_evaluations(run_id)
158
+
159
+ # Organize scores by metric
160
+ metric_scores: dict[str, list[float]] = {}
161
+
162
+ # eval_dict is a dict, so iterate over values
163
+ for record in eval_dict.values():
164
+ for metric_name, score_obj in record.scores.items():
165
+ if metric_name not in metric_scores:
166
+ metric_scores[metric_name] = []
167
+
168
+ # Get numeric score
169
+ if hasattr(score_obj, 'value'):
170
+ score = score_obj.value
171
+ elif isinstance(score_obj, (int, float)):
172
+ score = float(score_obj)
173
+ else:
174
+ continue # Skip non-numeric scores
175
+
176
+ metric_scores[metric_name].append(score)
177
+
178
+ return metric_scores
179
+
180
+ def _compare_pair(
181
+ self,
182
+ run_a_id: str,
183
+ run_b_id: str,
184
+ metric_name: str,
185
+ samples_a: list[float],
186
+ samples_b: list[float],
187
+ test_type: StatisticalTest,
188
+ ) -> reports.ComparisonResult:
189
+ """Compare two runs on a single metric.
190
+
191
+ Args:
192
+ run_a_id: First run identifier
193
+ run_b_id: Second run identifier
194
+ metric_name: Name of metric being compared
195
+ samples_a: Scores for first run
196
+ samples_b: Scores for second run
197
+ test_type: Type of statistical test to perform
198
+
199
+ Returns:
200
+ ComparisonResult with comparison statistics
201
+ """
202
+ # Calculate means
203
+ mean_a = sum(samples_a) / len(samples_a)
204
+ mean_b = sum(samples_b) / len(samples_b)
205
+
206
+ # Calculate delta
207
+ delta = mean_a - mean_b
208
+ delta_percent = (delta / mean_b * 100) if mean_b != 0 else 0.0
209
+
210
+ # Perform statistical test
211
+ test_result = None
212
+ if test_type == StatisticalTest.T_TEST:
213
+ test_result = statistics.t_test(
214
+ samples_a, samples_b, alpha=self._alpha, paired=True
215
+ )
216
+ elif test_type == StatisticalTest.BOOTSTRAP:
217
+ test_result = statistics.bootstrap_confidence_interval(
218
+ samples_a,
219
+ samples_b,
220
+ n_bootstrap=self._n_bootstrap,
221
+ confidence_level=1 - self._alpha,
222
+ )
223
+ elif test_type == StatisticalTest.PERMUTATION:
224
+ test_result = statistics.permutation_test(
225
+ samples_a,
226
+ samples_b,
227
+ n_permutations=self._n_permutations,
228
+ alpha=self._alpha,
229
+ )
230
+
231
+ # Determine winner
232
+ if test_result and test_result.significant:
233
+ winner = run_a_id if delta > 0 else run_b_id
234
+ else:
235
+ winner = "tie"
236
+
237
+ return reports.ComparisonResult(
238
+ metric_name=metric_name,
239
+ run_a_id=run_a_id,
240
+ run_b_id=run_b_id,
241
+ run_a_mean=mean_a,
242
+ run_b_mean=mean_b,
243
+ delta=delta,
244
+ delta_percent=delta_percent,
245
+ winner=winner,
246
+ test_result=test_result,
247
+ run_a_samples=samples_a,
248
+ run_b_samples=samples_b,
249
+ )
250
+
251
+ def _build_win_loss_matrix(
252
+ self,
253
+ run_ids: Sequence[str],
254
+ metric: str,
255
+ pairwise_results: list[reports.ComparisonResult],
256
+ ) -> reports.WinLossMatrix:
257
+ """Build win/loss matrix for a specific metric.
258
+
259
+ Args:
260
+ run_ids: List of run IDs
261
+ metric: Metric name
262
+ pairwise_results: All pairwise comparison results
263
+
264
+ Returns:
265
+ WinLossMatrix for the metric
266
+ """
267
+ n = len(run_ids)
268
+ matrix = [["—" for _ in range(n)] for _ in range(n)]
269
+
270
+ win_counts = {rid: 0 for rid in run_ids}
271
+ loss_counts = {rid: 0 for rid in run_ids}
272
+ tie_counts = {rid: 0 for rid in run_ids}
273
+
274
+ # Fill matrix from pairwise results
275
+ for result in pairwise_results:
276
+ if result.metric_name != metric:
277
+ continue
278
+
279
+ idx_a = run_ids.index(result.run_a_id)
280
+ idx_b = run_ids.index(result.run_b_id)
281
+
282
+ if result.winner == result.run_a_id:
283
+ matrix[idx_a][idx_b] = "win"
284
+ matrix[idx_b][idx_a] = "loss"
285
+ win_counts[result.run_a_id] += 1
286
+ loss_counts[result.run_b_id] += 1
287
+ elif result.winner == result.run_b_id:
288
+ matrix[idx_a][idx_b] = "loss"
289
+ matrix[idx_b][idx_a] = "win"
290
+ loss_counts[result.run_a_id] += 1
291
+ win_counts[result.run_b_id] += 1
292
+ else: # tie
293
+ matrix[idx_a][idx_b] = "tie"
294
+ matrix[idx_b][idx_a] = "tie"
295
+ tie_counts[result.run_a_id] += 1
296
+ tie_counts[result.run_b_id] += 1
297
+
298
+ return reports.WinLossMatrix(
299
+ run_ids=list(run_ids),
300
+ metric_name=metric,
301
+ matrix=matrix,
302
+ win_counts=win_counts,
303
+ loss_counts=loss_counts,
304
+ tie_counts=tie_counts,
305
+ )
306
+
307
+
308
+ def compare_runs(
309
+ run_ids: Sequence[str],
310
+ *,
311
+ storage_path: str | Path,
312
+ metrics: Sequence[str] | None = None,
313
+ statistical_test: StatisticalTest = StatisticalTest.BOOTSTRAP,
314
+ alpha: float = 0.05,
315
+ ) -> reports.ComparisonReport:
316
+ """Convenience function to compare runs.
317
+
318
+ Args:
319
+ run_ids: List of run IDs to compare
320
+ storage_path: Path to experiment storage
321
+ metrics: List of metrics to compare (None = all)
322
+ statistical_test: Type of statistical test
323
+ alpha: Significance level
324
+
325
+ Returns:
326
+ ComparisonReport with all comparisons
327
+
328
+ Example:
329
+ >>> report = compare_runs(
330
+ ... ["run-gpt4", "run-claude"],
331
+ ... storage_path=".cache/experiments",
332
+ ... metrics=["ExactMatch", "BLEU"],
333
+ ... )
334
+ >>> print(report.summary())
335
+ """
336
+ engine = ComparisonEngine(
337
+ storage_path=storage_path,
338
+ statistical_test=statistical_test,
339
+ alpha=alpha,
340
+ )
341
+
342
+ return engine.compare_runs(run_ids, metrics=metrics)
343
+
344
+
345
+ __all__ = [
346
+ "ComparisonEngine",
347
+ "compare_runs",
348
+ ]
@@ -0,0 +1,283 @@
1
+ """Comparison reports for analyzing experiment results.
2
+
3
+ This module provides structured reports for comparing multiple runs,
4
+ including win/loss matrices, metric deltas, and statistical significance.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Sequence
11
+
12
+ from themis.comparison.statistics import StatisticalTestResult
13
+
14
+
15
+ @dataclass
16
+ class ComparisonResult:
17
+ """Result of comparing two runs on a single metric.
18
+
19
+ Attributes:
20
+ metric_name: Name of the metric being compared
21
+ run_a_id: Identifier for first run
22
+ run_b_id: Identifier for second run
23
+ run_a_mean: Mean value for first run
24
+ run_b_mean: Mean value for second run
25
+ delta: Difference (run_a - run_b)
26
+ delta_percent: Percentage difference
27
+ winner: ID of the winning run ("tie" if no significant difference)
28
+ test_result: Statistical test result (if performed)
29
+ run_a_samples: Individual sample scores for run A
30
+ run_b_samples: Individual sample scores for run B
31
+ """
32
+
33
+ metric_name: str
34
+ run_a_id: str
35
+ run_b_id: str
36
+ run_a_mean: float
37
+ run_b_mean: float
38
+ delta: float
39
+ delta_percent: float
40
+ winner: str # run_a_id, run_b_id, or "tie"
41
+ test_result: StatisticalTestResult | None = None
42
+ run_a_samples: list[float] = field(default_factory=list)
43
+ run_b_samples: list[float] = field(default_factory=list)
44
+
45
+ def is_significant(self) -> bool:
46
+ """Check if the difference is statistically significant."""
47
+ return self.test_result is not None and self.test_result.significant
48
+
49
+ def summary(self) -> str:
50
+ """Generate a human-readable summary."""
51
+ direction = "↑" if self.delta > 0 else "↓" if self.delta < 0 else "="
52
+
53
+ summary = (
54
+ f"{self.metric_name}: {self.run_a_id} "
55
+ f"({self.run_a_mean:.3f}) vs {self.run_b_id} "
56
+ f"({self.run_b_mean:.3f}) = {direction}{abs(self.delta):.3f} "
57
+ f"({self.delta_percent:+.1f}%)"
58
+ )
59
+
60
+ if self.test_result:
61
+ sig_marker = "***" if self.is_significant() else "n.s."
62
+ summary += f" [{sig_marker}, p={self.test_result.p_value:.4f}]"
63
+
64
+ return summary
65
+
66
+
67
+ @dataclass
68
+ class WinLossMatrix:
69
+ """Win/loss/tie matrix for comparing multiple runs.
70
+
71
+ Attributes:
72
+ run_ids: List of run IDs in the matrix
73
+ metric_name: Name of the metric being compared
74
+ matrix: 2D matrix of results
75
+ matrix[i][j] = result of comparing run i vs run j
76
+ Values: "win", "loss", "tie"
77
+ win_counts: Number of wins for each run
78
+ loss_counts: Number of losses for each run
79
+ tie_counts: Number of ties for each run
80
+ """
81
+
82
+ run_ids: list[str]
83
+ metric_name: str
84
+ matrix: list[list[str]]
85
+ win_counts: dict[str, int] = field(default_factory=dict)
86
+ loss_counts: dict[str, int] = field(default_factory=dict)
87
+ tie_counts: dict[str, int] = field(default_factory=dict)
88
+
89
+ def get_result(self, run_a: str, run_b: str) -> str:
90
+ """Get comparison result between two runs."""
91
+ try:
92
+ idx_a = self.run_ids.index(run_a)
93
+ idx_b = self.run_ids.index(run_b)
94
+ return self.matrix[idx_a][idx_b]
95
+ except (ValueError, IndexError):
96
+ return "unknown"
97
+
98
+ def rank_runs(self) -> list[tuple[str, int, int, int]]:
99
+ """Rank runs by wins (descending), then losses (ascending).
100
+
101
+ Returns:
102
+ List of (run_id, wins, losses, ties) sorted by performance
103
+ """
104
+ rankings = [
105
+ (
106
+ run_id,
107
+ self.win_counts.get(run_id, 0),
108
+ self.loss_counts.get(run_id, 0),
109
+ self.tie_counts.get(run_id, 0),
110
+ )
111
+ for run_id in self.run_ids
112
+ ]
113
+
114
+ # Sort by wins (desc), then losses (asc)
115
+ rankings.sort(key=lambda x: (-x[1], x[2]))
116
+ return rankings
117
+
118
+ def to_table(self) -> str:
119
+ """Generate a formatted table representation."""
120
+ lines = []
121
+
122
+ # Header
123
+ header = f"{'Run':<20} | " + " | ".join(f"{rid:<12}" for rid in self.run_ids)
124
+ lines.append(header)
125
+ lines.append("-" * len(header))
126
+
127
+ # Rows
128
+ for i, run_id in enumerate(self.run_ids):
129
+ row = f"{run_id:<20} | "
130
+ row += " | ".join(f"{self.matrix[i][j]:<12}" for j in range(len(self.run_ids)))
131
+ lines.append(row)
132
+
133
+ # Summary
134
+ lines.append("")
135
+ lines.append("Summary (W/L/T):")
136
+ for run_id, wins, losses, ties in self.rank_runs():
137
+ lines.append(f" {run_id}: {wins}/{losses}/{ties}")
138
+
139
+ return "\n".join(lines)
140
+
141
+
142
+ @dataclass
143
+ class ComparisonReport:
144
+ """Comprehensive comparison report for multiple runs.
145
+
146
+ Attributes:
147
+ run_ids: List of all run IDs being compared
148
+ metrics: List of metric names being compared
149
+ pairwise_results: List of all pairwise comparison results
150
+ win_loss_matrices: Win/loss matrices for each metric
151
+ best_run_per_metric: Best run for each metric
152
+ overall_best_run: Overall best run across all metrics
153
+ metadata: Additional metadata about the comparison
154
+ """
155
+
156
+ run_ids: list[str]
157
+ metrics: list[str]
158
+ pairwise_results: list[ComparisonResult] = field(default_factory=list)
159
+ win_loss_matrices: dict[str, WinLossMatrix] = field(default_factory=dict)
160
+ best_run_per_metric: dict[str, str] = field(default_factory=dict)
161
+ overall_best_run: str | None = None
162
+ metadata: dict[str, Any] = field(default_factory=dict)
163
+
164
+ def get_comparison(
165
+ self, run_a: str, run_b: str, metric: str
166
+ ) -> ComparisonResult | None:
167
+ """Get comparison result for specific runs and metric."""
168
+ for result in self.pairwise_results:
169
+ if (
170
+ result.metric_name == metric
171
+ and result.run_a_id == run_a
172
+ and result.run_b_id == run_b
173
+ ):
174
+ return result
175
+ return None
176
+
177
+ def get_metric_results(self, metric: str) -> list[ComparisonResult]:
178
+ """Get all comparison results for a specific metric."""
179
+ return [r for r in self.pairwise_results if r.metric_name == metric]
180
+
181
+ def summary(self, include_details: bool = False) -> str:
182
+ """Generate a human-readable summary of the comparison.
183
+
184
+ Args:
185
+ include_details: Whether to include detailed pairwise comparisons
186
+
187
+ Returns:
188
+ Formatted summary string
189
+ """
190
+ lines = []
191
+ lines.append("=" * 80)
192
+ lines.append("COMPARISON REPORT")
193
+ lines.append("=" * 80)
194
+ lines.append("")
195
+
196
+ # Overall summary
197
+ lines.append(f"Comparing {len(self.run_ids)} runs across {len(self.metrics)} metrics")
198
+ lines.append(f"Runs: {', '.join(self.run_ids)}")
199
+ lines.append(f"Metrics: {', '.join(self.metrics)}")
200
+ lines.append("")
201
+
202
+ # Best run per metric
203
+ if self.best_run_per_metric:
204
+ lines.append("Best Run Per Metric:")
205
+ for metric, run_id in self.best_run_per_metric.items():
206
+ lines.append(f" {metric}: {run_id}")
207
+ lines.append("")
208
+
209
+ # Overall best
210
+ if self.overall_best_run:
211
+ lines.append(f"Overall Best Run: {self.overall_best_run}")
212
+ lines.append("")
213
+
214
+ # Win/loss matrices
215
+ if self.win_loss_matrices and include_details:
216
+ lines.append("=" * 80)
217
+ lines.append("WIN/LOSS MATRICES")
218
+ lines.append("=" * 80)
219
+ for metric, matrix in self.win_loss_matrices.items():
220
+ lines.append("")
221
+ lines.append(f"Metric: {metric}")
222
+ lines.append("-" * 40)
223
+ lines.append(matrix.to_table())
224
+ lines.append("")
225
+
226
+ # Pairwise comparisons
227
+ if include_details and self.pairwise_results:
228
+ lines.append("=" * 80)
229
+ lines.append("PAIRWISE COMPARISONS")
230
+ lines.append("=" * 80)
231
+
232
+ for metric in self.metrics:
233
+ results = self.get_metric_results(metric)
234
+ if results:
235
+ lines.append("")
236
+ lines.append(f"Metric: {metric}")
237
+ lines.append("-" * 40)
238
+ for result in results:
239
+ lines.append(f" {result.summary()}")
240
+ lines.append("")
241
+
242
+ return "\n".join(lines)
243
+
244
+ def to_dict(self) -> dict[str, Any]:
245
+ """Convert report to dictionary for serialization."""
246
+ return {
247
+ "run_ids": self.run_ids,
248
+ "metrics": self.metrics,
249
+ "best_run_per_metric": self.best_run_per_metric,
250
+ "overall_best_run": self.overall_best_run,
251
+ "pairwise_results": [
252
+ {
253
+ "metric": r.metric_name,
254
+ "run_a": r.run_a_id,
255
+ "run_b": r.run_b_id,
256
+ "run_a_mean": r.run_a_mean,
257
+ "run_b_mean": r.run_b_mean,
258
+ "delta": r.delta,
259
+ "delta_percent": r.delta_percent,
260
+ "winner": r.winner,
261
+ "significant": r.is_significant(),
262
+ "p_value": r.test_result.p_value if r.test_result else None,
263
+ }
264
+ for r in self.pairwise_results
265
+ ],
266
+ "win_loss_summary": {
267
+ metric: {
268
+ "rankings": [
269
+ {"run_id": rid, "wins": w, "losses": l, "ties": t}
270
+ for rid, w, l, t in matrix.rank_runs()
271
+ ]
272
+ }
273
+ for metric, matrix in self.win_loss_matrices.items()
274
+ },
275
+ "metadata": self.metadata,
276
+ }
277
+
278
+
279
+ __all__ = [
280
+ "ComparisonResult",
281
+ "WinLossMatrix",
282
+ "ComparisonReport",
283
+ ]