themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,798 @@
1
+ """Utilities for exporting experiment results to CSV, JSON, and HTML."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import html
7
+ import json
8
+ from collections import OrderedDict
9
+ from pathlib import Path
10
+ from typing import Mapping, MutableMapping, Protocol, Sequence
11
+
12
+ from themis.core import entities as core_entities
13
+ from themis.experiment import orchestrator
14
+
15
+
16
+ class ChartPointLike(Protocol):
17
+ label: str
18
+ x_value: object
19
+ metric_value: float
20
+ metric_name: str
21
+ count: int
22
+
23
+
24
+ class ChartLike(Protocol):
25
+ title: str
26
+ x_label: str
27
+ y_label: str
28
+ metric_name: str
29
+ points: Sequence[ChartPointLike]
30
+
31
+
32
+ def export_report_csv(
33
+ report: orchestrator.ExperimentReport,
34
+ path: str | Path,
35
+ *,
36
+ include_failures: bool = True,
37
+ ) -> Path:
38
+ """Write per-sample metrics to a CSV file for offline analysis."""
39
+
40
+ path = Path(path)
41
+ path.parent.mkdir(parents=True, exist_ok=True)
42
+ metadata_by_condition, metadata_fields = _collect_sample_metadata(
43
+ report.generation_results
44
+ )
45
+
46
+ # Create a proper index mapping generation records to their metadata
47
+ # We assume evaluation records are in the same order as generation records
48
+ gen_record_index = {}
49
+ for gen_record in report.generation_results:
50
+ sample_id = gen_record.task.metadata.get(
51
+ "dataset_id"
52
+ ) or gen_record.task.metadata.get("sample_id")
53
+ prompt_template = gen_record.task.prompt.spec.name
54
+ model_identifier = gen_record.task.model.identifier
55
+ sampling_temp = gen_record.task.sampling.temperature
56
+ sampling_max_tokens = gen_record.task.sampling.max_tokens
57
+ condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
58
+ gen_record_index[condition_id] = gen_record
59
+
60
+ metric_names = sorted(report.evaluation_report.metrics.keys())
61
+ fieldnames = (
62
+ ["sample_id"] + metadata_fields + [f"metric:{name}" for name in metric_names]
63
+ )
64
+ if include_failures:
65
+ fieldnames.append("failures")
66
+
67
+ with path.open("w", encoding="utf-8", newline="") as handle:
68
+ writer = csv.DictWriter(handle, fieldnames=fieldnames)
69
+ writer.writeheader()
70
+
71
+ # Process evaluation records in the same order as generation records
72
+ for i, eval_record in enumerate(report.evaluation_report.records):
73
+ # Find the corresponding generation record by index
74
+ if i < len(report.generation_results):
75
+ gen_record = report.generation_results[i]
76
+ sample_id = gen_record.task.metadata.get(
77
+ "dataset_id"
78
+ ) or gen_record.task.metadata.get("sample_id")
79
+ prompt_template = gen_record.task.prompt.spec.name
80
+ model_identifier = gen_record.task.model.identifier
81
+ sampling_temp = gen_record.task.sampling.temperature
82
+ sampling_max_tokens = gen_record.task.sampling.max_tokens
83
+ condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
84
+ metadata = metadata_by_condition.get(condition_id, {})
85
+ else:
86
+ # Fallback for extra evaluation records
87
+ sample_id = eval_record.sample_id or ""
88
+ metadata = {}
89
+
90
+ row: dict[str, object] = {"sample_id": sample_id}
91
+ for field in metadata_fields:
92
+ row[field] = metadata.get(field, "")
93
+ score_by_name = {
94
+ score.metric_name: score.value for score in eval_record.scores
95
+ }
96
+ for name in metric_names:
97
+ row[f"metric:{name}"] = score_by_name.get(name, "")
98
+ if include_failures:
99
+ row["failures"] = "; ".join(eval_record.failures)
100
+ writer.writerow(row)
101
+ return path
102
+
103
+
104
+ def export_html_report(
105
+ report: orchestrator.ExperimentReport,
106
+ path: str | Path,
107
+ *,
108
+ charts: Sequence[ChartLike] | None = None,
109
+ title: str = "Experiment report",
110
+ sample_limit: int = 100,
111
+ ) -> Path:
112
+ """Render the experiment report as an HTML document."""
113
+
114
+ path = Path(path)
115
+ path.parent.mkdir(parents=True, exist_ok=True)
116
+ html_content = render_html_report(
117
+ report,
118
+ charts=charts,
119
+ title=title,
120
+ sample_limit=sample_limit,
121
+ )
122
+ path.write_text(html_content, encoding="utf-8")
123
+ return path
124
+
125
+
126
+ def export_report_json(
127
+ report: orchestrator.ExperimentReport,
128
+ path: str | Path,
129
+ *,
130
+ charts: Sequence[ChartLike] | None = None,
131
+ title: str = "Experiment report",
132
+ sample_limit: int | None = None,
133
+ indent: int = 2,
134
+ ) -> Path:
135
+ """Serialize the report details to JSON for downstream tooling."""
136
+
137
+ payload = build_json_report(
138
+ report,
139
+ charts=charts,
140
+ title=title,
141
+ sample_limit=sample_limit,
142
+ )
143
+ path = Path(path)
144
+ path.parent.mkdir(parents=True, exist_ok=True)
145
+ path.write_text(json.dumps(payload, indent=indent), encoding="utf-8")
146
+ return path
147
+
148
+
149
+ def export_summary_json(
150
+ report: orchestrator.ExperimentReport,
151
+ path: str | Path,
152
+ *,
153
+ run_id: str | None = None,
154
+ indent: int = 2,
155
+ ) -> Path:
156
+ """Export a lightweight summary JSON file for quick results viewing.
157
+
158
+ This creates a small summary file (~1KB) containing only the essential
159
+ metrics and metadata, without the full sample-level details. This is
160
+ ideal for quickly comparing multiple runs without parsing large report files.
161
+
162
+ Args:
163
+ report: Experiment report to summarize
164
+ path: Output path for summary.json
165
+ run_id: Optional run identifier to include in summary
166
+ indent: JSON indentation level
167
+
168
+ Returns:
169
+ Path to the created summary file
170
+
171
+ Example:
172
+ >>> export_summary_json(report, "outputs/run-123/summary.json", run_id="run-123")
173
+ >>> # Quick comparison: cat outputs/*/summary.json | jq '.accuracy'
174
+
175
+ Note:
176
+ The summary file is typically ~1KB compared to ~1.6MB for the full report.
177
+ This makes it 1000x faster to view and compare results across runs.
178
+ """
179
+ # Extract key metrics
180
+ metrics_summary = {}
181
+ for name, aggregate in report.evaluation_report.metrics.items():
182
+ metrics_summary[name] = {
183
+ "mean": aggregate.mean,
184
+ "count": aggregate.count,
185
+ }
186
+
187
+ # Extract metadata from first generation record
188
+ metadata = {}
189
+ if report.generation_results:
190
+ first_record = report.generation_results[0]
191
+ metadata = {
192
+ "model": first_record.task.model.identifier,
193
+ "prompt_template": first_record.task.prompt.spec.name,
194
+ "sampling": {
195
+ "temperature": first_record.task.sampling.temperature,
196
+ "top_p": first_record.task.sampling.top_p,
197
+ "max_tokens": first_record.task.sampling.max_tokens,
198
+ },
199
+ }
200
+
201
+ # Calculate total cost if available
202
+ total_cost = 0.0
203
+ for record in report.generation_results:
204
+ if "cost_usd" in record.metrics:
205
+ total_cost += record.metrics["cost_usd"]
206
+
207
+ # Count failures
208
+ failure_count = len(report.evaluation_report.failures)
209
+
210
+ # Build summary
211
+ summary = {
212
+ "run_id": run_id,
213
+ "total_samples": len(report.generation_results),
214
+ "metrics": metrics_summary,
215
+ "metadata": metadata,
216
+ "cost_usd": round(total_cost, 4) if total_cost > 0 else None,
217
+ "failures": failure_count,
218
+ "failure_rate": (
219
+ round(failure_count / len(report.generation_results), 4)
220
+ if report.generation_results
221
+ else 0.0
222
+ ),
223
+ }
224
+
225
+ path = Path(path)
226
+ path.parent.mkdir(parents=True, exist_ok=True)
227
+ path.write_text(json.dumps(summary, indent=indent), encoding="utf-8")
228
+ return path
229
+
230
+
231
+ def export_report_bundle(
232
+ report: orchestrator.ExperimentReport,
233
+ *,
234
+ csv_path: str | Path | None = None,
235
+ html_path: str | Path | None = None,
236
+ json_path: str | Path | None = None,
237
+ summary_path: str | Path | None = None,
238
+ run_id: str | None = None,
239
+ charts: Sequence[ChartLike] | None = None,
240
+ title: str = "Experiment report",
241
+ sample_limit: int = 100,
242
+ indent: int = 2,
243
+ ) -> OrderedDict[str, Path]:
244
+ """Convenience helper that writes multiple export formats at once.
245
+
246
+ Args:
247
+ report: Experiment report to export
248
+ csv_path: Optional path for CSV export
249
+ html_path: Optional path for HTML export
250
+ json_path: Optional path for full JSON export
251
+ summary_path: Optional path for lightweight summary JSON export
252
+ run_id: Optional run identifier for summary
253
+ charts: Optional charts to include in visualizations
254
+ title: Report title
255
+ sample_limit: Maximum samples to include in detailed exports
256
+ indent: JSON indentation level
257
+
258
+ Returns:
259
+ Ordered dict of format -> path for created files
260
+
261
+ Note:
262
+ The summary export is highly recommended as it provides quick access
263
+ to key metrics without parsing large report files.
264
+ """
265
+ outputs: OrderedDict[str, Path] = OrderedDict()
266
+ if csv_path is not None:
267
+ outputs["csv"] = export_report_csv(report, csv_path)
268
+ if html_path is not None:
269
+ outputs["html"] = export_html_report(
270
+ report,
271
+ html_path,
272
+ charts=charts,
273
+ title=title,
274
+ sample_limit=sample_limit,
275
+ )
276
+ if json_path is not None:
277
+ outputs["json"] = export_report_json(
278
+ report,
279
+ json_path,
280
+ charts=charts,
281
+ title=title,
282
+ sample_limit=sample_limit,
283
+ indent=indent,
284
+ )
285
+ if summary_path is not None:
286
+ outputs["summary"] = export_summary_json(
287
+ report, summary_path, run_id=run_id, indent=indent
288
+ )
289
+ return outputs
290
+
291
+
292
+ def render_html_report(
293
+ report: orchestrator.ExperimentReport,
294
+ *,
295
+ charts: Sequence[ChartLike] | None = None,
296
+ title: str = "Experiment report",
297
+ sample_limit: int = 100,
298
+ ) -> str:
299
+ """Return an HTML string summarizing the experiment results."""
300
+
301
+ metadata_by_sample, metadata_fields = _collect_sample_metadata(
302
+ report.generation_results
303
+ )
304
+ metric_names = sorted(report.evaluation_report.metrics.keys())
305
+ summary_section = _render_summary(report)
306
+ cost_section = _render_cost_section(report)
307
+ metrics_table = _render_metric_table(report)
308
+ samples_table = _render_sample_table(
309
+ report,
310
+ metadata_by_sample,
311
+ metadata_fields,
312
+ metric_names,
313
+ limit=sample_limit,
314
+ )
315
+ chart_sections = "\n".join(_render_chart_section(chart) for chart in charts or ())
316
+ html_doc = f"""<!DOCTYPE html>
317
+ <html lang=\"en\">
318
+ <head>
319
+ <meta charset=\"utf-8\" />
320
+ <title>{html.escape(title)}</title>
321
+ <style>
322
+ body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 32px; background: #f6f8fb; color: #1f2933; }}
323
+ h1 {{ font-size: 1.8rem; margin-bottom: 0.5rem; }}
324
+ section {{ margin-bottom: 2rem; }}
325
+ table {{ border-collapse: collapse; width: 100%; background: white; box-shadow: 0 1px 2px rgba(15,23,42,0.08); }}
326
+ th, td {{ padding: 0.5rem 0.75rem; border-bottom: 1px solid #e5e7eb; font-size: 0.95rem; text-align: left; }}
327
+ th {{ background: #f0f2f8; font-weight: 600; }}
328
+ tbody tr:nth-child(odd) {{ background: #fafbff; }}
329
+ .summary-list {{ list-style: none; padding: 0; margin: 0; display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 0.75rem; }}
330
+ .summary-item {{ background: white; padding: 0.75rem 1rem; border-radius: 8px; box-shadow: inset 0 0 0 1px #e5e7eb; }}
331
+ .chart-section {{ background: white; padding: 1rem; border-radius: 10px; box-shadow: 0 1px 2px rgba(15,23,42,0.08); margin-bottom: 1.5rem; }}
332
+ .chart-title {{ margin: 0 0 0.5rem 0; font-size: 1.1rem; }}
333
+ .chart-svg {{ width: 100%; height: 320px; }}
334
+ .chart-table {{ margin-top: 0.75rem; }}
335
+ .subtle {{ color: #6b7280; font-size: 0.9rem; }}
336
+ .cost-highlight {{ color: #059669; font-size: 1.2rem; font-weight: 600; }}
337
+ .cost-section {{ background: white; padding: 1rem; border-radius: 10px; box-shadow: 0 1px 2px rgba(15,23,42,0.08); margin-bottom: 1.5rem; }}
338
+ .cost-section h2 {{ font-size: 1.3rem; margin-top: 0; margin-bottom: 1rem; }}
339
+ .cost-section h3 {{ font-size: 1.1rem; margin-top: 1.5rem; margin-bottom: 0.75rem; }}
340
+ </style>
341
+ </head>
342
+ <body>
343
+ <h1>{html.escape(title)}</h1>
344
+ {summary_section}
345
+ {cost_section}
346
+ {metrics_table}
347
+ {chart_sections}
348
+ {samples_table}
349
+ </body>
350
+ </html>"""
351
+ return html_doc
352
+
353
+
354
+ def build_json_report(
355
+ report: orchestrator.ExperimentReport,
356
+ *,
357
+ charts: Sequence[ChartLike] | None = None,
358
+ title: str = "Experiment report",
359
+ sample_limit: int | None = None,
360
+ ) -> dict[str, object]:
361
+ metadata_by_sample, metadata_fields = _collect_sample_metadata(
362
+ report.generation_results
363
+ )
364
+ metric_names = sorted(report.evaluation_report.metrics.keys())
365
+ samples = []
366
+ limit = (
367
+ sample_limit
368
+ if sample_limit is not None
369
+ else len(report.evaluation_report.records)
370
+ )
371
+
372
+ # Build mapping from sample_id to generation records to get task info
373
+ gen_records_by_sample: dict[str, core_entities.GenerationRecord] = {}
374
+ for gen_record in report.generation_results:
375
+ sid = _extract_sample_id(gen_record.task.metadata)
376
+ if sid:
377
+ # Use first generation record for each sample (may have multiple with different conditions)
378
+ if sid not in gen_records_by_sample:
379
+ gen_records_by_sample[sid] = gen_record
380
+
381
+ for index, record in enumerate(report.evaluation_report.records):
382
+ if index >= limit:
383
+ break
384
+ sample_id = record.sample_id or ""
385
+
386
+ # Try to find corresponding generation record for this evaluation record
387
+ gen_record = gen_records_by_sample.get(sample_id)
388
+
389
+ # Build condition_id if we have the generation record
390
+ sample_metadata = {}
391
+ if gen_record is not None:
392
+ prompt_template = gen_record.task.prompt.spec.name
393
+ model_identifier = gen_record.task.model.identifier
394
+ sampling_temp = gen_record.task.sampling.temperature
395
+ sampling_max_tokens = gen_record.task.sampling.max_tokens
396
+ condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
397
+ sample_metadata = dict(metadata_by_sample.get(condition_id, {}))
398
+
399
+ scores = [
400
+ {
401
+ "metric": score.metric_name,
402
+ "value": score.value,
403
+ "details": score.details,
404
+ "metadata": score.metadata,
405
+ }
406
+ for score in record.scores
407
+ ]
408
+ samples.append(
409
+ {
410
+ "sample_id": sample_id,
411
+ "metadata": sample_metadata,
412
+ "scores": scores,
413
+ "failures": list(record.failures),
414
+ }
415
+ )
416
+
417
+ payload = {
418
+ "title": title,
419
+ "summary": {
420
+ **report.metadata,
421
+ "run_failures": len(report.failures),
422
+ "evaluation_failures": len(report.evaluation_report.failures),
423
+ },
424
+ "metrics": [
425
+ {
426
+ "name": name,
427
+ "count": metric.count,
428
+ "mean": metric.mean,
429
+ }
430
+ for name, metric in sorted(
431
+ report.evaluation_report.metrics.items(), key=lambda item: item[0]
432
+ )
433
+ ],
434
+ "samples": samples,
435
+ "rendered_sample_limit": limit,
436
+ "total_samples": len(report.evaluation_report.records),
437
+ "charts": [
438
+ chart.as_dict() if hasattr(chart, "as_dict") else _chart_to_dict(chart)
439
+ for chart in charts or ()
440
+ ],
441
+ "run_failures": [
442
+ {"sample_id": failure.sample_id, "message": failure.message}
443
+ for failure in report.failures
444
+ ],
445
+ "evaluation_failures": [
446
+ {"sample_id": failure.sample_id, "message": failure.message}
447
+ for failure in report.evaluation_report.failures
448
+ ],
449
+ "metrics_rendered": metric_names,
450
+ }
451
+ return payload
452
+
453
+
454
+ def _row_from_evaluation_record(
455
+ record: core_entities.EvaluationRecord,
456
+ *,
457
+ metadata_by_sample: Mapping[str, MutableMapping[str, object]],
458
+ metadata_fields: Sequence[str],
459
+ metric_names: Sequence[str],
460
+ include_failures: bool,
461
+ ) -> dict[str, object]:
462
+ sample_id = record.sample_id or ""
463
+
464
+ # Generate the same condition ID used in _collect_sample_metadata
465
+ # We need to map back to the GenerationRecord that created this EvaluationRecord
466
+ # This is a workaround since we need access to the original task details
467
+
468
+ # Create a mapping function to find the corresponding generation record
469
+ # For now, we'll use a simple heuristic based on the available data
470
+ # In a real implementation, this mapping would need to be passed in
471
+
472
+ # Try to extract condition info from the record's metadata
473
+ # This is a hack - ideally we'd pass the original task or generation record
474
+ condition_metadata = {}
475
+ for score in record.scores:
476
+ if hasattr(score, "metadata") and score.metadata:
477
+ condition_metadata.update(score.metadata)
478
+
479
+ prompt_template = condition_metadata.get("prompt_template", "unknown")
480
+ model_identifier = condition_metadata.get("model_identifier", "unknown")
481
+ sampling_temp = condition_metadata.get("sampling_temperature", 0.0)
482
+ sampling_max_tokens = condition_metadata.get("sampling_max_tokens", 100)
483
+
484
+ condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
485
+
486
+ metadata = metadata_by_sample.get(condition_id, {})
487
+ row: dict[str, object] = {"sample_id": sample_id}
488
+ for field in metadata_fields:
489
+ row[field] = metadata.get(field, "")
490
+ score_by_name = {score.metric_name: score.value for score in record.scores}
491
+ for name in metric_names:
492
+ row[f"metric:{name}"] = score_by_name.get(name, "")
493
+ if include_failures:
494
+ row["failures"] = "; ".join(record.failures)
495
+ return row
496
+
497
+
498
+ def _collect_sample_metadata(
499
+ records: Sequence[core_entities.GenerationRecord],
500
+ ) -> tuple[dict[str, MutableMapping[str, object]], list[str]]:
501
+ metadata: dict[str, MutableMapping[str, object]] = {}
502
+ for index, record in enumerate(records):
503
+ sample_id = _extract_sample_id(record.task.metadata)
504
+ if sample_id is None:
505
+ sample_id = f"sample-{index}"
506
+
507
+ # Create unique identifier for each experimental condition
508
+ # Include prompt template, model, and sampling to distinguish conditions
509
+ prompt_template = record.task.prompt.spec.name
510
+ model_identifier = record.task.model.identifier
511
+ sampling_temp = record.task.sampling.temperature
512
+ sampling_max_tokens = record.task.sampling.max_tokens
513
+
514
+ # Create unique condition key
515
+ condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
516
+
517
+ # Store metadata with unique condition ID
518
+ condition_metadata = _metadata_from_task(record)
519
+ metadata[condition_id] = condition_metadata
520
+
521
+ # Collect all field names from all conditions
522
+ fields = sorted({field for meta in metadata.values() for field in meta.keys()})
523
+
524
+ return metadata, fields
525
+
526
+
527
+ def _extract_sample_id(metadata: Mapping[str, object]) -> str | None:
528
+ value = metadata.get("dataset_id") or metadata.get("sample_id")
529
+ if value is None:
530
+ return None
531
+ return str(value)
532
+
533
+
534
+ def _metadata_from_task(record: core_entities.GenerationRecord) -> dict[str, object]:
535
+ metadata = dict(record.task.metadata)
536
+ metadata.setdefault("model_identifier", record.task.model.identifier)
537
+ metadata.setdefault("model_provider", record.task.model.provider)
538
+ metadata.setdefault("prompt_template", record.task.prompt.spec.name)
539
+ metadata.setdefault("sampling_temperature", record.task.sampling.temperature)
540
+ metadata.setdefault("sampling_top_p", record.task.sampling.top_p)
541
+ metadata.setdefault("sampling_max_tokens", record.task.sampling.max_tokens)
542
+ return metadata
543
+
544
+
545
+ def _render_summary(report: orchestrator.ExperimentReport) -> str:
546
+ # Filter out cost from main summary (we'll show it separately)
547
+ metadata_items = sorted(
548
+ (k, v) for k, v in report.metadata.items() if k != "cost"
549
+ )
550
+ failures = len(report.failures)
551
+ metadata_html = "\n".join(
552
+ f'<li class="summary-item"><strong>{html.escape(str(key))}</strong><br /><span class="subtle">{html.escape(str(value))}</span></li>'
553
+ for key, value in metadata_items
554
+ )
555
+ failure_block = f'<li class="summary-item"><strong>Run failures</strong><br /><span class="subtle">{failures}</span></li>'
556
+ return f'<section><h2>Summary</h2><ul class="summary-list">{metadata_html}{failure_block}</ul></section>'
557
+
558
+
559
+ def _render_cost_section(report: orchestrator.ExperimentReport) -> str:
560
+ """Render cost breakdown section if cost data is available."""
561
+ cost_data = report.metadata.get("cost")
562
+ if not cost_data or not isinstance(cost_data, dict):
563
+ return ""
564
+
565
+ total_cost = cost_data.get("total_cost", 0.0)
566
+ generation_cost = cost_data.get("generation_cost", 0.0)
567
+ evaluation_cost = cost_data.get("evaluation_cost", 0.0)
568
+ currency = cost_data.get("currency", "USD")
569
+ token_counts = cost_data.get("token_counts", {})
570
+ per_model_costs = cost_data.get("per_model_costs", {})
571
+ api_calls = cost_data.get("api_calls", 0)
572
+
573
+ # Main cost summary
574
+ cost_items = [
575
+ f'<li class="summary-item"><strong>Total Cost</strong><br /><span class="cost-highlight">${total_cost:.4f} {currency}</span></li>',
576
+ f'<li class="summary-item"><strong>Generation</strong><br /><span class="subtle">${generation_cost:.4f}</span></li>',
577
+ f'<li class="summary-item"><strong>Evaluation</strong><br /><span class="subtle">${evaluation_cost:.4f}</span></li>',
578
+ f'<li class="summary-item"><strong>API Calls</strong><br /><span class="subtle">{api_calls}</span></li>',
579
+ ]
580
+
581
+ # Token counts
582
+ if token_counts:
583
+ prompt_tokens = token_counts.get("prompt_tokens", 0)
584
+ completion_tokens = token_counts.get("completion_tokens", 0)
585
+ total_tokens = token_counts.get("total_tokens", 0)
586
+ cost_items.append(
587
+ f'<li class="summary-item"><strong>Tokens</strong><br />'
588
+ f'<span class="subtle">{total_tokens:,} total ({prompt_tokens:,} prompt + {completion_tokens:,} completion)</span></li>'
589
+ )
590
+
591
+ cost_summary = "\n".join(cost_items)
592
+
593
+ # Per-model breakdown if available
594
+ model_breakdown = ""
595
+ if per_model_costs:
596
+ model_rows = []
597
+ for model, cost in sorted(
598
+ per_model_costs.items(), key=lambda x: x[1], reverse=True
599
+ ):
600
+ percentage = (cost / total_cost * 100) if total_cost > 0 else 0
601
+ model_rows.append(
602
+ f"<tr><td>{html.escape(model)}</td><td>${cost:.4f}</td><td>{percentage:.1f}%</td></tr>"
603
+ )
604
+ model_table = "\n".join(model_rows)
605
+ model_breakdown = f"""
606
+ <h3>Cost by Model</h3>
607
+ <table>
608
+ <thead>
609
+ <tr><th>Model</th><th>Cost</th><th>% of Total</th></tr>
610
+ </thead>
611
+ <tbody>
612
+ {model_table}
613
+ </tbody>
614
+ </table>
615
+ """
616
+
617
+ return f"""
618
+ <section>
619
+ <h2>💰 Cost Breakdown</h2>
620
+ <ul class="summary-list">
621
+ {cost_summary}
622
+ </ul>
623
+ {model_breakdown}
624
+ </section>
625
+ """
626
+
627
+
628
+ def _render_metric_table(report: orchestrator.ExperimentReport) -> str:
629
+ rows = []
630
+ for name in sorted(report.evaluation_report.metrics.keys()):
631
+ metric = report.evaluation_report.metrics[name]
632
+ rows.append(
633
+ f"<tr><td>{html.escape(name)}</td><td>{metric.count}</td><td>{metric.mean:.4f}</td></tr>"
634
+ )
635
+ table_body = "\n".join(rows) or '<tr><td colspan="3">No metrics recorded</td></tr>'
636
+ return (
637
+ "<section><h2>Metrics</h2><table><thead><tr><th>Metric</th><th>Count"
638
+ "</th><th>Mean</th></tr></thead><tbody>"
639
+ + table_body
640
+ + "</tbody></table></section>"
641
+ )
642
+
643
+
644
+ def _render_sample_table(
645
+ report: orchestrator.ExperimentReport,
646
+ metadata_by_sample: Mapping[str, MutableMapping[str, object]],
647
+ metadata_fields: Sequence[str],
648
+ metric_names: Sequence[str],
649
+ *,
650
+ limit: int,
651
+ ) -> str:
652
+ head_cells = [
653
+ "sample_id",
654
+ *metadata_fields,
655
+ *[f"metric:{name}" for name in metric_names],
656
+ ]
657
+ head_html = "".join(f"<th>{html.escape(label)}</th>" for label in head_cells)
658
+ body_rows: list[str] = []
659
+ for index, record in enumerate(report.evaluation_report.records):
660
+ if index >= limit:
661
+ break
662
+ row = _row_from_evaluation_record(
663
+ record,
664
+ metadata_by_sample=metadata_by_sample,
665
+ metadata_fields=metadata_fields,
666
+ metric_names=metric_names,
667
+ include_failures=True,
668
+ )
669
+ cells = [html.escape(str(row.get(label, ""))) for label in head_cells]
670
+ cells.append(html.escape(row.get("failures", "")))
671
+ body_rows.append(
672
+ "<tr>" + "".join(f"<td>{cell}</td>" for cell in cells) + "</tr>"
673
+ )
674
+ if not body_rows:
675
+ body_rows.append(
676
+ f'<tr><td colspan="{len(head_cells) + 1}">No evaluation records</td></tr>'
677
+ )
678
+ footer = ""
679
+ if len(report.evaluation_report.records) > limit:
680
+ remaining = len(report.evaluation_report.records) - limit
681
+ footer = f'<p class="subtle">Showing first {limit} rows ({remaining} more not rendered).</p>'
682
+ return (
683
+ "<section><h2>Sample breakdown</h2><table><thead><tr>"
684
+ + head_html
685
+ + "<th>failures</th></tr></thead><tbody>"
686
+ + "\n".join(body_rows)
687
+ + "</tbody></table>"
688
+ + footer
689
+ + "</section>"
690
+ )
691
+
692
+
693
+ def _render_chart_section(chart: ChartLike) -> str:
694
+ if not chart.points:
695
+ return (
696
+ f'<section class="chart-section"><h3 class="chart-title">{html.escape(chart.title)}</h3>'
697
+ '<p class="subtle">No data points</p></section>'
698
+ )
699
+ svg_markup = _chart_to_svg(chart)
700
+ rows = "\n".join(
701
+ f"<tr><td>{html.escape(point.label)}</td><td>{html.escape(str(point.x_value))}</td>"
702
+ f"<td>{point.metric_value:.4f}</td><td>{point.count}</td></tr>"
703
+ for point in chart.points
704
+ )
705
+ table = (
706
+ '<table class="chart-table"><thead><tr><th>Label</th><th>X value</th><th>Metric'
707
+ "</th><th>Count</th></tr></thead><tbody>" + rows + "</tbody></table>"
708
+ )
709
+ return (
710
+ f'<section class="chart-section"><h3 class="chart-title">{html.escape(chart.title)}</h3>'
711
+ + svg_markup
712
+ + table
713
+ + "</section>"
714
+ )
715
+
716
+
717
+ def _chart_to_svg(chart: ChartLike) -> str:
718
+ width, height, margin = 640, 320, 42
719
+ plot_width = width - 2 * margin
720
+ plot_height = height - 2 * margin
721
+ values = [point.metric_value for point in chart.points]
722
+ min_value = min(values)
723
+ max_value = max(values)
724
+ if min_value == max_value:
725
+ min_value -= 0.5
726
+ max_value += 0.5
727
+ count = len(chart.points)
728
+ if count == 1:
729
+ x_positions = [margin + plot_width / 2]
730
+ else:
731
+ step = plot_width / (count - 1)
732
+ x_positions = [margin + index * step for index in range(count)]
733
+
734
+ def scale_y(value: float) -> float:
735
+ ratio = (value - min_value) / (max_value - min_value)
736
+ return margin + (plot_height * (1 - ratio))
737
+
738
+ y_positions = [scale_y(point.metric_value) for point in chart.points]
739
+ polyline = " ".join(f"{x:.2f},{y:.2f}" for x, y in zip(x_positions, y_positions))
740
+ circles = "\n".join(
741
+ f'<circle cx="{x:.2f}" cy="{y:.2f}" r="5" fill="#2563eb"></circle>'
742
+ for x, y in zip(x_positions, y_positions)
743
+ )
744
+ labels = "\n".join(
745
+ f'<text x="{x:.2f}" y="{height - margin / 4:.2f}" text-anchor="middle" font-size="12">{html.escape(point.label)}</text>'
746
+ for x, point in zip(x_positions, chart.points)
747
+ )
748
+ y_labels = (
749
+ f'<text x="{margin / 2:.2f}" y="{height - margin:.2f}" font-size="12">{min_value:.2f}</text>'
750
+ f'<text x="{margin / 2:.2f}" y="{margin:.2f}" font-size="12">{max_value:.2f}</text>'
751
+ )
752
+ axis_lines = (
753
+ f'<line x1="{margin}" y1="{height - margin}" x2="{width - margin}" y2="{height - margin}" stroke="#94a3b8" />'
754
+ f'<line x1="{margin}" y1="{margin}" x2="{margin}" y2="{height - margin}" stroke="#94a3b8" />'
755
+ )
756
+ polyline_markup = (
757
+ f'<polyline fill="none" stroke="#2563eb" stroke-width="2" points="{polyline}"></polyline>'
758
+ if count > 1
759
+ else ""
760
+ )
761
+ return (
762
+ f'<svg class="chart-svg" viewBox="0 0 {width} {height}" role="img" aria-label="{html.escape(chart.metric_name)} vs {html.escape(chart.x_label)}">'
763
+ + axis_lines
764
+ + polyline_markup
765
+ + circles
766
+ + labels
767
+ + y_labels
768
+ + "</svg>"
769
+ )
770
+
771
+
772
+ def _chart_to_dict(chart: ChartLike) -> dict[str, object]:
773
+ return {
774
+ "title": chart.title,
775
+ "x_label": chart.x_label,
776
+ "y_label": chart.y_label,
777
+ "metric": chart.metric_name,
778
+ "points": [
779
+ {
780
+ "label": point.label,
781
+ "x": getattr(point, "x_value", getattr(point, "x", None)),
782
+ "value": point.metric_value,
783
+ "count": point.count,
784
+ }
785
+ for point in chart.points
786
+ ],
787
+ }
788
+
789
+
790
+ __all__ = [
791
+ "export_report_csv",
792
+ "export_html_report",
793
+ "export_report_json",
794
+ "export_summary_json",
795
+ "export_report_bundle",
796
+ "render_html_report",
797
+ "build_json_report",
798
+ ]