themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. themis/cli/__init__.py +5 -0
  2. themis/cli/__main__.py +6 -0
  3. themis/cli/commands/__init__.py +19 -0
  4. themis/cli/commands/benchmarks.py +221 -0
  5. themis/cli/commands/comparison.py +394 -0
  6. themis/cli/commands/config_commands.py +244 -0
  7. themis/cli/commands/cost.py +214 -0
  8. themis/cli/commands/demo.py +68 -0
  9. themis/cli/commands/info.py +90 -0
  10. themis/cli/commands/leaderboard.py +362 -0
  11. themis/cli/commands/math_benchmarks.py +318 -0
  12. themis/cli/commands/mcq_benchmarks.py +207 -0
  13. themis/cli/commands/sample_run.py +244 -0
  14. themis/cli/commands/visualize.py +299 -0
  15. themis/cli/main.py +93 -0
  16. themis/cli/new_project.py +33 -0
  17. themis/cli/utils.py +51 -0
  18. themis/config/__init__.py +19 -0
  19. themis/config/loader.py +27 -0
  20. themis/config/registry.py +34 -0
  21. themis/config/runtime.py +214 -0
  22. themis/config/schema.py +112 -0
  23. themis/core/__init__.py +5 -0
  24. themis/core/conversation.py +354 -0
  25. themis/core/entities.py +164 -0
  26. themis/core/serialization.py +231 -0
  27. themis/core/tools.py +393 -0
  28. themis/core/types.py +141 -0
  29. themis/datasets/__init__.py +273 -0
  30. themis/datasets/base.py +264 -0
  31. themis/datasets/commonsense_qa.py +174 -0
  32. themis/datasets/competition_math.py +265 -0
  33. themis/datasets/coqa.py +133 -0
  34. themis/datasets/gpqa.py +190 -0
  35. themis/datasets/gsm8k.py +123 -0
  36. themis/datasets/gsm_symbolic.py +124 -0
  37. themis/datasets/math500.py +122 -0
  38. themis/datasets/med_qa.py +179 -0
  39. themis/datasets/medmcqa.py +169 -0
  40. themis/datasets/mmlu_pro.py +262 -0
  41. themis/datasets/piqa.py +146 -0
  42. themis/datasets/registry.py +201 -0
  43. themis/datasets/schema.py +245 -0
  44. themis/datasets/sciq.py +150 -0
  45. themis/datasets/social_i_qa.py +151 -0
  46. themis/datasets/super_gpqa.py +263 -0
  47. themis/evaluation/__init__.py +1 -0
  48. themis/evaluation/conditional.py +410 -0
  49. themis/evaluation/extractors/__init__.py +19 -0
  50. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  51. themis/evaluation/extractors/exceptions.py +7 -0
  52. themis/evaluation/extractors/identity_extractor.py +29 -0
  53. themis/evaluation/extractors/json_field_extractor.py +45 -0
  54. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  55. themis/evaluation/extractors/regex_extractor.py +43 -0
  56. themis/evaluation/math_verify_utils.py +87 -0
  57. themis/evaluation/metrics/__init__.py +21 -0
  58. themis/evaluation/metrics/composite_metric.py +47 -0
  59. themis/evaluation/metrics/consistency_metric.py +80 -0
  60. themis/evaluation/metrics/exact_match.py +51 -0
  61. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  62. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  63. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  64. themis/evaluation/metrics/response_length.py +33 -0
  65. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  66. themis/evaluation/pipeline.py +49 -0
  67. themis/evaluation/pipelines/__init__.py +15 -0
  68. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  69. themis/evaluation/pipelines/standard_pipeline.py +288 -0
  70. themis/evaluation/reports.py +293 -0
  71. themis/evaluation/statistics/__init__.py +53 -0
  72. themis/evaluation/statistics/bootstrap.py +79 -0
  73. themis/evaluation/statistics/confidence_intervals.py +121 -0
  74. themis/evaluation/statistics/distributions.py +207 -0
  75. themis/evaluation/statistics/effect_sizes.py +124 -0
  76. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  77. themis/evaluation/statistics/types.py +139 -0
  78. themis/evaluation/strategies/__init__.py +13 -0
  79. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  80. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  81. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  82. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  83. themis/experiment/__init__.py +5 -0
  84. themis/experiment/builder.py +151 -0
  85. themis/experiment/cache_manager.py +129 -0
  86. themis/experiment/comparison.py +631 -0
  87. themis/experiment/cost.py +310 -0
  88. themis/experiment/definitions.py +62 -0
  89. themis/experiment/export.py +690 -0
  90. themis/experiment/export_csv.py +159 -0
  91. themis/experiment/integration_manager.py +104 -0
  92. themis/experiment/math.py +192 -0
  93. themis/experiment/mcq.py +169 -0
  94. themis/experiment/orchestrator.py +373 -0
  95. themis/experiment/pricing.py +317 -0
  96. themis/experiment/storage.py +255 -0
  97. themis/experiment/visualization.py +588 -0
  98. themis/generation/__init__.py +1 -0
  99. themis/generation/agentic_runner.py +420 -0
  100. themis/generation/batching.py +254 -0
  101. themis/generation/clients.py +143 -0
  102. themis/generation/conversation_runner.py +236 -0
  103. themis/generation/plan.py +456 -0
  104. themis/generation/providers/litellm_provider.py +221 -0
  105. themis/generation/providers/vllm_provider.py +135 -0
  106. themis/generation/router.py +34 -0
  107. themis/generation/runner.py +207 -0
  108. themis/generation/strategies.py +98 -0
  109. themis/generation/templates.py +71 -0
  110. themis/generation/turn_strategies.py +393 -0
  111. themis/generation/types.py +9 -0
  112. themis/integrations/__init__.py +0 -0
  113. themis/integrations/huggingface.py +61 -0
  114. themis/integrations/wandb.py +65 -0
  115. themis/interfaces/__init__.py +83 -0
  116. themis/project/__init__.py +20 -0
  117. themis/project/definitions.py +98 -0
  118. themis/project/patterns.py +230 -0
  119. themis/providers/__init__.py +5 -0
  120. themis/providers/registry.py +39 -0
  121. themis/utils/api_generator.py +379 -0
  122. themis/utils/cost_tracking.py +376 -0
  123. themis/utils/dashboard.py +452 -0
  124. themis/utils/logging_utils.py +41 -0
  125. themis/utils/progress.py +58 -0
  126. themis/utils/tracing.py +320 -0
  127. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
  128. themis_eval-0.1.1.dist-info/RECORD +134 -0
  129. themis_eval-0.1.0.dist-info/RECORD +0 -8
  130. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
  131. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
  132. {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,690 @@
1
+ """Utilities for exporting experiment results to CSV, JSON, and HTML."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import html
7
+ import json
8
+ from collections import OrderedDict
9
+ from pathlib import Path
10
+ from typing import Mapping, MutableMapping, Protocol, Sequence
11
+
12
+ from themis.core import entities as core_entities
13
+ from themis.experiment import orchestrator
14
+
15
+
16
+ class ChartPointLike(Protocol):
17
+ label: str
18
+ x_value: object
19
+ metric_value: float
20
+ metric_name: str
21
+ count: int
22
+
23
+
24
+ class ChartLike(Protocol):
25
+ title: str
26
+ x_label: str
27
+ y_label: str
28
+ metric_name: str
29
+ points: Sequence[ChartPointLike]
30
+
31
+
32
+ def export_report_csv(
33
+ report: orchestrator.ExperimentReport,
34
+ path: str | Path,
35
+ *,
36
+ include_failures: bool = True,
37
+ ) -> Path:
38
+ """Write per-sample metrics to a CSV file for offline analysis."""
39
+
40
+ path = Path(path)
41
+ path.parent.mkdir(parents=True, exist_ok=True)
42
+ metadata_by_condition, metadata_fields = _collect_sample_metadata(
43
+ report.generation_results
44
+ )
45
+
46
+ # Create a proper index mapping generation records to their metadata
47
+ # We assume evaluation records are in the same order as generation records
48
+ gen_record_index = {}
49
+ for gen_record in report.generation_results:
50
+ sample_id = gen_record.task.metadata.get(
51
+ "dataset_id"
52
+ ) or gen_record.task.metadata.get("sample_id")
53
+ prompt_template = gen_record.task.prompt.spec.name
54
+ model_identifier = gen_record.task.model.identifier
55
+ sampling_temp = gen_record.task.sampling.temperature
56
+ sampling_max_tokens = gen_record.task.sampling.max_tokens
57
+ condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
58
+ gen_record_index[condition_id] = gen_record
59
+
60
+ metric_names = sorted(report.evaluation_report.metrics.keys())
61
+ fieldnames = (
62
+ ["sample_id"] + metadata_fields + [f"metric:{name}" for name in metric_names]
63
+ )
64
+ if include_failures:
65
+ fieldnames.append("failures")
66
+
67
+ with path.open("w", encoding="utf-8", newline="") as handle:
68
+ writer = csv.DictWriter(handle, fieldnames=fieldnames)
69
+ writer.writeheader()
70
+
71
+ # Process evaluation records in the same order as generation records
72
+ for i, eval_record in enumerate(report.evaluation_report.records):
73
+ # Find the corresponding generation record by index
74
+ if i < len(report.generation_results):
75
+ gen_record = report.generation_results[i]
76
+ sample_id = gen_record.task.metadata.get(
77
+ "dataset_id"
78
+ ) or gen_record.task.metadata.get("sample_id")
79
+ prompt_template = gen_record.task.prompt.spec.name
80
+ model_identifier = gen_record.task.model.identifier
81
+ sampling_temp = gen_record.task.sampling.temperature
82
+ sampling_max_tokens = gen_record.task.sampling.max_tokens
83
+ condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
84
+ metadata = metadata_by_condition.get(condition_id, {})
85
+ else:
86
+ # Fallback for extra evaluation records
87
+ sample_id = eval_record.sample_id or ""
88
+ metadata = {}
89
+
90
+ row: dict[str, object] = {"sample_id": sample_id}
91
+ for field in metadata_fields:
92
+ row[field] = metadata.get(field, "")
93
+ score_by_name = {
94
+ score.metric_name: score.value for score in eval_record.scores
95
+ }
96
+ for name in metric_names:
97
+ row[f"metric:{name}"] = score_by_name.get(name, "")
98
+ if include_failures:
99
+ row["failures"] = "; ".join(eval_record.failures)
100
+ writer.writerow(row)
101
+ return path
102
+
103
+
104
+ def export_html_report(
105
+ report: orchestrator.ExperimentReport,
106
+ path: str | Path,
107
+ *,
108
+ charts: Sequence[ChartLike] | None = None,
109
+ title: str = "Experiment report",
110
+ sample_limit: int = 100,
111
+ ) -> Path:
112
+ """Render the experiment report as an HTML document."""
113
+
114
+ path = Path(path)
115
+ path.parent.mkdir(parents=True, exist_ok=True)
116
+ html_content = render_html_report(
117
+ report,
118
+ charts=charts,
119
+ title=title,
120
+ sample_limit=sample_limit,
121
+ )
122
+ path.write_text(html_content, encoding="utf-8")
123
+ return path
124
+
125
+
126
+ def export_report_json(
127
+ report: orchestrator.ExperimentReport,
128
+ path: str | Path,
129
+ *,
130
+ charts: Sequence[ChartLike] | None = None,
131
+ title: str = "Experiment report",
132
+ sample_limit: int | None = None,
133
+ indent: int = 2,
134
+ ) -> Path:
135
+ """Serialize the report details to JSON for downstream tooling."""
136
+
137
+ payload = build_json_report(
138
+ report,
139
+ charts=charts,
140
+ title=title,
141
+ sample_limit=sample_limit,
142
+ )
143
+ path = Path(path)
144
+ path.parent.mkdir(parents=True, exist_ok=True)
145
+ path.write_text(json.dumps(payload, indent=indent), encoding="utf-8")
146
+ return path
147
+
148
+
149
+ def export_report_bundle(
150
+ report: orchestrator.ExperimentReport,
151
+ *,
152
+ csv_path: str | Path | None = None,
153
+ html_path: str | Path | None = None,
154
+ json_path: str | Path | None = None,
155
+ charts: Sequence[ChartLike] | None = None,
156
+ title: str = "Experiment report",
157
+ sample_limit: int = 100,
158
+ indent: int = 2,
159
+ ) -> OrderedDict[str, Path]:
160
+ """Convenience helper that writes multiple export formats at once."""
161
+
162
+ outputs: OrderedDict[str, Path] = OrderedDict()
163
+ if csv_path is not None:
164
+ outputs["csv"] = export_report_csv(report, csv_path)
165
+ if html_path is not None:
166
+ outputs["html"] = export_html_report(
167
+ report,
168
+ html_path,
169
+ charts=charts,
170
+ title=title,
171
+ sample_limit=sample_limit,
172
+ )
173
+ if json_path is not None:
174
+ outputs["json"] = export_report_json(
175
+ report,
176
+ json_path,
177
+ charts=charts,
178
+ title=title,
179
+ sample_limit=sample_limit,
180
+ indent=indent,
181
+ )
182
+ return outputs
183
+
184
+
185
+ def render_html_report(
186
+ report: orchestrator.ExperimentReport,
187
+ *,
188
+ charts: Sequence[ChartLike] | None = None,
189
+ title: str = "Experiment report",
190
+ sample_limit: int = 100,
191
+ ) -> str:
192
+ """Return an HTML string summarizing the experiment results."""
193
+
194
+ metadata_by_sample, metadata_fields = _collect_sample_metadata(
195
+ report.generation_results
196
+ )
197
+ metric_names = sorted(report.evaluation_report.metrics.keys())
198
+ summary_section = _render_summary(report)
199
+ cost_section = _render_cost_section(report)
200
+ metrics_table = _render_metric_table(report)
201
+ samples_table = _render_sample_table(
202
+ report,
203
+ metadata_by_sample,
204
+ metadata_fields,
205
+ metric_names,
206
+ limit=sample_limit,
207
+ )
208
+ chart_sections = "\n".join(_render_chart_section(chart) for chart in charts or ())
209
+ html_doc = f"""<!DOCTYPE html>
210
+ <html lang=\"en\">
211
+ <head>
212
+ <meta charset=\"utf-8\" />
213
+ <title>{html.escape(title)}</title>
214
+ <style>
215
+ body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; margin: 32px; background: #f6f8fb; color: #1f2933; }}
216
+ h1 {{ font-size: 1.8rem; margin-bottom: 0.5rem; }}
217
+ section {{ margin-bottom: 2rem; }}
218
+ table {{ border-collapse: collapse; width: 100%; background: white; box-shadow: 0 1px 2px rgba(15,23,42,0.08); }}
219
+ th, td {{ padding: 0.5rem 0.75rem; border-bottom: 1px solid #e5e7eb; font-size: 0.95rem; text-align: left; }}
220
+ th {{ background: #f0f2f8; font-weight: 600; }}
221
+ tbody tr:nth-child(odd) {{ background: #fafbff; }}
222
+ .summary-list {{ list-style: none; padding: 0; margin: 0; display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 0.75rem; }}
223
+ .summary-item {{ background: white; padding: 0.75rem 1rem; border-radius: 8px; box-shadow: inset 0 0 0 1px #e5e7eb; }}
224
+ .chart-section {{ background: white; padding: 1rem; border-radius: 10px; box-shadow: 0 1px 2px rgba(15,23,42,0.08); margin-bottom: 1.5rem; }}
225
+ .chart-title {{ margin: 0 0 0.5rem 0; font-size: 1.1rem; }}
226
+ .chart-svg {{ width: 100%; height: 320px; }}
227
+ .chart-table {{ margin-top: 0.75rem; }}
228
+ .subtle {{ color: #6b7280; font-size: 0.9rem; }}
229
+ .cost-highlight {{ color: #059669; font-size: 1.2rem; font-weight: 600; }}
230
+ .cost-section {{ background: white; padding: 1rem; border-radius: 10px; box-shadow: 0 1px 2px rgba(15,23,42,0.08); margin-bottom: 1.5rem; }}
231
+ .cost-section h2 {{ font-size: 1.3rem; margin-top: 0; margin-bottom: 1rem; }}
232
+ .cost-section h3 {{ font-size: 1.1rem; margin-top: 1.5rem; margin-bottom: 0.75rem; }}
233
+ </style>
234
+ </head>
235
+ <body>
236
+ <h1>{html.escape(title)}</h1>
237
+ {summary_section}
238
+ {cost_section}
239
+ {metrics_table}
240
+ {chart_sections}
241
+ {samples_table}
242
+ </body>
243
+ </html>"""
244
+ return html_doc
245
+
246
+
247
+ def build_json_report(
248
+ report: orchestrator.ExperimentReport,
249
+ *,
250
+ charts: Sequence[ChartLike] | None = None,
251
+ title: str = "Experiment report",
252
+ sample_limit: int | None = None,
253
+ ) -> dict[str, object]:
254
+ metadata_by_sample, metadata_fields = _collect_sample_metadata(
255
+ report.generation_results
256
+ )
257
+ metric_names = sorted(report.evaluation_report.metrics.keys())
258
+ samples = []
259
+ limit = (
260
+ sample_limit
261
+ if sample_limit is not None
262
+ else len(report.evaluation_report.records)
263
+ )
264
+
265
+ # Build mapping from sample_id to generation records to get task info
266
+ gen_records_by_sample: dict[str, core_entities.GenerationRecord] = {}
267
+ for gen_record in report.generation_results:
268
+ sid = _extract_sample_id(gen_record.task.metadata)
269
+ if sid:
270
+ # Use first generation record for each sample (may have multiple with different conditions)
271
+ if sid not in gen_records_by_sample:
272
+ gen_records_by_sample[sid] = gen_record
273
+
274
+ for index, record in enumerate(report.evaluation_report.records):
275
+ if index >= limit:
276
+ break
277
+ sample_id = record.sample_id or ""
278
+
279
+ # Try to find corresponding generation record for this evaluation record
280
+ gen_record = gen_records_by_sample.get(sample_id)
281
+
282
+ # Build condition_id if we have the generation record
283
+ sample_metadata = {}
284
+ if gen_record is not None:
285
+ prompt_template = gen_record.task.prompt.spec.name
286
+ model_identifier = gen_record.task.model.identifier
287
+ sampling_temp = gen_record.task.sampling.temperature
288
+ sampling_max_tokens = gen_record.task.sampling.max_tokens
289
+ condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
290
+ sample_metadata = dict(metadata_by_sample.get(condition_id, {}))
291
+
292
+ scores = [
293
+ {
294
+ "metric": score.metric_name,
295
+ "value": score.value,
296
+ "details": score.details,
297
+ "metadata": score.metadata,
298
+ }
299
+ for score in record.scores
300
+ ]
301
+ samples.append(
302
+ {
303
+ "sample_id": sample_id,
304
+ "metadata": sample_metadata,
305
+ "scores": scores,
306
+ "failures": list(record.failures),
307
+ }
308
+ )
309
+
310
+ payload = {
311
+ "title": title,
312
+ "summary": {
313
+ **report.metadata,
314
+ "run_failures": len(report.failures),
315
+ "evaluation_failures": len(report.evaluation_report.failures),
316
+ },
317
+ "metrics": [
318
+ {
319
+ "name": name,
320
+ "count": metric.count,
321
+ "mean": metric.mean,
322
+ }
323
+ for name, metric in sorted(
324
+ report.evaluation_report.metrics.items(), key=lambda item: item[0]
325
+ )
326
+ ],
327
+ "samples": samples,
328
+ "rendered_sample_limit": limit,
329
+ "total_samples": len(report.evaluation_report.records),
330
+ "charts": [
331
+ chart.as_dict() if hasattr(chart, "as_dict") else _chart_to_dict(chart)
332
+ for chart in charts or ()
333
+ ],
334
+ "run_failures": [
335
+ {"sample_id": failure.sample_id, "message": failure.message}
336
+ for failure in report.failures
337
+ ],
338
+ "evaluation_failures": [
339
+ {"sample_id": failure.sample_id, "message": failure.message}
340
+ for failure in report.evaluation_report.failures
341
+ ],
342
+ "metrics_rendered": metric_names,
343
+ }
344
+ return payload
345
+
346
+
347
+ def _row_from_evaluation_record(
348
+ record: core_entities.EvaluationRecord,
349
+ *,
350
+ metadata_by_sample: Mapping[str, MutableMapping[str, object]],
351
+ metadata_fields: Sequence[str],
352
+ metric_names: Sequence[str],
353
+ include_failures: bool,
354
+ ) -> dict[str, object]:
355
+ sample_id = record.sample_id or ""
356
+
357
+ # Generate the same condition ID used in _collect_sample_metadata
358
+ # We need to map back to the GenerationRecord that created this EvaluationRecord
359
+ # This is a workaround since we need access to the original task details
360
+
361
+ # Create a mapping function to find the corresponding generation record
362
+ # For now, we'll use a simple heuristic based on the available data
363
+ # In a real implementation, this mapping would need to be passed in
364
+
365
+ # Try to extract condition info from the record's metadata
366
+ # This is a hack - ideally we'd pass the original task or generation record
367
+ condition_metadata = {}
368
+ for score in record.scores:
369
+ if hasattr(score, "metadata") and score.metadata:
370
+ condition_metadata.update(score.metadata)
371
+
372
+ prompt_template = condition_metadata.get("prompt_template", "unknown")
373
+ model_identifier = condition_metadata.get("model_identifier", "unknown")
374
+ sampling_temp = condition_metadata.get("sampling_temperature", 0.0)
375
+ sampling_max_tokens = condition_metadata.get("sampling_max_tokens", 100)
376
+
377
+ condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
378
+
379
+ metadata = metadata_by_sample.get(condition_id, {})
380
+ row: dict[str, object] = {"sample_id": sample_id}
381
+ for field in metadata_fields:
382
+ row[field] = metadata.get(field, "")
383
+ score_by_name = {score.metric_name: score.value for score in record.scores}
384
+ for name in metric_names:
385
+ row[f"metric:{name}"] = score_by_name.get(name, "")
386
+ if include_failures:
387
+ row["failures"] = "; ".join(record.failures)
388
+ return row
389
+
390
+
391
+ def _collect_sample_metadata(
392
+ records: Sequence[core_entities.GenerationRecord],
393
+ ) -> tuple[dict[str, MutableMapping[str, object]], list[str]]:
394
+ metadata: dict[str, MutableMapping[str, object]] = {}
395
+ for index, record in enumerate(records):
396
+ sample_id = _extract_sample_id(record.task.metadata)
397
+ if sample_id is None:
398
+ sample_id = f"sample-{index}"
399
+
400
+ # Create unique identifier for each experimental condition
401
+ # Include prompt template, model, and sampling to distinguish conditions
402
+ prompt_template = record.task.prompt.spec.name
403
+ model_identifier = record.task.model.identifier
404
+ sampling_temp = record.task.sampling.temperature
405
+ sampling_max_tokens = record.task.sampling.max_tokens
406
+
407
+ # Create unique condition key
408
+ condition_id = f"{sample_id}_{prompt_template}_{model_identifier}_{sampling_temp}_{sampling_max_tokens}"
409
+
410
+ # Store metadata with unique condition ID
411
+ condition_metadata = _metadata_from_task(record)
412
+ metadata[condition_id] = condition_metadata
413
+
414
+ # Collect all field names from all conditions
415
+ fields = sorted({field for meta in metadata.values() for field in meta.keys()})
416
+
417
+ return metadata, fields
418
+
419
+
420
+ def _extract_sample_id(metadata: Mapping[str, object]) -> str | None:
421
+ value = metadata.get("dataset_id") or metadata.get("sample_id")
422
+ if value is None:
423
+ return None
424
+ return str(value)
425
+
426
+
427
+ def _metadata_from_task(record: core_entities.GenerationRecord) -> dict[str, object]:
428
+ metadata = dict(record.task.metadata)
429
+ metadata.setdefault("model_identifier", record.task.model.identifier)
430
+ metadata.setdefault("model_provider", record.task.model.provider)
431
+ metadata.setdefault("prompt_template", record.task.prompt.spec.name)
432
+ metadata.setdefault("sampling_temperature", record.task.sampling.temperature)
433
+ metadata.setdefault("sampling_top_p", record.task.sampling.top_p)
434
+ metadata.setdefault("sampling_max_tokens", record.task.sampling.max_tokens)
435
+ return metadata
436
+
437
+
438
+ def _render_summary(report: orchestrator.ExperimentReport) -> str:
439
+ # Filter out cost from main summary (we'll show it separately)
440
+ metadata_items = sorted(
441
+ (k, v) for k, v in report.metadata.items() if k != "cost"
442
+ )
443
+ failures = len(report.failures)
444
+ metadata_html = "\n".join(
445
+ f'<li class="summary-item"><strong>{html.escape(str(key))}</strong><br /><span class="subtle">{html.escape(str(value))}</span></li>'
446
+ for key, value in metadata_items
447
+ )
448
+ failure_block = f'<li class="summary-item"><strong>Run failures</strong><br /><span class="subtle">{failures}</span></li>'
449
+ return f'<section><h2>Summary</h2><ul class="summary-list">{metadata_html}{failure_block}</ul></section>'
450
+
451
+
452
+ def _render_cost_section(report: orchestrator.ExperimentReport) -> str:
453
+ """Render cost breakdown section if cost data is available."""
454
+ cost_data = report.metadata.get("cost")
455
+ if not cost_data or not isinstance(cost_data, dict):
456
+ return ""
457
+
458
+ total_cost = cost_data.get("total_cost", 0.0)
459
+ generation_cost = cost_data.get("generation_cost", 0.0)
460
+ evaluation_cost = cost_data.get("evaluation_cost", 0.0)
461
+ currency = cost_data.get("currency", "USD")
462
+ token_counts = cost_data.get("token_counts", {})
463
+ per_model_costs = cost_data.get("per_model_costs", {})
464
+ api_calls = cost_data.get("api_calls", 0)
465
+
466
+ # Main cost summary
467
+ cost_items = [
468
+ f'<li class="summary-item"><strong>Total Cost</strong><br /><span class="cost-highlight">${total_cost:.4f} {currency}</span></li>',
469
+ f'<li class="summary-item"><strong>Generation</strong><br /><span class="subtle">${generation_cost:.4f}</span></li>',
470
+ f'<li class="summary-item"><strong>Evaluation</strong><br /><span class="subtle">${evaluation_cost:.4f}</span></li>',
471
+ f'<li class="summary-item"><strong>API Calls</strong><br /><span class="subtle">{api_calls}</span></li>',
472
+ ]
473
+
474
+ # Token counts
475
+ if token_counts:
476
+ prompt_tokens = token_counts.get("prompt_tokens", 0)
477
+ completion_tokens = token_counts.get("completion_tokens", 0)
478
+ total_tokens = token_counts.get("total_tokens", 0)
479
+ cost_items.append(
480
+ f'<li class="summary-item"><strong>Tokens</strong><br />'
481
+ f'<span class="subtle">{total_tokens:,} total ({prompt_tokens:,} prompt + {completion_tokens:,} completion)</span></li>'
482
+ )
483
+
484
+ cost_summary = "\n".join(cost_items)
485
+
486
+ # Per-model breakdown if available
487
+ model_breakdown = ""
488
+ if per_model_costs:
489
+ model_rows = []
490
+ for model, cost in sorted(
491
+ per_model_costs.items(), key=lambda x: x[1], reverse=True
492
+ ):
493
+ percentage = (cost / total_cost * 100) if total_cost > 0 else 0
494
+ model_rows.append(
495
+ f"<tr><td>{html.escape(model)}</td><td>${cost:.4f}</td><td>{percentage:.1f}%</td></tr>"
496
+ )
497
+ model_table = "\n".join(model_rows)
498
+ model_breakdown = f"""
499
+ <h3>Cost by Model</h3>
500
+ <table>
501
+ <thead>
502
+ <tr><th>Model</th><th>Cost</th><th>% of Total</th></tr>
503
+ </thead>
504
+ <tbody>
505
+ {model_table}
506
+ </tbody>
507
+ </table>
508
+ """
509
+
510
+ return f"""
511
+ <section>
512
+ <h2>💰 Cost Breakdown</h2>
513
+ <ul class="summary-list">
514
+ {cost_summary}
515
+ </ul>
516
+ {model_breakdown}
517
+ </section>
518
+ """
519
+
520
+
521
+ def _render_metric_table(report: orchestrator.ExperimentReport) -> str:
522
+ rows = []
523
+ for name in sorted(report.evaluation_report.metrics.keys()):
524
+ metric = report.evaluation_report.metrics[name]
525
+ rows.append(
526
+ f"<tr><td>{html.escape(name)}</td><td>{metric.count}</td><td>{metric.mean:.4f}</td></tr>"
527
+ )
528
+ table_body = "\n".join(rows) or '<tr><td colspan="3">No metrics recorded</td></tr>'
529
+ return (
530
+ "<section><h2>Metrics</h2><table><thead><tr><th>Metric</th><th>Count"
531
+ "</th><th>Mean</th></tr></thead><tbody>"
532
+ + table_body
533
+ + "</tbody></table></section>"
534
+ )
535
+
536
+
537
+ def _render_sample_table(
538
+ report: orchestrator.ExperimentReport,
539
+ metadata_by_sample: Mapping[str, MutableMapping[str, object]],
540
+ metadata_fields: Sequence[str],
541
+ metric_names: Sequence[str],
542
+ *,
543
+ limit: int,
544
+ ) -> str:
545
+ head_cells = [
546
+ "sample_id",
547
+ *metadata_fields,
548
+ *[f"metric:{name}" for name in metric_names],
549
+ ]
550
+ head_html = "".join(f"<th>{html.escape(label)}</th>" for label in head_cells)
551
+ body_rows: list[str] = []
552
+ for index, record in enumerate(report.evaluation_report.records):
553
+ if index >= limit:
554
+ break
555
+ row = _row_from_evaluation_record(
556
+ record,
557
+ metadata_by_sample=metadata_by_sample,
558
+ metadata_fields=metadata_fields,
559
+ metric_names=metric_names,
560
+ include_failures=True,
561
+ )
562
+ cells = [html.escape(str(row.get(label, ""))) for label in head_cells]
563
+ cells.append(html.escape(row.get("failures", "")))
564
+ body_rows.append(
565
+ "<tr>" + "".join(f"<td>{cell}</td>" for cell in cells) + "</tr>"
566
+ )
567
+ if not body_rows:
568
+ body_rows.append(
569
+ f'<tr><td colspan="{len(head_cells) + 1}">No evaluation records</td></tr>'
570
+ )
571
+ footer = ""
572
+ if len(report.evaluation_report.records) > limit:
573
+ remaining = len(report.evaluation_report.records) - limit
574
+ footer = f'<p class="subtle">Showing first {limit} rows ({remaining} more not rendered).</p>'
575
+ return (
576
+ "<section><h2>Sample breakdown</h2><table><thead><tr>"
577
+ + head_html
578
+ + "<th>failures</th></tr></thead><tbody>"
579
+ + "\n".join(body_rows)
580
+ + "</tbody></table>"
581
+ + footer
582
+ + "</section>"
583
+ )
584
+
585
+
586
+ def _render_chart_section(chart: ChartLike) -> str:
587
+ if not chart.points:
588
+ return (
589
+ f'<section class="chart-section"><h3 class="chart-title">{html.escape(chart.title)}</h3>'
590
+ '<p class="subtle">No data points</p></section>'
591
+ )
592
+ svg_markup = _chart_to_svg(chart)
593
+ rows = "\n".join(
594
+ f"<tr><td>{html.escape(point.label)}</td><td>{html.escape(str(point.x_value))}</td>"
595
+ f"<td>{point.metric_value:.4f}</td><td>{point.count}</td></tr>"
596
+ for point in chart.points
597
+ )
598
+ table = (
599
+ '<table class="chart-table"><thead><tr><th>Label</th><th>X value</th><th>Metric'
600
+ "</th><th>Count</th></tr></thead><tbody>" + rows + "</tbody></table>"
601
+ )
602
+ return (
603
+ f'<section class="chart-section"><h3 class="chart-title">{html.escape(chart.title)}</h3>'
604
+ + svg_markup
605
+ + table
606
+ + "</section>"
607
+ )
608
+
609
+
610
+ def _chart_to_svg(chart: ChartLike) -> str:
611
+ width, height, margin = 640, 320, 42
612
+ plot_width = width - 2 * margin
613
+ plot_height = height - 2 * margin
614
+ values = [point.metric_value for point in chart.points]
615
+ min_value = min(values)
616
+ max_value = max(values)
617
+ if min_value == max_value:
618
+ min_value -= 0.5
619
+ max_value += 0.5
620
+ count = len(chart.points)
621
+ if count == 1:
622
+ x_positions = [margin + plot_width / 2]
623
+ else:
624
+ step = plot_width / (count - 1)
625
+ x_positions = [margin + index * step for index in range(count)]
626
+
627
+ def scale_y(value: float) -> float:
628
+ ratio = (value - min_value) / (max_value - min_value)
629
+ return margin + (plot_height * (1 - ratio))
630
+
631
+ y_positions = [scale_y(point.metric_value) for point in chart.points]
632
+ polyline = " ".join(f"{x:.2f},{y:.2f}" for x, y in zip(x_positions, y_positions))
633
+ circles = "\n".join(
634
+ f'<circle cx="{x:.2f}" cy="{y:.2f}" r="5" fill="#2563eb"></circle>'
635
+ for x, y in zip(x_positions, y_positions)
636
+ )
637
+ labels = "\n".join(
638
+ f'<text x="{x:.2f}" y="{height - margin / 4:.2f}" text-anchor="middle" font-size="12">{html.escape(point.label)}</text>'
639
+ for x, point in zip(x_positions, chart.points)
640
+ )
641
+ y_labels = (
642
+ f'<text x="{margin / 2:.2f}" y="{height - margin:.2f}" font-size="12">{min_value:.2f}</text>'
643
+ f'<text x="{margin / 2:.2f}" y="{margin:.2f}" font-size="12">{max_value:.2f}</text>'
644
+ )
645
+ axis_lines = (
646
+ f'<line x1="{margin}" y1="{height - margin}" x2="{width - margin}" y2="{height - margin}" stroke="#94a3b8" />'
647
+ f'<line x1="{margin}" y1="{margin}" x2="{margin}" y2="{height - margin}" stroke="#94a3b8" />'
648
+ )
649
+ polyline_markup = (
650
+ f'<polyline fill="none" stroke="#2563eb" stroke-width="2" points="{polyline}"></polyline>'
651
+ if count > 1
652
+ else ""
653
+ )
654
+ return (
655
+ f'<svg class="chart-svg" viewBox="0 0 {width} {height}" role="img" aria-label="{html.escape(chart.metric_name)} vs {html.escape(chart.x_label)}">'
656
+ + axis_lines
657
+ + polyline_markup
658
+ + circles
659
+ + labels
660
+ + y_labels
661
+ + "</svg>"
662
+ )
663
+
664
+
665
+ def _chart_to_dict(chart: ChartLike) -> dict[str, object]:
666
+ return {
667
+ "title": chart.title,
668
+ "x_label": chart.x_label,
669
+ "y_label": chart.y_label,
670
+ "metric": chart.metric_name,
671
+ "points": [
672
+ {
673
+ "label": point.label,
674
+ "x": getattr(point, "x_value", getattr(point, "x", None)),
675
+ "value": point.metric_value,
676
+ "count": point.count,
677
+ }
678
+ for point in chart.points
679
+ ],
680
+ }
681
+
682
+
683
+ __all__ = [
684
+ "export_report_csv",
685
+ "export_html_report",
686
+ "export_report_json",
687
+ "export_report_bundle",
688
+ "render_html_report",
689
+ "build_json_report",
690
+ ]