themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. themis/__init__.py +12 -1
  2. themis/_version.py +2 -2
  3. themis/api.py +343 -0
  4. themis/backends/__init__.py +17 -0
  5. themis/backends/execution.py +197 -0
  6. themis/backends/storage.py +260 -0
  7. themis/cli/__init__.py +5 -0
  8. themis/cli/__main__.py +6 -0
  9. themis/cli/commands/__init__.py +19 -0
  10. themis/cli/commands/benchmarks.py +221 -0
  11. themis/cli/commands/comparison.py +394 -0
  12. themis/cli/commands/config_commands.py +244 -0
  13. themis/cli/commands/cost.py +214 -0
  14. themis/cli/commands/demo.py +68 -0
  15. themis/cli/commands/info.py +90 -0
  16. themis/cli/commands/leaderboard.py +362 -0
  17. themis/cli/commands/math_benchmarks.py +318 -0
  18. themis/cli/commands/mcq_benchmarks.py +207 -0
  19. themis/cli/commands/results.py +252 -0
  20. themis/cli/commands/sample_run.py +244 -0
  21. themis/cli/commands/visualize.py +299 -0
  22. themis/cli/main.py +463 -0
  23. themis/cli/new_project.py +33 -0
  24. themis/cli/utils.py +51 -0
  25. themis/comparison/__init__.py +25 -0
  26. themis/comparison/engine.py +348 -0
  27. themis/comparison/reports.py +283 -0
  28. themis/comparison/statistics.py +402 -0
  29. themis/config/__init__.py +19 -0
  30. themis/config/loader.py +27 -0
  31. themis/config/registry.py +34 -0
  32. themis/config/runtime.py +214 -0
  33. themis/config/schema.py +112 -0
  34. themis/core/__init__.py +5 -0
  35. themis/core/conversation.py +354 -0
  36. themis/core/entities.py +184 -0
  37. themis/core/serialization.py +231 -0
  38. themis/core/tools.py +393 -0
  39. themis/core/types.py +141 -0
  40. themis/datasets/__init__.py +273 -0
  41. themis/datasets/base.py +264 -0
  42. themis/datasets/commonsense_qa.py +174 -0
  43. themis/datasets/competition_math.py +265 -0
  44. themis/datasets/coqa.py +133 -0
  45. themis/datasets/gpqa.py +190 -0
  46. themis/datasets/gsm8k.py +123 -0
  47. themis/datasets/gsm_symbolic.py +124 -0
  48. themis/datasets/math500.py +122 -0
  49. themis/datasets/med_qa.py +179 -0
  50. themis/datasets/medmcqa.py +169 -0
  51. themis/datasets/mmlu_pro.py +262 -0
  52. themis/datasets/piqa.py +146 -0
  53. themis/datasets/registry.py +201 -0
  54. themis/datasets/schema.py +245 -0
  55. themis/datasets/sciq.py +150 -0
  56. themis/datasets/social_i_qa.py +151 -0
  57. themis/datasets/super_gpqa.py +263 -0
  58. themis/evaluation/__init__.py +1 -0
  59. themis/evaluation/conditional.py +410 -0
  60. themis/evaluation/extractors/__init__.py +19 -0
  61. themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
  62. themis/evaluation/extractors/exceptions.py +7 -0
  63. themis/evaluation/extractors/identity_extractor.py +29 -0
  64. themis/evaluation/extractors/json_field_extractor.py +45 -0
  65. themis/evaluation/extractors/math_verify_extractor.py +37 -0
  66. themis/evaluation/extractors/regex_extractor.py +43 -0
  67. themis/evaluation/math_verify_utils.py +87 -0
  68. themis/evaluation/metrics/__init__.py +21 -0
  69. themis/evaluation/metrics/code/__init__.py +19 -0
  70. themis/evaluation/metrics/code/codebleu.py +144 -0
  71. themis/evaluation/metrics/code/execution.py +280 -0
  72. themis/evaluation/metrics/code/pass_at_k.py +181 -0
  73. themis/evaluation/metrics/composite_metric.py +47 -0
  74. themis/evaluation/metrics/consistency_metric.py +80 -0
  75. themis/evaluation/metrics/exact_match.py +51 -0
  76. themis/evaluation/metrics/length_difference_tolerance.py +33 -0
  77. themis/evaluation/metrics/math_verify_accuracy.py +40 -0
  78. themis/evaluation/metrics/nlp/__init__.py +21 -0
  79. themis/evaluation/metrics/nlp/bertscore.py +138 -0
  80. themis/evaluation/metrics/nlp/bleu.py +129 -0
  81. themis/evaluation/metrics/nlp/meteor.py +153 -0
  82. themis/evaluation/metrics/nlp/rouge.py +136 -0
  83. themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
  84. themis/evaluation/metrics/response_length.py +33 -0
  85. themis/evaluation/metrics/rubric_judge_metric.py +134 -0
  86. themis/evaluation/pipeline.py +49 -0
  87. themis/evaluation/pipelines/__init__.py +15 -0
  88. themis/evaluation/pipelines/composable_pipeline.py +357 -0
  89. themis/evaluation/pipelines/standard_pipeline.py +348 -0
  90. themis/evaluation/reports.py +293 -0
  91. themis/evaluation/statistics/__init__.py +53 -0
  92. themis/evaluation/statistics/bootstrap.py +79 -0
  93. themis/evaluation/statistics/confidence_intervals.py +121 -0
  94. themis/evaluation/statistics/distributions.py +207 -0
  95. themis/evaluation/statistics/effect_sizes.py +124 -0
  96. themis/evaluation/statistics/hypothesis_tests.py +305 -0
  97. themis/evaluation/statistics/types.py +139 -0
  98. themis/evaluation/strategies/__init__.py +13 -0
  99. themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
  100. themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
  101. themis/evaluation/strategies/evaluation_strategy.py +24 -0
  102. themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
  103. themis/experiment/__init__.py +5 -0
  104. themis/experiment/builder.py +151 -0
  105. themis/experiment/cache_manager.py +134 -0
  106. themis/experiment/comparison.py +631 -0
  107. themis/experiment/cost.py +310 -0
  108. themis/experiment/definitions.py +62 -0
  109. themis/experiment/export.py +798 -0
  110. themis/experiment/export_csv.py +159 -0
  111. themis/experiment/integration_manager.py +104 -0
  112. themis/experiment/math.py +192 -0
  113. themis/experiment/mcq.py +169 -0
  114. themis/experiment/orchestrator.py +415 -0
  115. themis/experiment/pricing.py +317 -0
  116. themis/experiment/storage.py +1458 -0
  117. themis/experiment/visualization.py +588 -0
  118. themis/generation/__init__.py +1 -0
  119. themis/generation/agentic_runner.py +420 -0
  120. themis/generation/batching.py +254 -0
  121. themis/generation/clients.py +143 -0
  122. themis/generation/conversation_runner.py +236 -0
  123. themis/generation/plan.py +456 -0
  124. themis/generation/providers/litellm_provider.py +221 -0
  125. themis/generation/providers/vllm_provider.py +135 -0
  126. themis/generation/router.py +34 -0
  127. themis/generation/runner.py +207 -0
  128. themis/generation/strategies.py +98 -0
  129. themis/generation/templates.py +71 -0
  130. themis/generation/turn_strategies.py +393 -0
  131. themis/generation/types.py +9 -0
  132. themis/integrations/__init__.py +0 -0
  133. themis/integrations/huggingface.py +72 -0
  134. themis/integrations/wandb.py +77 -0
  135. themis/interfaces/__init__.py +169 -0
  136. themis/presets/__init__.py +10 -0
  137. themis/presets/benchmarks.py +354 -0
  138. themis/presets/models.py +190 -0
  139. themis/project/__init__.py +20 -0
  140. themis/project/definitions.py +98 -0
  141. themis/project/patterns.py +230 -0
  142. themis/providers/__init__.py +5 -0
  143. themis/providers/registry.py +39 -0
  144. themis/server/__init__.py +28 -0
  145. themis/server/app.py +337 -0
  146. themis/utils/api_generator.py +379 -0
  147. themis/utils/cost_tracking.py +376 -0
  148. themis/utils/dashboard.py +452 -0
  149. themis/utils/logging_utils.py +41 -0
  150. themis/utils/progress.py +58 -0
  151. themis/utils/tracing.py +320 -0
  152. themis_eval-0.2.0.dist-info/METADATA +596 -0
  153. themis_eval-0.2.0.dist-info/RECORD +157 -0
  154. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
  155. themis_eval-0.1.0.dist-info/METADATA +0 -758
  156. themis_eval-0.1.0.dist-info/RECORD +0 -8
  157. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
  158. {themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,362 @@
1
+ """Leaderboard generation for benchmarks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.experiment.comparison import compare_experiments
11
+
12
+
13
+ def leaderboard_command(
14
+ *,
15
+ run_ids: Annotated[list[str], Parameter(help="Run IDs to include in leaderboard")],
16
+ storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
17
+ metric: Annotated[str, Parameter(help="Primary metric for ranking")] = "accuracy",
18
+ format: Annotated[
19
+ str, Parameter(help="Output format: markdown, latex, csv")
20
+ ] = "markdown",
21
+ output: Annotated[
22
+ Path | None, Parameter(help="Output file path (optional)")
23
+ ] = None,
24
+ title: Annotated[str, Parameter(help="Leaderboard title")] = "Leaderboard",
25
+ ascending: Annotated[
26
+ bool, Parameter(help="Rank in ascending order (lower is better)")
27
+ ] = False,
28
+ include_cost: Annotated[bool, Parameter(help="Include cost column")] = True,
29
+ include_metadata: Annotated[
30
+ list[str] | None,
31
+ Parameter(help="Metadata fields to include (e.g., model, temperature)"),
32
+ ] = None,
33
+ ) -> int:
34
+ """Generate benchmark leaderboard from experiment runs.
35
+
36
+ Creates a ranked table of experiments based on a primary metric.
37
+ Perfect for README files, documentation, and benchmark tracking.
38
+
39
+ Examples:
40
+ # Basic leaderboard
41
+ uv run python -m themis.cli leaderboard \\
42
+ --run-ids run-1 run-2 run-3 \\
43
+ --metric accuracy \\
44
+ --output LEADERBOARD.md
45
+
46
+ # With custom metadata
47
+ uv run python -m themis.cli leaderboard \\
48
+ --run-ids run-gpt4 run-claude run-gemini \\
49
+ --metric accuracy \\
50
+ --include-metadata model \\
51
+ --include-metadata temperature \\
52
+ --output results.md
53
+
54
+ # LaTeX for papers
55
+ uv run python -m themis.cli leaderboard \\
56
+ --run-ids run-1 run-2 run-3 \\
57
+ --metric accuracy \\
58
+ --format latex \\
59
+ --title "Math500 Benchmark Results" \\
60
+ --output leaderboard.tex
61
+
62
+ # Cost-optimized ranking (lower is better)
63
+ uv run python -m themis.cli leaderboard \\
64
+ --run-ids run-1 run-2 run-3 \\
65
+ --metric cost \\
66
+ --ascending true \\
67
+ --output cost_leaderboard.md
68
+ """
69
+ try:
70
+ # Load experiments
71
+ print(f"Loading experiments from {storage}...")
72
+ comparison = compare_experiments(
73
+ run_ids=run_ids,
74
+ storage_dir=storage,
75
+ include_metadata=True,
76
+ )
77
+
78
+ print(f"✓ Loaded {len(comparison.experiments)} experiments")
79
+
80
+ # Rank by metric
81
+ ranked = comparison.rank_by_metric(metric, ascending=ascending)
82
+
83
+ print(f"✓ Ranked by {metric} ({'ascending' if ascending else 'descending'})")
84
+
85
+ # Generate leaderboard
86
+ if format == "markdown":
87
+ content = _generate_markdown_leaderboard(
88
+ ranked=ranked,
89
+ metric=metric,
90
+ title=title,
91
+ include_cost=include_cost,
92
+ include_metadata=include_metadata,
93
+ comparison=comparison,
94
+ )
95
+ elif format == "latex":
96
+ content = _generate_latex_leaderboard(
97
+ ranked=ranked,
98
+ metric=metric,
99
+ title=title,
100
+ include_cost=include_cost,
101
+ include_metadata=include_metadata,
102
+ comparison=comparison,
103
+ )
104
+ elif format == "csv":
105
+ content = _generate_csv_leaderboard(
106
+ ranked=ranked,
107
+ metric=metric,
108
+ include_cost=include_cost,
109
+ include_metadata=include_metadata,
110
+ comparison=comparison,
111
+ )
112
+ else:
113
+ print(f"Error: Unknown format '{format}'")
114
+ print("Available formats: markdown, latex, csv")
115
+ return 1
116
+
117
+ # Output
118
+ if output:
119
+ output = Path(output)
120
+ output.write_text(content, encoding="utf-8")
121
+ print(f"\n✓ Leaderboard saved to {output}")
122
+ else:
123
+ print("\n" + "=" * 80)
124
+ print(content)
125
+ print("=" * 80)
126
+
127
+ # Show top 3
128
+ print(f"\n🏆 Top 3 by {metric}:")
129
+ for i, exp in enumerate(ranked[:3], 1):
130
+ value = exp.get_metric(metric)
131
+ emoji = ["🥇", "🥈", "🥉"][i - 1]
132
+ print(f" {emoji} {i}. {exp.run_id}: {value:.4f}")
133
+
134
+ return 0
135
+
136
+ except Exception as e:
137
+ print(f"Error: {e}")
138
+ import traceback
139
+
140
+ traceback.print_exc()
141
+ return 1
142
+
143
+
144
+ def _generate_markdown_leaderboard(
145
+ ranked,
146
+ metric: str,
147
+ title: str,
148
+ include_cost: bool,
149
+ include_metadata: list[str] | None,
150
+ comparison,
151
+ ) -> str:
152
+ """Generate markdown leaderboard."""
153
+ lines = [f"# {title}\n"]
154
+
155
+ # Date
156
+ from datetime import datetime
157
+
158
+ lines.append(f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n")
159
+
160
+ # Build header
161
+ headers = ["Rank", "Run ID", metric.capitalize()]
162
+
163
+ # Add metadata columns
164
+ if include_metadata:
165
+ headers.extend(include_metadata)
166
+
167
+ # Add cost if requested
168
+ has_cost = include_cost and any(exp.get_cost() is not None for exp in ranked)
169
+ if has_cost:
170
+ headers.append("Cost ($)")
171
+
172
+ # Add other metrics
173
+ other_metrics = [m for m in comparison.metrics if m != metric]
174
+ headers.extend(other_metrics)
175
+
176
+ headers.extend(["Samples", "Failures"])
177
+
178
+ lines.append("| " + " | ".join(headers) + " |")
179
+ lines.append("| " + " | ".join(["---"] * len(headers)) + " |")
180
+
181
+ # Build rows
182
+ for rank, exp in enumerate(ranked, 1):
183
+ values = [str(rank), exp.run_id]
184
+
185
+ # Primary metric
186
+ val = exp.get_metric(metric)
187
+ values.append(f"**{val:.4f}**" if val is not None else "N/A")
188
+
189
+ # Metadata
190
+ if include_metadata:
191
+ for field in include_metadata:
192
+ val = exp.metadata.get(field, "—")
193
+ values.append(str(val))
194
+
195
+ # Cost
196
+ if has_cost:
197
+ cost = exp.get_cost()
198
+ values.append(f"{cost:.4f}" if cost is not None else "—")
199
+
200
+ # Other metrics
201
+ for m in other_metrics:
202
+ val = exp.get_metric(m)
203
+ values.append(f"{val:.4f}" if val is not None else "N/A")
204
+
205
+ values.append(str(exp.sample_count))
206
+ values.append(str(exp.failure_count))
207
+
208
+ lines.append("| " + " | ".join(values) + " |")
209
+
210
+ return "\n".join(lines)
211
+
212
+
213
+ def _generate_latex_leaderboard(
214
+ ranked,
215
+ metric: str,
216
+ title: str,
217
+ include_cost: bool,
218
+ include_metadata: list[str] | None,
219
+ comparison,
220
+ ) -> str:
221
+ """Generate LaTeX leaderboard."""
222
+ lines = []
223
+
224
+ # Calculate columns
225
+ n_cols = 2 # Rank + Run ID
226
+ n_cols += 1 # Primary metric
227
+
228
+ if include_metadata:
229
+ n_cols += len(include_metadata)
230
+
231
+ has_cost = include_cost and any(exp.get_cost() is not None for exp in ranked)
232
+ if has_cost:
233
+ n_cols += 1
234
+
235
+ other_metrics = [m for m in comparison.metrics if m != metric]
236
+ n_cols += len(other_metrics)
237
+ n_cols += 2 # Samples + Failures
238
+
239
+ # Table preamble
240
+ lines.append("\\begin{table}[htbp]")
241
+ lines.append("\\centering")
242
+ lines.append(f"\\caption{{{title}}}")
243
+ lines.append("\\label{tab:leaderboard}")
244
+
245
+ col_spec = "c" + "l" + "r" * (n_cols - 2) # Center rank, left run_id, right numbers
246
+ lines.append(f"\\begin{{tabular}}{{{col_spec}}}")
247
+ lines.append("\\toprule")
248
+
249
+ # Header
250
+ headers = ["Rank", "Run ID", f"\\textbf{{{metric}}}"]
251
+
252
+ if include_metadata:
253
+ headers.extend(include_metadata)
254
+
255
+ if has_cost:
256
+ headers.append("Cost (\\$)")
257
+
258
+ headers.extend(other_metrics)
259
+ headers.extend(["Samples", "Failures"])
260
+
261
+ lines.append(" & ".join(headers) + " \\\\")
262
+ lines.append("\\midrule")
263
+
264
+ # Rows
265
+ for rank, exp in enumerate(ranked, 1):
266
+ values = [str(rank), exp.run_id.replace("_", "\\_")]
267
+
268
+ # Primary metric (bold)
269
+ val = exp.get_metric(metric)
270
+ values.append(f"\\textbf{{{val:.4f}}}" if val is not None else "---")
271
+
272
+ # Metadata
273
+ if include_metadata:
274
+ for field in include_metadata:
275
+ val = exp.metadata.get(field, "---")
276
+ val_str = str(val).replace("_", "\\_")
277
+ values.append(val_str)
278
+
279
+ # Cost
280
+ if has_cost:
281
+ cost = exp.get_cost()
282
+ values.append(f"{cost:.4f}" if cost is not None else "---")
283
+
284
+ # Other metrics
285
+ for m in other_metrics:
286
+ val = exp.get_metric(m)
287
+ values.append(f"{val:.4f}" if val is not None else "---")
288
+
289
+ values.append(str(exp.sample_count))
290
+ values.append(str(exp.failure_count))
291
+
292
+ lines.append(" & ".join(values) + " \\\\")
293
+
294
+ lines.append("\\bottomrule")
295
+ lines.append("\\end{tabular}")
296
+ lines.append("\\end{table}")
297
+
298
+ return "\n".join(lines)
299
+
300
+
301
+ def _generate_csv_leaderboard(
302
+ ranked,
303
+ metric: str,
304
+ include_cost: bool,
305
+ include_metadata: list[str] | None,
306
+ comparison,
307
+ ) -> str:
308
+ """Generate CSV leaderboard."""
309
+ import csv
310
+ import io
311
+
312
+ output = io.StringIO()
313
+
314
+ # Build header
315
+ headers = ["rank", "run_id", metric]
316
+
317
+ if include_metadata:
318
+ headers.extend(include_metadata)
319
+
320
+ has_cost = include_cost and any(exp.get_cost() is not None for exp in ranked)
321
+ if has_cost:
322
+ headers.append("cost")
323
+
324
+ other_metrics = [m for m in comparison.metrics if m != metric]
325
+ headers.extend(other_metrics)
326
+ headers.extend(["sample_count", "failure_count"])
327
+
328
+ writer = csv.DictWriter(output, fieldnames=headers)
329
+ writer.writeheader()
330
+
331
+ # Write rows
332
+ for rank, exp in enumerate(ranked, 1):
333
+ row = {"rank": rank, "run_id": exp.run_id}
334
+
335
+ # Primary metric
336
+ val = exp.get_metric(metric)
337
+ row[metric] = val if val is not None else ""
338
+
339
+ # Metadata
340
+ if include_metadata:
341
+ for field in include_metadata:
342
+ row[field] = exp.metadata.get(field, "")
343
+
344
+ # Cost
345
+ if has_cost:
346
+ cost = exp.get_cost()
347
+ row["cost"] = cost if cost is not None else ""
348
+
349
+ # Other metrics
350
+ for m in other_metrics:
351
+ val = exp.get_metric(m)
352
+ row[m] = val if val is not None else ""
353
+
354
+ row["sample_count"] = exp.sample_count
355
+ row["failure_count"] = exp.failure_count
356
+
357
+ writer.writerow(row)
358
+
359
+ return output.getvalue()
360
+
361
+
362
+ __all__ = ["leaderboard_command"]
@@ -0,0 +1,318 @@
1
+ """Math benchmark command implementations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Annotated, Callable, Literal, Sequence
7
+
8
+ from cyclopts import Parameter
9
+
10
+ from themis.cli.utils import effective_total, export_outputs
11
+ from themis.datasets import (
12
+ competition_math as competition_math_dataset,
13
+ )
14
+ from themis.datasets import (
15
+ math500 as math500_dataset,
16
+ )
17
+ from themis.experiment import math as math_experiment
18
+ from themis.experiment import storage as experiment_storage
19
+ from themis.utils.logging_utils import configure_logging
20
+ from themis.utils.progress import ProgressReporter
21
+
22
+
23
+ def load_math_dataset(
24
+ *,
25
+ source: Literal["huggingface", "local"],
26
+ data_dir: Path | None,
27
+ limit: int | None,
28
+ subjects: Sequence[str] | None,
29
+ split: str = "test",
30
+ ):
31
+ """Load MATH-500 dataset.
32
+
33
+ Args:
34
+ source: Dataset source (huggingface or local)
35
+ data_dir: Directory containing local dataset
36
+ limit: Max rows to load
37
+ subjects: Subjects to filter
38
+ split: Dataset split
39
+
40
+ Returns:
41
+ List of generation examples
42
+ """
43
+ if source == "local":
44
+ if data_dir is None:
45
+ raise ValueError(
46
+ "The --data-dir option is required when --source=local so Themis "
47
+ "knows where to read the dataset."
48
+ )
49
+ samples = math500_dataset.load_math500(
50
+ source="local",
51
+ data_dir=data_dir,
52
+ split=split,
53
+ limit=limit,
54
+ subjects=subjects,
55
+ )
56
+ else:
57
+ samples = math500_dataset.load_math500(
58
+ source="huggingface",
59
+ split=split,
60
+ limit=limit,
61
+ subjects=subjects,
62
+ )
63
+ return [sample.to_generation_example() for sample in samples]
64
+
65
+
66
+ def load_competition_math_dataset(
67
+ *,
68
+ dataset: str,
69
+ subset: str | None,
70
+ source: Literal["huggingface", "local"],
71
+ data_dir: Path | None,
72
+ split: str,
73
+ limit: int | None,
74
+ subjects: Sequence[str] | None,
75
+ ):
76
+ """Load competition math dataset.
77
+
78
+ Args:
79
+ dataset: Dataset name
80
+ subset: Dataset subset
81
+ source: Dataset source
82
+ data_dir: Directory containing local dataset
83
+ split: Dataset split
84
+ limit: Max rows to load
85
+ subjects: Subjects to filter
86
+
87
+ Returns:
88
+ List of generation examples
89
+ """
90
+ if source == "local" and data_dir is None:
91
+ raise ValueError(
92
+ "The --data-dir option is required when --source=local so Themis "
93
+ "knows where to read the dataset."
94
+ )
95
+ samples = competition_math_dataset.load_competition_math(
96
+ dataset=dataset,
97
+ subset=subset,
98
+ source=source,
99
+ data_dir=data_dir,
100
+ split=split,
101
+ limit=limit,
102
+ subjects=subjects,
103
+ )
104
+ return [sample.to_generation_example() for sample in samples]
105
+
106
+
107
+ def run_math_benchmark(
108
+ rows: Sequence[dict[str, object]],
109
+ *,
110
+ max_samples: int | None,
111
+ storage: Path | None,
112
+ run_id: str | None,
113
+ resume: bool,
114
+ temperature: float,
115
+ csv_output: Path | None,
116
+ html_output: Path | None,
117
+ json_output: Path | None,
118
+ title: str,
119
+ task_name: str,
120
+ ) -> int:
121
+ """Run math benchmark experiment.
122
+
123
+ Args:
124
+ rows: Dataset rows
125
+ max_samples: Maximum samples to run
126
+ storage: Cache directory
127
+ run_id: Run identifier
128
+ resume: Whether to resume from cache
129
+ temperature: Sampling temperature
130
+ csv_output: CSV export path
131
+ html_output: HTML export path
132
+ json_output: JSON export path
133
+ title: Experiment title
134
+ task_name: Task name
135
+
136
+ Returns:
137
+ Exit code
138
+ """
139
+ storage_impl = experiment_storage.ExperimentStorage(storage) if storage else None
140
+ experiment = math_experiment.build_math500_zero_shot_experiment(
141
+ temperature=temperature,
142
+ storage=storage_impl,
143
+ task_name=task_name,
144
+ )
145
+
146
+ total = effective_total(len(rows), max_samples)
147
+ with ProgressReporter(total=total, description="Generating") as progress:
148
+ report = experiment.run(
149
+ rows,
150
+ max_samples=max_samples,
151
+ run_id=run_id,
152
+ resume=resume,
153
+ on_result=progress.on_result,
154
+ )
155
+ print(math_experiment.summarize_report(report))
156
+ export_outputs(
157
+ report,
158
+ csv_output=csv_output,
159
+ html_output=html_output,
160
+ json_output=json_output,
161
+ title=f"{title} experiment",
162
+ )
163
+ return 0
164
+
165
+
166
+ def math500_command(
167
+ *,
168
+ source: Annotated[
169
+ Literal["huggingface", "local"], Parameter(help="Dataset source")
170
+ ] = "huggingface",
171
+ data_dir: Annotated[
172
+ Path | None, Parameter(help="Directory containing local dataset")
173
+ ] = None,
174
+ limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
175
+ subjects: Annotated[tuple[str, ...], Parameter(help="Subjects to filter")] = (),
176
+ max_samples: Annotated[int | None, Parameter(help="Maximum samples to run")] = None,
177
+ storage: Annotated[
178
+ Path | None, Parameter(help="Cache directory for datasets/results")
179
+ ] = None,
180
+ run_id: Annotated[str | None, Parameter(help="Identifier for cached run")] = None,
181
+ resume: Annotated[
182
+ bool, Parameter(help="Reuse cached generations when storage is set")
183
+ ] = True,
184
+ temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
185
+ log_level: Annotated[
186
+ str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
187
+ ] = "info",
188
+ csv_output: Annotated[
189
+ Path | None, Parameter(help="Write CSV export to this path")
190
+ ] = None,
191
+ html_output: Annotated[
192
+ Path | None, Parameter(help="Write HTML summary to this path")
193
+ ] = None,
194
+ json_output: Annotated[
195
+ Path | None, Parameter(help="Write JSON export to this path")
196
+ ] = None,
197
+ ) -> int:
198
+ """Run the zero-shot MATH-500 evaluation."""
199
+ configure_logging(log_level)
200
+ subject_filter = list(subjects) if subjects else None
201
+ rows = load_math_dataset(
202
+ source=source,
203
+ data_dir=data_dir,
204
+ limit=limit,
205
+ subjects=subject_filter,
206
+ )
207
+ return run_math_benchmark(
208
+ rows,
209
+ max_samples=max_samples,
210
+ storage=storage,
211
+ run_id=run_id,
212
+ resume=resume,
213
+ temperature=temperature,
214
+ csv_output=csv_output,
215
+ html_output=html_output,
216
+ json_output=json_output,
217
+ title="math500",
218
+ task_name="math500",
219
+ )
220
+
221
+
222
+ def _create_competition_math_command(
223
+ dataset_name: str,
224
+ dataset_id: str,
225
+ subset: str | None = None,
226
+ ) -> Callable:
227
+ """Create a competition math command function.
228
+
229
+ Args:
230
+ dataset_name: Display name for the dataset
231
+ dataset_id: HuggingFace dataset ID
232
+ subset: Optional dataset subset
233
+
234
+ Returns:
235
+ Command function
236
+ """
237
+
238
+ def command(
239
+ *,
240
+ source: Annotated[
241
+ Literal["huggingface", "local"], Parameter(help="Dataset source")
242
+ ] = "huggingface",
243
+ split: Annotated[str, Parameter(help="Dataset split to load")] = "test",
244
+ data_dir: Annotated[
245
+ Path | None, Parameter(help="Directory containing local dataset")
246
+ ] = None,
247
+ limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
248
+ subjects: Annotated[
249
+ tuple[str, ...], Parameter(help="Optional subject filters")
250
+ ] = (),
251
+ max_samples: Annotated[
252
+ int | None, Parameter(help="Maximum samples to run")
253
+ ] = None,
254
+ storage: Annotated[
255
+ Path | None, Parameter(help="Cache directory for datasets/results")
256
+ ] = None,
257
+ run_id: Annotated[
258
+ str | None, Parameter(help="Identifier for cached run")
259
+ ] = None,
260
+ resume: Annotated[
261
+ bool, Parameter(help="Reuse cached generations when storage is set")
262
+ ] = True,
263
+ temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
264
+ log_level: Annotated[
265
+ str,
266
+ Parameter(help="Logging level (critical/error/warning/info/debug/trace)"),
267
+ ] = "info",
268
+ csv_output: Annotated[
269
+ Path | None, Parameter(help="Write CSV export to this path")
270
+ ] = None,
271
+ html_output: Annotated[
272
+ Path | None, Parameter(help="Write HTML summary to this path")
273
+ ] = None,
274
+ json_output: Annotated[
275
+ Path | None, Parameter(help="Write JSON export to this path")
276
+ ] = None,
277
+ ) -> int:
278
+ f"""Run the {dataset_name} benchmark."""
279
+ configure_logging(log_level)
280
+ subject_filter = list(subjects) if subjects else None
281
+ rows = load_competition_math_dataset(
282
+ dataset=dataset_id,
283
+ subset=subset,
284
+ source=source,
285
+ data_dir=data_dir,
286
+ split=split,
287
+ limit=limit,
288
+ subjects=subject_filter,
289
+ )
290
+
291
+ return run_math_benchmark(
292
+ rows,
293
+ max_samples=max_samples,
294
+ storage=storage,
295
+ run_id=run_id,
296
+ resume=resume,
297
+ temperature=temperature,
298
+ csv_output=csv_output,
299
+ html_output=html_output,
300
+ json_output=json_output,
301
+ title=dataset_name,
302
+ task_name=dataset_name,
303
+ )
304
+
305
+ command.__doc__ = f"Run the {dataset_name} benchmark."
306
+ return command
307
+
308
+
309
+ # Create specific competition math commands
310
+ aime24_command = _create_competition_math_command("aime24", "math-ai/aime24")
311
+ aime25_command = _create_competition_math_command("aime25", "math-ai/aime25")
312
+ amc23_command = _create_competition_math_command("amc23", "math-ai/amc23")
313
+ olympiadbench_command = _create_competition_math_command(
314
+ "olympiadbench", "math-ai/olympiadbench"
315
+ )
316
+ beyond_aime_command = _create_competition_math_command(
317
+ "beyondaime", "ByteDance-Seed/BeyondAIME"
318
+ )