themis-eval 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/cli/__init__.py +5 -0
- themis/cli/__main__.py +6 -0
- themis/cli/commands/__init__.py +19 -0
- themis/cli/commands/benchmarks.py +221 -0
- themis/cli/commands/comparison.py +394 -0
- themis/cli/commands/config_commands.py +244 -0
- themis/cli/commands/cost.py +214 -0
- themis/cli/commands/demo.py +68 -0
- themis/cli/commands/info.py +90 -0
- themis/cli/commands/leaderboard.py +362 -0
- themis/cli/commands/math_benchmarks.py +318 -0
- themis/cli/commands/mcq_benchmarks.py +207 -0
- themis/cli/commands/sample_run.py +244 -0
- themis/cli/commands/visualize.py +299 -0
- themis/cli/main.py +93 -0
- themis/cli/new_project.py +33 -0
- themis/cli/utils.py +51 -0
- themis/config/__init__.py +19 -0
- themis/config/loader.py +27 -0
- themis/config/registry.py +34 -0
- themis/config/runtime.py +214 -0
- themis/config/schema.py +112 -0
- themis/core/__init__.py +5 -0
- themis/core/conversation.py +354 -0
- themis/core/entities.py +164 -0
- themis/core/serialization.py +231 -0
- themis/core/tools.py +393 -0
- themis/core/types.py +141 -0
- themis/datasets/__init__.py +273 -0
- themis/datasets/base.py +264 -0
- themis/datasets/commonsense_qa.py +174 -0
- themis/datasets/competition_math.py +265 -0
- themis/datasets/coqa.py +133 -0
- themis/datasets/gpqa.py +190 -0
- themis/datasets/gsm8k.py +123 -0
- themis/datasets/gsm_symbolic.py +124 -0
- themis/datasets/math500.py +122 -0
- themis/datasets/med_qa.py +179 -0
- themis/datasets/medmcqa.py +169 -0
- themis/datasets/mmlu_pro.py +262 -0
- themis/datasets/piqa.py +146 -0
- themis/datasets/registry.py +201 -0
- themis/datasets/schema.py +245 -0
- themis/datasets/sciq.py +150 -0
- themis/datasets/social_i_qa.py +151 -0
- themis/datasets/super_gpqa.py +263 -0
- themis/evaluation/__init__.py +1 -0
- themis/evaluation/conditional.py +410 -0
- themis/evaluation/extractors/__init__.py +19 -0
- themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
- themis/evaluation/extractors/exceptions.py +7 -0
- themis/evaluation/extractors/identity_extractor.py +29 -0
- themis/evaluation/extractors/json_field_extractor.py +45 -0
- themis/evaluation/extractors/math_verify_extractor.py +37 -0
- themis/evaluation/extractors/regex_extractor.py +43 -0
- themis/evaluation/math_verify_utils.py +87 -0
- themis/evaluation/metrics/__init__.py +21 -0
- themis/evaluation/metrics/composite_metric.py +47 -0
- themis/evaluation/metrics/consistency_metric.py +80 -0
- themis/evaluation/metrics/exact_match.py +51 -0
- themis/evaluation/metrics/length_difference_tolerance.py +33 -0
- themis/evaluation/metrics/math_verify_accuracy.py +40 -0
- themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
- themis/evaluation/metrics/response_length.py +33 -0
- themis/evaluation/metrics/rubric_judge_metric.py +134 -0
- themis/evaluation/pipeline.py +49 -0
- themis/evaluation/pipelines/__init__.py +15 -0
- themis/evaluation/pipelines/composable_pipeline.py +357 -0
- themis/evaluation/pipelines/standard_pipeline.py +288 -0
- themis/evaluation/reports.py +293 -0
- themis/evaluation/statistics/__init__.py +53 -0
- themis/evaluation/statistics/bootstrap.py +79 -0
- themis/evaluation/statistics/confidence_intervals.py +121 -0
- themis/evaluation/statistics/distributions.py +207 -0
- themis/evaluation/statistics/effect_sizes.py +124 -0
- themis/evaluation/statistics/hypothesis_tests.py +305 -0
- themis/evaluation/statistics/types.py +139 -0
- themis/evaluation/strategies/__init__.py +13 -0
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
- themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
- themis/evaluation/strategies/evaluation_strategy.py +24 -0
- themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
- themis/experiment/__init__.py +5 -0
- themis/experiment/builder.py +151 -0
- themis/experiment/cache_manager.py +129 -0
- themis/experiment/comparison.py +631 -0
- themis/experiment/cost.py +310 -0
- themis/experiment/definitions.py +62 -0
- themis/experiment/export.py +690 -0
- themis/experiment/export_csv.py +159 -0
- themis/experiment/integration_manager.py +104 -0
- themis/experiment/math.py +192 -0
- themis/experiment/mcq.py +169 -0
- themis/experiment/orchestrator.py +373 -0
- themis/experiment/pricing.py +317 -0
- themis/experiment/storage.py +255 -0
- themis/experiment/visualization.py +588 -0
- themis/generation/__init__.py +1 -0
- themis/generation/agentic_runner.py +420 -0
- themis/generation/batching.py +254 -0
- themis/generation/clients.py +143 -0
- themis/generation/conversation_runner.py +236 -0
- themis/generation/plan.py +456 -0
- themis/generation/providers/litellm_provider.py +221 -0
- themis/generation/providers/vllm_provider.py +135 -0
- themis/generation/router.py +34 -0
- themis/generation/runner.py +207 -0
- themis/generation/strategies.py +98 -0
- themis/generation/templates.py +71 -0
- themis/generation/turn_strategies.py +393 -0
- themis/generation/types.py +9 -0
- themis/integrations/__init__.py +0 -0
- themis/integrations/huggingface.py +61 -0
- themis/integrations/wandb.py +65 -0
- themis/interfaces/__init__.py +83 -0
- themis/project/__init__.py +20 -0
- themis/project/definitions.py +98 -0
- themis/project/patterns.py +230 -0
- themis/providers/__init__.py +5 -0
- themis/providers/registry.py +39 -0
- themis/utils/api_generator.py +379 -0
- themis/utils/cost_tracking.py +376 -0
- themis/utils/dashboard.py +452 -0
- themis/utils/logging_utils.py +41 -0
- themis/utils/progress.py +58 -0
- themis/utils/tracing.py +320 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/METADATA +1 -1
- themis_eval-0.1.1.dist-info/RECORD +134 -0
- themis_eval-0.1.0.dist-info/RECORD +0 -8
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/WHEEL +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.1.0.dist-info → themis_eval-0.1.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
"""Leaderboard generation for benchmarks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated
|
|
7
|
+
|
|
8
|
+
from cyclopts import Parameter
|
|
9
|
+
|
|
10
|
+
from themis.experiment.comparison import compare_experiments
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def leaderboard_command(
|
|
14
|
+
*,
|
|
15
|
+
run_ids: Annotated[list[str], Parameter(help="Run IDs to include in leaderboard")],
|
|
16
|
+
storage: Annotated[Path, Parameter(help="Storage directory")] = Path(".cache/runs"),
|
|
17
|
+
metric: Annotated[str, Parameter(help="Primary metric for ranking")] = "accuracy",
|
|
18
|
+
format: Annotated[
|
|
19
|
+
str, Parameter(help="Output format: markdown, latex, csv")
|
|
20
|
+
] = "markdown",
|
|
21
|
+
output: Annotated[
|
|
22
|
+
Path | None, Parameter(help="Output file path (optional)")
|
|
23
|
+
] = None,
|
|
24
|
+
title: Annotated[str, Parameter(help="Leaderboard title")] = "Leaderboard",
|
|
25
|
+
ascending: Annotated[
|
|
26
|
+
bool, Parameter(help="Rank in ascending order (lower is better)")
|
|
27
|
+
] = False,
|
|
28
|
+
include_cost: Annotated[bool, Parameter(help="Include cost column")] = True,
|
|
29
|
+
include_metadata: Annotated[
|
|
30
|
+
list[str] | None,
|
|
31
|
+
Parameter(help="Metadata fields to include (e.g., model, temperature)"),
|
|
32
|
+
] = None,
|
|
33
|
+
) -> int:
|
|
34
|
+
"""Generate benchmark leaderboard from experiment runs.
|
|
35
|
+
|
|
36
|
+
Creates a ranked table of experiments based on a primary metric.
|
|
37
|
+
Perfect for README files, documentation, and benchmark tracking.
|
|
38
|
+
|
|
39
|
+
Examples:
|
|
40
|
+
# Basic leaderboard
|
|
41
|
+
uv run python -m themis.cli leaderboard \\
|
|
42
|
+
--run-ids run-1 run-2 run-3 \\
|
|
43
|
+
--metric accuracy \\
|
|
44
|
+
--output LEADERBOARD.md
|
|
45
|
+
|
|
46
|
+
# With custom metadata
|
|
47
|
+
uv run python -m themis.cli leaderboard \\
|
|
48
|
+
--run-ids run-gpt4 run-claude run-gemini \\
|
|
49
|
+
--metric accuracy \\
|
|
50
|
+
--include-metadata model \\
|
|
51
|
+
--include-metadata temperature \\
|
|
52
|
+
--output results.md
|
|
53
|
+
|
|
54
|
+
# LaTeX for papers
|
|
55
|
+
uv run python -m themis.cli leaderboard \\
|
|
56
|
+
--run-ids run-1 run-2 run-3 \\
|
|
57
|
+
--metric accuracy \\
|
|
58
|
+
--format latex \\
|
|
59
|
+
--title "Math500 Benchmark Results" \\
|
|
60
|
+
--output leaderboard.tex
|
|
61
|
+
|
|
62
|
+
# Cost-optimized ranking (lower is better)
|
|
63
|
+
uv run python -m themis.cli leaderboard \\
|
|
64
|
+
--run-ids run-1 run-2 run-3 \\
|
|
65
|
+
--metric cost \\
|
|
66
|
+
--ascending true \\
|
|
67
|
+
--output cost_leaderboard.md
|
|
68
|
+
"""
|
|
69
|
+
try:
|
|
70
|
+
# Load experiments
|
|
71
|
+
print(f"Loading experiments from {storage}...")
|
|
72
|
+
comparison = compare_experiments(
|
|
73
|
+
run_ids=run_ids,
|
|
74
|
+
storage_dir=storage,
|
|
75
|
+
include_metadata=True,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
print(f"✓ Loaded {len(comparison.experiments)} experiments")
|
|
79
|
+
|
|
80
|
+
# Rank by metric
|
|
81
|
+
ranked = comparison.rank_by_metric(metric, ascending=ascending)
|
|
82
|
+
|
|
83
|
+
print(f"✓ Ranked by {metric} ({'ascending' if ascending else 'descending'})")
|
|
84
|
+
|
|
85
|
+
# Generate leaderboard
|
|
86
|
+
if format == "markdown":
|
|
87
|
+
content = _generate_markdown_leaderboard(
|
|
88
|
+
ranked=ranked,
|
|
89
|
+
metric=metric,
|
|
90
|
+
title=title,
|
|
91
|
+
include_cost=include_cost,
|
|
92
|
+
include_metadata=include_metadata,
|
|
93
|
+
comparison=comparison,
|
|
94
|
+
)
|
|
95
|
+
elif format == "latex":
|
|
96
|
+
content = _generate_latex_leaderboard(
|
|
97
|
+
ranked=ranked,
|
|
98
|
+
metric=metric,
|
|
99
|
+
title=title,
|
|
100
|
+
include_cost=include_cost,
|
|
101
|
+
include_metadata=include_metadata,
|
|
102
|
+
comparison=comparison,
|
|
103
|
+
)
|
|
104
|
+
elif format == "csv":
|
|
105
|
+
content = _generate_csv_leaderboard(
|
|
106
|
+
ranked=ranked,
|
|
107
|
+
metric=metric,
|
|
108
|
+
include_cost=include_cost,
|
|
109
|
+
include_metadata=include_metadata,
|
|
110
|
+
comparison=comparison,
|
|
111
|
+
)
|
|
112
|
+
else:
|
|
113
|
+
print(f"Error: Unknown format '{format}'")
|
|
114
|
+
print("Available formats: markdown, latex, csv")
|
|
115
|
+
return 1
|
|
116
|
+
|
|
117
|
+
# Output
|
|
118
|
+
if output:
|
|
119
|
+
output = Path(output)
|
|
120
|
+
output.write_text(content, encoding="utf-8")
|
|
121
|
+
print(f"\n✓ Leaderboard saved to {output}")
|
|
122
|
+
else:
|
|
123
|
+
print("\n" + "=" * 80)
|
|
124
|
+
print(content)
|
|
125
|
+
print("=" * 80)
|
|
126
|
+
|
|
127
|
+
# Show top 3
|
|
128
|
+
print(f"\n🏆 Top 3 by {metric}:")
|
|
129
|
+
for i, exp in enumerate(ranked[:3], 1):
|
|
130
|
+
value = exp.get_metric(metric)
|
|
131
|
+
emoji = ["🥇", "🥈", "🥉"][i - 1]
|
|
132
|
+
print(f" {emoji} {i}. {exp.run_id}: {value:.4f}")
|
|
133
|
+
|
|
134
|
+
return 0
|
|
135
|
+
|
|
136
|
+
except Exception as e:
|
|
137
|
+
print(f"Error: {e}")
|
|
138
|
+
import traceback
|
|
139
|
+
|
|
140
|
+
traceback.print_exc()
|
|
141
|
+
return 1
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _generate_markdown_leaderboard(
|
|
145
|
+
ranked,
|
|
146
|
+
metric: str,
|
|
147
|
+
title: str,
|
|
148
|
+
include_cost: bool,
|
|
149
|
+
include_metadata: list[str] | None,
|
|
150
|
+
comparison,
|
|
151
|
+
) -> str:
|
|
152
|
+
"""Generate markdown leaderboard."""
|
|
153
|
+
lines = [f"# {title}\n"]
|
|
154
|
+
|
|
155
|
+
# Date
|
|
156
|
+
from datetime import datetime
|
|
157
|
+
|
|
158
|
+
lines.append(f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*\n")
|
|
159
|
+
|
|
160
|
+
# Build header
|
|
161
|
+
headers = ["Rank", "Run ID", metric.capitalize()]
|
|
162
|
+
|
|
163
|
+
# Add metadata columns
|
|
164
|
+
if include_metadata:
|
|
165
|
+
headers.extend(include_metadata)
|
|
166
|
+
|
|
167
|
+
# Add cost if requested
|
|
168
|
+
has_cost = include_cost and any(exp.get_cost() is not None for exp in ranked)
|
|
169
|
+
if has_cost:
|
|
170
|
+
headers.append("Cost ($)")
|
|
171
|
+
|
|
172
|
+
# Add other metrics
|
|
173
|
+
other_metrics = [m for m in comparison.metrics if m != metric]
|
|
174
|
+
headers.extend(other_metrics)
|
|
175
|
+
|
|
176
|
+
headers.extend(["Samples", "Failures"])
|
|
177
|
+
|
|
178
|
+
lines.append("| " + " | ".join(headers) + " |")
|
|
179
|
+
lines.append("| " + " | ".join(["---"] * len(headers)) + " |")
|
|
180
|
+
|
|
181
|
+
# Build rows
|
|
182
|
+
for rank, exp in enumerate(ranked, 1):
|
|
183
|
+
values = [str(rank), exp.run_id]
|
|
184
|
+
|
|
185
|
+
# Primary metric
|
|
186
|
+
val = exp.get_metric(metric)
|
|
187
|
+
values.append(f"**{val:.4f}**" if val is not None else "N/A")
|
|
188
|
+
|
|
189
|
+
# Metadata
|
|
190
|
+
if include_metadata:
|
|
191
|
+
for field in include_metadata:
|
|
192
|
+
val = exp.metadata.get(field, "—")
|
|
193
|
+
values.append(str(val))
|
|
194
|
+
|
|
195
|
+
# Cost
|
|
196
|
+
if has_cost:
|
|
197
|
+
cost = exp.get_cost()
|
|
198
|
+
values.append(f"{cost:.4f}" if cost is not None else "—")
|
|
199
|
+
|
|
200
|
+
# Other metrics
|
|
201
|
+
for m in other_metrics:
|
|
202
|
+
val = exp.get_metric(m)
|
|
203
|
+
values.append(f"{val:.4f}" if val is not None else "N/A")
|
|
204
|
+
|
|
205
|
+
values.append(str(exp.sample_count))
|
|
206
|
+
values.append(str(exp.failure_count))
|
|
207
|
+
|
|
208
|
+
lines.append("| " + " | ".join(values) + " |")
|
|
209
|
+
|
|
210
|
+
return "\n".join(lines)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _generate_latex_leaderboard(
|
|
214
|
+
ranked,
|
|
215
|
+
metric: str,
|
|
216
|
+
title: str,
|
|
217
|
+
include_cost: bool,
|
|
218
|
+
include_metadata: list[str] | None,
|
|
219
|
+
comparison,
|
|
220
|
+
) -> str:
|
|
221
|
+
"""Generate LaTeX leaderboard."""
|
|
222
|
+
lines = []
|
|
223
|
+
|
|
224
|
+
# Calculate columns
|
|
225
|
+
n_cols = 2 # Rank + Run ID
|
|
226
|
+
n_cols += 1 # Primary metric
|
|
227
|
+
|
|
228
|
+
if include_metadata:
|
|
229
|
+
n_cols += len(include_metadata)
|
|
230
|
+
|
|
231
|
+
has_cost = include_cost and any(exp.get_cost() is not None for exp in ranked)
|
|
232
|
+
if has_cost:
|
|
233
|
+
n_cols += 1
|
|
234
|
+
|
|
235
|
+
other_metrics = [m for m in comparison.metrics if m != metric]
|
|
236
|
+
n_cols += len(other_metrics)
|
|
237
|
+
n_cols += 2 # Samples + Failures
|
|
238
|
+
|
|
239
|
+
# Table preamble
|
|
240
|
+
lines.append("\\begin{table}[htbp]")
|
|
241
|
+
lines.append("\\centering")
|
|
242
|
+
lines.append(f"\\caption{{{title}}}")
|
|
243
|
+
lines.append("\\label{tab:leaderboard}")
|
|
244
|
+
|
|
245
|
+
col_spec = "c" + "l" + "r" * (n_cols - 2) # Center rank, left run_id, right numbers
|
|
246
|
+
lines.append(f"\\begin{{tabular}}{{{col_spec}}}")
|
|
247
|
+
lines.append("\\toprule")
|
|
248
|
+
|
|
249
|
+
# Header
|
|
250
|
+
headers = ["Rank", "Run ID", f"\\textbf{{{metric}}}"]
|
|
251
|
+
|
|
252
|
+
if include_metadata:
|
|
253
|
+
headers.extend(include_metadata)
|
|
254
|
+
|
|
255
|
+
if has_cost:
|
|
256
|
+
headers.append("Cost (\\$)")
|
|
257
|
+
|
|
258
|
+
headers.extend(other_metrics)
|
|
259
|
+
headers.extend(["Samples", "Failures"])
|
|
260
|
+
|
|
261
|
+
lines.append(" & ".join(headers) + " \\\\")
|
|
262
|
+
lines.append("\\midrule")
|
|
263
|
+
|
|
264
|
+
# Rows
|
|
265
|
+
for rank, exp in enumerate(ranked, 1):
|
|
266
|
+
values = [str(rank), exp.run_id.replace("_", "\\_")]
|
|
267
|
+
|
|
268
|
+
# Primary metric (bold)
|
|
269
|
+
val = exp.get_metric(metric)
|
|
270
|
+
values.append(f"\\textbf{{{val:.4f}}}" if val is not None else "---")
|
|
271
|
+
|
|
272
|
+
# Metadata
|
|
273
|
+
if include_metadata:
|
|
274
|
+
for field in include_metadata:
|
|
275
|
+
val = exp.metadata.get(field, "---")
|
|
276
|
+
val_str = str(val).replace("_", "\\_")
|
|
277
|
+
values.append(val_str)
|
|
278
|
+
|
|
279
|
+
# Cost
|
|
280
|
+
if has_cost:
|
|
281
|
+
cost = exp.get_cost()
|
|
282
|
+
values.append(f"{cost:.4f}" if cost is not None else "---")
|
|
283
|
+
|
|
284
|
+
# Other metrics
|
|
285
|
+
for m in other_metrics:
|
|
286
|
+
val = exp.get_metric(m)
|
|
287
|
+
values.append(f"{val:.4f}" if val is not None else "---")
|
|
288
|
+
|
|
289
|
+
values.append(str(exp.sample_count))
|
|
290
|
+
values.append(str(exp.failure_count))
|
|
291
|
+
|
|
292
|
+
lines.append(" & ".join(values) + " \\\\")
|
|
293
|
+
|
|
294
|
+
lines.append("\\bottomrule")
|
|
295
|
+
lines.append("\\end{tabular}")
|
|
296
|
+
lines.append("\\end{table}")
|
|
297
|
+
|
|
298
|
+
return "\n".join(lines)
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _generate_csv_leaderboard(
|
|
302
|
+
ranked,
|
|
303
|
+
metric: str,
|
|
304
|
+
include_cost: bool,
|
|
305
|
+
include_metadata: list[str] | None,
|
|
306
|
+
comparison,
|
|
307
|
+
) -> str:
|
|
308
|
+
"""Generate CSV leaderboard."""
|
|
309
|
+
import csv
|
|
310
|
+
import io
|
|
311
|
+
|
|
312
|
+
output = io.StringIO()
|
|
313
|
+
|
|
314
|
+
# Build header
|
|
315
|
+
headers = ["rank", "run_id", metric]
|
|
316
|
+
|
|
317
|
+
if include_metadata:
|
|
318
|
+
headers.extend(include_metadata)
|
|
319
|
+
|
|
320
|
+
has_cost = include_cost and any(exp.get_cost() is not None for exp in ranked)
|
|
321
|
+
if has_cost:
|
|
322
|
+
headers.append("cost")
|
|
323
|
+
|
|
324
|
+
other_metrics = [m for m in comparison.metrics if m != metric]
|
|
325
|
+
headers.extend(other_metrics)
|
|
326
|
+
headers.extend(["sample_count", "failure_count"])
|
|
327
|
+
|
|
328
|
+
writer = csv.DictWriter(output, fieldnames=headers)
|
|
329
|
+
writer.writeheader()
|
|
330
|
+
|
|
331
|
+
# Write rows
|
|
332
|
+
for rank, exp in enumerate(ranked, 1):
|
|
333
|
+
row = {"rank": rank, "run_id": exp.run_id}
|
|
334
|
+
|
|
335
|
+
# Primary metric
|
|
336
|
+
val = exp.get_metric(metric)
|
|
337
|
+
row[metric] = val if val is not None else ""
|
|
338
|
+
|
|
339
|
+
# Metadata
|
|
340
|
+
if include_metadata:
|
|
341
|
+
for field in include_metadata:
|
|
342
|
+
row[field] = exp.metadata.get(field, "")
|
|
343
|
+
|
|
344
|
+
# Cost
|
|
345
|
+
if has_cost:
|
|
346
|
+
cost = exp.get_cost()
|
|
347
|
+
row["cost"] = cost if cost is not None else ""
|
|
348
|
+
|
|
349
|
+
# Other metrics
|
|
350
|
+
for m in other_metrics:
|
|
351
|
+
val = exp.get_metric(m)
|
|
352
|
+
row[m] = val if val is not None else ""
|
|
353
|
+
|
|
354
|
+
row["sample_count"] = exp.sample_count
|
|
355
|
+
row["failure_count"] = exp.failure_count
|
|
356
|
+
|
|
357
|
+
writer.writerow(row)
|
|
358
|
+
|
|
359
|
+
return output.getvalue()
|
|
360
|
+
|
|
361
|
+
|
|
362
|
+
__all__ = ["leaderboard_command"]
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""Math benchmark command implementations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated, Callable, Literal, Sequence
|
|
7
|
+
|
|
8
|
+
from cyclopts import Parameter
|
|
9
|
+
|
|
10
|
+
from themis.cli.utils import effective_total, export_outputs
|
|
11
|
+
from themis.datasets import (
|
|
12
|
+
competition_math as competition_math_dataset,
|
|
13
|
+
)
|
|
14
|
+
from themis.datasets import (
|
|
15
|
+
math500 as math500_dataset,
|
|
16
|
+
)
|
|
17
|
+
from themis.experiment import math as math_experiment
|
|
18
|
+
from themis.experiment import storage as experiment_storage
|
|
19
|
+
from themis.utils.logging_utils import configure_logging
|
|
20
|
+
from themis.utils.progress import ProgressReporter
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def load_math_dataset(
|
|
24
|
+
*,
|
|
25
|
+
source: Literal["huggingface", "local"],
|
|
26
|
+
data_dir: Path | None,
|
|
27
|
+
limit: int | None,
|
|
28
|
+
subjects: Sequence[str] | None,
|
|
29
|
+
split: str = "test",
|
|
30
|
+
):
|
|
31
|
+
"""Load MATH-500 dataset.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
source: Dataset source (huggingface or local)
|
|
35
|
+
data_dir: Directory containing local dataset
|
|
36
|
+
limit: Max rows to load
|
|
37
|
+
subjects: Subjects to filter
|
|
38
|
+
split: Dataset split
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
List of generation examples
|
|
42
|
+
"""
|
|
43
|
+
if source == "local":
|
|
44
|
+
if data_dir is None:
|
|
45
|
+
raise ValueError(
|
|
46
|
+
"The --data-dir option is required when --source=local so Themis "
|
|
47
|
+
"knows where to read the dataset."
|
|
48
|
+
)
|
|
49
|
+
samples = math500_dataset.load_math500(
|
|
50
|
+
source="local",
|
|
51
|
+
data_dir=data_dir,
|
|
52
|
+
split=split,
|
|
53
|
+
limit=limit,
|
|
54
|
+
subjects=subjects,
|
|
55
|
+
)
|
|
56
|
+
else:
|
|
57
|
+
samples = math500_dataset.load_math500(
|
|
58
|
+
source="huggingface",
|
|
59
|
+
split=split,
|
|
60
|
+
limit=limit,
|
|
61
|
+
subjects=subjects,
|
|
62
|
+
)
|
|
63
|
+
return [sample.to_generation_example() for sample in samples]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def load_competition_math_dataset(
|
|
67
|
+
*,
|
|
68
|
+
dataset: str,
|
|
69
|
+
subset: str | None,
|
|
70
|
+
source: Literal["huggingface", "local"],
|
|
71
|
+
data_dir: Path | None,
|
|
72
|
+
split: str,
|
|
73
|
+
limit: int | None,
|
|
74
|
+
subjects: Sequence[str] | None,
|
|
75
|
+
):
|
|
76
|
+
"""Load competition math dataset.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
dataset: Dataset name
|
|
80
|
+
subset: Dataset subset
|
|
81
|
+
source: Dataset source
|
|
82
|
+
data_dir: Directory containing local dataset
|
|
83
|
+
split: Dataset split
|
|
84
|
+
limit: Max rows to load
|
|
85
|
+
subjects: Subjects to filter
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
List of generation examples
|
|
89
|
+
"""
|
|
90
|
+
if source == "local" and data_dir is None:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"The --data-dir option is required when --source=local so Themis "
|
|
93
|
+
"knows where to read the dataset."
|
|
94
|
+
)
|
|
95
|
+
samples = competition_math_dataset.load_competition_math(
|
|
96
|
+
dataset=dataset,
|
|
97
|
+
subset=subset,
|
|
98
|
+
source=source,
|
|
99
|
+
data_dir=data_dir,
|
|
100
|
+
split=split,
|
|
101
|
+
limit=limit,
|
|
102
|
+
subjects=subjects,
|
|
103
|
+
)
|
|
104
|
+
return [sample.to_generation_example() for sample in samples]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def run_math_benchmark(
|
|
108
|
+
rows: Sequence[dict[str, object]],
|
|
109
|
+
*,
|
|
110
|
+
max_samples: int | None,
|
|
111
|
+
storage: Path | None,
|
|
112
|
+
run_id: str | None,
|
|
113
|
+
resume: bool,
|
|
114
|
+
temperature: float,
|
|
115
|
+
csv_output: Path | None,
|
|
116
|
+
html_output: Path | None,
|
|
117
|
+
json_output: Path | None,
|
|
118
|
+
title: str,
|
|
119
|
+
task_name: str,
|
|
120
|
+
) -> int:
|
|
121
|
+
"""Run math benchmark experiment.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
rows: Dataset rows
|
|
125
|
+
max_samples: Maximum samples to run
|
|
126
|
+
storage: Cache directory
|
|
127
|
+
run_id: Run identifier
|
|
128
|
+
resume: Whether to resume from cache
|
|
129
|
+
temperature: Sampling temperature
|
|
130
|
+
csv_output: CSV export path
|
|
131
|
+
html_output: HTML export path
|
|
132
|
+
json_output: JSON export path
|
|
133
|
+
title: Experiment title
|
|
134
|
+
task_name: Task name
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Exit code
|
|
138
|
+
"""
|
|
139
|
+
storage_impl = experiment_storage.ExperimentStorage(storage) if storage else None
|
|
140
|
+
experiment = math_experiment.build_math500_zero_shot_experiment(
|
|
141
|
+
temperature=temperature,
|
|
142
|
+
storage=storage_impl,
|
|
143
|
+
task_name=task_name,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
total = effective_total(len(rows), max_samples)
|
|
147
|
+
with ProgressReporter(total=total, description="Generating") as progress:
|
|
148
|
+
report = experiment.run(
|
|
149
|
+
rows,
|
|
150
|
+
max_samples=max_samples,
|
|
151
|
+
run_id=run_id,
|
|
152
|
+
resume=resume,
|
|
153
|
+
on_result=progress.on_result,
|
|
154
|
+
)
|
|
155
|
+
print(math_experiment.summarize_report(report))
|
|
156
|
+
export_outputs(
|
|
157
|
+
report,
|
|
158
|
+
csv_output=csv_output,
|
|
159
|
+
html_output=html_output,
|
|
160
|
+
json_output=json_output,
|
|
161
|
+
title=f"{title} experiment",
|
|
162
|
+
)
|
|
163
|
+
return 0
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def math500_command(
|
|
167
|
+
*,
|
|
168
|
+
source: Annotated[
|
|
169
|
+
Literal["huggingface", "local"], Parameter(help="Dataset source")
|
|
170
|
+
] = "huggingface",
|
|
171
|
+
data_dir: Annotated[
|
|
172
|
+
Path | None, Parameter(help="Directory containing local dataset")
|
|
173
|
+
] = None,
|
|
174
|
+
limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
|
|
175
|
+
subjects: Annotated[tuple[str, ...], Parameter(help="Subjects to filter")] = (),
|
|
176
|
+
max_samples: Annotated[int | None, Parameter(help="Maximum samples to run")] = None,
|
|
177
|
+
storage: Annotated[
|
|
178
|
+
Path | None, Parameter(help="Cache directory for datasets/results")
|
|
179
|
+
] = None,
|
|
180
|
+
run_id: Annotated[str | None, Parameter(help="Identifier for cached run")] = None,
|
|
181
|
+
resume: Annotated[
|
|
182
|
+
bool, Parameter(help="Reuse cached generations when storage is set")
|
|
183
|
+
] = True,
|
|
184
|
+
temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
|
|
185
|
+
log_level: Annotated[
|
|
186
|
+
str, Parameter(help="Logging level (critical/error/warning/info/debug/trace)")
|
|
187
|
+
] = "info",
|
|
188
|
+
csv_output: Annotated[
|
|
189
|
+
Path | None, Parameter(help="Write CSV export to this path")
|
|
190
|
+
] = None,
|
|
191
|
+
html_output: Annotated[
|
|
192
|
+
Path | None, Parameter(help="Write HTML summary to this path")
|
|
193
|
+
] = None,
|
|
194
|
+
json_output: Annotated[
|
|
195
|
+
Path | None, Parameter(help="Write JSON export to this path")
|
|
196
|
+
] = None,
|
|
197
|
+
) -> int:
|
|
198
|
+
"""Run the zero-shot MATH-500 evaluation."""
|
|
199
|
+
configure_logging(log_level)
|
|
200
|
+
subject_filter = list(subjects) if subjects else None
|
|
201
|
+
rows = load_math_dataset(
|
|
202
|
+
source=source,
|
|
203
|
+
data_dir=data_dir,
|
|
204
|
+
limit=limit,
|
|
205
|
+
subjects=subject_filter,
|
|
206
|
+
)
|
|
207
|
+
return run_math_benchmark(
|
|
208
|
+
rows,
|
|
209
|
+
max_samples=max_samples,
|
|
210
|
+
storage=storage,
|
|
211
|
+
run_id=run_id,
|
|
212
|
+
resume=resume,
|
|
213
|
+
temperature=temperature,
|
|
214
|
+
csv_output=csv_output,
|
|
215
|
+
html_output=html_output,
|
|
216
|
+
json_output=json_output,
|
|
217
|
+
title="math500",
|
|
218
|
+
task_name="math500",
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _create_competition_math_command(
|
|
223
|
+
dataset_name: str,
|
|
224
|
+
dataset_id: str,
|
|
225
|
+
subset: str | None = None,
|
|
226
|
+
) -> Callable:
|
|
227
|
+
"""Create a competition math command function.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
dataset_name: Display name for the dataset
|
|
231
|
+
dataset_id: HuggingFace dataset ID
|
|
232
|
+
subset: Optional dataset subset
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
Command function
|
|
236
|
+
"""
|
|
237
|
+
|
|
238
|
+
def command(
|
|
239
|
+
*,
|
|
240
|
+
source: Annotated[
|
|
241
|
+
Literal["huggingface", "local"], Parameter(help="Dataset source")
|
|
242
|
+
] = "huggingface",
|
|
243
|
+
split: Annotated[str, Parameter(help="Dataset split to load")] = "test",
|
|
244
|
+
data_dir: Annotated[
|
|
245
|
+
Path | None, Parameter(help="Directory containing local dataset")
|
|
246
|
+
] = None,
|
|
247
|
+
limit: Annotated[int | None, Parameter(help="Max rows to load")] = None,
|
|
248
|
+
subjects: Annotated[
|
|
249
|
+
tuple[str, ...], Parameter(help="Optional subject filters")
|
|
250
|
+
] = (),
|
|
251
|
+
max_samples: Annotated[
|
|
252
|
+
int | None, Parameter(help="Maximum samples to run")
|
|
253
|
+
] = None,
|
|
254
|
+
storage: Annotated[
|
|
255
|
+
Path | None, Parameter(help="Cache directory for datasets/results")
|
|
256
|
+
] = None,
|
|
257
|
+
run_id: Annotated[
|
|
258
|
+
str | None, Parameter(help="Identifier for cached run")
|
|
259
|
+
] = None,
|
|
260
|
+
resume: Annotated[
|
|
261
|
+
bool, Parameter(help="Reuse cached generations when storage is set")
|
|
262
|
+
] = True,
|
|
263
|
+
temperature: Annotated[float, Parameter(help="Sampling temperature")] = 0.0,
|
|
264
|
+
log_level: Annotated[
|
|
265
|
+
str,
|
|
266
|
+
Parameter(help="Logging level (critical/error/warning/info/debug/trace)"),
|
|
267
|
+
] = "info",
|
|
268
|
+
csv_output: Annotated[
|
|
269
|
+
Path | None, Parameter(help="Write CSV export to this path")
|
|
270
|
+
] = None,
|
|
271
|
+
html_output: Annotated[
|
|
272
|
+
Path | None, Parameter(help="Write HTML summary to this path")
|
|
273
|
+
] = None,
|
|
274
|
+
json_output: Annotated[
|
|
275
|
+
Path | None, Parameter(help="Write JSON export to this path")
|
|
276
|
+
] = None,
|
|
277
|
+
) -> int:
|
|
278
|
+
f"""Run the {dataset_name} benchmark."""
|
|
279
|
+
configure_logging(log_level)
|
|
280
|
+
subject_filter = list(subjects) if subjects else None
|
|
281
|
+
rows = load_competition_math_dataset(
|
|
282
|
+
dataset=dataset_id,
|
|
283
|
+
subset=subset,
|
|
284
|
+
source=source,
|
|
285
|
+
data_dir=data_dir,
|
|
286
|
+
split=split,
|
|
287
|
+
limit=limit,
|
|
288
|
+
subjects=subject_filter,
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
return run_math_benchmark(
|
|
292
|
+
rows,
|
|
293
|
+
max_samples=max_samples,
|
|
294
|
+
storage=storage,
|
|
295
|
+
run_id=run_id,
|
|
296
|
+
resume=resume,
|
|
297
|
+
temperature=temperature,
|
|
298
|
+
csv_output=csv_output,
|
|
299
|
+
html_output=html_output,
|
|
300
|
+
json_output=json_output,
|
|
301
|
+
title=dataset_name,
|
|
302
|
+
task_name=dataset_name,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
command.__doc__ = f"Run the {dataset_name} benchmark."
|
|
306
|
+
return command
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
# Create specific competition math commands
|
|
310
|
+
aime24_command = _create_competition_math_command("aime24", "math-ai/aime24")
|
|
311
|
+
aime25_command = _create_competition_math_command("aime25", "math-ai/aime25")
|
|
312
|
+
amc23_command = _create_competition_math_command("amc23", "math-ai/amc23")
|
|
313
|
+
olympiadbench_command = _create_competition_math_command(
|
|
314
|
+
"olympiadbench", "math-ai/olympiadbench"
|
|
315
|
+
)
|
|
316
|
+
beyond_aime_command = _create_competition_math_command(
|
|
317
|
+
"beyondaime", "ByteDance-Seed/BeyondAIME"
|
|
318
|
+
)
|