themis-eval 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- themis/__init__.py +5 -2
- themis/_version.py +14 -1
- themis/api.py +83 -145
- themis/backends/storage.py +5 -0
- themis/cli/commands/info.py +2 -11
- themis/cli/main.py +231 -40
- themis/comparison/engine.py +7 -13
- themis/core/entities.py +4 -0
- themis/evaluation/metric_pipeline.py +12 -0
- themis/evaluation/pipeline.py +22 -0
- themis/evaluation/pipelines/__init__.py +4 -0
- themis/evaluation/pipelines/composable_pipeline.py +55 -0
- themis/evaluation/pipelines/standard_pipeline.py +18 -1
- themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +5 -2
- themis/evaluation/strategies/judge_evaluation_strategy.py +6 -1
- themis/experiment/__init__.py +2 -2
- themis/experiment/cache_manager.py +15 -1
- themis/experiment/definitions.py +1 -1
- themis/experiment/orchestrator.py +21 -11
- themis/experiment/share.py +264 -0
- themis/experiment/storage.py +345 -298
- themis/generation/plan.py +28 -6
- themis/generation/router.py +22 -4
- themis/generation/runner.py +16 -1
- themis/presets/benchmarks.py +602 -17
- themis/server/app.py +38 -26
- themis/session.py +125 -0
- themis/specs/__init__.py +7 -0
- themis/specs/execution.py +26 -0
- themis/specs/experiment.py +33 -0
- themis/specs/storage.py +18 -0
- themis/storage/__init__.py +6 -0
- themis/storage/experiment_storage.py +7 -0
- {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
- {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/RECORD +38 -31
- {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
- themis/experiment/builder.py +0 -151
- themis/experiment/export_csv.py +0 -159
- {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
- {themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
themis/presets/benchmarks.py
CHANGED
|
@@ -6,6 +6,7 @@ including prompts, metrics, extractors, and data loaders.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
+
import string
|
|
9
10
|
from dataclasses import dataclass, field
|
|
10
11
|
from typing import Any, Callable, Sequence
|
|
11
12
|
|
|
@@ -56,6 +57,85 @@ _BENCHMARK_REGISTRY: dict[str, BenchmarkPreset] = {}
|
|
|
56
57
|
_REGISTRY_INITIALIZED = False
|
|
57
58
|
|
|
58
59
|
|
|
60
|
+
def _to_dict_samples(samples: Sequence[Any]) -> list[dict[str, Any]]:
|
|
61
|
+
return [
|
|
62
|
+
sample.to_generation_example()
|
|
63
|
+
if hasattr(sample, "to_generation_example")
|
|
64
|
+
else dict(sample)
|
|
65
|
+
for sample in samples
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _format_mcq_options(choices: Sequence[str], labels: Sequence[str]) -> str:
|
|
70
|
+
return "\n".join(
|
|
71
|
+
f"{label}. {choice}" for label, choice in zip(labels, choices)
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _normalize_mcq_answer(
|
|
76
|
+
answer: Any,
|
|
77
|
+
choices: Sequence[str],
|
|
78
|
+
labels: Sequence[str],
|
|
79
|
+
) -> str:
|
|
80
|
+
if answer is None:
|
|
81
|
+
return ""
|
|
82
|
+
if isinstance(answer, bool):
|
|
83
|
+
return str(answer)
|
|
84
|
+
if isinstance(answer, (int, float)):
|
|
85
|
+
index = int(answer)
|
|
86
|
+
if 1 <= index <= len(choices):
|
|
87
|
+
return labels[index - 1]
|
|
88
|
+
if 0 <= index < len(choices):
|
|
89
|
+
return labels[index]
|
|
90
|
+
text = str(answer).strip()
|
|
91
|
+
if not text:
|
|
92
|
+
return ""
|
|
93
|
+
lowered = text.lower()
|
|
94
|
+
if lowered.startswith("option "):
|
|
95
|
+
text = text.split(" ", 1)[-1].strip()
|
|
96
|
+
if lowered.startswith("choice "):
|
|
97
|
+
text = text.split(" ", 1)[-1].strip()
|
|
98
|
+
if len(text) >= 2 and text[1] in {".", ")", ":", "-"}:
|
|
99
|
+
text = text[0]
|
|
100
|
+
if len(text) == 1 and text.isalpha():
|
|
101
|
+
letter = text.upper()
|
|
102
|
+
if letter in labels:
|
|
103
|
+
return letter
|
|
104
|
+
for idx, choice in enumerate(choices):
|
|
105
|
+
if text == str(choice).strip():
|
|
106
|
+
return labels[idx]
|
|
107
|
+
for idx, choice in enumerate(choices):
|
|
108
|
+
if text.lower() == str(choice).strip().lower():
|
|
109
|
+
return labels[idx]
|
|
110
|
+
return text
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _normalize_mcq_samples(samples: Sequence[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
114
|
+
normalized: list[dict[str, Any]] = []
|
|
115
|
+
for sample in samples:
|
|
116
|
+
row = dict(sample)
|
|
117
|
+
choices = row.get("choices") or row.get("options")
|
|
118
|
+
if not isinstance(choices, (list, tuple)):
|
|
119
|
+
normalized.append(row)
|
|
120
|
+
continue
|
|
121
|
+
choices_list = [str(choice) for choice in choices]
|
|
122
|
+
labels = row.get("choice_labels")
|
|
123
|
+
if isinstance(labels, (list, tuple)) and labels:
|
|
124
|
+
labels_list = [str(label) for label in labels][: len(choices_list)]
|
|
125
|
+
else:
|
|
126
|
+
labels_list = list(string.ascii_uppercase[: len(choices_list)])
|
|
127
|
+
row["choices"] = choices_list
|
|
128
|
+
row["choice_labels"] = labels_list
|
|
129
|
+
row["options"] = _format_mcq_options(choices_list, labels_list)
|
|
130
|
+
row["answer"] = _normalize_mcq_answer(
|
|
131
|
+
row.get("answer"),
|
|
132
|
+
choices_list,
|
|
133
|
+
labels_list,
|
|
134
|
+
)
|
|
135
|
+
normalized.append(row)
|
|
136
|
+
return normalized
|
|
137
|
+
|
|
138
|
+
|
|
59
139
|
def _ensure_registry_initialized() -> None:
|
|
60
140
|
"""Initialize benchmark registry on first use (lazy loading)."""
|
|
61
141
|
global _REGISTRY_INITIALIZED
|
|
@@ -119,8 +199,7 @@ def _create_math500_preset() -> BenchmarkPreset:
|
|
|
119
199
|
|
|
120
200
|
def load_math500(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
121
201
|
samples = load_math500_dataset(source="huggingface", limit=limit)
|
|
122
|
-
|
|
123
|
-
return [s.to_generation_example() if hasattr(s, 'to_generation_example') else dict(s) for s in samples]
|
|
202
|
+
return _to_dict_samples(samples)
|
|
124
203
|
|
|
125
204
|
prompt_template = PromptTemplate(
|
|
126
205
|
name="math500-zero-shot",
|
|
@@ -153,8 +232,7 @@ def _create_gsm8k_preset() -> BenchmarkPreset:
|
|
|
153
232
|
|
|
154
233
|
def load_gsm8k(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
155
234
|
samples = load_gsm8k_dataset(source="huggingface", split="test", limit=limit)
|
|
156
|
-
|
|
157
|
-
return [dict(s) if not isinstance(s, dict) else s for s in samples]
|
|
235
|
+
return _to_dict_samples(samples)
|
|
158
236
|
|
|
159
237
|
prompt_template = PromptTemplate(
|
|
160
238
|
name="gsm8k-zero-shot",
|
|
@@ -173,7 +251,7 @@ def _create_gsm8k_preset() -> BenchmarkPreset:
|
|
|
173
251
|
dataset_loader=load_gsm8k,
|
|
174
252
|
metadata_fields=(),
|
|
175
253
|
reference_field="answer",
|
|
176
|
-
dataset_id_field="
|
|
254
|
+
dataset_id_field="unique_id",
|
|
177
255
|
description="GSM8K dataset with grade school math word problems",
|
|
178
256
|
)
|
|
179
257
|
|
|
@@ -186,12 +264,12 @@ def _create_aime24_preset() -> BenchmarkPreset:
|
|
|
186
264
|
|
|
187
265
|
def load_aime24(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
188
266
|
samples = load_competition_math(
|
|
189
|
-
|
|
267
|
+
dataset="math-ai/aime24",
|
|
190
268
|
source="huggingface",
|
|
191
269
|
split="test",
|
|
192
270
|
limit=limit,
|
|
193
271
|
)
|
|
194
|
-
return
|
|
272
|
+
return _to_dict_samples(samples)
|
|
195
273
|
|
|
196
274
|
prompt_template = PromptTemplate(
|
|
197
275
|
name="aime24-zero-shot",
|
|
@@ -211,11 +289,201 @@ def _create_aime24_preset() -> BenchmarkPreset:
|
|
|
211
289
|
dataset_loader=load_aime24,
|
|
212
290
|
metadata_fields=("subject",),
|
|
213
291
|
reference_field="answer",
|
|
214
|
-
dataset_id_field="
|
|
292
|
+
dataset_id_field="unique_id",
|
|
215
293
|
description="AIME 2024 competition math problems",
|
|
216
294
|
)
|
|
217
295
|
|
|
218
296
|
|
|
297
|
+
def _create_gsm_symbolic_preset() -> BenchmarkPreset:
|
|
298
|
+
"""Create GSM-Symbolic benchmark preset."""
|
|
299
|
+
from themis.datasets.gsm_symbolic import (
|
|
300
|
+
load_gsm_symbolic as load_gsm_symbolic_dataset,
|
|
301
|
+
)
|
|
302
|
+
from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
|
|
303
|
+
from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
|
|
304
|
+
|
|
305
|
+
def load_gsm_symbolic(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
306
|
+
samples = load_gsm_symbolic_dataset(
|
|
307
|
+
source="huggingface",
|
|
308
|
+
split="test",
|
|
309
|
+
limit=limit,
|
|
310
|
+
)
|
|
311
|
+
return _to_dict_samples(samples)
|
|
312
|
+
|
|
313
|
+
prompt_template = PromptTemplate(
|
|
314
|
+
name="gsm-symbolic-zero-shot",
|
|
315
|
+
template=(
|
|
316
|
+
"Solve this math problem step by step.\n\n"
|
|
317
|
+
"Q: {question}\n"
|
|
318
|
+
"A:"
|
|
319
|
+
),
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
return BenchmarkPreset(
|
|
323
|
+
name="gsm-symbolic",
|
|
324
|
+
prompt_template=prompt_template,
|
|
325
|
+
metrics=[MathVerifyAccuracy()],
|
|
326
|
+
extractor=MathVerifyExtractor(),
|
|
327
|
+
dataset_loader=load_gsm_symbolic,
|
|
328
|
+
metadata_fields=(),
|
|
329
|
+
reference_field="answer",
|
|
330
|
+
dataset_id_field="unique_id",
|
|
331
|
+
description="GSM-Symbolic dataset for algebraic word problems",
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _create_aime25_preset() -> BenchmarkPreset:
|
|
336
|
+
"""Create AIME 2025 benchmark preset."""
|
|
337
|
+
from themis.datasets.competition_math import load_competition_math
|
|
338
|
+
from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
|
|
339
|
+
from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
|
|
340
|
+
|
|
341
|
+
def load_aime25(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
342
|
+
samples = load_competition_math(
|
|
343
|
+
dataset="math-ai/aime25",
|
|
344
|
+
source="huggingface",
|
|
345
|
+
split="test",
|
|
346
|
+
limit=limit,
|
|
347
|
+
)
|
|
348
|
+
return _to_dict_samples(samples)
|
|
349
|
+
|
|
350
|
+
prompt_template = PromptTemplate(
|
|
351
|
+
name="aime25-zero-shot",
|
|
352
|
+
template=(
|
|
353
|
+
"Solve the following AIME problem. "
|
|
354
|
+
"Your answer should be a number between 000 and 999.\n\n"
|
|
355
|
+
"Problem: {problem}\n\n"
|
|
356
|
+
"Solution:"
|
|
357
|
+
),
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
return BenchmarkPreset(
|
|
361
|
+
name="aime25",
|
|
362
|
+
prompt_template=prompt_template,
|
|
363
|
+
metrics=[MathVerifyAccuracy()],
|
|
364
|
+
extractor=MathVerifyExtractor(),
|
|
365
|
+
dataset_loader=load_aime25,
|
|
366
|
+
metadata_fields=("subject", "level"),
|
|
367
|
+
reference_field="answer",
|
|
368
|
+
dataset_id_field="unique_id",
|
|
369
|
+
description="AIME 2025 competition math problems",
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _create_amc23_preset() -> BenchmarkPreset:
|
|
374
|
+
"""Create AMC 2023 benchmark preset."""
|
|
375
|
+
from themis.datasets.competition_math import load_competition_math
|
|
376
|
+
from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
|
|
377
|
+
from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
|
|
378
|
+
|
|
379
|
+
def load_amc23(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
380
|
+
samples = load_competition_math(
|
|
381
|
+
dataset="math-ai/amc23",
|
|
382
|
+
source="huggingface",
|
|
383
|
+
split="test",
|
|
384
|
+
limit=limit,
|
|
385
|
+
)
|
|
386
|
+
return _to_dict_samples(samples)
|
|
387
|
+
|
|
388
|
+
prompt_template = PromptTemplate(
|
|
389
|
+
name="amc23-zero-shot",
|
|
390
|
+
template=(
|
|
391
|
+
"Solve the following AMC problem. "
|
|
392
|
+
"Give only the final answer.\n\n"
|
|
393
|
+
"Problem: {problem}\n\n"
|
|
394
|
+
"Answer:"
|
|
395
|
+
),
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
return BenchmarkPreset(
|
|
399
|
+
name="amc23",
|
|
400
|
+
prompt_template=prompt_template,
|
|
401
|
+
metrics=[MathVerifyAccuracy()],
|
|
402
|
+
extractor=MathVerifyExtractor(),
|
|
403
|
+
dataset_loader=load_amc23,
|
|
404
|
+
metadata_fields=("subject", "level"),
|
|
405
|
+
reference_field="answer",
|
|
406
|
+
dataset_id_field="unique_id",
|
|
407
|
+
description="AMC 2023 competition math problems",
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
|
|
411
|
+
def _create_olympiadbench_preset() -> BenchmarkPreset:
|
|
412
|
+
"""Create OlympiadBench benchmark preset."""
|
|
413
|
+
from themis.datasets.competition_math import load_competition_math
|
|
414
|
+
from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
|
|
415
|
+
from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
|
|
416
|
+
|
|
417
|
+
def load_olympiadbench(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
418
|
+
samples = load_competition_math(
|
|
419
|
+
dataset="math-ai/olympiadbench",
|
|
420
|
+
source="huggingface",
|
|
421
|
+
split="test",
|
|
422
|
+
limit=limit,
|
|
423
|
+
)
|
|
424
|
+
return _to_dict_samples(samples)
|
|
425
|
+
|
|
426
|
+
prompt_template = PromptTemplate(
|
|
427
|
+
name="olympiadbench-zero-shot",
|
|
428
|
+
template=(
|
|
429
|
+
"Solve the following olympiad-style math problem. "
|
|
430
|
+
"Show reasoning briefly, then give the final answer.\n\n"
|
|
431
|
+
"Problem: {problem}\n\n"
|
|
432
|
+
"Solution:"
|
|
433
|
+
),
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
return BenchmarkPreset(
|
|
437
|
+
name="olympiadbench",
|
|
438
|
+
prompt_template=prompt_template,
|
|
439
|
+
metrics=[MathVerifyAccuracy()],
|
|
440
|
+
extractor=MathVerifyExtractor(),
|
|
441
|
+
dataset_loader=load_olympiadbench,
|
|
442
|
+
metadata_fields=("subject", "level"),
|
|
443
|
+
reference_field="answer",
|
|
444
|
+
dataset_id_field="unique_id",
|
|
445
|
+
description="OlympiadBench competition math benchmark",
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _create_beyondaime_preset() -> BenchmarkPreset:
|
|
450
|
+
"""Create BeyondAIME benchmark preset."""
|
|
451
|
+
from themis.datasets.competition_math import load_competition_math
|
|
452
|
+
from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
|
|
453
|
+
from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
|
|
454
|
+
|
|
455
|
+
def load_beyondaime(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
456
|
+
samples = load_competition_math(
|
|
457
|
+
dataset="ByteDance-Seed/BeyondAIME",
|
|
458
|
+
source="huggingface",
|
|
459
|
+
split="test",
|
|
460
|
+
limit=limit,
|
|
461
|
+
)
|
|
462
|
+
return _to_dict_samples(samples)
|
|
463
|
+
|
|
464
|
+
prompt_template = PromptTemplate(
|
|
465
|
+
name="beyondaime-zero-shot",
|
|
466
|
+
template=(
|
|
467
|
+
"Solve the following advanced contest math problem. "
|
|
468
|
+
"Provide the final answer clearly.\n\n"
|
|
469
|
+
"Problem: {problem}\n\n"
|
|
470
|
+
"Answer:"
|
|
471
|
+
),
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
return BenchmarkPreset(
|
|
475
|
+
name="beyondaime",
|
|
476
|
+
prompt_template=prompt_template,
|
|
477
|
+
metrics=[MathVerifyAccuracy()],
|
|
478
|
+
extractor=MathVerifyExtractor(),
|
|
479
|
+
dataset_loader=load_beyondaime,
|
|
480
|
+
metadata_fields=("subject", "level"),
|
|
481
|
+
reference_field="answer",
|
|
482
|
+
dataset_id_field="unique_id",
|
|
483
|
+
description="BeyondAIME advanced competition math problems",
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
|
|
219
487
|
# ============================================================================
|
|
220
488
|
# MCQ Benchmarks
|
|
221
489
|
# ============================================================================
|
|
@@ -228,7 +496,7 @@ def _create_mmlu_pro_preset() -> BenchmarkPreset:
|
|
|
228
496
|
|
|
229
497
|
def load_mmlu_pro(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
230
498
|
samples = load_mmlu_pro_dataset(source="huggingface", split="test", limit=limit)
|
|
231
|
-
return
|
|
499
|
+
return _normalize_mcq_samples(_to_dict_samples(samples))
|
|
232
500
|
|
|
233
501
|
prompt_template = PromptTemplate(
|
|
234
502
|
name="mmlu-pro-zero-shot",
|
|
@@ -236,7 +504,7 @@ def _create_mmlu_pro_preset() -> BenchmarkPreset:
|
|
|
236
504
|
"Answer the following multiple choice question.\n\n"
|
|
237
505
|
"Question: {question}\n\n"
|
|
238
506
|
"Options:\n{options}\n\n"
|
|
239
|
-
"Answer:"
|
|
507
|
+
"Answer (letter):"
|
|
240
508
|
),
|
|
241
509
|
)
|
|
242
510
|
|
|
@@ -246,9 +514,9 @@ def _create_mmlu_pro_preset() -> BenchmarkPreset:
|
|
|
246
514
|
metrics=[ExactMatch()],
|
|
247
515
|
extractor=IdentityExtractor(),
|
|
248
516
|
dataset_loader=load_mmlu_pro,
|
|
249
|
-
metadata_fields=("
|
|
517
|
+
metadata_fields=("subject",),
|
|
250
518
|
reference_field="answer",
|
|
251
|
-
dataset_id_field="
|
|
519
|
+
dataset_id_field="unique_id",
|
|
252
520
|
description="MMLU-Pro professional-level multiple choice questions",
|
|
253
521
|
)
|
|
254
522
|
|
|
@@ -260,16 +528,20 @@ def _create_supergpqa_preset() -> BenchmarkPreset:
|
|
|
260
528
|
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
261
529
|
|
|
262
530
|
def load_supergpqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
263
|
-
samples = load_supergpqa_dataset(
|
|
264
|
-
|
|
531
|
+
samples = load_supergpqa_dataset(
|
|
532
|
+
source="huggingface",
|
|
533
|
+
split="test",
|
|
534
|
+
limit=limit,
|
|
535
|
+
)
|
|
536
|
+
return _normalize_mcq_samples(_to_dict_samples(samples))
|
|
265
537
|
|
|
266
538
|
prompt_template = PromptTemplate(
|
|
267
539
|
name="supergpqa-zero-shot",
|
|
268
540
|
template=(
|
|
269
541
|
"Answer the following science question.\n\n"
|
|
270
542
|
"Question: {question}\n\n"
|
|
271
|
-
"Choices:\n{
|
|
272
|
-
"Answer:"
|
|
543
|
+
"Choices:\n{options}\n\n"
|
|
544
|
+
"Answer (letter):"
|
|
273
545
|
),
|
|
274
546
|
)
|
|
275
547
|
|
|
@@ -281,11 +553,311 @@ def _create_supergpqa_preset() -> BenchmarkPreset:
|
|
|
281
553
|
dataset_loader=load_supergpqa,
|
|
282
554
|
metadata_fields=("subject",),
|
|
283
555
|
reference_field="answer",
|
|
284
|
-
dataset_id_field="
|
|
556
|
+
dataset_id_field="unique_id",
|
|
285
557
|
description="SuperGPQA graduate-level science questions",
|
|
286
558
|
)
|
|
287
559
|
|
|
288
560
|
|
|
561
|
+
def _create_gpqa_preset() -> BenchmarkPreset:
|
|
562
|
+
"""Create GPQA benchmark preset."""
|
|
563
|
+
from themis.datasets.gpqa import load_gpqa as load_gpqa_dataset
|
|
564
|
+
from themis.evaluation.extractors.identity_extractor import IdentityExtractor
|
|
565
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
566
|
+
|
|
567
|
+
def load_gpqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
568
|
+
samples = load_gpqa_dataset(
|
|
569
|
+
source="huggingface",
|
|
570
|
+
split="test",
|
|
571
|
+
limit=limit,
|
|
572
|
+
subset="default",
|
|
573
|
+
)
|
|
574
|
+
return _normalize_mcq_samples(_to_dict_samples(samples))
|
|
575
|
+
|
|
576
|
+
prompt_template = PromptTemplate(
|
|
577
|
+
name="gpqa-zero-shot",
|
|
578
|
+
template=(
|
|
579
|
+
"Answer the following question.\n\n"
|
|
580
|
+
"Question: {question}\n\n"
|
|
581
|
+
"Choices:\n{options}\n\n"
|
|
582
|
+
"Answer (letter):"
|
|
583
|
+
),
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
return BenchmarkPreset(
|
|
587
|
+
name="gpqa",
|
|
588
|
+
prompt_template=prompt_template,
|
|
589
|
+
metrics=[ExactMatch()],
|
|
590
|
+
extractor=IdentityExtractor(),
|
|
591
|
+
dataset_loader=load_gpqa,
|
|
592
|
+
metadata_fields=("subject",),
|
|
593
|
+
reference_field="answer",
|
|
594
|
+
dataset_id_field="unique_id",
|
|
595
|
+
description="GPQA graduate-level science questions",
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
def _create_medmcqa_preset() -> BenchmarkPreset:
|
|
600
|
+
"""Create MedMCQA benchmark preset."""
|
|
601
|
+
from themis.datasets.medmcqa import load_medmcqa as load_medmcqa_dataset
|
|
602
|
+
from themis.evaluation.extractors.identity_extractor import IdentityExtractor
|
|
603
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
604
|
+
|
|
605
|
+
def load_medmcqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
606
|
+
samples = load_medmcqa_dataset(
|
|
607
|
+
source="huggingface",
|
|
608
|
+
split="test",
|
|
609
|
+
limit=limit,
|
|
610
|
+
)
|
|
611
|
+
return _normalize_mcq_samples(_to_dict_samples(samples))
|
|
612
|
+
|
|
613
|
+
prompt_template = PromptTemplate(
|
|
614
|
+
name="medmcqa-zero-shot",
|
|
615
|
+
template=(
|
|
616
|
+
"Answer the following medical multiple choice question.\n\n"
|
|
617
|
+
"Question: {question}\n\n"
|
|
618
|
+
"Options:\n{options}\n\n"
|
|
619
|
+
"Answer (letter):"
|
|
620
|
+
),
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
return BenchmarkPreset(
|
|
624
|
+
name="medmcqa",
|
|
625
|
+
prompt_template=prompt_template,
|
|
626
|
+
metrics=[ExactMatch()],
|
|
627
|
+
extractor=IdentityExtractor(),
|
|
628
|
+
dataset_loader=load_medmcqa,
|
|
629
|
+
metadata_fields=("subject",),
|
|
630
|
+
reference_field="answer",
|
|
631
|
+
dataset_id_field="unique_id",
|
|
632
|
+
description="MedMCQA medical entrance exam questions",
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def _create_med_qa_preset() -> BenchmarkPreset:
|
|
637
|
+
"""Create MedQA benchmark preset."""
|
|
638
|
+
from themis.datasets.med_qa import load_med_qa as load_med_qa_dataset
|
|
639
|
+
from themis.evaluation.extractors.identity_extractor import IdentityExtractor
|
|
640
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
641
|
+
|
|
642
|
+
def load_med_qa(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
643
|
+
samples = load_med_qa_dataset(
|
|
644
|
+
source="huggingface",
|
|
645
|
+
split="test",
|
|
646
|
+
limit=limit,
|
|
647
|
+
)
|
|
648
|
+
return _normalize_mcq_samples(_to_dict_samples(samples))
|
|
649
|
+
|
|
650
|
+
prompt_template = PromptTemplate(
|
|
651
|
+
name="med-qa-zero-shot",
|
|
652
|
+
template=(
|
|
653
|
+
"Answer the following medical multiple choice question.\n\n"
|
|
654
|
+
"Question: {question}\n\n"
|
|
655
|
+
"Options:\n{options}\n\n"
|
|
656
|
+
"Answer (letter):"
|
|
657
|
+
),
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
return BenchmarkPreset(
|
|
661
|
+
name="med_qa",
|
|
662
|
+
prompt_template=prompt_template,
|
|
663
|
+
metrics=[ExactMatch()],
|
|
664
|
+
extractor=IdentityExtractor(),
|
|
665
|
+
dataset_loader=load_med_qa,
|
|
666
|
+
metadata_fields=("subject",),
|
|
667
|
+
reference_field="answer",
|
|
668
|
+
dataset_id_field="unique_id",
|
|
669
|
+
description="MedQA multiple choice medical QA benchmark",
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
|
|
673
|
+
def _create_sciq_preset() -> BenchmarkPreset:
|
|
674
|
+
"""Create SciQ benchmark preset."""
|
|
675
|
+
from themis.datasets.sciq import load_sciq as load_sciq_dataset
|
|
676
|
+
from themis.evaluation.extractors.identity_extractor import IdentityExtractor
|
|
677
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
678
|
+
|
|
679
|
+
def load_sciq(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
680
|
+
samples = load_sciq_dataset(
|
|
681
|
+
source="huggingface",
|
|
682
|
+
split="test",
|
|
683
|
+
limit=limit,
|
|
684
|
+
)
|
|
685
|
+
return _normalize_mcq_samples(_to_dict_samples(samples))
|
|
686
|
+
|
|
687
|
+
prompt_template = PromptTemplate(
|
|
688
|
+
name="sciq-zero-shot",
|
|
689
|
+
template=(
|
|
690
|
+
"Answer the following science question.\n\n"
|
|
691
|
+
"Question: {question}\n\n"
|
|
692
|
+
"Options:\n{options}\n\n"
|
|
693
|
+
"Answer (letter):"
|
|
694
|
+
),
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
return BenchmarkPreset(
|
|
698
|
+
name="sciq",
|
|
699
|
+
prompt_template=prompt_template,
|
|
700
|
+
metrics=[ExactMatch()],
|
|
701
|
+
extractor=IdentityExtractor(),
|
|
702
|
+
dataset_loader=load_sciq,
|
|
703
|
+
metadata_fields=(),
|
|
704
|
+
reference_field="answer",
|
|
705
|
+
dataset_id_field="unique_id",
|
|
706
|
+
description="SciQ science multiple choice questions",
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
|
|
710
|
+
def _create_commonsense_qa_preset() -> BenchmarkPreset:
|
|
711
|
+
"""Create CommonsenseQA benchmark preset."""
|
|
712
|
+
from themis.datasets.commonsense_qa import (
|
|
713
|
+
load_commonsense_qa as load_commonsense_qa_dataset,
|
|
714
|
+
)
|
|
715
|
+
from themis.evaluation.extractors.identity_extractor import IdentityExtractor
|
|
716
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
717
|
+
|
|
718
|
+
def load_commonsense_qa(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
719
|
+
samples = load_commonsense_qa_dataset(
|
|
720
|
+
source="huggingface",
|
|
721
|
+
split="validation",
|
|
722
|
+
limit=limit,
|
|
723
|
+
)
|
|
724
|
+
return _normalize_mcq_samples(_to_dict_samples(samples))
|
|
725
|
+
|
|
726
|
+
prompt_template = PromptTemplate(
|
|
727
|
+
name="commonsense-qa-zero-shot",
|
|
728
|
+
template=(
|
|
729
|
+
"Answer the following commonsense question.\n\n"
|
|
730
|
+
"Question: {question}\n\n"
|
|
731
|
+
"Options:\n{options}\n\n"
|
|
732
|
+
"Answer (letter):"
|
|
733
|
+
),
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
return BenchmarkPreset(
|
|
737
|
+
name="commonsense_qa",
|
|
738
|
+
prompt_template=prompt_template,
|
|
739
|
+
metrics=[ExactMatch()],
|
|
740
|
+
extractor=IdentityExtractor(),
|
|
741
|
+
dataset_loader=load_commonsense_qa,
|
|
742
|
+
metadata_fields=("concept",),
|
|
743
|
+
reference_field="answer",
|
|
744
|
+
dataset_id_field="unique_id",
|
|
745
|
+
description="CommonsenseQA multiple choice reasoning benchmark",
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
def _create_piqa_preset() -> BenchmarkPreset:
|
|
750
|
+
"""Create PIQA benchmark preset."""
|
|
751
|
+
from themis.datasets.piqa import load_piqa as load_piqa_dataset
|
|
752
|
+
from themis.evaluation.extractors.identity_extractor import IdentityExtractor
|
|
753
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
754
|
+
|
|
755
|
+
def load_piqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
756
|
+
samples = load_piqa_dataset(
|
|
757
|
+
source="huggingface",
|
|
758
|
+
split="validation",
|
|
759
|
+
limit=limit,
|
|
760
|
+
)
|
|
761
|
+
return _normalize_mcq_samples(_to_dict_samples(samples))
|
|
762
|
+
|
|
763
|
+
prompt_template = PromptTemplate(
|
|
764
|
+
name="piqa-zero-shot",
|
|
765
|
+
template=(
|
|
766
|
+
"Choose the best answer for the goal.\n\n"
|
|
767
|
+
"Goal: {goal}\n\n"
|
|
768
|
+
"Options:\n{options}\n\n"
|
|
769
|
+
"Answer (letter):"
|
|
770
|
+
),
|
|
771
|
+
)
|
|
772
|
+
|
|
773
|
+
return BenchmarkPreset(
|
|
774
|
+
name="piqa",
|
|
775
|
+
prompt_template=prompt_template,
|
|
776
|
+
metrics=[ExactMatch()],
|
|
777
|
+
extractor=IdentityExtractor(),
|
|
778
|
+
dataset_loader=load_piqa,
|
|
779
|
+
metadata_fields=(),
|
|
780
|
+
reference_field="answer",
|
|
781
|
+
dataset_id_field="unique_id",
|
|
782
|
+
description="PIQA physical commonsense reasoning benchmark",
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
def _create_social_i_qa_preset() -> BenchmarkPreset:
|
|
787
|
+
"""Create Social IQA benchmark preset."""
|
|
788
|
+
from themis.datasets.social_i_qa import load_social_i_qa as load_social_i_qa_dataset
|
|
789
|
+
from themis.evaluation.extractors.identity_extractor import IdentityExtractor
|
|
790
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
791
|
+
|
|
792
|
+
def load_social_i_qa(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
793
|
+
samples = load_social_i_qa_dataset(
|
|
794
|
+
source="huggingface",
|
|
795
|
+
split="validation",
|
|
796
|
+
limit=limit,
|
|
797
|
+
)
|
|
798
|
+
return _normalize_mcq_samples(_to_dict_samples(samples))
|
|
799
|
+
|
|
800
|
+
prompt_template = PromptTemplate(
|
|
801
|
+
name="social-iqa-zero-shot",
|
|
802
|
+
template=(
|
|
803
|
+
"Answer the question based on the social context.\n\n"
|
|
804
|
+
"Context: {context}\n"
|
|
805
|
+
"Question: {question}\n\n"
|
|
806
|
+
"Options:\n{options}\n\n"
|
|
807
|
+
"Answer (letter):"
|
|
808
|
+
),
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
return BenchmarkPreset(
|
|
812
|
+
name="social_i_qa",
|
|
813
|
+
prompt_template=prompt_template,
|
|
814
|
+
metrics=[ExactMatch()],
|
|
815
|
+
extractor=IdentityExtractor(),
|
|
816
|
+
dataset_loader=load_social_i_qa,
|
|
817
|
+
metadata_fields=(),
|
|
818
|
+
reference_field="answer",
|
|
819
|
+
dataset_id_field="unique_id",
|
|
820
|
+
description="Social IQA commonsense reasoning benchmark",
|
|
821
|
+
)
|
|
822
|
+
|
|
823
|
+
|
|
824
|
+
def _create_coqa_preset() -> BenchmarkPreset:
|
|
825
|
+
"""Create CoQA benchmark preset."""
|
|
826
|
+
from themis.datasets.coqa import load_coqa as load_coqa_dataset
|
|
827
|
+
from themis.evaluation.extractors.identity_extractor import IdentityExtractor
|
|
828
|
+
from themis.evaluation.metrics.exact_match import ExactMatch
|
|
829
|
+
|
|
830
|
+
def load_coqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
|
|
831
|
+
samples = load_coqa_dataset(
|
|
832
|
+
source="huggingface",
|
|
833
|
+
split="validation",
|
|
834
|
+
limit=limit,
|
|
835
|
+
)
|
|
836
|
+
return _to_dict_samples(samples)
|
|
837
|
+
|
|
838
|
+
prompt_template = PromptTemplate(
|
|
839
|
+
name="coqa-zero-shot",
|
|
840
|
+
template=(
|
|
841
|
+
"Answer the question based on the passage.\n\n"
|
|
842
|
+
"Passage: {story}\n\n"
|
|
843
|
+
"Question: {question}\n"
|
|
844
|
+
"Answer:"
|
|
845
|
+
),
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
return BenchmarkPreset(
|
|
849
|
+
name="coqa",
|
|
850
|
+
prompt_template=prompt_template,
|
|
851
|
+
metrics=[ExactMatch()],
|
|
852
|
+
extractor=IdentityExtractor(),
|
|
853
|
+
dataset_loader=load_coqa,
|
|
854
|
+
metadata_fields=("turn",),
|
|
855
|
+
reference_field="answer",
|
|
856
|
+
dataset_id_field="unique_id",
|
|
857
|
+
description="CoQA conversational question answering benchmark",
|
|
858
|
+
)
|
|
859
|
+
|
|
860
|
+
|
|
289
861
|
# ============================================================================
|
|
290
862
|
# Demo/Test Benchmarks
|
|
291
863
|
# ============================================================================
|
|
@@ -337,10 +909,23 @@ def _register_all_benchmarks() -> None:
|
|
|
337
909
|
register_benchmark(_create_math500_preset())
|
|
338
910
|
register_benchmark(_create_gsm8k_preset())
|
|
339
911
|
register_benchmark(_create_aime24_preset())
|
|
912
|
+
register_benchmark(_create_aime25_preset())
|
|
913
|
+
register_benchmark(_create_amc23_preset())
|
|
914
|
+
register_benchmark(_create_olympiadbench_preset())
|
|
915
|
+
register_benchmark(_create_beyondaime_preset())
|
|
916
|
+
register_benchmark(_create_gsm_symbolic_preset())
|
|
340
917
|
|
|
341
918
|
# MCQ benchmarks
|
|
342
919
|
register_benchmark(_create_mmlu_pro_preset())
|
|
343
920
|
register_benchmark(_create_supergpqa_preset())
|
|
921
|
+
register_benchmark(_create_gpqa_preset())
|
|
922
|
+
register_benchmark(_create_medmcqa_preset())
|
|
923
|
+
register_benchmark(_create_med_qa_preset())
|
|
924
|
+
register_benchmark(_create_sciq_preset())
|
|
925
|
+
register_benchmark(_create_commonsense_qa_preset())
|
|
926
|
+
register_benchmark(_create_piqa_preset())
|
|
927
|
+
register_benchmark(_create_social_i_qa_preset())
|
|
928
|
+
register_benchmark(_create_coqa_preset())
|
|
344
929
|
|
|
345
930
|
# Demo
|
|
346
931
|
register_benchmark(_create_demo_preset())
|