themis-eval 0.2.3__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. themis/__init__.py +5 -2
  2. themis/_version.py +14 -1
  3. themis/api.py +83 -145
  4. themis/backends/storage.py +5 -0
  5. themis/cli/commands/info.py +2 -11
  6. themis/cli/main.py +231 -40
  7. themis/comparison/engine.py +7 -13
  8. themis/core/entities.py +4 -0
  9. themis/evaluation/metric_pipeline.py +12 -0
  10. themis/evaluation/pipeline.py +22 -0
  11. themis/evaluation/pipelines/__init__.py +4 -0
  12. themis/evaluation/pipelines/composable_pipeline.py +55 -0
  13. themis/evaluation/pipelines/standard_pipeline.py +16 -0
  14. themis/experiment/__init__.py +2 -2
  15. themis/experiment/cache_manager.py +15 -1
  16. themis/experiment/definitions.py +1 -1
  17. themis/experiment/orchestrator.py +21 -11
  18. themis/experiment/share.py +264 -0
  19. themis/experiment/storage.py +345 -298
  20. themis/generation/router.py +22 -4
  21. themis/generation/runner.py +16 -1
  22. themis/presets/benchmarks.py +602 -17
  23. themis/server/app.py +38 -26
  24. themis/session.py +125 -0
  25. themis/specs/__init__.py +7 -0
  26. themis/specs/execution.py +26 -0
  27. themis/specs/experiment.py +33 -0
  28. themis/specs/storage.py +18 -0
  29. themis/storage/__init__.py +6 -0
  30. themis/storage/experiment_storage.py +7 -0
  31. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
  32. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/RECORD +35 -28
  33. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
  34. themis/experiment/builder.py +0 -151
  35. themis/experiment/export_csv.py +0 -159
  36. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
  37. {themis_eval-0.2.3.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0
@@ -6,6 +6,7 @@ including prompts, metrics, extractors, and data loaders.
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
+ import string
9
10
  from dataclasses import dataclass, field
10
11
  from typing import Any, Callable, Sequence
11
12
 
@@ -56,6 +57,85 @@ _BENCHMARK_REGISTRY: dict[str, BenchmarkPreset] = {}
56
57
  _REGISTRY_INITIALIZED = False
57
58
 
58
59
 
60
+ def _to_dict_samples(samples: Sequence[Any]) -> list[dict[str, Any]]:
61
+ return [
62
+ sample.to_generation_example()
63
+ if hasattr(sample, "to_generation_example")
64
+ else dict(sample)
65
+ for sample in samples
66
+ ]
67
+
68
+
69
+ def _format_mcq_options(choices: Sequence[str], labels: Sequence[str]) -> str:
70
+ return "\n".join(
71
+ f"{label}. {choice}" for label, choice in zip(labels, choices)
72
+ )
73
+
74
+
75
+ def _normalize_mcq_answer(
76
+ answer: Any,
77
+ choices: Sequence[str],
78
+ labels: Sequence[str],
79
+ ) -> str:
80
+ if answer is None:
81
+ return ""
82
+ if isinstance(answer, bool):
83
+ return str(answer)
84
+ if isinstance(answer, (int, float)):
85
+ index = int(answer)
86
+ if 1 <= index <= len(choices):
87
+ return labels[index - 1]
88
+ if 0 <= index < len(choices):
89
+ return labels[index]
90
+ text = str(answer).strip()
91
+ if not text:
92
+ return ""
93
+ lowered = text.lower()
94
+ if lowered.startswith("option "):
95
+ text = text.split(" ", 1)[-1].strip()
96
+ if lowered.startswith("choice "):
97
+ text = text.split(" ", 1)[-1].strip()
98
+ if len(text) >= 2 and text[1] in {".", ")", ":", "-"}:
99
+ text = text[0]
100
+ if len(text) == 1 and text.isalpha():
101
+ letter = text.upper()
102
+ if letter in labels:
103
+ return letter
104
+ for idx, choice in enumerate(choices):
105
+ if text == str(choice).strip():
106
+ return labels[idx]
107
+ for idx, choice in enumerate(choices):
108
+ if text.lower() == str(choice).strip().lower():
109
+ return labels[idx]
110
+ return text
111
+
112
+
113
+ def _normalize_mcq_samples(samples: Sequence[dict[str, Any]]) -> list[dict[str, Any]]:
114
+ normalized: list[dict[str, Any]] = []
115
+ for sample in samples:
116
+ row = dict(sample)
117
+ choices = row.get("choices") or row.get("options")
118
+ if not isinstance(choices, (list, tuple)):
119
+ normalized.append(row)
120
+ continue
121
+ choices_list = [str(choice) for choice in choices]
122
+ labels = row.get("choice_labels")
123
+ if isinstance(labels, (list, tuple)) and labels:
124
+ labels_list = [str(label) for label in labels][: len(choices_list)]
125
+ else:
126
+ labels_list = list(string.ascii_uppercase[: len(choices_list)])
127
+ row["choices"] = choices_list
128
+ row["choice_labels"] = labels_list
129
+ row["options"] = _format_mcq_options(choices_list, labels_list)
130
+ row["answer"] = _normalize_mcq_answer(
131
+ row.get("answer"),
132
+ choices_list,
133
+ labels_list,
134
+ )
135
+ normalized.append(row)
136
+ return normalized
137
+
138
+
59
139
  def _ensure_registry_initialized() -> None:
60
140
  """Initialize benchmark registry on first use (lazy loading)."""
61
141
  global _REGISTRY_INITIALIZED
@@ -119,8 +199,7 @@ def _create_math500_preset() -> BenchmarkPreset:
119
199
 
120
200
  def load_math500(limit: int | None = None) -> Sequence[dict[str, Any]]:
121
201
  samples = load_math500_dataset(source="huggingface", limit=limit)
122
- # Convert MathSample objects to dicts
123
- return [s.to_generation_example() if hasattr(s, 'to_generation_example') else dict(s) for s in samples]
202
+ return _to_dict_samples(samples)
124
203
 
125
204
  prompt_template = PromptTemplate(
126
205
  name="math500-zero-shot",
@@ -153,8 +232,7 @@ def _create_gsm8k_preset() -> BenchmarkPreset:
153
232
 
154
233
  def load_gsm8k(limit: int | None = None) -> Sequence[dict[str, Any]]:
155
234
  samples = load_gsm8k_dataset(source="huggingface", split="test", limit=limit)
156
- # Convert sample objects to dicts if needed
157
- return [dict(s) if not isinstance(s, dict) else s for s in samples]
235
+ return _to_dict_samples(samples)
158
236
 
159
237
  prompt_template = PromptTemplate(
160
238
  name="gsm8k-zero-shot",
@@ -173,7 +251,7 @@ def _create_gsm8k_preset() -> BenchmarkPreset:
173
251
  dataset_loader=load_gsm8k,
174
252
  metadata_fields=(),
175
253
  reference_field="answer",
176
- dataset_id_field="id",
254
+ dataset_id_field="unique_id",
177
255
  description="GSM8K dataset with grade school math word problems",
178
256
  )
179
257
 
@@ -186,12 +264,12 @@ def _create_aime24_preset() -> BenchmarkPreset:
186
264
 
187
265
  def load_aime24(limit: int | None = None) -> Sequence[dict[str, Any]]:
188
266
  samples = load_competition_math(
189
- dataset_id="aime24",
267
+ dataset="math-ai/aime24",
190
268
  source="huggingface",
191
269
  split="test",
192
270
  limit=limit,
193
271
  )
194
- return [dict(s) if not isinstance(s, dict) else s for s in samples]
272
+ return _to_dict_samples(samples)
195
273
 
196
274
  prompt_template = PromptTemplate(
197
275
  name="aime24-zero-shot",
@@ -211,11 +289,201 @@ def _create_aime24_preset() -> BenchmarkPreset:
211
289
  dataset_loader=load_aime24,
212
290
  metadata_fields=("subject",),
213
291
  reference_field="answer",
214
- dataset_id_field="id",
292
+ dataset_id_field="unique_id",
215
293
  description="AIME 2024 competition math problems",
216
294
  )
217
295
 
218
296
 
297
+ def _create_gsm_symbolic_preset() -> BenchmarkPreset:
298
+ """Create GSM-Symbolic benchmark preset."""
299
+ from themis.datasets.gsm_symbolic import (
300
+ load_gsm_symbolic as load_gsm_symbolic_dataset,
301
+ )
302
+ from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
303
+ from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
304
+
305
+ def load_gsm_symbolic(limit: int | None = None) -> Sequence[dict[str, Any]]:
306
+ samples = load_gsm_symbolic_dataset(
307
+ source="huggingface",
308
+ split="test",
309
+ limit=limit,
310
+ )
311
+ return _to_dict_samples(samples)
312
+
313
+ prompt_template = PromptTemplate(
314
+ name="gsm-symbolic-zero-shot",
315
+ template=(
316
+ "Solve this math problem step by step.\n\n"
317
+ "Q: {question}\n"
318
+ "A:"
319
+ ),
320
+ )
321
+
322
+ return BenchmarkPreset(
323
+ name="gsm-symbolic",
324
+ prompt_template=prompt_template,
325
+ metrics=[MathVerifyAccuracy()],
326
+ extractor=MathVerifyExtractor(),
327
+ dataset_loader=load_gsm_symbolic,
328
+ metadata_fields=(),
329
+ reference_field="answer",
330
+ dataset_id_field="unique_id",
331
+ description="GSM-Symbolic dataset for algebraic word problems",
332
+ )
333
+
334
+
335
+ def _create_aime25_preset() -> BenchmarkPreset:
336
+ """Create AIME 2025 benchmark preset."""
337
+ from themis.datasets.competition_math import load_competition_math
338
+ from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
339
+ from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
340
+
341
+ def load_aime25(limit: int | None = None) -> Sequence[dict[str, Any]]:
342
+ samples = load_competition_math(
343
+ dataset="math-ai/aime25",
344
+ source="huggingface",
345
+ split="test",
346
+ limit=limit,
347
+ )
348
+ return _to_dict_samples(samples)
349
+
350
+ prompt_template = PromptTemplate(
351
+ name="aime25-zero-shot",
352
+ template=(
353
+ "Solve the following AIME problem. "
354
+ "Your answer should be a number between 000 and 999.\n\n"
355
+ "Problem: {problem}\n\n"
356
+ "Solution:"
357
+ ),
358
+ )
359
+
360
+ return BenchmarkPreset(
361
+ name="aime25",
362
+ prompt_template=prompt_template,
363
+ metrics=[MathVerifyAccuracy()],
364
+ extractor=MathVerifyExtractor(),
365
+ dataset_loader=load_aime25,
366
+ metadata_fields=("subject", "level"),
367
+ reference_field="answer",
368
+ dataset_id_field="unique_id",
369
+ description="AIME 2025 competition math problems",
370
+ )
371
+
372
+
373
+ def _create_amc23_preset() -> BenchmarkPreset:
374
+ """Create AMC 2023 benchmark preset."""
375
+ from themis.datasets.competition_math import load_competition_math
376
+ from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
377
+ from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
378
+
379
+ def load_amc23(limit: int | None = None) -> Sequence[dict[str, Any]]:
380
+ samples = load_competition_math(
381
+ dataset="math-ai/amc23",
382
+ source="huggingface",
383
+ split="test",
384
+ limit=limit,
385
+ )
386
+ return _to_dict_samples(samples)
387
+
388
+ prompt_template = PromptTemplate(
389
+ name="amc23-zero-shot",
390
+ template=(
391
+ "Solve the following AMC problem. "
392
+ "Give only the final answer.\n\n"
393
+ "Problem: {problem}\n\n"
394
+ "Answer:"
395
+ ),
396
+ )
397
+
398
+ return BenchmarkPreset(
399
+ name="amc23",
400
+ prompt_template=prompt_template,
401
+ metrics=[MathVerifyAccuracy()],
402
+ extractor=MathVerifyExtractor(),
403
+ dataset_loader=load_amc23,
404
+ metadata_fields=("subject", "level"),
405
+ reference_field="answer",
406
+ dataset_id_field="unique_id",
407
+ description="AMC 2023 competition math problems",
408
+ )
409
+
410
+
411
+ def _create_olympiadbench_preset() -> BenchmarkPreset:
412
+ """Create OlympiadBench benchmark preset."""
413
+ from themis.datasets.competition_math import load_competition_math
414
+ from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
415
+ from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
416
+
417
+ def load_olympiadbench(limit: int | None = None) -> Sequence[dict[str, Any]]:
418
+ samples = load_competition_math(
419
+ dataset="math-ai/olympiadbench",
420
+ source="huggingface",
421
+ split="test",
422
+ limit=limit,
423
+ )
424
+ return _to_dict_samples(samples)
425
+
426
+ prompt_template = PromptTemplate(
427
+ name="olympiadbench-zero-shot",
428
+ template=(
429
+ "Solve the following olympiad-style math problem. "
430
+ "Show reasoning briefly, then give the final answer.\n\n"
431
+ "Problem: {problem}\n\n"
432
+ "Solution:"
433
+ ),
434
+ )
435
+
436
+ return BenchmarkPreset(
437
+ name="olympiadbench",
438
+ prompt_template=prompt_template,
439
+ metrics=[MathVerifyAccuracy()],
440
+ extractor=MathVerifyExtractor(),
441
+ dataset_loader=load_olympiadbench,
442
+ metadata_fields=("subject", "level"),
443
+ reference_field="answer",
444
+ dataset_id_field="unique_id",
445
+ description="OlympiadBench competition math benchmark",
446
+ )
447
+
448
+
449
+ def _create_beyondaime_preset() -> BenchmarkPreset:
450
+ """Create BeyondAIME benchmark preset."""
451
+ from themis.datasets.competition_math import load_competition_math
452
+ from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
453
+ from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
454
+
455
+ def load_beyondaime(limit: int | None = None) -> Sequence[dict[str, Any]]:
456
+ samples = load_competition_math(
457
+ dataset="ByteDance-Seed/BeyondAIME",
458
+ source="huggingface",
459
+ split="test",
460
+ limit=limit,
461
+ )
462
+ return _to_dict_samples(samples)
463
+
464
+ prompt_template = PromptTemplate(
465
+ name="beyondaime-zero-shot",
466
+ template=(
467
+ "Solve the following advanced contest math problem. "
468
+ "Provide the final answer clearly.\n\n"
469
+ "Problem: {problem}\n\n"
470
+ "Answer:"
471
+ ),
472
+ )
473
+
474
+ return BenchmarkPreset(
475
+ name="beyondaime",
476
+ prompt_template=prompt_template,
477
+ metrics=[MathVerifyAccuracy()],
478
+ extractor=MathVerifyExtractor(),
479
+ dataset_loader=load_beyondaime,
480
+ metadata_fields=("subject", "level"),
481
+ reference_field="answer",
482
+ dataset_id_field="unique_id",
483
+ description="BeyondAIME advanced competition math problems",
484
+ )
485
+
486
+
219
487
  # ============================================================================
220
488
  # MCQ Benchmarks
221
489
  # ============================================================================
@@ -228,7 +496,7 @@ def _create_mmlu_pro_preset() -> BenchmarkPreset:
228
496
 
229
497
  def load_mmlu_pro(limit: int | None = None) -> Sequence[dict[str, Any]]:
230
498
  samples = load_mmlu_pro_dataset(source="huggingface", split="test", limit=limit)
231
- return [dict(s) if not isinstance(s, dict) else s for s in samples]
499
+ return _normalize_mcq_samples(_to_dict_samples(samples))
232
500
 
233
501
  prompt_template = PromptTemplate(
234
502
  name="mmlu-pro-zero-shot",
@@ -236,7 +504,7 @@ def _create_mmlu_pro_preset() -> BenchmarkPreset:
236
504
  "Answer the following multiple choice question.\n\n"
237
505
  "Question: {question}\n\n"
238
506
  "Options:\n{options}\n\n"
239
- "Answer:"
507
+ "Answer (letter):"
240
508
  ),
241
509
  )
242
510
 
@@ -246,9 +514,9 @@ def _create_mmlu_pro_preset() -> BenchmarkPreset:
246
514
  metrics=[ExactMatch()],
247
515
  extractor=IdentityExtractor(),
248
516
  dataset_loader=load_mmlu_pro,
249
- metadata_fields=("category",),
517
+ metadata_fields=("subject",),
250
518
  reference_field="answer",
251
- dataset_id_field="id",
519
+ dataset_id_field="unique_id",
252
520
  description="MMLU-Pro professional-level multiple choice questions",
253
521
  )
254
522
 
@@ -260,16 +528,20 @@ def _create_supergpqa_preset() -> BenchmarkPreset:
260
528
  from themis.evaluation.metrics.exact_match import ExactMatch
261
529
 
262
530
  def load_supergpqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
263
- samples = load_supergpqa_dataset(source="huggingface", split="test", limit=limit)
264
- return [dict(s) if not isinstance(s, dict) else s for s in samples]
531
+ samples = load_supergpqa_dataset(
532
+ source="huggingface",
533
+ split="test",
534
+ limit=limit,
535
+ )
536
+ return _normalize_mcq_samples(_to_dict_samples(samples))
265
537
 
266
538
  prompt_template = PromptTemplate(
267
539
  name="supergpqa-zero-shot",
268
540
  template=(
269
541
  "Answer the following science question.\n\n"
270
542
  "Question: {question}\n\n"
271
- "Choices:\n{choices}\n\n"
272
- "Answer:"
543
+ "Choices:\n{options}\n\n"
544
+ "Answer (letter):"
273
545
  ),
274
546
  )
275
547
 
@@ -281,11 +553,311 @@ def _create_supergpqa_preset() -> BenchmarkPreset:
281
553
  dataset_loader=load_supergpqa,
282
554
  metadata_fields=("subject",),
283
555
  reference_field="answer",
284
- dataset_id_field="id",
556
+ dataset_id_field="unique_id",
285
557
  description="SuperGPQA graduate-level science questions",
286
558
  )
287
559
 
288
560
 
561
+ def _create_gpqa_preset() -> BenchmarkPreset:
562
+ """Create GPQA benchmark preset."""
563
+ from themis.datasets.gpqa import load_gpqa as load_gpqa_dataset
564
+ from themis.evaluation.extractors.identity_extractor import IdentityExtractor
565
+ from themis.evaluation.metrics.exact_match import ExactMatch
566
+
567
+ def load_gpqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
568
+ samples = load_gpqa_dataset(
569
+ source="huggingface",
570
+ split="test",
571
+ limit=limit,
572
+ subset="default",
573
+ )
574
+ return _normalize_mcq_samples(_to_dict_samples(samples))
575
+
576
+ prompt_template = PromptTemplate(
577
+ name="gpqa-zero-shot",
578
+ template=(
579
+ "Answer the following question.\n\n"
580
+ "Question: {question}\n\n"
581
+ "Choices:\n{options}\n\n"
582
+ "Answer (letter):"
583
+ ),
584
+ )
585
+
586
+ return BenchmarkPreset(
587
+ name="gpqa",
588
+ prompt_template=prompt_template,
589
+ metrics=[ExactMatch()],
590
+ extractor=IdentityExtractor(),
591
+ dataset_loader=load_gpqa,
592
+ metadata_fields=("subject",),
593
+ reference_field="answer",
594
+ dataset_id_field="unique_id",
595
+ description="GPQA graduate-level science questions",
596
+ )
597
+
598
+
599
+ def _create_medmcqa_preset() -> BenchmarkPreset:
600
+ """Create MedMCQA benchmark preset."""
601
+ from themis.datasets.medmcqa import load_medmcqa as load_medmcqa_dataset
602
+ from themis.evaluation.extractors.identity_extractor import IdentityExtractor
603
+ from themis.evaluation.metrics.exact_match import ExactMatch
604
+
605
+ def load_medmcqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
606
+ samples = load_medmcqa_dataset(
607
+ source="huggingface",
608
+ split="test",
609
+ limit=limit,
610
+ )
611
+ return _normalize_mcq_samples(_to_dict_samples(samples))
612
+
613
+ prompt_template = PromptTemplate(
614
+ name="medmcqa-zero-shot",
615
+ template=(
616
+ "Answer the following medical multiple choice question.\n\n"
617
+ "Question: {question}\n\n"
618
+ "Options:\n{options}\n\n"
619
+ "Answer (letter):"
620
+ ),
621
+ )
622
+
623
+ return BenchmarkPreset(
624
+ name="medmcqa",
625
+ prompt_template=prompt_template,
626
+ metrics=[ExactMatch()],
627
+ extractor=IdentityExtractor(),
628
+ dataset_loader=load_medmcqa,
629
+ metadata_fields=("subject",),
630
+ reference_field="answer",
631
+ dataset_id_field="unique_id",
632
+ description="MedMCQA medical entrance exam questions",
633
+ )
634
+
635
+
636
+ def _create_med_qa_preset() -> BenchmarkPreset:
637
+ """Create MedQA benchmark preset."""
638
+ from themis.datasets.med_qa import load_med_qa as load_med_qa_dataset
639
+ from themis.evaluation.extractors.identity_extractor import IdentityExtractor
640
+ from themis.evaluation.metrics.exact_match import ExactMatch
641
+
642
+ def load_med_qa(limit: int | None = None) -> Sequence[dict[str, Any]]:
643
+ samples = load_med_qa_dataset(
644
+ source="huggingface",
645
+ split="test",
646
+ limit=limit,
647
+ )
648
+ return _normalize_mcq_samples(_to_dict_samples(samples))
649
+
650
+ prompt_template = PromptTemplate(
651
+ name="med-qa-zero-shot",
652
+ template=(
653
+ "Answer the following medical multiple choice question.\n\n"
654
+ "Question: {question}\n\n"
655
+ "Options:\n{options}\n\n"
656
+ "Answer (letter):"
657
+ ),
658
+ )
659
+
660
+ return BenchmarkPreset(
661
+ name="med_qa",
662
+ prompt_template=prompt_template,
663
+ metrics=[ExactMatch()],
664
+ extractor=IdentityExtractor(),
665
+ dataset_loader=load_med_qa,
666
+ metadata_fields=("subject",),
667
+ reference_field="answer",
668
+ dataset_id_field="unique_id",
669
+ description="MedQA multiple choice medical QA benchmark",
670
+ )
671
+
672
+
673
+ def _create_sciq_preset() -> BenchmarkPreset:
674
+ """Create SciQ benchmark preset."""
675
+ from themis.datasets.sciq import load_sciq as load_sciq_dataset
676
+ from themis.evaluation.extractors.identity_extractor import IdentityExtractor
677
+ from themis.evaluation.metrics.exact_match import ExactMatch
678
+
679
+ def load_sciq(limit: int | None = None) -> Sequence[dict[str, Any]]:
680
+ samples = load_sciq_dataset(
681
+ source="huggingface",
682
+ split="test",
683
+ limit=limit,
684
+ )
685
+ return _normalize_mcq_samples(_to_dict_samples(samples))
686
+
687
+ prompt_template = PromptTemplate(
688
+ name="sciq-zero-shot",
689
+ template=(
690
+ "Answer the following science question.\n\n"
691
+ "Question: {question}\n\n"
692
+ "Options:\n{options}\n\n"
693
+ "Answer (letter):"
694
+ ),
695
+ )
696
+
697
+ return BenchmarkPreset(
698
+ name="sciq",
699
+ prompt_template=prompt_template,
700
+ metrics=[ExactMatch()],
701
+ extractor=IdentityExtractor(),
702
+ dataset_loader=load_sciq,
703
+ metadata_fields=(),
704
+ reference_field="answer",
705
+ dataset_id_field="unique_id",
706
+ description="SciQ science multiple choice questions",
707
+ )
708
+
709
+
710
+ def _create_commonsense_qa_preset() -> BenchmarkPreset:
711
+ """Create CommonsenseQA benchmark preset."""
712
+ from themis.datasets.commonsense_qa import (
713
+ load_commonsense_qa as load_commonsense_qa_dataset,
714
+ )
715
+ from themis.evaluation.extractors.identity_extractor import IdentityExtractor
716
+ from themis.evaluation.metrics.exact_match import ExactMatch
717
+
718
+ def load_commonsense_qa(limit: int | None = None) -> Sequence[dict[str, Any]]:
719
+ samples = load_commonsense_qa_dataset(
720
+ source="huggingface",
721
+ split="validation",
722
+ limit=limit,
723
+ )
724
+ return _normalize_mcq_samples(_to_dict_samples(samples))
725
+
726
+ prompt_template = PromptTemplate(
727
+ name="commonsense-qa-zero-shot",
728
+ template=(
729
+ "Answer the following commonsense question.\n\n"
730
+ "Question: {question}\n\n"
731
+ "Options:\n{options}\n\n"
732
+ "Answer (letter):"
733
+ ),
734
+ )
735
+
736
+ return BenchmarkPreset(
737
+ name="commonsense_qa",
738
+ prompt_template=prompt_template,
739
+ metrics=[ExactMatch()],
740
+ extractor=IdentityExtractor(),
741
+ dataset_loader=load_commonsense_qa,
742
+ metadata_fields=("concept",),
743
+ reference_field="answer",
744
+ dataset_id_field="unique_id",
745
+ description="CommonsenseQA multiple choice reasoning benchmark",
746
+ )
747
+
748
+
749
+ def _create_piqa_preset() -> BenchmarkPreset:
750
+ """Create PIQA benchmark preset."""
751
+ from themis.datasets.piqa import load_piqa as load_piqa_dataset
752
+ from themis.evaluation.extractors.identity_extractor import IdentityExtractor
753
+ from themis.evaluation.metrics.exact_match import ExactMatch
754
+
755
+ def load_piqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
756
+ samples = load_piqa_dataset(
757
+ source="huggingface",
758
+ split="validation",
759
+ limit=limit,
760
+ )
761
+ return _normalize_mcq_samples(_to_dict_samples(samples))
762
+
763
+ prompt_template = PromptTemplate(
764
+ name="piqa-zero-shot",
765
+ template=(
766
+ "Choose the best answer for the goal.\n\n"
767
+ "Goal: {goal}\n\n"
768
+ "Options:\n{options}\n\n"
769
+ "Answer (letter):"
770
+ ),
771
+ )
772
+
773
+ return BenchmarkPreset(
774
+ name="piqa",
775
+ prompt_template=prompt_template,
776
+ metrics=[ExactMatch()],
777
+ extractor=IdentityExtractor(),
778
+ dataset_loader=load_piqa,
779
+ metadata_fields=(),
780
+ reference_field="answer",
781
+ dataset_id_field="unique_id",
782
+ description="PIQA physical commonsense reasoning benchmark",
783
+ )
784
+
785
+
786
+ def _create_social_i_qa_preset() -> BenchmarkPreset:
787
+ """Create Social IQA benchmark preset."""
788
+ from themis.datasets.social_i_qa import load_social_i_qa as load_social_i_qa_dataset
789
+ from themis.evaluation.extractors.identity_extractor import IdentityExtractor
790
+ from themis.evaluation.metrics.exact_match import ExactMatch
791
+
792
+ def load_social_i_qa(limit: int | None = None) -> Sequence[dict[str, Any]]:
793
+ samples = load_social_i_qa_dataset(
794
+ source="huggingface",
795
+ split="validation",
796
+ limit=limit,
797
+ )
798
+ return _normalize_mcq_samples(_to_dict_samples(samples))
799
+
800
+ prompt_template = PromptTemplate(
801
+ name="social-iqa-zero-shot",
802
+ template=(
803
+ "Answer the question based on the social context.\n\n"
804
+ "Context: {context}\n"
805
+ "Question: {question}\n\n"
806
+ "Options:\n{options}\n\n"
807
+ "Answer (letter):"
808
+ ),
809
+ )
810
+
811
+ return BenchmarkPreset(
812
+ name="social_i_qa",
813
+ prompt_template=prompt_template,
814
+ metrics=[ExactMatch()],
815
+ extractor=IdentityExtractor(),
816
+ dataset_loader=load_social_i_qa,
817
+ metadata_fields=(),
818
+ reference_field="answer",
819
+ dataset_id_field="unique_id",
820
+ description="Social IQA commonsense reasoning benchmark",
821
+ )
822
+
823
+
824
+ def _create_coqa_preset() -> BenchmarkPreset:
825
+ """Create CoQA benchmark preset."""
826
+ from themis.datasets.coqa import load_coqa as load_coqa_dataset
827
+ from themis.evaluation.extractors.identity_extractor import IdentityExtractor
828
+ from themis.evaluation.metrics.exact_match import ExactMatch
829
+
830
+ def load_coqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
831
+ samples = load_coqa_dataset(
832
+ source="huggingface",
833
+ split="validation",
834
+ limit=limit,
835
+ )
836
+ return _to_dict_samples(samples)
837
+
838
+ prompt_template = PromptTemplate(
839
+ name="coqa-zero-shot",
840
+ template=(
841
+ "Answer the question based on the passage.\n\n"
842
+ "Passage: {story}\n\n"
843
+ "Question: {question}\n"
844
+ "Answer:"
845
+ ),
846
+ )
847
+
848
+ return BenchmarkPreset(
849
+ name="coqa",
850
+ prompt_template=prompt_template,
851
+ metrics=[ExactMatch()],
852
+ extractor=IdentityExtractor(),
853
+ dataset_loader=load_coqa,
854
+ metadata_fields=("turn",),
855
+ reference_field="answer",
856
+ dataset_id_field="unique_id",
857
+ description="CoQA conversational question answering benchmark",
858
+ )
859
+
860
+
289
861
  # ============================================================================
290
862
  # Demo/Test Benchmarks
291
863
  # ============================================================================
@@ -337,10 +909,23 @@ def _register_all_benchmarks() -> None:
337
909
  register_benchmark(_create_math500_preset())
338
910
  register_benchmark(_create_gsm8k_preset())
339
911
  register_benchmark(_create_aime24_preset())
912
+ register_benchmark(_create_aime25_preset())
913
+ register_benchmark(_create_amc23_preset())
914
+ register_benchmark(_create_olympiadbench_preset())
915
+ register_benchmark(_create_beyondaime_preset())
916
+ register_benchmark(_create_gsm_symbolic_preset())
340
917
 
341
918
  # MCQ benchmarks
342
919
  register_benchmark(_create_mmlu_pro_preset())
343
920
  register_benchmark(_create_supergpqa_preset())
921
+ register_benchmark(_create_gpqa_preset())
922
+ register_benchmark(_create_medmcqa_preset())
923
+ register_benchmark(_create_med_qa_preset())
924
+ register_benchmark(_create_sciq_preset())
925
+ register_benchmark(_create_commonsense_qa_preset())
926
+ register_benchmark(_create_piqa_preset())
927
+ register_benchmark(_create_social_i_qa_preset())
928
+ register_benchmark(_create_coqa_preset())
344
929
 
345
930
  # Demo
346
931
  register_benchmark(_create_demo_preset())