PyPI - themis-eval - Versions diffs - 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

themis-eval 0.2.2py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

themis/__init__.py +5 -2
themis/_version.py +14 -1
themis/api.py +83 -145
themis/backends/storage.py +5 -0
themis/cli/commands/info.py +2 -11
themis/cli/main.py +231 -40
themis/comparison/engine.py +7 -13
themis/core/entities.py +4 -0
themis/evaluation/metric_pipeline.py +12 -0
themis/evaluation/pipeline.py +22 -0
themis/evaluation/pipelines/__init__.py +4 -0
themis/evaluation/pipelines/composable_pipeline.py +55 -0
themis/evaluation/pipelines/standard_pipeline.py +18 -1
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +5 -2
themis/evaluation/strategies/judge_evaluation_strategy.py +6 -1
themis/experiment/__init__.py +2 -2
themis/experiment/cache_manager.py +15 -1
themis/experiment/definitions.py +1 -1
themis/experiment/orchestrator.py +21 -11
themis/experiment/share.py +264 -0
themis/experiment/storage.py +345 -298
themis/generation/plan.py +28 -6
themis/generation/router.py +22 -4
themis/generation/runner.py +16 -1
themis/presets/benchmarks.py +602 -17
themis/server/app.py +38 -26
themis/session.py +125 -0
themis/specs/__init__.py +7 -0
themis/specs/execution.py +26 -0
themis/specs/experiment.py +33 -0
themis/specs/storage.py +18 -0
themis/storage/__init__.py +6 -0
themis/storage/experiment_storage.py +7 -0
{themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/METADATA +47 -34
{themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/RECORD +38 -31
{themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/WHEEL +1 -1
themis/experiment/builder.py +0 -151
themis/experiment/export_csv.py +0 -159
{themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.2.2.dist-info → themis_eval-1.0.0.dist-info}/top_level.txt +0 -0

themis/presets/benchmarks.py CHANGED Viewed

@@ -6,6 +6,7 @@ including prompts, metrics, extractors, and data loaders.
 from __future__ import annotations
+import string
 from dataclasses import dataclass, field
 from typing import Any, Callable, Sequence
@@ -56,6 +57,85 @@ _BENCHMARK_REGISTRY: dict[str, BenchmarkPreset] = {}
 _REGISTRY_INITIALIZED = False
+def _to_dict_samples(samples: Sequence[Any]) -> list[dict[str, Any]]:
+    return [
+        sample.to_generation_example()
+        if hasattr(sample, "to_generation_example")
+        else dict(sample)
+        for sample in samples
+    ]
+def _format_mcq_options(choices: Sequence[str], labels: Sequence[str]) -> str:
+    return "\n".join(
+        f"{label}. {choice}" for label, choice in zip(labels, choices)
+    )
+def _normalize_mcq_answer(
+    answer: Any,
+    choices: Sequence[str],
+    labels: Sequence[str],
+) -> str:
+    if answer is None:
+        return ""
+    if isinstance(answer, bool):
+        return str(answer)
+    if isinstance(answer, (int, float)):
+        index = int(answer)
+        if 1 <= index <= len(choices):
+            return labels[index - 1]
+        if 0 <= index < len(choices):
+            return labels[index]
+    text = str(answer).strip()
+    if not text:
+        return ""
+    lowered = text.lower()
+    if lowered.startswith("option "):
+        text = text.split(" ", 1)[-1].strip()
+    if lowered.startswith("choice "):
+        text = text.split(" ", 1)[-1].strip()
+    if len(text) >= 2 and text[1] in {".", ")", ":", "-"}:
+        text = text[0]
+    if len(text) == 1 and text.isalpha():
+        letter = text.upper()
+        if letter in labels:
+            return letter
+    for idx, choice in enumerate(choices):
+        if text == str(choice).strip():
+            return labels[idx]
+    for idx, choice in enumerate(choices):
+        if text.lower() == str(choice).strip().lower():
+            return labels[idx]
+    return text
+def _normalize_mcq_samples(samples: Sequence[dict[str, Any]]) -> list[dict[str, Any]]:
+    normalized: list[dict[str, Any]] = []
+    for sample in samples:
+        row = dict(sample)
+        choices = row.get("choices") or row.get("options")
+        if not isinstance(choices, (list, tuple)):
+            normalized.append(row)
+            continue
+        choices_list = [str(choice) for choice in choices]
+        labels = row.get("choice_labels")
+        if isinstance(labels, (list, tuple)) and labels:
+            labels_list = [str(label) for label in labels][: len(choices_list)]
+        else:
+            labels_list = list(string.ascii_uppercase[: len(choices_list)])
+        row["choices"] = choices_list
+        row["choice_labels"] = labels_list
+        row["options"] = _format_mcq_options(choices_list, labels_list)
+        row["answer"] = _normalize_mcq_answer(
+            row.get("answer"),
+            choices_list,
+            labels_list,
+        )
+        normalized.append(row)
+    return normalized
 def _ensure_registry_initialized() -> None:
     """Initialize benchmark registry on first use (lazy loading)."""
     global _REGISTRY_INITIALIZED
@@ -119,8 +199,7 @@ def _create_math500_preset() -> BenchmarkPreset:
     def load_math500(limit: int | None = None) -> Sequence[dict[str, Any]]:
         samples = load_math500_dataset(source="huggingface", limit=limit)
-        # Convert MathSample objects to dicts
-        return [s.to_generation_example() if hasattr(s, 'to_generation_example') else dict(s) for s in samples]
+        return _to_dict_samples(samples)
     prompt_template = PromptTemplate(
         name="math500-zero-shot",
@@ -153,8 +232,7 @@ def _create_gsm8k_preset() -> BenchmarkPreset:
     def load_gsm8k(limit: int | None = None) -> Sequence[dict[str, Any]]:
         samples = load_gsm8k_dataset(source="huggingface", split="test", limit=limit)
-        # Convert sample objects to dicts if needed
-        return [dict(s) if not isinstance(s, dict) else s for s in samples]
+        return _to_dict_samples(samples)
     prompt_template = PromptTemplate(
         name="gsm8k-zero-shot",
@@ -173,7 +251,7 @@ def _create_gsm8k_preset() -> BenchmarkPreset:
         dataset_loader=load_gsm8k,
         metadata_fields=(),
         reference_field="answer",
-        dataset_id_field="id",
+        dataset_id_field="unique_id",
         description="GSM8K dataset with grade school math word problems",
     )
@@ -186,12 +264,12 @@ def _create_aime24_preset() -> BenchmarkPreset:
     def load_aime24(limit: int | None = None) -> Sequence[dict[str, Any]]:
         samples = load_competition_math(
-            dataset_id="aime24",
+            dataset="math-ai/aime24",
             source="huggingface",
             split="test",
             limit=limit,
         )
-        return [dict(s) if not isinstance(s, dict) else s for s in samples]
+        return _to_dict_samples(samples)
     prompt_template = PromptTemplate(
         name="aime24-zero-shot",
@@ -211,11 +289,201 @@ def _create_aime24_preset() -> BenchmarkPreset:
         dataset_loader=load_aime24,
         metadata_fields=("subject",),
         reference_field="answer",
-        dataset_id_field="id",
+        dataset_id_field="unique_id",
         description="AIME 2024 competition math problems",
     )
+def _create_gsm_symbolic_preset() -> BenchmarkPreset:
+    """Create GSM-Symbolic benchmark preset."""
+    from themis.datasets.gsm_symbolic import (
+        load_gsm_symbolic as load_gsm_symbolic_dataset,
+    )
+    from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
+    from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
+    def load_gsm_symbolic(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_gsm_symbolic_dataset(
+            source="huggingface",
+            split="test",
+            limit=limit,
+        )
+        return _to_dict_samples(samples)
+    prompt_template = PromptTemplate(
+        name="gsm-symbolic-zero-shot",
+        template=(
+            "Solve this math problem step by step.\n\n"
+            "Q: {question}\n"
+            "A:"
+        ),
+    )
+    return BenchmarkPreset(
+        name="gsm-symbolic",
+        prompt_template=prompt_template,
+        metrics=[MathVerifyAccuracy()],
+        extractor=MathVerifyExtractor(),
+        dataset_loader=load_gsm_symbolic,
+        metadata_fields=(),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="GSM-Symbolic dataset for algebraic word problems",
+    )
+def _create_aime25_preset() -> BenchmarkPreset:
+    """Create AIME 2025 benchmark preset."""
+    from themis.datasets.competition_math import load_competition_math
+    from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
+    from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
+    def load_aime25(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_competition_math(
+            dataset="math-ai/aime25",
+            source="huggingface",
+            split="test",
+            limit=limit,
+        )
+        return _to_dict_samples(samples)
+    prompt_template = PromptTemplate(
+        name="aime25-zero-shot",
+        template=(
+            "Solve the following AIME problem. "
+            "Your answer should be a number between 000 and 999.\n\n"
+            "Problem: {problem}\n\n"
+            "Solution:"
+        ),
+    )
+    return BenchmarkPreset(
+        name="aime25",
+        prompt_template=prompt_template,
+        metrics=[MathVerifyAccuracy()],
+        extractor=MathVerifyExtractor(),
+        dataset_loader=load_aime25,
+        metadata_fields=("subject", "level"),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="AIME 2025 competition math problems",
+    )
+def _create_amc23_preset() -> BenchmarkPreset:
+    """Create AMC 2023 benchmark preset."""
+    from themis.datasets.competition_math import load_competition_math
+    from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
+    from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
+    def load_amc23(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_competition_math(
+            dataset="math-ai/amc23",
+            source="huggingface",
+            split="test",
+            limit=limit,
+        )
+        return _to_dict_samples(samples)
+    prompt_template = PromptTemplate(
+        name="amc23-zero-shot",
+        template=(
+            "Solve the following AMC problem. "
+            "Give only the final answer.\n\n"
+            "Problem: {problem}\n\n"
+            "Answer:"
+        ),
+    )
+    return BenchmarkPreset(
+        name="amc23",
+        prompt_template=prompt_template,
+        metrics=[MathVerifyAccuracy()],
+        extractor=MathVerifyExtractor(),
+        dataset_loader=load_amc23,
+        metadata_fields=("subject", "level"),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="AMC 2023 competition math problems",
+    )
+def _create_olympiadbench_preset() -> BenchmarkPreset:
+    """Create OlympiadBench benchmark preset."""
+    from themis.datasets.competition_math import load_competition_math
+    from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
+    from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
+    def load_olympiadbench(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_competition_math(
+            dataset="math-ai/olympiadbench",
+            source="huggingface",
+            split="test",
+            limit=limit,
+        )
+        return _to_dict_samples(samples)
+    prompt_template = PromptTemplate(
+        name="olympiadbench-zero-shot",
+        template=(
+            "Solve the following olympiad-style math problem. "
+            "Show reasoning briefly, then give the final answer.\n\n"
+            "Problem: {problem}\n\n"
+            "Solution:"
+        ),
+    )
+    return BenchmarkPreset(
+        name="olympiadbench",
+        prompt_template=prompt_template,
+        metrics=[MathVerifyAccuracy()],
+        extractor=MathVerifyExtractor(),
+        dataset_loader=load_olympiadbench,
+        metadata_fields=("subject", "level"),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="OlympiadBench competition math benchmark",
+    )
+def _create_beyondaime_preset() -> BenchmarkPreset:
+    """Create BeyondAIME benchmark preset."""
+    from themis.datasets.competition_math import load_competition_math
+    from themis.evaluation.extractors.math_verify_extractor import MathVerifyExtractor
+    from themis.evaluation.metrics.math_verify_accuracy import MathVerifyAccuracy
+    def load_beyondaime(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_competition_math(
+            dataset="ByteDance-Seed/BeyondAIME",
+            source="huggingface",
+            split="test",
+            limit=limit,
+        )
+        return _to_dict_samples(samples)
+    prompt_template = PromptTemplate(
+        name="beyondaime-zero-shot",
+        template=(
+            "Solve the following advanced contest math problem. "
+            "Provide the final answer clearly.\n\n"
+            "Problem: {problem}\n\n"
+            "Answer:"
+        ),
+    )
+    return BenchmarkPreset(
+        name="beyondaime",
+        prompt_template=prompt_template,
+        metrics=[MathVerifyAccuracy()],
+        extractor=MathVerifyExtractor(),
+        dataset_loader=load_beyondaime,
+        metadata_fields=("subject", "level"),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="BeyondAIME advanced competition math problems",
+    )
 # ============================================================================
 # MCQ Benchmarks
 # ============================================================================
@@ -228,7 +496,7 @@ def _create_mmlu_pro_preset() -> BenchmarkPreset:
     def load_mmlu_pro(limit: int | None = None) -> Sequence[dict[str, Any]]:
         samples = load_mmlu_pro_dataset(source="huggingface", split="test", limit=limit)
-        return [dict(s) if not isinstance(s, dict) else s for s in samples]
+        return _normalize_mcq_samples(_to_dict_samples(samples))
     prompt_template = PromptTemplate(
         name="mmlu-pro-zero-shot",
@@ -236,7 +504,7 @@ def _create_mmlu_pro_preset() -> BenchmarkPreset:
             "Answer the following multiple choice question.\n\n"
             "Question: {question}\n\n"
             "Options:\n{options}\n\n"
-            "Answer:"
+            "Answer (letter):"
         ),
     )
@@ -246,9 +514,9 @@ def _create_mmlu_pro_preset() -> BenchmarkPreset:
         metrics=[ExactMatch()],
         extractor=IdentityExtractor(),
         dataset_loader=load_mmlu_pro,
-        metadata_fields=("category",),
+        metadata_fields=("subject",),
         reference_field="answer",
-        dataset_id_field="id",
+        dataset_id_field="unique_id",
         description="MMLU-Pro professional-level multiple choice questions",
     )
@@ -260,16 +528,20 @@ def _create_supergpqa_preset() -> BenchmarkPreset:
     from themis.evaluation.metrics.exact_match import ExactMatch
     def load_supergpqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
-        samples = load_supergpqa_dataset(source="huggingface", split="test", limit=limit)
-        return [dict(s) if not isinstance(s, dict) else s for s in samples]
+        samples = load_supergpqa_dataset(
+            source="huggingface",
+            split="test",
+            limit=limit,
+        )
+        return _normalize_mcq_samples(_to_dict_samples(samples))
     prompt_template = PromptTemplate(
         name="supergpqa-zero-shot",
         template=(
             "Answer the following science question.\n\n"
             "Question: {question}\n\n"
-            "Choices:\n{choices}\n\n"
-            "Answer:"
+            "Choices:\n{options}\n\n"
+            "Answer (letter):"
         ),
     )
@@ -281,11 +553,311 @@ def _create_supergpqa_preset() -> BenchmarkPreset:
         dataset_loader=load_supergpqa,
         metadata_fields=("subject",),
         reference_field="answer",
-        dataset_id_field="id",
+        dataset_id_field="unique_id",
         description="SuperGPQA graduate-level science questions",
     )
+def _create_gpqa_preset() -> BenchmarkPreset:
+    """Create GPQA benchmark preset."""
+    from themis.datasets.gpqa import load_gpqa as load_gpqa_dataset
+    from themis.evaluation.extractors.identity_extractor import IdentityExtractor
+    from themis.evaluation.metrics.exact_match import ExactMatch
+    def load_gpqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_gpqa_dataset(
+            source="huggingface",
+            split="test",
+            limit=limit,
+            subset="default",
+        )
+        return _normalize_mcq_samples(_to_dict_samples(samples))
+    prompt_template = PromptTemplate(
+        name="gpqa-zero-shot",
+        template=(
+            "Answer the following question.\n\n"
+            "Question: {question}\n\n"
+            "Choices:\n{options}\n\n"
+            "Answer (letter):"
+        ),
+    )
+    return BenchmarkPreset(
+        name="gpqa",
+        prompt_template=prompt_template,
+        metrics=[ExactMatch()],
+        extractor=IdentityExtractor(),
+        dataset_loader=load_gpqa,
+        metadata_fields=("subject",),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="GPQA graduate-level science questions",
+    )
+def _create_medmcqa_preset() -> BenchmarkPreset:
+    """Create MedMCQA benchmark preset."""
+    from themis.datasets.medmcqa import load_medmcqa as load_medmcqa_dataset
+    from themis.evaluation.extractors.identity_extractor import IdentityExtractor
+    from themis.evaluation.metrics.exact_match import ExactMatch
+    def load_medmcqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_medmcqa_dataset(
+            source="huggingface",
+            split="test",
+            limit=limit,
+        )
+        return _normalize_mcq_samples(_to_dict_samples(samples))
+    prompt_template = PromptTemplate(
+        name="medmcqa-zero-shot",
+        template=(
+            "Answer the following medical multiple choice question.\n\n"
+            "Question: {question}\n\n"
+            "Options:\n{options}\n\n"
+            "Answer (letter):"
+        ),
+    )
+    return BenchmarkPreset(
+        name="medmcqa",
+        prompt_template=prompt_template,
+        metrics=[ExactMatch()],
+        extractor=IdentityExtractor(),
+        dataset_loader=load_medmcqa,
+        metadata_fields=("subject",),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="MedMCQA medical entrance exam questions",
+    )
+def _create_med_qa_preset() -> BenchmarkPreset:
+    """Create MedQA benchmark preset."""
+    from themis.datasets.med_qa import load_med_qa as load_med_qa_dataset
+    from themis.evaluation.extractors.identity_extractor import IdentityExtractor
+    from themis.evaluation.metrics.exact_match import ExactMatch
+    def load_med_qa(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_med_qa_dataset(
+            source="huggingface",
+            split="test",
+            limit=limit,
+        )
+        return _normalize_mcq_samples(_to_dict_samples(samples))
+    prompt_template = PromptTemplate(
+        name="med-qa-zero-shot",
+        template=(
+            "Answer the following medical multiple choice question.\n\n"
+            "Question: {question}\n\n"
+            "Options:\n{options}\n\n"
+            "Answer (letter):"
+        ),
+    )
+    return BenchmarkPreset(
+        name="med_qa",
+        prompt_template=prompt_template,
+        metrics=[ExactMatch()],
+        extractor=IdentityExtractor(),
+        dataset_loader=load_med_qa,
+        metadata_fields=("subject",),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="MedQA multiple choice medical QA benchmark",
+    )
+def _create_sciq_preset() -> BenchmarkPreset:
+    """Create SciQ benchmark preset."""
+    from themis.datasets.sciq import load_sciq as load_sciq_dataset
+    from themis.evaluation.extractors.identity_extractor import IdentityExtractor
+    from themis.evaluation.metrics.exact_match import ExactMatch
+    def load_sciq(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_sciq_dataset(
+            source="huggingface",
+            split="test",
+            limit=limit,
+        )
+        return _normalize_mcq_samples(_to_dict_samples(samples))
+    prompt_template = PromptTemplate(
+        name="sciq-zero-shot",
+        template=(
+            "Answer the following science question.\n\n"
+            "Question: {question}\n\n"
+            "Options:\n{options}\n\n"
+            "Answer (letter):"
+        ),
+    )
+    return BenchmarkPreset(
+        name="sciq",
+        prompt_template=prompt_template,
+        metrics=[ExactMatch()],
+        extractor=IdentityExtractor(),
+        dataset_loader=load_sciq,
+        metadata_fields=(),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="SciQ science multiple choice questions",
+    )
+def _create_commonsense_qa_preset() -> BenchmarkPreset:
+    """Create CommonsenseQA benchmark preset."""
+    from themis.datasets.commonsense_qa import (
+        load_commonsense_qa as load_commonsense_qa_dataset,
+    )
+    from themis.evaluation.extractors.identity_extractor import IdentityExtractor
+    from themis.evaluation.metrics.exact_match import ExactMatch
+    def load_commonsense_qa(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_commonsense_qa_dataset(
+            source="huggingface",
+            split="validation",
+            limit=limit,
+        )
+        return _normalize_mcq_samples(_to_dict_samples(samples))
+    prompt_template = PromptTemplate(
+        name="commonsense-qa-zero-shot",
+        template=(
+            "Answer the following commonsense question.\n\n"
+            "Question: {question}\n\n"
+            "Options:\n{options}\n\n"
+            "Answer (letter):"
+        ),
+    )
+    return BenchmarkPreset(
+        name="commonsense_qa",
+        prompt_template=prompt_template,
+        metrics=[ExactMatch()],
+        extractor=IdentityExtractor(),
+        dataset_loader=load_commonsense_qa,
+        metadata_fields=("concept",),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="CommonsenseQA multiple choice reasoning benchmark",
+    )
+def _create_piqa_preset() -> BenchmarkPreset:
+    """Create PIQA benchmark preset."""
+    from themis.datasets.piqa import load_piqa as load_piqa_dataset
+    from themis.evaluation.extractors.identity_extractor import IdentityExtractor
+    from themis.evaluation.metrics.exact_match import ExactMatch
+    def load_piqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_piqa_dataset(
+            source="huggingface",
+            split="validation",
+            limit=limit,
+        )
+        return _normalize_mcq_samples(_to_dict_samples(samples))
+    prompt_template = PromptTemplate(
+        name="piqa-zero-shot",
+        template=(
+            "Choose the best answer for the goal.\n\n"
+            "Goal: {goal}\n\n"
+            "Options:\n{options}\n\n"
+            "Answer (letter):"
+        ),
+    )
+    return BenchmarkPreset(
+        name="piqa",
+        prompt_template=prompt_template,
+        metrics=[ExactMatch()],
+        extractor=IdentityExtractor(),
+        dataset_loader=load_piqa,
+        metadata_fields=(),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="PIQA physical commonsense reasoning benchmark",
+    )
+def _create_social_i_qa_preset() -> BenchmarkPreset:
+    """Create Social IQA benchmark preset."""
+    from themis.datasets.social_i_qa import load_social_i_qa as load_social_i_qa_dataset
+    from themis.evaluation.extractors.identity_extractor import IdentityExtractor
+    from themis.evaluation.metrics.exact_match import ExactMatch
+    def load_social_i_qa(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_social_i_qa_dataset(
+            source="huggingface",
+            split="validation",
+            limit=limit,
+        )
+        return _normalize_mcq_samples(_to_dict_samples(samples))
+    prompt_template = PromptTemplate(
+        name="social-iqa-zero-shot",
+        template=(
+            "Answer the question based on the social context.\n\n"
+            "Context: {context}\n"
+            "Question: {question}\n\n"
+            "Options:\n{options}\n\n"
+            "Answer (letter):"
+        ),
+    )
+    return BenchmarkPreset(
+        name="social_i_qa",
+        prompt_template=prompt_template,
+        metrics=[ExactMatch()],
+        extractor=IdentityExtractor(),
+        dataset_loader=load_social_i_qa,
+        metadata_fields=(),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="Social IQA commonsense reasoning benchmark",
+    )
+def _create_coqa_preset() -> BenchmarkPreset:
+    """Create CoQA benchmark preset."""
+    from themis.datasets.coqa import load_coqa as load_coqa_dataset
+    from themis.evaluation.extractors.identity_extractor import IdentityExtractor
+    from themis.evaluation.metrics.exact_match import ExactMatch
+    def load_coqa(limit: int | None = None) -> Sequence[dict[str, Any]]:
+        samples = load_coqa_dataset(
+            source="huggingface",
+            split="validation",
+            limit=limit,
+        )
+        return _to_dict_samples(samples)
+    prompt_template = PromptTemplate(
+        name="coqa-zero-shot",
+        template=(
+            "Answer the question based on the passage.\n\n"
+            "Passage: {story}\n\n"
+            "Question: {question}\n"
+            "Answer:"
+        ),
+    )
+    return BenchmarkPreset(
+        name="coqa",
+        prompt_template=prompt_template,
+        metrics=[ExactMatch()],
+        extractor=IdentityExtractor(),
+        dataset_loader=load_coqa,
+        metadata_fields=("turn",),
+        reference_field="answer",
+        dataset_id_field="unique_id",
+        description="CoQA conversational question answering benchmark",
+    )
 # ============================================================================
 # Demo/Test Benchmarks
 # ============================================================================
@@ -337,10 +909,23 @@ def _register_all_benchmarks() -> None:
     register_benchmark(_create_math500_preset())
     register_benchmark(_create_gsm8k_preset())
     register_benchmark(_create_aime24_preset())
+    register_benchmark(_create_aime25_preset())
+    register_benchmark(_create_amc23_preset())
+    register_benchmark(_create_olympiadbench_preset())
+    register_benchmark(_create_beyondaime_preset())
+    register_benchmark(_create_gsm_symbolic_preset())
     # MCQ benchmarks
     register_benchmark(_create_mmlu_pro_preset())
     register_benchmark(_create_supergpqa_preset())
+    register_benchmark(_create_gpqa_preset())
+    register_benchmark(_create_medmcqa_preset())
+    register_benchmark(_create_med_qa_preset())
+    register_benchmark(_create_sciq_preset())
+    register_benchmark(_create_commonsense_qa_preset())
+    register_benchmark(_create_piqa_preset())
+    register_benchmark(_create_social_i_qa_preset())
+    register_benchmark(_create_coqa_preset())
     # Demo
     register_benchmark(_create_demo_preset())

themis-eval 0.2.2__py3-none-any.whl → 1.0.0__py3-none-any.whl

themis-eval 0.2.2py3-none-any.whl → 1.0.0py3-none-any.whl