PyPI - eval-framework - Versions diffs - 0.2.7__py3-none-any.whl - Mend

eval-framework 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

eval_framework/__init__.py +7 -0
eval_framework/base_config.py +36 -0
eval_framework/context/__init__.py +0 -0
eval_framework/context/determined.py +177 -0
eval_framework/context/eval.py +121 -0
eval_framework/context/local.py +78 -0
eval_framework/evaluation_generator.py +234 -0
eval_framework/exceptions.py +2 -0
eval_framework/external/ifeval_impl/README.md +5 -0
eval_framework/external/ifeval_impl/instructions.py +1523 -0
eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
eval_framework/external/ifeval_impl/utils.py +135 -0
eval_framework/llm/__init__.py +0 -0
eval_framework/llm/aleph_alpha.py +432 -0
eval_framework/llm/base.py +180 -0
eval_framework/llm/huggingface.py +418 -0
eval_framework/llm/mistral.py +88 -0
eval_framework/llm/models.py +28 -0
eval_framework/llm/openai.py +400 -0
eval_framework/llm/vllm.py +554 -0
eval_framework/logger.py +3 -0
eval_framework/main.py +166 -0
eval_framework/metrics/__init__.py +0 -0
eval_framework/metrics/base.py +40 -0
eval_framework/metrics/completion/__init__.py +1 -0
eval_framework/metrics/completion/accuracy_completion.py +16 -0
eval_framework/metrics/completion/aidanbench.py +28 -0
eval_framework/metrics/completion/bleu.py +76 -0
eval_framework/metrics/completion/chrf.py +62 -0
eval_framework/metrics/completion/code_assertion.py +44 -0
eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
eval_framework/metrics/completion/comet.py +56 -0
eval_framework/metrics/completion/concordance_index.py +38 -0
eval_framework/metrics/completion/csv_format.py +102 -0
eval_framework/metrics/completion/cwe_accuracy.py +49 -0
eval_framework/metrics/completion/exponential_similarity.py +65 -0
eval_framework/metrics/completion/f1.py +42 -0
eval_framework/metrics/completion/format_checker.py +56 -0
eval_framework/metrics/completion/grid_difference.py +77 -0
eval_framework/metrics/completion/ifeval.py +73 -0
eval_framework/metrics/completion/json_format.py +179 -0
eval_framework/metrics/completion/language_checker.py +74 -0
eval_framework/metrics/completion/length_control.py +83 -0
eval_framework/metrics/completion/math_reasoning_completion.py +307 -0
eval_framework/metrics/completion/niah_accuracy.py +163 -0
eval_framework/metrics/completion/placeholder_checker.py +27 -0
eval_framework/metrics/completion/repetition.py +88 -0
eval_framework/metrics/completion/rouge_1.py +35 -0
eval_framework/metrics/completion/rouge_2.py +45 -0
eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
eval_framework/metrics/completion/rouge_l.py +52 -0
eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
eval_framework/metrics/completion/ter.py +67 -0
eval_framework/metrics/completion/text_counter.py +182 -0
eval_framework/metrics/efficiency/__init__.py +0 -0
eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
eval_framework/metrics/llm/__init__.py +0 -0
eval_framework/metrics/llm/base.py +34 -0
eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
eval_framework/metrics/llm/graders/comparison_grader.py +198 -0
eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
eval_framework/metrics/llm/graders/language.py +56 -0
eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
eval_framework/metrics/llm/graders/models.py +74 -0
eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
eval_framework/metrics/llm/llm_judge_mtbench_pair.py +306 -0
eval_framework/metrics/llm/llm_judge_mtbench_single.py +210 -0
eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
eval_framework/metrics/llm/llm_judge_sql.py +394 -0
eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
eval_framework/metrics/llm/utils.py +20 -0
eval_framework/metrics/loglikelihood/__init__.py +0 -0
eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
eval_framework/metrics/loglikelihood/base.py +50 -0
eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
eval_framework/metrics/loglikelihood/dcs.py +43 -0
eval_framework/metrics/loglikelihood/probability_mass.py +53 -0
eval_framework/metrics/loglikelihood/ternary.py +42 -0
eval_framework/py.typed +0 -0
eval_framework/response_generator.py +351 -0
eval_framework/result_processors/__init__.py +0 -0
eval_framework/result_processors/base.py +88 -0
eval_framework/result_processors/hf_uploader.py +75 -0
eval_framework/result_processors/result_processor.py +129 -0
eval_framework/result_processors/wandb_uploader.py +137 -0
eval_framework/run.py +369 -0
eval_framework/run_direct.py +42 -0
eval_framework/shared/types.py +227 -0
eval_framework/tasks/__init__.py +6 -0
eval_framework/tasks/base.py +392 -0
eval_framework/tasks/benchmarks/__init__.py +0 -0
eval_framework/tasks/benchmarks/aidanbench.py +211 -0
eval_framework/tasks/benchmarks/arc.py +70 -0
eval_framework/tasks/benchmarks/arc_de.py +46 -0
eval_framework/tasks/benchmarks/arc_fi.py +46 -0
eval_framework/tasks/benchmarks/belebele.py +60 -0
eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
eval_framework/tasks/benchmarks/casehold.py +47 -0
eval_framework/tasks/benchmarks/chembench.py +85 -0
eval_framework/tasks/benchmarks/copa.py +64 -0
eval_framework/tasks/benchmarks/duc.py +91 -0
eval_framework/tasks/benchmarks/flores200.py +133 -0
eval_framework/tasks/benchmarks/flores_plus.py +84 -0
eval_framework/tasks/benchmarks/gpqa.py +201 -0
eval_framework/tasks/benchmarks/gsm8k.py +150 -0
eval_framework/tasks/benchmarks/hellaswag.py +69 -0
eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
eval_framework/tasks/benchmarks/humaneval.py +97 -0
eval_framework/tasks/benchmarks/ifeval.py +78 -0
eval_framework/tasks/benchmarks/include.py +119 -0
eval_framework/tasks/benchmarks/infinitebench.py +302 -0
eval_framework/tasks/benchmarks/math_reasoning.py +580 -0
eval_framework/tasks/benchmarks/mbpp.py +192 -0
eval_framework/tasks/benchmarks/mmlu.py +215 -0
eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
eval_framework/tasks/benchmarks/mmlu_pro.py +164 -0
eval_framework/tasks/benchmarks/mmmlu.py +529 -0
eval_framework/tasks/benchmarks/openbookqa.py +85 -0
eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
eval_framework/tasks/benchmarks/pawsx.py +65 -0
eval_framework/tasks/benchmarks/piqa.py +64 -0
eval_framework/tasks/benchmarks/quality.py +56 -0
eval_framework/tasks/benchmarks/sciq.py +110 -0
eval_framework/tasks/benchmarks/sphyr.py +79 -0
eval_framework/tasks/benchmarks/squad.py +211 -0
eval_framework/tasks/benchmarks/struct_eval.py +116 -0
eval_framework/tasks/benchmarks/tablebench.py +117 -0
eval_framework/tasks/benchmarks/triviaqa.py +42 -0
eval_framework/tasks/benchmarks/truthfulqa.py +119 -0
eval_framework/tasks/benchmarks/winogender.py +64 -0
eval_framework/tasks/benchmarks/winogrande.py +69 -0
eval_framework/tasks/benchmarks/winox.py +57 -0
eval_framework/tasks/benchmarks/wmt.py +160 -0
eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
eval_framework/tasks/eval_config.py +136 -0
eval_framework/tasks/perturbation.py +83 -0
eval_framework/tasks/registry.py +186 -0
eval_framework/tasks/task_loader.py +81 -0
eval_framework/tasks/task_names.py +324 -0
eval_framework/tasks/utils.py +584 -0
eval_framework/utils/constants.py +9 -0
eval_framework/utils/file_ops.py +245 -0
eval_framework/utils/generate_task_docs.py +244 -0
eval_framework/utils/helpers.py +32 -0
eval_framework/utils/logging.py +62 -0
eval_framework/utils/packaging.py +52 -0
eval_framework/utils/tqdm_handler.py +14 -0
eval_framework-0.2.7.dist-info/METADATA +548 -0
eval_framework-0.2.7.dist-info/RECORD +170 -0
eval_framework-0.2.7.dist-info/WHEEL +4 -0
eval_framework-0.2.7.dist-info/entry_points.txt +3 -0
template_formatting/README.md +83 -0
template_formatting/__init__.py +0 -0
template_formatting/formatter.py +537 -0
template_formatting/mistral_formatter.py +159 -0
template_formatting/py.typed +0 -0

eval_framework/tasks/benchmarks/chembench.py ADDED Viewed

@@ -0,0 +1,85 @@
+import json
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import BaseTask, Language, ResponseType
+from eval_framework.tasks.utils import get_n_letters
+CHEMBENCH_SUBJECTS = [
+    "analytical_chemistry",
+    "chemical_preference",
+    "general_chemistry",
+    "inorganic_chemistry",
+    "materials_science",
+    "organic_chemistry",
+    "physical_chemistry",
+    "technical_chemistry",
+    "toxicity_and_safety",
+]
+class ChemBench(BaseTask[str]):
+    """ChemBench dataset: https://huggingface.co/datasets/jablonkagroup/ChemBench"""
+    NAME = "ChemBench"
+    DATASET_PATH = "jablonkagroup/ChemBench"
+    SAMPLE_SPLIT = "train"  # Only has train split
+    FEWSHOT_SPLIT = "train"  # Only has train split
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = CHEMBENCH_SUBJECTS
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        assert num_fewshot == 0, "Fewshot is not supported for ChemBench"
+        super().__init__(num_fewshot)
+        self.keys = get_n_letters(16)
+    def _load_dataset(self, subject: str) -> None:
+        super()._load_dataset(subject)
+        # Keep only the multiple-choice options with 1 correct answer
+        for split in self.dataset.keys():
+            filtered_items = []
+            for item in self.dataset[split]:
+                if item.get("metrics") == ["multiple_choice_grade"]:
+                    target_scores = json.loads(item["examples"][0]["target_scores"])
+                    correct_answers = [i for i, score in enumerate(target_scores.values()) if score == 1.0]
+                    if len(correct_answers) == 1:
+                        filtered_items.append(item)
+            self.dataset[split] = filtered_items
+    def _get_subject_name(self, item: dict[str, Any]) -> str:
+        return " ".join(item["subject"].split("_"))
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return (
+            "The following is a question about chemistry. Please answer by responding with the letter of the correct "
+            "answer."
+        )
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        question = item["examples"][0]["input"].strip()
+        target_scores = json.loads(item["examples"][0]["target_scores"])
+        choices = "".join([f"{key}. {choice}\n" for key, choice in zip(self.keys, target_scores.keys())])
+        return f"Question: {question}\n{choices}"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        ground_truth = self._get_ground_truth(item)
+        return f"{self._get_cue_text(item)}{ground_truth}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "Answer:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        target_scores = json.loads(item["examples"][0]["target_scores"])
+        correct_answers = [i for i, score in enumerate(target_scores.values()) if score == 1.0]
+        assert len(correct_answers) == 1, f"Expected exactly one correct answer, but got {len(correct_answers)}"
+        return f" {self.keys[correct_answers[0]]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        target_scores = json.loads(item["examples"][0]["target_scores"])
+        return [f" {key}" for key in self.keys[: len(target_scores)]]

eval_framework/tasks/benchmarks/copa.py ADDED Viewed

@@ -0,0 +1,64 @@
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.metrics.loglikelihood.confidence_weighted_accuracy import ConfidenceWeightedAccuracy
+from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore
+from eval_framework.metrics.loglikelihood.ternary import TernaryScore
+from eval_framework.tasks.base import BaseTask, Language, ResponseType
+class COPA(BaseTask[str]):
+    """COPA dataset: https://huggingface.co/datasets/aps/super_glue"""
+    NAME = "COPA"
+    DATASET_PATH = "aps/super_glue"
+    SAMPLE_SPLIT = "validation"  # 100 examples (same split as lm-eval)
+    FEWSHOT_SPLIT = "test"  # 500 examples
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = ["copa"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["because", "therefore"]
+    LANGUAGE = Language.ENG
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        connector = {
+            "cause": "because",
+            "effect": "therefore",
+        }[item["question"]]
+        return item["premise"].strip()[:-1] + f" {connector} "
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        correct_choice = item["choice1"] if item["label"] == 0 else item["choice2"]
+        return f"{self.convert_choice(correct_choice)}"
+    def convert_choice(self, choice: str) -> str:
+        return choice[0].lower() + choice[1:]
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        choices = [self.convert_choice(item["choice1"]), self.convert_choice(item["choice2"])]
+        return choices
+class COPA_IDK(COPA):
+    NAME = "COPA_IDK"
+    METRICS = [
+        AccuracyLoglikelihood,
+        AccuracyNormLoglikelihood,
+        ConfidenceWeightedAccuracy,
+        DistributionalCorrectnessScore,
+        TernaryScore,
+    ]
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return (
+            "Complete the sentence only if you are confident, since mistakes may be penalised, while correct "
+            "answers receive points. It is acceptable to answer with 'I do not know' if you are unsure, and "
+            "you will receive 0 points."
+        )
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        completions = super()._get_possible_completions(item)
+        return (completions or []) + ["I do not know."]

eval_framework/tasks/benchmarks/duc.py ADDED Viewed

@@ -0,0 +1,91 @@
+import random
+import re
+from abc import ABC
+from typing import Any
+from eval_framework.metrics.base import BaseMetric
+from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
+from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
+class DUC(BaseTask[str], ABC):
+    """https://huggingface.co/datasets/midas/duc2001"""
+    DATASET_PATH: str = "midas/duc2001"
+    SAMPLE_SPLIT: str = "test"
+    FEWSHOT_SPLIT: str = "test"
+    RESPONSE_TYPE: ResponseType = ResponseType.COMPLETION
+    METRICS: list[type[BaseMetric]] = [AccuracyCompletion]
+    SUBJECTS: list[str] = ["raw"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Text", "Keyphrase"]
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences: list[str] = ["Text:"]
+        self.max_tokens = 50  # longest keyphrase is less than 50 characters long
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        for stop_sequence in self.stop_sequences:
+            if stop_sequence in completion_text:
+                completion_text = completion_text.split(stop_sequence)[0]
+        completion_text = completion_text.strip()
+        return completion_text
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        instruction_text = " ".join(item["document"])
+        instruction_text = re.sub(r"\s+([.,!?;:])", r"\1", instruction_text)
+        return f"Text: {instruction_text}\nKeyphrase:"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        target = self._get_ground_truth(item)
+        assert target is not None
+        assert isinstance(target, list)
+        return f" {target[0]}"
+class DUC_EXTRACTIVE(DUC):
+    NAME = "DUC Extractive"
+    SUBJECTS: list[str] = ["raw"]
+    def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
+        return item["extractive_keyphrases"]
+    def _get_system_prompt_text(self, item: dict[str, Any]) -> str:
+        return (
+            "You are an AI model tasked with extracting keyphrases from a text document. "
+            "Keyphrases should capture main ideas or significant topics exactly as worded in the text."
+        )
+class DUC_ABSTRACTIVE(DUC):
+    NAME = "DUC Abstractive"
+    SUBJECTS: list[str] = ["raw"]
+    def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
+        return item["abstractive_keyphrases"]
+    def _load_dataset(self, subject: str) -> None:
+        # not all samples have abstractive keyphrases
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject)
+        self.dataset = {}
+        for split, data in hf_dataset.items():
+            data_list = list(filter(lambda x: len(x["abstractive_keyphrases"]) > 0, data))
+            if split == self.SAMPLE_SPLIT:
+                self.rnd = random.Random(RANDOM_SEED)
+                self.rnd.shuffle(data_list)
+            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                self.dataset[split] = data_list
+    def _get_system_prompt_text(self, item: dict[str, Any]) -> str:
+        return (
+            "You are an AI model tasked with generating abstractive keyphrases "
+            "that capture the main ideas of the text without using exact wording."
+        )
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return "Paraphrase the following texts to improve clarity and relevance."

eval_framework/tasks/benchmarks/flores200.py ADDED Viewed

@@ -0,0 +1,133 @@
+import os
+import random
+from pathlib import Path
+from typing import Any
+import pycountry
+from datasets import DownloadConfig, load_dataset
+from huggingface_hub import HfApi
+from huggingface_hub.errors import RevisionNotFoundError
+from eval_framework.metrics.completion.bleu import BLEU
+from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
+FLORES_LANGUAGES = [
+    "deu_Latn",
+    "eng_Latn",
+    "fin_Latn",
+    "fra_Latn",
+    "nld_Latn",
+]  # Note: there are many more languages in the dataset, but we only consider these for now
+class Flores200(BaseTask[str]):
+    """FLORES-200 dataset: https://huggingface.co/datasets/facebook/flores"""
+    NAME = "FLoRes-200"
+    DATASET_PATH = "facebook/flores"
+    SAMPLE_SPLIT = "devtest"
+    FEWSHOT_SPLIT = "dev"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [BLEU]
+    SUBJECTS = [f"{s}-{t}" for s in FLORES_LANGUAGES for t in FLORES_LANGUAGES if s != t]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
+    LANGUAGE = {
+        "deu_Latn": Language.DEU,
+        "eng_Latn": Language.ENG,
+        "fin_Latn": Language.FIN,
+        "fra_Latn": Language.FRA,
+        "nld_Latn": Language.NLD,
+    }
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = ["\n"]
+    def _load_hf_dataset(self, **kwargs: Any) -> Any:
+        """Override to handle FLORES-200 encoding issues by using parquet files."""
+        # Check if the HF_REVISION is valid before loading the dataset
+        if self.HF_REVISION:
+            try:
+                _ = HfApi().dataset_info(repo_id=kwargs["path"], revision=self.HF_REVISION, timeout=100.0)
+            except Exception as e:
+                if isinstance(e, RevisionNotFoundError):
+                    raise e
+        cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
+        download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
+        # First, try to load using parquet files to bypass the problematic loading script
+        try:
+            # Try loading without the loading script by using data_files
+            # This forces the dataset library to use the parquet files directly
+            dataset = load_dataset(
+                kwargs.get("path", self.DATASET_PATH),
+                name=kwargs.get("name"),
+                split=kwargs.get("split"),
+                data_files=None,  # Let it auto-discover parquet files
+                revision=self.HF_REVISION,
+                trust_remote_code=False,  # Disable the loading script!
+                cache_dir=cache_dir,
+                download_config=download_config,
+            )
+            return dataset
+        except Exception:
+            # If parquet loading fails, try the original method
+            # Try the original loading with the loading script
+            dataset = load_dataset(
+                **kwargs,
+                revision=self.HF_REVISION,
+                trust_remote_code=True,
+                cache_dir=cache_dir,
+                download_config=download_config,
+            )
+            return dataset
+    def _load_dataset(self, subject: SubjectType) -> None:
+        # Store the subject (language pair) for use in other methods
+        self.subject = subject
+        # For FLORES, we need to load the dataset once with all languages
+        # The subject (e.g., "eng_Latn-deu_Latn") determines which fields we use
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name="all")
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            data_list = list(data)
+            # Add the subject to each item so _get_instruction_text can use it
+            for item in data_list:
+                item["subject"] = subject
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+                self.dataset[split] = data_list
+            elif split == self.FEWSHOT_SPLIT:
+                self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        source_key = item["subject"].split("-")[0]
+        source_language = pycountry.languages.get(alpha_3=source_key.split("_")[0]).name
+        source = item[f"sentence_{source_key}"]
+        instruction = f"{source_language} sentence: {source}\n"
+        target_key = item["subject"].split("-")[1]
+        target_language = pycountry.languages.get(alpha_3=target_key.split("_")[0]).name
+        return f"{instruction}{target_language} sentence:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        target_key = item["subject"].split("-")[1]
+        return item[f"sentence_{target_key}"]
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        target = f" {self._get_ground_truth(item)}"
+        assert target is not None
+        assert isinstance(target, str)
+        return target
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        return completion_text.strip()

eval_framework/tasks/benchmarks/flores_plus.py ADDED Viewed

@@ -0,0 +1,84 @@
+import random
+from itertools import product
+from typing import Any
+from eval_framework.metrics.completion.bleu import BLEU
+from eval_framework.metrics.completion.chrf import CHRF
+from eval_framework.metrics.completion.comet import COMET
+from eval_framework.shared.types import BaseMetricContext, UntemplatedPrompt
+from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
+LANG_MAP = {
+    "deu_Latn": "German",
+    "eng_Latn": "English",
+    "fra_Latn": "French",
+    "ita_Latn": "Italian",
+    "nld_Latn": "Dutch",
+    "pol_Latn": "Polish",
+    "rus_Cyrl": "Russian",
+    "spa_Latn": "Spanish",
+    "ukr_Cyrl": "Ukrainian",
+}
+class FloresPlus(BaseTask[str]):
+    """Flores-Plus dataset: https://huggingface.co/datasets/openlanguagedata/flores_plus"""
+    NAME = "Flores-Plus"
+    DATASET_PATH = "openlanguagedata/flores_plus"
+    SAMPLE_SPLIT = "dev"
+    FEWSHOT_SPLIT = "devtest"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [BLEU, CHRF, COMET]
+    SUBJECTS = [f"{s}-{t}" for s, t in product(LANG_MAP, LANG_MAP) if s != t]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
+    LANGUAGE = {
+        "deu_Latn": Language.DEU,
+        "eng_Latn": Language.ENG,
+        "fra_Latn": Language.FRA,
+        "ita_Latn": Language.ITA,
+        "nld_Latn": Language.NLD,
+        "pol_Latn": Language.POL,
+        "rus_Cyrl": Language.RUS,
+        "spa_Latn": Language.SPA,
+        "ukr_Cyrl": Language.UKR,
+    }
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = ["\n"]
+    def _load_dataset(self, subject: str) -> None:
+        hf_dataset_src = self._load_hf_dataset(path=self.DATASET_PATH, name=subject.split("-")[0])
+        hf_dataset_tgt = self._load_hf_dataset(path=self.DATASET_PATH, name=subject.split("-")[1])
+        self.dataset = {}
+        self.rnd = random.Random(42)
+        for split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+            data_src = hf_dataset_src[split]
+            data_tgt = hf_dataset_tgt[split]
+            data_list = []
+            for item_src, item_tgt in zip(data_src, data_tgt):
+                assert item_src["id"] == item_tgt["id"]
+                iso_src = f"{item_src['iso_639_3']}_{item_src['iso_15924']}"
+                iso_tgt = f"{item_tgt['iso_639_3']}_{item_tgt['iso_15924']}"
+                text_src = item_src["text"]
+                text_tgt = item_tgt["text"]
+                data_list.append({"iso_source": iso_src, "iso_target": iso_tgt, "source": text_src, "target": text_tgt})
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        target_language = LANG_MAP[item["iso_target"]]
+        instruction = f"Translate the following text into {target_language}:\n{item['source']}"
+        return instruction
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return item["target"]
+    def _get_context(self, item: dict[str, Any]) -> BaseMetricContext | list[BaseMetricContext] | None:
+        return UntemplatedPrompt(untemplated_prompt=item["source"])
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        return completion_text.strip()

eval_framework/tasks/benchmarks/gpqa.py ADDED Viewed

@@ -0,0 +1,201 @@
+import hashlib
+import logging
+import random
+import re
+from typing import Any
+from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.metrics.loglikelihood.confidence_weighted_accuracy import ConfidenceWeightedAccuracy
+from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore
+from eval_framework.metrics.loglikelihood.ternary import TernaryScore
+from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample, SubjectType
+from eval_framework.tasks.utils import get_n_letters
+logger = logging.getLogger(__name__)
+class GPQA(BaseTask[str]):
+    """GPQA dataset: https://huggingface.co/datasets/Idavidrein/gpqa"""
+    NAME = "GPQA"
+    DATASET_PATH = "Idavidrein/gpqa"
+    SAMPLE_SPLIT = "train"
+    FEWSHOT_SPLIT = "train"
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = ["gpqa_extended"]  # ["gpqa_diamond", "gpqa_extended", "gpqa_main", "gpqa_experts"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(4)
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = ["Question:"]
+        self.keys = get_n_letters(4)
+        self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
+        self.rnd_choice_shuffle = random.Random(RANDOM_SEED)
+    def _load_dataset(self, subject: SubjectType) -> None:
+        name = subject if subject != NO_SUBJECT else None
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                # exclude in the GPQA dataset one of the sample that has an too long prompt (DNA sequence)
+                data_list_filtered = [
+                    item
+                    for item in data_list
+                    if item["Question"]
+                    != "Hello, you are embarking on a new project. You need to produce the HP1alpha protein in E. coli. Which of these plasmids will you choose?"  # noqa: E501
+                ]
+                if len(data_list) - len(data_list_filtered) > 0:
+                    logger.info(f"Excluded {len(data_list) - len(data_list_filtered)} samples from {split} split.")
+                assert len(data_list) - len(data_list_filtered) < 2, "we expect to remove max one item"
+                self.dataset[split] = data_list_filtered
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        system_prompt_text = (
+            "Here are some example questions from experts. "
+            "An explanation is given before the final answer. "
+            "Answer the final question yourself, giving your reasoning beforehand."
+        )
+        return system_prompt_text
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        choices, _ = self._get_possible_completions_marked(item)
+        prompt = f"Question: {item['Question'].strip()}\n"
+        prompt += "\n".join(choices) + "\n"
+        return prompt
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        ground_truth = self._get_ground_truth(item)
+        assert ground_truth is not None
+        return f"{self._get_cue_text(item)}{ground_truth}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "Answer:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        choices, correct_answer_position = self._get_possible_completions_marked(item)
+        answer_key = choices[correct_answer_position][:3]
+        return f" {answer_key}"
+    def _get_possible_completions_marked(self, item: dict[str, Any]) -> tuple[list[str], int]:
+        choices = [self._preprocess(item[f"Incorrect Answer {x}"]) for x in range(1, 4)]
+        correct_answer = self._preprocess(item["Correct Answer"])
+        # we want to be random, but always the same for the same input
+        # so we hash the string, which always give you the same seed
+        hash_object = hashlib.sha256(f"{choices} {correct_answer}".encode())
+        self.rnd_choice_shuffle.seed(int(hash_object.hexdigest(), 16))
+        self.rnd_choice_shuffle.shuffle(choices)
+        correct_answer_position = self.rnd_choice_shuffle.randint(0, 3)
+        choices.insert(correct_answer_position, correct_answer)
+        choices = [f"({self.keys[i]}) {choice}" for i, choice in enumerate(choices)]
+        return choices, correct_answer_position
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return [f" ({x})" for x in self.keys]
+    @staticmethod
+    def _preprocess(text: str | None) -> str:
+        if text is None:
+            return " "
+        text = text.strip()
+        text = text.replace(" [title]", ". ")
+        text = re.sub("\\[.*?\\]", "", text)
+        text = text.replace("  ", " ")
+        return text
+class GPQA_IDK(GPQA):
+    NAME = "GPQA_IDK"
+    METRICS = [
+        AccuracyLoglikelihood,
+        AccuracyNormLoglikelihood,
+        ConfidenceWeightedAccuracy,
+        DistributionalCorrectnessScore,
+        TernaryScore,
+    ]
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return (
+            "Answer only if you are confident, since mistakes may be penalised, while correct answers receive points. "
+            "It is acceptable to answer with '?' if you are unsure, and you will receive 0 points."
+        )
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        completions = super()._get_possible_completions(item)
+        return (completions or []) + [" (?)"]
+class GPQA_COT(GPQA):
+    NAME = "GPQA_COT"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [AccuracyCompletion]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Therefore", "the", "answer", "is", "ANSWER_LETTER"] + get_n_letters(
+        4
+    )
+    ANS_RE = re.compile(r"Therefore, the answer is \(([ABCDEFGHIJ])\)")
+    def __init__(self, num_fewshot: int = 0) -> None:
+        assert num_fewshot == 0, "Fewshot is not supported for GPQA_COT"
+        super().__init__(num_fewshot)
+        self.stop_sequences: list[str] = ["Question:"]
+        self.keys = get_n_letters(4)
+        self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
+        self.rnd_choice_shuffle = random.Random(RANDOM_SEED)
+    def _extract_answer(self, completion: str) -> str:
+        match = self.ANS_RE.search(completion)
+        if match:
+            match_str = match.group(1)
+            return match_str
+        else:
+            return "[invalid]"
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        for stop_sequence in self.stop_sequences:
+            if stop_sequence in completion_text:
+                completion_text = completion_text.split(stop_sequence)[0]
+        return self._extract_answer(completion_text)
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return ""
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        # using the reasoning prompt from "Figure 44 of Tülu 3 paper: https://arxiv.org/pdf/2411.15124"
+        choices, _ = self._get_possible_completions_marked(item)
+        instruction_text = (
+            "Answer the following multiple-choice question by giving the correct answer letter in parentheses. "
+            "Provide CONCISE reasoning for the answer, and make sure to finish the response with "
+            '"Therefore, the answer is (ANSWER_LETTER)" where (ANSWER_LETTER) is one of (A), (B), (C), (D), (E), etc.'
+        )
+        instruction_text += f"\n\nQuestion: {item['Question'].strip()}\n"
+        instruction_text += "\n".join(choices)
+        instruction_text += (
+            "\n\nAnswer the above question and REMEMBER to finish your response with the exact phrase "
+            '"Therefore, the answer is (ANSWER_LETTER)" where (ANSWER_LETTER) is one of (A), (B), (C), (D), (E), etc.'
+        )
+        return instruction_text
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return ""
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        choices, correct_answer_position = self._get_possible_completions_marked(item)
+        # index 1 selects the letter
+        answer_key = choices[correct_answer_position][1]
+        return answer_key