PyPI - eval-framework - Versions diffs - 0.2.0__py3-none-any.whl - Mend

eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

eval_framework/__init__.py +7 -0
eval_framework/base_config.py +36 -0
eval_framework/context/__init__.py +0 -0
eval_framework/context/determined.py +170 -0
eval_framework/context/eval.py +114 -0
eval_framework/context/local.py +52 -0
eval_framework/evaluation_generator.py +231 -0
eval_framework/exceptions.py +2 -0
eval_framework/external/ifeval_impl/README.md +5 -0
eval_framework/external/ifeval_impl/instructions.py +1523 -0
eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
eval_framework/external/ifeval_impl/utils.py +135 -0
eval_framework/llm/__init__.py +0 -0
eval_framework/llm/aleph_alpha.py +323 -0
eval_framework/llm/base.py +58 -0
eval_framework/llm/huggingface.py +332 -0
eval_framework/llm/mistral.py +73 -0
eval_framework/llm/models.py +16 -0
eval_framework/llm/openai.py +205 -0
eval_framework/llm/vllm.py +438 -0
eval_framework/logger.py +3 -0
eval_framework/main.py +187 -0
eval_framework/metrics/__init__.py +0 -0
eval_framework/metrics/base.py +40 -0
eval_framework/metrics/completion/__init__.py +1 -0
eval_framework/metrics/completion/accuracy_completion.py +16 -0
eval_framework/metrics/completion/bleu.py +76 -0
eval_framework/metrics/completion/chrf.py +62 -0
eval_framework/metrics/completion/code_assertion.py +44 -0
eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
eval_framework/metrics/completion/comet.py +56 -0
eval_framework/metrics/completion/concordance_index.py +38 -0
eval_framework/metrics/completion/csv_format.py +102 -0
eval_framework/metrics/completion/cwe_accuracy.py +49 -0
eval_framework/metrics/completion/exponential_similarity.py +65 -0
eval_framework/metrics/completion/f1.py +42 -0
eval_framework/metrics/completion/format_checker.py +56 -0
eval_framework/metrics/completion/grid_difference.py +77 -0
eval_framework/metrics/completion/ifeval.py +73 -0
eval_framework/metrics/completion/json_format.py +171 -0
eval_framework/metrics/completion/language_checker.py +74 -0
eval_framework/metrics/completion/length_control.py +83 -0
eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
eval_framework/metrics/completion/niah_accuracy.py +163 -0
eval_framework/metrics/completion/placeholder_checker.py +27 -0
eval_framework/metrics/completion/repetition.py +88 -0
eval_framework/metrics/completion/rouge_1.py +35 -0
eval_framework/metrics/completion/rouge_2.py +45 -0
eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
eval_framework/metrics/completion/rouge_l.py +52 -0
eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
eval_framework/metrics/completion/ter.py +67 -0
eval_framework/metrics/completion/text_counter.py +182 -0
eval_framework/metrics/efficiency/__init__.py +0 -0
eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
eval_framework/metrics/llm/__init__.py +0 -0
eval_framework/metrics/llm/base.py +8 -0
eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
eval_framework/metrics/llm/graders/language.py +56 -0
eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
eval_framework/metrics/llm/graders/models.py +74 -0
eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
eval_framework/metrics/llm/llm_judge_sql.py +394 -0
eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
eval_framework/metrics/loglikelihood/__init__.py +0 -0
eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
eval_framework/py.typed +0 -0
eval_framework/response_generator.py +416 -0
eval_framework/result_processors/__init__.py +0 -0
eval_framework/result_processors/base.py +74 -0
eval_framework/result_processors/hf_processor.py +87 -0
eval_framework/result_processors/result_processor.py +129 -0
eval_framework/run.py +314 -0
eval_framework/run_direct.py +42 -0
eval_framework/shared/types.py +227 -0
eval_framework/tasks/__init__.py +6 -0
eval_framework/tasks/base.py +314 -0
eval_framework/tasks/benchmarks/__init__.py +0 -0
eval_framework/tasks/benchmarks/arc.py +46 -0
eval_framework/tasks/benchmarks/arc_de.py +46 -0
eval_framework/tasks/benchmarks/arc_fi.py +46 -0
eval_framework/tasks/benchmarks/belebele.py +60 -0
eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
eval_framework/tasks/benchmarks/casehold.py +47 -0
eval_framework/tasks/benchmarks/chembench.py +85 -0
eval_framework/tasks/benchmarks/copa.py +39 -0
eval_framework/tasks/benchmarks/duc.py +91 -0
eval_framework/tasks/benchmarks/flores200.py +62 -0
eval_framework/tasks/benchmarks/flores_plus.py +84 -0
eval_framework/tasks/benchmarks/gpqa.py +177 -0
eval_framework/tasks/benchmarks/gsm8k.py +148 -0
eval_framework/tasks/benchmarks/hellaswag.py +44 -0
eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
eval_framework/tasks/benchmarks/humaneval.py +97 -0
eval_framework/tasks/benchmarks/ifeval.py +78 -0
eval_framework/tasks/benchmarks/include.py +119 -0
eval_framework/tasks/benchmarks/infinitebench.py +302 -0
eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
eval_framework/tasks/benchmarks/mbpp.py +192 -0
eval_framework/tasks/benchmarks/mmlu.py +190 -0
eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
eval_framework/tasks/benchmarks/mmmlu.py +529 -0
eval_framework/tasks/benchmarks/openbookqa.py +37 -0
eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
eval_framework/tasks/benchmarks/pawsx.py +65 -0
eval_framework/tasks/benchmarks/piqa.py +39 -0
eval_framework/tasks/benchmarks/quality.py +56 -0
eval_framework/tasks/benchmarks/sciq.py +44 -0
eval_framework/tasks/benchmarks/sphyr.py +75 -0
eval_framework/tasks/benchmarks/squad.py +89 -0
eval_framework/tasks/benchmarks/struct_eval.py +110 -0
eval_framework/tasks/benchmarks/tablebench.py +117 -0
eval_framework/tasks/benchmarks/triviaqa.py +42 -0
eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
eval_framework/tasks/benchmarks/winogender.py +39 -0
eval_framework/tasks/benchmarks/winogrande.py +44 -0
eval_framework/tasks/benchmarks/winox.py +57 -0
eval_framework/tasks/benchmarks/wmt.py +160 -0
eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
eval_framework/tasks/eval_config.py +112 -0
eval_framework/tasks/perturbation.py +83 -0
eval_framework/tasks/registry.py +186 -0
eval_framework/tasks/task_loader.py +80 -0
eval_framework/tasks/task_names.py +138 -0
eval_framework/tasks/utils.py +578 -0
eval_framework/utils/constants.py +9 -0
eval_framework/utils/generate_task_docs.py +229 -0
eval_framework/utils/helpers.py +3 -0
eval_framework/utils/logging.py +50 -0
eval_framework/utils/packaging.py +52 -0
eval_framework-0.2.0.dist-info/METADATA +514 -0
eval_framework-0.2.0.dist-info/RECORD +161 -0
eval_framework-0.2.0.dist-info/WHEEL +4 -0
eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
template_formatting/README.md +83 -0
template_formatting/__init__.py +0 -0
template_formatting/formatter.py +536 -0
template_formatting/mistral_formatter.py +159 -0
template_formatting/py.typed +0 -0
template_formatting/tests/test_formatter_eval.py +408 -0
template_formatting/tests/test_formatter_scaling.py +253 -0
template_formatting/tests/test_mistral_formatter.py +136 -0

eval_framework/tasks/benchmarks/mbpp.py ADDED Viewed

@@ -0,0 +1,192 @@
+import ast
+import logging
+import re
+from typing import Any
+from eval_framework.metrics.completion.code_assertion import (
+    CodeCompletionAssertion,
+)
+from eval_framework.shared.types import BaseMetricContext
+from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
+logger = logging.getLogger(__name__)
+BEGIN = "```python"
+END = "```"
+class MBPPMetricContext(BaseMetricContext):
+    tests_code: str
+class MBPP(BaseTask[str]):
+    """
+    MBPP provides both the problem statement and the test cases upfront. It says, "Here's the problem and here are the
+    tests; write code that passes them.". Note that LLMs can cheat and only write code that passes the tests without
+    solving the given problem.
+    MBPP_PROMPT_WITHOUT_TESTS, on the other hand, only gives you the problem statement and function signature
+    initially. It says, "Here's the problem and function signature; write code, then we'll run tests later."
+    """
+    NAME = "MBPP"
+    DATASET_PATH = "google-research-datasets/mbpp"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "train"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [CodeCompletionAssertion]
+    SUBJECTS = ["full"]  # , "sanitized"]  # these are HF dataset SUBSETS!
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = [END]
+    @staticmethod
+    def _code_expander(code: str, gt_asserts: str) -> str:
+        """
+        code variable carries the LLM-generated code snippet. We append the asserts for code testing
+        here. If no valid code is found in the LLM output, this function is not called.
+        Important: gt_asserts come as a stringiied list of assert strings. We safely reconvert this string
+        back to the list of of individual assert statements (also strings) by ast.literal_eval
+        """
+        if not gt_asserts:  # no ground truth (data asserts) are given, we return the original code
+            return code
+        gt_asserts = ast.literal_eval(gt_asserts)  # never use eval!
+        if not isinstance(gt_asserts, list):
+            logger.info("*** WARNING, we expect a list of ground truth asserts here! Sample can not be finalized")
+            return code
+        postfix = ""
+        stacked_asserts = ""
+        for gt_assert in gt_asserts:
+            stacked_asserts += "    " + gt_assert + "\n"
+        postfix = "try:\n" + stacked_asserts + "    score = True\nexcept:\n    score = False\nprint(score)"
+        return code + postfix
+    @staticmethod
+    def _get_function_name(line: str) -> str:
+        match = re.search(r"def\s+(\w+)\s*\(", line)
+        function_name = ""
+        if match:
+            function_name = match.group(1)
+        return function_name
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        """
+        Passing selected task and tests depending on zero or few-shot setting
+        """
+        tests = "\n".join(item["test_list"])
+        text = item["text"] if "text" in item else item["prompt"]
+        instruction_text = f"You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n{tests}\n"  # noqa E501
+        return instruction_text
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return BEGIN
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        """
+        asserts are being passed as ground_truth, this is expected by CodeCompletionAssertion metrics
+        """
+        return f"{item['test_list']}"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        target = item["code"]
+        assert target is not None
+        assert isinstance(target, str)
+        return f"{BEGIN}\n" + target + f"\n{END}"
+    def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
+        fewshot_examples = self.rnd.sample(self.dataset[self.FEWSHOT_SPLIT], self.num_fewshot)
+        return fewshot_examples
+    def _get_context(self, item: dict[str, Any]) -> MBPPMetricContext:
+        return MBPPMetricContext(tests_code="\n".join(item["test_list"]))
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        assert sample is not None
+        if BEGIN in completion_text:
+            completion_text = completion_text.split(f"{BEGIN}\n")[1]
+        if END in completion_text:
+            completion_text = completion_text.split(END)[0]
+        extracted_code = completion_text + "\n"
+        mbpp_ground_truth = str(sample.ground_truth)
+        code = self._code_expander(extracted_code, mbpp_ground_truth)
+        return code
+class MBPP_SANITIZED(MBPP):
+    NAME = "MBPP_SANITZED"
+    SUBJECTS = ["sanitized"]
+class MBPP_PROMPT_WITHOUT_TESTS(MBPP):
+    """
+    MBPP provides both the problem statement and the test cases upfront. It says, "Here's the problem and here are the
+    tests; write code that passes them.". Note that LLMs can cheat and only write code that passes the tests without
+    solving the given problem.
+    MBPP_PROMPT_WITHOUT_TESTS, on the other hand, only gives you the problem statement and function signature
+    initially. It says, "Here's the problem and function signature; write code, then we'll run tests later."
+    """
+    NAME = "MBPP_PROMPT_WITHOUT_TESTS"
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        """
+        Passing selected task and tests depending on zero or few-shot setting
+        """
+        text = item["text"] if "text" in item else item["prompt"]
+        instruction_text = f"You are an expert Python programmer, and here is your task: {text}\n\n"  # noqa E501
+        return instruction_text
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        function_header = self._get_function_header(item["code"])
+        return f"{BEGIN}\n{function_header}"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        target = item["code"]
+        assert target is not None
+        assert isinstance(target, str)
+        return f"{BEGIN}\n" + target + f"\n{END}"
+    @staticmethod
+    def _get_function_header(line: str) -> str:
+        match = re.search(r"^\s*def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(.*?\)\s*:", line, re.MULTILINE)
+        postfix = ""
+        if match is not None:  # extract up to next open parenthesis in the found substring
+            postfix = line[match.start() :]
+            match = re.search(r"\)", postfix)
+            if match is not None:
+                end = match.start()
+                postfix = postfix[: end + 1]
+            else:
+                postfix = ""
+        if postfix == "":
+            return postfix
+        return f"{postfix.strip()}:"
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        assert sample is not None
+        if BEGIN in completion_text:
+            completion_text = completion_text.split(BEGIN)[1]
+        if END in completion_text:
+            completion_text = completion_text.split(END)[0]
+        extracted_code = completion_text + "\n"
+        mbpp_ground_truth = str(sample.ground_truth)
+        function_header = self._get_function_header(sample.messages[-1].content)
+        code = self._code_expander(extracted_code, mbpp_ground_truth)
+        return function_header + code
+class MBPP_PROMPT_WITHOUT_TESTS_SANITIZED(MBPP_PROMPT_WITHOUT_TESTS):
+    NAME = "MBPP_PROMPT_WITHOUT_TESTS_SANITIZED"
+    SUBJECTS = ["sanitized"]

eval_framework/tasks/benchmarks/mmlu.py ADDED Viewed

@@ -0,0 +1,190 @@
+import re
+from typing import Any
+from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
+from eval_framework.tasks.utils import get_n_letters
+MMLU_SUBJECTS = [
+    "abstract_algebra",
+    "anatomy",
+    "astronomy",
+    "business_ethics",
+    "clinical_knowledge",
+    "college_biology",
+    "college_chemistry",
+    "college_computer_science",
+    "college_mathematics",
+    "college_medicine",
+    "college_physics",
+    "computer_security",
+    "conceptual_physics",
+    "econometrics",
+    "electrical_engineering",
+    "elementary_mathematics",
+    "formal_logic",
+    "global_facts",
+    "high_school_biology",
+    "high_school_chemistry",
+    "high_school_computer_science",
+    "high_school_european_history",
+    "high_school_geography",
+    "high_school_government_and_politics",
+    "high_school_macroeconomics",
+    "high_school_mathematics",
+    "high_school_microeconomics",
+    "high_school_physics",
+    "high_school_psychology",
+    "high_school_statistics",
+    "high_school_us_history",
+    "high_school_world_history",
+    "human_aging",
+    "human_sexuality",
+    "international_law",
+    "jurisprudence",
+    "logical_fallacies",
+    "machine_learning",
+    "management",
+    "marketing",
+    "medical_genetics",
+    "miscellaneous",
+    "moral_disputes",
+    "moral_scenarios",
+    "nutrition",
+    "philosophy",
+    "prehistory",
+    "professional_accounting",
+    "professional_law",
+    "professional_medicine",
+    "professional_psychology",
+    "public_relations",
+    "security_studies",
+    "sociology",
+    "us_foreign_policy",
+    "virology",
+    "world_religions",
+]
+class MMLU(BaseTask[str]):
+    """MMLU dataset: https://huggingface.co/datasets/cais/mmlu"""
+    NAME = "MMLU"
+    DATASET_PATH = "cais/mmlu"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "dev"
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = MMLU_SUBJECTS
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4)
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.keys = get_n_letters(4)
+    def _get_subject_name(self, item: dict[str, Any]) -> str:
+        return " ".join(item["subject"].split("_"))
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return f"The following are multiple choice questions (with answers) about {self._get_subject_name(item)}."
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        question = item["question"].strip()
+        choices = "".join([f"{key}. {choice}\n" for key, choice in zip(self.keys, item["choices"])])
+        return f"Question: {question}\n{choices}"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        ground_truth = self._get_ground_truth(item)
+        assert ground_truth is not None
+        return f"{self._get_cue_text(item)}{ground_truth}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "Answer:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return f" {self.keys[item['answer']]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return [f" {key}" for key in self.keys]
+class FullTextMMLU(MMLU):
+    """MMLU dataset but where the model is expected to replicate choice text, rather than just the key."""
+    NAME = "Full Text MMLU"
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "answers"] + get_n_letters(4)
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        subject_name = self._get_subject_name(item)
+        return f"""The following are multiple choice questions (with possible answers) about {subject_name}.
+Answer with the full text of the correct answer."""
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        question = item["question"].strip()
+        choices = "".join([f"- {choice}\n" for choice in item["choices"]])
+        return f"Question: {question}\nPossible answers:\n{choices}"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return f" {item['choices'][item['answer']]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return [f" {choice}" for choice in item["choices"]]
+class MMLU_COT(MMLU):
+    """
+    MMLU dataset with instruction to summarize reasoning and conclude with answer.
+    Inspired by https://arxiv.org/pdf/2411.15124 (Table 44)
+    """
+    NAME = "MMLU_COT"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [AccuracyCompletion]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Therefore", "the", "answer", "is", "ANSWER_LETTER"] + get_n_letters(
+        4
+    )
+    ANS_RE = re.compile(r"Therefore, the answer is: ([ABCD])")
+    def __init__(self, num_fewshot: int = 0) -> None:
+        assert num_fewshot == 0, "Fewshot is not supported for MMLU_COT"
+        super().__init__(num_fewshot)
+        self.stop_sequences: list[str] = ["Question:"]
+    def _extract_answer(self, completion: str) -> str:
+        match = self.ANS_RE.search(completion)
+        if match:
+            match_str = match.group(1)
+            return match_str
+        else:
+            return "[invalid]"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return ""
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return self.keys[item["answer"]]
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        for stop_sequence in self.stop_sequences:
+            if stop_sequence in completion_text:
+                completion_text = completion_text.split(stop_sequence)[0]
+        return self._extract_answer(completion_text)
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        question = item["question"].strip()
+        choices = "\n".join([f"{key}. {choice}" for key, choice in zip(self.keys, item["choices"])])
+        return f"Question: {question}\n{choices}"
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return (
+            f"The following are multiple choice questions about {self._get_subject_name(item)}. "
+            'Summarize your reasoning concisely, then conclude with "Therefore, the answer is: X", where X is '
+            "one of A, B, C, or D."
+        )

eval_framework/tasks/benchmarks/mmlu_de.py ADDED Viewed

@@ -0,0 +1,109 @@
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import BaseTask, Language, ResponseType
+from eval_framework.tasks.utils import get_n_letters
+MMLU_SUBJECTS_TRANSLATION = {
+    "abstract_algebra": "Abstrakte Algebra",
+    "anatomy": "Anatomie",
+    "astronomy": "Astronomie",
+    "business_ethics": "Wirtschaftsethik",
+    "clinical_knowledge": "Klinisches Wissen",
+    "college_biology": "Hochschulbiologie",
+    "college_chemistry": "Hochschulchemie",
+    "college_computer_science": "Hochschulinformatik",
+    "college_mathematics": "Hochschulmathematik",
+    "college_medicine": "Hochschulmedizin",
+    "college_physics": "Hochschulphysik",
+    "computer_security": "Computersicherheit",
+    "conceptual_physics": "Konzeptuelle Physik",
+    "econometrics": "Ökonometrie",
+    "electrical_engineering": "Elektrotechnik",
+    "elementary_mathematics": "Elementarmathematik",
+    "formal_logic": "Formale Logik",
+    "global_facts": "Globale Fakten",
+    "high_school_biology": "Gymnasialbiologie",
+    "high_school_chemistry": "Gymnasialchemie",
+    "high_school_computer_science": "Gymnasiale Informatik",
+    "high_school_european_history": "Gymnasiale Europäische Geschichte",
+    "high_school_geography": "Gymnasiale Geographie",
+    "high_school_government_and_politics": "Gymnasiale Regierung und Politik",
+    "high_school_macroeconomics": "Gymnasiale Makroökonomie",
+    "high_school_mathematics": "Gymnasialmathematik",
+    "high_school_microeconomics": "Gymnasiale Mikroökonomie",
+    "high_school_physics": "Gymnasialphysik",
+    "high_school_psychology": "Gymnasialpsychologie",
+    "high_school_statistics": "Gymnasialstatistik",
+    "high_school_us_history": "Gymnasiale US-Geschichte",
+    "high_school_world_history": "Gymnasiale Weltgeschichte",
+    "human_aging": "Menschliches Altern",
+    "human_sexuality": "Menschliche Sexualität",
+    "international_law": "Internationales Recht",
+    "jurisprudence": "Rechtswissenschaft",
+    "logical_fallacies": "Logische Fehlschlüsse",
+    "machine_learning": "Maschinelles Lernen",
+    "management": "Management",
+    "marketing": "Marketing",
+    "medical_genetics": "Medizinische Genetik",
+    "miscellaneous": "Verschiedenes",
+    "moral_disputes": "Moralische Streitfragen",
+    "moral_scenarios": "Moralische Szenarien",
+    "nutrition": "Ernährung",
+    "philosophy": "Philosophie",
+    "prehistory": "Urgeschichte",
+    "professional_accounting": "Berufliche Buchhaltung",
+    "professional_law": "Berufliches Recht",
+    "professional_medicine": "Berufliche Medizin",
+    "professional_psychology": "Berufliche Psychologie",
+    "public_relations": "Öffentlichkeitsarbeit",
+    "security_studies": "Sicherheitsstudien",
+    "sociology": "Soziologie",
+    "us_foreign_policy": "US-Außenpolitik",
+    "virology": "Virologie",
+    "world_religions": "Weltreligionen",
+}
+class MMLU_DE(BaseTask[str]):
+    """MMLU DE dataset: https://huggingface.co/datasets/LeoLM/MMLU_de"""
+    NAME = "MMLU_DE"
+    DATASET_PATH = "LeoLM/MMLU_de"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "validation"
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = list(MMLU_SUBJECTS_TRANSLATION.keys())
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Frage"] + get_n_letters(4)
+    LANGUAGE = Language.DEU
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.keys = get_n_letters(4)
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return f"Die folgenden sind Multiple Choice Fragen (mit Antworten) über {MMLU_SUBJECTS_TRANSLATION[item['subject']]}."  # noqa: E501
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        question = item["question_de"].strip()
+        choices = "".join([f"{key}. {choice}\n" for key, choice in zip(self.keys, item["choices_de"])])
+        return f"Frage: {question}\n{choices}"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        ground_truth = self._get_ground_truth(item)
+        assert ground_truth is not None
+        return f"{self._get_cue_text(item)}{ground_truth}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "Antwort:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return f" {self.keys[item['answer']]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return [f" {key}" for key in self.keys]

eval_framework/tasks/benchmarks/mmlu_pro.py ADDED Viewed

@@ -0,0 +1,139 @@
+import random
+import re
+from typing import Any
+from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, Sample
+from eval_framework.tasks.utils import get_n_letters
+MMLU_PRO_SUBJECTS = [
+    "engineering",
+    "physics",
+    "psychology",
+    "chemistry",
+    "biology",
+    "law",
+    "philosophy",
+    "computer science",
+    "other",
+    "economics",
+    "business",
+    "history",
+    "math",
+    "health",
+]
+class MMLU_PRO(BaseTask[str]):
+    """MMLU_PRO dataset: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"""
+    NAME = "MMLU Pro"
+    DATASET_PATH = "TIGER-Lab/MMLU-Pro"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "test"
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = MMLU_PRO_SUBJECTS
+    PERTURBATION_UNMODIFIABLE_WORDS = get_n_letters(10)
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.keys = get_n_letters(10)
+    def _load_dataset(self, subject: str) -> None:
+        name = subject if subject != NO_SUBJECT else None
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH)
+        hf_dataset = hf_dataset.filter(lambda example: example["category"] == name)
+        self.dataset = {}
+        for split, data in hf_dataset.items():
+            data_list = list(data)
+            assert len(data_list) > 0
+            if split == self.SAMPLE_SPLIT:
+                self.rnd = random.Random(RANDOM_SEED)
+                self.rnd.shuffle(data_list)
+            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                self.dataset[split] = data_list
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return f"The following are multiple choice questions (with answers) about {item['subject']}."
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        instruction_text = item["question"].strip() + "\n"
+        instruction_text += "".join([f"{key}. {choice}\n" for key, choice in zip(self.keys, item["options"])])
+        return instruction_text
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        ground_truth = self._get_ground_truth(item)
+        assert ground_truth is not None
+        return f"{self._get_cue_text(item)}{ground_truth}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "Answer:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return f" {self.keys[item['answer_index']]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return [f" {key}" for key in self.keys]
+class MMLU_PRO_COT(MMLU_PRO):
+    NAME = "MMLU_PRO_COT"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [AccuracyCompletion]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Therefore", "the", "answer", "is", "ANSWER_LETTER"] + get_n_letters(
+        4
+    )
+    ANS_RE = re.compile(r"Therefore, the answer is \(([ABCDEFGHIJ])\)")
+    def __init__(self, num_fewshot: int = 0) -> None:
+        assert num_fewshot == 0, "Fewshot is not supported for MMLU_PRO_COT"
+        super().__init__(num_fewshot)
+        self.stop_sequences: list[str] = ["Question:"]
+    def _extract_answer(self, completion: str) -> str:
+        match = self.ANS_RE.search(completion)
+        if match:
+            match_str = match.group(1)
+            return match_str
+        else:
+            return "[invalid]"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return ""
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return self.keys[item["answer_index"]]
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        for stop_sequence in self.stop_sequences:
+            if stop_sequence in completion_text:
+                completion_text = completion_text.split(stop_sequence)[0]
+        return self._extract_answer(completion_text)
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        # using the reasoning prompt from "Figure 44 of Tülu 3 paper: https://arxiv.org/pdf/2411.15124"
+        instruction_text = (
+            "Answer the following multiple-choice question by giving the correct answer letter in parentheses. "
+            "Provide CONCISE reasoning for the answer, and make sure to finish the response with "
+            '"Therefore, the answer is (ANSWER_LETTER)" where (ANSWER_LETTER) is one of (A), (B), (C), (D), (E), etc.'
+        )
+        instruction_text += f"\n\nQuestion: {item['question'].strip()}\n"
+        instruction_text += "\n".join([f"({key}) {choice}" for key, choice in zip(self.keys, item["options"])])
+        instruction_text += (
+            "\n\nAnswer the above question and REMEMBER to finish your response with the exact phrase "
+            '"Therefore, the answer is (ANSWER_LETTER)" where (ANSWER_LETTER) is one of (A), (B), (C), (D), (E), etc.'
+        )
+        return instruction_text