PyPI - eval-framework - Versions diffs - 0.2.0__py3-none-any.whl - Mend

eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

eval_framework/__init__.py +7 -0
eval_framework/base_config.py +36 -0
eval_framework/context/__init__.py +0 -0
eval_framework/context/determined.py +170 -0
eval_framework/context/eval.py +114 -0
eval_framework/context/local.py +52 -0
eval_framework/evaluation_generator.py +231 -0
eval_framework/exceptions.py +2 -0
eval_framework/external/ifeval_impl/README.md +5 -0
eval_framework/external/ifeval_impl/instructions.py +1523 -0
eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
eval_framework/external/ifeval_impl/utils.py +135 -0
eval_framework/llm/__init__.py +0 -0
eval_framework/llm/aleph_alpha.py +323 -0
eval_framework/llm/base.py +58 -0
eval_framework/llm/huggingface.py +332 -0
eval_framework/llm/mistral.py +73 -0
eval_framework/llm/models.py +16 -0
eval_framework/llm/openai.py +205 -0
eval_framework/llm/vllm.py +438 -0
eval_framework/logger.py +3 -0
eval_framework/main.py +187 -0
eval_framework/metrics/__init__.py +0 -0
eval_framework/metrics/base.py +40 -0
eval_framework/metrics/completion/__init__.py +1 -0
eval_framework/metrics/completion/accuracy_completion.py +16 -0
eval_framework/metrics/completion/bleu.py +76 -0
eval_framework/metrics/completion/chrf.py +62 -0
eval_framework/metrics/completion/code_assertion.py +44 -0
eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
eval_framework/metrics/completion/comet.py +56 -0
eval_framework/metrics/completion/concordance_index.py +38 -0
eval_framework/metrics/completion/csv_format.py +102 -0
eval_framework/metrics/completion/cwe_accuracy.py +49 -0
eval_framework/metrics/completion/exponential_similarity.py +65 -0
eval_framework/metrics/completion/f1.py +42 -0
eval_framework/metrics/completion/format_checker.py +56 -0
eval_framework/metrics/completion/grid_difference.py +77 -0
eval_framework/metrics/completion/ifeval.py +73 -0
eval_framework/metrics/completion/json_format.py +171 -0
eval_framework/metrics/completion/language_checker.py +74 -0
eval_framework/metrics/completion/length_control.py +83 -0
eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
eval_framework/metrics/completion/niah_accuracy.py +163 -0
eval_framework/metrics/completion/placeholder_checker.py +27 -0
eval_framework/metrics/completion/repetition.py +88 -0
eval_framework/metrics/completion/rouge_1.py +35 -0
eval_framework/metrics/completion/rouge_2.py +45 -0
eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
eval_framework/metrics/completion/rouge_l.py +52 -0
eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
eval_framework/metrics/completion/ter.py +67 -0
eval_framework/metrics/completion/text_counter.py +182 -0
eval_framework/metrics/efficiency/__init__.py +0 -0
eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
eval_framework/metrics/llm/__init__.py +0 -0
eval_framework/metrics/llm/base.py +8 -0
eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
eval_framework/metrics/llm/graders/language.py +56 -0
eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
eval_framework/metrics/llm/graders/models.py +74 -0
eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
eval_framework/metrics/llm/llm_judge_sql.py +394 -0
eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
eval_framework/metrics/loglikelihood/__init__.py +0 -0
eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
eval_framework/py.typed +0 -0
eval_framework/response_generator.py +416 -0
eval_framework/result_processors/__init__.py +0 -0
eval_framework/result_processors/base.py +74 -0
eval_framework/result_processors/hf_processor.py +87 -0
eval_framework/result_processors/result_processor.py +129 -0
eval_framework/run.py +314 -0
eval_framework/run_direct.py +42 -0
eval_framework/shared/types.py +227 -0
eval_framework/tasks/__init__.py +6 -0
eval_framework/tasks/base.py +314 -0
eval_framework/tasks/benchmarks/__init__.py +0 -0
eval_framework/tasks/benchmarks/arc.py +46 -0
eval_framework/tasks/benchmarks/arc_de.py +46 -0
eval_framework/tasks/benchmarks/arc_fi.py +46 -0
eval_framework/tasks/benchmarks/belebele.py +60 -0
eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
eval_framework/tasks/benchmarks/casehold.py +47 -0
eval_framework/tasks/benchmarks/chembench.py +85 -0
eval_framework/tasks/benchmarks/copa.py +39 -0
eval_framework/tasks/benchmarks/duc.py +91 -0
eval_framework/tasks/benchmarks/flores200.py +62 -0
eval_framework/tasks/benchmarks/flores_plus.py +84 -0
eval_framework/tasks/benchmarks/gpqa.py +177 -0
eval_framework/tasks/benchmarks/gsm8k.py +148 -0
eval_framework/tasks/benchmarks/hellaswag.py +44 -0
eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
eval_framework/tasks/benchmarks/humaneval.py +97 -0
eval_framework/tasks/benchmarks/ifeval.py +78 -0
eval_framework/tasks/benchmarks/include.py +119 -0
eval_framework/tasks/benchmarks/infinitebench.py +302 -0
eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
eval_framework/tasks/benchmarks/mbpp.py +192 -0
eval_framework/tasks/benchmarks/mmlu.py +190 -0
eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
eval_framework/tasks/benchmarks/mmmlu.py +529 -0
eval_framework/tasks/benchmarks/openbookqa.py +37 -0
eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
eval_framework/tasks/benchmarks/pawsx.py +65 -0
eval_framework/tasks/benchmarks/piqa.py +39 -0
eval_framework/tasks/benchmarks/quality.py +56 -0
eval_framework/tasks/benchmarks/sciq.py +44 -0
eval_framework/tasks/benchmarks/sphyr.py +75 -0
eval_framework/tasks/benchmarks/squad.py +89 -0
eval_framework/tasks/benchmarks/struct_eval.py +110 -0
eval_framework/tasks/benchmarks/tablebench.py +117 -0
eval_framework/tasks/benchmarks/triviaqa.py +42 -0
eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
eval_framework/tasks/benchmarks/winogender.py +39 -0
eval_framework/tasks/benchmarks/winogrande.py +44 -0
eval_framework/tasks/benchmarks/winox.py +57 -0
eval_framework/tasks/benchmarks/wmt.py +160 -0
eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
eval_framework/tasks/eval_config.py +112 -0
eval_framework/tasks/perturbation.py +83 -0
eval_framework/tasks/registry.py +186 -0
eval_framework/tasks/task_loader.py +80 -0
eval_framework/tasks/task_names.py +138 -0
eval_framework/tasks/utils.py +578 -0
eval_framework/utils/constants.py +9 -0
eval_framework/utils/generate_task_docs.py +229 -0
eval_framework/utils/helpers.py +3 -0
eval_framework/utils/logging.py +50 -0
eval_framework/utils/packaging.py +52 -0
eval_framework-0.2.0.dist-info/METADATA +514 -0
eval_framework-0.2.0.dist-info/RECORD +161 -0
eval_framework-0.2.0.dist-info/WHEEL +4 -0
eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
template_formatting/README.md +83 -0
template_formatting/__init__.py +0 -0
template_formatting/formatter.py +536 -0
template_formatting/mistral_formatter.py +159 -0
template_formatting/py.typed +0 -0
template_formatting/tests/test_formatter_eval.py +408 -0
template_formatting/tests/test_formatter_scaling.py +253 -0
template_formatting/tests/test_mistral_formatter.py +136 -0

eval_framework/tasks/benchmarks/belebele.py ADDED Viewed

@@ -0,0 +1,60 @@
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import BaseTask, Language, ResponseType
+from eval_framework.tasks.utils import get_n_letters
+class BELEBELE(BaseTask[str]):
+    """BELEBELE dataset: https://huggingface.co/datasets/facebook/belebele"""
+    NAME = "BELEBELE"
+    DATASET_PATH = "facebook/belebele"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "test"
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = [
+        "eng_Latn",
+    ]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"] + get_n_letters(4)
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.keys = get_n_letters(4)
+        self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return "The following are multiple choice questions (with answers)."
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        context = item["flores_passage"].strip()
+        question = item["question"].strip()
+        choices = "".join(
+            [
+                f"{key}. {choice}\n"
+                for key, choice in zip(
+                    self.keys, [item["mc_answer1"], item["mc_answer2"], item["mc_answer3"], item["mc_answer4"]]
+                )
+            ]
+        )
+        return f"{context}\n\nQuestion: {question}\n{choices}"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        ground_truth = self._get_ground_truth(item)
+        assert ground_truth is not None
+        return f"{self._get_cue_text(item)}{ground_truth}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "Answer:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return f" {self.keys[int(item['correct_answer_num']) - 1]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return [f" {key}" for key in self.keys]

eval_framework/tasks/benchmarks/bigcodebench.py ADDED Viewed

@@ -0,0 +1,155 @@
+import random
+import re
+from typing import Any
+from eval_framework.metrics.completion.code_execution_pass_at_one import (
+    CodeExecutionPassAtOne,
+    CodeExecutionPassAtOneContext,
+)
+from eval_framework.tasks.base import (
+    RANDOM_SEED,
+    BaseTask,
+    Language,
+    ResponseType,
+    Sample,
+    SubjectType,
+)
+from eval_framework.tasks.utils import (
+    BIG_CODE_BENCH_PACKAGE_MAPPING,
+    CallableSerializer,
+    _parse_unittest_output,
+    unittest_merge_snippets,
+)
+PROMPT_INSTRUCTION = (
+    "Please provide a self-contained Python script, without tests or example usage, that solves the following "
+    "problem in a markdown code block:\n"
+)  # from https://arxiv.org/pdf/2406.15877 - Figure 14
+RESPONSE_PREFIX = (
+    "Below is a Python script with a self-contained function that solves the problem and passes "
+    "corresponding tests:\n"
+)  # from https://github.com/bigcode-project/bigcodebench/blob/main/bigcodebench/generate.py#L149
+class BigCodeBench(BaseTask[str]):
+    """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench"""
+    NAME = "BigCodeBench"
+    DATASET_PATH = "bigcode/bigcodebench"
+    SAMPLE_SPLIT = "v0.1.4"
+    FEWSHOT_SPLIT = "v0.1.4"  # (there is no dedicated split, few-shot is not expected for this dataset)
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [CodeExecutionPassAtOne]
+    SUBJECTS = ["original", "calibrated"]
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        assert num_fewshot == 0, "Fewshot is not supported for BigCodeBench"
+        # NOTE : this serializer should be the same class as initialized in the metric
+        self.serializer = CallableSerializer()
+        super().__init__(num_fewshot)
+    def _load_dataset(self, subject: SubjectType) -> None:
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=None)
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return PROMPT_INSTRUCTION + item["complete_prompt"]
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return RESPONSE_PREFIX + (item["code_prompt"] if item["subject"] == "calibrated" else "")
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return item["canonical_solution"]  # Not needed for evaluation, as it is test based given the generated code
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return None
+    def _get_context(self, item: dict[str, Any]) -> CodeExecutionPassAtOneContext:
+        return CodeExecutionPassAtOneContext(
+            run_env="python:3.12",  # os.environ.get("DOCKER_CODE_EXECUTION"),
+            code_prompt=item["code_prompt"],
+            test_code=item["test"],
+            snippet_merge_fn=self.serializer.encode(unittest_merge_snippets),
+            output_parse_fn=self.serializer.encode(_parse_unittest_output),
+            package_downloads=BIG_CODE_BENCH_PACKAGE_MAPPING,
+        )
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        if sample is not None and sample.context is not None and sample.subject == "calibrated":
+            assert isinstance(sample.context, CodeExecutionPassAtOneContext), "Expected CodeExecutionPassAtOneContext"
+            processed_text = (sample.context.code_prompt if sample.context is not None else "") + completion_text
+        else:
+            processed_text = extract_executable_code(completion_text)
+        return processed_text
+class BigCodeBenchInstruct(BigCodeBench):
+    """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench"""
+    NAME = "BigCodeBenchInstruct"
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return PROMPT_INSTRUCTION + item["instruct_prompt"]
+class BigCodeBenchHard(BigCodeBench):
+    """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench-hard"""
+    NAME = "BigCodeBenchHard"
+    DATASET_PATH = "bigcode/bigcodebench-hard"
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return PROMPT_INSTRUCTION + item["complete_prompt"]
+class BigCodeBenchHardInstruct(BigCodeBenchHard):
+    """BigCodeBench dataset: https://huggingface.co/datasets/bigcode/bigcodebench-hard"""
+    NAME = "BigCodeBenchHardInstruct"
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return PROMPT_INSTRUCTION + item["instruct_prompt"]
+def extract_executable_code(llm_response: str) -> str:
+    # Look for nested markdown+python pattern
+    nested_pattern = r"```markdown.*?```python\s*(.*?)\s*```"
+    nested_matches = re.findall(nested_pattern, llm_response, re.DOTALL)
+    if nested_matches:
+        return nested_matches[0].strip()
+    # Look for python code blocks
+    python_pattern = r"```python\s*(.*?)\s*```"
+    python_matches = re.findall(python_pattern, llm_response, re.DOTALL)
+    if python_matches:
+        return python_matches[0].strip()
+    # Look for markdown-only code blocks
+    markdown_pattern = r"```markdown\s*(.*?)\s*```"
+    markdown_matches = re.findall(markdown_pattern, llm_response, re.DOTALL)
+    if markdown_matches:
+        return markdown_matches[0].strip()
+    # Look for generic code blocks as fallback
+    generic_pattern = r"```\s*(.*?)\s*```"
+    generic_matches = re.findall(generic_pattern, llm_response, re.DOTALL)
+    if generic_matches:
+        return generic_matches[0].strip()
+    # If no code blocks found, return original response
+    return llm_response

eval_framework/tasks/benchmarks/casehold.py ADDED Viewed

@@ -0,0 +1,47 @@
+import random
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType
+class CASEHOLD(BaseTask[str]):
+    NAME = "CaseHold"
+    DATASET_PATH = "lex_glue"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "train"
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = ["case_hold"]
+    LANGUAGE = Language.ENG
+    def _load_dataset(self, subject: str) -> None:
+        name = subject if subject != NO_SUBJECT else None
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                self.dataset[split] = [i for i in data_list if i["context"].count("(<HOLDING>)") == 1]
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return item["context"].split("(<HOLDING>)", maxsplit=1)[0]
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        right = item["context"].split("(<HOLDING>)", maxsplit=1)[1]
+        return f"{item['endings'][item['label']]}{right}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        right = item["context"].split("(<HOLDING>)", maxsplit=1)[1]
+        return [f"{ending}{right}" for ending in item["endings"]]

eval_framework/tasks/benchmarks/chembench.py ADDED Viewed

@@ -0,0 +1,85 @@
+import json
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import BaseTask, Language, ResponseType
+from eval_framework.tasks.utils import get_n_letters
+CHEMBENCH_SUBJECTS = [
+    "analytical_chemistry",
+    "chemical_preference",
+    "general_chemistry",
+    "inorganic_chemistry",
+    "materials_science",
+    "organic_chemistry",
+    "physical_chemistry",
+    "technical_chemistry",
+    "toxicity_and_safety",
+]
+class ChemBench(BaseTask[str]):
+    """ChemBench dataset: https://huggingface.co/datasets/jablonkagroup/ChemBench"""
+    NAME = "ChemBench"
+    DATASET_PATH = "jablonkagroup/ChemBench"
+    SAMPLE_SPLIT = "train"  # Only has train split
+    FEWSHOT_SPLIT = "train"  # Only has train split
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = CHEMBENCH_SUBJECTS
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        assert num_fewshot == 0, "Fewshot is not supported for ChemBench"
+        super().__init__(num_fewshot)
+        self.keys = get_n_letters(16)
+    def _load_dataset(self, subject: str) -> None:
+        super()._load_dataset(subject)
+        # Keep only the multiple-choice options with 1 correct answer
+        for split in self.dataset.keys():
+            filtered_items = []
+            for item in self.dataset[split]:
+                if item.get("metrics") == ["multiple_choice_grade"]:
+                    target_scores = json.loads(item["examples"][0]["target_scores"])
+                    correct_answers = [i for i, score in enumerate(target_scores.values()) if score == 1.0]
+                    if len(correct_answers) == 1:
+                        filtered_items.append(item)
+            self.dataset[split] = filtered_items
+    def _get_subject_name(self, item: dict[str, Any]) -> str:
+        return " ".join(item["subject"].split("_"))
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return (
+            "The following is a question about chemistry. Please answer by responding with the letter of the correct "
+            "answer."
+        )
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        question = item["examples"][0]["input"].strip()
+        target_scores = json.loads(item["examples"][0]["target_scores"])
+        choices = "".join([f"{key}. {choice}\n" for key, choice in zip(self.keys, target_scores.keys())])
+        return f"Question: {question}\n{choices}"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        ground_truth = self._get_ground_truth(item)
+        return f"{self._get_cue_text(item)}{ground_truth}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "Answer:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        target_scores = json.loads(item["examples"][0]["target_scores"])
+        correct_answers = [i for i, score in enumerate(target_scores.values()) if score == 1.0]
+        assert len(correct_answers) == 1, f"Expected exactly one correct answer, but got {len(correct_answers)}"
+        return f" {self.keys[correct_answers[0]]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        target_scores = json.loads(item["examples"][0]["target_scores"])
+        return [f" {key}" for key in self.keys[: len(target_scores)]]

eval_framework/tasks/benchmarks/copa.py ADDED Viewed

@@ -0,0 +1,39 @@
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import BaseTask, Language, ResponseType
+class COPA(BaseTask[str]):
+    """COPA dataset: https://huggingface.co/datasets/aps/super_glue"""
+    NAME = "COPA"
+    DATASET_PATH = "aps/super_glue"
+    SAMPLE_SPLIT = "validation"  # 100 examples (same split as lm-eval)
+    FEWSHOT_SPLIT = "test"  # 500 examples
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = ["copa"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["because", "therefore"]
+    LANGUAGE = Language.ENG
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        connector = {
+            "cause": "because",
+            "effect": "therefore",
+        }[item["question"]]
+        return item["premise"].strip()[:-1] + f" {connector} "
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        correct_choice = item["choice1"] if item["label"] == 0 else item["choice2"]
+        return f"{self.convert_choice(correct_choice)}"
+    def convert_choice(self, choice: str) -> str:
+        return choice[0].lower() + choice[1:]
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        choices = [self.convert_choice(item["choice1"]), self.convert_choice(item["choice2"])]
+        return choices

eval_framework/tasks/benchmarks/duc.py ADDED Viewed

@@ -0,0 +1,91 @@
+import random
+import re
+from abc import ABC
+from typing import Any
+from eval_framework.metrics.base import BaseMetric
+from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
+from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
+class DUC(BaseTask[str], ABC):
+    """https://huggingface.co/datasets/midas/duc2001"""
+    DATASET_PATH: str = "midas/duc2001"
+    SAMPLE_SPLIT: str = "test"
+    FEWSHOT_SPLIT: str = "test"
+    RESPONSE_TYPE: ResponseType = ResponseType.COMPLETION
+    METRICS: list[type[BaseMetric]] = [AccuracyCompletion]
+    SUBJECTS: list[str] = ["raw"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Text", "Keyphrase"]
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences: list[str] = ["Text:"]
+        self.max_tokens = 50  # longest keyphrase is less than 50 characters long
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        for stop_sequence in self.stop_sequences:
+            if stop_sequence in completion_text:
+                completion_text = completion_text.split(stop_sequence)[0]
+        completion_text = completion_text.strip()
+        return completion_text
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        instruction_text = " ".join(item["document"])
+        instruction_text = re.sub(r"\s+([.,!?;:])", r"\1", instruction_text)
+        return f"Text: {instruction_text}\nKeyphrase:"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        target = self._get_ground_truth(item)
+        assert target is not None
+        assert isinstance(target, list)
+        return f" {target[0]}"
+class DUC_EXTRACTIVE(DUC):
+    NAME = "DUC Extractive"
+    SUBJECTS: list[str] = ["raw"]
+    def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
+        return item["extractive_keyphrases"]
+    def _get_system_prompt_text(self, item: dict[str, Any]) -> str:
+        return (
+            "You are an AI model tasked with extracting keyphrases from a text document. "
+            "Keyphrases should capture main ideas or significant topics exactly as worded in the text."
+        )
+class DUC_ABSTRACTIVE(DUC):
+    NAME = "DUC Abstractive"
+    SUBJECTS: list[str] = ["raw"]
+    def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
+        return item["abstractive_keyphrases"]
+    def _load_dataset(self, subject: str) -> None:
+        # not all samples have abstractive keyphrases
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=subject)
+        self.dataset = {}
+        for split, data in hf_dataset.items():
+            data_list = list(filter(lambda x: len(x["abstractive_keyphrases"]) > 0, data))
+            if split == self.SAMPLE_SPLIT:
+                self.rnd = random.Random(RANDOM_SEED)
+                self.rnd.shuffle(data_list)
+            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                self.dataset[split] = data_list
+    def _get_system_prompt_text(self, item: dict[str, Any]) -> str:
+        return (
+            "You are an AI model tasked with generating abstractive keyphrases "
+            "that capture the main ideas of the text without using exact wording."
+        )
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return "Paraphrase the following texts to improve clarity and relevance."

eval_framework/tasks/benchmarks/flores200.py ADDED Viewed

@@ -0,0 +1,62 @@
+from typing import Any
+import pycountry
+from eval_framework.metrics.completion.bleu import BLEU
+from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
+FLORES_LANGUAGES = [
+    "deu_Latn",
+    "eng_Latn",
+    "fin_Latn",
+    "fra_Latn",
+    "nld_Latn",
+]  # Note: there are many more languages in the dataset, but we only consider these for now
+class Flores200(BaseTask[str]):
+    """QMSum dataset: https://huggingface.co/datasets/facebook/flores"""
+    NAME = "FLoRes-200"
+    DATASET_PATH = "facebook/flores"
+    SAMPLE_SPLIT = "devtest"
+    FEWSHOT_SPLIT = "dev"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [BLEU]
+    SUBJECTS = [f"{s}-{t}" for s in FLORES_LANGUAGES for t in FLORES_LANGUAGES if s != t]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
+    LANGUAGE = {
+        "deu_Latn": Language.DEU,
+        "eng_Latn": Language.ENG,
+        "fin_Latn": Language.FIN,
+        "fra_Latn": Language.FRA,
+        "nld_Latn": Language.NLD,
+    }
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = ["\n"]
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        source_key = item["subject"].split("-")[0]
+        source_language = pycountry.languages.get(alpha_3=source_key.split("_")[0]).name
+        source = item[f"sentence_{source_key}"]
+        instruction = f"{source_language} sentence: {source}\n"
+        target_key = item["subject"].split("-")[1]
+        target_language = pycountry.languages.get(alpha_3=target_key.split("_")[0]).name
+        return f"{instruction}{target_language} sentence:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        target_key = item["subject"].split("-")[1]
+        return item[f"sentence_{target_key}"]
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        target = f" {self._get_ground_truth(item)}"
+        assert target is not None
+        assert isinstance(target, str)
+        return target
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        return completion_text.strip()

eval_framework/tasks/benchmarks/flores_plus.py ADDED Viewed

@@ -0,0 +1,84 @@
+import random
+from itertools import product
+from typing import Any
+from eval_framework.metrics.completion.bleu import BLEU
+from eval_framework.metrics.completion.chrf import CHRF
+from eval_framework.metrics.completion.comet import COMET
+from eval_framework.shared.types import BaseMetricContext, UntemplatedPrompt
+from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
+LANG_MAP = {
+    "deu_Latn": "German",
+    "eng_Latn": "English",
+    "fra_Latn": "French",
+    "ita_Latn": "Italian",
+    "nld_Latn": "Dutch",
+    "pol_Latn": "Polish",
+    "rus_Cyrl": "Russian",
+    "spa_Latn": "Spanish",
+    "ukr_Cyrl": "Ukrainian",
+}
+class FloresPlus(BaseTask[str]):
+    """Flores-Plus dataset: https://huggingface.co/datasets/openlanguagedata/flores_plus"""
+    NAME = "Flores-Plus"
+    DATASET_PATH = "openlanguagedata/flores_plus"
+    SAMPLE_SPLIT = "dev"
+    FEWSHOT_SPLIT = "devtest"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [BLEU, CHRF, COMET]
+    SUBJECTS = [f"{s}-{t}" for s, t in product(LANG_MAP, LANG_MAP) if s != t]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["sentence"]
+    LANGUAGE = {
+        "deu_Latn": Language.DEU,
+        "eng_Latn": Language.ENG,
+        "fra_Latn": Language.FRA,
+        "ita_Latn": Language.ITA,
+        "nld_Latn": Language.NLD,
+        "pol_Latn": Language.POL,
+        "rus_Cyrl": Language.RUS,
+        "spa_Latn": Language.SPA,
+        "ukr_Cyrl": Language.UKR,
+    }
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = ["\n"]
+    def _load_dataset(self, subject: str) -> None:
+        hf_dataset_src = self._load_hf_dataset(path=self.DATASET_PATH, name=subject.split("-")[0])
+        hf_dataset_tgt = self._load_hf_dataset(path=self.DATASET_PATH, name=subject.split("-")[1])
+        self.dataset = {}
+        self.rnd = random.Random(42)
+        for split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+            data_src = hf_dataset_src[split]
+            data_tgt = hf_dataset_tgt[split]
+            data_list = []
+            for item_src, item_tgt in zip(data_src, data_tgt):
+                assert item_src["id"] == item_tgt["id"]
+                iso_src = f"{item_src['iso_639_3']}_{item_src['iso_15924']}"
+                iso_tgt = f"{item_tgt['iso_639_3']}_{item_tgt['iso_15924']}"
+                text_src = item_src["text"]
+                text_tgt = item_tgt["text"]
+                data_list.append({"iso_source": iso_src, "iso_target": iso_tgt, "source": text_src, "target": text_tgt})
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        target_language = LANG_MAP[item["iso_target"]]
+        instruction = f"Translate the following text into {target_language}:\n{item['source']}"
+        return instruction
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return item["target"]
+    def _get_context(self, item: dict[str, Any]) -> BaseMetricContext | list[BaseMetricContext] | None:
+        return UntemplatedPrompt(untemplated_prompt=item["source"])
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        return completion_text.strip()