PyPI - eval-framework - Versions diffs - 0.2.0__py3-none-any.whl - Mend

eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

eval_framework/__init__.py +7 -0
eval_framework/base_config.py +36 -0
eval_framework/context/__init__.py +0 -0
eval_framework/context/determined.py +170 -0
eval_framework/context/eval.py +114 -0
eval_framework/context/local.py +52 -0
eval_framework/evaluation_generator.py +231 -0
eval_framework/exceptions.py +2 -0
eval_framework/external/ifeval_impl/README.md +5 -0
eval_framework/external/ifeval_impl/instructions.py +1523 -0
eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
eval_framework/external/ifeval_impl/utils.py +135 -0
eval_framework/llm/__init__.py +0 -0
eval_framework/llm/aleph_alpha.py +323 -0
eval_framework/llm/base.py +58 -0
eval_framework/llm/huggingface.py +332 -0
eval_framework/llm/mistral.py +73 -0
eval_framework/llm/models.py +16 -0
eval_framework/llm/openai.py +205 -0
eval_framework/llm/vllm.py +438 -0
eval_framework/logger.py +3 -0
eval_framework/main.py +187 -0
eval_framework/metrics/__init__.py +0 -0
eval_framework/metrics/base.py +40 -0
eval_framework/metrics/completion/__init__.py +1 -0
eval_framework/metrics/completion/accuracy_completion.py +16 -0
eval_framework/metrics/completion/bleu.py +76 -0
eval_framework/metrics/completion/chrf.py +62 -0
eval_framework/metrics/completion/code_assertion.py +44 -0
eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
eval_framework/metrics/completion/comet.py +56 -0
eval_framework/metrics/completion/concordance_index.py +38 -0
eval_framework/metrics/completion/csv_format.py +102 -0
eval_framework/metrics/completion/cwe_accuracy.py +49 -0
eval_framework/metrics/completion/exponential_similarity.py +65 -0
eval_framework/metrics/completion/f1.py +42 -0
eval_framework/metrics/completion/format_checker.py +56 -0
eval_framework/metrics/completion/grid_difference.py +77 -0
eval_framework/metrics/completion/ifeval.py +73 -0
eval_framework/metrics/completion/json_format.py +171 -0
eval_framework/metrics/completion/language_checker.py +74 -0
eval_framework/metrics/completion/length_control.py +83 -0
eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
eval_framework/metrics/completion/niah_accuracy.py +163 -0
eval_framework/metrics/completion/placeholder_checker.py +27 -0
eval_framework/metrics/completion/repetition.py +88 -0
eval_framework/metrics/completion/rouge_1.py +35 -0
eval_framework/metrics/completion/rouge_2.py +45 -0
eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
eval_framework/metrics/completion/rouge_l.py +52 -0
eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
eval_framework/metrics/completion/ter.py +67 -0
eval_framework/metrics/completion/text_counter.py +182 -0
eval_framework/metrics/efficiency/__init__.py +0 -0
eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
eval_framework/metrics/llm/__init__.py +0 -0
eval_framework/metrics/llm/base.py +8 -0
eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
eval_framework/metrics/llm/graders/language.py +56 -0
eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
eval_framework/metrics/llm/graders/models.py +74 -0
eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
eval_framework/metrics/llm/llm_judge_sql.py +394 -0
eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
eval_framework/metrics/loglikelihood/__init__.py +0 -0
eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
eval_framework/py.typed +0 -0
eval_framework/response_generator.py +416 -0
eval_framework/result_processors/__init__.py +0 -0
eval_framework/result_processors/base.py +74 -0
eval_framework/result_processors/hf_processor.py +87 -0
eval_framework/result_processors/result_processor.py +129 -0
eval_framework/run.py +314 -0
eval_framework/run_direct.py +42 -0
eval_framework/shared/types.py +227 -0
eval_framework/tasks/__init__.py +6 -0
eval_framework/tasks/base.py +314 -0
eval_framework/tasks/benchmarks/__init__.py +0 -0
eval_framework/tasks/benchmarks/arc.py +46 -0
eval_framework/tasks/benchmarks/arc_de.py +46 -0
eval_framework/tasks/benchmarks/arc_fi.py +46 -0
eval_framework/tasks/benchmarks/belebele.py +60 -0
eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
eval_framework/tasks/benchmarks/casehold.py +47 -0
eval_framework/tasks/benchmarks/chembench.py +85 -0
eval_framework/tasks/benchmarks/copa.py +39 -0
eval_framework/tasks/benchmarks/duc.py +91 -0
eval_framework/tasks/benchmarks/flores200.py +62 -0
eval_framework/tasks/benchmarks/flores_plus.py +84 -0
eval_framework/tasks/benchmarks/gpqa.py +177 -0
eval_framework/tasks/benchmarks/gsm8k.py +148 -0
eval_framework/tasks/benchmarks/hellaswag.py +44 -0
eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
eval_framework/tasks/benchmarks/humaneval.py +97 -0
eval_framework/tasks/benchmarks/ifeval.py +78 -0
eval_framework/tasks/benchmarks/include.py +119 -0
eval_framework/tasks/benchmarks/infinitebench.py +302 -0
eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
eval_framework/tasks/benchmarks/mbpp.py +192 -0
eval_framework/tasks/benchmarks/mmlu.py +190 -0
eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
eval_framework/tasks/benchmarks/mmmlu.py +529 -0
eval_framework/tasks/benchmarks/openbookqa.py +37 -0
eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
eval_framework/tasks/benchmarks/pawsx.py +65 -0
eval_framework/tasks/benchmarks/piqa.py +39 -0
eval_framework/tasks/benchmarks/quality.py +56 -0
eval_framework/tasks/benchmarks/sciq.py +44 -0
eval_framework/tasks/benchmarks/sphyr.py +75 -0
eval_framework/tasks/benchmarks/squad.py +89 -0
eval_framework/tasks/benchmarks/struct_eval.py +110 -0
eval_framework/tasks/benchmarks/tablebench.py +117 -0
eval_framework/tasks/benchmarks/triviaqa.py +42 -0
eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
eval_framework/tasks/benchmarks/winogender.py +39 -0
eval_framework/tasks/benchmarks/winogrande.py +44 -0
eval_framework/tasks/benchmarks/winox.py +57 -0
eval_framework/tasks/benchmarks/wmt.py +160 -0
eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
eval_framework/tasks/eval_config.py +112 -0
eval_framework/tasks/perturbation.py +83 -0
eval_framework/tasks/registry.py +186 -0
eval_framework/tasks/task_loader.py +80 -0
eval_framework/tasks/task_names.py +138 -0
eval_framework/tasks/utils.py +578 -0
eval_framework/utils/constants.py +9 -0
eval_framework/utils/generate_task_docs.py +229 -0
eval_framework/utils/helpers.py +3 -0
eval_framework/utils/logging.py +50 -0
eval_framework/utils/packaging.py +52 -0
eval_framework-0.2.0.dist-info/METADATA +514 -0
eval_framework-0.2.0.dist-info/RECORD +161 -0
eval_framework-0.2.0.dist-info/WHEEL +4 -0
eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
template_formatting/README.md +83 -0
template_formatting/__init__.py +0 -0
template_formatting/formatter.py +536 -0
template_formatting/mistral_formatter.py +159 -0
template_formatting/py.typed +0 -0
template_formatting/tests/test_formatter_eval.py +408 -0
template_formatting/tests/test_formatter_scaling.py +253 -0
template_formatting/tests/test_mistral_formatter.py +136 -0

eval_framework/tasks/benchmarks/squad.py ADDED Viewed

@@ -0,0 +1,89 @@
+import random
+from typing import Any
+from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
+from eval_framework.metrics.completion.f1 import F1
+from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, SubjectType
+class SQUAD2(BaseTask[str]):
+    """Squad v2 dataset: https://huggingface.co/datasets/rajpurkar/squad_v2"""
+    NAME = "SQuAD2"
+    DATASET_PATH = "rajpurkar/squad_v2"
+    SAMPLE_SPLIT = "validation"
+    FEWSHOT_SPLIT = "train"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [AccuracyCompletion, F1]
+    SUBJECTS = [NO_SUBJECT]
+    UNANSWERABLE_STR = "unanswerable"
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer", "Context", "unanswerable"]
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = [".\n"]
+        self.max_tokens = 300  # the max length of the ground truth is 160 characters while the average is ~19
+        self.rnd_choice_shuffle = random.Random()
+    def _load_dataset(self, subject: SubjectType) -> None:
+        name = subject if subject != NO_SUBJECT else None
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        prompt = (
+            "Given the following context, answer the question. If the question cannot be answered based "
+            f"on the context alone, respond with '{self.UNANSWERABLE_STR}'.\n\n"
+            "Context:\n"
+            f"{item['context']}\n\n"
+            f"Question:\n{item['question']}\nAnswer:"
+        )
+        return prompt
+    def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
+        text_ = item["answers"]["text"]
+        ground_truth_for_unanswerable = [
+            self.UNANSWERABLE_STR,
+            self.UNANSWERABLE_STR + " ",
+            self.UNANSWERABLE_STR.capitalize(),
+        ]
+        ground_truths = text_ if text_ else ground_truth_for_unanswerable
+        return [f" {ground_truth}" for ground_truth in ground_truths]
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        target = self._get_ground_truth(item)[0]
+        assert target is not None
+        assert isinstance(target, str)
+        return target
+class SQUAD(SQUAD2):
+    """Squad dataset: https://huggingface.co/datasets/rajpurkar/squad"""
+    NAME = "SQuAD"
+    DATASET_PATH = "rajpurkar/squad"
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        prompt = (
+            "Given the following context, answer the question.\n\n"
+            "Context:\n"
+            f"{item['context']}\n\n"
+            f"Question:\n{item['question']}\n"
+        )
+        return prompt
+    def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
+        return item["answers"]["text"]

eval_framework/tasks/benchmarks/struct_eval.py ADDED Viewed

@@ -0,0 +1,110 @@
+import os
+import random
+import re
+from typing import Any
+from datasets import DatasetDict
+from eval_framework.metrics.completion.struct_eval_metrics import RenderableStructMetric, StructMetric
+from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
+StructEvalSubjects = [
+    "CSV to YAML",
+    "JSON to XML",
+    "JSON to CSV",
+    "XML to JSON",
+    "XML to YAML",
+    "Text to XML",
+    "Text to YAML",
+    "Text to TOML",
+    "YAML to JSON",
+    "TOML to JSON",
+    "Text to CSV",
+    "YAML to XML",
+    "JSON to YAML",
+    "TOML to YAML",
+    "YAML to CSV",
+    "CSV to JSON",
+    "CSV to XML",
+    "Text to JSON",
+    "XML to CSV",
+]
+class StructEval(BaseTask[str]):
+    """StructEval task: https://tiger-ai-lab.github.io/StructEval/"""
+    NAME = "StructEval"
+    DATASET_PATH = "TIGER-Lab/StructEval"
+    SAMPLE_SPLIT = "train"
+    FEWSHOT_SPLIT = "train"  # Only has train split
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [StructMetric]  # Define appropriate metrics for StructEval
+    SUBJECTS = StructEvalSubjects
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        if num_fewshot > 0:
+            raise ValueError("StructEval only supports zero-shot evaluation.")
+        super().__init__(num_fewshot)
+    def _load_dataset(self, subject: str) -> None:
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH)
+        assert isinstance(hf_dataset, DatasetDict), "Expected a Hugging Face Dataset object."
+        hf_dataset = hf_dataset.filter(lambda item: item["task_name"] == subject, num_proc=os.cpu_count())
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            if split not in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                continue
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return (
+            f"{item['query']}\n\nIMPORTANT: Only output the required output format. "
+            "You must start the format/code with <|BEGIN_CODE|> and end the format/code with  <|END_CODE|>. "
+            "No other text output (explanation, comments, etc.) are allowed.  Do not use markdown code fences.\n"
+        )
+    def _get_eval_kwargs(self, item: dict[str, Any]) -> dict[str, Any] | None:
+        return {
+            "output_type": item["output_type"],
+            "paths": item["raw_output_metric"],
+        }
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "<|BEGIN_CODE|>"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
+        return None
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        m = re.search(r"(?:<\|BEGIN_CODE\|>|```[\w+-]*)(.*?)(?:<\|END_CODE\|>|```*)", completion_text, re.DOTALL)
+        return m.group(1).strip() if m else completion_text.strip()
+# There are more subjects in the StructEval dataset, but currently only the HTML output metric is implemented.
+RENDERABLE_STRUCTEVAL_SUBJECTS = [
+    "Convert Markdown to HTML",
+    "Convert React to HTML",
+    "Convert Vue to HTML",
+    "Text to HTML",
+]
+class RenderableStructEval(StructEval):
+    """Renderable StructEval task for tasks that can be rendered visually."""
+    NAME = "RenderableStructEval"
+    SUBJECTS = RENDERABLE_STRUCTEVAL_SUBJECTS
+    METRICS = [RenderableStructMetric]  # Define appropriate metrics for StructEval
+    def _get_eval_kwargs(self, item: dict[str, Any]) -> dict[str, Any] | None:
+        return {
+            "output_type": item["output_type"],
+            "keywords": item["raw_output_metric"],
+        }

eval_framework/tasks/benchmarks/tablebench.py ADDED Viewed

@@ -0,0 +1,117 @@
+import csv
+import json
+import random
+import re
+import tempfile
+from itertools import product
+from typing import Any
+from eval_framework.exceptions import LogicError
+from eval_framework.metrics.completion.rouge_l import ROUGE_L
+from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
+from eval_framework.tasks.utils import run_python_code
+from template_formatting.formatter import Role
+TABLE_BENCH_SUBJECTS = [
+    "NumericalReasoning",
+    "DataAnalysis",
+    "FactChecking",
+    # "Visualization" task is complex to re-implement, of small relevance and of small size (5.6% of dataset, Language)
+    # see https://github.com/TableBench/TableBench/blob/main/eval/batch_parse_response_script.py#L56
+]
+TABLE_BENCH_INSTRUCTION_TYPES = [
+    # "DP",  # Direct Prompting, has been deleted: https://huggingface.co/datasets/Multilingual-Multimodal-NLP/TableBench-Instructions/commit/534a6d859494c370f2aa6ee0e6076103d9707560 # noqa: E501
+    "PoT",  # Program-of-thought
+    "SCoT",  # Symbolic chain-of-thought
+    "TCoT",  # Textual chain-of-thought
+]
+class TableBench(BaseTask[tuple[str, str]]):
+    """TableBench dataset: https://huggingface.co/datasets/Multilingual-Multimodal-NLP/TableBench"""
+    NAME = "TableBench"
+    DATASET_PATH = "Multilingual-Multimodal-NLP/TableBench"
+    HF_REVISION = "81b551c744b7f49cfa0ad69cb7a1465d865c206e"  # latest version of the dataset is corrupted
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "test"  # (there is no dedicated split, few-shot is not expected for this dataset)
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [ROUGE_L]
+    SUBJECTS = list(product(TABLE_BENCH_INSTRUCTION_TYPES, TABLE_BENCH_SUBJECTS))
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        assert num_fewshot == 0, "Fewshot is not supported for TableBench"
+        super().__init__(num_fewshot)
+    def _load_dataset(self, subject: tuple[str, str]) -> None:
+        instruction_type, qtype = subject
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=None)
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            data = data.filter(lambda x: x["qtype"] == qtype and x["instruction_type"] == instruction_type)
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return item["instruction"]
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return item["answer"]
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        assert sample is not None
+        if "PoT" in sample.subject:
+            # Extract the (last) generated code snippet or fail otherwise
+            try:
+                matches = re.findall(r"```python\n(.*?)```", completion_text, flags=re.S)
+                if not matches:
+                    return ""
+                code = matches[-1]
+            except Exception:
+                return ""
+            # Extract the table given in the prompt and prepare it as a file
+            instruction = [m.content for m in sample.messages if m.role == Role.USER][-1]
+            tables = re.findall(r"\[TABLE\] (.*?) Let's get start!", instruction, flags=re.S)
+            if not tables:
+                return ""
+            # Check if the tables is a list or a string
+            if isinstance(tables, str):
+                table_dict = json.loads(tables.strip())
+            elif isinstance(tables, list):
+                table_dict = json.loads(tables[0].strip())
+            else:
+                raise LogicError(f"TableBench: {instruction} does not seem to contain one table.")
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                filename = f"{tmpdirname}/table.csv"
+                with open(filename, "w") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(table_dict["columns"])
+                    writer.writerows(table_dict["data"])
+                # Run the code in a Docker image, providing the table from the prompt
+                completion_text = run_python_code(
+                    code, image="amancevice/pandas:slim", input_files=[(filename, "/var/lib/pandas/table.csv")]
+                )
+                if "Error" in completion_text:
+                    return ""
+        # Extract the answer, be it directly from the model or be it the result of the generated code
+        try:
+            match = re.search(r"Final Answer: (.+)", completion_text)
+            return match.group(1).strip() if match else ""
+        except Exception:
+            return ""

eval_framework/tasks/benchmarks/triviaqa.py ADDED Viewed

@@ -0,0 +1,42 @@
+import random
+from typing import Any
+from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
+from eval_framework.metrics.completion.f1 import F1
+from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
+class TRIVIAQA(BaseTask[str]):
+    """Trivia QA dataset: https://huggingface.co/datasets/mandarjoshi/trivia_qa"""
+    NAME = "TriviaQA"
+    DATASET_PATH = "mandarjoshi/trivia_qa"
+    SAMPLE_SPLIT = "validation"
+    FEWSHOT_SPLIT = "train"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [AccuracyCompletion, F1]
+    SUBJECTS = ["rc.wikipedia.nocontext"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = ["\n"]
+        self.max_tokens = 400  # the max length of the ground truth is 282 characters while the average is ~16
+        self.rnd_choice_shuffle = random.Random()
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        prompt = f"Question: {item['question'].strip()}\nAnswer:"
+        return prompt
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        target = self._get_ground_truth(item)[0]
+        assert target is not None
+        assert isinstance(target, str)
+        return f" {target}"
+    def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
+        return item["answer"]["aliases"]
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        return completion_text.strip().rstrip(".")

eval_framework/tasks/benchmarks/truthfulqa.py ADDED Viewed

@@ -0,0 +1,95 @@
+import random
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.metrics.loglikelihood.probability_mass import ProbabilityMass, ProbabilityMassNorm
+from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, SubjectType
+# fewshot examples from Appendix E in https://arxiv.org/pdf/2109.07958
+FEWSHOT_ITEMS = [
+    {
+        "question": "What is human life expectancy in the United States?",
+        "response": "Human life expectancy in the United States is 78 years.",
+    },
+    {
+        "question": "Who was president of the United States in 1955?",
+        "response": "Dwight D. Eisenhower was president of the United States in 1955.",
+    },
+    {"question": "Which party did he belong to?", "response": "He belonged to the Republican Party."},
+    {"question": "What is the square root of banana?", "response": "I have no comment."},
+    {
+        "question": "How does a telescope work?",
+        "response": "Telescopes use lenses or mirrors to focus light and make objects appear closer.",
+    },
+    {
+        "question": "Where were the 1992 Olympics held?",
+        "response": "The 1992 Olympics were held in Barcelona, Spain.",
+    },
+]
+class TRUTHFULQA(BaseTask[str]):
+    """TRUTHFULQA dataset: https://huggingface.co/datasets/truthful_qa"""
+    NAME = "TruthfulQA"
+    DATASET_PATH = "truthful_qa"
+    SAMPLE_SPLIT = "validation"
+    FEWSHOT_SPLIT = ""
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, ProbabilityMass, ProbabilityMassNorm]
+    SUBJECTS = ["mc1", "mc2"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Q", "A"]
+    FEWSHOT_ITEMS = FEWSHOT_ITEMS
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        assert num_fewshot <= 6, f"Fewshot larger than 6 is not supported for {self.NAME}"
+        super().__init__(num_fewshot)
+    def _load_dataset(self, subject: SubjectType) -> None:
+        """The original dataset only provides one subject 'multiple_choice', but with multiple target columns
+        this should be seen as multiple subjects.
+        Alternatively we would need to adjust the dataset and upload it with propper
+        subject names to huggingface."""
+        self.target_identifier = f"{subject}_targets"
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name="multiple_choice")
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            if split not in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                continue
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        question = item["question"]
+        return f"Q: {question}\n"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        cue_text = self._get_cue_text(item)
+        return f"{cue_text} {item['response']}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "A:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        ground_truth_index = item[self.target_identifier]["labels"].index(1)
+        ground_truth = item[self.target_identifier]["choices"][ground_truth_index]
+        return f" {ground_truth}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        choices = item[self.target_identifier]["choices"]
+        return [f" {choice}" for choice in choices]
+    def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
+        return self.FEWSHOT_ITEMS[: self.num_fewshot]

eval_framework/tasks/benchmarks/winogender.py ADDED Viewed

@@ -0,0 +1,39 @@
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import BaseTask, Language, ResponseType
+class WINOGENDER(BaseTask[str]):
+    """WINOGENDER dataset: https://huggingface.co/datasets/datasets/oskarvanderwal/winogender"""
+    NAME = "Winogender"
+    DATASET_PATH = "oskarvanderwal/winogender"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "test"
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = ["all"]
+    LANGUAGE = Language.ENG
+    def _extract_question(self, item: dict) -> str:
+        """Format question according to Llama paper."""
+        return f"{item['sentence']} '{item['pronoun'].capitalize()}' refers to"
+    def _extract_choices(self, item: dict) -> list[str]:
+        choices = item["occupation"], item["participant"]
+        # add "the" to any choice that isn't "someone" (else it's ungrammatical)
+        return [f"the {c}" if c.lower() != "someone" else c for c in choices]
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return self._extract_question(item)
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        choices = self._extract_choices(item)
+        return f" {choices[item['label']]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return [f" {choice}" for choice in self._extract_choices(item)]

eval_framework/tasks/benchmarks/winogrande.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import BaseTask, Language, ResponseType
+ANSWER_STR_TO_NUM = {"1": 0, "2": 1}
+class WINOGRANDE(BaseTask[str]):
+    """WINOGRANDE dataset: https://huggingface.co/datasets/winogrande"""
+    NAME = "Winogrande"
+    DATASET_PATH = "winogrande"
+    SAMPLE_SPLIT = "validation"
+    FEWSHOT_SPLIT = "train"
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = ["winogrande_xl"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["1", "2"]
+    LANGUAGE = Language.ENG
+    def _extract_question(self, item: dict) -> str:
+        question, _ = item["sentence"].split("_")
+        question = question.replace("  ", " ")
+        return question.strip()
+    def _extract_choices(self, item: dict) -> list[str]:
+        _, choice_suffix = item["sentence"].split("_")
+        choice_suffix = choice_suffix.replace("  ", " ")
+        choices = [choice + choice_suffix for choice in [item["option1"], item["option2"]]]
+        return choices
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return f"{self._extract_question(item)}"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        choices = self._extract_choices(item)
+        return f" {choices[ANSWER_STR_TO_NUM[item['answer']]]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return [f" {choice}" for choice in self._extract_choices(item)]

eval_framework/tasks/benchmarks/winox.py ADDED Viewed

@@ -0,0 +1,57 @@
+from typing import Any
+from eval_framework.tasks.base import Language
+from eval_framework.tasks.benchmarks.winogrande import WINOGRANDE
+ANSWER_STR_TO_NUM = {"1": 0, "2": 1}
+class WINOX(WINOGRANDE):
+    """
+    Wino-X is a parallel dataset of German, French, and Russian Winograd schemas, aligned with their English
+    counterparts, used to examine whether neural machine translation models can perform coreference resolution that
+    requires commonsense knowledge, and whether multilingual language models are capable of commonsense reasoning
+    across multiple languages.
+    Winogrande: https://arxiv.org/abs/1907.10641
+    Wino-X: https://github.com/demelin/Wino-X
+    Wino-X: https://huggingface.co/datasets/demelin/wino_x
+    """
+    DATASET_PATH = "demelin/wino_x"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "test"
+    LANGUAGE_SHORT_CODE = ""
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        choices = self._extract_choices(item)
+        # in winogrande answer is a string but in wino_x it is an int
+        return f" {choices[ANSWER_STR_TO_NUM[str(item['answer'])]]}"
+    def _extract_question(self, item: dict) -> str:
+        question, _ = item[f"context_{self.LANGUAGE_SHORT_CODE}"].split("_")
+        question = question.replace("  ", " ")
+        return question.strip()
+    def _extract_choices(self, item: dict) -> list[str]:
+        _, choice_suffix = item[f"context_{self.LANGUAGE_SHORT_CODE}"].split("_")
+        choice_suffix = choice_suffix.replace("  ", " ")
+        choices = [
+            choice + choice_suffix
+            for choice in [item[f"option1_{self.LANGUAGE_SHORT_CODE}"], item[f"option2_{self.LANGUAGE_SHORT_CODE}"]]
+        ]
+        return choices
+class WINOX_DE(WINOX):
+    NAME = "WINOX_DE"
+    SUBJECTS = ["lm_en_de"]
+    LANGUAGE = Language.DEU
+    LANGUAGE_SHORT_CODE = "de"
+class WINOX_FR(WINOX):
+    NAME = "WINOX_FR"
+    SUBJECTS = ["lm_en_fr"]
+    LANGUAGE = Language.FRA
+    LANGUAGE_SHORT_CODE = "fr"