PyPI - eval-framework - Versions diffs - 0.2.7__py3-none-any.whl - Mend

eval-framework 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

eval_framework/__init__.py +7 -0
eval_framework/base_config.py +36 -0
eval_framework/context/__init__.py +0 -0
eval_framework/context/determined.py +177 -0
eval_framework/context/eval.py +121 -0
eval_framework/context/local.py +78 -0
eval_framework/evaluation_generator.py +234 -0
eval_framework/exceptions.py +2 -0
eval_framework/external/ifeval_impl/README.md +5 -0
eval_framework/external/ifeval_impl/instructions.py +1523 -0
eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
eval_framework/external/ifeval_impl/utils.py +135 -0
eval_framework/llm/__init__.py +0 -0
eval_framework/llm/aleph_alpha.py +432 -0
eval_framework/llm/base.py +180 -0
eval_framework/llm/huggingface.py +418 -0
eval_framework/llm/mistral.py +88 -0
eval_framework/llm/models.py +28 -0
eval_framework/llm/openai.py +400 -0
eval_framework/llm/vllm.py +554 -0
eval_framework/logger.py +3 -0
eval_framework/main.py +166 -0
eval_framework/metrics/__init__.py +0 -0
eval_framework/metrics/base.py +40 -0
eval_framework/metrics/completion/__init__.py +1 -0
eval_framework/metrics/completion/accuracy_completion.py +16 -0
eval_framework/metrics/completion/aidanbench.py +28 -0
eval_framework/metrics/completion/bleu.py +76 -0
eval_framework/metrics/completion/chrf.py +62 -0
eval_framework/metrics/completion/code_assertion.py +44 -0
eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
eval_framework/metrics/completion/comet.py +56 -0
eval_framework/metrics/completion/concordance_index.py +38 -0
eval_framework/metrics/completion/csv_format.py +102 -0
eval_framework/metrics/completion/cwe_accuracy.py +49 -0
eval_framework/metrics/completion/exponential_similarity.py +65 -0
eval_framework/metrics/completion/f1.py +42 -0
eval_framework/metrics/completion/format_checker.py +56 -0
eval_framework/metrics/completion/grid_difference.py +77 -0
eval_framework/metrics/completion/ifeval.py +73 -0
eval_framework/metrics/completion/json_format.py +179 -0
eval_framework/metrics/completion/language_checker.py +74 -0
eval_framework/metrics/completion/length_control.py +83 -0
eval_framework/metrics/completion/math_reasoning_completion.py +307 -0
eval_framework/metrics/completion/niah_accuracy.py +163 -0
eval_framework/metrics/completion/placeholder_checker.py +27 -0
eval_framework/metrics/completion/repetition.py +88 -0
eval_framework/metrics/completion/rouge_1.py +35 -0
eval_framework/metrics/completion/rouge_2.py +45 -0
eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
eval_framework/metrics/completion/rouge_l.py +52 -0
eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
eval_framework/metrics/completion/ter.py +67 -0
eval_framework/metrics/completion/text_counter.py +182 -0
eval_framework/metrics/efficiency/__init__.py +0 -0
eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
eval_framework/metrics/llm/__init__.py +0 -0
eval_framework/metrics/llm/base.py +34 -0
eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
eval_framework/metrics/llm/graders/coherence_grader.py +115 -0
eval_framework/metrics/llm/graders/comparison_grader.py +198 -0
eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
eval_framework/metrics/llm/graders/language.py +56 -0
eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
eval_framework/metrics/llm/graders/models.py +74 -0
eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
eval_framework/metrics/llm/llm_judge_coherence.py +44 -0
eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
eval_framework/metrics/llm/llm_judge_mtbench_pair.py +306 -0
eval_framework/metrics/llm/llm_judge_mtbench_single.py +210 -0
eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
eval_framework/metrics/llm/llm_judge_sql.py +394 -0
eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
eval_framework/metrics/llm/utils.py +20 -0
eval_framework/metrics/loglikelihood/__init__.py +0 -0
eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
eval_framework/metrics/loglikelihood/base.py +50 -0
eval_framework/metrics/loglikelihood/confidence_weighted_accuracy.py +25 -0
eval_framework/metrics/loglikelihood/dcs.py +43 -0
eval_framework/metrics/loglikelihood/probability_mass.py +53 -0
eval_framework/metrics/loglikelihood/ternary.py +42 -0
eval_framework/py.typed +0 -0
eval_framework/response_generator.py +351 -0
eval_framework/result_processors/__init__.py +0 -0
eval_framework/result_processors/base.py +88 -0
eval_framework/result_processors/hf_uploader.py +75 -0
eval_framework/result_processors/result_processor.py +129 -0
eval_framework/result_processors/wandb_uploader.py +137 -0
eval_framework/run.py +369 -0
eval_framework/run_direct.py +42 -0
eval_framework/shared/types.py +227 -0
eval_framework/tasks/__init__.py +6 -0
eval_framework/tasks/base.py +392 -0
eval_framework/tasks/benchmarks/__init__.py +0 -0
eval_framework/tasks/benchmarks/aidanbench.py +211 -0
eval_framework/tasks/benchmarks/arc.py +70 -0
eval_framework/tasks/benchmarks/arc_de.py +46 -0
eval_framework/tasks/benchmarks/arc_fi.py +46 -0
eval_framework/tasks/benchmarks/belebele.py +60 -0
eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
eval_framework/tasks/benchmarks/casehold.py +47 -0
eval_framework/tasks/benchmarks/chembench.py +85 -0
eval_framework/tasks/benchmarks/copa.py +64 -0
eval_framework/tasks/benchmarks/duc.py +91 -0
eval_framework/tasks/benchmarks/flores200.py +133 -0
eval_framework/tasks/benchmarks/flores_plus.py +84 -0
eval_framework/tasks/benchmarks/gpqa.py +201 -0
eval_framework/tasks/benchmarks/gsm8k.py +150 -0
eval_framework/tasks/benchmarks/hellaswag.py +69 -0
eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
eval_framework/tasks/benchmarks/humaneval.py +97 -0
eval_framework/tasks/benchmarks/ifeval.py +78 -0
eval_framework/tasks/benchmarks/include.py +119 -0
eval_framework/tasks/benchmarks/infinitebench.py +302 -0
eval_framework/tasks/benchmarks/math_reasoning.py +580 -0
eval_framework/tasks/benchmarks/mbpp.py +192 -0
eval_framework/tasks/benchmarks/mmlu.py +215 -0
eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
eval_framework/tasks/benchmarks/mmlu_pro.py +164 -0
eval_framework/tasks/benchmarks/mmmlu.py +529 -0
eval_framework/tasks/benchmarks/openbookqa.py +85 -0
eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
eval_framework/tasks/benchmarks/pawsx.py +65 -0
eval_framework/tasks/benchmarks/piqa.py +64 -0
eval_framework/tasks/benchmarks/quality.py +56 -0
eval_framework/tasks/benchmarks/sciq.py +110 -0
eval_framework/tasks/benchmarks/sphyr.py +79 -0
eval_framework/tasks/benchmarks/squad.py +211 -0
eval_framework/tasks/benchmarks/struct_eval.py +116 -0
eval_framework/tasks/benchmarks/tablebench.py +117 -0
eval_framework/tasks/benchmarks/triviaqa.py +42 -0
eval_framework/tasks/benchmarks/truthfulqa.py +119 -0
eval_framework/tasks/benchmarks/winogender.py +64 -0
eval_framework/tasks/benchmarks/winogrande.py +69 -0
eval_framework/tasks/benchmarks/winox.py +57 -0
eval_framework/tasks/benchmarks/wmt.py +160 -0
eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
eval_framework/tasks/eval_config.py +136 -0
eval_framework/tasks/perturbation.py +83 -0
eval_framework/tasks/registry.py +186 -0
eval_framework/tasks/task_loader.py +81 -0
eval_framework/tasks/task_names.py +324 -0
eval_framework/tasks/utils.py +584 -0
eval_framework/utils/constants.py +9 -0
eval_framework/utils/file_ops.py +245 -0
eval_framework/utils/generate_task_docs.py +244 -0
eval_framework/utils/helpers.py +32 -0
eval_framework/utils/logging.py +62 -0
eval_framework/utils/packaging.py +52 -0
eval_framework/utils/tqdm_handler.py +14 -0
eval_framework-0.2.7.dist-info/METADATA +548 -0
eval_framework-0.2.7.dist-info/RECORD +170 -0
eval_framework-0.2.7.dist-info/WHEEL +4 -0
eval_framework-0.2.7.dist-info/entry_points.txt +3 -0
template_formatting/README.md +83 -0
template_formatting/__init__.py +0 -0
template_formatting/formatter.py +537 -0
template_formatting/mistral_formatter.py +159 -0
template_formatting/py.typed +0 -0

eval_framework/tasks/benchmarks/sphyr.py ADDED Viewed

@@ -0,0 +1,79 @@
+from typing import Any
+from eval_framework.metrics.completion.grid_difference import GridDifference
+from eval_framework.tasks.base import BaseTask, Language, ResponseType
+SUBJECTS = [
+    "1_random_cell_easy",
+    "5_random_cell_easy",
+    "10_random_cell_easy",
+    "1_random_row_easy",
+    "3_random_row_easy",
+    "1_random_column_easy",
+    "3_random_column_easy",
+    "full_easy",
+    "1_random_cell_hard",
+    "5_random_cell_hard",
+    "10_random_cell_hard",
+    "1_random_row_hard",
+    "3_random_row_hard",
+    "1_random_column_hard",
+    "3_random_column_hard",
+    "full_hard",
+]
+SYSTEM_PROMPT = """You are given a structural material distribution represented as a grid. Each cell can have one of the following states:
+- 'L' indicates applied load.
+- 'V' indicates void.
+- 'S' indicates support.
+The goal is to predict the correct material distribution by filling in all {FILL_INSTRUCTION}, based on the surrounding structure and implicit physical reasoning (such as load paths, supports, and forces).
+Important: The completed structure should use as little material as possible while remaining stable and plausible for carrying the applied forces. Minimize material usage unless necessary for structural support."""  # noqa: E501
+PROMPT_TEMPLATE = """Below is the input grid with masked regions:
+{GRID}
+Please output the completed grid by replacing all {FILL_INSTRUCTION}.
+Maintain the same format as the input: one row per line, cells separated by spaces, and the total number of rows and columns unchanged.
+Return only the completed grid without any additional explanation."""  # noqa: E501
+EASY_FILL_INSTRUCTION = "'V' cells with either '1' (solid) or '0' (empty)"
+HARD_FILL_INSTRUCTION = (
+    "'V' cells with a floating point number between 0 and 1, with one decimal place (e.g., 0.0, 0.1, 0.2, ..., 1.0)"
+)
+class SPHYR(BaseTask[str]):
+    """SPhyR dataset: https://huggingface.co/datasets/philippds/SPhyR"""
+    NAME = "SPHYR"
+    DATASET_PATH = "philippds/SPhyR"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = ""
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [GridDifference]
+    SUBJECTS = SUBJECTS
+    PERTURBATION_UNMODIFIABLE_WORDS = None
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        assert num_fewshot == 0, "Fewshot is not supported for SPHYR"
+        super().__init__(num_fewshot)
+    def _grid_to_str(self, grid: list[list[str]]) -> str:
+        return "\n".join(" ".join(str(cell) for cell in row) for row in grid)
+    def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None:
+        FILL_INSTRUCTION = EASY_FILL_INSTRUCTION if "easy" in item["subject"] else HARD_FILL_INSTRUCTION
+        return SYSTEM_PROMPT.format(FILL_INSTRUCTION=FILL_INSTRUCTION)
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        FILL_INSTRUCTION = EASY_FILL_INSTRUCTION if "easy" in item["subject"] else HARD_FILL_INSTRUCTION
+        grid = self._grid_to_str(item["input_grid"])
+        return PROMPT_TEMPLATE.format(GRID=grid, FILL_INSTRUCTION=FILL_INSTRUCTION)
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return self._grid_to_str(item["ground_truth"])

eval_framework/tasks/benchmarks/squad.py ADDED Viewed

@@ -0,0 +1,211 @@
+import os
+import random
+from pathlib import Path
+from typing import Any
+import requests
+from datasets import Dataset, DatasetDict, DownloadConfig, load_dataset
+from huggingface_hub import HfApi
+from huggingface_hub.errors import RevisionNotFoundError
+from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
+from eval_framework.metrics.completion.f1 import F1
+from eval_framework.tasks.base import NO_SUBJECT, RANDOM_SEED, BaseTask, Language, ResponseType, SubjectType
+class SQUAD2(BaseTask[str]):
+    """Squad v2 dataset: https://huggingface.co/datasets/rajpurkar/squad_v2"""
+    NAME = "SQuAD2"
+    DATASET_PATH = "rajpurkar/squad_v2"
+    SAMPLE_SPLIT = "validation"
+    FEWSHOT_SPLIT = "train"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [AccuracyCompletion, F1]
+    SUBJECTS = [NO_SUBJECT]
+    UNANSWERABLE_STR = "unanswerable"
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer", "Context", "unanswerable"]
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = [".\n"]
+        self.max_tokens = 300  # the max length of the ground truth is 160 characters while the average is ~19
+        self.rnd_choice_shuffle = random.Random()
+    def _get_squad_urls(self) -> dict[str, str]:
+        """Get the URLs for this SQUAD version."""
+        return {
+            "train": "https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/master/dataset/train-v2.0.json",
+            "validation": "https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/master/dataset/dev-v2.0.json",
+        }
+    def _load_hf_dataset(self, **kwargs: Any) -> Any:
+        """Load SQUAD dataset, falling back to JSON if HF fails."""
+        # Validate HF revision if specified
+        self._validate_hf_revision(kwargs.get("path", self.DATASET_PATH))
+        # Try HuggingFace first
+        try:
+            return self._load_from_huggingface(**kwargs)
+        except ValueError as e:
+            if "Feature type 'List' not found" in str(e):
+                import warnings
+                warnings.warn(
+                    f"Dataset {kwargs.get('path', self.DATASET_PATH)} has incompatible feature types "
+                    "(List instead of Sequence), loading directly from JSON files"
+                )
+                return self._load_from_json(**kwargs)
+            raise
+    def _validate_hf_revision(self, dataset_path: str) -> None:
+        """Validate HuggingFace revision if specified."""
+        if self.HF_REVISION:
+            try:
+                HfApi().dataset_info(repo_id=dataset_path, revision=self.HF_REVISION, timeout=100.0)
+            except RevisionNotFoundError:
+                raise
+    def _load_from_huggingface(self, **kwargs: Any) -> Any:
+        """Load dataset from HuggingFace."""
+        cache_dir = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
+        download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
+        return load_dataset(
+            **kwargs,
+            revision=self.HF_REVISION,
+            trust_remote_code=True,
+            cache_dir=cache_dir,
+            download_config=download_config,
+        )
+    def _load_from_json(self, **kwargs: Any) -> Dataset | DatasetDict:
+        """Load SQUAD directly from GitHub JSON files."""
+        urls = self._get_squad_urls()
+        requested_split = kwargs.get("split")
+        splits_to_load = [requested_split] if requested_split else list(urls.keys())
+        datasets = {}
+        for split in splits_to_load:
+            if split not in urls:
+                continue
+            dataset = self._download_and_parse_split(split, urls[split])
+            if dataset:
+                datasets[split] = dataset
+        if not datasets:
+            raise ValueError(f"Failed to load any splits for {kwargs.get('path', self.DATASET_PATH)}")
+        # Return single dataset or DatasetDict depending on what was requested
+        return datasets[requested_split] if requested_split else DatasetDict(datasets)
+    def _download_and_parse_split(self, split: str, url: str) -> Dataset | None:
+        """Download and parse a single SQUAD split."""
+        try:
+            # Download the data
+            response = requests.get(url, timeout=30)
+            response.raise_for_status()
+            squad_data = response.json()
+            # Flatten the nested structure
+            examples = self._flatten_squad_data(squad_data)
+            return Dataset.from_list(examples)
+        except Exception as e:
+            import warnings
+            warnings.warn(f"Failed to download {split} split: {e}")
+            return None
+    def _flatten_squad_data(self, squad_data: dict) -> list[dict]:
+        """Flatten nested SQUAD JSON structure into examples."""
+        examples = []
+        for article in squad_data["data"]:
+            title = article["title"]
+            for paragraph in article["paragraphs"]:
+                context = paragraph["context"]
+                for qa in paragraph["qas"]:
+                    example = {
+                        "id": qa["id"],
+                        "title": title,
+                        "context": context,
+                        "question": qa["question"],
+                        "answers": {
+                            "text": [answer["text"] for answer in qa.get("answers", [])],
+                            "answer_start": [answer["answer_start"] for answer in qa.get("answers", [])],
+                        },
+                    }
+                    examples.append(example)
+        return examples
+    def _load_dataset(self, subject: SubjectType) -> None:
+        name = subject if subject != NO_SUBJECT else None
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        prompt = (
+            "Given the following context, answer the question. If the question cannot be answered based "
+            f"on the context alone, respond with '{self.UNANSWERABLE_STR}'.\n\n"
+            "Context:\n"
+            f"{item['context']}\n\n"
+            f"Question:\n{item['question']}\nAnswer:"
+        )
+        return prompt
+    def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
+        text_ = item["answers"]["text"]
+        ground_truth_for_unanswerable = [
+            self.UNANSWERABLE_STR,
+            self.UNANSWERABLE_STR + " ",
+            self.UNANSWERABLE_STR.capitalize(),
+        ]
+        ground_truths = text_ if text_ else ground_truth_for_unanswerable
+        return [f" {ground_truth}" for ground_truth in ground_truths]
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        target = self._get_ground_truth(item)[0]
+        assert target is not None
+        assert isinstance(target, str)
+        return target
+class SQUAD(SQUAD2):
+    """Squad dataset: https://huggingface.co/datasets/rajpurkar/squad"""
+    NAME = "SQuAD"
+    DATASET_PATH = "rajpurkar/squad"
+    def _get_squad_urls(self) -> dict[str, str]:
+        """Override to provide SQUAD v1 URLs."""
+        return {
+            "train": "https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/master/dataset/train-v1.1.json",
+            "validation": "https://raw.githubusercontent.com/rajpurkar/SQuAD-explorer/master/dataset/dev-v1.1.json",
+        }
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        prompt = (
+            "Given the following context, answer the question.\n\n"
+            "Context:\n"
+            f"{item['context']}\n\n"
+            f"Question:\n{item['question']}\n"
+        )
+        return prompt
+    def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
+        return item["answers"]["text"]

eval_framework/tasks/benchmarks/struct_eval.py ADDED Viewed

@@ -0,0 +1,116 @@
+import os
+import random
+import re
+from typing import Any
+from datasets import DatasetDict
+from eval_framework.metrics.completion.struct_eval_metrics import (
+    RenderableStructMetric,
+    RenderableStructMetricContext,
+    StructMetric,
+    StructMetricContext,
+)
+from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
+StructEvalSubjects = [
+    "CSV to YAML",
+    "JSON to XML",
+    "JSON to CSV",
+    "XML to JSON",
+    "XML to YAML",
+    "Text to XML",
+    "Text to YAML",
+    "Text to TOML",
+    "YAML to JSON",
+    "TOML to JSON",
+    "Text to CSV",
+    "YAML to XML",
+    "JSON to YAML",
+    "TOML to YAML",
+    "YAML to CSV",
+    "CSV to JSON",
+    "CSV to XML",
+    "Text to JSON",
+    "XML to CSV",
+]
+class StructEval(BaseTask[str]):
+    """StructEval task: https://tiger-ai-lab.github.io/StructEval/"""
+    NAME = "StructEval"
+    DATASET_PATH = "TIGER-Lab/StructEval"
+    SAMPLE_SPLIT = "train"
+    FEWSHOT_SPLIT = "train"  # Only has train split
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [StructMetric]  # Define appropriate metrics for StructEval
+    SUBJECTS = StructEvalSubjects
+    LANGUAGE = Language.ENG
+    HF_REVISION = "b551217560cf225245b0607a21c505e24a58e396"
+    def __init__(self, num_fewshot: int = 0) -> None:
+        if num_fewshot > 0:
+            raise ValueError("StructEval only supports zero-shot evaluation.")
+        super().__init__(num_fewshot)
+    def _load_dataset(self, subject: str) -> None:
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH)
+        assert isinstance(hf_dataset, DatasetDict), "Expected a Hugging Face Dataset object."
+        hf_dataset = hf_dataset.filter(lambda item: item["task_name"] == subject, num_proc=os.cpu_count())
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            if split not in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                continue
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return (
+            f"{item['query']}\n\nIMPORTANT: Only output the required output format. "
+            "You must start the format/code with <|BEGIN_CODE|> and end the format/code with  <|END_CODE|>. "
+            "No other text output (explanation, comments, etc.) are allowed.  Do not use markdown code fences.\n"
+        )
+    def _get_context(self, item: dict[str, Any]) -> StructMetricContext | RenderableStructMetricContext:
+        return StructMetricContext(
+            output_type=item["output_type"],
+            paths=item["raw_output_metric"],
+        )
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "<|BEGIN_CODE|>"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
+        return None
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        m = re.search(r"(?:<\|BEGIN_CODE\|>|```[\w+-]*)(.*?)(?:<\|END_CODE\|>|```*)", completion_text, re.DOTALL)
+        return m.group(1).strip() if m else completion_text.strip()
+# There are more subjects in the StructEval dataset, but currently only the HTML output metric is implemented.
+RENDERABLE_STRUCTEVAL_SUBJECTS = [
+    "Convert Markdown to HTML",
+    "Convert React to HTML",
+    "Convert Vue to HTML",
+    "Text to HTML",
+]
+class RenderableStructEval(StructEval):
+    """Renderable StructEval task for tasks that can be rendered visually."""
+    NAME = "RenderableStructEval"
+    SUBJECTS = RENDERABLE_STRUCTEVAL_SUBJECTS
+    METRICS = [RenderableStructMetric]  # Define appropriate metrics for StructEval
+    def _get_context(self, item: dict[str, Any]) -> RenderableStructMetricContext:
+        return RenderableStructMetricContext(
+            output_type=item["output_type"],
+            keywords=item["raw_output_metric"],
+        )

eval_framework/tasks/benchmarks/tablebench.py ADDED Viewed

@@ -0,0 +1,117 @@
+import csv
+import json
+import random
+import re
+import tempfile
+from itertools import product
+from typing import Any
+from eval_framework.exceptions import LogicError
+from eval_framework.metrics.completion.rouge_l import ROUGE_L
+from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, Sample
+from eval_framework.tasks.utils import run_python_code
+from template_formatting.formatter import Role
+TABLE_BENCH_SUBJECTS = [
+    "NumericalReasoning",
+    "DataAnalysis",
+    "FactChecking",
+    # "Visualization" task is complex to re-implement, of small relevance and of small size (5.6% of dataset, Language)
+    # see https://github.com/TableBench/TableBench/blob/main/eval/batch_parse_response_script.py#L56
+]
+TABLE_BENCH_INSTRUCTION_TYPES = [
+    # "DP",  # Direct Prompting, has been deleted: https://huggingface.co/datasets/Multilingual-Multimodal-NLP/TableBench-Instructions/commit/534a6d859494c370f2aa6ee0e6076103d9707560 # noqa: E501
+    "PoT",  # Program-of-thought
+    "SCoT",  # Symbolic chain-of-thought
+    "TCoT",  # Textual chain-of-thought
+]
+class TableBench(BaseTask[tuple[str, str]]):
+    """TableBench dataset: https://huggingface.co/datasets/Multilingual-Multimodal-NLP/TableBench"""
+    NAME = "TableBench"
+    DATASET_PATH = "Multilingual-Multimodal-NLP/TableBench"
+    HF_REVISION = "81b551c744b7f49cfa0ad69cb7a1465d865c206e"  # latest version of the dataset is corrupted
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "test"  # (there is no dedicated split, few-shot is not expected for this dataset)
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [ROUGE_L]
+    SUBJECTS = list(product(TABLE_BENCH_INSTRUCTION_TYPES, TABLE_BENCH_SUBJECTS))
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        assert num_fewshot == 0, "Fewshot is not supported for TableBench"
+        super().__init__(num_fewshot)
+    def _load_dataset(self, subject: tuple[str, str]) -> None:
+        instruction_type, qtype = subject
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=None)
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            data = data.filter(lambda x: x["qtype"] == qtype and x["instruction_type"] == instruction_type)
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            if split in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return item["instruction"]
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        return item["answer"]
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        assert sample is not None
+        if "PoT" in sample.subject:
+            # Extract the (last) generated code snippet or fail otherwise
+            try:
+                matches = re.findall(r"```python\n(.*?)```", completion_text, flags=re.S)
+                if not matches:
+                    return ""
+                code = matches[-1]
+            except Exception:
+                return ""
+            # Extract the table given in the prompt and prepare it as a file
+            instruction = [m.content for m in sample.messages if m.role == Role.USER][-1]
+            tables = re.findall(r"\[TABLE\] (.*?) Let's get start!", instruction, flags=re.S)
+            if not tables:
+                return ""
+            # Check if the tables is a list or a string
+            if isinstance(tables, str):
+                table_dict = json.loads(tables.strip())
+            elif isinstance(tables, list):
+                table_dict = json.loads(tables[0].strip())
+            else:
+                raise LogicError(f"TableBench: {instruction} does not seem to contain one table.")
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                filename = f"{tmpdirname}/table.csv"
+                with open(filename, "w") as f:
+                    writer = csv.writer(f)
+                    writer.writerow(table_dict["columns"])
+                    writer.writerows(table_dict["data"])
+                # Run the code in a Docker image, providing the table from the prompt
+                completion_text = run_python_code(
+                    code, image="amancevice/pandas:slim", input_files=[(filename, "/var/lib/pandas/table.csv")]
+                )
+                if "Error" in completion_text:
+                    return ""
+        # Extract the answer, be it directly from the model or be it the result of the generated code
+        try:
+            match = re.search(r"Final Answer: (.+)", completion_text)
+            return match.group(1).strip() if match else ""
+        except Exception:
+            return ""

eval_framework/tasks/benchmarks/triviaqa.py ADDED Viewed

@@ -0,0 +1,42 @@
+import random
+from typing import Any
+from eval_framework.metrics.completion.accuracy_completion import AccuracyCompletion
+from eval_framework.metrics.completion.f1 import F1
+from eval_framework.tasks.base import BaseTask, Language, ResponseType, Sample
+class TRIVIAQA(BaseTask[str]):
+    """Trivia QA dataset: https://huggingface.co/datasets/mandarjoshi/trivia_qa"""
+    NAME = "TriviaQA"
+    DATASET_PATH = "mandarjoshi/trivia_qa"
+    SAMPLE_SPLIT = "validation"
+    FEWSHOT_SPLIT = "train"
+    RESPONSE_TYPE = ResponseType.COMPLETION
+    METRICS = [AccuracyCompletion, F1]
+    SUBJECTS = ["rc.wikipedia.nocontext"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question", "Answer"]
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.stop_sequences = ["\n"]
+        self.max_tokens = 400  # the max length of the ground truth is 282 characters while the average is ~16
+        self.rnd_choice_shuffle = random.Random()
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        prompt = f"Question: {item['question'].strip()}\nAnswer:"
+        return prompt
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        target = self._get_ground_truth(item)[0]
+        assert target is not None
+        assert isinstance(target, str)
+        return f" {target}"
+    def _get_ground_truth(self, item: dict[str, Any]) -> list[str]:
+        return item["answer"]["aliases"]
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        return completion_text.strip().rstrip(".")

eval_framework/tasks/benchmarks/truthfulqa.py ADDED Viewed

@@ -0,0 +1,119 @@
+import random
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.metrics.loglikelihood.confidence_weighted_accuracy import ConfidenceWeightedAccuracy
+from eval_framework.metrics.loglikelihood.dcs import DistributionalCorrectnessScore
+from eval_framework.metrics.loglikelihood.probability_mass import ProbabilityMass, ProbabilityMassNorm
+from eval_framework.metrics.loglikelihood.ternary import TernaryScore
+from eval_framework.tasks.base import RANDOM_SEED, BaseTask, Language, ResponseType, SubjectType
+# fewshot examples from Appendix E in https://arxiv.org/pdf/2109.07958
+FEWSHOT_ITEMS = [
+    {
+        "question": "What is human life expectancy in the United States?",
+        "response": "Human life expectancy in the United States is 78 years.",
+    },
+    {
+        "question": "Who was president of the United States in 1955?",
+        "response": "Dwight D. Eisenhower was president of the United States in 1955.",
+    },
+    {"question": "Which party did he belong to?", "response": "He belonged to the Republican Party."},
+    {"question": "What is the square root of banana?", "response": "I have no comment."},
+    {
+        "question": "How does a telescope work?",
+        "response": "Telescopes use lenses or mirrors to focus light and make objects appear closer.",
+    },
+    {
+        "question": "Where were the 1992 Olympics held?",
+        "response": "The 1992 Olympics were held in Barcelona, Spain.",
+    },
+]
+class TRUTHFULQA(BaseTask[str]):
+    """TRUTHFULQA dataset: https://huggingface.co/datasets/truthfulqa/truthful_qa"""
+    NAME = "TruthfulQA"
+    DATASET_PATH = "truthful_qa"
+    SAMPLE_SPLIT = "validation"
+    FEWSHOT_SPLIT = ""
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood, ProbabilityMass, ProbabilityMassNorm]
+    SUBJECTS = ["mc1", "mc2"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Q", "A"]
+    FEWSHOT_ITEMS = FEWSHOT_ITEMS
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        assert num_fewshot <= 6, f"Fewshot larger than 6 is not supported for {self.NAME}"
+        super().__init__(num_fewshot)
+    def _load_dataset(self, subject: SubjectType) -> None:
+        """The original dataset only provides one subject 'multiple_choice', but with multiple target columns
+        this should be seen as multiple subjects.
+        Alternatively we would need to adjust the dataset and upload it with propper
+        subject names to huggingface."""
+        self.target_identifier = f"{subject}_targets"
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name="multiple_choice")
+        self.dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            if split not in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                continue
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            self.dataset[split] = data_list
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        question = item["question"]
+        return f"Q: {question}\n"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        cue_text = self._get_cue_text(item)
+        return f"{cue_text} {item['response']}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "A:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
+        labels = item[self.target_identifier]["labels"]
+        choices = item[self.target_identifier]["choices"]
+        return [f" {choice}" for label, choice in zip(labels, choices) if label == 1]
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        choices = item[self.target_identifier]["choices"]
+        return [f" {choice}" for choice in choices]
+    def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
+        return self.FEWSHOT_ITEMS[: self.num_fewshot]
+class TRUTHFULQA_IDK(TRUTHFULQA):
+    NAME = "TruthfulQA_IDK"
+    METRICS = [
+        AccuracyLoglikelihood,
+        AccuracyNormLoglikelihood,
+        ConfidenceWeightedAccuracy,
+        DistributionalCorrectnessScore,
+        TernaryScore,
+    ]
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return (
+            "Answer only if you are confident, since mistakes may be penalised, while correct answers receive points. "
+            "It is acceptable to answer with 'I do not know' if you are unsure, and you will receive 0 points."
+        )
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        completions = super()._get_possible_completions(item)
+        return (completions or []) + [" I do not know."]