PyPI - eval-framework - Versions diffs - 0.2.0__py3-none-any.whl - Mend

eval-framework 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (161) hide show

eval_framework/__init__.py +7 -0
eval_framework/base_config.py +36 -0
eval_framework/context/__init__.py +0 -0
eval_framework/context/determined.py +170 -0
eval_framework/context/eval.py +114 -0
eval_framework/context/local.py +52 -0
eval_framework/evaluation_generator.py +231 -0
eval_framework/exceptions.py +2 -0
eval_framework/external/ifeval_impl/README.md +5 -0
eval_framework/external/ifeval_impl/instructions.py +1523 -0
eval_framework/external/ifeval_impl/instructions_registry.py +161 -0
eval_framework/external/ifeval_impl/instructions_util.py +1689 -0
eval_framework/external/ifeval_impl/utils.py +135 -0
eval_framework/llm/__init__.py +0 -0
eval_framework/llm/aleph_alpha.py +323 -0
eval_framework/llm/base.py +58 -0
eval_framework/llm/huggingface.py +332 -0
eval_framework/llm/mistral.py +73 -0
eval_framework/llm/models.py +16 -0
eval_framework/llm/openai.py +205 -0
eval_framework/llm/vllm.py +438 -0
eval_framework/logger.py +3 -0
eval_framework/main.py +187 -0
eval_framework/metrics/__init__.py +0 -0
eval_framework/metrics/base.py +40 -0
eval_framework/metrics/completion/__init__.py +1 -0
eval_framework/metrics/completion/accuracy_completion.py +16 -0
eval_framework/metrics/completion/bleu.py +76 -0
eval_framework/metrics/completion/chrf.py +62 -0
eval_framework/metrics/completion/code_assertion.py +44 -0
eval_framework/metrics/completion/code_execution_pass_at_one.py +126 -0
eval_framework/metrics/completion/comet.py +56 -0
eval_framework/metrics/completion/concordance_index.py +38 -0
eval_framework/metrics/completion/csv_format.py +102 -0
eval_framework/metrics/completion/cwe_accuracy.py +49 -0
eval_framework/metrics/completion/exponential_similarity.py +65 -0
eval_framework/metrics/completion/f1.py +42 -0
eval_framework/metrics/completion/format_checker.py +56 -0
eval_framework/metrics/completion/grid_difference.py +77 -0
eval_framework/metrics/completion/ifeval.py +73 -0
eval_framework/metrics/completion/json_format.py +171 -0
eval_framework/metrics/completion/language_checker.py +74 -0
eval_framework/metrics/completion/length_control.py +83 -0
eval_framework/metrics/completion/math_reasoning_completion.py +303 -0
eval_framework/metrics/completion/niah_accuracy.py +163 -0
eval_framework/metrics/completion/placeholder_checker.py +27 -0
eval_framework/metrics/completion/repetition.py +88 -0
eval_framework/metrics/completion/rouge_1.py +35 -0
eval_framework/metrics/completion/rouge_2.py +45 -0
eval_framework/metrics/completion/rouge_geometric_mean.py +36 -0
eval_framework/metrics/completion/rouge_l.py +52 -0
eval_framework/metrics/completion/struct_eval_metrics.py +248 -0
eval_framework/metrics/completion/ter.py +67 -0
eval_framework/metrics/completion/text_counter.py +182 -0
eval_framework/metrics/efficiency/__init__.py +0 -0
eval_framework/metrics/efficiency/bytes_per_sequence_position.py +48 -0
eval_framework/metrics/llm/__init__.py +0 -0
eval_framework/metrics/llm/base.py +8 -0
eval_framework/metrics/llm/graders/chatbot_style_grader.py +92 -0
eval_framework/metrics/llm/graders/comparison_grader.py +146 -0
eval_framework/metrics/llm/graders/conciseness_grader.py +93 -0
eval_framework/metrics/llm/graders/contains_names_grader.py +71 -0
eval_framework/metrics/llm/graders/format_correctness_grader.py +109 -0
eval_framework/metrics/llm/graders/instruction_grader.py +177 -0
eval_framework/metrics/llm/graders/language.py +56 -0
eval_framework/metrics/llm/graders/long_context_grader.py +72 -0
eval_framework/metrics/llm/graders/models.py +74 -0
eval_framework/metrics/llm/graders/refusal_grader.py +57 -0
eval_framework/metrics/llm/graders/sql_quality_grader.py +145 -0
eval_framework/metrics/llm/graders/summary_world_knowledge_grader.py +103 -0
eval_framework/metrics/llm/llm_judge_chatbot_style.py +36 -0
eval_framework/metrics/llm/llm_judge_completion_accuracy.py +39 -0
eval_framework/metrics/llm/llm_judge_conciseness.py +37 -0
eval_framework/metrics/llm/llm_judge_contains_names.py +36 -0
eval_framework/metrics/llm/llm_judge_format_correctness.py +43 -0
eval_framework/metrics/llm/llm_judge_instruction.py +58 -0
eval_framework/metrics/llm/llm_judge_mtbench_pair.py +205 -0
eval_framework/metrics/llm/llm_judge_mtbench_single.py +188 -0
eval_framework/metrics/llm/llm_judge_refusal.py +35 -0
eval_framework/metrics/llm/llm_judge_sql.py +394 -0
eval_framework/metrics/llm/llm_judge_world_knowledge.py +37 -0
eval_framework/metrics/loglikelihood/__init__.py +0 -0
eval_framework/metrics/loglikelihood/accuracy_loglikelihood.py +51 -0
eval_framework/metrics/loglikelihood/probability_mass.py +56 -0
eval_framework/py.typed +0 -0
eval_framework/response_generator.py +416 -0
eval_framework/result_processors/__init__.py +0 -0
eval_framework/result_processors/base.py +74 -0
eval_framework/result_processors/hf_processor.py +87 -0
eval_framework/result_processors/result_processor.py +129 -0
eval_framework/run.py +314 -0
eval_framework/run_direct.py +42 -0
eval_framework/shared/types.py +227 -0
eval_framework/tasks/__init__.py +6 -0
eval_framework/tasks/base.py +314 -0
eval_framework/tasks/benchmarks/__init__.py +0 -0
eval_framework/tasks/benchmarks/arc.py +46 -0
eval_framework/tasks/benchmarks/arc_de.py +46 -0
eval_framework/tasks/benchmarks/arc_fi.py +46 -0
eval_framework/tasks/benchmarks/belebele.py +60 -0
eval_framework/tasks/benchmarks/bigcodebench.py +155 -0
eval_framework/tasks/benchmarks/casehold.py +47 -0
eval_framework/tasks/benchmarks/chembench.py +85 -0
eval_framework/tasks/benchmarks/copa.py +39 -0
eval_framework/tasks/benchmarks/duc.py +91 -0
eval_framework/tasks/benchmarks/flores200.py +62 -0
eval_framework/tasks/benchmarks/flores_plus.py +84 -0
eval_framework/tasks/benchmarks/gpqa.py +177 -0
eval_framework/tasks/benchmarks/gsm8k.py +148 -0
eval_framework/tasks/benchmarks/hellaswag.py +44 -0
eval_framework/tasks/benchmarks/hellaswag_de.py +52 -0
eval_framework/tasks/benchmarks/humaneval.py +97 -0
eval_framework/tasks/benchmarks/ifeval.py +78 -0
eval_framework/tasks/benchmarks/include.py +119 -0
eval_framework/tasks/benchmarks/infinitebench.py +302 -0
eval_framework/tasks/benchmarks/math_reasoning.py +569 -0
eval_framework/tasks/benchmarks/mbpp.py +192 -0
eval_framework/tasks/benchmarks/mmlu.py +190 -0
eval_framework/tasks/benchmarks/mmlu_de.py +109 -0
eval_framework/tasks/benchmarks/mmlu_pro.py +139 -0
eval_framework/tasks/benchmarks/mmmlu.py +529 -0
eval_framework/tasks/benchmarks/openbookqa.py +37 -0
eval_framework/tasks/benchmarks/opengptx_eu20.py +363 -0
eval_framework/tasks/benchmarks/pawsx.py +65 -0
eval_framework/tasks/benchmarks/piqa.py +39 -0
eval_framework/tasks/benchmarks/quality.py +56 -0
eval_framework/tasks/benchmarks/sciq.py +44 -0
eval_framework/tasks/benchmarks/sphyr.py +75 -0
eval_framework/tasks/benchmarks/squad.py +89 -0
eval_framework/tasks/benchmarks/struct_eval.py +110 -0
eval_framework/tasks/benchmarks/tablebench.py +117 -0
eval_framework/tasks/benchmarks/triviaqa.py +42 -0
eval_framework/tasks/benchmarks/truthfulqa.py +95 -0
eval_framework/tasks/benchmarks/winogender.py +39 -0
eval_framework/tasks/benchmarks/winogrande.py +44 -0
eval_framework/tasks/benchmarks/winox.py +57 -0
eval_framework/tasks/benchmarks/wmt.py +160 -0
eval_framework/tasks/benchmarks/zero_scrolls.py +197 -0
eval_framework/tasks/eval_config.py +112 -0
eval_framework/tasks/perturbation.py +83 -0
eval_framework/tasks/registry.py +186 -0
eval_framework/tasks/task_loader.py +80 -0
eval_framework/tasks/task_names.py +138 -0
eval_framework/tasks/utils.py +578 -0
eval_framework/utils/constants.py +9 -0
eval_framework/utils/generate_task_docs.py +229 -0
eval_framework/utils/helpers.py +3 -0
eval_framework/utils/logging.py +50 -0
eval_framework/utils/packaging.py +52 -0
eval_framework-0.2.0.dist-info/METADATA +514 -0
eval_framework-0.2.0.dist-info/RECORD +161 -0
eval_framework-0.2.0.dist-info/WHEEL +4 -0
eval_framework-0.2.0.dist-info/entry_points.txt +3 -0
template_formatting/README.md +83 -0
template_formatting/__init__.py +0 -0
template_formatting/formatter.py +536 -0
template_formatting/mistral_formatter.py +159 -0
template_formatting/py.typed +0 -0
template_formatting/tests/test_formatter_eval.py +408 -0
template_formatting/tests/test_formatter_scaling.py +253 -0
template_formatting/tests/test_mistral_formatter.py +136 -0

eval_framework/shared/types.py ADDED Viewed

@@ -0,0 +1,227 @@
+import re
+from collections.abc import Callable, Sequence
+from typing import Annotated, NamedTuple, Self, TypeVar, cast
+from pydantic import BaseModel, ConfigDict
+from eval_framework.metrics.llm.graders.language import detect_language_of
+from eval_framework.utils.helpers import count_bytes
+from template_formatting.formatter import ConcatFormatter, Message, Role
+class ConcatCompression(NamedTuple):
+    """Helper class for storing compression info for the concat formatter.
+    The concat formatter is used to avoid bias towards special tokens.
+    """
+    num_bytes: int
+    num_tokens: int
+    @classmethod
+    def calculate(
+        cls,
+        messages: Sequence[Message],
+        count_tokens: Callable[[str], int],
+        choices: list[str] | None = None,
+        completion: str | None = None,
+    ) -> Self | None:
+        """Calculate the compression info for the given messages and token counting function."""
+        if (choices is None) == (completion is None):
+            raise ValueError("Either possible_completions or completion must be provided, but not both.")
+        concat_str = ConcatFormatter().format(messages, output_mode="string")
+        if choices is not None:
+            if any(c is None for c in choices):
+                return None
+            num_bytes = count_bytes(concat_str) + sum(count_bytes(c) for c in choices)
+            num_tokens = count_tokens(concat_str) + sum(count_tokens(c) for c in choices)
+        else:
+            if completion is None:
+                return None
+            concat_str = f"{concat_str}{completion}"
+            num_bytes = count_bytes(concat_str)
+            num_tokens = count_tokens(concat_str)
+        res = cls(num_bytes=num_bytes, num_tokens=num_tokens)
+        if res.num_bytes > 0 and res.num_tokens > 0:
+            return res
+        else:
+            return None
+class BaseMetricContext(BaseModel):
+    """Base class for metric context"""
+    model_config = ConfigDict(extra="allow")
+class LanguageMetricContext(BaseMetricContext):
+    language: str
+class UntemplatedPrompt(BaseMetricContext):
+    untemplated_prompt: str
+class Error(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    error_class: str
+    message: str
+    traceback: str
+class PromptTooLongException(Exception):
+    pass
+class BaseCompletion(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    prompt: Annotated[str, "prompt as passed to the llm"]
+    prompt_sequence_positions: Annotated[
+        int | None,
+        "number of sequence positions that the prompt occupies in the llm architecture (e.g. token count) "
+        "or None if the info is not available",
+    ]
+    completion: Annotated[str, "completion as generated by the llm"]
+    concat_compression: Annotated[ConcatCompression | None, "Compression info for the concat formatter."] = None
+class RawCompletion(BaseCompletion):
+    completion_sequence_positions: Annotated[
+        int | None,
+        "number of sequence positions that the completion occupies in the llm architecture "
+        "(e.g. token count) or None if the info is not available",
+    ]
+    raw_completion_error: Error | None = None
+class Completion(BaseCompletion):
+    id: int
+    subject: str
+    ground_truth: str | None | list[str]
+    messages: list[Message] | None  # needed for LLM as a judge
+    raw_completion: Annotated[str, "raw completion as generated by the llm"]
+    raw_completion_sequence_positions: Annotated[
+        int | None,
+        "number of sequence positions that the completion occupies in the llm architecture or None "
+        "if the info is not available",
+    ]
+    context: list[BaseMetricContext] | BaseMetricContext | None = None
+    error: Error | None = None
+    @property
+    def ground_truth_list(self) -> list[str] | list[None]:
+        if isinstance(self.ground_truth, list):
+            return self.ground_truth
+        return [self.ground_truth]  # type: ignore[return-value]
+    # Use just the raw messages for instructions to LLM judges, not the original prompt with its special formatting.
+    # (see https://x.com/karpathy/status/1823418177197646104 for a motivation).
+    @property
+    def system_user_instruction(self) -> str:
+        assert self.messages is not None
+        return "\n\n".join([m.content for m in self.messages if m.role in (Role.SYSTEM, Role.USER)])
+    @property
+    def user_instruction(self) -> str:
+        assert self.messages is not None
+        return "\n\n".join([m.content for m in self.messages if m.role == Role.USER])
+    @property
+    def first_user_instruction(self) -> str:
+        assert self.messages is not None
+        user_messages = [m.content for m in self.messages if m.role == Role.USER]
+        return user_messages[0] if user_messages else ""
+    @property
+    def all_but_first_user_instruction(self) -> str:
+        assert self.messages is not None
+        user_messages = [m.content for m in self.messages if m.role == Role.USER]
+        return "\n\n".join(user_messages[1:]) if len(user_messages) > 1 else ""
+    @property
+    def last_user_instruction(self) -> str:
+        assert self.messages is not None
+        user_messages = [m.content for m in self.messages if m.role == Role.USER]
+        return user_messages[-1] if user_messages else ""
+    @property
+    def sanitized_completion(self) -> str:
+        # Make sure the completion doesn't contain any obvious special chars either by "breaking" any <|xyz|> pattern.
+        return re.sub(r"<\|(\S+)\|>", r"<| \1 |>", self.completion)
+    def get_completion_language(self) -> str:
+        detected_language = ""
+        if self.context and isinstance(self.context, LanguageMetricContext):
+            detected_language = self.context.language
+        else:
+            detected_language_object = detect_language_of(self.completion)
+            detected_language = detected_language_object.iso_code_639_1.name.lower() if detected_language_object else ""
+        return detected_language
+    def get_raw_completion_language(self) -> str:
+        detected_language = ""
+        if self.context and isinstance(self.context, LanguageMetricContext):
+            detected_language = self.context.language
+        else:
+            detected_language_object = detect_language_of(self.raw_completion)
+            detected_language = detected_language_object.iso_code_639_1.name.lower() if detected_language_object else ""
+        return detected_language
+    def get_instruction_language(self) -> str:
+        detected_language = ""
+        if self.context and isinstance(self.context, LanguageMetricContext):
+            detected_language = self.context.language
+        else:
+            detected_language_object = detect_language_of(self.user_instruction)
+            detected_language = detected_language_object.iso_code_639_1.name.lower() if detected_language_object else ""
+        return detected_language
+class BaseLoglikelihood(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    prompt: str
+    prompt_sequence_positions: int | None
+    loglikelihoods: dict[str, float]
+    loglikelihoods_sequence_positions: dict[str, int]  # Is empty if the model does not provide sequence positions
+    concat_compression: Annotated[ConcatCompression | None, "Compression info for the concat formatter"] = None
+class RawLoglikelihood(BaseLoglikelihood):
+    raw_loglikelihood_error: Error | None = None
+class Loglikelihood(BaseLoglikelihood):
+    id: int
+    subject: str
+    ground_truth: str | list[str]
+    error: Error | None = None
+    @property
+    def ground_truth_list(self) -> list[str] | list[None]:
+        if isinstance(self.ground_truth, list):
+            return self.ground_truth
+        return [self.ground_truth]  # type: ignore[return-value]
+MetricContext = TypeVar("MetricContext", bound=BaseMetricContext)
+def extract_context_metric[MetricContext: BaseMetricContext](
+    response: Completion, metric_context_class: type[MetricContext]
+) -> MetricContext:
+    assert response.context is not None, "Expected context to be provided in the response"
+    if not isinstance(response.context, list):
+        assert isinstance(response.context, metric_context_class) or isinstance(response.context, BaseMetricContext), (
+            f"Expected context to be of type {metric_context_class.__name__}, got {type(response.context).__name__}"
+        )
+        return cast(MetricContext, response.context)
+    else:
+        assert len(response.context) > 0, "Expected context to be provided in the response"
+        context = [
+            metric_context for metric_context in response.context if isinstance(metric_context, metric_context_class)
+        ][0]
+        assert context is not None, f"Expected {metric_context_class.__name__} to be provided in the response context"
+        return cast(MetricContext, context)

eval_framework/tasks/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+# Register all tasks on import
+from .task_names import register_all_tasks
+register_all_tasks()
+del register_all_tasks

eval_framework/tasks/base.py ADDED Viewed

@@ -0,0 +1,314 @@
+import logging
+import os
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Iterable
+from enum import Enum
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Self, TypeVar
+import iso639
+from datasets import DatasetDict, DownloadConfig, load_dataset
+from huggingface_hub import HfApi
+from huggingface_hub.errors import RevisionNotFoundError
+from pydantic import BaseModel, ConfigDict
+from eval_framework.shared.types import BaseMetricContext
+from template_formatting.formatter import Message, Role
+if TYPE_CHECKING:
+    from eval_framework.metrics.base import BaseMetric
+RANDOM_SEED = 42
+NO_SUBJECT = "no_subject"
+class ResponseType(Enum):
+    COMPLETION = "completion"
+    LOGLIKELIHOODS = "loglikelihoods"
+class Language(Enum):
+    ENG = "English"
+    DEU = "German"
+    FRA = "French"
+    ITA = "Italian"
+    SPA = "Spanish"
+    POR = "Portuguese"
+    NLD = "Dutch"
+    FIN = "Finnish"
+    SWE = "Swedish"
+    ARB = "Arabic"
+    POL = "Polish"
+    RUS = "Russian"
+    UKR = "Ukrainian"
+    HRV = "Croatian"
+    SRP = "Serbian"
+    @classmethod
+    def add_members(cls, new_members: dict[str, Any]) -> type["Language"]:
+        members = {member.name: member.value for member in cls}
+        for name, value in new_members.items():
+            if name not in members:
+                members[name] = value
+        return Enum(cls.__name__, members)  # type: ignore[return-value]
+languages: dict[str, str] = {}
+for language in iso639.ALL_LANGUAGES:
+    enum_name = language.part3.upper()
+    languages[enum_name] = language.name
+Language: type[Enum] = Language.add_members(languages)  # type: ignore[no-redef]
+class Sample(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    id: int
+    subject: str
+    messages: list[Message]
+    ground_truth: str | list[str] | None
+    possible_completions: list[str] | None
+    context: BaseMetricContext | list[BaseMetricContext] | None = None
+SubjectType = TypeVar("SubjectType")
+logger = logging.getLogger(__name__)
+class BaseTask[SubjectType](ABC):
+    NAME: str
+    DATASET_PATH: str
+    SAMPLE_SPLIT: str
+    FEWSHOT_SPLIT: str
+    RESPONSE_TYPE: ResponseType
+    METRICS: list[type["BaseMetric"]]
+    SUBJECTS: list[SubjectType]
+    HF_REVISION: str | None = None  # tag name, or branch name, or commit hash to ensure reproducibility
+    # Words in _get_instruction_text() not to be perturbed. List of words is case insensitive. No special characters
+    # or whitespace should be included.
+    PERTURBATION_UNMODIFIABLE_WORDS: list[str] | None
+    # The language (or languages) tested by the benchmark. Accepts a single string, a dictionary specifying
+    # language by subtopic, or `None` (for tasks not specific to a single language).
+    LANGUAGE: Language | dict[str, Language] | dict[str, tuple[Language, Language]] | None
+    def __init__(self, num_fewshot: int = 0) -> None:
+        self.num_fewshot = num_fewshot
+        self.stop_sequences: list[str] | None = None
+        self.max_tokens: int | None = None
+    @classmethod
+    def with_overwrite(
+        cls, num_fewshot: int, *, custom_subjects: list[str] | None, custom_hf_revision: str | None
+    ) -> Self:
+        instance = cls(num_fewshot=num_fewshot)
+        # If custom subjects were provided during initialization, they take precedence over the class-level SUBJECTS.
+        filtered_subjects = instance._filter_task_subjects(custom_subjects=custom_subjects)
+        if filtered_subjects:
+            logger.info(f"Setting SUBJECTS to `{filtered_subjects}` for the task {instance.__class__.__name__}")
+            instance.SUBJECTS = filtered_subjects  # type: ignore[assignment]
+        # If a custom revision was provided during initialization, it takes precedence over the class-level HF_REVISION.
+        if custom_hf_revision:
+            logger.info(f"Setting HF revision to `{custom_hf_revision}` for the task {instance.__class__.__name__}")
+            instance.HF_REVISION = custom_hf_revision
+        return instance
+    def _filter_task_subjects(self, custom_subjects: list[str] | None) -> list[str] | list[tuple] | None:
+        """Process custom subjects passed from EvalConfig. Check and returns restricted task subjects if specified."""
+        if not custom_subjects:
+            return None
+        assert hasattr(self, "SUBJECTS") and len(self.SUBJECTS) > 0
+        if isinstance(self.SUBJECTS[0], tuple):
+            # subjects are specified as strings but we need tuples
+            filters = [tuple(item.strip() for item in subject.split(",")) for subject in custom_subjects]
+            # check if all parts of custom subjects exists (* is a wildcard)
+            num_items = len(self.SUBJECTS[0])
+            legal_values = [
+                set([s[i] for s in self.SUBJECTS if isinstance(s, tuple)] + ["*"]) for i in range(num_items)
+            ]
+            for tpl in filters:
+                for i, v in enumerate(tpl):
+                    assert v in legal_values[i], f"Subject part {v} not found in task {self.__class__.__name__}"
+            # filter task subjects. * is a supported wildcard for a specific item in a tuple, e.g. "DE_DE, *"
+            chosen_subjects = []
+            for subject in self.SUBJECTS:
+                subject_tuple = subject if isinstance(subject, tuple) else tuple(str(subject).split(","))
+                for filter in filters:
+                    if all(filter[i] == "*" or filter[i] == subject_tuple[i] for i in range(num_items)):
+                        chosen_subjects.append(subject_tuple)
+                        break
+            return chosen_subjects  # type: ignore[return-value]
+        else:
+            for cs in custom_subjects:
+                assert cs in self.SUBJECTS, f"Subject {cs} not found in task {self.__class__.__name__}"
+            return custom_subjects  # type: ignore[return-value]
+    def _load_hf_dataset(self, **kwargs: Any) -> Any:
+        # Check if the HF_REVISION is valid before loading the dataset
+        if self.HF_REVISION:
+            try:
+                _ = HfApi().dataset_info(repo_id=kwargs["path"], revision=self.HF_REVISION, timeout=100.0)
+            except Exception as e:
+                if isinstance(e, RevisionNotFoundError):
+                    raise e
+        cache_dir: str = os.environ.get("HF_DATASET_CACHE_DIR", f"{Path.home()}/.cache/huggingface/datasets")
+        download_config = DownloadConfig(cache_dir=cache_dir, max_retries=5)
+        try:
+            return load_dataset(
+                **kwargs,
+                revision=self.HF_REVISION,
+                trust_remote_code=True,
+                cache_dir=cache_dir,
+                download_config=download_config,
+            )
+        except Exception:
+            return load_dataset(
+                **kwargs,
+                revision=self.HF_REVISION,
+                trust_remote_code=True,
+                cache_dir=f"{Path.home()}/.cache/eval-framework",
+            )
+    def _shuffle_splits(self, hf_dataset: DatasetDict) -> dict[str, list[dict[str, Any]]]:
+        dataset = {}
+        self.rnd = random.Random(RANDOM_SEED)
+        for split, data in hf_dataset.items():
+            if split not in [self.SAMPLE_SPLIT, self.FEWSHOT_SPLIT]:
+                continue
+            data_list = list(data)
+            if split == self.SAMPLE_SPLIT:
+                self.rnd.shuffle(data_list)
+            dataset[split] = data_list
+        return dataset
+    def _load_dataset(self, subject: SubjectType) -> None:
+        name = subject if subject != NO_SUBJECT else None
+        hf_dataset = self._load_hf_dataset(path=self.DATASET_PATH, name=name)
+        self.dataset = self._shuffle_splits(hf_dataset=hf_dataset)
+    def post_process_generated_completion(self, completion_text: str, sample: Sample | None = None) -> str:
+        return completion_text
+    def _get_example_messages(self, item: dict[str, Any]) -> list[Message]:
+        fewshot_examples = self._sample_fewshot_examples(item) if self.num_fewshot > 0 else []
+        example_messages = []
+        for fewshot_example in fewshot_examples:
+            fewshot_example["subject"] = item["subject"]
+            example_messages.extend(self._get_instruction_messages(fewshot_example))
+            example_messages.append(
+                Message(role=Role.ASSISTANT, content=self._get_fewshot_target_text(fewshot_example))
+            )
+        return example_messages
+    def _get_messages(self, item: dict[str, Any]) -> list[Message]:
+        example_messages = self._get_example_messages(item)
+        instruction_message = self._get_instruction_messages(item)
+        cue_text = self._get_cue_text(item)
+        cue_message = [Message(role=Role.ASSISTANT, content=cue_text)] if cue_text else []
+        messages = example_messages + instruction_message + cue_message
+        if initial_prompt_text := self._get_initial_prompt_text(item):
+            first_message = messages[0]
+            assert first_message.role == Role.USER
+            first_message.content = f"{initial_prompt_text}\n\n{first_message.content}"
+        if system_prompt_text := self._get_system_prompt_text(item):
+            return [Message(role=Role.SYSTEM, content=system_prompt_text)] + messages
+        return messages
+    def _get_instruction_messages(self, item: dict[str, Any]) -> list[Message]:
+        return [Message(role=Role.USER, content=self._get_instruction_text(item))]
+    def iterate_samples(self, num_samples: int | None = None) -> Iterable[Sample]:
+        for subject in self.SUBJECTS:
+            self._load_dataset(subject)
+            assert len(self.dataset[self.SAMPLE_SPLIT]) > 0
+            done = False
+            index = 0
+            for item in self.dataset[self.SAMPLE_SPLIT]:
+                if done:
+                    break
+                item["subject"] = subject
+                for sample in self._create_samples(item, index, str(subject)):
+                    yield sample
+                    index += 1
+                    if index == num_samples:
+                        done = True
+                        break
+    def _create_samples(self, item: dict[str, Any], index: int, subject: str) -> list[Sample]:
+        """Creates one or more samples from a single dataset item. Default implementation returns single sample."""
+        return [
+            Sample(
+                id=index,
+                subject=str(subject),
+                messages=self._get_messages(item),
+                ground_truth=self._get_ground_truth(item),
+                possible_completions=self._get_possible_completions(item),
+                context=self._get_context(item),
+            )
+        ]
+    def _get_initial_prompt_text(self, item: dict[str, Any]) -> str:
+        return ""
+    def _get_system_prompt_text(self, item: dict[str, Any]) -> str | None:
+        return None
+    @abstractmethod
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        raise NotImplementedError
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        target = self._get_ground_truth(item)
+        assert target is not None
+        assert isinstance(target, str)
+        return target
+    @abstractmethod
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None | list[str]:
+        raise NotImplementedError
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return ""
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return None
+    def _sample_fewshot_examples(self, item: dict[str, Any]) -> list[dict]:
+        if self.FEWSHOT_SPLIT == self.SAMPLE_SPLIT:
+            fewshot_examples = self.rnd.sample(self.dataset[self.FEWSHOT_SPLIT], self.num_fewshot + 1)
+            fewshot_examples = [example for example in fewshot_examples if example != item]
+            fewshot_examples = fewshot_examples[: self.num_fewshot]
+            return fewshot_examples
+        else:
+            return self.rnd.sample(self.dataset[self.FEWSHOT_SPLIT], self.num_fewshot)
+    def _get_context(self, item: dict[str, Any]) -> BaseMetricContext | list[BaseMetricContext] | None:
+        return None
+    def get_metadata(self) -> dict[str, str | list[str]]:
+        return {
+            "dataset_path": self.DATASET_PATH,
+            "sample_split": self.SAMPLE_SPLIT,
+            "fewshot_split": self.FEWSHOT_SPLIT,
+            "response_type": self.RESPONSE_TYPE.value,
+            "metrics": [m.NAME for m in self.METRICS],
+            "subjects": [str(s) for s in self.SUBJECTS],
+        }

eval_framework/tasks/benchmarks/__init__.py ADDED Viewed

File without changes

eval_framework/tasks/benchmarks/arc.py ADDED Viewed

@@ -0,0 +1,46 @@
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import BaseTask, Language, ResponseType
+from eval_framework.tasks.utils import get_n_letters
+class ARC(BaseTask[str]):
+    """ARC dataset: https://huggingface.co/datasets/allenai/ai2_arc"""
+    NAME = "ARC"
+    DATASET_PATH = "ai2_arc"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "train"
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = ["ARC-Easy", "ARC-Challenge"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(5)
+    LANGUAGE = Language.ENG
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.keys = get_n_letters(5)  # needs to be 5 because there is one sample with 5 answer possibilities
+        self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return f"Question: {item['question']}\n"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        ground_truth = self._get_ground_truth(item)
+        assert ground_truth is not None
+        return f"{self._get_cue_text(item)}{ground_truth}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "Answer:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        answer_key = self.num_to_letter.get(item["answerKey"], item["answerKey"])
+        return f" {item['choices']['text'][self.keys.index(answer_key)]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return [f" {choice}" for choice in item["choices"]["text"]]

eval_framework/tasks/benchmarks/arc_de.py ADDED Viewed

@@ -0,0 +1,46 @@
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import NO_SUBJECT, BaseTask, Language, ResponseType
+from eval_framework.tasks.utils import get_n_letters
+class ARC_DE(BaseTask[str]):
+    """ARC-DE dataset: https://huggingface.co/datasets/LeoLM/ArcChallenge_de"""
+    NAME = "ARC German"
+    DATASET_PATH = "LeoLM/ArcChallenge_de"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "validation"
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = [NO_SUBJECT]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Frage"] + get_n_letters(5)
+    LANGUAGE = Language.DEU
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.keys = get_n_letters(5)  # needs to be 5 because there is one sample with 5 answer possibilities
+        self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return f"Frage: {item['question_de']}\n"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        ground_truth = self._get_ground_truth(item)
+        assert ground_truth is not None
+        return f"{self._get_cue_text(item)}{ground_truth}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "Antwort:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        answer_key = self.num_to_letter.get(item["answerKey"], item["answerKey"])
+        return f" {item['choices_de']['text'][self.keys.index(answer_key)]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return [f" {choice}" for choice in item["choices_de"]["text"]]

eval_framework/tasks/benchmarks/arc_fi.py ADDED Viewed

@@ -0,0 +1,46 @@
+from typing import Any
+from eval_framework.metrics.loglikelihood.accuracy_loglikelihood import (
+    AccuracyLoglikelihood,
+    AccuracyNormLoglikelihood,
+)
+from eval_framework.tasks.base import BaseTask, Language, ResponseType
+from eval_framework.tasks.utils import get_n_letters
+class ARC_FI(BaseTask[str]):
+    """ARC-FI dataset: https://huggingface.co/datasets/LumiOpen/arc_challenge_mt"""
+    NAME = "ARC Finnish"
+    DATASET_PATH = "LumiOpen/arc_challenge_mt"
+    SAMPLE_SPLIT = "test"
+    FEWSHOT_SPLIT = "validation"
+    RESPONSE_TYPE = ResponseType.LOGLIKELIHOODS
+    METRICS = [AccuracyLoglikelihood, AccuracyNormLoglikelihood]
+    SUBJECTS = ["fi"]
+    PERTURBATION_UNMODIFIABLE_WORDS = ["Question"] + get_n_letters(5)
+    LANGUAGE = Language.FIN
+    def __init__(self, num_fewshot: int = 0) -> None:
+        super().__init__(num_fewshot)
+        self.keys = get_n_letters(5)  # needs to be 5 because there is one sample with 5 answer possibilities
+        self.num_to_letter = {str(i): letter for i, letter in enumerate(self.keys, start=1)}
+    def _get_instruction_text(self, item: dict[str, Any]) -> str:
+        return f"Question: {item['question']}\n"
+    def _get_fewshot_target_text(self, item: dict[str, Any]) -> str:
+        ground_truth = self._get_ground_truth(item)
+        assert ground_truth is not None
+        return f"{self._get_cue_text(item)}{ground_truth}"
+    def _get_cue_text(self, item: dict[str, Any]) -> str:
+        return "Answer:"
+    def _get_ground_truth(self, item: dict[str, Any]) -> str | None:
+        answer_key = self.num_to_letter.get(item["answerKey"], item["answerKey"])
+        return f" {item['choices']['text'][self.keys.index(answer_key)]}"
+    def _get_possible_completions(self, item: dict[str, Any]) -> list[str] | None:
+        return [f" {choice}" for choice in item["choices"]["text"]]