PyPI - EuroEval - Versions diffs - 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl - Mend

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

euroeval/__init__.py +32 -14
euroeval/benchmark_config_factory.py +92 -180
euroeval/benchmark_modules/base.py +49 -39
euroeval/benchmark_modules/fresh.py +35 -21
euroeval/benchmark_modules/hf.py +280 -244
euroeval/benchmark_modules/litellm.py +752 -312
euroeval/benchmark_modules/vllm.py +570 -268
euroeval/benchmarker.py +651 -528
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +49 -38
euroeval/constants.py +44 -25
euroeval/data_loading.py +111 -55
euroeval/data_models.py +490 -323
euroeval/dataset_configs/__init__.py +26 -4
euroeval/dataset_configs/bosnian.py +39 -0
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/croatian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +78 -50
euroeval/dataset_configs/dutch.py +74 -44
euroeval/dataset_configs/english.py +71 -36
euroeval/dataset_configs/estonian.py +111 -0
euroeval/dataset_configs/faroese.py +25 -18
euroeval/dataset_configs/finnish.py +63 -26
euroeval/dataset_configs/french.py +65 -32
euroeval/dataset_configs/german.py +77 -36
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +68 -57
euroeval/dataset_configs/italian.py +68 -36
euroeval/dataset_configs/latvian.py +87 -0
euroeval/dataset_configs/lithuanian.py +64 -0
euroeval/dataset_configs/norwegian.py +98 -72
euroeval/dataset_configs/polish.py +96 -0
euroeval/dataset_configs/portuguese.py +63 -40
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/slovene.py +56 -0
euroeval/dataset_configs/spanish.py +68 -34
euroeval/dataset_configs/swedish.py +82 -41
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/enums.py +12 -6
euroeval/exceptions.py +21 -1
euroeval/finetuning.py +34 -26
euroeval/generation.py +76 -41
euroeval/generation_utils.py +169 -34
euroeval/languages.py +1020 -188
euroeval/logging_utils.py +268 -0
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +85 -0
euroeval/metrics/huggingface.py +216 -0
euroeval/metrics/llm_as_a_judge.py +260 -0
euroeval/metrics/pipeline.py +289 -0
euroeval/metrics/speed.py +48 -0
euroeval/model_cache.py +40 -21
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +157 -22
euroeval/prompt_templates/multiple_choice.py +159 -17
euroeval/prompt_templates/named_entity_recognition.py +318 -21
euroeval/prompt_templates/reading_comprehension.py +207 -16
euroeval/prompt_templates/sentiment_classification.py +205 -22
euroeval/prompt_templates/summarization.py +122 -22
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +20 -9
euroeval/speed_benchmark.py +11 -12
euroeval/task_group_utils/multiple_choice_classification.py +21 -12
euroeval/task_group_utils/question_answering.py +101 -73
euroeval/task_group_utils/sequence_classification.py +144 -61
euroeval/task_group_utils/text_to_text.py +33 -12
euroeval/task_group_utils/token_classification.py +86 -89
euroeval/tasks.py +75 -16
euroeval/tokenisation_utils.py +603 -0
euroeval/types.py +17 -11
euroeval/utils.py +332 -137
euroeval-16.7.1.dist-info/METADATA +623 -0
euroeval-16.7.1.dist-info/RECORD +84 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -737
euroeval/metrics.py +0 -452
euroeval/tokenization_utils.py +0 -498
euroeval-15.12.0.dist-info/METADATA +0 -285
euroeval-15.12.0.dist-info/RECORD +0 -63
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -2,7 +2,7 @@
 import collections.abc as c
 import logging
-import os
+import re
 import typing as t
 from functools import cached_property, partial
 from json import JSONDecodeError
@@ -15,6 +15,7 @@ from huggingface_hub import HfApi
 from huggingface_hub import whoami as hf_whoami
 from huggingface_hub.errors import (
     GatedRepoError,
+    HfHubHTTPError,
     HFValidationError,
     LocalTokenNotFoundError,
     RepositoryNotFoundError,
@@ -35,6 +36,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.trainer import Trainer
 from urllib3.exceptions import RequestError
+from ..caching_utils import cache_arguments
 from ..constants import (
     DUMMY_FILL_VALUE,
     GENERATIVE_PIPELINE_TAGS,
@@ -42,7 +44,7 @@ from ..constants import (
     MAX_CONTEXT_LENGTH,
     MERGE_TAGS,
 )
-from ..data_models import HFModelInfo, ModelConfig
+from ..data_models import HashableDict, HFModelInfo, ModelConfig
 from ..enums import (
     BatchingPreference,
     GenerativeType,
@@ -57,19 +59,21 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
+from ..generation_utils import raise_if_wrong_params
 from ..languages import get_all_languages
+from ..logging_utils import block_terminal_output, log, log_once
 from ..task_group_utils import (
     multiple_choice_classification,
     question_answering,
     token_classification,
 )
-from ..tokenization_utils import get_bos_token, get_eos_token
+from ..tokenisation_utils import get_bos_token, get_eos_token
 from ..utils import (
-    block_terminal_output,
     create_model_cache_dir,
     get_class_by_name,
+    get_hf_token,
     internet_connection_available,
-    log_once,
+    split_model_id,
 )
 from .base import BenchmarkModule
@@ -81,8 +85,6 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig, Task
     from ..types import ExtractLabelsFunction
-logger = logging.getLogger("euroeval")
 class HuggingFaceEncoderModel(BenchmarkModule):
     """An encoder model from the Hugging Face Hub."""
@@ -90,12 +92,14 @@ class HuggingFaceEncoderModel(BenchmarkModule):
     fresh_model = False
     batching_preference = BatchingPreference.NO_PREFERENCE
     high_priority = True
+    allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
     def __init__(
         self,
         model_config: "ModelConfig",
         dataset_config: "DatasetConfig",
         benchmark_config: "BenchmarkConfig",
+        log_metadata: bool = True,
     ) -> None:
         """Initialise the model.
@@ -106,18 +110,24 @@ class HuggingFaceEncoderModel(BenchmarkModule):
                 The dataset configuration.
             benchmark_config:
                 The benchmark configuration.
+            log_metadata:
+                Whether to log the model metadata.
         """
-        model, tokenizer = load_model_and_tokenizer(
+        raise_if_wrong_params(
+            model_config=model_config, allowed_params=self.allowed_params
+        )
+        model, tokeniser = load_model_and_tokeniser(
             model_config=model_config,
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
         )
         self._model: "PreTrainedModel" = model
-        self._tokenizer: "PreTrainedTokenizer" = tokenizer
+        self._tokeniser: "PreTrainedTokenizer" = tokeniser
-        self._model, self._tokenizer = align_model_and_tokenizer(
+        self._model, self._tokeniser = align_model_and_tokeniser(
             model=self._model,
-            tokenizer=self._tokenizer,
+            tokeniser=self._tokeniser,
             model_max_length=self.model_max_length,
             raise_errors=benchmark_config.raise_errors,
         )
@@ -126,6 +136,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             model_config=model_config,
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
+            log_metadata=log_metadata,
         )
     @cached_property
@@ -135,23 +146,25 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         Returns:
             The number of parameters in the model.
         """
-        token = (
-            self.benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True
-        )
-        hf_api = HfApi(token=token)
-        try:
-            repo_info = hf_api.model_info(
-                repo_id=self.model_config.adapter_base_model_id
-                or self.model_config.model_id,
-                revision=self.model_config.revision,
-            )
-        except (
-            RepositoryNotFoundError,
-            RevisionNotFoundError,
-            RequestException,
-            HFValidationError,
-        ):
+        # No need to try to use the API if we have no internet.
+        if not internet_connection_available():
             repo_info = None
+        else:
+            token = get_hf_token(api_key=self.benchmark_config.api_key)
+            hf_api = HfApi(token=token)
+            try:
+                repo_info = hf_api.model_info(
+                    repo_id=self.model_config.adapter_base_model_id
+                    or self.model_config.model_id,
+                    revision=self.model_config.revision,
+                )
+            except (
+                RepositoryNotFoundError,
+                RevisionNotFoundError,
+                RequestException,
+                HFValidationError,
+            ):
+                repo_info = None
         if (
             repo_info is not None
@@ -168,12 +181,13 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         elif hasattr(self._model, "parameters"):
             num_params = sum(p.numel() for p in self._model.parameters())
         else:
-            logger.warning(
+            log(
                 "The number of parameters could not be determined for the model, since "
                 "the model is not stored in the safetensors format. If this is your "
                 "own model, then you can use this Hugging Face Space to convert your "
                 "model to the safetensors format: "
-                "https://huggingface.co/spaces/safetensors/convert."
+                "https://huggingface.co/spaces/safetensors/convert.",
+                level=logging.WARNING,
             )
             num_params = -1
         return num_params
@@ -191,10 +205,10 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         ):
             vocab_size = self._model.config.vocab_size
         elif (
-            hasattr(self._tokenizer, "vocab_size")
-            and self._tokenizer.vocab_size is not None
+            hasattr(self._tokeniser, "vocab_size")
+            and self._tokeniser.vocab_size is not None
         ):
-            vocab_size = self._tokenizer.vocab_size
+            vocab_size = self._tokeniser.vocab_size
         else:
             vocab_size = -1
         return vocab_size
@@ -208,18 +222,18 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         """
         all_max_lengths: list[int] = list()
-        # Add the registered max length of the tokenizer
+        # Add the registered max length of the tokeniser
         if hasattr(
-            self._tokenizer, "model_max_length"
-        ) and self._tokenizer.model_max_length < int(1e30):
-            all_max_lengths.append(self._tokenizer.model_max_length)
+            self._tokeniser, "model_max_length"
+        ) and self._tokeniser.model_max_length < int(1e30):
+            all_max_lengths.append(self._tokeniser.model_max_length)
         # Add the max length derived from the model's input sizes
-        if hasattr(self._tokenizer, "max_model_input_sizes"):
+        if hasattr(self._tokeniser, "max_model_input_sizes"):
             all_max_lengths.extend(
                 [
                     size
-                    for size in self._tokenizer.max_model_input_sizes.values()
+                    for size in self._tokeniser.max_model_input_sizes.values()
                     if size is not None
                 ]
             )
@@ -245,15 +259,6 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             max_length for max_length in all_max_lengths if max_length >= 128
         ]
-        # We remove the upper cap of maximum context length for the model, as it is
-        # highly unlikely that this is the model's actual maximum context length - we
-        # would rather not report a value than report an incorrect one.
-        all_max_lengths = [
-            max_length
-            for max_length in all_max_lengths
-            if max_length != MAX_CONTEXT_LENGTH
-        ]
         if len(list(all_max_lengths)) > 0:
             model_max_length = min(list(all_max_lengths))
         else:
@@ -262,7 +267,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         return model_max_length
     @property
-    def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
+    def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
         Returns:
@@ -275,10 +280,10 @@ class HuggingFaceEncoderModel(BenchmarkModule):
                 | TaskGroup.QUESTION_ANSWERING
                 | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
             ):
-                return DataCollatorWithPadding(self._tokenizer, padding="longest")
+                return DataCollatorWithPadding(self._tokeniser, padding="longest")
             case TaskGroup.TOKEN_CLASSIFICATION:
                 return DataCollatorForTokenClassification(
-                    tokenizer=self._tokenizer, label_pad_token_id=-100
+                    tokenizer=self._tokeniser, label_pad_token_id=-100
                 )
             case _:
                 raise NotImplementedError(
@@ -357,16 +362,16 @@ class HuggingFaceEncoderModel(BenchmarkModule):
                         self._model.config.label2id[lbl.lower()]
                         for lbl in examples["label"]
                     ]
-                except KeyError:
+                except KeyError as e:
                     raise InvalidBenchmark(
                         f"One of the labels in the dataset, "
                         f"{examples['label'].lower()}, does not occur in the "
                         f"label2id dictionary {self._model.config.label2id}."
-                    )
+                    ) from e
             return examples
         def tokenise(examples: dict) -> "BatchEncoding":
-            return self._tokenizer(text=examples["text"], truncation=True, padding=True)
+            return self._tokeniser(text=examples["text"], truncation=True, padding=True)
         match task.task_group:
             case TaskGroup.SEQUENCE_CLASSIFICATION:
@@ -376,39 +381,20 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
                 dataset = DatasetDict(
-                    train=dataset["train"].map(
-                        partial(
-                            multiple_choice_classification.prepare_examples,
-                            tokenizer=self._tokenizer,
-                        ),
-                        batched=True,
-                        batch_size=10,
-                        remove_columns=dataset["train"].column_names,
-                        load_from_cache_file=False,
-                        keep_in_memory=True,
-                    ),
-                    val=dataset["val"].map(
-                        partial(
-                            multiple_choice_classification.prepare_examples,
-                            tokenizer=self._tokenizer,
-                        ),
-                        batched=True,
-                        batch_size=10,
-                        remove_columns=dataset["val"].column_names,
-                        load_from_cache_file=False,
-                        keep_in_memory=True,
-                    ),
-                    test=dataset["test"].map(
-                        partial(
-                            multiple_choice_classification.prepare_examples,
-                            tokenizer=self._tokenizer,
-                        ),
-                        batched=True,
-                        batch_size=10,
-                        remove_columns=dataset["test"].column_names,
-                        load_from_cache_file=False,
-                        keep_in_memory=True,
-                    ),
+                    {
+                        split_name: split.map(
+                            partial(
+                                multiple_choice_classification.prepare_examples,
+                                tokeniser=self._tokeniser,
+                            ),
+                            batched=True,
+                            batch_size=10,
+                            remove_columns=split.column_names,
+                            load_from_cache_file=False,
+                            keep_in_memory=True,
+                        )
+                        for split_name, split in dataset.items()
+                    }
                 )
             case TaskGroup.TEXT_TO_TEXT:
@@ -423,7 +409,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
                 dataset = dataset.map(
                     partial(
                         token_classification.tokenize_and_align_labels,
-                        tokenizer=self._tokenizer,
+                        tokeniser=self._tokeniser,
                         label2id=self._model.config.label2id,
                     ),
                     batched=True,
@@ -432,43 +418,44 @@ class HuggingFaceEncoderModel(BenchmarkModule):
                 )
             case TaskGroup.QUESTION_ANSWERING:
-                dataset = DatasetDict(
-                    dict(
-                        train=dataset["train"].map(
-                            partial(
-                                question_answering.prepare_train_examples,
-                                tokenizer=self._tokenizer,
-                            ),
-                            batched=True,
-                            batch_size=10,
-                            remove_columns=dataset["test"].column_names,
-                            load_from_cache_file=False,
-                            keep_in_memory=True,
+                data_dict = dict()
+                if "train" in dataset:
+                    data_dict["train"] = dataset["train"].map(
+                        partial(
+                            question_answering.prepare_train_examples,
+                            tokeniser=self._tokeniser,
                         ),
-                        val=dataset["val"].map(
-                            partial(
-                                question_answering.prepare_train_examples,
-                                tokenizer=self._tokenizer,
-                            ),
-                            batched=True,
-                            batch_size=10,
-                            remove_columns=dataset["test"].column_names,
-                            load_from_cache_file=False,
-                            keep_in_memory=True,
+                        batched=True,
+                        batch_size=10,
+                        remove_columns=dataset["test"].column_names,
+                        load_from_cache_file=False,
+                        keep_in_memory=True,
+                    )
+                if "val" in dataset:
+                    data_dict["val"] = dataset["val"].map(
+                        partial(
+                            question_answering.prepare_train_examples,
+                            tokeniser=self._tokeniser,
                         ),
-                        test=dataset["test"].map(
-                            partial(
-                                question_answering.prepare_test_examples,
-                                tokenizer=self._tokenizer,
-                            ),
-                            batched=True,
-                            batch_size=10,
-                            remove_columns=dataset["test"].column_names,
-                            load_from_cache_file=False,
-                            keep_in_memory=True,
+                        batched=True,
+                        batch_size=10,
+                        remove_columns=dataset["test"].column_names,
+                        load_from_cache_file=False,
+                        keep_in_memory=True,
+                    )
+                if "test" in dataset:
+                    data_dict["test"] = dataset["test"].map(
+                        partial(
+                            question_answering.prepare_test_examples,
+                            tokeniser=self._tokeniser,
                         ),
+                        batched=True,
+                        batch_size=10,
+                        remove_columns=dataset["test"].column_names,
+                        load_from_cache_file=False,
+                        keep_in_memory=True,
                     )
-                )
+                dataset = DatasetDict(data_dict)
                 # The Trainer hides the columns that are not used by the model (here
                 # `id` and `offset_mapping` which we will need for our post-processing),
@@ -499,11 +486,15 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             Whether the model exists, or an error describing why we cannot check
             whether the model exists.
         """
-        model_id, revision = (
-            model_id.split("@") if "@" in model_id else (model_id, "main")
-        )
+        model_id_components = split_model_id(model_id=model_id)
         model_info = get_model_repo_info(
-            model_id=model_id, revision=revision, benchmark_config=benchmark_config
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            api_key=benchmark_config.api_key,
+            cache_dir=benchmark_config.cache_dir,
+            trust_remote_code=benchmark_config.trust_remote_code,
+            requires_safetensors=benchmark_config.requires_safetensors,
+            run_with_cli=benchmark_config.run_with_cli,
         )
         return (
             model_info is not None
@@ -525,11 +516,15 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         Returns:
             The model configuration.
         """
-        model_id, revision = (
-            model_id.split("@") if "@" in model_id else (model_id, "main")
-        )
+        model_id_components = split_model_id(model_id=model_id)
         model_info = get_model_repo_info(
-            model_id=model_id, revision=revision, benchmark_config=benchmark_config
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            api_key=benchmark_config.api_key,
+            cache_dir=benchmark_config.cache_dir,
+            trust_remote_code=benchmark_config.trust_remote_code,
+            requires_safetensors=benchmark_config.requires_safetensors,
+            run_with_cli=benchmark_config.run_with_cli,
         )
         if model_info is None:
             raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -538,8 +533,9 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         language_codes = list(language_mapping.keys())
         model_config = ModelConfig(
-            model_id=model_id,
-            revision=revision,
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            param=model_id_components.param,
             task=model_info.pipeline_tag,
             languages=[
                 language_mapping[tag]
@@ -559,12 +555,12 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         return model_config
-def load_model_and_tokenizer(
+def load_model_and_tokeniser(
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
 ) -> tuple["PreTrainedModel", "PreTrainedTokenizer"]:
-    """Load the model and tokenizer.
+    """Load the model and tokeniser.
     Args:
         model_config:
@@ -575,7 +571,7 @@ def load_model_and_tokenizer(
             The benchmark configuration
     Returns:
-        The loaded model and tokenizer.
+        A pair (model, tokeniser), with the loaded model and tokeniser
     """
     config: "PretrainedConfig"
     block_terminal_output()
@@ -594,8 +590,8 @@ def load_model_and_tokenizer(
     config = load_hf_model_config(
         model_id=model_id,
         num_labels=len(id2label),
-        id2label=id2label,
-        label2id={label: idx for idx, label in id2label.items()},
+        id2label=HashableDict(id2label),
+        label2id=HashableDict({label: idx for idx, label in id2label.items()}),
         revision=model_config.revision,
         model_cache_dir=model_config.model_cache_dir,
         api_key=benchmark_config.api_key,
@@ -607,23 +603,20 @@ def load_model_and_tokenizer(
         config=config,
         ignore_mismatched_sizes=ignore_mismatched_sizes,
         revision=model_config.revision,
-        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+        token=get_hf_token(api_key=benchmark_config.api_key),
         cache_dir=model_config.model_cache_dir,
         trust_remote_code=benchmark_config.trust_remote_code,
-        torch_dtype=get_torch_dtype(
+        dtype=get_dtype(
             device=benchmark_config.device,
-            torch_dtype_is_set=config.to_dict().get("torch_dtype") is not None,
+            dtype_is_set=config.to_dict().get("dtype") is not None,
             bf16_available=(
                 torch.cuda.is_available() and torch.cuda.is_bf16_supported()
             ),
         ),
     )
-    # These are used when a timeout occurs
-    attempts_left = 5
     model: "PreTrainedModel | None" = None
-    while True:
+    for _ in range(num_attempts := 5):
         # Get the model class associated with the task group
         model_cls_or_none: t.Type["PreTrainedModel"] | None = get_class_by_name(
             class_name=task_group_to_class_name(task_group=task_group),
@@ -650,36 +643,41 @@ def load_model_and_tokenizer(
             break
         except (KeyError, RuntimeError) as e:
             if not model_kwargs["ignore_mismatched_sizes"]:
-                logger.debug(
+                log(
                     f"{type(e).__name__} occurred during the loading "
                     f"of the {model_id!r} model. Retrying with "
-                    "`ignore_mismatched_sizes` set to True."
+                    "`ignore_mismatched_sizes` set to True.",
+                    level=logging.DEBUG,
                 )
                 model_kwargs["ignore_mismatched_sizes"] = True
                 continue
             else:
-                raise InvalidModel(str(e))
+                raise InvalidModel(str(e)) from e
         except (TimeoutError, RequestError):
-            attempts_left -= 1
-            if attempts_left == 0:
-                raise InvalidModel("The model could not be loaded after 5 attempts.")
-            logger.info(f"Couldn't load the model {model_id!r}. Retrying.")
+            log(
+                f"Couldn't load the model {model_id!r}. Retrying.",
+                level=logging.WARNING,
+            )
             sleep(5)
             continue
         except (OSError, ValueError) as e:
             if "checkpoint seems to be incorrect" in str(e):
                 raise InvalidModel(
                     f"The model {model_id!r} has an incorrect checkpoint."
-                )
+                ) from e
             if "trust_remote_code" in str(e):
                 raise InvalidModel(
                     f"Loading the model {model_id!r} needs to trust remote code. "
                     "If you trust the suppliers of this model, then you can enable "
                     "this by setting the `--trust-remote-code` flag."
-                )
+                ) from e
             raise InvalidModel(
                 f"The model {model_id!r} could not be loaded. The error was {e!r}."
-            )
+            ) from e
+    else:
+        raise InvalidModel(
+            f"Could not load the model {model_id!r} after {num_attempts} attempts."
+        )
     if isinstance(model_or_tuple, tuple):
         model = model_or_tuple[0]
@@ -697,17 +695,25 @@ def load_model_and_tokenizer(
     ):
         model = setup_model_for_question_answering(model=model)
-    tokenizer = load_tokenizer(
+    tokeniser = load_tokeniser(
         model=model,
         model_id=model_id,
         trust_remote_code=benchmark_config.trust_remote_code,
+        model_config=model_config,
     )
-    return model, tokenizer
+    return model, tokeniser
+@cache_arguments("model_id", "revision")
 def get_model_repo_info(
-    model_id: str, revision: str, benchmark_config: "BenchmarkConfig"
+    model_id: str,
+    revision: str,
+    api_key: str | None,
+    cache_dir: str,
+    trust_remote_code: bool,
+    requires_safetensors: bool,
+    run_with_cli: bool,
 ) -> "HFModelInfo | None":
     """Get the information about the model from the HF Hub or a local directory.
@@ -716,28 +722,30 @@ def get_model_repo_info(
             The model ID.
         revision:
             The revision of the model.
-        benchmark_config:
-            The benchmark configuration.
     Returns:
         The information about the model, or None if the model could not be found.
     """
-    token = benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True
+    token = get_hf_token(api_key=api_key)
     hf_api = HfApi(token=token)
-    model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "main")
     # Get information on the model.
     # The first case is when the model is a local model, in which case we create a dummy
     # model info object.
     model_info: HfApiModelInfo | None = None
     if Path(model_id).is_dir():
-        logger.debug(f"Checking for local model in {model_id}.")
+        log(f"Checking for local model in {model_id}.", level=logging.DEBUG)
         if all(
             (Path(model_id) / required_file).exists()
             for required_file in LOCAL_MODELS_REQUIRED_FILES
         ):
             model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
+    # If we have not internet, and the model_id is not a directory for a local model
+    # we also just create a dummy model info object.
+    elif not internet_connection_available():
+        model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
     # If the model does not exist locally, then we get the model info from the Hugging
     # Face Hub, if possible
     if model_info is None:
@@ -752,35 +760,39 @@ def get_model_repo_info(
             except (GatedRepoError, LocalTokenNotFoundError) as e:
                 try:
                     hf_whoami(token=token)
-                    logger.debug(
+                    log(
                         f"Could not access the model {model_id} with the revision "
-                        f"{revision}. The error was {str(e)!r}."
+                        f"{revision}. The error was {str(e)!r}.",
+                        level=logging.DEBUG,
                     )
                     return None
                 except LocalTokenNotFoundError:
-                    logger.debug(
+                    log(
                         f"Could not access the model {model_id} with the revision "
                         f"{revision}. The error was {str(e)!r}. Please set the "
                         "`HUGGINGFACE_API_KEY` environment variable or use the "
-                        "`--api-key` argument."
+                        "`--api-key` argument.",
+                        level=logging.DEBUG,
                     )
                     return None
-            except (RepositoryNotFoundError, HFValidationError):
+            except (RepositoryNotFoundError, HFValidationError, HfHubHTTPError):
                 return None
             except (OSError, RequestException) as e:
                 if internet_connection_available():
                     errors.append(e)
                     continue
-                logger.debug(
+                log(
                     "Could not access the Hugging Face Hub. Please check your internet "
-                    "connection."
+                    "connection.",
+                    level=logging.DEBUG,
                 )
                 return None
         else:
-            logger.debug(
+            log(
                 f"Could not access model info for the model {model_id!r} from the "
                 f"Hugging Face Hub, after {num_attempts} attempts. The errors "
-                f"encountered were {errors!r}."
+                f"encountered were {errors!r}.",
+                level=logging.DEBUG,
             )
             return None
@@ -800,12 +812,7 @@ def get_model_repo_info(
             level=logging.DEBUG,
         )
         if base_model_id is not None:
-            base_model_info = hf_api.model_info(
-                repo_id=base_model_id,
-                token=benchmark_config.api_key
-                or os.getenv("HUGGINGFACE_API_KEY")
-                or True,
-            )
+            base_model_info = hf_api.model_info(repo_id=base_model_id, token=token)
             tags += base_model_info.tags or list()
             tags = list(set(tags))
@@ -816,15 +823,15 @@ def get_model_repo_info(
         hf_config = load_hf_model_config(
             model_id=base_model_id or model_id,
             num_labels=0,
-            id2label=dict(),
-            label2id=dict(),
+            id2label=HashableDict(),
+            label2id=HashableDict(),
             revision=revision,
             model_cache_dir=create_model_cache_dir(
-                cache_dir=benchmark_config.cache_dir, model_id=model_id
+                cache_dir=cache_dir, model_id=model_id
             ),
-            api_key=benchmark_config.api_key,
-            trust_remote_code=benchmark_config.trust_remote_code,
-            run_with_cli=benchmark_config.run_with_cli,
+            api_key=api_key,
+            trust_remote_code=trust_remote_code,
+            run_with_cli=run_with_cli,
         )
         class_names = hf_config.architectures
         generative_class_names = [
@@ -839,19 +846,19 @@ def get_model_repo_info(
         else:
             pipeline_tag = "fill-mask"
-    if benchmark_config.only_allow_safetensors:
+    if requires_safetensors:
         repo_files = hf_api.list_repo_files(repo_id=model_id, revision=revision)
         has_safetensors = any(f.endswith(".safetensors") for f in repo_files)
         if not has_safetensors:
             msg = f"Model {model_id} does not have safetensors weights available. "
-            if benchmark_config.run_with_cli:
+            if run_with_cli:
                 msg += "Skipping since the `--only-allow-safetensors` flag is set."
             else:
                 msg += (
-                    "Skipping since the `only_allow_safetensors` argument is set "
+                    "Skipping since the `requires_safetensors` argument is set "
                     "to `True`."
                 )
-            logger.warning(msg)
+            log(msg, level=logging.WARNING)
             return None
         # Also check base model if we are evaluating an adapter
@@ -865,11 +872,11 @@ def get_model_repo_info(
                     f"Base model {base_model_id} does not have safetensors weights "
                     "available."
                 )
-                if benchmark_config.run_with_cli:
+                if run_with_cli:
                     msg += " Skipping since the `--only-allow-safetensors` flag is set."
                 else:
                     msg += (
-                        " Skipping since the `only_allow_safetensors` argument is set "
+                        " Skipping since the `requires_safetensors` argument is set "
                         "to `True`."
                     )
                 logging.warning(msg)
@@ -880,10 +887,13 @@ def get_model_repo_info(
     )
-def load_tokenizer(
-    model: "PreTrainedModel | None", model_id: str, trust_remote_code: bool
+def load_tokeniser(
+    model: "PreTrainedModel | None",
+    model_id: str,
+    trust_remote_code: bool,
+    model_config: "ModelConfig",
 ) -> "PreTrainedTokenizer":
-    """Load the tokenizer.
+    """Load the tokeniser.
     Args:
         model:
@@ -893,16 +903,19 @@ def load_tokenizer(
             The model identifier. Used for logging.
         trust_remote_code:
             Whether to trust remote code.
+        model_config:
+            The model configuration.
     Returns:
-        The loaded tokenizer.
+        The loaded tokeniser.
     """
     loading_kwargs: dict[str, bool | str] = dict(
-        use_fast=True,
+        use_fast=False if model_config.param == "slow-tokenizer" else True,
         verbose=False,
         trust_remote_code=trust_remote_code,
         padding_side="right",
         truncation_side="right",
+        cache_dir=model_config.model_cache_dir,
     )
     # If the model is a subclass of a certain model types then we have to add a prefix
@@ -918,45 +931,51 @@ def load_tokenizer(
     num_retries = 5
     for _ in range(num_retries):
         try:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, **loading_kwargs)
+            tokeniser = AutoTokenizer.from_pretrained(model_id, **loading_kwargs)
             break
-        except (JSONDecodeError, OSError, TypeError):
-            raise InvalidModel(f"Could not load tokenizer for model {model_id!r}.")
+        except (JSONDecodeError, OSError, TypeError) as e:
+            raise InvalidModel(
+                f"Could not load tokeniser for model {model_id!r}."
+            ) from e
         except (TimeoutError, RequestError):
-            logger.info(f"Couldn't load tokenizer for {model_id!r}. Retrying.")
+            log(
+                f"Couldn't load tokeniser for {model_id!r}. Retrying.",
+                level=logging.WARNING,
+            )
             sleep(5)
             continue
     else:
         raise InvalidModel(
-            f"Could not load tokenizer for model {model_id!r} after {num_retries} "
+            f"Could not load tokeniser for model {model_id!r} after {num_retries} "
             "attempts."
         )
     # Ensure that BOS, EOS and PAD tokens are set
-    tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
-    tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
+    tokeniser.bos_token, tokeniser.bos_token_id = get_bos_token(tokeniser=tokeniser)
+    tokeniser.eos_token, tokeniser.eos_token_id = get_eos_token(tokeniser=tokeniser)
-    return tokenizer
+    return tokeniser
-def get_torch_dtype(
-    device: torch.device, torch_dtype_is_set: bool, bf16_available: bool
+@cache_arguments()
+def get_dtype(
+    device: torch.device, dtype_is_set: bool, bf16_available: bool
 ) -> str | torch.dtype:
     """Get the torch dtype, used for loading the model.
     Args:
         device:
             The device to use.
-        torch_dtype_is_set:
-            Whether the torch data type is set in the model configuration.
+        dtype_is_set:
+            Whether the data type is set in the model configuration.
         bf16_available:
             Whether bfloat16 is available.
     Returns:
-        The torch dtype.
+        The dtype.
     """
     using_cuda = device == torch.device("cuda")
-    if using_cuda and torch_dtype_is_set:
+    if using_cuda and dtype_is_set:
         return "auto"
     elif using_cuda and bf16_available:
         return torch.bfloat16
@@ -965,6 +984,7 @@ def get_torch_dtype(
     return torch.float32
+@cache_arguments("model_id", "revision", "num_labels", "id2label", "label2id")
 def load_hf_model_config(
     model_id: str,
     num_labels: int,
@@ -1001,7 +1021,7 @@ def load_hf_model_config(
     Returns:
         The Hugging Face model configuration.
     """
-    while True:
+    for _ in range(num_attempts := 5):
         try:
             config = AutoConfig.from_pretrained(
                 model_id,
@@ -1009,35 +1029,36 @@ def load_hf_model_config(
                 id2label=id2label,
                 label2id=label2id,
                 revision=revision,
-                token=api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+                token=get_hf_token(api_key=api_key),
                 trust_remote_code=trust_remote_code,
                 cache_dir=model_cache_dir,
+                local_files_only=not internet_connection_available(),
             )
-            if config.eos_token_id is not None and config.pad_token_id is None:
-                if isinstance(config.eos_token_id, list):
-                    config.pad_token_id = config.eos_token_id[0]
-                else:
-                    config.pad_token_id = config.eos_token_id
-            return config
+            break
         except KeyError as e:
             key = e.args[0]
             raise InvalidModel(
                 f"The model config for the model {model_id!r} could not be "
                 f"loaded, as the key {key!r} was not found in the config."
-            )
+            ) from e
         except (OSError, GatedRepoError) as e:
-            # TEMP: When the model is gated then we cannot set cache dir, for some
-            # reason (since transformers v4.38.2, still a problem in v4.48.0). This
-            # should be included back in when this is fixed.
-            if "gated repo" in str(e):
-                model_cache_dir = None
-                continue
+            if isinstance(e, GatedRepoError) or "gated repo" in str(e).lower():
+                raise InvalidModel(
+                    f"The model {model_id!r} is a gated repository. Please ensure "
+                    "that you are logged in with `hf auth login` or have provided a "
+                    "valid Hugging Face access token with the `HUGGINGFACE_API_KEY` "
+                    "environment variable or the `--api-key` argument. Also check that "
+                    "your account has access to this model."
+                ) from e
             raise InvalidModel(
                 f"Couldn't load model config for {model_id!r}. The error was "
                 f"{e!r}. Skipping"
-            )
+            ) from e
         except (TimeoutError, RequestError):
-            logger.info(f"Couldn't load model config for {model_id!r}. Retrying.")
+            log(
+                f"Couldn't load model config for {model_id!r}. Retrying.",
+                level=logging.WARNING,
+            )
             sleep(5)
             continue
         except ValueError as e:
@@ -1045,17 +1066,31 @@ def load_hf_model_config(
                 raise InvalidModel(
                     f"The model {model_id!r} is awaiting a review from the repository "
                     "authors. Please try again later."
-                )
+                ) from e
             if "trust_remote_code" in str(e):
                 raise NeedsAdditionalArgument(
                     cli_argument="--trust-remote-code",
                     script_argument="trust_remote_code=True",
                     run_with_cli=run_with_cli,
-                )
+                ) from e
             raise InvalidModel(
                 f"The config for the model {model_id!r} could not be loaded. The "
                 f"error was {e!r}."
-            )
+            ) from e
+    else:
+        raise InvalidModel(
+            f"Couldn't load model config for {model_id!r} after {num_attempts} "
+            "attempts."
+        )
+    # Ensure that the PAD token ID is set
+    if config.eos_token_id is not None and config.pad_token_id is None:
+        if isinstance(config.eos_token_id, list):
+            config.pad_token_id = config.eos_token_id[0]
+        else:
+            config.pad_token_id = config.eos_token_id
+    return config
 def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedModel":
@@ -1140,33 +1175,33 @@ def get_children_of_module(
         return submodules
-def align_model_and_tokenizer(
+def align_model_and_tokeniser(
     model: "PreTrainedModel",
-    tokenizer: "PreTrainedTokenizer",
+    tokeniser: "PreTrainedTokenizer",
     model_max_length: int,
     raise_errors: bool = False,
 ) -> tuple["PreTrainedModel", "PreTrainedTokenizer"]:
-    """Aligns the model and the tokenizer.
+    """Aligns the model and the tokeniser.
     Args:
         model:
             The model to fix.
-        tokenizer:
-            The tokenizer to fix.
+        tokeniser:
+            The tokeniser to fix.
         model_max_length:
             The maximum length of the model.
         raise_errors:
             Whether to raise errors instead of trying to fix them silently.
     Returns:
-        The fixed model and tokenizer.
+        The fixed model and tokeniser.
     """
     model_max_length = min(model_max_length, MAX_CONTEXT_LENGTH)
     if model_max_length > 0:
-        tokenizer.model_max_length = model_max_length
+        tokeniser.model_max_length = model_max_length
     else:
-        tokenizer.model_max_length = 512
+        tokeniser.model_max_length = 512
     # Move the model to the CPU, since otherwise we can't catch the IndexErrors when
     # finding the maximum sequence length of the model
@@ -1175,9 +1210,9 @@ def align_model_and_tokenizer(
     # Manually check that this model max length is valid for the model, and adjust
     # otherwise
-    initial_max_length = tokenizer.model_max_length
+    initial_max_length = tokeniser.model_max_length
     for max_length in range(initial_max_length, 0, -1):
-        tokenizer.model_max_length = max_length
+        tokeniser.model_max_length = max_length
         dummy_inputs = torch.full(
             size=(1, max_length),
             fill_value=DUMMY_FILL_VALUE,
@@ -1204,26 +1239,27 @@ def align_model_and_tokenizer(
     # Move the model back to the original device
     model.to(model_device)  # type: ignore[arg-type]
-    # If there is a mismatch between the vocab size according to the tokenizer and
+    # If there is a mismatch between the vocab size according to the tokeniser and
     # the vocab size according to the model, we raise an error
     if hasattr(model.config, "vocab_size"):
-        if model.config.vocab_size < len(tokenizer):
+        if model.config.vocab_size < len(tokeniser):
             if raise_errors:
                 raise InvalidModel(
-                    "The vocab size of the tokenizer is larger than the vocab size of "
+                    "The vocab size of the tokeniser is larger than the vocab size of "
                     "the model. As the --raise-errors option was specified, the "
                     "embeddings of the model will not be automatically adjusted."
                 )
             if hasattr(model, "resize_token_embeddings"):
-                model.resize_token_embeddings(new_num_tokens=tokenizer.vocab_size + 1)
+                model.resize_token_embeddings(new_num_tokens=tokeniser.vocab_size + 1)
-    if tokenizer.bos_token is None and tokenizer.eos_token is not None:
-        tokenizer.bos_token = tokenizer.eos_token
-        tokenizer.bos_token_id = tokenizer.eos_token_id
+    if tokeniser.bos_token is None and tokeniser.eos_token is not None:
+        tokeniser.bos_token = tokeniser.eos_token
+        tokeniser.bos_token_id = tokeniser.eos_token_id
-    return model, tokenizer
+    return model, tokeniser
+@cache_arguments()
 def task_group_to_class_name(task_group: TaskGroup) -> str:
     """Convert a task group to a class name.

EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl