PyPI - EuroEval - Versions diffs - 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl - Mend

EuroEval 16.3.0py3-none-any.whl → 16.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show

euroeval/__init__.py +3 -2
euroeval/benchmark_config_factory.py +0 -4
euroeval/benchmark_modules/base.py +3 -16
euroeval/benchmark_modules/fresh.py +2 -1
euroeval/benchmark_modules/hf.py +99 -62
euroeval/benchmark_modules/litellm.py +101 -41
euroeval/benchmark_modules/vllm.py +91 -83
euroeval/benchmarker.py +84 -78
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/constants.py +6 -0
euroeval/data_loading.py +14 -11
euroeval/data_models.py +12 -4
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/czech.py +79 -0
euroeval/dataset_configs/danish.py +10 -11
euroeval/dataset_configs/dutch.py +0 -1
euroeval/dataset_configs/english.py +0 -1
euroeval/dataset_configs/estonian.py +11 -1
euroeval/dataset_configs/finnish.py +0 -1
euroeval/dataset_configs/french.py +0 -1
euroeval/dataset_configs/german.py +0 -1
euroeval/dataset_configs/italian.py +0 -1
euroeval/dataset_configs/latvian.py +0 -1
euroeval/dataset_configs/lithuanian.py +9 -3
euroeval/dataset_configs/norwegian.py +0 -1
euroeval/dataset_configs/polish.py +0 -1
euroeval/dataset_configs/portuguese.py +0 -1
euroeval/dataset_configs/slovak.py +60 -0
euroeval/dataset_configs/spanish.py +0 -1
euroeval/dataset_configs/swedish.py +10 -12
euroeval/finetuning.py +21 -15
euroeval/generation.py +10 -10
euroeval/generation_utils.py +2 -3
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +9 -5
euroeval/metrics/llm_as_a_judge.py +5 -3
euroeval/metrics/pipeline.py +17 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +11 -14
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/linguistic_acceptability.py +21 -3
euroeval/prompt_templates/multiple_choice.py +25 -1
euroeval/prompt_templates/named_entity_recognition.py +51 -11
euroeval/prompt_templates/reading_comprehension.py +31 -3
euroeval/prompt_templates/sentiment_classification.py +23 -1
euroeval/prompt_templates/summarization.py +26 -6
euroeval/scores.py +7 -7
euroeval/speed_benchmark.py +3 -5
euroeval/task_group_utils/multiple_choice_classification.py +0 -3
euroeval/task_group_utils/question_answering.py +0 -3
euroeval/task_group_utils/sequence_classification.py +43 -31
euroeval/task_group_utils/text_to_text.py +17 -8
euroeval/task_group_utils/token_classification.py +10 -9
euroeval/tokenisation_utils.py +14 -12
euroeval/utils.py +29 -146
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/METADATA +4 -4
euroeval-16.4.0.dist-info/RECORD +75 -0
euroeval-16.3.0.dist-info/RECORD +0 -71
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -21,7 +21,8 @@ if os.getenv("FULL_LOG") != "1":
     os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
 # Set up logging
-fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
+# fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
+fmt = colored("%(message)s", "light_yellow")
 logging.basicConfig(
     level=logging.CRITICAL if hasattr(sys, "_called_from_test") else logging.INFO,
     format=fmt,
@@ -50,7 +51,7 @@ import importlib.metadata  # noqa: E402
 from dotenv import load_dotenv  # noqa: E402
 from .benchmarker import Benchmarker  # noqa: E402
-from .utils import block_terminal_output  # noqa: E402
+from .logging_utils import block_terminal_output  # noqa: E402
 # Block unwanted terminal outputs. This blocks way more than the above, but since it
 # relies on importing from the `utils` module, external modules are already imported

euroeval/benchmark_config_factory.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Factory class for creating dataset configurations."""
-import logging
 import sys
 import typing as t
@@ -17,9 +16,6 @@ if t.TYPE_CHECKING:
     from .data_models import Language, Task
-logger = logging.getLogger("euroeval")
 def build_benchmark_config(
     benchmark_config_params: BenchmarkConfigParams,
 ) -> BenchmarkConfig:

euroeval/benchmark_modules/base.py CHANGED Viewed

@@ -3,24 +3,22 @@
 import collections.abc as c
 import logging
 import re
-import sys
 import typing as t
 from abc import ABC, abstractmethod
 from functools import cached_property, partial
 from datasets import Dataset, DatasetDict
 from torch import nn
-from tqdm.auto import tqdm
 from ..enums import TaskGroup
 from ..exceptions import InvalidBenchmark, NeedsEnvironmentVariable, NeedsExtraInstalled
+from ..logging_utils import get_pbar, log_once
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
     text_to_text,
     token_classification,
 )
-from ..utils import log_once
 if t.TYPE_CHECKING:
     from transformers.tokenization_utils import PreTrainedTokenizer
@@ -36,8 +34,6 @@ if t.TYPE_CHECKING:
     from ..enums import BatchingPreference, GenerativeType
     from ..types import ComputeMetricsFunction, ExtractLabelsFunction
-logger = logging.getLogger("euroeval")
 class BenchmarkModule(ABC):
     """Abstract class for a benchmark module.
@@ -87,16 +83,7 @@ class BenchmarkModule(ABC):
     def _log_metadata(self) -> None:
         """Log the metadata of the model."""
-        # Set logging level based on verbosity
-        if hasattr(sys, "_called_from_test"):
-            logging_level = logging.CRITICAL
-        elif self.benchmark_config.verbose:
-            logging_level = logging.DEBUG
-        else:
-            logging_level = logging.INFO
-        logger.setLevel(logging_level)
-        logging_msg: str = ""
+        logging_msg: str = "    ↳ "
         if self.num_params < 0:
             logging_msg += "The model has an unknown number of parameters, "
         else:
@@ -273,7 +260,7 @@ class BenchmarkModule(ABC):
                 tasks.
         """
         for idx, dataset in enumerate(
-            tqdm(iterable=datasets, desc="Preparing datasets")
+            get_pbar(iterable=datasets, desc="Preparing datasets")
         ):
             prepared_dataset = self.prepare_dataset(
                 dataset=dataset, task=task, itr_idx=idx

euroeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -27,7 +27,8 @@ from ..exceptions import (
     NeedsExtraInstalled,
 )
 from ..generation_utils import raise_if_wrong_params
-from ..utils import block_terminal_output, create_model_cache_dir, get_hf_token
+from ..logging_utils import block_terminal_output
+from ..utils import create_model_cache_dir, get_hf_token
 from .hf import (
     HuggingFaceEncoderModel,
     align_model_and_tokeniser,

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -36,6 +36,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
 from transformers.trainer import Trainer
 from urllib3.exceptions import RequestError
+from ..caching_utils import cache_arguments
 from ..constants import (
     DUMMY_FILL_VALUE,
     GENERATIVE_PIPELINE_TAGS,
@@ -43,7 +44,7 @@ from ..constants import (
     MAX_CONTEXT_LENGTH,
     MERGE_TAGS,
 )
-from ..data_models import HFModelInfo, ModelConfig
+from ..data_models import HashableDict, HFModelInfo, ModelConfig
 from ..enums import (
     BatchingPreference,
     GenerativeType,
@@ -60,6 +61,7 @@ from ..exceptions import (
 )
 from ..generation_utils import raise_if_wrong_params
 from ..languages import get_all_languages
+from ..logging_utils import block_terminal_output, log, log_once
 from ..task_group_utils import (
     multiple_choice_classification,
     question_answering,
@@ -67,12 +69,10 @@ from ..task_group_utils import (
 )
 from ..tokenisation_utils import get_bos_token, get_eos_token
 from ..utils import (
-    block_terminal_output,
     create_model_cache_dir,
     get_class_by_name,
     get_hf_token,
     internet_connection_available,
-    log_once,
     split_model_id,
 )
 from .base import BenchmarkModule
@@ -85,8 +85,6 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig, Task
     from ..types import ExtractLabelsFunction
-logger = logging.getLogger("euroeval")
 class HuggingFaceEncoderModel(BenchmarkModule):
     """An encoder model from the Hugging Face Hub."""
@@ -183,12 +181,13 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         elif hasattr(self._model, "parameters"):
             num_params = sum(p.numel() for p in self._model.parameters())
         else:
-            logger.warning(
+            log(
                 "The number of parameters could not be determined for the model, since "
                 "the model is not stored in the safetensors format. If this is your "
                 "own model, then you can use this Hugging Face Space to convert your "
                 "model to the safetensors format: "
-                "https://huggingface.co/spaces/safetensors/convert."
+                "https://huggingface.co/spaces/safetensors/convert.",
+                level=logging.WARNING,
             )
             num_params = -1
         return num_params
@@ -491,7 +490,11 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         model_info = get_model_repo_info(
             model_id=model_id_components.model_id,
             revision=model_id_components.revision,
-            benchmark_config=benchmark_config,
+            api_key=benchmark_config.api_key,
+            cache_dir=benchmark_config.cache_dir,
+            trust_remote_code=benchmark_config.trust_remote_code,
+            requires_safetensors=benchmark_config.requires_safetensors,
+            run_with_cli=benchmark_config.run_with_cli,
         )
         return (
             model_info is not None
@@ -517,7 +520,11 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         model_info = get_model_repo_info(
             model_id=model_id_components.model_id,
             revision=model_id_components.revision,
-            benchmark_config=benchmark_config,
+            api_key=benchmark_config.api_key,
+            cache_dir=benchmark_config.cache_dir,
+            trust_remote_code=benchmark_config.trust_remote_code,
+            requires_safetensors=benchmark_config.requires_safetensors,
+            run_with_cli=benchmark_config.run_with_cli,
         )
         if model_info is None:
             raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -583,8 +590,8 @@ def load_model_and_tokeniser(
     config = load_hf_model_config(
         model_id=model_id,
         num_labels=len(id2label),
-        id2label=id2label,
-        label2id={label: idx for idx, label in id2label.items()},
+        id2label=HashableDict(id2label),
+        label2id=HashableDict({label: idx for idx, label in id2label.items()}),
         revision=model_config.revision,
         model_cache_dir=model_config.model_cache_dir,
         api_key=benchmark_config.api_key,
@@ -608,11 +615,8 @@ def load_model_and_tokeniser(
         ),
     )
-    # These are used when a timeout occurs
-    attempts_left = 5
     model: "PreTrainedModel | None" = None
-    while True:
+    for _ in range(num_attempts := 5):
         # Get the model class associated with the task group
         model_cls_or_none: t.Type["PreTrainedModel"] | None = get_class_by_name(
             class_name=task_group_to_class_name(task_group=task_group),
@@ -639,22 +643,21 @@ def load_model_and_tokeniser(
             break
         except (KeyError, RuntimeError) as e:
             if not model_kwargs["ignore_mismatched_sizes"]:
-                logger.debug(
+                log(
                     f"{type(e).__name__} occurred during the loading "
                     f"of the {model_id!r} model. Retrying with "
-                    "`ignore_mismatched_sizes` set to True."
+                    "`ignore_mismatched_sizes` set to True.",
+                    level=logging.DEBUG,
                 )
                 model_kwargs["ignore_mismatched_sizes"] = True
                 continue
             else:
                 raise InvalidModel(str(e)) from e
-        except (TimeoutError, RequestError) as e:
-            attempts_left -= 1
-            if attempts_left == 0:
-                raise InvalidModel(
-                    "The model could not be loaded after 5 attempts."
-                ) from e
-            logger.info(f"Couldn't load the model {model_id!r}. Retrying.")
+        except (TimeoutError, RequestError):
+            log(
+                f"Couldn't load the model {model_id!r}. Retrying.",
+                level=logging.WARNING,
+            )
             sleep(5)
             continue
         except (OSError, ValueError) as e:
@@ -671,6 +674,10 @@ def load_model_and_tokeniser(
             raise InvalidModel(
                 f"The model {model_id!r} could not be loaded. The error was {e!r}."
             ) from e
+    else:
+        raise InvalidModel(
+            f"Could not load the model {model_id!r} after {num_attempts} attempts."
+        )
     if isinstance(model_or_tuple, tuple):
         model = model_or_tuple[0]
@@ -698,8 +705,15 @@ def load_model_and_tokeniser(
     return model, tokeniser
+@cache_arguments("model_id", "revision")
 def get_model_repo_info(
-    model_id: str, revision: str, benchmark_config: "BenchmarkConfig"
+    model_id: str,
+    revision: str,
+    api_key: str | None,
+    cache_dir: str,
+    trust_remote_code: bool,
+    requires_safetensors: bool,
+    run_with_cli: bool,
 ) -> "HFModelInfo | None":
     """Get the information about the model from the HF Hub or a local directory.
@@ -708,13 +722,11 @@ def get_model_repo_info(
             The model ID.
         revision:
             The revision of the model.
-        benchmark_config:
-            The benchmark configuration.
     Returns:
         The information about the model, or None if the model could not be found.
     """
-    token = get_hf_token(api_key=benchmark_config.api_key)
+    token = get_hf_token(api_key=api_key)
     hf_api = HfApi(token=token)
     # Get information on the model.
@@ -722,7 +734,7 @@ def get_model_repo_info(
     # model info object.
     model_info: HfApiModelInfo | None = None
     if Path(model_id).is_dir():
-        logger.debug(f"Checking for local model in {model_id}.")
+        log(f"Checking for local model in {model_id}.", level=logging.DEBUG)
         if all(
             (Path(model_id) / required_file).exists()
             for required_file in LOCAL_MODELS_REQUIRED_FILES
@@ -748,17 +760,19 @@ def get_model_repo_info(
             except (GatedRepoError, LocalTokenNotFoundError) as e:
                 try:
                     hf_whoami(token=token)
-                    logger.debug(
+                    log(
                         f"Could not access the model {model_id} with the revision "
-                        f"{revision}. The error was {str(e)!r}."
+                        f"{revision}. The error was {str(e)!r}.",
+                        level=logging.DEBUG,
                     )
                     return None
                 except LocalTokenNotFoundError:
-                    logger.debug(
+                    log(
                         f"Could not access the model {model_id} with the revision "
                         f"{revision}. The error was {str(e)!r}. Please set the "
                         "`HUGGINGFACE_API_KEY` environment variable or use the "
-                        "`--api-key` argument."
+                        "`--api-key` argument.",
+                        level=logging.DEBUG,
                     )
                     return None
             except (RepositoryNotFoundError, HFValidationError):
@@ -774,16 +788,18 @@ def get_model_repo_info(
                 if internet_connection_available():
                     errors.append(e)
                     continue
-                logger.debug(
+                log(
                     "Could not access the Hugging Face Hub. Please check your internet "
-                    "connection."
+                    "connection.",
+                    level=logging.DEBUG,
                 )
                 return None
         else:
-            logger.debug(
+            log(
                 f"Could not access model info for the model {model_id!r} from the "
                 f"Hugging Face Hub, after {num_attempts} attempts. The errors "
-                f"encountered were {errors!r}."
+                f"encountered were {errors!r}.",
+                level=logging.DEBUG,
             )
             return None
@@ -814,15 +830,15 @@ def get_model_repo_info(
         hf_config = load_hf_model_config(
             model_id=base_model_id or model_id,
             num_labels=0,
-            id2label=dict(),
-            label2id=dict(),
+            id2label=HashableDict(),
+            label2id=HashableDict(),
             revision=revision,
             model_cache_dir=create_model_cache_dir(
-                cache_dir=benchmark_config.cache_dir, model_id=model_id
+                cache_dir=cache_dir, model_id=model_id
             ),
-            api_key=benchmark_config.api_key,
-            trust_remote_code=benchmark_config.trust_remote_code,
-            run_with_cli=benchmark_config.run_with_cli,
+            api_key=api_key,
+            trust_remote_code=trust_remote_code,
+            run_with_cli=run_with_cli,
         )
         class_names = hf_config.architectures
         generative_class_names = [
@@ -837,19 +853,19 @@ def get_model_repo_info(
         else:
             pipeline_tag = "fill-mask"
-    if benchmark_config.requires_safetensors:
+    if requires_safetensors:
         repo_files = hf_api.list_repo_files(repo_id=model_id, revision=revision)
         has_safetensors = any(f.endswith(".safetensors") for f in repo_files)
         if not has_safetensors:
             msg = f"Model {model_id} does not have safetensors weights available. "
-            if benchmark_config.run_with_cli:
+            if run_with_cli:
                 msg += "Skipping since the `--only-allow-safetensors` flag is set."
             else:
                 msg += (
                     "Skipping since the `requires_safetensors` argument is set "
                     "to `True`."
                 )
-            logger.warning(msg)
+            log(msg, level=logging.WARNING)
             return None
         # Also check base model if we are evaluating an adapter
@@ -863,7 +879,7 @@ def get_model_repo_info(
                     f"Base model {base_model_id} does not have safetensors weights "
                     "available."
                 )
-                if benchmark_config.run_with_cli:
+                if run_with_cli:
                     msg += " Skipping since the `--only-allow-safetensors` flag is set."
                 else:
                     msg += (
@@ -929,7 +945,10 @@ def load_tokeniser(
                 f"Could not load tokeniser for model {model_id!r}."
             ) from e
         except (TimeoutError, RequestError):
-            logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
+            log(
+                f"Couldn't load tokeniser for {model_id!r}. Retrying.",
+                level=logging.WARNING,
+            )
             sleep(5)
             continue
     else:
@@ -945,6 +964,7 @@ def load_tokeniser(
     return tokeniser
+@cache_arguments()
 def get_dtype(
     device: torch.device, dtype_is_set: bool, bf16_available: bool
 ) -> str | torch.dtype:
@@ -953,6 +973,7 @@ def get_dtype(
     Args:
         device:
             The device to use.
+        dtype_is_set:
             Whether the data type is set in the model configuration.
         bf16_available:
             Whether bfloat16 is available.
@@ -970,6 +991,7 @@ def get_dtype(
     return torch.float32
+@cache_arguments("model_id", "revision", "num_labels", "id2label", "label2id")
 def load_hf_model_config(
     model_id: str,
     num_labels: int,
@@ -1006,7 +1028,7 @@ def load_hf_model_config(
     Returns:
         The Hugging Face model configuration.
     """
-    while True:
+    for _ in range(num_attempts := 5):
         try:
             config = AutoConfig.from_pretrained(
                 model_id,
@@ -1019,12 +1041,7 @@ def load_hf_model_config(
                 cache_dir=model_cache_dir,
                 local_files_only=not internet_connection_available(),
             )
-            if config.eos_token_id is not None and config.pad_token_id is None:
-                if isinstance(config.eos_token_id, list):
-                    config.pad_token_id = config.eos_token_id[0]
-                else:
-                    config.pad_token_id = config.eos_token_id
-            return config
+            break
         except KeyError as e:
             key = e.args[0]
             raise InvalidModel(
@@ -1032,18 +1049,23 @@ def load_hf_model_config(
                 f"loaded, as the key {key!r} was not found in the config."
             ) from e
         except (OSError, GatedRepoError) as e:
-            # TEMP: When the model is gated then we cannot set cache dir, for some
-            # reason (since transformers v4.38.2, still a problem in v4.48.0). This
-            # should be included back in when this is fixed.
-            if "gated repo" in str(e):
-                model_cache_dir = None
-                continue
+            if isinstance(e, GatedRepoError) or "gated repo" in str(e).lower():
+                raise InvalidModel(
+                    f"The model {model_id!r} is a gated repository. Please ensure "
+                    "that you are logged in with `hf auth login` or have provided a "
+                    "valid Hugging Face access token with the `HUGGINGFACE_API_KEY` "
+                    "environment variable or the `--api-key` argument. Also check that "
+                    "your account has access to this model."
+                ) from e
             raise InvalidModel(
                 f"Couldn't load model config for {model_id!r}. The error was "
                 f"{e!r}. Skipping"
             ) from e
         except (TimeoutError, RequestError):
-            logger.info(f"Couldn't load model config for {model_id!r}. Retrying.")
+            log(
+                f"Couldn't load model config for {model_id!r}. Retrying.",
+                level=logging.WARNING,
+            )
             sleep(5)
             continue
         except ValueError as e:
@@ -1062,6 +1084,20 @@ def load_hf_model_config(
                 f"The config for the model {model_id!r} could not be loaded. The "
                 f"error was {e!r}."
             ) from e
+    else:
+        raise InvalidModel(
+            f"Couldn't load model config for {model_id!r} after {num_attempts} "
+            "attempts."
+        )
+    # Ensure that the PAD token ID is set
+    if config.eos_token_id is not None and config.pad_token_id is None:
+        if isinstance(config.eos_token_id, list):
+            config.pad_token_id = config.eos_token_id[0]
+        else:
+            config.pad_token_id = config.eos_token_id
+    return config
 def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedModel":
@@ -1230,6 +1266,7 @@ def align_model_and_tokeniser(
     return model, tokeniser
+@cache_arguments()
 def task_group_to_class_name(task_group: TaskGroup) -> str:
     """Convert a task group to a class name.

EuroEval 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.3.0py3-none-any.whl → 16.4.0py3-none-any.whl