PyPI - EuroEval - Versions diffs - 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl - Mend

EuroEval 15.4.2py3-none-any.whl → 15.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (54) hide show

euroeval/__init__.py +2 -2
euroeval/benchmark_modules/base.py +3 -2
euroeval/benchmark_modules/fresh.py +8 -6
euroeval/benchmark_modules/hf.py +44 -33
euroeval/benchmark_modules/litellm.py +314 -120
euroeval/benchmark_modules/vllm.py +99 -59
euroeval/benchmarker.py +52 -21
euroeval/callbacks.py +2 -2
euroeval/constants.py +9 -2
euroeval/data_models.py +258 -44
euroeval/dataset_configs/__init__.py +61 -0
euroeval/dataset_configs/danish.py +120 -0
euroeval/dataset_configs/dutch.py +123 -0
euroeval/dataset_configs/english.py +88 -0
euroeval/dataset_configs/faroese.py +53 -0
euroeval/dataset_configs/french.py +83 -0
euroeval/dataset_configs/german.py +91 -0
euroeval/dataset_configs/icelandic.py +148 -0
euroeval/dataset_configs/italian.py +81 -0
euroeval/dataset_configs/norwegian.py +178 -0
euroeval/dataset_configs/spanish.py +78 -0
euroeval/dataset_configs/swedish.py +100 -0
euroeval/exceptions.py +10 -10
euroeval/finetuning.py +6 -10
euroeval/generation.py +1 -0
euroeval/human_evaluation.py +2 -2
euroeval/languages.py +20 -13
euroeval/model_cache.py +1 -1
euroeval/model_loading.py +1 -12
euroeval/prompt_templates/__init__.py +8 -0
euroeval/prompt_templates/linguistic_acceptability.py +112 -0
euroeval/prompt_templates/multiple_choice.py +97 -0
euroeval/prompt_templates/named_entity_recognition.py +257 -0
euroeval/prompt_templates/reading_comprehension.py +118 -0
euroeval/prompt_templates/sentiment_classification.py +137 -0
euroeval/prompt_templates/summarization.py +97 -0
euroeval/speed_benchmark.py +1 -1
euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
euroeval/tasks.py +54 -0
euroeval/tokenization_utils.py +343 -0
euroeval/types.py +3 -1
euroeval/utils.py +5 -254
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
euroeval-15.6.0.dist-info/RECORD +59 -0
euroeval/dataset_configs.py +0 -2408
euroeval-15.4.2.dist-info/RECORD +0 -40
/euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@
 ### Block unwanted terminal output that happens on importing external modules ###
 import logging
+import os
 import sys
 import warnings
@@ -14,7 +15,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
 logging.getLogger("httpx").setLevel(logging.CRITICAL)
 logging.getLogger("datasets").setLevel(logging.CRITICAL)
 logging.getLogger("vllm").setLevel(logging.CRITICAL)
-logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
+os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
 # Set up logging
 fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
@@ -29,7 +30,6 @@ logging.basicConfig(
 ### Set the rest up ###
 import importlib.metadata  # noqa: E402
-import os  # noqa: E402
 from dotenv import load_dotenv  # noqa: E402

euroeval/benchmark_modules/base.py CHANGED Viewed

@@ -10,7 +10,8 @@ from functools import cached_property, partial
 from datasets import DatasetDict
 from torch import nn
 from tqdm.auto import tqdm
-from transformers import PreTrainedTokenizer, Trainer
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.trainer import Trainer
 from ..data_models import (
     BenchmarkConfig,
@@ -21,7 +22,7 @@ from ..data_models import (
 )
 from ..enums import BatchingPreference, GenerativeType, TaskGroup
 from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
-from ..task_utils import (
+from ..task_group_utils import (
     question_answering,
     sequence_classification,
     text_to_text,

euroeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -4,19 +4,21 @@ import os
 from functools import cached_property
 from json import JSONDecodeError
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers.models.electra import (
     ElectraForQuestionAnswering,
     ElectraForSequenceClassification,
     ElectraForTokenClassification,
-    PretrainedConfig,
-    PreTrainedModel,
-    PreTrainedTokenizer,
+)
+from transformers.models.xlm_roberta import (
     XLMRobertaForQuestionAnswering,
     XLMRobertaForSequenceClassification,
     XLMRobertaForTokenClassification,
 )
+from transformers.tokenization_utils import PreTrainedTokenizer
 from ..data_models import BenchmarkConfig, DatasetConfig, ModelConfig
 from ..enums import InferenceBackend, ModelType, TaskGroup

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -13,37 +13,36 @@ import torch
 from datasets import DatasetDict
 from huggingface_hub import HfApi
 from huggingface_hub import whoami as hf_whoami
-from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
-from huggingface_hub.hf_api import RepositoryNotFoundError, RevisionNotFoundError
-from huggingface_hub.utils import (
+from huggingface_hub.errors import (
     GatedRepoError,
     HFValidationError,
     LocalTokenNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
 )
+from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
 from peft import PeftConfig
 from requests.exceptions import RequestException
 from torch import nn
-from transformers import (
-    AutoConfig,
-    AutoTokenizer,
-    BatchEncoding,
+from transformers.configuration_utils import PretrainedConfig
+from transformers.data.data_collator import (
     DataCollatorForTokenClassification,
     DataCollatorWithPadding,
-    PretrainedConfig,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    Trainer,
 )
 from transformers.modelcard import TASK_MAPPING
-from transformers.models.auto.modeling_auto import (
-    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
-)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.auto.configuration_auto import AutoConfig
+from transformers.models.auto.tokenization_auto import AutoTokenizer
+from transformers.tokenization_utils import PreTrainedTokenizer
+from transformers.tokenization_utils_base import BatchEncoding
+from transformers.trainer import Trainer
 from urllib3.exceptions import RequestError
 from ..constants import (
     DUMMY_FILL_VALUE,
     GENERATIVE_PIPELINE_TAGS,
     LOCAL_MODELS_REQUIRED_FILES,
+    MAX_CONTEXT_LENGTH,
     MERGE_TAGS,
 )
 from ..data_models import BenchmarkConfig, DatasetConfig, HFModelInfo, ModelConfig, Task
@@ -64,18 +63,17 @@ from ..exceptions import (
     NoInternetConnection,
 )
 from ..languages import get_all_languages
-from ..task_utils import (
+from ..task_group_utils import (
     multiple_choice_classification,
     question_answering,
     token_classification,
 )
+from ..tokenization_utils import get_bos_token, get_eos_token
 from ..types import ExtractLabelsFunction
 from ..utils import (
     block_terminal_output,
     create_model_cache_dir,
-    get_bos_token,
     get_class_by_name,
-    get_eos_token,
     internet_connection_available,
     log_once,
 )
@@ -245,6 +243,15 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             max_length for max_length in all_max_lengths if max_length >= 128
         ]
+        # We remove the upper cap of maximum context length for the model, as it is
+        # highly unlikely that this is the model's actual maximum context length - we
+        # would rather not report a value than report an incorrect one.
+        all_max_lengths = [
+            max_length
+            for max_length in all_max_lengths
+            if max_length != MAX_CONTEXT_LENGTH
+        ]
         if len(list(all_max_lengths)) > 0:
             model_max_length = min(list(all_max_lengths))
         else:
@@ -680,7 +687,7 @@ def load_model_and_tokenizer(
     assert model is not None, "The model should not be None."
     model.eval()
-    model.to(benchmark_config.device)
+    model.to(benchmark_config.device)  # type: ignore[arg-type]
     if (
         isinstance(model, PreTrainedModel)
@@ -787,12 +794,6 @@ def get_model_repo_info(
             tags += base_model_info.tags or list()
             tags = list(set(tags))
-    # TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
-    # 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
-    # when this PR has been merged in and published:
-    # https://github.com/huggingface/transformers/pull/37107
-    TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
     # Get the pipeline tag for the model. If it is not specified, then we determine it
     # by checking the model's architecture as written in the model's Hugging Face config
     pipeline_tag = model_info.pipeline_tag
@@ -814,7 +815,7 @@ def get_model_repo_info(
         generative_class_names = [
             class_name
             for tag in GENERATIVE_PIPELINE_TAGS
-            for class_name in TASK_MAPPING.get(tag, dict()).values()
+            for class_name in TASK_MAPPING.get(tag, dict()).values()  # type: ignore[attr-defined]
         ]
         if class_names is not None and any(
             class_name in generative_class_names for class_name in class_names
@@ -1073,17 +1074,20 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
         for attribute in attribute_list:
             token_type_embeddings = getattr(token_type_embeddings, attribute)
+        token_type_embedding_tensor = token_type_embeddings.weight.data
+        assert isinstance(token_type_embedding_tensor, torch.Tensor)
         # If the token type embeddings has shape (1, ...) then set the shape to
         # (2, ...) by randomly initializing the second token type embedding
-        if token_type_embeddings.weight.data.shape[0] == 1:
+        if token_type_embedding_tensor.shape[0] == 1:
             token_type_embeddings.weight.data = torch.cat(
                 (
-                    token_type_embeddings.weight.data,
-                    torch.rand_like(token_type_embeddings.weight.data),
+                    token_type_embedding_tensor,
+                    torch.rand_like(token_type_embedding_tensor),
                 ),
                 dim=0,
             )
-            token_type_embeddings.num_embeddings = 2
+            token_type_embeddings.num_embeddings = 2  # type: ignore[assignment]
         # Set the model config to use the new type vocab size
         model.config.type_vocab_size = 2
@@ -1140,8 +1144,7 @@ def align_model_and_tokenizer(
     Returns:
         The fixed model and tokenizer.
     """
-    # Ensure that the model max length is at most 5,000, to avoid OOM errors
-    model_max_length = min(model_max_length, 5_000)
+    model_max_length = min(model_max_length, MAX_CONTEXT_LENGTH)
     if model_max_length > 0:
         tokenizer.model_max_length = model_max_length
@@ -1151,7 +1154,7 @@ def align_model_and_tokenizer(
     # Move the model to the CPU, since otherwise we can't catch the IndexErrors when
     # finding the maximum sequence length of the model
     model_device = model.device
-    model.to(torch.device("cpu"))
+    model.to(torch.device("cpu"))  # type: ignore[arg-type]
     # Manually check that this model max length is valid for the model, and adjust
     # otherwise
@@ -1173,8 +1176,16 @@ def align_model_and_tokenizer(
             except IndexError:
                 continue
+            except ValueError as e:
+                # This happens when the model is using Triton, such as with ModernBERT,
+                # which doesn't work with CPU tensors at all
+                if "cpu tensor" in str(e):
+                    break
+                else:
+                    raise e
     # Move the model back to the original device
-    model.to(model_device)
+    model.to(model_device)  # type: ignore[arg-type]
     # If there is a mismatch between the vocab size according to the tokenizer and
     # the vocab size according to the model, we raise an error

EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.4.2py3-none-any.whl → 15.6.0py3-none-any.whl