EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (54) hide show
  1. euroeval/__init__.py +2 -2
  2. euroeval/benchmark_modules/base.py +3 -2
  3. euroeval/benchmark_modules/fresh.py +8 -6
  4. euroeval/benchmark_modules/hf.py +44 -33
  5. euroeval/benchmark_modules/litellm.py +314 -120
  6. euroeval/benchmark_modules/vllm.py +99 -59
  7. euroeval/benchmarker.py +52 -21
  8. euroeval/callbacks.py +2 -2
  9. euroeval/constants.py +9 -2
  10. euroeval/data_models.py +258 -44
  11. euroeval/dataset_configs/__init__.py +61 -0
  12. euroeval/dataset_configs/danish.py +120 -0
  13. euroeval/dataset_configs/dutch.py +123 -0
  14. euroeval/dataset_configs/english.py +88 -0
  15. euroeval/dataset_configs/faroese.py +53 -0
  16. euroeval/dataset_configs/french.py +83 -0
  17. euroeval/dataset_configs/german.py +91 -0
  18. euroeval/dataset_configs/icelandic.py +148 -0
  19. euroeval/dataset_configs/italian.py +81 -0
  20. euroeval/dataset_configs/norwegian.py +178 -0
  21. euroeval/dataset_configs/spanish.py +78 -0
  22. euroeval/dataset_configs/swedish.py +100 -0
  23. euroeval/exceptions.py +10 -10
  24. euroeval/finetuning.py +6 -10
  25. euroeval/generation.py +1 -0
  26. euroeval/human_evaluation.py +2 -2
  27. euroeval/languages.py +20 -13
  28. euroeval/model_cache.py +1 -1
  29. euroeval/model_loading.py +1 -12
  30. euroeval/prompt_templates/__init__.py +8 -0
  31. euroeval/prompt_templates/linguistic_acceptability.py +112 -0
  32. euroeval/prompt_templates/multiple_choice.py +97 -0
  33. euroeval/prompt_templates/named_entity_recognition.py +257 -0
  34. euroeval/prompt_templates/reading_comprehension.py +118 -0
  35. euroeval/prompt_templates/sentiment_classification.py +137 -0
  36. euroeval/prompt_templates/summarization.py +97 -0
  37. euroeval/speed_benchmark.py +1 -1
  38. euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
  39. euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
  40. euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
  41. euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
  42. euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
  43. euroeval/tasks.py +54 -0
  44. euroeval/tokenization_utils.py +343 -0
  45. euroeval/types.py +3 -1
  46. euroeval/utils.py +5 -254
  47. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
  48. euroeval-15.6.0.dist-info/RECORD +59 -0
  49. euroeval/dataset_configs.py +0 -2408
  50. euroeval-15.4.2.dist-info/RECORD +0 -40
  51. /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
  52. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
  53. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
  54. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py CHANGED
@@ -4,6 +4,7 @@
4
4
  ### Block unwanted terminal output that happens on importing external modules ###
5
5
 
6
6
  import logging
7
+ import os
7
8
  import sys
8
9
  import warnings
9
10
 
@@ -14,7 +15,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
14
15
  logging.getLogger("httpx").setLevel(logging.CRITICAL)
15
16
  logging.getLogger("datasets").setLevel(logging.CRITICAL)
16
17
  logging.getLogger("vllm").setLevel(logging.CRITICAL)
17
- logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
18
+ os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
18
19
 
19
20
  # Set up logging
20
21
  fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
@@ -29,7 +30,6 @@ logging.basicConfig(
29
30
  ### Set the rest up ###
30
31
 
31
32
  import importlib.metadata # noqa: E402
32
- import os # noqa: E402
33
33
 
34
34
  from dotenv import load_dotenv # noqa: E402
35
35
 
@@ -10,7 +10,8 @@ from functools import cached_property, partial
10
10
  from datasets import DatasetDict
11
11
  from torch import nn
12
12
  from tqdm.auto import tqdm
13
- from transformers import PreTrainedTokenizer, Trainer
13
+ from transformers.tokenization_utils import PreTrainedTokenizer
14
+ from transformers.trainer import Trainer
14
15
 
15
16
  from ..data_models import (
16
17
  BenchmarkConfig,
@@ -21,7 +22,7 @@ from ..data_models import (
21
22
  )
22
23
  from ..enums import BatchingPreference, GenerativeType, TaskGroup
23
24
  from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
24
- from ..task_utils import (
25
+ from ..task_group_utils import (
25
26
  question_answering,
26
27
  sequence_classification,
27
28
  text_to_text,
@@ -4,19 +4,21 @@ import os
4
4
  from functools import cached_property
5
5
  from json import JSONDecodeError
6
6
 
7
- from transformers import (
8
- AutoConfig,
9
- AutoTokenizer,
7
+ from transformers.configuration_utils import PretrainedConfig
8
+ from transformers.modeling_utils import PreTrainedModel
9
+ from transformers.models.auto.configuration_auto import AutoConfig
10
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
11
+ from transformers.models.electra import (
10
12
  ElectraForQuestionAnswering,
11
13
  ElectraForSequenceClassification,
12
14
  ElectraForTokenClassification,
13
- PretrainedConfig,
14
- PreTrainedModel,
15
- PreTrainedTokenizer,
15
+ )
16
+ from transformers.models.xlm_roberta import (
16
17
  XLMRobertaForQuestionAnswering,
17
18
  XLMRobertaForSequenceClassification,
18
19
  XLMRobertaForTokenClassification,
19
20
  )
21
+ from transformers.tokenization_utils import PreTrainedTokenizer
20
22
 
21
23
  from ..data_models import BenchmarkConfig, DatasetConfig, ModelConfig
22
24
  from ..enums import InferenceBackend, ModelType, TaskGroup
@@ -13,37 +13,36 @@ import torch
13
13
  from datasets import DatasetDict
14
14
  from huggingface_hub import HfApi
15
15
  from huggingface_hub import whoami as hf_whoami
16
- from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
17
- from huggingface_hub.hf_api import RepositoryNotFoundError, RevisionNotFoundError
18
- from huggingface_hub.utils import (
16
+ from huggingface_hub.errors import (
19
17
  GatedRepoError,
20
18
  HFValidationError,
21
19
  LocalTokenNotFoundError,
20
+ RepositoryNotFoundError,
21
+ RevisionNotFoundError,
22
22
  )
23
+ from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
23
24
  from peft import PeftConfig
24
25
  from requests.exceptions import RequestException
25
26
  from torch import nn
26
- from transformers import (
27
- AutoConfig,
28
- AutoTokenizer,
29
- BatchEncoding,
27
+ from transformers.configuration_utils import PretrainedConfig
28
+ from transformers.data.data_collator import (
30
29
  DataCollatorForTokenClassification,
31
30
  DataCollatorWithPadding,
32
- PretrainedConfig,
33
- PreTrainedModel,
34
- PreTrainedTokenizer,
35
- Trainer,
36
31
  )
37
32
  from transformers.modelcard import TASK_MAPPING
38
- from transformers.models.auto.modeling_auto import (
39
- MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
40
- )
33
+ from transformers.modeling_utils import PreTrainedModel
34
+ from transformers.models.auto.configuration_auto import AutoConfig
35
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
36
+ from transformers.tokenization_utils import PreTrainedTokenizer
37
+ from transformers.tokenization_utils_base import BatchEncoding
38
+ from transformers.trainer import Trainer
41
39
  from urllib3.exceptions import RequestError
42
40
 
43
41
  from ..constants import (
44
42
  DUMMY_FILL_VALUE,
45
43
  GENERATIVE_PIPELINE_TAGS,
46
44
  LOCAL_MODELS_REQUIRED_FILES,
45
+ MAX_CONTEXT_LENGTH,
47
46
  MERGE_TAGS,
48
47
  )
49
48
  from ..data_models import BenchmarkConfig, DatasetConfig, HFModelInfo, ModelConfig, Task
@@ -64,18 +63,17 @@ from ..exceptions import (
64
63
  NoInternetConnection,
65
64
  )
66
65
  from ..languages import get_all_languages
67
- from ..task_utils import (
66
+ from ..task_group_utils import (
68
67
  multiple_choice_classification,
69
68
  question_answering,
70
69
  token_classification,
71
70
  )
71
+ from ..tokenization_utils import get_bos_token, get_eos_token
72
72
  from ..types import ExtractLabelsFunction
73
73
  from ..utils import (
74
74
  block_terminal_output,
75
75
  create_model_cache_dir,
76
- get_bos_token,
77
76
  get_class_by_name,
78
- get_eos_token,
79
77
  internet_connection_available,
80
78
  log_once,
81
79
  )
@@ -245,6 +243,15 @@ class HuggingFaceEncoderModel(BenchmarkModule):
245
243
  max_length for max_length in all_max_lengths if max_length >= 128
246
244
  ]
247
245
 
246
+ # We remove the upper cap of maximum context length for the model, as it is
247
+ # highly unlikely that this is the model's actual maximum context length - we
248
+ # would rather not report a value than report an incorrect one.
249
+ all_max_lengths = [
250
+ max_length
251
+ for max_length in all_max_lengths
252
+ if max_length != MAX_CONTEXT_LENGTH
253
+ ]
254
+
248
255
  if len(list(all_max_lengths)) > 0:
249
256
  model_max_length = min(list(all_max_lengths))
250
257
  else:
@@ -680,7 +687,7 @@ def load_model_and_tokenizer(
680
687
  assert model is not None, "The model should not be None."
681
688
 
682
689
  model.eval()
683
- model.to(benchmark_config.device)
690
+ model.to(benchmark_config.device) # type: ignore[arg-type]
684
691
 
685
692
  if (
686
693
  isinstance(model, PreTrainedModel)
@@ -787,12 +794,6 @@ def get_model_repo_info(
787
794
  tags += base_model_info.tags or list()
788
795
  tags = list(set(tags))
789
796
 
790
- # TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
791
- # 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
792
- # when this PR has been merged in and published:
793
- # https://github.com/huggingface/transformers/pull/37107
794
- TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
795
-
796
797
  # Get the pipeline tag for the model. If it is not specified, then we determine it
797
798
  # by checking the model's architecture as written in the model's Hugging Face config
798
799
  pipeline_tag = model_info.pipeline_tag
@@ -814,7 +815,7 @@ def get_model_repo_info(
814
815
  generative_class_names = [
815
816
  class_name
816
817
  for tag in GENERATIVE_PIPELINE_TAGS
817
- for class_name in TASK_MAPPING.get(tag, dict()).values()
818
+ for class_name in TASK_MAPPING.get(tag, dict()).values() # type: ignore[attr-defined]
818
819
  ]
819
820
  if class_names is not None and any(
820
821
  class_name in generative_class_names for class_name in class_names
@@ -1073,17 +1074,20 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
1073
1074
  for attribute in attribute_list:
1074
1075
  token_type_embeddings = getattr(token_type_embeddings, attribute)
1075
1076
 
1077
+ token_type_embedding_tensor = token_type_embeddings.weight.data
1078
+ assert isinstance(token_type_embedding_tensor, torch.Tensor)
1079
+
1076
1080
  # If the token type embeddings has shape (1, ...) then set the shape to
1077
1081
  # (2, ...) by randomly initializing the second token type embedding
1078
- if token_type_embeddings.weight.data.shape[0] == 1:
1082
+ if token_type_embedding_tensor.shape[0] == 1:
1079
1083
  token_type_embeddings.weight.data = torch.cat(
1080
1084
  (
1081
- token_type_embeddings.weight.data,
1082
- torch.rand_like(token_type_embeddings.weight.data),
1085
+ token_type_embedding_tensor,
1086
+ torch.rand_like(token_type_embedding_tensor),
1083
1087
  ),
1084
1088
  dim=0,
1085
1089
  )
1086
- token_type_embeddings.num_embeddings = 2
1090
+ token_type_embeddings.num_embeddings = 2 # type: ignore[assignment]
1087
1091
 
1088
1092
  # Set the model config to use the new type vocab size
1089
1093
  model.config.type_vocab_size = 2
@@ -1140,8 +1144,7 @@ def align_model_and_tokenizer(
1140
1144
  Returns:
1141
1145
  The fixed model and tokenizer.
1142
1146
  """
1143
- # Ensure that the model max length is at most 5,000, to avoid OOM errors
1144
- model_max_length = min(model_max_length, 5_000)
1147
+ model_max_length = min(model_max_length, MAX_CONTEXT_LENGTH)
1145
1148
 
1146
1149
  if model_max_length > 0:
1147
1150
  tokenizer.model_max_length = model_max_length
@@ -1151,7 +1154,7 @@ def align_model_and_tokenizer(
1151
1154
  # Move the model to the CPU, since otherwise we can't catch the IndexErrors when
1152
1155
  # finding the maximum sequence length of the model
1153
1156
  model_device = model.device
1154
- model.to(torch.device("cpu"))
1157
+ model.to(torch.device("cpu")) # type: ignore[arg-type]
1155
1158
 
1156
1159
  # Manually check that this model max length is valid for the model, and adjust
1157
1160
  # otherwise
@@ -1173,8 +1176,16 @@ def align_model_and_tokenizer(
1173
1176
  except IndexError:
1174
1177
  continue
1175
1178
 
1179
+ except ValueError as e:
1180
+ # This happens when the model is using Triton, such as with ModernBERT,
1181
+ # which doesn't work with CPU tensors at all
1182
+ if "cpu tensor" in str(e):
1183
+ break
1184
+ else:
1185
+ raise e
1186
+
1176
1187
  # Move the model back to the original device
1177
- model.to(model_device)
1188
+ model.to(model_device) # type: ignore[arg-type]
1178
1189
 
1179
1190
  # If there is a mismatch between the vocab size according to the tokenizer and
1180
1191
  # the vocab size according to the model, we raise an error