EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +2 -2
- euroeval/benchmark_modules/base.py +3 -2
- euroeval/benchmark_modules/fresh.py +8 -6
- euroeval/benchmark_modules/hf.py +44 -33
- euroeval/benchmark_modules/litellm.py +314 -120
- euroeval/benchmark_modules/vllm.py +99 -59
- euroeval/benchmarker.py +52 -21
- euroeval/callbacks.py +2 -2
- euroeval/constants.py +9 -2
- euroeval/data_models.py +258 -44
- euroeval/dataset_configs/__init__.py +61 -0
- euroeval/dataset_configs/danish.py +120 -0
- euroeval/dataset_configs/dutch.py +123 -0
- euroeval/dataset_configs/english.py +88 -0
- euroeval/dataset_configs/faroese.py +53 -0
- euroeval/dataset_configs/french.py +83 -0
- euroeval/dataset_configs/german.py +91 -0
- euroeval/dataset_configs/icelandic.py +148 -0
- euroeval/dataset_configs/italian.py +81 -0
- euroeval/dataset_configs/norwegian.py +178 -0
- euroeval/dataset_configs/spanish.py +78 -0
- euroeval/dataset_configs/swedish.py +100 -0
- euroeval/exceptions.py +10 -10
- euroeval/finetuning.py +6 -10
- euroeval/generation.py +1 -0
- euroeval/human_evaluation.py +2 -2
- euroeval/languages.py +20 -13
- euroeval/model_cache.py +1 -1
- euroeval/model_loading.py +1 -12
- euroeval/prompt_templates/__init__.py +8 -0
- euroeval/prompt_templates/linguistic_acceptability.py +112 -0
- euroeval/prompt_templates/multiple_choice.py +97 -0
- euroeval/prompt_templates/named_entity_recognition.py +257 -0
- euroeval/prompt_templates/reading_comprehension.py +118 -0
- euroeval/prompt_templates/sentiment_classification.py +137 -0
- euroeval/prompt_templates/summarization.py +97 -0
- euroeval/speed_benchmark.py +1 -1
- euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
- euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
- euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
- euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
- euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
- euroeval/tasks.py +54 -0
- euroeval/tokenization_utils.py +343 -0
- euroeval/types.py +3 -1
- euroeval/utils.py +5 -254
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
- euroeval-15.6.0.dist-info/RECORD +59 -0
- euroeval/dataset_configs.py +0 -2408
- euroeval-15.4.2.dist-info/RECORD +0 -40
- /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
### Block unwanted terminal output that happens on importing external modules ###
|
|
5
5
|
|
|
6
6
|
import logging
|
|
7
|
+
import os
|
|
7
8
|
import sys
|
|
8
9
|
import warnings
|
|
9
10
|
|
|
@@ -14,7 +15,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
|
|
|
14
15
|
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
15
16
|
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
16
17
|
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
17
|
-
|
|
18
|
+
os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
|
|
18
19
|
|
|
19
20
|
# Set up logging
|
|
20
21
|
fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
|
|
@@ -29,7 +30,6 @@ logging.basicConfig(
|
|
|
29
30
|
### Set the rest up ###
|
|
30
31
|
|
|
31
32
|
import importlib.metadata # noqa: E402
|
|
32
|
-
import os # noqa: E402
|
|
33
33
|
|
|
34
34
|
from dotenv import load_dotenv # noqa: E402
|
|
35
35
|
|
|
@@ -10,7 +10,8 @@ from functools import cached_property, partial
|
|
|
10
10
|
from datasets import DatasetDict
|
|
11
11
|
from torch import nn
|
|
12
12
|
from tqdm.auto import tqdm
|
|
13
|
-
from transformers import PreTrainedTokenizer
|
|
13
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
14
|
+
from transformers.trainer import Trainer
|
|
14
15
|
|
|
15
16
|
from ..data_models import (
|
|
16
17
|
BenchmarkConfig,
|
|
@@ -21,7 +22,7 @@ from ..data_models import (
|
|
|
21
22
|
)
|
|
22
23
|
from ..enums import BatchingPreference, GenerativeType, TaskGroup
|
|
23
24
|
from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
|
|
24
|
-
from ..
|
|
25
|
+
from ..task_group_utils import (
|
|
25
26
|
question_answering,
|
|
26
27
|
sequence_classification,
|
|
27
28
|
text_to_text,
|
|
@@ -4,19 +4,21 @@ import os
|
|
|
4
4
|
from functools import cached_property
|
|
5
5
|
from json import JSONDecodeError
|
|
6
6
|
|
|
7
|
-
from transformers import
|
|
8
|
-
|
|
9
|
-
|
|
7
|
+
from transformers.configuration_utils import PretrainedConfig
|
|
8
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
9
|
+
from transformers.models.auto.configuration_auto import AutoConfig
|
|
10
|
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
11
|
+
from transformers.models.electra import (
|
|
10
12
|
ElectraForQuestionAnswering,
|
|
11
13
|
ElectraForSequenceClassification,
|
|
12
14
|
ElectraForTokenClassification,
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
PreTrainedTokenizer,
|
|
15
|
+
)
|
|
16
|
+
from transformers.models.xlm_roberta import (
|
|
16
17
|
XLMRobertaForQuestionAnswering,
|
|
17
18
|
XLMRobertaForSequenceClassification,
|
|
18
19
|
XLMRobertaForTokenClassification,
|
|
19
20
|
)
|
|
21
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
20
22
|
|
|
21
23
|
from ..data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
22
24
|
from ..enums import InferenceBackend, ModelType, TaskGroup
|
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -13,37 +13,36 @@ import torch
|
|
|
13
13
|
from datasets import DatasetDict
|
|
14
14
|
from huggingface_hub import HfApi
|
|
15
15
|
from huggingface_hub import whoami as hf_whoami
|
|
16
|
-
from huggingface_hub.
|
|
17
|
-
from huggingface_hub.hf_api import RepositoryNotFoundError, RevisionNotFoundError
|
|
18
|
-
from huggingface_hub.utils import (
|
|
16
|
+
from huggingface_hub.errors import (
|
|
19
17
|
GatedRepoError,
|
|
20
18
|
HFValidationError,
|
|
21
19
|
LocalTokenNotFoundError,
|
|
20
|
+
RepositoryNotFoundError,
|
|
21
|
+
RevisionNotFoundError,
|
|
22
22
|
)
|
|
23
|
+
from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
|
|
23
24
|
from peft import PeftConfig
|
|
24
25
|
from requests.exceptions import RequestException
|
|
25
26
|
from torch import nn
|
|
26
|
-
from transformers import
|
|
27
|
-
|
|
28
|
-
AutoTokenizer,
|
|
29
|
-
BatchEncoding,
|
|
27
|
+
from transformers.configuration_utils import PretrainedConfig
|
|
28
|
+
from transformers.data.data_collator import (
|
|
30
29
|
DataCollatorForTokenClassification,
|
|
31
30
|
DataCollatorWithPadding,
|
|
32
|
-
PretrainedConfig,
|
|
33
|
-
PreTrainedModel,
|
|
34
|
-
PreTrainedTokenizer,
|
|
35
|
-
Trainer,
|
|
36
31
|
)
|
|
37
32
|
from transformers.modelcard import TASK_MAPPING
|
|
38
|
-
from transformers.
|
|
39
|
-
|
|
40
|
-
|
|
33
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
34
|
+
from transformers.models.auto.configuration_auto import AutoConfig
|
|
35
|
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
36
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
37
|
+
from transformers.tokenization_utils_base import BatchEncoding
|
|
38
|
+
from transformers.trainer import Trainer
|
|
41
39
|
from urllib3.exceptions import RequestError
|
|
42
40
|
|
|
43
41
|
from ..constants import (
|
|
44
42
|
DUMMY_FILL_VALUE,
|
|
45
43
|
GENERATIVE_PIPELINE_TAGS,
|
|
46
44
|
LOCAL_MODELS_REQUIRED_FILES,
|
|
45
|
+
MAX_CONTEXT_LENGTH,
|
|
47
46
|
MERGE_TAGS,
|
|
48
47
|
)
|
|
49
48
|
from ..data_models import BenchmarkConfig, DatasetConfig, HFModelInfo, ModelConfig, Task
|
|
@@ -64,18 +63,17 @@ from ..exceptions import (
|
|
|
64
63
|
NoInternetConnection,
|
|
65
64
|
)
|
|
66
65
|
from ..languages import get_all_languages
|
|
67
|
-
from ..
|
|
66
|
+
from ..task_group_utils import (
|
|
68
67
|
multiple_choice_classification,
|
|
69
68
|
question_answering,
|
|
70
69
|
token_classification,
|
|
71
70
|
)
|
|
71
|
+
from ..tokenization_utils import get_bos_token, get_eos_token
|
|
72
72
|
from ..types import ExtractLabelsFunction
|
|
73
73
|
from ..utils import (
|
|
74
74
|
block_terminal_output,
|
|
75
75
|
create_model_cache_dir,
|
|
76
|
-
get_bos_token,
|
|
77
76
|
get_class_by_name,
|
|
78
|
-
get_eos_token,
|
|
79
77
|
internet_connection_available,
|
|
80
78
|
log_once,
|
|
81
79
|
)
|
|
@@ -245,6 +243,15 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
245
243
|
max_length for max_length in all_max_lengths if max_length >= 128
|
|
246
244
|
]
|
|
247
245
|
|
|
246
|
+
# We remove the upper cap of maximum context length for the model, as it is
|
|
247
|
+
# highly unlikely that this is the model's actual maximum context length - we
|
|
248
|
+
# would rather not report a value than report an incorrect one.
|
|
249
|
+
all_max_lengths = [
|
|
250
|
+
max_length
|
|
251
|
+
for max_length in all_max_lengths
|
|
252
|
+
if max_length != MAX_CONTEXT_LENGTH
|
|
253
|
+
]
|
|
254
|
+
|
|
248
255
|
if len(list(all_max_lengths)) > 0:
|
|
249
256
|
model_max_length = min(list(all_max_lengths))
|
|
250
257
|
else:
|
|
@@ -680,7 +687,7 @@ def load_model_and_tokenizer(
|
|
|
680
687
|
assert model is not None, "The model should not be None."
|
|
681
688
|
|
|
682
689
|
model.eval()
|
|
683
|
-
model.to(benchmark_config.device)
|
|
690
|
+
model.to(benchmark_config.device) # type: ignore[arg-type]
|
|
684
691
|
|
|
685
692
|
if (
|
|
686
693
|
isinstance(model, PreTrainedModel)
|
|
@@ -787,12 +794,6 @@ def get_model_repo_info(
|
|
|
787
794
|
tags += base_model_info.tags or list()
|
|
788
795
|
tags = list(set(tags))
|
|
789
796
|
|
|
790
|
-
# TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
|
|
791
|
-
# 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
|
|
792
|
-
# when this PR has been merged in and published:
|
|
793
|
-
# https://github.com/huggingface/transformers/pull/37107
|
|
794
|
-
TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
|
|
795
|
-
|
|
796
797
|
# Get the pipeline tag for the model. If it is not specified, then we determine it
|
|
797
798
|
# by checking the model's architecture as written in the model's Hugging Face config
|
|
798
799
|
pipeline_tag = model_info.pipeline_tag
|
|
@@ -814,7 +815,7 @@ def get_model_repo_info(
|
|
|
814
815
|
generative_class_names = [
|
|
815
816
|
class_name
|
|
816
817
|
for tag in GENERATIVE_PIPELINE_TAGS
|
|
817
|
-
for class_name in TASK_MAPPING.get(tag, dict()).values()
|
|
818
|
+
for class_name in TASK_MAPPING.get(tag, dict()).values() # type: ignore[attr-defined]
|
|
818
819
|
]
|
|
819
820
|
if class_names is not None and any(
|
|
820
821
|
class_name in generative_class_names for class_name in class_names
|
|
@@ -1073,17 +1074,20 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
|
|
|
1073
1074
|
for attribute in attribute_list:
|
|
1074
1075
|
token_type_embeddings = getattr(token_type_embeddings, attribute)
|
|
1075
1076
|
|
|
1077
|
+
token_type_embedding_tensor = token_type_embeddings.weight.data
|
|
1078
|
+
assert isinstance(token_type_embedding_tensor, torch.Tensor)
|
|
1079
|
+
|
|
1076
1080
|
# If the token type embeddings has shape (1, ...) then set the shape to
|
|
1077
1081
|
# (2, ...) by randomly initializing the second token type embedding
|
|
1078
|
-
if
|
|
1082
|
+
if token_type_embedding_tensor.shape[0] == 1:
|
|
1079
1083
|
token_type_embeddings.weight.data = torch.cat(
|
|
1080
1084
|
(
|
|
1081
|
-
|
|
1082
|
-
torch.rand_like(
|
|
1085
|
+
token_type_embedding_tensor,
|
|
1086
|
+
torch.rand_like(token_type_embedding_tensor),
|
|
1083
1087
|
),
|
|
1084
1088
|
dim=0,
|
|
1085
1089
|
)
|
|
1086
|
-
token_type_embeddings.num_embeddings = 2
|
|
1090
|
+
token_type_embeddings.num_embeddings = 2 # type: ignore[assignment]
|
|
1087
1091
|
|
|
1088
1092
|
# Set the model config to use the new type vocab size
|
|
1089
1093
|
model.config.type_vocab_size = 2
|
|
@@ -1140,8 +1144,7 @@ def align_model_and_tokenizer(
|
|
|
1140
1144
|
Returns:
|
|
1141
1145
|
The fixed model and tokenizer.
|
|
1142
1146
|
"""
|
|
1143
|
-
|
|
1144
|
-
model_max_length = min(model_max_length, 5_000)
|
|
1147
|
+
model_max_length = min(model_max_length, MAX_CONTEXT_LENGTH)
|
|
1145
1148
|
|
|
1146
1149
|
if model_max_length > 0:
|
|
1147
1150
|
tokenizer.model_max_length = model_max_length
|
|
@@ -1151,7 +1154,7 @@ def align_model_and_tokenizer(
|
|
|
1151
1154
|
# Move the model to the CPU, since otherwise we can't catch the IndexErrors when
|
|
1152
1155
|
# finding the maximum sequence length of the model
|
|
1153
1156
|
model_device = model.device
|
|
1154
|
-
model.to(torch.device("cpu"))
|
|
1157
|
+
model.to(torch.device("cpu")) # type: ignore[arg-type]
|
|
1155
1158
|
|
|
1156
1159
|
# Manually check that this model max length is valid for the model, and adjust
|
|
1157
1160
|
# otherwise
|
|
@@ -1173,8 +1176,16 @@ def align_model_and_tokenizer(
|
|
|
1173
1176
|
except IndexError:
|
|
1174
1177
|
continue
|
|
1175
1178
|
|
|
1179
|
+
except ValueError as e:
|
|
1180
|
+
# This happens when the model is using Triton, such as with ModernBERT,
|
|
1181
|
+
# which doesn't work with CPU tensors at all
|
|
1182
|
+
if "cpu tensor" in str(e):
|
|
1183
|
+
break
|
|
1184
|
+
else:
|
|
1185
|
+
raise e
|
|
1186
|
+
|
|
1176
1187
|
# Move the model back to the original device
|
|
1177
|
-
model.to(model_device)
|
|
1188
|
+
model.to(model_device) # type: ignore[arg-type]
|
|
1178
1189
|
|
|
1179
1190
|
# If there is a mismatch between the vocab size according to the tokenizer and
|
|
1180
1191
|
# the vocab size according to the model, we raise an error
|