EuroEval 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_config_factory.py +6 -1
- euroeval/benchmark_modules/base.py +2 -0
- euroeval/benchmark_modules/fresh.py +7 -1
- euroeval/benchmark_modules/hf.py +26 -21
- euroeval/benchmark_modules/litellm.py +258 -131
- euroeval/benchmark_modules/vllm.py +79 -40
- euroeval/benchmarker.py +11 -2
- euroeval/cli.py +14 -1
- euroeval/constants.py +1 -1
- euroeval/data_models.py +77 -6
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/danish.py +14 -0
- euroeval/dataset_configs/dutch.py +14 -0
- euroeval/dataset_configs/english.py +22 -0
- euroeval/dataset_configs/estonian.py +15 -7
- euroeval/dataset_configs/finnish.py +14 -0
- euroeval/dataset_configs/french.py +14 -0
- euroeval/dataset_configs/german.py +23 -0
- euroeval/dataset_configs/italian.py +14 -0
- euroeval/dataset_configs/latvian.py +14 -0
- euroeval/dataset_configs/norwegian.py +14 -0
- euroeval/dataset_configs/polish.py +126 -0
- euroeval/dataset_configs/portuguese.py +14 -0
- euroeval/dataset_configs/spanish.py +14 -0
- euroeval/dataset_configs/swedish.py +25 -0
- euroeval/enums.py +12 -0
- euroeval/generation.py +17 -8
- euroeval/generation_utils.py +65 -11
- euroeval/metrics/pipeline.py +1 -1
- euroeval/prompt_templates/linguistic_acceptability.py +9 -0
- euroeval/prompt_templates/multiple_choice.py +27 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -0
- euroeval/prompt_templates/reading_comprehension.py +11 -0
- euroeval/prompt_templates/sentiment_classification.py +15 -0
- euroeval/prompt_templates/summarization.py +27 -1
- euroeval/scores.py +5 -0
- euroeval/task_group_utils/question_answering.py +29 -29
- euroeval/task_group_utils/sequence_classification.py +11 -34
- euroeval/task_group_utils/token_classification.py +3 -3
- euroeval/tasks.py +4 -4
- euroeval/{tokenization_utils.py → tokenisation_utils.py} +50 -28
- euroeval/utils.py +36 -3
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/METADATA +1 -1
- euroeval-16.1.1.dist-info/RECORD +70 -0
- euroeval-16.0.1.dist-info/RECORD +0 -69
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/WHEEL +0 -0
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/entry_points.txt +0 -0
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -8,7 +8,7 @@ import torch
|
|
|
8
8
|
|
|
9
9
|
from .data_models import BenchmarkConfig
|
|
10
10
|
from .dataset_configs import get_all_dataset_configs
|
|
11
|
-
from .enums import Device
|
|
11
|
+
from .enums import Device, GenerativeType
|
|
12
12
|
from .exceptions import InvalidBenchmark
|
|
13
13
|
from .languages import get_all_languages
|
|
14
14
|
from .tasks import SPEED, get_all_tasks
|
|
@@ -43,6 +43,7 @@ def build_benchmark_config(
|
|
|
43
43
|
api_base: str | None,
|
|
44
44
|
api_version: str | None,
|
|
45
45
|
gpu_memory_utilization: float,
|
|
46
|
+
generative_type: GenerativeType | None,
|
|
46
47
|
debug: bool,
|
|
47
48
|
run_with_cli: bool,
|
|
48
49
|
requires_safetensors: bool,
|
|
@@ -107,6 +108,9 @@ def build_benchmark_config(
|
|
|
107
108
|
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
108
109
|
this if you are running out of GPU memory. Only relevant if the model is
|
|
109
110
|
generative.
|
|
111
|
+
generative_type:
|
|
112
|
+
The type of generative model. Only relevant if the model is generative. If
|
|
113
|
+
not specified, the type will be inferred automatically.
|
|
110
114
|
debug:
|
|
111
115
|
Whether to run the benchmark in debug mode.
|
|
112
116
|
run_with_cli:
|
|
@@ -157,6 +161,7 @@ def build_benchmark_config(
|
|
|
157
161
|
api_base=api_base,
|
|
158
162
|
api_version=api_version,
|
|
159
163
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
164
|
+
generative_type=generative_type,
|
|
160
165
|
debug=debug,
|
|
161
166
|
run_with_cli=run_with_cli,
|
|
162
167
|
requires_safetensors=requires_safetensors,
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
4
|
import logging
|
|
5
|
+
import re
|
|
5
6
|
import sys
|
|
6
7
|
import typing as t
|
|
7
8
|
from abc import ABC, abstractmethod
|
|
@@ -55,6 +56,7 @@ class BenchmarkModule(ABC):
|
|
|
55
56
|
fresh_model: bool
|
|
56
57
|
batching_preference: "BatchingPreference"
|
|
57
58
|
high_priority: bool
|
|
59
|
+
allowed_params: dict[re.Pattern, list[str]] = {re.compile(r".*"): []}
|
|
58
60
|
|
|
59
61
|
def __init__(
|
|
60
62
|
self,
|
|
@@ -25,6 +25,7 @@ from ..exceptions import (
|
|
|
25
25
|
NeedsEnvironmentVariable,
|
|
26
26
|
NeedsExtraInstalled,
|
|
27
27
|
)
|
|
28
|
+
from ..generation_utils import raise_if_wrong_params
|
|
28
29
|
from ..utils import block_terminal_output, create_model_cache_dir, get_hf_token
|
|
29
30
|
from .hf import (
|
|
30
31
|
HuggingFaceEncoderModel,
|
|
@@ -64,6 +65,10 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
64
65
|
log_metadata:
|
|
65
66
|
Whether to log metadata about the model and the benchmark.
|
|
66
67
|
"""
|
|
68
|
+
raise_if_wrong_params(
|
|
69
|
+
model_config=model_config, allowed_params=self.allowed_params
|
|
70
|
+
)
|
|
71
|
+
|
|
67
72
|
# This is already set when calling `super.__init__`, but we need it to get a
|
|
68
73
|
# value from `self.model_max_length`, so we set it here as well.
|
|
69
74
|
self.model_config = model_config
|
|
@@ -183,9 +188,10 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
183
188
|
"""
|
|
184
189
|
return ModelConfig(
|
|
185
190
|
model_id=model_id,
|
|
191
|
+
revision="main",
|
|
192
|
+
param=None,
|
|
186
193
|
task="fill-mask",
|
|
187
194
|
languages=list(),
|
|
188
|
-
revision="main",
|
|
189
195
|
merge=False,
|
|
190
196
|
inference_backend=InferenceBackend.TRANSFORMERS,
|
|
191
197
|
model_type=ModelType.ENCODER,
|
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -14,6 +14,7 @@ from huggingface_hub import HfApi
|
|
|
14
14
|
from huggingface_hub import whoami as hf_whoami
|
|
15
15
|
from huggingface_hub.errors import (
|
|
16
16
|
GatedRepoError,
|
|
17
|
+
HfHubHTTPError,
|
|
17
18
|
HFValidationError,
|
|
18
19
|
LocalTokenNotFoundError,
|
|
19
20
|
RepositoryNotFoundError,
|
|
@@ -56,13 +57,14 @@ from ..exceptions import (
|
|
|
56
57
|
NeedsEnvironmentVariable,
|
|
57
58
|
NeedsExtraInstalled,
|
|
58
59
|
)
|
|
60
|
+
from ..generation_utils import raise_if_wrong_params
|
|
59
61
|
from ..languages import get_all_languages
|
|
60
62
|
from ..task_group_utils import (
|
|
61
63
|
multiple_choice_classification,
|
|
62
64
|
question_answering,
|
|
63
65
|
token_classification,
|
|
64
66
|
)
|
|
65
|
-
from ..
|
|
67
|
+
from ..tokenisation_utils import get_bos_token, get_eos_token
|
|
66
68
|
from ..utils import (
|
|
67
69
|
block_terminal_output,
|
|
68
70
|
create_model_cache_dir,
|
|
@@ -70,6 +72,7 @@ from ..utils import (
|
|
|
70
72
|
get_hf_token,
|
|
71
73
|
internet_connection_available,
|
|
72
74
|
log_once,
|
|
75
|
+
split_model_id,
|
|
73
76
|
)
|
|
74
77
|
from .base import BenchmarkModule
|
|
75
78
|
|
|
@@ -110,6 +113,10 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
110
113
|
log_metadata:
|
|
111
114
|
Whether to log the model metadata.
|
|
112
115
|
"""
|
|
116
|
+
raise_if_wrong_params(
|
|
117
|
+
model_config=model_config, allowed_params=self.allowed_params
|
|
118
|
+
)
|
|
119
|
+
|
|
113
120
|
model, tokeniser = load_model_and_tokeniser(
|
|
114
121
|
model_config=model_config,
|
|
115
122
|
dataset_config=dataset_config,
|
|
@@ -247,15 +254,6 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
247
254
|
max_length for max_length in all_max_lengths if max_length >= 128
|
|
248
255
|
]
|
|
249
256
|
|
|
250
|
-
# We remove the upper cap of maximum context length for the model, as it is
|
|
251
|
-
# highly unlikely that this is the model's actual maximum context length - we
|
|
252
|
-
# would rather not report a value than report an incorrect one.
|
|
253
|
-
all_max_lengths = [
|
|
254
|
-
max_length
|
|
255
|
-
for max_length in all_max_lengths
|
|
256
|
-
if max_length != MAX_CONTEXT_LENGTH
|
|
257
|
-
]
|
|
258
|
-
|
|
259
257
|
if len(list(all_max_lengths)) > 0:
|
|
260
258
|
model_max_length = min(list(all_max_lengths))
|
|
261
259
|
else:
|
|
@@ -483,11 +481,11 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
483
481
|
Whether the model exists, or an error describing why we cannot check
|
|
484
482
|
whether the model exists.
|
|
485
483
|
"""
|
|
486
|
-
|
|
487
|
-
model_id.split("@") if "@" in model_id else (model_id, "main")
|
|
488
|
-
)
|
|
484
|
+
model_id_components = split_model_id(model_id=model_id)
|
|
489
485
|
model_info = get_model_repo_info(
|
|
490
|
-
model_id=model_id,
|
|
486
|
+
model_id=model_id_components.model_id,
|
|
487
|
+
revision=model_id_components.revision,
|
|
488
|
+
benchmark_config=benchmark_config,
|
|
491
489
|
)
|
|
492
490
|
return (
|
|
493
491
|
model_info is not None
|
|
@@ -509,11 +507,11 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
509
507
|
Returns:
|
|
510
508
|
The model configuration.
|
|
511
509
|
"""
|
|
512
|
-
|
|
513
|
-
model_id.split("@") if "@" in model_id else (model_id, "main")
|
|
514
|
-
)
|
|
510
|
+
model_id_components = split_model_id(model_id=model_id)
|
|
515
511
|
model_info = get_model_repo_info(
|
|
516
|
-
model_id=model_id,
|
|
512
|
+
model_id=model_id_components.model_id,
|
|
513
|
+
revision=model_id_components.revision,
|
|
514
|
+
benchmark_config=benchmark_config,
|
|
517
515
|
)
|
|
518
516
|
if model_info is None:
|
|
519
517
|
raise InvalidModel(f"The model {model_id!r} could not be found.")
|
|
@@ -522,8 +520,9 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
522
520
|
language_codes = list(language_mapping.keys())
|
|
523
521
|
|
|
524
522
|
model_config = ModelConfig(
|
|
525
|
-
model_id=model_id,
|
|
526
|
-
revision=revision,
|
|
523
|
+
model_id=model_id_components.model_id,
|
|
524
|
+
revision=model_id_components.revision,
|
|
525
|
+
param=model_id_components.param,
|
|
527
526
|
task=model_info.pipeline_tag,
|
|
528
527
|
languages=[
|
|
529
528
|
language_mapping[tag]
|
|
@@ -710,7 +709,6 @@ def get_model_repo_info(
|
|
|
710
709
|
"""
|
|
711
710
|
token = get_hf_token(api_key=benchmark_config.api_key)
|
|
712
711
|
hf_api = HfApi(token=token)
|
|
713
|
-
model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "main")
|
|
714
712
|
|
|
715
713
|
# Get information on the model.
|
|
716
714
|
# The first case is when the model is a local model, in which case we create a dummy
|
|
@@ -753,6 +751,13 @@ def get_model_repo_info(
|
|
|
753
751
|
return None
|
|
754
752
|
except (RepositoryNotFoundError, HFValidationError):
|
|
755
753
|
return None
|
|
754
|
+
except HfHubHTTPError as e:
|
|
755
|
+
if "unauthorized" in str(e).lower():
|
|
756
|
+
raise InvalidModel(
|
|
757
|
+
"It seems like your specified Hugging Face API key is invalid. "
|
|
758
|
+
"Please double-check your API key."
|
|
759
|
+
) from e
|
|
760
|
+
raise InvalidModel(str(e)) from e
|
|
756
761
|
except (OSError, RequestException) as e:
|
|
757
762
|
if internet_connection_available():
|
|
758
763
|
errors.append(e)
|