EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +32 -14
- euroeval/benchmark_config_factory.py +92 -180
- euroeval/benchmark_modules/base.py +49 -39
- euroeval/benchmark_modules/fresh.py +35 -21
- euroeval/benchmark_modules/hf.py +280 -244
- euroeval/benchmark_modules/litellm.py +752 -312
- euroeval/benchmark_modules/vllm.py +570 -268
- euroeval/benchmarker.py +651 -528
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +49 -38
- euroeval/constants.py +44 -25
- euroeval/data_loading.py +111 -55
- euroeval/data_models.py +490 -323
- euroeval/dataset_configs/__init__.py +26 -4
- euroeval/dataset_configs/bosnian.py +39 -0
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/croatian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +78 -50
- euroeval/dataset_configs/dutch.py +74 -44
- euroeval/dataset_configs/english.py +71 -36
- euroeval/dataset_configs/estonian.py +111 -0
- euroeval/dataset_configs/faroese.py +25 -18
- euroeval/dataset_configs/finnish.py +63 -26
- euroeval/dataset_configs/french.py +65 -32
- euroeval/dataset_configs/german.py +77 -36
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +68 -57
- euroeval/dataset_configs/italian.py +68 -36
- euroeval/dataset_configs/latvian.py +87 -0
- euroeval/dataset_configs/lithuanian.py +64 -0
- euroeval/dataset_configs/norwegian.py +98 -72
- euroeval/dataset_configs/polish.py +96 -0
- euroeval/dataset_configs/portuguese.py +63 -40
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/slovene.py +56 -0
- euroeval/dataset_configs/spanish.py +68 -34
- euroeval/dataset_configs/swedish.py +82 -41
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/enums.py +12 -6
- euroeval/exceptions.py +21 -1
- euroeval/finetuning.py +34 -26
- euroeval/generation.py +76 -41
- euroeval/generation_utils.py +169 -34
- euroeval/languages.py +1020 -188
- euroeval/logging_utils.py +268 -0
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +85 -0
- euroeval/metrics/huggingface.py +216 -0
- euroeval/metrics/llm_as_a_judge.py +260 -0
- euroeval/metrics/pipeline.py +289 -0
- euroeval/metrics/speed.py +48 -0
- euroeval/model_cache.py +40 -21
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +157 -22
- euroeval/prompt_templates/multiple_choice.py +159 -17
- euroeval/prompt_templates/named_entity_recognition.py +318 -21
- euroeval/prompt_templates/reading_comprehension.py +207 -16
- euroeval/prompt_templates/sentiment_classification.py +205 -22
- euroeval/prompt_templates/summarization.py +122 -22
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +20 -9
- euroeval/speed_benchmark.py +11 -12
- euroeval/task_group_utils/multiple_choice_classification.py +21 -12
- euroeval/task_group_utils/question_answering.py +101 -73
- euroeval/task_group_utils/sequence_classification.py +144 -61
- euroeval/task_group_utils/text_to_text.py +33 -12
- euroeval/task_group_utils/token_classification.py +86 -89
- euroeval/tasks.py +75 -16
- euroeval/tokenisation_utils.py +603 -0
- euroeval/types.py +17 -11
- euroeval/utils.py +332 -137
- euroeval-16.7.1.dist-info/METADATA +623 -0
- euroeval-16.7.1.dist-info/RECORD +84 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -737
- euroeval/metrics.py +0 -452
- euroeval/tokenization_utils.py +0 -498
- euroeval-15.12.0.dist-info/METADATA +0 -285
- euroeval-15.12.0.dist-info/RECORD +0 -63
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Freshly initialised encoder models."""
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import re
|
|
4
4
|
import typing as t
|
|
5
5
|
from functools import cached_property
|
|
6
6
|
from json import JSONDecodeError
|
|
@@ -26,10 +26,12 @@ from ..exceptions import (
|
|
|
26
26
|
NeedsEnvironmentVariable,
|
|
27
27
|
NeedsExtraInstalled,
|
|
28
28
|
)
|
|
29
|
-
from ..
|
|
29
|
+
from ..generation_utils import raise_if_wrong_params
|
|
30
|
+
from ..logging_utils import block_terminal_output
|
|
31
|
+
from ..utils import create_model_cache_dir, get_hf_token
|
|
30
32
|
from .hf import (
|
|
31
33
|
HuggingFaceEncoderModel,
|
|
32
|
-
|
|
34
|
+
align_model_and_tokeniser,
|
|
33
35
|
setup_model_for_question_answering,
|
|
34
36
|
)
|
|
35
37
|
|
|
@@ -45,12 +47,14 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
45
47
|
"""A freshly initialised encoder model."""
|
|
46
48
|
|
|
47
49
|
fresh_model = True
|
|
50
|
+
allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
|
|
48
51
|
|
|
49
52
|
def __init__(
|
|
50
53
|
self,
|
|
51
54
|
model_config: "ModelConfig",
|
|
52
55
|
dataset_config: "DatasetConfig",
|
|
53
56
|
benchmark_config: "BenchmarkConfig",
|
|
57
|
+
log_metadata: bool = True,
|
|
54
58
|
) -> None:
|
|
55
59
|
"""Initialise the model.
|
|
56
60
|
|
|
@@ -61,23 +65,29 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
61
65
|
The dataset configuration.
|
|
62
66
|
benchmark_config:
|
|
63
67
|
The benchmark configuration.
|
|
68
|
+
log_metadata:
|
|
69
|
+
Whether to log metadata about the model and the benchmark.
|
|
64
70
|
"""
|
|
71
|
+
raise_if_wrong_params(
|
|
72
|
+
model_config=model_config, allowed_params=self.allowed_params
|
|
73
|
+
)
|
|
74
|
+
|
|
65
75
|
# This is already set when calling `super.__init__`, but we need it to get a
|
|
66
76
|
# value from `self.model_max_length`, so we set it here as well.
|
|
67
77
|
self.model_config = model_config
|
|
68
78
|
|
|
69
|
-
model,
|
|
79
|
+
model, tokeniser = load_model_and_tokeniser(
|
|
70
80
|
model_config=model_config,
|
|
71
81
|
dataset_config=dataset_config,
|
|
72
82
|
benchmark_config=benchmark_config,
|
|
73
83
|
model_max_length=self.model_max_length,
|
|
74
84
|
)
|
|
75
85
|
self._model: "PreTrainedModel" = model
|
|
76
|
-
self.
|
|
86
|
+
self._tokeniser: "PreTrainedTokenizer" = tokeniser
|
|
77
87
|
|
|
78
|
-
self._model, self.
|
|
88
|
+
self._model, self._tokeniser = align_model_and_tokeniser(
|
|
79
89
|
model=self._model,
|
|
80
|
-
|
|
90
|
+
tokeniser=self._tokeniser,
|
|
81
91
|
model_max_length=self.model_max_length,
|
|
82
92
|
raise_errors=benchmark_config.raise_errors,
|
|
83
93
|
)
|
|
@@ -88,6 +98,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
88
98
|
model_config=model_config,
|
|
89
99
|
dataset_config=dataset_config,
|
|
90
100
|
benchmark_config=benchmark_config,
|
|
101
|
+
log_metadata=log_metadata,
|
|
91
102
|
)
|
|
92
103
|
|
|
93
104
|
@cached_property
|
|
@@ -180,9 +191,10 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
180
191
|
"""
|
|
181
192
|
return ModelConfig(
|
|
182
193
|
model_id=model_id,
|
|
194
|
+
revision="main",
|
|
195
|
+
param=None,
|
|
183
196
|
task="fill-mask",
|
|
184
197
|
languages=list(),
|
|
185
|
-
revision="main",
|
|
186
198
|
merge=False,
|
|
187
199
|
inference_backend=InferenceBackend.TRANSFORMERS,
|
|
188
200
|
model_type=ModelType.ENCODER,
|
|
@@ -194,13 +206,13 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
194
206
|
)
|
|
195
207
|
|
|
196
208
|
|
|
197
|
-
def
|
|
209
|
+
def load_model_and_tokeniser(
|
|
198
210
|
model_config: "ModelConfig",
|
|
199
211
|
dataset_config: "DatasetConfig",
|
|
200
212
|
benchmark_config: "BenchmarkConfig",
|
|
201
213
|
model_max_length: int,
|
|
202
214
|
) -> "tuple[PreTrainedModel, PreTrainedTokenizer]":
|
|
203
|
-
"""Load the model and
|
|
215
|
+
"""Load the model and tokeniser.
|
|
204
216
|
|
|
205
217
|
Args:
|
|
206
218
|
model_config:
|
|
@@ -213,7 +225,7 @@ def load_model_and_tokenizer(
|
|
|
213
225
|
The maximum context length of the model.
|
|
214
226
|
|
|
215
227
|
Returns:
|
|
216
|
-
The loaded model and
|
|
228
|
+
The loaded model and tokeniser.
|
|
217
229
|
"""
|
|
218
230
|
config: "PretrainedConfig"
|
|
219
231
|
block_terminal_output()
|
|
@@ -262,7 +274,7 @@ def load_model_and_tokenizer(
|
|
|
262
274
|
|
|
263
275
|
config = AutoConfig.from_pretrained(
|
|
264
276
|
real_model_id,
|
|
265
|
-
token=benchmark_config.api_key
|
|
277
|
+
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
266
278
|
num_labels=len(id2label),
|
|
267
279
|
id2label=id2label,
|
|
268
280
|
label2id={label: id_ for id_, label in id2label.items()},
|
|
@@ -274,29 +286,31 @@ def load_model_and_tokenizer(
|
|
|
274
286
|
if dataset_config.task.task_group == TaskGroup.QUESTION_ANSWERING:
|
|
275
287
|
model = setup_model_for_question_answering(model=model)
|
|
276
288
|
|
|
277
|
-
# Load the
|
|
289
|
+
# Load the tokeniser. If the model is a subclass of a RoBERTa model then we
|
|
278
290
|
# have to add a prefix space to the tokens, by the way the model is constructed
|
|
279
291
|
prefix_models = ["Roberta", "GPT", "Deberta"]
|
|
280
292
|
prefix = any(model_type in type(model).__name__ for model_type in prefix_models)
|
|
281
293
|
try:
|
|
282
|
-
|
|
294
|
+
tokeniser: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
|
|
283
295
|
real_model_id,
|
|
284
296
|
revision=model_config.revision,
|
|
285
|
-
token=benchmark_config.api_key
|
|
297
|
+
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
286
298
|
add_prefix_space=prefix,
|
|
287
299
|
cache_dir=model_config.model_cache_dir,
|
|
288
|
-
use_fast=True,
|
|
300
|
+
use_fast=False if model_config.param == "slow-tokenizer" else True,
|
|
289
301
|
verbose=False,
|
|
290
302
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
291
303
|
)
|
|
292
|
-
except (JSONDecodeError, OSError):
|
|
293
|
-
raise InvalidModel(
|
|
304
|
+
except (JSONDecodeError, OSError) as e:
|
|
305
|
+
raise InvalidModel(
|
|
306
|
+
f"Could not load tokeniser for model {real_model_id!r}."
|
|
307
|
+
) from e
|
|
294
308
|
|
|
295
|
-
model,
|
|
309
|
+
model, tokeniser = align_model_and_tokeniser(
|
|
296
310
|
model=model,
|
|
297
|
-
|
|
311
|
+
tokeniser=tokeniser,
|
|
298
312
|
model_max_length=model_max_length,
|
|
299
313
|
raise_errors=benchmark_config.raise_errors,
|
|
300
314
|
)
|
|
301
315
|
|
|
302
|
-
return model,
|
|
316
|
+
return model, tokeniser
|