EuroEval 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +3 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +323 -193
- euroeval/benchmark_modules/vllm.py +166 -112
- euroeval/benchmarker.py +59 -33
- euroeval/cli.py +3 -3
- euroeval/constants.py +13 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +53 -7
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +38 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +8 -7
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +46 -14
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +234 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +17 -6
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +96 -23
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +47 -75
- euroeval/tasks.py +31 -6
- euroeval/tokenization_utils.py +295 -207
- euroeval/utils.py +118 -34
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +12 -14
- euroeval-16.0.0.dist-info/RECORD +69 -0
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -468
- euroeval-15.15.0.dist-info/RECORD +0 -63
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -77,10 +77,6 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
|
77
77
|
os.environ["OMP_NUM_THREADS"] = "1"
|
|
78
78
|
|
|
79
79
|
|
|
80
|
-
# Disable a warning from Ray regarding the detection of the number of CPUs
|
|
81
|
-
os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
|
|
82
|
-
|
|
83
|
-
|
|
84
80
|
# Avoid the "Cannot re-initialize CUDA in forked subprocess" error - see
|
|
85
81
|
# https://github.com/vllm-project/vllm/issues/6152 for more
|
|
86
82
|
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
|
@@ -100,9 +96,9 @@ os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
|
|
|
100
96
|
os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
|
|
101
97
|
|
|
102
98
|
|
|
103
|
-
#
|
|
104
|
-
#
|
|
105
|
-
os.environ["VLLM_USE_V1"] = "
|
|
99
|
+
# Enable the newer vLLM V1 engine, which is faster and offers more compatibility with
|
|
100
|
+
# newer models
|
|
101
|
+
os.environ["VLLM_USE_V1"] = "1"
|
|
106
102
|
|
|
107
103
|
|
|
108
104
|
# Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
|
|
@@ -45,8 +45,7 @@ def build_benchmark_config(
|
|
|
45
45
|
gpu_memory_utilization: float,
|
|
46
46
|
debug: bool,
|
|
47
47
|
run_with_cli: bool,
|
|
48
|
-
|
|
49
|
-
first_time: bool = False,
|
|
48
|
+
requires_safetensors: bool,
|
|
50
49
|
) -> BenchmarkConfig:
|
|
51
50
|
"""Create a benchmark configuration.
|
|
52
51
|
|
|
@@ -112,11 +111,8 @@ def build_benchmark_config(
|
|
|
112
111
|
Whether to run the benchmark in debug mode.
|
|
113
112
|
run_with_cli:
|
|
114
113
|
Whether the benchmark is being run with the CLI.
|
|
115
|
-
|
|
114
|
+
requires_safetensors:
|
|
116
115
|
Whether to only allow evaluations of models stored as safetensors.
|
|
117
|
-
first_time:
|
|
118
|
-
Whether this is the first time the benchmark configuration is being created.
|
|
119
|
-
Defaults to False.
|
|
120
116
|
|
|
121
117
|
Returns:
|
|
122
118
|
The benchmark configuration.
|
|
@@ -163,7 +159,7 @@ def build_benchmark_config(
|
|
|
163
159
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
164
160
|
debug=debug,
|
|
165
161
|
run_with_cli=run_with_cli,
|
|
166
|
-
|
|
162
|
+
requires_safetensors=requires_safetensors,
|
|
167
163
|
)
|
|
168
164
|
|
|
169
165
|
|
|
@@ -7,12 +7,12 @@ import typing as t
|
|
|
7
7
|
from abc import ABC, abstractmethod
|
|
8
8
|
from functools import cached_property, partial
|
|
9
9
|
|
|
10
|
-
from datasets import DatasetDict
|
|
10
|
+
from datasets import Dataset, DatasetDict
|
|
11
11
|
from torch import nn
|
|
12
12
|
from tqdm.auto import tqdm
|
|
13
13
|
|
|
14
14
|
from ..enums import TaskGroup
|
|
15
|
-
from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
|
|
15
|
+
from ..exceptions import InvalidBenchmark, NeedsEnvironmentVariable, NeedsExtraInstalled
|
|
16
16
|
from ..task_group_utils import (
|
|
17
17
|
question_answering,
|
|
18
18
|
sequence_classification,
|
|
@@ -61,6 +61,7 @@ class BenchmarkModule(ABC):
|
|
|
61
61
|
model_config: "ModelConfig",
|
|
62
62
|
dataset_config: "DatasetConfig",
|
|
63
63
|
benchmark_config: "BenchmarkConfig",
|
|
64
|
+
log_metadata: bool = True,
|
|
64
65
|
) -> None:
|
|
65
66
|
"""Initialise the benchmark module.
|
|
66
67
|
|
|
@@ -71,12 +72,16 @@ class BenchmarkModule(ABC):
|
|
|
71
72
|
The dataset configuration.
|
|
72
73
|
benchmark_config:
|
|
73
74
|
The benchmark configuration.
|
|
75
|
+
log_metadata:
|
|
76
|
+
Whether to log the metadata of the model.
|
|
74
77
|
"""
|
|
75
78
|
self.model_config = model_config
|
|
76
79
|
self.dataset_config = dataset_config
|
|
77
80
|
self.benchmark_config = benchmark_config
|
|
81
|
+
self.log_metadata = log_metadata
|
|
78
82
|
self.buffer: dict[str, t.Any] = dict()
|
|
79
|
-
self.
|
|
83
|
+
if self.log_metadata:
|
|
84
|
+
self._log_metadata()
|
|
80
85
|
|
|
81
86
|
def _log_metadata(self) -> None:
|
|
82
87
|
"""Log the metadata of the model."""
|
|
@@ -117,16 +122,16 @@ class BenchmarkModule(ABC):
|
|
|
117
122
|
f"{self.__class__.__name__}."
|
|
118
123
|
)
|
|
119
124
|
|
|
120
|
-
def
|
|
121
|
-
"""Get the underlying
|
|
125
|
+
def get_tokeniser(self) -> "PreTrainedTokenizer":
|
|
126
|
+
"""Get the underlying tokeniser.
|
|
122
127
|
|
|
123
128
|
Returns:
|
|
124
|
-
The
|
|
129
|
+
The tokeniser.
|
|
125
130
|
"""
|
|
126
|
-
if hasattr(self, "
|
|
127
|
-
return self.
|
|
131
|
+
if hasattr(self, "_tokeniser"):
|
|
132
|
+
return self._tokeniser
|
|
128
133
|
raise NotImplementedError(
|
|
129
|
-
"The `
|
|
134
|
+
"The `get_tokeniser` method has not been implemented for "
|
|
130
135
|
f"{self.__class__.__name__}."
|
|
131
136
|
)
|
|
132
137
|
|
|
@@ -192,11 +197,13 @@ class BenchmarkModule(ABC):
|
|
|
192
197
|
return partial(
|
|
193
198
|
sequence_classification.compute_metrics,
|
|
194
199
|
dataset_config=self.dataset_config,
|
|
200
|
+
benchmark_config=self.benchmark_config,
|
|
195
201
|
)
|
|
196
202
|
case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
197
203
|
return partial(
|
|
198
204
|
sequence_classification.compute_metrics,
|
|
199
205
|
dataset_config=self.dataset_config,
|
|
206
|
+
benchmark_config=self.benchmark_config,
|
|
200
207
|
)
|
|
201
208
|
case TaskGroup.TEXT_TO_TEXT:
|
|
202
209
|
return partial(
|
|
@@ -209,11 +216,13 @@ class BenchmarkModule(ABC):
|
|
|
209
216
|
token_classification.compute_metrics,
|
|
210
217
|
has_misc_tags=self.buffer.get("has_misc_tags", True),
|
|
211
218
|
dataset_config=self.dataset_config,
|
|
219
|
+
benchmark_config=self.benchmark_config,
|
|
212
220
|
)
|
|
213
221
|
case TaskGroup.QUESTION_ANSWERING:
|
|
214
222
|
return partial(
|
|
215
223
|
question_answering.compute_metrics,
|
|
216
224
|
dataset_config=self.dataset_config,
|
|
225
|
+
benchmark_config=self.benchmark_config,
|
|
217
226
|
)
|
|
218
227
|
case _:
|
|
219
228
|
raise NotImplementedError(
|
|
@@ -255,6 +264,11 @@ class BenchmarkModule(ABC):
|
|
|
255
264
|
|
|
256
265
|
Returns:
|
|
257
266
|
The prepared datasets.
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
InvalidBenchmark:
|
|
270
|
+
If the dataset does not have a 'train' split for token classification
|
|
271
|
+
tasks.
|
|
258
272
|
"""
|
|
259
273
|
for idx, dataset in enumerate(
|
|
260
274
|
tqdm(iterable=datasets, desc="Preparing datasets")
|
|
@@ -263,22 +277,24 @@ class BenchmarkModule(ABC):
|
|
|
263
277
|
dataset=dataset, task=task, itr_idx=idx
|
|
264
278
|
)
|
|
265
279
|
if self.dataset_config.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
|
|
280
|
+
if "train" not in dataset:
|
|
281
|
+
raise InvalidBenchmark(
|
|
282
|
+
"The dataset does not have a 'train' split, which is required "
|
|
283
|
+
"for token classification tasks."
|
|
284
|
+
)
|
|
266
285
|
labels_in_train: set[str] = {
|
|
267
286
|
tag for tag_list in dataset["train"]["labels"] for tag in tag_list
|
|
268
287
|
}
|
|
269
288
|
self.buffer["has_misc_tags"] = (
|
|
270
289
|
"B-MISC" in labels_in_train or "I-MISC" in labels_in_train
|
|
271
290
|
)
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
original_test=dataset["test"],
|
|
280
|
-
)
|
|
281
|
-
)
|
|
291
|
+
|
|
292
|
+
datasets_dict: dict[str, Dataset] = dict()
|
|
293
|
+
for split_name, split in prepared_dataset.items():
|
|
294
|
+
datasets_dict[split_name] = split
|
|
295
|
+
for split_name, split in dataset.items():
|
|
296
|
+
datasets_dict[f"original_{split_name}"] = split
|
|
297
|
+
datasets[idx] = DatasetDict(datasets_dict)
|
|
282
298
|
return datasets
|
|
283
299
|
|
|
284
300
|
@abstractmethod
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Freshly initialised encoder models."""
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import typing as t
|
|
5
4
|
from functools import cached_property
|
|
6
5
|
from json import JSONDecodeError
|
|
@@ -26,10 +25,10 @@ from ..exceptions import (
|
|
|
26
25
|
NeedsEnvironmentVariable,
|
|
27
26
|
NeedsExtraInstalled,
|
|
28
27
|
)
|
|
29
|
-
from ..utils import block_terminal_output, create_model_cache_dir
|
|
28
|
+
from ..utils import block_terminal_output, create_model_cache_dir, get_hf_token
|
|
30
29
|
from .hf import (
|
|
31
30
|
HuggingFaceEncoderModel,
|
|
32
|
-
|
|
31
|
+
align_model_and_tokeniser,
|
|
33
32
|
setup_model_for_question_answering,
|
|
34
33
|
)
|
|
35
34
|
|
|
@@ -51,6 +50,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
51
50
|
model_config: "ModelConfig",
|
|
52
51
|
dataset_config: "DatasetConfig",
|
|
53
52
|
benchmark_config: "BenchmarkConfig",
|
|
53
|
+
log_metadata: bool = True,
|
|
54
54
|
) -> None:
|
|
55
55
|
"""Initialise the model.
|
|
56
56
|
|
|
@@ -61,23 +61,25 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
61
61
|
The dataset configuration.
|
|
62
62
|
benchmark_config:
|
|
63
63
|
The benchmark configuration.
|
|
64
|
+
log_metadata:
|
|
65
|
+
Whether to log metadata about the model and the benchmark.
|
|
64
66
|
"""
|
|
65
67
|
# This is already set when calling `super.__init__`, but we need it to get a
|
|
66
68
|
# value from `self.model_max_length`, so we set it here as well.
|
|
67
69
|
self.model_config = model_config
|
|
68
70
|
|
|
69
|
-
model,
|
|
71
|
+
model, tokeniser = load_model_and_tokeniser(
|
|
70
72
|
model_config=model_config,
|
|
71
73
|
dataset_config=dataset_config,
|
|
72
74
|
benchmark_config=benchmark_config,
|
|
73
75
|
model_max_length=self.model_max_length,
|
|
74
76
|
)
|
|
75
77
|
self._model: "PreTrainedModel" = model
|
|
76
|
-
self.
|
|
78
|
+
self._tokeniser: "PreTrainedTokenizer" = tokeniser
|
|
77
79
|
|
|
78
|
-
self._model, self.
|
|
80
|
+
self._model, self._tokeniser = align_model_and_tokeniser(
|
|
79
81
|
model=self._model,
|
|
80
|
-
|
|
82
|
+
tokeniser=self._tokeniser,
|
|
81
83
|
model_max_length=self.model_max_length,
|
|
82
84
|
raise_errors=benchmark_config.raise_errors,
|
|
83
85
|
)
|
|
@@ -88,6 +90,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
88
90
|
model_config=model_config,
|
|
89
91
|
dataset_config=dataset_config,
|
|
90
92
|
benchmark_config=benchmark_config,
|
|
93
|
+
log_metadata=log_metadata,
|
|
91
94
|
)
|
|
92
95
|
|
|
93
96
|
@cached_property
|
|
@@ -194,13 +197,13 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
194
197
|
)
|
|
195
198
|
|
|
196
199
|
|
|
197
|
-
def
|
|
200
|
+
def load_model_and_tokeniser(
|
|
198
201
|
model_config: "ModelConfig",
|
|
199
202
|
dataset_config: "DatasetConfig",
|
|
200
203
|
benchmark_config: "BenchmarkConfig",
|
|
201
204
|
model_max_length: int,
|
|
202
205
|
) -> "tuple[PreTrainedModel, PreTrainedTokenizer]":
|
|
203
|
-
"""Load the model and
|
|
206
|
+
"""Load the model and tokeniser.
|
|
204
207
|
|
|
205
208
|
Args:
|
|
206
209
|
model_config:
|
|
@@ -213,7 +216,7 @@ def load_model_and_tokenizer(
|
|
|
213
216
|
The maximum context length of the model.
|
|
214
217
|
|
|
215
218
|
Returns:
|
|
216
|
-
The loaded model and
|
|
219
|
+
The loaded model and tokeniser.
|
|
217
220
|
"""
|
|
218
221
|
config: "PretrainedConfig"
|
|
219
222
|
block_terminal_output()
|
|
@@ -262,7 +265,7 @@ def load_model_and_tokenizer(
|
|
|
262
265
|
|
|
263
266
|
config = AutoConfig.from_pretrained(
|
|
264
267
|
real_model_id,
|
|
265
|
-
token=benchmark_config.api_key
|
|
268
|
+
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
266
269
|
num_labels=len(id2label),
|
|
267
270
|
id2label=id2label,
|
|
268
271
|
label2id={label: id_ for id_, label in id2label.items()},
|
|
@@ -274,29 +277,31 @@ def load_model_and_tokenizer(
|
|
|
274
277
|
if dataset_config.task.task_group == TaskGroup.QUESTION_ANSWERING:
|
|
275
278
|
model = setup_model_for_question_answering(model=model)
|
|
276
279
|
|
|
277
|
-
# Load the
|
|
280
|
+
# Load the tokeniser. If the model is a subclass of a RoBERTa model then we
|
|
278
281
|
# have to add a prefix space to the tokens, by the way the model is constructed
|
|
279
282
|
prefix_models = ["Roberta", "GPT", "Deberta"]
|
|
280
283
|
prefix = any(model_type in type(model).__name__ for model_type in prefix_models)
|
|
281
284
|
try:
|
|
282
|
-
|
|
285
|
+
tokeniser: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
|
|
283
286
|
real_model_id,
|
|
284
287
|
revision=model_config.revision,
|
|
285
|
-
token=benchmark_config.api_key
|
|
288
|
+
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
286
289
|
add_prefix_space=prefix,
|
|
287
290
|
cache_dir=model_config.model_cache_dir,
|
|
288
291
|
use_fast=True,
|
|
289
292
|
verbose=False,
|
|
290
293
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
291
294
|
)
|
|
292
|
-
except (JSONDecodeError, OSError):
|
|
293
|
-
raise InvalidModel(
|
|
295
|
+
except (JSONDecodeError, OSError) as e:
|
|
296
|
+
raise InvalidModel(
|
|
297
|
+
f"Could not load tokeniser for model {real_model_id!r}."
|
|
298
|
+
) from e
|
|
294
299
|
|
|
295
|
-
model,
|
|
300
|
+
model, tokeniser = align_model_and_tokeniser(
|
|
296
301
|
model=model,
|
|
297
|
-
|
|
302
|
+
tokeniser=tokeniser,
|
|
298
303
|
model_max_length=model_max_length,
|
|
299
304
|
raise_errors=benchmark_config.raise_errors,
|
|
300
305
|
)
|
|
301
306
|
|
|
302
|
-
return model,
|
|
307
|
+
return model, tokeniser
|