EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +9 -2
- euroeval/benchmark_config_factory.py +51 -50
- euroeval/benchmark_modules/base.py +9 -21
- euroeval/benchmark_modules/fresh.py +2 -1
- euroeval/benchmark_modules/hf.py +101 -71
- euroeval/benchmark_modules/litellm.py +115 -53
- euroeval/benchmark_modules/vllm.py +107 -92
- euroeval/benchmarker.py +144 -121
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +86 -8
- euroeval/constants.py +9 -0
- euroeval/data_loading.py +80 -29
- euroeval/data_models.py +338 -330
- euroeval/dataset_configs/__init__.py +12 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +55 -93
- euroeval/dataset_configs/dutch.py +48 -87
- euroeval/dataset_configs/english.py +45 -77
- euroeval/dataset_configs/estonian.py +42 -34
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -69
- euroeval/dataset_configs/french.py +39 -75
- euroeval/dataset_configs/german.py +45 -82
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -79
- euroeval/dataset_configs/latvian.py +28 -35
- euroeval/dataset_configs/lithuanian.py +28 -26
- euroeval/dataset_configs/norwegian.py +72 -115
- euroeval/dataset_configs/polish.py +33 -61
- euroeval/dataset_configs/portuguese.py +33 -66
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/spanish.py +42 -77
- euroeval/dataset_configs/swedish.py +52 -90
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +24 -17
- euroeval/generation.py +15 -14
- euroeval/generation_utils.py +8 -8
- euroeval/languages.py +395 -323
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +21 -6
- euroeval/metrics/llm_as_a_judge.py +6 -4
- euroeval/metrics/pipeline.py +17 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +17 -19
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +99 -42
- euroeval/prompt_templates/multiple_choice.py +102 -38
- euroeval/prompt_templates/named_entity_recognition.py +172 -51
- euroeval/prompt_templates/reading_comprehension.py +119 -42
- euroeval/prompt_templates/sentiment_classification.py +110 -40
- euroeval/prompt_templates/summarization.py +85 -40
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +11 -10
- euroeval/speed_benchmark.py +5 -6
- euroeval/task_group_utils/multiple_choice_classification.py +2 -4
- euroeval/task_group_utils/question_answering.py +24 -16
- euroeval/task_group_utils/sequence_classification.py +48 -35
- euroeval/task_group_utils/text_to_text.py +19 -9
- euroeval/task_group_utils/token_classification.py +21 -17
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +33 -22
- euroeval/types.py +10 -9
- euroeval/utils.py +35 -149
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.3.0.dist-info/RECORD +0 -71
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -21,7 +21,8 @@ if os.getenv("FULL_LOG") != "1":
|
|
|
21
21
|
os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
|
|
22
22
|
|
|
23
23
|
# Set up logging
|
|
24
|
-
fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
|
|
24
|
+
# fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
|
|
25
|
+
fmt = colored("%(message)s", "light_yellow")
|
|
25
26
|
logging.basicConfig(
|
|
26
27
|
level=logging.CRITICAL if hasattr(sys, "_called_from_test") else logging.INFO,
|
|
27
28
|
format=fmt,
|
|
@@ -50,7 +51,13 @@ import importlib.metadata # noqa: E402
|
|
|
50
51
|
from dotenv import load_dotenv # noqa: E402
|
|
51
52
|
|
|
52
53
|
from .benchmarker import Benchmarker # noqa: E402
|
|
53
|
-
from .
|
|
54
|
+
from .data_models import DatasetConfig # noqa: E402
|
|
55
|
+
from .logging_utils import block_terminal_output # noqa: E402
|
|
56
|
+
from .tasks import ( # noqa: E402
|
|
57
|
+
MULTIPLE_CHOICE,
|
|
58
|
+
TEXT_CLASSIFICATION,
|
|
59
|
+
TOKEN_CLASSIFICATION,
|
|
60
|
+
)
|
|
54
61
|
|
|
55
62
|
# Block unwanted terminal outputs. This blocks way more than the above, but since it
|
|
56
63
|
# relies on importing from the `utils` module, external modules are already imported
|
|
@@ -1,23 +1,20 @@
|
|
|
1
1
|
"""Factory class for creating dataset configurations."""
|
|
2
2
|
|
|
3
|
-
import
|
|
3
|
+
import collections.abc as c
|
|
4
4
|
import sys
|
|
5
5
|
import typing as t
|
|
6
6
|
|
|
7
7
|
import torch
|
|
8
8
|
|
|
9
|
-
from .data_models import BenchmarkConfig, BenchmarkConfigParams
|
|
9
|
+
from .data_models import BenchmarkConfig, BenchmarkConfigParams, DatasetConfig, Task
|
|
10
10
|
from .dataset_configs import get_all_dataset_configs
|
|
11
11
|
from .enums import Device
|
|
12
12
|
from .exceptions import InvalidBenchmark
|
|
13
13
|
from .languages import get_all_languages
|
|
14
|
-
from .tasks import
|
|
14
|
+
from .tasks import get_all_tasks
|
|
15
15
|
|
|
16
16
|
if t.TYPE_CHECKING:
|
|
17
|
-
from .data_models import Language
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
logger = logging.getLogger("euroeval")
|
|
17
|
+
from .data_models import Language
|
|
21
18
|
|
|
22
19
|
|
|
23
20
|
def build_benchmark_config(
|
|
@@ -44,7 +41,7 @@ def build_benchmark_config(
|
|
|
44
41
|
default_language_codes=language_codes,
|
|
45
42
|
)
|
|
46
43
|
|
|
47
|
-
|
|
44
|
+
dataset_configs = prepare_dataset_configs(
|
|
48
45
|
task=benchmark_config_params.task,
|
|
49
46
|
dataset=benchmark_config_params.dataset,
|
|
50
47
|
dataset_languages=dataset_languages,
|
|
@@ -53,8 +50,7 @@ def build_benchmark_config(
|
|
|
53
50
|
return BenchmarkConfig(
|
|
54
51
|
model_languages=model_languages,
|
|
55
52
|
dataset_languages=dataset_languages,
|
|
56
|
-
|
|
57
|
-
datasets=datasets,
|
|
53
|
+
datasets=dataset_configs,
|
|
58
54
|
batch_size=benchmark_config_params.batch_size,
|
|
59
55
|
raise_errors=benchmark_config_params.raise_errors,
|
|
60
56
|
cache_dir=benchmark_config_params.cache_dir,
|
|
@@ -84,7 +80,9 @@ def build_benchmark_config(
|
|
|
84
80
|
)
|
|
85
81
|
|
|
86
82
|
|
|
87
|
-
def get_correct_language_codes(
|
|
83
|
+
def get_correct_language_codes(
|
|
84
|
+
language_codes: str | c.Sequence[str],
|
|
85
|
+
) -> c.Sequence[str]:
|
|
88
86
|
"""Get correct language code(s).
|
|
89
87
|
|
|
90
88
|
Args:
|
|
@@ -105,7 +103,7 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
|
|
|
105
103
|
elif isinstance(language_codes, str):
|
|
106
104
|
languages = [language_codes]
|
|
107
105
|
else:
|
|
108
|
-
languages = language_codes
|
|
106
|
+
languages = list(language_codes)
|
|
109
107
|
|
|
110
108
|
# If `languages` contains 'no' then also include 'nb' and 'nn'. Conversely, if
|
|
111
109
|
# either 'nb' or 'nn' are specified then also include 'no'.
|
|
@@ -118,8 +116,9 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
|
|
|
118
116
|
|
|
119
117
|
|
|
120
118
|
def prepare_languages(
|
|
121
|
-
language_codes: str |
|
|
122
|
-
|
|
119
|
+
language_codes: str | c.Sequence[str] | None,
|
|
120
|
+
default_language_codes: c.Sequence[str],
|
|
121
|
+
) -> c.Sequence["Language"]:
|
|
123
122
|
"""Prepare language(s) for benchmarking.
|
|
124
123
|
|
|
125
124
|
Args:
|
|
@@ -137,7 +136,7 @@ def prepare_languages(
|
|
|
137
136
|
language_mapping = get_all_languages()
|
|
138
137
|
|
|
139
138
|
# Create the list `languages_str` of language codes to use for models or datasets
|
|
140
|
-
languages_str:
|
|
139
|
+
languages_str: c.Sequence[str]
|
|
141
140
|
if language_codes is None:
|
|
142
141
|
languages_str = default_language_codes
|
|
143
142
|
elif isinstance(language_codes, str):
|
|
@@ -154,12 +153,12 @@ def prepare_languages(
|
|
|
154
153
|
return prepared_languages
|
|
155
154
|
|
|
156
155
|
|
|
157
|
-
def
|
|
158
|
-
task: str |
|
|
159
|
-
dataset_languages:
|
|
160
|
-
dataset: str |
|
|
161
|
-
) ->
|
|
162
|
-
"""Prepare
|
|
156
|
+
def prepare_dataset_configs(
|
|
157
|
+
task: "str | Task | c.Sequence[str | Task] | None",
|
|
158
|
+
dataset_languages: c.Sequence["Language"],
|
|
159
|
+
dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None",
|
|
160
|
+
) -> c.Sequence["DatasetConfig"]:
|
|
161
|
+
"""Prepare dataset config(s) for benchmarking.
|
|
163
162
|
|
|
164
163
|
Args:
|
|
165
164
|
task:
|
|
@@ -172,56 +171,58 @@ def prepare_tasks_and_datasets(
|
|
|
172
171
|
included, limited by the `task` and `dataset_languages` parameters.
|
|
173
172
|
|
|
174
173
|
Returns:
|
|
175
|
-
The prepared
|
|
174
|
+
The prepared dataset configs.
|
|
176
175
|
|
|
177
176
|
Raises:
|
|
178
177
|
InvalidBenchmark:
|
|
179
178
|
If the task or dataset is not found in the benchmark tasks or datasets.
|
|
180
179
|
"""
|
|
181
|
-
# Create a dictionary that maps benchmark tasks to their associated benchmark
|
|
182
|
-
# task objects, and a dictionary that maps dataset names to their associated
|
|
183
|
-
# dataset configuration objects
|
|
184
|
-
task_mapping = get_all_tasks()
|
|
185
|
-
all_dataset_configs = get_all_dataset_configs()
|
|
186
|
-
|
|
187
180
|
# Create the list of dataset tasks
|
|
181
|
+
task_mapping = get_all_tasks()
|
|
188
182
|
try:
|
|
189
183
|
if task is None:
|
|
190
|
-
tasks =
|
|
184
|
+
tasks = None
|
|
191
185
|
elif isinstance(task, str):
|
|
192
186
|
tasks = [task_mapping[task]]
|
|
187
|
+
elif isinstance(task, Task):
|
|
188
|
+
tasks = [task]
|
|
193
189
|
else:
|
|
194
|
-
tasks = [task_mapping[t] for t in task]
|
|
190
|
+
tasks = [task_mapping[t] if isinstance(t, str) else t for t in task]
|
|
195
191
|
except KeyError as e:
|
|
196
192
|
raise InvalidBenchmark(f"Task {e} not found in the benchmark tasks.") from e
|
|
197
193
|
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
194
|
+
# Create the list of dataset configs
|
|
195
|
+
all_dataset_configs = get_all_dataset_configs()
|
|
196
|
+
all_official_dataset_configs: c.Sequence[DatasetConfig] = [
|
|
197
|
+
dataset_config
|
|
198
|
+
for dataset_config in all_dataset_configs.values()
|
|
201
199
|
if not dataset_config.unofficial
|
|
202
200
|
]
|
|
203
|
-
|
|
204
|
-
dataset
|
|
205
|
-
|
|
206
|
-
dataset
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
201
|
+
try:
|
|
202
|
+
if dataset is None:
|
|
203
|
+
datasets = all_official_dataset_configs
|
|
204
|
+
elif isinstance(dataset, str):
|
|
205
|
+
datasets = [all_dataset_configs[dataset]]
|
|
206
|
+
elif isinstance(dataset, DatasetConfig):
|
|
207
|
+
datasets = [dataset]
|
|
208
|
+
else:
|
|
209
|
+
datasets = [
|
|
210
|
+
all_dataset_configs[d] if isinstance(d, str) else d for d in dataset
|
|
211
|
+
]
|
|
212
|
+
except KeyError as e:
|
|
211
213
|
raise InvalidBenchmark(
|
|
212
|
-
f"Dataset
|
|
213
|
-
|
|
214
|
-
)
|
|
214
|
+
f"Dataset {e} not found in the benchmark datasets."
|
|
215
|
+
) from e
|
|
215
216
|
|
|
217
|
+
# Filter the dataset configs based on the specified tasks and languages
|
|
216
218
|
datasets = [
|
|
217
|
-
|
|
218
|
-
for
|
|
219
|
-
if
|
|
220
|
-
and
|
|
221
|
-
and set(dataset_config.languages).intersection(dataset_languages)
|
|
219
|
+
ds
|
|
220
|
+
for ds in datasets
|
|
221
|
+
if (tasks is None or ds.task in tasks)
|
|
222
|
+
and any(lang in dataset_languages for lang in ds.languages)
|
|
222
223
|
]
|
|
223
224
|
|
|
224
|
-
return
|
|
225
|
+
return datasets
|
|
225
226
|
|
|
226
227
|
|
|
227
228
|
def prepare_device(device: Device | None) -> torch.device:
|
|
@@ -3,24 +3,22 @@
|
|
|
3
3
|
import collections.abc as c
|
|
4
4
|
import logging
|
|
5
5
|
import re
|
|
6
|
-
import sys
|
|
7
6
|
import typing as t
|
|
8
7
|
from abc import ABC, abstractmethod
|
|
9
8
|
from functools import cached_property, partial
|
|
10
9
|
|
|
11
10
|
from datasets import Dataset, DatasetDict
|
|
12
11
|
from torch import nn
|
|
13
|
-
from tqdm.auto import tqdm
|
|
14
12
|
|
|
15
13
|
from ..enums import TaskGroup
|
|
16
14
|
from ..exceptions import InvalidBenchmark, NeedsEnvironmentVariable, NeedsExtraInstalled
|
|
15
|
+
from ..logging_utils import get_pbar, log_once
|
|
17
16
|
from ..task_group_utils import (
|
|
18
17
|
question_answering,
|
|
19
18
|
sequence_classification,
|
|
20
19
|
text_to_text,
|
|
21
20
|
token_classification,
|
|
22
21
|
)
|
|
23
|
-
from ..utils import log_once
|
|
24
22
|
|
|
25
23
|
if t.TYPE_CHECKING:
|
|
26
24
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
@@ -36,8 +34,6 @@ if t.TYPE_CHECKING:
|
|
|
36
34
|
from ..enums import BatchingPreference, GenerativeType
|
|
37
35
|
from ..types import ComputeMetricsFunction, ExtractLabelsFunction
|
|
38
36
|
|
|
39
|
-
logger = logging.getLogger("euroeval")
|
|
40
|
-
|
|
41
37
|
|
|
42
38
|
class BenchmarkModule(ABC):
|
|
43
39
|
"""Abstract class for a benchmark module.
|
|
@@ -56,7 +52,7 @@ class BenchmarkModule(ABC):
|
|
|
56
52
|
fresh_model: bool
|
|
57
53
|
batching_preference: "BatchingPreference"
|
|
58
54
|
high_priority: bool
|
|
59
|
-
allowed_params: dict[re.Pattern,
|
|
55
|
+
allowed_params: dict[re.Pattern, c.Sequence[str]] = {re.compile(r".*"): []}
|
|
60
56
|
|
|
61
57
|
def __init__(
|
|
62
58
|
self,
|
|
@@ -87,20 +83,12 @@ class BenchmarkModule(ABC):
|
|
|
87
83
|
|
|
88
84
|
def _log_metadata(self) -> None:
|
|
89
85
|
"""Log the metadata of the model."""
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
logging_level = logging.CRITICAL
|
|
93
|
-
elif self.benchmark_config.verbose:
|
|
94
|
-
logging_level = logging.DEBUG
|
|
95
|
-
else:
|
|
96
|
-
logging_level = logging.INFO
|
|
97
|
-
logger.setLevel(logging_level)
|
|
98
|
-
|
|
99
|
-
logging_msg: str = ""
|
|
86
|
+
model_id = self.model_config.model_id
|
|
87
|
+
logging_msg: str = " ↳ "
|
|
100
88
|
if self.num_params < 0:
|
|
101
|
-
logging_msg += "The model has an unknown number of parameters, "
|
|
89
|
+
logging_msg += f"The model {model_id} has an unknown number of parameters, "
|
|
102
90
|
else:
|
|
103
|
-
logging_msg += f"The model has {self.num_params:,} parameters, "
|
|
91
|
+
logging_msg += f"The model {model_id} has {self.num_params:,} parameters, "
|
|
104
92
|
if self.vocab_size < 0:
|
|
105
93
|
logging_msg += "an unknown vocabulary size, "
|
|
106
94
|
else:
|
|
@@ -179,7 +167,7 @@ class BenchmarkModule(ABC):
|
|
|
179
167
|
|
|
180
168
|
@property
|
|
181
169
|
@abstractmethod
|
|
182
|
-
def data_collator(self) -> c.Callable[[
|
|
170
|
+
def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
|
|
183
171
|
"""The data collator used to prepare samples during finetuning.
|
|
184
172
|
|
|
185
173
|
Returns:
|
|
@@ -253,7 +241,7 @@ class BenchmarkModule(ABC):
|
|
|
253
241
|
|
|
254
242
|
def prepare_datasets(
|
|
255
243
|
self, datasets: list[DatasetDict], task: "Task"
|
|
256
|
-
) ->
|
|
244
|
+
) -> c.Sequence[DatasetDict]:
|
|
257
245
|
"""Prepare the datasets for the model.
|
|
258
246
|
|
|
259
247
|
This includes things like tokenisation.
|
|
@@ -273,7 +261,7 @@ class BenchmarkModule(ABC):
|
|
|
273
261
|
tasks.
|
|
274
262
|
"""
|
|
275
263
|
for idx, dataset in enumerate(
|
|
276
|
-
|
|
264
|
+
get_pbar(iterable=datasets, desc="Preparing datasets")
|
|
277
265
|
):
|
|
278
266
|
prepared_dataset = self.prepare_dataset(
|
|
279
267
|
dataset=dataset, task=task, itr_idx=idx
|
|
@@ -27,7 +27,8 @@ from ..exceptions import (
|
|
|
27
27
|
NeedsExtraInstalled,
|
|
28
28
|
)
|
|
29
29
|
from ..generation_utils import raise_if_wrong_params
|
|
30
|
-
from ..
|
|
30
|
+
from ..logging_utils import block_terminal_output
|
|
31
|
+
from ..utils import create_model_cache_dir, get_hf_token
|
|
31
32
|
from .hf import (
|
|
32
33
|
HuggingFaceEncoderModel,
|
|
33
34
|
align_model_and_tokeniser,
|