EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +6 -0
- euroeval/benchmark_config_factory.py +51 -46
- euroeval/benchmark_modules/base.py +6 -5
- euroeval/benchmark_modules/hf.py +2 -9
- euroeval/benchmark_modules/litellm.py +14 -12
- euroeval/benchmark_modules/vllm.py +17 -10
- euroeval/benchmarker.py +61 -44
- euroeval/caching_utils.py +1 -1
- euroeval/cli.py +86 -8
- euroeval/constants.py +3 -0
- euroeval/data_loading.py +78 -30
- euroeval/data_models.py +326 -326
- euroeval/dataset_configs/__init__.py +10 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +25 -29
- euroeval/dataset_configs/danish.py +51 -88
- euroeval/dataset_configs/dutch.py +48 -86
- euroeval/dataset_configs/english.py +45 -76
- euroeval/dataset_configs/estonian.py +36 -38
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -68
- euroeval/dataset_configs/french.py +39 -74
- euroeval/dataset_configs/german.py +45 -81
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -78
- euroeval/dataset_configs/latvian.py +28 -34
- euroeval/dataset_configs/lithuanian.py +22 -26
- euroeval/dataset_configs/norwegian.py +72 -114
- euroeval/dataset_configs/polish.py +33 -60
- euroeval/dataset_configs/portuguese.py +33 -65
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +19 -24
- euroeval/dataset_configs/spanish.py +42 -76
- euroeval/dataset_configs/swedish.py +48 -84
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +3 -2
- euroeval/generation.py +5 -4
- euroeval/generation_utils.py +6 -5
- euroeval/languages.py +395 -323
- euroeval/metrics/huggingface.py +14 -3
- euroeval/metrics/llm_as_a_judge.py +1 -1
- euroeval/model_cache.py +6 -5
- euroeval/model_loading.py +1 -1
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +82 -43
- euroeval/prompt_templates/multiple_choice.py +81 -41
- euroeval/prompt_templates/named_entity_recognition.py +125 -44
- euroeval/prompt_templates/reading_comprehension.py +92 -43
- euroeval/prompt_templates/sentiment_classification.py +91 -43
- euroeval/prompt_templates/summarization.py +64 -39
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +4 -3
- euroeval/speed_benchmark.py +2 -1
- euroeval/task_group_utils/multiple_choice_classification.py +2 -1
- euroeval/task_group_utils/question_answering.py +24 -13
- euroeval/task_group_utils/sequence_classification.py +5 -4
- euroeval/task_group_utils/text_to_text.py +2 -1
- euroeval/task_group_utils/token_classification.py +11 -8
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +19 -10
- euroeval/types.py +10 -9
- euroeval/utils.py +6 -3
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.4.0.dist-info/RECORD +0 -75
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -51,7 +51,13 @@ import importlib.metadata # noqa: E402
|
|
|
51
51
|
from dotenv import load_dotenv # noqa: E402
|
|
52
52
|
|
|
53
53
|
from .benchmarker import Benchmarker # noqa: E402
|
|
54
|
+
from .data_models import DatasetConfig # noqa: E402
|
|
54
55
|
from .logging_utils import block_terminal_output # noqa: E402
|
|
56
|
+
from .tasks import ( # noqa: E402
|
|
57
|
+
MULTIPLE_CHOICE,
|
|
58
|
+
TEXT_CLASSIFICATION,
|
|
59
|
+
TOKEN_CLASSIFICATION,
|
|
60
|
+
)
|
|
55
61
|
|
|
56
62
|
# Block unwanted terminal outputs. This blocks way more than the above, but since it
|
|
57
63
|
# relies on importing from the `utils` module, external modules are already imported
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
"""Factory class for creating dataset configurations."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import sys
|
|
4
5
|
import typing as t
|
|
5
6
|
|
|
6
7
|
import torch
|
|
7
8
|
|
|
8
|
-
from .data_models import BenchmarkConfig, BenchmarkConfigParams
|
|
9
|
+
from .data_models import BenchmarkConfig, BenchmarkConfigParams, DatasetConfig, Task
|
|
9
10
|
from .dataset_configs import get_all_dataset_configs
|
|
10
11
|
from .enums import Device
|
|
11
12
|
from .exceptions import InvalidBenchmark
|
|
12
13
|
from .languages import get_all_languages
|
|
13
|
-
from .tasks import
|
|
14
|
+
from .tasks import get_all_tasks
|
|
14
15
|
|
|
15
16
|
if t.TYPE_CHECKING:
|
|
16
|
-
from .data_models import Language
|
|
17
|
+
from .data_models import Language
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
def build_benchmark_config(
|
|
@@ -40,7 +41,7 @@ def build_benchmark_config(
|
|
|
40
41
|
default_language_codes=language_codes,
|
|
41
42
|
)
|
|
42
43
|
|
|
43
|
-
|
|
44
|
+
dataset_configs = prepare_dataset_configs(
|
|
44
45
|
task=benchmark_config_params.task,
|
|
45
46
|
dataset=benchmark_config_params.dataset,
|
|
46
47
|
dataset_languages=dataset_languages,
|
|
@@ -49,8 +50,7 @@ def build_benchmark_config(
|
|
|
49
50
|
return BenchmarkConfig(
|
|
50
51
|
model_languages=model_languages,
|
|
51
52
|
dataset_languages=dataset_languages,
|
|
52
|
-
|
|
53
|
-
datasets=datasets,
|
|
53
|
+
datasets=dataset_configs,
|
|
54
54
|
batch_size=benchmark_config_params.batch_size,
|
|
55
55
|
raise_errors=benchmark_config_params.raise_errors,
|
|
56
56
|
cache_dir=benchmark_config_params.cache_dir,
|
|
@@ -80,7 +80,9 @@ def build_benchmark_config(
|
|
|
80
80
|
)
|
|
81
81
|
|
|
82
82
|
|
|
83
|
-
def get_correct_language_codes(
|
|
83
|
+
def get_correct_language_codes(
|
|
84
|
+
language_codes: str | c.Sequence[str],
|
|
85
|
+
) -> c.Sequence[str]:
|
|
84
86
|
"""Get correct language code(s).
|
|
85
87
|
|
|
86
88
|
Args:
|
|
@@ -101,7 +103,7 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
|
|
|
101
103
|
elif isinstance(language_codes, str):
|
|
102
104
|
languages = [language_codes]
|
|
103
105
|
else:
|
|
104
|
-
languages = language_codes
|
|
106
|
+
languages = list(language_codes)
|
|
105
107
|
|
|
106
108
|
# If `languages` contains 'no' then also include 'nb' and 'nn'. Conversely, if
|
|
107
109
|
# either 'nb' or 'nn' are specified then also include 'no'.
|
|
@@ -114,8 +116,9 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
|
|
|
114
116
|
|
|
115
117
|
|
|
116
118
|
def prepare_languages(
|
|
117
|
-
language_codes: str |
|
|
118
|
-
|
|
119
|
+
language_codes: str | c.Sequence[str] | None,
|
|
120
|
+
default_language_codes: c.Sequence[str],
|
|
121
|
+
) -> c.Sequence["Language"]:
|
|
119
122
|
"""Prepare language(s) for benchmarking.
|
|
120
123
|
|
|
121
124
|
Args:
|
|
@@ -133,7 +136,7 @@ def prepare_languages(
|
|
|
133
136
|
language_mapping = get_all_languages()
|
|
134
137
|
|
|
135
138
|
# Create the list `languages_str` of language codes to use for models or datasets
|
|
136
|
-
languages_str:
|
|
139
|
+
languages_str: c.Sequence[str]
|
|
137
140
|
if language_codes is None:
|
|
138
141
|
languages_str = default_language_codes
|
|
139
142
|
elif isinstance(language_codes, str):
|
|
@@ -150,12 +153,12 @@ def prepare_languages(
|
|
|
150
153
|
return prepared_languages
|
|
151
154
|
|
|
152
155
|
|
|
153
|
-
def
|
|
154
|
-
task: str |
|
|
155
|
-
dataset_languages:
|
|
156
|
-
dataset: str |
|
|
157
|
-
) ->
|
|
158
|
-
"""Prepare
|
|
156
|
+
def prepare_dataset_configs(
|
|
157
|
+
task: "str | Task | c.Sequence[str | Task] | None",
|
|
158
|
+
dataset_languages: c.Sequence["Language"],
|
|
159
|
+
dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None",
|
|
160
|
+
) -> c.Sequence["DatasetConfig"]:
|
|
161
|
+
"""Prepare dataset config(s) for benchmarking.
|
|
159
162
|
|
|
160
163
|
Args:
|
|
161
164
|
task:
|
|
@@ -168,56 +171,58 @@ def prepare_tasks_and_datasets(
|
|
|
168
171
|
included, limited by the `task` and `dataset_languages` parameters.
|
|
169
172
|
|
|
170
173
|
Returns:
|
|
171
|
-
The prepared
|
|
174
|
+
The prepared dataset configs.
|
|
172
175
|
|
|
173
176
|
Raises:
|
|
174
177
|
InvalidBenchmark:
|
|
175
178
|
If the task or dataset is not found in the benchmark tasks or datasets.
|
|
176
179
|
"""
|
|
177
|
-
# Create a dictionary that maps benchmark tasks to their associated benchmark
|
|
178
|
-
# task objects, and a dictionary that maps dataset names to their associated
|
|
179
|
-
# dataset configuration objects
|
|
180
|
-
task_mapping = get_all_tasks()
|
|
181
|
-
all_dataset_configs = get_all_dataset_configs()
|
|
182
|
-
|
|
183
180
|
# Create the list of dataset tasks
|
|
181
|
+
task_mapping = get_all_tasks()
|
|
184
182
|
try:
|
|
185
183
|
if task is None:
|
|
186
|
-
tasks =
|
|
184
|
+
tasks = None
|
|
187
185
|
elif isinstance(task, str):
|
|
188
186
|
tasks = [task_mapping[task]]
|
|
187
|
+
elif isinstance(task, Task):
|
|
188
|
+
tasks = [task]
|
|
189
189
|
else:
|
|
190
|
-
tasks = [task_mapping[t] for t in task]
|
|
190
|
+
tasks = [task_mapping[t] if isinstance(t, str) else t for t in task]
|
|
191
191
|
except KeyError as e:
|
|
192
192
|
raise InvalidBenchmark(f"Task {e} not found in the benchmark tasks.") from e
|
|
193
193
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
194
|
+
# Create the list of dataset configs
|
|
195
|
+
all_dataset_configs = get_all_dataset_configs()
|
|
196
|
+
all_official_dataset_configs: c.Sequence[DatasetConfig] = [
|
|
197
|
+
dataset_config
|
|
198
|
+
for dataset_config in all_dataset_configs.values()
|
|
197
199
|
if not dataset_config.unofficial
|
|
198
200
|
]
|
|
199
|
-
|
|
200
|
-
dataset
|
|
201
|
-
|
|
202
|
-
dataset
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
201
|
+
try:
|
|
202
|
+
if dataset is None:
|
|
203
|
+
datasets = all_official_dataset_configs
|
|
204
|
+
elif isinstance(dataset, str):
|
|
205
|
+
datasets = [all_dataset_configs[dataset]]
|
|
206
|
+
elif isinstance(dataset, DatasetConfig):
|
|
207
|
+
datasets = [dataset]
|
|
208
|
+
else:
|
|
209
|
+
datasets = [
|
|
210
|
+
all_dataset_configs[d] if isinstance(d, str) else d for d in dataset
|
|
211
|
+
]
|
|
212
|
+
except KeyError as e:
|
|
207
213
|
raise InvalidBenchmark(
|
|
208
|
-
f"Dataset
|
|
209
|
-
|
|
210
|
-
)
|
|
214
|
+
f"Dataset {e} not found in the benchmark datasets."
|
|
215
|
+
) from e
|
|
211
216
|
|
|
217
|
+
# Filter the dataset configs based on the specified tasks and languages
|
|
212
218
|
datasets = [
|
|
213
|
-
|
|
214
|
-
for
|
|
215
|
-
if
|
|
216
|
-
and
|
|
217
|
-
and set(dataset_config.languages).intersection(dataset_languages)
|
|
219
|
+
ds
|
|
220
|
+
for ds in datasets
|
|
221
|
+
if (tasks is None or ds.task in tasks)
|
|
222
|
+
and any(lang in dataset_languages for lang in ds.languages)
|
|
218
223
|
]
|
|
219
224
|
|
|
220
|
-
return
|
|
225
|
+
return datasets
|
|
221
226
|
|
|
222
227
|
|
|
223
228
|
def prepare_device(device: Device | None) -> torch.device:
|
|
@@ -52,7 +52,7 @@ class BenchmarkModule(ABC):
|
|
|
52
52
|
fresh_model: bool
|
|
53
53
|
batching_preference: "BatchingPreference"
|
|
54
54
|
high_priority: bool
|
|
55
|
-
allowed_params: dict[re.Pattern,
|
|
55
|
+
allowed_params: dict[re.Pattern, c.Sequence[str]] = {re.compile(r".*"): []}
|
|
56
56
|
|
|
57
57
|
def __init__(
|
|
58
58
|
self,
|
|
@@ -83,11 +83,12 @@ class BenchmarkModule(ABC):
|
|
|
83
83
|
|
|
84
84
|
def _log_metadata(self) -> None:
|
|
85
85
|
"""Log the metadata of the model."""
|
|
86
|
+
model_id = self.model_config.model_id
|
|
86
87
|
logging_msg: str = " ↳ "
|
|
87
88
|
if self.num_params < 0:
|
|
88
|
-
logging_msg += "The model has an unknown number of parameters, "
|
|
89
|
+
logging_msg += f"The model {model_id} has an unknown number of parameters, "
|
|
89
90
|
else:
|
|
90
|
-
logging_msg += f"The model has {self.num_params:,} parameters, "
|
|
91
|
+
logging_msg += f"The model {model_id} has {self.num_params:,} parameters, "
|
|
91
92
|
if self.vocab_size < 0:
|
|
92
93
|
logging_msg += "an unknown vocabulary size, "
|
|
93
94
|
else:
|
|
@@ -166,7 +167,7 @@ class BenchmarkModule(ABC):
|
|
|
166
167
|
|
|
167
168
|
@property
|
|
168
169
|
@abstractmethod
|
|
169
|
-
def data_collator(self) -> c.Callable[[
|
|
170
|
+
def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
|
|
170
171
|
"""The data collator used to prepare samples during finetuning.
|
|
171
172
|
|
|
172
173
|
Returns:
|
|
@@ -240,7 +241,7 @@ class BenchmarkModule(ABC):
|
|
|
240
241
|
|
|
241
242
|
def prepare_datasets(
|
|
242
243
|
self, datasets: list[DatasetDict], task: "Task"
|
|
243
|
-
) ->
|
|
244
|
+
) -> c.Sequence[DatasetDict]:
|
|
244
245
|
"""Prepare the datasets for the model.
|
|
245
246
|
|
|
246
247
|
This includes things like tokenisation.
|
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -267,7 +267,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
267
267
|
return model_max_length
|
|
268
268
|
|
|
269
269
|
@property
|
|
270
|
-
def data_collator(self) -> c.Callable[[
|
|
270
|
+
def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
|
|
271
271
|
"""The data collator used to prepare samples during finetuning.
|
|
272
272
|
|
|
273
273
|
Returns:
|
|
@@ -775,15 +775,8 @@ def get_model_repo_info(
|
|
|
775
775
|
level=logging.DEBUG,
|
|
776
776
|
)
|
|
777
777
|
return None
|
|
778
|
-
except (RepositoryNotFoundError, HFValidationError):
|
|
778
|
+
except (RepositoryNotFoundError, HFValidationError, HfHubHTTPError):
|
|
779
779
|
return None
|
|
780
|
-
except HfHubHTTPError as e:
|
|
781
|
-
if "unauthorized" in str(e).lower():
|
|
782
|
-
raise InvalidModel(
|
|
783
|
-
"It seems like your specified Hugging Face API key is invalid. "
|
|
784
|
-
"Please double-check your API key."
|
|
785
|
-
) from e
|
|
786
|
-
raise InvalidModel(str(e)) from e
|
|
787
780
|
except (OSError, RequestException) as e:
|
|
788
781
|
if internet_connection_available():
|
|
789
782
|
errors.append(e)
|
|
@@ -310,7 +310,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
310
310
|
InvalidBenchmark:
|
|
311
311
|
If the inputs do not contain either 'messages' or 'text' keys.
|
|
312
312
|
"""
|
|
313
|
-
model_inputs:
|
|
313
|
+
model_inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str]
|
|
314
314
|
if "messages" in inputs:
|
|
315
315
|
model_inputs = inputs["messages"]
|
|
316
316
|
elif "text" in inputs:
|
|
@@ -331,9 +331,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
331
331
|
)
|
|
332
332
|
|
|
333
333
|
all_responses: dict[int, "ModelResponse"] = {}
|
|
334
|
-
inputs_to_run:
|
|
335
|
-
|
|
336
|
-
)
|
|
334
|
+
inputs_to_run: c.Sequence[
|
|
335
|
+
tuple[int, c.Sequence[litellm.AllMessageValues] | str]
|
|
336
|
+
] = list(enumerate(model_inputs))
|
|
337
337
|
for attempt in range(num_attempts := 10):
|
|
338
338
|
if not inputs_to_run:
|
|
339
339
|
break
|
|
@@ -540,7 +540,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
540
540
|
)
|
|
541
541
|
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
542
542
|
keys_and_their_types = {
|
|
543
|
-
tag_name: (
|
|
543
|
+
tag_name: (c.Sequence[str], ...) for tag_name in ner_tag_names
|
|
544
544
|
}
|
|
545
545
|
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
546
546
|
generation_kwargs["response_format"] = pydantic_class
|
|
@@ -686,9 +686,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
686
686
|
async def _generate_async(
|
|
687
687
|
self,
|
|
688
688
|
model_id: str,
|
|
689
|
-
inputs:
|
|
689
|
+
inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str],
|
|
690
690
|
**generation_kwargs,
|
|
691
|
-
) -> tuple[
|
|
691
|
+
) -> tuple[
|
|
692
|
+
c.Sequence[tuple[int, "ModelResponse"]], c.Sequence[tuple[int, Exception]]
|
|
693
|
+
]:
|
|
692
694
|
"""Generate outputs from the model asynchronously.
|
|
693
695
|
|
|
694
696
|
Args:
|
|
@@ -789,7 +791,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
789
791
|
|
|
790
792
|
@staticmethod
|
|
791
793
|
def _create_model_output(
|
|
792
|
-
model_responses:
|
|
794
|
+
model_responses: c.Sequence["ModelResponse"], model_id: str
|
|
793
795
|
) -> GenerativeModelOutput:
|
|
794
796
|
"""Create a GenerativeModelOutput object from a list of ModelResponse objects.
|
|
795
797
|
|
|
@@ -863,7 +865,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
863
865
|
)
|
|
864
866
|
continue
|
|
865
867
|
|
|
866
|
-
logprobs_list:
|
|
868
|
+
logprobs_list: c.Sequence[c.Sequence[tuple[str, float]]]
|
|
867
869
|
if isinstance(logprobs_obj, ChoiceLogprobs):
|
|
868
870
|
logprobs_list = [
|
|
869
871
|
[
|
|
@@ -1159,7 +1161,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1159
1161
|
return -1
|
|
1160
1162
|
|
|
1161
1163
|
@property
|
|
1162
|
-
def data_collator(self) -> c.Callable[[
|
|
1164
|
+
def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
|
|
1163
1165
|
"""The data collator used to prepare samples during finetuning.
|
|
1164
1166
|
|
|
1165
1167
|
Returns:
|
|
@@ -1545,7 +1547,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1545
1547
|
# First attempt is a test run with a single conversation to handle errors
|
|
1546
1548
|
# quickly. We repeat this multiple times to deal with different types of
|
|
1547
1549
|
# errors, and stop if we get a successful response.
|
|
1548
|
-
test_input:
|
|
1550
|
+
test_input: c.Sequence[litellm.AllMessageValues] | str
|
|
1549
1551
|
if self.generative_type == GenerativeType.BASE:
|
|
1550
1552
|
test_input = "Test message"
|
|
1551
1553
|
else:
|
|
@@ -1604,7 +1606,7 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1604
1606
|
)
|
|
1605
1607
|
|
|
1606
1608
|
try:
|
|
1607
|
-
downloaded_ollama_models:
|
|
1609
|
+
downloaded_ollama_models: c.Sequence[str] = [
|
|
1608
1610
|
model_obj.model
|
|
1609
1611
|
for model_obj in ollama.list().models
|
|
1610
1612
|
if model_obj.model is not None
|
|
@@ -416,12 +416,18 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
416
416
|
json=structured_generation_schema
|
|
417
417
|
)
|
|
418
418
|
elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
419
|
+
choice_labels = [
|
|
420
|
+
self.dataset_config.prompt_label_mapping[label]
|
|
421
|
+
for label in self.dataset_config.labels
|
|
422
|
+
]
|
|
423
|
+
if "first_label_token_mapping" in self.buffer and isinstance(
|
|
424
|
+
self.buffer["first_label_token_mapping"], dict
|
|
425
|
+
):
|
|
426
|
+
choice_labels = [
|
|
427
|
+
self.buffer["first_label_token_mapping"][label]
|
|
428
|
+
for label in choice_labels
|
|
423
429
|
]
|
|
424
|
-
)
|
|
430
|
+
structured_outputs = StructuredOutputsParams(choice=choice_labels)
|
|
425
431
|
log_once(
|
|
426
432
|
"Using structured generation with the choices: "
|
|
427
433
|
f"{structured_outputs.choice!r}.",
|
|
@@ -452,7 +458,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
452
458
|
|
|
453
459
|
# If any of the prompts are empty then we need to replace them with a BOS token
|
|
454
460
|
# so that the vLLM model can generate from them
|
|
455
|
-
prompts:
|
|
461
|
+
prompts: c.Sequence[str] = inputs["text"]
|
|
456
462
|
if any(len(prompt) == 0 for prompt in prompts):
|
|
457
463
|
log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
|
|
458
464
|
prompts = [
|
|
@@ -556,13 +562,14 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
556
562
|
)
|
|
557
563
|
|
|
558
564
|
# Parse the raw model outputs
|
|
559
|
-
completion_ids:
|
|
565
|
+
completion_ids: c.Sequence[c.Sequence[int]] = [
|
|
560
566
|
list(output.outputs[0].token_ids) for output in raw_outputs
|
|
561
567
|
]
|
|
562
568
|
completions = self._tokeniser.batch_decode(
|
|
563
569
|
sequences=[
|
|
564
570
|
torch.LongTensor(completion_id) for completion_id in completion_ids
|
|
565
|
-
]
|
|
571
|
+
],
|
|
572
|
+
skip_special_tokens=True,
|
|
566
573
|
)
|
|
567
574
|
if (
|
|
568
575
|
self.end_of_reasoning_token is not None
|
|
@@ -608,7 +615,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
608
615
|
|
|
609
616
|
# Add logprobs scores to the output
|
|
610
617
|
if self.buffer["first_label_token_mapping"]:
|
|
611
|
-
scores:
|
|
618
|
+
scores: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]] = [
|
|
612
619
|
[
|
|
613
620
|
[
|
|
614
621
|
(obj.decoded_token or "", obj.logprob)
|
|
@@ -719,7 +726,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
719
726
|
return model_config
|
|
720
727
|
|
|
721
728
|
@property
|
|
722
|
-
def data_collator(self) -> c.Callable[[
|
|
729
|
+
def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
|
|
723
730
|
"""The data collator used to prepare samples during finetuning.
|
|
724
731
|
|
|
725
732
|
Returns:
|
euroeval/benchmarker.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Class that benchmarks language models."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import contextlib
|
|
4
5
|
import datetime as dt
|
|
5
6
|
import json
|
|
@@ -38,7 +39,7 @@ from .utils import (
|
|
|
38
39
|
|
|
39
40
|
if t.TYPE_CHECKING:
|
|
40
41
|
from .benchmark_modules import BenchmarkModule
|
|
41
|
-
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
42
|
+
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig, Task
|
|
42
43
|
|
|
43
44
|
|
|
44
45
|
class Benchmarker:
|
|
@@ -62,11 +63,11 @@ class Benchmarker:
|
|
|
62
63
|
self,
|
|
63
64
|
progress_bar: bool = True,
|
|
64
65
|
save_results: bool = True,
|
|
65
|
-
task: str |
|
|
66
|
-
dataset:
|
|
67
|
-
language: str |
|
|
68
|
-
model_language: str |
|
|
69
|
-
dataset_language: str |
|
|
66
|
+
task: "str | Task | c.Sequence[str | Task] | None" = None,
|
|
67
|
+
dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
|
|
68
|
+
language: str | c.Sequence[str] = "all",
|
|
69
|
+
model_language: str | c.Sequence[str] | None = None,
|
|
70
|
+
dataset_language: str | c.Sequence[str] | None = None,
|
|
70
71
|
device: Device | None = None,
|
|
71
72
|
batch_size: int = 32,
|
|
72
73
|
raise_errors: bool = False,
|
|
@@ -176,6 +177,8 @@ class Benchmarker:
|
|
|
176
177
|
ValueError:
|
|
177
178
|
If both `task` and `dataset` are specified, or if `download_only`
|
|
178
179
|
is True and we have no internet connection.
|
|
180
|
+
ImportError:
|
|
181
|
+
If `hf_transfer` is enabled but not installed.
|
|
179
182
|
"""
|
|
180
183
|
if task is not None and dataset is not None:
|
|
181
184
|
raise ValueError("Only one of `task` and `dataset` can be specified.")
|
|
@@ -236,13 +239,13 @@ class Benchmarker:
|
|
|
236
239
|
)
|
|
237
240
|
|
|
238
241
|
# Initialise variable storing model lists, so we only have to fetch it once
|
|
239
|
-
self._model_lists: dict[str,
|
|
242
|
+
self._model_lists: dict[str, c.Sequence[str]] | None = None
|
|
240
243
|
|
|
241
244
|
self.results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
|
|
242
245
|
adjust_logging_level(verbose=self.benchmark_config.verbose)
|
|
243
246
|
|
|
244
247
|
@property
|
|
245
|
-
def benchmark_results(self) ->
|
|
248
|
+
def benchmark_results(self) -> c.Sequence[BenchmarkResult]:
|
|
246
249
|
"""The benchmark results.
|
|
247
250
|
|
|
248
251
|
Returns:
|
|
@@ -320,14 +323,14 @@ class Benchmarker:
|
|
|
320
323
|
|
|
321
324
|
def benchmark(
|
|
322
325
|
self,
|
|
323
|
-
model:
|
|
324
|
-
task: str |
|
|
325
|
-
dataset:
|
|
326
|
+
model: c.Sequence[str] | str,
|
|
327
|
+
task: "str | Task | c.Sequence[str | Task] | None" = None,
|
|
328
|
+
dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
|
|
326
329
|
progress_bar: bool | None = None,
|
|
327
330
|
save_results: bool | None = None,
|
|
328
|
-
language: str |
|
|
329
|
-
model_language: str |
|
|
330
|
-
dataset_language: str |
|
|
331
|
+
language: str | c.Sequence[str] | None = None,
|
|
332
|
+
model_language: str | c.Sequence[str] | None = None,
|
|
333
|
+
dataset_language: str | c.Sequence[str] | None = None,
|
|
331
334
|
device: Device | None = None,
|
|
332
335
|
batch_size: int | None = None,
|
|
333
336
|
raise_errors: bool | None = None,
|
|
@@ -347,7 +350,7 @@ class Benchmarker:
|
|
|
347
350
|
force: bool | None = None,
|
|
348
351
|
verbose: bool | None = None,
|
|
349
352
|
debug: bool | None = None,
|
|
350
|
-
) ->
|
|
353
|
+
) -> c.Sequence[BenchmarkResult]:
|
|
351
354
|
"""Benchmarks models on datasets.
|
|
352
355
|
|
|
353
356
|
Args:
|
|
@@ -605,9 +608,7 @@ class Benchmarker:
|
|
|
605
608
|
clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
|
|
606
609
|
|
|
607
610
|
model_ids = self._prepare_model_ids(model_id=model)
|
|
608
|
-
dataset_configs =
|
|
609
|
-
dataset_names=benchmark_config.datasets
|
|
610
|
-
)
|
|
611
|
+
dataset_configs = benchmark_config.datasets
|
|
611
612
|
|
|
612
613
|
# Get all the model configs
|
|
613
614
|
model_configs: list[ModelConfig] = list()
|
|
@@ -625,27 +626,40 @@ class Benchmarker:
|
|
|
625
626
|
log(e.message, level=logging.ERROR)
|
|
626
627
|
|
|
627
628
|
# Create a dictionary that takes each model config to the dataset configs that
|
|
628
|
-
# we need to benchmark the model on.
|
|
629
|
-
#
|
|
630
|
-
|
|
631
|
-
|
|
629
|
+
# we need to benchmark the model on. We initially include all the relevant
|
|
630
|
+
# datasets for each model.
|
|
631
|
+
model_config_to_dataset_configs: dict[
|
|
632
|
+
ModelConfig, c.Sequence[DatasetConfig]
|
|
633
|
+
] = {
|
|
632
634
|
model_config: [
|
|
633
635
|
dataset_config
|
|
634
636
|
for dataset_config in dataset_configs
|
|
635
|
-
if
|
|
636
|
-
benchmark_config.force
|
|
637
|
-
or not model_has_been_benchmarked(
|
|
638
|
-
model_config=model_config,
|
|
639
|
-
dataset_config=dataset_config,
|
|
640
|
-
benchmark_config=benchmark_config,
|
|
641
|
-
benchmark_results=self.benchmark_results,
|
|
642
|
-
)
|
|
643
|
-
)
|
|
644
|
-
and model_config.model_type in dataset_config.allowed_model_types
|
|
637
|
+
if model_config.model_type in dataset_config.allowed_model_types
|
|
645
638
|
]
|
|
646
639
|
for model_config in model_configs
|
|
647
640
|
}
|
|
648
641
|
|
|
642
|
+
# Initialise the current benchmark results with all the ones that we have cached
|
|
643
|
+
# on disk already (can be none), and remove those datasets from the mapping
|
|
644
|
+
current_benchmark_results: list[BenchmarkResult] = list()
|
|
645
|
+
for (
|
|
646
|
+
model_config,
|
|
647
|
+
model_dataset_configs,
|
|
648
|
+
) in model_config_to_dataset_configs.items():
|
|
649
|
+
new_model_dataset_configs: list[DatasetConfig] = list()
|
|
650
|
+
for dataset_config in model_dataset_configs:
|
|
651
|
+
benchmark_record = get_record(
|
|
652
|
+
model_config=model_config,
|
|
653
|
+
dataset_config=dataset_config,
|
|
654
|
+
benchmark_config=benchmark_config,
|
|
655
|
+
benchmark_results=self.benchmark_results,
|
|
656
|
+
)
|
|
657
|
+
if benchmark_record is not None and not benchmark_config.force:
|
|
658
|
+
current_benchmark_results.append(benchmark_record)
|
|
659
|
+
else:
|
|
660
|
+
new_model_dataset_configs.append(dataset_config)
|
|
661
|
+
model_config_to_dataset_configs[model_config] = new_model_dataset_configs
|
|
662
|
+
|
|
649
663
|
total_benchmarks = sum(
|
|
650
664
|
len(dataset_configs)
|
|
651
665
|
for dataset_configs in model_config_to_dataset_configs.values()
|
|
@@ -656,10 +670,9 @@ class Benchmarker:
|
|
|
656
670
|
"benchmarked on all the selected datasets.",
|
|
657
671
|
level=logging.INFO,
|
|
658
672
|
)
|
|
659
|
-
return
|
|
673
|
+
return current_benchmark_results
|
|
660
674
|
|
|
661
675
|
num_finished_benchmarks = 0
|
|
662
|
-
current_benchmark_results: list[BenchmarkResult] = list()
|
|
663
676
|
benchmark_params_to_revert: dict[str, t.Any] = dict()
|
|
664
677
|
for model_config in model_configs:
|
|
665
678
|
if not model_config_to_dataset_configs[model_config]:
|
|
@@ -809,7 +822,9 @@ class Benchmarker:
|
|
|
809
822
|
if benchmark_config.clear_model_cache:
|
|
810
823
|
clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
|
|
811
824
|
|
|
812
|
-
log(
|
|
825
|
+
log(
|
|
826
|
+
f"\nCompleted {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO
|
|
827
|
+
)
|
|
813
828
|
|
|
814
829
|
# This avoids the following warning at the end of the benchmarking:
|
|
815
830
|
# Warning: WARNING: process group has NOT been destroyed before we destruct
|
|
@@ -823,7 +838,7 @@ class Benchmarker:
|
|
|
823
838
|
destroy_process_group()
|
|
824
839
|
return current_benchmark_results
|
|
825
840
|
|
|
826
|
-
def _prepare_model_ids(self, model_id:
|
|
841
|
+
def _prepare_model_ids(self, model_id: c.Sequence[str] | str) -> c.Sequence[str]:
|
|
827
842
|
"""Prepare the model ID(s) to be benchmarked.
|
|
828
843
|
|
|
829
844
|
Args:
|
|
@@ -1020,13 +1035,13 @@ class Benchmarker:
|
|
|
1020
1035
|
return self.benchmark(*args, **kwds)
|
|
1021
1036
|
|
|
1022
1037
|
|
|
1023
|
-
def
|
|
1038
|
+
def get_record(
|
|
1024
1039
|
model_config: "ModelConfig",
|
|
1025
1040
|
dataset_config: "DatasetConfig",
|
|
1026
1041
|
benchmark_config: "BenchmarkConfig",
|
|
1027
|
-
benchmark_results:
|
|
1028
|
-
) ->
|
|
1029
|
-
"""
|
|
1042
|
+
benchmark_results: c.Sequence[BenchmarkResult],
|
|
1043
|
+
) -> BenchmarkResult | None:
|
|
1044
|
+
"""Get the benchmark record for a given model and dataset.
|
|
1030
1045
|
|
|
1031
1046
|
Args:
|
|
1032
1047
|
model_config:
|
|
@@ -1039,7 +1054,7 @@ def model_has_been_benchmarked(
|
|
|
1039
1054
|
The benchmark results.
|
|
1040
1055
|
|
|
1041
1056
|
Returns:
|
|
1042
|
-
|
|
1057
|
+
The benchmark record, or None if no such record exists.
|
|
1043
1058
|
"""
|
|
1044
1059
|
for record in benchmark_results:
|
|
1045
1060
|
model_id_components = split_model_id(model_id=record.model)
|
|
@@ -1064,8 +1079,8 @@ def model_has_been_benchmarked(
|
|
|
1064
1079
|
and same_split
|
|
1065
1080
|
and same_num_shots
|
|
1066
1081
|
):
|
|
1067
|
-
return
|
|
1068
|
-
return
|
|
1082
|
+
return record
|
|
1083
|
+
return None
|
|
1069
1084
|
|
|
1070
1085
|
|
|
1071
1086
|
def clear_model_cache_fn(cache_dir: str) -> None:
|
|
@@ -1086,7 +1101,9 @@ def clear_model_cache_fn(cache_dir: str) -> None:
|
|
|
1086
1101
|
rmtree(sub_model_dir)
|
|
1087
1102
|
|
|
1088
1103
|
|
|
1089
|
-
def prepare_dataset_configs(
|
|
1104
|
+
def prepare_dataset_configs(
|
|
1105
|
+
dataset_names: c.Sequence[str],
|
|
1106
|
+
) -> c.Sequence["DatasetConfig"]:
|
|
1090
1107
|
"""Prepare the dataset configuration(s) to be benchmarked.
|
|
1091
1108
|
|
|
1092
1109
|
Args:
|
euroeval/caching_utils.py
CHANGED
|
@@ -54,7 +54,7 @@ def cache_arguments(
|
|
|
54
54
|
key = args + tuple(kwargs[k] for k in sorted(kwargs.keys()))
|
|
55
55
|
else:
|
|
56
56
|
func_params = func.__code__.co_varnames
|
|
57
|
-
key_items: list[t.Any] =
|
|
57
|
+
key_items: list[t.Any] = list()
|
|
58
58
|
for arg_name in arguments:
|
|
59
59
|
if arg_name in kwargs:
|
|
60
60
|
key_items.append(kwargs[arg_name])
|