EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +6 -0
- euroeval/benchmark_config_factory.py +51 -46
- euroeval/benchmark_modules/base.py +6 -5
- euroeval/benchmark_modules/hf.py +2 -9
- euroeval/benchmark_modules/litellm.py +14 -12
- euroeval/benchmark_modules/vllm.py +17 -10
- euroeval/benchmarker.py +61 -44
- euroeval/caching_utils.py +1 -1
- euroeval/cli.py +86 -8
- euroeval/constants.py +3 -0
- euroeval/data_loading.py +78 -30
- euroeval/data_models.py +326 -326
- euroeval/dataset_configs/__init__.py +10 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +25 -29
- euroeval/dataset_configs/danish.py +51 -88
- euroeval/dataset_configs/dutch.py +48 -86
- euroeval/dataset_configs/english.py +45 -76
- euroeval/dataset_configs/estonian.py +36 -38
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -68
- euroeval/dataset_configs/french.py +39 -74
- euroeval/dataset_configs/german.py +45 -81
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -78
- euroeval/dataset_configs/latvian.py +28 -34
- euroeval/dataset_configs/lithuanian.py +22 -26
- euroeval/dataset_configs/norwegian.py +72 -114
- euroeval/dataset_configs/polish.py +33 -60
- euroeval/dataset_configs/portuguese.py +33 -65
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +19 -24
- euroeval/dataset_configs/spanish.py +42 -76
- euroeval/dataset_configs/swedish.py +48 -84
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +3 -2
- euroeval/generation.py +5 -4
- euroeval/generation_utils.py +6 -5
- euroeval/languages.py +395 -323
- euroeval/metrics/huggingface.py +14 -3
- euroeval/metrics/llm_as_a_judge.py +1 -1
- euroeval/model_cache.py +6 -5
- euroeval/model_loading.py +1 -1
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +82 -43
- euroeval/prompt_templates/multiple_choice.py +81 -41
- euroeval/prompt_templates/named_entity_recognition.py +125 -44
- euroeval/prompt_templates/reading_comprehension.py +92 -43
- euroeval/prompt_templates/sentiment_classification.py +91 -43
- euroeval/prompt_templates/summarization.py +64 -39
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +4 -3
- euroeval/speed_benchmark.py +2 -1
- euroeval/task_group_utils/multiple_choice_classification.py +2 -1
- euroeval/task_group_utils/question_answering.py +24 -13
- euroeval/task_group_utils/sequence_classification.py +5 -4
- euroeval/task_group_utils/text_to_text.py +2 -1
- euroeval/task_group_utils/token_classification.py +11 -8
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +19 -10
- euroeval/types.py +10 -9
- euroeval/utils.py +6 -3
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.4.0.dist-info/RECORD +0 -75
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/cli.py
CHANGED
|
@@ -1,12 +1,18 @@
|
|
|
1
1
|
"""Command-line interface for benchmarking."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
4
|
+
import importlib.util
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
3
8
|
import click
|
|
4
9
|
|
|
5
10
|
from .benchmarker import Benchmarker
|
|
11
|
+
from .data_models import DatasetConfig, Task
|
|
6
12
|
from .dataset_configs import get_all_dataset_configs
|
|
7
13
|
from .enums import Device, GenerativeType
|
|
8
14
|
from .languages import get_all_languages
|
|
9
|
-
from .
|
|
15
|
+
from .logging_utils import log
|
|
10
16
|
|
|
11
17
|
|
|
12
18
|
@click.command()
|
|
@@ -23,7 +29,6 @@ from .tasks import get_all_tasks
|
|
|
23
29
|
default=None,
|
|
24
30
|
show_default=True,
|
|
25
31
|
multiple=True,
|
|
26
|
-
type=click.Choice(list(get_all_tasks().keys())),
|
|
27
32
|
help="The dataset tasks to benchmark the model(s) on.",
|
|
28
33
|
)
|
|
29
34
|
@click.option(
|
|
@@ -65,7 +70,6 @@ from .tasks import get_all_tasks
|
|
|
65
70
|
default=None,
|
|
66
71
|
show_default=True,
|
|
67
72
|
multiple=True,
|
|
68
|
-
type=click.Choice(list(get_all_dataset_configs().keys())),
|
|
69
73
|
help="""The name of the benchmark dataset. We recommend to use the `task` and
|
|
70
74
|
`language` options instead of this option.""",
|
|
71
75
|
)
|
|
@@ -222,9 +226,17 @@ from .tasks import get_all_tasks
|
|
|
222
226
|
help="Only download the requested model weights and datasets, and exit.",
|
|
223
227
|
default=False,
|
|
224
228
|
)
|
|
229
|
+
@click.option(
|
|
230
|
+
"--custom-datasets-file",
|
|
231
|
+
type=click.Path(exists=False, dir_okay=False, path_type=Path),
|
|
232
|
+
default="custom_datasets.py",
|
|
233
|
+
show_default=True,
|
|
234
|
+
help="A path to a Python file containing DatasetConfig definitions for custom "
|
|
235
|
+
"datasets.",
|
|
236
|
+
)
|
|
225
237
|
def benchmark(
|
|
226
238
|
model: tuple[str],
|
|
227
|
-
dataset: tuple[str],
|
|
239
|
+
dataset: tuple[str | DatasetConfig],
|
|
228
240
|
language: tuple[str],
|
|
229
241
|
model_language: tuple[str],
|
|
230
242
|
dataset_language: tuple[str],
|
|
@@ -250,26 +262,92 @@ def benchmark(
|
|
|
250
262
|
requires_safetensors: bool,
|
|
251
263
|
generative_type: str | None,
|
|
252
264
|
download_only: bool,
|
|
265
|
+
custom_datasets_file: Path,
|
|
253
266
|
) -> None:
|
|
254
267
|
"""Benchmark pretrained language models on language tasks."""
|
|
255
268
|
models = list(model)
|
|
256
|
-
datasets
|
|
269
|
+
datasets: c.Sequence[str | DatasetConfig] | None = (
|
|
270
|
+
None if len(dataset) == 0 else list(dataset)
|
|
271
|
+
)
|
|
257
272
|
languages: list[str] = list(language)
|
|
258
273
|
model_languages = None if len(model_language) == 0 else list(model_language)
|
|
259
274
|
dataset_languages = None if len(dataset_language) == 0 else list(dataset_language)
|
|
260
|
-
tasks = None if len(task) == 0 else list(task)
|
|
275
|
+
tasks: c.Sequence[str | Task] | None = None if len(task) == 0 else list(task)
|
|
261
276
|
batch_size_int = int(batch_size)
|
|
262
277
|
device = Device[device.upper()] if device is not None else None
|
|
263
278
|
generative_type_obj = (
|
|
264
279
|
GenerativeType[generative_type.upper()] if generative_type else None
|
|
265
280
|
)
|
|
266
281
|
|
|
282
|
+
# Load all defined DatasetConfig and Task objects from the custom datasets file
|
|
283
|
+
if custom_datasets_file.exists():
|
|
284
|
+
# Load the custom module
|
|
285
|
+
spec = importlib.util.spec_from_file_location(
|
|
286
|
+
name="custom_datasets_module", location=str(custom_datasets_file.resolve())
|
|
287
|
+
)
|
|
288
|
+
if spec is None:
|
|
289
|
+
raise RuntimeError(
|
|
290
|
+
"Could not load the spec for the custom datasets file from "
|
|
291
|
+
f"{custom_datasets_file.resolve()}."
|
|
292
|
+
)
|
|
293
|
+
module = importlib.util.module_from_spec(spec=spec)
|
|
294
|
+
if spec.loader is None:
|
|
295
|
+
raise RuntimeError(
|
|
296
|
+
"Could not load the module for the custom datasets file from "
|
|
297
|
+
f"{custom_datasets_file.resolve()}."
|
|
298
|
+
)
|
|
299
|
+
spec.loader.exec_module(module)
|
|
300
|
+
|
|
301
|
+
# Load all the custom dataset configurations from the module
|
|
302
|
+
custom_dataset_configs: list[DatasetConfig] = [
|
|
303
|
+
obj for obj in vars(module).values() if isinstance(obj, DatasetConfig)
|
|
304
|
+
]
|
|
305
|
+
|
|
306
|
+
# If the user has not specified any datasets or tasks, we just use all the usual
|
|
307
|
+
# datasets as well as all the custom ones that we loaded
|
|
308
|
+
if datasets is None and tasks is None:
|
|
309
|
+
datasets = custom_dataset_configs + list(get_all_dataset_configs().values())
|
|
310
|
+
datasets = [ds for ds in datasets if not ds.unofficial]
|
|
311
|
+
|
|
312
|
+
# If the user has specified only datasets, then we replace the custom dataset
|
|
313
|
+
# names that the user specified (if any) with the corresponding dataset configs
|
|
314
|
+
# that we loaded
|
|
315
|
+
elif datasets is not None and tasks is None:
|
|
316
|
+
dataset_name_to_config = {
|
|
317
|
+
config.name: config for config in custom_dataset_configs
|
|
318
|
+
}
|
|
319
|
+
datasets = [
|
|
320
|
+
dataset_name_to_config.get(ds, ds) if isinstance(ds, str) else ds
|
|
321
|
+
for ds in datasets
|
|
322
|
+
]
|
|
323
|
+
|
|
324
|
+
# If the user has specified only tasks, then we find all the official usual and
|
|
325
|
+
# custom datasets belonging to that task, and use those. We reset the `tasks`
|
|
326
|
+
# variable as we're using the `datasets` variable directly instead
|
|
327
|
+
elif datasets is None and tasks is not None:
|
|
328
|
+
datasets = custom_dataset_configs + list(get_all_dataset_configs().values())
|
|
329
|
+
datasets = [
|
|
330
|
+
ds for ds in datasets if not ds.unofficial and ds.task.name in tasks
|
|
331
|
+
]
|
|
332
|
+
tasks = None
|
|
333
|
+
|
|
334
|
+
# Log the loaded custom datasets and tasks
|
|
335
|
+
dataset_str = (
|
|
336
|
+
"the custom dataset"
|
|
337
|
+
if len(custom_dataset_configs) == 1
|
|
338
|
+
else f"{len(custom_dataset_configs):,} custom datasets"
|
|
339
|
+
)
|
|
340
|
+
log(
|
|
341
|
+
f"Loaded {dataset_str} from {custom_datasets_file.as_posix()!r}.\n",
|
|
342
|
+
level=logging.INFO,
|
|
343
|
+
)
|
|
344
|
+
|
|
267
345
|
benchmarker = Benchmarker(
|
|
268
346
|
language=languages,
|
|
269
347
|
model_language=model_languages,
|
|
270
348
|
dataset_language=dataset_languages,
|
|
271
|
-
task=tasks,
|
|
272
|
-
dataset=datasets,
|
|
349
|
+
task=tasks, # type: ignore[arg-type]
|
|
350
|
+
dataset=datasets, # type: ignore[arg-type]
|
|
273
351
|
batch_size=batch_size_int,
|
|
274
352
|
progress_bar=progress_bar,
|
|
275
353
|
save_results=save_results,
|
euroeval/constants.py
CHANGED
|
@@ -90,3 +90,6 @@ JSON_STRIP_CHARACTERS = ' {}\n\r":'
|
|
|
90
90
|
# tasks. We also use this to determine whether we should store logprobs in the model
|
|
91
91
|
# outputs (and cache).
|
|
92
92
|
NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
|
|
93
|
+
|
|
94
|
+
# We only allow loading local datasets in these file formats
|
|
95
|
+
SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]
|
euroeval/data_loading.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Functions related to the loading of the data."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import sys
|
|
5
6
|
import time
|
|
@@ -11,6 +12,7 @@ from datasets.exceptions import DatasetsError
|
|
|
11
12
|
from huggingface_hub.errors import HfHubHTTPError
|
|
12
13
|
from numpy.random import Generator
|
|
13
14
|
|
|
15
|
+
from .constants import SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS
|
|
14
16
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
15
17
|
from .logging_utils import log, no_terminal_output
|
|
16
18
|
from .tasks import EUROPEAN_VALUES
|
|
@@ -64,7 +66,7 @@ def load_data(
|
|
|
64
66
|
|
|
65
67
|
# Bootstrap the splits, if applicable
|
|
66
68
|
if dataset_config.bootstrap_samples:
|
|
67
|
-
bootstrapped_splits: dict[str,
|
|
69
|
+
bootstrapped_splits: dict[str, c.Sequence["Dataset"]] = dict()
|
|
68
70
|
for split in dataset_config.splits:
|
|
69
71
|
bootstrap_indices = rng.integers(
|
|
70
72
|
0,
|
|
@@ -102,38 +104,84 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
102
104
|
Returns:
|
|
103
105
|
The dataset.
|
|
104
106
|
"""
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
107
|
+
# Case where the dataset source is a Hugging Face ID
|
|
108
|
+
if isinstance(dataset_config.source, str):
|
|
109
|
+
num_attempts = 5
|
|
110
|
+
for _ in range(num_attempts):
|
|
111
|
+
try:
|
|
112
|
+
with no_terminal_output():
|
|
113
|
+
dataset = load_dataset(
|
|
114
|
+
path=dataset_config.source.split("::")[0],
|
|
115
|
+
name=(
|
|
116
|
+
dataset_config.source.split("::")[1]
|
|
117
|
+
if "::" in dataset_config.source
|
|
118
|
+
else None
|
|
119
|
+
),
|
|
120
|
+
cache_dir=cache_dir,
|
|
121
|
+
token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_"),
|
|
122
|
+
)
|
|
123
|
+
break
|
|
124
|
+
except (
|
|
125
|
+
FileNotFoundError,
|
|
126
|
+
ConnectionError,
|
|
127
|
+
DatasetsError,
|
|
128
|
+
requests.ConnectionError,
|
|
129
|
+
requests.ReadTimeout,
|
|
130
|
+
) as e:
|
|
131
|
+
log(
|
|
132
|
+
f"Failed to load dataset {dataset_config.source!r}, due to "
|
|
133
|
+
f"the following error: {e}. Retrying...",
|
|
134
|
+
level=logging.DEBUG,
|
|
113
135
|
)
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
f"Failed to load dataset {dataset_config.huggingface_id!r}, due to "
|
|
124
|
-
f"the following error: {e}. Retrying...",
|
|
125
|
-
level=logging.DEBUG,
|
|
136
|
+
time.sleep(1)
|
|
137
|
+
continue
|
|
138
|
+
except HfHubHTTPError:
|
|
139
|
+
raise HuggingFaceHubDown()
|
|
140
|
+
else:
|
|
141
|
+
raise InvalidBenchmark(
|
|
142
|
+
f"Failed to load dataset {dataset_config.source!r} after "
|
|
143
|
+
f"{num_attempts} attempts. Run with verbose mode to see the individual "
|
|
144
|
+
"errors."
|
|
126
145
|
)
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
raise HuggingFaceHubDown()
|
|
146
|
+
|
|
147
|
+
# Case where the dataset source is a dictionary with keys "train", "val" and "test",
|
|
148
|
+
# with the values pointing to local CSV files
|
|
131
149
|
else:
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
150
|
+
data_files = {
|
|
151
|
+
split: dataset_config.source[split]
|
|
152
|
+
for split in dataset_config.splits
|
|
153
|
+
if split in dataset_config.source
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
# Get the file extension and ensure that all files have the same extension
|
|
157
|
+
file_extensions = {
|
|
158
|
+
split: dataset_config.source[split].split(".")[-1]
|
|
159
|
+
for split in dataset_config.splits
|
|
160
|
+
if split in dataset_config.source
|
|
161
|
+
}
|
|
162
|
+
if len(set(file_extensions.values())) != 1:
|
|
163
|
+
raise InvalidBenchmark(
|
|
164
|
+
"All data files in a custom dataset must have the same file extension. "
|
|
165
|
+
f"Got the extensions {', '.join(file_extensions.values())} for the "
|
|
166
|
+
f"dataset {dataset_config.name!r}."
|
|
167
|
+
)
|
|
168
|
+
file_extension = list(file_extensions.values())[0]
|
|
169
|
+
|
|
170
|
+
# Check that the file extension is supported
|
|
171
|
+
if file_extension not in SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS:
|
|
172
|
+
raise InvalidBenchmark(
|
|
173
|
+
"Unsupported file extension for custom dataset. Supported file "
|
|
174
|
+
"extensions are "
|
|
175
|
+
f"{', '.join(SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS)}, but got "
|
|
176
|
+
f"{file_extension!r}."
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Load the dataset
|
|
180
|
+
with no_terminal_output():
|
|
181
|
+
dataset = load_dataset(
|
|
182
|
+
path=file_extension, data_files=data_files, cache_dir=cache_dir
|
|
183
|
+
)
|
|
184
|
+
|
|
137
185
|
assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
|
|
138
186
|
missing_keys = [key for key in dataset_config.splits if key not in dataset]
|
|
139
187
|
if missing_keys:
|