EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (71) hide show
  1. euroeval/__init__.py +6 -0
  2. euroeval/benchmark_config_factory.py +51 -46
  3. euroeval/benchmark_modules/base.py +6 -5
  4. euroeval/benchmark_modules/hf.py +2 -9
  5. euroeval/benchmark_modules/litellm.py +14 -12
  6. euroeval/benchmark_modules/vllm.py +17 -10
  7. euroeval/benchmarker.py +61 -44
  8. euroeval/caching_utils.py +1 -1
  9. euroeval/cli.py +86 -8
  10. euroeval/constants.py +3 -0
  11. euroeval/data_loading.py +78 -30
  12. euroeval/data_models.py +326 -326
  13. euroeval/dataset_configs/__init__.py +10 -3
  14. euroeval/dataset_configs/bulgarian.py +56 -0
  15. euroeval/dataset_configs/czech.py +25 -29
  16. euroeval/dataset_configs/danish.py +51 -88
  17. euroeval/dataset_configs/dutch.py +48 -86
  18. euroeval/dataset_configs/english.py +45 -76
  19. euroeval/dataset_configs/estonian.py +36 -38
  20. euroeval/dataset_configs/faroese.py +19 -60
  21. euroeval/dataset_configs/finnish.py +36 -68
  22. euroeval/dataset_configs/french.py +39 -74
  23. euroeval/dataset_configs/german.py +45 -81
  24. euroeval/dataset_configs/greek.py +64 -0
  25. euroeval/dataset_configs/icelandic.py +54 -91
  26. euroeval/dataset_configs/italian.py +42 -78
  27. euroeval/dataset_configs/latvian.py +28 -34
  28. euroeval/dataset_configs/lithuanian.py +22 -26
  29. euroeval/dataset_configs/norwegian.py +72 -114
  30. euroeval/dataset_configs/polish.py +33 -60
  31. euroeval/dataset_configs/portuguese.py +33 -65
  32. euroeval/dataset_configs/serbian.py +64 -0
  33. euroeval/dataset_configs/slovak.py +19 -24
  34. euroeval/dataset_configs/spanish.py +42 -76
  35. euroeval/dataset_configs/swedish.py +48 -84
  36. euroeval/dataset_configs/ukrainian.py +64 -0
  37. euroeval/exceptions.py +1 -1
  38. euroeval/finetuning.py +3 -2
  39. euroeval/generation.py +5 -4
  40. euroeval/generation_utils.py +6 -5
  41. euroeval/languages.py +395 -323
  42. euroeval/metrics/huggingface.py +14 -3
  43. euroeval/metrics/llm_as_a_judge.py +1 -1
  44. euroeval/model_cache.py +6 -5
  45. euroeval/model_loading.py +1 -1
  46. euroeval/prompt_templates/__init__.py +2 -0
  47. euroeval/prompt_templates/classification.py +206 -0
  48. euroeval/prompt_templates/linguistic_acceptability.py +82 -43
  49. euroeval/prompt_templates/multiple_choice.py +81 -41
  50. euroeval/prompt_templates/named_entity_recognition.py +125 -44
  51. euroeval/prompt_templates/reading_comprehension.py +92 -43
  52. euroeval/prompt_templates/sentiment_classification.py +91 -43
  53. euroeval/prompt_templates/summarization.py +64 -39
  54. euroeval/prompt_templates/token_classification.py +279 -0
  55. euroeval/scores.py +4 -3
  56. euroeval/speed_benchmark.py +2 -1
  57. euroeval/task_group_utils/multiple_choice_classification.py +2 -1
  58. euroeval/task_group_utils/question_answering.py +24 -13
  59. euroeval/task_group_utils/sequence_classification.py +5 -4
  60. euroeval/task_group_utils/text_to_text.py +2 -1
  61. euroeval/task_group_utils/token_classification.py +11 -8
  62. euroeval/tasks.py +44 -1
  63. euroeval/tokenisation_utils.py +19 -10
  64. euroeval/types.py +10 -9
  65. euroeval/utils.py +6 -3
  66. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
  67. euroeval-16.5.0.dist-info/RECORD +81 -0
  68. euroeval-16.4.0.dist-info/RECORD +0 -75
  69. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  70. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  71. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/cli.py CHANGED
@@ -1,12 +1,18 @@
1
1
  """Command-line interface for benchmarking."""
2
2
 
3
+ import collections.abc as c
4
+ import importlib.util
5
+ import logging
6
+ from pathlib import Path
7
+
3
8
  import click
4
9
 
5
10
  from .benchmarker import Benchmarker
11
+ from .data_models import DatasetConfig, Task
6
12
  from .dataset_configs import get_all_dataset_configs
7
13
  from .enums import Device, GenerativeType
8
14
  from .languages import get_all_languages
9
- from .tasks import get_all_tasks
15
+ from .logging_utils import log
10
16
 
11
17
 
12
18
  @click.command()
@@ -23,7 +29,6 @@ from .tasks import get_all_tasks
23
29
  default=None,
24
30
  show_default=True,
25
31
  multiple=True,
26
- type=click.Choice(list(get_all_tasks().keys())),
27
32
  help="The dataset tasks to benchmark the model(s) on.",
28
33
  )
29
34
  @click.option(
@@ -65,7 +70,6 @@ from .tasks import get_all_tasks
65
70
  default=None,
66
71
  show_default=True,
67
72
  multiple=True,
68
- type=click.Choice(list(get_all_dataset_configs().keys())),
69
73
  help="""The name of the benchmark dataset. We recommend to use the `task` and
70
74
  `language` options instead of this option.""",
71
75
  )
@@ -222,9 +226,17 @@ from .tasks import get_all_tasks
222
226
  help="Only download the requested model weights and datasets, and exit.",
223
227
  default=False,
224
228
  )
229
+ @click.option(
230
+ "--custom-datasets-file",
231
+ type=click.Path(exists=False, dir_okay=False, path_type=Path),
232
+ default="custom_datasets.py",
233
+ show_default=True,
234
+ help="A path to a Python file containing DatasetConfig definitions for custom "
235
+ "datasets.",
236
+ )
225
237
  def benchmark(
226
238
  model: tuple[str],
227
- dataset: tuple[str],
239
+ dataset: tuple[str | DatasetConfig],
228
240
  language: tuple[str],
229
241
  model_language: tuple[str],
230
242
  dataset_language: tuple[str],
@@ -250,26 +262,92 @@ def benchmark(
250
262
  requires_safetensors: bool,
251
263
  generative_type: str | None,
252
264
  download_only: bool,
265
+ custom_datasets_file: Path,
253
266
  ) -> None:
254
267
  """Benchmark pretrained language models on language tasks."""
255
268
  models = list(model)
256
- datasets = None if len(dataset) == 0 else list(dataset)
269
+ datasets: c.Sequence[str | DatasetConfig] | None = (
270
+ None if len(dataset) == 0 else list(dataset)
271
+ )
257
272
  languages: list[str] = list(language)
258
273
  model_languages = None if len(model_language) == 0 else list(model_language)
259
274
  dataset_languages = None if len(dataset_language) == 0 else list(dataset_language)
260
- tasks = None if len(task) == 0 else list(task)
275
+ tasks: c.Sequence[str | Task] | None = None if len(task) == 0 else list(task)
261
276
  batch_size_int = int(batch_size)
262
277
  device = Device[device.upper()] if device is not None else None
263
278
  generative_type_obj = (
264
279
  GenerativeType[generative_type.upper()] if generative_type else None
265
280
  )
266
281
 
282
+ # Load all defined DatasetConfig and Task objects from the custom datasets file
283
+ if custom_datasets_file.exists():
284
+ # Load the custom module
285
+ spec = importlib.util.spec_from_file_location(
286
+ name="custom_datasets_module", location=str(custom_datasets_file.resolve())
287
+ )
288
+ if spec is None:
289
+ raise RuntimeError(
290
+ "Could not load the spec for the custom datasets file from "
291
+ f"{custom_datasets_file.resolve()}."
292
+ )
293
+ module = importlib.util.module_from_spec(spec=spec)
294
+ if spec.loader is None:
295
+ raise RuntimeError(
296
+ "Could not load the module for the custom datasets file from "
297
+ f"{custom_datasets_file.resolve()}."
298
+ )
299
+ spec.loader.exec_module(module)
300
+
301
+ # Load all the custom dataset configurations from the module
302
+ custom_dataset_configs: list[DatasetConfig] = [
303
+ obj for obj in vars(module).values() if isinstance(obj, DatasetConfig)
304
+ ]
305
+
306
+ # If the user has not specified any datasets or tasks, we just use all the usual
307
+ # datasets as well as all the custom ones that we loaded
308
+ if datasets is None and tasks is None:
309
+ datasets = custom_dataset_configs + list(get_all_dataset_configs().values())
310
+ datasets = [ds for ds in datasets if not ds.unofficial]
311
+
312
+ # If the user has specified only datasets, then we replace the custom dataset
313
+ # names that the user specified (if any) with the corresponding dataset configs
314
+ # that we loaded
315
+ elif datasets is not None and tasks is None:
316
+ dataset_name_to_config = {
317
+ config.name: config for config in custom_dataset_configs
318
+ }
319
+ datasets = [
320
+ dataset_name_to_config.get(ds, ds) if isinstance(ds, str) else ds
321
+ for ds in datasets
322
+ ]
323
+
324
+ # If the user has specified only tasks, then we find all the official usual and
325
+ # custom datasets belonging to that task, and use those. We reset the `tasks`
326
+ # variable as we're using the `datasets` variable directly instead
327
+ elif datasets is None and tasks is not None:
328
+ datasets = custom_dataset_configs + list(get_all_dataset_configs().values())
329
+ datasets = [
330
+ ds for ds in datasets if not ds.unofficial and ds.task.name in tasks
331
+ ]
332
+ tasks = None
333
+
334
+ # Log the loaded custom datasets and tasks
335
+ dataset_str = (
336
+ "the custom dataset"
337
+ if len(custom_dataset_configs) == 1
338
+ else f"{len(custom_dataset_configs):,} custom datasets"
339
+ )
340
+ log(
341
+ f"Loaded {dataset_str} from {custom_datasets_file.as_posix()!r}.\n",
342
+ level=logging.INFO,
343
+ )
344
+
267
345
  benchmarker = Benchmarker(
268
346
  language=languages,
269
347
  model_language=model_languages,
270
348
  dataset_language=dataset_languages,
271
- task=tasks,
272
- dataset=datasets,
349
+ task=tasks, # type: ignore[arg-type]
350
+ dataset=datasets, # type: ignore[arg-type]
273
351
  batch_size=batch_size_int,
274
352
  progress_bar=progress_bar,
275
353
  save_results=save_results,
euroeval/constants.py CHANGED
@@ -90,3 +90,6 @@ JSON_STRIP_CHARACTERS = ' {}\n\r":'
90
90
  # tasks. We also use this to determine whether we should store logprobs in the model
91
91
  # outputs (and cache).
92
92
  NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
93
+
94
+ # We only allow loading local datasets in these file formats
95
+ SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]
euroeval/data_loading.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Functions related to the loading of the data."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import sys
5
6
  import time
@@ -11,6 +12,7 @@ from datasets.exceptions import DatasetsError
11
12
  from huggingface_hub.errors import HfHubHTTPError
12
13
  from numpy.random import Generator
13
14
 
15
+ from .constants import SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS
14
16
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark
15
17
  from .logging_utils import log, no_terminal_output
16
18
  from .tasks import EUROPEAN_VALUES
@@ -64,7 +66,7 @@ def load_data(
64
66
 
65
67
  # Bootstrap the splits, if applicable
66
68
  if dataset_config.bootstrap_samples:
67
- bootstrapped_splits: dict[str, list["Dataset"]] = dict()
69
+ bootstrapped_splits: dict[str, c.Sequence["Dataset"]] = dict()
68
70
  for split in dataset_config.splits:
69
71
  bootstrap_indices = rng.integers(
70
72
  0,
@@ -102,38 +104,84 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
102
104
  Returns:
103
105
  The dataset.
104
106
  """
105
- num_attempts = 5
106
- for _ in range(num_attempts):
107
- try:
108
- with no_terminal_output():
109
- dataset = load_dataset(
110
- path=dataset_config.huggingface_id,
111
- cache_dir=cache_dir,
112
- token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_"),
107
+ # Case where the dataset source is a Hugging Face ID
108
+ if isinstance(dataset_config.source, str):
109
+ num_attempts = 5
110
+ for _ in range(num_attempts):
111
+ try:
112
+ with no_terminal_output():
113
+ dataset = load_dataset(
114
+ path=dataset_config.source.split("::")[0],
115
+ name=(
116
+ dataset_config.source.split("::")[1]
117
+ if "::" in dataset_config.source
118
+ else None
119
+ ),
120
+ cache_dir=cache_dir,
121
+ token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_"),
122
+ )
123
+ break
124
+ except (
125
+ FileNotFoundError,
126
+ ConnectionError,
127
+ DatasetsError,
128
+ requests.ConnectionError,
129
+ requests.ReadTimeout,
130
+ ) as e:
131
+ log(
132
+ f"Failed to load dataset {dataset_config.source!r}, due to "
133
+ f"the following error: {e}. Retrying...",
134
+ level=logging.DEBUG,
113
135
  )
114
- break
115
- except (
116
- FileNotFoundError,
117
- ConnectionError,
118
- DatasetsError,
119
- requests.ConnectionError,
120
- requests.ReadTimeout,
121
- ) as e:
122
- log(
123
- f"Failed to load dataset {dataset_config.huggingface_id!r}, due to "
124
- f"the following error: {e}. Retrying...",
125
- level=logging.DEBUG,
136
+ time.sleep(1)
137
+ continue
138
+ except HfHubHTTPError:
139
+ raise HuggingFaceHubDown()
140
+ else:
141
+ raise InvalidBenchmark(
142
+ f"Failed to load dataset {dataset_config.source!r} after "
143
+ f"{num_attempts} attempts. Run with verbose mode to see the individual "
144
+ "errors."
126
145
  )
127
- time.sleep(1)
128
- continue
129
- except HfHubHTTPError:
130
- raise HuggingFaceHubDown()
146
+
147
+ # Case where the dataset source is a dictionary with keys "train", "val" and "test",
148
+ # with the values pointing to local CSV files
131
149
  else:
132
- raise InvalidBenchmark(
133
- f"Failed to load dataset {dataset_config.huggingface_id!r} after "
134
- f"{num_attempts} attempts. Run with verbose mode to see the individual "
135
- "errors."
136
- )
150
+ data_files = {
151
+ split: dataset_config.source[split]
152
+ for split in dataset_config.splits
153
+ if split in dataset_config.source
154
+ }
155
+
156
+ # Get the file extension and ensure that all files have the same extension
157
+ file_extensions = {
158
+ split: dataset_config.source[split].split(".")[-1]
159
+ for split in dataset_config.splits
160
+ if split in dataset_config.source
161
+ }
162
+ if len(set(file_extensions.values())) != 1:
163
+ raise InvalidBenchmark(
164
+ "All data files in a custom dataset must have the same file extension. "
165
+ f"Got the extensions {', '.join(file_extensions.values())} for the "
166
+ f"dataset {dataset_config.name!r}."
167
+ )
168
+ file_extension = list(file_extensions.values())[0]
169
+
170
+ # Check that the file extension is supported
171
+ if file_extension not in SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS:
172
+ raise InvalidBenchmark(
173
+ "Unsupported file extension for custom dataset. Supported file "
174
+ "extensions are "
175
+ f"{', '.join(SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS)}, but got "
176
+ f"{file_extension!r}."
177
+ )
178
+
179
+ # Load the dataset
180
+ with no_terminal_output():
181
+ dataset = load_dataset(
182
+ path=file_extension, data_files=data_files, cache_dir=cache_dir
183
+ )
184
+
137
185
  assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
138
186
  missing_keys = [key for key in dataset_config.splits if key not in dataset]
139
187
  if missing_keys: