EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show
  1. euroeval/__init__.py +72 -0
  2. euroeval/benchmark_config_factory.py +358 -0
  3. euroeval/benchmark_modules/__init__.py +7 -0
  4. euroeval/benchmark_modules/base.py +354 -0
  5. euroeval/benchmark_modules/fresh.py +286 -0
  6. euroeval/benchmark_modules/hf.py +1185 -0
  7. euroeval/benchmark_modules/litellm.py +905 -0
  8. euroeval/benchmark_modules/vllm.py +1171 -0
  9. euroeval/benchmarker.py +1074 -0
  10. euroeval/callbacks.py +72 -0
  11. euroeval/cli.py +281 -0
  12. euroeval/constants.py +50 -0
  13. euroeval/data_loading.py +96 -0
  14. euroeval/data_models.py +474 -0
  15. euroeval/dataset_configs.py +2001 -0
  16. euroeval/enums.py +144 -0
  17. euroeval/exceptions.py +191 -0
  18. euroeval/finetuning.py +324 -0
  19. euroeval/generation.py +296 -0
  20. euroeval/human_evaluation.py +737 -0
  21. euroeval/languages.py +200 -0
  22. euroeval/model_cache.py +253 -0
  23. euroeval/model_config.py +77 -0
  24. euroeval/model_loading.py +78 -0
  25. euroeval/scores.py +90 -0
  26. euroeval/speed_benchmark.py +124 -0
  27. euroeval/task_utils/__init__.py +1 -0
  28. euroeval/task_utils/multiple_choice_classification.py +176 -0
  29. euroeval/task_utils/question_answering.py +698 -0
  30. euroeval/task_utils/sequence_classification.py +237 -0
  31. euroeval/task_utils/text_to_text.py +150 -0
  32. euroeval/task_utils/token_classification.py +464 -0
  33. euroeval/tasks.py +202 -0
  34. euroeval/types.py +97 -0
  35. euroeval/utils.py +574 -0
  36. euroeval-15.2.0.dist-info/METADATA +234 -0
  37. euroeval-15.2.0.dist-info/RECORD +40 -0
  38. euroeval-15.2.0.dist-info/WHEEL +4 -0
  39. euroeval-15.2.0.dist-info/entry_points.txt +4 -0
  40. euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
euroeval/__init__.py ADDED
@@ -0,0 +1,72 @@
1
+ """EuroEval - A benchmarking framework for language models."""
2
+
3
+ ### STAGE 1 ###
4
+ ### Block unwanted terminal output that happens on importing external modules ###
5
+
6
+ import logging
7
+ import sys
8
+ import warnings
9
+
10
+ from termcolor import colored
11
+
12
+ # Block specific warnings before importing anything else, as they can be noisy
13
+ warnings.filterwarnings("ignore", category=UserWarning)
14
+ logging.getLogger("httpx").setLevel(logging.CRITICAL)
15
+ logging.getLogger("datasets").setLevel(logging.CRITICAL)
16
+ logging.getLogger("vllm").setLevel(logging.CRITICAL)
17
+
18
+ # Set up logging
19
+ fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
20
+ logging.basicConfig(
21
+ level=logging.CRITICAL if hasattr(sys, "_called_from_test") else logging.INFO,
22
+ format=fmt,
23
+ datefmt="%Y-%m-%d %H:%M:%S",
24
+ )
25
+
26
+
27
+ ### STAGE 2 ###
28
+ ### Set the rest up ###
29
+
30
+ import importlib.metadata # noqa: E402
31
+ import os # noqa: E402
32
+
33
+ from dotenv import load_dotenv # noqa: E402
34
+
35
+ from .benchmarker import Benchmarker # noqa: E402
36
+ from .utils import block_terminal_output # noqa: E402
37
+
38
+ # Block unwanted terminal outputs. This blocks way more than the above, but since it
39
+ # relies on importing from the `utils` module, external modules are already imported
40
+ # before this is run, necessitating the above block as well
41
+ block_terminal_output()
42
+
43
+
44
+ # Fetches the version of the package as defined in pyproject.toml
45
+ __version__ = importlib.metadata.version("euroeval")
46
+
47
+
48
+ # Loads environment variables
49
+ load_dotenv()
50
+
51
+
52
+ # Disable parallelisation when tokenizing, as that can lead to errors
53
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
54
+
55
+
56
+ # Enable MPS fallback
57
+ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
58
+
59
+
60
+ # Set amount of threads per GPU - this is the default and is only set to prevent a
61
+ # warning from showing
62
+ os.environ["OMP_NUM_THREADS"] = "1"
63
+
64
+
65
+ # Disable a warning from Ray regarding the detection of the number of CPUs
66
+ os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
67
+
68
+
69
+ # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
70
+ # former and LiteLLM uses the latter
71
+ if os.getenv("HUGGINGFACE_API_KEY"):
72
+ os.environ["HF_TOKEN"] = os.environ["HUGGINGFACE_API_KEY"]
@@ -0,0 +1,358 @@
1
+ """Factory class for creating dataset configurations."""
2
+
3
+ import importlib.util
4
+ import logging
5
+ import sys
6
+ import typing as t
7
+
8
+ import torch
9
+
10
+ from .data_models import BenchmarkConfig
11
+ from .dataset_configs import get_all_dataset_configs
12
+ from .enums import Device
13
+ from .exceptions import InvalidBenchmark
14
+ from .languages import get_all_languages
15
+ from .tasks import get_all_tasks
16
+ from .utils import log_once
17
+
18
+ if t.TYPE_CHECKING:
19
+ from .data_models import Language, Task
20
+
21
+
22
+ logger = logging.getLogger("euroeval")
23
+
24
+
25
+ def build_benchmark_config(
26
+ progress_bar: bool,
27
+ save_results: bool,
28
+ task: str | list[str] | None,
29
+ dataset: str | list[str] | None,
30
+ language: str | list[str],
31
+ model_language: str | list[str] | None,
32
+ dataset_language: str | list[str] | None,
33
+ device: Device | None,
34
+ batch_size: int,
35
+ raise_errors: bool,
36
+ cache_dir: str,
37
+ api_key: str | None,
38
+ force: bool,
39
+ verbose: bool,
40
+ trust_remote_code: bool,
41
+ use_flash_attention: bool | None,
42
+ clear_model_cache: bool,
43
+ evaluate_test_split: bool,
44
+ few_shot: bool,
45
+ num_iterations: int,
46
+ api_base: str | None,
47
+ api_version: str | None,
48
+ debug: bool,
49
+ run_with_cli: bool,
50
+ only_allow_safetensors: bool,
51
+ first_time: bool = False,
52
+ ) -> BenchmarkConfig:
53
+ """Create a benchmark configuration.
54
+
55
+ Args:
56
+ progress_bar:
57
+ Whether to show a progress bar when running the benchmark.
58
+ save_results:
59
+ Whether to save the benchmark results to a file.
60
+ task:
61
+ The tasks to include for dataset. If None then datasets will not be
62
+ filtered based on their task.
63
+ dataset:
64
+ The datasets to include for task. If None then all datasets will be
65
+ included, limited by the `task` parameter.
66
+ language:
67
+ The language codes of the languages to include, both for models and
68
+ datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
69
+ to 'all' if all languages should be considered.
70
+ model_language:
71
+ The language codes of the languages to include for models. If None then
72
+ the `language` parameter will be used.
73
+ dataset_language:
74
+ The language codes of the languages to include for datasets. If None then
75
+ the `language` parameter will be used.
76
+ device:
77
+ The device to use for running the models. If None then the device will be
78
+ set automatically.
79
+ batch_size:
80
+ The batch size to use for running the models.
81
+ raise_errors:
82
+ Whether to raise errors when running the benchmark.
83
+ cache_dir:
84
+ The directory to use for caching the models.
85
+ api_key:
86
+ The API key to use for a given inference server.
87
+ force:
88
+ Whether to force the benchmark to run even if the results are already
89
+ cached.
90
+ verbose:
91
+ Whether to print verbose output when running the benchmark. This is
92
+ automatically set if `debug` is True.
93
+ trust_remote_code:
94
+ Whether to trust remote code when running the benchmark.
95
+ use_flash_attention:
96
+ Whether to use Flash Attention for the models. If None then it will be used
97
+ if it is available.
98
+ clear_model_cache:
99
+ Whether to clear the model cache before running the benchmark.
100
+ evaluate_test_split:
101
+ Whether to use the test split for the datasets.
102
+ few_shot:
103
+ Whether to use few-shot learning for the models.
104
+ num_iterations:
105
+ The number of iterations each model should be evaluated for.
106
+ api_base:
107
+ The base URL for a given inference API. Only relevant if `model` refers to a
108
+ model on an inference API.
109
+ api_version:
110
+ The version of the API to use for a given inference API.
111
+ debug:
112
+ Whether to run the benchmark in debug mode.
113
+ run_with_cli:
114
+ Whether the benchmark is being run with the CLI.
115
+ only_allow_safetensors:
116
+ Whether to only allow evaluations of models stored as safetensors.
117
+ first_time:
118
+ Whether this is the first time the benchmark configuration is being created.
119
+ Defaults to False.
120
+
121
+ Returns:
122
+ The benchmark configuration.
123
+ """
124
+ language_codes = get_correct_language_codes(language_codes=language)
125
+ model_languages = prepare_languages(
126
+ language_codes=model_language, default_language_codes=language_codes
127
+ )
128
+ dataset_languages = prepare_languages(
129
+ language_codes=dataset_language, default_language_codes=language_codes
130
+ )
131
+
132
+ tasks, datasets = prepare_tasks_and_datasets(
133
+ task=task, dataset=dataset, dataset_languages=dataset_languages
134
+ )
135
+
136
+ torch_device = prepare_device(device=device)
137
+
138
+ if use_flash_attention is None:
139
+ if torch_device.type != "cuda":
140
+ use_flash_attention = False
141
+ elif (
142
+ importlib.util.find_spec("flash_attn") is None
143
+ and importlib.util.find_spec("vllm_flash_attn") is None
144
+ ):
145
+ use_flash_attention = False
146
+ if first_time and torch_device.type == "cuda":
147
+ message = (
148
+ "Flash attention has not been installed, so this will not be used. "
149
+ "To install it, run `pip install -U wheel && "
150
+ "FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn "
151
+ "--no-build-isolation`. Alternatively, you can disable this "
152
+ "message by setting "
153
+ )
154
+ if run_with_cli:
155
+ message += "the flag `--no-use-flash-attention`."
156
+ else:
157
+ message += (
158
+ "the argument `use_flash_attention=False` in the `Benchmarker`."
159
+ )
160
+ log_once(message=message, level=logging.INFO)
161
+
162
+ # Set variable with number of iterations
163
+ if hasattr(sys, "_called_from_test"):
164
+ num_iterations = 1
165
+
166
+ return BenchmarkConfig(
167
+ model_languages=model_languages,
168
+ dataset_languages=dataset_languages,
169
+ tasks=tasks,
170
+ datasets=datasets,
171
+ batch_size=batch_size,
172
+ raise_errors=raise_errors,
173
+ cache_dir=cache_dir,
174
+ api_key=api_key,
175
+ force=force,
176
+ progress_bar=progress_bar,
177
+ save_results=save_results,
178
+ verbose=verbose or debug,
179
+ device=torch_device,
180
+ trust_remote_code=trust_remote_code,
181
+ use_flash_attention=use_flash_attention,
182
+ clear_model_cache=clear_model_cache,
183
+ evaluate_test_split=evaluate_test_split,
184
+ few_shot=few_shot,
185
+ num_iterations=num_iterations,
186
+ api_base=api_base,
187
+ api_version=api_version,
188
+ debug=debug,
189
+ run_with_cli=run_with_cli,
190
+ only_allow_safetensors=only_allow_safetensors,
191
+ )
192
+
193
+
194
+ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
195
+ """Get correct language code(s).
196
+
197
+ Args:
198
+ language_codes:
199
+ The language codes of the languages to include, both for models and
200
+ datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
201
+ to 'all' if all languages should be considered.
202
+
203
+ Returns:
204
+ The correct language codes.
205
+ """
206
+ # Create a dictionary that maps languages to their associated language objects
207
+ language_mapping = get_all_languages()
208
+
209
+ # Create the list `languages`
210
+ if "all" in language_codes:
211
+ languages = list(language_mapping.keys())
212
+ elif isinstance(language_codes, str):
213
+ languages = [language_codes]
214
+ else:
215
+ languages = language_codes
216
+
217
+ # If `languages` contains 'no' then also include 'nb' and 'nn'. Conversely, if
218
+ # either 'nb' or 'nn' are specified then also include 'no'.
219
+ if "no" in languages:
220
+ languages = list(set(languages) | {"nb", "nn"})
221
+ elif "nb" in languages or "nn" in languages:
222
+ languages = list(set(languages) | {"no"})
223
+
224
+ return languages
225
+
226
+
227
+ def prepare_languages(
228
+ language_codes: str | list[str] | None, default_language_codes: list[str]
229
+ ) -> list["Language"]:
230
+ """Prepare language(s) for benchmarking.
231
+
232
+ Args:
233
+ language_codes:
234
+ The language codes of the languages to include for models or datasets.
235
+ If specified then this overrides the `language` parameter for model or
236
+ dataset languages.
237
+ default_language_codes:
238
+ The default language codes of the languages to include.
239
+
240
+ Returns:
241
+ The prepared model or dataset languages.
242
+ """
243
+ # Create a dictionary that maps languages to their associated language objects
244
+ language_mapping = get_all_languages()
245
+
246
+ # Create the list `languages_str` of language codes to use for models or datasets
247
+ languages_str: list[str]
248
+ if language_codes is None:
249
+ languages_str = default_language_codes
250
+ elif isinstance(language_codes, str):
251
+ languages_str = [language_codes]
252
+ else:
253
+ languages_str = language_codes
254
+
255
+ # Convert the model languages to language objects
256
+ if "all" in languages_str:
257
+ prepared_languages = list(language_mapping.values())
258
+ else:
259
+ prepared_languages = [language_mapping[language] for language in languages_str]
260
+
261
+ return prepared_languages
262
+
263
+
264
+ def prepare_tasks_and_datasets(
265
+ task: str | list[str] | None,
266
+ dataset_languages: list["Language"],
267
+ dataset: str | list[str] | None,
268
+ ) -> tuple[list["Task"], list[str]]:
269
+ """Prepare task(s) and dataset(s) for benchmarking.
270
+
271
+ Args:
272
+ task:
273
+ The tasks to include for dataset. If None then datasets will not be
274
+ filtered based on their task.
275
+ dataset_languages:
276
+ The languages of the datasets in the benchmark.
277
+ dataset:
278
+ The datasets to include for task. If None then all datasets will be
279
+ included, limited by the `task` and `dataset_languages` parameters.
280
+
281
+ Returns:
282
+ The prepared tasks and datasets.
283
+
284
+ Raises:
285
+ InvalidBenchmark:
286
+ If the task or dataset is not found in the benchmark tasks or datasets.
287
+ """
288
+ # Create a dictionary that maps benchmark tasks to their associated benchmark
289
+ # task objects, and a dictionary that maps dataset names to their associated
290
+ # dataset configuration objects
291
+ task_mapping = get_all_tasks()
292
+ all_dataset_configs = get_all_dataset_configs()
293
+
294
+ # Create the list of dataset tasks
295
+ try:
296
+ if task is None:
297
+ tasks = list(task_mapping.values())
298
+ elif isinstance(task, str):
299
+ tasks = [task_mapping[task]]
300
+ else:
301
+ tasks = [task_mapping[t] for t in task]
302
+ except KeyError as e:
303
+ raise InvalidBenchmark(f"Task {e} not found in the benchmark tasks.") from e
304
+
305
+ all_official_datasets = [
306
+ dataset_name
307
+ for dataset_name, dataset_config in all_dataset_configs.items()
308
+ if not dataset_config.unofficial
309
+ ]
310
+ if dataset is None:
311
+ dataset = all_official_datasets
312
+ elif isinstance(dataset, str):
313
+ dataset = [dataset]
314
+
315
+ all_datasets = list(all_dataset_configs.keys())
316
+ invalid_datasets = set(dataset) - set(all_datasets)
317
+ if invalid_datasets:
318
+ raise InvalidBenchmark(
319
+ f"Dataset(s) {', '.join(invalid_datasets)} not found in the benchmark "
320
+ "datasets."
321
+ )
322
+
323
+ datasets = [
324
+ dataset_name
325
+ for dataset_name, dataset_config in all_dataset_configs.items()
326
+ if dataset_name in dataset
327
+ and dataset_config.task in tasks
328
+ and set(dataset_config.languages).intersection(dataset_languages)
329
+ ]
330
+
331
+ return tasks, datasets
332
+
333
+
334
+ def prepare_device(device: Device | None) -> torch.device:
335
+ """Prepare device for benchmarking.
336
+
337
+ Args:
338
+ device:
339
+ The device to use for running the models. If None then the device will be
340
+ set automatically.
341
+
342
+ Returns:
343
+ The prepared device.
344
+ """
345
+ device_mapping = {
346
+ Device.CPU: torch.device("cpu"),
347
+ Device.CUDA: torch.device("cuda"),
348
+ Device.MPS: torch.device("mps"),
349
+ }
350
+ if isinstance(device, Device):
351
+ return device_mapping[device]
352
+
353
+ if torch.cuda.is_available():
354
+ return torch.device("cuda")
355
+ elif torch.backends.mps.is_available():
356
+ return torch.device("mps")
357
+ else:
358
+ return torch.device("cpu")
@@ -0,0 +1,7 @@
1
+ """The different types of modules that can be benchmarked."""
2
+
3
+ from .base import BenchmarkModule
4
+ from .fresh import FreshEncoderModel
5
+ from .hf import HuggingFaceEncoderModel
6
+ from .litellm import LiteLLMModel
7
+ from .vllm import VLLMModel