EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show
  1. euroeval/__init__.py +72 -0
  2. euroeval/benchmark_config_factory.py +358 -0
  3. euroeval/benchmark_modules/__init__.py +7 -0
  4. euroeval/benchmark_modules/base.py +354 -0
  5. euroeval/benchmark_modules/fresh.py +286 -0
  6. euroeval/benchmark_modules/hf.py +1185 -0
  7. euroeval/benchmark_modules/litellm.py +905 -0
  8. euroeval/benchmark_modules/vllm.py +1171 -0
  9. euroeval/benchmarker.py +1074 -0
  10. euroeval/callbacks.py +72 -0
  11. euroeval/cli.py +281 -0
  12. euroeval/constants.py +50 -0
  13. euroeval/data_loading.py +96 -0
  14. euroeval/data_models.py +474 -0
  15. euroeval/dataset_configs.py +2001 -0
  16. euroeval/enums.py +144 -0
  17. euroeval/exceptions.py +191 -0
  18. euroeval/finetuning.py +324 -0
  19. euroeval/generation.py +296 -0
  20. euroeval/human_evaluation.py +737 -0
  21. euroeval/languages.py +200 -0
  22. euroeval/model_cache.py +253 -0
  23. euroeval/model_config.py +77 -0
  24. euroeval/model_loading.py +78 -0
  25. euroeval/scores.py +90 -0
  26. euroeval/speed_benchmark.py +124 -0
  27. euroeval/task_utils/__init__.py +1 -0
  28. euroeval/task_utils/multiple_choice_classification.py +176 -0
  29. euroeval/task_utils/question_answering.py +698 -0
  30. euroeval/task_utils/sequence_classification.py +237 -0
  31. euroeval/task_utils/text_to_text.py +150 -0
  32. euroeval/task_utils/token_classification.py +464 -0
  33. euroeval/tasks.py +202 -0
  34. euroeval/types.py +97 -0
  35. euroeval/utils.py +574 -0
  36. euroeval-15.2.0.dist-info/METADATA +234 -0
  37. euroeval-15.2.0.dist-info/RECORD +40 -0
  38. euroeval-15.2.0.dist-info/WHEEL +4 -0
  39. euroeval-15.2.0.dist-info/entry_points.txt +4 -0
  40. euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
euroeval/enums.py ADDED
@@ -0,0 +1,144 @@
1
+ """Enums used in the project."""
2
+
3
+ from enum import Enum, auto
4
+
5
+
6
+ class AutoStrEnum(str, Enum):
7
+ """StrEnum where auto() returns the field name in lower case."""
8
+
9
+ @staticmethod
10
+ def _generate_next_value_(
11
+ name: str, start: int, count: int, last_values: list
12
+ ) -> str:
13
+ return name.lower()
14
+
15
+
16
+ class Device(AutoStrEnum):
17
+ """The compute device to use for the evaluation.
18
+
19
+ Attributes:
20
+ CPU:
21
+ CPU device.
22
+ MPS:
23
+ MPS GPU, used in M-series MacBooks.
24
+ CUDA:
25
+ CUDA GPU, used with NVIDIA GPUs.
26
+ """
27
+
28
+ CPU = auto()
29
+ MPS = auto()
30
+ CUDA = auto()
31
+
32
+
33
+ class InferenceBackend(AutoStrEnum):
34
+ """The backend used for model inference.
35
+
36
+ Attributes:
37
+ TRANSFORMERS:
38
+ Hugging Face `transformers` library.
39
+ VLLM:
40
+ VLLM library.
41
+ LITELLM:
42
+ LiteLLM library.
43
+ NONE:
44
+ No inference backend used (e.g., for human evaluation).
45
+ """
46
+
47
+ TRANSFORMERS = auto()
48
+ VLLM = auto()
49
+ LITELLM = auto()
50
+ NONE = auto()
51
+
52
+
53
+ class ModelType(AutoStrEnum):
54
+ """The type of a model.
55
+
56
+ Attributes:
57
+ ENCODER:
58
+ An encoder (i.e., BERT-style) model.
59
+ GENERATIVE:
60
+ A generative model. Can be either decoder or encoder-decoder (aka seq2seq).
61
+ HUMAN:
62
+ Human evaluator.
63
+ """
64
+
65
+ ENCODER = auto()
66
+ GENERATIVE = auto()
67
+ HUMAN = auto()
68
+
69
+
70
+ class GenerativeType(AutoStrEnum):
71
+ """The type of a generative model.
72
+
73
+ Attributes:
74
+ BASE:
75
+ A base (i.e., pretrained) generative model.
76
+ INSTRUCTION_TUNED:
77
+ An instruction-tuned generative model.
78
+ REASONING:
79
+ A generative reasoning model.
80
+ """
81
+
82
+ BASE = auto()
83
+ INSTRUCTION_TUNED = auto()
84
+ REASONING = auto()
85
+
86
+
87
+ class DataType(AutoStrEnum):
88
+ """The data type of the model weights.
89
+
90
+ Attributes:
91
+ FP32:
92
+ 32-bit floating point.
93
+ FP16:
94
+ 16-bit floating point.
95
+ BF16:
96
+ 16-bit bfloat.
97
+ """
98
+
99
+ FP32 = auto()
100
+ FP16 = auto()
101
+ BF16 = auto()
102
+
103
+
104
+ class BatchingPreference(AutoStrEnum):
105
+ """The preference for batching.
106
+
107
+ Attributes:
108
+ NO_PREFERENCE:
109
+ No preference for batching.
110
+ SINGLE_SAMPLE:
111
+ Single sample batching.
112
+ ALL_AT_ONCE:
113
+ All samples at once batching.
114
+ """
115
+
116
+ NO_PREFERENCE = auto()
117
+ SINGLE_SAMPLE = auto()
118
+ ALL_AT_ONCE = auto()
119
+
120
+
121
+ class TaskGroup(AutoStrEnum):
122
+ """The overall task group of a task.
123
+
124
+ Attributes:
125
+ SEQUENCE_CLASSIFICATION:
126
+ Classification of documents.
127
+ MULTIPLE_CHOICE_CLASSIFICATION:
128
+ Classification of documents with multiple-choice options.
129
+ TOKEN_CLASSIFICATION:
130
+ Token-level classification.
131
+ QUESTION_ANSWERING:
132
+ Extractive question answering.
133
+ TEXT_TO_TEXT:
134
+ Text-to-text generation.
135
+ SPEED:
136
+ Speed benchmark.
137
+ """
138
+
139
+ SEQUENCE_CLASSIFICATION = auto()
140
+ MULTIPLE_CHOICE_CLASSIFICATION = auto()
141
+ TOKEN_CLASSIFICATION = auto()
142
+ QUESTION_ANSWERING = auto()
143
+ TEXT_TO_TEXT = auto()
144
+ SPEED = auto()
euroeval/exceptions.py ADDED
@@ -0,0 +1,191 @@
1
+ """Exceptions to used by other functions."""
2
+
3
+
4
+ class InvalidBenchmark(Exception):
5
+ """The (model, dataset) combination cannot be benchmarked."""
6
+
7
+ def __init__(
8
+ self, message: str = "This model cannot be benchmarked on the given dataset."
9
+ ) -> None:
10
+ """Initialize the exception.
11
+
12
+ Args:
13
+ message:
14
+ The message to display.
15
+ """
16
+ self.message = message
17
+ super().__init__(self.message)
18
+
19
+
20
+ class InvalidModel(Exception):
21
+ """The model cannot be benchmarked on any datasets."""
22
+
23
+ def __init__(
24
+ self, message: str = "The model cannot be benchmarked on any datasets."
25
+ ) -> None:
26
+ """Initialize the exception.
27
+
28
+ Args:
29
+ message:
30
+ The message to display.
31
+ """
32
+ self.message = message
33
+ super().__init__(self.message)
34
+
35
+
36
+ class HuggingFaceHubDown(Exception):
37
+ """The Hugging Face Hub seems to be down."""
38
+
39
+ def __init__(
40
+ self, message: str = "The Hugging Face Hub is currently down."
41
+ ) -> None:
42
+ """Initialize the exception.
43
+
44
+ Args:
45
+ message:
46
+ The message to display.
47
+ """
48
+ self.message = message
49
+ super().__init__(self.message)
50
+
51
+
52
+ class NoInternetConnection(Exception):
53
+ """There seems to be no internet connection."""
54
+
55
+ def __init__(
56
+ self, message: str = "There is currently no internet connection."
57
+ ) -> None:
58
+ """Initialize the exception.
59
+
60
+ Args:
61
+ message:
62
+ The message to display.
63
+ """
64
+ self.message = message
65
+ super().__init__(self.message)
66
+
67
+
68
+ class NaNValueInModelOutput(Exception):
69
+ """There is a NaN value in the model output."""
70
+
71
+ def __init__(
72
+ self, message: str = "There is a NaN value in the model output."
73
+ ) -> None:
74
+ """Initialize the exception.
75
+
76
+ Args:
77
+ message:
78
+ The message to display.
79
+ """
80
+ self.message = message
81
+ super().__init__(self.message)
82
+
83
+
84
+ class FlashAttentionNotInstalled(Exception):
85
+ """The `flash-attn` package has not been installed."""
86
+
87
+ def __init__(
88
+ self,
89
+ message: str = (
90
+ "The model you are trying to load requires Flash Attention. To use Flash "
91
+ "Attention, please install the `flash-attn` package, which can be done by "
92
+ "running `pip install -U wheel && FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE "
93
+ "pip install flash-attn --no-build-isolation`."
94
+ ),
95
+ ) -> None:
96
+ """Initialize the exception.
97
+
98
+ Args:
99
+ message:
100
+ The message to display.
101
+ """
102
+ self.message = message
103
+ super().__init__(self.message)
104
+
105
+
106
+ class NeedsExtraInstalled(InvalidModel):
107
+ """The evaluation requires extra to be installed."""
108
+
109
+ def __init__(self, extra: str) -> None:
110
+ """Initialize the exception.
111
+
112
+ Args:
113
+ extra:
114
+ The extra that needs to be installed.
115
+ """
116
+ self.extra = extra
117
+ self.message = (
118
+ f"The model you are trying to load requires the `{extra}` extra to be "
119
+ f"installed. To install the `{extra}` extra, please run `pip install "
120
+ f"euroeval[{extra}]` or `pip install euroeval[all]`."
121
+ )
122
+ super().__init__(self.message)
123
+
124
+
125
+ class NeedsManualDependency(InvalidModel):
126
+ """The evaluation requires a dependency to be manually installed."""
127
+
128
+ def __init__(self, package: str) -> None:
129
+ """Initialize the exception.
130
+
131
+ Args:
132
+ package:
133
+ The package that needs to be manually installed.
134
+ """
135
+ self.package = package
136
+ self.message = (
137
+ f"The model you are trying to load requires the `{package}` package to be "
138
+ f"installed - please run `pip install {package}` and try again."
139
+ )
140
+ super().__init__(self.message)
141
+
142
+
143
+ class NeedsAdditionalArgument(InvalidModel):
144
+ """The evaluation requires additional arguments to the `euroeval` command."""
145
+
146
+ def __init__(
147
+ self, cli_argument: str, script_argument: str, run_with_cli: bool
148
+ ) -> None:
149
+ """Initialize the exception.
150
+
151
+ Args:
152
+ cli_argument:
153
+ The argument that needs to be passed to the `euroeval` command.
154
+ script_argument:
155
+ The argument that needs to be passed to the `Benchmarker` class.
156
+ run_with_cli:
157
+ Whether the benchmark is being run with the CLI.
158
+ """
159
+ self.cli_argument = cli_argument
160
+ self.script_argument = script_argument
161
+ if run_with_cli:
162
+ self.message = (
163
+ f"The model you are trying to load requires the `{cli_argument}` "
164
+ "argument to be passed to the `euroeval` command. Please pass the "
165
+ "argument and try again."
166
+ )
167
+ else:
168
+ self.message = (
169
+ f"The model you are trying to load requires the `{script_argument}` "
170
+ "argument to be passed to the `Benchmarker` class. Please pass the "
171
+ "argument and try again."
172
+ )
173
+ super().__init__(self.message)
174
+
175
+
176
+ class NeedsEnvironmentVariable(InvalidModel):
177
+ """The evaluation requires an environment variable to be set."""
178
+
179
+ def __init__(self, env_var: str) -> None:
180
+ """Initialize the exception.
181
+
182
+ Args:
183
+ env_var:
184
+ The environment variable that needs to be set.
185
+ """
186
+ self.env_var = env_var
187
+ self.message = (
188
+ f"The model you are trying to load requires the `{env_var}` environment "
189
+ "variable to be set. Please set the environment variable and try again."
190
+ )
191
+ super().__init__(self.message)
euroeval/finetuning.py ADDED
@@ -0,0 +1,324 @@
1
+ """Functions related to the finetuning of models."""
2
+
3
+ import logging
4
+ import sys
5
+ import typing as t
6
+
7
+ import torch
8
+ from datasets import DatasetDict
9
+ from tqdm.auto import tqdm
10
+ from transformers import (
11
+ EarlyStoppingCallback,
12
+ IntervalStrategy,
13
+ PrinterCallback,
14
+ ProgressCallback,
15
+ TrainingArguments,
16
+ )
17
+ from transformers.trainer import OptimizerNames
18
+
19
+ from .benchmark_modules import BenchmarkModule
20
+ from .callbacks import NeverLeaveProgressCallback
21
+ from .enums import DataType
22
+ from .exceptions import InvalidBenchmark, NaNValueInModelOutput
23
+ from .model_loading import load_model
24
+ from .utils import (
25
+ block_terminal_output,
26
+ clear_memory,
27
+ enforce_reproducibility,
28
+ log_once,
29
+ )
30
+
31
+ if t.TYPE_CHECKING:
32
+ from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
33
+
34
+ logger = logging.getLogger("euroeval")
35
+
36
+
37
+ def finetune(
38
+ model: BenchmarkModule,
39
+ datasets: list[DatasetDict],
40
+ model_config: "ModelConfig",
41
+ dataset_config: "DatasetConfig",
42
+ benchmark_config: "BenchmarkConfig",
43
+ ) -> list[dict[str, float]]:
44
+ """Evaluate a model on a dataset through finetuning.
45
+
46
+ Args:
47
+ model:
48
+ The model to evaluate.
49
+ datasets:
50
+ The datasets to use for training and evaluation.
51
+ model_config:
52
+ The configuration of the model.
53
+ dataset_config:
54
+ The dataset configuration.
55
+ benchmark_config:
56
+ The benchmark configuration.
57
+
58
+ Returns:
59
+ A list of dicts containing the scores for each metric for each iteration.
60
+ """
61
+ # Set the data type to use for the model weights
62
+ using_cuda = benchmark_config.device == torch.device("cuda")
63
+ if using_cuda and torch.cuda.is_bf16_supported():
64
+ dtype = DataType.BF16
65
+ elif using_cuda:
66
+ dtype = DataType.FP16
67
+ else:
68
+ dtype = DataType.FP32
69
+
70
+ # TEMP
71
+ dtype = DataType.FP32
72
+
73
+ bs: int = benchmark_config.batch_size
74
+ scores: list[dict[str, float]] = list()
75
+ for idx in tqdm(
76
+ iterable=range(benchmark_config.num_iterations),
77
+ desc="Benchmarking",
78
+ disable=not benchmark_config.progress_bar,
79
+ ):
80
+ # Set variable that tracks whether we need to initialize new models in
81
+ # the single iteration call
82
+ model_already_initialized = idx == 0
83
+
84
+ # Run a loop here to deal with automatic reduction of batch size
85
+ while True:
86
+ # Clear GPU memory
87
+ if not model_already_initialized:
88
+ try:
89
+ del model
90
+ except UnboundLocalError:
91
+ pass
92
+ clear_memory()
93
+
94
+ try:
95
+ # Re-block terminal output, as it gets unblocked by the `transformers`
96
+ # package before training
97
+ block_terminal_output()
98
+
99
+ training_args = get_training_args(
100
+ benchmark_config=benchmark_config,
101
+ model_config=model_config,
102
+ iteration_idx=idx,
103
+ dtype=dtype,
104
+ batch_size=bs,
105
+ )
106
+
107
+ itr_scores = finetune_single_iteration(
108
+ model=model if model_already_initialized else None,
109
+ dataset=datasets[idx],
110
+ iteration_idx=idx,
111
+ training_args=training_args,
112
+ model_config=model_config,
113
+ dataset_config=dataset_config,
114
+ benchmark_config=benchmark_config,
115
+ )
116
+
117
+ scores.append(itr_scores)
118
+ logger.debug(f"Test scores for iteration {idx}: {itr_scores}")
119
+
120
+ break
121
+
122
+ # NaN values can appear in the model output when using mixed precision, as
123
+ # the hidden states get overflowed. In this case we try to disable mixed
124
+ # precision and try again.
125
+ except NaNValueInModelOutput:
126
+ if dtype != DataType.FP32:
127
+ dtype = DataType.FP32
128
+ model_already_initialized = False
129
+ logger.debug(
130
+ "NaN value detected in model outputs while using mixed "
131
+ "precision. Retrying with full fp32 precision."
132
+ )
133
+ else:
134
+ raise InvalidBenchmark(
135
+ "NaN value detected in model outputs, even with mixed "
136
+ "precision disabled."
137
+ )
138
+
139
+ except Exception as e:
140
+ if "CUDA" not in str(e) and "out of memory" not in str(e):
141
+ raise InvalidBenchmark(str(e))
142
+
143
+ if bs <= 1:
144
+ msg = "Could not benchmark the model, even with a batch size of 1!"
145
+ if "MPS" in str(e):
146
+ msg += (
147
+ " As you are using MPS, you can try running the evaluation "
148
+ "with the `PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0` "
149
+ "environment variable set, as this removes the upper bound "
150
+ "on the memory usage."
151
+ )
152
+ raise InvalidBenchmark(msg)
153
+
154
+ model_already_initialized = False
155
+
156
+ bs //= 2
157
+ logger.debug(f"Reduced batch size to {bs}")
158
+
159
+ return scores
160
+
161
+
162
+ def finetune_single_iteration(
163
+ model: BenchmarkModule | None,
164
+ dataset: DatasetDict,
165
+ iteration_idx: int,
166
+ training_args: TrainingArguments,
167
+ model_config: "ModelConfig",
168
+ dataset_config: "DatasetConfig",
169
+ benchmark_config: "BenchmarkConfig",
170
+ ) -> dict[str, float]:
171
+ """Run a single iteration of a benchmark.
172
+
173
+ Args:
174
+ model:
175
+ The model to use in the benchmark. If None then a new model will be loaded.
176
+ dataset:
177
+ The dataset to use for training and evaluation.
178
+ iteration_idx:
179
+ The index of the iteration.
180
+ training_args:
181
+ The training arguments.
182
+ model_config:
183
+ The model configuration.
184
+ dataset_config:
185
+ The dataset configuration.
186
+ benchmark_config:
187
+ The benchmark configuration.
188
+
189
+ Returns:
190
+ The scores for the test dataset.
191
+ """
192
+ # Set random seeds to enforce reproducibility of the randomly initialised weights
193
+ enforce_reproducibility(seed=training_args.seed)
194
+
195
+ if model is None:
196
+ model = load_model(
197
+ model_config=model_config,
198
+ dataset_config=dataset_config,
199
+ benchmark_config=benchmark_config,
200
+ )
201
+
202
+ trainer = model.trainer_class(
203
+ model=model.get_pytorch_module(),
204
+ processing_class=model.get_tokenizer(),
205
+ args=training_args,
206
+ train_dataset=dataset["train"],
207
+ eval_dataset=dataset["val"],
208
+ compute_metrics=model.compute_metrics,
209
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
210
+ data_collator=model.data_collator,
211
+ )
212
+
213
+ if not benchmark_config.verbose:
214
+
215
+ def no_logging(logs: dict[str, float]) -> None:
216
+ return
217
+
218
+ trainer.log = no_logging
219
+
220
+ # Re-block terminal output, as it gets unblocked by the `transformers`
221
+ # package before training
222
+ block_terminal_output()
223
+
224
+ # Sort out callbacks. We remove the callbacks that are producing unnecessary
225
+ # output, to avoid cluttering the terminal output
226
+ if not benchmark_config.verbose:
227
+ trainer.remove_callback(PrinterCallback)
228
+ trainer.remove_callback(ProgressCallback)
229
+ if benchmark_config.progress_bar:
230
+ trainer.add_callback(NeverLeaveProgressCallback)
231
+
232
+ try:
233
+ trainer.train()
234
+ with torch.inference_mode():
235
+ try:
236
+ test_scores = trainer.evaluate(
237
+ eval_dataset=dataset["test"],
238
+ orig_eval_dataset=dataset["original_test"],
239
+ metric_key_prefix="test",
240
+ )
241
+ except TypeError:
242
+ test_scores = trainer.evaluate(
243
+ eval_dataset=dataset["test"], metric_key_prefix="test"
244
+ )
245
+ return test_scores
246
+
247
+ except NaNValueInModelOutput as e:
248
+ del trainer
249
+ del model
250
+ clear_memory()
251
+ raise e
252
+
253
+ except (RuntimeError, ValueError, IndexError) as e:
254
+ raise InvalidBenchmark(str(e))
255
+
256
+
257
+ def get_training_args(
258
+ benchmark_config: "BenchmarkConfig",
259
+ model_config: "ModelConfig",
260
+ iteration_idx: int,
261
+ dtype: DataType,
262
+ batch_size: int | None = None,
263
+ ) -> TrainingArguments:
264
+ """Get the training arguments for the current iteration.
265
+
266
+ Args:
267
+ benchmark_config:
268
+ The benchmark configuration.
269
+ model_config:
270
+ The model configuration.
271
+ iteration_idx:
272
+ The index of the current iteration. This is only used to generate a
273
+ unique random seed for the current iteration.
274
+ dtype:
275
+ The data type to use for the model weights.
276
+ batch_size:
277
+ The batch size to use for the current iteration, or None if the batch size
278
+ in the benchmark config should be used.
279
+
280
+ Returns:
281
+ The training arguments for the current iteration.
282
+ """
283
+ log_once(message=f"Using {dtype} data type.", level=logging.DEBUG)
284
+
285
+ if benchmark_config.verbose:
286
+ logging_strategy = IntervalStrategy.STEPS
287
+ else:
288
+ logging_strategy = IntervalStrategy.NO
289
+
290
+ if batch_size is None:
291
+ batch_size = benchmark_config.batch_size
292
+
293
+ training_args = TrainingArguments(
294
+ output_dir=model_config.model_cache_dir,
295
+ evaluation_strategy=IntervalStrategy.STEPS,
296
+ logging_strategy=logging_strategy,
297
+ save_strategy=IntervalStrategy.STEPS,
298
+ eval_steps=30,
299
+ logging_steps=30,
300
+ save_steps=30,
301
+ max_steps=1 if hasattr(sys, "_called_from_test") else 10_000,
302
+ use_cpu=benchmark_config.device == torch.device("cpu"),
303
+ report_to=[],
304
+ save_total_limit=1,
305
+ per_device_train_batch_size=batch_size,
306
+ per_device_eval_batch_size=batch_size,
307
+ learning_rate=2e-5,
308
+ warmup_ratio=0.01,
309
+ gradient_accumulation_steps=32 // batch_size,
310
+ load_best_model_at_end=True,
311
+ optim=OptimizerNames.ADAMW_TORCH,
312
+ seed=4242 + iteration_idx,
313
+ fp16=dtype == DataType.FP16,
314
+ bf16=dtype == DataType.BF16,
315
+ disable_tqdm=not benchmark_config.progress_bar,
316
+ ddp_find_unused_parameters=False,
317
+ save_safetensors=False,
318
+ )
319
+
320
+ # TEMP: Use only 1 GPU for now for finetuning
321
+ if benchmark_config.device == torch.device("cuda"):
322
+ training_args._n_gpu = 1
323
+
324
+ return training_args