EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show
  1. euroeval/__init__.py +72 -0
  2. euroeval/benchmark_config_factory.py +358 -0
  3. euroeval/benchmark_modules/__init__.py +7 -0
  4. euroeval/benchmark_modules/base.py +354 -0
  5. euroeval/benchmark_modules/fresh.py +286 -0
  6. euroeval/benchmark_modules/hf.py +1185 -0
  7. euroeval/benchmark_modules/litellm.py +905 -0
  8. euroeval/benchmark_modules/vllm.py +1171 -0
  9. euroeval/benchmarker.py +1074 -0
  10. euroeval/callbacks.py +72 -0
  11. euroeval/cli.py +281 -0
  12. euroeval/constants.py +50 -0
  13. euroeval/data_loading.py +96 -0
  14. euroeval/data_models.py +474 -0
  15. euroeval/dataset_configs.py +2001 -0
  16. euroeval/enums.py +144 -0
  17. euroeval/exceptions.py +191 -0
  18. euroeval/finetuning.py +324 -0
  19. euroeval/generation.py +296 -0
  20. euroeval/human_evaluation.py +737 -0
  21. euroeval/languages.py +200 -0
  22. euroeval/model_cache.py +253 -0
  23. euroeval/model_config.py +77 -0
  24. euroeval/model_loading.py +78 -0
  25. euroeval/scores.py +90 -0
  26. euroeval/speed_benchmark.py +124 -0
  27. euroeval/task_utils/__init__.py +1 -0
  28. euroeval/task_utils/multiple_choice_classification.py +176 -0
  29. euroeval/task_utils/question_answering.py +698 -0
  30. euroeval/task_utils/sequence_classification.py +237 -0
  31. euroeval/task_utils/text_to_text.py +150 -0
  32. euroeval/task_utils/token_classification.py +464 -0
  33. euroeval/tasks.py +202 -0
  34. euroeval/types.py +97 -0
  35. euroeval/utils.py +574 -0
  36. euroeval-15.2.0.dist-info/METADATA +234 -0
  37. euroeval-15.2.0.dist-info/RECORD +40 -0
  38. euroeval-15.2.0.dist-info/WHEEL +4 -0
  39. euroeval-15.2.0.dist-info/entry_points.txt +4 -0
  40. euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,354 @@
1
+ """Abstract benchmark module class that the model classes inherit from."""
2
+
3
+ import collections.abc as c
4
+ import logging
5
+ import sys
6
+ import typing as t
7
+ from abc import ABC, abstractmethod
8
+ from functools import cached_property, partial
9
+
10
+ from datasets import DatasetDict
11
+ from torch import nn
12
+ from tqdm.auto import tqdm
13
+ from transformers import PreTrainedTokenizer, Trainer
14
+
15
+ from ..data_models import (
16
+ BenchmarkConfig,
17
+ DatasetConfig,
18
+ GenerativeModelOutput,
19
+ ModelConfig,
20
+ Task,
21
+ )
22
+ from ..enums import BatchingPreference, GenerativeType, TaskGroup
23
+ from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
24
+ from ..task_utils import (
25
+ question_answering,
26
+ sequence_classification,
27
+ text_to_text,
28
+ token_classification,
29
+ )
30
+ from ..types import ComputeMetricsFunction, ExtractLabelsFunction
31
+ from ..utils import log_once
32
+
33
+ logger = logging.getLogger("euroeval")
34
+
35
+
36
+ class BenchmarkModule(ABC):
37
+ """Abstract class for a benchmark module.
38
+
39
+ Attributes:
40
+ model_config:
41
+ The model configuration.
42
+ dataset_config:
43
+ The dataset configuration.
44
+ benchmark_config:
45
+ The benchmark configuration.
46
+ buffer:
47
+ A buffer to store temporary data.
48
+ """
49
+
50
+ fresh_model: bool
51
+ batching_preference: BatchingPreference
52
+ high_priority: bool
53
+
54
+ def __init__(
55
+ self,
56
+ model_config: ModelConfig,
57
+ dataset_config: DatasetConfig,
58
+ benchmark_config: BenchmarkConfig,
59
+ ) -> None:
60
+ """Initialise the benchmark module.
61
+
62
+ Args:
63
+ model_config:
64
+ The model configuration.
65
+ dataset_config:
66
+ The dataset configuration.
67
+ benchmark_config:
68
+ The benchmark configuration.
69
+ """
70
+ self.model_config = model_config
71
+ self.dataset_config = dataset_config
72
+ self.benchmark_config = benchmark_config
73
+ self.buffer: dict[str, t.Any] = dict()
74
+ self._log_metadata()
75
+
76
+ def _log_metadata(self) -> None:
77
+ """Log the metadata of the model."""
78
+ # Set logging level based on verbosity
79
+ if hasattr(sys, "_called_from_test"):
80
+ logging_level = logging.CRITICAL
81
+ elif self.benchmark_config.verbose:
82
+ logging_level = logging.DEBUG
83
+ else:
84
+ logging_level = logging.INFO
85
+ logger.setLevel(logging_level)
86
+
87
+ logging_msg: str = ""
88
+ if self.num_params < 0:
89
+ logging_msg += "The model has an unknown number of parameters, "
90
+ else:
91
+ logging_msg += f"The model has {self.num_params:,} parameters, "
92
+ if self.vocab_size < 0:
93
+ logging_msg += "an unknown vocabulary size, "
94
+ else:
95
+ logging_msg += f"a vocabulary size of {self.vocab_size:,}, "
96
+ if self.model_max_length < 0:
97
+ logging_msg += "and an unknown maximum sequence length."
98
+ else:
99
+ logging_msg += f"and a maximum context length of {self.model_max_length:,}."
100
+ log_once(message=logging_msg, level=logging.INFO)
101
+
102
+ def get_pytorch_module(self) -> "nn.Module":
103
+ """Get the underlying PyTorch module.
104
+
105
+ Returns:
106
+ The PyTorch module.
107
+ """
108
+ if hasattr(self, "_model"):
109
+ return self._model
110
+ raise NotImplementedError(
111
+ "The `get_pytorch_module` method has not been implemented for "
112
+ f"{self.__class__.__name__}."
113
+ )
114
+
115
+ def get_tokenizer(self) -> "PreTrainedTokenizer":
116
+ """Get the underlying tokenizer.
117
+
118
+ Returns:
119
+ The tokenizer.
120
+ """
121
+ if hasattr(self, "_tokenizer"):
122
+ return self._tokenizer
123
+ raise NotImplementedError(
124
+ "The `get_tokenizer` method has not been implemented for "
125
+ f"{self.__class__.__name__}."
126
+ )
127
+
128
+ @cached_property
129
+ @abstractmethod
130
+ def num_params(self) -> int:
131
+ """The number of parameters in the model.
132
+
133
+ Returns:
134
+ The number of parameters in the model.
135
+ """
136
+ ...
137
+
138
+ @property
139
+ @abstractmethod
140
+ def generative_type(self) -> GenerativeType | None:
141
+ """Get the generative type of the model.
142
+
143
+ Returns:
144
+ The generative type of the model, or None if the model is not generative.
145
+ """
146
+ ...
147
+
148
+ @cached_property
149
+ @abstractmethod
150
+ def vocab_size(self) -> int:
151
+ """The vocabulary size of the model.
152
+
153
+ Returns:
154
+ The vocabulary size of the model.
155
+ """
156
+ ...
157
+
158
+ @cached_property
159
+ @abstractmethod
160
+ def model_max_length(self) -> int:
161
+ """The maximum length of the model.
162
+
163
+ Returns:
164
+ The maximum length of the model.
165
+ """
166
+ ...
167
+
168
+ @property
169
+ @abstractmethod
170
+ def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
171
+ """The data collator used to prepare samples during finetuning.
172
+
173
+ Returns:
174
+ The data collator.
175
+ """
176
+ ...
177
+
178
+ @property
179
+ def compute_metrics(self) -> ComputeMetricsFunction:
180
+ """The function used to compute the metrics.
181
+
182
+ Returns:
183
+ The function used to compute the metrics.
184
+ """
185
+ match self.dataset_config.task.task_group:
186
+ case TaskGroup.SEQUENCE_CLASSIFICATION:
187
+ return partial(
188
+ sequence_classification.compute_metrics,
189
+ dataset_config=self.dataset_config,
190
+ benchmark_config=self.benchmark_config,
191
+ )
192
+ case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
193
+ return partial(
194
+ sequence_classification.compute_metrics,
195
+ dataset_config=self.dataset_config,
196
+ benchmark_config=self.benchmark_config,
197
+ )
198
+ case TaskGroup.TEXT_TO_TEXT:
199
+ return partial(
200
+ text_to_text.compute_metrics,
201
+ dataset_config=self.dataset_config,
202
+ benchmark_config=self.benchmark_config,
203
+ )
204
+ case TaskGroup.TOKEN_CLASSIFICATION:
205
+ return partial(
206
+ token_classification.compute_metrics,
207
+ has_misc_tags=self.buffer.get("has_misc_tags", True),
208
+ dataset_config=self.dataset_config,
209
+ benchmark_config=self.benchmark_config,
210
+ )
211
+ case TaskGroup.QUESTION_ANSWERING:
212
+ return partial(
213
+ question_answering.compute_metrics,
214
+ dataset_config=self.dataset_config,
215
+ benchmark_config=self.benchmark_config,
216
+ )
217
+ case _:
218
+ raise NotImplementedError(
219
+ f"Unsupported task group: {self.dataset_config.task.task_group}."
220
+ )
221
+
222
+ @property
223
+ @abstractmethod
224
+ def extract_labels_from_generation(self) -> ExtractLabelsFunction:
225
+ """The function used to extract the labels from the generated output.
226
+
227
+ Returns:
228
+ The function used to extract the labels from the generated output.
229
+ """
230
+ ...
231
+
232
+ @property
233
+ @abstractmethod
234
+ def trainer_class(self) -> t.Type["Trainer"]:
235
+ """The Trainer class to use for finetuning.
236
+
237
+ Returns:
238
+ The Trainer class.
239
+ """
240
+ ...
241
+
242
+ def prepare_datasets(
243
+ self, datasets: list[DatasetDict], task: Task
244
+ ) -> list[DatasetDict]:
245
+ """Prepare the datasets for the model.
246
+
247
+ This includes things like tokenisation.
248
+
249
+ Args:
250
+ datasets:
251
+ The datasets to prepare.
252
+ task:
253
+ The task to prepare the datasets for.
254
+
255
+ Returns:
256
+ The prepared datasets.
257
+ """
258
+ for idx, dataset in enumerate(
259
+ tqdm(iterable=datasets, desc="Preparing datasets")
260
+ ):
261
+ prepared_dataset = self.prepare_dataset(
262
+ dataset=dataset, task=task, itr_idx=idx
263
+ )
264
+ if self.dataset_config.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
265
+ labels_in_train: set[str] = {
266
+ tag for tag_list in dataset["train"]["labels"] for tag in tag_list
267
+ }
268
+ self.buffer["has_misc_tags"] = (
269
+ "B-MISC" in labels_in_train or "I-MISC" in labels_in_train
270
+ )
271
+ datasets[idx] = DatasetDict(
272
+ dict(
273
+ train=prepared_dataset["train"],
274
+ val=prepared_dataset["val"],
275
+ test=prepared_dataset["test"],
276
+ original_train=dataset["train"],
277
+ original_val=dataset["val"],
278
+ original_test=dataset["test"],
279
+ )
280
+ )
281
+ return datasets
282
+
283
+ @abstractmethod
284
+ def prepare_dataset(
285
+ self, dataset: DatasetDict, task: Task, itr_idx: int
286
+ ) -> DatasetDict:
287
+ """Prepare the dataset for the model.
288
+
289
+ This includes things like tokenisation.
290
+
291
+ Args:
292
+ dataset:
293
+ The dataset to prepare.
294
+ task:
295
+ The task to prepare the dataset for.
296
+ itr_idx:
297
+ The index of the dataset in the iterator.
298
+
299
+ Returns:
300
+ The prepared dataset.
301
+ """
302
+ ...
303
+
304
+ def generate(self, inputs: dict) -> GenerativeModelOutput:
305
+ """Generate outputs from the model.
306
+
307
+ Args:
308
+ inputs:
309
+ A batch of inputs to pass through the model.
310
+
311
+ Returns:
312
+ The generated model outputs.
313
+ """
314
+ raise NotImplementedError(
315
+ "The `generate` method has not been implemented for "
316
+ f"{self.__class__.__name__}."
317
+ )
318
+
319
+ @classmethod
320
+ @abstractmethod
321
+ def model_exists(
322
+ cls, model_id: str, benchmark_config: BenchmarkConfig
323
+ ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
324
+ """Check if a model exists.
325
+
326
+ Args:
327
+ model_id:
328
+ The model ID.
329
+ benchmark_config:
330
+ The benchmark configuration.
331
+
332
+ Returns:
333
+ Whether the model exists, or an error describing why we cannot check
334
+ whether the model exists.
335
+ """
336
+ ...
337
+
338
+ @classmethod
339
+ @abstractmethod
340
+ def get_model_config(
341
+ cls, model_id: str, benchmark_config: BenchmarkConfig
342
+ ) -> ModelConfig:
343
+ """Fetch the model configuration.
344
+
345
+ Args:
346
+ model_id:
347
+ The model ID.
348
+ benchmark_config:
349
+ The benchmark configuration.
350
+
351
+ Returns:
352
+ The model configuration.
353
+ """
354
+ ...
@@ -0,0 +1,286 @@
1
+ """Freshly initialised encoder models."""
2
+
3
+ import os
4
+ from functools import cached_property
5
+ from json import JSONDecodeError
6
+
7
+ from transformers import (
8
+ AutoConfig,
9
+ AutoTokenizer,
10
+ ElectraForQuestionAnswering,
11
+ ElectraForSequenceClassification,
12
+ ElectraForTokenClassification,
13
+ PretrainedConfig,
14
+ PreTrainedModel,
15
+ PreTrainedTokenizer,
16
+ XLMRobertaForQuestionAnswering,
17
+ XLMRobertaForSequenceClassification,
18
+ XLMRobertaForTokenClassification,
19
+ )
20
+
21
+ from ..data_models import BenchmarkConfig, DatasetConfig, ModelConfig
22
+ from ..enums import InferenceBackend, ModelType, TaskGroup
23
+ from ..exceptions import (
24
+ InvalidBenchmark,
25
+ InvalidModel,
26
+ NeedsEnvironmentVariable,
27
+ NeedsExtraInstalled,
28
+ )
29
+ from ..utils import block_terminal_output, create_model_cache_dir
30
+ from .hf import (
31
+ HuggingFaceEncoderModel,
32
+ align_model_and_tokenizer,
33
+ setup_model_for_question_answering,
34
+ )
35
+
36
+
37
+ class FreshEncoderModel(HuggingFaceEncoderModel):
38
+ """A freshly initialised encoder model."""
39
+
40
+ fresh_model = True
41
+
42
+ def __init__(
43
+ self,
44
+ model_config: ModelConfig,
45
+ dataset_config: DatasetConfig,
46
+ benchmark_config: BenchmarkConfig,
47
+ ) -> None:
48
+ """Initialise the model.
49
+
50
+ Args:
51
+ model_config:
52
+ The model configuration.
53
+ dataset_config:
54
+ The dataset configuration.
55
+ benchmark_config:
56
+ The benchmark configuration.
57
+ """
58
+ # This is already set when calling `super.__init__`, but we need it to get a
59
+ # value from `self.model_max_length`, so we set it here as well.
60
+ self.model_config = model_config
61
+
62
+ model, tokenizer = load_model_and_tokenizer(
63
+ model_config=model_config,
64
+ dataset_config=dataset_config,
65
+ benchmark_config=benchmark_config,
66
+ model_max_length=self.model_max_length,
67
+ )
68
+ self._model: PreTrainedModel = model
69
+ self._tokenizer: PreTrainedTokenizer = tokenizer
70
+
71
+ self._model, self._tokenizer = align_model_and_tokenizer(
72
+ model=self._model,
73
+ tokenizer=self._tokenizer,
74
+ model_max_length=self.model_max_length,
75
+ raise_errors=benchmark_config.raise_errors,
76
+ )
77
+
78
+ # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
79
+ # to call the `__init__` method of the `BenchmarkModule` class.
80
+ super(HuggingFaceEncoderModel, self).__init__(
81
+ model_config=model_config,
82
+ dataset_config=dataset_config,
83
+ benchmark_config=benchmark_config,
84
+ )
85
+
86
+ @cached_property
87
+ def num_params(self) -> int:
88
+ """The number of parameters in the model.
89
+
90
+ Returns:
91
+ The number of parameters in the model.
92
+ """
93
+ match self.model_config.model_id:
94
+ case "fresh-xlm-roberta-base":
95
+ return 278_885_778
96
+ case "fresh-electra-small":
97
+ return 13_738_755
98
+ case _:
99
+ raise NotImplementedError(
100
+ f"Number of parameters for model {self.model_config.model_id} is "
101
+ "not implemented."
102
+ )
103
+
104
+ @cached_property
105
+ def vocab_size(self) -> int:
106
+ """The vocabulary size of the model.
107
+
108
+ Returns:
109
+ The vocabulary size of the model.
110
+ """
111
+ match self.model_config.model_id:
112
+ case "fresh-xlm-roberta-base":
113
+ return 250_002
114
+ case "fresh-electra-small":
115
+ return 32_000
116
+ case _:
117
+ raise NotImplementedError(
118
+ f"Vocabulary size for model {self.model_config.model_id} is not "
119
+ "implemented."
120
+ )
121
+
122
+ @cached_property
123
+ def model_max_length(self) -> int:
124
+ """The maximum context length of the model.
125
+
126
+ Returns:
127
+ The maximum context length of the model.
128
+ """
129
+ match self.model_config.model_id:
130
+ case "fresh-xlm-roberta-base":
131
+ return 512
132
+ case "fresh-electra-small":
133
+ return 128
134
+ case _:
135
+ raise NotImplementedError(
136
+ f"Maximum context length for model {self.model_config.model_id} is "
137
+ "not implemented."
138
+ )
139
+
140
+ @classmethod
141
+ def model_exists(
142
+ cls, model_id: str, benchmark_config: BenchmarkConfig
143
+ ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
144
+ """Check if a model exists.
145
+
146
+ Args:
147
+ model_id:
148
+ The model ID.
149
+ benchmark_config:
150
+ The benchmark configuration.
151
+
152
+ Returns:
153
+ Whether the model exists, or an error describing why we cannot check
154
+ whether the model exists.
155
+ """
156
+ valid_models = ["fresh-electra-small", "fresh-xlm-roberta-base"]
157
+ return model_id in valid_models
158
+
159
+ @classmethod
160
+ def get_model_config(
161
+ cls, model_id: str, benchmark_config: BenchmarkConfig
162
+ ) -> ModelConfig:
163
+ """Fetch the model configuration.
164
+
165
+ Args:
166
+ model_id:
167
+ The model ID.
168
+ benchmark_config:
169
+ The benchmark configuration.
170
+
171
+ Returns:
172
+ The model configuration.
173
+ """
174
+ return ModelConfig(
175
+ model_id=model_id,
176
+ task="fill-mask",
177
+ languages=list(),
178
+ revision="main",
179
+ merge=False,
180
+ inference_backend=InferenceBackend.TRANSFORMERS,
181
+ model_type=ModelType.ENCODER,
182
+ fresh=True,
183
+ model_cache_dir=create_model_cache_dir(
184
+ cache_dir=benchmark_config.cache_dir, model_id=model_id
185
+ ),
186
+ adapter_base_model_id=None,
187
+ )
188
+
189
+
190
+ def load_model_and_tokenizer(
191
+ model_config: ModelConfig,
192
+ dataset_config: DatasetConfig,
193
+ benchmark_config: BenchmarkConfig,
194
+ model_max_length: int,
195
+ ) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
196
+ """Load the model and tokenizer.
197
+
198
+ Args:
199
+ model_config:
200
+ The model configuration.
201
+ dataset_config:
202
+ The dataset configuration.
203
+ benchmark_config:
204
+ The benchmark configuration.
205
+ model_max_length:
206
+ The maximum context length of the model.
207
+
208
+ Returns:
209
+ The loaded model and tokenizer.
210
+ """
211
+ config: "PretrainedConfig"
212
+ block_terminal_output()
213
+
214
+ # Get the fresh model ID and the corresponding real model ID
215
+ model_id = model_config.model_id.replace("-", "_")
216
+ fresh_to_real_model_id_mapping = dict(
217
+ fresh_xlm_roberta_base="FacebookAI/xlm-roberta-base",
218
+ fresh_electra_small="google/electra-small-discriminator",
219
+ )
220
+ real_model_id = fresh_to_real_model_id_mapping[model_id]
221
+
222
+ match dataset_config.task.task_group:
223
+ case (
224
+ TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
225
+ ):
226
+ model_cls_mapping = dict(
227
+ fresh_xlm_roberta_base=XLMRobertaForSequenceClassification,
228
+ fresh_electra_small=ElectraForSequenceClassification,
229
+ )
230
+ case TaskGroup.TOKEN_CLASSIFICATION:
231
+ model_cls_mapping = dict(
232
+ fresh_xlm_roberta_base=XLMRobertaForTokenClassification,
233
+ fresh_electra_small=ElectraForTokenClassification,
234
+ )
235
+ case TaskGroup.QUESTION_ANSWERING:
236
+ model_cls_mapping = dict(
237
+ fresh_xlm_roberta_base=XLMRobertaForQuestionAnswering,
238
+ fresh_electra_small=ElectraForQuestionAnswering,
239
+ )
240
+ case _:
241
+ raise InvalidBenchmark(
242
+ f"Task group {dataset_config.task.task_group} is not "
243
+ f"supported for model {model_config.model_id}."
244
+ )
245
+ model_cls = model_cls_mapping[model_id]
246
+
247
+ config = AutoConfig.from_pretrained(
248
+ real_model_id,
249
+ token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
250
+ num_labels=dataset_config.num_labels,
251
+ id2label=dataset_config.id2label,
252
+ label2id=dataset_config.label2id,
253
+ cache_dir=model_config.model_cache_dir,
254
+ trust_remote_code=benchmark_config.trust_remote_code,
255
+ )
256
+ model = model_cls(config)
257
+
258
+ if dataset_config.task.task_group == TaskGroup.QUESTION_ANSWERING:
259
+ model = setup_model_for_question_answering(model=model)
260
+
261
+ # Load the tokenizer. If the model is a subclass of a RoBERTa model then we
262
+ # have to add a prefix space to the tokens, by the way the model is constructed
263
+ prefix_models = ["Roberta", "GPT", "Deberta"]
264
+ prefix = any(model_type in type(model).__name__ for model_type in prefix_models)
265
+ try:
266
+ tokenizer: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
267
+ real_model_id,
268
+ revision=model_config.revision,
269
+ token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
270
+ add_prefix_space=prefix,
271
+ cache_dir=model_config.model_cache_dir,
272
+ use_fast=True,
273
+ verbose=False,
274
+ trust_remote_code=benchmark_config.trust_remote_code,
275
+ )
276
+ except (JSONDecodeError, OSError):
277
+ raise InvalidModel(f"Could not load tokenizer for model {real_model_id!r}.")
278
+
279
+ model, tokenizer = align_model_and_tokenizer(
280
+ model=model,
281
+ tokenizer=tokenizer,
282
+ model_max_length=model_max_length,
283
+ raise_errors=benchmark_config.raise_errors,
284
+ )
285
+
286
+ return model, tokenizer