EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show
  1. euroeval/__init__.py +72 -0
  2. euroeval/benchmark_config_factory.py +358 -0
  3. euroeval/benchmark_modules/__init__.py +7 -0
  4. euroeval/benchmark_modules/base.py +354 -0
  5. euroeval/benchmark_modules/fresh.py +286 -0
  6. euroeval/benchmark_modules/hf.py +1185 -0
  7. euroeval/benchmark_modules/litellm.py +905 -0
  8. euroeval/benchmark_modules/vllm.py +1171 -0
  9. euroeval/benchmarker.py +1074 -0
  10. euroeval/callbacks.py +72 -0
  11. euroeval/cli.py +281 -0
  12. euroeval/constants.py +50 -0
  13. euroeval/data_loading.py +96 -0
  14. euroeval/data_models.py +474 -0
  15. euroeval/dataset_configs.py +2001 -0
  16. euroeval/enums.py +144 -0
  17. euroeval/exceptions.py +191 -0
  18. euroeval/finetuning.py +324 -0
  19. euroeval/generation.py +296 -0
  20. euroeval/human_evaluation.py +737 -0
  21. euroeval/languages.py +200 -0
  22. euroeval/model_cache.py +253 -0
  23. euroeval/model_config.py +77 -0
  24. euroeval/model_loading.py +78 -0
  25. euroeval/scores.py +90 -0
  26. euroeval/speed_benchmark.py +124 -0
  27. euroeval/task_utils/__init__.py +1 -0
  28. euroeval/task_utils/multiple_choice_classification.py +176 -0
  29. euroeval/task_utils/question_answering.py +698 -0
  30. euroeval/task_utils/sequence_classification.py +237 -0
  31. euroeval/task_utils/text_to_text.py +150 -0
  32. euroeval/task_utils/token_classification.py +464 -0
  33. euroeval/tasks.py +202 -0
  34. euroeval/types.py +97 -0
  35. euroeval/utils.py +574 -0
  36. euroeval-15.2.0.dist-info/METADATA +234 -0
  37. euroeval-15.2.0.dist-info/RECORD +40 -0
  38. euroeval-15.2.0.dist-info/WHEEL +4 -0
  39. euroeval-15.2.0.dist-info/entry_points.txt +4 -0
  40. euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1185 @@
1
+ """Encoder models from the Hugging Face Hub."""
2
+
3
+ import collections.abc as c
4
+ import logging
5
+ import os
6
+ import typing as t
7
+ from functools import cached_property, partial
8
+ from json import JSONDecodeError
9
+ from pathlib import Path
10
+ from time import sleep
11
+
12
+ import torch
13
+ from datasets import DatasetDict
14
+ from huggingface_hub import HfApi
15
+ from huggingface_hub import whoami as hf_whoami
16
+ from huggingface_hub.hf_api import ModelInfo as HfApiModelInfo
17
+ from huggingface_hub.hf_api import RepositoryNotFoundError, RevisionNotFoundError
18
+ from huggingface_hub.utils import (
19
+ GatedRepoError,
20
+ HFValidationError,
21
+ LocalTokenNotFoundError,
22
+ )
23
+ from requests.exceptions import RequestException
24
+ from torch import nn
25
+ from transformers import (
26
+ AutoConfig,
27
+ AutoTokenizer,
28
+ BatchEncoding,
29
+ DataCollatorForTokenClassification,
30
+ DataCollatorWithPadding,
31
+ PretrainedConfig,
32
+ PreTrainedModel,
33
+ PreTrainedTokenizer,
34
+ Trainer,
35
+ )
36
+ from transformers.modelcard import TASK_MAPPING
37
+ from urllib3.exceptions import RequestError
38
+
39
+ from ..constants import (
40
+ DUMMY_FILL_VALUE,
41
+ GENERATIVE_PIPELINE_TAGS,
42
+ LOCAL_MODELS_REQUIRED_FILES,
43
+ MERGE_TAGS,
44
+ )
45
+ from ..data_models import BenchmarkConfig, DatasetConfig, HFModelInfo, ModelConfig, Task
46
+ from ..enums import (
47
+ BatchingPreference,
48
+ GenerativeType,
49
+ InferenceBackend,
50
+ ModelType,
51
+ TaskGroup,
52
+ )
53
+ from ..exceptions import (
54
+ HuggingFaceHubDown,
55
+ InvalidBenchmark,
56
+ InvalidModel,
57
+ NeedsAdditionalArgument,
58
+ NeedsEnvironmentVariable,
59
+ NeedsExtraInstalled,
60
+ NoInternetConnection,
61
+ )
62
+ from ..languages import get_all_languages
63
+ from ..task_utils import (
64
+ multiple_choice_classification,
65
+ question_answering,
66
+ token_classification,
67
+ )
68
+ from ..types import ExtractLabelsFunction
69
+ from ..utils import (
70
+ block_terminal_output,
71
+ create_model_cache_dir,
72
+ get_bos_token,
73
+ get_class_by_name,
74
+ get_eos_token,
75
+ internet_connection_available,
76
+ )
77
+ from .base import BenchmarkModule
78
+
79
+ logger = logging.getLogger("euroeval")
80
+
81
+
82
+ class HuggingFaceEncoderModel(BenchmarkModule):
83
+ """An encoder model from the Hugging Face Hub."""
84
+
85
+ fresh_model = False
86
+ batching_preference = BatchingPreference.NO_PREFERENCE
87
+ high_priority = True
88
+
89
+ def __init__(
90
+ self,
91
+ model_config: ModelConfig,
92
+ dataset_config: DatasetConfig,
93
+ benchmark_config: BenchmarkConfig,
94
+ ) -> None:
95
+ """Initialise the model.
96
+
97
+ Args:
98
+ model_config:
99
+ The model configuration.
100
+ dataset_config:
101
+ The dataset configuration.
102
+ benchmark_config:
103
+ The benchmark configuration.
104
+ """
105
+ model, tokenizer = load_model_and_tokenizer(
106
+ model_config=model_config,
107
+ dataset_config=dataset_config,
108
+ benchmark_config=benchmark_config,
109
+ )
110
+ self._model: PreTrainedModel = model
111
+ self._tokenizer: PreTrainedTokenizer = tokenizer
112
+
113
+ self._model, self._tokenizer = align_model_and_tokenizer(
114
+ model=self._model,
115
+ tokenizer=self._tokenizer,
116
+ model_max_length=self.model_max_length,
117
+ raise_errors=benchmark_config.raise_errors,
118
+ )
119
+
120
+ super().__init__(
121
+ model_config=model_config,
122
+ dataset_config=dataset_config,
123
+ benchmark_config=benchmark_config,
124
+ )
125
+
126
+ @cached_property
127
+ def num_params(self) -> int:
128
+ """The number of parameters in the model.
129
+
130
+ Returns:
131
+ The number of parameters in the model.
132
+ """
133
+ token = (
134
+ self.benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True
135
+ )
136
+ hf_api = HfApi(token=token)
137
+ try:
138
+ repo_info = hf_api.model_info(
139
+ repo_id=self.model_config.adapter_base_model_id
140
+ or self.model_config.model_id,
141
+ revision=self.model_config.revision,
142
+ )
143
+ except (
144
+ RepositoryNotFoundError,
145
+ RevisionNotFoundError,
146
+ RequestException,
147
+ HFValidationError,
148
+ ):
149
+ repo_info = None
150
+
151
+ if (
152
+ repo_info is not None
153
+ and hasattr(repo_info, "safetensors")
154
+ and repo_info.safetensors is not None
155
+ and "total" in repo_info.safetensors
156
+ ):
157
+ num_params = repo_info.safetensors["total"]
158
+ elif (
159
+ hasattr(self._model.config, "num_params")
160
+ and self._model.config.num_params is not None
161
+ ):
162
+ num_params = self._model.config.num_params
163
+ elif hasattr(self._model, "parameters"):
164
+ num_params = sum(p.numel() for p in self._model.parameters())
165
+ else:
166
+ logger.warning(
167
+ "The number of parameters could not be determined for the model, since "
168
+ "the model is not stored in the safetensors format. If this is your "
169
+ "own model, then you can use this Hugging Face Space to convert your "
170
+ "model to the safetensors format: "
171
+ "https://huggingface.co/spaces/safetensors/convert."
172
+ )
173
+ num_params = -1
174
+ return num_params
175
+
176
+ @cached_property
177
+ def vocab_size(self) -> int:
178
+ """The vocabulary size of the model.
179
+
180
+ Returns:
181
+ The vocabulary size of the model.
182
+ """
183
+ if (
184
+ hasattr(self._model.config, "vocab_size")
185
+ and self._model.config.vocab_size is not None
186
+ ):
187
+ vocab_size = self._model.config.vocab_size
188
+ elif (
189
+ hasattr(self._tokenizer, "vocab_size")
190
+ and self._tokenizer.vocab_size is not None
191
+ ):
192
+ vocab_size = self._tokenizer.vocab_size
193
+ else:
194
+ vocab_size = -1
195
+ return vocab_size
196
+
197
+ @cached_property
198
+ def model_max_length(self) -> int:
199
+ """The maximum context length of the model.
200
+
201
+ Returns:
202
+ The maximum context length of the model.
203
+ """
204
+ all_max_lengths: list[int] = list()
205
+
206
+ # Add the registered max length of the tokenizer
207
+ if hasattr(
208
+ self._tokenizer, "model_max_length"
209
+ ) and self._tokenizer.model_max_length < int(1e30):
210
+ all_max_lengths.append(self._tokenizer.model_max_length)
211
+
212
+ # Add the max length derived from the model's input sizes
213
+ if hasattr(self._tokenizer, "max_model_input_sizes"):
214
+ all_max_lengths.extend(
215
+ [
216
+ size
217
+ for size in self._tokenizer.max_model_input_sizes.values()
218
+ if size is not None
219
+ ]
220
+ )
221
+
222
+ # Add max length candidates from the model's configuration
223
+ candidate_config_max_lengths = [
224
+ "max_position_embeddings",
225
+ "max_sequence_length",
226
+ "model_max_length",
227
+ "sliding_window",
228
+ "sliding_window_size",
229
+ "n_positions",
230
+ ]
231
+ for candidate_config_max_length in candidate_config_max_lengths:
232
+ if (
233
+ hasattr(self._model.config, candidate_config_max_length)
234
+ and (value := getattr(self._model.config, candidate_config_max_length))
235
+ is not None
236
+ ):
237
+ all_max_lengths.append(value)
238
+
239
+ # To avoid models having artificially low max lengths, we remove any max lengths
240
+ # that are less than 128
241
+ all_max_lengths = [
242
+ max_length for max_length in all_max_lengths if max_length >= 128
243
+ ]
244
+
245
+ if len(list(all_max_lengths)) > 0:
246
+ model_max_length = min(list(all_max_lengths))
247
+ else:
248
+ model_max_length = -1
249
+
250
+ return model_max_length
251
+
252
+ @property
253
+ def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
254
+ """The data collator used to prepare samples during finetuning.
255
+
256
+ Returns:
257
+ The data collator.
258
+ """
259
+ match self.dataset_config.task.task_group:
260
+ case (
261
+ TaskGroup.SEQUENCE_CLASSIFICATION
262
+ | TaskGroup.TEXT_TO_TEXT
263
+ | TaskGroup.QUESTION_ANSWERING
264
+ | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
265
+ ):
266
+ return DataCollatorWithPadding(self._tokenizer, padding="longest")
267
+ case TaskGroup.TOKEN_CLASSIFICATION:
268
+ return DataCollatorForTokenClassification(
269
+ tokenizer=self._tokenizer, label_pad_token_id=-100
270
+ )
271
+ case _:
272
+ raise NotImplementedError(
273
+ f"Unsupported task group: {self.dataset_config.task.task_group}."
274
+ )
275
+
276
+ @property
277
+ def generative_type(self) -> GenerativeType | None:
278
+ """Get the generative type of the model.
279
+
280
+ Returns:
281
+ The generative type of the model, or None if it has not been set yet.
282
+ """
283
+ return None
284
+
285
+ @property
286
+ def extract_labels_from_generation(self) -> ExtractLabelsFunction:
287
+ """The function used to extract the labels from the generated output.
288
+
289
+ Returns:
290
+ The function used to extract the labels from the generated output.
291
+ """
292
+ raise NotImplementedError(
293
+ "The `extract_labels_from_generation` property has not been implemented "
294
+ "for Hugging Face Encoder models."
295
+ )
296
+
297
+ @property
298
+ def trainer_class(self) -> t.Type["Trainer"]:
299
+ """The Trainer class to use for finetuning.
300
+
301
+ Returns:
302
+ The Trainer class.
303
+ """
304
+ match self.dataset_config.task.task_group:
305
+ case (
306
+ TaskGroup.SEQUENCE_CLASSIFICATION
307
+ | TaskGroup.TEXT_TO_TEXT
308
+ | TaskGroup.TOKEN_CLASSIFICATION
309
+ ):
310
+ return Trainer
311
+ case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
312
+ return (
313
+ multiple_choice_classification.MultipleChoiceClassificationTrainer
314
+ )
315
+ case TaskGroup.QUESTION_ANSWERING:
316
+ return question_answering.QuestionAnsweringTrainer
317
+ case _:
318
+ raise NotImplementedError(
319
+ f"Unsupported task group: {self.dataset_config.task.task_group}."
320
+ )
321
+
322
+ def prepare_dataset(
323
+ self, dataset: DatasetDict, task: Task, itr_idx: int
324
+ ) -> DatasetDict:
325
+ """Prepare the dataset for the model.
326
+
327
+ This includes things like tokenisation.
328
+
329
+ Args:
330
+ dataset:
331
+ The dataset to prepare.
332
+ task:
333
+ The task to prepare the dataset for.
334
+ itr_idx:
335
+ The index of the dataset in the iterator.
336
+
337
+ Returns:
338
+ The prepared dataset.
339
+ """
340
+
341
+ def numericalise_labels(examples: dict) -> dict:
342
+ if "label" in examples:
343
+ try:
344
+ examples["label"] = [
345
+ self._model.config.label2id[lbl.lower()]
346
+ for lbl in examples["label"]
347
+ ]
348
+ except KeyError:
349
+ raise InvalidBenchmark(
350
+ f"One of the labels in the dataset, "
351
+ f"{examples['label'].lower()}, does not occur in the "
352
+ f"label2id dictionary {self._model.config.label2id}."
353
+ )
354
+ return examples
355
+
356
+ def tokenise(examples: dict) -> BatchEncoding:
357
+ return self._tokenizer(text=examples["text"], truncation=True, padding=True)
358
+
359
+ match task.task_group:
360
+ case TaskGroup.SEQUENCE_CLASSIFICATION:
361
+ dataset = dataset.map(
362
+ numericalise_labels, batched=True, load_from_cache_file=False
363
+ ).map(tokenise, batched=True, load_from_cache_file=False)
364
+
365
+ case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
366
+ dataset = DatasetDict(
367
+ train=dataset["train"].map(
368
+ partial(
369
+ multiple_choice_classification.prepare_examples,
370
+ tokenizer=self._tokenizer,
371
+ ),
372
+ batched=True,
373
+ batch_size=1,
374
+ remove_columns=dataset["train"].column_names,
375
+ load_from_cache_file=False,
376
+ keep_in_memory=True,
377
+ ),
378
+ val=dataset["val"].map(
379
+ partial(
380
+ multiple_choice_classification.prepare_examples,
381
+ tokenizer=self._tokenizer,
382
+ ),
383
+ batched=True,
384
+ batch_size=1,
385
+ remove_columns=dataset["val"].column_names,
386
+ load_from_cache_file=False,
387
+ keep_in_memory=True,
388
+ ),
389
+ test=dataset["test"].map(
390
+ partial(
391
+ multiple_choice_classification.prepare_examples,
392
+ tokenizer=self._tokenizer,
393
+ ),
394
+ batched=True,
395
+ batch_size=1,
396
+ remove_columns=dataset["test"].column_names,
397
+ load_from_cache_file=False,
398
+ keep_in_memory=True,
399
+ ),
400
+ )
401
+
402
+ case TaskGroup.TEXT_TO_TEXT:
403
+ dataset = dataset.map(
404
+ tokenise,
405
+ batched=True,
406
+ load_from_cache_file=False,
407
+ keep_in_memory=True,
408
+ )
409
+
410
+ case TaskGroup.TOKEN_CLASSIFICATION:
411
+ dataset = dataset.map(
412
+ partial(
413
+ token_classification.tokenize_and_align_labels,
414
+ tokenizer=self._tokenizer,
415
+ label2id=self._model.config.label2id,
416
+ ),
417
+ batched=True,
418
+ load_from_cache_file=False,
419
+ keep_in_memory=True,
420
+ )
421
+
422
+ case TaskGroup.QUESTION_ANSWERING:
423
+ dataset = DatasetDict(
424
+ dict(
425
+ train=dataset["train"].map(
426
+ partial(
427
+ question_answering.prepare_train_examples,
428
+ tokenizer=self._tokenizer,
429
+ ),
430
+ batched=True,
431
+ batch_size=10,
432
+ remove_columns=dataset["test"].column_names,
433
+ load_from_cache_file=False,
434
+ keep_in_memory=True,
435
+ ),
436
+ val=dataset["val"].map(
437
+ partial(
438
+ question_answering.prepare_train_examples,
439
+ tokenizer=self._tokenizer,
440
+ ),
441
+ batched=True,
442
+ batch_size=10,
443
+ remove_columns=dataset["test"].column_names,
444
+ load_from_cache_file=False,
445
+ keep_in_memory=True,
446
+ ),
447
+ test=dataset["test"].map(
448
+ partial(
449
+ question_answering.prepare_test_examples,
450
+ tokenizer=self._tokenizer,
451
+ ),
452
+ batched=True,
453
+ batch_size=10,
454
+ remove_columns=dataset["test"].column_names,
455
+ load_from_cache_file=False,
456
+ keep_in_memory=True,
457
+ ),
458
+ )
459
+ )
460
+
461
+ # The Trainer hides the columns that are not used by the model (here
462
+ # `id` and `offset_mapping` which we will need for our post-processing),
463
+ # so we put them back
464
+ for split_name, split in dataset.items():
465
+ dataset[split_name].set_format(
466
+ type=split.format["type"], columns=list(split.features.keys())
467
+ )
468
+
469
+ case _:
470
+ raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
471
+
472
+ return dataset
473
+
474
+ @classmethod
475
+ def model_exists(
476
+ cls, model_id: str, benchmark_config: BenchmarkConfig
477
+ ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
478
+ """Check if a model exists.
479
+
480
+ Args:
481
+ model_id:
482
+ The model ID.
483
+ benchmark_config:
484
+ The benchmark configuration.
485
+
486
+ Returns:
487
+ Whether the model exists, or an error describing why we cannot check
488
+ whether the model exists.
489
+ """
490
+ model_id, revision = (
491
+ model_id.split("@") if "@" in model_id else (model_id, "main")
492
+ )
493
+ model_info = get_model_repo_info(
494
+ model_id=model_id, revision=revision, benchmark_config=benchmark_config
495
+ )
496
+ return (
497
+ model_info is not None
498
+ and model_info.pipeline_tag not in GENERATIVE_PIPELINE_TAGS
499
+ )
500
+
501
+ @classmethod
502
+ def get_model_config(
503
+ cls, model_id: str, benchmark_config: BenchmarkConfig
504
+ ) -> ModelConfig:
505
+ """Fetch the model configuration.
506
+
507
+ Args:
508
+ model_id:
509
+ The model ID.
510
+ benchmark_config:
511
+ The benchmark configuration.
512
+
513
+ Returns:
514
+ The model configuration.
515
+ """
516
+ model_id, revision = (
517
+ model_id.split("@") if "@" in model_id else (model_id, "main")
518
+ )
519
+ model_info = get_model_repo_info(
520
+ model_id=model_id, revision=revision, benchmark_config=benchmark_config
521
+ )
522
+ if model_info is None:
523
+ raise InvalidModel(f"The model {model_id!r} could not be found.")
524
+
525
+ language_mapping = get_all_languages()
526
+ language_codes = list(language_mapping.keys())
527
+
528
+ model_config = ModelConfig(
529
+ model_id=model_id,
530
+ revision=revision,
531
+ task=model_info.pipeline_tag,
532
+ languages=[
533
+ language_mapping[tag]
534
+ for tag in model_info.tags
535
+ if tag in language_codes
536
+ ],
537
+ merge=any(tag in model_info.tags for tag in MERGE_TAGS),
538
+ inference_backend=InferenceBackend.TRANSFORMERS,
539
+ model_type=ModelType.ENCODER,
540
+ fresh=False,
541
+ model_cache_dir=create_model_cache_dir(
542
+ cache_dir=benchmark_config.cache_dir, model_id=model_id
543
+ ),
544
+ adapter_base_model_id=None,
545
+ )
546
+
547
+ return model_config
548
+
549
+
550
+ def load_model_and_tokenizer(
551
+ model_config: ModelConfig,
552
+ dataset_config: DatasetConfig,
553
+ benchmark_config: BenchmarkConfig,
554
+ ) -> tuple[PreTrainedModel, PreTrainedTokenizer]:
555
+ """Load the model and tokenizer.
556
+
557
+ Args:
558
+ model_config:
559
+ The model configuration.
560
+ dataset_config:
561
+ The dataset configuration.
562
+ benchmark_config:
563
+ The benchmark configuration
564
+
565
+ Returns:
566
+ The loaded model and tokenizer.
567
+ """
568
+ config: "PretrainedConfig"
569
+ block_terminal_output()
570
+
571
+ model_id = model_config.model_id
572
+ task_group = dataset_config.task.task_group
573
+ ignore_mismatched_sizes = False
574
+
575
+ # Special case where there is a mismatch between the labels during training and
576
+ # testing
577
+ if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
578
+ id2label = {0: "0", 1: "1"}
579
+ else:
580
+ id2label = dataset_config.id2label
581
+
582
+ config = load_hf_model_config(
583
+ model_id=model_id,
584
+ num_labels=len(id2label),
585
+ id2label=id2label,
586
+ label2id={label: idx for idx, label in id2label.items()},
587
+ revision=model_config.revision,
588
+ model_cache_dir=model_config.model_cache_dir,
589
+ api_key=benchmark_config.api_key,
590
+ trust_remote_code=benchmark_config.trust_remote_code,
591
+ run_with_cli=benchmark_config.run_with_cli,
592
+ )
593
+
594
+ model_kwargs = dict(
595
+ config=config,
596
+ ignore_mismatched_sizes=ignore_mismatched_sizes,
597
+ revision=model_config.revision,
598
+ token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
599
+ cache_dir=model_config.model_cache_dir,
600
+ trust_remote_code=benchmark_config.trust_remote_code,
601
+ torch_dtype=get_torch_dtype(
602
+ device=benchmark_config.device,
603
+ torch_dtype_is_set=config.to_dict().get("torch_dtype") is not None,
604
+ bf16_available=(
605
+ torch.cuda.is_available() and torch.cuda.is_bf16_supported()
606
+ ),
607
+ ),
608
+ )
609
+
610
+ # These are used when a timeout occurs
611
+ attempts_left = 5
612
+
613
+ model: PreTrainedModel | None = None
614
+ while True:
615
+ # Get the model class associated with the task group
616
+ model_cls_or_none: t.Type["PreTrainedModel"] | None = get_class_by_name(
617
+ class_name=task_group_to_class_name(task_group=task_group),
618
+ module_name="transformers",
619
+ )
620
+
621
+ # If the model class could not be found then raise an error
622
+ if not model_cls_or_none:
623
+ raise InvalidBenchmark(
624
+ f"The task group {task_group.value!r} does not correspond to a "
625
+ "Hugging Face AutoModel type (such as "
626
+ "`AutoModelForSequenceClassification`)."
627
+ )
628
+
629
+ # If the model is a DeBERTaV2 model then we ensure that
630
+ # `pooler_hidden_size` is the same size as `hidden_size`
631
+ if config.model_type == "deberta-v2":
632
+ config.pooler_hidden_size = config.hidden_size
633
+
634
+ try:
635
+ model_or_tuple = model_cls_or_none.from_pretrained(
636
+ model_config.model_id, **model_kwargs
637
+ )
638
+ break
639
+ except (KeyError, RuntimeError) as e:
640
+ if not model_kwargs["ignore_mismatched_sizes"]:
641
+ logger.debug(
642
+ f"{type(e).__name__} occurred during the loading "
643
+ f"of the {model_id!r} model. Retrying with "
644
+ "`ignore_mismatched_sizes` set to True."
645
+ )
646
+ model_kwargs["ignore_mismatched_sizes"] = True
647
+ continue
648
+ else:
649
+ raise InvalidModel(str(e))
650
+ except (TimeoutError, RequestError):
651
+ attempts_left -= 1
652
+ if attempts_left == 0:
653
+ raise InvalidModel("The model could not be loaded after 5 attempts.")
654
+ logger.info(f"Couldn't load the model {model_id!r}. Retrying.")
655
+ sleep(5)
656
+ continue
657
+ except (OSError, ValueError) as e:
658
+ if "checkpoint seems to be incorrect" in str(e):
659
+ raise InvalidModel(
660
+ f"The model {model_id!r} has an incorrect checkpoint."
661
+ )
662
+ if "trust_remote_code" in str(e):
663
+ raise InvalidModel(
664
+ f"Loading the model {model_id!r} needs to trust remote code. "
665
+ "If you trust the suppliers of this model, then you can enable "
666
+ "this by setting the `--trust-remote-code` flag."
667
+ )
668
+ raise InvalidModel(
669
+ f"The model {model_id!r} could not be loaded. The error was {e!r}."
670
+ )
671
+
672
+ if isinstance(model_or_tuple, tuple):
673
+ model = model_or_tuple[0]
674
+ else:
675
+ model = model_or_tuple
676
+
677
+ assert model is not None, "The model should not be None."
678
+
679
+ model.eval()
680
+ model.to(benchmark_config.device)
681
+
682
+ if (
683
+ isinstance(model, PreTrainedModel)
684
+ and task_group == TaskGroup.QUESTION_ANSWERING
685
+ ):
686
+ model = setup_model_for_question_answering(model=model)
687
+
688
+ tokenizer = load_tokenizer(
689
+ model=model,
690
+ model_id=model_id,
691
+ trust_remote_code=benchmark_config.trust_remote_code,
692
+ )
693
+
694
+ return model, tokenizer
695
+
696
+
697
+ def get_model_repo_info(
698
+ model_id: str, revision: str, benchmark_config: BenchmarkConfig
699
+ ) -> HFModelInfo | None:
700
+ """Get the information about the model from the HF Hub or a local directory.
701
+
702
+ Args:
703
+ model_id:
704
+ The model ID.
705
+ revision:
706
+ The revision of the model.
707
+ benchmark_config:
708
+ The benchmark configuration.
709
+
710
+ Returns:
711
+ The information about the model, or None if the model could not be found.
712
+ """
713
+ token = benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True
714
+ hf_api = HfApi(token=token)
715
+ model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "main")
716
+
717
+ # Get information on the model.
718
+ # The first case is when the model is a local model, in which case we create a dummy
719
+ # model info object.
720
+ model_info: HfApiModelInfo | None = None
721
+ if Path(model_id).is_dir():
722
+ logger.debug(f"Checking for local model in {model_id}.")
723
+ if all(
724
+ (Path(model_id) / required_file).exists()
725
+ for required_file in LOCAL_MODELS_REQUIRED_FILES
726
+ ):
727
+ model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
728
+
729
+ # If the model does not exist locally, then we get the model info from the Hugging
730
+ # Face Hub
731
+ if model_info is None:
732
+ try:
733
+ model_info = hf_api.model_info(
734
+ repo_id=model_id, revision=revision, token=token
735
+ )
736
+ except (GatedRepoError, LocalTokenNotFoundError) as e:
737
+ try:
738
+ hf_whoami(token=token)
739
+ logger.warning(
740
+ f"Could not access the model {model_id} with the revision "
741
+ f"{revision}. The error was {str(e)!r}."
742
+ )
743
+ return None
744
+ except LocalTokenNotFoundError:
745
+ raise NeedsAdditionalArgument(
746
+ cli_argument="--api-key",
747
+ script_argument="api_key=<your-api-key>",
748
+ run_with_cli=benchmark_config.run_with_cli,
749
+ )
750
+ except (RepositoryNotFoundError, HFValidationError):
751
+ return None
752
+ except (OSError, RequestException):
753
+ if internet_connection_available():
754
+ raise HuggingFaceHubDown()
755
+ else:
756
+ raise NoInternetConnection()
757
+
758
+ # Get all the Hugging Face repository tags for the model. If the model is an adapter
759
+ # model, then we also get the tags for the base model
760
+ tags = model_info.tags or list()
761
+ has_base_model_tag = any(
762
+ tag.startswith("base_model:") and tag.count(":") == 1 for tag in tags
763
+ )
764
+ base_model_id: str | None = None
765
+ if has_base_model_tag:
766
+ has_adapter_config = model_info.siblings is not None and any(
767
+ sibling.rfilename == "adapter_config.json"
768
+ for sibling in model_info.siblings
769
+ )
770
+ if has_adapter_config:
771
+ base_model_id = [
772
+ tag.split(":")[1]
773
+ for tag in tags
774
+ if tag.startswith("base_model:") and tag.count(":") == 1
775
+ ][0]
776
+ base_model_info = hf_api.model_info(
777
+ repo_id=base_model_id,
778
+ revision=revision,
779
+ token=benchmark_config.api_key
780
+ or os.getenv("HUGGINGFACE_API_KEY")
781
+ or True,
782
+ )
783
+ tags += base_model_info.tags or list()
784
+ tags = list(set(tags))
785
+
786
+ # Get the pipeline tag for the model. If it is not specified, then we determine it
787
+ # by checking the model's architecture as written in the model's Hugging Face config
788
+ pipeline_tag = model_info.pipeline_tag
789
+ if pipeline_tag is None:
790
+ hf_config = load_hf_model_config(
791
+ model_id=model_id,
792
+ num_labels=0,
793
+ id2label=dict(),
794
+ label2id=dict(),
795
+ revision=revision,
796
+ model_cache_dir=create_model_cache_dir(
797
+ cache_dir=benchmark_config.cache_dir, model_id=model_id
798
+ ),
799
+ api_key=benchmark_config.api_key,
800
+ trust_remote_code=benchmark_config.trust_remote_code,
801
+ run_with_cli=benchmark_config.run_with_cli,
802
+ )
803
+ class_names = hf_config.architectures
804
+ generative_class_names = [
805
+ class_name
806
+ for tag in GENERATIVE_PIPELINE_TAGS
807
+ for class_name in TASK_MAPPING[tag].values()
808
+ ]
809
+ if class_names is not None and any(
810
+ class_name in generative_class_names for class_name in class_names
811
+ ):
812
+ pipeline_tag = "text-generation"
813
+ else:
814
+ pipeline_tag = "fill-mask"
815
+
816
+ if benchmark_config.only_allow_safetensors:
817
+ # Check if any file ends with .safetensors
818
+ repo_files = hf_api.list_repo_files(repo_id=model_id, revision=revision)
819
+ has_safetensors = any(f.endswith(".safetensors") for f in repo_files)
820
+ if not has_safetensors:
821
+ msg = f"Model {model_id} does not have safetensors weights available. "
822
+ if benchmark_config.run_with_cli:
823
+ msg += "Skipping since the `--only-allow-safetensors` flag is set."
824
+ else:
825
+ msg += (
826
+ "Skipping since the `only_allow_safetensors` argument is set "
827
+ "to `True`."
828
+ )
829
+ raise InvalidModel(msg)
830
+
831
+ return HFModelInfo(
832
+ pipeline_tag=pipeline_tag, tags=tags, adapter_base_model_id=base_model_id
833
+ )
834
+
835
+
836
+ def load_tokenizer(
837
+ model: "PreTrainedModel | None", model_id: str, trust_remote_code: bool
838
+ ) -> "PreTrainedTokenizer":
839
+ """Load the tokenizer.
840
+
841
+ Args:
842
+ model:
843
+ The model, which is used to determine whether to add a prefix space to
844
+ the tokens. Can be None.
845
+ model_id:
846
+ The model identifier. Used for logging.
847
+ trust_remote_code:
848
+ Whether to trust remote code.
849
+
850
+ Returns:
851
+ The loaded tokenizer.
852
+ """
853
+ loading_kwargs: dict[str, bool | str] = dict(
854
+ use_fast=True,
855
+ verbose=False,
856
+ trust_remote_code=trust_remote_code,
857
+ padding_side="right",
858
+ truncation_side="right",
859
+ )
860
+
861
+ # If the model is a subclass of a certain model types then we have to add a prefix
862
+ # space to the tokens, by the way the model is constructed.
863
+ if model is not None:
864
+ prefix_models = ["Roberta", "GPT", "Deberta"]
865
+ add_prefix = any(
866
+ model_type in type(model).__name__ for model_type in prefix_models
867
+ )
868
+ if add_prefix:
869
+ loading_kwargs["add_prefix_space"] = True
870
+
871
+ num_retries = 5
872
+ for _ in range(num_retries):
873
+ try:
874
+ tokenizer = AutoTokenizer.from_pretrained(model_id, **loading_kwargs)
875
+ break
876
+ except (JSONDecodeError, OSError, TypeError):
877
+ raise InvalidModel(f"Could not load tokenizer for model {model_id!r}.")
878
+ except (TimeoutError, RequestError):
879
+ logger.info(f"Couldn't load tokenizer for {model_id!r}. Retrying.")
880
+ sleep(5)
881
+ continue
882
+ else:
883
+ raise InvalidModel(
884
+ f"Could not load tokenizer for model {model_id!r} after {num_retries} "
885
+ "attempts."
886
+ )
887
+
888
+ # Ensure that BOS, EOS and PAD tokens are set
889
+ tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
890
+ tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
891
+
892
+ return tokenizer
893
+
894
+
895
+ def get_torch_dtype(
896
+ device: torch.device, torch_dtype_is_set: bool, bf16_available: bool
897
+ ) -> str | torch.dtype:
898
+ """Get the torch dtype, used for loading the model.
899
+
900
+ Args:
901
+ device:
902
+ The device to use.
903
+ torch_dtype_is_set:
904
+ Whether the torch data type is set in the model configuration.
905
+ bf16_available:
906
+ Whether bfloat16 is available.
907
+
908
+ Returns:
909
+ The torch dtype.
910
+ """
911
+ using_cuda = device == torch.device("cuda")
912
+ if using_cuda and torch_dtype_is_set:
913
+ return "auto"
914
+ elif using_cuda and bf16_available:
915
+ return torch.bfloat16
916
+ elif using_cuda:
917
+ return torch.float16
918
+ return torch.float32
919
+
920
+
921
+ def load_hf_model_config(
922
+ model_id: str,
923
+ num_labels: int,
924
+ id2label: dict[int, str],
925
+ label2id: dict[str, int],
926
+ revision: str,
927
+ model_cache_dir: str | None,
928
+ api_key: str | None,
929
+ trust_remote_code: bool,
930
+ run_with_cli: bool,
931
+ ) -> "PretrainedConfig":
932
+ """Load the Hugging Face model configuration.
933
+
934
+ Args:
935
+ model_id:
936
+ The Hugging Face model ID.
937
+ num_labels:
938
+ The number of labels in the dataset.
939
+ id2label:
940
+ The mapping from label IDs to labels.
941
+ label2id:
942
+ The mapping from labels to label IDs.
943
+ revision:
944
+ The revision of the model.
945
+ model_cache_dir:
946
+ The directory to cache the model in.
947
+ api_key:
948
+ The Hugging Face API key.
949
+ trust_remote_code:
950
+ Whether to trust remote code.
951
+ run_with_cli:
952
+ Whether the script is being run with the CLI.
953
+
954
+ Returns:
955
+ The Hugging Face model configuration.
956
+ """
957
+ while True:
958
+ try:
959
+ config = AutoConfig.from_pretrained(
960
+ model_id,
961
+ num_labels=num_labels,
962
+ id2label=id2label,
963
+ label2id=label2id,
964
+ revision=revision,
965
+ token=api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
966
+ trust_remote_code=trust_remote_code,
967
+ cache_dir=model_cache_dir,
968
+ )
969
+ if config.eos_token_id is not None and config.pad_token_id is None:
970
+ if isinstance(config.eos_token_id, list):
971
+ config.pad_token_id = config.eos_token_id[0]
972
+ else:
973
+ config.pad_token_id = config.eos_token_id
974
+ return config
975
+ except KeyError as e:
976
+ key = e.args[0]
977
+ raise InvalidModel(
978
+ f"The model config for the model {model_id!r} could not be "
979
+ f"loaded, as the key {key!r} was not found in the config."
980
+ )
981
+ except (OSError, GatedRepoError) as e:
982
+ # TEMP: When the model is gated then we cannot set cache dir, for some
983
+ # reason (since transformers v4.38.2, still a problem in v4.48.0). This
984
+ # should be included back in when this is fixed.
985
+ if "gated repo" in str(e):
986
+ model_cache_dir = None
987
+ continue
988
+ raise InvalidModel(
989
+ f"Couldn't load model config for {model_id!r}. The error was "
990
+ f"{e!r}. Skipping"
991
+ )
992
+ except (TimeoutError, RequestError):
993
+ logger.info(f"Couldn't load model config for {model_id!r}. Retrying.")
994
+ sleep(5)
995
+ continue
996
+ except ValueError as e:
997
+ if "awaiting a review from the repo authors" in str(e):
998
+ raise InvalidModel(
999
+ f"The model {model_id!r} is awaiting a review from the repository "
1000
+ "authors. Please try again later."
1001
+ )
1002
+ if "trust_remote_code" in str(e):
1003
+ raise NeedsAdditionalArgument(
1004
+ cli_argument="--trust-remote-code",
1005
+ script_argument="trust_remote_code=True",
1006
+ run_with_cli=run_with_cli,
1007
+ )
1008
+ raise InvalidModel(
1009
+ f"The config for the model {model_id!r} could not be loaded. The "
1010
+ f"error was {e!r}."
1011
+ )
1012
+
1013
+
1014
+ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedModel":
1015
+ """Setup a model for question answering.
1016
+
1017
+ Args:
1018
+ model:
1019
+ The model to setup.
1020
+
1021
+ Returns:
1022
+ The setup model.
1023
+ """
1024
+ # Get the models' token type embedding children, if they exist
1025
+ children = get_children_of_module(name="model", module=model)
1026
+
1027
+ # If the model has token type embeddings then get them
1028
+ if children:
1029
+ # Get the list of attributes that are token type embeddings
1030
+ attribute_list = list()
1031
+ done = False
1032
+ while not done:
1033
+ for key, value in children.items():
1034
+ attribute_list.append(key)
1035
+ if isinstance(value, dict):
1036
+ children = value
1037
+ else:
1038
+ done = True
1039
+ break
1040
+
1041
+ # Get the token type embeddings
1042
+ token_type_embeddings = model
1043
+ for attribute in attribute_list:
1044
+ token_type_embeddings = getattr(token_type_embeddings, attribute)
1045
+
1046
+ # If the token type embeddings has shape (1, ...) then set the shape to
1047
+ # (2, ...) by randomly initializing the second token type embedding
1048
+ if token_type_embeddings.weight.data.shape[0] == 1:
1049
+ token_type_embeddings.weight.data = torch.cat(
1050
+ (
1051
+ token_type_embeddings.weight.data,
1052
+ torch.rand_like(token_type_embeddings.weight.data),
1053
+ ),
1054
+ dim=0,
1055
+ )
1056
+ token_type_embeddings.num_embeddings = 2
1057
+
1058
+ # Set the model config to use the new type vocab size
1059
+ model.config.type_vocab_size = 2
1060
+
1061
+ return model
1062
+
1063
+
1064
+ def get_children_of_module(
1065
+ name: str, module: nn.Module
1066
+ ) -> nn.Module | dict[str, t.Any] | None:
1067
+ """Get the children of a module.
1068
+
1069
+ Args:
1070
+ name:
1071
+ The name of the module.
1072
+ module:
1073
+ The module to get the children of.
1074
+
1075
+ Returns:
1076
+ The children of the module, or None if the module has no children.
1077
+ """
1078
+ if len(list(module.children())) == 0:
1079
+ if name == "token_type_embeddings":
1080
+ return module
1081
+ else:
1082
+ return None
1083
+ else:
1084
+ submodules = dict()
1085
+ for subname, submodule in module.named_children():
1086
+ children = get_children_of_module(name=subname, module=submodule)
1087
+ if children:
1088
+ submodules[subname] = children
1089
+ return submodules
1090
+
1091
+
1092
+ def align_model_and_tokenizer(
1093
+ model: "PreTrainedModel",
1094
+ tokenizer: "PreTrainedTokenizer",
1095
+ model_max_length: int,
1096
+ raise_errors: bool = False,
1097
+ ) -> tuple["PreTrainedModel", "PreTrainedTokenizer"]:
1098
+ """Aligns the model and the tokenizer.
1099
+
1100
+ Args:
1101
+ model:
1102
+ The model to fix.
1103
+ tokenizer:
1104
+ The tokenizer to fix.
1105
+ model_max_length:
1106
+ The maximum length of the model.
1107
+ raise_errors:
1108
+ Whether to raise errors instead of trying to fix them silently.
1109
+
1110
+ Returns:
1111
+ The fixed model and tokenizer.
1112
+ """
1113
+ # Ensure that the model max length is at most 5,000, to avoid OOM errors
1114
+ model_max_length = min(model_max_length, 5_000)
1115
+
1116
+ if model_max_length > 0:
1117
+ tokenizer.model_max_length = model_max_length
1118
+ else:
1119
+ tokenizer.model_max_length = 512
1120
+
1121
+ # Move the model to the CPU, since otherwise we can't catch the IndexErrors when
1122
+ # finding the maximum sequence length of the model
1123
+ model_device = model.device
1124
+ model.to(torch.device("cpu"))
1125
+
1126
+ # Manually check that this model max length is valid for the model, and adjust
1127
+ # otherwise
1128
+ initial_max_length = tokenizer.model_max_length
1129
+ for max_length in range(initial_max_length, 0, -1):
1130
+ tokenizer.model_max_length = max_length
1131
+ dummy_inputs = torch.full(
1132
+ size=(1, max_length),
1133
+ fill_value=DUMMY_FILL_VALUE,
1134
+ dtype=torch.long,
1135
+ device=model.device,
1136
+ )
1137
+ with torch.inference_mode():
1138
+ try:
1139
+ model(dummy_inputs, attention_mask=torch.ones_like(dummy_inputs))
1140
+ break
1141
+
1142
+ # This happens if `max_length` is too large
1143
+ except IndexError:
1144
+ continue
1145
+
1146
+ # Move the model back to the original device
1147
+ model.to(model_device)
1148
+
1149
+ # If there is a mismatch between the vocab size according to the tokenizer and
1150
+ # the vocab size according to the model, we raise an error
1151
+ if hasattr(model.config, "vocab_size"):
1152
+ if model.config.vocab_size < len(tokenizer):
1153
+ if raise_errors:
1154
+ raise InvalidModel(
1155
+ "The vocab size of the tokenizer is larger than the vocab size of "
1156
+ "the model. As the --raise-errors option was specified, the "
1157
+ "embeddings of the model will not be automatically adjusted."
1158
+ )
1159
+ if hasattr(model, "resize_token_embeddings"):
1160
+ model.resize_token_embeddings(new_num_tokens=tokenizer.vocab_size + 1)
1161
+
1162
+ if tokenizer.bos_token is None and tokenizer.eos_token is not None:
1163
+ tokenizer.bos_token = tokenizer.eos_token
1164
+ tokenizer.bos_token_id = tokenizer.eos_token_id
1165
+
1166
+ return model, tokenizer
1167
+
1168
+
1169
+ def task_group_to_class_name(task_group: TaskGroup) -> str:
1170
+ """Convert a task group to a class name.
1171
+
1172
+ Args:
1173
+ task_group:
1174
+ The task group.
1175
+
1176
+ Returns:
1177
+ The class name.
1178
+ """
1179
+ pascal_case = task_group.title().replace("_", "")
1180
+ special_case_mapping = dict(
1181
+ MultipleChoiceClassification="SequenceClassification",
1182
+ Speed="SequenceClassification",
1183
+ )
1184
+ pascal_case = special_case_mapping.get(pascal_case, pascal_case)
1185
+ return f"AutoModelFor{pascal_case}"