EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +8 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +190 -110
- euroeval/benchmark_modules/vllm.py +199 -139
- euroeval/benchmarker.py +49 -22
- euroeval/cli.py +3 -3
- euroeval/constants.py +19 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +73 -23
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +35 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +6 -6
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +90 -20
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +276 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/model_cache.py +13 -1
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +19 -8
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +128 -42
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +59 -73
- euroeval/tasks.py +33 -6
- euroeval/tokenization_utils.py +294 -207
- euroeval/utils.py +150 -35
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
- euroeval-16.0.1.dist-info/RECORD +69 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -470
- euroeval-15.16.0.dist-info/RECORD +0 -63
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
4
|
import logging
|
|
5
|
-
import os
|
|
6
5
|
import typing as t
|
|
7
6
|
from functools import cached_property, partial
|
|
8
7
|
from json import JSONDecodeError
|
|
@@ -68,6 +67,7 @@ from ..utils import (
|
|
|
68
67
|
block_terminal_output,
|
|
69
68
|
create_model_cache_dir,
|
|
70
69
|
get_class_by_name,
|
|
70
|
+
get_hf_token,
|
|
71
71
|
internet_connection_available,
|
|
72
72
|
log_once,
|
|
73
73
|
)
|
|
@@ -96,6 +96,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
96
96
|
model_config: "ModelConfig",
|
|
97
97
|
dataset_config: "DatasetConfig",
|
|
98
98
|
benchmark_config: "BenchmarkConfig",
|
|
99
|
+
log_metadata: bool = True,
|
|
99
100
|
) -> None:
|
|
100
101
|
"""Initialise the model.
|
|
101
102
|
|
|
@@ -106,18 +107,20 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
106
107
|
The dataset configuration.
|
|
107
108
|
benchmark_config:
|
|
108
109
|
The benchmark configuration.
|
|
110
|
+
log_metadata:
|
|
111
|
+
Whether to log the model metadata.
|
|
109
112
|
"""
|
|
110
|
-
model,
|
|
113
|
+
model, tokeniser = load_model_and_tokeniser(
|
|
111
114
|
model_config=model_config,
|
|
112
115
|
dataset_config=dataset_config,
|
|
113
116
|
benchmark_config=benchmark_config,
|
|
114
117
|
)
|
|
115
118
|
self._model: "PreTrainedModel" = model
|
|
116
|
-
self.
|
|
119
|
+
self._tokeniser: "PreTrainedTokenizer" = tokeniser
|
|
117
120
|
|
|
118
|
-
self._model, self.
|
|
121
|
+
self._model, self._tokeniser = align_model_and_tokeniser(
|
|
119
122
|
model=self._model,
|
|
120
|
-
|
|
123
|
+
tokeniser=self._tokeniser,
|
|
121
124
|
model_max_length=self.model_max_length,
|
|
122
125
|
raise_errors=benchmark_config.raise_errors,
|
|
123
126
|
)
|
|
@@ -126,6 +129,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
126
129
|
model_config=model_config,
|
|
127
130
|
dataset_config=dataset_config,
|
|
128
131
|
benchmark_config=benchmark_config,
|
|
132
|
+
log_metadata=log_metadata,
|
|
129
133
|
)
|
|
130
134
|
|
|
131
135
|
@cached_property
|
|
@@ -135,9 +139,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
135
139
|
Returns:
|
|
136
140
|
The number of parameters in the model.
|
|
137
141
|
"""
|
|
138
|
-
token = (
|
|
139
|
-
self.benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True
|
|
140
|
-
)
|
|
142
|
+
token = get_hf_token(api_key=self.benchmark_config.api_key)
|
|
141
143
|
hf_api = HfApi(token=token)
|
|
142
144
|
try:
|
|
143
145
|
repo_info = hf_api.model_info(
|
|
@@ -191,10 +193,10 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
191
193
|
):
|
|
192
194
|
vocab_size = self._model.config.vocab_size
|
|
193
195
|
elif (
|
|
194
|
-
hasattr(self.
|
|
195
|
-
and self.
|
|
196
|
+
hasattr(self._tokeniser, "vocab_size")
|
|
197
|
+
and self._tokeniser.vocab_size is not None
|
|
196
198
|
):
|
|
197
|
-
vocab_size = self.
|
|
199
|
+
vocab_size = self._tokeniser.vocab_size
|
|
198
200
|
else:
|
|
199
201
|
vocab_size = -1
|
|
200
202
|
return vocab_size
|
|
@@ -208,18 +210,18 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
208
210
|
"""
|
|
209
211
|
all_max_lengths: list[int] = list()
|
|
210
212
|
|
|
211
|
-
# Add the registered max length of the
|
|
213
|
+
# Add the registered max length of the tokeniser
|
|
212
214
|
if hasattr(
|
|
213
|
-
self.
|
|
214
|
-
) and self.
|
|
215
|
-
all_max_lengths.append(self.
|
|
215
|
+
self._tokeniser, "model_max_length"
|
|
216
|
+
) and self._tokeniser.model_max_length < int(1e30):
|
|
217
|
+
all_max_lengths.append(self._tokeniser.model_max_length)
|
|
216
218
|
|
|
217
219
|
# Add the max length derived from the model's input sizes
|
|
218
|
-
if hasattr(self.
|
|
220
|
+
if hasattr(self._tokeniser, "max_model_input_sizes"):
|
|
219
221
|
all_max_lengths.extend(
|
|
220
222
|
[
|
|
221
223
|
size
|
|
222
|
-
for size in self.
|
|
224
|
+
for size in self._tokeniser.max_model_input_sizes.values()
|
|
223
225
|
if size is not None
|
|
224
226
|
]
|
|
225
227
|
)
|
|
@@ -275,10 +277,10 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
275
277
|
| TaskGroup.QUESTION_ANSWERING
|
|
276
278
|
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
277
279
|
):
|
|
278
|
-
return DataCollatorWithPadding(self.
|
|
280
|
+
return DataCollatorWithPadding(self._tokeniser, padding="longest")
|
|
279
281
|
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
280
282
|
return DataCollatorForTokenClassification(
|
|
281
|
-
tokenizer=self.
|
|
283
|
+
tokenizer=self._tokeniser, label_pad_token_id=-100
|
|
282
284
|
)
|
|
283
285
|
case _:
|
|
284
286
|
raise NotImplementedError(
|
|
@@ -357,16 +359,16 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
357
359
|
self._model.config.label2id[lbl.lower()]
|
|
358
360
|
for lbl in examples["label"]
|
|
359
361
|
]
|
|
360
|
-
except KeyError:
|
|
362
|
+
except KeyError as e:
|
|
361
363
|
raise InvalidBenchmark(
|
|
362
364
|
f"One of the labels in the dataset, "
|
|
363
365
|
f"{examples['label'].lower()}, does not occur in the "
|
|
364
366
|
f"label2id dictionary {self._model.config.label2id}."
|
|
365
|
-
)
|
|
367
|
+
) from e
|
|
366
368
|
return examples
|
|
367
369
|
|
|
368
370
|
def tokenise(examples: dict) -> "BatchEncoding":
|
|
369
|
-
return self.
|
|
371
|
+
return self._tokeniser(text=examples["text"], truncation=True, padding=True)
|
|
370
372
|
|
|
371
373
|
match task.task_group:
|
|
372
374
|
case TaskGroup.SEQUENCE_CLASSIFICATION:
|
|
@@ -376,39 +378,20 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
376
378
|
|
|
377
379
|
case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
|
|
378
380
|
dataset = DatasetDict(
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
tokenizer=self._tokenizer,
|
|
394
|
-
),
|
|
395
|
-
batched=True,
|
|
396
|
-
batch_size=10,
|
|
397
|
-
remove_columns=dataset["val"].column_names,
|
|
398
|
-
load_from_cache_file=False,
|
|
399
|
-
keep_in_memory=True,
|
|
400
|
-
),
|
|
401
|
-
test=dataset["test"].map(
|
|
402
|
-
partial(
|
|
403
|
-
multiple_choice_classification.prepare_examples,
|
|
404
|
-
tokenizer=self._tokenizer,
|
|
405
|
-
),
|
|
406
|
-
batched=True,
|
|
407
|
-
batch_size=10,
|
|
408
|
-
remove_columns=dataset["test"].column_names,
|
|
409
|
-
load_from_cache_file=False,
|
|
410
|
-
keep_in_memory=True,
|
|
411
|
-
),
|
|
381
|
+
{
|
|
382
|
+
split_name: split.map(
|
|
383
|
+
partial(
|
|
384
|
+
multiple_choice_classification.prepare_examples,
|
|
385
|
+
tokeniser=self._tokeniser,
|
|
386
|
+
),
|
|
387
|
+
batched=True,
|
|
388
|
+
batch_size=10,
|
|
389
|
+
remove_columns=split.column_names,
|
|
390
|
+
load_from_cache_file=False,
|
|
391
|
+
keep_in_memory=True,
|
|
392
|
+
)
|
|
393
|
+
for split_name, split in dataset.items()
|
|
394
|
+
}
|
|
412
395
|
)
|
|
413
396
|
|
|
414
397
|
case TaskGroup.TEXT_TO_TEXT:
|
|
@@ -423,7 +406,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
423
406
|
dataset = dataset.map(
|
|
424
407
|
partial(
|
|
425
408
|
token_classification.tokenize_and_align_labels,
|
|
426
|
-
|
|
409
|
+
tokeniser=self._tokeniser,
|
|
427
410
|
label2id=self._model.config.label2id,
|
|
428
411
|
),
|
|
429
412
|
batched=True,
|
|
@@ -432,43 +415,44 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
432
415
|
)
|
|
433
416
|
|
|
434
417
|
case TaskGroup.QUESTION_ANSWERING:
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
),
|
|
442
|
-
batched=True,
|
|
443
|
-
batch_size=10,
|
|
444
|
-
remove_columns=dataset["test"].column_names,
|
|
445
|
-
load_from_cache_file=False,
|
|
446
|
-
keep_in_memory=True,
|
|
418
|
+
data_dict = dict()
|
|
419
|
+
if "train" in dataset:
|
|
420
|
+
data_dict["train"] = dataset["train"].map(
|
|
421
|
+
partial(
|
|
422
|
+
question_answering.prepare_train_examples,
|
|
423
|
+
tokeniser=self._tokeniser,
|
|
447
424
|
),
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
425
|
+
batched=True,
|
|
426
|
+
batch_size=10,
|
|
427
|
+
remove_columns=dataset["test"].column_names,
|
|
428
|
+
load_from_cache_file=False,
|
|
429
|
+
keep_in_memory=True,
|
|
430
|
+
)
|
|
431
|
+
if "val" in dataset:
|
|
432
|
+
data_dict["val"] = dataset["val"].map(
|
|
433
|
+
partial(
|
|
434
|
+
question_answering.prepare_train_examples,
|
|
435
|
+
tokeniser=self._tokeniser,
|
|
458
436
|
),
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
437
|
+
batched=True,
|
|
438
|
+
batch_size=10,
|
|
439
|
+
remove_columns=dataset["test"].column_names,
|
|
440
|
+
load_from_cache_file=False,
|
|
441
|
+
keep_in_memory=True,
|
|
442
|
+
)
|
|
443
|
+
if "test" in dataset:
|
|
444
|
+
data_dict["test"] = dataset["test"].map(
|
|
445
|
+
partial(
|
|
446
|
+
question_answering.prepare_test_examples,
|
|
447
|
+
tokeniser=self._tokeniser,
|
|
469
448
|
),
|
|
449
|
+
batched=True,
|
|
450
|
+
batch_size=10,
|
|
451
|
+
remove_columns=dataset["test"].column_names,
|
|
452
|
+
load_from_cache_file=False,
|
|
453
|
+
keep_in_memory=True,
|
|
470
454
|
)
|
|
471
|
-
)
|
|
455
|
+
dataset = DatasetDict(data_dict)
|
|
472
456
|
|
|
473
457
|
# The Trainer hides the columns that are not used by the model (here
|
|
474
458
|
# `id` and `offset_mapping` which we will need for our post-processing),
|
|
@@ -559,12 +543,12 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
559
543
|
return model_config
|
|
560
544
|
|
|
561
545
|
|
|
562
|
-
def
|
|
546
|
+
def load_model_and_tokeniser(
|
|
563
547
|
model_config: "ModelConfig",
|
|
564
548
|
dataset_config: "DatasetConfig",
|
|
565
549
|
benchmark_config: "BenchmarkConfig",
|
|
566
550
|
) -> tuple["PreTrainedModel", "PreTrainedTokenizer"]:
|
|
567
|
-
"""Load the model and
|
|
551
|
+
"""Load the model and tokeniser.
|
|
568
552
|
|
|
569
553
|
Args:
|
|
570
554
|
model_config:
|
|
@@ -575,7 +559,7 @@ def load_model_and_tokenizer(
|
|
|
575
559
|
The benchmark configuration
|
|
576
560
|
|
|
577
561
|
Returns:
|
|
578
|
-
The loaded model and
|
|
562
|
+
The loaded model and tokeniser.
|
|
579
563
|
"""
|
|
580
564
|
config: "PretrainedConfig"
|
|
581
565
|
block_terminal_output()
|
|
@@ -607,12 +591,12 @@ def load_model_and_tokenizer(
|
|
|
607
591
|
config=config,
|
|
608
592
|
ignore_mismatched_sizes=ignore_mismatched_sizes,
|
|
609
593
|
revision=model_config.revision,
|
|
610
|
-
token=benchmark_config.api_key
|
|
594
|
+
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
611
595
|
cache_dir=model_config.model_cache_dir,
|
|
612
596
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
613
|
-
|
|
597
|
+
dtype=get_dtype(
|
|
614
598
|
device=benchmark_config.device,
|
|
615
|
-
|
|
599
|
+
dtype_is_set=config.to_dict().get("dtype") is not None,
|
|
616
600
|
bf16_available=(
|
|
617
601
|
torch.cuda.is_available() and torch.cuda.is_bf16_supported()
|
|
618
602
|
),
|
|
@@ -658,11 +642,13 @@ def load_model_and_tokenizer(
|
|
|
658
642
|
model_kwargs["ignore_mismatched_sizes"] = True
|
|
659
643
|
continue
|
|
660
644
|
else:
|
|
661
|
-
raise InvalidModel(str(e))
|
|
662
|
-
except (TimeoutError, RequestError):
|
|
645
|
+
raise InvalidModel(str(e)) from e
|
|
646
|
+
except (TimeoutError, RequestError) as e:
|
|
663
647
|
attempts_left -= 1
|
|
664
648
|
if attempts_left == 0:
|
|
665
|
-
raise InvalidModel(
|
|
649
|
+
raise InvalidModel(
|
|
650
|
+
"The model could not be loaded after 5 attempts."
|
|
651
|
+
) from e
|
|
666
652
|
logger.info(f"Couldn't load the model {model_id!r}. Retrying.")
|
|
667
653
|
sleep(5)
|
|
668
654
|
continue
|
|
@@ -670,16 +656,16 @@ def load_model_and_tokenizer(
|
|
|
670
656
|
if "checkpoint seems to be incorrect" in str(e):
|
|
671
657
|
raise InvalidModel(
|
|
672
658
|
f"The model {model_id!r} has an incorrect checkpoint."
|
|
673
|
-
)
|
|
659
|
+
) from e
|
|
674
660
|
if "trust_remote_code" in str(e):
|
|
675
661
|
raise InvalidModel(
|
|
676
662
|
f"Loading the model {model_id!r} needs to trust remote code. "
|
|
677
663
|
"If you trust the suppliers of this model, then you can enable "
|
|
678
664
|
"this by setting the `--trust-remote-code` flag."
|
|
679
|
-
)
|
|
665
|
+
) from e
|
|
680
666
|
raise InvalidModel(
|
|
681
667
|
f"The model {model_id!r} could not be loaded. The error was {e!r}."
|
|
682
|
-
)
|
|
668
|
+
) from e
|
|
683
669
|
|
|
684
670
|
if isinstance(model_or_tuple, tuple):
|
|
685
671
|
model = model_or_tuple[0]
|
|
@@ -697,13 +683,13 @@ def load_model_and_tokenizer(
|
|
|
697
683
|
):
|
|
698
684
|
model = setup_model_for_question_answering(model=model)
|
|
699
685
|
|
|
700
|
-
|
|
686
|
+
tokeniser = load_tokeniser(
|
|
701
687
|
model=model,
|
|
702
688
|
model_id=model_id,
|
|
703
689
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
704
690
|
)
|
|
705
691
|
|
|
706
|
-
return model,
|
|
692
|
+
return model, tokeniser
|
|
707
693
|
|
|
708
694
|
|
|
709
695
|
def get_model_repo_info(
|
|
@@ -722,7 +708,7 @@ def get_model_repo_info(
|
|
|
722
708
|
Returns:
|
|
723
709
|
The information about the model, or None if the model could not be found.
|
|
724
710
|
"""
|
|
725
|
-
token = benchmark_config.api_key
|
|
711
|
+
token = get_hf_token(api_key=benchmark_config.api_key)
|
|
726
712
|
hf_api = HfApi(token=token)
|
|
727
713
|
model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "main")
|
|
728
714
|
|
|
@@ -800,12 +786,7 @@ def get_model_repo_info(
|
|
|
800
786
|
level=logging.DEBUG,
|
|
801
787
|
)
|
|
802
788
|
if base_model_id is not None:
|
|
803
|
-
base_model_info = hf_api.model_info(
|
|
804
|
-
repo_id=base_model_id,
|
|
805
|
-
token=benchmark_config.api_key
|
|
806
|
-
or os.getenv("HUGGINGFACE_API_KEY")
|
|
807
|
-
or True,
|
|
808
|
-
)
|
|
789
|
+
base_model_info = hf_api.model_info(repo_id=base_model_id, token=token)
|
|
809
790
|
tags += base_model_info.tags or list()
|
|
810
791
|
tags = list(set(tags))
|
|
811
792
|
|
|
@@ -839,7 +820,7 @@ def get_model_repo_info(
|
|
|
839
820
|
else:
|
|
840
821
|
pipeline_tag = "fill-mask"
|
|
841
822
|
|
|
842
|
-
if benchmark_config.
|
|
823
|
+
if benchmark_config.requires_safetensors:
|
|
843
824
|
repo_files = hf_api.list_repo_files(repo_id=model_id, revision=revision)
|
|
844
825
|
has_safetensors = any(f.endswith(".safetensors") for f in repo_files)
|
|
845
826
|
if not has_safetensors:
|
|
@@ -848,7 +829,7 @@ def get_model_repo_info(
|
|
|
848
829
|
msg += "Skipping since the `--only-allow-safetensors` flag is set."
|
|
849
830
|
else:
|
|
850
831
|
msg += (
|
|
851
|
-
"Skipping since the `
|
|
832
|
+
"Skipping since the `requires_safetensors` argument is set "
|
|
852
833
|
"to `True`."
|
|
853
834
|
)
|
|
854
835
|
logger.warning(msg)
|
|
@@ -869,7 +850,7 @@ def get_model_repo_info(
|
|
|
869
850
|
msg += " Skipping since the `--only-allow-safetensors` flag is set."
|
|
870
851
|
else:
|
|
871
852
|
msg += (
|
|
872
|
-
" Skipping since the `
|
|
853
|
+
" Skipping since the `requires_safetensors` argument is set "
|
|
873
854
|
"to `True`."
|
|
874
855
|
)
|
|
875
856
|
logging.warning(msg)
|
|
@@ -880,10 +861,10 @@ def get_model_repo_info(
|
|
|
880
861
|
)
|
|
881
862
|
|
|
882
863
|
|
|
883
|
-
def
|
|
864
|
+
def load_tokeniser(
|
|
884
865
|
model: "PreTrainedModel | None", model_id: str, trust_remote_code: bool
|
|
885
866
|
) -> "PreTrainedTokenizer":
|
|
886
|
-
"""Load the
|
|
867
|
+
"""Load the tokeniser.
|
|
887
868
|
|
|
888
869
|
Args:
|
|
889
870
|
model:
|
|
@@ -895,7 +876,7 @@ def load_tokenizer(
|
|
|
895
876
|
Whether to trust remote code.
|
|
896
877
|
|
|
897
878
|
Returns:
|
|
898
|
-
The loaded
|
|
879
|
+
The loaded tokeniser.
|
|
899
880
|
"""
|
|
900
881
|
loading_kwargs: dict[str, bool | str] = dict(
|
|
901
882
|
use_fast=True,
|
|
@@ -918,45 +899,46 @@ def load_tokenizer(
|
|
|
918
899
|
num_retries = 5
|
|
919
900
|
for _ in range(num_retries):
|
|
920
901
|
try:
|
|
921
|
-
|
|
902
|
+
tokeniser = AutoTokenizer.from_pretrained(model_id, **loading_kwargs)
|
|
922
903
|
break
|
|
923
|
-
except (JSONDecodeError, OSError, TypeError):
|
|
924
|
-
raise InvalidModel(
|
|
904
|
+
except (JSONDecodeError, OSError, TypeError) as e:
|
|
905
|
+
raise InvalidModel(
|
|
906
|
+
f"Could not load tokeniser for model {model_id!r}."
|
|
907
|
+
) from e
|
|
925
908
|
except (TimeoutError, RequestError):
|
|
926
|
-
logger.info(f"Couldn't load
|
|
909
|
+
logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
|
|
927
910
|
sleep(5)
|
|
928
911
|
continue
|
|
929
912
|
else:
|
|
930
913
|
raise InvalidModel(
|
|
931
|
-
f"Could not load
|
|
914
|
+
f"Could not load tokeniser for model {model_id!r} after {num_retries} "
|
|
932
915
|
"attempts."
|
|
933
916
|
)
|
|
934
917
|
|
|
935
918
|
# Ensure that BOS, EOS and PAD tokens are set
|
|
936
|
-
|
|
937
|
-
|
|
919
|
+
tokeniser.bos_token, tokeniser.bos_token_id = get_bos_token(tokeniser=tokeniser)
|
|
920
|
+
tokeniser.eos_token, tokeniser.eos_token_id = get_eos_token(tokeniser=tokeniser)
|
|
938
921
|
|
|
939
|
-
return
|
|
922
|
+
return tokeniser
|
|
940
923
|
|
|
941
924
|
|
|
942
|
-
def
|
|
943
|
-
device: torch.device,
|
|
925
|
+
def get_dtype(
|
|
926
|
+
device: torch.device, dtype_is_set: bool, bf16_available: bool
|
|
944
927
|
) -> str | torch.dtype:
|
|
945
928
|
"""Get the torch dtype, used for loading the model.
|
|
946
929
|
|
|
947
930
|
Args:
|
|
948
931
|
device:
|
|
949
932
|
The device to use.
|
|
950
|
-
|
|
951
|
-
Whether the torch data type is set in the model configuration.
|
|
933
|
+
Whether the data type is set in the model configuration.
|
|
952
934
|
bf16_available:
|
|
953
935
|
Whether bfloat16 is available.
|
|
954
936
|
|
|
955
937
|
Returns:
|
|
956
|
-
The
|
|
938
|
+
The dtype.
|
|
957
939
|
"""
|
|
958
940
|
using_cuda = device == torch.device("cuda")
|
|
959
|
-
if using_cuda and
|
|
941
|
+
if using_cuda and dtype_is_set:
|
|
960
942
|
return "auto"
|
|
961
943
|
elif using_cuda and bf16_available:
|
|
962
944
|
return torch.bfloat16
|
|
@@ -1009,7 +991,7 @@ def load_hf_model_config(
|
|
|
1009
991
|
id2label=id2label,
|
|
1010
992
|
label2id=label2id,
|
|
1011
993
|
revision=revision,
|
|
1012
|
-
token=api_key
|
|
994
|
+
token=get_hf_token(api_key=api_key),
|
|
1013
995
|
trust_remote_code=trust_remote_code,
|
|
1014
996
|
cache_dir=model_cache_dir,
|
|
1015
997
|
)
|
|
@@ -1024,7 +1006,7 @@ def load_hf_model_config(
|
|
|
1024
1006
|
raise InvalidModel(
|
|
1025
1007
|
f"The model config for the model {model_id!r} could not be "
|
|
1026
1008
|
f"loaded, as the key {key!r} was not found in the config."
|
|
1027
|
-
)
|
|
1009
|
+
) from e
|
|
1028
1010
|
except (OSError, GatedRepoError) as e:
|
|
1029
1011
|
# TEMP: When the model is gated then we cannot set cache dir, for some
|
|
1030
1012
|
# reason (since transformers v4.38.2, still a problem in v4.48.0). This
|
|
@@ -1035,7 +1017,7 @@ def load_hf_model_config(
|
|
|
1035
1017
|
raise InvalidModel(
|
|
1036
1018
|
f"Couldn't load model config for {model_id!r}. The error was "
|
|
1037
1019
|
f"{e!r}. Skipping"
|
|
1038
|
-
)
|
|
1020
|
+
) from e
|
|
1039
1021
|
except (TimeoutError, RequestError):
|
|
1040
1022
|
logger.info(f"Couldn't load model config for {model_id!r}. Retrying.")
|
|
1041
1023
|
sleep(5)
|
|
@@ -1045,17 +1027,17 @@ def load_hf_model_config(
|
|
|
1045
1027
|
raise InvalidModel(
|
|
1046
1028
|
f"The model {model_id!r} is awaiting a review from the repository "
|
|
1047
1029
|
"authors. Please try again later."
|
|
1048
|
-
)
|
|
1030
|
+
) from e
|
|
1049
1031
|
if "trust_remote_code" in str(e):
|
|
1050
1032
|
raise NeedsAdditionalArgument(
|
|
1051
1033
|
cli_argument="--trust-remote-code",
|
|
1052
1034
|
script_argument="trust_remote_code=True",
|
|
1053
1035
|
run_with_cli=run_with_cli,
|
|
1054
|
-
)
|
|
1036
|
+
) from e
|
|
1055
1037
|
raise InvalidModel(
|
|
1056
1038
|
f"The config for the model {model_id!r} could not be loaded. The "
|
|
1057
1039
|
f"error was {e!r}."
|
|
1058
|
-
)
|
|
1040
|
+
) from e
|
|
1059
1041
|
|
|
1060
1042
|
|
|
1061
1043
|
def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedModel":
|
|
@@ -1140,33 +1122,33 @@ def get_children_of_module(
|
|
|
1140
1122
|
return submodules
|
|
1141
1123
|
|
|
1142
1124
|
|
|
1143
|
-
def
|
|
1125
|
+
def align_model_and_tokeniser(
|
|
1144
1126
|
model: "PreTrainedModel",
|
|
1145
|
-
|
|
1127
|
+
tokeniser: "PreTrainedTokenizer",
|
|
1146
1128
|
model_max_length: int,
|
|
1147
1129
|
raise_errors: bool = False,
|
|
1148
1130
|
) -> tuple["PreTrainedModel", "PreTrainedTokenizer"]:
|
|
1149
|
-
"""Aligns the model and the
|
|
1131
|
+
"""Aligns the model and the tokeniser.
|
|
1150
1132
|
|
|
1151
1133
|
Args:
|
|
1152
1134
|
model:
|
|
1153
1135
|
The model to fix.
|
|
1154
|
-
|
|
1155
|
-
The
|
|
1136
|
+
tokeniser:
|
|
1137
|
+
The tokeniser to fix.
|
|
1156
1138
|
model_max_length:
|
|
1157
1139
|
The maximum length of the model.
|
|
1158
1140
|
raise_errors:
|
|
1159
1141
|
Whether to raise errors instead of trying to fix them silently.
|
|
1160
1142
|
|
|
1161
1143
|
Returns:
|
|
1162
|
-
The fixed model and
|
|
1144
|
+
The fixed model and tokeniser.
|
|
1163
1145
|
"""
|
|
1164
1146
|
model_max_length = min(model_max_length, MAX_CONTEXT_LENGTH)
|
|
1165
1147
|
|
|
1166
1148
|
if model_max_length > 0:
|
|
1167
|
-
|
|
1149
|
+
tokeniser.model_max_length = model_max_length
|
|
1168
1150
|
else:
|
|
1169
|
-
|
|
1151
|
+
tokeniser.model_max_length = 512
|
|
1170
1152
|
|
|
1171
1153
|
# Move the model to the CPU, since otherwise we can't catch the IndexErrors when
|
|
1172
1154
|
# finding the maximum sequence length of the model
|
|
@@ -1175,9 +1157,9 @@ def align_model_and_tokenizer(
|
|
|
1175
1157
|
|
|
1176
1158
|
# Manually check that this model max length is valid for the model, and adjust
|
|
1177
1159
|
# otherwise
|
|
1178
|
-
initial_max_length =
|
|
1160
|
+
initial_max_length = tokeniser.model_max_length
|
|
1179
1161
|
for max_length in range(initial_max_length, 0, -1):
|
|
1180
|
-
|
|
1162
|
+
tokeniser.model_max_length = max_length
|
|
1181
1163
|
dummy_inputs = torch.full(
|
|
1182
1164
|
size=(1, max_length),
|
|
1183
1165
|
fill_value=DUMMY_FILL_VALUE,
|
|
@@ -1204,24 +1186,24 @@ def align_model_and_tokenizer(
|
|
|
1204
1186
|
# Move the model back to the original device
|
|
1205
1187
|
model.to(model_device) # type: ignore[arg-type]
|
|
1206
1188
|
|
|
1207
|
-
# If there is a mismatch between the vocab size according to the
|
|
1189
|
+
# If there is a mismatch between the vocab size according to the tokeniser and
|
|
1208
1190
|
# the vocab size according to the model, we raise an error
|
|
1209
1191
|
if hasattr(model.config, "vocab_size"):
|
|
1210
|
-
if model.config.vocab_size < len(
|
|
1192
|
+
if model.config.vocab_size < len(tokeniser):
|
|
1211
1193
|
if raise_errors:
|
|
1212
1194
|
raise InvalidModel(
|
|
1213
|
-
"The vocab size of the
|
|
1195
|
+
"The vocab size of the tokeniser is larger than the vocab size of "
|
|
1214
1196
|
"the model. As the --raise-errors option was specified, the "
|
|
1215
1197
|
"embeddings of the model will not be automatically adjusted."
|
|
1216
1198
|
)
|
|
1217
1199
|
if hasattr(model, "resize_token_embeddings"):
|
|
1218
|
-
model.resize_token_embeddings(new_num_tokens=
|
|
1200
|
+
model.resize_token_embeddings(new_num_tokens=tokeniser.vocab_size + 1)
|
|
1219
1201
|
|
|
1220
|
-
if
|
|
1221
|
-
|
|
1222
|
-
|
|
1202
|
+
if tokeniser.bos_token is None and tokeniser.eos_token is not None:
|
|
1203
|
+
tokeniser.bos_token = tokeniser.eos_token
|
|
1204
|
+
tokeniser.bos_token_id = tokeniser.eos_token_id
|
|
1223
1205
|
|
|
1224
|
-
return model,
|
|
1206
|
+
return model, tokeniser
|
|
1225
1207
|
|
|
1226
1208
|
|
|
1227
1209
|
def task_group_to_class_name(task_group: TaskGroup) -> str:
|