EuroEval 15.9.1__py3-none-any.whl → 15.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/hf.py +3 -3
- euroeval/benchmark_modules/litellm.py +158 -122
- euroeval/benchmark_modules/vllm.py +188 -235
- euroeval/constants.py +13 -0
- euroeval/data_loading.py +8 -2
- euroeval/finetuning.py +22 -0
- euroeval/task_group_utils/multiple_choice_classification.py +11 -1
- euroeval/task_group_utils/question_answering.py +14 -4
- euroeval/task_group_utils/sequence_classification.py +1 -1
- euroeval/tokenization_utils.py +121 -18
- euroeval/utils.py +13 -8
- {euroeval-15.9.1.dist-info → euroeval-15.10.0.dist-info}/METADATA +7 -8
- {euroeval-15.9.1.dist-info → euroeval-15.10.0.dist-info}/RECORD +16 -16
- {euroeval-15.9.1.dist-info → euroeval-15.10.0.dist-info}/WHEEL +0 -0
- {euroeval-15.9.1.dist-info → euroeval-15.10.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.9.1.dist-info → euroeval-15.10.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -7,12 +7,10 @@ import json
|
|
|
7
7
|
import logging
|
|
8
8
|
import os
|
|
9
9
|
import re
|
|
10
|
-
import sys
|
|
11
10
|
import typing as t
|
|
12
11
|
from functools import partial
|
|
13
12
|
from pathlib import Path
|
|
14
13
|
from time import sleep
|
|
15
|
-
from types import MethodType
|
|
16
14
|
|
|
17
15
|
import torch
|
|
18
16
|
from datasets import DatasetDict
|
|
@@ -26,11 +24,13 @@ from transformers.trainer import Trainer
|
|
|
26
24
|
from urllib3.exceptions import RequestError
|
|
27
25
|
|
|
28
26
|
from ..constants import (
|
|
27
|
+
CUSTOM_STOP_TOKENS,
|
|
29
28
|
GENERATIVE_PIPELINE_TAGS,
|
|
30
29
|
MAX_CONTEXT_LENGTH,
|
|
31
30
|
MAX_LOGPROBS,
|
|
32
31
|
MERGE_TAGS,
|
|
33
32
|
REASONING_MAX_TOKENS,
|
|
33
|
+
REASONING_TOKENS,
|
|
34
34
|
TASKS_USING_JSON,
|
|
35
35
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
|
|
36
36
|
)
|
|
@@ -67,6 +67,7 @@ from ..tokenization_utils import (
|
|
|
67
67
|
get_end_of_chat_token_ids,
|
|
68
68
|
get_eos_token,
|
|
69
69
|
get_first_label_token_mapping,
|
|
70
|
+
get_pad_token,
|
|
70
71
|
should_prompts_be_stripped,
|
|
71
72
|
)
|
|
72
73
|
from ..types import ExtractLabelsFunction
|
|
@@ -79,17 +80,12 @@ from ..utils import (
|
|
|
79
80
|
from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
|
|
80
81
|
|
|
81
82
|
if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
82
|
-
from vllm import LLM,
|
|
83
|
+
from vllm import LLM, SamplingParams
|
|
83
84
|
from vllm.distributed.parallel_state import (
|
|
84
85
|
destroy_distributed_environment,
|
|
85
86
|
destroy_model_parallel,
|
|
86
87
|
)
|
|
87
|
-
from vllm.inputs import PromptType
|
|
88
88
|
from vllm.lora.request import LoRARequest
|
|
89
|
-
from vllm.model_executor.guided_decoding.guided_fields import GuidedDecodingRequest
|
|
90
|
-
from vllm.pooling_params import PoolingParams
|
|
91
|
-
from vllm.prompt_adapter.request import PromptAdapterRequest
|
|
92
|
-
from vllm.sampling_params import RequestOutputKind
|
|
93
89
|
|
|
94
90
|
if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
|
|
95
91
|
from outlines.models.vllm import adapt_tokenizer
|
|
@@ -135,9 +131,18 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
135
131
|
)
|
|
136
132
|
self._model: LLM = model
|
|
137
133
|
self._tokenizer: PreTrainedTokenizer = tokenizer
|
|
138
|
-
self.
|
|
134
|
+
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
139
135
|
model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
|
|
140
136
|
)
|
|
137
|
+
self.end_of_chat_token_ids = get_end_of_chat_token_ids(
|
|
138
|
+
tokenizer=self._tokenizer
|
|
139
|
+
)
|
|
140
|
+
self.custom_stop_tokens = get_custom_stop_tokens(
|
|
141
|
+
model=self._model,
|
|
142
|
+
tokenizer=self._tokenizer,
|
|
143
|
+
model_id=model_config.model_id,
|
|
144
|
+
is_reasoning_model=self.end_of_reasoning_token is not None,
|
|
145
|
+
)
|
|
141
146
|
|
|
142
147
|
# We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
|
|
143
148
|
# to call the `__init__` method of the `BenchmarkModule` class.
|
|
@@ -183,9 +188,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
183
188
|
"""
|
|
184
189
|
if not hasattr(self, "_tokenizer"):
|
|
185
190
|
return None
|
|
186
|
-
elif self.
|
|
191
|
+
elif self.end_of_reasoning_token is not None:
|
|
187
192
|
return GenerativeType.REASONING
|
|
188
|
-
elif
|
|
193
|
+
elif (
|
|
194
|
+
self._tokenizer.chat_template is not None
|
|
195
|
+
or "instruct" in self.model_config.model_id.lower()
|
|
196
|
+
):
|
|
189
197
|
return GenerativeType.INSTRUCTION_TUNED
|
|
190
198
|
else:
|
|
191
199
|
return GenerativeType.BASE
|
|
@@ -295,55 +303,29 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
295
303
|
Returns:
|
|
296
304
|
The generated model outputs.
|
|
297
305
|
"""
|
|
298
|
-
#
|
|
299
|
-
|
|
300
|
-
# instruction tuned (since these separate the few-shot examples in the input in
|
|
301
|
-
# this case)
|
|
302
|
-
stop_tokens: list[str] = list()
|
|
306
|
+
# Get stopping tokens
|
|
307
|
+
stop_tokens: list[str] = self.custom_stop_tokens.copy()
|
|
303
308
|
if self.buffer["instruction_model"] is False:
|
|
304
309
|
stop_tokens.append("\n\n")
|
|
305
310
|
if self._tokenizer.pad_token_id is not None:
|
|
311
|
+
assert isinstance(self._tokenizer.pad_token, str), (
|
|
312
|
+
f"The pad token for the model {self.model_config.model_id!r} "
|
|
313
|
+
f"is not a string, which is unexpected: {self._tokenizer.pad_token!r}."
|
|
314
|
+
)
|
|
306
315
|
stop_tokens.append(self._tokenizer.pad_token)
|
|
307
316
|
if self._tokenizer.eos_token_id is not None:
|
|
317
|
+
assert isinstance(self._tokenizer.eos_token, str), (
|
|
318
|
+
f"The EOS token for the model {self.model_config.model_id!r} "
|
|
319
|
+
f"is not a string, which is unexpected: {self._tokenizer.eos_token!r}."
|
|
320
|
+
)
|
|
308
321
|
stop_tokens.append(self._tokenizer.eos_token)
|
|
309
322
|
if self._tokenizer.pad_token_id is None:
|
|
310
323
|
self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
|
|
311
324
|
self._tokenizer.pad_token = self._tokenizer.eos_token
|
|
312
|
-
if
|
|
313
|
-
self._tokenizer.
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
self._tokenizer.pad_token_id = self._tokenizer.bos_token_id
|
|
317
|
-
self._tokenizer.pad_token = self._tokenizer.bos_token
|
|
318
|
-
elif (
|
|
319
|
-
self._tokenizer.eos_token_id is not None
|
|
320
|
-
and self._tokenizer.pad_token_id is None
|
|
321
|
-
):
|
|
322
|
-
self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
|
|
323
|
-
self._tokenizer.pad_token = self._tokenizer.eos_token
|
|
324
|
-
elif self._tokenizer.pad_token_id is None:
|
|
325
|
-
pad_token_candidates = ["<pad>", "[pad]", "<|endoftext|>", "<|im_end|>"]
|
|
326
|
-
pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
|
|
327
|
-
for candidate in pad_token_candidates:
|
|
328
|
-
if candidate in self._tokenizer.get_vocab():
|
|
329
|
-
pad_token_id = self._tokenizer.get_vocab()[candidate]
|
|
330
|
-
self._tokenizer.pad_token = candidate
|
|
331
|
-
self._tokenizer.pad_token_id = pad_token_id
|
|
332
|
-
break
|
|
333
|
-
else:
|
|
334
|
-
raise InvalidModel(
|
|
335
|
-
"Could not find a suitable token to use as a padding token, since "
|
|
336
|
-
"the model does not have a BOS, EOS, or padding token, and does "
|
|
337
|
-
f"not have any of the following tokens in its vocabulary: "
|
|
338
|
-
f"{pad_token_candidates}."
|
|
339
|
-
)
|
|
340
|
-
|
|
341
|
-
assert self._tokenizer.pad_token_id is not None
|
|
342
|
-
|
|
343
|
-
# Add end of chat token as a stopping token, if it exists
|
|
344
|
-
end_of_chat_token_ids = get_end_of_chat_token_ids(tokenizer=self._tokenizer)
|
|
345
|
-
if end_of_chat_token_ids is not None:
|
|
346
|
-
end_of_chat_token = self._tokenizer.decode(end_of_chat_token_ids).strip()
|
|
325
|
+
if self.end_of_chat_token_ids is not None:
|
|
326
|
+
end_of_chat_token = self._tokenizer.decode(
|
|
327
|
+
self.end_of_chat_token_ids
|
|
328
|
+
).strip()
|
|
347
329
|
if end_of_chat_token:
|
|
348
330
|
stop_tokens.append(end_of_chat_token)
|
|
349
331
|
|
|
@@ -430,7 +412,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
430
412
|
raw_outputs = self._model.generate(
|
|
431
413
|
prompts=prompts,
|
|
432
414
|
sampling_params=sampling_params,
|
|
433
|
-
use_tqdm=
|
|
415
|
+
use_tqdm=False if input_is_a_test else get_pbar_without_leave,
|
|
434
416
|
lora_request=self.buffer.get("lora_request"),
|
|
435
417
|
)
|
|
436
418
|
break
|
|
@@ -497,30 +479,23 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
497
479
|
completion_ids: list[list[int]] = [
|
|
498
480
|
output.outputs[0].token_ids for output in raw_outputs
|
|
499
481
|
]
|
|
500
|
-
if self.end_of_reasoning_token_id in completion_ids[0]:
|
|
501
|
-
# Find the latest index of the end of reasoning token and slice
|
|
502
|
-
# the token IDs to only include the tokens after it
|
|
503
|
-
completion_ids = [
|
|
504
|
-
token_ids[
|
|
505
|
-
max(
|
|
506
|
-
[
|
|
507
|
-
i
|
|
508
|
-
for i, x in enumerate(token_ids)
|
|
509
|
-
if x == self.end_of_reasoning_token_id
|
|
510
|
-
]
|
|
511
|
-
)
|
|
512
|
-
+ 1 :
|
|
513
|
-
]
|
|
514
|
-
if self.end_of_reasoning_token_id in token_ids
|
|
515
|
-
else token_ids
|
|
516
|
-
for token_ids in completion_ids
|
|
517
|
-
]
|
|
518
482
|
completions = self._tokenizer.batch_decode(
|
|
519
483
|
sequences=[
|
|
520
484
|
torch.LongTensor(completion_id) for completion_id in completion_ids
|
|
521
|
-
]
|
|
522
|
-
|
|
485
|
+
]
|
|
486
|
+
)
|
|
487
|
+
if self.end_of_reasoning_token is not None:
|
|
488
|
+
completions = [
|
|
489
|
+
completion.split(self.end_of_reasoning_token)[-1]
|
|
490
|
+
for completion in completions
|
|
491
|
+
]
|
|
492
|
+
stop_token_pattern = re.compile(
|
|
493
|
+
"|".join(re.escape(stop_token) for stop_token in stop_tokens)
|
|
523
494
|
)
|
|
495
|
+
completions = [
|
|
496
|
+
re.split(pattern=stop_token_pattern, string=completion)[0]
|
|
497
|
+
for completion in completions
|
|
498
|
+
]
|
|
524
499
|
completions = [completion.strip() for completion in completions]
|
|
525
500
|
|
|
526
501
|
# Sanity check
|
|
@@ -541,17 +516,6 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
541
516
|
]
|
|
542
517
|
for raw_output in raw_outputs
|
|
543
518
|
]
|
|
544
|
-
scores = [
|
|
545
|
-
score_list[
|
|
546
|
-
raw_output.outputs[0].token_ids.index(
|
|
547
|
-
self.end_of_reasoning_token_id
|
|
548
|
-
)
|
|
549
|
-
+ 2 :
|
|
550
|
-
]
|
|
551
|
-
if self.end_of_reasoning_token_id in raw_output.outputs[0].token_ids
|
|
552
|
-
else score_list
|
|
553
|
-
for raw_output, score_list in zip(raw_outputs, scores)
|
|
554
|
-
]
|
|
555
519
|
output = GenerativeModelOutput(sequences=completions, scores=scores)
|
|
556
520
|
else:
|
|
557
521
|
output = GenerativeModelOutput(sequences=completions)
|
|
@@ -831,10 +795,6 @@ def load_model_and_tokenizer(
|
|
|
831
795
|
f"The model {model_id!r} could not be loaded. The error was {e!r}."
|
|
832
796
|
)
|
|
833
797
|
|
|
834
|
-
model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
|
|
835
|
-
model._validate_and_add_requests = MethodType(
|
|
836
|
-
_validate_and_add_requests_with_fixed_progress_bars, model
|
|
837
|
-
)
|
|
838
798
|
model.config = hf_model_config
|
|
839
799
|
|
|
840
800
|
return model, tokenizer
|
|
@@ -918,90 +878,11 @@ def load_tokenizer(
|
|
|
918
878
|
# Ensure that BOS, EOS and PAD tokens are set
|
|
919
879
|
tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
|
|
920
880
|
tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
|
|
921
|
-
|
|
922
|
-
tokenizer.pad_token = tokenizer.eos_token
|
|
881
|
+
tokenizer.pad_token, tokenizer.pad_token_id = get_pad_token(tokenizer=tokenizer)
|
|
923
882
|
|
|
924
883
|
return tokenizer
|
|
925
884
|
|
|
926
885
|
|
|
927
|
-
def _run_engine_with_fixed_progress_bars(
|
|
928
|
-
self: "LLM", use_tqdm: bool
|
|
929
|
-
) -> list["RequestOutput"]:
|
|
930
|
-
if use_tqdm:
|
|
931
|
-
num_requests = self.llm_engine.get_num_unfinished_requests()
|
|
932
|
-
pbar = tqdm(
|
|
933
|
-
total=num_requests, leave=False, disable=hasattr(sys, "_called_from_test")
|
|
934
|
-
)
|
|
935
|
-
else:
|
|
936
|
-
pbar = None
|
|
937
|
-
|
|
938
|
-
# Run the engine.
|
|
939
|
-
outputs: list["RequestOutput"] = list()
|
|
940
|
-
while self.llm_engine.has_unfinished_requests():
|
|
941
|
-
step_outputs = self.llm_engine.step()
|
|
942
|
-
for output in step_outputs:
|
|
943
|
-
if output.finished:
|
|
944
|
-
outputs.append(output)
|
|
945
|
-
if pbar is not None:
|
|
946
|
-
pbar.update(1)
|
|
947
|
-
|
|
948
|
-
if pbar is not None:
|
|
949
|
-
pbar.close()
|
|
950
|
-
|
|
951
|
-
# Sort the outputs by request ID. This is necessary because some requests may be
|
|
952
|
-
# finished earlier than its previous requests.
|
|
953
|
-
outputs = sorted(outputs, key=lambda x: int(x.request_id))
|
|
954
|
-
|
|
955
|
-
return outputs
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
def _validate_and_add_requests_with_fixed_progress_bars(
|
|
959
|
-
self: "LLM",
|
|
960
|
-
prompts: "PromptType | c.Sequence[PromptType]",
|
|
961
|
-
params: "SamplingParams | c.Sequence[SamplingParams] | PoolingParams | c.Sequence[PoolingParams]", # noqa: E501
|
|
962
|
-
*,
|
|
963
|
-
use_tqdm: bool,
|
|
964
|
-
lora_request: "c.Sequence[LoRARequest] | LoRARequest | None",
|
|
965
|
-
prompt_adapter_request: "PromptAdapterRequest | None",
|
|
966
|
-
tokenization_kwargs: dict[str, t.Any] | None = None,
|
|
967
|
-
guided_options: "GuidedDecodingRequest | None" = None,
|
|
968
|
-
priority: list[int] | None = None,
|
|
969
|
-
) -> None:
|
|
970
|
-
if isinstance(prompts, (str, dict)):
|
|
971
|
-
# Convert a single prompt to a list.
|
|
972
|
-
prompts = [prompts]
|
|
973
|
-
|
|
974
|
-
num_requests = len(prompts)
|
|
975
|
-
if isinstance(params, list) and len(params) != num_requests:
|
|
976
|
-
raise ValueError("The lengths of prompts and params must be the same.")
|
|
977
|
-
if isinstance(lora_request, list) and len(lora_request) != num_requests:
|
|
978
|
-
raise ValueError("The lengths of prompts and lora_request must be the same.")
|
|
979
|
-
|
|
980
|
-
for sp in params if isinstance(params, list) else (params,):
|
|
981
|
-
if isinstance(sp, SamplingParams):
|
|
982
|
-
self._add_guided_params(sp, guided_options)
|
|
983
|
-
|
|
984
|
-
# We only care about the final output
|
|
985
|
-
sp.output_kind = RequestOutputKind.FINAL_ONLY
|
|
986
|
-
|
|
987
|
-
# Add requests to the engine.
|
|
988
|
-
it = prompts
|
|
989
|
-
if use_tqdm:
|
|
990
|
-
it = tqdm(it, desc="Adding requests", leave=False)
|
|
991
|
-
|
|
992
|
-
for i, prompt in enumerate(it):
|
|
993
|
-
self._add_request(
|
|
994
|
-
prompt,
|
|
995
|
-
params[i] if isinstance(params, c.Sequence) else params,
|
|
996
|
-
tokenization_kwargs=tokenization_kwargs,
|
|
997
|
-
lora_request=lora_request[i]
|
|
998
|
-
if isinstance(lora_request, c.Sequence)
|
|
999
|
-
else lora_request,
|
|
1000
|
-
prompt_adapter_request=prompt_adapter_request,
|
|
1001
|
-
priority=priority[i] if priority else 0,
|
|
1002
|
-
)
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
886
|
def clear_vllm() -> None:
|
|
1006
887
|
"""Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
|
|
1007
888
|
with contextlib.suppress(ValueError):
|
|
@@ -1016,14 +897,10 @@ def clear_vllm() -> None:
|
|
|
1016
897
|
clear_memory()
|
|
1017
898
|
|
|
1018
899
|
|
|
1019
|
-
def
|
|
900
|
+
def get_end_of_reasoning_token(
|
|
1020
901
|
model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
|
|
1021
|
-
) ->
|
|
1022
|
-
"""Get the end
|
|
1023
|
-
|
|
1024
|
-
This assumes that the reasoning token is of the form <X> and that the end of
|
|
1025
|
-
reasoning token is </X> (for X being any string without spaces). We disallow the
|
|
1026
|
-
reasoning token to be the same as the beginning-of-sentence token.
|
|
902
|
+
) -> str | None:
|
|
903
|
+
"""Get the end-of-reasoning token for a generative model.
|
|
1027
904
|
|
|
1028
905
|
Args:
|
|
1029
906
|
model:
|
|
@@ -1034,86 +911,162 @@ def get_end_of_reasoning_token_id(
|
|
|
1034
911
|
The model ID.
|
|
1035
912
|
|
|
1036
913
|
Returns:
|
|
1037
|
-
The end of reasoning token
|
|
914
|
+
The end of reasoning token, or None if it could not be found.
|
|
1038
915
|
"""
|
|
1039
|
-
if
|
|
1040
|
-
|
|
1041
|
-
|
|
916
|
+
# Create a prompt to check if the model uses the reasoning tokens
|
|
917
|
+
prompt = "What is your name?"
|
|
918
|
+
if tokenizer.chat_template is not None:
|
|
1042
919
|
templated_prompt = tokenizer.apply_chat_template(
|
|
1043
|
-
conversation=[dict(role="user", content=
|
|
920
|
+
conversation=[dict(role="user", content=prompt)],
|
|
1044
921
|
add_generation_prompt=True,
|
|
1045
922
|
tokenize=False,
|
|
1046
923
|
)
|
|
1047
924
|
assert isinstance(templated_prompt, str)
|
|
1048
925
|
prompt = templated_prompt
|
|
1049
926
|
|
|
1050
|
-
#
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
927
|
+
# Check that the beginning-of-reasoning token is actually used by the model
|
|
928
|
+
completion = (
|
|
929
|
+
model.generate(
|
|
930
|
+
prompts=[prompt],
|
|
931
|
+
sampling_params=SamplingParams(max_tokens=10),
|
|
932
|
+
use_tqdm=False,
|
|
933
|
+
)[0]
|
|
934
|
+
.outputs[0]
|
|
935
|
+
.text
|
|
1056
936
|
)
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
if
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
elif isinstance(tokenizer.bos_token, list):
|
|
1064
|
-
for bos_token in tokenizer.bos_token:
|
|
1065
|
-
prompt = prompt.replace(bos_token, "").strip()
|
|
1066
|
-
completion = completion.replace(bos_token, "").strip()
|
|
1067
|
-
|
|
1068
|
-
# If it doesn't contain a reasoning token, we can't find the end of reasoning token
|
|
1069
|
-
prompt_match = re.search(pattern=r"<\w+>", string=prompt)
|
|
1070
|
-
completion_match = re.search(pattern=r"<\w+>", string=completion)
|
|
1071
|
-
if completion_match is None and prompt_match is None:
|
|
937
|
+
bor_reasoning_matches = [
|
|
938
|
+
(bor_token, eor_token)
|
|
939
|
+
for bor_token, eor_token in REASONING_TOKENS
|
|
940
|
+
if bor_token in prompt or bor_token in completion
|
|
941
|
+
]
|
|
942
|
+
if not bor_reasoning_matches:
|
|
1072
943
|
log_once(
|
|
1073
|
-
f"
|
|
1074
|
-
"the model is not
|
|
1075
|
-
|
|
944
|
+
f"The model {model_id!r} did not generate any beginning-of-reasoning "
|
|
945
|
+
"tokens in the prompt or the completion. Assuming the model is not "
|
|
946
|
+
"a reasoning model.",
|
|
947
|
+
level=logging.INFO,
|
|
1076
948
|
)
|
|
1077
949
|
return None
|
|
1078
950
|
|
|
1079
|
-
# Check that the
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
decoder_token.content
|
|
1089
|
-
for decoder_token in tokenizer.added_tokens_decoder.values()
|
|
1090
|
-
]
|
|
1091
|
-
special_tokens.extend(
|
|
1092
|
-
[encoder_token for encoder_token in tokenizer.added_tokens_encoder.keys()]
|
|
951
|
+
# Check that the beginning-of-reasoning token is actually used by the model
|
|
952
|
+
completion = (
|
|
953
|
+
model.generate(
|
|
954
|
+
prompts=[prompt],
|
|
955
|
+
sampling_params=SamplingParams(max_tokens=REASONING_MAX_TOKENS),
|
|
956
|
+
use_tqdm=False,
|
|
957
|
+
)[0]
|
|
958
|
+
.outputs[0]
|
|
959
|
+
.text
|
|
1093
960
|
)
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
961
|
+
eor_reasoning_matches = [
|
|
962
|
+
(bor_token, eor_token)
|
|
963
|
+
for bor_token, eor_token in bor_reasoning_matches
|
|
964
|
+
if eor_token in completion
|
|
965
|
+
]
|
|
966
|
+
if not eor_reasoning_matches:
|
|
1099
967
|
log_once(
|
|
1100
|
-
f"
|
|
1101
|
-
|
|
1102
|
-
"
|
|
1103
|
-
"
|
|
1104
|
-
|
|
968
|
+
f"The model {model_id!r} did not generate any end-of-reasoning "
|
|
969
|
+
"tokens in the prompt or the completion, even though it generated "
|
|
970
|
+
"the beginning-of-reasoning tokens "
|
|
971
|
+
f"{[bor_token for bor_token, _ in bor_reasoning_matches]!r}. "
|
|
972
|
+
"This is probably not correct, so please report this issue.",
|
|
973
|
+
level=logging.INFO,
|
|
1105
974
|
)
|
|
1106
975
|
return None
|
|
1107
976
|
|
|
977
|
+
if len(eor_reasoning_matches) > 1:
|
|
978
|
+
log_once(
|
|
979
|
+
f"Found multiple reasoning tokens {eor_reasoning_matches} for "
|
|
980
|
+
f"model {model_id!r}. Using {eor_reasoning_matches[0]!r} as "
|
|
981
|
+
"the reasoning token. If this is not the correct reasoning token, "
|
|
982
|
+
"please report this issue.",
|
|
983
|
+
level=logging.INFO,
|
|
984
|
+
)
|
|
985
|
+
|
|
986
|
+
bor_token, eor_token = eor_reasoning_matches[0]
|
|
1108
987
|
log_once(
|
|
1109
|
-
f"Detected reasoning token {
|
|
1110
|
-
f"token {
|
|
1111
|
-
level=logging.
|
|
988
|
+
f"Detected beginning-of-reasoning token {bor_token!r} and end-of-reasoning "
|
|
989
|
+
f"token {eor_token!r} for model {model_id!r}.",
|
|
990
|
+
level=logging.INFO,
|
|
991
|
+
)
|
|
992
|
+
|
|
993
|
+
return eor_token
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
def get_custom_stop_tokens(
|
|
997
|
+
model: "LLM",
|
|
998
|
+
tokenizer: "PreTrainedTokenizer",
|
|
999
|
+
model_id: str,
|
|
1000
|
+
is_reasoning_model: bool,
|
|
1001
|
+
) -> list[str]:
|
|
1002
|
+
"""Get the stop tokens for a generative model.
|
|
1003
|
+
|
|
1004
|
+
Args:
|
|
1005
|
+
model:
|
|
1006
|
+
The vLLM model.
|
|
1007
|
+
tokenizer:
|
|
1008
|
+
The tokenizer.
|
|
1009
|
+
model_id:
|
|
1010
|
+
The model ID.
|
|
1011
|
+
is_reasoning_model:
|
|
1012
|
+
Whether the model is a reasoning model. This is used to determine the number
|
|
1013
|
+
of generated tokens to allow before stopping the generation.
|
|
1014
|
+
|
|
1015
|
+
Returns:
|
|
1016
|
+
A list of stop tokens.
|
|
1017
|
+
"""
|
|
1018
|
+
candidate_stop_tokens = CUSTOM_STOP_TOKENS
|
|
1019
|
+
|
|
1020
|
+
# Create a prompt to check if the model uses the reasoning tokens
|
|
1021
|
+
prompt = "Hello"
|
|
1022
|
+
if tokenizer.chat_template is not None:
|
|
1023
|
+
templated_prompt = tokenizer.apply_chat_template(
|
|
1024
|
+
conversation=[dict(role="user", content=prompt)],
|
|
1025
|
+
add_generation_prompt=True,
|
|
1026
|
+
tokenize=False,
|
|
1027
|
+
)
|
|
1028
|
+
assert isinstance(templated_prompt, str)
|
|
1029
|
+
prompt = templated_prompt
|
|
1030
|
+
|
|
1031
|
+
# Check that the beginning-of-reasoning token is actually used by the model
|
|
1032
|
+
max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
|
|
1033
|
+
completion = (
|
|
1034
|
+
model.generate(
|
|
1035
|
+
prompts=[prompt],
|
|
1036
|
+
sampling_params=SamplingParams(max_tokens=max_tokens, temperature=0.0),
|
|
1037
|
+
use_tqdm=False,
|
|
1038
|
+
)[0]
|
|
1039
|
+
.outputs[0]
|
|
1040
|
+
.text
|
|
1112
1041
|
)
|
|
1113
1042
|
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1043
|
+
stop_tokens = [
|
|
1044
|
+
stop_token
|
|
1045
|
+
for stop_token in candidate_stop_tokens
|
|
1046
|
+
if stop_token in prompt or stop_token in completion
|
|
1047
|
+
]
|
|
1048
|
+
if stop_tokens:
|
|
1049
|
+
logger.debug(
|
|
1050
|
+
f"Found the following custom stop tokens for model {model_id!r}: "
|
|
1051
|
+
f"{stop_tokens}."
|
|
1052
|
+
)
|
|
1053
|
+
else:
|
|
1054
|
+
logger.debug(f"Found no custom stop tokens for model {model_id!r}.")
|
|
1055
|
+
|
|
1056
|
+
return stop_tokens
|
|
1057
|
+
|
|
1058
|
+
|
|
1059
|
+
def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
|
|
1060
|
+
"""Get a progress bar for vLLM which disappears after completion.
|
|
1118
1061
|
|
|
1119
|
-
|
|
1062
|
+
Args:
|
|
1063
|
+
*tqdm_args:
|
|
1064
|
+
Positional arguments to pass to tqdm.
|
|
1065
|
+
**tqdm_kwargs:
|
|
1066
|
+
Additional keyword arguments to pass to tqdm.
|
|
1067
|
+
|
|
1068
|
+
Returns:
|
|
1069
|
+
A tqdm progress bar.
|
|
1070
|
+
"""
|
|
1071
|
+
tqdm_kwargs.pop("leave", None) # Remove the 'leave' key if it exists
|
|
1072
|
+
return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
|
euroeval/constants.py
CHANGED
|
@@ -64,3 +64,16 @@ MERGE_TAGS = ["merge", "mergekit"]
|
|
|
64
64
|
|
|
65
65
|
# The minimum required CUDA compute capability for using bfloat16 in vLLM
|
|
66
66
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
|
|
67
|
+
|
|
68
|
+
# Used to detect whether a model is a reasoning model
|
|
69
|
+
REASONING_TOKENS = [
|
|
70
|
+
("<think>", "</think>"),
|
|
71
|
+
("<reason>", "</reason>"),
|
|
72
|
+
("<reasoning>", "</reasoning>"),
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
# These tokens are sometimes used by models to indicate the end of a generated
|
|
76
|
+
# response, but they do not use them as a proper EOS token, so we have to deal with them
|
|
77
|
+
# manually. We only use them as stop tokens if they actually appear in the model's
|
|
78
|
+
# output
|
|
79
|
+
CUSTOM_STOP_TOKENS = ["<sep>"]
|
euroeval/data_loading.py
CHANGED
|
@@ -4,11 +4,11 @@ import logging
|
|
|
4
4
|
import sys
|
|
5
5
|
import time
|
|
6
6
|
|
|
7
|
+
import requests
|
|
7
8
|
from datasets import Dataset, DatasetDict, load_dataset
|
|
8
9
|
from datasets.exceptions import DatasetsError
|
|
9
10
|
from huggingface_hub.errors import HfHubHTTPError
|
|
10
11
|
from numpy.random import Generator
|
|
11
|
-
from requests import ReadTimeout
|
|
12
12
|
|
|
13
13
|
from .data_models import BenchmarkConfig, DatasetConfig
|
|
14
14
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
@@ -101,7 +101,13 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDic
|
|
|
101
101
|
token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
|
|
102
102
|
)
|
|
103
103
|
break
|
|
104
|
-
except (
|
|
104
|
+
except (
|
|
105
|
+
FileNotFoundError,
|
|
106
|
+
ConnectionError,
|
|
107
|
+
DatasetsError,
|
|
108
|
+
requests.ConnectionError,
|
|
109
|
+
requests.ReadTimeout,
|
|
110
|
+
):
|
|
105
111
|
logger.warning(
|
|
106
112
|
f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
|
|
107
113
|
)
|
euroeval/finetuning.py
CHANGED
|
@@ -200,6 +200,7 @@ def finetune_single_iteration(
|
|
|
200
200
|
compute_metrics=model.compute_metrics,
|
|
201
201
|
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
|
|
202
202
|
data_collator=model.data_collator,
|
|
203
|
+
preprocess_logits_for_metrics=remove_extra_tensors_from_logits,
|
|
203
204
|
)
|
|
204
205
|
|
|
205
206
|
if not benchmark_config.verbose:
|
|
@@ -316,3 +317,24 @@ def get_training_args(
|
|
|
316
317
|
training_args._n_gpu = 1
|
|
317
318
|
|
|
318
319
|
return training_args
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def remove_extra_tensors_from_logits(
|
|
323
|
+
logits: torch.Tensor | tuple[torch.Tensor, ...], labels: torch.Tensor
|
|
324
|
+
) -> torch.Tensor | tuple[torch.Tensor, ...]:
|
|
325
|
+
"""If the logits are a tuple, return only the first element.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
logits:
|
|
329
|
+
The logits to process.
|
|
330
|
+
labels:
|
|
331
|
+
The labels to use for the processing.
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
The processed logits.
|
|
335
|
+
"""
|
|
336
|
+
if isinstance(logits, tuple):
|
|
337
|
+
logits = logits[:-1]
|
|
338
|
+
if len(logits) == 1:
|
|
339
|
+
logits = logits[0]
|
|
340
|
+
return logits
|