EuroEval 15.9.1__py3-none-any.whl → 15.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -7,12 +7,10 @@ import json
7
7
  import logging
8
8
  import os
9
9
  import re
10
- import sys
11
10
  import typing as t
12
11
  from functools import partial
13
12
  from pathlib import Path
14
13
  from time import sleep
15
- from types import MethodType
16
14
 
17
15
  import torch
18
16
  from datasets import DatasetDict
@@ -26,11 +24,13 @@ from transformers.trainer import Trainer
26
24
  from urllib3.exceptions import RequestError
27
25
 
28
26
  from ..constants import (
27
+ CUSTOM_STOP_TOKENS,
29
28
  GENERATIVE_PIPELINE_TAGS,
30
29
  MAX_CONTEXT_LENGTH,
31
30
  MAX_LOGPROBS,
32
31
  MERGE_TAGS,
33
32
  REASONING_MAX_TOKENS,
33
+ REASONING_TOKENS,
34
34
  TASKS_USING_JSON,
35
35
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
36
36
  )
@@ -67,6 +67,7 @@ from ..tokenization_utils import (
67
67
  get_end_of_chat_token_ids,
68
68
  get_eos_token,
69
69
  get_first_label_token_mapping,
70
+ get_pad_token,
70
71
  should_prompts_be_stripped,
71
72
  )
72
73
  from ..types import ExtractLabelsFunction
@@ -79,17 +80,12 @@ from ..utils import (
79
80
  from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
80
81
 
81
82
  if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
82
- from vllm import LLM, RequestOutput, SamplingParams
83
+ from vllm import LLM, SamplingParams
83
84
  from vllm.distributed.parallel_state import (
84
85
  destroy_distributed_environment,
85
86
  destroy_model_parallel,
86
87
  )
87
- from vllm.inputs import PromptType
88
88
  from vllm.lora.request import LoRARequest
89
- from vllm.model_executor.guided_decoding.guided_fields import GuidedDecodingRequest
90
- from vllm.pooling_params import PoolingParams
91
- from vllm.prompt_adapter.request import PromptAdapterRequest
92
- from vllm.sampling_params import RequestOutputKind
93
89
 
94
90
  if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
95
91
  from outlines.models.vllm import adapt_tokenizer
@@ -135,9 +131,18 @@ class VLLMModel(HuggingFaceEncoderModel):
135
131
  )
136
132
  self._model: LLM = model
137
133
  self._tokenizer: PreTrainedTokenizer = tokenizer
138
- self.end_of_reasoning_token_id = get_end_of_reasoning_token_id(
134
+ self.end_of_reasoning_token = get_end_of_reasoning_token(
139
135
  model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
140
136
  )
137
+ self.end_of_chat_token_ids = get_end_of_chat_token_ids(
138
+ tokenizer=self._tokenizer
139
+ )
140
+ self.custom_stop_tokens = get_custom_stop_tokens(
141
+ model=self._model,
142
+ tokenizer=self._tokenizer,
143
+ model_id=model_config.model_id,
144
+ is_reasoning_model=self.end_of_reasoning_token is not None,
145
+ )
141
146
 
142
147
  # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
143
148
  # to call the `__init__` method of the `BenchmarkModule` class.
@@ -183,9 +188,12 @@ class VLLMModel(HuggingFaceEncoderModel):
183
188
  """
184
189
  if not hasattr(self, "_tokenizer"):
185
190
  return None
186
- elif self.end_of_reasoning_token_id is not None:
191
+ elif self.end_of_reasoning_token is not None:
187
192
  return GenerativeType.REASONING
188
- elif self._tokenizer.chat_template is not None:
193
+ elif (
194
+ self._tokenizer.chat_template is not None
195
+ or "instruct" in self.model_config.model_id.lower()
196
+ ):
189
197
  return GenerativeType.INSTRUCTION_TUNED
190
198
  else:
191
199
  return GenerativeType.BASE
@@ -295,55 +303,29 @@ class VLLMModel(HuggingFaceEncoderModel):
295
303
  Returns:
296
304
  The generated model outputs.
297
305
  """
298
- # Define which tokens to use as stopping criteria. We want to use the padding
299
- # token, end-of-sentence token, and a double newline if the model isn't
300
- # instruction tuned (since these separate the few-shot examples in the input in
301
- # this case)
302
- stop_tokens: list[str] = list()
306
+ # Get stopping tokens
307
+ stop_tokens: list[str] = self.custom_stop_tokens.copy()
303
308
  if self.buffer["instruction_model"] is False:
304
309
  stop_tokens.append("\n\n")
305
310
  if self._tokenizer.pad_token_id is not None:
311
+ assert isinstance(self._tokenizer.pad_token, str), (
312
+ f"The pad token for the model {self.model_config.model_id!r} "
313
+ f"is not a string, which is unexpected: {self._tokenizer.pad_token!r}."
314
+ )
306
315
  stop_tokens.append(self._tokenizer.pad_token)
307
316
  if self._tokenizer.eos_token_id is not None:
317
+ assert isinstance(self._tokenizer.eos_token, str), (
318
+ f"The EOS token for the model {self.model_config.model_id!r} "
319
+ f"is not a string, which is unexpected: {self._tokenizer.eos_token!r}."
320
+ )
308
321
  stop_tokens.append(self._tokenizer.eos_token)
309
322
  if self._tokenizer.pad_token_id is None:
310
323
  self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
311
324
  self._tokenizer.pad_token = self._tokenizer.eos_token
312
- if (
313
- self._tokenizer.bos_token_id is not None
314
- and self._tokenizer.pad_token_id is None
315
- ):
316
- self._tokenizer.pad_token_id = self._tokenizer.bos_token_id
317
- self._tokenizer.pad_token = self._tokenizer.bos_token
318
- elif (
319
- self._tokenizer.eos_token_id is not None
320
- and self._tokenizer.pad_token_id is None
321
- ):
322
- self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
323
- self._tokenizer.pad_token = self._tokenizer.eos_token
324
- elif self._tokenizer.pad_token_id is None:
325
- pad_token_candidates = ["<pad>", "[pad]", "<|endoftext|>", "<|im_end|>"]
326
- pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
327
- for candidate in pad_token_candidates:
328
- if candidate in self._tokenizer.get_vocab():
329
- pad_token_id = self._tokenizer.get_vocab()[candidate]
330
- self._tokenizer.pad_token = candidate
331
- self._tokenizer.pad_token_id = pad_token_id
332
- break
333
- else:
334
- raise InvalidModel(
335
- "Could not find a suitable token to use as a padding token, since "
336
- "the model does not have a BOS, EOS, or padding token, and does "
337
- f"not have any of the following tokens in its vocabulary: "
338
- f"{pad_token_candidates}."
339
- )
340
-
341
- assert self._tokenizer.pad_token_id is not None
342
-
343
- # Add end of chat token as a stopping token, if it exists
344
- end_of_chat_token_ids = get_end_of_chat_token_ids(tokenizer=self._tokenizer)
345
- if end_of_chat_token_ids is not None:
346
- end_of_chat_token = self._tokenizer.decode(end_of_chat_token_ids).strip()
325
+ if self.end_of_chat_token_ids is not None:
326
+ end_of_chat_token = self._tokenizer.decode(
327
+ self.end_of_chat_token_ids
328
+ ).strip()
347
329
  if end_of_chat_token:
348
330
  stop_tokens.append(end_of_chat_token)
349
331
 
@@ -430,7 +412,7 @@ class VLLMModel(HuggingFaceEncoderModel):
430
412
  raw_outputs = self._model.generate(
431
413
  prompts=prompts,
432
414
  sampling_params=sampling_params,
433
- use_tqdm=(not input_is_a_test),
415
+ use_tqdm=False if input_is_a_test else get_pbar_without_leave,
434
416
  lora_request=self.buffer.get("lora_request"),
435
417
  )
436
418
  break
@@ -497,30 +479,23 @@ class VLLMModel(HuggingFaceEncoderModel):
497
479
  completion_ids: list[list[int]] = [
498
480
  output.outputs[0].token_ids for output in raw_outputs
499
481
  ]
500
- if self.end_of_reasoning_token_id in completion_ids[0]:
501
- # Find the latest index of the end of reasoning token and slice
502
- # the token IDs to only include the tokens after it
503
- completion_ids = [
504
- token_ids[
505
- max(
506
- [
507
- i
508
- for i, x in enumerate(token_ids)
509
- if x == self.end_of_reasoning_token_id
510
- ]
511
- )
512
- + 1 :
513
- ]
514
- if self.end_of_reasoning_token_id in token_ids
515
- else token_ids
516
- for token_ids in completion_ids
517
- ]
518
482
  completions = self._tokenizer.batch_decode(
519
483
  sequences=[
520
484
  torch.LongTensor(completion_id) for completion_id in completion_ids
521
- ],
522
- skip_special_tokens=True,
485
+ ]
486
+ )
487
+ if self.end_of_reasoning_token is not None:
488
+ completions = [
489
+ completion.split(self.end_of_reasoning_token)[-1]
490
+ for completion in completions
491
+ ]
492
+ stop_token_pattern = re.compile(
493
+ "|".join(re.escape(stop_token) for stop_token in stop_tokens)
523
494
  )
495
+ completions = [
496
+ re.split(pattern=stop_token_pattern, string=completion)[0]
497
+ for completion in completions
498
+ ]
524
499
  completions = [completion.strip() for completion in completions]
525
500
 
526
501
  # Sanity check
@@ -541,17 +516,6 @@ class VLLMModel(HuggingFaceEncoderModel):
541
516
  ]
542
517
  for raw_output in raw_outputs
543
518
  ]
544
- scores = [
545
- score_list[
546
- raw_output.outputs[0].token_ids.index(
547
- self.end_of_reasoning_token_id
548
- )
549
- + 2 :
550
- ]
551
- if self.end_of_reasoning_token_id in raw_output.outputs[0].token_ids
552
- else score_list
553
- for raw_output, score_list in zip(raw_outputs, scores)
554
- ]
555
519
  output = GenerativeModelOutput(sequences=completions, scores=scores)
556
520
  else:
557
521
  output = GenerativeModelOutput(sequences=completions)
@@ -831,10 +795,6 @@ def load_model_and_tokenizer(
831
795
  f"The model {model_id!r} could not be loaded. The error was {e!r}."
832
796
  )
833
797
 
834
- model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
835
- model._validate_and_add_requests = MethodType(
836
- _validate_and_add_requests_with_fixed_progress_bars, model
837
- )
838
798
  model.config = hf_model_config
839
799
 
840
800
  return model, tokenizer
@@ -918,90 +878,11 @@ def load_tokenizer(
918
878
  # Ensure that BOS, EOS and PAD tokens are set
919
879
  tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
920
880
  tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
921
- if tokenizer.pad_token_id is None:
922
- tokenizer.pad_token = tokenizer.eos_token
881
+ tokenizer.pad_token, tokenizer.pad_token_id = get_pad_token(tokenizer=tokenizer)
923
882
 
924
883
  return tokenizer
925
884
 
926
885
 
927
- def _run_engine_with_fixed_progress_bars(
928
- self: "LLM", use_tqdm: bool
929
- ) -> list["RequestOutput"]:
930
- if use_tqdm:
931
- num_requests = self.llm_engine.get_num_unfinished_requests()
932
- pbar = tqdm(
933
- total=num_requests, leave=False, disable=hasattr(sys, "_called_from_test")
934
- )
935
- else:
936
- pbar = None
937
-
938
- # Run the engine.
939
- outputs: list["RequestOutput"] = list()
940
- while self.llm_engine.has_unfinished_requests():
941
- step_outputs = self.llm_engine.step()
942
- for output in step_outputs:
943
- if output.finished:
944
- outputs.append(output)
945
- if pbar is not None:
946
- pbar.update(1)
947
-
948
- if pbar is not None:
949
- pbar.close()
950
-
951
- # Sort the outputs by request ID. This is necessary because some requests may be
952
- # finished earlier than its previous requests.
953
- outputs = sorted(outputs, key=lambda x: int(x.request_id))
954
-
955
- return outputs
956
-
957
-
958
- def _validate_and_add_requests_with_fixed_progress_bars(
959
- self: "LLM",
960
- prompts: "PromptType | c.Sequence[PromptType]",
961
- params: "SamplingParams | c.Sequence[SamplingParams] | PoolingParams | c.Sequence[PoolingParams]", # noqa: E501
962
- *,
963
- use_tqdm: bool,
964
- lora_request: "c.Sequence[LoRARequest] | LoRARequest | None",
965
- prompt_adapter_request: "PromptAdapterRequest | None",
966
- tokenization_kwargs: dict[str, t.Any] | None = None,
967
- guided_options: "GuidedDecodingRequest | None" = None,
968
- priority: list[int] | None = None,
969
- ) -> None:
970
- if isinstance(prompts, (str, dict)):
971
- # Convert a single prompt to a list.
972
- prompts = [prompts]
973
-
974
- num_requests = len(prompts)
975
- if isinstance(params, list) and len(params) != num_requests:
976
- raise ValueError("The lengths of prompts and params must be the same.")
977
- if isinstance(lora_request, list) and len(lora_request) != num_requests:
978
- raise ValueError("The lengths of prompts and lora_request must be the same.")
979
-
980
- for sp in params if isinstance(params, list) else (params,):
981
- if isinstance(sp, SamplingParams):
982
- self._add_guided_params(sp, guided_options)
983
-
984
- # We only care about the final output
985
- sp.output_kind = RequestOutputKind.FINAL_ONLY
986
-
987
- # Add requests to the engine.
988
- it = prompts
989
- if use_tqdm:
990
- it = tqdm(it, desc="Adding requests", leave=False)
991
-
992
- for i, prompt in enumerate(it):
993
- self._add_request(
994
- prompt,
995
- params[i] if isinstance(params, c.Sequence) else params,
996
- tokenization_kwargs=tokenization_kwargs,
997
- lora_request=lora_request[i]
998
- if isinstance(lora_request, c.Sequence)
999
- else lora_request,
1000
- prompt_adapter_request=prompt_adapter_request,
1001
- priority=priority[i] if priority else 0,
1002
- )
1003
-
1004
-
1005
886
  def clear_vllm() -> None:
1006
887
  """Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
1007
888
  with contextlib.suppress(ValueError):
@@ -1016,14 +897,10 @@ def clear_vllm() -> None:
1016
897
  clear_memory()
1017
898
 
1018
899
 
1019
- def get_end_of_reasoning_token_id(
900
+ def get_end_of_reasoning_token(
1020
901
  model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
1021
- ) -> int | None:
1022
- """Get the end of reasoning token ID for a generative model.
1023
-
1024
- This assumes that the reasoning token is of the form <X> and that the end of
1025
- reasoning token is </X> (for X being any string without spaces). We disallow the
1026
- reasoning token to be the same as the beginning-of-sentence token.
902
+ ) -> str | None:
903
+ """Get the end-of-reasoning token for a generative model.
1027
904
 
1028
905
  Args:
1029
906
  model:
@@ -1034,86 +911,162 @@ def get_end_of_reasoning_token_id(
1034
911
  The model ID.
1035
912
 
1036
913
  Returns:
1037
- The end of reasoning token ID, or None if it could not be found.
914
+ The end of reasoning token, or None if it could not be found.
1038
915
  """
1039
- if tokenizer.chat_template is None:
1040
- prompt = "What is your name?"
1041
- else:
916
+ # Create a prompt to check if the model uses the reasoning tokens
917
+ prompt = "What is your name?"
918
+ if tokenizer.chat_template is not None:
1042
919
  templated_prompt = tokenizer.apply_chat_template(
1043
- conversation=[dict(role="user", content="What is your name?")],
920
+ conversation=[dict(role="user", content=prompt)],
1044
921
  add_generation_prompt=True,
1045
922
  tokenize=False,
1046
923
  )
1047
924
  assert isinstance(templated_prompt, str)
1048
925
  prompt = templated_prompt
1049
926
 
1050
- # Generate a completion and remove the BOS token from it, to not confuse it with the
1051
- # potential reasoning token
1052
- model_output = model.generate(
1053
- prompts=[prompt],
1054
- sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
1055
- use_tqdm=False,
927
+ # Check that the beginning-of-reasoning token is actually used by the model
928
+ completion = (
929
+ model.generate(
930
+ prompts=[prompt],
931
+ sampling_params=SamplingParams(max_tokens=10),
932
+ use_tqdm=False,
933
+ )[0]
934
+ .outputs[0]
935
+ .text
1056
936
  )
1057
- completion = model_output[0].outputs[0].text
1058
-
1059
- if tokenizer.bos_token is not None:
1060
- if isinstance(tokenizer.bos_token, str):
1061
- prompt = prompt.replace(tokenizer.bos_token, "").strip()
1062
- completion = completion.replace(tokenizer.bos_token, "").strip()
1063
- elif isinstance(tokenizer.bos_token, list):
1064
- for bos_token in tokenizer.bos_token:
1065
- prompt = prompt.replace(bos_token, "").strip()
1066
- completion = completion.replace(bos_token, "").strip()
1067
-
1068
- # If it doesn't contain a reasoning token, we can't find the end of reasoning token
1069
- prompt_match = re.search(pattern=r"<\w+>", string=prompt)
1070
- completion_match = re.search(pattern=r"<\w+>", string=completion)
1071
- if completion_match is None and prompt_match is None:
937
+ bor_reasoning_matches = [
938
+ (bor_token, eor_token)
939
+ for bor_token, eor_token in REASONING_TOKENS
940
+ if bor_token in prompt or bor_token in completion
941
+ ]
942
+ if not bor_reasoning_matches:
1072
943
  log_once(
1073
- f"Could not find a reasoning token for model {model_id!r}, so assuming "
1074
- "the model is not a reasoning model.",
1075
- level=logging.DEBUG,
944
+ f"The model {model_id!r} did not generate any beginning-of-reasoning "
945
+ "tokens in the prompt or the completion. Assuming the model is not "
946
+ "a reasoning model.",
947
+ level=logging.INFO,
1076
948
  )
1077
949
  return None
1078
950
 
1079
- # Check that the found reasoning token and its associated end-of-reasoning tokens
1080
- # are both special tokens
1081
- elif completion_match is not None:
1082
- reasoning_token = completion_match.group()
1083
- else:
1084
- assert prompt_match is not None
1085
- reasoning_token = prompt_match.group()
1086
- end_of_reasoning_token = f"</{reasoning_token[1:-1]}>"
1087
- special_tokens = [
1088
- decoder_token.content
1089
- for decoder_token in tokenizer.added_tokens_decoder.values()
1090
- ]
1091
- special_tokens.extend(
1092
- [encoder_token for encoder_token in tokenizer.added_tokens_encoder.keys()]
951
+ # Check that the beginning-of-reasoning token is actually used by the model
952
+ completion = (
953
+ model.generate(
954
+ prompts=[prompt],
955
+ sampling_params=SamplingParams(max_tokens=REASONING_MAX_TOKENS),
956
+ use_tqdm=False,
957
+ )[0]
958
+ .outputs[0]
959
+ .text
1093
960
  )
1094
- special_tokens.extend(tokenizer.all_special_tokens)
1095
- if (
1096
- reasoning_token not in special_tokens
1097
- or end_of_reasoning_token not in special_tokens
1098
- ):
961
+ eor_reasoning_matches = [
962
+ (bor_token, eor_token)
963
+ for bor_token, eor_token in bor_reasoning_matches
964
+ if eor_token in completion
965
+ ]
966
+ if not eor_reasoning_matches:
1099
967
  log_once(
1100
- f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1101
- f"token {end_of_reasoning_token!r} for model {model_id!r}, but one of "
1102
- "them is not registered as a special token, so assuming it is not a "
1103
- "real reasoning token.",
1104
- level=logging.DEBUG,
968
+ f"The model {model_id!r} did not generate any end-of-reasoning "
969
+ "tokens in the prompt or the completion, even though it generated "
970
+ "the beginning-of-reasoning tokens "
971
+ f"{[bor_token for bor_token, _ in bor_reasoning_matches]!r}. "
972
+ "This is probably not correct, so please report this issue.",
973
+ level=logging.INFO,
1105
974
  )
1106
975
  return None
1107
976
 
977
+ if len(eor_reasoning_matches) > 1:
978
+ log_once(
979
+ f"Found multiple reasoning tokens {eor_reasoning_matches} for "
980
+ f"model {model_id!r}. Using {eor_reasoning_matches[0]!r} as "
981
+ "the reasoning token. If this is not the correct reasoning token, "
982
+ "please report this issue.",
983
+ level=logging.INFO,
984
+ )
985
+
986
+ bor_token, eor_token = eor_reasoning_matches[0]
1108
987
  log_once(
1109
- f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1110
- f"token {end_of_reasoning_token!r} for model {model_id!r}.",
1111
- level=logging.DEBUG,
988
+ f"Detected beginning-of-reasoning token {bor_token!r} and end-of-reasoning "
989
+ f"token {eor_token!r} for model {model_id!r}.",
990
+ level=logging.INFO,
991
+ )
992
+
993
+ return eor_token
994
+
995
+
996
+ def get_custom_stop_tokens(
997
+ model: "LLM",
998
+ tokenizer: "PreTrainedTokenizer",
999
+ model_id: str,
1000
+ is_reasoning_model: bool,
1001
+ ) -> list[str]:
1002
+ """Get the stop tokens for a generative model.
1003
+
1004
+ Args:
1005
+ model:
1006
+ The vLLM model.
1007
+ tokenizer:
1008
+ The tokenizer.
1009
+ model_id:
1010
+ The model ID.
1011
+ is_reasoning_model:
1012
+ Whether the model is a reasoning model. This is used to determine the number
1013
+ of generated tokens to allow before stopping the generation.
1014
+
1015
+ Returns:
1016
+ A list of stop tokens.
1017
+ """
1018
+ candidate_stop_tokens = CUSTOM_STOP_TOKENS
1019
+
1020
+ # Create a prompt to check if the model uses the reasoning tokens
1021
+ prompt = "Hello"
1022
+ if tokenizer.chat_template is not None:
1023
+ templated_prompt = tokenizer.apply_chat_template(
1024
+ conversation=[dict(role="user", content=prompt)],
1025
+ add_generation_prompt=True,
1026
+ tokenize=False,
1027
+ )
1028
+ assert isinstance(templated_prompt, str)
1029
+ prompt = templated_prompt
1030
+
1031
+ # Check that the beginning-of-reasoning token is actually used by the model
1032
+ max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
1033
+ completion = (
1034
+ model.generate(
1035
+ prompts=[prompt],
1036
+ sampling_params=SamplingParams(max_tokens=max_tokens, temperature=0.0),
1037
+ use_tqdm=False,
1038
+ )[0]
1039
+ .outputs[0]
1040
+ .text
1112
1041
  )
1113
1042
 
1114
- # Encode the end of reasoning token and return its ID
1115
- end_of_reasoning_token_id = tokenizer.encode(
1116
- text=end_of_reasoning_token, add_special_tokens=False
1117
- )[0]
1043
+ stop_tokens = [
1044
+ stop_token
1045
+ for stop_token in candidate_stop_tokens
1046
+ if stop_token in prompt or stop_token in completion
1047
+ ]
1048
+ if stop_tokens:
1049
+ logger.debug(
1050
+ f"Found the following custom stop tokens for model {model_id!r}: "
1051
+ f"{stop_tokens}."
1052
+ )
1053
+ else:
1054
+ logger.debug(f"Found no custom stop tokens for model {model_id!r}.")
1055
+
1056
+ return stop_tokens
1057
+
1058
+
1059
+ def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
1060
+ """Get a progress bar for vLLM which disappears after completion.
1118
1061
 
1119
- return end_of_reasoning_token_id
1062
+ Args:
1063
+ *tqdm_args:
1064
+ Positional arguments to pass to tqdm.
1065
+ **tqdm_kwargs:
1066
+ Additional keyword arguments to pass to tqdm.
1067
+
1068
+ Returns:
1069
+ A tqdm progress bar.
1070
+ """
1071
+ tqdm_kwargs.pop("leave", None) # Remove the 'leave' key if it exists
1072
+ return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
euroeval/constants.py CHANGED
@@ -64,3 +64,16 @@ MERGE_TAGS = ["merge", "mergekit"]
64
64
 
65
65
  # The minimum required CUDA compute capability for using bfloat16 in vLLM
66
66
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
67
+
68
+ # Used to detect whether a model is a reasoning model
69
+ REASONING_TOKENS = [
70
+ ("<think>", "</think>"),
71
+ ("<reason>", "</reason>"),
72
+ ("<reasoning>", "</reasoning>"),
73
+ ]
74
+
75
+ # These tokens are sometimes used by models to indicate the end of a generated
76
+ # response, but they do not use them as a proper EOS token, so we have to deal with them
77
+ # manually. We only use them as stop tokens if they actually appear in the model's
78
+ # output
79
+ CUSTOM_STOP_TOKENS = ["<sep>"]
euroeval/data_loading.py CHANGED
@@ -4,11 +4,11 @@ import logging
4
4
  import sys
5
5
  import time
6
6
 
7
+ import requests
7
8
  from datasets import Dataset, DatasetDict, load_dataset
8
9
  from datasets.exceptions import DatasetsError
9
10
  from huggingface_hub.errors import HfHubHTTPError
10
11
  from numpy.random import Generator
11
- from requests import ReadTimeout
12
12
 
13
13
  from .data_models import BenchmarkConfig, DatasetConfig
14
14
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark
@@ -101,7 +101,13 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDic
101
101
  token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
102
102
  )
103
103
  break
104
- except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
104
+ except (
105
+ FileNotFoundError,
106
+ ConnectionError,
107
+ DatasetsError,
108
+ requests.ConnectionError,
109
+ requests.ReadTimeout,
110
+ ):
105
111
  logger.warning(
106
112
  f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
107
113
  )
euroeval/finetuning.py CHANGED
@@ -200,6 +200,7 @@ def finetune_single_iteration(
200
200
  compute_metrics=model.compute_metrics,
201
201
  callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
202
202
  data_collator=model.data_collator,
203
+ preprocess_logits_for_metrics=remove_extra_tensors_from_logits,
203
204
  )
204
205
 
205
206
  if not benchmark_config.verbose:
@@ -316,3 +317,24 @@ def get_training_args(
316
317
  training_args._n_gpu = 1
317
318
 
318
319
  return training_args
320
+
321
+
322
+ def remove_extra_tensors_from_logits(
323
+ logits: torch.Tensor | tuple[torch.Tensor, ...], labels: torch.Tensor
324
+ ) -> torch.Tensor | tuple[torch.Tensor, ...]:
325
+ """If the logits are a tuple, return only the first element.
326
+
327
+ Args:
328
+ logits:
329
+ The logits to process.
330
+ labels:
331
+ The labels to use for the processing.
332
+
333
+ Returns:
334
+ The processed logits.
335
+ """
336
+ if isinstance(logits, tuple):
337
+ logits = logits[:-1]
338
+ if len(logits) == 1:
339
+ logits = logits[0]
340
+ return logits