EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show
  1. euroeval/__init__.py +8 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +199 -139
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +19 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +73 -23
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +35 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +90 -20
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +276 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/model_cache.py +13 -1
  41. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  42. euroeval/prompt_templates/multiple_choice.py +23 -2
  43. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  44. euroeval/prompt_templates/reading_comprehension.py +42 -2
  45. euroeval/prompt_templates/sentiment_classification.py +46 -2
  46. euroeval/prompt_templates/summarization.py +24 -4
  47. euroeval/scores.py +7 -2
  48. euroeval/speed_benchmark.py +6 -6
  49. euroeval/task_group_utils/multiple_choice_classification.py +19 -8
  50. euroeval/task_group_utils/question_answering.py +35 -28
  51. euroeval/task_group_utils/sequence_classification.py +128 -42
  52. euroeval/task_group_utils/text_to_text.py +7 -3
  53. euroeval/task_group_utils/token_classification.py +59 -73
  54. euroeval/tasks.py +33 -6
  55. euroeval/tokenization_utils.py +294 -207
  56. euroeval/utils.py +150 -35
  57. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
  58. euroeval-16.0.1.dist-info/RECORD +69 -0
  59. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
  60. euroeval/human_evaluation.py +0 -738
  61. euroeval/metrics.py +0 -470
  62. euroeval-15.16.0.dist-info/RECORD +0 -63
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
  64. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -5,9 +5,10 @@ import re
5
5
  import typing as t
6
6
 
7
7
  import torch
8
+ from transformers import MistralCommonTokenizer
8
9
 
9
- from .constants import TASK_GROUPS_USING_LOGPROBS
10
10
  from .enums import GenerativeType
11
+ from .exceptions import InvalidModel
11
12
  from .utils import log_once
12
13
 
13
14
  if t.TYPE_CHECKING:
@@ -20,47 +21,47 @@ if t.TYPE_CHECKING:
20
21
  logger = logging.getLogger("euroeval")
21
22
 
22
23
 
23
- def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
24
- """Get the special token metadata for a tokenizer.
24
+ def get_special_token_metadata(tokeniser: "PreTrainedTokenizerBase") -> dict:
25
+ """Get the special token metadata for a tokeniser.
25
26
 
26
27
  Args:
27
- tokenizer:
28
- The tokenizer.
28
+ tokeniser:
29
+ The tokeniser.
29
30
 
30
31
  Returns:
31
32
  The special token metadata.
32
33
  """
33
- # Create some test input IDs, to check if the tokenizer is adding special tokens
34
- test_input_ids = tokenizer("Test").input_ids
34
+ # Create some test input IDs, to check if the tokeniser is adding special tokens
35
+ test_input_ids = tokeniser("Test").input_ids
35
36
 
36
- # Extract the CLS token IDs from the tokenizer, if it's using them
37
+ # Extract the CLS token IDs from the tokeniser, if it's using them
37
38
  has_cls_token = True
38
- if tokenizer.cls_token_id in test_input_ids:
39
- cls_token_id = tokenizer.cls_token_id
40
- cls_token = tokenizer.cls_token
41
- elif tokenizer.bos_token_id in test_input_ids:
42
- cls_token_id = tokenizer.bos_token_id
43
- cls_token = tokenizer.bos_token
44
- elif tokenizer.cls_token is not None:
45
- cls_token_id = tokenizer.cls_token_id
46
- cls_token = tokenizer.cls_token
39
+ if tokeniser.cls_token_id in test_input_ids:
40
+ cls_token_id = tokeniser.cls_token_id
41
+ cls_token = tokeniser.cls_token
42
+ elif tokeniser.bos_token_id in test_input_ids:
43
+ cls_token_id = tokeniser.bos_token_id
44
+ cls_token = tokeniser.bos_token
45
+ elif tokeniser.cls_token is not None:
46
+ cls_token_id = tokeniser.cls_token_id
47
+ cls_token = tokeniser.cls_token
47
48
  has_cls_token = False
48
49
  else:
49
- cls_token_id = tokenizer.bos_token_id
50
- cls_token = tokenizer.bos_token
50
+ cls_token_id = tokeniser.bos_token_id
51
+ cls_token = tokeniser.bos_token
51
52
  has_cls_token = False
52
53
 
53
- # Extract the SEP token IDs from the tokenizer, if it's using them
54
+ # Extract the SEP token IDs from the tokeniser, if it's using them
54
55
  has_sep_token = True
55
- if tokenizer.sep_token_id in test_input_ids:
56
- sep_token = tokenizer.sep_token
57
- elif tokenizer.eos_token_id in test_input_ids:
58
- sep_token = tokenizer.eos_token
59
- elif tokenizer.sep_token is not None:
60
- sep_token = tokenizer.sep_token
56
+ if tokeniser.sep_token_id in test_input_ids:
57
+ sep_token = tokeniser.sep_token
58
+ elif tokeniser.eos_token_id in test_input_ids:
59
+ sep_token = tokeniser.eos_token
60
+ elif tokeniser.sep_token is not None:
61
+ sep_token = tokeniser.sep_token
61
62
  has_sep_token = False
62
63
  else:
63
- sep_token = tokenizer.eos_token
64
+ sep_token = tokeniser.eos_token
64
65
  has_sep_token = False
65
66
 
66
67
  return dict(
@@ -73,29 +74,29 @@ def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
73
74
 
74
75
 
75
76
  def should_prompts_be_stripped(
76
- labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
77
+ labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
77
78
  ) -> bool:
78
79
  """Determine if we should strip the prompts for few-shot evaluation.
79
80
 
80
- This is the case if the tokenizer needs to include the space as part of the label
81
+ This is the case if the tokeniser needs to include the space as part of the label
81
82
  token. The strategy is thus to tokenize a label with a preceeding colon (as in the
82
83
  prompts), i.e., ": positive", and check if the tokenization starts with the tokens
83
84
  of ": ". If this is the case, then we should not strip the prompts, since the
84
- tokenizer produces the whitespace token separately.
85
+ tokeniser produces the whitespace token separately.
85
86
 
86
87
  Args:
87
88
  labels_to_be_generated:
88
89
  The labels that are to be generated.
89
- tokenizer:
90
- The tokenizer used to tokenize the labels.
90
+ tokeniser:
91
+ The tokeniser used to tokenize the labels.
91
92
 
92
93
  Returns:
93
94
  Whether we should strip the prompts.
94
95
  """
95
96
  strip_prompts = True
96
97
  for label in labels_to_be_generated:
97
- colon_tokens = tokenizer(": ", add_special_tokens=False).input_ids
98
- label_tokens = tokenizer(": " + label, add_special_tokens=False).input_ids
98
+ colon_tokens = tokeniser(": ", add_special_tokens=False).input_ids
99
+ label_tokens = tokeniser(": " + label, add_special_tokens=False).input_ids
99
100
 
100
101
  if isinstance(colon_tokens, torch.Tensor):
101
102
  colon_tokens = list(colon_tokens.squeeze(0))
@@ -112,38 +113,38 @@ def should_prompts_be_stripped(
112
113
 
113
114
 
114
115
  def should_prefix_space_be_added_to_labels(
115
- labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
116
+ labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
116
117
  ) -> bool:
117
118
  """Determine if we should add a prefix space to the labels.
118
119
 
119
- This is the case if the prompts are stripped and the tokenizer doesn't
120
+ This is the case if the prompts are stripped and the tokeniser doesn't
120
121
  automatically add prefix whitespaces to the labels.
121
122
 
122
123
  Args:
123
124
  labels_to_be_generated:
124
125
  The labels that are to be generated.
125
- tokenizer:
126
- The tokenizer used to tokenize the labels.
126
+ tokeniser:
127
+ The tokeniser used to tokenize the labels.
127
128
 
128
129
  Returns:
129
130
  Whether we should add a prefix space to the labels.
130
131
  """
131
132
  if not should_prompts_be_stripped(
132
- labels_to_be_generated=labels_to_be_generated, tokenizer=tokenizer
133
+ labels_to_be_generated=labels_to_be_generated, tokeniser=tokeniser
133
134
  ):
134
135
  return False
135
136
 
136
- whitespace_token = tokenizer.convert_ids_to_tokens(
137
- ids=tokenizer(" ", add_special_tokens=False).input_ids[0]
137
+ whitespace_token = tokeniser.convert_ids_to_tokens(
138
+ ids=tokeniser(" ", add_special_tokens=False).input_ids[0]
138
139
  )[0]
139
140
 
140
141
  add_prefix_space = True
141
142
  for label in labels_to_be_generated:
142
- label_tokens = tokenizer(label, add_special_tokens=False).input_ids
143
+ label_tokens = tokeniser(label, add_special_tokens=False).input_ids
143
144
  if isinstance(label_tokens, torch.Tensor):
144
145
  label_tokens = list(label_tokens.squeeze(0))
145
146
  first_label_token: int = int(label_tokens[0])
146
- first_character_of_label = tokenizer.convert_ids_to_tokens(first_label_token)[0]
147
+ first_character_of_label = tokeniser.convert_ids_to_tokens(first_label_token)[0]
147
148
  has_prefix_space = first_character_of_label == whitespace_token
148
149
  if has_prefix_space:
149
150
  add_prefix_space = False
@@ -153,22 +154,22 @@ def should_prefix_space_be_added_to_labels(
153
154
 
154
155
 
155
156
  def get_bos_token(
156
- tokenizer: "PreTrainedTokenizer",
157
+ tokeniser: "PreTrainedTokenizer",
157
158
  ) -> tuple[str, int] | tuple[None, None]:
158
- """Get the beginning-of-sequence token from a tokenizer.
159
+ """Get the beginning-of-sequence token from a tokeniser.
159
160
 
160
161
  Args:
161
- tokenizer:
162
- The tokenizer.
162
+ tokeniser:
163
+ The tokeniser.
163
164
 
164
165
  Returns:
165
166
  A pair (token, token_id) representing the beginning-of-sequence token and its
166
167
  token ID, or (None, None) if no BOS token is found.
167
168
  """
168
- if isinstance(tokenizer.bos_token, str) and isinstance(tokenizer.bos_token_id, int):
169
- return tokenizer.bos_token, tokenizer.bos_token_id
169
+ if isinstance(tokeniser.bos_token, str) and isinstance(tokeniser.bos_token_id, int):
170
+ return tokeniser.bos_token, tokeniser.bos_token_id
170
171
 
171
- vocab: dict[str, int] = tokenizer.get_vocab()
172
+ vocab: dict[str, int] = tokeniser.get_vocab()
172
173
 
173
174
  candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
174
175
  for candidate_bos_token in candidate_bos_tokens:
@@ -179,7 +180,7 @@ def get_bos_token(
179
180
  else:
180
181
  log_once(
181
182
  "The model does not have a beginning-of-sequence token. Please ensure that "
182
- "this has been set in the tokenizer's configuration. Using no BOS token."
183
+ "this has been set in the tokeniser's configuration. Using no BOS token."
183
184
  " This may lead to unexpected behavior in the model.",
184
185
  level=logging.INFO,
185
186
  )
@@ -194,22 +195,22 @@ def get_bos_token(
194
195
 
195
196
 
196
197
  def get_eos_token(
197
- tokenizer: "PreTrainedTokenizer",
198
+ tokeniser: "PreTrainedTokenizer",
198
199
  ) -> tuple[str, int] | tuple[None, None]:
199
- """Get the end-of-sequence token from a tokenizer.
200
+ """Get the end-of-sequence token from a tokeniser.
200
201
 
201
202
  Args:
202
- tokenizer:
203
- The tokenizer.
203
+ tokeniser:
204
+ The tokeniser.
204
205
 
205
206
  Returns:
206
207
  A pair (token, token_id) representing the end-of-sequence token and its token
207
208
  ID, or (None, None) if no EOS token is found.
208
209
  """
209
- if isinstance(tokenizer.eos_token, str) and isinstance(tokenizer.eos_token_id, int):
210
- return tokenizer.eos_token, tokenizer.eos_token_id
210
+ if isinstance(tokeniser.eos_token, str) and isinstance(tokeniser.eos_token_id, int):
211
+ return tokeniser.eos_token, tokeniser.eos_token_id
211
212
 
212
- vocab: dict[str, int] = tokenizer.get_vocab()
213
+ vocab: dict[str, int] = tokeniser.get_vocab()
213
214
 
214
215
  candidate_eos_tokens = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]"]
215
216
  for candidate_eos_token in candidate_eos_tokens:
@@ -220,7 +221,7 @@ def get_eos_token(
220
221
  else:
221
222
  log_once(
222
223
  "The model does not have an end-of-sequence token. Please ensure that this "
223
- "has been set in the tokenizer's configuration. Using no EOS token. This "
224
+ "has been set in the tokeniser's configuration. Using no EOS token. This "
224
225
  "may lead to unexpected behavior in the model.",
225
226
  level=logging.INFO,
226
227
  )
@@ -235,55 +236,55 @@ def get_eos_token(
235
236
 
236
237
 
237
238
  def get_pad_token(
238
- tokenizer: "PreTrainedTokenizer",
239
+ tokeniser: "PreTrainedTokenizer",
239
240
  ) -> tuple[str, int] | tuple[None, None]:
240
- """Get the padding token from a tokenizer.
241
+ """Get the padding token from a tokeniser.
241
242
 
242
243
  Args:
243
- tokenizer:
244
- The tokenizer.
244
+ tokeniser:
245
+ The tokeniser.
245
246
 
246
247
  Returns:
247
248
  A pair (token, token_id) representing the padding token and its token ID, or
248
249
  (None, None) if no padding token is found.
249
250
  """
250
- # If the tokenizer already has a padding token, return it
251
- if tokenizer.pad_token is not None and tokenizer.pad_token_id is not None:
252
- assert isinstance(tokenizer.pad_token, str), (
253
- "Expected tokenizer.pad_token to be a string, but got "
254
- f"{type(tokenizer.pad_token)}."
251
+ # If the tokeniser already has a padding token, return it
252
+ if tokeniser.pad_token is not None and tokeniser.pad_token_id is not None:
253
+ assert isinstance(tokeniser.pad_token, str), (
254
+ "Expected tokeniser.pad_token to be a string, but got "
255
+ f"{type(tokeniser.pad_token)}."
255
256
  )
256
- assert isinstance(tokenizer.pad_token_id, int), (
257
- "Expected tokenizer.pad_token_id to be an integer, but got "
258
- f"{type(tokenizer.pad_token_id)}."
257
+ assert isinstance(tokeniser.pad_token_id, int), (
258
+ "Expected tokeniser.pad_token_id to be an integer, but got "
259
+ f"{type(tokeniser.pad_token_id)}."
259
260
  )
260
- return (tokenizer.pad_token, tokenizer.pad_token_id)
261
+ return (tokeniser.pad_token, tokeniser.pad_token_id)
261
262
 
262
- # If the tokenizer has a BOS token, use it as the padding token
263
- if tokenizer.bos_token is not None and tokenizer.bos_token_id is not None:
264
- assert isinstance(tokenizer.bos_token, str), (
265
- "Expected tokenizer.bos_token to be a string, but got "
266
- f"{type(tokenizer.bos_token)}."
263
+ # If the tokeniser has a BOS token, use it as the padding token
264
+ if tokeniser.bos_token is not None and tokeniser.bos_token_id is not None:
265
+ assert isinstance(tokeniser.bos_token, str), (
266
+ "Expected tokeniser.bos_token to be a string, but got "
267
+ f"{type(tokeniser.bos_token)}."
267
268
  )
268
- assert isinstance(tokenizer.bos_token_id, int), (
269
- "Expected tokenizer.bos_token_id to be an integer, but got "
270
- f"{type(tokenizer.bos_token_id)}."
269
+ assert isinstance(tokeniser.bos_token_id, int), (
270
+ "Expected tokeniser.bos_token_id to be an integer, but got "
271
+ f"{type(tokeniser.bos_token_id)}."
271
272
  )
272
- pad_token = tokenizer.bos_token
273
- pad_token_id = tokenizer.bos_token_id
274
-
275
- # If the tokenizer has an EOS token, use it as the padding token
276
- elif tokenizer.eos_token is not None and tokenizer.eos_token_id is not None:
277
- assert isinstance(tokenizer.eos_token, str), (
278
- "Expected tokenizer.eos_token to be a string, but got "
279
- f"{type(tokenizer.eos_token)}."
273
+ pad_token = tokeniser.bos_token
274
+ pad_token_id = tokeniser.bos_token_id
275
+
276
+ # If the tokeniser has an EOS token, use it as the padding token
277
+ elif tokeniser.eos_token is not None and tokeniser.eos_token_id is not None:
278
+ assert isinstance(tokeniser.eos_token, str), (
279
+ "Expected tokeniser.eos_token to be a string, but got "
280
+ f"{type(tokeniser.eos_token)}."
280
281
  )
281
- assert isinstance(tokenizer.eos_token_id, int), (
282
- "Expected tokenizer.eos_token_id to be an integer, but got "
283
- f"{type(tokenizer.eos_token_id)}."
282
+ assert isinstance(tokeniser.eos_token_id, int), (
283
+ "Expected tokeniser.eos_token_id to be an integer, but got "
284
+ f"{type(tokeniser.eos_token_id)}."
284
285
  )
285
- pad_token = tokenizer.eos_token
286
- pad_token_id = tokenizer.eos_token_id
286
+ pad_token = tokeniser.eos_token
287
+ pad_token_id = tokeniser.eos_token_id
287
288
 
288
289
  # Otherwise, try to find a candidate padding token in the vocabulary
289
290
  else:
@@ -296,14 +297,14 @@ def get_pad_token(
296
297
  ]
297
298
  pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
298
299
  for candidate in pad_token_candidates:
299
- if candidate in tokenizer.get_vocab():
300
+ if candidate in tokeniser.get_vocab():
300
301
  pad_token = candidate
301
- pad_token_id = tokenizer.get_vocab()[candidate]
302
+ pad_token_id = tokeniser.get_vocab()[candidate]
302
303
  break
303
304
  else:
304
305
  log_once(
305
306
  "Could not identify a padding token for the model. Please ensure that "
306
- "this has been set in the tokenizer's configuration. Using no padding "
307
+ "this has been set in the tokeniser's configuration. Using no padding "
307
308
  "token. This may lead to unexpected behavior in the model.",
308
309
  level=logging.INFO,
309
310
  )
@@ -317,50 +318,58 @@ def get_pad_token(
317
318
  return pad_token, pad_token_id
318
319
 
319
320
 
320
- def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | None:
321
+ def get_end_of_chat_token_ids(tokeniser: "PreTrainedTokenizer") -> list[int] | None:
321
322
  """Get the end token ID for chat models.
322
323
 
323
- This is only relevant for tokenizers with a chat template.
324
+ This is only relevant for tokenisers with a chat template.
324
325
 
325
326
  Args:
326
- tokenizer:
327
- The tokenizer.
327
+ tokeniser:
328
+ The tokeniser.
328
329
 
329
330
  Returns:
330
- The token IDs used to end chats, or None if the tokenizer does not have a chat
331
- template.
332
-
333
- Raises:
334
- ValueError:
335
- If the end-of-chat token could not be located.
331
+ The token IDs used to end chats, or None if the tokeniser does not have a chat
332
+ template or if no end-of-chat token could be found.
336
333
  """
337
- if tokenizer.chat_template is None:
334
+ if not has_chat_template(tokeniser=tokeniser):
338
335
  return None
339
336
 
340
337
  user_message: dict[str, str] = dict(role="user", content="X")
341
- token_ids: list[int] = tokenizer.apply_chat_template(conversation=[user_message]) # type: ignore[assignment]
338
+ token_ids = apply_chat_template(
339
+ conversation=[user_message],
340
+ tokeniser=tokeniser,
341
+ tokenize=True,
342
+ add_generation_prompt=False,
343
+ )
344
+ assert isinstance(token_ids, list)
342
345
 
343
- for idx, token in enumerate(tokenizer.convert_ids_to_tokens(token_ids)):
344
- token_id = tokenizer.convert_tokens_to_ids(token)
345
- assert isinstance(token_id, int)
346
- token = tokenizer.decode([token_id])
346
+ for idx, token in enumerate(tokeniser.convert_ids_to_tokens(token_ids)):
347
347
  if "X" in token:
348
348
  x_token_index = idx
349
349
  break
350
350
  else:
351
- raise ValueError("Could not locate the end-of-chat token for the model.")
351
+ logger.debug("Could not locate the end-of-chat token for the model.")
352
+ return None
352
353
 
353
354
  end_of_chat_tokens = token_ids[x_token_index + 1 :]
354
355
  if len(end_of_chat_tokens) == 0:
356
+ logger.debug("Could not locate the end-of-chat token for the model.")
355
357
  return None
358
+
359
+ log_once(
360
+ f"Detected end-of-chat token IDs as {end_of_chat_tokens}, corresponding to "
361
+ f"tokens {tokeniser.convert_ids_to_tokens(end_of_chat_tokens)}.",
362
+ level=logging.DEBUG,
363
+ )
356
364
  return end_of_chat_tokens
357
365
 
358
366
 
359
367
  def get_first_label_token_mapping(
360
368
  dataset_config: "DatasetConfig",
361
369
  model_config: "ModelConfig",
362
- tokenizer: "PreTrainedTokenizer | None",
370
+ tokeniser: "PreTrainedTokenizer | None",
363
371
  generative_type: "GenerativeType | None",
372
+ log_metadata: bool,
364
373
  ) -> dict[str, str] | bool:
365
374
  """Check if the model should output scores.
366
375
 
@@ -369,130 +378,208 @@ def get_first_label_token_mapping(
369
378
  The dataset configuration.
370
379
  model_config:
371
380
  The model configuration.
372
- tokenizer:
373
- The tokenizer, or None if not available.
381
+ tokeniser:
382
+ The tokeniser, or None if not available.
374
383
  generative_type:
375
384
  The generative type, or None if not available.
385
+ log_metadata:
386
+ Whether to log metadata.
376
387
 
377
388
  Returns:
378
389
  A mapping from labels to the first token in each label, or alternatively a
379
390
  Boolean value indicating whether the model should output scores (if the mapping
380
391
  is outputted then the model will always output scores).
381
392
  """
382
- if generative_type == GenerativeType.REASONING:
383
- log_once(
384
- f"The model {model_config.model_id!r} is a reasoning model and "
385
- "thus does not support logprobs, so we do not enable it.",
386
- level=logging.DEBUG,
387
- )
393
+ if not (dataset_config.task.uses_logprobs and dataset_config.labels):
394
+ if log_metadata:
395
+ log_once(
396
+ "We will not use logprobs with the model, since the dataset does not "
397
+ "have labels.",
398
+ level=logging.DEBUG,
399
+ )
388
400
  return False
389
-
390
- # If we do not have any tokenizer, then we cannot check if the model should output
391
- # scores and we just assume it should if the dataset supports it
392
- output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
393
- if tokenizer is None:
394
- if output_scores:
401
+ elif generative_type == GenerativeType.REASONING:
402
+ if log_metadata:
395
403
  log_once(
396
- f"We will use logprobs with the model {model_config.model_id!r} "
397
- "since the dataset supports it and no tokenizer is available.",
404
+ f"The model {model_config.model_id!r} is a reasoning model and "
405
+ "thus does not support logprobs, so we do not enable it.",
398
406
  level=logging.DEBUG,
399
407
  )
400
- else:
408
+ return False
409
+ elif tokeniser is None:
410
+ if log_metadata:
401
411
  log_once(
402
- f"We will not use logprobs with the model {model_config.model_id!r} "
403
- "since the dataset does not support it and no tokenizer is available.",
412
+ f"We will use logprobs with the model {model_config.model_id!r} "
413
+ "since the dataset supports it and no tokeniser is available.",
404
414
  level=logging.DEBUG,
405
415
  )
406
- return output_scores
407
-
408
- # If there are labels associated with the dataset, and that the first token of each
409
- # label is distinct, then we can safely use the logprobs
410
- if output_scores and dataset_config.labels:
411
- local_labels = [
412
- dataset_config.prompt_label_mapping[label].strip()
413
- for label in dataset_config.labels
416
+ return True
417
+
418
+ local_labels = [
419
+ dataset_config.prompt_label_mapping[label].strip()
420
+ for label in dataset_config.labels
421
+ ]
422
+
423
+ # Tokenize some text containing each label, which we will use to extract the
424
+ # first token of each label
425
+ all_tokens: list[list[str]]
426
+ if not has_chat_template(tokeniser=tokeniser):
427
+ add_prefix_space = should_prefix_space_be_added_to_labels(
428
+ labels_to_be_generated=local_labels, tokeniser=tokeniser
429
+ )
430
+ all_tokens = [
431
+ tokeniser.tokenize(text=f" {label}" if add_prefix_space else label)
432
+ for label in local_labels
414
433
  ]
415
-
416
- # Tokenize some text containing each label, which we will use to extract the
417
- # first token of each label
418
- all_tokens: list[list[str]]
419
- if tokenizer.chat_template is None:
420
- add_prefix_space = should_prefix_space_be_added_to_labels(
421
- labels_to_be_generated=local_labels, tokenizer=tokenizer
422
- )
423
- all_tokens = [
424
- tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
425
- for label in local_labels
426
- ]
427
- else:
428
- all_tokens = [
429
- tokenizer.convert_ids_to_tokens(
430
- ids=tokenizer.apply_chat_template(
431
- conversation=[
432
- dict(role="user", content=""),
433
- dict(role="assistant", content=label),
434
- ],
435
- add_generation_prompt=True,
436
- tokenize=True,
437
- )
438
- )
439
- for label in local_labels
440
- ]
441
-
442
- # Remove any non-alphabetic characters from the tokens
434
+ else:
443
435
  all_tokens = [
444
- [
445
- re.sub(
446
- pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
447
- repl="",
448
- string=token.lower(),
436
+ tokeniser.convert_ids_to_tokens(
437
+ ids=apply_chat_template(
438
+ conversation=[
439
+ dict(role="user", content=""),
440
+ dict(role="assistant", content=label),
441
+ # Adding extra user message as Mistral tokenisers require
442
+ # conversamtions to end with a user message
443
+ dict(role="user", content=""),
444
+ ],
445
+ tokeniser=tokeniser,
446
+ tokenize=True,
449
447
  )
450
- for token in token_list
451
- ]
452
- for token_list in all_tokens
448
+ )
449
+ for label in local_labels
453
450
  ]
454
451
 
455
- # Extract the first token of each label
456
- first_tokens: list[str] = list()
457
- for token_list, label in zip(all_tokens, local_labels):
458
- matching_tokens = [
459
- tok for tok in token_list if tok and label.startswith(tok)
460
- ]
461
- if not matching_tokens:
452
+ # Remove any non-alphabetic characters from the tokens
453
+ all_tokens = [
454
+ [
455
+ re.sub(
456
+ pattern=r"^[^a-zæøåüöä0-9]+|[^a-zæøåüöä0-9]+$",
457
+ repl="",
458
+ string=token.lower(),
459
+ )
460
+ for token in token_list
461
+ ]
462
+ for token_list in all_tokens
463
+ ]
464
+
465
+ # Extract the first token of each label
466
+ first_tokens: list[str] = list()
467
+ for token_list, label in zip(all_tokens, local_labels):
468
+ matching_tokens = [tok for tok in token_list if tok and label.startswith(tok)]
469
+ if not matching_tokens:
470
+ if log_metadata:
462
471
  log_once(
463
472
  f"No matching token found in token_list for label '{label}', so "
464
473
  "we will not use logprobs with the model.",
465
474
  level=logging.DEBUG,
466
475
  )
467
- return False
468
- first_tokens.append(matching_tokens[0])
469
-
470
- # Build a mapping from labels to the first token in each label if the first
471
- # tokens are distinct
472
- if len(first_tokens) == len(set(first_tokens)):
476
+ return False
477
+ first_tokens.append(matching_tokens[0])
478
+
479
+ # Build a mapping from labels to the first token in each label if the first
480
+ # tokens are distinct
481
+ if len(first_tokens) == len(set(first_tokens)):
482
+ mapping = {
483
+ label: first_token for label, first_token in zip(local_labels, first_tokens)
484
+ }
485
+ if log_metadata:
473
486
  log_once(
474
- "We will use logprobs with the model since the first tokens of the "
475
- "labels are distinct.",
487
+ "Using logprobs as evaluation strategy for the model, with the "
488
+ f"following mapping from labels to their first token: {mapping}.",
476
489
  level=logging.DEBUG,
477
490
  )
478
- return {
479
- label: first_token
480
- for label, first_token in zip(local_labels, first_tokens)
481
- }
482
- else:
491
+ return mapping
492
+ else:
493
+ if log_metadata:
483
494
  log_once(
484
495
  "We will not use logprobs with the model since the first tokens of the "
485
496
  "labels are not distinct. The first tokens for the labels "
486
497
  f"{local_labels} are {first_tokens}"
487
498
  )
488
- return False
499
+ return False
489
500
 
490
- # Otherwise, we assume that the model should not output scores, to avoid potential
491
- # evaluation errors. This will force the label extraction to rely on word edit
492
- # distance instead of logprobs.
493
- log_once(
494
- "We will not use logprobs with the model, since the dataset does not have "
495
- "labels.",
496
- level=logging.DEBUG,
497
- )
498
- return False
501
+
502
+ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
503
+ """Check if a tokeniser has a chat template.
504
+
505
+ Args:
506
+ tokeniser:
507
+ The tokeniser.
508
+
509
+ Returns:
510
+ Whether the tokeniser has a chat template.
511
+ """
512
+ if hasattr(tokeniser, "chat_template"):
513
+ has_template = tokeniser.chat_template is not None
514
+ if has_template:
515
+ log_once(
516
+ "The tokeniser has a chat template, so assuming that the model is "
517
+ "instruction tuned.",
518
+ level=logging.DEBUG,
519
+ )
520
+ return has_template
521
+ elif isinstance(tokeniser, MistralCommonTokenizer):
522
+ log_once(
523
+ "The tokeniser is a Mistral tokeniser, so assuming that the model is "
524
+ "instruction tuned.",
525
+ level=logging.DEBUG,
526
+ )
527
+ return True
528
+ else:
529
+ log_once(
530
+ "We cannot find a chat template for the tokeniser, so assuming that the "
531
+ "model isn't instruction tuned.",
532
+ level=logging.DEBUG,
533
+ )
534
+ return False
535
+
536
+
537
+ def apply_chat_template(
538
+ conversation: list[dict[str, str]],
539
+ tokeniser: "PreTrainedTokenizer",
540
+ tokenize: bool = False,
541
+ add_generation_prompt: bool = True,
542
+ **transformers_tokeniser_kwargs,
543
+ ) -> str | list[int]:
544
+ """Apply the chat template to a prompt.
545
+
546
+ Args:
547
+ conversation:
548
+ The conversation to apply the chat template to.
549
+ tokeniser:
550
+ The tokeniser.
551
+ tokenize:
552
+ Whether to tokenize the resulting prompt, returning a list of token IDs
553
+ instead of a string.
554
+ add_generation_prompt:
555
+ Whether to add a generation prompt at the end of the conversation. This is
556
+ only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
557
+ always add a generation prompt.
558
+ **transformers_tokeniser_kwargs:
559
+ Additional keyword arguments to pass to the tokeniser, in case the tokeniser
560
+ is a regular Hugging Face tokeniser.
561
+
562
+ Returns:
563
+ The prompt with the chat template applied, either as a string or a list of
564
+ token IDs, depending on the value of `tokenize`.
565
+
566
+ Raises:
567
+ InvalidModel:
568
+ If the tokeniser does not have a chat template.
569
+ """
570
+ if not has_chat_template(tokeniser=tokeniser):
571
+ raise InvalidModel(
572
+ "The tokeniser does not have a chat template, so cannot apply it."
573
+ )
574
+ elif isinstance(tokeniser, MistralCommonTokenizer):
575
+ templated_prompt = tokeniser.apply_chat_template(
576
+ conversation=conversation, tokenize=tokenize
577
+ )
578
+ else:
579
+ templated_prompt = tokeniser.apply_chat_template(
580
+ conversation=conversation,
581
+ add_generation_prompt=add_generation_prompt,
582
+ tokenize=tokenize,
583
+ **transformers_tokeniser_kwargs,
584
+ )
585
+ return templated_prompt