EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show
  1. euroeval/__init__.py +3 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +161 -114
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +13 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +53 -7
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +38 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +46 -14
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +234 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  41. euroeval/prompt_templates/multiple_choice.py +23 -2
  42. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  43. euroeval/prompt_templates/reading_comprehension.py +42 -2
  44. euroeval/prompt_templates/sentiment_classification.py +46 -2
  45. euroeval/prompt_templates/summarization.py +24 -4
  46. euroeval/scores.py +7 -2
  47. euroeval/speed_benchmark.py +6 -6
  48. euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  49. euroeval/task_group_utils/question_answering.py +35 -28
  50. euroeval/task_group_utils/sequence_classification.py +96 -23
  51. euroeval/task_group_utils/text_to_text.py +7 -3
  52. euroeval/task_group_utils/token_classification.py +47 -75
  53. euroeval/tasks.py +31 -6
  54. euroeval/tokenization_utils.py +295 -207
  55. euroeval/utils.py +118 -34
  56. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
  57. euroeval-16.0.0.dist-info/RECORD +69 -0
  58. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
  59. euroeval/human_evaluation.py +0 -738
  60. euroeval/metrics.py +0 -470
  61. euroeval-15.16.0.dist-info/RECORD +0 -63
  62. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -5,8 +5,10 @@ import re
5
5
  import typing as t
6
6
 
7
7
  import torch
8
+ from transformers import MistralCommonTokenizer
9
+
10
+ from euroeval.exceptions import InvalidModel
8
11
 
9
- from .constants import TASK_GROUPS_USING_LOGPROBS
10
12
  from .enums import GenerativeType
11
13
  from .utils import log_once
12
14
 
@@ -20,47 +22,47 @@ if t.TYPE_CHECKING:
20
22
  logger = logging.getLogger("euroeval")
21
23
 
22
24
 
23
- def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
24
- """Get the special token metadata for a tokenizer.
25
+ def get_special_token_metadata(tokeniser: "PreTrainedTokenizerBase") -> dict:
26
+ """Get the special token metadata for a tokeniser.
25
27
 
26
28
  Args:
27
- tokenizer:
28
- The tokenizer.
29
+ tokeniser:
30
+ The tokeniser.
29
31
 
30
32
  Returns:
31
33
  The special token metadata.
32
34
  """
33
- # Create some test input IDs, to check if the tokenizer is adding special tokens
34
- test_input_ids = tokenizer("Test").input_ids
35
+ # Create some test input IDs, to check if the tokeniser is adding special tokens
36
+ test_input_ids = tokeniser("Test").input_ids
35
37
 
36
- # Extract the CLS token IDs from the tokenizer, if it's using them
38
+ # Extract the CLS token IDs from the tokeniser, if it's using them
37
39
  has_cls_token = True
38
- if tokenizer.cls_token_id in test_input_ids:
39
- cls_token_id = tokenizer.cls_token_id
40
- cls_token = tokenizer.cls_token
41
- elif tokenizer.bos_token_id in test_input_ids:
42
- cls_token_id = tokenizer.bos_token_id
43
- cls_token = tokenizer.bos_token
44
- elif tokenizer.cls_token is not None:
45
- cls_token_id = tokenizer.cls_token_id
46
- cls_token = tokenizer.cls_token
40
+ if tokeniser.cls_token_id in test_input_ids:
41
+ cls_token_id = tokeniser.cls_token_id
42
+ cls_token = tokeniser.cls_token
43
+ elif tokeniser.bos_token_id in test_input_ids:
44
+ cls_token_id = tokeniser.bos_token_id
45
+ cls_token = tokeniser.bos_token
46
+ elif tokeniser.cls_token is not None:
47
+ cls_token_id = tokeniser.cls_token_id
48
+ cls_token = tokeniser.cls_token
47
49
  has_cls_token = False
48
50
  else:
49
- cls_token_id = tokenizer.bos_token_id
50
- cls_token = tokenizer.bos_token
51
+ cls_token_id = tokeniser.bos_token_id
52
+ cls_token = tokeniser.bos_token
51
53
  has_cls_token = False
52
54
 
53
- # Extract the SEP token IDs from the tokenizer, if it's using them
55
+ # Extract the SEP token IDs from the tokeniser, if it's using them
54
56
  has_sep_token = True
55
- if tokenizer.sep_token_id in test_input_ids:
56
- sep_token = tokenizer.sep_token
57
- elif tokenizer.eos_token_id in test_input_ids:
58
- sep_token = tokenizer.eos_token
59
- elif tokenizer.sep_token is not None:
60
- sep_token = tokenizer.sep_token
57
+ if tokeniser.sep_token_id in test_input_ids:
58
+ sep_token = tokeniser.sep_token
59
+ elif tokeniser.eos_token_id in test_input_ids:
60
+ sep_token = tokeniser.eos_token
61
+ elif tokeniser.sep_token is not None:
62
+ sep_token = tokeniser.sep_token
61
63
  has_sep_token = False
62
64
  else:
63
- sep_token = tokenizer.eos_token
65
+ sep_token = tokeniser.eos_token
64
66
  has_sep_token = False
65
67
 
66
68
  return dict(
@@ -73,29 +75,29 @@ def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
73
75
 
74
76
 
75
77
  def should_prompts_be_stripped(
76
- labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
78
+ labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
77
79
  ) -> bool:
78
80
  """Determine if we should strip the prompts for few-shot evaluation.
79
81
 
80
- This is the case if the tokenizer needs to include the space as part of the label
82
+ This is the case if the tokeniser needs to include the space as part of the label
81
83
  token. The strategy is thus to tokenize a label with a preceeding colon (as in the
82
84
  prompts), i.e., ": positive", and check if the tokenization starts with the tokens
83
85
  of ": ". If this is the case, then we should not strip the prompts, since the
84
- tokenizer produces the whitespace token separately.
86
+ tokeniser produces the whitespace token separately.
85
87
 
86
88
  Args:
87
89
  labels_to_be_generated:
88
90
  The labels that are to be generated.
89
- tokenizer:
90
- The tokenizer used to tokenize the labels.
91
+ tokeniser:
92
+ The tokeniser used to tokenize the labels.
91
93
 
92
94
  Returns:
93
95
  Whether we should strip the prompts.
94
96
  """
95
97
  strip_prompts = True
96
98
  for label in labels_to_be_generated:
97
- colon_tokens = tokenizer(": ", add_special_tokens=False).input_ids
98
- label_tokens = tokenizer(": " + label, add_special_tokens=False).input_ids
99
+ colon_tokens = tokeniser(": ", add_special_tokens=False).input_ids
100
+ label_tokens = tokeniser(": " + label, add_special_tokens=False).input_ids
99
101
 
100
102
  if isinstance(colon_tokens, torch.Tensor):
101
103
  colon_tokens = list(colon_tokens.squeeze(0))
@@ -112,38 +114,38 @@ def should_prompts_be_stripped(
112
114
 
113
115
 
114
116
  def should_prefix_space_be_added_to_labels(
115
- labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
117
+ labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
116
118
  ) -> bool:
117
119
  """Determine if we should add a prefix space to the labels.
118
120
 
119
- This is the case if the prompts are stripped and the tokenizer doesn't
121
+ This is the case if the prompts are stripped and the tokeniser doesn't
120
122
  automatically add prefix whitespaces to the labels.
121
123
 
122
124
  Args:
123
125
  labels_to_be_generated:
124
126
  The labels that are to be generated.
125
- tokenizer:
126
- The tokenizer used to tokenize the labels.
127
+ tokeniser:
128
+ The tokeniser used to tokenize the labels.
127
129
 
128
130
  Returns:
129
131
  Whether we should add a prefix space to the labels.
130
132
  """
131
133
  if not should_prompts_be_stripped(
132
- labels_to_be_generated=labels_to_be_generated, tokenizer=tokenizer
134
+ labels_to_be_generated=labels_to_be_generated, tokeniser=tokeniser
133
135
  ):
134
136
  return False
135
137
 
136
- whitespace_token = tokenizer.convert_ids_to_tokens(
137
- ids=tokenizer(" ", add_special_tokens=False).input_ids[0]
138
+ whitespace_token = tokeniser.convert_ids_to_tokens(
139
+ ids=tokeniser(" ", add_special_tokens=False).input_ids[0]
138
140
  )[0]
139
141
 
140
142
  add_prefix_space = True
141
143
  for label in labels_to_be_generated:
142
- label_tokens = tokenizer(label, add_special_tokens=False).input_ids
144
+ label_tokens = tokeniser(label, add_special_tokens=False).input_ids
143
145
  if isinstance(label_tokens, torch.Tensor):
144
146
  label_tokens = list(label_tokens.squeeze(0))
145
147
  first_label_token: int = int(label_tokens[0])
146
- first_character_of_label = tokenizer.convert_ids_to_tokens(first_label_token)[0]
148
+ first_character_of_label = tokeniser.convert_ids_to_tokens(first_label_token)[0]
147
149
  has_prefix_space = first_character_of_label == whitespace_token
148
150
  if has_prefix_space:
149
151
  add_prefix_space = False
@@ -153,22 +155,22 @@ def should_prefix_space_be_added_to_labels(
153
155
 
154
156
 
155
157
  def get_bos_token(
156
- tokenizer: "PreTrainedTokenizer",
158
+ tokeniser: "PreTrainedTokenizer",
157
159
  ) -> tuple[str, int] | tuple[None, None]:
158
- """Get the beginning-of-sequence token from a tokenizer.
160
+ """Get the beginning-of-sequence token from a tokeniser.
159
161
 
160
162
  Args:
161
- tokenizer:
162
- The tokenizer.
163
+ tokeniser:
164
+ The tokeniser.
163
165
 
164
166
  Returns:
165
167
  A pair (token, token_id) representing the beginning-of-sequence token and its
166
168
  token ID, or (None, None) if no BOS token is found.
167
169
  """
168
- if isinstance(tokenizer.bos_token, str) and isinstance(tokenizer.bos_token_id, int):
169
- return tokenizer.bos_token, tokenizer.bos_token_id
170
+ if isinstance(tokeniser.bos_token, str) and isinstance(tokeniser.bos_token_id, int):
171
+ return tokeniser.bos_token, tokeniser.bos_token_id
170
172
 
171
- vocab: dict[str, int] = tokenizer.get_vocab()
173
+ vocab: dict[str, int] = tokeniser.get_vocab()
172
174
 
173
175
  candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
174
176
  for candidate_bos_token in candidate_bos_tokens:
@@ -179,7 +181,7 @@ def get_bos_token(
179
181
  else:
180
182
  log_once(
181
183
  "The model does not have a beginning-of-sequence token. Please ensure that "
182
- "this has been set in the tokenizer's configuration. Using no BOS token."
184
+ "this has been set in the tokeniser's configuration. Using no BOS token."
183
185
  " This may lead to unexpected behavior in the model.",
184
186
  level=logging.INFO,
185
187
  )
@@ -194,22 +196,22 @@ def get_bos_token(
194
196
 
195
197
 
196
198
  def get_eos_token(
197
- tokenizer: "PreTrainedTokenizer",
199
+ tokeniser: "PreTrainedTokenizer",
198
200
  ) -> tuple[str, int] | tuple[None, None]:
199
- """Get the end-of-sequence token from a tokenizer.
201
+ """Get the end-of-sequence token from a tokeniser.
200
202
 
201
203
  Args:
202
- tokenizer:
203
- The tokenizer.
204
+ tokeniser:
205
+ The tokeniser.
204
206
 
205
207
  Returns:
206
208
  A pair (token, token_id) representing the end-of-sequence token and its token
207
209
  ID, or (None, None) if no EOS token is found.
208
210
  """
209
- if isinstance(tokenizer.eos_token, str) and isinstance(tokenizer.eos_token_id, int):
210
- return tokenizer.eos_token, tokenizer.eos_token_id
211
+ if isinstance(tokeniser.eos_token, str) and isinstance(tokeniser.eos_token_id, int):
212
+ return tokeniser.eos_token, tokeniser.eos_token_id
211
213
 
212
- vocab: dict[str, int] = tokenizer.get_vocab()
214
+ vocab: dict[str, int] = tokeniser.get_vocab()
213
215
 
214
216
  candidate_eos_tokens = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]"]
215
217
  for candidate_eos_token in candidate_eos_tokens:
@@ -220,7 +222,7 @@ def get_eos_token(
220
222
  else:
221
223
  log_once(
222
224
  "The model does not have an end-of-sequence token. Please ensure that this "
223
- "has been set in the tokenizer's configuration. Using no EOS token. This "
225
+ "has been set in the tokeniser's configuration. Using no EOS token. This "
224
226
  "may lead to unexpected behavior in the model.",
225
227
  level=logging.INFO,
226
228
  )
@@ -235,55 +237,55 @@ def get_eos_token(
235
237
 
236
238
 
237
239
  def get_pad_token(
238
- tokenizer: "PreTrainedTokenizer",
240
+ tokeniser: "PreTrainedTokenizer",
239
241
  ) -> tuple[str, int] | tuple[None, None]:
240
- """Get the padding token from a tokenizer.
242
+ """Get the padding token from a tokeniser.
241
243
 
242
244
  Args:
243
- tokenizer:
244
- The tokenizer.
245
+ tokeniser:
246
+ The tokeniser.
245
247
 
246
248
  Returns:
247
249
  A pair (token, token_id) representing the padding token and its token ID, or
248
250
  (None, None) if no padding token is found.
249
251
  """
250
- # If the tokenizer already has a padding token, return it
251
- if tokenizer.pad_token is not None and tokenizer.pad_token_id is not None:
252
- assert isinstance(tokenizer.pad_token, str), (
253
- "Expected tokenizer.pad_token to be a string, but got "
254
- f"{type(tokenizer.pad_token)}."
252
+ # If the tokeniser already has a padding token, return it
253
+ if tokeniser.pad_token is not None and tokeniser.pad_token_id is not None:
254
+ assert isinstance(tokeniser.pad_token, str), (
255
+ "Expected tokeniser.pad_token to be a string, but got "
256
+ f"{type(tokeniser.pad_token)}."
255
257
  )
256
- assert isinstance(tokenizer.pad_token_id, int), (
257
- "Expected tokenizer.pad_token_id to be an integer, but got "
258
- f"{type(tokenizer.pad_token_id)}."
258
+ assert isinstance(tokeniser.pad_token_id, int), (
259
+ "Expected tokeniser.pad_token_id to be an integer, but got "
260
+ f"{type(tokeniser.pad_token_id)}."
259
261
  )
260
- return (tokenizer.pad_token, tokenizer.pad_token_id)
262
+ return (tokeniser.pad_token, tokeniser.pad_token_id)
261
263
 
262
- # If the tokenizer has a BOS token, use it as the padding token
263
- if tokenizer.bos_token is not None and tokenizer.bos_token_id is not None:
264
- assert isinstance(tokenizer.bos_token, str), (
265
- "Expected tokenizer.bos_token to be a string, but got "
266
- f"{type(tokenizer.bos_token)}."
264
+ # If the tokeniser has a BOS token, use it as the padding token
265
+ if tokeniser.bos_token is not None and tokeniser.bos_token_id is not None:
266
+ assert isinstance(tokeniser.bos_token, str), (
267
+ "Expected tokeniser.bos_token to be a string, but got "
268
+ f"{type(tokeniser.bos_token)}."
267
269
  )
268
- assert isinstance(tokenizer.bos_token_id, int), (
269
- "Expected tokenizer.bos_token_id to be an integer, but got "
270
- f"{type(tokenizer.bos_token_id)}."
270
+ assert isinstance(tokeniser.bos_token_id, int), (
271
+ "Expected tokeniser.bos_token_id to be an integer, but got "
272
+ f"{type(tokeniser.bos_token_id)}."
271
273
  )
272
- pad_token = tokenizer.bos_token
273
- pad_token_id = tokenizer.bos_token_id
274
-
275
- # If the tokenizer has an EOS token, use it as the padding token
276
- elif tokenizer.eos_token is not None and tokenizer.eos_token_id is not None:
277
- assert isinstance(tokenizer.eos_token, str), (
278
- "Expected tokenizer.eos_token to be a string, but got "
279
- f"{type(tokenizer.eos_token)}."
274
+ pad_token = tokeniser.bos_token
275
+ pad_token_id = tokeniser.bos_token_id
276
+
277
+ # If the tokeniser has an EOS token, use it as the padding token
278
+ elif tokeniser.eos_token is not None and tokeniser.eos_token_id is not None:
279
+ assert isinstance(tokeniser.eos_token, str), (
280
+ "Expected tokeniser.eos_token to be a string, but got "
281
+ f"{type(tokeniser.eos_token)}."
280
282
  )
281
- assert isinstance(tokenizer.eos_token_id, int), (
282
- "Expected tokenizer.eos_token_id to be an integer, but got "
283
- f"{type(tokenizer.eos_token_id)}."
283
+ assert isinstance(tokeniser.eos_token_id, int), (
284
+ "Expected tokeniser.eos_token_id to be an integer, but got "
285
+ f"{type(tokeniser.eos_token_id)}."
284
286
  )
285
- pad_token = tokenizer.eos_token
286
- pad_token_id = tokenizer.eos_token_id
287
+ pad_token = tokeniser.eos_token
288
+ pad_token_id = tokeniser.eos_token_id
287
289
 
288
290
  # Otherwise, try to find a candidate padding token in the vocabulary
289
291
  else:
@@ -296,14 +298,14 @@ def get_pad_token(
296
298
  ]
297
299
  pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
298
300
  for candidate in pad_token_candidates:
299
- if candidate in tokenizer.get_vocab():
301
+ if candidate in tokeniser.get_vocab():
300
302
  pad_token = candidate
301
- pad_token_id = tokenizer.get_vocab()[candidate]
303
+ pad_token_id = tokeniser.get_vocab()[candidate]
302
304
  break
303
305
  else:
304
306
  log_once(
305
307
  "Could not identify a padding token for the model. Please ensure that "
306
- "this has been set in the tokenizer's configuration. Using no padding "
308
+ "this has been set in the tokeniser's configuration. Using no padding "
307
309
  "token. This may lead to unexpected behavior in the model.",
308
310
  level=logging.INFO,
309
311
  )
@@ -317,50 +319,58 @@ def get_pad_token(
317
319
  return pad_token, pad_token_id
318
320
 
319
321
 
320
- def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | None:
322
+ def get_end_of_chat_token_ids(tokeniser: "PreTrainedTokenizer") -> list[int] | None:
321
323
  """Get the end token ID for chat models.
322
324
 
323
- This is only relevant for tokenizers with a chat template.
325
+ This is only relevant for tokenisers with a chat template.
324
326
 
325
327
  Args:
326
- tokenizer:
327
- The tokenizer.
328
+ tokeniser:
329
+ The tokeniser.
328
330
 
329
331
  Returns:
330
- The token IDs used to end chats, or None if the tokenizer does not have a chat
331
- template.
332
-
333
- Raises:
334
- ValueError:
335
- If the end-of-chat token could not be located.
332
+ The token IDs used to end chats, or None if the tokeniser does not have a chat
333
+ template or if no end-of-chat token could be found.
336
334
  """
337
- if tokenizer.chat_template is None:
335
+ if not has_chat_template(tokeniser=tokeniser):
338
336
  return None
339
337
 
340
338
  user_message: dict[str, str] = dict(role="user", content="X")
341
- token_ids: list[int] = tokenizer.apply_chat_template(conversation=[user_message]) # type: ignore[assignment]
339
+ token_ids = apply_chat_template(
340
+ conversation=[user_message],
341
+ tokeniser=tokeniser,
342
+ tokenize=True,
343
+ add_generation_prompt=False,
344
+ )
345
+ assert isinstance(token_ids, list)
342
346
 
343
- for idx, token in enumerate(tokenizer.convert_ids_to_tokens(token_ids)):
344
- token_id = tokenizer.convert_tokens_to_ids(token)
345
- assert isinstance(token_id, int)
346
- token = tokenizer.decode([token_id])
347
+ for idx, token in enumerate(tokeniser.convert_ids_to_tokens(token_ids)):
347
348
  if "X" in token:
348
349
  x_token_index = idx
349
350
  break
350
351
  else:
351
- raise ValueError("Could not locate the end-of-chat token for the model.")
352
+ logger.debug("Could not locate the end-of-chat token for the model.")
353
+ return None
352
354
 
353
355
  end_of_chat_tokens = token_ids[x_token_index + 1 :]
354
356
  if len(end_of_chat_tokens) == 0:
357
+ logger.debug("Could not locate the end-of-chat token for the model.")
355
358
  return None
359
+
360
+ log_once(
361
+ f"Detected end-of-chat token IDs as {end_of_chat_tokens}, corresponding to "
362
+ f"tokens {tokeniser.convert_ids_to_tokens(end_of_chat_tokens)}.",
363
+ level=logging.DEBUG,
364
+ )
356
365
  return end_of_chat_tokens
357
366
 
358
367
 
359
368
  def get_first_label_token_mapping(
360
369
  dataset_config: "DatasetConfig",
361
370
  model_config: "ModelConfig",
362
- tokenizer: "PreTrainedTokenizer | None",
371
+ tokeniser: "PreTrainedTokenizer | None",
363
372
  generative_type: "GenerativeType | None",
373
+ log_metadata: bool,
364
374
  ) -> dict[str, str] | bool:
365
375
  """Check if the model should output scores.
366
376
 
@@ -369,130 +379,208 @@ def get_first_label_token_mapping(
369
379
  The dataset configuration.
370
380
  model_config:
371
381
  The model configuration.
372
- tokenizer:
373
- The tokenizer, or None if not available.
382
+ tokeniser:
383
+ The tokeniser, or None if not available.
374
384
  generative_type:
375
385
  The generative type, or None if not available.
386
+ log_metadata:
387
+ Whether to log metadata.
376
388
 
377
389
  Returns:
378
390
  A mapping from labels to the first token in each label, or alternatively a
379
391
  Boolean value indicating whether the model should output scores (if the mapping
380
392
  is outputted then the model will always output scores).
381
393
  """
382
- if generative_type == GenerativeType.REASONING:
383
- log_once(
384
- f"The model {model_config.model_id!r} is a reasoning model and "
385
- "thus does not support logprobs, so we do not enable it.",
386
- level=logging.DEBUG,
387
- )
394
+ if not (dataset_config.task.uses_logprobs and dataset_config.labels):
395
+ if log_metadata:
396
+ log_once(
397
+ "We will not use logprobs with the model, since the dataset does not "
398
+ "have labels.",
399
+ level=logging.DEBUG,
400
+ )
388
401
  return False
389
-
390
- # If we do not have any tokenizer, then we cannot check if the model should output
391
- # scores and we just assume it should if the dataset supports it
392
- output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
393
- if tokenizer is None:
394
- if output_scores:
402
+ elif generative_type == GenerativeType.REASONING:
403
+ if log_metadata:
395
404
  log_once(
396
- f"We will use logprobs with the model {model_config.model_id!r} "
397
- "since the dataset supports it and no tokenizer is available.",
405
+ f"The model {model_config.model_id!r} is a reasoning model and "
406
+ "thus does not support logprobs, so we do not enable it.",
398
407
  level=logging.DEBUG,
399
408
  )
400
- else:
409
+ return False
410
+ elif tokeniser is None:
411
+ if log_metadata:
401
412
  log_once(
402
- f"We will not use logprobs with the model {model_config.model_id!r} "
403
- "since the dataset does not support it and no tokenizer is available.",
413
+ f"We will use logprobs with the model {model_config.model_id!r} "
414
+ "since the dataset supports it and no tokeniser is available.",
404
415
  level=logging.DEBUG,
405
416
  )
406
- return output_scores
407
-
408
- # If there are labels associated with the dataset, and that the first token of each
409
- # label is distinct, then we can safely use the logprobs
410
- if output_scores and dataset_config.labels:
411
- local_labels = [
412
- dataset_config.prompt_label_mapping[label].strip()
413
- for label in dataset_config.labels
417
+ return True
418
+
419
+ local_labels = [
420
+ dataset_config.prompt_label_mapping[label].strip()
421
+ for label in dataset_config.labels
422
+ ]
423
+
424
+ # Tokenize some text containing each label, which we will use to extract the
425
+ # first token of each label
426
+ all_tokens: list[list[str]]
427
+ if not has_chat_template(tokeniser=tokeniser):
428
+ add_prefix_space = should_prefix_space_be_added_to_labels(
429
+ labels_to_be_generated=local_labels, tokeniser=tokeniser
430
+ )
431
+ all_tokens = [
432
+ tokeniser.tokenize(text=f" {label}" if add_prefix_space else label)
433
+ for label in local_labels
414
434
  ]
415
-
416
- # Tokenize some text containing each label, which we will use to extract the
417
- # first token of each label
418
- all_tokens: list[list[str]]
419
- if tokenizer.chat_template is None:
420
- add_prefix_space = should_prefix_space_be_added_to_labels(
421
- labels_to_be_generated=local_labels, tokenizer=tokenizer
422
- )
423
- all_tokens = [
424
- tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
425
- for label in local_labels
426
- ]
427
- else:
428
- all_tokens = [
429
- tokenizer.convert_ids_to_tokens(
430
- ids=tokenizer.apply_chat_template(
431
- conversation=[
432
- dict(role="user", content=""),
433
- dict(role="assistant", content=label),
434
- ],
435
- add_generation_prompt=True,
436
- tokenize=True,
437
- )
438
- )
439
- for label in local_labels
440
- ]
441
-
442
- # Remove any non-alphabetic characters from the tokens
435
+ else:
443
436
  all_tokens = [
444
- [
445
- re.sub(
446
- pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
447
- repl="",
448
- string=token.lower(),
437
+ tokeniser.convert_ids_to_tokens(
438
+ ids=apply_chat_template(
439
+ conversation=[
440
+ dict(role="user", content=""),
441
+ dict(role="assistant", content=label),
442
+ # Adding extra user message as Mistral tokenisers require
443
+ # conversamtions to end with a user message
444
+ dict(role="user", content=""),
445
+ ],
446
+ tokeniser=tokeniser,
447
+ tokenize=True,
449
448
  )
450
- for token in token_list
451
- ]
452
- for token_list in all_tokens
449
+ )
450
+ for label in local_labels
453
451
  ]
454
452
 
455
- # Extract the first token of each label
456
- first_tokens: list[str] = list()
457
- for token_list, label in zip(all_tokens, local_labels):
458
- matching_tokens = [
459
- tok for tok in token_list if tok and label.startswith(tok)
460
- ]
461
- if not matching_tokens:
453
+ # Remove any non-alphabetic characters from the tokens
454
+ all_tokens = [
455
+ [
456
+ re.sub(
457
+ pattern=r"^[^a-zæøåüöä0-9]+|[^a-zæøåüöä0-9]+$",
458
+ repl="",
459
+ string=token.lower(),
460
+ )
461
+ for token in token_list
462
+ ]
463
+ for token_list in all_tokens
464
+ ]
465
+
466
+ # Extract the first token of each label
467
+ first_tokens: list[str] = list()
468
+ for token_list, label in zip(all_tokens, local_labels):
469
+ matching_tokens = [tok for tok in token_list if tok and label.startswith(tok)]
470
+ if not matching_tokens:
471
+ if log_metadata:
462
472
  log_once(
463
473
  f"No matching token found in token_list for label '{label}', so "
464
474
  "we will not use logprobs with the model.",
465
475
  level=logging.DEBUG,
466
476
  )
467
- return False
468
- first_tokens.append(matching_tokens[0])
469
-
470
- # Build a mapping from labels to the first token in each label if the first
471
- # tokens are distinct
472
- if len(first_tokens) == len(set(first_tokens)):
477
+ return False
478
+ first_tokens.append(matching_tokens[0])
479
+
480
+ # Build a mapping from labels to the first token in each label if the first
481
+ # tokens are distinct
482
+ if len(first_tokens) == len(set(first_tokens)):
483
+ mapping = {
484
+ label: first_token for label, first_token in zip(local_labels, first_tokens)
485
+ }
486
+ if log_metadata:
473
487
  log_once(
474
- "We will use logprobs with the model since the first tokens of the "
475
- "labels are distinct.",
488
+ "Using logprobs as evaluation strategy for the model, with the "
489
+ f"following mapping from labels to their first token: {mapping}.",
476
490
  level=logging.DEBUG,
477
491
  )
478
- return {
479
- label: first_token
480
- for label, first_token in zip(local_labels, first_tokens)
481
- }
482
- else:
492
+ return mapping
493
+ else:
494
+ if log_metadata:
483
495
  log_once(
484
496
  "We will not use logprobs with the model since the first tokens of the "
485
497
  "labels are not distinct. The first tokens for the labels "
486
498
  f"{local_labels} are {first_tokens}"
487
499
  )
488
- return False
500
+ return False
489
501
 
490
- # Otherwise, we assume that the model should not output scores, to avoid potential
491
- # evaluation errors. This will force the label extraction to rely on word edit
492
- # distance instead of logprobs.
493
- log_once(
494
- "We will not use logprobs with the model, since the dataset does not have "
495
- "labels.",
496
- level=logging.DEBUG,
497
- )
498
- return False
502
+
503
+ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
504
+ """Check if a tokeniser has a chat template.
505
+
506
+ Args:
507
+ tokeniser:
508
+ The tokeniser.
509
+
510
+ Returns:
511
+ Whether the tokeniser has a chat template.
512
+ """
513
+ if hasattr(tokeniser, "chat_template"):
514
+ has_template = tokeniser.chat_template is not None
515
+ if has_template:
516
+ log_once(
517
+ "The tokeniser has a chat template, so assuming that the model is "
518
+ "instruction tuned.",
519
+ level=logging.DEBUG,
520
+ )
521
+ return has_template
522
+ elif isinstance(tokeniser, MistralCommonTokenizer):
523
+ log_once(
524
+ "The tokeniser is a Mistral tokeniser, so assuming that the model is "
525
+ "instruction tuned.",
526
+ level=logging.DEBUG,
527
+ )
528
+ return True
529
+ else:
530
+ log_once(
531
+ "We cannot find a chat template for the tokeniser, so assuming that the "
532
+ "model isn't instruction tuned.",
533
+ level=logging.DEBUG,
534
+ )
535
+ return False
536
+
537
+
538
+ def apply_chat_template(
539
+ conversation: list[dict[str, str]],
540
+ tokeniser: "PreTrainedTokenizer",
541
+ tokenize: bool = False,
542
+ add_generation_prompt: bool = True,
543
+ **transformers_tokeniser_kwargs,
544
+ ) -> str | list[int]:
545
+ """Apply the chat template to a prompt.
546
+
547
+ Args:
548
+ conversation:
549
+ The conversation to apply the chat template to.
550
+ tokeniser:
551
+ The tokeniser.
552
+ tokenize:
553
+ Whether to tokenize the resulting prompt, returning a list of token IDs
554
+ instead of a string.
555
+ add_generation_prompt:
556
+ Whether to add a generation prompt at the end of the conversation. This is
557
+ only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
558
+ always add a generation prompt.
559
+ **transformers_tokeniser_kwargs:
560
+ Additional keyword arguments to pass to the tokeniser, in case the tokeniser
561
+ is a regular Hugging Face tokeniser.
562
+
563
+ Returns:
564
+ The prompt with the chat template applied, either as a string or a list of
565
+ token IDs, depending on the value of `tokenize`.
566
+
567
+ Raises:
568
+ InvalidModel:
569
+ If the tokeniser does not have a chat template.
570
+ """
571
+ if not has_chat_template(tokeniser=tokeniser):
572
+ raise InvalidModel(
573
+ "The tokeniser does not have a chat template, so cannot apply it."
574
+ )
575
+ elif isinstance(tokeniser, MistralCommonTokenizer):
576
+ templated_prompt = tokeniser.apply_chat_template(
577
+ conversation=conversation, tokenize=tokenize
578
+ )
579
+ else:
580
+ templated_prompt = tokeniser.apply_chat_template(
581
+ conversation=conversation,
582
+ add_generation_prompt=add_generation_prompt,
583
+ tokenize=tokenize,
584
+ **transformers_tokeniser_kwargs,
585
+ )
586
+ return templated_prompt