EuroEval 15.5.0__py3-none-any.whl → 15.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (53) hide show
  1. euroeval/benchmark_modules/base.py +3 -2
  2. euroeval/benchmark_modules/fresh.py +8 -6
  3. euroeval/benchmark_modules/hf.py +33 -31
  4. euroeval/benchmark_modules/litellm.py +120 -56
  5. euroeval/benchmark_modules/vllm.py +41 -26
  6. euroeval/benchmarker.py +23 -21
  7. euroeval/callbacks.py +2 -2
  8. euroeval/constants.py +1 -1
  9. euroeval/data_models.py +257 -42
  10. euroeval/dataset_configs/__init__.py +61 -0
  11. euroeval/dataset_configs/danish.py +120 -0
  12. euroeval/dataset_configs/dutch.py +123 -0
  13. euroeval/dataset_configs/english.py +88 -0
  14. euroeval/dataset_configs/faroese.py +53 -0
  15. euroeval/dataset_configs/french.py +83 -0
  16. euroeval/dataset_configs/german.py +91 -0
  17. euroeval/dataset_configs/icelandic.py +148 -0
  18. euroeval/dataset_configs/italian.py +81 -0
  19. euroeval/dataset_configs/norwegian.py +178 -0
  20. euroeval/dataset_configs/spanish.py +78 -0
  21. euroeval/dataset_configs/swedish.py +100 -0
  22. euroeval/exceptions.py +10 -10
  23. euroeval/finetuning.py +6 -10
  24. euroeval/generation.py +1 -0
  25. euroeval/human_evaluation.py +2 -2
  26. euroeval/languages.py +20 -13
  27. euroeval/model_cache.py +1 -1
  28. euroeval/model_loading.py +1 -12
  29. euroeval/prompt_templates/__init__.py +8 -0
  30. euroeval/prompt_templates/linguistic_acceptability.py +112 -0
  31. euroeval/prompt_templates/multiple_choice.py +97 -0
  32. euroeval/prompt_templates/named_entity_recognition.py +257 -0
  33. euroeval/prompt_templates/reading_comprehension.py +118 -0
  34. euroeval/prompt_templates/sentiment_classification.py +137 -0
  35. euroeval/prompt_templates/summarization.py +97 -0
  36. euroeval/speed_benchmark.py +1 -1
  37. euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
  38. euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
  39. euroeval/{task_utils → task_group_utils}/sequence_classification.py +1 -1
  40. euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
  41. euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
  42. euroeval/tasks.py +54 -0
  43. euroeval/tokenization_utils.py +343 -0
  44. euroeval/types.py +3 -1
  45. euroeval/utils.py +2 -347
  46. {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/METADATA +30 -9
  47. euroeval-15.6.0.dist-info/RECORD +59 -0
  48. euroeval/dataset_configs.py +0 -2408
  49. euroeval-15.5.0.dist-info/RECORD +0 -40
  50. /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
  51. {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
  52. {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
  53. {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,343 @@
1
+ """Utility functions related to tokenization."""
2
+
3
+ import logging
4
+ import re
5
+ import typing as t
6
+
7
+ import torch
8
+
9
+ from .constants import TASK_GROUPS_USING_LOGPROBS
10
+ from .exceptions import InvalidModel
11
+ from .utils import log_once
12
+
13
+ if t.TYPE_CHECKING:
14
+ from transformers.tokenization_utils import PreTrainedTokenizer
15
+ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
16
+
17
+ from .data_models import DatasetConfig
18
+
19
+
20
+ logger = logging.getLogger("euroeval")
21
+
22
+
23
+ def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
24
+ """Get the special token metadata for a tokenizer.
25
+
26
+ Args:
27
+ tokenizer:
28
+ The tokenizer.
29
+
30
+ Returns:
31
+ The special token metadata.
32
+ """
33
+ # Create some test input IDs, to check if the tokenizer is adding special tokens
34
+ test_input_ids = tokenizer("Test").input_ids
35
+
36
+ # Extract the CLS token IDs from the tokenizer, if it's using them
37
+ has_cls_token = True
38
+ if tokenizer.cls_token_id in test_input_ids:
39
+ cls_token_id = tokenizer.cls_token_id
40
+ cls_token = tokenizer.cls_token
41
+ elif tokenizer.bos_token_id in test_input_ids:
42
+ cls_token_id = tokenizer.bos_token_id
43
+ cls_token = tokenizer.bos_token
44
+ elif tokenizer.cls_token is not None:
45
+ cls_token_id = tokenizer.cls_token_id
46
+ cls_token = tokenizer.cls_token
47
+ has_cls_token = False
48
+ else:
49
+ cls_token_id = tokenizer.bos_token_id
50
+ cls_token = tokenizer.bos_token
51
+ has_cls_token = False
52
+
53
+ # Extract the SEP token IDs from the tokenizer, if it's using them
54
+ has_sep_token = True
55
+ if tokenizer.sep_token_id in test_input_ids:
56
+ sep_token = tokenizer.sep_token
57
+ elif tokenizer.eos_token_id in test_input_ids:
58
+ sep_token = tokenizer.eos_token
59
+ elif tokenizer.sep_token is not None:
60
+ sep_token = tokenizer.sep_token
61
+ has_sep_token = False
62
+ else:
63
+ sep_token = tokenizer.eos_token
64
+ has_sep_token = False
65
+
66
+ return dict(
67
+ cls_token_id=cls_token_id,
68
+ cls_token=cls_token,
69
+ sep_token=sep_token,
70
+ has_cls_token=has_cls_token,
71
+ has_sep_token=has_sep_token,
72
+ )
73
+
74
+
75
+ def should_prompts_be_stripped(
76
+ labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
77
+ ) -> bool:
78
+ """Determine if we should strip the prompts for few-shot evaluation.
79
+
80
+ This is the case if the tokenizer needs to include the space as part of the label
81
+ token. The strategy is thus to tokenize a label with a preceeding colon (as in the
82
+ prompts), i.e., ": positive", and check if the tokenization starts with the tokens
83
+ of ": ". If this is the case, then we should not strip the prompts, since the
84
+ tokenizer produces the whitespace token separately.
85
+
86
+ Args:
87
+ labels_to_be_generated:
88
+ The labels that are to be generated.
89
+ tokenizer:
90
+ The tokenizer used to tokenize the labels.
91
+
92
+ Returns:
93
+ Whether we should strip the prompts.
94
+ """
95
+ strip_prompts = True
96
+ for label in labels_to_be_generated:
97
+ colon_tokens = tokenizer(": ", add_special_tokens=False).input_ids
98
+ label_tokens = tokenizer(": " + label, add_special_tokens=False).input_ids
99
+
100
+ if isinstance(colon_tokens, torch.Tensor):
101
+ colon_tokens = list(colon_tokens.squeeze(0))
102
+ if isinstance(label_tokens, torch.Tensor):
103
+ label_tokens = list(label_tokens.squeeze(0))
104
+
105
+ label_tokens_start_with_colon_tokens = (
106
+ label_tokens[: len(colon_tokens)] == colon_tokens
107
+ )
108
+ if label_tokens_start_with_colon_tokens:
109
+ strip_prompts = False
110
+
111
+ return strip_prompts
112
+
113
+
114
+ def should_prefix_space_be_added_to_labels(
115
+ labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
116
+ ) -> bool:
117
+ """Determine if we should add a prefix space to the labels.
118
+
119
+ This is the case if the prompts are stripped and the tokenizer doesn't
120
+ automatically add prefix whitespaces to the labels.
121
+
122
+ Args:
123
+ labels_to_be_generated:
124
+ The labels that are to be generated.
125
+ tokenizer:
126
+ The tokenizer used to tokenize the labels.
127
+
128
+ Returns:
129
+ Whether we should add a prefix space to the labels.
130
+ """
131
+ if not should_prompts_be_stripped(
132
+ labels_to_be_generated=labels_to_be_generated, tokenizer=tokenizer
133
+ ):
134
+ return False
135
+
136
+ whitespace_token = tokenizer.convert_ids_to_tokens(
137
+ ids=tokenizer(" ", add_special_tokens=False).input_ids[0]
138
+ )[0]
139
+
140
+ add_prefix_space = True
141
+ for label in labels_to_be_generated:
142
+ label_tokens = tokenizer(label, add_special_tokens=False).input_ids
143
+ if isinstance(label_tokens, torch.Tensor):
144
+ label_tokens = list(label_tokens.squeeze(0))
145
+ first_label_token: int = int(label_tokens[0])
146
+ first_character_of_label = tokenizer.convert_ids_to_tokens(first_label_token)[0]
147
+ has_prefix_space = first_character_of_label == whitespace_token
148
+ if has_prefix_space:
149
+ add_prefix_space = False
150
+ break
151
+
152
+ return add_prefix_space
153
+
154
+
155
+ def get_bos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
156
+ """Get the beginning-of-sequence token from a tokenizer.
157
+
158
+ Args:
159
+ tokenizer:
160
+ The tokenizer.
161
+
162
+ Returns:
163
+ A pair (token, token_id) representing the beginning-of-sequence token and its
164
+ token ID.
165
+ """
166
+ if isinstance(tokenizer.bos_token, str) and isinstance(tokenizer.bos_token_id, int):
167
+ return tokenizer.bos_token, tokenizer.bos_token_id
168
+
169
+ vocab: dict[str, int] = tokenizer.get_vocab()
170
+
171
+ candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "[CLS]"]
172
+ for candidate_bos_token in candidate_bos_tokens:
173
+ if candidate_bos_token in vocab:
174
+ bos_token = candidate_bos_token
175
+ bos_token_id = vocab[bos_token]
176
+ break
177
+ else:
178
+ raise InvalidModel(
179
+ "The model does not have a beginning-of-sequence token. Please ensure that "
180
+ "this has been set in the tokenizer's configuration."
181
+ )
182
+
183
+ return bos_token, bos_token_id
184
+
185
+
186
+ def get_eos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
187
+ """Get the end-of-sequence token from a tokenizer.
188
+
189
+ Args:
190
+ tokenizer:
191
+ The tokenizer.
192
+
193
+ Returns:
194
+ A pair (token, token_id) representing the end-of-sequence token and its token
195
+ ID.
196
+ """
197
+ if isinstance(tokenizer.eos_token, str) and isinstance(tokenizer.eos_token_id, int):
198
+ return tokenizer.eos_token, tokenizer.eos_token_id
199
+
200
+ vocab: dict[str, int] = tokenizer.get_vocab()
201
+
202
+ candidate_eos_tokens = ["</s>", "<|end_of_text|>", "[SEP]"]
203
+ for candidate_eos_token in candidate_eos_tokens:
204
+ if candidate_eos_token in vocab:
205
+ eos_token = candidate_eos_token
206
+ eos_token_id = vocab[eos_token]
207
+ break
208
+ else:
209
+ raise InvalidModel(
210
+ "The model does not have an end-of-sequence token. Please ensure that this "
211
+ "has been set in the tokenizer's configuration."
212
+ )
213
+
214
+ return eos_token, eos_token_id
215
+
216
+
217
+ def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | None:
218
+ """Get the end token ID for chat models.
219
+
220
+ This is only relevant for tokenizers with a chat template.
221
+
222
+ Args:
223
+ tokenizer:
224
+ The tokenizer.
225
+
226
+ Returns:
227
+ The token IDs used to end chats, or None if the tokenizer does not have a chat
228
+ template.
229
+
230
+ Raises:
231
+ ValueError:
232
+ If the end-of-chat token could not be located.
233
+ """
234
+ if tokenizer.chat_template is None:
235
+ return None
236
+
237
+ user_message: dict[str, str] = dict(role="user", content="X")
238
+ token_ids: list[int] = tokenizer.apply_chat_template(conversation=[user_message]) # type: ignore[assignment]
239
+
240
+ for idx, token in enumerate(tokenizer.convert_ids_to_tokens(token_ids)):
241
+ token_id = tokenizer.convert_tokens_to_ids(token)
242
+ assert isinstance(token_id, int)
243
+ token = tokenizer.decode([token_id])
244
+ if "X" in token:
245
+ x_token_index = idx
246
+ break
247
+ else:
248
+ raise ValueError("Could not locate the end-of-chat token for the model.")
249
+
250
+ end_of_chat_tokens = token_ids[x_token_index + 1 :]
251
+ if len(end_of_chat_tokens) == 0:
252
+ return None
253
+ return end_of_chat_tokens
254
+
255
+
256
+ def get_first_label_token_mapping(
257
+ dataset_config: "DatasetConfig", tokenizer: "PreTrainedTokenizer | None"
258
+ ) -> dict[str, str] | bool:
259
+ """Check if the model should output scores.
260
+
261
+ Args:
262
+ dataset_config:
263
+ The dataset configuration.
264
+ tokenizer:
265
+ The tokenizer, or None if not available.
266
+
267
+ Returns:
268
+ A mapping from labels to the first token in each label, or alternatively a
269
+ Boolean value indicating whether the model should output scores (if the mapping
270
+ is outputted then the model will always output scores).
271
+ """
272
+ # If we do not have any tokenizer, then we cannot check if the model should output
273
+ # scores and we just assume it should if the dataset supports it
274
+ output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
275
+ if tokenizer is None:
276
+ if output_scores:
277
+ log_once(
278
+ "The model will output scores, since the dataset supports it and no "
279
+ "tokenizer is available.",
280
+ level=logging.DEBUG,
281
+ )
282
+ else:
283
+ log_once(
284
+ "The model will not output scores, since the dataset does not support "
285
+ "it and no tokenizer is available.",
286
+ level=logging.DEBUG,
287
+ )
288
+ return output_scores
289
+
290
+ # If there are labels associated with the dataset, and that the first token of each
291
+ # label is distinct, then we can safely use the logprobs
292
+ if output_scores and dataset_config.labels:
293
+ local_labels = [
294
+ dataset_config.prompt_label_mapping[label].strip()
295
+ for label in dataset_config.labels
296
+ ]
297
+
298
+ # Get the first token of each label, where we add a prefix space if needed
299
+ add_prefix_space = (
300
+ should_prefix_space_be_added_to_labels(
301
+ labels_to_be_generated=local_labels, tokenizer=tokenizer
302
+ )
303
+ and tokenizer.chat_template is None
304
+ )
305
+ first_tokens = [
306
+ tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
307
+ for label in local_labels
308
+ ]
309
+ first_tokens = [
310
+ re.sub(
311
+ pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
312
+ )
313
+ for token in first_tokens
314
+ ]
315
+
316
+ # Build a mapping from labels to the first token in each label if the first
317
+ # tokens are distinct
318
+ if len(first_tokens) == len(set(first_tokens)):
319
+ log_once(
320
+ "The model will output scores, since the first tokens of the labels "
321
+ "are distinct.",
322
+ level=logging.DEBUG,
323
+ )
324
+ return {
325
+ label: first_token
326
+ for label, first_token in zip(local_labels, first_tokens)
327
+ }
328
+ else:
329
+ log_once(
330
+ "The model will not output scores, since the first tokens of the "
331
+ "labels are not distinct. The first tokens for the labels "
332
+ f"{local_labels} are {first_tokens}"
333
+ )
334
+ return False
335
+
336
+ # Otherwise, we assume that the model should not output scores, to avoid potential
337
+ # evaluation errors. This will force the label extraction to rely on word edit
338
+ # distance instead of logprobs.
339
+ log_once(
340
+ "The model will not output scores, since the dataset does not have labels.",
341
+ level=logging.DEBUG,
342
+ )
343
+ return False
euroeval/types.py CHANGED
@@ -3,6 +3,7 @@
3
3
  import typing as t
4
4
 
5
5
  from numpy.typing import NDArray
6
+ from transformers.trainer_utils import EvalPrediction
6
7
 
7
8
  if t.TYPE_CHECKING:
8
9
  from .data_models import GenerativeModelOutput
@@ -18,7 +19,8 @@ class ComputeMetricsFunction(t.Protocol):
18
19
 
19
20
  def __call__(
20
21
  self,
21
- model_outputs_and_labels: tuple[
22
+ model_outputs_and_labels: EvalPrediction
23
+ | tuple[
22
24
  NDArray | list[str] | list[list[str]], NDArray | list[str] | list[list[str]]
23
25
  ],
24
26
  ) -> dict[str, float]: