EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. euroeval/__init__.py +32 -14
  2. euroeval/benchmark_config_factory.py +92 -180
  3. euroeval/benchmark_modules/base.py +49 -39
  4. euroeval/benchmark_modules/fresh.py +35 -21
  5. euroeval/benchmark_modules/hf.py +280 -244
  6. euroeval/benchmark_modules/litellm.py +752 -312
  7. euroeval/benchmark_modules/vllm.py +570 -268
  8. euroeval/benchmarker.py +651 -528
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +49 -38
  12. euroeval/constants.py +44 -25
  13. euroeval/data_loading.py +111 -55
  14. euroeval/data_models.py +490 -323
  15. euroeval/dataset_configs/__init__.py +26 -4
  16. euroeval/dataset_configs/bosnian.py +39 -0
  17. euroeval/dataset_configs/bulgarian.py +56 -0
  18. euroeval/dataset_configs/croatian.py +56 -0
  19. euroeval/dataset_configs/czech.py +75 -0
  20. euroeval/dataset_configs/danish.py +78 -50
  21. euroeval/dataset_configs/dutch.py +74 -44
  22. euroeval/dataset_configs/english.py +71 -36
  23. euroeval/dataset_configs/estonian.py +111 -0
  24. euroeval/dataset_configs/faroese.py +25 -18
  25. euroeval/dataset_configs/finnish.py +63 -26
  26. euroeval/dataset_configs/french.py +65 -32
  27. euroeval/dataset_configs/german.py +77 -36
  28. euroeval/dataset_configs/greek.py +64 -0
  29. euroeval/dataset_configs/icelandic.py +68 -57
  30. euroeval/dataset_configs/italian.py +68 -36
  31. euroeval/dataset_configs/latvian.py +87 -0
  32. euroeval/dataset_configs/lithuanian.py +64 -0
  33. euroeval/dataset_configs/norwegian.py +98 -72
  34. euroeval/dataset_configs/polish.py +96 -0
  35. euroeval/dataset_configs/portuguese.py +63 -40
  36. euroeval/dataset_configs/serbian.py +64 -0
  37. euroeval/dataset_configs/slovak.py +55 -0
  38. euroeval/dataset_configs/slovene.py +56 -0
  39. euroeval/dataset_configs/spanish.py +68 -34
  40. euroeval/dataset_configs/swedish.py +82 -41
  41. euroeval/dataset_configs/ukrainian.py +64 -0
  42. euroeval/enums.py +12 -6
  43. euroeval/exceptions.py +21 -1
  44. euroeval/finetuning.py +34 -26
  45. euroeval/generation.py +76 -41
  46. euroeval/generation_utils.py +169 -34
  47. euroeval/languages.py +1020 -188
  48. euroeval/logging_utils.py +268 -0
  49. euroeval/metrics/__init__.py +6 -0
  50. euroeval/metrics/base.py +85 -0
  51. euroeval/metrics/huggingface.py +216 -0
  52. euroeval/metrics/llm_as_a_judge.py +260 -0
  53. euroeval/metrics/pipeline.py +289 -0
  54. euroeval/metrics/speed.py +48 -0
  55. euroeval/model_cache.py +40 -21
  56. euroeval/model_config.py +4 -5
  57. euroeval/model_loading.py +3 -0
  58. euroeval/prompt_templates/__init__.py +2 -0
  59. euroeval/prompt_templates/classification.py +206 -0
  60. euroeval/prompt_templates/linguistic_acceptability.py +157 -22
  61. euroeval/prompt_templates/multiple_choice.py +159 -17
  62. euroeval/prompt_templates/named_entity_recognition.py +318 -21
  63. euroeval/prompt_templates/reading_comprehension.py +207 -16
  64. euroeval/prompt_templates/sentiment_classification.py +205 -22
  65. euroeval/prompt_templates/summarization.py +122 -22
  66. euroeval/prompt_templates/token_classification.py +279 -0
  67. euroeval/scores.py +20 -9
  68. euroeval/speed_benchmark.py +11 -12
  69. euroeval/task_group_utils/multiple_choice_classification.py +21 -12
  70. euroeval/task_group_utils/question_answering.py +101 -73
  71. euroeval/task_group_utils/sequence_classification.py +144 -61
  72. euroeval/task_group_utils/text_to_text.py +33 -12
  73. euroeval/task_group_utils/token_classification.py +86 -89
  74. euroeval/tasks.py +75 -16
  75. euroeval/tokenisation_utils.py +603 -0
  76. euroeval/types.py +17 -11
  77. euroeval/utils.py +332 -137
  78. euroeval-16.7.1.dist-info/METADATA +623 -0
  79. euroeval-16.7.1.dist-info/RECORD +84 -0
  80. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
  81. euroeval/human_evaluation.py +0 -737
  82. euroeval/metrics.py +0 -452
  83. euroeval/tokenization_utils.py +0 -498
  84. euroeval-15.12.0.dist-info/METADATA +0 -285
  85. euroeval-15.12.0.dist-info/RECORD +0 -63
  86. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
  87. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,498 +0,0 @@
1
- """Utility functions related to tokenization."""
2
-
3
- import logging
4
- import re
5
- import typing as t
6
-
7
- import torch
8
-
9
- from .constants import TASK_GROUPS_USING_LOGPROBS
10
- from .enums import GenerativeType
11
- from .utils import log_once
12
-
13
- if t.TYPE_CHECKING:
14
- from transformers.tokenization_utils import PreTrainedTokenizer
15
- from transformers.tokenization_utils_base import PreTrainedTokenizerBase
16
-
17
- from .data_models import DatasetConfig, ModelConfig
18
-
19
-
20
- logger = logging.getLogger("euroeval")
21
-
22
-
23
- def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
24
- """Get the special token metadata for a tokenizer.
25
-
26
- Args:
27
- tokenizer:
28
- The tokenizer.
29
-
30
- Returns:
31
- The special token metadata.
32
- """
33
- # Create some test input IDs, to check if the tokenizer is adding special tokens
34
- test_input_ids = tokenizer("Test").input_ids
35
-
36
- # Extract the CLS token IDs from the tokenizer, if it's using them
37
- has_cls_token = True
38
- if tokenizer.cls_token_id in test_input_ids:
39
- cls_token_id = tokenizer.cls_token_id
40
- cls_token = tokenizer.cls_token
41
- elif tokenizer.bos_token_id in test_input_ids:
42
- cls_token_id = tokenizer.bos_token_id
43
- cls_token = tokenizer.bos_token
44
- elif tokenizer.cls_token is not None:
45
- cls_token_id = tokenizer.cls_token_id
46
- cls_token = tokenizer.cls_token
47
- has_cls_token = False
48
- else:
49
- cls_token_id = tokenizer.bos_token_id
50
- cls_token = tokenizer.bos_token
51
- has_cls_token = False
52
-
53
- # Extract the SEP token IDs from the tokenizer, if it's using them
54
- has_sep_token = True
55
- if tokenizer.sep_token_id in test_input_ids:
56
- sep_token = tokenizer.sep_token
57
- elif tokenizer.eos_token_id in test_input_ids:
58
- sep_token = tokenizer.eos_token
59
- elif tokenizer.sep_token is not None:
60
- sep_token = tokenizer.sep_token
61
- has_sep_token = False
62
- else:
63
- sep_token = tokenizer.eos_token
64
- has_sep_token = False
65
-
66
- return dict(
67
- cls_token_id=cls_token_id,
68
- cls_token=cls_token,
69
- sep_token=sep_token,
70
- has_cls_token=has_cls_token,
71
- has_sep_token=has_sep_token,
72
- )
73
-
74
-
75
- def should_prompts_be_stripped(
76
- labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
77
- ) -> bool:
78
- """Determine if we should strip the prompts for few-shot evaluation.
79
-
80
- This is the case if the tokenizer needs to include the space as part of the label
81
- token. The strategy is thus to tokenize a label with a preceeding colon (as in the
82
- prompts), i.e., ": positive", and check if the tokenization starts with the tokens
83
- of ": ". If this is the case, then we should not strip the prompts, since the
84
- tokenizer produces the whitespace token separately.
85
-
86
- Args:
87
- labels_to_be_generated:
88
- The labels that are to be generated.
89
- tokenizer:
90
- The tokenizer used to tokenize the labels.
91
-
92
- Returns:
93
- Whether we should strip the prompts.
94
- """
95
- strip_prompts = True
96
- for label in labels_to_be_generated:
97
- colon_tokens = tokenizer(": ", add_special_tokens=False).input_ids
98
- label_tokens = tokenizer(": " + label, add_special_tokens=False).input_ids
99
-
100
- if isinstance(colon_tokens, torch.Tensor):
101
- colon_tokens = list(colon_tokens.squeeze(0))
102
- if isinstance(label_tokens, torch.Tensor):
103
- label_tokens = list(label_tokens.squeeze(0))
104
-
105
- label_tokens_start_with_colon_tokens = (
106
- label_tokens[: len(colon_tokens)] == colon_tokens
107
- )
108
- if label_tokens_start_with_colon_tokens:
109
- strip_prompts = False
110
-
111
- return strip_prompts
112
-
113
-
114
- def should_prefix_space_be_added_to_labels(
115
- labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
116
- ) -> bool:
117
- """Determine if we should add a prefix space to the labels.
118
-
119
- This is the case if the prompts are stripped and the tokenizer doesn't
120
- automatically add prefix whitespaces to the labels.
121
-
122
- Args:
123
- labels_to_be_generated:
124
- The labels that are to be generated.
125
- tokenizer:
126
- The tokenizer used to tokenize the labels.
127
-
128
- Returns:
129
- Whether we should add a prefix space to the labels.
130
- """
131
- if not should_prompts_be_stripped(
132
- labels_to_be_generated=labels_to_be_generated, tokenizer=tokenizer
133
- ):
134
- return False
135
-
136
- whitespace_token = tokenizer.convert_ids_to_tokens(
137
- ids=tokenizer(" ", add_special_tokens=False).input_ids[0]
138
- )[0]
139
-
140
- add_prefix_space = True
141
- for label in labels_to_be_generated:
142
- label_tokens = tokenizer(label, add_special_tokens=False).input_ids
143
- if isinstance(label_tokens, torch.Tensor):
144
- label_tokens = list(label_tokens.squeeze(0))
145
- first_label_token: int = int(label_tokens[0])
146
- first_character_of_label = tokenizer.convert_ids_to_tokens(first_label_token)[0]
147
- has_prefix_space = first_character_of_label == whitespace_token
148
- if has_prefix_space:
149
- add_prefix_space = False
150
- break
151
-
152
- return add_prefix_space
153
-
154
-
155
- def get_bos_token(
156
- tokenizer: "PreTrainedTokenizer",
157
- ) -> tuple[str, int] | tuple[None, None]:
158
- """Get the beginning-of-sequence token from a tokenizer.
159
-
160
- Args:
161
- tokenizer:
162
- The tokenizer.
163
-
164
- Returns:
165
- A pair (token, token_id) representing the beginning-of-sequence token and its
166
- token ID, or (None, None) if no BOS token is found.
167
- """
168
- if isinstance(tokenizer.bos_token, str) and isinstance(tokenizer.bos_token_id, int):
169
- return tokenizer.bos_token, tokenizer.bos_token_id
170
-
171
- vocab: dict[str, int] = tokenizer.get_vocab()
172
-
173
- candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
174
- for candidate_bos_token in candidate_bos_tokens:
175
- if candidate_bos_token in vocab:
176
- bos_token = candidate_bos_token
177
- bos_token_id = vocab[bos_token]
178
- break
179
- else:
180
- log_once(
181
- "The model does not have a beginning-of-sequence token. Please ensure that "
182
- "this has been set in the tokenizer's configuration. Using no BOS token."
183
- " This may lead to unexpected behavior in the model.",
184
- level=logging.INFO,
185
- )
186
- return None, None
187
-
188
- log_once(
189
- f"Beginning-of-sequence token was not set, but detected it as {bos_token!r} "
190
- f"with ID {bos_token_id}.",
191
- level=logging.DEBUG,
192
- )
193
- return bos_token, bos_token_id
194
-
195
-
196
- def get_eos_token(
197
- tokenizer: "PreTrainedTokenizer",
198
- ) -> tuple[str, int] | tuple[None, None]:
199
- """Get the end-of-sequence token from a tokenizer.
200
-
201
- Args:
202
- tokenizer:
203
- The tokenizer.
204
-
205
- Returns:
206
- A pair (token, token_id) representing the end-of-sequence token and its token
207
- ID, or (None, None) if no EOS token is found.
208
- """
209
- if isinstance(tokenizer.eos_token, str) and isinstance(tokenizer.eos_token_id, int):
210
- return tokenizer.eos_token, tokenizer.eos_token_id
211
-
212
- vocab: dict[str, int] = tokenizer.get_vocab()
213
-
214
- candidate_eos_tokens = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]"]
215
- for candidate_eos_token in candidate_eos_tokens:
216
- if candidate_eos_token in vocab:
217
- eos_token = candidate_eos_token
218
- eos_token_id = vocab[eos_token]
219
- break
220
- else:
221
- log_once(
222
- "The model does not have an end-of-sequence token. Please ensure that this "
223
- "has been set in the tokenizer's configuration. Using no EOS token. This "
224
- "may lead to unexpected behavior in the model.",
225
- level=logging.INFO,
226
- )
227
- return None, None
228
-
229
- log_once(
230
- f"End-of-sequence token was not set, but detected it as {eos_token!r} with "
231
- f"ID {eos_token_id}.",
232
- level=logging.DEBUG,
233
- )
234
- return eos_token, eos_token_id
235
-
236
-
237
- def get_pad_token(
238
- tokenizer: "PreTrainedTokenizer",
239
- ) -> tuple[str, int] | tuple[None, None]:
240
- """Get the padding token from a tokenizer.
241
-
242
- Args:
243
- tokenizer:
244
- The tokenizer.
245
-
246
- Returns:
247
- A pair (token, token_id) representing the padding token and its token ID, or
248
- (None, None) if no padding token is found.
249
- """
250
- # If the tokenizer already has a padding token, return it
251
- if tokenizer.pad_token is not None and tokenizer.pad_token_id is not None:
252
- assert isinstance(tokenizer.pad_token, str), (
253
- "Expected tokenizer.pad_token to be a string, but got "
254
- f"{type(tokenizer.pad_token)}."
255
- )
256
- assert isinstance(tokenizer.pad_token_id, int), (
257
- "Expected tokenizer.pad_token_id to be an integer, but got "
258
- f"{type(tokenizer.pad_token_id)}."
259
- )
260
- return (tokenizer.pad_token, tokenizer.pad_token_id)
261
-
262
- # If the tokenizer has a BOS token, use it as the padding token
263
- if tokenizer.bos_token is not None and tokenizer.bos_token_id is not None:
264
- assert isinstance(tokenizer.bos_token, str), (
265
- "Expected tokenizer.bos_token to be a string, but got "
266
- f"{type(tokenizer.bos_token)}."
267
- )
268
- assert isinstance(tokenizer.bos_token_id, int), (
269
- "Expected tokenizer.bos_token_id to be an integer, but got "
270
- f"{type(tokenizer.bos_token_id)}."
271
- )
272
- pad_token = tokenizer.bos_token
273
- pad_token_id = tokenizer.bos_token_id
274
-
275
- # If the tokenizer has an EOS token, use it as the padding token
276
- elif tokenizer.eos_token is not None and tokenizer.eos_token_id is not None:
277
- assert isinstance(tokenizer.eos_token, str), (
278
- "Expected tokenizer.eos_token to be a string, but got "
279
- f"{type(tokenizer.eos_token)}."
280
- )
281
- assert isinstance(tokenizer.eos_token_id, int), (
282
- "Expected tokenizer.eos_token_id to be an integer, but got "
283
- f"{type(tokenizer.eos_token_id)}."
284
- )
285
- pad_token = tokenizer.eos_token
286
- pad_token_id = tokenizer.eos_token_id
287
-
288
- # Otherwise, try to find a candidate padding token in the vocabulary
289
- else:
290
- pad_token_candidates = [
291
- "<pad>",
292
- "[pad]",
293
- "<|endoftext|>",
294
- "<|end▁of▁sentence|>",
295
- "<|im_end|>",
296
- ]
297
- pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
298
- for candidate in pad_token_candidates:
299
- if candidate in tokenizer.get_vocab():
300
- pad_token = candidate
301
- pad_token_id = tokenizer.get_vocab()[candidate]
302
- break
303
- else:
304
- log_once(
305
- "Could not identify a padding token for the model. Please ensure that "
306
- "this has been set in the tokenizer's configuration. Using no padding "
307
- "token. This may lead to unexpected behavior in the model.",
308
- level=logging.INFO,
309
- )
310
- return None, None
311
-
312
- log_once(
313
- f"Padding token was not set, but detected it as {pad_token!r} with ID "
314
- f"{pad_token_id}.",
315
- level=logging.DEBUG,
316
- )
317
- return pad_token, pad_token_id
318
-
319
-
320
- def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | None:
321
- """Get the end token ID for chat models.
322
-
323
- This is only relevant for tokenizers with a chat template.
324
-
325
- Args:
326
- tokenizer:
327
- The tokenizer.
328
-
329
- Returns:
330
- The token IDs used to end chats, or None if the tokenizer does not have a chat
331
- template.
332
-
333
- Raises:
334
- ValueError:
335
- If the end-of-chat token could not be located.
336
- """
337
- if tokenizer.chat_template is None:
338
- return None
339
-
340
- user_message: dict[str, str] = dict(role="user", content="X")
341
- token_ids: list[int] = tokenizer.apply_chat_template(conversation=[user_message]) # type: ignore[assignment]
342
-
343
- for idx, token in enumerate(tokenizer.convert_ids_to_tokens(token_ids)):
344
- token_id = tokenizer.convert_tokens_to_ids(token)
345
- assert isinstance(token_id, int)
346
- token = tokenizer.decode([token_id])
347
- if "X" in token:
348
- x_token_index = idx
349
- break
350
- else:
351
- raise ValueError("Could not locate the end-of-chat token for the model.")
352
-
353
- end_of_chat_tokens = token_ids[x_token_index + 1 :]
354
- if len(end_of_chat_tokens) == 0:
355
- return None
356
- return end_of_chat_tokens
357
-
358
-
359
- def get_first_label_token_mapping(
360
- dataset_config: "DatasetConfig",
361
- model_config: "ModelConfig",
362
- tokenizer: "PreTrainedTokenizer | None",
363
- generative_type: "GenerativeType | None",
364
- ) -> dict[str, str] | bool:
365
- """Check if the model should output scores.
366
-
367
- Args:
368
- dataset_config:
369
- The dataset configuration.
370
- model_config:
371
- The model configuration.
372
- tokenizer:
373
- The tokenizer, or None if not available.
374
- generative_type:
375
- The generative type, or None if not available.
376
-
377
- Returns:
378
- A mapping from labels to the first token in each label, or alternatively a
379
- Boolean value indicating whether the model should output scores (if the mapping
380
- is outputted then the model will always output scores).
381
- """
382
- if generative_type == GenerativeType.REASONING:
383
- log_once(
384
- f"The model {model_config.model_id!r} is a reasoning model and "
385
- "thus does not support logprobs, so we do not enable it.",
386
- level=logging.DEBUG,
387
- )
388
- return False
389
-
390
- # If we do not have any tokenizer, then we cannot check if the model should output
391
- # scores and we just assume it should if the dataset supports it
392
- output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
393
- if tokenizer is None:
394
- if output_scores:
395
- log_once(
396
- f"We will use logprobs with the model {model_config.model_id!r} "
397
- "since the dataset supports it and no tokenizer is available.",
398
- level=logging.DEBUG,
399
- )
400
- else:
401
- log_once(
402
- f"We will not use logprobs with the model {model_config.model_id!r} "
403
- "since the dataset does not support it and no tokenizer is available.",
404
- level=logging.DEBUG,
405
- )
406
- return output_scores
407
-
408
- # If there are labels associated with the dataset, and that the first token of each
409
- # label is distinct, then we can safely use the logprobs
410
- if output_scores and dataset_config.labels:
411
- local_labels = [
412
- dataset_config.prompt_label_mapping[label].strip()
413
- for label in dataset_config.labels
414
- ]
415
-
416
- # Tokenize some text containing each label, which we will use to extract the
417
- # first token of each label
418
- all_tokens: list[list[str]]
419
- if tokenizer.chat_template is None:
420
- add_prefix_space = should_prefix_space_be_added_to_labels(
421
- labels_to_be_generated=local_labels, tokenizer=tokenizer
422
- )
423
- all_tokens = [
424
- tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
425
- for label in local_labels
426
- ]
427
- else:
428
- all_tokens = [
429
- tokenizer.convert_ids_to_tokens(
430
- ids=tokenizer.apply_chat_template(
431
- conversation=[
432
- dict(role="user", content=""),
433
- dict(role="assistant", content=label),
434
- ],
435
- add_generation_prompt=True,
436
- tokenize=True,
437
- )
438
- )
439
- for label in local_labels
440
- ]
441
-
442
- # Remove any non-alphabetic characters from the tokens
443
- all_tokens = [
444
- [
445
- re.sub(
446
- pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
447
- repl="",
448
- string=token.lower(),
449
- )
450
- for token in token_list
451
- ]
452
- for token_list in all_tokens
453
- ]
454
-
455
- # Extract the first token of each label
456
- first_tokens: list[str] = list()
457
- for token_list, label in zip(all_tokens, local_labels):
458
- matching_tokens = [
459
- tok for tok in token_list if tok and label.startswith(tok)
460
- ]
461
- if not matching_tokens:
462
- log_once(
463
- f"No matching token found in token_list for label '{label}', so "
464
- "we will not use logprobs with the model.",
465
- level=logging.DEBUG,
466
- )
467
- return False
468
- first_tokens.append(matching_tokens[0])
469
-
470
- # Build a mapping from labels to the first token in each label if the first
471
- # tokens are distinct
472
- if len(first_tokens) == len(set(first_tokens)):
473
- log_once(
474
- "We will use logprobs with the model since the first tokens of the "
475
- "labels are distinct.",
476
- level=logging.DEBUG,
477
- )
478
- return {
479
- label: first_token
480
- for label, first_token in zip(local_labels, first_tokens)
481
- }
482
- else:
483
- log_once(
484
- "We will not use logprobs with the model since the first tokens of the "
485
- "labels are not distinct. The first tokens for the labels "
486
- f"{local_labels} are {first_tokens}"
487
- )
488
- return False
489
-
490
- # Otherwise, we assume that the model should not output scores, to avoid potential
491
- # evaluation errors. This will force the label extraction to rely on word edit
492
- # distance instead of logprobs.
493
- log_once(
494
- "We will not use logprobs with the model, since the dataset does not have "
495
- "labels.",
496
- level=logging.DEBUG,
497
- )
498
- return False