EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +32 -14
- euroeval/benchmark_config_factory.py +92 -180
- euroeval/benchmark_modules/base.py +49 -39
- euroeval/benchmark_modules/fresh.py +35 -21
- euroeval/benchmark_modules/hf.py +280 -244
- euroeval/benchmark_modules/litellm.py +752 -312
- euroeval/benchmark_modules/vllm.py +570 -268
- euroeval/benchmarker.py +651 -528
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +49 -38
- euroeval/constants.py +44 -25
- euroeval/data_loading.py +111 -55
- euroeval/data_models.py +490 -323
- euroeval/dataset_configs/__init__.py +26 -4
- euroeval/dataset_configs/bosnian.py +39 -0
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/croatian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +78 -50
- euroeval/dataset_configs/dutch.py +74 -44
- euroeval/dataset_configs/english.py +71 -36
- euroeval/dataset_configs/estonian.py +111 -0
- euroeval/dataset_configs/faroese.py +25 -18
- euroeval/dataset_configs/finnish.py +63 -26
- euroeval/dataset_configs/french.py +65 -32
- euroeval/dataset_configs/german.py +77 -36
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +68 -57
- euroeval/dataset_configs/italian.py +68 -36
- euroeval/dataset_configs/latvian.py +87 -0
- euroeval/dataset_configs/lithuanian.py +64 -0
- euroeval/dataset_configs/norwegian.py +98 -72
- euroeval/dataset_configs/polish.py +96 -0
- euroeval/dataset_configs/portuguese.py +63 -40
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/slovene.py +56 -0
- euroeval/dataset_configs/spanish.py +68 -34
- euroeval/dataset_configs/swedish.py +82 -41
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/enums.py +12 -6
- euroeval/exceptions.py +21 -1
- euroeval/finetuning.py +34 -26
- euroeval/generation.py +76 -41
- euroeval/generation_utils.py +169 -34
- euroeval/languages.py +1020 -188
- euroeval/logging_utils.py +268 -0
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +85 -0
- euroeval/metrics/huggingface.py +216 -0
- euroeval/metrics/llm_as_a_judge.py +260 -0
- euroeval/metrics/pipeline.py +289 -0
- euroeval/metrics/speed.py +48 -0
- euroeval/model_cache.py +40 -21
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +157 -22
- euroeval/prompt_templates/multiple_choice.py +159 -17
- euroeval/prompt_templates/named_entity_recognition.py +318 -21
- euroeval/prompt_templates/reading_comprehension.py +207 -16
- euroeval/prompt_templates/sentiment_classification.py +205 -22
- euroeval/prompt_templates/summarization.py +122 -22
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +20 -9
- euroeval/speed_benchmark.py +11 -12
- euroeval/task_group_utils/multiple_choice_classification.py +21 -12
- euroeval/task_group_utils/question_answering.py +101 -73
- euroeval/task_group_utils/sequence_classification.py +144 -61
- euroeval/task_group_utils/text_to_text.py +33 -12
- euroeval/task_group_utils/token_classification.py +86 -89
- euroeval/tasks.py +75 -16
- euroeval/tokenisation_utils.py +603 -0
- euroeval/types.py +17 -11
- euroeval/utils.py +332 -137
- euroeval-16.7.1.dist-info/METADATA +623 -0
- euroeval-16.7.1.dist-info/RECORD +84 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -737
- euroeval/metrics.py +0 -452
- euroeval/tokenization_utils.py +0 -498
- euroeval-15.12.0.dist-info/METADATA +0 -285
- euroeval-15.12.0.dist-info/RECORD +0 -63
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,603 @@
|
|
|
1
|
+
"""Utility functions related to tokenisation."""
|
|
2
|
+
|
|
3
|
+
import collections.abc as c
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
import typing as t
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
from transformers.tokenization_mistral_common import MistralCommonTokenizer
|
|
10
|
+
|
|
11
|
+
from .constants import BOS_TOKENS, EOS_TOKENS, PAD_TOKENS
|
|
12
|
+
from .enums import GenerativeType
|
|
13
|
+
from .exceptions import InvalidModel
|
|
14
|
+
from .logging_utils import log, log_once
|
|
15
|
+
|
|
16
|
+
if t.TYPE_CHECKING:
|
|
17
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
18
|
+
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
|
19
|
+
|
|
20
|
+
from .data_models import DatasetConfig, ModelConfig
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_special_token_metadata(tokeniser: "PreTrainedTokenizerBase") -> dict:
|
|
24
|
+
"""Get the special token metadata for a tokeniser.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
tokeniser:
|
|
28
|
+
The tokeniser.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
The special token metadata.
|
|
32
|
+
"""
|
|
33
|
+
# Create some test input IDs, to check if the tokeniser is adding special tokens
|
|
34
|
+
test_input_ids = tokeniser("Test").input_ids
|
|
35
|
+
|
|
36
|
+
# Extract the CLS token IDs from the tokeniser, if it's using them
|
|
37
|
+
has_cls_token = True
|
|
38
|
+
if tokeniser.cls_token_id in test_input_ids:
|
|
39
|
+
cls_token_id = tokeniser.cls_token_id
|
|
40
|
+
cls_token = tokeniser.cls_token
|
|
41
|
+
elif tokeniser.bos_token_id in test_input_ids:
|
|
42
|
+
cls_token_id = tokeniser.bos_token_id
|
|
43
|
+
cls_token = tokeniser.bos_token
|
|
44
|
+
elif tokeniser.cls_token is not None:
|
|
45
|
+
cls_token_id = tokeniser.cls_token_id
|
|
46
|
+
cls_token = tokeniser.cls_token
|
|
47
|
+
has_cls_token = False
|
|
48
|
+
else:
|
|
49
|
+
cls_token_id = tokeniser.bos_token_id
|
|
50
|
+
cls_token = tokeniser.bos_token
|
|
51
|
+
has_cls_token = False
|
|
52
|
+
|
|
53
|
+
# Extract the SEP token IDs from the tokeniser, if it's using them
|
|
54
|
+
has_sep_token = True
|
|
55
|
+
if tokeniser.sep_token_id in test_input_ids:
|
|
56
|
+
sep_token = tokeniser.sep_token
|
|
57
|
+
elif tokeniser.eos_token_id in test_input_ids:
|
|
58
|
+
sep_token = tokeniser.eos_token
|
|
59
|
+
elif tokeniser.sep_token is not None:
|
|
60
|
+
sep_token = tokeniser.sep_token
|
|
61
|
+
has_sep_token = False
|
|
62
|
+
else:
|
|
63
|
+
sep_token = tokeniser.eos_token
|
|
64
|
+
has_sep_token = False
|
|
65
|
+
|
|
66
|
+
return dict(
|
|
67
|
+
cls_token_id=cls_token_id,
|
|
68
|
+
cls_token=cls_token,
|
|
69
|
+
sep_token=sep_token,
|
|
70
|
+
has_cls_token=has_cls_token,
|
|
71
|
+
has_sep_token=has_sep_token,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def should_prompts_be_stripped(
|
|
76
|
+
labels_to_be_generated: c.Sequence[str], tokeniser: "PreTrainedTokenizer"
|
|
77
|
+
) -> bool:
|
|
78
|
+
"""Determine if we should strip the prompts for few-shot evaluation.
|
|
79
|
+
|
|
80
|
+
This is the case if the tokeniser needs to include the space as part of the label
|
|
81
|
+
token. The strategy is thus to tokenise a label with a preceeding colon (as in the
|
|
82
|
+
prompts), i.e., ": positive", and check if the tokenisation starts with the tokens
|
|
83
|
+
of ": ". If this is the case, then we should not strip the prompts, since the
|
|
84
|
+
tokeniser produces the whitespace token separately.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
labels_to_be_generated:
|
|
88
|
+
The labels that are to be generated.
|
|
89
|
+
tokeniser:
|
|
90
|
+
The tokeniser used to tokenise the labels.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Whether we should strip the prompts.
|
|
94
|
+
"""
|
|
95
|
+
strip_prompts = True
|
|
96
|
+
for label in labels_to_be_generated:
|
|
97
|
+
colon_tokens = tokeniser(": ", add_special_tokens=False).input_ids
|
|
98
|
+
label_tokens = tokeniser(": " + label, add_special_tokens=False).input_ids
|
|
99
|
+
|
|
100
|
+
if isinstance(colon_tokens, torch.Tensor):
|
|
101
|
+
colon_tokens = list(colon_tokens.squeeze(0))
|
|
102
|
+
if isinstance(label_tokens, torch.Tensor):
|
|
103
|
+
label_tokens = list(label_tokens.squeeze(0))
|
|
104
|
+
|
|
105
|
+
label_tokens_start_with_colon_tokens = (
|
|
106
|
+
label_tokens[: len(colon_tokens)] == colon_tokens
|
|
107
|
+
)
|
|
108
|
+
if label_tokens_start_with_colon_tokens:
|
|
109
|
+
strip_prompts = False
|
|
110
|
+
|
|
111
|
+
return strip_prompts
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def should_prefix_space_be_added_to_labels(
|
|
115
|
+
labels_to_be_generated: c.Sequence[str], tokeniser: "PreTrainedTokenizer"
|
|
116
|
+
) -> bool:
|
|
117
|
+
"""Determine if we should add a prefix space to the labels.
|
|
118
|
+
|
|
119
|
+
This is the case if the prompts are stripped and the tokeniser doesn't
|
|
120
|
+
automatically add prefix whitespaces to the labels.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
labels_to_be_generated:
|
|
124
|
+
The labels that are to be generated.
|
|
125
|
+
tokeniser:
|
|
126
|
+
The tokeniser used to tokenise the labels.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Whether we should add a prefix space to the labels.
|
|
130
|
+
"""
|
|
131
|
+
if not should_prompts_be_stripped(
|
|
132
|
+
labels_to_be_generated=labels_to_be_generated, tokeniser=tokeniser
|
|
133
|
+
):
|
|
134
|
+
return False
|
|
135
|
+
|
|
136
|
+
whitespace_token = tokeniser.convert_ids_to_tokens(
|
|
137
|
+
ids=tokeniser(" ", add_special_tokens=False).input_ids[0]
|
|
138
|
+
)[0]
|
|
139
|
+
|
|
140
|
+
add_prefix_space = True
|
|
141
|
+
for label in labels_to_be_generated:
|
|
142
|
+
label_tokens = tokeniser(label, add_special_tokens=False).input_ids
|
|
143
|
+
if isinstance(label_tokens, torch.Tensor):
|
|
144
|
+
label_tokens = list(label_tokens.squeeze(0))
|
|
145
|
+
first_label_token: int = int(label_tokens[0])
|
|
146
|
+
first_character_of_label = tokeniser.convert_ids_to_tokens(first_label_token)[0]
|
|
147
|
+
has_prefix_space = first_character_of_label == whitespace_token
|
|
148
|
+
if has_prefix_space:
|
|
149
|
+
add_prefix_space = False
|
|
150
|
+
break
|
|
151
|
+
|
|
152
|
+
return add_prefix_space
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def get_bos_token(
|
|
156
|
+
tokeniser: "PreTrainedTokenizer",
|
|
157
|
+
) -> tuple[str, int] | tuple[None, None]:
|
|
158
|
+
"""Get the beginning-of-sequence token from a tokeniser.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
tokeniser:
|
|
162
|
+
The tokeniser.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
A pair (token, token_id) representing the beginning-of-sequence token and its
|
|
166
|
+
token ID, or (None, None) if no BOS token is found.
|
|
167
|
+
"""
|
|
168
|
+
if isinstance(tokeniser.bos_token, str) and isinstance(tokeniser.bos_token_id, int):
|
|
169
|
+
return tokeniser.bos_token, tokeniser.bos_token_id
|
|
170
|
+
|
|
171
|
+
vocab: dict[str, int] = tokeniser.get_vocab()
|
|
172
|
+
|
|
173
|
+
for candidate_bos_token in BOS_TOKENS:
|
|
174
|
+
if candidate_bos_token in vocab:
|
|
175
|
+
bos_token = candidate_bos_token
|
|
176
|
+
bos_token_id = vocab[bos_token]
|
|
177
|
+
break
|
|
178
|
+
else:
|
|
179
|
+
log_once(
|
|
180
|
+
"The model does not have a beginning-of-sequence token. Please ensure that "
|
|
181
|
+
"this has been set in the tokeniser's configuration. Using no BOS token."
|
|
182
|
+
" This may lead to unexpected behavior in the model.",
|
|
183
|
+
level=logging.WARNING,
|
|
184
|
+
)
|
|
185
|
+
return None, None
|
|
186
|
+
|
|
187
|
+
log_once(
|
|
188
|
+
f"Beginning-of-sequence token was not set, but detected it as {bos_token!r} "
|
|
189
|
+
f"with ID {bos_token_id}.",
|
|
190
|
+
level=logging.DEBUG,
|
|
191
|
+
)
|
|
192
|
+
return bos_token, bos_token_id
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def get_eos_token(
|
|
196
|
+
tokeniser: "PreTrainedTokenizer",
|
|
197
|
+
) -> tuple[str, int] | tuple[None, None]:
|
|
198
|
+
"""Get the end-of-sequence token from a tokeniser.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
tokeniser:
|
|
202
|
+
The tokeniser.
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
A pair (token, token_id) representing the end-of-sequence token and its token
|
|
206
|
+
ID, or (None, None) if no EOS token is found.
|
|
207
|
+
"""
|
|
208
|
+
if isinstance(tokeniser.eos_token, str) and isinstance(tokeniser.eos_token_id, int):
|
|
209
|
+
return tokeniser.eos_token, tokeniser.eos_token_id
|
|
210
|
+
|
|
211
|
+
vocab: dict[str, int] = tokeniser.get_vocab()
|
|
212
|
+
|
|
213
|
+
for candidate_eos_token in EOS_TOKENS:
|
|
214
|
+
if candidate_eos_token in vocab:
|
|
215
|
+
eos_token = candidate_eos_token
|
|
216
|
+
eos_token_id = vocab[eos_token]
|
|
217
|
+
break
|
|
218
|
+
else:
|
|
219
|
+
log_once(
|
|
220
|
+
"The model does not have an end-of-sequence token. Please ensure that this "
|
|
221
|
+
"has been set in the tokeniser's configuration. Using no EOS token. This "
|
|
222
|
+
"may lead to unexpected behavior in the model.",
|
|
223
|
+
level=logging.WARNING,
|
|
224
|
+
)
|
|
225
|
+
return None, None
|
|
226
|
+
|
|
227
|
+
log_once(
|
|
228
|
+
f"End-of-sequence token was not set, but detected it as {eos_token!r} with "
|
|
229
|
+
f"ID {eos_token_id}.",
|
|
230
|
+
level=logging.WARNING,
|
|
231
|
+
)
|
|
232
|
+
return eos_token, eos_token_id
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def get_pad_token(
|
|
236
|
+
tokeniser: "PreTrainedTokenizer",
|
|
237
|
+
) -> tuple[str, int] | tuple[None, None]:
|
|
238
|
+
"""Get the padding token from a tokeniser.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
tokeniser:
|
|
242
|
+
The tokeniser.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
A pair (token, token_id) representing the padding token and its token ID, or
|
|
246
|
+
(None, None) if no padding token is found.
|
|
247
|
+
"""
|
|
248
|
+
# If the tokeniser already has a padding token, return it
|
|
249
|
+
if tokeniser.pad_token is not None and tokeniser.pad_token_id is not None:
|
|
250
|
+
assert isinstance(tokeniser.pad_token, str), (
|
|
251
|
+
"Expected tokeniser.pad_token to be a string, but got "
|
|
252
|
+
f"{type(tokeniser.pad_token)}."
|
|
253
|
+
)
|
|
254
|
+
assert isinstance(tokeniser.pad_token_id, int), (
|
|
255
|
+
"Expected tokeniser.pad_token_id to be an integer, but got "
|
|
256
|
+
f"{type(tokeniser.pad_token_id)}."
|
|
257
|
+
)
|
|
258
|
+
return (tokeniser.pad_token, tokeniser.pad_token_id)
|
|
259
|
+
|
|
260
|
+
# If the tokeniser has a BOS token, use it as the padding token
|
|
261
|
+
if tokeniser.bos_token is not None and tokeniser.bos_token_id is not None:
|
|
262
|
+
assert isinstance(tokeniser.bos_token, str), (
|
|
263
|
+
"Expected tokeniser.bos_token to be a string, but got "
|
|
264
|
+
f"{type(tokeniser.bos_token)}."
|
|
265
|
+
)
|
|
266
|
+
assert isinstance(tokeniser.bos_token_id, int), (
|
|
267
|
+
"Expected tokeniser.bos_token_id to be an integer, but got "
|
|
268
|
+
f"{type(tokeniser.bos_token_id)}."
|
|
269
|
+
)
|
|
270
|
+
pad_token = tokeniser.bos_token
|
|
271
|
+
pad_token_id = tokeniser.bos_token_id
|
|
272
|
+
|
|
273
|
+
# If the tokeniser has an EOS token, use it as the padding token
|
|
274
|
+
elif tokeniser.eos_token is not None and tokeniser.eos_token_id is not None:
|
|
275
|
+
assert isinstance(tokeniser.eos_token, str), (
|
|
276
|
+
"Expected tokeniser.eos_token to be a string, but got "
|
|
277
|
+
f"{type(tokeniser.eos_token)}."
|
|
278
|
+
)
|
|
279
|
+
assert isinstance(tokeniser.eos_token_id, int), (
|
|
280
|
+
"Expected tokeniser.eos_token_id to be an integer, but got "
|
|
281
|
+
f"{type(tokeniser.eos_token_id)}."
|
|
282
|
+
)
|
|
283
|
+
pad_token = tokeniser.eos_token
|
|
284
|
+
pad_token_id = tokeniser.eos_token_id
|
|
285
|
+
|
|
286
|
+
# Otherwise, try to find a candidate padding token in the vocabulary
|
|
287
|
+
else:
|
|
288
|
+
for candidate in PAD_TOKENS:
|
|
289
|
+
if candidate in tokeniser.get_vocab():
|
|
290
|
+
pad_token = candidate
|
|
291
|
+
pad_token_id = tokeniser.get_vocab()[candidate]
|
|
292
|
+
break
|
|
293
|
+
else:
|
|
294
|
+
log_once(
|
|
295
|
+
"Could not identify a padding token for the model. Please ensure that "
|
|
296
|
+
"this has been set in the tokeniser's configuration. Using no padding "
|
|
297
|
+
"token. This may lead to unexpected behavior in the model.",
|
|
298
|
+
level=logging.WARNING,
|
|
299
|
+
)
|
|
300
|
+
return None, None
|
|
301
|
+
|
|
302
|
+
log_once(
|
|
303
|
+
f"Padding token was not set, but detected it as {pad_token!r} with ID "
|
|
304
|
+
f"{pad_token_id}.",
|
|
305
|
+
level=logging.DEBUG,
|
|
306
|
+
)
|
|
307
|
+
return pad_token, pad_token_id
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def get_end_of_chat_token_ids(
|
|
311
|
+
tokeniser: "PreTrainedTokenizer", generative_type: GenerativeType | None
|
|
312
|
+
) -> c.Sequence[int] | None:
|
|
313
|
+
"""Get the end token ID for chat models.
|
|
314
|
+
|
|
315
|
+
This is only relevant for tokenisers with a chat template.
|
|
316
|
+
|
|
317
|
+
Args:
|
|
318
|
+
tokeniser:
|
|
319
|
+
The tokeniser.
|
|
320
|
+
generative_type:
|
|
321
|
+
The generative type, or None if not available.
|
|
322
|
+
|
|
323
|
+
Returns:
|
|
324
|
+
The token IDs used to end chats, or None if the tokeniser does not have a chat
|
|
325
|
+
template or if no end-of-chat token could be found.
|
|
326
|
+
"""
|
|
327
|
+
if generative_type == GenerativeType.BASE:
|
|
328
|
+
return None
|
|
329
|
+
|
|
330
|
+
user_message: dict[str, str] = dict(role="user", content="X")
|
|
331
|
+
try:
|
|
332
|
+
token_ids = apply_chat_template(
|
|
333
|
+
conversation=[user_message],
|
|
334
|
+
tokeniser=tokeniser,
|
|
335
|
+
tokenise=True,
|
|
336
|
+
add_generation_prompt=False,
|
|
337
|
+
enable_thinking=generative_type == GenerativeType.REASONING,
|
|
338
|
+
)
|
|
339
|
+
except InvalidModel as e:
|
|
340
|
+
if "does not have a chat template" in str(e):
|
|
341
|
+
return None
|
|
342
|
+
raise e
|
|
343
|
+
assert isinstance(token_ids, list)
|
|
344
|
+
|
|
345
|
+
for idx, token in enumerate(tokeniser.convert_ids_to_tokens(token_ids)):
|
|
346
|
+
if "X" in token:
|
|
347
|
+
x_token_index = idx
|
|
348
|
+
break
|
|
349
|
+
else:
|
|
350
|
+
log(
|
|
351
|
+
"Could not locate the end-of-chat token for the model.", level=logging.DEBUG
|
|
352
|
+
)
|
|
353
|
+
return None
|
|
354
|
+
|
|
355
|
+
end_of_chat_tokens = token_ids[x_token_index + 1 :]
|
|
356
|
+
if len(end_of_chat_tokens) == 0:
|
|
357
|
+
log(
|
|
358
|
+
"Could not locate the end-of-chat token for the model.", level=logging.DEBUG
|
|
359
|
+
)
|
|
360
|
+
return None
|
|
361
|
+
|
|
362
|
+
log_once(
|
|
363
|
+
f"Detected end-of-chat token IDs as {end_of_chat_tokens}, corresponding to "
|
|
364
|
+
f"tokens {tokeniser.convert_ids_to_tokens(end_of_chat_tokens)}.",
|
|
365
|
+
level=logging.DEBUG,
|
|
366
|
+
)
|
|
367
|
+
return end_of_chat_tokens
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def get_first_label_token_mapping(
|
|
371
|
+
dataset_config: "DatasetConfig",
|
|
372
|
+
model_config: "ModelConfig",
|
|
373
|
+
tokeniser: "PreTrainedTokenizer | None",
|
|
374
|
+
generative_type: "GenerativeType | None",
|
|
375
|
+
log_metadata: bool,
|
|
376
|
+
) -> dict[str, str] | bool:
|
|
377
|
+
"""Check if the model should output scores.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
dataset_config:
|
|
381
|
+
The dataset configuration.
|
|
382
|
+
model_config:
|
|
383
|
+
The model configuration.
|
|
384
|
+
tokeniser:
|
|
385
|
+
The tokeniser, or None if not available.
|
|
386
|
+
generative_type:
|
|
387
|
+
The generative type, or None if not available.
|
|
388
|
+
log_metadata:
|
|
389
|
+
Whether to log metadata.
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
A mapping from labels to the first token in each label, or alternatively a
|
|
393
|
+
Boolean value indicating whether the model should output scores (if the mapping
|
|
394
|
+
is outputted then the model will always output scores).
|
|
395
|
+
"""
|
|
396
|
+
if not (dataset_config.task.uses_logprobs and dataset_config.labels):
|
|
397
|
+
if log_metadata:
|
|
398
|
+
log_once(
|
|
399
|
+
"We will not use logprobs with the model, since the dataset does not "
|
|
400
|
+
"have labels.",
|
|
401
|
+
level=logging.DEBUG,
|
|
402
|
+
)
|
|
403
|
+
return False
|
|
404
|
+
elif generative_type == GenerativeType.REASONING:
|
|
405
|
+
if log_metadata:
|
|
406
|
+
log_once(
|
|
407
|
+
f"The model {model_config.model_id!r} is a reasoning model and "
|
|
408
|
+
"thus does not support logprobs, so we do not enable it.",
|
|
409
|
+
level=logging.DEBUG,
|
|
410
|
+
)
|
|
411
|
+
return False
|
|
412
|
+
elif tokeniser is None:
|
|
413
|
+
if log_metadata:
|
|
414
|
+
log_once(
|
|
415
|
+
f"We will use logprobs with the model {model_config.model_id!r} "
|
|
416
|
+
"since the dataset supports it and no tokeniser is available.",
|
|
417
|
+
level=logging.DEBUG,
|
|
418
|
+
)
|
|
419
|
+
return True
|
|
420
|
+
|
|
421
|
+
local_labels = [
|
|
422
|
+
dataset_config.prompt_label_mapping[label].strip()
|
|
423
|
+
for label in dataset_config.labels
|
|
424
|
+
]
|
|
425
|
+
|
|
426
|
+
# Tokenise some text containing each label, which we will use to extract the
|
|
427
|
+
# first token of each label
|
|
428
|
+
all_tokens: c.Sequence[c.Sequence[str]]
|
|
429
|
+
if not has_chat_template(tokeniser=tokeniser):
|
|
430
|
+
add_prefix_space = should_prefix_space_be_added_to_labels(
|
|
431
|
+
labels_to_be_generated=local_labels, tokeniser=tokeniser
|
|
432
|
+
)
|
|
433
|
+
all_tokens = [
|
|
434
|
+
[
|
|
435
|
+
tokeniser.decode(token_id)
|
|
436
|
+
for token_id in tokeniser.encode(
|
|
437
|
+
text=f" {label}" if add_prefix_space else label,
|
|
438
|
+
add_special_tokens=False,
|
|
439
|
+
)
|
|
440
|
+
]
|
|
441
|
+
for label in local_labels
|
|
442
|
+
]
|
|
443
|
+
else:
|
|
444
|
+
all_tokens = [
|
|
445
|
+
tokeniser.convert_ids_to_tokens(
|
|
446
|
+
ids=apply_chat_template(
|
|
447
|
+
conversation=[
|
|
448
|
+
dict(role="user", content=""),
|
|
449
|
+
dict(role="assistant", content=label),
|
|
450
|
+
# Adding extra user message as Mistral tokenisers require
|
|
451
|
+
# conversations to end with a user message
|
|
452
|
+
dict(role="user", content=""),
|
|
453
|
+
],
|
|
454
|
+
tokeniser=tokeniser,
|
|
455
|
+
tokenise=True,
|
|
456
|
+
add_generation_prompt=True,
|
|
457
|
+
enable_thinking=generative_type == GenerativeType.REASONING,
|
|
458
|
+
)
|
|
459
|
+
)
|
|
460
|
+
for label in local_labels
|
|
461
|
+
]
|
|
462
|
+
|
|
463
|
+
# Remove any non-alphabetic characters from the tokens
|
|
464
|
+
all_tokens = [
|
|
465
|
+
[
|
|
466
|
+
re.sub(
|
|
467
|
+
pattern=r"^[^a-zæøåüöä0-9 ]+|[^a-zæøåüöä0-9 ]+$",
|
|
468
|
+
repl="",
|
|
469
|
+
string=token.lower(),
|
|
470
|
+
)
|
|
471
|
+
for token in token_list
|
|
472
|
+
]
|
|
473
|
+
for token_list in all_tokens
|
|
474
|
+
]
|
|
475
|
+
|
|
476
|
+
# Extract the first token of each label
|
|
477
|
+
first_tokens: list[str] = list()
|
|
478
|
+
for token_list, label in zip(all_tokens, local_labels):
|
|
479
|
+
matching_tokens = [
|
|
480
|
+
tok for tok in token_list if tok and label.startswith(tok.strip())
|
|
481
|
+
]
|
|
482
|
+
if not matching_tokens:
|
|
483
|
+
if log_metadata:
|
|
484
|
+
log_once(
|
|
485
|
+
f"No matching token found in token_list for label {label!r}, so "
|
|
486
|
+
"we will not use logprobs with the model.",
|
|
487
|
+
level=logging.DEBUG,
|
|
488
|
+
)
|
|
489
|
+
return False
|
|
490
|
+
first_tokens.append(matching_tokens[0])
|
|
491
|
+
|
|
492
|
+
# Build a mapping from labels to the first token in each label if the first
|
|
493
|
+
# tokens are distinct
|
|
494
|
+
if len(first_tokens) == len(set(first_tokens)):
|
|
495
|
+
mapping = {
|
|
496
|
+
label: first_token for label, first_token in zip(local_labels, first_tokens)
|
|
497
|
+
}
|
|
498
|
+
if log_metadata:
|
|
499
|
+
log_once(
|
|
500
|
+
"Using logprobs as evaluation strategy for the model, with the "
|
|
501
|
+
f"following mapping from labels to their first token: {mapping}.",
|
|
502
|
+
level=logging.DEBUG,
|
|
503
|
+
)
|
|
504
|
+
return mapping
|
|
505
|
+
else:
|
|
506
|
+
if log_metadata:
|
|
507
|
+
log_once(
|
|
508
|
+
"We will not use logprobs with the model since the first tokens of the "
|
|
509
|
+
"labels are not distinct. The first tokens for the labels "
|
|
510
|
+
f"{local_labels} are {first_tokens}",
|
|
511
|
+
level=logging.DEBUG,
|
|
512
|
+
)
|
|
513
|
+
return False
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
|
|
517
|
+
"""Check if a tokeniser has a chat template.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
tokeniser:
|
|
521
|
+
The tokeniser.
|
|
522
|
+
|
|
523
|
+
Returns:
|
|
524
|
+
Whether the tokeniser has a chat template.
|
|
525
|
+
"""
|
|
526
|
+
if isinstance(tokeniser, MistralCommonTokenizer):
|
|
527
|
+
log_once(
|
|
528
|
+
"The tokeniser is a Mistral tokeniser, so assuming that the model is "
|
|
529
|
+
"instruction tuned.",
|
|
530
|
+
level=logging.DEBUG,
|
|
531
|
+
)
|
|
532
|
+
return True
|
|
533
|
+
elif hasattr(tokeniser, "chat_template"):
|
|
534
|
+
has_template = tokeniser.chat_template is not None
|
|
535
|
+
if has_template:
|
|
536
|
+
log_once(
|
|
537
|
+
"The tokeniser has a chat template, so assuming that the model is "
|
|
538
|
+
"instruction tuned.",
|
|
539
|
+
level=logging.DEBUG,
|
|
540
|
+
)
|
|
541
|
+
return has_template
|
|
542
|
+
else:
|
|
543
|
+
log_once(
|
|
544
|
+
"We cannot find a chat template for the tokeniser, so assuming that the "
|
|
545
|
+
"model isn't instruction tuned.",
|
|
546
|
+
level=logging.DEBUG,
|
|
547
|
+
)
|
|
548
|
+
return False
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def apply_chat_template(
|
|
552
|
+
conversation: c.Sequence[dict[str, str]],
|
|
553
|
+
tokeniser: "PreTrainedTokenizer",
|
|
554
|
+
tokenise: bool,
|
|
555
|
+
add_generation_prompt: bool,
|
|
556
|
+
**extra_kwargs,
|
|
557
|
+
) -> str | c.Sequence[int]:
|
|
558
|
+
"""Apply the chat template to a prompt.
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
conversation:
|
|
562
|
+
The conversation to apply the chat template to.
|
|
563
|
+
tokeniser:
|
|
564
|
+
The tokeniser.
|
|
565
|
+
tokenise:
|
|
566
|
+
Whether to tokenise the resulting prompt, returning a list of token IDs
|
|
567
|
+
instead of a string.
|
|
568
|
+
add_generation_prompt:
|
|
569
|
+
Whether to add a generation prompt at the end of the conversation. This is
|
|
570
|
+
only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
|
|
571
|
+
always add a generation prompt.
|
|
572
|
+
**extra_kwargs:
|
|
573
|
+
Extra keyword arguments to pass to the tokeniser's `apply_chat_template`
|
|
574
|
+
method. Only relevant for regular Hugging Face tokenisers.
|
|
575
|
+
|
|
576
|
+
Returns:
|
|
577
|
+
The prompt with the chat template applied, either as a string or a list of
|
|
578
|
+
token IDs, depending on the value of `tokenise`.
|
|
579
|
+
|
|
580
|
+
Raises:
|
|
581
|
+
InvalidModel:
|
|
582
|
+
If the tokeniser does not have a chat template.
|
|
583
|
+
"""
|
|
584
|
+
# Ensure that the first user message is not empty, as this can cause issues with
|
|
585
|
+
# Jinja2
|
|
586
|
+
conversation[0]["content"] = conversation[0]["content"] or " "
|
|
587
|
+
|
|
588
|
+
if not has_chat_template(tokeniser=tokeniser):
|
|
589
|
+
raise InvalidModel(
|
|
590
|
+
"The tokeniser does not have a chat template, so cannot apply it."
|
|
591
|
+
)
|
|
592
|
+
elif isinstance(tokeniser, MistralCommonTokenizer):
|
|
593
|
+
templated_prompt = tokeniser.apply_chat_template(
|
|
594
|
+
conversation=conversation, tokenize=tokenise
|
|
595
|
+
)
|
|
596
|
+
else:
|
|
597
|
+
templated_prompt = tokeniser.apply_chat_template(
|
|
598
|
+
conversation=conversation,
|
|
599
|
+
add_generation_prompt=add_generation_prompt,
|
|
600
|
+
tokenize=tokenise,
|
|
601
|
+
**extra_kwargs,
|
|
602
|
+
)
|
|
603
|
+
return templated_prompt
|
euroeval/types.py
CHANGED
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
"""Types used throughout the project."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import typing as t
|
|
4
5
|
|
|
5
6
|
from transformers.trainer_utils import EvalPrediction
|
|
6
7
|
|
|
7
8
|
if t.TYPE_CHECKING:
|
|
9
|
+
from datasets.arrow_dataset import Dataset
|
|
8
10
|
from numpy.typing import NDArray
|
|
9
11
|
|
|
10
|
-
from .data_models import GenerativeModelOutput
|
|
12
|
+
from .data_models import BenchmarkConfig, GenerativeModelOutput
|
|
11
13
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
Labels: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
|
|
14
|
+
ScoreDict: t.TypeAlias = dict[str, dict[str, float] | c.Sequence[dict[str, float]]]
|
|
15
|
+
Predictions: t.TypeAlias = "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]"
|
|
16
|
+
Labels: t.TypeAlias = "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]"
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class ComputeMetricsFunction(t.Protocol):
|
|
@@ -22,15 +23,20 @@ class ComputeMetricsFunction(t.Protocol):
|
|
|
22
23
|
self,
|
|
23
24
|
model_outputs_and_labels: EvalPrediction
|
|
24
25
|
| tuple[
|
|
25
|
-
"NDArray |
|
|
26
|
-
"NDArray |
|
|
26
|
+
"NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]",
|
|
27
|
+
"NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]",
|
|
27
28
|
],
|
|
29
|
+
dataset: "Dataset",
|
|
30
|
+
benchmark_config: "BenchmarkConfig",
|
|
28
31
|
) -> dict[str, float]:
|
|
29
32
|
"""Compute the metrics.
|
|
30
33
|
|
|
31
34
|
Args:
|
|
32
35
|
model_outputs_and_labels:
|
|
33
36
|
The model outputs and labels.
|
|
37
|
+
dataset:
|
|
38
|
+
The dataset used for evaluation. This is only used in case any
|
|
39
|
+
additional metadata is used to compute the metrics.
|
|
34
40
|
|
|
35
41
|
Returns:
|
|
36
42
|
The computed metrics.
|
|
@@ -43,7 +49,7 @@ class ExtractLabelsFunction(t.Protocol):
|
|
|
43
49
|
|
|
44
50
|
def __call__(
|
|
45
51
|
self, input_batch: dict[str, list], model_output: "GenerativeModelOutput"
|
|
46
|
-
) ->
|
|
52
|
+
) -> c.Sequence[str]:
|
|
47
53
|
"""Extract the labels from the generated output.
|
|
48
54
|
|
|
49
55
|
Args:
|
|
@@ -58,7 +64,7 @@ class ExtractLabelsFunction(t.Protocol):
|
|
|
58
64
|
...
|
|
59
65
|
|
|
60
66
|
|
|
61
|
-
def is_list_of_int(x: object) -> t.TypeGuard[
|
|
67
|
+
def is_list_of_int(x: object) -> t.TypeGuard[c.Sequence[int]]:
|
|
62
68
|
"""Check if an object is a list of integers.
|
|
63
69
|
|
|
64
70
|
Args:
|
|
@@ -71,7 +77,7 @@ def is_list_of_int(x: object) -> t.TypeGuard[list[int]]:
|
|
|
71
77
|
return isinstance(x, list) and all(isinstance(i, int) for i in x)
|
|
72
78
|
|
|
73
79
|
|
|
74
|
-
def is_list_of_list_of_int(x: object) -> t.TypeGuard[
|
|
80
|
+
def is_list_of_list_of_int(x: object) -> t.TypeGuard[c.Sequence[c.Sequence[int]]]:
|
|
75
81
|
"""Check if an object is a list of list of integers.
|
|
76
82
|
|
|
77
83
|
Args:
|
|
@@ -88,7 +94,7 @@ def is_list_of_list_of_int(x: object) -> t.TypeGuard[list[list[int]]]:
|
|
|
88
94
|
)
|
|
89
95
|
|
|
90
96
|
|
|
91
|
-
def is_list_of_str(x: object) -> t.TypeGuard[
|
|
97
|
+
def is_list_of_str(x: object) -> t.TypeGuard[c.Sequence[str]]:
|
|
92
98
|
"""Check if an object is a list of integers.
|
|
93
99
|
|
|
94
100
|
Args:
|