EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +8 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +190 -110
- euroeval/benchmark_modules/vllm.py +199 -139
- euroeval/benchmarker.py +49 -22
- euroeval/cli.py +3 -3
- euroeval/constants.py +19 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +73 -23
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +35 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +6 -6
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +90 -20
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +276 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/model_cache.py +13 -1
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +19 -8
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +128 -42
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +59 -73
- euroeval/tasks.py +33 -6
- euroeval/tokenization_utils.py +294 -207
- euroeval/utils.py +150 -35
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
- euroeval-16.0.1.dist-info/RECORD +69 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -470
- euroeval-15.16.0.dist-info/RECORD +0 -63
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
euroeval/tokenization_utils.py
CHANGED
|
@@ -5,9 +5,10 @@ import re
|
|
|
5
5
|
import typing as t
|
|
6
6
|
|
|
7
7
|
import torch
|
|
8
|
+
from transformers import MistralCommonTokenizer
|
|
8
9
|
|
|
9
|
-
from .constants import TASK_GROUPS_USING_LOGPROBS
|
|
10
10
|
from .enums import GenerativeType
|
|
11
|
+
from .exceptions import InvalidModel
|
|
11
12
|
from .utils import log_once
|
|
12
13
|
|
|
13
14
|
if t.TYPE_CHECKING:
|
|
@@ -20,47 +21,47 @@ if t.TYPE_CHECKING:
|
|
|
20
21
|
logger = logging.getLogger("euroeval")
|
|
21
22
|
|
|
22
23
|
|
|
23
|
-
def get_special_token_metadata(
|
|
24
|
-
"""Get the special token metadata for a
|
|
24
|
+
def get_special_token_metadata(tokeniser: "PreTrainedTokenizerBase") -> dict:
|
|
25
|
+
"""Get the special token metadata for a tokeniser.
|
|
25
26
|
|
|
26
27
|
Args:
|
|
27
|
-
|
|
28
|
-
The
|
|
28
|
+
tokeniser:
|
|
29
|
+
The tokeniser.
|
|
29
30
|
|
|
30
31
|
Returns:
|
|
31
32
|
The special token metadata.
|
|
32
33
|
"""
|
|
33
|
-
# Create some test input IDs, to check if the
|
|
34
|
-
test_input_ids =
|
|
34
|
+
# Create some test input IDs, to check if the tokeniser is adding special tokens
|
|
35
|
+
test_input_ids = tokeniser("Test").input_ids
|
|
35
36
|
|
|
36
|
-
# Extract the CLS token IDs from the
|
|
37
|
+
# Extract the CLS token IDs from the tokeniser, if it's using them
|
|
37
38
|
has_cls_token = True
|
|
38
|
-
if
|
|
39
|
-
cls_token_id =
|
|
40
|
-
cls_token =
|
|
41
|
-
elif
|
|
42
|
-
cls_token_id =
|
|
43
|
-
cls_token =
|
|
44
|
-
elif
|
|
45
|
-
cls_token_id =
|
|
46
|
-
cls_token =
|
|
39
|
+
if tokeniser.cls_token_id in test_input_ids:
|
|
40
|
+
cls_token_id = tokeniser.cls_token_id
|
|
41
|
+
cls_token = tokeniser.cls_token
|
|
42
|
+
elif tokeniser.bos_token_id in test_input_ids:
|
|
43
|
+
cls_token_id = tokeniser.bos_token_id
|
|
44
|
+
cls_token = tokeniser.bos_token
|
|
45
|
+
elif tokeniser.cls_token is not None:
|
|
46
|
+
cls_token_id = tokeniser.cls_token_id
|
|
47
|
+
cls_token = tokeniser.cls_token
|
|
47
48
|
has_cls_token = False
|
|
48
49
|
else:
|
|
49
|
-
cls_token_id =
|
|
50
|
-
cls_token =
|
|
50
|
+
cls_token_id = tokeniser.bos_token_id
|
|
51
|
+
cls_token = tokeniser.bos_token
|
|
51
52
|
has_cls_token = False
|
|
52
53
|
|
|
53
|
-
# Extract the SEP token IDs from the
|
|
54
|
+
# Extract the SEP token IDs from the tokeniser, if it's using them
|
|
54
55
|
has_sep_token = True
|
|
55
|
-
if
|
|
56
|
-
sep_token =
|
|
57
|
-
elif
|
|
58
|
-
sep_token =
|
|
59
|
-
elif
|
|
60
|
-
sep_token =
|
|
56
|
+
if tokeniser.sep_token_id in test_input_ids:
|
|
57
|
+
sep_token = tokeniser.sep_token
|
|
58
|
+
elif tokeniser.eos_token_id in test_input_ids:
|
|
59
|
+
sep_token = tokeniser.eos_token
|
|
60
|
+
elif tokeniser.sep_token is not None:
|
|
61
|
+
sep_token = tokeniser.sep_token
|
|
61
62
|
has_sep_token = False
|
|
62
63
|
else:
|
|
63
|
-
sep_token =
|
|
64
|
+
sep_token = tokeniser.eos_token
|
|
64
65
|
has_sep_token = False
|
|
65
66
|
|
|
66
67
|
return dict(
|
|
@@ -73,29 +74,29 @@ def get_special_token_metadata(tokenizer: "PreTrainedTokenizerBase") -> dict:
|
|
|
73
74
|
|
|
74
75
|
|
|
75
76
|
def should_prompts_be_stripped(
|
|
76
|
-
labels_to_be_generated: list[str],
|
|
77
|
+
labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
|
|
77
78
|
) -> bool:
|
|
78
79
|
"""Determine if we should strip the prompts for few-shot evaluation.
|
|
79
80
|
|
|
80
|
-
This is the case if the
|
|
81
|
+
This is the case if the tokeniser needs to include the space as part of the label
|
|
81
82
|
token. The strategy is thus to tokenize a label with a preceeding colon (as in the
|
|
82
83
|
prompts), i.e., ": positive", and check if the tokenization starts with the tokens
|
|
83
84
|
of ": ". If this is the case, then we should not strip the prompts, since the
|
|
84
|
-
|
|
85
|
+
tokeniser produces the whitespace token separately.
|
|
85
86
|
|
|
86
87
|
Args:
|
|
87
88
|
labels_to_be_generated:
|
|
88
89
|
The labels that are to be generated.
|
|
89
|
-
|
|
90
|
-
The
|
|
90
|
+
tokeniser:
|
|
91
|
+
The tokeniser used to tokenize the labels.
|
|
91
92
|
|
|
92
93
|
Returns:
|
|
93
94
|
Whether we should strip the prompts.
|
|
94
95
|
"""
|
|
95
96
|
strip_prompts = True
|
|
96
97
|
for label in labels_to_be_generated:
|
|
97
|
-
colon_tokens =
|
|
98
|
-
label_tokens =
|
|
98
|
+
colon_tokens = tokeniser(": ", add_special_tokens=False).input_ids
|
|
99
|
+
label_tokens = tokeniser(": " + label, add_special_tokens=False).input_ids
|
|
99
100
|
|
|
100
101
|
if isinstance(colon_tokens, torch.Tensor):
|
|
101
102
|
colon_tokens = list(colon_tokens.squeeze(0))
|
|
@@ -112,38 +113,38 @@ def should_prompts_be_stripped(
|
|
|
112
113
|
|
|
113
114
|
|
|
114
115
|
def should_prefix_space_be_added_to_labels(
|
|
115
|
-
labels_to_be_generated: list[str],
|
|
116
|
+
labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
|
|
116
117
|
) -> bool:
|
|
117
118
|
"""Determine if we should add a prefix space to the labels.
|
|
118
119
|
|
|
119
|
-
This is the case if the prompts are stripped and the
|
|
120
|
+
This is the case if the prompts are stripped and the tokeniser doesn't
|
|
120
121
|
automatically add prefix whitespaces to the labels.
|
|
121
122
|
|
|
122
123
|
Args:
|
|
123
124
|
labels_to_be_generated:
|
|
124
125
|
The labels that are to be generated.
|
|
125
|
-
|
|
126
|
-
The
|
|
126
|
+
tokeniser:
|
|
127
|
+
The tokeniser used to tokenize the labels.
|
|
127
128
|
|
|
128
129
|
Returns:
|
|
129
130
|
Whether we should add a prefix space to the labels.
|
|
130
131
|
"""
|
|
131
132
|
if not should_prompts_be_stripped(
|
|
132
|
-
labels_to_be_generated=labels_to_be_generated,
|
|
133
|
+
labels_to_be_generated=labels_to_be_generated, tokeniser=tokeniser
|
|
133
134
|
):
|
|
134
135
|
return False
|
|
135
136
|
|
|
136
|
-
whitespace_token =
|
|
137
|
-
ids=
|
|
137
|
+
whitespace_token = tokeniser.convert_ids_to_tokens(
|
|
138
|
+
ids=tokeniser(" ", add_special_tokens=False).input_ids[0]
|
|
138
139
|
)[0]
|
|
139
140
|
|
|
140
141
|
add_prefix_space = True
|
|
141
142
|
for label in labels_to_be_generated:
|
|
142
|
-
label_tokens =
|
|
143
|
+
label_tokens = tokeniser(label, add_special_tokens=False).input_ids
|
|
143
144
|
if isinstance(label_tokens, torch.Tensor):
|
|
144
145
|
label_tokens = list(label_tokens.squeeze(0))
|
|
145
146
|
first_label_token: int = int(label_tokens[0])
|
|
146
|
-
first_character_of_label =
|
|
147
|
+
first_character_of_label = tokeniser.convert_ids_to_tokens(first_label_token)[0]
|
|
147
148
|
has_prefix_space = first_character_of_label == whitespace_token
|
|
148
149
|
if has_prefix_space:
|
|
149
150
|
add_prefix_space = False
|
|
@@ -153,22 +154,22 @@ def should_prefix_space_be_added_to_labels(
|
|
|
153
154
|
|
|
154
155
|
|
|
155
156
|
def get_bos_token(
|
|
156
|
-
|
|
157
|
+
tokeniser: "PreTrainedTokenizer",
|
|
157
158
|
) -> tuple[str, int] | tuple[None, None]:
|
|
158
|
-
"""Get the beginning-of-sequence token from a
|
|
159
|
+
"""Get the beginning-of-sequence token from a tokeniser.
|
|
159
160
|
|
|
160
161
|
Args:
|
|
161
|
-
|
|
162
|
-
The
|
|
162
|
+
tokeniser:
|
|
163
|
+
The tokeniser.
|
|
163
164
|
|
|
164
165
|
Returns:
|
|
165
166
|
A pair (token, token_id) representing the beginning-of-sequence token and its
|
|
166
167
|
token ID, or (None, None) if no BOS token is found.
|
|
167
168
|
"""
|
|
168
|
-
if isinstance(
|
|
169
|
-
return
|
|
169
|
+
if isinstance(tokeniser.bos_token, str) and isinstance(tokeniser.bos_token_id, int):
|
|
170
|
+
return tokeniser.bos_token, tokeniser.bos_token_id
|
|
170
171
|
|
|
171
|
-
vocab: dict[str, int] =
|
|
172
|
+
vocab: dict[str, int] = tokeniser.get_vocab()
|
|
172
173
|
|
|
173
174
|
candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
|
|
174
175
|
for candidate_bos_token in candidate_bos_tokens:
|
|
@@ -179,7 +180,7 @@ def get_bos_token(
|
|
|
179
180
|
else:
|
|
180
181
|
log_once(
|
|
181
182
|
"The model does not have a beginning-of-sequence token. Please ensure that "
|
|
182
|
-
"this has been set in the
|
|
183
|
+
"this has been set in the tokeniser's configuration. Using no BOS token."
|
|
183
184
|
" This may lead to unexpected behavior in the model.",
|
|
184
185
|
level=logging.INFO,
|
|
185
186
|
)
|
|
@@ -194,22 +195,22 @@ def get_bos_token(
|
|
|
194
195
|
|
|
195
196
|
|
|
196
197
|
def get_eos_token(
|
|
197
|
-
|
|
198
|
+
tokeniser: "PreTrainedTokenizer",
|
|
198
199
|
) -> tuple[str, int] | tuple[None, None]:
|
|
199
|
-
"""Get the end-of-sequence token from a
|
|
200
|
+
"""Get the end-of-sequence token from a tokeniser.
|
|
200
201
|
|
|
201
202
|
Args:
|
|
202
|
-
|
|
203
|
-
The
|
|
203
|
+
tokeniser:
|
|
204
|
+
The tokeniser.
|
|
204
205
|
|
|
205
206
|
Returns:
|
|
206
207
|
A pair (token, token_id) representing the end-of-sequence token and its token
|
|
207
208
|
ID, or (None, None) if no EOS token is found.
|
|
208
209
|
"""
|
|
209
|
-
if isinstance(
|
|
210
|
-
return
|
|
210
|
+
if isinstance(tokeniser.eos_token, str) and isinstance(tokeniser.eos_token_id, int):
|
|
211
|
+
return tokeniser.eos_token, tokeniser.eos_token_id
|
|
211
212
|
|
|
212
|
-
vocab: dict[str, int] =
|
|
213
|
+
vocab: dict[str, int] = tokeniser.get_vocab()
|
|
213
214
|
|
|
214
215
|
candidate_eos_tokens = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]"]
|
|
215
216
|
for candidate_eos_token in candidate_eos_tokens:
|
|
@@ -220,7 +221,7 @@ def get_eos_token(
|
|
|
220
221
|
else:
|
|
221
222
|
log_once(
|
|
222
223
|
"The model does not have an end-of-sequence token. Please ensure that this "
|
|
223
|
-
"has been set in the
|
|
224
|
+
"has been set in the tokeniser's configuration. Using no EOS token. This "
|
|
224
225
|
"may lead to unexpected behavior in the model.",
|
|
225
226
|
level=logging.INFO,
|
|
226
227
|
)
|
|
@@ -235,55 +236,55 @@ def get_eos_token(
|
|
|
235
236
|
|
|
236
237
|
|
|
237
238
|
def get_pad_token(
|
|
238
|
-
|
|
239
|
+
tokeniser: "PreTrainedTokenizer",
|
|
239
240
|
) -> tuple[str, int] | tuple[None, None]:
|
|
240
|
-
"""Get the padding token from a
|
|
241
|
+
"""Get the padding token from a tokeniser.
|
|
241
242
|
|
|
242
243
|
Args:
|
|
243
|
-
|
|
244
|
-
The
|
|
244
|
+
tokeniser:
|
|
245
|
+
The tokeniser.
|
|
245
246
|
|
|
246
247
|
Returns:
|
|
247
248
|
A pair (token, token_id) representing the padding token and its token ID, or
|
|
248
249
|
(None, None) if no padding token is found.
|
|
249
250
|
"""
|
|
250
|
-
# If the
|
|
251
|
-
if
|
|
252
|
-
assert isinstance(
|
|
253
|
-
"Expected
|
|
254
|
-
f"{type(
|
|
251
|
+
# If the tokeniser already has a padding token, return it
|
|
252
|
+
if tokeniser.pad_token is not None and tokeniser.pad_token_id is not None:
|
|
253
|
+
assert isinstance(tokeniser.pad_token, str), (
|
|
254
|
+
"Expected tokeniser.pad_token to be a string, but got "
|
|
255
|
+
f"{type(tokeniser.pad_token)}."
|
|
255
256
|
)
|
|
256
|
-
assert isinstance(
|
|
257
|
-
"Expected
|
|
258
|
-
f"{type(
|
|
257
|
+
assert isinstance(tokeniser.pad_token_id, int), (
|
|
258
|
+
"Expected tokeniser.pad_token_id to be an integer, but got "
|
|
259
|
+
f"{type(tokeniser.pad_token_id)}."
|
|
259
260
|
)
|
|
260
|
-
return (
|
|
261
|
+
return (tokeniser.pad_token, tokeniser.pad_token_id)
|
|
261
262
|
|
|
262
|
-
# If the
|
|
263
|
-
if
|
|
264
|
-
assert isinstance(
|
|
265
|
-
"Expected
|
|
266
|
-
f"{type(
|
|
263
|
+
# If the tokeniser has a BOS token, use it as the padding token
|
|
264
|
+
if tokeniser.bos_token is not None and tokeniser.bos_token_id is not None:
|
|
265
|
+
assert isinstance(tokeniser.bos_token, str), (
|
|
266
|
+
"Expected tokeniser.bos_token to be a string, but got "
|
|
267
|
+
f"{type(tokeniser.bos_token)}."
|
|
267
268
|
)
|
|
268
|
-
assert isinstance(
|
|
269
|
-
"Expected
|
|
270
|
-
f"{type(
|
|
269
|
+
assert isinstance(tokeniser.bos_token_id, int), (
|
|
270
|
+
"Expected tokeniser.bos_token_id to be an integer, but got "
|
|
271
|
+
f"{type(tokeniser.bos_token_id)}."
|
|
271
272
|
)
|
|
272
|
-
pad_token =
|
|
273
|
-
pad_token_id =
|
|
274
|
-
|
|
275
|
-
# If the
|
|
276
|
-
elif
|
|
277
|
-
assert isinstance(
|
|
278
|
-
"Expected
|
|
279
|
-
f"{type(
|
|
273
|
+
pad_token = tokeniser.bos_token
|
|
274
|
+
pad_token_id = tokeniser.bos_token_id
|
|
275
|
+
|
|
276
|
+
# If the tokeniser has an EOS token, use it as the padding token
|
|
277
|
+
elif tokeniser.eos_token is not None and tokeniser.eos_token_id is not None:
|
|
278
|
+
assert isinstance(tokeniser.eos_token, str), (
|
|
279
|
+
"Expected tokeniser.eos_token to be a string, but got "
|
|
280
|
+
f"{type(tokeniser.eos_token)}."
|
|
280
281
|
)
|
|
281
|
-
assert isinstance(
|
|
282
|
-
"Expected
|
|
283
|
-
f"{type(
|
|
282
|
+
assert isinstance(tokeniser.eos_token_id, int), (
|
|
283
|
+
"Expected tokeniser.eos_token_id to be an integer, but got "
|
|
284
|
+
f"{type(tokeniser.eos_token_id)}."
|
|
284
285
|
)
|
|
285
|
-
pad_token =
|
|
286
|
-
pad_token_id =
|
|
286
|
+
pad_token = tokeniser.eos_token
|
|
287
|
+
pad_token_id = tokeniser.eos_token_id
|
|
287
288
|
|
|
288
289
|
# Otherwise, try to find a candidate padding token in the vocabulary
|
|
289
290
|
else:
|
|
@@ -296,14 +297,14 @@ def get_pad_token(
|
|
|
296
297
|
]
|
|
297
298
|
pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
|
|
298
299
|
for candidate in pad_token_candidates:
|
|
299
|
-
if candidate in
|
|
300
|
+
if candidate in tokeniser.get_vocab():
|
|
300
301
|
pad_token = candidate
|
|
301
|
-
pad_token_id =
|
|
302
|
+
pad_token_id = tokeniser.get_vocab()[candidate]
|
|
302
303
|
break
|
|
303
304
|
else:
|
|
304
305
|
log_once(
|
|
305
306
|
"Could not identify a padding token for the model. Please ensure that "
|
|
306
|
-
"this has been set in the
|
|
307
|
+
"this has been set in the tokeniser's configuration. Using no padding "
|
|
307
308
|
"token. This may lead to unexpected behavior in the model.",
|
|
308
309
|
level=logging.INFO,
|
|
309
310
|
)
|
|
@@ -317,50 +318,58 @@ def get_pad_token(
|
|
|
317
318
|
return pad_token, pad_token_id
|
|
318
319
|
|
|
319
320
|
|
|
320
|
-
def get_end_of_chat_token_ids(
|
|
321
|
+
def get_end_of_chat_token_ids(tokeniser: "PreTrainedTokenizer") -> list[int] | None:
|
|
321
322
|
"""Get the end token ID for chat models.
|
|
322
323
|
|
|
323
|
-
This is only relevant for
|
|
324
|
+
This is only relevant for tokenisers with a chat template.
|
|
324
325
|
|
|
325
326
|
Args:
|
|
326
|
-
|
|
327
|
-
The
|
|
327
|
+
tokeniser:
|
|
328
|
+
The tokeniser.
|
|
328
329
|
|
|
329
330
|
Returns:
|
|
330
|
-
The token IDs used to end chats, or None if the
|
|
331
|
-
template.
|
|
332
|
-
|
|
333
|
-
Raises:
|
|
334
|
-
ValueError:
|
|
335
|
-
If the end-of-chat token could not be located.
|
|
331
|
+
The token IDs used to end chats, or None if the tokeniser does not have a chat
|
|
332
|
+
template or if no end-of-chat token could be found.
|
|
336
333
|
"""
|
|
337
|
-
if
|
|
334
|
+
if not has_chat_template(tokeniser=tokeniser):
|
|
338
335
|
return None
|
|
339
336
|
|
|
340
337
|
user_message: dict[str, str] = dict(role="user", content="X")
|
|
341
|
-
token_ids
|
|
338
|
+
token_ids = apply_chat_template(
|
|
339
|
+
conversation=[user_message],
|
|
340
|
+
tokeniser=tokeniser,
|
|
341
|
+
tokenize=True,
|
|
342
|
+
add_generation_prompt=False,
|
|
343
|
+
)
|
|
344
|
+
assert isinstance(token_ids, list)
|
|
342
345
|
|
|
343
|
-
for idx, token in enumerate(
|
|
344
|
-
token_id = tokenizer.convert_tokens_to_ids(token)
|
|
345
|
-
assert isinstance(token_id, int)
|
|
346
|
-
token = tokenizer.decode([token_id])
|
|
346
|
+
for idx, token in enumerate(tokeniser.convert_ids_to_tokens(token_ids)):
|
|
347
347
|
if "X" in token:
|
|
348
348
|
x_token_index = idx
|
|
349
349
|
break
|
|
350
350
|
else:
|
|
351
|
-
|
|
351
|
+
logger.debug("Could not locate the end-of-chat token for the model.")
|
|
352
|
+
return None
|
|
352
353
|
|
|
353
354
|
end_of_chat_tokens = token_ids[x_token_index + 1 :]
|
|
354
355
|
if len(end_of_chat_tokens) == 0:
|
|
356
|
+
logger.debug("Could not locate the end-of-chat token for the model.")
|
|
355
357
|
return None
|
|
358
|
+
|
|
359
|
+
log_once(
|
|
360
|
+
f"Detected end-of-chat token IDs as {end_of_chat_tokens}, corresponding to "
|
|
361
|
+
f"tokens {tokeniser.convert_ids_to_tokens(end_of_chat_tokens)}.",
|
|
362
|
+
level=logging.DEBUG,
|
|
363
|
+
)
|
|
356
364
|
return end_of_chat_tokens
|
|
357
365
|
|
|
358
366
|
|
|
359
367
|
def get_first_label_token_mapping(
|
|
360
368
|
dataset_config: "DatasetConfig",
|
|
361
369
|
model_config: "ModelConfig",
|
|
362
|
-
|
|
370
|
+
tokeniser: "PreTrainedTokenizer | None",
|
|
363
371
|
generative_type: "GenerativeType | None",
|
|
372
|
+
log_metadata: bool,
|
|
364
373
|
) -> dict[str, str] | bool:
|
|
365
374
|
"""Check if the model should output scores.
|
|
366
375
|
|
|
@@ -369,130 +378,208 @@ def get_first_label_token_mapping(
|
|
|
369
378
|
The dataset configuration.
|
|
370
379
|
model_config:
|
|
371
380
|
The model configuration.
|
|
372
|
-
|
|
373
|
-
The
|
|
381
|
+
tokeniser:
|
|
382
|
+
The tokeniser, or None if not available.
|
|
374
383
|
generative_type:
|
|
375
384
|
The generative type, or None if not available.
|
|
385
|
+
log_metadata:
|
|
386
|
+
Whether to log metadata.
|
|
376
387
|
|
|
377
388
|
Returns:
|
|
378
389
|
A mapping from labels to the first token in each label, or alternatively a
|
|
379
390
|
Boolean value indicating whether the model should output scores (if the mapping
|
|
380
391
|
is outputted then the model will always output scores).
|
|
381
392
|
"""
|
|
382
|
-
if
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
393
|
+
if not (dataset_config.task.uses_logprobs and dataset_config.labels):
|
|
394
|
+
if log_metadata:
|
|
395
|
+
log_once(
|
|
396
|
+
"We will not use logprobs with the model, since the dataset does not "
|
|
397
|
+
"have labels.",
|
|
398
|
+
level=logging.DEBUG,
|
|
399
|
+
)
|
|
388
400
|
return False
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
# scores and we just assume it should if the dataset supports it
|
|
392
|
-
output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
|
|
393
|
-
if tokenizer is None:
|
|
394
|
-
if output_scores:
|
|
401
|
+
elif generative_type == GenerativeType.REASONING:
|
|
402
|
+
if log_metadata:
|
|
395
403
|
log_once(
|
|
396
|
-
f"
|
|
397
|
-
"
|
|
404
|
+
f"The model {model_config.model_id!r} is a reasoning model and "
|
|
405
|
+
"thus does not support logprobs, so we do not enable it.",
|
|
398
406
|
level=logging.DEBUG,
|
|
399
407
|
)
|
|
400
|
-
|
|
408
|
+
return False
|
|
409
|
+
elif tokeniser is None:
|
|
410
|
+
if log_metadata:
|
|
401
411
|
log_once(
|
|
402
|
-
f"We will
|
|
403
|
-
"since the dataset
|
|
412
|
+
f"We will use logprobs with the model {model_config.model_id!r} "
|
|
413
|
+
"since the dataset supports it and no tokeniser is available.",
|
|
404
414
|
level=logging.DEBUG,
|
|
405
415
|
)
|
|
406
|
-
return
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
416
|
+
return True
|
|
417
|
+
|
|
418
|
+
local_labels = [
|
|
419
|
+
dataset_config.prompt_label_mapping[label].strip()
|
|
420
|
+
for label in dataset_config.labels
|
|
421
|
+
]
|
|
422
|
+
|
|
423
|
+
# Tokenize some text containing each label, which we will use to extract the
|
|
424
|
+
# first token of each label
|
|
425
|
+
all_tokens: list[list[str]]
|
|
426
|
+
if not has_chat_template(tokeniser=tokeniser):
|
|
427
|
+
add_prefix_space = should_prefix_space_be_added_to_labels(
|
|
428
|
+
labels_to_be_generated=local_labels, tokeniser=tokeniser
|
|
429
|
+
)
|
|
430
|
+
all_tokens = [
|
|
431
|
+
tokeniser.tokenize(text=f" {label}" if add_prefix_space else label)
|
|
432
|
+
for label in local_labels
|
|
414
433
|
]
|
|
415
|
-
|
|
416
|
-
# Tokenize some text containing each label, which we will use to extract the
|
|
417
|
-
# first token of each label
|
|
418
|
-
all_tokens: list[list[str]]
|
|
419
|
-
if tokenizer.chat_template is None:
|
|
420
|
-
add_prefix_space = should_prefix_space_be_added_to_labels(
|
|
421
|
-
labels_to_be_generated=local_labels, tokenizer=tokenizer
|
|
422
|
-
)
|
|
423
|
-
all_tokens = [
|
|
424
|
-
tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
|
|
425
|
-
for label in local_labels
|
|
426
|
-
]
|
|
427
|
-
else:
|
|
428
|
-
all_tokens = [
|
|
429
|
-
tokenizer.convert_ids_to_tokens(
|
|
430
|
-
ids=tokenizer.apply_chat_template(
|
|
431
|
-
conversation=[
|
|
432
|
-
dict(role="user", content=""),
|
|
433
|
-
dict(role="assistant", content=label),
|
|
434
|
-
],
|
|
435
|
-
add_generation_prompt=True,
|
|
436
|
-
tokenize=True,
|
|
437
|
-
)
|
|
438
|
-
)
|
|
439
|
-
for label in local_labels
|
|
440
|
-
]
|
|
441
|
-
|
|
442
|
-
# Remove any non-alphabetic characters from the tokens
|
|
434
|
+
else:
|
|
443
435
|
all_tokens = [
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
436
|
+
tokeniser.convert_ids_to_tokens(
|
|
437
|
+
ids=apply_chat_template(
|
|
438
|
+
conversation=[
|
|
439
|
+
dict(role="user", content=""),
|
|
440
|
+
dict(role="assistant", content=label),
|
|
441
|
+
# Adding extra user message as Mistral tokenisers require
|
|
442
|
+
# conversamtions to end with a user message
|
|
443
|
+
dict(role="user", content=""),
|
|
444
|
+
],
|
|
445
|
+
tokeniser=tokeniser,
|
|
446
|
+
tokenize=True,
|
|
449
447
|
)
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
for token_list in all_tokens
|
|
448
|
+
)
|
|
449
|
+
for label in local_labels
|
|
453
450
|
]
|
|
454
451
|
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
452
|
+
# Remove any non-alphabetic characters from the tokens
|
|
453
|
+
all_tokens = [
|
|
454
|
+
[
|
|
455
|
+
re.sub(
|
|
456
|
+
pattern=r"^[^a-zæøåüöä0-9]+|[^a-zæøåüöä0-9]+$",
|
|
457
|
+
repl="",
|
|
458
|
+
string=token.lower(),
|
|
459
|
+
)
|
|
460
|
+
for token in token_list
|
|
461
|
+
]
|
|
462
|
+
for token_list in all_tokens
|
|
463
|
+
]
|
|
464
|
+
|
|
465
|
+
# Extract the first token of each label
|
|
466
|
+
first_tokens: list[str] = list()
|
|
467
|
+
for token_list, label in zip(all_tokens, local_labels):
|
|
468
|
+
matching_tokens = [tok for tok in token_list if tok and label.startswith(tok)]
|
|
469
|
+
if not matching_tokens:
|
|
470
|
+
if log_metadata:
|
|
462
471
|
log_once(
|
|
463
472
|
f"No matching token found in token_list for label '{label}', so "
|
|
464
473
|
"we will not use logprobs with the model.",
|
|
465
474
|
level=logging.DEBUG,
|
|
466
475
|
)
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
476
|
+
return False
|
|
477
|
+
first_tokens.append(matching_tokens[0])
|
|
478
|
+
|
|
479
|
+
# Build a mapping from labels to the first token in each label if the first
|
|
480
|
+
# tokens are distinct
|
|
481
|
+
if len(first_tokens) == len(set(first_tokens)):
|
|
482
|
+
mapping = {
|
|
483
|
+
label: first_token for label, first_token in zip(local_labels, first_tokens)
|
|
484
|
+
}
|
|
485
|
+
if log_metadata:
|
|
473
486
|
log_once(
|
|
474
|
-
"
|
|
475
|
-
"labels
|
|
487
|
+
"Using logprobs as evaluation strategy for the model, with the "
|
|
488
|
+
f"following mapping from labels to their first token: {mapping}.",
|
|
476
489
|
level=logging.DEBUG,
|
|
477
490
|
)
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
}
|
|
482
|
-
else:
|
|
491
|
+
return mapping
|
|
492
|
+
else:
|
|
493
|
+
if log_metadata:
|
|
483
494
|
log_once(
|
|
484
495
|
"We will not use logprobs with the model since the first tokens of the "
|
|
485
496
|
"labels are not distinct. The first tokens for the labels "
|
|
486
497
|
f"{local_labels} are {first_tokens}"
|
|
487
498
|
)
|
|
488
|
-
|
|
499
|
+
return False
|
|
489
500
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
501
|
+
|
|
502
|
+
def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
|
|
503
|
+
"""Check if a tokeniser has a chat template.
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
tokeniser:
|
|
507
|
+
The tokeniser.
|
|
508
|
+
|
|
509
|
+
Returns:
|
|
510
|
+
Whether the tokeniser has a chat template.
|
|
511
|
+
"""
|
|
512
|
+
if hasattr(tokeniser, "chat_template"):
|
|
513
|
+
has_template = tokeniser.chat_template is not None
|
|
514
|
+
if has_template:
|
|
515
|
+
log_once(
|
|
516
|
+
"The tokeniser has a chat template, so assuming that the model is "
|
|
517
|
+
"instruction tuned.",
|
|
518
|
+
level=logging.DEBUG,
|
|
519
|
+
)
|
|
520
|
+
return has_template
|
|
521
|
+
elif isinstance(tokeniser, MistralCommonTokenizer):
|
|
522
|
+
log_once(
|
|
523
|
+
"The tokeniser is a Mistral tokeniser, so assuming that the model is "
|
|
524
|
+
"instruction tuned.",
|
|
525
|
+
level=logging.DEBUG,
|
|
526
|
+
)
|
|
527
|
+
return True
|
|
528
|
+
else:
|
|
529
|
+
log_once(
|
|
530
|
+
"We cannot find a chat template for the tokeniser, so assuming that the "
|
|
531
|
+
"model isn't instruction tuned.",
|
|
532
|
+
level=logging.DEBUG,
|
|
533
|
+
)
|
|
534
|
+
return False
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def apply_chat_template(
|
|
538
|
+
conversation: list[dict[str, str]],
|
|
539
|
+
tokeniser: "PreTrainedTokenizer",
|
|
540
|
+
tokenize: bool = False,
|
|
541
|
+
add_generation_prompt: bool = True,
|
|
542
|
+
**transformers_tokeniser_kwargs,
|
|
543
|
+
) -> str | list[int]:
|
|
544
|
+
"""Apply the chat template to a prompt.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
conversation:
|
|
548
|
+
The conversation to apply the chat template to.
|
|
549
|
+
tokeniser:
|
|
550
|
+
The tokeniser.
|
|
551
|
+
tokenize:
|
|
552
|
+
Whether to tokenize the resulting prompt, returning a list of token IDs
|
|
553
|
+
instead of a string.
|
|
554
|
+
add_generation_prompt:
|
|
555
|
+
Whether to add a generation prompt at the end of the conversation. This is
|
|
556
|
+
only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
|
|
557
|
+
always add a generation prompt.
|
|
558
|
+
**transformers_tokeniser_kwargs:
|
|
559
|
+
Additional keyword arguments to pass to the tokeniser, in case the tokeniser
|
|
560
|
+
is a regular Hugging Face tokeniser.
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
The prompt with the chat template applied, either as a string or a list of
|
|
564
|
+
token IDs, depending on the value of `tokenize`.
|
|
565
|
+
|
|
566
|
+
Raises:
|
|
567
|
+
InvalidModel:
|
|
568
|
+
If the tokeniser does not have a chat template.
|
|
569
|
+
"""
|
|
570
|
+
if not has_chat_template(tokeniser=tokeniser):
|
|
571
|
+
raise InvalidModel(
|
|
572
|
+
"The tokeniser does not have a chat template, so cannot apply it."
|
|
573
|
+
)
|
|
574
|
+
elif isinstance(tokeniser, MistralCommonTokenizer):
|
|
575
|
+
templated_prompt = tokeniser.apply_chat_template(
|
|
576
|
+
conversation=conversation, tokenize=tokenize
|
|
577
|
+
)
|
|
578
|
+
else:
|
|
579
|
+
templated_prompt = tokeniser.apply_chat_template(
|
|
580
|
+
conversation=conversation,
|
|
581
|
+
add_generation_prompt=add_generation_prompt,
|
|
582
|
+
tokenize=tokenize,
|
|
583
|
+
**transformers_tokeniser_kwargs,
|
|
584
|
+
)
|
|
585
|
+
return templated_prompt
|