EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +6 -0
- euroeval/benchmark_config_factory.py +51 -46
- euroeval/benchmark_modules/base.py +6 -5
- euroeval/benchmark_modules/hf.py +2 -9
- euroeval/benchmark_modules/litellm.py +14 -12
- euroeval/benchmark_modules/vllm.py +17 -10
- euroeval/benchmarker.py +61 -44
- euroeval/caching_utils.py +1 -1
- euroeval/cli.py +86 -8
- euroeval/constants.py +3 -0
- euroeval/data_loading.py +78 -30
- euroeval/data_models.py +326 -326
- euroeval/dataset_configs/__init__.py +10 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +25 -29
- euroeval/dataset_configs/danish.py +51 -88
- euroeval/dataset_configs/dutch.py +48 -86
- euroeval/dataset_configs/english.py +45 -76
- euroeval/dataset_configs/estonian.py +36 -38
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -68
- euroeval/dataset_configs/french.py +39 -74
- euroeval/dataset_configs/german.py +45 -81
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -78
- euroeval/dataset_configs/latvian.py +28 -34
- euroeval/dataset_configs/lithuanian.py +22 -26
- euroeval/dataset_configs/norwegian.py +72 -114
- euroeval/dataset_configs/polish.py +33 -60
- euroeval/dataset_configs/portuguese.py +33 -65
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +19 -24
- euroeval/dataset_configs/spanish.py +42 -76
- euroeval/dataset_configs/swedish.py +48 -84
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +3 -2
- euroeval/generation.py +5 -4
- euroeval/generation_utils.py +6 -5
- euroeval/languages.py +395 -323
- euroeval/metrics/huggingface.py +14 -3
- euroeval/metrics/llm_as_a_judge.py +1 -1
- euroeval/model_cache.py +6 -5
- euroeval/model_loading.py +1 -1
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +82 -43
- euroeval/prompt_templates/multiple_choice.py +81 -41
- euroeval/prompt_templates/named_entity_recognition.py +125 -44
- euroeval/prompt_templates/reading_comprehension.py +92 -43
- euroeval/prompt_templates/sentiment_classification.py +91 -43
- euroeval/prompt_templates/summarization.py +64 -39
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +4 -3
- euroeval/speed_benchmark.py +2 -1
- euroeval/task_group_utils/multiple_choice_classification.py +2 -1
- euroeval/task_group_utils/question_answering.py +24 -13
- euroeval/task_group_utils/sequence_classification.py +5 -4
- euroeval/task_group_utils/text_to_text.py +2 -1
- euroeval/task_group_utils/token_classification.py +11 -8
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +19 -10
- euroeval/types.py +10 -9
- euroeval/utils.py +6 -3
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.4.0.dist-info/RECORD +0 -75
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -37,7 +37,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
37
37
|
train_dataset: "Dataset",
|
|
38
38
|
eval_dataset: "Dataset",
|
|
39
39
|
compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
|
|
40
|
-
callbacks: "
|
|
40
|
+
callbacks: "c.Sequence[TrainerCallback]",
|
|
41
41
|
data_collator: "c.Callable",
|
|
42
42
|
**kwargs,
|
|
43
43
|
) -> None:
|
|
@@ -67,7 +67,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
67
67
|
self,
|
|
68
68
|
eval_dataset: "Dataset | None" = None,
|
|
69
69
|
orig_eval_dataset: "Dataset | None" = None,
|
|
70
|
-
ignore_keys:
|
|
70
|
+
ignore_keys: c.Sequence[str] | None = None,
|
|
71
71
|
metric_key_prefix: str = "eval",
|
|
72
72
|
) -> dict[str, float]:
|
|
73
73
|
"""Evaluate the model on the given dataset.
|
|
@@ -203,7 +203,7 @@ def compute_metrics(
|
|
|
203
203
|
|
|
204
204
|
def extract_labels_from_generation(
|
|
205
205
|
input_batch: dict[str, list], model_output: "GenerativeModelOutput"
|
|
206
|
-
) ->
|
|
206
|
+
) -> c.Sequence[t.Any]:
|
|
207
207
|
"""Extract the predicted labels from the generated output.
|
|
208
208
|
|
|
209
209
|
Args:
|
|
@@ -265,8 +265,11 @@ def prepare_train_examples(
|
|
|
265
265
|
max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
|
|
266
266
|
num_special_tokens = int(has_cls_token) + int(has_sep_token)
|
|
267
267
|
stride = tokeniser.model_max_length // 4
|
|
268
|
-
|
|
269
|
-
|
|
268
|
+
stride = min(
|
|
269
|
+
stride,
|
|
270
|
+
tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
|
|
271
|
+
)
|
|
272
|
+
stride = max(stride, 0)
|
|
270
273
|
max_length = tokeniser.model_max_length - stride
|
|
271
274
|
|
|
272
275
|
# Tokenise our examples with truncation and padding, but keep the overflows using a
|
|
@@ -335,9 +338,17 @@ def prepare_train_examples(
|
|
|
335
338
|
end_char = start_char + len(answers["text"][0])
|
|
336
339
|
|
|
337
340
|
# Start token index of the current span in the text.
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
token_start_index
|
|
341
|
+
try:
|
|
342
|
+
token_start_index = 0
|
|
343
|
+
while sequence_ids[token_start_index] != 1:
|
|
344
|
+
token_start_index += 1
|
|
345
|
+
|
|
346
|
+
# If it turns out that we cannot find the context in the span, then we
|
|
347
|
+
# treat this as an impossible case
|
|
348
|
+
except IndexError:
|
|
349
|
+
tokenised_examples.start_positions.append(cls_index)
|
|
350
|
+
tokenised_examples.end_positions.append(cls_index)
|
|
351
|
+
continue
|
|
341
352
|
|
|
342
353
|
# End token index of the current span in the text.
|
|
343
354
|
token_end_index = len(input_ids) - 1
|
|
@@ -469,7 +480,7 @@ def postprocess_predictions_and_labels(
|
|
|
469
480
|
dataset: "Dataset",
|
|
470
481
|
prepared_dataset: "Dataset",
|
|
471
482
|
cls_token_index: int,
|
|
472
|
-
) -> tuple[
|
|
483
|
+
) -> tuple[c.Sequence[dict], c.Sequence[dict]]:
|
|
473
484
|
"""Postprocess the predictions and labels, to allow easier metric computation.
|
|
474
485
|
|
|
475
486
|
Args:
|
|
@@ -550,7 +561,7 @@ def find_best_answer(
|
|
|
550
561
|
all_start_logits: np.ndarray,
|
|
551
562
|
all_end_logits: np.ndarray,
|
|
552
563
|
prepared_dataset: "Dataset",
|
|
553
|
-
feature_indices:
|
|
564
|
+
feature_indices: c.Sequence[int],
|
|
554
565
|
context: str,
|
|
555
566
|
max_answer_length: int,
|
|
556
567
|
num_best_logits: int,
|
|
@@ -583,7 +594,7 @@ def find_best_answer(
|
|
|
583
594
|
The best answer for the example.
|
|
584
595
|
"""
|
|
585
596
|
# Loop through all the features associated to the current example
|
|
586
|
-
valid_answers = list()
|
|
597
|
+
valid_answers: list[dict] = list()
|
|
587
598
|
for feature_index in feature_indices:
|
|
588
599
|
# Get the features associated with the current example
|
|
589
600
|
features = prepared_dataset[feature_index]
|
|
@@ -624,12 +635,12 @@ def find_best_answer(
|
|
|
624
635
|
def find_valid_answers(
|
|
625
636
|
start_logits: np.ndarray,
|
|
626
637
|
end_logits: np.ndarray,
|
|
627
|
-
offset_mapping:
|
|
638
|
+
offset_mapping: c.Sequence[tuple[int, int]],
|
|
628
639
|
context: str,
|
|
629
640
|
max_answer_length: int,
|
|
630
641
|
num_best_logits: int,
|
|
631
642
|
min_null_score: float,
|
|
632
|
-
) ->
|
|
643
|
+
) -> c.Sequence[dict]:
|
|
633
644
|
"""Find the valid answers from the start and end indexes.
|
|
634
645
|
|
|
635
646
|
Args:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to the sequence-classification task group."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import re
|
|
5
6
|
import typing as t
|
|
@@ -110,7 +111,7 @@ def extract_labels_from_generation(
|
|
|
110
111
|
dataset_config: "DatasetConfig",
|
|
111
112
|
model_config: "ModelConfig",
|
|
112
113
|
first_label_token_mapping: dict[str, str] | bool,
|
|
113
|
-
) ->
|
|
114
|
+
) -> c.Sequence[str]:
|
|
114
115
|
"""Extract the predicted labels from the generated output.
|
|
115
116
|
|
|
116
117
|
Args:
|
|
@@ -243,10 +244,10 @@ def extract_labels_from_generation(
|
|
|
243
244
|
|
|
244
245
|
|
|
245
246
|
def get_closest_logprobs_labels(
|
|
246
|
-
generation_logprobs:
|
|
247
|
+
generation_logprobs: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]],
|
|
247
248
|
first_label_token_mapping: dict[str, str] | t.Literal[True],
|
|
248
|
-
candidate_labels:
|
|
249
|
-
) ->
|
|
249
|
+
candidate_labels: c.Sequence[c.Sequence[str]],
|
|
250
|
+
) -> c.Sequence[str] | None:
|
|
250
251
|
"""Get the labels with the highest predicted logprob value.
|
|
251
252
|
|
|
252
253
|
In case a candidate label is split into multiple tokens, we only use the first
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to the text-to-text task group."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import typing as t
|
|
5
6
|
|
|
@@ -131,7 +132,7 @@ def compute_metrics(
|
|
|
131
132
|
|
|
132
133
|
def extract_labels_from_generation(
|
|
133
134
|
input_batch: dict[str, list], model_output: "GenerativeModelOutput"
|
|
134
|
-
) ->
|
|
135
|
+
) -> c.Sequence[t.Any]:
|
|
135
136
|
"""Extract the predicted labels from the generated output.
|
|
136
137
|
|
|
137
138
|
Args:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to the token-classification task group."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import typing as t
|
|
5
6
|
from copy import deepcopy
|
|
@@ -59,7 +60,9 @@ def compute_metrics(
|
|
|
59
60
|
|
|
60
61
|
predictions: list[list[str]]
|
|
61
62
|
if not isinstance(model_outputs[0][0], str):
|
|
62
|
-
raw_predictions:
|
|
63
|
+
raw_predictions: c.Sequence[c.Sequence[int]] = np.argmax(
|
|
64
|
+
model_outputs, axis=-1
|
|
65
|
+
).tolist()
|
|
63
66
|
|
|
64
67
|
# Remove ignored index (special tokens)
|
|
65
68
|
predictions = [
|
|
@@ -189,7 +192,7 @@ def extract_labels_from_generation(
|
|
|
189
192
|
input_batch: dict[str, list],
|
|
190
193
|
model_output: "GenerativeModelOutput",
|
|
191
194
|
dataset_config: "DatasetConfig",
|
|
192
|
-
) ->
|
|
195
|
+
) -> c.Sequence[t.Any]:
|
|
193
196
|
"""Extract the predicted labels from the generated output.
|
|
194
197
|
|
|
195
198
|
Args:
|
|
@@ -284,8 +287,8 @@ def tokenize_and_align_labels(
|
|
|
284
287
|
# tokeniser is of a "fast" variant then this can be accessed through the
|
|
285
288
|
# `word_ids` method. Otherwise, we have to extract it manually.
|
|
286
289
|
all_labels: list[list[int]] = list()
|
|
287
|
-
labels:
|
|
288
|
-
word_ids:
|
|
290
|
+
labels: c.Sequence[str]
|
|
291
|
+
word_ids: c.Sequence[int | None]
|
|
289
292
|
for i, labels in enumerate(examples["labels"]):
|
|
290
293
|
# Try to get the word IDs from the tokeniser
|
|
291
294
|
try:
|
|
@@ -295,10 +298,10 @@ def tokenize_and_align_labels(
|
|
|
295
298
|
# IDs manually
|
|
296
299
|
except ValueError:
|
|
297
300
|
# Get the list of words in the document
|
|
298
|
-
words:
|
|
301
|
+
words: c.Sequence[str] = examples["tokens"][i]
|
|
299
302
|
|
|
300
303
|
# Get the list of token IDs in the document
|
|
301
|
-
tok_ids:
|
|
304
|
+
tok_ids: c.Sequence[int] = tokenized_inputs.input_ids[i]
|
|
302
305
|
|
|
303
306
|
# Decode the token IDs
|
|
304
307
|
tokens = tokeniser.convert_ids_to_tokens(tok_ids)
|
|
@@ -391,8 +394,8 @@ def tokenize_and_align_labels(
|
|
|
391
394
|
|
|
392
395
|
|
|
393
396
|
def handle_unk_tokens(
|
|
394
|
-
tokeniser: "PreTrainedTokenizer", tokens: list[str], words:
|
|
395
|
-
) ->
|
|
397
|
+
tokeniser: "PreTrainedTokenizer", tokens: list[str], words: c.Sequence[str]
|
|
398
|
+
) -> c.Sequence[str]:
|
|
396
399
|
"""Replace unknown tokens in the tokens with the corresponding word.
|
|
397
400
|
|
|
398
401
|
Args:
|
euroeval/tasks.py
CHANGED
|
@@ -5,12 +5,14 @@ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
|
|
|
5
5
|
from .data_models import Task
|
|
6
6
|
from .enums import GenerativeType, ModelType, TaskGroup
|
|
7
7
|
from .prompt_templates import (
|
|
8
|
+
CLASSIFICATION_TEMPLATES,
|
|
8
9
|
LA_TEMPLATES,
|
|
9
10
|
MULTIPLE_CHOICE_TEMPLATES,
|
|
10
11
|
NER_TEMPLATES,
|
|
11
12
|
RC_TEMPLATES,
|
|
12
13
|
SENT_TEMPLATES,
|
|
13
14
|
SUMM_TEMPLATES,
|
|
15
|
+
TOKEN_CLASSIFICATION_TEMPLATES,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -20,7 +22,11 @@ def get_all_tasks() -> dict[str, Task]:
|
|
|
20
22
|
Returns:
|
|
21
23
|
A mapping between names of dataset tasks and their configurations.
|
|
22
24
|
"""
|
|
23
|
-
return {
|
|
25
|
+
return {
|
|
26
|
+
cfg.name: cfg
|
|
27
|
+
for cfg in globals().values()
|
|
28
|
+
if isinstance(cfg, Task) and cfg != SPEED
|
|
29
|
+
}
|
|
24
30
|
|
|
25
31
|
|
|
26
32
|
LA = Task(
|
|
@@ -159,3 +165,40 @@ SPEED = Task(
|
|
|
159
165
|
default_max_generated_tokens=5,
|
|
160
166
|
default_labels=[],
|
|
161
167
|
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# Used for custom datasets
|
|
171
|
+
|
|
172
|
+
TEXT_CLASSIFICATION = Task(
|
|
173
|
+
name="classification",
|
|
174
|
+
task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
|
|
175
|
+
template_dict=CLASSIFICATION_TEMPLATES,
|
|
176
|
+
metrics=[m.mcc_metric, m.macro_f1_metric],
|
|
177
|
+
default_num_few_shot_examples=12,
|
|
178
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
179
|
+
default_labels=None,
|
|
180
|
+
uses_logprobs=True,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
TOKEN_CLASSIFICATION = Task(
|
|
184
|
+
name="token-classification",
|
|
185
|
+
task_group=TaskGroup.TOKEN_CLASSIFICATION,
|
|
186
|
+
template_dict=TOKEN_CLASSIFICATION_TEMPLATES,
|
|
187
|
+
metrics=[m.micro_f1_metric],
|
|
188
|
+
default_num_few_shot_examples=8,
|
|
189
|
+
default_max_generated_tokens=128,
|
|
190
|
+
default_labels=None,
|
|
191
|
+
uses_structured_output=True,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
MULTIPLE_CHOICE = Task(
|
|
195
|
+
name="multiple-choice",
|
|
196
|
+
task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
197
|
+
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
198
|
+
metrics=[m.mcc_metric, m.accuracy_metric],
|
|
199
|
+
default_num_few_shot_examples=5,
|
|
200
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
201
|
+
default_labels=None,
|
|
202
|
+
default_allowed_model_types=[ModelType.GENERATIVE],
|
|
203
|
+
uses_logprobs=True,
|
|
204
|
+
)
|
euroeval/tokenisation_utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to tokenisation."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import re
|
|
5
6
|
import typing as t
|
|
@@ -71,7 +72,7 @@ def get_special_token_metadata(tokeniser: "PreTrainedTokenizerBase") -> dict:
|
|
|
71
72
|
|
|
72
73
|
|
|
73
74
|
def should_prompts_be_stripped(
|
|
74
|
-
labels_to_be_generated:
|
|
75
|
+
labels_to_be_generated: c.Sequence[str], tokeniser: "PreTrainedTokenizer"
|
|
75
76
|
) -> bool:
|
|
76
77
|
"""Determine if we should strip the prompts for few-shot evaluation.
|
|
77
78
|
|
|
@@ -110,7 +111,7 @@ def should_prompts_be_stripped(
|
|
|
110
111
|
|
|
111
112
|
|
|
112
113
|
def should_prefix_space_be_added_to_labels(
|
|
113
|
-
labels_to_be_generated:
|
|
114
|
+
labels_to_be_generated: c.Sequence[str], tokeniser: "PreTrainedTokenizer"
|
|
114
115
|
) -> bool:
|
|
115
116
|
"""Determine if we should add a prefix space to the labels.
|
|
116
117
|
|
|
@@ -317,7 +318,7 @@ def get_pad_token(
|
|
|
317
318
|
|
|
318
319
|
def get_end_of_chat_token_ids(
|
|
319
320
|
tokeniser: "PreTrainedTokenizer", generative_type: GenerativeType | None
|
|
320
|
-
) ->
|
|
321
|
+
) -> c.Sequence[int] | None:
|
|
321
322
|
"""Get the end token ID for chat models.
|
|
322
323
|
|
|
323
324
|
This is only relevant for tokenisers with a chat template.
|
|
@@ -433,13 +434,19 @@ def get_first_label_token_mapping(
|
|
|
433
434
|
|
|
434
435
|
# Tokenise some text containing each label, which we will use to extract the
|
|
435
436
|
# first token of each label
|
|
436
|
-
all_tokens:
|
|
437
|
+
all_tokens: c.Sequence[c.Sequence[str]]
|
|
437
438
|
if not has_chat_template(tokeniser=tokeniser):
|
|
438
439
|
add_prefix_space = should_prefix_space_be_added_to_labels(
|
|
439
440
|
labels_to_be_generated=local_labels, tokeniser=tokeniser
|
|
440
441
|
)
|
|
441
442
|
all_tokens = [
|
|
442
|
-
|
|
443
|
+
[
|
|
444
|
+
tokeniser.decode(token_id)
|
|
445
|
+
for token_id in tokeniser.encode(
|
|
446
|
+
text=f" {label}" if add_prefix_space else label,
|
|
447
|
+
add_special_tokens=False,
|
|
448
|
+
)
|
|
449
|
+
]
|
|
443
450
|
for label in local_labels
|
|
444
451
|
]
|
|
445
452
|
else:
|
|
@@ -466,7 +473,7 @@ def get_first_label_token_mapping(
|
|
|
466
473
|
all_tokens = [
|
|
467
474
|
[
|
|
468
475
|
re.sub(
|
|
469
|
-
pattern=r"^[^a-zæøåüöä0-9]+|[^a-zæøåüöä0-9]+$",
|
|
476
|
+
pattern=r"^[^a-zæøåüöä0-9 ]+|[^a-zæøåüöä0-9 ]+$",
|
|
470
477
|
repl="",
|
|
471
478
|
string=token.lower(),
|
|
472
479
|
)
|
|
@@ -478,11 +485,13 @@ def get_first_label_token_mapping(
|
|
|
478
485
|
# Extract the first token of each label
|
|
479
486
|
first_tokens: list[str] = list()
|
|
480
487
|
for token_list, label in zip(all_tokens, local_labels):
|
|
481
|
-
matching_tokens = [
|
|
488
|
+
matching_tokens = [
|
|
489
|
+
tok for tok in token_list if tok and label.startswith(tok.strip())
|
|
490
|
+
]
|
|
482
491
|
if not matching_tokens:
|
|
483
492
|
if log_metadata:
|
|
484
493
|
log_once(
|
|
485
|
-
f"No matching token found in token_list for label
|
|
494
|
+
f"No matching token found in token_list for label {label!r}, so "
|
|
486
495
|
"we will not use logprobs with the model.",
|
|
487
496
|
level=logging.DEBUG,
|
|
488
497
|
)
|
|
@@ -549,12 +558,12 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
|
|
|
549
558
|
|
|
550
559
|
|
|
551
560
|
def apply_chat_template(
|
|
552
|
-
conversation:
|
|
561
|
+
conversation: c.Sequence[dict[str, str]],
|
|
553
562
|
tokeniser: "PreTrainedTokenizer",
|
|
554
563
|
tokenise: bool,
|
|
555
564
|
add_generation_prompt: bool,
|
|
556
565
|
**extra_kwargs,
|
|
557
|
-
) -> str |
|
|
566
|
+
) -> str | c.Sequence[int]:
|
|
558
567
|
"""Apply the chat template to a prompt.
|
|
559
568
|
|
|
560
569
|
Args:
|
euroeval/types.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Types used throughout the project."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import typing as t
|
|
4
5
|
|
|
5
6
|
from transformers.trainer_utils import EvalPrediction
|
|
@@ -10,9 +11,9 @@ if t.TYPE_CHECKING:
|
|
|
10
11
|
|
|
11
12
|
from .data_models import BenchmarkConfig, GenerativeModelOutput
|
|
12
13
|
|
|
13
|
-
ScoreDict: t.TypeAlias = dict[str, dict[str, float] |
|
|
14
|
-
Predictions: t.TypeAlias = "NDArray |
|
|
15
|
-
Labels: t.TypeAlias = "NDArray |
|
|
14
|
+
ScoreDict: t.TypeAlias = dict[str, dict[str, float] | c.Sequence[dict[str, float]]]
|
|
15
|
+
Predictions: t.TypeAlias = "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]"
|
|
16
|
+
Labels: t.TypeAlias = "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]"
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class ComputeMetricsFunction(t.Protocol):
|
|
@@ -22,8 +23,8 @@ class ComputeMetricsFunction(t.Protocol):
|
|
|
22
23
|
self,
|
|
23
24
|
model_outputs_and_labels: EvalPrediction
|
|
24
25
|
| tuple[
|
|
25
|
-
"NDArray |
|
|
26
|
-
"NDArray |
|
|
26
|
+
"NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]",
|
|
27
|
+
"NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]",
|
|
27
28
|
],
|
|
28
29
|
dataset: "Dataset",
|
|
29
30
|
benchmark_config: "BenchmarkConfig",
|
|
@@ -48,7 +49,7 @@ class ExtractLabelsFunction(t.Protocol):
|
|
|
48
49
|
|
|
49
50
|
def __call__(
|
|
50
51
|
self, input_batch: dict[str, list], model_output: "GenerativeModelOutput"
|
|
51
|
-
) ->
|
|
52
|
+
) -> c.Sequence[str]:
|
|
52
53
|
"""Extract the labels from the generated output.
|
|
53
54
|
|
|
54
55
|
Args:
|
|
@@ -63,7 +64,7 @@ class ExtractLabelsFunction(t.Protocol):
|
|
|
63
64
|
...
|
|
64
65
|
|
|
65
66
|
|
|
66
|
-
def is_list_of_int(x: object) -> t.TypeGuard[
|
|
67
|
+
def is_list_of_int(x: object) -> t.TypeGuard[c.Sequence[int]]:
|
|
67
68
|
"""Check if an object is a list of integers.
|
|
68
69
|
|
|
69
70
|
Args:
|
|
@@ -76,7 +77,7 @@ def is_list_of_int(x: object) -> t.TypeGuard[list[int]]:
|
|
|
76
77
|
return isinstance(x, list) and all(isinstance(i, int) for i in x)
|
|
77
78
|
|
|
78
79
|
|
|
79
|
-
def is_list_of_list_of_int(x: object) -> t.TypeGuard[
|
|
80
|
+
def is_list_of_list_of_int(x: object) -> t.TypeGuard[c.Sequence[c.Sequence[int]]]:
|
|
80
81
|
"""Check if an object is a list of list of integers.
|
|
81
82
|
|
|
82
83
|
Args:
|
|
@@ -93,7 +94,7 @@ def is_list_of_list_of_int(x: object) -> t.TypeGuard[list[list[int]]]:
|
|
|
93
94
|
)
|
|
94
95
|
|
|
95
96
|
|
|
96
|
-
def is_list_of_str(x: object) -> t.TypeGuard[
|
|
97
|
+
def is_list_of_str(x: object) -> t.TypeGuard[c.Sequence[str]]:
|
|
97
98
|
"""Check if an object is a list of integers.
|
|
98
99
|
|
|
99
100
|
Args:
|
euroeval/utils.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Utility functions to be used in other scripts."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import collections.abc as c
|
|
4
5
|
import gc
|
|
5
6
|
import importlib
|
|
6
7
|
import importlib.metadata
|
|
@@ -142,7 +143,9 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
|
|
|
142
143
|
return rng
|
|
143
144
|
|
|
144
145
|
|
|
145
|
-
def get_class_by_name(
|
|
146
|
+
def get_class_by_name(
|
|
147
|
+
class_name: str | c.Sequence[str], module_name: str
|
|
148
|
+
) -> t.Type | None:
|
|
146
149
|
"""Get a class by its name.
|
|
147
150
|
|
|
148
151
|
Args:
|
|
@@ -421,8 +424,8 @@ def get_hf_token(api_key: str | None) -> str | bool:
|
|
|
421
424
|
|
|
422
425
|
|
|
423
426
|
def extract_multiple_choice_labels(
|
|
424
|
-
prompt: str, candidate_labels:
|
|
425
|
-
) ->
|
|
427
|
+
prompt: str, candidate_labels: c.Sequence[str]
|
|
428
|
+
) -> c.Sequence[str]:
|
|
426
429
|
"""Extract multiple choice labels from a prompt.
|
|
427
430
|
|
|
428
431
|
Args:
|