EuroEval 15.7.0__py3-none-any.whl → 15.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/litellm.py +12 -253
- euroeval/benchmark_modules/vllm.py +13 -303
- euroeval/benchmarker.py +1 -0
- euroeval/data_models.py +3 -1
- euroeval/dataset_configs/dutch.py +5 -16
- euroeval/generation_utils.py +346 -0
- euroeval/scores.py +7 -1
- {euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/METADATA +1 -1
- {euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/RECORD +12 -11
- {euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/entry_points.txt +0 -0
- {euroeval-15.7.0.dist-info → euroeval-15.7.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
"""Generative models from an inference API, using the LiteLLM framework."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
-
import itertools as it
|
|
5
|
-
import json
|
|
6
4
|
import logging
|
|
7
5
|
import os
|
|
8
|
-
import random
|
|
9
6
|
import re
|
|
10
7
|
import typing as t
|
|
11
8
|
from functools import cached_property, partial
|
|
@@ -60,6 +57,7 @@ from ..exceptions import (
|
|
|
60
57
|
NeedsEnvironmentVariable,
|
|
61
58
|
NeedsExtraInstalled,
|
|
62
59
|
)
|
|
60
|
+
from ..generation_utils import apply_prompt, extract_few_shot_examples
|
|
63
61
|
from ..task_group_utils import (
|
|
64
62
|
question_answering,
|
|
65
63
|
sequence_classification,
|
|
@@ -943,14 +941,22 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
943
941
|
)
|
|
944
942
|
|
|
945
943
|
if self.benchmark_config.few_shot:
|
|
946
|
-
few_shot_examples =
|
|
947
|
-
dataset=dataset,
|
|
944
|
+
few_shot_examples = extract_few_shot_examples(
|
|
945
|
+
dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
|
|
948
946
|
)
|
|
949
947
|
else:
|
|
950
948
|
few_shot_examples = list()
|
|
951
949
|
|
|
952
950
|
dataset["test"] = dataset["test"].map(
|
|
953
|
-
partial(
|
|
951
|
+
partial(
|
|
952
|
+
apply_prompt,
|
|
953
|
+
few_shot_examples=few_shot_examples,
|
|
954
|
+
model_config=self.model_config,
|
|
955
|
+
dataset_config=self.dataset_config,
|
|
956
|
+
instruction_model=True,
|
|
957
|
+
always_populate_text_field=False,
|
|
958
|
+
tokenizer=None,
|
|
959
|
+
),
|
|
954
960
|
batched=True,
|
|
955
961
|
load_from_cache_file=False,
|
|
956
962
|
keep_in_memory=True,
|
|
@@ -958,253 +964,6 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
958
964
|
|
|
959
965
|
return dataset
|
|
960
966
|
|
|
961
|
-
def _extract_few_shot_examples(
|
|
962
|
-
self, dataset: DatasetDict, task: Task, itr_idx: int
|
|
963
|
-
) -> list[dict[str, t.Any]]:
|
|
964
|
-
"""Extract few-shot examples from a dataset.
|
|
965
|
-
|
|
966
|
-
This will always extract the examples from the training split.
|
|
967
|
-
|
|
968
|
-
We ensure that the few-shot examples are unique by picking them one at a time.
|
|
969
|
-
|
|
970
|
-
Args:
|
|
971
|
-
dataset:
|
|
972
|
-
The dataset to extract the few-shot examples from.
|
|
973
|
-
task:
|
|
974
|
-
The task that is being benchmarked.
|
|
975
|
-
itr_idx:
|
|
976
|
-
The index of the dataset in the iterator.
|
|
977
|
-
|
|
978
|
-
Returns:
|
|
979
|
-
The few-shot examples.
|
|
980
|
-
"""
|
|
981
|
-
random_seed = 4242 + itr_idx
|
|
982
|
-
num_few_shots = self.dataset_config.num_few_shot_examples
|
|
983
|
-
few_shot_examples: list[dict[str, t.Any]] = list()
|
|
984
|
-
shuffled_train = dataset["train"].shuffle(seed=random_seed)
|
|
985
|
-
|
|
986
|
-
match task.task_group:
|
|
987
|
-
case (
|
|
988
|
-
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
989
|
-
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
990
|
-
):
|
|
991
|
-
labels = it.cycle(self.dataset_config.labels)
|
|
992
|
-
while (
|
|
993
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
994
|
-
):
|
|
995
|
-
label = next(labels)
|
|
996
|
-
possible_examples = shuffled_train.filter(
|
|
997
|
-
lambda x: x["label"].lower() == label.lower()
|
|
998
|
-
)
|
|
999
|
-
if len(possible_examples) == 0:
|
|
1000
|
-
continue
|
|
1001
|
-
example = possible_examples.select(range(1))[0]
|
|
1002
|
-
few_shot_examples.append(example)
|
|
1003
|
-
shuffled_train = shuffled_train.filter(
|
|
1004
|
-
lambda x: x["text"] != example["text"]
|
|
1005
|
-
)
|
|
1006
|
-
|
|
1007
|
-
case TaskGroup.TEXT_TO_TEXT:
|
|
1008
|
-
while (
|
|
1009
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
1010
|
-
):
|
|
1011
|
-
example = shuffled_train.select(range(1))[0]
|
|
1012
|
-
few_shot_examples.append(example)
|
|
1013
|
-
shuffled_train = shuffled_train.filter(
|
|
1014
|
-
lambda x: x["text"] != example["text"]
|
|
1015
|
-
)
|
|
1016
|
-
|
|
1017
|
-
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
1018
|
-
labels = it.cycle(
|
|
1019
|
-
[
|
|
1020
|
-
label.lower()
|
|
1021
|
-
for label in self.dataset_config.labels
|
|
1022
|
-
if label.lower().startswith("b-")
|
|
1023
|
-
]
|
|
1024
|
-
)
|
|
1025
|
-
while (
|
|
1026
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
1027
|
-
):
|
|
1028
|
-
label = next(labels)
|
|
1029
|
-
possible_examples = shuffled_train.filter(
|
|
1030
|
-
lambda x: label in [tag.lower() for tag in x["labels"]]
|
|
1031
|
-
)
|
|
1032
|
-
if len(possible_examples) == 0:
|
|
1033
|
-
continue
|
|
1034
|
-
example = possible_examples.select(range(1))[0]
|
|
1035
|
-
few_shot_examples.append(example)
|
|
1036
|
-
shuffled_train = shuffled_train.filter(
|
|
1037
|
-
lambda x: x["tokens"] != example["tokens"]
|
|
1038
|
-
)
|
|
1039
|
-
|
|
1040
|
-
case TaskGroup.QUESTION_ANSWERING:
|
|
1041
|
-
# Locate the maximum number of tokens that constitutes a short example
|
|
1042
|
-
for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
|
|
1043
|
-
train_with_short_examples = dataset["train"].filter(
|
|
1044
|
-
lambda example: len(example["context"]) < max_num_tokens
|
|
1045
|
-
)
|
|
1046
|
-
num_short_examples = len(train_with_short_examples)
|
|
1047
|
-
if num_short_examples >= self.dataset_config.num_few_shot_examples:
|
|
1048
|
-
break
|
|
1049
|
-
else:
|
|
1050
|
-
raise InvalidBenchmark(
|
|
1051
|
-
"Could not find enough short examples for few-shot learning."
|
|
1052
|
-
)
|
|
1053
|
-
|
|
1054
|
-
shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
|
|
1055
|
-
while (
|
|
1056
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
1057
|
-
):
|
|
1058
|
-
example = shuffled_train.select(range(1))[0]
|
|
1059
|
-
few_shot_examples.append(example)
|
|
1060
|
-
shuffled_train = shuffled_train.filter(
|
|
1061
|
-
lambda x: x["context"] != example["context"]
|
|
1062
|
-
)
|
|
1063
|
-
|
|
1064
|
-
case _:
|
|
1065
|
-
raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
|
|
1066
|
-
|
|
1067
|
-
random.seed(random_seed)
|
|
1068
|
-
random.shuffle(few_shot_examples)
|
|
1069
|
-
return few_shot_examples
|
|
1070
|
-
|
|
1071
|
-
def _apply_prompt(
|
|
1072
|
-
self,
|
|
1073
|
-
examples: dict[str, t.Any],
|
|
1074
|
-
few_shot_examples: list[dict[str, t.Any]],
|
|
1075
|
-
task: Task,
|
|
1076
|
-
) -> dict[str, t.Any]:
|
|
1077
|
-
"""Apply prompt template to an example, potentially with few-shot examples.
|
|
1078
|
-
|
|
1079
|
-
Args:
|
|
1080
|
-
examples:
|
|
1081
|
-
The examples to apply the few-shot examples to.
|
|
1082
|
-
few_shot_examples:
|
|
1083
|
-
The few-shot examples to apply.
|
|
1084
|
-
task:
|
|
1085
|
-
The task that is being benchmarked.
|
|
1086
|
-
|
|
1087
|
-
Returns:
|
|
1088
|
-
The example with the few-shot examples applied.
|
|
1089
|
-
"""
|
|
1090
|
-
|
|
1091
|
-
def create_prompt(**kwargs: str) -> tuple[str, str]:
|
|
1092
|
-
"""Create a prompt from the given keyword arguments.
|
|
1093
|
-
|
|
1094
|
-
Args:
|
|
1095
|
-
kwargs:
|
|
1096
|
-
The keyword arguments to use in the prompt.
|
|
1097
|
-
|
|
1098
|
-
Returns:
|
|
1099
|
-
A pair (prompt, label), where "label" is an empty string if the model is
|
|
1100
|
-
not instruction tuned (as in this case it is included in the prompt).
|
|
1101
|
-
"""
|
|
1102
|
-
label_key = "label" if "label" in kwargs else "target_text"
|
|
1103
|
-
label = kwargs.pop(label_key)
|
|
1104
|
-
label_mapping = self.dataset_config.prompt_label_mapping
|
|
1105
|
-
label = label_mapping.get(label, label)
|
|
1106
|
-
prompt = self.dataset_config.instruction_prompt.format(**kwargs)
|
|
1107
|
-
return prompt, label
|
|
1108
|
-
|
|
1109
|
-
match task.task_group:
|
|
1110
|
-
case (
|
|
1111
|
-
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
1112
|
-
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
1113
|
-
):
|
|
1114
|
-
few_shot_sections = [
|
|
1115
|
-
create_prompt(
|
|
1116
|
-
text=example["text"].replace("\n", " ").strip(),
|
|
1117
|
-
label=example["label"].replace("\n", " ").strip(),
|
|
1118
|
-
)
|
|
1119
|
-
for example in few_shot_examples
|
|
1120
|
-
]
|
|
1121
|
-
new_sections = [
|
|
1122
|
-
create_prompt(text=text.replace("\n", " ").strip(), label="")
|
|
1123
|
-
for text in examples["text"]
|
|
1124
|
-
]
|
|
1125
|
-
|
|
1126
|
-
case TaskGroup.TEXT_TO_TEXT:
|
|
1127
|
-
few_shot_sections = [
|
|
1128
|
-
create_prompt(
|
|
1129
|
-
text=example["text"].replace("\n", " ").strip(),
|
|
1130
|
-
target_text=example["target_text"].replace("\n", " ").strip(),
|
|
1131
|
-
)
|
|
1132
|
-
for example in few_shot_examples
|
|
1133
|
-
]
|
|
1134
|
-
new_sections = [
|
|
1135
|
-
create_prompt(text=text.replace("\n", " ").strip(), target_text="")
|
|
1136
|
-
for text in examples["text"]
|
|
1137
|
-
]
|
|
1138
|
-
|
|
1139
|
-
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
1140
|
-
|
|
1141
|
-
def create_label(example: dict) -> str:
|
|
1142
|
-
prompt_labels = self.dataset_config.prompt_label_mapping.values()
|
|
1143
|
-
labels: dict[str, list[str]] = {
|
|
1144
|
-
prompt_label: list() for prompt_label in prompt_labels
|
|
1145
|
-
}
|
|
1146
|
-
for token, label in zip(example["tokens"], example["labels"]):
|
|
1147
|
-
label = label.lower()
|
|
1148
|
-
if label == "o":
|
|
1149
|
-
continue
|
|
1150
|
-
prompt_label = self.dataset_config.prompt_label_mapping[label]
|
|
1151
|
-
if label.startswith("b-"):
|
|
1152
|
-
labels[prompt_label].append(token)
|
|
1153
|
-
elif label.startswith("i-"):
|
|
1154
|
-
labels[prompt_label][-1] += " " + token
|
|
1155
|
-
return json.dumps(labels, ensure_ascii=False)
|
|
1156
|
-
|
|
1157
|
-
few_shot_sections = [
|
|
1158
|
-
create_prompt(
|
|
1159
|
-
text=" ".join(example["tokens"]).replace("\n", " ").strip(),
|
|
1160
|
-
label=create_label(example=example),
|
|
1161
|
-
)
|
|
1162
|
-
for example in few_shot_examples
|
|
1163
|
-
]
|
|
1164
|
-
new_sections = [
|
|
1165
|
-
create_prompt(
|
|
1166
|
-
text=" ".join(tokens).replace("\n", " ").strip(), label=""
|
|
1167
|
-
)
|
|
1168
|
-
for tokens in examples["tokens"]
|
|
1169
|
-
]
|
|
1170
|
-
|
|
1171
|
-
case TaskGroup.QUESTION_ANSWERING:
|
|
1172
|
-
few_shot_sections = [
|
|
1173
|
-
create_prompt(
|
|
1174
|
-
text=example["context"].replace("\n", " ").strip(),
|
|
1175
|
-
question=example["question"].replace("\n", " ").strip(),
|
|
1176
|
-
label=example["answers"]["text"][0].replace("\n", " "),
|
|
1177
|
-
)
|
|
1178
|
-
for example in few_shot_examples
|
|
1179
|
-
]
|
|
1180
|
-
new_sections = [
|
|
1181
|
-
create_prompt(
|
|
1182
|
-
text=context.replace("\n", " ").strip(),
|
|
1183
|
-
question=question.replace("\n", " ").strip(),
|
|
1184
|
-
label="",
|
|
1185
|
-
)
|
|
1186
|
-
for context, question in zip(
|
|
1187
|
-
examples["context"], examples["question"]
|
|
1188
|
-
)
|
|
1189
|
-
]
|
|
1190
|
-
|
|
1191
|
-
case _:
|
|
1192
|
-
raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
|
|
1193
|
-
|
|
1194
|
-
few_shot_messages = [
|
|
1195
|
-
dict(role=role, content=content)
|
|
1196
|
-
for prompt, label in few_shot_sections
|
|
1197
|
-
for role, content in [("user", prompt), ("assistant", label)]
|
|
1198
|
-
]
|
|
1199
|
-
|
|
1200
|
-
messages_list = [
|
|
1201
|
-
few_shot_messages + [dict(role="user", content=prompt)]
|
|
1202
|
-
for prompt, _ in new_sections
|
|
1203
|
-
]
|
|
1204
|
-
|
|
1205
|
-
examples["messages"] = messages_list
|
|
1206
|
-
return examples
|
|
1207
|
-
|
|
1208
967
|
|
|
1209
968
|
def raise_if_wrong_params(
|
|
1210
969
|
model_config: ModelConfig, allowed_params: dict[str, list[str]]
|
|
@@ -3,11 +3,9 @@
|
|
|
3
3
|
import collections.abc as c
|
|
4
4
|
import contextlib
|
|
5
5
|
import importlib.util
|
|
6
|
-
import itertools as it
|
|
7
6
|
import json
|
|
8
7
|
import logging
|
|
9
8
|
import os
|
|
10
|
-
import random
|
|
11
9
|
import re
|
|
12
10
|
import sys
|
|
13
11
|
import typing as t
|
|
@@ -56,6 +54,7 @@ from ..exceptions import (
|
|
|
56
54
|
NeedsEnvironmentVariable,
|
|
57
55
|
NeedsExtraInstalled,
|
|
58
56
|
)
|
|
57
|
+
from ..generation_utils import apply_prompt, extract_few_shot_examples
|
|
59
58
|
from ..languages import get_all_languages
|
|
60
59
|
from ..task_group_utils import (
|
|
61
60
|
question_answering,
|
|
@@ -258,14 +257,22 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
258
257
|
)
|
|
259
258
|
|
|
260
259
|
if self.benchmark_config.few_shot:
|
|
261
|
-
few_shot_examples =
|
|
262
|
-
dataset=dataset,
|
|
260
|
+
few_shot_examples = extract_few_shot_examples(
|
|
261
|
+
dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
|
|
263
262
|
)
|
|
264
263
|
else:
|
|
265
264
|
few_shot_examples = list()
|
|
266
265
|
|
|
267
266
|
dataset["test"] = dataset["test"].map(
|
|
268
|
-
partial(
|
|
267
|
+
partial(
|
|
268
|
+
apply_prompt,
|
|
269
|
+
few_shot_examples=few_shot_examples,
|
|
270
|
+
model_config=self.model_config,
|
|
271
|
+
dataset_config=self.dataset_config,
|
|
272
|
+
instruction_model=self.buffer["instruction_model"],
|
|
273
|
+
always_populate_text_field=True,
|
|
274
|
+
tokenizer=self._tokenizer,
|
|
275
|
+
),
|
|
269
276
|
batched=True,
|
|
270
277
|
load_from_cache_file=False,
|
|
271
278
|
keep_in_memory=True,
|
|
@@ -439,6 +446,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
439
446
|
logger.info(
|
|
440
447
|
"Prompts are too long, so truncating them and trying again..."
|
|
441
448
|
)
|
|
449
|
+
logger.debug(f"The error message was: {str(e)}")
|
|
442
450
|
tokenized_prompts = self._tokenizer(
|
|
443
451
|
text=prompts,
|
|
444
452
|
truncation=True,
|
|
@@ -499,7 +507,6 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
499
507
|
|
|
500
508
|
# Sanity check
|
|
501
509
|
if len(completions) != len(prompts):
|
|
502
|
-
breakpoint()
|
|
503
510
|
raise InvalidBenchmark(
|
|
504
511
|
f"Expected {len(prompts):,} completions, but got {len(completions):,}."
|
|
505
512
|
)
|
|
@@ -615,303 +622,6 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
615
622
|
|
|
616
623
|
return model_config
|
|
617
624
|
|
|
618
|
-
def _extract_few_shot_examples(
|
|
619
|
-
self, dataset: DatasetDict, task: Task, itr_idx: int
|
|
620
|
-
) -> list[dict[str, t.Any]]:
|
|
621
|
-
"""Extract few-shot examples from a dataset.
|
|
622
|
-
|
|
623
|
-
This will always extract the examples from the training split.
|
|
624
|
-
|
|
625
|
-
We ensure that the few-shot examples are unique by picking them one at a time.
|
|
626
|
-
|
|
627
|
-
Args:
|
|
628
|
-
dataset:
|
|
629
|
-
The dataset to extract the few-shot examples from.
|
|
630
|
-
task:
|
|
631
|
-
The task that is being benchmarked.
|
|
632
|
-
itr_idx:
|
|
633
|
-
The index of the dataset in the iterator.
|
|
634
|
-
|
|
635
|
-
Returns:
|
|
636
|
-
The few-shot examples.
|
|
637
|
-
"""
|
|
638
|
-
random_seed = 4242 + itr_idx
|
|
639
|
-
num_few_shots = self.dataset_config.num_few_shot_examples
|
|
640
|
-
few_shot_examples: list[dict[str, t.Any]] = list()
|
|
641
|
-
shuffled_train = dataset["train"].shuffle(seed=random_seed)
|
|
642
|
-
|
|
643
|
-
match task.task_group:
|
|
644
|
-
case (
|
|
645
|
-
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
646
|
-
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
647
|
-
):
|
|
648
|
-
labels = it.cycle(self.dataset_config.labels)
|
|
649
|
-
while (
|
|
650
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
651
|
-
):
|
|
652
|
-
label = next(labels)
|
|
653
|
-
possible_examples = shuffled_train.filter(
|
|
654
|
-
lambda x: x["label"].lower() == label.lower()
|
|
655
|
-
)
|
|
656
|
-
if len(possible_examples) == 0:
|
|
657
|
-
continue
|
|
658
|
-
example = possible_examples.select(range(1))[0]
|
|
659
|
-
few_shot_examples.append(example)
|
|
660
|
-
shuffled_train = shuffled_train.filter(
|
|
661
|
-
lambda x: x["text"] != example["text"]
|
|
662
|
-
)
|
|
663
|
-
|
|
664
|
-
case TaskGroup.TEXT_TO_TEXT:
|
|
665
|
-
while (
|
|
666
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
667
|
-
):
|
|
668
|
-
example = shuffled_train.select(range(1))[0]
|
|
669
|
-
few_shot_examples.append(example)
|
|
670
|
-
shuffled_train = shuffled_train.filter(
|
|
671
|
-
lambda x: x["text"] != example["text"]
|
|
672
|
-
)
|
|
673
|
-
|
|
674
|
-
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
675
|
-
labels = it.cycle(
|
|
676
|
-
[
|
|
677
|
-
label.lower()
|
|
678
|
-
for label in self.dataset_config.labels
|
|
679
|
-
if label.lower().startswith("b-")
|
|
680
|
-
]
|
|
681
|
-
)
|
|
682
|
-
while (
|
|
683
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
684
|
-
):
|
|
685
|
-
label = next(labels)
|
|
686
|
-
possible_examples = shuffled_train.filter(
|
|
687
|
-
lambda x: label in [tag.lower() for tag in x["labels"]]
|
|
688
|
-
)
|
|
689
|
-
if len(possible_examples) == 0:
|
|
690
|
-
continue
|
|
691
|
-
example = possible_examples.select(range(1))[0]
|
|
692
|
-
few_shot_examples.append(example)
|
|
693
|
-
shuffled_train = shuffled_train.filter(
|
|
694
|
-
lambda x: x["tokens"] != example["tokens"]
|
|
695
|
-
)
|
|
696
|
-
|
|
697
|
-
case TaskGroup.QUESTION_ANSWERING:
|
|
698
|
-
# Locate the maximum number of tokens that constitutes a short example
|
|
699
|
-
for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
|
|
700
|
-
train_with_short_examples = dataset["train"].filter(
|
|
701
|
-
lambda example: len(example["context"]) < max_num_tokens
|
|
702
|
-
)
|
|
703
|
-
num_short_examples = len(train_with_short_examples)
|
|
704
|
-
if num_short_examples >= self.dataset_config.num_few_shot_examples:
|
|
705
|
-
break
|
|
706
|
-
else:
|
|
707
|
-
raise InvalidBenchmark(
|
|
708
|
-
"Could not find enough short examples for few-shot learning."
|
|
709
|
-
)
|
|
710
|
-
|
|
711
|
-
shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
|
|
712
|
-
while (
|
|
713
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
714
|
-
):
|
|
715
|
-
example = shuffled_train.select(range(1))[0]
|
|
716
|
-
few_shot_examples.append(example)
|
|
717
|
-
shuffled_train = shuffled_train.filter(
|
|
718
|
-
lambda x: x["context"] != example["context"]
|
|
719
|
-
)
|
|
720
|
-
|
|
721
|
-
case _:
|
|
722
|
-
raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
|
|
723
|
-
|
|
724
|
-
random.seed(random_seed)
|
|
725
|
-
random.shuffle(few_shot_examples)
|
|
726
|
-
return few_shot_examples
|
|
727
|
-
|
|
728
|
-
def _apply_prompt(
|
|
729
|
-
self,
|
|
730
|
-
examples: dict[str, t.Any],
|
|
731
|
-
few_shot_examples: list[dict[str, t.Any]],
|
|
732
|
-
task: Task,
|
|
733
|
-
) -> dict[str, t.Any]:
|
|
734
|
-
"""Apply prompt template to an example, potentially with few-shot examples.
|
|
735
|
-
|
|
736
|
-
Args:
|
|
737
|
-
examples:
|
|
738
|
-
The examples to apply the few-shot examples to.
|
|
739
|
-
few_shot_examples:
|
|
740
|
-
The few-shot examples to apply.
|
|
741
|
-
task:
|
|
742
|
-
The task that is being benchmarked.
|
|
743
|
-
|
|
744
|
-
Returns:
|
|
745
|
-
The example with the few-shot examples applied.
|
|
746
|
-
"""
|
|
747
|
-
|
|
748
|
-
def create_prompt(**kwargs: str) -> tuple[str, str]:
|
|
749
|
-
"""Create a prompt from the given keyword arguments.
|
|
750
|
-
|
|
751
|
-
Args:
|
|
752
|
-
kwargs:
|
|
753
|
-
The keyword arguments to use in the prompt.
|
|
754
|
-
|
|
755
|
-
Returns:
|
|
756
|
-
A pair (prompt, label), where "label" is an empty string if the model is
|
|
757
|
-
not instruction tuned (as in this case it is included in the prompt).
|
|
758
|
-
"""
|
|
759
|
-
label_key = "label" if "label" in kwargs else "target_text"
|
|
760
|
-
label = kwargs.pop(label_key)
|
|
761
|
-
assert label is not None, (
|
|
762
|
-
f"Found a None label for the prompt: {kwargs}. This should not happen."
|
|
763
|
-
)
|
|
764
|
-
label_mapping = self.dataset_config.prompt_label_mapping
|
|
765
|
-
label = label_mapping.get(label, label)
|
|
766
|
-
if self.buffer["instruction_model"]:
|
|
767
|
-
prompt = self.dataset_config.instruction_prompt.format(**kwargs)
|
|
768
|
-
return prompt, label
|
|
769
|
-
else:
|
|
770
|
-
kwargs[label_key] = label
|
|
771
|
-
return self.dataset_config.prompt_template.format(**kwargs), ""
|
|
772
|
-
|
|
773
|
-
match task.task_group:
|
|
774
|
-
case (
|
|
775
|
-
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
776
|
-
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
777
|
-
):
|
|
778
|
-
few_shot_sections = [
|
|
779
|
-
create_prompt(
|
|
780
|
-
text=example["text"].replace("\n", " ").strip(),
|
|
781
|
-
label=example["label"].replace("\n", " ").strip(),
|
|
782
|
-
)
|
|
783
|
-
for example in few_shot_examples
|
|
784
|
-
]
|
|
785
|
-
new_sections = [
|
|
786
|
-
create_prompt(text=text.replace("\n", " ").strip(), label="")
|
|
787
|
-
for text in examples["text"]
|
|
788
|
-
]
|
|
789
|
-
|
|
790
|
-
case TaskGroup.TEXT_TO_TEXT:
|
|
791
|
-
few_shot_sections = [
|
|
792
|
-
create_prompt(
|
|
793
|
-
text=example["text"].replace("\n", " ").strip(),
|
|
794
|
-
target_text=example["target_text"].replace("\n", " ").strip(),
|
|
795
|
-
)
|
|
796
|
-
for example in few_shot_examples
|
|
797
|
-
]
|
|
798
|
-
new_sections = [
|
|
799
|
-
create_prompt(text=text.replace("\n", " ").strip(), target_text="")
|
|
800
|
-
for text in examples["text"]
|
|
801
|
-
]
|
|
802
|
-
|
|
803
|
-
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
804
|
-
|
|
805
|
-
def create_label(example: dict) -> str:
|
|
806
|
-
prompt_labels = self.dataset_config.prompt_label_mapping.values()
|
|
807
|
-
labels: dict[str, list[str]] = {
|
|
808
|
-
prompt_label: list() for prompt_label in prompt_labels
|
|
809
|
-
}
|
|
810
|
-
for token, label in zip(example["tokens"], example["labels"]):
|
|
811
|
-
label = label.lower()
|
|
812
|
-
if label == "o":
|
|
813
|
-
continue
|
|
814
|
-
prompt_label = self.dataset_config.prompt_label_mapping[label]
|
|
815
|
-
if label.startswith("b-"):
|
|
816
|
-
labels[prompt_label].append(token)
|
|
817
|
-
elif label.startswith("i-"):
|
|
818
|
-
labels[prompt_label][-1] += " " + token
|
|
819
|
-
return json.dumps(labels, ensure_ascii=False)
|
|
820
|
-
|
|
821
|
-
few_shot_sections = [
|
|
822
|
-
create_prompt(
|
|
823
|
-
text=" ".join(example["tokens"]).replace("\n", " ").strip(),
|
|
824
|
-
label=create_label(example=example),
|
|
825
|
-
)
|
|
826
|
-
for example in few_shot_examples
|
|
827
|
-
]
|
|
828
|
-
new_sections = [
|
|
829
|
-
create_prompt(
|
|
830
|
-
text=" ".join(tokens).replace("\n", " ").strip(), label=""
|
|
831
|
-
)
|
|
832
|
-
for tokens in examples["tokens"]
|
|
833
|
-
]
|
|
834
|
-
|
|
835
|
-
case TaskGroup.QUESTION_ANSWERING:
|
|
836
|
-
few_shot_sections = [
|
|
837
|
-
create_prompt(
|
|
838
|
-
text=example["context"].replace("\n", " ").strip(),
|
|
839
|
-
question=example["question"].replace("\n", " ").strip(),
|
|
840
|
-
label=example["answers"]["text"][0].replace("\n", " "),
|
|
841
|
-
)
|
|
842
|
-
for example in few_shot_examples
|
|
843
|
-
]
|
|
844
|
-
new_sections = [
|
|
845
|
-
create_prompt(
|
|
846
|
-
text=context.replace("\n", " ").strip(),
|
|
847
|
-
question=question.replace("\n", " ").strip(),
|
|
848
|
-
label="",
|
|
849
|
-
)
|
|
850
|
-
for context, question in zip(
|
|
851
|
-
examples["context"], examples["question"]
|
|
852
|
-
)
|
|
853
|
-
]
|
|
854
|
-
|
|
855
|
-
case _:
|
|
856
|
-
raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
|
|
857
|
-
|
|
858
|
-
if self.buffer["instruction_model"]:
|
|
859
|
-
few_shot_messages = [
|
|
860
|
-
dict(role=role, content=content)
|
|
861
|
-
for prompt, label in few_shot_sections
|
|
862
|
-
for role, content in [("user", prompt), ("assistant", label)]
|
|
863
|
-
]
|
|
864
|
-
|
|
865
|
-
messages_list = [
|
|
866
|
-
few_shot_messages + [dict(role="user", content=prompt)]
|
|
867
|
-
for prompt, _ in new_sections
|
|
868
|
-
]
|
|
869
|
-
|
|
870
|
-
# Pick the chat template that matches the language of the dataset, if such a
|
|
871
|
-
# template exists
|
|
872
|
-
chat_template: str | None = None
|
|
873
|
-
if isinstance(self._tokenizer.chat_template, dict):
|
|
874
|
-
language_codes = [
|
|
875
|
-
language.code for language in self.dataset_config.languages
|
|
876
|
-
]
|
|
877
|
-
for name, candidate_template in self._tokenizer.chat_template.items():
|
|
878
|
-
if name.lower() in language_codes:
|
|
879
|
-
chat_template = candidate_template
|
|
880
|
-
log_once(
|
|
881
|
-
f"Using the {name!r} chat template for the tokenizer for "
|
|
882
|
-
f"model {self.model_config.model_id!r}.",
|
|
883
|
-
level=logging.DEBUG,
|
|
884
|
-
)
|
|
885
|
-
break
|
|
886
|
-
|
|
887
|
-
texts = [
|
|
888
|
-
self._tokenizer.apply_chat_template(
|
|
889
|
-
conversation=messages,
|
|
890
|
-
tokenize=False,
|
|
891
|
-
add_generation_prompt=True,
|
|
892
|
-
chat_template=chat_template,
|
|
893
|
-
)
|
|
894
|
-
for messages in messages_list
|
|
895
|
-
]
|
|
896
|
-
|
|
897
|
-
examples["text"] = texts
|
|
898
|
-
|
|
899
|
-
else:
|
|
900
|
-
prompt_prefix = ""
|
|
901
|
-
if self.dataset_config.prompt_prefix:
|
|
902
|
-
prompt_prefix = self.dataset_config.prompt_prefix + "\n\n"
|
|
903
|
-
|
|
904
|
-
few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
|
|
905
|
-
if few_shot_prompt:
|
|
906
|
-
few_shot_prompt += "\n\n"
|
|
907
|
-
|
|
908
|
-
examples["text"] = [
|
|
909
|
-
prompt_prefix + few_shot_prompt + new_prompt
|
|
910
|
-
for new_prompt, _ in new_sections
|
|
911
|
-
]
|
|
912
|
-
|
|
913
|
-
return examples
|
|
914
|
-
|
|
915
625
|
@property
|
|
916
626
|
def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
|
|
917
627
|
"""The data collator used to prepare samples during finetuning.
|
euroeval/benchmarker.py
CHANGED
euroeval/data_models.py
CHANGED
|
@@ -531,7 +531,9 @@ class DatasetConfig:
|
|
|
531
531
|
|
|
532
532
|
# Convert labels to single-quoted labels - and remove duplicates
|
|
533
533
|
quoted_labels = [
|
|
534
|
-
f"'{label}'"
|
|
534
|
+
f"'{self.prompt_label_mapping[label]}'"
|
|
535
|
+
for label in set(self.labels)
|
|
536
|
+
if label in self.prompt_label_mapping
|
|
535
537
|
]
|
|
536
538
|
|
|
537
539
|
if not quoted_labels:
|
|
@@ -6,13 +6,14 @@ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
name="
|
|
9
|
+
DBRD_CONFIG = DatasetConfig(
|
|
10
|
+
name="dbrd",
|
|
11
11
|
pretty_name="the truncated version of the Dutch sentiment classification "
|
|
12
|
-
"dataset
|
|
13
|
-
huggingface_id="EuroEval/
|
|
12
|
+
"dataset DBRD",
|
|
13
|
+
huggingface_id="EuroEval/dbrd-mini",
|
|
14
14
|
task=SENT,
|
|
15
15
|
languages=[NL],
|
|
16
|
+
_labels=["negative", "positive"],
|
|
16
17
|
)
|
|
17
18
|
|
|
18
19
|
SCALA_NL_CONFIG = DatasetConfig(
|
|
@@ -71,18 +72,6 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
|
|
|
71
72
|
|
|
72
73
|
### Unofficial datasets ###
|
|
73
74
|
|
|
74
|
-
DBRD_CONFIG = DatasetConfig(
|
|
75
|
-
name="dbrd",
|
|
76
|
-
pretty_name="the truncated version of the Dutch sentiment classification "
|
|
77
|
-
"dataset DBRD",
|
|
78
|
-
huggingface_id="EuroEval/dbrd-mini",
|
|
79
|
-
task=SENT,
|
|
80
|
-
languages=[NL],
|
|
81
|
-
_labels=["negative", "positive"],
|
|
82
|
-
_prompt_label_mapping=dict(positive="positief", negative="negatief"),
|
|
83
|
-
unofficial=True,
|
|
84
|
-
)
|
|
85
|
-
|
|
86
75
|
DUTCH_COLA_CONFIG = DatasetConfig(
|
|
87
76
|
name="dutch-cola",
|
|
88
77
|
pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
"""Utility functions related to generative models."""
|
|
2
|
+
|
|
3
|
+
import itertools as it
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import random
|
|
7
|
+
import typing as t
|
|
8
|
+
|
|
9
|
+
from .enums import TaskGroup
|
|
10
|
+
from .exceptions import InvalidBenchmark
|
|
11
|
+
from .utils import log_once
|
|
12
|
+
|
|
13
|
+
if t.TYPE_CHECKING:
|
|
14
|
+
from datasets import DatasetDict
|
|
15
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
16
|
+
|
|
17
|
+
from .data_models import DatasetConfig, ModelConfig
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("euroeval")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract_few_shot_examples(
|
|
23
|
+
dataset: "DatasetDict", dataset_config: "DatasetConfig", itr_idx: int
|
|
24
|
+
) -> list[dict[str, t.Any]]:
|
|
25
|
+
"""Extract few-shot examples from a dataset.
|
|
26
|
+
|
|
27
|
+
This will always extract the examples from the training split.
|
|
28
|
+
|
|
29
|
+
We ensure that the few-shot examples are unique by picking them one at a time.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
dataset:
|
|
33
|
+
The dataset to extract the few-shot examples from.
|
|
34
|
+
dataset_config:
|
|
35
|
+
The dataset configuration.
|
|
36
|
+
itr_idx:
|
|
37
|
+
The index of the dataset in the iterator.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
The few-shot examples.
|
|
41
|
+
"""
|
|
42
|
+
random_seed = 4242 + itr_idx
|
|
43
|
+
num_few_shots = dataset_config.num_few_shot_examples
|
|
44
|
+
few_shot_examples: list[dict[str, t.Any]] = list()
|
|
45
|
+
shuffled_train = dataset["train"].shuffle(seed=random_seed)
|
|
46
|
+
|
|
47
|
+
match dataset_config.task.task_group:
|
|
48
|
+
case (
|
|
49
|
+
TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
50
|
+
):
|
|
51
|
+
# Locate the maximum number of tokens that constitutes a short example
|
|
52
|
+
for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
|
|
53
|
+
train_with_short_examples = dataset["train"].filter(
|
|
54
|
+
lambda example: len(example["text"]) < max_num_tokens
|
|
55
|
+
)
|
|
56
|
+
num_short_examples = len(train_with_short_examples)
|
|
57
|
+
if num_short_examples >= dataset_config.num_few_shot_examples:
|
|
58
|
+
break
|
|
59
|
+
else:
|
|
60
|
+
raise InvalidBenchmark(
|
|
61
|
+
"Could not find enough short examples for few-shot learning."
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
|
|
65
|
+
labels = it.cycle(dataset_config.labels)
|
|
66
|
+
while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
|
|
67
|
+
label = next(labels)
|
|
68
|
+
possible_examples = shuffled_train.filter(
|
|
69
|
+
lambda x: x["label"].lower() == label.lower()
|
|
70
|
+
)
|
|
71
|
+
if len(possible_examples) == 0:
|
|
72
|
+
continue
|
|
73
|
+
example = possible_examples.select(range(1))[0]
|
|
74
|
+
few_shot_examples.append(example)
|
|
75
|
+
shuffled_train = shuffled_train.filter(
|
|
76
|
+
lambda x: x["text"] != example["text"]
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
case TaskGroup.TEXT_TO_TEXT:
|
|
80
|
+
while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
|
|
81
|
+
example = shuffled_train.select(range(1))[0]
|
|
82
|
+
few_shot_examples.append(example)
|
|
83
|
+
shuffled_train = shuffled_train.filter(
|
|
84
|
+
lambda x: x["text"] != example["text"]
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
88
|
+
labels = it.cycle(
|
|
89
|
+
[
|
|
90
|
+
label.lower()
|
|
91
|
+
for label in dataset_config.labels
|
|
92
|
+
if label.lower().startswith("b-")
|
|
93
|
+
]
|
|
94
|
+
)
|
|
95
|
+
while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
|
|
96
|
+
label = next(labels)
|
|
97
|
+
possible_examples = shuffled_train.filter(
|
|
98
|
+
lambda x: label in [tag.lower() for tag in x["labels"]]
|
|
99
|
+
)
|
|
100
|
+
if len(possible_examples) == 0:
|
|
101
|
+
continue
|
|
102
|
+
example = possible_examples.select(range(1))[0]
|
|
103
|
+
few_shot_examples.append(example)
|
|
104
|
+
shuffled_train = shuffled_train.filter(
|
|
105
|
+
lambda x: x["tokens"] != example["tokens"]
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
case TaskGroup.QUESTION_ANSWERING:
|
|
109
|
+
# Locate the maximum number of tokens that constitutes a short example
|
|
110
|
+
for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
|
|
111
|
+
train_with_short_examples = dataset["train"].filter(
|
|
112
|
+
lambda example: len(example["context"]) < max_num_tokens
|
|
113
|
+
)
|
|
114
|
+
num_short_examples = len(train_with_short_examples)
|
|
115
|
+
if num_short_examples >= dataset_config.num_few_shot_examples:
|
|
116
|
+
break
|
|
117
|
+
else:
|
|
118
|
+
raise InvalidBenchmark(
|
|
119
|
+
"Could not find enough short examples for few-shot learning."
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
|
|
123
|
+
while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
|
|
124
|
+
example = shuffled_train.select(range(1))[0]
|
|
125
|
+
few_shot_examples.append(example)
|
|
126
|
+
shuffled_train = shuffled_train.filter(
|
|
127
|
+
lambda x: x["context"] != example["context"]
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
case _:
|
|
131
|
+
raise NotImplementedError(
|
|
132
|
+
f"Unsupported task group: {dataset_config.task.task_group}."
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
random.seed(random_seed)
|
|
136
|
+
random.shuffle(few_shot_examples)
|
|
137
|
+
return few_shot_examples
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def apply_prompt(
|
|
141
|
+
examples: dict[str, t.Any],
|
|
142
|
+
few_shot_examples: list[dict[str, t.Any]],
|
|
143
|
+
model_config: "ModelConfig",
|
|
144
|
+
dataset_config: "DatasetConfig",
|
|
145
|
+
instruction_model: bool,
|
|
146
|
+
always_populate_text_field: bool,
|
|
147
|
+
tokenizer: "PreTrainedTokenizer | None",
|
|
148
|
+
) -> dict[str, t.Any]:
|
|
149
|
+
"""Apply prompt template to an example, potentially with few-shot examples.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
examples:
|
|
153
|
+
The examples to apply the few-shot examples to.
|
|
154
|
+
few_shot_examples:
|
|
155
|
+
The few-shot examples to apply.
|
|
156
|
+
dataset_config:
|
|
157
|
+
The dataset configuration.
|
|
158
|
+
instruction_model:
|
|
159
|
+
Whether the model is instruction-tuned.
|
|
160
|
+
always_populate_text_field:
|
|
161
|
+
Whether to always populate the 'text' field in the examples, as opposed to
|
|
162
|
+
the 'messages' field.
|
|
163
|
+
tokenizer:
|
|
164
|
+
The tokenizer to use for the model. If None, the tokenizer is not used.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
The example with the few-shot examples applied.
|
|
168
|
+
"""
|
|
169
|
+
# Sanity check
|
|
170
|
+
if instruction_model and always_populate_text_field and tokenizer is None:
|
|
171
|
+
raise ValueError(
|
|
172
|
+
"The `tokenizer` argument must be provided when the model is instruction "
|
|
173
|
+
"tuned and when we are not just returning the raw messages."
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def create_prompt(**kwargs: str) -> tuple[str, str]:
|
|
177
|
+
"""Create a prompt from the given keyword arguments.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
kwargs:
|
|
181
|
+
The keyword arguments to use in the prompt.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
A pair (prompt, label), where "label" is an empty string if the model is
|
|
185
|
+
not instruction tuned (as in this case it is included in the prompt).
|
|
186
|
+
"""
|
|
187
|
+
label_key = "label" if "label" in kwargs else "target_text"
|
|
188
|
+
label = kwargs.pop(label_key)
|
|
189
|
+
assert label is not None, (
|
|
190
|
+
f"Found a None label for the prompt: {kwargs}. This should not happen."
|
|
191
|
+
)
|
|
192
|
+
label_mapping = dataset_config.prompt_label_mapping
|
|
193
|
+
label = label_mapping.get(label, label)
|
|
194
|
+
if instruction_model:
|
|
195
|
+
prompt = dataset_config.instruction_prompt.format(**kwargs)
|
|
196
|
+
return prompt, label
|
|
197
|
+
else:
|
|
198
|
+
kwargs[label_key] = label
|
|
199
|
+
return dataset_config.prompt_template.format(**kwargs), ""
|
|
200
|
+
|
|
201
|
+
match dataset_config.task.task_group:
|
|
202
|
+
case (
|
|
203
|
+
TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
204
|
+
):
|
|
205
|
+
few_shot_sections = [
|
|
206
|
+
create_prompt(
|
|
207
|
+
text=example["text"].replace("\n", " ").strip(),
|
|
208
|
+
label=example["label"].replace("\n", " ").strip(),
|
|
209
|
+
)
|
|
210
|
+
for example in few_shot_examples
|
|
211
|
+
]
|
|
212
|
+
new_sections = [
|
|
213
|
+
create_prompt(text=text.replace("\n", " ").strip(), label="")
|
|
214
|
+
for text in examples["text"]
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
case TaskGroup.TEXT_TO_TEXT:
|
|
218
|
+
few_shot_sections = [
|
|
219
|
+
create_prompt(
|
|
220
|
+
text=example["text"].replace("\n", " ").strip(),
|
|
221
|
+
target_text=example["target_text"].replace("\n", " ").strip(),
|
|
222
|
+
)
|
|
223
|
+
for example in few_shot_examples
|
|
224
|
+
]
|
|
225
|
+
new_sections = [
|
|
226
|
+
create_prompt(text=text.replace("\n", " ").strip(), target_text="")
|
|
227
|
+
for text in examples["text"]
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
231
|
+
|
|
232
|
+
def create_label(example: dict) -> str:
|
|
233
|
+
prompt_labels = dataset_config.prompt_label_mapping.values()
|
|
234
|
+
labels: dict[str, list[str]] = {
|
|
235
|
+
prompt_label: list() for prompt_label in prompt_labels
|
|
236
|
+
}
|
|
237
|
+
for token, label in zip(example["tokens"], example["labels"]):
|
|
238
|
+
label = label.lower()
|
|
239
|
+
if label == "o":
|
|
240
|
+
continue
|
|
241
|
+
prompt_label = dataset_config.prompt_label_mapping[label]
|
|
242
|
+
if label.startswith("b-"):
|
|
243
|
+
labels[prompt_label].append(token)
|
|
244
|
+
elif label.startswith("i-"):
|
|
245
|
+
labels[prompt_label][-1] += " " + token
|
|
246
|
+
return json.dumps(labels, ensure_ascii=False)
|
|
247
|
+
|
|
248
|
+
few_shot_sections = [
|
|
249
|
+
create_prompt(
|
|
250
|
+
text=" ".join(example["tokens"]).replace("\n", " ").strip(),
|
|
251
|
+
label=create_label(example=example),
|
|
252
|
+
)
|
|
253
|
+
for example in few_shot_examples
|
|
254
|
+
]
|
|
255
|
+
new_sections = [
|
|
256
|
+
create_prompt(
|
|
257
|
+
text=" ".join(tokens).replace("\n", " ").strip(), label=""
|
|
258
|
+
)
|
|
259
|
+
for tokens in examples["tokens"]
|
|
260
|
+
]
|
|
261
|
+
|
|
262
|
+
case TaskGroup.QUESTION_ANSWERING:
|
|
263
|
+
few_shot_sections = [
|
|
264
|
+
create_prompt(
|
|
265
|
+
text=example["context"].replace("\n", " ").strip(),
|
|
266
|
+
question=example["question"].replace("\n", " ").strip(),
|
|
267
|
+
label=example["answers"]["text"][0].replace("\n", " "),
|
|
268
|
+
)
|
|
269
|
+
for example in few_shot_examples
|
|
270
|
+
]
|
|
271
|
+
new_sections = [
|
|
272
|
+
create_prompt(
|
|
273
|
+
text=context.replace("\n", " ").strip(),
|
|
274
|
+
question=question.replace("\n", " ").strip(),
|
|
275
|
+
label="",
|
|
276
|
+
)
|
|
277
|
+
for context, question in zip(examples["context"], examples["question"])
|
|
278
|
+
]
|
|
279
|
+
|
|
280
|
+
case _:
|
|
281
|
+
raise NotImplementedError(
|
|
282
|
+
f"Unsupported task group: {dataset_config.task.task_group}."
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
if instruction_model:
|
|
286
|
+
few_shot_messages = [
|
|
287
|
+
dict(role=role, content=content)
|
|
288
|
+
for prompt, label in few_shot_sections
|
|
289
|
+
for role, content in [("user", prompt), ("assistant", label)]
|
|
290
|
+
]
|
|
291
|
+
|
|
292
|
+
messages_list = [
|
|
293
|
+
few_shot_messages + [dict(role="user", content=prompt)]
|
|
294
|
+
for prompt, _ in new_sections
|
|
295
|
+
]
|
|
296
|
+
|
|
297
|
+
if not always_populate_text_field:
|
|
298
|
+
examples["messages"] = messages_list
|
|
299
|
+
|
|
300
|
+
else:
|
|
301
|
+
assert tokenizer is not None
|
|
302
|
+
|
|
303
|
+
# Pick the chat template that matches the language of the dataset, if such a
|
|
304
|
+
# template exists
|
|
305
|
+
chat_template: str | None = None
|
|
306
|
+
if isinstance(tokenizer.chat_template, dict):
|
|
307
|
+
language_codes = [
|
|
308
|
+
language.code for language in dataset_config.languages
|
|
309
|
+
]
|
|
310
|
+
for name, candidate_template in tokenizer.chat_template.items():
|
|
311
|
+
if name.lower() in language_codes:
|
|
312
|
+
chat_template = candidate_template
|
|
313
|
+
log_once(
|
|
314
|
+
f"Using the {name!r} chat template for the tokenizer for "
|
|
315
|
+
f"model {model_config.model_id!r}.",
|
|
316
|
+
level=logging.DEBUG,
|
|
317
|
+
)
|
|
318
|
+
break
|
|
319
|
+
|
|
320
|
+
texts = [
|
|
321
|
+
tokenizer.apply_chat_template(
|
|
322
|
+
conversation=messages,
|
|
323
|
+
tokenize=False,
|
|
324
|
+
add_generation_prompt=True,
|
|
325
|
+
chat_template=chat_template,
|
|
326
|
+
)
|
|
327
|
+
for messages in messages_list
|
|
328
|
+
]
|
|
329
|
+
|
|
330
|
+
examples["text"] = texts
|
|
331
|
+
|
|
332
|
+
else:
|
|
333
|
+
prompt_prefix = ""
|
|
334
|
+
if dataset_config.prompt_prefix:
|
|
335
|
+
prompt_prefix = dataset_config.prompt_prefix + "\n\n"
|
|
336
|
+
|
|
337
|
+
few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
|
|
338
|
+
if few_shot_prompt:
|
|
339
|
+
few_shot_prompt += "\n\n"
|
|
340
|
+
|
|
341
|
+
examples["text"] = [
|
|
342
|
+
prompt_prefix + few_shot_prompt + new_prompt
|
|
343
|
+
for new_prompt, _ in new_sections
|
|
344
|
+
]
|
|
345
|
+
|
|
346
|
+
return examples
|
euroeval/scores.py
CHANGED
|
@@ -18,6 +18,7 @@ def log_scores(
|
|
|
18
18
|
metric_configs: list["MetricConfig"],
|
|
19
19
|
scores: list[dict[str, float]],
|
|
20
20
|
model_id: str,
|
|
21
|
+
model_revision: str,
|
|
21
22
|
) -> "ScoreDict":
|
|
22
23
|
"""Log the scores.
|
|
23
24
|
|
|
@@ -30,13 +31,18 @@ def log_scores(
|
|
|
30
31
|
The scores that are to be logged. This is a list of dictionaries full of
|
|
31
32
|
scores.
|
|
32
33
|
model_id:
|
|
33
|
-
The
|
|
34
|
+
The model ID of the model that was evaluated.
|
|
35
|
+
model_revision:
|
|
36
|
+
The revision of the model.
|
|
34
37
|
|
|
35
38
|
Returns:
|
|
36
39
|
A dictionary with keys 'raw_scores' and 'total', with 'raw_scores' being
|
|
37
40
|
identical to `scores` and 'total' being a dictionary with the aggregated scores
|
|
38
41
|
(means and standard errors).
|
|
39
42
|
"""
|
|
43
|
+
if model_revision and model_revision != "main":
|
|
44
|
+
model_id += f"@{model_revision}"
|
|
45
|
+
|
|
40
46
|
logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
|
|
41
47
|
|
|
42
48
|
total_dict: dict[str, float] = dict()
|
|
@@ -1,21 +1,22 @@
|
|
|
1
1
|
euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
|
|
2
2
|
euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
3
|
+
euroeval/benchmarker.py,sha256=OnjGVblWW20wSmA7Tr2c-qE3g8FIjxW6wTJySAcGxVk,48492
|
|
4
4
|
euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
|
|
5
5
|
euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
|
|
6
6
|
euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
|
|
7
7
|
euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
8
|
+
euroeval/data_models.py,sha256=t5FwpGxiSIMe7iKae-tT7usUWki-ILzAFFm7dPJoFsk,22973
|
|
9
9
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
10
10
|
euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
|
|
11
11
|
euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
|
|
12
12
|
euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
|
|
13
|
+
euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
|
|
13
14
|
euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
|
|
14
15
|
euroeval/languages.py,sha256=IQUbGMyn7pxAyM70M0FTO80m92Q4KgIU604MJhVia-Q,8513
|
|
15
16
|
euroeval/model_cache.py,sha256=n39yFpZkudBCVwz1EQpZ-g5BQtlQemQ5nP3IiFKJZHg,8275
|
|
16
17
|
euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
|
|
17
18
|
euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
|
|
18
|
-
euroeval/scores.py,sha256=
|
|
19
|
+
euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
|
|
19
20
|
euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
|
|
20
21
|
euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
|
|
21
22
|
euroeval/tokenization_utils.py,sha256=fbMVAOkRdcpf9L2SVechPpmWYgDXgQcc-sDrYu21wFI,12487
|
|
@@ -25,11 +26,11 @@ euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwR
|
|
|
25
26
|
euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
|
|
26
27
|
euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
|
|
27
28
|
euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
|
|
28
|
-
euroeval/benchmark_modules/litellm.py,sha256=
|
|
29
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
29
|
+
euroeval/benchmark_modules/litellm.py,sha256=v_rbCm2FiTMqcUui_09k3E1-s5uOmbfAvSy2c7Mm0_E,42636
|
|
30
|
+
euroeval/benchmark_modules/vllm.py,sha256=Q-3vtZz5XxQQImJxOiF0XDrQ4T_p0bkgdPw1Jobgu3s,39380
|
|
30
31
|
euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
|
|
31
32
|
euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
|
|
32
|
-
euroeval/dataset_configs/dutch.py,sha256=
|
|
33
|
+
euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
|
|
33
34
|
euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
|
|
34
35
|
euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
|
|
35
36
|
euroeval/dataset_configs/finnish.py,sha256=BIfcxdgJu4CfevHAjzwH7cYd8Xd9DGrm49lcJZcGVQM,1730
|
|
@@ -53,8 +54,8 @@ euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iY
|
|
|
53
54
|
euroeval/task_group_utils/sequence_classification.py,sha256=xPz1gJioK96iv2bNoDWiC2EJkhRvRd7QZNgY8bT237c,11703
|
|
54
55
|
euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
|
|
55
56
|
euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
|
|
56
|
-
euroeval-15.7.
|
|
57
|
-
euroeval-15.7.
|
|
58
|
-
euroeval-15.7.
|
|
59
|
-
euroeval-15.7.
|
|
60
|
-
euroeval-15.7.
|
|
57
|
+
euroeval-15.7.1.dist-info/METADATA,sha256=Fj6QejwQCK0zGuP_DHSQ7sul195ivUqOUCT5AVxgLSI,13669
|
|
58
|
+
euroeval-15.7.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
59
|
+
euroeval-15.7.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
60
|
+
euroeval-15.7.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
61
|
+
euroeval-15.7.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|