EuroEval 15.6.1__py3-none-any.whl → 15.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/litellm.py +148 -284
- euroeval/benchmark_modules/vllm.py +115 -338
- euroeval/benchmarker.py +13 -2
- euroeval/constants.py +1 -1
- euroeval/data_loading.py +48 -26
- euroeval/data_models.py +3 -9
- euroeval/dataset_configs/dutch.py +5 -16
- euroeval/dataset_configs/finnish.py +60 -0
- euroeval/generation_utils.py +346 -0
- euroeval/prompt_templates/linguistic_acceptability.py +9 -1
- euroeval/prompt_templates/multiple_choice.py +8 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -1
- euroeval/prompt_templates/reading_comprehension.py +11 -1
- euroeval/prompt_templates/sentiment_classification.py +11 -1
- euroeval/prompt_templates/summarization.py +9 -1
- euroeval/scores.py +7 -1
- euroeval/task_group_utils/sequence_classification.py +27 -32
- euroeval/task_group_utils/text_to_text.py +10 -27
- euroeval/tasks.py +1 -1
- euroeval/tokenization_utils.py +22 -6
- {euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/METADATA +14 -2
- {euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/RECORD +25 -23
- {euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/entry_points.txt +0 -0
- {euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/licenses/LICENSE +0 -0
euroeval/data_loading.py
CHANGED
|
@@ -39,32 +39,9 @@ def load_data(
|
|
|
39
39
|
HuggingFaceHubDown:
|
|
40
40
|
If the Hugging Face Hub is down.
|
|
41
41
|
"""
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
dataset = load_dataset(
|
|
46
|
-
path=dataset_config.huggingface_id,
|
|
47
|
-
cache_dir=benchmark_config.cache_dir,
|
|
48
|
-
token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
|
|
49
|
-
)
|
|
50
|
-
break
|
|
51
|
-
except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
|
|
52
|
-
logger.warning(
|
|
53
|
-
f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
|
|
54
|
-
)
|
|
55
|
-
time.sleep(1)
|
|
56
|
-
continue
|
|
57
|
-
except HfHubHTTPError:
|
|
58
|
-
raise HuggingFaceHubDown()
|
|
59
|
-
else:
|
|
60
|
-
raise InvalidBenchmark(
|
|
61
|
-
f"Failed to load dataset {dataset_config.huggingface_id!r} after "
|
|
62
|
-
f"{num_attempts} attempts."
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
|
|
66
|
-
|
|
67
|
-
dataset = DatasetDict({key: dataset[key] for key in ["train", "val", "test"]})
|
|
42
|
+
dataset = load_raw_data(
|
|
43
|
+
dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
|
|
44
|
+
)
|
|
68
45
|
|
|
69
46
|
if not benchmark_config.evaluate_test_split:
|
|
70
47
|
dataset["test"] = dataset["val"]
|
|
@@ -101,3 +78,48 @@ def load_data(
|
|
|
101
78
|
for idx in range(benchmark_config.num_iterations)
|
|
102
79
|
]
|
|
103
80
|
return datasets
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDict:
|
|
84
|
+
"""Load the raw dataset.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
dataset_config:
|
|
88
|
+
The configuration for the dataset.
|
|
89
|
+
cache_dir:
|
|
90
|
+
The directory to cache the dataset.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
The dataset.
|
|
94
|
+
"""
|
|
95
|
+
num_attempts = 5
|
|
96
|
+
for _ in range(num_attempts):
|
|
97
|
+
try:
|
|
98
|
+
dataset = load_dataset(
|
|
99
|
+
path=dataset_config.huggingface_id,
|
|
100
|
+
cache_dir=cache_dir,
|
|
101
|
+
token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
|
|
102
|
+
)
|
|
103
|
+
break
|
|
104
|
+
except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
|
|
105
|
+
logger.warning(
|
|
106
|
+
f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
|
|
107
|
+
)
|
|
108
|
+
time.sleep(1)
|
|
109
|
+
continue
|
|
110
|
+
except HfHubHTTPError:
|
|
111
|
+
raise HuggingFaceHubDown()
|
|
112
|
+
else:
|
|
113
|
+
raise InvalidBenchmark(
|
|
114
|
+
f"Failed to load dataset {dataset_config.huggingface_id!r} after "
|
|
115
|
+
f"{num_attempts} attempts."
|
|
116
|
+
)
|
|
117
|
+
assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
|
|
118
|
+
required_keys = ["train", "val", "test"]
|
|
119
|
+
missing_keys = [key for key in required_keys if key not in dataset]
|
|
120
|
+
if missing_keys:
|
|
121
|
+
raise InvalidBenchmark(
|
|
122
|
+
"The dataset is missing the following required splits: "
|
|
123
|
+
f"{', '.join(missing_keys)}"
|
|
124
|
+
)
|
|
125
|
+
return DatasetDict({key: dataset[key] for key in required_keys})
|
euroeval/data_models.py
CHANGED
|
@@ -521,14 +521,6 @@ class DatasetConfig:
|
|
|
521
521
|
|
|
522
522
|
Returns:
|
|
523
523
|
The natural string representation of the labels in specified language.
|
|
524
|
-
|
|
525
|
-
Raises:
|
|
526
|
-
NotImplementedError:
|
|
527
|
-
If `and_separator` or `or_separator` are `None`, see `Language`.
|
|
528
|
-
|
|
529
|
-
Example:
|
|
530
|
-
>>> get_labels_str(language=DA)
|
|
531
|
-
"'a', 'b', 'c' eller 'd'"
|
|
532
524
|
"""
|
|
533
525
|
main_language = self.languages[0]
|
|
534
526
|
|
|
@@ -539,7 +531,9 @@ class DatasetConfig:
|
|
|
539
531
|
|
|
540
532
|
# Convert labels to single-quoted labels - and remove duplicates
|
|
541
533
|
quoted_labels = [
|
|
542
|
-
f"'{label}'"
|
|
534
|
+
f"'{self.prompt_label_mapping[label]}'"
|
|
535
|
+
for label in set(self.labels)
|
|
536
|
+
if label in self.prompt_label_mapping
|
|
543
537
|
]
|
|
544
538
|
|
|
545
539
|
if not quoted_labels:
|
|
@@ -6,13 +6,14 @@ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
name="
|
|
9
|
+
DBRD_CONFIG = DatasetConfig(
|
|
10
|
+
name="dbrd",
|
|
11
11
|
pretty_name="the truncated version of the Dutch sentiment classification "
|
|
12
|
-
"dataset
|
|
13
|
-
huggingface_id="EuroEval/
|
|
12
|
+
"dataset DBRD",
|
|
13
|
+
huggingface_id="EuroEval/dbrd-mini",
|
|
14
14
|
task=SENT,
|
|
15
15
|
languages=[NL],
|
|
16
|
+
_labels=["negative", "positive"],
|
|
16
17
|
)
|
|
17
18
|
|
|
18
19
|
SCALA_NL_CONFIG = DatasetConfig(
|
|
@@ -71,18 +72,6 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
|
|
|
71
72
|
|
|
72
73
|
### Unofficial datasets ###
|
|
73
74
|
|
|
74
|
-
DBRD_CONFIG = DatasetConfig(
|
|
75
|
-
name="dbrd",
|
|
76
|
-
pretty_name="the truncated version of the Dutch sentiment classification "
|
|
77
|
-
"dataset DBRD",
|
|
78
|
-
huggingface_id="EuroEval/dbrd-mini",
|
|
79
|
-
task=SENT,
|
|
80
|
-
languages=[NL],
|
|
81
|
-
_labels=["negative", "positive"],
|
|
82
|
-
_prompt_label_mapping=dict(positive="positief", negative="negatief"),
|
|
83
|
-
unofficial=True,
|
|
84
|
-
)
|
|
85
|
-
|
|
86
75
|
DUTCH_COLA_CONFIG = DatasetConfig(
|
|
87
76
|
name="dutch-cola",
|
|
88
77
|
pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""All Finnish dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import FI
|
|
5
|
+
from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
SCANDISENT_FI_CONFIG = DatasetConfig(
|
|
10
|
+
name="scandisent-fi",
|
|
11
|
+
pretty_name="the truncated version of the Finnish part of the binary sentiment "
|
|
12
|
+
"classification dataset ScandiSent",
|
|
13
|
+
huggingface_id="EuroEval/scandisent-fi-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[FI],
|
|
16
|
+
_labels=["negative", "positive"],
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
TURKU_NER_FI_CONFIG = DatasetConfig(
|
|
20
|
+
name="turku-ner-fi",
|
|
21
|
+
pretty_name="the Finnish part of the named entity recognition dataset Turku NER",
|
|
22
|
+
huggingface_id="EuroEval/turku-ner-fi-mini",
|
|
23
|
+
task=NER,
|
|
24
|
+
languages=[FI],
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
TYDIQA_FI_CONFIG = DatasetConfig(
|
|
28
|
+
name="tydiqa-fi",
|
|
29
|
+
pretty_name="the Finnish part of the TydiQA reading comprehension dataset",
|
|
30
|
+
huggingface_id="EuroEval/tydiqa-fi-mini",
|
|
31
|
+
task=RC,
|
|
32
|
+
languages=[FI],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
XLSUM_FI_CONFIG = DatasetConfig(
|
|
36
|
+
name="xlsum-fi",
|
|
37
|
+
pretty_name="the Finnish summarisation dataset XL-Sum",
|
|
38
|
+
huggingface_id="EuroEval/xlsum-fi-mini",
|
|
39
|
+
task=SUMM,
|
|
40
|
+
languages=[FI],
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
HELLASWAG_FI_CONFIG = DatasetConfig(
|
|
44
|
+
name="hellaswag-fi",
|
|
45
|
+
pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
|
|
46
|
+
"HellaSwag-fi, translated from the English HellaSwag dataset",
|
|
47
|
+
huggingface_id="EuroEval/hellaswag-fi-mini",
|
|
48
|
+
task=COMMON_SENSE,
|
|
49
|
+
languages=[FI],
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
SCALA_FI_CONFIG = DatasetConfig(
|
|
53
|
+
name="scala-fi",
|
|
54
|
+
pretty_name="the Finnish part of the linguistic acceptability dataset ScaLA",
|
|
55
|
+
huggingface_id="EuroEval/scala-fi",
|
|
56
|
+
task=LA,
|
|
57
|
+
languages=[FI],
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
### Unofficial datasets ###
|
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
"""Utility functions related to generative models."""
|
|
2
|
+
|
|
3
|
+
import itertools as it
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import random
|
|
7
|
+
import typing as t
|
|
8
|
+
|
|
9
|
+
from .enums import TaskGroup
|
|
10
|
+
from .exceptions import InvalidBenchmark
|
|
11
|
+
from .utils import log_once
|
|
12
|
+
|
|
13
|
+
if t.TYPE_CHECKING:
|
|
14
|
+
from datasets import DatasetDict
|
|
15
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
16
|
+
|
|
17
|
+
from .data_models import DatasetConfig, ModelConfig
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("euroeval")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def extract_few_shot_examples(
|
|
23
|
+
dataset: "DatasetDict", dataset_config: "DatasetConfig", itr_idx: int
|
|
24
|
+
) -> list[dict[str, t.Any]]:
|
|
25
|
+
"""Extract few-shot examples from a dataset.
|
|
26
|
+
|
|
27
|
+
This will always extract the examples from the training split.
|
|
28
|
+
|
|
29
|
+
We ensure that the few-shot examples are unique by picking them one at a time.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
dataset:
|
|
33
|
+
The dataset to extract the few-shot examples from.
|
|
34
|
+
dataset_config:
|
|
35
|
+
The dataset configuration.
|
|
36
|
+
itr_idx:
|
|
37
|
+
The index of the dataset in the iterator.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
The few-shot examples.
|
|
41
|
+
"""
|
|
42
|
+
random_seed = 4242 + itr_idx
|
|
43
|
+
num_few_shots = dataset_config.num_few_shot_examples
|
|
44
|
+
few_shot_examples: list[dict[str, t.Any]] = list()
|
|
45
|
+
shuffled_train = dataset["train"].shuffle(seed=random_seed)
|
|
46
|
+
|
|
47
|
+
match dataset_config.task.task_group:
|
|
48
|
+
case (
|
|
49
|
+
TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
50
|
+
):
|
|
51
|
+
# Locate the maximum number of tokens that constitutes a short example
|
|
52
|
+
for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
|
|
53
|
+
train_with_short_examples = dataset["train"].filter(
|
|
54
|
+
lambda example: len(example["text"]) < max_num_tokens
|
|
55
|
+
)
|
|
56
|
+
num_short_examples = len(train_with_short_examples)
|
|
57
|
+
if num_short_examples >= dataset_config.num_few_shot_examples:
|
|
58
|
+
break
|
|
59
|
+
else:
|
|
60
|
+
raise InvalidBenchmark(
|
|
61
|
+
"Could not find enough short examples for few-shot learning."
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
|
|
65
|
+
labels = it.cycle(dataset_config.labels)
|
|
66
|
+
while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
|
|
67
|
+
label = next(labels)
|
|
68
|
+
possible_examples = shuffled_train.filter(
|
|
69
|
+
lambda x: x["label"].lower() == label.lower()
|
|
70
|
+
)
|
|
71
|
+
if len(possible_examples) == 0:
|
|
72
|
+
continue
|
|
73
|
+
example = possible_examples.select(range(1))[0]
|
|
74
|
+
few_shot_examples.append(example)
|
|
75
|
+
shuffled_train = shuffled_train.filter(
|
|
76
|
+
lambda x: x["text"] != example["text"]
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
case TaskGroup.TEXT_TO_TEXT:
|
|
80
|
+
while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
|
|
81
|
+
example = shuffled_train.select(range(1))[0]
|
|
82
|
+
few_shot_examples.append(example)
|
|
83
|
+
shuffled_train = shuffled_train.filter(
|
|
84
|
+
lambda x: x["text"] != example["text"]
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
88
|
+
labels = it.cycle(
|
|
89
|
+
[
|
|
90
|
+
label.lower()
|
|
91
|
+
for label in dataset_config.labels
|
|
92
|
+
if label.lower().startswith("b-")
|
|
93
|
+
]
|
|
94
|
+
)
|
|
95
|
+
while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
|
|
96
|
+
label = next(labels)
|
|
97
|
+
possible_examples = shuffled_train.filter(
|
|
98
|
+
lambda x: label in [tag.lower() for tag in x["labels"]]
|
|
99
|
+
)
|
|
100
|
+
if len(possible_examples) == 0:
|
|
101
|
+
continue
|
|
102
|
+
example = possible_examples.select(range(1))[0]
|
|
103
|
+
few_shot_examples.append(example)
|
|
104
|
+
shuffled_train = shuffled_train.filter(
|
|
105
|
+
lambda x: x["tokens"] != example["tokens"]
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
case TaskGroup.QUESTION_ANSWERING:
|
|
109
|
+
# Locate the maximum number of tokens that constitutes a short example
|
|
110
|
+
for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
|
|
111
|
+
train_with_short_examples = dataset["train"].filter(
|
|
112
|
+
lambda example: len(example["context"]) < max_num_tokens
|
|
113
|
+
)
|
|
114
|
+
num_short_examples = len(train_with_short_examples)
|
|
115
|
+
if num_short_examples >= dataset_config.num_few_shot_examples:
|
|
116
|
+
break
|
|
117
|
+
else:
|
|
118
|
+
raise InvalidBenchmark(
|
|
119
|
+
"Could not find enough short examples for few-shot learning."
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
|
|
123
|
+
while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
|
|
124
|
+
example = shuffled_train.select(range(1))[0]
|
|
125
|
+
few_shot_examples.append(example)
|
|
126
|
+
shuffled_train = shuffled_train.filter(
|
|
127
|
+
lambda x: x["context"] != example["context"]
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
case _:
|
|
131
|
+
raise NotImplementedError(
|
|
132
|
+
f"Unsupported task group: {dataset_config.task.task_group}."
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
random.seed(random_seed)
|
|
136
|
+
random.shuffle(few_shot_examples)
|
|
137
|
+
return few_shot_examples
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def apply_prompt(
|
|
141
|
+
examples: dict[str, t.Any],
|
|
142
|
+
few_shot_examples: list[dict[str, t.Any]],
|
|
143
|
+
model_config: "ModelConfig",
|
|
144
|
+
dataset_config: "DatasetConfig",
|
|
145
|
+
instruction_model: bool,
|
|
146
|
+
always_populate_text_field: bool,
|
|
147
|
+
tokenizer: "PreTrainedTokenizer | None",
|
|
148
|
+
) -> dict[str, t.Any]:
|
|
149
|
+
"""Apply prompt template to an example, potentially with few-shot examples.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
examples:
|
|
153
|
+
The examples to apply the few-shot examples to.
|
|
154
|
+
few_shot_examples:
|
|
155
|
+
The few-shot examples to apply.
|
|
156
|
+
dataset_config:
|
|
157
|
+
The dataset configuration.
|
|
158
|
+
instruction_model:
|
|
159
|
+
Whether the model is instruction-tuned.
|
|
160
|
+
always_populate_text_field:
|
|
161
|
+
Whether to always populate the 'text' field in the examples, as opposed to
|
|
162
|
+
the 'messages' field.
|
|
163
|
+
tokenizer:
|
|
164
|
+
The tokenizer to use for the model. If None, the tokenizer is not used.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
The example with the few-shot examples applied.
|
|
168
|
+
"""
|
|
169
|
+
# Sanity check
|
|
170
|
+
if instruction_model and always_populate_text_field and tokenizer is None:
|
|
171
|
+
raise ValueError(
|
|
172
|
+
"The `tokenizer` argument must be provided when the model is instruction "
|
|
173
|
+
"tuned and when we are not just returning the raw messages."
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def create_prompt(**kwargs: str) -> tuple[str, str]:
|
|
177
|
+
"""Create a prompt from the given keyword arguments.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
kwargs:
|
|
181
|
+
The keyword arguments to use in the prompt.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
A pair (prompt, label), where "label" is an empty string if the model is
|
|
185
|
+
not instruction tuned (as in this case it is included in the prompt).
|
|
186
|
+
"""
|
|
187
|
+
label_key = "label" if "label" in kwargs else "target_text"
|
|
188
|
+
label = kwargs.pop(label_key)
|
|
189
|
+
assert label is not None, (
|
|
190
|
+
f"Found a None label for the prompt: {kwargs}. This should not happen."
|
|
191
|
+
)
|
|
192
|
+
label_mapping = dataset_config.prompt_label_mapping
|
|
193
|
+
label = label_mapping.get(label, label)
|
|
194
|
+
if instruction_model:
|
|
195
|
+
prompt = dataset_config.instruction_prompt.format(**kwargs)
|
|
196
|
+
return prompt, label
|
|
197
|
+
else:
|
|
198
|
+
kwargs[label_key] = label
|
|
199
|
+
return dataset_config.prompt_template.format(**kwargs), ""
|
|
200
|
+
|
|
201
|
+
match dataset_config.task.task_group:
|
|
202
|
+
case (
|
|
203
|
+
TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
204
|
+
):
|
|
205
|
+
few_shot_sections = [
|
|
206
|
+
create_prompt(
|
|
207
|
+
text=example["text"].replace("\n", " ").strip(),
|
|
208
|
+
label=example["label"].replace("\n", " ").strip(),
|
|
209
|
+
)
|
|
210
|
+
for example in few_shot_examples
|
|
211
|
+
]
|
|
212
|
+
new_sections = [
|
|
213
|
+
create_prompt(text=text.replace("\n", " ").strip(), label="")
|
|
214
|
+
for text in examples["text"]
|
|
215
|
+
]
|
|
216
|
+
|
|
217
|
+
case TaskGroup.TEXT_TO_TEXT:
|
|
218
|
+
few_shot_sections = [
|
|
219
|
+
create_prompt(
|
|
220
|
+
text=example["text"].replace("\n", " ").strip(),
|
|
221
|
+
target_text=example["target_text"].replace("\n", " ").strip(),
|
|
222
|
+
)
|
|
223
|
+
for example in few_shot_examples
|
|
224
|
+
]
|
|
225
|
+
new_sections = [
|
|
226
|
+
create_prompt(text=text.replace("\n", " ").strip(), target_text="")
|
|
227
|
+
for text in examples["text"]
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
231
|
+
|
|
232
|
+
def create_label(example: dict) -> str:
|
|
233
|
+
prompt_labels = dataset_config.prompt_label_mapping.values()
|
|
234
|
+
labels: dict[str, list[str]] = {
|
|
235
|
+
prompt_label: list() for prompt_label in prompt_labels
|
|
236
|
+
}
|
|
237
|
+
for token, label in zip(example["tokens"], example["labels"]):
|
|
238
|
+
label = label.lower()
|
|
239
|
+
if label == "o":
|
|
240
|
+
continue
|
|
241
|
+
prompt_label = dataset_config.prompt_label_mapping[label]
|
|
242
|
+
if label.startswith("b-"):
|
|
243
|
+
labels[prompt_label].append(token)
|
|
244
|
+
elif label.startswith("i-"):
|
|
245
|
+
labels[prompt_label][-1] += " " + token
|
|
246
|
+
return json.dumps(labels, ensure_ascii=False)
|
|
247
|
+
|
|
248
|
+
few_shot_sections = [
|
|
249
|
+
create_prompt(
|
|
250
|
+
text=" ".join(example["tokens"]).replace("\n", " ").strip(),
|
|
251
|
+
label=create_label(example=example),
|
|
252
|
+
)
|
|
253
|
+
for example in few_shot_examples
|
|
254
|
+
]
|
|
255
|
+
new_sections = [
|
|
256
|
+
create_prompt(
|
|
257
|
+
text=" ".join(tokens).replace("\n", " ").strip(), label=""
|
|
258
|
+
)
|
|
259
|
+
for tokens in examples["tokens"]
|
|
260
|
+
]
|
|
261
|
+
|
|
262
|
+
case TaskGroup.QUESTION_ANSWERING:
|
|
263
|
+
few_shot_sections = [
|
|
264
|
+
create_prompt(
|
|
265
|
+
text=example["context"].replace("\n", " ").strip(),
|
|
266
|
+
question=example["question"].replace("\n", " ").strip(),
|
|
267
|
+
label=example["answers"]["text"][0].replace("\n", " "),
|
|
268
|
+
)
|
|
269
|
+
for example in few_shot_examples
|
|
270
|
+
]
|
|
271
|
+
new_sections = [
|
|
272
|
+
create_prompt(
|
|
273
|
+
text=context.replace("\n", " ").strip(),
|
|
274
|
+
question=question.replace("\n", " ").strip(),
|
|
275
|
+
label="",
|
|
276
|
+
)
|
|
277
|
+
for context, question in zip(examples["context"], examples["question"])
|
|
278
|
+
]
|
|
279
|
+
|
|
280
|
+
case _:
|
|
281
|
+
raise NotImplementedError(
|
|
282
|
+
f"Unsupported task group: {dataset_config.task.task_group}."
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
if instruction_model:
|
|
286
|
+
few_shot_messages = [
|
|
287
|
+
dict(role=role, content=content)
|
|
288
|
+
for prompt, label in few_shot_sections
|
|
289
|
+
for role, content in [("user", prompt), ("assistant", label)]
|
|
290
|
+
]
|
|
291
|
+
|
|
292
|
+
messages_list = [
|
|
293
|
+
few_shot_messages + [dict(role="user", content=prompt)]
|
|
294
|
+
for prompt, _ in new_sections
|
|
295
|
+
]
|
|
296
|
+
|
|
297
|
+
if not always_populate_text_field:
|
|
298
|
+
examples["messages"] = messages_list
|
|
299
|
+
|
|
300
|
+
else:
|
|
301
|
+
assert tokenizer is not None
|
|
302
|
+
|
|
303
|
+
# Pick the chat template that matches the language of the dataset, if such a
|
|
304
|
+
# template exists
|
|
305
|
+
chat_template: str | None = None
|
|
306
|
+
if isinstance(tokenizer.chat_template, dict):
|
|
307
|
+
language_codes = [
|
|
308
|
+
language.code for language in dataset_config.languages
|
|
309
|
+
]
|
|
310
|
+
for name, candidate_template in tokenizer.chat_template.items():
|
|
311
|
+
if name.lower() in language_codes:
|
|
312
|
+
chat_template = candidate_template
|
|
313
|
+
log_once(
|
|
314
|
+
f"Using the {name!r} chat template for the tokenizer for "
|
|
315
|
+
f"model {model_config.model_id!r}.",
|
|
316
|
+
level=logging.DEBUG,
|
|
317
|
+
)
|
|
318
|
+
break
|
|
319
|
+
|
|
320
|
+
texts = [
|
|
321
|
+
tokenizer.apply_chat_template(
|
|
322
|
+
conversation=messages,
|
|
323
|
+
tokenize=False,
|
|
324
|
+
add_generation_prompt=True,
|
|
325
|
+
chat_template=chat_template,
|
|
326
|
+
)
|
|
327
|
+
for messages in messages_list
|
|
328
|
+
]
|
|
329
|
+
|
|
330
|
+
examples["text"] = texts
|
|
331
|
+
|
|
332
|
+
else:
|
|
333
|
+
prompt_prefix = ""
|
|
334
|
+
if dataset_config.prompt_prefix:
|
|
335
|
+
prompt_prefix = dataset_config.prompt_prefix + "\n\n"
|
|
336
|
+
|
|
337
|
+
few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
|
|
338
|
+
if few_shot_prompt:
|
|
339
|
+
few_shot_prompt += "\n\n"
|
|
340
|
+
|
|
341
|
+
examples["text"] = [
|
|
342
|
+
prompt_prefix + few_shot_prompt + new_prompt
|
|
343
|
+
for new_prompt, _ in new_sections
|
|
344
|
+
]
|
|
345
|
+
|
|
346
|
+
return examples
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Linguistic Acceptability task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
LA_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -36,6 +36,14 @@ LA_TEMPLATES = {
|
|
|
36
36
|
default_instruction_prompt="Texto: {text}\n\nDetermina si el texto es "
|
|
37
37
|
"gramaticalmente correcto o no. Responde con {labels_str}, y nada más.",
|
|
38
38
|
),
|
|
39
|
+
FI: PromptConfig(
|
|
40
|
+
default_prompt_label_mapping=dict(correct="kyllä", incorrect="ei"),
|
|
41
|
+
default_prompt_prefix="Seuraavat ovat lauseita ja ovatko ne "
|
|
42
|
+
"kieliopillisesti oikein.",
|
|
43
|
+
default_prompt_template="Lause: {text}\nKieliopillisesti oikein: {label}",
|
|
44
|
+
default_instruction_prompt="Lause: {text}\n\nMääritä onko lause "
|
|
45
|
+
"oikein vai ei. Vastaa {labels_str}, ja ei mitään muuta.",
|
|
46
|
+
),
|
|
39
47
|
FO: PromptConfig(
|
|
40
48
|
default_prompt_label_mapping=dict(correct="ja", incorrect="nei"),
|
|
41
49
|
default_prompt_prefix="Hetta eru nakrir setningar og um teir eru mállæruliga "
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for all multiple choice tasks."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
# TODO: Missing Faroese
|
|
7
7
|
MULTIPLE_CHOICE_TEMPLATES = {
|
|
@@ -36,6 +36,13 @@ MULTIPLE_CHOICE_TEMPLATES = {
|
|
|
36
36
|
"usando solo {labels_str}, y nada más.",
|
|
37
37
|
default_prompt_label_mapping="auto",
|
|
38
38
|
),
|
|
39
|
+
FI: PromptConfig(
|
|
40
|
+
default_prompt_prefix="Seuraavat ovat monivalintakysymyksiä (vastauksineen).",
|
|
41
|
+
default_prompt_template="Kysymys: {text}\nVastaus: {label}",
|
|
42
|
+
default_instruction_prompt="Kysymys: {text}\n\nVastaa yllä olevaan kysymykseen "
|
|
43
|
+
"käyttämällä {labels_str}, äläkä mitään muuta.",
|
|
44
|
+
default_prompt_label_mapping="auto",
|
|
45
|
+
),
|
|
39
46
|
FR: PromptConfig(
|
|
40
47
|
default_prompt_prefix="Les questions suivantes sont des questions à choix "
|
|
41
48
|
"multiples (avec réponses).",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Named Entity Recognition task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
NER_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -80,6 +80,25 @@ NER_TEMPLATES = {
|
|
|
80
80
|
"claves {labels_str}. Los valores deben ser listas de las "
|
|
81
81
|
"entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
|
|
82
82
|
),
|
|
83
|
+
FI: PromptConfig(
|
|
84
|
+
default_prompt_label_mapping={
|
|
85
|
+
"b-per": "henkilö",
|
|
86
|
+
"i-per": "henkilö",
|
|
87
|
+
"b-loc": "paikka",
|
|
88
|
+
"i-loc": "paikka",
|
|
89
|
+
"b-org": "organisaatio",
|
|
90
|
+
"i-org": "organisaatio",
|
|
91
|
+
"b-misc": "muut",
|
|
92
|
+
"i-misc": "muut",
|
|
93
|
+
},
|
|
94
|
+
default_prompt_prefix="Seuraavassa on lauseita ja JSON-sanakirjoja, jotka "
|
|
95
|
+
"sisältävät annetussa lauseessa esiintyvät nimetyt entiteetit.",
|
|
96
|
+
default_prompt_template="Lause: {text}\nNimetyt entiteetit: {label}",
|
|
97
|
+
default_instruction_prompt="Lause: {text}\n\nTunnista lauseessa olevat "
|
|
98
|
+
"entiteetit. Tulosta ne JSON-sanakirjana, jonka avaimet ovat {labels_str}. "
|
|
99
|
+
"Arvojen tulee olla listoja kyseisen tyypin nimetyistä entiteeteistä "
|
|
100
|
+
"täsmälleen siinä muodossa kuin ne esiintyvät lauseessa.",
|
|
101
|
+
),
|
|
83
102
|
FO: PromptConfig(
|
|
84
103
|
default_prompt_label_mapping={
|
|
85
104
|
"b-per": "persónur",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Reading Comprehension task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
5
|
|
|
6
6
|
RC_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -39,6 +39,16 @@ RC_TEMPLATES = {
|
|
|
39
39
|
"sobre el texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
|
|
40
40
|
default_prompt_label_mapping=dict(),
|
|
41
41
|
),
|
|
42
|
+
FI: PromptConfig(
|
|
43
|
+
default_prompt_prefix="Seuraavassa on tekstejä ja niihin liittyviä kysymyksiä "
|
|
44
|
+
"ja vastauksia.",
|
|
45
|
+
default_prompt_template="Teksti: {text}\nKysymys: {question} "
|
|
46
|
+
"\nVastaa enintään 3 sanalla: {label}",
|
|
47
|
+
default_instruction_prompt="Teksti: {text}\n\nVastaa seuraavaan "
|
|
48
|
+
"kysymykseen yllä olevasta tekstistä enintään 3 sanalla.\n\n"
|
|
49
|
+
"Kysymys: {question}",
|
|
50
|
+
default_prompt_label_mapping=dict(),
|
|
51
|
+
),
|
|
42
52
|
FO: PromptConfig(
|
|
43
53
|
default_prompt_prefix="Hetta eru tekstir saman við spurningum og svar.",
|
|
44
54
|
default_prompt_template="Tekstur: {text}\nSpurningur: {question}\nSvara við í "
|