EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +32 -14
- euroeval/benchmark_config_factory.py +92 -180
- euroeval/benchmark_modules/base.py +49 -39
- euroeval/benchmark_modules/fresh.py +35 -21
- euroeval/benchmark_modules/hf.py +280 -244
- euroeval/benchmark_modules/litellm.py +752 -312
- euroeval/benchmark_modules/vllm.py +570 -268
- euroeval/benchmarker.py +651 -528
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +49 -38
- euroeval/constants.py +44 -25
- euroeval/data_loading.py +111 -55
- euroeval/data_models.py +490 -323
- euroeval/dataset_configs/__init__.py +26 -4
- euroeval/dataset_configs/bosnian.py +39 -0
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/croatian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +78 -50
- euroeval/dataset_configs/dutch.py +74 -44
- euroeval/dataset_configs/english.py +71 -36
- euroeval/dataset_configs/estonian.py +111 -0
- euroeval/dataset_configs/faroese.py +25 -18
- euroeval/dataset_configs/finnish.py +63 -26
- euroeval/dataset_configs/french.py +65 -32
- euroeval/dataset_configs/german.py +77 -36
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +68 -57
- euroeval/dataset_configs/italian.py +68 -36
- euroeval/dataset_configs/latvian.py +87 -0
- euroeval/dataset_configs/lithuanian.py +64 -0
- euroeval/dataset_configs/norwegian.py +98 -72
- euroeval/dataset_configs/polish.py +96 -0
- euroeval/dataset_configs/portuguese.py +63 -40
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/slovene.py +56 -0
- euroeval/dataset_configs/spanish.py +68 -34
- euroeval/dataset_configs/swedish.py +82 -41
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/enums.py +12 -6
- euroeval/exceptions.py +21 -1
- euroeval/finetuning.py +34 -26
- euroeval/generation.py +76 -41
- euroeval/generation_utils.py +169 -34
- euroeval/languages.py +1020 -188
- euroeval/logging_utils.py +268 -0
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +85 -0
- euroeval/metrics/huggingface.py +216 -0
- euroeval/metrics/llm_as_a_judge.py +260 -0
- euroeval/metrics/pipeline.py +289 -0
- euroeval/metrics/speed.py +48 -0
- euroeval/model_cache.py +40 -21
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +157 -22
- euroeval/prompt_templates/multiple_choice.py +159 -17
- euroeval/prompt_templates/named_entity_recognition.py +318 -21
- euroeval/prompt_templates/reading_comprehension.py +207 -16
- euroeval/prompt_templates/sentiment_classification.py +205 -22
- euroeval/prompt_templates/summarization.py +122 -22
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +20 -9
- euroeval/speed_benchmark.py +11 -12
- euroeval/task_group_utils/multiple_choice_classification.py +21 -12
- euroeval/task_group_utils/question_answering.py +101 -73
- euroeval/task_group_utils/sequence_classification.py +144 -61
- euroeval/task_group_utils/text_to_text.py +33 -12
- euroeval/task_group_utils/token_classification.py +86 -89
- euroeval/tasks.py +75 -16
- euroeval/tokenisation_utils.py +603 -0
- euroeval/types.py +17 -11
- euroeval/utils.py +332 -137
- euroeval-16.7.1.dist-info/METADATA +623 -0
- euroeval-16.7.1.dist-info/RECORD +84 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -737
- euroeval/metrics.py +0 -452
- euroeval/tokenization_utils.py +0 -498
- euroeval-15.12.0.dist-info/METADATA +0 -285
- euroeval-15.12.0.dist-info/RECORD +0 -63
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
euroeval/speed_benchmark.py
CHANGED
|
@@ -1,26 +1,25 @@
|
|
|
1
1
|
"""Benchmarking model inference speed."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import typing as t
|
|
5
6
|
|
|
6
7
|
import pyinfer
|
|
7
|
-
from tqdm.auto import tqdm
|
|
8
8
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
9
9
|
|
|
10
10
|
from .benchmark_modules import HuggingFaceEncoderModel, LiteLLMModel, VLLMModel
|
|
11
11
|
from .exceptions import InvalidBenchmark
|
|
12
|
+
from .logging_utils import get_pbar, log
|
|
12
13
|
from .utils import clear_memory
|
|
13
14
|
|
|
14
15
|
if t.TYPE_CHECKING:
|
|
15
16
|
from .benchmark_modules import BenchmarkModule
|
|
16
17
|
from .data_models import BenchmarkConfig
|
|
17
18
|
|
|
18
|
-
logger = logging.getLogger("euroeval")
|
|
19
|
-
|
|
20
19
|
|
|
21
20
|
def benchmark_speed(
|
|
22
21
|
model: "BenchmarkModule", benchmark_config: "BenchmarkConfig"
|
|
23
|
-
) ->
|
|
22
|
+
) -> c.Sequence[dict[str, float]]:
|
|
24
23
|
"""Benchmark model inference speed.
|
|
25
24
|
|
|
26
25
|
Args:
|
|
@@ -33,7 +32,7 @@ def benchmark_speed(
|
|
|
33
32
|
Dictionary of scores.
|
|
34
33
|
"""
|
|
35
34
|
scores: list[dict[str, float]] = list()
|
|
36
|
-
for idx in
|
|
35
|
+
for idx in get_pbar(
|
|
37
36
|
iterable=range(benchmark_config.num_iterations),
|
|
38
37
|
desc="Benchmarking",
|
|
39
38
|
disable=not benchmark_config.progress_bar,
|
|
@@ -41,7 +40,7 @@ def benchmark_speed(
|
|
|
41
40
|
itr_scores = benchmark_speed_single_iteration(model=model, itr_idx=idx)
|
|
42
41
|
clear_memory()
|
|
43
42
|
scores.append(itr_scores)
|
|
44
|
-
|
|
43
|
+
log(f"Scores for iteration {idx}: {itr_scores}", level=logging.DEBUG)
|
|
45
44
|
return scores
|
|
46
45
|
|
|
47
46
|
|
|
@@ -59,7 +58,7 @@ def benchmark_speed_single_iteration(
|
|
|
59
58
|
Returns:
|
|
60
59
|
A dictionary containing the scores for the current iteration.
|
|
61
60
|
"""
|
|
62
|
-
|
|
61
|
+
gpt2_tokeniser = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
|
|
63
62
|
|
|
64
63
|
base_doc = "Document which contains roughly 10 tokens. "
|
|
65
64
|
multiplier = 10 * (1 + itr_idx)
|
|
@@ -74,11 +73,11 @@ def benchmark_speed_single_iteration(
|
|
|
74
73
|
model.generate(inputs=dict(text=[doc]))
|
|
75
74
|
|
|
76
75
|
def encoder_predict(doc: str) -> None:
|
|
77
|
-
|
|
76
|
+
tokeniser = model.get_tokeniser()
|
|
78
77
|
pytorch_model = model.get_pytorch_module()
|
|
79
78
|
inputs = {
|
|
80
79
|
key: tensor.to(pytorch_model.device)
|
|
81
|
-
for key, tensor in
|
|
80
|
+
for key, tensor in tokeniser(
|
|
82
81
|
text=[doc], truncation=True, return_tensors="pt"
|
|
83
82
|
).items()
|
|
84
83
|
}
|
|
@@ -102,21 +101,21 @@ def benchmark_speed_single_iteration(
|
|
|
102
101
|
speed_scores = pyinfer.InferenceReport(
|
|
103
102
|
model=predict, inputs=doc, n_seconds=3
|
|
104
103
|
).run(print_report=False)
|
|
105
|
-
num_gpt2_tokens = len(
|
|
104
|
+
num_gpt2_tokens = len(gpt2_tokeniser([doc], truncation=True)["input_ids"][0])
|
|
106
105
|
gpt2_tokens_per_second = speed_scores["Infer(p/sec)"] * num_gpt2_tokens
|
|
107
106
|
|
|
108
107
|
speed_scores_short = pyinfer.InferenceReport(
|
|
109
108
|
model=predict, inputs=short_doc, n_seconds=3
|
|
110
109
|
).run(print_report=False)
|
|
111
110
|
num_gpt2_tokens_short = len(
|
|
112
|
-
|
|
111
|
+
gpt2_tokeniser([short_doc], truncation=True)["input_ids"][0]
|
|
113
112
|
)
|
|
114
113
|
gpt2_tokens_per_second_short = (
|
|
115
114
|
speed_scores_short["Infer(p/sec)"] * num_gpt2_tokens_short
|
|
116
115
|
)
|
|
117
116
|
|
|
118
117
|
except (RuntimeError, ValueError, IndexError) as e:
|
|
119
|
-
raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}")
|
|
118
|
+
raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}") from e
|
|
120
119
|
|
|
121
120
|
return dict(
|
|
122
121
|
test_speed=gpt2_tokens_per_second, test_speed_short=gpt2_tokens_per_second_short
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Utility functions related to the multiple-choice classification task group."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import hashlib
|
|
4
|
-
import logging
|
|
5
5
|
import re
|
|
6
6
|
import typing as t
|
|
7
7
|
from collections import defaultdict
|
|
@@ -18,8 +18,6 @@ if t.TYPE_CHECKING:
|
|
|
18
18
|
|
|
19
19
|
from ..types import Labels, Predictions
|
|
20
20
|
|
|
21
|
-
logger = logging.getLogger("euroeval")
|
|
22
|
-
|
|
23
21
|
|
|
24
22
|
class MultipleChoiceClassificationTrainer(Trainer):
|
|
25
23
|
"""Trainer subclass for multiple-choice classification tasks."""
|
|
@@ -27,7 +25,7 @@ class MultipleChoiceClassificationTrainer(Trainer):
|
|
|
27
25
|
def evaluate( # type: ignore[override]
|
|
28
26
|
self,
|
|
29
27
|
eval_dataset: "Dataset | None" = None,
|
|
30
|
-
ignore_keys:
|
|
28
|
+
ignore_keys: c.Sequence[str] | None = None,
|
|
31
29
|
metric_key_prefix: str = "eval",
|
|
32
30
|
) -> dict[str, float]:
|
|
33
31
|
"""Evaluate the model on the given dataset.
|
|
@@ -94,15 +92,15 @@ class MultipleChoiceClassificationTrainer(Trainer):
|
|
|
94
92
|
|
|
95
93
|
|
|
96
94
|
def prepare_examples(
|
|
97
|
-
examples: "BatchEncoding",
|
|
95
|
+
examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
|
|
98
96
|
) -> "BatchEncoding":
|
|
99
97
|
"""Prepare the features.
|
|
100
98
|
|
|
101
99
|
Args:
|
|
102
100
|
examples:
|
|
103
101
|
The examples to prepare.
|
|
104
|
-
|
|
105
|
-
The
|
|
102
|
+
tokeniser:
|
|
103
|
+
The tokeniser to use to prepare the examples.
|
|
106
104
|
|
|
107
105
|
Returns:
|
|
108
106
|
The prepared examples.
|
|
@@ -110,12 +108,23 @@ def prepare_examples(
|
|
|
110
108
|
doc: str = examples["text"][0]
|
|
111
109
|
sections = doc.split("\n")
|
|
112
110
|
|
|
113
|
-
|
|
111
|
+
candidate_choice_idxs = [
|
|
114
112
|
idx
|
|
115
113
|
for idx, section in enumerate(sections)
|
|
116
|
-
if re.match(pattern=r"^[a-
|
|
114
|
+
if re.match(pattern=r"^[a-z0-9]+\. ", string=section) is not None
|
|
117
115
|
]
|
|
118
|
-
|
|
116
|
+
|
|
117
|
+
# Sometimes the question itself starts with a letter or number followed by a dot, We
|
|
118
|
+
# want to ignore these cases, and focus on the final contingent block of at least
|
|
119
|
+
# two choices.
|
|
120
|
+
choice_idxs: list[int] = list()
|
|
121
|
+
for idx in reversed(candidate_choice_idxs):
|
|
122
|
+
if len(choice_idxs) < 2 or (
|
|
123
|
+
len(choice_idxs) >= 2 and idx == choice_idxs[-1] - 1
|
|
124
|
+
):
|
|
125
|
+
choice_idxs.append(idx)
|
|
126
|
+
|
|
127
|
+
choices = [sections[idx] for idx in reversed(choice_idxs)]
|
|
119
128
|
|
|
120
129
|
# Check that the choices are present, and that all of them are at the end
|
|
121
130
|
assert len(choices) > 0, "No choices found in the document."
|
|
@@ -127,7 +136,7 @@ def prepare_examples(
|
|
|
127
136
|
question_idx = min(choice_idxs) - 2 # -2 to remove the 'Choices:' line
|
|
128
137
|
context_and_question = "\n".join(sections[: question_idx + 1]).strip()
|
|
129
138
|
|
|
130
|
-
new_examples =
|
|
139
|
+
new_examples = tokeniser(
|
|
131
140
|
text=[context_and_question] * len(choices),
|
|
132
141
|
text_pair=[choice[3:] for choice in choices],
|
|
133
142
|
padding=True,
|
|
@@ -135,7 +144,7 @@ def prepare_examples(
|
|
|
135
144
|
)
|
|
136
145
|
new_examples["label"] = [
|
|
137
146
|
int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
|
|
138
|
-
for letter, choice in zip("
|
|
147
|
+
for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
|
|
139
148
|
]
|
|
140
149
|
new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
|
|
141
150
|
return new_examples
|
|
@@ -1,16 +1,18 @@
|
|
|
1
1
|
"""Utility functions related to the question-answering task group."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
-
import logging
|
|
5
4
|
import typing as t
|
|
6
5
|
from collections import defaultdict
|
|
7
6
|
|
|
8
7
|
import numpy as np
|
|
9
|
-
from transformers.tokenization_utils_base import
|
|
8
|
+
from transformers.tokenization_utils_base import (
|
|
9
|
+
PreTrainedTokenizerBase,
|
|
10
|
+
TruncationStrategy,
|
|
11
|
+
)
|
|
10
12
|
from transformers.trainer import Trainer
|
|
11
13
|
|
|
12
14
|
from ..exceptions import InvalidBenchmark
|
|
13
|
-
from ..
|
|
15
|
+
from ..tokenisation_utils import get_special_token_metadata
|
|
14
16
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
15
17
|
|
|
16
18
|
if t.TYPE_CHECKING:
|
|
@@ -23,11 +25,9 @@ if t.TYPE_CHECKING:
|
|
|
23
25
|
from transformers.trainer_utils import EvalPrediction
|
|
24
26
|
from transformers.training_args import TrainingArguments
|
|
25
27
|
|
|
26
|
-
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
28
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
27
29
|
from ..types import Labels, Predictions
|
|
28
30
|
|
|
29
|
-
logger = logging.getLogger("euroeval")
|
|
30
|
-
|
|
31
31
|
|
|
32
32
|
class QuestionAnsweringTrainer(Trainer):
|
|
33
33
|
"""Trainer subclass for question answering tasks."""
|
|
@@ -40,7 +40,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
40
40
|
train_dataset: "Dataset",
|
|
41
41
|
eval_dataset: "Dataset",
|
|
42
42
|
compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
|
|
43
|
-
callbacks: "
|
|
43
|
+
callbacks: "c.Sequence[TrainerCallback]",
|
|
44
44
|
data_collator: "c.Callable",
|
|
45
45
|
**kwargs,
|
|
46
46
|
) -> None:
|
|
@@ -57,7 +57,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
57
57
|
**kwargs,
|
|
58
58
|
)
|
|
59
59
|
|
|
60
|
-
# Get the CLS token id for the
|
|
60
|
+
# Get the CLS token id for the tokeniser
|
|
61
61
|
if self.tokenizer is not None:
|
|
62
62
|
assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
|
|
63
63
|
special_token_metadata = get_special_token_metadata(self.tokenizer)
|
|
@@ -70,7 +70,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
70
70
|
self,
|
|
71
71
|
eval_dataset: "Dataset | None" = None,
|
|
72
72
|
orig_eval_dataset: "Dataset | None" = None,
|
|
73
|
-
ignore_keys:
|
|
73
|
+
ignore_keys: c.Sequence[str] | None = None,
|
|
74
74
|
metric_key_prefix: str = "eval",
|
|
75
75
|
) -> dict[str, float]:
|
|
76
76
|
"""Evaluate the model on the given dataset.
|
|
@@ -149,6 +149,8 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
149
149
|
def compute_metrics(
|
|
150
150
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
151
151
|
dataset_config: "DatasetConfig",
|
|
152
|
+
benchmark_config: "BenchmarkConfig",
|
|
153
|
+
dataset: "Dataset",
|
|
152
154
|
) -> dict[str, float]:
|
|
153
155
|
"""Compute the metrics needed for evaluation.
|
|
154
156
|
|
|
@@ -158,6 +160,11 @@ def compute_metrics(
|
|
|
158
160
|
contains the true labels.
|
|
159
161
|
dataset_config:
|
|
160
162
|
The configuration of the dataset.
|
|
163
|
+
benchmark_config:
|
|
164
|
+
The configuration of the benchmark.
|
|
165
|
+
dataset:
|
|
166
|
+
The dataset used for evaluation. This is only used in case any additional
|
|
167
|
+
metadata is used to compute the metrics.
|
|
161
168
|
|
|
162
169
|
Returns:
|
|
163
170
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
@@ -181,7 +188,13 @@ def compute_metrics(
|
|
|
181
188
|
|
|
182
189
|
results: dict[str, float] = dict()
|
|
183
190
|
for metric in dataset_config.task.metrics:
|
|
184
|
-
score: float | None = metric(
|
|
191
|
+
score: float | None = metric(
|
|
192
|
+
predictions=predictions,
|
|
193
|
+
references=labels,
|
|
194
|
+
dataset=dataset,
|
|
195
|
+
dataset_config=dataset_config,
|
|
196
|
+
benchmark_config=benchmark_config,
|
|
197
|
+
)
|
|
185
198
|
|
|
186
199
|
# The metric returns None if we are running on multi-GPU and the current
|
|
187
200
|
# process is not the main process
|
|
@@ -193,7 +206,7 @@ def compute_metrics(
|
|
|
193
206
|
|
|
194
207
|
def extract_labels_from_generation(
|
|
195
208
|
input_batch: dict[str, list], model_output: "GenerativeModelOutput"
|
|
196
|
-
) ->
|
|
209
|
+
) -> c.Sequence[t.Any]:
|
|
197
210
|
"""Extract the predicted labels from the generated output.
|
|
198
211
|
|
|
199
212
|
Args:
|
|
@@ -215,15 +228,15 @@ def extract_labels_from_generation(
|
|
|
215
228
|
|
|
216
229
|
|
|
217
230
|
def prepare_train_examples(
|
|
218
|
-
examples: "BatchEncoding",
|
|
231
|
+
examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
|
|
219
232
|
) -> "BatchEncoding":
|
|
220
233
|
"""Prepare the features for training.
|
|
221
234
|
|
|
222
235
|
Args:
|
|
223
236
|
examples:
|
|
224
237
|
The examples to prepare.
|
|
225
|
-
|
|
226
|
-
The
|
|
238
|
+
tokeniser:
|
|
239
|
+
The tokeniser to use to prepare the examples.
|
|
227
240
|
|
|
228
241
|
Returns:
|
|
229
242
|
The prepared examples.
|
|
@@ -233,37 +246,40 @@ def prepare_train_examples(
|
|
|
233
246
|
# take a lots of space). So we remove that left whitespace
|
|
234
247
|
examples["question"] = [q.lstrip() for q in examples["question"]]
|
|
235
248
|
|
|
236
|
-
# Extract special token metadata from the
|
|
237
|
-
special_token_metadata = get_special_token_metadata(
|
|
249
|
+
# Extract special token metadata from the tokeniser
|
|
250
|
+
special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
|
|
238
251
|
has_cls_token = special_token_metadata["has_cls_token"]
|
|
239
252
|
has_sep_token = special_token_metadata["has_sep_token"]
|
|
240
253
|
cls_token_id = special_token_metadata["cls_token_id"]
|
|
241
254
|
cls_token = special_token_metadata["cls_token"]
|
|
242
255
|
sep_token = special_token_metadata["sep_token"]
|
|
243
256
|
|
|
244
|
-
# If the
|
|
257
|
+
# If the tokeniser is not adding special tokens, then we add them manually
|
|
245
258
|
if not has_cls_token and not has_sep_token:
|
|
246
259
|
examples["question"] = [
|
|
247
260
|
f"{cls_token}{q}{sep_token}" for q in examples["question"]
|
|
248
261
|
]
|
|
249
262
|
examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
|
|
250
263
|
|
|
251
|
-
# Set the stride used during
|
|
264
|
+
# Set the stride used during tokenisation, when the context is long enough to be
|
|
252
265
|
# split into several features. Since we are always keeping the question tokens, we
|
|
253
266
|
# need to make sure that the stride does not exceed the resulting maximum context
|
|
254
267
|
# length.
|
|
255
|
-
max_question_tokens = max(len(
|
|
268
|
+
max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
|
|
256
269
|
num_special_tokens = int(has_cls_token) + int(has_sep_token)
|
|
257
|
-
stride =
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
270
|
+
stride = tokeniser.model_max_length // 4
|
|
271
|
+
stride = min(
|
|
272
|
+
stride,
|
|
273
|
+
tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
|
|
274
|
+
)
|
|
275
|
+
stride = max(stride, 0)
|
|
276
|
+
max_length = tokeniser.model_max_length - stride
|
|
261
277
|
|
|
262
|
-
#
|
|
278
|
+
# Tokenise our examples with truncation and padding, but keep the overflows using a
|
|
263
279
|
# stride. This results in one example possible giving several features when a
|
|
264
280
|
# context is long, each of those features having a context that overlaps a bit the
|
|
265
281
|
# context of the previous feature.
|
|
266
|
-
|
|
282
|
+
tokenised_examples = tokeniser(
|
|
267
283
|
text=examples["question"],
|
|
268
284
|
text_pair=examples["context"],
|
|
269
285
|
truncation="only_second",
|
|
@@ -277,32 +293,32 @@ def prepare_train_examples(
|
|
|
277
293
|
# Since one example might give us several features if it has a long context, we
|
|
278
294
|
# need a map from a feature to its corresponding example. This key gives us just
|
|
279
295
|
# that
|
|
280
|
-
sample_mapping =
|
|
296
|
+
sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
|
|
281
297
|
|
|
282
298
|
# The offset mappings will give us a map from token to character position in the
|
|
283
299
|
# original context. This will help us compute the start_positions and
|
|
284
300
|
# end_positions.
|
|
285
|
-
offset_mapping =
|
|
301
|
+
offset_mapping = tokenised_examples.pop("offset_mapping")
|
|
286
302
|
|
|
287
303
|
# Initialise the start- and end positions of the answers
|
|
288
|
-
|
|
289
|
-
|
|
304
|
+
tokenised_examples["start_positions"] = list()
|
|
305
|
+
tokenised_examples["end_positions"] = list()
|
|
290
306
|
|
|
291
307
|
for i, offsets in enumerate(offset_mapping):
|
|
292
308
|
# Get the input IDs for the current example
|
|
293
|
-
input_ids =
|
|
309
|
+
input_ids = tokenised_examples.input_ids[i]
|
|
294
310
|
|
|
295
311
|
# We will label impossible answers with the index of the CLS token
|
|
296
312
|
cls_index = input_ids.index(cls_token_id)
|
|
297
313
|
|
|
298
314
|
# Grab the sequence corresponding to that example (to know what is the context
|
|
299
315
|
# and what is the question).
|
|
300
|
-
sequence_ids =
|
|
316
|
+
sequence_ids = tokenised_examples.sequence_ids(i)
|
|
301
317
|
|
|
302
318
|
# Manually ensure that the special tokens are set to None in `sequence_ids`
|
|
303
|
-
for special_token in
|
|
304
|
-
if hasattr(
|
|
305
|
-
special_token_id = getattr(
|
|
319
|
+
for special_token in tokeniser.special_tokens_map.keys():
|
|
320
|
+
if hasattr(tokeniser, f"{special_token}_id"):
|
|
321
|
+
special_token_id = getattr(tokeniser, f"{special_token}_id")
|
|
306
322
|
if special_token_id is not None:
|
|
307
323
|
sequence_ids = [
|
|
308
324
|
None if token_id == special_token_id else seq_id
|
|
@@ -316,8 +332,8 @@ def prepare_train_examples(
|
|
|
316
332
|
|
|
317
333
|
# If no answers are given, set the cls_index as answer.
|
|
318
334
|
if len(answers["answer_start"]) == 0:
|
|
319
|
-
|
|
320
|
-
|
|
335
|
+
tokenised_examples.start_positions.append(cls_index)
|
|
336
|
+
tokenised_examples.end_positions.append(cls_index)
|
|
321
337
|
|
|
322
338
|
else:
|
|
323
339
|
# Start/end character index of the answer in the text.
|
|
@@ -325,9 +341,17 @@ def prepare_train_examples(
|
|
|
325
341
|
end_char = start_char + len(answers["text"][0])
|
|
326
342
|
|
|
327
343
|
# Start token index of the current span in the text.
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
token_start_index
|
|
344
|
+
try:
|
|
345
|
+
token_start_index = 0
|
|
346
|
+
while sequence_ids[token_start_index] != 1:
|
|
347
|
+
token_start_index += 1
|
|
348
|
+
|
|
349
|
+
# If it turns out that we cannot find the context in the span, then we
|
|
350
|
+
# treat this as an impossible case
|
|
351
|
+
except IndexError:
|
|
352
|
+
tokenised_examples.start_positions.append(cls_index)
|
|
353
|
+
tokenised_examples.end_positions.append(cls_index)
|
|
354
|
+
continue
|
|
331
355
|
|
|
332
356
|
# End token index of the current span in the text.
|
|
333
357
|
token_end_index = len(input_ids) - 1
|
|
@@ -340,8 +364,8 @@ def prepare_train_examples(
|
|
|
340
364
|
offsets[token_start_index][0] <= start_char
|
|
341
365
|
and offsets[token_end_index][1] >= end_char
|
|
342
366
|
):
|
|
343
|
-
|
|
344
|
-
|
|
367
|
+
tokenised_examples.start_positions.append(cls_index)
|
|
368
|
+
tokenised_examples.end_positions.append(cls_index)
|
|
345
369
|
|
|
346
370
|
# Otherwise move the token_start_index and token_end_index to the two ends
|
|
347
371
|
# of the answer. Note: we could go after the last offset if the answer is
|
|
@@ -353,71 +377,75 @@ def prepare_train_examples(
|
|
|
353
377
|
):
|
|
354
378
|
token_start_index += 1
|
|
355
379
|
token_start_index -= 1
|
|
356
|
-
|
|
380
|
+
tokenised_examples.start_positions.append(token_start_index)
|
|
357
381
|
while (
|
|
358
382
|
token_start_index <= token_end_index
|
|
359
383
|
and offsets[token_end_index][1] >= end_char
|
|
360
384
|
):
|
|
361
385
|
token_end_index -= 1
|
|
362
386
|
token_end_index += 1
|
|
363
|
-
|
|
387
|
+
tokenised_examples.end_positions.append(token_end_index)
|
|
364
388
|
assert token_end_index >= token_start_index
|
|
365
389
|
|
|
366
|
-
return
|
|
390
|
+
return tokenised_examples
|
|
367
391
|
|
|
368
392
|
|
|
369
393
|
def prepare_test_examples(
|
|
370
|
-
examples: "BatchEncoding",
|
|
394
|
+
examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
|
|
371
395
|
) -> "BatchEncoding":
|
|
372
396
|
"""Prepare test examples.
|
|
373
397
|
|
|
374
398
|
Args:
|
|
375
399
|
examples:
|
|
376
400
|
Dictionary of test examples.
|
|
377
|
-
|
|
378
|
-
The
|
|
401
|
+
tokeniser:
|
|
402
|
+
The tokeniser used to preprocess the examples.
|
|
379
403
|
|
|
380
404
|
Returns:
|
|
381
405
|
The prepared test examples.
|
|
382
406
|
"""
|
|
383
407
|
# Some of the questions have lots of whitespace on the left, which is not useful
|
|
384
|
-
# and will make the truncation of the context fail (the
|
|
408
|
+
# and will make the truncation of the context fail (the tokenised question will
|
|
385
409
|
# take a lots of space). So we remove that left whitespace
|
|
386
410
|
examples["question"] = [q.lstrip() for q in examples["question"]]
|
|
387
411
|
|
|
388
|
-
# Extract special token metadata from the
|
|
389
|
-
special_token_metadata = get_special_token_metadata(
|
|
412
|
+
# Extract special token metadata from the tokeniser
|
|
413
|
+
special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
|
|
390
414
|
has_cls_token = special_token_metadata["has_cls_token"]
|
|
391
415
|
has_sep_token = special_token_metadata["has_sep_token"]
|
|
392
416
|
cls_token = special_token_metadata["cls_token"]
|
|
393
417
|
sep_token = special_token_metadata["sep_token"]
|
|
394
418
|
|
|
395
|
-
# If the
|
|
419
|
+
# If the tokeniser is not adding special tokens, then we add them manually
|
|
396
420
|
if not has_cls_token and not has_sep_token:
|
|
397
421
|
examples["question"] = [
|
|
398
422
|
f"{cls_token}{q}{sep_token}" for q in examples["question"]
|
|
399
423
|
]
|
|
400
424
|
examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
|
|
401
425
|
|
|
402
|
-
# Set the stride used during
|
|
426
|
+
# Set the stride used during tokenisation, when the context is long enough to be
|
|
403
427
|
# split into several features. Since we are always keeping the question tokens, we
|
|
404
428
|
# need to make sure that the stride does not exceed the resulting maximum context
|
|
405
429
|
# length.
|
|
406
|
-
max_question_tokens = max(len(
|
|
430
|
+
max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
|
|
407
431
|
num_special_tokens = int(has_cls_token) + int(has_sep_token)
|
|
408
|
-
stride =
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
432
|
+
stride = tokeniser.model_max_length // 4
|
|
433
|
+
stride = min(
|
|
434
|
+
stride,
|
|
435
|
+
tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
|
|
436
|
+
)
|
|
437
|
+
stride = max(stride, 0)
|
|
438
|
+
max_length = tokeniser.model_max_length - stride
|
|
439
|
+
max_length = max(max_length, 0)
|
|
412
440
|
|
|
413
|
-
#
|
|
441
|
+
# Tokenise our examples with truncation and maybe padding, but keep the overflows
|
|
414
442
|
# using a stride. This results in one example possible giving several features when
|
|
415
443
|
# a context is long, each of those features having a context that overlaps a bit
|
|
416
444
|
# the context of the previous feature.
|
|
417
|
-
|
|
445
|
+
tokenised_examples = tokeniser(
|
|
418
446
|
text=examples["question"],
|
|
419
447
|
text_pair=examples["context"],
|
|
420
|
-
truncation=
|
|
448
|
+
truncation=TruncationStrategy.LONGEST_FIRST,
|
|
421
449
|
max_length=max_length,
|
|
422
450
|
stride=stride,
|
|
423
451
|
return_overflowing_tokens=True,
|
|
@@ -428,30 +456,30 @@ def prepare_test_examples(
|
|
|
428
456
|
# Since one example might give us several features if it has a long context, we
|
|
429
457
|
# need a map from a feature to its corresponding example. This key gives us just
|
|
430
458
|
# that.
|
|
431
|
-
sample_mapping =
|
|
459
|
+
sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
|
|
432
460
|
|
|
433
461
|
# We keep the id that gave us this feature and we will store the offset mappings.
|
|
434
|
-
|
|
462
|
+
tokenised_examples["id"] = list()
|
|
435
463
|
|
|
436
|
-
for i in range(len(
|
|
464
|
+
for i in range(len(tokenised_examples.input_ids)):
|
|
437
465
|
# Grab the sequence corresponding to that example (to know what is the context
|
|
438
466
|
# and what is the question).
|
|
439
|
-
sequence_ids =
|
|
467
|
+
sequence_ids = tokenised_examples.sequence_ids(i)
|
|
440
468
|
context_index = 1
|
|
441
469
|
|
|
442
470
|
# One example can give several spans, this is the index of the example
|
|
443
471
|
# containing this span of text.
|
|
444
472
|
sample_index = sample_mapping[i]
|
|
445
|
-
|
|
473
|
+
tokenised_examples.id.append(examples["id"][sample_index])
|
|
446
474
|
|
|
447
475
|
# Set to (-1, -1) the offset_mapping that are not part of the context so it's
|
|
448
476
|
# easy to determine if a token position is part of the context or not.
|
|
449
|
-
|
|
477
|
+
tokenised_examples.offset_mapping[i] = [
|
|
450
478
|
(o if sequence_ids[k] == context_index else (-1, -1))
|
|
451
|
-
for k, o in enumerate(
|
|
479
|
+
for k, o in enumerate(tokenised_examples.offset_mapping[i])
|
|
452
480
|
]
|
|
453
481
|
|
|
454
|
-
return
|
|
482
|
+
return tokenised_examples
|
|
455
483
|
|
|
456
484
|
|
|
457
485
|
def postprocess_predictions_and_labels(
|
|
@@ -459,7 +487,7 @@ def postprocess_predictions_and_labels(
|
|
|
459
487
|
dataset: "Dataset",
|
|
460
488
|
prepared_dataset: "Dataset",
|
|
461
489
|
cls_token_index: int,
|
|
462
|
-
) -> tuple[
|
|
490
|
+
) -> tuple[c.Sequence[dict], c.Sequence[dict]]:
|
|
463
491
|
"""Postprocess the predictions and labels, to allow easier metric computation.
|
|
464
492
|
|
|
465
493
|
Args:
|
|
@@ -540,7 +568,7 @@ def find_best_answer(
|
|
|
540
568
|
all_start_logits: np.ndarray,
|
|
541
569
|
all_end_logits: np.ndarray,
|
|
542
570
|
prepared_dataset: "Dataset",
|
|
543
|
-
feature_indices:
|
|
571
|
+
feature_indices: c.Sequence[int],
|
|
544
572
|
context: str,
|
|
545
573
|
max_answer_length: int,
|
|
546
574
|
num_best_logits: int,
|
|
@@ -573,7 +601,7 @@ def find_best_answer(
|
|
|
573
601
|
The best answer for the example.
|
|
574
602
|
"""
|
|
575
603
|
# Loop through all the features associated to the current example
|
|
576
|
-
valid_answers = list()
|
|
604
|
+
valid_answers: list[dict] = list()
|
|
577
605
|
for feature_index in feature_indices:
|
|
578
606
|
# Get the features associated with the current example
|
|
579
607
|
features = prepared_dataset[feature_index]
|
|
@@ -614,12 +642,12 @@ def find_best_answer(
|
|
|
614
642
|
def find_valid_answers(
|
|
615
643
|
start_logits: np.ndarray,
|
|
616
644
|
end_logits: np.ndarray,
|
|
617
|
-
offset_mapping:
|
|
645
|
+
offset_mapping: c.Sequence[tuple[int, int]],
|
|
618
646
|
context: str,
|
|
619
647
|
max_answer_length: int,
|
|
620
648
|
num_best_logits: int,
|
|
621
649
|
min_null_score: float,
|
|
622
|
-
) ->
|
|
650
|
+
) -> c.Sequence[dict]:
|
|
623
651
|
"""Find the valid answers from the start and end indexes.
|
|
624
652
|
|
|
625
653
|
Args:
|