EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. euroeval/__init__.py +32 -14
  2. euroeval/benchmark_config_factory.py +92 -180
  3. euroeval/benchmark_modules/base.py +49 -39
  4. euroeval/benchmark_modules/fresh.py +35 -21
  5. euroeval/benchmark_modules/hf.py +280 -244
  6. euroeval/benchmark_modules/litellm.py +752 -312
  7. euroeval/benchmark_modules/vllm.py +570 -268
  8. euroeval/benchmarker.py +651 -528
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +49 -38
  12. euroeval/constants.py +44 -25
  13. euroeval/data_loading.py +111 -55
  14. euroeval/data_models.py +490 -323
  15. euroeval/dataset_configs/__init__.py +26 -4
  16. euroeval/dataset_configs/bosnian.py +39 -0
  17. euroeval/dataset_configs/bulgarian.py +56 -0
  18. euroeval/dataset_configs/croatian.py +56 -0
  19. euroeval/dataset_configs/czech.py +75 -0
  20. euroeval/dataset_configs/danish.py +78 -50
  21. euroeval/dataset_configs/dutch.py +74 -44
  22. euroeval/dataset_configs/english.py +71 -36
  23. euroeval/dataset_configs/estonian.py +111 -0
  24. euroeval/dataset_configs/faroese.py +25 -18
  25. euroeval/dataset_configs/finnish.py +63 -26
  26. euroeval/dataset_configs/french.py +65 -32
  27. euroeval/dataset_configs/german.py +77 -36
  28. euroeval/dataset_configs/greek.py +64 -0
  29. euroeval/dataset_configs/icelandic.py +68 -57
  30. euroeval/dataset_configs/italian.py +68 -36
  31. euroeval/dataset_configs/latvian.py +87 -0
  32. euroeval/dataset_configs/lithuanian.py +64 -0
  33. euroeval/dataset_configs/norwegian.py +98 -72
  34. euroeval/dataset_configs/polish.py +96 -0
  35. euroeval/dataset_configs/portuguese.py +63 -40
  36. euroeval/dataset_configs/serbian.py +64 -0
  37. euroeval/dataset_configs/slovak.py +55 -0
  38. euroeval/dataset_configs/slovene.py +56 -0
  39. euroeval/dataset_configs/spanish.py +68 -34
  40. euroeval/dataset_configs/swedish.py +82 -41
  41. euroeval/dataset_configs/ukrainian.py +64 -0
  42. euroeval/enums.py +12 -6
  43. euroeval/exceptions.py +21 -1
  44. euroeval/finetuning.py +34 -26
  45. euroeval/generation.py +76 -41
  46. euroeval/generation_utils.py +169 -34
  47. euroeval/languages.py +1020 -188
  48. euroeval/logging_utils.py +268 -0
  49. euroeval/metrics/__init__.py +6 -0
  50. euroeval/metrics/base.py +85 -0
  51. euroeval/metrics/huggingface.py +216 -0
  52. euroeval/metrics/llm_as_a_judge.py +260 -0
  53. euroeval/metrics/pipeline.py +289 -0
  54. euroeval/metrics/speed.py +48 -0
  55. euroeval/model_cache.py +40 -21
  56. euroeval/model_config.py +4 -5
  57. euroeval/model_loading.py +3 -0
  58. euroeval/prompt_templates/__init__.py +2 -0
  59. euroeval/prompt_templates/classification.py +206 -0
  60. euroeval/prompt_templates/linguistic_acceptability.py +157 -22
  61. euroeval/prompt_templates/multiple_choice.py +159 -17
  62. euroeval/prompt_templates/named_entity_recognition.py +318 -21
  63. euroeval/prompt_templates/reading_comprehension.py +207 -16
  64. euroeval/prompt_templates/sentiment_classification.py +205 -22
  65. euroeval/prompt_templates/summarization.py +122 -22
  66. euroeval/prompt_templates/token_classification.py +279 -0
  67. euroeval/scores.py +20 -9
  68. euroeval/speed_benchmark.py +11 -12
  69. euroeval/task_group_utils/multiple_choice_classification.py +21 -12
  70. euroeval/task_group_utils/question_answering.py +101 -73
  71. euroeval/task_group_utils/sequence_classification.py +144 -61
  72. euroeval/task_group_utils/text_to_text.py +33 -12
  73. euroeval/task_group_utils/token_classification.py +86 -89
  74. euroeval/tasks.py +75 -16
  75. euroeval/tokenisation_utils.py +603 -0
  76. euroeval/types.py +17 -11
  77. euroeval/utils.py +332 -137
  78. euroeval-16.7.1.dist-info/METADATA +623 -0
  79. euroeval-16.7.1.dist-info/RECORD +84 -0
  80. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
  81. euroeval/human_evaluation.py +0 -737
  82. euroeval/metrics.py +0 -452
  83. euroeval/tokenization_utils.py +0 -498
  84. euroeval-15.12.0.dist-info/METADATA +0 -285
  85. euroeval-15.12.0.dist-info/RECORD +0 -63
  86. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
  87. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,26 +1,25 @@
1
1
  """Benchmarking model inference speed."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import typing as t
5
6
 
6
7
  import pyinfer
7
- from tqdm.auto import tqdm
8
8
  from transformers.models.auto.tokenization_auto import AutoTokenizer
9
9
 
10
10
  from .benchmark_modules import HuggingFaceEncoderModel, LiteLLMModel, VLLMModel
11
11
  from .exceptions import InvalidBenchmark
12
+ from .logging_utils import get_pbar, log
12
13
  from .utils import clear_memory
13
14
 
14
15
  if t.TYPE_CHECKING:
15
16
  from .benchmark_modules import BenchmarkModule
16
17
  from .data_models import BenchmarkConfig
17
18
 
18
- logger = logging.getLogger("euroeval")
19
-
20
19
 
21
20
  def benchmark_speed(
22
21
  model: "BenchmarkModule", benchmark_config: "BenchmarkConfig"
23
- ) -> list[dict[str, float]]:
22
+ ) -> c.Sequence[dict[str, float]]:
24
23
  """Benchmark model inference speed.
25
24
 
26
25
  Args:
@@ -33,7 +32,7 @@ def benchmark_speed(
33
32
  Dictionary of scores.
34
33
  """
35
34
  scores: list[dict[str, float]] = list()
36
- for idx in tqdm(
35
+ for idx in get_pbar(
37
36
  iterable=range(benchmark_config.num_iterations),
38
37
  desc="Benchmarking",
39
38
  disable=not benchmark_config.progress_bar,
@@ -41,7 +40,7 @@ def benchmark_speed(
41
40
  itr_scores = benchmark_speed_single_iteration(model=model, itr_idx=idx)
42
41
  clear_memory()
43
42
  scores.append(itr_scores)
44
- logger.debug(f"Scores for iteration {idx}: {itr_scores}")
43
+ log(f"Scores for iteration {idx}: {itr_scores}", level=logging.DEBUG)
45
44
  return scores
46
45
 
47
46
 
@@ -59,7 +58,7 @@ def benchmark_speed_single_iteration(
59
58
  Returns:
60
59
  A dictionary containing the scores for the current iteration.
61
60
  """
62
- gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
61
+ gpt2_tokeniser = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
63
62
 
64
63
  base_doc = "Document which contains roughly 10 tokens. "
65
64
  multiplier = 10 * (1 + itr_idx)
@@ -74,11 +73,11 @@ def benchmark_speed_single_iteration(
74
73
  model.generate(inputs=dict(text=[doc]))
75
74
 
76
75
  def encoder_predict(doc: str) -> None:
77
- tokenizer = model.get_tokenizer()
76
+ tokeniser = model.get_tokeniser()
78
77
  pytorch_model = model.get_pytorch_module()
79
78
  inputs = {
80
79
  key: tensor.to(pytorch_model.device)
81
- for key, tensor in tokenizer(
80
+ for key, tensor in tokeniser(
82
81
  text=[doc], truncation=True, return_tensors="pt"
83
82
  ).items()
84
83
  }
@@ -102,21 +101,21 @@ def benchmark_speed_single_iteration(
102
101
  speed_scores = pyinfer.InferenceReport(
103
102
  model=predict, inputs=doc, n_seconds=3
104
103
  ).run(print_report=False)
105
- num_gpt2_tokens = len(gpt2_tokenizer([doc], truncation=True)["input_ids"][0])
104
+ num_gpt2_tokens = len(gpt2_tokeniser([doc], truncation=True)["input_ids"][0])
106
105
  gpt2_tokens_per_second = speed_scores["Infer(p/sec)"] * num_gpt2_tokens
107
106
 
108
107
  speed_scores_short = pyinfer.InferenceReport(
109
108
  model=predict, inputs=short_doc, n_seconds=3
110
109
  ).run(print_report=False)
111
110
  num_gpt2_tokens_short = len(
112
- gpt2_tokenizer([short_doc], truncation=True)["input_ids"][0]
111
+ gpt2_tokeniser([short_doc], truncation=True)["input_ids"][0]
113
112
  )
114
113
  gpt2_tokens_per_second_short = (
115
114
  speed_scores_short["Infer(p/sec)"] * num_gpt2_tokens_short
116
115
  )
117
116
 
118
117
  except (RuntimeError, ValueError, IndexError) as e:
119
- raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}")
118
+ raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}") from e
120
119
 
121
120
  return dict(
122
121
  test_speed=gpt2_tokens_per_second, test_speed_short=gpt2_tokens_per_second_short
@@ -1,7 +1,7 @@
1
1
  """Utility functions related to the multiple-choice classification task group."""
2
2
 
3
+ import collections.abc as c
3
4
  import hashlib
4
- import logging
5
5
  import re
6
6
  import typing as t
7
7
  from collections import defaultdict
@@ -18,8 +18,6 @@ if t.TYPE_CHECKING:
18
18
 
19
19
  from ..types import Labels, Predictions
20
20
 
21
- logger = logging.getLogger("euroeval")
22
-
23
21
 
24
22
  class MultipleChoiceClassificationTrainer(Trainer):
25
23
  """Trainer subclass for multiple-choice classification tasks."""
@@ -27,7 +25,7 @@ class MultipleChoiceClassificationTrainer(Trainer):
27
25
  def evaluate( # type: ignore[override]
28
26
  self,
29
27
  eval_dataset: "Dataset | None" = None,
30
- ignore_keys: list[str] | None = None,
28
+ ignore_keys: c.Sequence[str] | None = None,
31
29
  metric_key_prefix: str = "eval",
32
30
  ) -> dict[str, float]:
33
31
  """Evaluate the model on the given dataset.
@@ -94,15 +92,15 @@ class MultipleChoiceClassificationTrainer(Trainer):
94
92
 
95
93
 
96
94
  def prepare_examples(
97
- examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
95
+ examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
98
96
  ) -> "BatchEncoding":
99
97
  """Prepare the features.
100
98
 
101
99
  Args:
102
100
  examples:
103
101
  The examples to prepare.
104
- tokenizer:
105
- The tokenizer to use to prepare the examples.
102
+ tokeniser:
103
+ The tokeniser to use to prepare the examples.
106
104
 
107
105
  Returns:
108
106
  The prepared examples.
@@ -110,12 +108,23 @@ def prepare_examples(
110
108
  doc: str = examples["text"][0]
111
109
  sections = doc.split("\n")
112
110
 
113
- choice_idxs = [
111
+ candidate_choice_idxs = [
114
112
  idx
115
113
  for idx, section in enumerate(sections)
116
- if re.match(pattern=r"^[a-e]\. ", string=section) is not None
114
+ if re.match(pattern=r"^[a-z0-9]+\. ", string=section) is not None
117
115
  ]
118
- choices = [sections[idx] for idx in choice_idxs]
116
+
117
+ # Sometimes the question itself starts with a letter or number followed by a dot, We
118
+ # want to ignore these cases, and focus on the final contingent block of at least
119
+ # two choices.
120
+ choice_idxs: list[int] = list()
121
+ for idx in reversed(candidate_choice_idxs):
122
+ if len(choice_idxs) < 2 or (
123
+ len(choice_idxs) >= 2 and idx == choice_idxs[-1] - 1
124
+ ):
125
+ choice_idxs.append(idx)
126
+
127
+ choices = [sections[idx] for idx in reversed(choice_idxs)]
119
128
 
120
129
  # Check that the choices are present, and that all of them are at the end
121
130
  assert len(choices) > 0, "No choices found in the document."
@@ -127,7 +136,7 @@ def prepare_examples(
127
136
  question_idx = min(choice_idxs) - 2 # -2 to remove the 'Choices:' line
128
137
  context_and_question = "\n".join(sections[: question_idx + 1]).strip()
129
138
 
130
- new_examples = tokenizer(
139
+ new_examples = tokeniser(
131
140
  text=[context_and_question] * len(choices),
132
141
  text_pair=[choice[3:] for choice in choices],
133
142
  padding=True,
@@ -135,7 +144,7 @@ def prepare_examples(
135
144
  )
136
145
  new_examples["label"] = [
137
146
  int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
138
- for letter, choice in zip("abcde", choices)
147
+ for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
139
148
  ]
140
149
  new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
141
150
  return new_examples
@@ -1,16 +1,18 @@
1
1
  """Utility functions related to the question-answering task group."""
2
2
 
3
3
  import collections.abc as c
4
- import logging
5
4
  import typing as t
6
5
  from collections import defaultdict
7
6
 
8
7
  import numpy as np
9
- from transformers.tokenization_utils_base import PreTrainedTokenizerBase
8
+ from transformers.tokenization_utils_base import (
9
+ PreTrainedTokenizerBase,
10
+ TruncationStrategy,
11
+ )
10
12
  from transformers.trainer import Trainer
11
13
 
12
14
  from ..exceptions import InvalidBenchmark
13
- from ..tokenization_utils import get_special_token_metadata
15
+ from ..tokenisation_utils import get_special_token_metadata
14
16
  from ..utils import raise_if_model_output_contains_nan_values
15
17
 
16
18
  if t.TYPE_CHECKING:
@@ -23,11 +25,9 @@ if t.TYPE_CHECKING:
23
25
  from transformers.trainer_utils import EvalPrediction
24
26
  from transformers.training_args import TrainingArguments
25
27
 
26
- from ..data_models import DatasetConfig, GenerativeModelOutput
28
+ from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
27
29
  from ..types import Labels, Predictions
28
30
 
29
- logger = logging.getLogger("euroeval")
30
-
31
31
 
32
32
  class QuestionAnsweringTrainer(Trainer):
33
33
  """Trainer subclass for question answering tasks."""
@@ -40,7 +40,7 @@ class QuestionAnsweringTrainer(Trainer):
40
40
  train_dataset: "Dataset",
41
41
  eval_dataset: "Dataset",
42
42
  compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
43
- callbacks: "list[TrainerCallback]",
43
+ callbacks: "c.Sequence[TrainerCallback]",
44
44
  data_collator: "c.Callable",
45
45
  **kwargs,
46
46
  ) -> None:
@@ -57,7 +57,7 @@ class QuestionAnsweringTrainer(Trainer):
57
57
  **kwargs,
58
58
  )
59
59
 
60
- # Get the CLS token id for the tokenizer
60
+ # Get the CLS token id for the tokeniser
61
61
  if self.tokenizer is not None:
62
62
  assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
63
63
  special_token_metadata = get_special_token_metadata(self.tokenizer)
@@ -70,7 +70,7 @@ class QuestionAnsweringTrainer(Trainer):
70
70
  self,
71
71
  eval_dataset: "Dataset | None" = None,
72
72
  orig_eval_dataset: "Dataset | None" = None,
73
- ignore_keys: list[str] | None = None,
73
+ ignore_keys: c.Sequence[str] | None = None,
74
74
  metric_key_prefix: str = "eval",
75
75
  ) -> dict[str, float]:
76
76
  """Evaluate the model on the given dataset.
@@ -149,6 +149,8 @@ class QuestionAnsweringTrainer(Trainer):
149
149
  def compute_metrics(
150
150
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
151
151
  dataset_config: "DatasetConfig",
152
+ benchmark_config: "BenchmarkConfig",
153
+ dataset: "Dataset",
152
154
  ) -> dict[str, float]:
153
155
  """Compute the metrics needed for evaluation.
154
156
 
@@ -158,6 +160,11 @@ def compute_metrics(
158
160
  contains the true labels.
159
161
  dataset_config:
160
162
  The configuration of the dataset.
163
+ benchmark_config:
164
+ The configuration of the benchmark.
165
+ dataset:
166
+ The dataset used for evaluation. This is only used in case any additional
167
+ metadata is used to compute the metrics.
161
168
 
162
169
  Returns:
163
170
  A dictionary with the names of the metrics as keys and the metric values as
@@ -181,7 +188,13 @@ def compute_metrics(
181
188
 
182
189
  results: dict[str, float] = dict()
183
190
  for metric in dataset_config.task.metrics:
184
- score: float | None = metric(predictions=predictions, references=labels)
191
+ score: float | None = metric(
192
+ predictions=predictions,
193
+ references=labels,
194
+ dataset=dataset,
195
+ dataset_config=dataset_config,
196
+ benchmark_config=benchmark_config,
197
+ )
185
198
 
186
199
  # The metric returns None if we are running on multi-GPU and the current
187
200
  # process is not the main process
@@ -193,7 +206,7 @@ def compute_metrics(
193
206
 
194
207
  def extract_labels_from_generation(
195
208
  input_batch: dict[str, list], model_output: "GenerativeModelOutput"
196
- ) -> list[t.Any]:
209
+ ) -> c.Sequence[t.Any]:
197
210
  """Extract the predicted labels from the generated output.
198
211
 
199
212
  Args:
@@ -215,15 +228,15 @@ def extract_labels_from_generation(
215
228
 
216
229
 
217
230
  def prepare_train_examples(
218
- examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
231
+ examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
219
232
  ) -> "BatchEncoding":
220
233
  """Prepare the features for training.
221
234
 
222
235
  Args:
223
236
  examples:
224
237
  The examples to prepare.
225
- tokenizer:
226
- The tokenizer to use to prepare the examples.
238
+ tokeniser:
239
+ The tokeniser to use to prepare the examples.
227
240
 
228
241
  Returns:
229
242
  The prepared examples.
@@ -233,37 +246,40 @@ def prepare_train_examples(
233
246
  # take a lots of space). So we remove that left whitespace
234
247
  examples["question"] = [q.lstrip() for q in examples["question"]]
235
248
 
236
- # Extract special token metadata from the tokenizer
237
- special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
249
+ # Extract special token metadata from the tokeniser
250
+ special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
238
251
  has_cls_token = special_token_metadata["has_cls_token"]
239
252
  has_sep_token = special_token_metadata["has_sep_token"]
240
253
  cls_token_id = special_token_metadata["cls_token_id"]
241
254
  cls_token = special_token_metadata["cls_token"]
242
255
  sep_token = special_token_metadata["sep_token"]
243
256
 
244
- # If the tokenizer is not adding special tokens, then we add them manually
257
+ # If the tokeniser is not adding special tokens, then we add them manually
245
258
  if not has_cls_token and not has_sep_token:
246
259
  examples["question"] = [
247
260
  f"{cls_token}{q}{sep_token}" for q in examples["question"]
248
261
  ]
249
262
  examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
250
263
 
251
- # Set the stride used during tokenization, when the context is long enough to be
264
+ # Set the stride used during tokenisation, when the context is long enough to be
252
265
  # split into several features. Since we are always keeping the question tokens, we
253
266
  # need to make sure that the stride does not exceed the resulting maximum context
254
267
  # length.
255
- max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
268
+ max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
256
269
  num_special_tokens = int(has_cls_token) + int(has_sep_token)
257
- stride = tokenizer.model_max_length // 4
258
- max_length = tokenizer.model_max_length - stride
259
- stride = min(stride, max_length - max_question_tokens - num_special_tokens)
260
- max_length = tokenizer.model_max_length - stride
270
+ stride = tokeniser.model_max_length // 4
271
+ stride = min(
272
+ stride,
273
+ tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
274
+ )
275
+ stride = max(stride, 0)
276
+ max_length = tokeniser.model_max_length - stride
261
277
 
262
- # Tokenize our examples with truncation and padding, but keep the overflows using a
278
+ # Tokenise our examples with truncation and padding, but keep the overflows using a
263
279
  # stride. This results in one example possible giving several features when a
264
280
  # context is long, each of those features having a context that overlaps a bit the
265
281
  # context of the previous feature.
266
- tokenized_examples = tokenizer(
282
+ tokenised_examples = tokeniser(
267
283
  text=examples["question"],
268
284
  text_pair=examples["context"],
269
285
  truncation="only_second",
@@ -277,32 +293,32 @@ def prepare_train_examples(
277
293
  # Since one example might give us several features if it has a long context, we
278
294
  # need a map from a feature to its corresponding example. This key gives us just
279
295
  # that
280
- sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
296
+ sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
281
297
 
282
298
  # The offset mappings will give us a map from token to character position in the
283
299
  # original context. This will help us compute the start_positions and
284
300
  # end_positions.
285
- offset_mapping = tokenized_examples.pop("offset_mapping")
301
+ offset_mapping = tokenised_examples.pop("offset_mapping")
286
302
 
287
303
  # Initialise the start- and end positions of the answers
288
- tokenized_examples["start_positions"] = list()
289
- tokenized_examples["end_positions"] = list()
304
+ tokenised_examples["start_positions"] = list()
305
+ tokenised_examples["end_positions"] = list()
290
306
 
291
307
  for i, offsets in enumerate(offset_mapping):
292
308
  # Get the input IDs for the current example
293
- input_ids = tokenized_examples.input_ids[i]
309
+ input_ids = tokenised_examples.input_ids[i]
294
310
 
295
311
  # We will label impossible answers with the index of the CLS token
296
312
  cls_index = input_ids.index(cls_token_id)
297
313
 
298
314
  # Grab the sequence corresponding to that example (to know what is the context
299
315
  # and what is the question).
300
- sequence_ids = tokenized_examples.sequence_ids(i)
316
+ sequence_ids = tokenised_examples.sequence_ids(i)
301
317
 
302
318
  # Manually ensure that the special tokens are set to None in `sequence_ids`
303
- for special_token in tokenizer.special_tokens_map.keys():
304
- if hasattr(tokenizer, f"{special_token}_id"):
305
- special_token_id = getattr(tokenizer, f"{special_token}_id")
319
+ for special_token in tokeniser.special_tokens_map.keys():
320
+ if hasattr(tokeniser, f"{special_token}_id"):
321
+ special_token_id = getattr(tokeniser, f"{special_token}_id")
306
322
  if special_token_id is not None:
307
323
  sequence_ids = [
308
324
  None if token_id == special_token_id else seq_id
@@ -316,8 +332,8 @@ def prepare_train_examples(
316
332
 
317
333
  # If no answers are given, set the cls_index as answer.
318
334
  if len(answers["answer_start"]) == 0:
319
- tokenized_examples.start_positions.append(cls_index)
320
- tokenized_examples.end_positions.append(cls_index)
335
+ tokenised_examples.start_positions.append(cls_index)
336
+ tokenised_examples.end_positions.append(cls_index)
321
337
 
322
338
  else:
323
339
  # Start/end character index of the answer in the text.
@@ -325,9 +341,17 @@ def prepare_train_examples(
325
341
  end_char = start_char + len(answers["text"][0])
326
342
 
327
343
  # Start token index of the current span in the text.
328
- token_start_index = 0
329
- while sequence_ids[token_start_index] != 1:
330
- token_start_index += 1
344
+ try:
345
+ token_start_index = 0
346
+ while sequence_ids[token_start_index] != 1:
347
+ token_start_index += 1
348
+
349
+ # If it turns out that we cannot find the context in the span, then we
350
+ # treat this as an impossible case
351
+ except IndexError:
352
+ tokenised_examples.start_positions.append(cls_index)
353
+ tokenised_examples.end_positions.append(cls_index)
354
+ continue
331
355
 
332
356
  # End token index of the current span in the text.
333
357
  token_end_index = len(input_ids) - 1
@@ -340,8 +364,8 @@ def prepare_train_examples(
340
364
  offsets[token_start_index][0] <= start_char
341
365
  and offsets[token_end_index][1] >= end_char
342
366
  ):
343
- tokenized_examples.start_positions.append(cls_index)
344
- tokenized_examples.end_positions.append(cls_index)
367
+ tokenised_examples.start_positions.append(cls_index)
368
+ tokenised_examples.end_positions.append(cls_index)
345
369
 
346
370
  # Otherwise move the token_start_index and token_end_index to the two ends
347
371
  # of the answer. Note: we could go after the last offset if the answer is
@@ -353,71 +377,75 @@ def prepare_train_examples(
353
377
  ):
354
378
  token_start_index += 1
355
379
  token_start_index -= 1
356
- tokenized_examples.start_positions.append(token_start_index)
380
+ tokenised_examples.start_positions.append(token_start_index)
357
381
  while (
358
382
  token_start_index <= token_end_index
359
383
  and offsets[token_end_index][1] >= end_char
360
384
  ):
361
385
  token_end_index -= 1
362
386
  token_end_index += 1
363
- tokenized_examples.end_positions.append(token_end_index)
387
+ tokenised_examples.end_positions.append(token_end_index)
364
388
  assert token_end_index >= token_start_index
365
389
 
366
- return tokenized_examples
390
+ return tokenised_examples
367
391
 
368
392
 
369
393
  def prepare_test_examples(
370
- examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
394
+ examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
371
395
  ) -> "BatchEncoding":
372
396
  """Prepare test examples.
373
397
 
374
398
  Args:
375
399
  examples:
376
400
  Dictionary of test examples.
377
- tokenizer:
378
- The tokenizer used to preprocess the examples.
401
+ tokeniser:
402
+ The tokeniser used to preprocess the examples.
379
403
 
380
404
  Returns:
381
405
  The prepared test examples.
382
406
  """
383
407
  # Some of the questions have lots of whitespace on the left, which is not useful
384
- # and will make the truncation of the context fail (the tokenized question will
408
+ # and will make the truncation of the context fail (the tokenised question will
385
409
  # take a lots of space). So we remove that left whitespace
386
410
  examples["question"] = [q.lstrip() for q in examples["question"]]
387
411
 
388
- # Extract special token metadata from the tokenizer
389
- special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
412
+ # Extract special token metadata from the tokeniser
413
+ special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
390
414
  has_cls_token = special_token_metadata["has_cls_token"]
391
415
  has_sep_token = special_token_metadata["has_sep_token"]
392
416
  cls_token = special_token_metadata["cls_token"]
393
417
  sep_token = special_token_metadata["sep_token"]
394
418
 
395
- # If the tokenizer is not adding special tokens, then we add them manually
419
+ # If the tokeniser is not adding special tokens, then we add them manually
396
420
  if not has_cls_token and not has_sep_token:
397
421
  examples["question"] = [
398
422
  f"{cls_token}{q}{sep_token}" for q in examples["question"]
399
423
  ]
400
424
  examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
401
425
 
402
- # Set the stride used during tokenization, when the context is long enough to be
426
+ # Set the stride used during tokenisation, when the context is long enough to be
403
427
  # split into several features. Since we are always keeping the question tokens, we
404
428
  # need to make sure that the stride does not exceed the resulting maximum context
405
429
  # length.
406
- max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
430
+ max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
407
431
  num_special_tokens = int(has_cls_token) + int(has_sep_token)
408
- stride = tokenizer.model_max_length // 4
409
- max_length = tokenizer.model_max_length - stride
410
- stride = min(stride, max_length - max_question_tokens - num_special_tokens)
411
- max_length = tokenizer.model_max_length - stride
432
+ stride = tokeniser.model_max_length // 4
433
+ stride = min(
434
+ stride,
435
+ tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
436
+ )
437
+ stride = max(stride, 0)
438
+ max_length = tokeniser.model_max_length - stride
439
+ max_length = max(max_length, 0)
412
440
 
413
- # Tokenize our examples with truncation and maybe padding, but keep the overflows
441
+ # Tokenise our examples with truncation and maybe padding, but keep the overflows
414
442
  # using a stride. This results in one example possible giving several features when
415
443
  # a context is long, each of those features having a context that overlaps a bit
416
444
  # the context of the previous feature.
417
- tokenized_examples = tokenizer(
445
+ tokenised_examples = tokeniser(
418
446
  text=examples["question"],
419
447
  text_pair=examples["context"],
420
- truncation="only_second",
448
+ truncation=TruncationStrategy.LONGEST_FIRST,
421
449
  max_length=max_length,
422
450
  stride=stride,
423
451
  return_overflowing_tokens=True,
@@ -428,30 +456,30 @@ def prepare_test_examples(
428
456
  # Since one example might give us several features if it has a long context, we
429
457
  # need a map from a feature to its corresponding example. This key gives us just
430
458
  # that.
431
- sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
459
+ sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
432
460
 
433
461
  # We keep the id that gave us this feature and we will store the offset mappings.
434
- tokenized_examples["id"] = list()
462
+ tokenised_examples["id"] = list()
435
463
 
436
- for i in range(len(tokenized_examples.input_ids)):
464
+ for i in range(len(tokenised_examples.input_ids)):
437
465
  # Grab the sequence corresponding to that example (to know what is the context
438
466
  # and what is the question).
439
- sequence_ids = tokenized_examples.sequence_ids(i)
467
+ sequence_ids = tokenised_examples.sequence_ids(i)
440
468
  context_index = 1
441
469
 
442
470
  # One example can give several spans, this is the index of the example
443
471
  # containing this span of text.
444
472
  sample_index = sample_mapping[i]
445
- tokenized_examples.id.append(examples["id"][sample_index])
473
+ tokenised_examples.id.append(examples["id"][sample_index])
446
474
 
447
475
  # Set to (-1, -1) the offset_mapping that are not part of the context so it's
448
476
  # easy to determine if a token position is part of the context or not.
449
- tokenized_examples.offset_mapping[i] = [
477
+ tokenised_examples.offset_mapping[i] = [
450
478
  (o if sequence_ids[k] == context_index else (-1, -1))
451
- for k, o in enumerate(tokenized_examples.offset_mapping[i])
479
+ for k, o in enumerate(tokenised_examples.offset_mapping[i])
452
480
  ]
453
481
 
454
- return tokenized_examples
482
+ return tokenised_examples
455
483
 
456
484
 
457
485
  def postprocess_predictions_and_labels(
@@ -459,7 +487,7 @@ def postprocess_predictions_and_labels(
459
487
  dataset: "Dataset",
460
488
  prepared_dataset: "Dataset",
461
489
  cls_token_index: int,
462
- ) -> tuple[list[dict], list[dict]]:
490
+ ) -> tuple[c.Sequence[dict], c.Sequence[dict]]:
463
491
  """Postprocess the predictions and labels, to allow easier metric computation.
464
492
 
465
493
  Args:
@@ -540,7 +568,7 @@ def find_best_answer(
540
568
  all_start_logits: np.ndarray,
541
569
  all_end_logits: np.ndarray,
542
570
  prepared_dataset: "Dataset",
543
- feature_indices: list[int],
571
+ feature_indices: c.Sequence[int],
544
572
  context: str,
545
573
  max_answer_length: int,
546
574
  num_best_logits: int,
@@ -573,7 +601,7 @@ def find_best_answer(
573
601
  The best answer for the example.
574
602
  """
575
603
  # Loop through all the features associated to the current example
576
- valid_answers = list()
604
+ valid_answers: list[dict] = list()
577
605
  for feature_index in feature_indices:
578
606
  # Get the features associated with the current example
579
607
  features = prepared_dataset[feature_index]
@@ -614,12 +642,12 @@ def find_best_answer(
614
642
  def find_valid_answers(
615
643
  start_logits: np.ndarray,
616
644
  end_logits: np.ndarray,
617
- offset_mapping: list[tuple[int, int]],
645
+ offset_mapping: c.Sequence[tuple[int, int]],
618
646
  context: str,
619
647
  max_answer_length: int,
620
648
  num_best_logits: int,
621
649
  min_null_score: float,
622
- ) -> list[dict]:
650
+ ) -> c.Sequence[dict]:
623
651
  """Find the valid answers from the start and end indexes.
624
652
 
625
653
  Args: