PyPI - EuroEval - Versions diffs - 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl - Mend

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

euroeval/__init__.py +32 -14
euroeval/benchmark_config_factory.py +92 -180
euroeval/benchmark_modules/base.py +49 -39
euroeval/benchmark_modules/fresh.py +35 -21
euroeval/benchmark_modules/hf.py +280 -244
euroeval/benchmark_modules/litellm.py +752 -312
euroeval/benchmark_modules/vllm.py +570 -268
euroeval/benchmarker.py +651 -528
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +49 -38
euroeval/constants.py +44 -25
euroeval/data_loading.py +111 -55
euroeval/data_models.py +490 -323
euroeval/dataset_configs/__init__.py +26 -4
euroeval/dataset_configs/bosnian.py +39 -0
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/croatian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +78 -50
euroeval/dataset_configs/dutch.py +74 -44
euroeval/dataset_configs/english.py +71 -36
euroeval/dataset_configs/estonian.py +111 -0
euroeval/dataset_configs/faroese.py +25 -18
euroeval/dataset_configs/finnish.py +63 -26
euroeval/dataset_configs/french.py +65 -32
euroeval/dataset_configs/german.py +77 -36
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +68 -57
euroeval/dataset_configs/italian.py +68 -36
euroeval/dataset_configs/latvian.py +87 -0
euroeval/dataset_configs/lithuanian.py +64 -0
euroeval/dataset_configs/norwegian.py +98 -72
euroeval/dataset_configs/polish.py +96 -0
euroeval/dataset_configs/portuguese.py +63 -40
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/slovene.py +56 -0
euroeval/dataset_configs/spanish.py +68 -34
euroeval/dataset_configs/swedish.py +82 -41
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/enums.py +12 -6
euroeval/exceptions.py +21 -1
euroeval/finetuning.py +34 -26
euroeval/generation.py +76 -41
euroeval/generation_utils.py +169 -34
euroeval/languages.py +1020 -188
euroeval/logging_utils.py +268 -0
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +85 -0
euroeval/metrics/huggingface.py +216 -0
euroeval/metrics/llm_as_a_judge.py +260 -0
euroeval/metrics/pipeline.py +289 -0
euroeval/metrics/speed.py +48 -0
euroeval/model_cache.py +40 -21
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +157 -22
euroeval/prompt_templates/multiple_choice.py +159 -17
euroeval/prompt_templates/named_entity_recognition.py +318 -21
euroeval/prompt_templates/reading_comprehension.py +207 -16
euroeval/prompt_templates/sentiment_classification.py +205 -22
euroeval/prompt_templates/summarization.py +122 -22
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +20 -9
euroeval/speed_benchmark.py +11 -12
euroeval/task_group_utils/multiple_choice_classification.py +21 -12
euroeval/task_group_utils/question_answering.py +101 -73
euroeval/task_group_utils/sequence_classification.py +144 -61
euroeval/task_group_utils/text_to_text.py +33 -12
euroeval/task_group_utils/token_classification.py +86 -89
euroeval/tasks.py +75 -16
euroeval/tokenisation_utils.py +603 -0
euroeval/types.py +17 -11
euroeval/utils.py +332 -137
euroeval-16.7.1.dist-info/METADATA +623 -0
euroeval-16.7.1.dist-info/RECORD +84 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -737
euroeval/metrics.py +0 -452
euroeval/tokenization_utils.py +0 -498
euroeval-15.12.0.dist-info/METADATA +0 -285
euroeval-15.12.0.dist-info/RECORD +0 -63
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/speed_benchmark.py CHANGED Viewed

@@ -1,26 +1,25 @@
 """Benchmarking model inference speed."""
+import collections.abc as c
 import logging
 import typing as t
 import pyinfer
-from tqdm.auto import tqdm
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from .benchmark_modules import HuggingFaceEncoderModel, LiteLLMModel, VLLMModel
 from .exceptions import InvalidBenchmark
+from .logging_utils import get_pbar, log
 from .utils import clear_memory
 if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
     from .data_models import BenchmarkConfig
-logger = logging.getLogger("euroeval")
 def benchmark_speed(
     model: "BenchmarkModule", benchmark_config: "BenchmarkConfig"
-) -> list[dict[str, float]]:
+) -> c.Sequence[dict[str, float]]:
     """Benchmark model inference speed.
     Args:
@@ -33,7 +32,7 @@ def benchmark_speed(
         Dictionary of scores.
     """
     scores: list[dict[str, float]] = list()
-    for idx in tqdm(
+    for idx in get_pbar(
         iterable=range(benchmark_config.num_iterations),
         desc="Benchmarking",
         disable=not benchmark_config.progress_bar,
@@ -41,7 +40,7 @@ def benchmark_speed(
         itr_scores = benchmark_speed_single_iteration(model=model, itr_idx=idx)
         clear_memory()
         scores.append(itr_scores)
-        logger.debug(f"Scores for iteration {idx}: {itr_scores}")
+        log(f"Scores for iteration {idx}: {itr_scores}", level=logging.DEBUG)
     return scores
@@ -59,7 +58,7 @@ def benchmark_speed_single_iteration(
     Returns:
         A dictionary containing the scores for the current iteration.
     """
-    gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
+    gpt2_tokeniser = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
     base_doc = "Document which contains roughly 10 tokens. "
     multiplier = 10 * (1 + itr_idx)
@@ -74,11 +73,11 @@ def benchmark_speed_single_iteration(
         model.generate(inputs=dict(text=[doc]))
     def encoder_predict(doc: str) -> None:
-        tokenizer = model.get_tokenizer()
+        tokeniser = model.get_tokeniser()
         pytorch_model = model.get_pytorch_module()
         inputs = {
             key: tensor.to(pytorch_model.device)
-            for key, tensor in tokenizer(
+            for key, tensor in tokeniser(
                 text=[doc], truncation=True, return_tensors="pt"
             ).items()
         }
@@ -102,21 +101,21 @@ def benchmark_speed_single_iteration(
         speed_scores = pyinfer.InferenceReport(
             model=predict, inputs=doc, n_seconds=3
         ).run(print_report=False)
-        num_gpt2_tokens = len(gpt2_tokenizer([doc], truncation=True)["input_ids"][0])
+        num_gpt2_tokens = len(gpt2_tokeniser([doc], truncation=True)["input_ids"][0])
         gpt2_tokens_per_second = speed_scores["Infer(p/sec)"] * num_gpt2_tokens
         speed_scores_short = pyinfer.InferenceReport(
             model=predict, inputs=short_doc, n_seconds=3
         ).run(print_report=False)
         num_gpt2_tokens_short = len(
-            gpt2_tokenizer([short_doc], truncation=True)["input_ids"][0]
+            gpt2_tokeniser([short_doc], truncation=True)["input_ids"][0]
         )
         gpt2_tokens_per_second_short = (
             speed_scores_short["Infer(p/sec)"] * num_gpt2_tokens_short
         )
     except (RuntimeError, ValueError, IndexError) as e:
-        raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}")
+        raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}") from e
     return dict(
         test_speed=gpt2_tokens_per_second, test_speed_short=gpt2_tokens_per_second_short

euroeval/task_group_utils/multiple_choice_classification.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Utility functions related to the multiple-choice classification task group."""
+import collections.abc as c
 import hashlib
-import logging
 import re
 import typing as t
 from collections import defaultdict
@@ -18,8 +18,6 @@ if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 class MultipleChoiceClassificationTrainer(Trainer):
     """Trainer subclass for multiple-choice classification tasks."""
@@ -27,7 +25,7 @@ class MultipleChoiceClassificationTrainer(Trainer):
     def evaluate(  # type: ignore[override]
         self,
         eval_dataset: "Dataset | None" = None,
-        ignore_keys: list[str] | None = None,
+        ignore_keys: c.Sequence[str] | None = None,
         metric_key_prefix: str = "eval",
     ) -> dict[str, float]:
         """Evaluate the model on the given dataset.
@@ -94,15 +92,15 @@ class MultipleChoiceClassificationTrainer(Trainer):
 def prepare_examples(
-    examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
+    examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
 ) -> "BatchEncoding":
     """Prepare the features.
     Args:
         examples:
             The examples to prepare.
-        tokenizer:
-            The tokenizer to use to prepare the examples.
+        tokeniser:
+            The tokeniser to use to prepare the examples.
     Returns:
         The prepared examples.
@@ -110,12 +108,23 @@ def prepare_examples(
     doc: str = examples["text"][0]
     sections = doc.split("\n")
-    choice_idxs = [
+    candidate_choice_idxs = [
         idx
         for idx, section in enumerate(sections)
-        if re.match(pattern=r"^[a-e]\. ", string=section) is not None
+        if re.match(pattern=r"^[a-z0-9]+\. ", string=section) is not None
     ]
-    choices = [sections[idx] for idx in choice_idxs]
+    # Sometimes the question itself starts with a letter or number followed by a dot, We
+    # want to ignore these cases, and focus on the final contingent block of at least
+    # two choices.
+    choice_idxs: list[int] = list()
+    for idx in reversed(candidate_choice_idxs):
+        if len(choice_idxs) < 2 or (
+            len(choice_idxs) >= 2 and idx == choice_idxs[-1] - 1
+        ):
+            choice_idxs.append(idx)
+    choices = [sections[idx] for idx in reversed(choice_idxs)]
     # Check that the choices are present, and that all of them are at the end
     assert len(choices) > 0, "No choices found in the document."
@@ -127,7 +136,7 @@ def prepare_examples(
     question_idx = min(choice_idxs) - 2  # -2 to remove the 'Choices:' line
     context_and_question = "\n".join(sections[: question_idx + 1]).strip()
-    new_examples = tokenizer(
+    new_examples = tokeniser(
         text=[context_and_question] * len(choices),
         text_pair=[choice[3:] for choice in choices],
         padding=True,
@@ -135,7 +144,7 @@ def prepare_examples(
     )
     new_examples["label"] = [
         int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
-        for letter, choice in zip("abcde", choices)
+        for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
     ]
     new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
     return new_examples

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -1,16 +1,18 @@
 """Utility functions related to the question-answering task group."""
 import collections.abc as c
-import logging
 import typing as t
 from collections import defaultdict
 import numpy as np
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.tokenization_utils_base import (
+    PreTrainedTokenizerBase,
+    TruncationStrategy,
+)
 from transformers.trainer import Trainer
 from ..exceptions import InvalidBenchmark
-from ..tokenization_utils import get_special_token_metadata
+from ..tokenisation_utils import get_special_token_metadata
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
@@ -23,11 +25,9 @@ if t.TYPE_CHECKING:
     from transformers.trainer_utils import EvalPrediction
     from transformers.training_args import TrainingArguments
-    from ..data_models import DatasetConfig, GenerativeModelOutput
+    from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
-logger = logging.getLogger("euroeval")
 class QuestionAnsweringTrainer(Trainer):
     """Trainer subclass for question answering tasks."""
@@ -40,7 +40,7 @@ class QuestionAnsweringTrainer(Trainer):
         train_dataset: "Dataset",
         eval_dataset: "Dataset",
         compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
-        callbacks: "list[TrainerCallback]",
+        callbacks: "c.Sequence[TrainerCallback]",
         data_collator: "c.Callable",
         **kwargs,
     ) -> None:
@@ -57,7 +57,7 @@ class QuestionAnsweringTrainer(Trainer):
             **kwargs,
         )
-        # Get the CLS token id for the tokenizer
+        # Get the CLS token id for the tokeniser
         if self.tokenizer is not None:
             assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
             special_token_metadata = get_special_token_metadata(self.tokenizer)
@@ -70,7 +70,7 @@ class QuestionAnsweringTrainer(Trainer):
         self,
         eval_dataset: "Dataset | None" = None,
         orig_eval_dataset: "Dataset | None" = None,
-        ignore_keys: list[str] | None = None,
+        ignore_keys: c.Sequence[str] | None = None,
         metric_key_prefix: str = "eval",
     ) -> dict[str, float]:
         """Evaluate the model on the given dataset.
@@ -149,6 +149,8 @@ class QuestionAnsweringTrainer(Trainer):
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -158,6 +160,11 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
+        benchmark_config:
+            The configuration of the benchmark.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -181,7 +188,13 @@ def compute_metrics(
     results: dict[str, float] = dict()
     for metric in dataset_config.task.metrics:
-        score: float | None = metric(predictions=predictions, references=labels)
+        score: float | None = metric(
+            predictions=predictions,
+            references=labels,
+            dataset=dataset,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
+        )
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process
@@ -193,7 +206,7 @@ def compute_metrics(
 def extract_labels_from_generation(
     input_batch: dict[str, list], model_output: "GenerativeModelOutput"
-) -> list[t.Any]:
+) -> c.Sequence[t.Any]:
     """Extract the predicted labels from the generated output.
     Args:
@@ -215,15 +228,15 @@ def extract_labels_from_generation(
 def prepare_train_examples(
-    examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
+    examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
 ) -> "BatchEncoding":
     """Prepare the features for training.
     Args:
         examples:
             The examples to prepare.
-        tokenizer:
-            The tokenizer to use to prepare the examples.
+        tokeniser:
+            The tokeniser to use to prepare the examples.
     Returns:
         The prepared examples.
@@ -233,37 +246,40 @@ def prepare_train_examples(
     # take a lots of space). So we remove that left whitespace
     examples["question"] = [q.lstrip() for q in examples["question"]]
-    # Extract special token metadata from the tokenizer
-    special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
+    # Extract special token metadata from the tokeniser
+    special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
     has_cls_token = special_token_metadata["has_cls_token"]
     has_sep_token = special_token_metadata["has_sep_token"]
     cls_token_id = special_token_metadata["cls_token_id"]
     cls_token = special_token_metadata["cls_token"]
     sep_token = special_token_metadata["sep_token"]
-    # If the tokenizer is not adding special tokens, then we add them manually
+    # If the tokeniser is not adding special tokens, then we add them manually
     if not has_cls_token and not has_sep_token:
         examples["question"] = [
             f"{cls_token}{q}{sep_token}" for q in examples["question"]
         ]
         examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
-    # Set the stride used during tokenization, when the context is long enough to be
+    # Set the stride used during tokenisation, when the context is long enough to be
     # split into several features. Since we are always keeping the question tokens, we
     # need to make sure that the stride does not exceed the resulting maximum context
     # length.
-    max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
+    max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
     num_special_tokens = int(has_cls_token) + int(has_sep_token)
-    stride = tokenizer.model_max_length // 4
-    max_length = tokenizer.model_max_length - stride
-    stride = min(stride, max_length - max_question_tokens - num_special_tokens)
-    max_length = tokenizer.model_max_length - stride
+    stride = tokeniser.model_max_length // 4
+    stride = min(
+        stride,
+        tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
+    )
+    stride = max(stride, 0)
+    max_length = tokeniser.model_max_length - stride
-    # Tokenize our examples with truncation and padding, but keep the overflows using a
+    # Tokenise our examples with truncation and padding, but keep the overflows using a
     # stride. This results in one example possible giving several features when a
     # context is long, each of those features having a context that overlaps a bit the
     # context of the previous feature.
-    tokenized_examples = tokenizer(
+    tokenised_examples = tokeniser(
         text=examples["question"],
         text_pair=examples["context"],
         truncation="only_second",
@@ -277,32 +293,32 @@ def prepare_train_examples(
     # Since one example might give us several features if it has a long context, we
     # need a map from a feature to its corresponding example. This key gives us just
     # that
-    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+    sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
     # The offset mappings will give us a map from token to character position in the
     # original context. This will help us compute the start_positions and
     # end_positions.
-    offset_mapping = tokenized_examples.pop("offset_mapping")
+    offset_mapping = tokenised_examples.pop("offset_mapping")
     # Initialise the start- and end positions of the answers
-    tokenized_examples["start_positions"] = list()
-    tokenized_examples["end_positions"] = list()
+    tokenised_examples["start_positions"] = list()
+    tokenised_examples["end_positions"] = list()
     for i, offsets in enumerate(offset_mapping):
         # Get the input IDs for the current example
-        input_ids = tokenized_examples.input_ids[i]
+        input_ids = tokenised_examples.input_ids[i]
         # We will label impossible answers with the index of the CLS token
         cls_index = input_ids.index(cls_token_id)
         # Grab the sequence corresponding to that example (to know what is the context
         # and what is the question).
-        sequence_ids = tokenized_examples.sequence_ids(i)
+        sequence_ids = tokenised_examples.sequence_ids(i)
         # Manually ensure that the special tokens are set to None in `sequence_ids`
-        for special_token in tokenizer.special_tokens_map.keys():
-            if hasattr(tokenizer, f"{special_token}_id"):
-                special_token_id = getattr(tokenizer, f"{special_token}_id")
+        for special_token in tokeniser.special_tokens_map.keys():
+            if hasattr(tokeniser, f"{special_token}_id"):
+                special_token_id = getattr(tokeniser, f"{special_token}_id")
                 if special_token_id is not None:
                     sequence_ids = [
                         None if token_id == special_token_id else seq_id
@@ -316,8 +332,8 @@ def prepare_train_examples(
         # If no answers are given, set the cls_index as answer.
         if len(answers["answer_start"]) == 0:
-            tokenized_examples.start_positions.append(cls_index)
-            tokenized_examples.end_positions.append(cls_index)
+            tokenised_examples.start_positions.append(cls_index)
+            tokenised_examples.end_positions.append(cls_index)
         else:
             # Start/end character index of the answer in the text.
@@ -325,9 +341,17 @@ def prepare_train_examples(
             end_char = start_char + len(answers["text"][0])
             # Start token index of the current span in the text.
-            token_start_index = 0
-            while sequence_ids[token_start_index] != 1:
-                token_start_index += 1
+            try:
+                token_start_index = 0
+                while sequence_ids[token_start_index] != 1:
+                    token_start_index += 1
+            # If it turns out that we cannot find the context in the span, then we
+            # treat this as an impossible case
+            except IndexError:
+                tokenised_examples.start_positions.append(cls_index)
+                tokenised_examples.end_positions.append(cls_index)
+                continue
             # End token index of the current span in the text.
             token_end_index = len(input_ids) - 1
@@ -340,8 +364,8 @@ def prepare_train_examples(
                 offsets[token_start_index][0] <= start_char
                 and offsets[token_end_index][1] >= end_char
             ):
-                tokenized_examples.start_positions.append(cls_index)
-                tokenized_examples.end_positions.append(cls_index)
+                tokenised_examples.start_positions.append(cls_index)
+                tokenised_examples.end_positions.append(cls_index)
             # Otherwise move the token_start_index and token_end_index to the two ends
             # of the answer. Note: we could go after the last offset if the answer is
@@ -353,71 +377,75 @@ def prepare_train_examples(
                 ):
                     token_start_index += 1
                 token_start_index -= 1
-                tokenized_examples.start_positions.append(token_start_index)
+                tokenised_examples.start_positions.append(token_start_index)
                 while (
                     token_start_index <= token_end_index
                     and offsets[token_end_index][1] >= end_char
                 ):
                     token_end_index -= 1
                 token_end_index += 1
-                tokenized_examples.end_positions.append(token_end_index)
+                tokenised_examples.end_positions.append(token_end_index)
                 assert token_end_index >= token_start_index
-    return tokenized_examples
+    return tokenised_examples
 def prepare_test_examples(
-    examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
+    examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
 ) -> "BatchEncoding":
     """Prepare test examples.
     Args:
         examples:
             Dictionary of test examples.
-        tokenizer:
-            The tokenizer used to preprocess the examples.
+        tokeniser:
+            The tokeniser used to preprocess the examples.
     Returns:
         The prepared test examples.
     """
     # Some of the questions have lots of whitespace on the left, which is not useful
-    # and will make the truncation of the context fail (the tokenized question will
+    # and will make the truncation of the context fail (the tokenised question will
     # take a lots of space). So we remove that left whitespace
     examples["question"] = [q.lstrip() for q in examples["question"]]
-    # Extract special token metadata from the tokenizer
-    special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
+    # Extract special token metadata from the tokeniser
+    special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
     has_cls_token = special_token_metadata["has_cls_token"]
     has_sep_token = special_token_metadata["has_sep_token"]
     cls_token = special_token_metadata["cls_token"]
     sep_token = special_token_metadata["sep_token"]
-    # If the tokenizer is not adding special tokens, then we add them manually
+    # If the tokeniser is not adding special tokens, then we add them manually
     if not has_cls_token and not has_sep_token:
         examples["question"] = [
             f"{cls_token}{q}{sep_token}" for q in examples["question"]
         ]
         examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
-    # Set the stride used during tokenization, when the context is long enough to be
+    # Set the stride used during tokenisation, when the context is long enough to be
     # split into several features. Since we are always keeping the question tokens, we
     # need to make sure that the stride does not exceed the resulting maximum context
     # length.
-    max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
+    max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
     num_special_tokens = int(has_cls_token) + int(has_sep_token)
-    stride = tokenizer.model_max_length // 4
-    max_length = tokenizer.model_max_length - stride
-    stride = min(stride, max_length - max_question_tokens - num_special_tokens)
-    max_length = tokenizer.model_max_length - stride
+    stride = tokeniser.model_max_length // 4
+    stride = min(
+        stride,
+        tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
+    )
+    stride = max(stride, 0)
+    max_length = tokeniser.model_max_length - stride
+    max_length = max(max_length, 0)
-    # Tokenize our examples with truncation and maybe padding, but keep the overflows
+    # Tokenise our examples with truncation and maybe padding, but keep the overflows
     # using a stride. This results in one example possible giving several features when
     # a context is long, each of those features having a context that overlaps a bit
     # the context of the previous feature.
-    tokenized_examples = tokenizer(
+    tokenised_examples = tokeniser(
         text=examples["question"],
         text_pair=examples["context"],
-        truncation="only_second",
+        truncation=TruncationStrategy.LONGEST_FIRST,
         max_length=max_length,
         stride=stride,
         return_overflowing_tokens=True,
@@ -428,30 +456,30 @@ def prepare_test_examples(
     # Since one example might give us several features if it has a long context, we
     # need a map from a feature to its corresponding example. This key gives us just
     # that.
-    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+    sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
     # We keep the id that gave us this feature and we will store the offset mappings.
-    tokenized_examples["id"] = list()
+    tokenised_examples["id"] = list()
-    for i in range(len(tokenized_examples.input_ids)):
+    for i in range(len(tokenised_examples.input_ids)):
         # Grab the sequence corresponding to that example (to know what is the context
         # and what is the question).
-        sequence_ids = tokenized_examples.sequence_ids(i)
+        sequence_ids = tokenised_examples.sequence_ids(i)
         context_index = 1
         # One example can give several spans, this is the index of the example
         # containing this span of text.
         sample_index = sample_mapping[i]
-        tokenized_examples.id.append(examples["id"][sample_index])
+        tokenised_examples.id.append(examples["id"][sample_index])
         # Set to (-1, -1) the offset_mapping that are not part of the context so it's
         # easy to determine if a token position is part of the context or not.
-        tokenized_examples.offset_mapping[i] = [
+        tokenised_examples.offset_mapping[i] = [
             (o if sequence_ids[k] == context_index else (-1, -1))
-            for k, o in enumerate(tokenized_examples.offset_mapping[i])
+            for k, o in enumerate(tokenised_examples.offset_mapping[i])
         ]
-    return tokenized_examples
+    return tokenised_examples
 def postprocess_predictions_and_labels(
@@ -459,7 +487,7 @@ def postprocess_predictions_and_labels(
     dataset: "Dataset",
     prepared_dataset: "Dataset",
     cls_token_index: int,
-) -> tuple[list[dict], list[dict]]:
+) -> tuple[c.Sequence[dict], c.Sequence[dict]]:
     """Postprocess the predictions and labels, to allow easier metric computation.
     Args:
@@ -540,7 +568,7 @@ def find_best_answer(
     all_start_logits: np.ndarray,
     all_end_logits: np.ndarray,
     prepared_dataset: "Dataset",
-    feature_indices: list[int],
+    feature_indices: c.Sequence[int],
     context: str,
     max_answer_length: int,
     num_best_logits: int,
@@ -573,7 +601,7 @@ def find_best_answer(
         The best answer for the example.
     """
     # Loop through all the features associated to the current example
-    valid_answers = list()
+    valid_answers: list[dict] = list()
     for feature_index in feature_indices:
         # Get the features associated with the current example
         features = prepared_dataset[feature_index]
@@ -614,12 +642,12 @@ def find_best_answer(
 def find_valid_answers(
     start_logits: np.ndarray,
     end_logits: np.ndarray,
-    offset_mapping: list[tuple[int, int]],
+    offset_mapping: c.Sequence[tuple[int, int]],
     context: str,
     max_answer_length: int,
     num_best_logits: int,
     min_null_score: float,
-) -> list[dict]:
+) -> c.Sequence[dict]:
     """Find the valid answers from the start and end indexes.
     Args:

EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl