PyPI - EuroEval - Versions diffs - 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl - Mend

EuroEval 16.4.0py3-none-any.whl → 16.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (71) hide show

euroeval/__init__.py +6 -0
euroeval/benchmark_config_factory.py +51 -46
euroeval/benchmark_modules/base.py +6 -5
euroeval/benchmark_modules/hf.py +2 -9
euroeval/benchmark_modules/litellm.py +14 -12
euroeval/benchmark_modules/vllm.py +17 -10
euroeval/benchmarker.py +61 -44
euroeval/caching_utils.py +1 -1
euroeval/cli.py +86 -8
euroeval/constants.py +3 -0
euroeval/data_loading.py +78 -30
euroeval/data_models.py +326 -326
euroeval/dataset_configs/__init__.py +10 -3
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/czech.py +25 -29
euroeval/dataset_configs/danish.py +51 -88
euroeval/dataset_configs/dutch.py +48 -86
euroeval/dataset_configs/english.py +45 -76
euroeval/dataset_configs/estonian.py +36 -38
euroeval/dataset_configs/faroese.py +19 -60
euroeval/dataset_configs/finnish.py +36 -68
euroeval/dataset_configs/french.py +39 -74
euroeval/dataset_configs/german.py +45 -81
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +54 -91
euroeval/dataset_configs/italian.py +42 -78
euroeval/dataset_configs/latvian.py +28 -34
euroeval/dataset_configs/lithuanian.py +22 -26
euroeval/dataset_configs/norwegian.py +72 -114
euroeval/dataset_configs/polish.py +33 -60
euroeval/dataset_configs/portuguese.py +33 -65
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +19 -24
euroeval/dataset_configs/spanish.py +42 -76
euroeval/dataset_configs/swedish.py +48 -84
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/exceptions.py +1 -1
euroeval/finetuning.py +3 -2
euroeval/generation.py +5 -4
euroeval/generation_utils.py +6 -5
euroeval/languages.py +395 -323
euroeval/metrics/huggingface.py +14 -3
euroeval/metrics/llm_as_a_judge.py +1 -1
euroeval/model_cache.py +6 -5
euroeval/model_loading.py +1 -1
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +82 -43
euroeval/prompt_templates/multiple_choice.py +81 -41
euroeval/prompt_templates/named_entity_recognition.py +125 -44
euroeval/prompt_templates/reading_comprehension.py +92 -43
euroeval/prompt_templates/sentiment_classification.py +91 -43
euroeval/prompt_templates/summarization.py +64 -39
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +4 -3
euroeval/speed_benchmark.py +2 -1
euroeval/task_group_utils/multiple_choice_classification.py +2 -1
euroeval/task_group_utils/question_answering.py +24 -13
euroeval/task_group_utils/sequence_classification.py +5 -4
euroeval/task_group_utils/text_to_text.py +2 -1
euroeval/task_group_utils/token_classification.py +11 -8
euroeval/tasks.py +44 -1
euroeval/tokenisation_utils.py +19 -10
euroeval/types.py +10 -9
euroeval/utils.py +6 -3
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
euroeval-16.5.0.dist-info/RECORD +81 -0
euroeval-16.4.0.dist-info/RECORD +0 -75
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -37,7 +37,7 @@ class QuestionAnsweringTrainer(Trainer):
         train_dataset: "Dataset",
         eval_dataset: "Dataset",
         compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
-        callbacks: "list[TrainerCallback]",
+        callbacks: "c.Sequence[TrainerCallback]",
         data_collator: "c.Callable",
         **kwargs,
     ) -> None:
@@ -67,7 +67,7 @@ class QuestionAnsweringTrainer(Trainer):
         self,
         eval_dataset: "Dataset | None" = None,
         orig_eval_dataset: "Dataset | None" = None,
-        ignore_keys: list[str] | None = None,
+        ignore_keys: c.Sequence[str] | None = None,
         metric_key_prefix: str = "eval",
     ) -> dict[str, float]:
         """Evaluate the model on the given dataset.
@@ -203,7 +203,7 @@ def compute_metrics(
 def extract_labels_from_generation(
     input_batch: dict[str, list], model_output: "GenerativeModelOutput"
-) -> list[t.Any]:
+) -> c.Sequence[t.Any]:
     """Extract the predicted labels from the generated output.
     Args:
@@ -265,8 +265,11 @@ def prepare_train_examples(
     max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
     num_special_tokens = int(has_cls_token) + int(has_sep_token)
     stride = tokeniser.model_max_length // 4
-    max_length = tokeniser.model_max_length - stride
-    stride = min(stride, max_length - max_question_tokens - num_special_tokens)
+    stride = min(
+        stride,
+        tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
+    )
+    stride = max(stride, 0)
     max_length = tokeniser.model_max_length - stride
     # Tokenise our examples with truncation and padding, but keep the overflows using a
@@ -335,9 +338,17 @@ def prepare_train_examples(
             end_char = start_char + len(answers["text"][0])
             # Start token index of the current span in the text.
-            token_start_index = 0
-            while sequence_ids[token_start_index] != 1:
-                token_start_index += 1
+            try:
+                token_start_index = 0
+                while sequence_ids[token_start_index] != 1:
+                    token_start_index += 1
+            # If it turns out that we cannot find the context in the span, then we
+            # treat this as an impossible case
+            except IndexError:
+                tokenised_examples.start_positions.append(cls_index)
+                tokenised_examples.end_positions.append(cls_index)
+                continue
             # End token index of the current span in the text.
             token_end_index = len(input_ids) - 1
@@ -469,7 +480,7 @@ def postprocess_predictions_and_labels(
     dataset: "Dataset",
     prepared_dataset: "Dataset",
     cls_token_index: int,
-) -> tuple[list[dict], list[dict]]:
+) -> tuple[c.Sequence[dict], c.Sequence[dict]]:
     """Postprocess the predictions and labels, to allow easier metric computation.
     Args:
@@ -550,7 +561,7 @@ def find_best_answer(
     all_start_logits: np.ndarray,
     all_end_logits: np.ndarray,
     prepared_dataset: "Dataset",
-    feature_indices: list[int],
+    feature_indices: c.Sequence[int],
     context: str,
     max_answer_length: int,
     num_best_logits: int,
@@ -583,7 +594,7 @@ def find_best_answer(
         The best answer for the example.
     """
     # Loop through all the features associated to the current example
-    valid_answers = list()
+    valid_answers: list[dict] = list()
     for feature_index in feature_indices:
         # Get the features associated with the current example
         features = prepared_dataset[feature_index]
@@ -624,12 +635,12 @@ def find_best_answer(
 def find_valid_answers(
     start_logits: np.ndarray,
     end_logits: np.ndarray,
-    offset_mapping: list[tuple[int, int]],
+    offset_mapping: c.Sequence[tuple[int, int]],
     context: str,
     max_answer_length: int,
     num_best_logits: int,
     min_null_score: float,
-) -> list[dict]:
+) -> c.Sequence[dict]:
     """Find the valid answers from the start and end indexes.
     Args:

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Utility functions related to the sequence-classification task group."""
+import collections.abc as c
 import logging
 import re
 import typing as t
@@ -110,7 +111,7 @@ def extract_labels_from_generation(
     dataset_config: "DatasetConfig",
     model_config: "ModelConfig",
     first_label_token_mapping: dict[str, str] | bool,
-) -> list[str]:
+) -> c.Sequence[str]:
     """Extract the predicted labels from the generated output.
     Args:
@@ -243,10 +244,10 @@ def extract_labels_from_generation(
 def get_closest_logprobs_labels(
-    generation_logprobs: list[list[list[tuple[str, float]]]],
+    generation_logprobs: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]],
     first_label_token_mapping: dict[str, str] | t.Literal[True],
-    candidate_labels: list[list[str]],
-) -> list[str] | None:
+    candidate_labels: c.Sequence[c.Sequence[str]],
+) -> c.Sequence[str] | None:
     """Get the labels with the highest predicted logprob value.
     In case a candidate label is split into multiple tokens, we only use the first

euroeval/task_group_utils/text_to_text.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Utility functions related to the text-to-text task group."""
+import collections.abc as c
 import logging
 import typing as t
@@ -131,7 +132,7 @@ def compute_metrics(
 def extract_labels_from_generation(
     input_batch: dict[str, list], model_output: "GenerativeModelOutput"
-) -> list[t.Any]:
+) -> c.Sequence[t.Any]:
     """Extract the predicted labels from the generated output.
     Args:

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Utility functions related to the token-classification task group."""
+import collections.abc as c
 import logging
 import typing as t
 from copy import deepcopy
@@ -59,7 +60,9 @@ def compute_metrics(
     predictions: list[list[str]]
     if not isinstance(model_outputs[0][0], str):
-        raw_predictions: list[list[int]] = np.argmax(model_outputs, axis=-1).tolist()
+        raw_predictions: c.Sequence[c.Sequence[int]] = np.argmax(
+            model_outputs, axis=-1
+        ).tolist()
         # Remove ignored index (special tokens)
         predictions = [
@@ -189,7 +192,7 @@ def extract_labels_from_generation(
     input_batch: dict[str, list],
     model_output: "GenerativeModelOutput",
     dataset_config: "DatasetConfig",
-) -> list[t.Any]:
+) -> c.Sequence[t.Any]:
     """Extract the predicted labels from the generated output.
     Args:
@@ -284,8 +287,8 @@ def tokenize_and_align_labels(
     # tokeniser is of a "fast" variant then this can be accessed through the
     # `word_ids` method. Otherwise, we have to extract it manually.
     all_labels: list[list[int]] = list()
-    labels: list[str]
-    word_ids: list[int | None]
+    labels: c.Sequence[str]
+    word_ids: c.Sequence[int | None]
     for i, labels in enumerate(examples["labels"]):
         # Try to get the word IDs from the tokeniser
         try:
@@ -295,10 +298,10 @@ def tokenize_and_align_labels(
         # IDs manually
         except ValueError:
             # Get the list of words in the document
-            words: list[str] = examples["tokens"][i]
+            words: c.Sequence[str] = examples["tokens"][i]
             # Get the list of token IDs in the document
-            tok_ids: list[int] = tokenized_inputs.input_ids[i]
+            tok_ids: c.Sequence[int] = tokenized_inputs.input_ids[i]
             # Decode the token IDs
             tokens = tokeniser.convert_ids_to_tokens(tok_ids)
@@ -391,8 +394,8 @@ def tokenize_and_align_labels(
 def handle_unk_tokens(
-    tokeniser: "PreTrainedTokenizer", tokens: list[str], words: list[str]
-) -> list[str]:
+    tokeniser: "PreTrainedTokenizer", tokens: list[str], words: c.Sequence[str]
+) -> c.Sequence[str]:
     """Replace unknown tokens in the tokens with the corresponding word.
     Args:

euroeval/tasks.py CHANGED Viewed

@@ -5,12 +5,14 @@ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
 from .data_models import Task
 from .enums import GenerativeType, ModelType, TaskGroup
 from .prompt_templates import (
+    CLASSIFICATION_TEMPLATES,
     LA_TEMPLATES,
     MULTIPLE_CHOICE_TEMPLATES,
     NER_TEMPLATES,
     RC_TEMPLATES,
     SENT_TEMPLATES,
     SUMM_TEMPLATES,
+    TOKEN_CLASSIFICATION_TEMPLATES,
 )
@@ -20,7 +22,11 @@ def get_all_tasks() -> dict[str, Task]:
     Returns:
         A mapping between names of dataset tasks and their configurations.
     """
-    return {cfg.name: cfg for cfg in globals().values() if isinstance(cfg, Task)}
+    return {
+        cfg.name: cfg
+        for cfg in globals().values()
+        if isinstance(cfg, Task) and cfg != SPEED
+    }
 LA = Task(
@@ -159,3 +165,40 @@ SPEED = Task(
     default_max_generated_tokens=5,
     default_labels=[],
 )
+# Used for custom datasets
+TEXT_CLASSIFICATION = Task(
+    name="classification",
+    task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
+    template_dict=CLASSIFICATION_TEMPLATES,
+    metrics=[m.mcc_metric, m.macro_f1_metric],
+    default_num_few_shot_examples=12,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
+    default_labels=None,
+    uses_logprobs=True,
+)
+TOKEN_CLASSIFICATION = Task(
+    name="token-classification",
+    task_group=TaskGroup.TOKEN_CLASSIFICATION,
+    template_dict=TOKEN_CLASSIFICATION_TEMPLATES,
+    metrics=[m.micro_f1_metric],
+    default_num_few_shot_examples=8,
+    default_max_generated_tokens=128,
+    default_labels=None,
+    uses_structured_output=True,
+)
+MULTIPLE_CHOICE = Task(
+    name="multiple-choice",
+    task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
+    template_dict=MULTIPLE_CHOICE_TEMPLATES,
+    metrics=[m.mcc_metric, m.accuracy_metric],
+    default_num_few_shot_examples=5,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
+    default_labels=None,
+    default_allowed_model_types=[ModelType.GENERATIVE],
+    uses_logprobs=True,
+)

euroeval/tokenisation_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Utility functions related to tokenisation."""
+import collections.abc as c
 import logging
 import re
 import typing as t
@@ -71,7 +72,7 @@ def get_special_token_metadata(tokeniser: "PreTrainedTokenizerBase") -> dict:
 def should_prompts_be_stripped(
-    labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
+    labels_to_be_generated: c.Sequence[str], tokeniser: "PreTrainedTokenizer"
 ) -> bool:
     """Determine if we should strip the prompts for few-shot evaluation.
@@ -110,7 +111,7 @@ def should_prompts_be_stripped(
 def should_prefix_space_be_added_to_labels(
-    labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
+    labels_to_be_generated: c.Sequence[str], tokeniser: "PreTrainedTokenizer"
 ) -> bool:
     """Determine if we should add a prefix space to the labels.
@@ -317,7 +318,7 @@ def get_pad_token(
 def get_end_of_chat_token_ids(
     tokeniser: "PreTrainedTokenizer", generative_type: GenerativeType | None
-) -> list[int] | None:
+) -> c.Sequence[int] | None:
     """Get the end token ID for chat models.
     This is only relevant for tokenisers with a chat template.
@@ -433,13 +434,19 @@ def get_first_label_token_mapping(
     # Tokenise some text containing each label, which we will use to extract the
     # first token of each label
-    all_tokens: list[list[str]]
+    all_tokens: c.Sequence[c.Sequence[str]]
     if not has_chat_template(tokeniser=tokeniser):
         add_prefix_space = should_prefix_space_be_added_to_labels(
             labels_to_be_generated=local_labels, tokeniser=tokeniser
         )
         all_tokens = [
-            tokeniser.tokenize(text=f" {label}" if add_prefix_space else label)
+            [
+                tokeniser.decode(token_id)
+                for token_id in tokeniser.encode(
+                    text=f" {label}" if add_prefix_space else label,
+                    add_special_tokens=False,
+                )
+            ]
             for label in local_labels
         ]
     else:
@@ -466,7 +473,7 @@ def get_first_label_token_mapping(
     all_tokens = [
         [
             re.sub(
-                pattern=r"^[^a-zæøåüöä0-9]+|[^a-zæøåüöä0-9]+$",
+                pattern=r"^[^a-zæøåüöä0-9 ]+|[^a-zæøåüöä0-9 ]+$",
                 repl="",
                 string=token.lower(),
             )
@@ -478,11 +485,13 @@ def get_first_label_token_mapping(
     # Extract the first token of each label
     first_tokens: list[str] = list()
     for token_list, label in zip(all_tokens, local_labels):
-        matching_tokens = [tok for tok in token_list if tok and label.startswith(tok)]
+        matching_tokens = [
+            tok for tok in token_list if tok and label.startswith(tok.strip())
+        ]
         if not matching_tokens:
             if log_metadata:
                 log_once(
-                    f"No matching token found in token_list for label '{label}', so "
+                    f"No matching token found in token_list for label {label!r}, so "
                     "we will not use logprobs with the model.",
                     level=logging.DEBUG,
                 )
@@ -549,12 +558,12 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
 def apply_chat_template(
-    conversation: list[dict[str, str]],
+    conversation: c.Sequence[dict[str, str]],
     tokeniser: "PreTrainedTokenizer",
     tokenise: bool,
     add_generation_prompt: bool,
     **extra_kwargs,
-) -> str | list[int]:
+) -> str | c.Sequence[int]:
     """Apply the chat template to a prompt.
     Args:

euroeval/types.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Types used throughout the project."""
+import collections.abc as c
 import typing as t
 from transformers.trainer_utils import EvalPrediction
@@ -10,9 +11,9 @@ if t.TYPE_CHECKING:
     from .data_models import BenchmarkConfig, GenerativeModelOutput
-ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
-Predictions: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
-Labels: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
+ScoreDict: t.TypeAlias = dict[str, dict[str, float] | c.Sequence[dict[str, float]]]
+Predictions: t.TypeAlias = "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]"
+Labels: t.TypeAlias = "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]"
 class ComputeMetricsFunction(t.Protocol):
@@ -22,8 +23,8 @@ class ComputeMetricsFunction(t.Protocol):
         self,
         model_outputs_and_labels: EvalPrediction
         | tuple[
-            "NDArray | list[str] | list[list[str]]",
-            "NDArray | list[str] | list[list[str]]",
+            "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]",
+            "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]",
         ],
         dataset: "Dataset",
         benchmark_config: "BenchmarkConfig",
@@ -48,7 +49,7 @@ class ExtractLabelsFunction(t.Protocol):
     def __call__(
         self, input_batch: dict[str, list], model_output: "GenerativeModelOutput"
-    ) -> list[str]:
+    ) -> c.Sequence[str]:
         """Extract the labels from the generated output.
         Args:
@@ -63,7 +64,7 @@ class ExtractLabelsFunction(t.Protocol):
         ...
-def is_list_of_int(x: object) -> t.TypeGuard[list[int]]:
+def is_list_of_int(x: object) -> t.TypeGuard[c.Sequence[int]]:
     """Check if an object is a list of integers.
     Args:
@@ -76,7 +77,7 @@ def is_list_of_int(x: object) -> t.TypeGuard[list[int]]:
     return isinstance(x, list) and all(isinstance(i, int) for i in x)
-def is_list_of_list_of_int(x: object) -> t.TypeGuard[list[list[int]]]:
+def is_list_of_list_of_int(x: object) -> t.TypeGuard[c.Sequence[c.Sequence[int]]]:
     """Check if an object is a list of list of integers.
     Args:
@@ -93,7 +94,7 @@ def is_list_of_list_of_int(x: object) -> t.TypeGuard[list[list[int]]]:
     )
-def is_list_of_str(x: object) -> t.TypeGuard[list[str]]:
+def is_list_of_str(x: object) -> t.TypeGuard[c.Sequence[str]]:
     """Check if an object is a list of integers.
     Args:

euroeval/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Utility functions to be used in other scripts."""
 import asyncio
+import collections.abc as c
 import gc
 import importlib
 import importlib.metadata
@@ -142,7 +143,9 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
     return rng
-def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type | None:
+def get_class_by_name(
+    class_name: str | c.Sequence[str], module_name: str
+) -> t.Type | None:
     """Get a class by its name.
     Args:
@@ -421,8 +424,8 @@ def get_hf_token(api_key: str | None) -> str | bool:
 def extract_multiple_choice_labels(
-    prompt: str, candidate_labels: list[str]
-) -> list[str]:
+    prompt: str, candidate_labels: c.Sequence[str]
+) -> c.Sequence[str]:
     """Extract multiple choice labels from a prompt.
     Args:

EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.4.0py3-none-any.whl → 16.5.0py3-none-any.whl