PyPI - EuroEval - Versions diffs - 15.7.0__py3-none-any.whl → 15.7.2__py3-none-any.whl - Mend

EuroEval 15.7.0py3-none-any.whl → 15.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (18) hide show

euroeval/benchmark_config_factory.py +1 -1
euroeval/benchmark_modules/litellm.py +27 -258
euroeval/benchmark_modules/vllm.py +14 -304
euroeval/benchmarker.py +14 -11
euroeval/data_models.py +3 -1
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/dutch.py +5 -16
euroeval/dataset_configs/finnish.py +11 -9
euroeval/generation_utils.py +346 -0
euroeval/languages.py +1 -1
euroeval/scores.py +7 -1
euroeval/task_group_utils/sequence_classification.py +46 -11
euroeval/tokenization_utils.py +50 -14
{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/METADATA +1 -1
{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/RECORD +18 -17
{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/WHEEL +0 -0
{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/entry_points.txt +0 -0
{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/licenses/LICENSE +0 -0

euroeval/generation_utils.py ADDED Viewed

@@ -0,0 +1,346 @@
+"""Utility functions related to generative models."""
+import itertools as it
+import json
+import logging
+import random
+import typing as t
+from .enums import TaskGroup
+from .exceptions import InvalidBenchmark
+from .utils import log_once
+if t.TYPE_CHECKING:
+    from datasets import DatasetDict
+    from transformers.tokenization_utils import PreTrainedTokenizer
+    from .data_models import DatasetConfig, ModelConfig
+logger = logging.getLogger("euroeval")
+def extract_few_shot_examples(
+    dataset: "DatasetDict", dataset_config: "DatasetConfig", itr_idx: int
+) -> list[dict[str, t.Any]]:
+    """Extract few-shot examples from a dataset.
+    This will always extract the examples from the training split.
+    We ensure that the few-shot examples are unique by picking them one at a time.
+    Args:
+        dataset:
+            The dataset to extract the few-shot examples from.
+        dataset_config:
+            The dataset configuration.
+        itr_idx:
+            The index of the dataset in the iterator.
+    Returns:
+        The few-shot examples.
+    """
+    random_seed = 4242 + itr_idx
+    num_few_shots = dataset_config.num_few_shot_examples
+    few_shot_examples: list[dict[str, t.Any]] = list()
+    shuffled_train = dataset["train"].shuffle(seed=random_seed)
+    match dataset_config.task.task_group:
+        case (
+            TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
+        ):
+            # Locate the maximum number of tokens that constitutes a short example
+            for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
+                train_with_short_examples = dataset["train"].filter(
+                    lambda example: len(example["text"]) < max_num_tokens
+                )
+                num_short_examples = len(train_with_short_examples)
+                if num_short_examples >= dataset_config.num_few_shot_examples:
+                    break
+            else:
+                raise InvalidBenchmark(
+                    "Could not find enough short examples for few-shot learning."
+                )
+            shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
+            labels = it.cycle(dataset_config.labels)
+            while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
+                label = next(labels)
+                possible_examples = shuffled_train.filter(
+                    lambda x: x["label"].lower() == label.lower()
+                )
+                if len(possible_examples) == 0:
+                    continue
+                example = possible_examples.select(range(1))[0]
+                few_shot_examples.append(example)
+                shuffled_train = shuffled_train.filter(
+                    lambda x: x["text"] != example["text"]
+                )
+        case TaskGroup.TEXT_TO_TEXT:
+            while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
+                example = shuffled_train.select(range(1))[0]
+                few_shot_examples.append(example)
+                shuffled_train = shuffled_train.filter(
+                    lambda x: x["text"] != example["text"]
+                )
+        case TaskGroup.TOKEN_CLASSIFICATION:
+            labels = it.cycle(
+                [
+                    label.lower()
+                    for label in dataset_config.labels
+                    if label.lower().startswith("b-")
+                ]
+            )
+            while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
+                label = next(labels)
+                possible_examples = shuffled_train.filter(
+                    lambda x: label in [tag.lower() for tag in x["labels"]]
+                )
+                if len(possible_examples) == 0:
+                    continue
+                example = possible_examples.select(range(1))[0]
+                few_shot_examples.append(example)
+                shuffled_train = shuffled_train.filter(
+                    lambda x: x["tokens"] != example["tokens"]
+                )
+        case TaskGroup.QUESTION_ANSWERING:
+            # Locate the maximum number of tokens that constitutes a short example
+            for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
+                train_with_short_examples = dataset["train"].filter(
+                    lambda example: len(example["context"]) < max_num_tokens
+                )
+                num_short_examples = len(train_with_short_examples)
+                if num_short_examples >= dataset_config.num_few_shot_examples:
+                    break
+            else:
+                raise InvalidBenchmark(
+                    "Could not find enough short examples for few-shot learning."
+                )
+            shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
+            while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
+                example = shuffled_train.select(range(1))[0]
+                few_shot_examples.append(example)
+                shuffled_train = shuffled_train.filter(
+                    lambda x: x["context"] != example["context"]
+                )
+        case _:
+            raise NotImplementedError(
+                f"Unsupported task group: {dataset_config.task.task_group}."
+            )
+    random.seed(random_seed)
+    random.shuffle(few_shot_examples)
+    return few_shot_examples
+def apply_prompt(
+    examples: dict[str, t.Any],
+    few_shot_examples: list[dict[str, t.Any]],
+    model_config: "ModelConfig",
+    dataset_config: "DatasetConfig",
+    instruction_model: bool,
+    always_populate_text_field: bool,
+    tokenizer: "PreTrainedTokenizer | None",
+) -> dict[str, t.Any]:
+    """Apply prompt template to an example, potentially with few-shot examples.
+    Args:
+        examples:
+            The examples to apply the few-shot examples to.
+        few_shot_examples:
+            The few-shot examples to apply.
+        dataset_config:
+            The dataset configuration.
+        instruction_model:
+            Whether the model is instruction-tuned.
+        always_populate_text_field:
+            Whether to always populate the 'text' field in the examples, as opposed to
+            the 'messages' field.
+        tokenizer:
+            The tokenizer to use for the model. If None, the tokenizer is not used.
+    Returns:
+        The example with the few-shot examples applied.
+    """
+    # Sanity check
+    if instruction_model and always_populate_text_field and tokenizer is None:
+        raise ValueError(
+            "The `tokenizer` argument must be provided when the model is instruction "
+            "tuned and when we are not just returning the raw messages."
+        )
+    def create_prompt(**kwargs: str) -> tuple[str, str]:
+        """Create a prompt from the given keyword arguments.
+        Args:
+            kwargs:
+                The keyword arguments to use in the prompt.
+        Returns:
+            A pair (prompt, label), where "label" is an empty string if the model is
+            not instruction tuned (as in this case it is included in the prompt).
+        """
+        label_key = "label" if "label" in kwargs else "target_text"
+        label = kwargs.pop(label_key)
+        assert label is not None, (
+            f"Found a None label for the prompt: {kwargs}. This should not happen."
+        )
+        label_mapping = dataset_config.prompt_label_mapping
+        label = label_mapping.get(label, label)
+        if instruction_model:
+            prompt = dataset_config.instruction_prompt.format(**kwargs)
+            return prompt, label
+        else:
+            kwargs[label_key] = label
+            return dataset_config.prompt_template.format(**kwargs), ""
+    match dataset_config.task.task_group:
+        case (
+            TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
+        ):
+            few_shot_sections = [
+                create_prompt(
+                    text=example["text"].replace("\n", " ").strip(),
+                    label=example["label"].replace("\n", " ").strip(),
+                )
+                for example in few_shot_examples
+            ]
+            new_sections = [
+                create_prompt(text=text.replace("\n", " ").strip(), label="")
+                for text in examples["text"]
+            ]
+        case TaskGroup.TEXT_TO_TEXT:
+            few_shot_sections = [
+                create_prompt(
+                    text=example["text"].replace("\n", " ").strip(),
+                    target_text=example["target_text"].replace("\n", " ").strip(),
+                )
+                for example in few_shot_examples
+            ]
+            new_sections = [
+                create_prompt(text=text.replace("\n", " ").strip(), target_text="")
+                for text in examples["text"]
+            ]
+        case TaskGroup.TOKEN_CLASSIFICATION:
+            def create_label(example: dict) -> str:
+                prompt_labels = dataset_config.prompt_label_mapping.values()
+                labels: dict[str, list[str]] = {
+                    prompt_label: list() for prompt_label in prompt_labels
+                }
+                for token, label in zip(example["tokens"], example["labels"]):
+                    label = label.lower()
+                    if label == "o":
+                        continue
+                    prompt_label = dataset_config.prompt_label_mapping[label]
+                    if label.startswith("b-"):
+                        labels[prompt_label].append(token)
+                    elif label.startswith("i-"):
+                        labels[prompt_label][-1] += " " + token
+                return json.dumps(labels, ensure_ascii=False)
+            few_shot_sections = [
+                create_prompt(
+                    text=" ".join(example["tokens"]).replace("\n", " ").strip(),
+                    label=create_label(example=example),
+                )
+                for example in few_shot_examples
+            ]
+            new_sections = [
+                create_prompt(
+                    text=" ".join(tokens).replace("\n", " ").strip(), label=""
+                )
+                for tokens in examples["tokens"]
+            ]
+        case TaskGroup.QUESTION_ANSWERING:
+            few_shot_sections = [
+                create_prompt(
+                    text=example["context"].replace("\n", " ").strip(),
+                    question=example["question"].replace("\n", " ").strip(),
+                    label=example["answers"]["text"][0].replace("\n", " "),
+                )
+                for example in few_shot_examples
+            ]
+            new_sections = [
+                create_prompt(
+                    text=context.replace("\n", " ").strip(),
+                    question=question.replace("\n", " ").strip(),
+                    label="",
+                )
+                for context, question in zip(examples["context"], examples["question"])
+            ]
+        case _:
+            raise NotImplementedError(
+                f"Unsupported task group: {dataset_config.task.task_group}."
+            )
+    if instruction_model:
+        few_shot_messages = [
+            dict(role=role, content=content)
+            for prompt, label in few_shot_sections
+            for role, content in [("user", prompt), ("assistant", label)]
+        ]
+        messages_list = [
+            few_shot_messages + [dict(role="user", content=prompt)]
+            for prompt, _ in new_sections
+        ]
+        if not always_populate_text_field:
+            examples["messages"] = messages_list
+        else:
+            assert tokenizer is not None
+            # Pick the chat template that matches the language of the dataset, if such a
+            # template exists
+            chat_template: str | None = None
+            if isinstance(tokenizer.chat_template, dict):
+                language_codes = [
+                    language.code for language in dataset_config.languages
+                ]
+                for name, candidate_template in tokenizer.chat_template.items():
+                    if name.lower() in language_codes:
+                        chat_template = candidate_template
+                        log_once(
+                            f"Using the {name!r} chat template for the tokenizer for "
+                            f"model {model_config.model_id!r}.",
+                            level=logging.DEBUG,
+                        )
+                        break
+            texts = [
+                tokenizer.apply_chat_template(
+                    conversation=messages,
+                    tokenize=False,
+                    add_generation_prompt=True,
+                    chat_template=chat_template,
+                )
+                for messages in messages_list
+            ]
+            examples["text"] = texts
+    else:
+        prompt_prefix = ""
+        if dataset_config.prompt_prefix:
+            prompt_prefix = dataset_config.prompt_prefix + "\n\n"
+        few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
+        if few_shot_prompt:
+            few_shot_prompt += "\n\n"
+        examples["text"] = [
+            prompt_prefix + few_shot_prompt + new_prompt
+            for new_prompt, _ in new_sections
+        ]
+    return examples

euroeval/languages.py CHANGED Viewed

@@ -21,6 +21,7 @@ def get_all_languages() -> dict[str, Language]:
 DA = Language(code="da", name="Danish", _and_separator="og", _or_separator="eller")
 NL = Language(code="nl", name="Dutch", _and_separator="en", _or_separator="of")
 EN = Language(code="en", name="English", _and_separator="and", _or_separator="or")
+FI = Language(code="fi", name="Finnish", _and_separator="ja", _or_separator="tai")
 FO = Language(code="fo", name="Faroese", _and_separator="og", _or_separator="ella")
 FR = Language(code="fr", name="French", _and_separator="et", _or_separator="ou")
 DE = Language(code="de", name="German", _and_separator="und", _or_separator="oder")
@@ -78,7 +79,6 @@ EO = Language(code="eo", name="Esperanto")
 ET = Language(code="et", name="Estonian")
 EE = Language(code="ee", name="Ewe")
 FJ = Language(code="fj", name="Fijian")
-FI = Language(code="fi", name="Finnish")
 FY = Language(code="fy", name="Western Frisian")
 FF = Language(code="ff", name="Fulah")
 GD = Language(code="gd", name="Gaelic")

euroeval/scores.py CHANGED Viewed

@@ -18,6 +18,7 @@ def log_scores(
     metric_configs: list["MetricConfig"],
     scores: list[dict[str, float]],
     model_id: str,
+    model_revision: str,
 ) -> "ScoreDict":
     """Log the scores.
@@ -30,13 +31,18 @@ def log_scores(
             The scores that are to be logged. This is a list of dictionaries full of
             scores.
         model_id:
-            The full Hugging Face Hub path to the pretrained transformer model.
+            The model ID of the model that was evaluated.
+        model_revision:
+            The revision of the model.
     Returns:
         A dictionary with keys 'raw_scores' and 'total', with 'raw_scores' being
         identical to `scores` and 'total' being a dictionary with the aggregated scores
         (means and standard errors).
     """
+    if model_revision and model_revision != "main":
+        model_id += f"@{model_revision}"
     logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
     total_dict: dict[str, float] = dict()

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -132,6 +132,11 @@ def extract_labels_from_generation(
         The predicted labels.
     """
     if model_output.scores is not None:
+        if first_label_token_mapping is False:
+            raise InvalidBenchmark(
+                "The model outputted logprobs, but the first label token mapping is "
+                "not provided. This means that the model should not output logprobs."
+            )
         labels = get_closest_logprobs_labels(
             generation_logprobs=model_output.scores,
             dataset_config=dataset_config,
@@ -147,7 +152,7 @@ def extract_labels_from_generation(
 def get_closest_logprobs_labels(
     generation_logprobs: list[list[list[tuple[str, float]]]],
     dataset_config: "DatasetConfig",
-    first_label_token_mapping: dict[str, str] | bool,
+    first_label_token_mapping: dict[str, str] | t.Literal[True],
 ) -> list[str] | None:
     """Get the labels with the highest predicted logprob value.
@@ -164,8 +169,7 @@ def get_closest_logprobs_labels(
             The configuration of the dataset.
         first_label_token_mapping:
             A mapping from labels to the first token in each label, or alternatively a
-            Boolean value indicating whether the model should output scores (if the
-            mapping is outputted then the model will always output scores).
+            `True` value indicating that the model should output logprobs.
     Returns:
         The predicted labels, or None if labels could not be extracted.
@@ -195,7 +199,9 @@ def get_closest_logprobs_labels(
             # label, as the output label
             output_label: str | None = None
             for generated_label in generated_labels:
-                # Get the candidate labels that starts with the generated label
+                # Get the candidate labels. If we have a first label token mapping, we
+                # use it to get the candidate labels. Otherwise, we check if any of the
+                # labels start with the generated label.
                 if isinstance(first_label_token_mapping, dict):
                     if any(
                         candidate_label not in first_label_token_mapping
@@ -239,14 +245,43 @@ def get_closest_logprobs_labels(
                     )
                     return None
-                # If no candidate label is found, we ignore the generated label, as it
-                # basically means that the model is just really bad at generating
-                # labels.
+                # If no candidate label is found, we first check if any of the labels
+                # start with the generated label. This could be the case if the labels
+                # in the first token mapping is inaccurate or incomplete, for instance
+                # if 'pos' is in the first label token mapping, but the model outputted
+                # 'posit'. If this is the case then we cannot trust the first label
+                # token mapping, and we fall back to using word edit distance.
+                # Otherwise, the generated label is just bad, and we skip to the next
+                # generated label.
                 elif len(candidate_output_labels) == 0:
-                    logger.debug(
-                        f"No candidate label found for the generated label "
-                        f"{generated_label!r}. The generated label is thus ignored."
-                    )
+                    candidate_output_labels_starting_with_generated_label = [
+                        candidate_label
+                        for candidate_label in candidate_labels
+                        if candidate_label.startswith(generated_label)
+                    ]
+                    if candidate_output_labels_starting_with_generated_label:
+                        log_once(
+                            f"No candidate label found for the generated label "
+                            f"{generated_label!r}. This means that using logprobs to "
+                            "extract the labels is not reliable, and we will instead "
+                            "fall back to extracting the labels using word edit "
+                            "distance.",
+                            level=logging.DEBUG,
+                        )
+                        return None
+            # If we did not find any candidate label for any of the generated labels, we
+            # assume that something is wrong with the model output, and we fall back to
+            # using word edit distance to extract the labels
+            else:
+                log_once(
+                    f"No candidate label found for any of the generated labels "
+                    f"{generated_labels}. This means that using logprobs to extract "
+                    "the labels is not reliable, and we will instead fall back to "
+                    "extracting the labels using word edit distance.",
+                    level=logging.DEBUG,
+                )
+                return None
             if output_label is not None:
                 output_labels.append(output_label)

euroeval/tokenization_utils.py CHANGED Viewed

@@ -311,24 +311,60 @@ def get_first_label_token_mapping(
             for label in dataset_config.labels
         ]
-        # Get the first token of each label, where we add a prefix space if needed
-        add_prefix_space = (
-            should_prefix_space_be_added_to_labels(
+        # Tokenize some text containing each label, which we will use to extract the
+        # first token of each label
+        all_tokens: list[list[str]]
+        if tokenizer.chat_template is None:
+            add_prefix_space = should_prefix_space_be_added_to_labels(
                 labels_to_be_generated=local_labels, tokenizer=tokenizer
             )
-            and tokenizer.chat_template is None
-        )
-        first_tokens = [
-            tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
-            for label in local_labels
-        ]
-        first_tokens = [
-            re.sub(
-                pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
-            )
-            for token in first_tokens
+            all_tokens = [
+                tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
+                for label in local_labels
+            ]
+        else:
+            all_tokens = [
+                tokenizer.convert_ids_to_tokens(
+                    ids=tokenizer.apply_chat_template(
+                        conversation=[
+                            dict(role="user", content=""),
+                            dict(role="assistant", content=label),
+                        ],
+                        add_generation_prompt=True,
+                        tokenize=True,
+                    )
+                )
+                for label in local_labels
+            ]
+        # Remove any non-alphabetic characters from the tokens
+        all_tokens = [
+            [
+                re.sub(
+                    pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
+                    repl="",
+                    string=token.lower(),
+                )
+                for token in token_list
+            ]
+            for token_list in all_tokens
         ]
+        # Extract the first token of each label
+        first_tokens: list[str] = list()
+        for token_list, label in zip(all_tokens, local_labels):
+            matching_tokens = [
+                tok for tok in token_list if tok and label.startswith(tok)
+            ]
+            if not matching_tokens:
+                log_once(
+                    f"No matching token found in token_list for label '{label}', so "
+                    "we will not output scores.",
+                    level=logging.DEBUG,
+                )
+                return False
+            first_tokens.append(matching_tokens[0])
         # Build a mapping from labels to the first token in each label if the first
         # tokens are distinct
         if len(first_tokens) == len(set(first_tokens)):

{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.7.0
+Version: 15.7.2
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues

{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/RECORD RENAMED Viewed

@@ -1,38 +1,39 @@
 euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
-euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
-euroeval/benchmarker.py,sha256=gOLNpW11cBX_8AvotnlGNbejtOM4acmXS3aovNREqhA,48434
+euroeval/benchmark_config_factory.py,sha256=RDYotoLcfNr3xU8Cw-G-Y8wLe6RSlJD1Ok9C97lWfOs,12553
+euroeval/benchmarker.py,sha256=4tCrs0CvKvQcMpJRtaonxELEDXkmY95stCGwht6wTGE,48649
 euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
 euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
 euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
 euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
-euroeval/data_models.py,sha256=Nlb2s26u5OvQ2AITAt25NMpeI1IHM2_qqbpyU_bZhiY,22907
+euroeval/data_models.py,sha256=t5FwpGxiSIMe7iKae-tT7usUWki-ILzAFFm7dPJoFsk,22973
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
 euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
 euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
+euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
 euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
-euroeval/languages.py,sha256=IQUbGMyn7pxAyM70M0FTO80m92Q4KgIU604MJhVia-Q,8513
+euroeval/languages.py,sha256=LerXuRBAUYkQL6qSV-F82itAE4EgBGFBtzaGnJJZvOE,8555
 euroeval/model_cache.py,sha256=n39yFpZkudBCVwz1EQpZ-g5BQtlQemQ5nP3IiFKJZHg,8275
 euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
 euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
-euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
+euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
 euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
 euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
-euroeval/tokenization_utils.py,sha256=fbMVAOkRdcpf9L2SVechPpmWYgDXgQcc-sDrYu21wFI,12487
+euroeval/tokenization_utils.py,sha256=RYTYbzCM9cryZ_w-_CzyN9Sbt47DbaGU5ukm-H38sHI,13871
 euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
 euroeval/utils.py,sha256=DyWhtdFlAM1TZuiYXWNPN8KxNrZGNa-J3WfS6DGwkvM,10467
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
 euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
 euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
-euroeval/benchmark_modules/litellm.py,sha256=9Fhh7Zyn6F4JBlRoQkST1wIeb8z0YliRRrcmD5pONs4,52551
-euroeval/benchmark_modules/vllm.py,sha256=vwAE7SGRhePqkzAt1S-FKPelEqe8VMGwah9Nj2J1hLs,51295
-euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
+euroeval/benchmark_modules/litellm.py,sha256=_32H-M1L_TfW-opyaMLJFPxx0iOG8A8Zfq7uVGFKZdA,43005
+euroeval/benchmark_modules/vllm.py,sha256=DJyla0jr-DVMPPs4RBguxq1Xn5YguvyuAnIlgIOfFaw,39394
+euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
 euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
-euroeval/dataset_configs/dutch.py,sha256=N3zL0vGe4OyPgVU_AiYNNfk96jSc_JDtKrVIHbaEYCU,3536
+euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
 euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
 euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
-euroeval/dataset_configs/finnish.py,sha256=BIfcxdgJu4CfevHAjzwH7cYd8Xd9DGrm49lcJZcGVQM,1730
+euroeval/dataset_configs/finnish.py,sha256=lZA2bY_ul9qh3uGFrTNe7q15WyZ04EL9OYmrkcNjygY,1857
 euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
 euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
 euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
@@ -50,11 +51,11 @@ euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5w
 euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
 euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
-euroeval/task_group_utils/sequence_classification.py,sha256=xPz1gJioK96iv2bNoDWiC2EJkhRvRd7QZNgY8bT237c,11703
+euroeval/task_group_utils/sequence_classification.py,sha256=MCdO5h3v_LWTkrvKAeefPq7rl1H5mFed50nAL4uZq0E,13837
 euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
 euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
-euroeval-15.7.0.dist-info/METADATA,sha256=8oMsbhHWeO7j4KQdn4lpt-O94Nw0erwRoD_Ogk6CX2U,13669
-euroeval-15.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.7.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.7.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
-euroeval-15.7.0.dist-info/RECORD,,
+euroeval-15.7.2.dist-info/METADATA,sha256=nCF9GI8kOoKP3Up_KgPSxe4pnomawC1rQqRGlYoEsIA,13669
+euroeval-15.7.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.7.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.7.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.7.2.dist-info/RECORD,,

{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.7.0__py3-none-any.whl → 15.7.2__py3-none-any.whl

Potentially problematic release.

EuroEval 15.7.0py3-none-any.whl → 15.7.2py3-none-any.whl