PyPI - EuroEval - Versions diffs - 16.0.0__py3-none-any.whl → 16.0.1__py3-none-any.whl - Mend

EuroEval 16.0.0py3-none-any.whl → 16.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (19) hide show

euroeval/__init__.py +5 -0
euroeval/benchmark_modules/vllm.py +41 -28
euroeval/constants.py +6 -0
euroeval/data_models.py +20 -16
euroeval/dataset_configs/danish.py +0 -3
euroeval/generation_utils.py +44 -6
euroeval/metrics/pipeline.py +50 -8
euroeval/model_cache.py +13 -1
euroeval/task_group_utils/multiple_choice_classification.py +2 -2
euroeval/task_group_utils/sequence_classification.py +66 -53
euroeval/task_group_utils/token_classification.py +14 -0
euroeval/tasks.py +9 -7
euroeval/tokenization_utils.py +1 -2
euroeval/utils.py +32 -1
{euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +3 -1
{euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/RECORD +19 -19
{euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -13,6 +13,7 @@ from termcolor import colored
 # Block specific warnings before importing anything else, as they can be noisy
 warnings.filterwarnings("ignore", category=UserWarning)
+warnings.filterwarnings("ignore", category=FutureWarning)
 logging.getLogger("httpx").setLevel(logging.CRITICAL)
 logging.getLogger("datasets").setLevel(logging.CRITICAL)
 logging.getLogger("vllm").setLevel(logging.CRITICAL)
@@ -101,6 +102,10 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
 os.environ["VLLM_USE_V1"] = "1"
+# Use the FlashInfer flash-attention backend for vLLM
+os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
 # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
 # former and LiteLLM uses the latter
 if os.getenv("HUGGINGFACE_API_KEY"):

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -337,31 +337,6 @@ class VLLMModel(HuggingFaceEncoderModel):
             if end_of_chat_token:
                 stop_tokens.append(end_of_chat_token)
-        structured_generation_schema = None
-        if self.dataset_config.task.uses_structured_output:
-            if self.generative_type == GenerativeType.REASONING:
-                log_once(
-                    f"The model {self.model_config.model_id!r} is a reasoning model "
-                    "and thus does not support structured generation, so we do not "
-                    "enable it.",
-                    level=logging.DEBUG,
-                )
-            else:
-                ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
-                keys_and_their_types: dict[str, t.Any] = {
-                    tag_name: (conlist(str, max_length=5), ...)
-                    for tag_name in ner_tag_names
-                }
-                answer_format_class = create_model(
-                    "AnswerFormat", **keys_and_their_types
-                )
-                structured_generation_schema = answer_format_class.model_json_schema()
-                log_once(
-                    "Using structured generation with the JSON schema "
-                    f"{structured_generation_schema}",
-                    level=logging.DEBUG,
-                )
         # Get the mapping from labels to the first token in the label. We call this each
         # time we generate a new dataset since the dataset config can change
         self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
@@ -382,8 +357,29 @@ class VLLMModel(HuggingFaceEncoderModel):
                 "error was. Skipping this evaluation."
             )
-        # Define the guided decoding that we will use for structured generation
-        if structured_generation_schema is not None:
+        structured_generation_schema = None
+        if (
+            self.dataset_config.task.uses_structured_output
+            or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
+        ) and self.generative_type == GenerativeType.REASONING:
+            guided_decoding = None
+            logger.debug(
+                "The dataset uses structured output, but we are not using it as the "
+                "model is a reasoning model."
+            )
+        elif self.dataset_config.task.uses_structured_output:
+            ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
+            keys_and_their_types: dict[str, t.Any] = {
+                tag_name: (conlist(str, max_length=5), ...)
+                for tag_name in ner_tag_names
+            }
+            answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
+            structured_generation_schema = answer_format_class.model_json_schema()
+            log_once(
+                "Using structured generation with the JSON schema: "
+                f"{json.dumps(structured_generation_schema)}",
+                level=logging.DEBUG,
+            )
             guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
         elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
             guided_decoding = GuidedDecodingParams(
@@ -392,8 +388,17 @@ class VLLMModel(HuggingFaceEncoderModel):
                     for label in self.dataset_config.labels
                 ]
             )
+            log_once(
+                "Using structured generation with the choices: "
+                f"{guided_decoding.choice!r}.",
+                level=logging.DEBUG,
+            )
         else:
             guided_decoding = None
+            log_once(
+                "Not using structured generation as the dataset does not require it.",
+                level=logging.DEBUG,
+            )
         # Define the parameters used for vLLM generation
         max_tokens: int = (
@@ -439,6 +444,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         # Generate sequences using vLLM
         input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
         num_attempts = 3
+        truncation_attempts = 0
         for _ in range(num_attempts):
             try:
                 raw_outputs = self._model.generate(
@@ -466,12 +472,19 @@ class VLLMModel(HuggingFaceEncoderModel):
                         "Prompts are too long, so truncating them and trying again..."
                     )
                     logger.debug(f"The error message was: {str(e)}")
+                    # If we have already tried truncating the prompts a few times, then
+                    # we truncate a bit more aggressively
+                    extra_truncation = 50 * truncation_attempts
+                    truncation_attempts += 1
                     tokenized_prompts = self._tokeniser(
                         text=prompts,
                         truncation=True,
                         max_length=max(
                             min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
-                            - max_tokens,
+                            - max_tokens
+                            - extra_truncation,
                             0,
                         ),
                     )

euroeval/constants.py CHANGED Viewed

@@ -75,3 +75,9 @@ LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
 # These characters are stripped from JSON output when trying to identify the label
 JSON_STRIP_CHARACTERS = ' {}\n\r":'
+# The number of tokens we generate when evaluating generative models on classification
+# tasks. We also use this to determine whether we should store logprobs in the model
+# outputs (and cache).
+NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10

euroeval/data_models.py CHANGED Viewed

@@ -125,6 +125,12 @@ class Task:
             A list of generative model types that are allowed to be evaluated on this
             task. If None, all generative model types are allowed. Only relevant if
             `allowed_model_types` includes generative models.
+        allow_invalid_model_outputs (optional):
+            Whether to allow invalid model outputs. This is only relevant for generative
+            models on classification tasks, where the model may generate an output
+            which is not one of the allowed labels. If True, the model output will be
+            mapped to the closest valid label. If False, the model output will be
+            considered incorrect and the evaluation will be aborted. Defaults to True.
     """
     name: str
@@ -148,6 +154,7 @@ class Task:
             GenerativeType.REASONING,
         ]
     )
+    allow_invalid_model_outputs: bool = True
     def __post_init__(self) -> None:
         """Post-initialisation checks."""
@@ -430,7 +437,6 @@ class DatasetConfig:
             if self._prompt_prefix is None
             else self._prompt_prefix
         )
-        prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
         return prompt_prefix
     @property
@@ -443,7 +449,6 @@ class DatasetConfig:
             if self._prompt_template is None
             else self._prompt_template
         )
-        prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
         return prompt_template
     @property
@@ -456,9 +461,6 @@ class DatasetConfig:
             if self._instruction_prompt is None
             else self._instruction_prompt
         )
-        instruction_prompt = instruction_prompt.replace(
-            "{labels_str}", self._labels_str
-        )
         return instruction_prompt
     @property
@@ -519,15 +521,16 @@ class DatasetConfig:
         """Return a hash of the dataset configuration."""
         return hash(self.name)
-    @property
-    def _labels_str(self) -> str:
+    def get_labels_str(self, labels: list[str] | None = None) -> str:
         """Converts a set of labels to a natural string, in the specified language.
         If the task is NER, we separate using 'and' and use the mapped labels instead of
         the BIO NER labels.
         Args:
-            language: The language to be used when converting the labels.
+            labels (optional):
+                The labels to convert to a natural string. If None, uses all the labels
+                in the dataset. Defaults to None.
         Returns:
             The natural string representation of the labels in specified language.
@@ -539,16 +542,17 @@ class DatasetConfig:
         else:
             sep_word = main_language.or_separator
-        local_labels: list[str] = []
-        for label in self.labels:
-            if label not in self.prompt_label_mapping:
-                continue
-            local_label = self.prompt_label_mapping[label]
-            if local_label not in local_labels:
-                local_labels.append(local_label)
+        if labels is None:
+            labels = list()
+            for english_label in self.labels:
+                if english_label not in self.prompt_label_mapping:
+                    continue
+                label = self.prompt_label_mapping[english_label]
+                if label not in labels:
+                    labels.append(label)
         # Convert labels to single-quoted labels - and remove duplicates
-        quoted_labels = [f"'{label}'" for label in local_labels]
+        quoted_labels = [f"'{label}'" for label in labels]
         if not quoted_labels:
             return ""

euroeval/dataset_configs/danish.py CHANGED Viewed

@@ -84,7 +84,6 @@ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
     languages=[DA],
     splits=["test"],
     bootstrap_samples=False,
-    _instruction_prompt="{text}",
 )
@@ -159,7 +158,6 @@ EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
     languages=[DA],
     splits=["test"],
     bootstrap_samples=False,
-    _instruction_prompt="{text}",
     unofficial=True,
 )
@@ -172,6 +170,5 @@ EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
     languages=[DA],
     splits=["test"],
     bootstrap_samples=False,
-    _instruction_prompt="{text}",
     unofficial=True,
 )

euroeval/generation_utils.py CHANGED Viewed

@@ -9,7 +9,7 @@ import typing as t
 from .enums import TaskGroup
 from .exceptions import InvalidBenchmark
 from .tokenization_utils import apply_chat_template
-from .utils import log_once
+from .utils import extract_multiple_choice_labels, log_once
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -230,18 +230,49 @@ def apply_prompt(
             return dataset_config.prompt_template.format(**kwargs), ""
     match dataset_config.task.task_group:
-        case (
-            TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
-        ):
+        case TaskGroup.SEQUENCE_CLASSIFICATION:
+            labels_str = dataset_config.get_labels_str()
+            few_shot_sections = [
+                create_prompt(
+                    text=example["text"].replace("\n", " ").strip(),
+                    label=example["label"].replace("\n", " ").strip(),
+                    labels_str=labels_str,
+                )
+                for example in few_shot_examples
+            ]
+            new_sections = [
+                create_prompt(
+                    text=text.replace("\n", " ").strip(),
+                    label="",
+                    labels_str=labels_str,
+                )
+                for text in examples["text"]
+            ]
+        case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
             few_shot_sections = [
                 create_prompt(
                     text=example["text"].replace("\n", " ").strip(),
                     label=example["label"].replace("\n", " ").strip(),
+                    labels_str=dataset_config.get_labels_str(
+                        labels=extract_multiple_choice_labels(
+                            prompt=example["text"],
+                            candidate_labels=dataset_config.labels,
+                        )
+                    ),
                 )
                 for example in few_shot_examples
             ]
             new_sections = [
-                create_prompt(text=text.replace("\n", " ").strip(), label="")
+                create_prompt(
+                    text=text.replace("\n", " ").strip(),
+                    label="",
+                    labels_str=dataset_config.get_labels_str(
+                        labels=extract_multiple_choice_labels(
+                            prompt=text, candidate_labels=dataset_config.labels
+                        )
+                    ),
+                )
                 for text in examples["text"]
             ]
@@ -259,6 +290,7 @@ def apply_prompt(
             ]
         case TaskGroup.TOKEN_CLASSIFICATION:
+            labels_str = dataset_config.get_labels_str()
             def create_label(example: dict) -> str:
                 prompt_labels = dataset_config.prompt_label_mapping.values()
@@ -280,12 +312,15 @@ def apply_prompt(
                 create_prompt(
                     text=" ".join(example["tokens"]).replace("\n", " ").strip(),
                     label=create_label(example=example),
+                    labels_str=labels_str,
                 )
                 for example in few_shot_examples
             ]
             new_sections = [
                 create_prompt(
-                    text=" ".join(tokens).replace("\n", " ").strip(), label=""
+                    text=" ".join(tokens).replace("\n", " ").strip(),
+                    label="",
+                    labels_str=labels_str,
                 )
                 for tokens in examples["tokens"]
             ]
@@ -375,4 +410,7 @@ def apply_prompt(
             for new_prompt, _ in new_sections
         ]
+    # Always add the final prompts without few-shot examples, too, for analysis
+    examples["prompt"] = [new_prompt for new_prompt, _ in new_sections]
     return examples

euroeval/metrics/pipeline.py CHANGED Viewed

@@ -26,6 +26,27 @@ logger: logging.Logger = logging.getLogger("euroeval")
 T = t.TypeVar("T", bound=int | float | str | bool)
+class PreprocessingFunction(t.Protocol):
+    """A protocol for a preprocessing function."""
+    def __call__(
+        self, predictions: c.Sequence[int], dataset: "Dataset"
+    ) -> c.Sequence[int]:
+        """Preprocess the model predictions before they are passed to the pipeline.
+        Args:
+            predictions:
+                The model predictions.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
+        Returns:
+            The preprocessed model predictions.
+        """
+        ...
 class PipelineMetric(Metric):
     """Load a scikit-learn pipeline and use it to get scores from the predictions."""
@@ -36,7 +57,7 @@ class PipelineMetric(Metric):
         pipeline_repo: str,
         pipeline_scoring_function: c.Callable[["Pipeline", c.Sequence], float],
         pipeline_file_name: str = "pipeline.pkl",
-        preprocessing_fn: c.Callable[[c.Sequence[T]], c.Sequence[T]] = lambda x: x,
+        preprocessing_fn: PreprocessingFunction | None = None,
         postprocessing_fn: c.Callable[[float], tuple[float, str]] | None = None,
     ) -> None:
         """Initialise the pipeline transform metric.
@@ -101,7 +122,10 @@ class PipelineMetric(Metric):
         """
         if self.pipeline is None:
             self.pipeline = self._download_pipeline()
-        predictions = self.preprocessing_fn(predictions)
+        if self.preprocessing_fn is not None:
+            predictions = self.preprocessing_fn(
+                predictions=predictions, dataset=dataset
+            )
         return self.pipeline_scoring_function(self.pipeline, predictions)
     def _download_pipeline(self) -> "Pipeline":
@@ -133,13 +157,18 @@ class PipelineMetric(Metric):
 ### European Values Metric ###
-def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence[int]:
+def european_values_preprocessing_fn(
+    predictions: c.Sequence[int], dataset: "Dataset"
+) -> c.Sequence[int]:
     """Preprocess the model predictions for the European Values metric.
     Args:
         predictions:
             The model predictions, a sequence of integers representing the predicted
             choices for each question.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         The preprocessed model predictions, a sequence of integers representing the
@@ -154,6 +183,17 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
     num_questions = 53
     num_phrasings_per_question = 5
+    # Convert the predictions to integers
+    integer_predictions = []
+    for prediction, idx_to_choice in zip(predictions, dataset["idx_to_choice"]):
+        idx_to_choice = {
+            int(idx): int(choice)
+            for idx, choice in idx_to_choice.items()
+            if choice is not None
+        }
+        integer_prediction = idx_to_choice[prediction]
+        integer_predictions.append(integer_prediction)
     assert len(predictions) % num_questions == 0, (
         f"The number of predictions ({len(predictions)}) is not a multiple of "
         f"{num_questions}, which is required for the European Values metric."
@@ -171,7 +211,7 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
         # Shape: (num_questions, num_phrasings_per_question)
         arr = np.array(
             [
-                predictions[i : i + num_phrasings_per_question]
+                integer_predictions[i : i + num_phrasings_per_question]
                 for i in range(0, len(predictions), num_phrasings_per_question)
             ]
         )
@@ -188,7 +228,7 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
         arr = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=arr)
         # Convert the array to a list
-        predictions = arr.tolist()
+        integer_predictions = arr.tolist()
     # Some of the questions are categorical and we're only interested in whether the
     # model chooses a specific choice or not. This mapping takes the question index
@@ -208,11 +248,13 @@ def european_values_preprocessing_fn(predictions: c.Sequence[int]) -> c.Sequence
     }
     # Map the predictions to the choices we're interested in
-    predictions = list(predictions)
+    integer_predictions = list(integer_predictions)
     for question_idx, choice in question_choices.items():
-        predictions[question_idx] = 1 if predictions[question_idx] == choice else 0
+        integer_predictions[question_idx] = (
+            1 if integer_predictions[question_idx] == choice else 0
+        )
-    return predictions
+    return integer_predictions
 def european_values_scoring_function(

euroeval/model_cache.py CHANGED Viewed

@@ -10,7 +10,9 @@ from dataclasses import asdict
 from tqdm.auto import tqdm
+from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
 from .data_models import GenerativeModelOutput, SingleGenerativeModelOutput
+from .utils import log_once
 if t.TYPE_CHECKING:
     from pathlib import Path
@@ -189,10 +191,20 @@ class ModelCache:
                 # the indices of the top scores, to save space. Further, we only store
                 # the scores if the generated sequence is shorter than the maximum
                 # length
-                if model_output.scores is not None and self.max_generated_tokens < 8:
+                if (
+                    model_output.scores is not None
+                    and self.max_generated_tokens
+                    <= NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
+                ):
                     assert model_output.scores is not None
                     scores = model_output.scores[sample_idx]
                 else:
+                    if model_output.scores is not None:
+                        log_once(
+                            "The generated sequence is longer than the maximum "
+                            "length for classification. Not caching the scores.",
+                            level=logging.DEBUG,
+                        )
                     scores = None
                 self[model_input] = SingleGenerativeModelOutput(
                     sequence=model_output.sequences[sample_idx], scores=scores

euroeval/task_group_utils/multiple_choice_classification.py CHANGED Viewed

@@ -126,7 +126,7 @@ def prepare_examples(
         ):
             choice_idxs.append(idx)
-    choices = [sections[idx] for idx in choice_idxs]
+    choices = [sections[idx] for idx in reversed(choice_idxs)]
     # Check that the choices are present, and that all of them are at the end
     assert len(choices) > 0, "No choices found in the document."
@@ -146,7 +146,7 @@ def prepare_examples(
     )
     new_examples["label"] = [
         int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
-        for letter, choice in zip("abcde", choices)
+        for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
     ]
     new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
     return new_examples

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -9,7 +9,11 @@ import numpy as np
 from ..enums import TaskGroup
 from ..exceptions import InvalidBenchmark
-from ..utils import log_once, raise_if_model_output_contains_nan_values
+from ..utils import (
+    extract_multiple_choice_labels,
+    log_once,
+    raise_if_model_output_contains_nan_values,
+)
 if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset
@@ -128,6 +132,21 @@ def extract_labels_from_generation(
             or if the model outputted log probabilities but the first label token
             mapping is not provided.
     """
+    # Get the candidate labels, which are the labels that the model can predict
+    default_labels = [
+        dataset_config.prompt_label_mapping[lbl]
+        for lbl in dataset_config.id2label.values()
+    ]
+    if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
+        sample_candidate_labels = [
+            extract_multiple_choice_labels(
+                prompt=prompt, candidate_labels=default_labels
+            )
+            for prompt in input_batch["prompt"]
+        ]
+    else:
+        sample_candidate_labels = [default_labels] * len(input_batch["prompt"])
     if model_output.scores is not None:
         if first_label_token_mapping is False:
             raise InvalidBenchmark(
@@ -136,8 +155,8 @@ def extract_labels_from_generation(
             )
         labels = get_closest_logprobs_labels(
             generation_logprobs=model_output.scores,
-            dataset_config=dataset_config,
             first_label_token_mapping=first_label_token_mapping,
+            candidate_labels=sample_candidate_labels,
         )
         if labels is not None:
             return labels
@@ -147,31 +166,8 @@ def extract_labels_from_generation(
                 "does not seem to be able to do that. Skipping the evaluation."
             )
-    # Get the candidate labels, which are the labels that the model can predict
-    candidate_labels = [
-        dataset_config.prompt_label_mapping[lbl]
-        for lbl in dataset_config.id2label.values()
-    ]
     new_predicted_labels: list[str] = list()
     for idx, predicted_label in enumerate(model_output.sequences):
-        # Special case if we are doing multiple choice classification: we in this case
-        # dynamically change the candidate labels to the labels mentioned in the prompt
-        if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
-            prompt = input_batch["text"][idx]
-            sample_candidate_labels = [
-                candidate_label
-                for candidate_label in candidate_labels
-                if re.search(
-                    pattern=rf"\b{candidate_label}. ",
-                    string=prompt,
-                    flags=re.IGNORECASE,
-                )
-                is not None
-            ]
-        else:
-            sample_candidate_labels = candidate_labels
         # If the prediction includes a boxed answer, use that instead of the full
         # generation
         if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
@@ -192,22 +188,43 @@ def extract_labels_from_generation(
                 s2=candidate_label.lower(),
                 weights=(insertion_weight, deletion_weight, substitution_weight),
             )
-            for candidate_label in sample_candidate_labels
+            for candidate_label in sample_candidate_labels[idx]
+        ]
+        best_candidate_label = sample_candidate_labels[idx][
+            np.argmin(edit_distances).item()
         ]
-        # If no candidate labels were found, we assume that something is wrong with the
-        # model output, and we raise an error
+        # If no candidate labels were found, we either pick the label with the smallest
+        # word edit distance to the predicted label (if invalid model outputs are
+        # allowed), or we raise an error
         if min(edit_distances) > 100:
-            raise InvalidBenchmark(
-                "No candidate labels found for the predicted label "
-                f"{predicted_label!r}, out of the candidate labels "
-                f"{sample_candidate_labels}. This likely means that the model output "
-                "is completely off, and we cannot extract any labels from it. Please "
-                "check the model output and the candidate labels."
-            )
+            if dataset_config.task.allow_invalid_model_outputs:
+                logger.warning(
+                    "No candidate labels found for the predicted label "
+                    f"{predicted_label!r}, out of the candidate labels "
+                    f"{sample_candidate_labels[idx]}. This likely means that the model "
+                    "output is completely off, but since invalid model outputs are "
+                    "allowed for this task, we will use the closest candidate label "
+                    f"({best_candidate_label})) as the output label. If you see this "
+                    "warning very often, please report this issue to the EuroEval "
+                    "team at github.com/EuroEval/EuroEval/issues."
+                )
+                logger.debug(
+                    "The candidate labels were extracted from the prompt: "
+                    f"{input_batch['text'][idx]!r}."
+                )
+            else:
+                raise InvalidBenchmark(
+                    "No candidate labels found for the predicted label "
+                    f"{predicted_label!r}, out of the candidate labels "
+                    f"{sample_candidate_labels[idx]}. This likely means that the model "
+                    "output is completely off, and we cannot extract any labels from "
+                    "it. Please check the model output and the candidate labels. The "
+                    "candidate labels were extracted from the prompt: "
+                    f"{input_batch['text'][idx]!r}."
+                )
-        # Pick the label with the smallest word edit distance to the predicted label
-        best_candidate_label = sample_candidate_labels[np.argmin(edit_distances).item()]
         new_predicted_labels.append(best_candidate_label)
     return new_predicted_labels
@@ -215,8 +232,8 @@ def extract_labels_from_generation(
 def get_closest_logprobs_labels(
     generation_logprobs: list[list[list[tuple[str, float]]]],
-    dataset_config: "DatasetConfig",
     first_label_token_mapping: dict[str, str] | t.Literal[True],
+    candidate_labels: list[list[str]],
 ) -> list[str] | None:
     """Get the labels with the highest predicted logprob value.
@@ -229,11 +246,11 @@ def get_closest_logprobs_labels(
         generation_logprobs:
             The logprobs of the generated tokens, for all samples in the batch. Of shape
             (batch_size, num_tokens, num_logprobs).
-        dataset_config:
-            The configuration of the dataset.
         first_label_token_mapping:
             A mapping from labels to the first token in each label, or alternatively a
             `True` value indicating that the model should output logprobs.
+        candidate_labels:
+            The candidate labels for each sample in the batch.
     Returns:
         The predicted labels, or None if labels could not be extracted.
@@ -242,12 +259,8 @@ def get_closest_logprobs_labels(
         InvalidBenchmark:
             If no candidate label can be found for any of the generated labels.
     """
-    english_labels = list(dataset_config.id2label.values())
-    english2local = dataset_config.prompt_label_mapping
-    candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
     output_labels: list[str] = list()
-    for sample in generation_logprobs:
+    for idx, sample in enumerate(generation_logprobs):
         for logprob_list in sample:
             generated_labels = [
                 re.sub(pattern=r"^[^a-zæøåüöä0-9]+$", repl="", string=label.lower())
@@ -265,7 +278,7 @@ def get_closest_logprobs_labels(
                 if isinstance(first_label_token_mapping, dict):
                     if any(
                         candidate_label not in first_label_token_mapping
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                     ):
                         raise InvalidBenchmark(
                             "There is a label not present in the first label token "
@@ -276,13 +289,13 @@ def get_closest_logprobs_labels(
                     candidate_output_labels = {
                         candidate_label
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                         if generated_label == first_label_token_mapping[candidate_label]
                     }
                 else:
                     candidate_output_labels = {
                         candidate_label
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                         if candidate_label.startswith(generated_label)
                     }
@@ -328,7 +341,7 @@ def get_closest_logprobs_labels(
                 elif len(candidate_output_labels) == 0:
                     candidate_output_labels_starting_with_generated_label = [
                         candidate_label
-                        for candidate_label in candidate_labels
+                        for candidate_label in candidate_labels[idx]
                         if candidate_label.startswith(generated_label)
                     ]
                     if candidate_output_labels_starting_with_generated_label:
@@ -364,18 +377,18 @@ def get_closest_logprobs_labels(
             if len(sample) == 0:
                 log_once(
                     "The model outputted an empty string, so no candidate labels could "
-                    f"be determined. Using the first label, {candidate_labels[0]!r}, "
-                    "as the output label.",
+                    "be determined. Using the first label, "
+                    f"{candidate_labels[idx][0]!r}, as the output label.",
                     level=logging.INFO,
                 )
             else:
                 log_once(
                     "Could not find a candidate label for any of the generated "
                     f"labels in the sample {sample}. Using the first label, "
-                    f"{candidate_labels[0]!r}, as the output label.",
+                    f"{candidate_labels[idx][0]!r}, as the output label.",
                     level=logging.INFO,
                 )
-            output_labels.append(candidate_labels[0])
+            output_labels.append(candidate_labels[idx][0])
     assert len(output_labels) == len(generation_logprobs)
     return output_labels

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -215,6 +215,20 @@ def extract_labels_from_generation(
         prompt_label_mapping = dataset_config.prompt_label_mapping
         for prompt_tag_name, named_entities in prediction_dict.items():
+            if not isinstance(named_entities, list):
+                logger.debug(
+                    "The model produced an invalid format for the named entities. "
+                    f"Expected a list but got {type(named_entities)}. Skipping."
+                )
+                continue
+            try:
+                named_entities = [str(ne) for ne in named_entities]
+            except Exception:
+                logger.debug(
+                    "The model produced an invalid format for the named entities. "
+                    f"Expected a list of strings but got {named_entities}. Skipping."
+                )
+                continue
             try:
                 tag_name = [
                     tag[2:]

euroeval/tasks.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All benchmarks tasks used in EuroEval."""
 from . import metrics as m
+from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
 from .data_models import Task
 from .enums import GenerativeType, ModelType, TaskGroup
 from .prompt_templates import (
@@ -28,7 +29,7 @@ LA = Task(
     template_dict=LA_TEMPLATES,
     metrics=[m.mcc_metric, m.macro_f1_metric],
     default_num_few_shot_examples=12,
-    default_max_generated_tokens=10,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["correct", "incorrect"],
     uses_logprobs=True,
 )
@@ -73,7 +74,7 @@ SENT = Task(
     template_dict=SENT_TEMPLATES,
     metrics=[m.mcc_metric, m.macro_f1_metric],
     default_num_few_shot_examples=12,
-    default_max_generated_tokens=10,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["positive", "neutral", "negative"],
     uses_logprobs=True,
 )
@@ -97,7 +98,7 @@ KNOW = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=10,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
     uses_logprobs=True,
 )
@@ -109,7 +110,7 @@ MCRC = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=10,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
     uses_logprobs=True,
 )
@@ -121,7 +122,7 @@ COMMON_SENSE = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=10,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
     uses_logprobs=True,
 )
@@ -133,8 +134,8 @@ EUROPEAN_VALUES = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.european_values_metric],
     default_num_few_shot_examples=0,
-    default_max_generated_tokens=10,
-    default_labels=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
+    default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
     allowed_model_types=[ModelType.GENERATIVE],
     allowed_generative_types=[
         GenerativeType.INSTRUCTION_TUNED,
@@ -142,6 +143,7 @@ EUROPEAN_VALUES = Task(
     ],
     requires_zero_shot=True,
     uses_logprobs=True,
+    allow_invalid_model_outputs=False,
 )

euroeval/tokenization_utils.py CHANGED Viewed

@@ -7,9 +7,8 @@ import typing as t
 import torch
 from transformers import MistralCommonTokenizer
-from euroeval.exceptions import InvalidModel
 from .enums import GenerativeType
+from .exceptions import InvalidModel
 from .utils import log_once
 if t.TYPE_CHECKING:

euroeval/utils.py CHANGED Viewed

@@ -25,7 +25,7 @@ from datasets.utils import disable_progress_bar
 from requests.exceptions import RequestException
 from transformers import logging as tf_logging
-from .exceptions import NaNValueInModelOutput
+from .exceptions import InvalidBenchmark, NaNValueInModelOutput
 if t.TYPE_CHECKING:
     from types import TracebackType
@@ -457,3 +457,34 @@ def get_hf_token(api_key: str | None) -> str | bool:
             level=logging.DEBUG,
         )
         return False
+def extract_multiple_choice_labels(
+    prompt: str, candidate_labels: list[str]
+) -> list[str]:
+    """Extract multiple choice labels from a prompt.
+    Args:
+        prompt:
+            The prompt to extract the labels from.
+        candidate_labels:
+            The candidate labels to look for in the prompt.
+    Returns:
+        The extracted labels.
+    """
+    sample_candidate_labels: list[str] = list()
+    for candidate_label in candidate_labels:
+        candidate_label_match = re.search(
+            pattern=rf"\b{candidate_label}\. ", string=prompt, flags=re.IGNORECASE
+        )
+        if candidate_label_match is not None:
+            sample_candidate_labels.append(candidate_label)
+    if not sample_candidate_labels:
+        raise InvalidBenchmark(
+            "Could not extract any candidate labels from the prompt. Please ensure "
+            "that the candidate labels are present in the prompt, each followed by a "
+            "dot and a space (e.g., 'a. '). The candidate labels are: "
+            f"{', '.join(candidate_labels)}. Here is the prompt: {prompt!r}"
+        )
+    return sample_candidate_labels

{euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 16.0.0
+Version: 16.0.1
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,10 +61,12 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
 Provides-Extra: all
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
 Description-Content-Type: text/markdown

{euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/RECORD RENAMED Viewed

@@ -1,34 +1,34 @@
-euroeval/__init__.py,sha256=MgFG1amMgiTJmK_hcQ7nnX-o4KFhlD1P5xKUBTloPCQ,3564
+euroeval/__init__.py,sha256=8jqSCcDWvwwNb1guPi8cLAekPSOX9V8DpRx_v3-c19E,3730
 euroeval/benchmark_config_factory.py,sha256=ZKzGkWr-Mr4wEMYNXUHsYkd2R-dxnNyETZJJ-Fq-my0,11386
 euroeval/benchmarker.py,sha256=YNqhl2QchqzbGMGu8QoJAG_mnYbcJ46ksfaS0x78fiw,49847
 euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
 euroeval/cli.py,sha256=RR45NiHMI9hphqBJ7Xopde-C18Be9JgJxgg6eYPFVMM,8594
-euroeval/constants.py,sha256=HWJ3PJRS-ZbAMXTvujiK8QP7IiS4RHkjnegv3oi52w0,2499
+euroeval/constants.py,sha256=imy-YwofbAwTbjk_vgynYf3zaK5kKV349oXZl99DVyM,2742
 euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
-euroeval/data_models.py,sha256=NdzD1ER3GHJp51UXLGTW8iTYwzZlITH2nO0vanTkEWU,24272
+euroeval/data_models.py,sha256=UGyqPAYFImrR1gi4ctQdCVb0rjVkEmyf4Lc1a7_6t6E,24663
 euroeval/enums.py,sha256=V73E8FTL1aRz74OKcxokTYLnO7Q8HGs2QI0JPZI4qQo,3032
 euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
 euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
 euroeval/generation.py,sha256=wm2u8fDGDgtWxCReG3N6v4_lLvo0OHTpR88ThGSRH7A,12139
-euroeval/generation_utils.py,sha256=vU-j9kjFDuPlSizEaRByx_XJyyAVpE8PdGOm9i--9zQ,14613
+euroeval/generation_utils.py,sha256=w3hfiJfUPDjf2xSKdDrhlpfuxZlztF0_0h2sFPB2hT0,16212
 euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
-euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
+euroeval/model_cache.py,sha256=h61cL_fy2Sd1sqYZis5lAWqvQIfQXXt_v8QZeftKNkg,9226
 euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
 euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
 euroeval/scores.py,sha256=gJ7DSQVyE2_8qZxJPuUJcFk7Byj2D7nevE23kd4XMbA,3004
 euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
-euroeval/tasks.py,sha256=jl8HicriMSN_LfHANokVGFqzgV53QcJ5dmzb297xI04,4173
-euroeval/tokenization_utils.py,sha256=icEfttWReKRC5MbREOuxTHOPpuVvH6uHhnqz1w7qIyA,20565
+euroeval/tasks.py,sha256=fwmDKnIexmWbm8HueLUilYzqdNRfo0rFxX-tjZ53Nbg,4503
+euroeval/tokenization_utils.py,sha256=66nip9llPw3XBEzGY0TE1DrejLV2WvdSA1p1euXC6Bg,20556
 euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
-euroeval/utils.py,sha256=O4JIROPfbA7MD9SbOY0CifoCckYjmdNjXYjOxDwBnwM,14149
+euroeval/utils.py,sha256=ITvT-JxXosrDuElNV7cbASfxzDWSBz9mJWAZHiTOiZY,15304
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=vYW97bnlzqxxcIq6lY-zd0o6zxyDRMhT85jOhdKnoYE,11482
 euroeval/benchmark_modules/fresh.py,sha256=_iRTHt9qUkq7jPOlgwx7IwZG48dK4mjMrh7KiEHeUjE,10462
 euroeval/benchmark_modules/hf.py,sha256=HDXuVwt0kZUyL9x3aG5pEjSdGCRfzegqT0xKZYprjU0,43843
 euroeval/benchmark_modules/litellm.py,sha256=M6ct5ppcYfO-Il5VMRm3PuyAeQ-rtS22UKyRStLnqfM,59210
-euroeval/benchmark_modules/vllm.py,sha256=dTwGGOFQ7wqYXg7x2YBUJNQcO6OwqjTMBfUf5OveXNk,41289
+euroeval/benchmark_modules/vllm.py,sha256=ckWLA9maDP5TLAfLhEXzkOYJBngb5BQR7X7RLKPl64A,41824
 euroeval/dataset_configs/__init__.py,sha256=lEOr4kJzgtUymeNBVhd-VwdUK0YTUZ3GjUMlLz5fGWk,2010
-euroeval/dataset_configs/danish.py,sha256=3n9e0r-hYRI2hPOgLDMQsO8bPgZKjw7OcFCUsCvdmk4,5294
+euroeval/dataset_configs/danish.py,sha256=Pb43E-xfgQk9uaxq8ooznvf8okdX8KAYFEPHt1CG_TQ,5192
 euroeval/dataset_configs/dutch.py,sha256=tY7FDw7BmhXxNfI1hqfasxQXP0QbYTqknokTZ7gqdRY,5079
 euroeval/dataset_configs/english.py,sha256=Y4yc3AQu8WojqENj0sy4-rIlx1LhPnsCQ0DeonqDsVs,4128
 euroeval/dataset_configs/estonian.py,sha256=o13P_XkrdhLFCz9l8LJy-TSY3JIN7XmByxesEDiagnc,2879
@@ -47,7 +47,7 @@ euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,
 euroeval/metrics/base.py,sha256=4vnRIPfKUwTNe0ZVm5YC2jQNecwchGUpN6nAH5cX0PM,2288
 euroeval/metrics/huggingface.py,sha256=b_Z_FUELQcmK7HeJh0zlAZs3pim1uNHnFLu7nvlZ4_A,5824
 euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
-euroeval/metrics/pipeline.py,sha256=T65p2sxPnwh2WgCjqsqzvE3XOzizNY7rlSm8KPR7sCk,8883
+euroeval/metrics/pipeline.py,sha256=a09Um3tnNdyQhzyDa9k-seYQXriYiJRQ5vyHK2lrKcg,10276
 euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
 euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
 euroeval/prompt_templates/linguistic_acceptability.py,sha256=9ZIyv_hfI2Aj20Uy9SY1izq5OBRV844PXPiZCNCOoEY,8207
@@ -57,13 +57,13 @@ euroeval/prompt_templates/reading_comprehension.py,sha256=eRMN-kCT3wuImbuFXzZYfo
 euroeval/prompt_templates/sentiment_classification.py,sha256=eIXn-aAY7LKeXqxzMKoqdVbihA2f1RaNQk7DhceuQdQ,8887
 euroeval/prompt_templates/summarization.py,sha256=GvnKuYJKbJ_2QkdtSWp_h4RhfOXdq-7_yYeClJSPaTY,6137
 euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
-euroeval/task_group_utils/multiple_choice_classification.py,sha256=lNEOWi3ckLBnMP1QoSTxNxT-s6kBz2XH17mrmjQlv5s,7075
+euroeval/task_group_utils/multiple_choice_classification.py,sha256=i5sidJGAXnENRoB6pOelyaUeGP1qoxwPSzD-F9RLwWk,7106
 euroeval/task_group_utils/question_answering.py,sha256=vdEbcZy7BE6ICA7kWkPYmPW4eVuIiZ_4uJRLUexDhwY,27750
-euroeval/task_group_utils/sequence_classification.py,sha256=K_hFWY6D5WR8-uy6ZikCq3ighHNHSyzW7A62vwDkwDs,16512
+euroeval/task_group_utils/sequence_classification.py,sha256=ZIXcYo6ins9VUv8TT4aupWrfUQoWGBlgU8a1hYATOYM,17249
 euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
-euroeval/task_group_utils/token_classification.py,sha256=6bN9soT1kLthutCpqUT-jDmZZw9Mt7H3tjI4zVvE4BY,16469
-euroeval-16.0.0.dist-info/METADATA,sha256=uvzi8Bkgab8rKhgKavqFnv8rpL0KntFIYMZ7f1Joa0U,13544
-euroeval-16.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-16.0.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
-euroeval-16.0.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
-euroeval-16.0.0.dist-info/RECORD,,
+euroeval/task_group_utils/token_classification.py,sha256=sNl0rhkXI9g5zKsJujrWX-9jWbYYK2iaKA1AcUg0xW4,17118
+euroeval-16.0.1.dist-info/METADATA,sha256=toyIiyjwyl4Oty2YsD-P6r95hN0Si3BkBNBMOfmiwBA,13729
+euroeval-16.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-16.0.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
+euroeval-16.0.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
+euroeval-16.0.1.dist-info/RECORD,,

{euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-16.0.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 16.0.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

Potentially problematic release.

EuroEval 16.0.0py3-none-any.whl → 16.0.1py3-none-any.whl