PyPI - EuroEval - Versions diffs - 15.7.1__tar.gz → 15.7.2__tar.gz - Mend

EuroEval 15.7.1tar.gz → 15.7.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (240) hide show

{euroeval-15.7.1 → euroeval-15.7.2}/.pre-commit-config.yaml RENAMED Viewed

@@ -10,7 +10,7 @@ repos:
       - id: trailing-whitespace
       - id: debug-statements
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.7
+    rev: v0.11.8
     hooks:
       - id: ruff
         args:

{euroeval-15.7.1 → euroeval-15.7.2}/CHANGELOG.md RENAMED Viewed

@@ -10,6 +10,22 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
+## [v15.7.2] - 2025-05-02
+### Fixed
+- Now does not check if a model exists if it has already been evaluated. This is an
+  issue when evaluating Ollama models, if the Ollama server is not running.
+- When evaluating instruction-tuned models on text classification tasks, the chat
+  template sometimes ends with special symbols, such as a newline, which can change the
+  tokenisation of the generated label. When we are evaluating the model using logprobs
+  we are thus looking for the wrong label in these cases. We now take this into account,
+  and log it to the user if the labels are not found, to avoid confusion.
+- Finnish datasets were not included in the default "all" dataset list, which is the
+  default used when no datasets are specified. This has been fixed now.
+- Temporarily disabled HellaSwag-fi, as there is an issue with the labels in the test
+  split, causing errors during evaluation. We will re-enable in a future release, when
+  this has been fixed.
 ## [v15.7.1] - 2025-04-29
 ### Changed
 - Marked the DBRD Dutch sentiment classification as official, as the quality is

{euroeval-15.7.1 → euroeval-15.7.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.7.1
+Version: 15.7.2
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues

{euroeval-15.7.1 → euroeval-15.7.2}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "EuroEval"
-version = "15.7.1"
+version = "15.7.2"
 description = "The robust European language model benchmark."
 readme = "README.md"
 authors = [

{euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmark_config_factory.py RENAMED Viewed

@@ -238,7 +238,7 @@ def prepare_languages(
             The default language codes of the languages to include.
     Returns:
-        The prepared model or dataset languages.
+        The prepared dataset languages.
     """
     # Create a dictionary that maps languages to their associated language objects
     language_mapping = get_all_languages()

{euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmark_modules/litellm.py RENAMED Viewed

@@ -1007,6 +1007,10 @@ def try_download_ollama_model(model_id: str) -> bool:
     Returns:
         Whether the model was downloaded successfully.
+    Raises:
+        InvalidModel:
+            If Ollama is not running or the model cannot be downloaded.
     """
     if not (model_id.startswith("ollama/") or model_id.startswith("ollama_chat/")):
         return False
@@ -1021,11 +1025,17 @@ def try_download_ollama_model(model_id: str) -> bool:
             level=logging.WARNING,
         )
-    downloaded_ollama_models: list[str] = [
-        model_obj.model
-        for model_obj in ollama.list().models
-        if model_obj.model is not None
-    ]
+    try:
+        downloaded_ollama_models: list[str] = [
+            model_obj.model
+            for model_obj in ollama.list().models
+            if model_obj.model is not None
+        ]
+    except ConnectionError:
+        raise InvalidModel(
+            "Ollama does not seem to be running, so we cannot evaluate the model "
+            f"{model_id!r}. Please make sure that Ollama is running and try again."
+        )
     ollama_model_id = "/".join(model_id.split("/")[1:])
     if ollama_model_id not in downloaded_ollama_models:

{euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmark_modules/vllm.py RENAMED Viewed

@@ -797,7 +797,7 @@ def load_model_and_tokenizer(
             enable_lora=model_config.adapter_base_model_id is not None,
             max_lora_rank=256,
         )
-    except (ValueError, OSError) as e:
+    except (RuntimeError, ValueError, OSError) as e:
         if "awaiting a review from the repo authors" in str(e):
             raise InvalidModel(
                 f"The model {model_id!r} is awaiting a review from the repository "

{euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/benchmarker.py RENAMED Viewed

@@ -372,15 +372,7 @@ class Benchmarker:
         current_benchmark_results: list[BenchmarkResult] = list()
         for model_id in model_ids:
-            try:
-                model_config = get_model_config(
-                    model_id=model_id, benchmark_config=benchmark_config
-                )
-            except InvalidModel as e:
-                logger.info(e.message)
-                num_finished_benchmarks += len(dataset_configs)
-                continue
+            model_config: ModelConfig | None = None
             loaded_model: BenchmarkModule | None = None
             for dataset_config in dataset_configs:
                 # Skip if we have already benchmarked this model on this dataset and
@@ -394,12 +386,22 @@ class Benchmarker:
                 ):
                     logger.debug(
                         f"Skipping benchmarking {model_id} on "
-                        f"{dataset_config.pretty_name}, as it "
-                        "has already been benchmarked."
+                        f"{dataset_config.pretty_name}, as it has already been "
+                        "benchmarked."
                     )
                     num_finished_benchmarks += 1
                     continue
+                if model_config is None:
+                    try:
+                        model_config = get_model_config(
+                            model_id=model_id, benchmark_config=benchmark_config
+                        )
+                    except InvalidModel as e:
+                        logger.info(e.message)
+                        num_finished_benchmarks += len(dataset_configs)
+                        continue
                 # Skip if the model is an encoder model and the task is generative
                 task_is_generative = (
                     dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS

{euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/__init__.py RENAMED Viewed

@@ -7,6 +7,7 @@ from .danish import *  # noqa: F403
 from .dutch import *  # noqa: F403
 from .english import *  # noqa: F403
 from .faroese import *  # noqa: F403
+from .finnish import *  # noqa: F403
 from .french import *  # noqa: F403
 from .german import *  # noqa: F403
 from .icelandic import *  # noqa: F403

{euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/dataset_configs/finnish.py RENAMED Viewed

@@ -2,7 +2,7 @@
 from ..data_models import DatasetConfig
 from ..languages import FI
-from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
+from ..tasks import LA, NER, RC, SENT, SUMM
 ### Official datasets ###
@@ -40,14 +40,16 @@ XLSUM_FI_CONFIG = DatasetConfig(
     languages=[FI],
 )
-HELLASWAG_FI_CONFIG = DatasetConfig(
-    name="hellaswag-fi",
-    pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
-    "HellaSwag-fi, translated from the English HellaSwag dataset",
-    huggingface_id="EuroEval/hellaswag-fi-mini",
-    task=COMMON_SENSE,
-    languages=[FI],
-)
+# TODO: Include when this issue has been resolved:
+# https://github.com/EuroEval/EuroEval/issues/158#issuecomment-2846664885
+# HELLASWAG_FI_CONFIG = DatasetConfig(
+#     name="hellaswag-fi",
+#     pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
+#     "HellaSwag-fi, translated from the English HellaSwag dataset",
+#     huggingface_id="EuroEval/hellaswag-fi-mini",
+#     task=COMMON_SENSE,
+#     languages=[FI],
+# )
 SCALA_FI_CONFIG = DatasetConfig(
     name="scala-fi",

{euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/languages.py RENAMED Viewed

@@ -21,6 +21,7 @@ def get_all_languages() -> dict[str, Language]:
 DA = Language(code="da", name="Danish", _and_separator="og", _or_separator="eller")
 NL = Language(code="nl", name="Dutch", _and_separator="en", _or_separator="of")
 EN = Language(code="en", name="English", _and_separator="and", _or_separator="or")
+FI = Language(code="fi", name="Finnish", _and_separator="ja", _or_separator="tai")
 FO = Language(code="fo", name="Faroese", _and_separator="og", _or_separator="ella")
 FR = Language(code="fr", name="French", _and_separator="et", _or_separator="ou")
 DE = Language(code="de", name="German", _and_separator="und", _or_separator="oder")
@@ -78,7 +79,6 @@ EO = Language(code="eo", name="Esperanto")
 ET = Language(code="et", name="Estonian")
 EE = Language(code="ee", name="Ewe")
 FJ = Language(code="fj", name="Fijian")
-FI = Language(code="fi", name="Finnish")
 FY = Language(code="fy", name="Western Frisian")
 FF = Language(code="ff", name="Fulah")
 GD = Language(code="gd", name="Gaelic")

{euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/task_group_utils/sequence_classification.py RENAMED Viewed

@@ -132,6 +132,11 @@ def extract_labels_from_generation(
         The predicted labels.
     """
     if model_output.scores is not None:
+        if first_label_token_mapping is False:
+            raise InvalidBenchmark(
+                "The model outputted logprobs, but the first label token mapping is "
+                "not provided. This means that the model should not output logprobs."
+            )
         labels = get_closest_logprobs_labels(
             generation_logprobs=model_output.scores,
             dataset_config=dataset_config,
@@ -147,7 +152,7 @@ def extract_labels_from_generation(
 def get_closest_logprobs_labels(
     generation_logprobs: list[list[list[tuple[str, float]]]],
     dataset_config: "DatasetConfig",
-    first_label_token_mapping: dict[str, str] | bool,
+    first_label_token_mapping: dict[str, str] | t.Literal[True],
 ) -> list[str] | None:
     """Get the labels with the highest predicted logprob value.
@@ -164,8 +169,7 @@ def get_closest_logprobs_labels(
             The configuration of the dataset.
         first_label_token_mapping:
             A mapping from labels to the first token in each label, or alternatively a
-            Boolean value indicating whether the model should output scores (if the
-            mapping is outputted then the model will always output scores).
+            `True` value indicating that the model should output logprobs.
     Returns:
         The predicted labels, or None if labels could not be extracted.
@@ -195,7 +199,9 @@ def get_closest_logprobs_labels(
             # label, as the output label
             output_label: str | None = None
             for generated_label in generated_labels:
-                # Get the candidate labels that starts with the generated label
+                # Get the candidate labels. If we have a first label token mapping, we
+                # use it to get the candidate labels. Otherwise, we check if any of the
+                # labels start with the generated label.
                 if isinstance(first_label_token_mapping, dict):
                     if any(
                         candidate_label not in first_label_token_mapping
@@ -239,14 +245,43 @@ def get_closest_logprobs_labels(
                     )
                     return None
-                # If no candidate label is found, we ignore the generated label, as it
-                # basically means that the model is just really bad at generating
-                # labels.
+                # If no candidate label is found, we first check if any of the labels
+                # start with the generated label. This could be the case if the labels
+                # in the first token mapping is inaccurate or incomplete, for instance
+                # if 'pos' is in the first label token mapping, but the model outputted
+                # 'posit'. If this is the case then we cannot trust the first label
+                # token mapping, and we fall back to using word edit distance.
+                # Otherwise, the generated label is just bad, and we skip to the next
+                # generated label.
                 elif len(candidate_output_labels) == 0:
-                    logger.debug(
-                        f"No candidate label found for the generated label "
-                        f"{generated_label!r}. The generated label is thus ignored."
-                    )
+                    candidate_output_labels_starting_with_generated_label = [
+                        candidate_label
+                        for candidate_label in candidate_labels
+                        if candidate_label.startswith(generated_label)
+                    ]
+                    if candidate_output_labels_starting_with_generated_label:
+                        log_once(
+                            f"No candidate label found for the generated label "
+                            f"{generated_label!r}. This means that using logprobs to "
+                            "extract the labels is not reliable, and we will instead "
+                            "fall back to extracting the labels using word edit "
+                            "distance.",
+                            level=logging.DEBUG,
+                        )
+                        return None
+            # If we did not find any candidate label for any of the generated labels, we
+            # assume that something is wrong with the model output, and we fall back to
+            # using word edit distance to extract the labels
+            else:
+                log_once(
+                    f"No candidate label found for any of the generated labels "
+                    f"{generated_labels}. This means that using logprobs to extract "
+                    "the labels is not reliable, and we will instead fall back to "
+                    "extracting the labels using word edit distance.",
+                    level=logging.DEBUG,
+                )
+                return None
             if output_label is not None:
                 output_labels.append(output_label)

{euroeval-15.7.1 → euroeval-15.7.2}/src/euroeval/tokenization_utils.py RENAMED Viewed

@@ -311,24 +311,60 @@ def get_first_label_token_mapping(
             for label in dataset_config.labels
         ]
-        # Get the first token of each label, where we add a prefix space if needed
-        add_prefix_space = (
-            should_prefix_space_be_added_to_labels(
+        # Tokenize some text containing each label, which we will use to extract the
+        # first token of each label
+        all_tokens: list[list[str]]
+        if tokenizer.chat_template is None:
+            add_prefix_space = should_prefix_space_be_added_to_labels(
                 labels_to_be_generated=local_labels, tokenizer=tokenizer
             )
-            and tokenizer.chat_template is None
-        )
-        first_tokens = [
-            tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
-            for label in local_labels
-        ]
-        first_tokens = [
-            re.sub(
-                pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
-            )
-            for token in first_tokens
+            all_tokens = [
+                tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
+                for label in local_labels
+            ]
+        else:
+            all_tokens = [
+                tokenizer.convert_ids_to_tokens(
+                    ids=tokenizer.apply_chat_template(
+                        conversation=[
+                            dict(role="user", content=""),
+                            dict(role="assistant", content=label),
+                        ],
+                        add_generation_prompt=True,
+                        tokenize=True,
+                    )
+                )
+                for label in local_labels
+            ]
+        # Remove any non-alphabetic characters from the tokens
+        all_tokens = [
+            [
+                re.sub(
+                    pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
+                    repl="",
+                    string=token.lower(),
+                )
+                for token in token_list
+            ]
+            for token_list in all_tokens
         ]
+        # Extract the first token of each label
+        first_tokens: list[str] = list()
+        for token_list, label in zip(all_tokens, local_labels):
+            matching_tokens = [
+                tok for tok in token_list if tok and label.startswith(tok)
+            ]
+            if not matching_tokens:
+                log_once(
+                    f"No matching token found in token_list for label '{label}', so "
+                    "we will not output scores.",
+                    level=logging.DEBUG,
+                )
+                return False
+            first_tokens.append(matching_tokens[0])
         # Build a mapping from labels to the first token in each label if the first
         # tokens are distinct
         if len(first_tokens) == len(set(first_tokens)):

{euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_hellaswag_fi.py RENAMED Viewed

@@ -1,3 +1,13 @@
+# /// script
+# requires-python = ">=3.10,<4.0"
+# dependencies = [
+#     "datasets==3.5.0",
+#     "huggingface-hub==0.24.0",
+#     "pandas==2.2.0",
+#     "requests==2.32.3",
+# ]
+# ///
 """Create the Finnish HellaSwag-mini dataset and upload it to the HF Hub."""
 from collections import Counter
@@ -30,12 +40,16 @@ def main() -> None:
     repo_id = "Finnish-NLP/hellaswag-fi-google-translate"
     dataset = load_dataset(path=repo_id, token=True)
-    dfs = {}
-    for split in ["train", "validation", "test"]:
-        df = dataset[split].to_pandas()
+    assert isinstance(dataset, DatasetDict)
-        df["endings"] = df["endings"].apply(process_endings)
+    splits = ["train", "validation", "test"]
+    assert list(dataset.keys()) == splits
+    dfs: dict[str, pd.DataFrame] = dict()
+    for split in splits:
+        df = dataset[split].to_pandas()
+        assert isinstance(df, pd.DataFrame)
+        df.endings = df.endings.apply(process_endings)
         df = process_split(df=df, split=split)
         dfs[split] = df

{euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_scala.py RENAMED Viewed

@@ -2,7 +2,6 @@
 # requires-python = ">=3.10,<4.0"
 # dependencies = [
 #     "datasets==3.5.0",
-#     "euroeval",
 #     "huggingface-hub==0.24.0",
 #     "pandas==2.2.0",
 #     "requests==2.32.3",
@@ -41,14 +40,9 @@ from pandas.errors import SettingWithCopyWarning
 from requests.exceptions import HTTPError
 from tqdm.auto import tqdm
-from euroeval.utils import block_terminal_output
 def main() -> None:
     """Create the ScaLA datasets and upload them to the HF Hub."""
-    # Block terminal output
-    block_terminal_output()
     # Set up the POS dataset loaders
     pos_datasets = {
         "da": load_dadt_pos,

{euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_scandisent_fi.py RENAMED Viewed

@@ -1,3 +1,13 @@
+# /// script
+# requires-python = ">=3.10,<4.0"
+# dependencies = [
+#     "datasets==3.5.0",
+#     "huggingface-hub==0.24.0",
+#     "pandas==2.2.0",
+#     "requests==2.32.3",
+# ]
+# ///
 """Create the Finnish part of the ScandiSent dataset and upload it to the HF Hub."""
 import pandas as pd
@@ -14,7 +24,7 @@ def main() -> None:
     # Download the dataset
     dataset = load_dataset(path=repo_id, token=True, split="train")
-    assert isinstance(dataset, DatasetDict)
+    assert isinstance(dataset, Dataset)
     # Convert the dataset to a dataframe
     df = dataset.to_pandas()

{euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_turku_ner_fi.py RENAMED Viewed

@@ -1,3 +1,13 @@
+# /// script
+# requires-python = ">=3.10,<4.0"
+# dependencies = [
+#     "datasets==3.5.0",
+#     "huggingface-hub==0.24.0",
+#     "pandas==2.2.0",
+#     "requests==2.32.3",
+# ]
+# ///
 """Create the Finnish Turku NER dataset and upload it to the HF Hub."""
 import pandas as pd

{euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_tydiqa_fi.py RENAMED Viewed

@@ -1,3 +1,13 @@
+# /// script
+# requires-python = ">=3.10,<4.0"
+# dependencies = [
+#     "datasets==3.5.0",
+#     "huggingface-hub==0.24.0",
+#     "pandas==2.2.0",
+#     "requests==2.32.3",
+# ]
+# ///
 """Create the TydiQA-mini Finnish dataset and upload it to the HF Hub."""
 import pandas as pd

{euroeval-15.7.1 → euroeval-15.7.2}/src/scripts/create_xlsum_fi.py RENAMED Viewed

@@ -1,3 +1,13 @@
+# /// script
+# requires-python = ">=3.10,<4.0"
+# dependencies = [
+#     "datasets==3.5.0",
+#     "huggingface-hub==0.24.0",
+#     "pandas==2.2.0",
+#     "requests==2.32.3",
+# ]
+# ///
 """Create the Finnish version of the XLSum summarisation dataset."""
 import pandas as pd
@@ -11,7 +21,7 @@ def main() -> None:
     """Create the Finnish XL-Sum dataset and upload to HF Hub."""
     dataset_id = "TurkuNLP/xlsum-fi"
-    dataset = load_dataset(dataset_id)
+    dataset = load_dataset(dataset_id, trust_remote_code=True, token=True)
     assert isinstance(dataset, DatasetDict)
     dataset = dataset.rename_columns(column_mapping=dict(summary="target_text"))

EuroEval 15.7.1__tar.gz → 15.7.2__tar.gz

Potentially problematic release.

EuroEval 15.7.1tar.gz → 15.7.2tar.gz