PyPI - EuroEval - Versions diffs - 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl - Mend

EuroEval 16.0.1py3-none-any.whl → 16.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (48) hide show

euroeval/benchmark_config_factory.py +6 -1
euroeval/benchmark_modules/base.py +2 -0
euroeval/benchmark_modules/fresh.py +7 -1
euroeval/benchmark_modules/hf.py +26 -21
euroeval/benchmark_modules/litellm.py +258 -131
euroeval/benchmark_modules/vllm.py +79 -40
euroeval/benchmarker.py +11 -2
euroeval/cli.py +14 -1
euroeval/constants.py +1 -1
euroeval/data_models.py +77 -6
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +14 -0
euroeval/dataset_configs/dutch.py +14 -0
euroeval/dataset_configs/english.py +22 -0
euroeval/dataset_configs/estonian.py +15 -7
euroeval/dataset_configs/finnish.py +14 -0
euroeval/dataset_configs/french.py +14 -0
euroeval/dataset_configs/german.py +23 -0
euroeval/dataset_configs/italian.py +14 -0
euroeval/dataset_configs/latvian.py +14 -0
euroeval/dataset_configs/norwegian.py +14 -0
euroeval/dataset_configs/polish.py +126 -0
euroeval/dataset_configs/portuguese.py +14 -0
euroeval/dataset_configs/spanish.py +14 -0
euroeval/dataset_configs/swedish.py +25 -0
euroeval/enums.py +12 -0
euroeval/generation.py +17 -8
euroeval/generation_utils.py +65 -11
euroeval/metrics/pipeline.py +1 -1
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +27 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +11 -0
euroeval/prompt_templates/sentiment_classification.py +15 -0
euroeval/prompt_templates/summarization.py +27 -1
euroeval/scores.py +5 -0
euroeval/task_group_utils/question_answering.py +29 -29
euroeval/task_group_utils/sequence_classification.py +11 -34
euroeval/task_group_utils/token_classification.py +3 -3
euroeval/tasks.py +4 -4
euroeval/{tokenization_utils.py → tokenisation_utils.py} +50 -28
euroeval/utils.py +36 -3
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/METADATA +1 -1
euroeval-16.1.1.dist-info/RECORD +70 -0
euroeval-16.0.1.dist-info/RECORD +0 -69
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/WHEEL +0 -0
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/entry_points.txt +0 -0
{euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_config_factory.py CHANGED Viewed

@@ -8,7 +8,7 @@ import torch
 from .data_models import BenchmarkConfig
 from .dataset_configs import get_all_dataset_configs
-from .enums import Device
+from .enums import Device, GenerativeType
 from .exceptions import InvalidBenchmark
 from .languages import get_all_languages
 from .tasks import SPEED, get_all_tasks
@@ -43,6 +43,7 @@ def build_benchmark_config(
     api_base: str | None,
     api_version: str | None,
     gpu_memory_utilization: float,
+    generative_type: GenerativeType | None,
     debug: bool,
     run_with_cli: bool,
     requires_safetensors: bool,
@@ -107,6 +108,9 @@ def build_benchmark_config(
             faster evaluation, but at the risk of running out of GPU memory. Only reduce
             this if you are running out of GPU memory. Only relevant if the model is
             generative.
+        generative_type:
+            The type of generative model. Only relevant if the model is generative. If
+            not specified, the type will be inferred automatically.
         debug:
             Whether to run the benchmark in debug mode.
         run_with_cli:
@@ -157,6 +161,7 @@ def build_benchmark_config(
         api_base=api_base,
         api_version=api_version,
         gpu_memory_utilization=gpu_memory_utilization,
+        generative_type=generative_type,
         debug=debug,
         run_with_cli=run_with_cli,
         requires_safetensors=requires_safetensors,

euroeval/benchmark_modules/base.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import collections.abc as c
 import logging
+import re
 import sys
 import typing as t
 from abc import ABC, abstractmethod
@@ -55,6 +56,7 @@ class BenchmarkModule(ABC):
     fresh_model: bool
     batching_preference: "BatchingPreference"
     high_priority: bool
+    allowed_params: dict[re.Pattern, list[str]] = {re.compile(r".*"): []}
     def __init__(
         self,

euroeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -25,6 +25,7 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
+from ..generation_utils import raise_if_wrong_params
 from ..utils import block_terminal_output, create_model_cache_dir, get_hf_token
 from .hf import (
     HuggingFaceEncoderModel,
@@ -64,6 +65,10 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
             log_metadata:
                 Whether to log metadata about the model and the benchmark.
         """
+        raise_if_wrong_params(
+            model_config=model_config, allowed_params=self.allowed_params
+        )
         # This is already set when calling `super.__init__`, but we need it to get a
         # value from `self.model_max_length`, so we set it here as well.
         self.model_config = model_config
@@ -183,9 +188,10 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
         """
         return ModelConfig(
             model_id=model_id,
+            revision="main",
+            param=None,
             task="fill-mask",
             languages=list(),
-            revision="main",
             merge=False,
             inference_backend=InferenceBackend.TRANSFORMERS,
             model_type=ModelType.ENCODER,

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -14,6 +14,7 @@ from huggingface_hub import HfApi
 from huggingface_hub import whoami as hf_whoami
 from huggingface_hub.errors import (
     GatedRepoError,
+    HfHubHTTPError,
     HFValidationError,
     LocalTokenNotFoundError,
     RepositoryNotFoundError,
@@ -56,13 +57,14 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
+from ..generation_utils import raise_if_wrong_params
 from ..languages import get_all_languages
 from ..task_group_utils import (
     multiple_choice_classification,
     question_answering,
     token_classification,
 )
-from ..tokenization_utils import get_bos_token, get_eos_token
+from ..tokenisation_utils import get_bos_token, get_eos_token
 from ..utils import (
     block_terminal_output,
     create_model_cache_dir,
@@ -70,6 +72,7 @@ from ..utils import (
     get_hf_token,
     internet_connection_available,
     log_once,
+    split_model_id,
 )
 from .base import BenchmarkModule
@@ -110,6 +113,10 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             log_metadata:
                 Whether to log the model metadata.
         """
+        raise_if_wrong_params(
+            model_config=model_config, allowed_params=self.allowed_params
+        )
         model, tokeniser = load_model_and_tokeniser(
             model_config=model_config,
             dataset_config=dataset_config,
@@ -247,15 +254,6 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             max_length for max_length in all_max_lengths if max_length >= 128
         ]
-        # We remove the upper cap of maximum context length for the model, as it is
-        # highly unlikely that this is the model's actual maximum context length - we
-        # would rather not report a value than report an incorrect one.
-        all_max_lengths = [
-            max_length
-            for max_length in all_max_lengths
-            if max_length != MAX_CONTEXT_LENGTH
-        ]
         if len(list(all_max_lengths)) > 0:
             model_max_length = min(list(all_max_lengths))
         else:
@@ -483,11 +481,11 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             Whether the model exists, or an error describing why we cannot check
             whether the model exists.
         """
-        model_id, revision = (
-            model_id.split("@") if "@" in model_id else (model_id, "main")
-        )
+        model_id_components = split_model_id(model_id=model_id)
         model_info = get_model_repo_info(
-            model_id=model_id, revision=revision, benchmark_config=benchmark_config
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            benchmark_config=benchmark_config,
         )
         return (
             model_info is not None
@@ -509,11 +507,11 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         Returns:
             The model configuration.
         """
-        model_id, revision = (
-            model_id.split("@") if "@" in model_id else (model_id, "main")
-        )
+        model_id_components = split_model_id(model_id=model_id)
         model_info = get_model_repo_info(
-            model_id=model_id, revision=revision, benchmark_config=benchmark_config
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            benchmark_config=benchmark_config,
         )
         if model_info is None:
             raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -522,8 +520,9 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         language_codes = list(language_mapping.keys())
         model_config = ModelConfig(
-            model_id=model_id,
-            revision=revision,
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            param=model_id_components.param,
             task=model_info.pipeline_tag,
             languages=[
                 language_mapping[tag]
@@ -710,7 +709,6 @@ def get_model_repo_info(
     """
     token = get_hf_token(api_key=benchmark_config.api_key)
     hf_api = HfApi(token=token)
-    model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "main")
     # Get information on the model.
     # The first case is when the model is a local model, in which case we create a dummy
@@ -753,6 +751,13 @@ def get_model_repo_info(
                     return None
             except (RepositoryNotFoundError, HFValidationError):
                 return None
+            except HfHubHTTPError as e:
+                if "unauthorized" in str(e).lower():
+                    raise InvalidModel(
+                        "It seems like your specified Hugging Face API key is invalid. "
+                        "Please double-check your API key."
+                    ) from e
+                raise InvalidModel(str(e)) from e
             except (OSError, RequestException) as e:
                 if internet_connection_available():
                     errors.append(e)

EuroEval 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl

Potentially problematic release.

EuroEval 16.0.1py3-none-any.whl → 16.1.1py3-none-any.whl