PyPI - EuroEval - Versions diffs - 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl - Mend

EuroEval 15.15.0py3-none-any.whl → 16.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show

euroeval/__init__.py +3 -7
euroeval/benchmark_config_factory.py +3 -7
euroeval/benchmark_modules/base.py +35 -19
euroeval/benchmark_modules/fresh.py +24 -19
euroeval/benchmark_modules/hf.py +136 -154
euroeval/benchmark_modules/litellm.py +323 -193
euroeval/benchmark_modules/vllm.py +166 -112
euroeval/benchmarker.py +59 -33
euroeval/cli.py +3 -3
euroeval/constants.py +13 -15
euroeval/data_loading.py +33 -28
euroeval/data_models.py +53 -7
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/danish.py +38 -1
euroeval/dataset_configs/dutch.py +38 -1
euroeval/dataset_configs/english.py +38 -1
euroeval/dataset_configs/estonian.py +95 -0
euroeval/dataset_configs/faroese.py +38 -0
euroeval/dataset_configs/finnish.py +39 -1
euroeval/dataset_configs/french.py +38 -1
euroeval/dataset_configs/german.py +38 -1
euroeval/dataset_configs/icelandic.py +39 -1
euroeval/dataset_configs/italian.py +38 -1
euroeval/dataset_configs/latvian.py +81 -0
euroeval/dataset_configs/norwegian.py +38 -1
euroeval/dataset_configs/portuguese.py +38 -1
euroeval/dataset_configs/spanish.py +38 -1
euroeval/dataset_configs/swedish.py +38 -1
euroeval/enums.py +0 -6
euroeval/finetuning.py +8 -7
euroeval/generation.py +25 -14
euroeval/generation_utils.py +46 -14
euroeval/languages.py +947 -187
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +76 -0
euroeval/metrics/huggingface.py +192 -0
euroeval/metrics/llm_as_a_judge.py +257 -0
euroeval/metrics/pipeline.py +234 -0
euroeval/metrics/speed.py +51 -0
euroeval/prompt_templates/linguistic_acceptability.py +40 -2
euroeval/prompt_templates/multiple_choice.py +23 -2
euroeval/prompt_templates/named_entity_recognition.py +65 -2
euroeval/prompt_templates/reading_comprehension.py +42 -2
euroeval/prompt_templates/sentiment_classification.py +46 -2
euroeval/prompt_templates/summarization.py +24 -4
euroeval/scores.py +7 -2
euroeval/speed_benchmark.py +6 -6
euroeval/task_group_utils/multiple_choice_classification.py +17 -6
euroeval/task_group_utils/question_answering.py +35 -28
euroeval/task_group_utils/sequence_classification.py +96 -23
euroeval/task_group_utils/text_to_text.py +7 -3
euroeval/task_group_utils/token_classification.py +47 -75
euroeval/tasks.py +31 -6
euroeval/tokenization_utils.py +295 -207
euroeval/utils.py +118 -34
{euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +12 -14
euroeval-16.0.0.dist-info/RECORD +69 -0
{euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -738
euroeval/metrics.py +0 -468
euroeval-15.15.0.dist-info/RECORD +0 -63
{euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
{euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -77,10 +77,6 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
 os.environ["OMP_NUM_THREADS"] = "1"
-# Disable a warning from Ray regarding the detection of the number of CPUs
-os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
 # Avoid the "Cannot re-initialize CUDA in forked subprocess" error - see
 # https://github.com/vllm-project/vllm/issues/6152 for more
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@@ -100,9 +96,9 @@ os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
 os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
-# Use older version v0 of vLLM, as the newer one requires XGrammar as decoding backend,
-# but XGrammar does not support having a maximal amount of elements in lists
-os.environ["VLLM_USE_V1"] = "0"
+# Enable the newer vLLM V1 engine, which is faster and offers more compatibility with
+# newer models
+os.environ["VLLM_USE_V1"] = "1"
 # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the

euroeval/benchmark_config_factory.py CHANGED Viewed

@@ -45,8 +45,7 @@ def build_benchmark_config(
     gpu_memory_utilization: float,
     debug: bool,
     run_with_cli: bool,
-    only_allow_safetensors: bool,
-    first_time: bool = False,
+    requires_safetensors: bool,
 ) -> BenchmarkConfig:
     """Create a benchmark configuration.
@@ -112,11 +111,8 @@ def build_benchmark_config(
             Whether to run the benchmark in debug mode.
         run_with_cli:
             Whether the benchmark is being run with the CLI.
-        only_allow_safetensors:
+        requires_safetensors:
             Whether to only allow evaluations of models stored as safetensors.
-        first_time:
-            Whether this is the first time the benchmark configuration is being created.
-            Defaults to False.
     Returns:
         The benchmark configuration.
@@ -163,7 +159,7 @@ def build_benchmark_config(
         gpu_memory_utilization=gpu_memory_utilization,
         debug=debug,
         run_with_cli=run_with_cli,
-        only_allow_safetensors=only_allow_safetensors,
+        requires_safetensors=requires_safetensors,
     )

euroeval/benchmark_modules/base.py CHANGED Viewed

@@ -7,12 +7,12 @@ import typing as t
 from abc import ABC, abstractmethod
 from functools import cached_property, partial
-from datasets import DatasetDict
+from datasets import Dataset, DatasetDict
 from torch import nn
 from tqdm.auto import tqdm
 from ..enums import TaskGroup
-from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
+from ..exceptions import InvalidBenchmark, NeedsEnvironmentVariable, NeedsExtraInstalled
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
@@ -61,6 +61,7 @@ class BenchmarkModule(ABC):
         model_config: "ModelConfig",
         dataset_config: "DatasetConfig",
         benchmark_config: "BenchmarkConfig",
+        log_metadata: bool = True,
     ) -> None:
         """Initialise the benchmark module.
@@ -71,12 +72,16 @@ class BenchmarkModule(ABC):
                 The dataset configuration.
             benchmark_config:
                 The benchmark configuration.
+            log_metadata:
+                Whether to log the metadata of the model.
         """
         self.model_config = model_config
         self.dataset_config = dataset_config
         self.benchmark_config = benchmark_config
+        self.log_metadata = log_metadata
         self.buffer: dict[str, t.Any] = dict()
-        self._log_metadata()
+        if self.log_metadata:
+            self._log_metadata()
     def _log_metadata(self) -> None:
         """Log the metadata of the model."""
@@ -117,16 +122,16 @@ class BenchmarkModule(ABC):
             f"{self.__class__.__name__}."
         )
-    def get_tokenizer(self) -> "PreTrainedTokenizer":
-        """Get the underlying tokenizer.
+    def get_tokeniser(self) -> "PreTrainedTokenizer":
+        """Get the underlying tokeniser.
         Returns:
-            The tokenizer.
+            The tokeniser.
         """
-        if hasattr(self, "_tokenizer"):
-            return self._tokenizer
+        if hasattr(self, "_tokeniser"):
+            return self._tokeniser
         raise NotImplementedError(
-            "The `get_tokenizer` method has not been implemented for "
+            "The `get_tokeniser` method has not been implemented for "
             f"{self.__class__.__name__}."
         )
@@ -192,11 +197,13 @@ class BenchmarkModule(ABC):
                 return partial(
                     sequence_classification.compute_metrics,
                     dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
                 )
             case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
                 return partial(
                     sequence_classification.compute_metrics,
                     dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
                 )
             case TaskGroup.TEXT_TO_TEXT:
                 return partial(
@@ -209,11 +216,13 @@ class BenchmarkModule(ABC):
                     token_classification.compute_metrics,
                     has_misc_tags=self.buffer.get("has_misc_tags", True),
                     dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
                 )
             case TaskGroup.QUESTION_ANSWERING:
                 return partial(
                     question_answering.compute_metrics,
                     dataset_config=self.dataset_config,
+                    benchmark_config=self.benchmark_config,
                 )
             case _:
                 raise NotImplementedError(
@@ -255,6 +264,11 @@ class BenchmarkModule(ABC):
         Returns:
             The prepared datasets.
+        Raises:
+            InvalidBenchmark:
+                If the dataset does not have a 'train' split for token classification
+                tasks.
         """
         for idx, dataset in enumerate(
             tqdm(iterable=datasets, desc="Preparing datasets")
@@ -263,22 +277,24 @@ class BenchmarkModule(ABC):
                 dataset=dataset, task=task, itr_idx=idx
             )
             if self.dataset_config.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
+                if "train" not in dataset:
+                    raise InvalidBenchmark(
+                        "The dataset does not have a 'train' split, which is required "
+                        "for token classification tasks."
+                    )
                 labels_in_train: set[str] = {
                     tag for tag_list in dataset["train"]["labels"] for tag in tag_list
                 }
                 self.buffer["has_misc_tags"] = (
                     "B-MISC" in labels_in_train or "I-MISC" in labels_in_train
                 )
-            datasets[idx] = DatasetDict(
-                dict(
-                    train=prepared_dataset["train"],
-                    val=prepared_dataset["val"],
-                    test=prepared_dataset["test"],
-                    original_train=dataset["train"],
-                    original_val=dataset["val"],
-                    original_test=dataset["test"],
-                )
-            )
+            datasets_dict: dict[str, Dataset] = dict()
+            for split_name, split in prepared_dataset.items():
+                datasets_dict[split_name] = split
+            for split_name, split in dataset.items():
+                datasets_dict[f"original_{split_name}"] = split
+            datasets[idx] = DatasetDict(datasets_dict)
         return datasets
     @abstractmethod

euroeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Freshly initialised encoder models."""
-import os
 import typing as t
 from functools import cached_property
 from json import JSONDecodeError
@@ -26,10 +25,10 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
-from ..utils import block_terminal_output, create_model_cache_dir
+from ..utils import block_terminal_output, create_model_cache_dir, get_hf_token
 from .hf import (
     HuggingFaceEncoderModel,
-    align_model_and_tokenizer,
+    align_model_and_tokeniser,
     setup_model_for_question_answering,
 )
@@ -51,6 +50,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
         model_config: "ModelConfig",
         dataset_config: "DatasetConfig",
         benchmark_config: "BenchmarkConfig",
+        log_metadata: bool = True,
     ) -> None:
         """Initialise the model.
@@ -61,23 +61,25 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
                 The dataset configuration.
             benchmark_config:
                 The benchmark configuration.
+            log_metadata:
+                Whether to log metadata about the model and the benchmark.
         """
         # This is already set when calling `super.__init__`, but we need it to get a
         # value from `self.model_max_length`, so we set it here as well.
         self.model_config = model_config
-        model, tokenizer = load_model_and_tokenizer(
+        model, tokeniser = load_model_and_tokeniser(
             model_config=model_config,
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
             model_max_length=self.model_max_length,
         )
         self._model: "PreTrainedModel" = model
-        self._tokenizer: "PreTrainedTokenizer" = tokenizer
+        self._tokeniser: "PreTrainedTokenizer" = tokeniser
-        self._model, self._tokenizer = align_model_and_tokenizer(
+        self._model, self._tokeniser = align_model_and_tokeniser(
             model=self._model,
-            tokenizer=self._tokenizer,
+            tokeniser=self._tokeniser,
             model_max_length=self.model_max_length,
             raise_errors=benchmark_config.raise_errors,
         )
@@ -88,6 +90,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
             model_config=model_config,
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
+            log_metadata=log_metadata,
         )
     @cached_property
@@ -194,13 +197,13 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
         )
-def load_model_and_tokenizer(
+def load_model_and_tokeniser(
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
     model_max_length: int,
 ) -> "tuple[PreTrainedModel, PreTrainedTokenizer]":
-    """Load the model and tokenizer.
+    """Load the model and tokeniser.
     Args:
         model_config:
@@ -213,7 +216,7 @@ def load_model_and_tokenizer(
             The maximum context length of the model.
     Returns:
-        The loaded model and tokenizer.
+        The loaded model and tokeniser.
     """
     config: "PretrainedConfig"
     block_terminal_output()
@@ -262,7 +265,7 @@ def load_model_and_tokenizer(
     config = AutoConfig.from_pretrained(
         real_model_id,
-        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+        token=get_hf_token(api_key=benchmark_config.api_key),
         num_labels=len(id2label),
         id2label=id2label,
         label2id={label: id_ for id_, label in id2label.items()},
@@ -274,29 +277,31 @@ def load_model_and_tokenizer(
     if dataset_config.task.task_group == TaskGroup.QUESTION_ANSWERING:
         model = setup_model_for_question_answering(model=model)
-    # Load the tokenizer. If the model is a subclass of a RoBERTa model then we
+    # Load the tokeniser. If the model is a subclass of a RoBERTa model then we
     # have to add a prefix space to the tokens, by the way the model is constructed
     prefix_models = ["Roberta", "GPT", "Deberta"]
     prefix = any(model_type in type(model).__name__ for model_type in prefix_models)
     try:
-        tokenizer: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
+        tokeniser: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
             real_model_id,
             revision=model_config.revision,
-            token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+            token=get_hf_token(api_key=benchmark_config.api_key),
             add_prefix_space=prefix,
             cache_dir=model_config.model_cache_dir,
             use_fast=True,
             verbose=False,
             trust_remote_code=benchmark_config.trust_remote_code,
         )
-    except (JSONDecodeError, OSError):
-        raise InvalidModel(f"Could not load tokenizer for model {real_model_id!r}.")
+    except (JSONDecodeError, OSError) as e:
+        raise InvalidModel(
+            f"Could not load tokeniser for model {real_model_id!r}."
+        ) from e
-    model, tokenizer = align_model_and_tokenizer(
+    model, tokeniser = align_model_and_tokeniser(
         model=model,
-        tokenizer=tokenizer,
+        tokeniser=tokeniser,
         model_max_length=model_max_length,
         raise_errors=benchmark_config.raise_errors,
     )
-    return model, tokenizer
+    return model, tokeniser

EuroEval 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.15.0py3-none-any.whl → 16.0.0py3-none-any.whl