PyPI - EuroEval - Versions diffs - 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl - Mend

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

euroeval/__init__.py +32 -14
euroeval/benchmark_config_factory.py +92 -180
euroeval/benchmark_modules/base.py +49 -39
euroeval/benchmark_modules/fresh.py +35 -21
euroeval/benchmark_modules/hf.py +280 -244
euroeval/benchmark_modules/litellm.py +752 -312
euroeval/benchmark_modules/vllm.py +570 -268
euroeval/benchmarker.py +651 -528
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +49 -38
euroeval/constants.py +44 -25
euroeval/data_loading.py +111 -55
euroeval/data_models.py +490 -323
euroeval/dataset_configs/__init__.py +26 -4
euroeval/dataset_configs/bosnian.py +39 -0
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/croatian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +78 -50
euroeval/dataset_configs/dutch.py +74 -44
euroeval/dataset_configs/english.py +71 -36
euroeval/dataset_configs/estonian.py +111 -0
euroeval/dataset_configs/faroese.py +25 -18
euroeval/dataset_configs/finnish.py +63 -26
euroeval/dataset_configs/french.py +65 -32
euroeval/dataset_configs/german.py +77 -36
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +68 -57
euroeval/dataset_configs/italian.py +68 -36
euroeval/dataset_configs/latvian.py +87 -0
euroeval/dataset_configs/lithuanian.py +64 -0
euroeval/dataset_configs/norwegian.py +98 -72
euroeval/dataset_configs/polish.py +96 -0
euroeval/dataset_configs/portuguese.py +63 -40
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/slovene.py +56 -0
euroeval/dataset_configs/spanish.py +68 -34
euroeval/dataset_configs/swedish.py +82 -41
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/enums.py +12 -6
euroeval/exceptions.py +21 -1
euroeval/finetuning.py +34 -26
euroeval/generation.py +76 -41
euroeval/generation_utils.py +169 -34
euroeval/languages.py +1020 -188
euroeval/logging_utils.py +268 -0
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +85 -0
euroeval/metrics/huggingface.py +216 -0
euroeval/metrics/llm_as_a_judge.py +260 -0
euroeval/metrics/pipeline.py +289 -0
euroeval/metrics/speed.py +48 -0
euroeval/model_cache.py +40 -21
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +157 -22
euroeval/prompt_templates/multiple_choice.py +159 -17
euroeval/prompt_templates/named_entity_recognition.py +318 -21
euroeval/prompt_templates/reading_comprehension.py +207 -16
euroeval/prompt_templates/sentiment_classification.py +205 -22
euroeval/prompt_templates/summarization.py +122 -22
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +20 -9
euroeval/speed_benchmark.py +11 -12
euroeval/task_group_utils/multiple_choice_classification.py +21 -12
euroeval/task_group_utils/question_answering.py +101 -73
euroeval/task_group_utils/sequence_classification.py +144 -61
euroeval/task_group_utils/text_to_text.py +33 -12
euroeval/task_group_utils/token_classification.py +86 -89
euroeval/tasks.py +75 -16
euroeval/tokenisation_utils.py +603 -0
euroeval/types.py +17 -11
euroeval/utils.py +332 -137
euroeval-16.7.1.dist-info/METADATA +623 -0
euroeval-16.7.1.dist-info/RECORD +84 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -737
euroeval/metrics.py +0 -452
euroeval/tokenization_utils.py +0 -498
euroeval-15.12.0.dist-info/METADATA +0 -285
euroeval-15.12.0.dist-info/RECORD +0 -63
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Freshly initialised encoder models."""
-import os
+import re
 import typing as t
 from functools import cached_property
 from json import JSONDecodeError
@@ -26,10 +26,12 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
-from ..utils import block_terminal_output, create_model_cache_dir
+from ..generation_utils import raise_if_wrong_params
+from ..logging_utils import block_terminal_output
+from ..utils import create_model_cache_dir, get_hf_token
 from .hf import (
     HuggingFaceEncoderModel,
-    align_model_and_tokenizer,
+    align_model_and_tokeniser,
     setup_model_for_question_answering,
 )
@@ -45,12 +47,14 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
     """A freshly initialised encoder model."""
     fresh_model = True
+    allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
     def __init__(
         self,
         model_config: "ModelConfig",
         dataset_config: "DatasetConfig",
         benchmark_config: "BenchmarkConfig",
+        log_metadata: bool = True,
     ) -> None:
         """Initialise the model.
@@ -61,23 +65,29 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
                 The dataset configuration.
             benchmark_config:
                 The benchmark configuration.
+            log_metadata:
+                Whether to log metadata about the model and the benchmark.
         """
+        raise_if_wrong_params(
+            model_config=model_config, allowed_params=self.allowed_params
+        )
         # This is already set when calling `super.__init__`, but we need it to get a
         # value from `self.model_max_length`, so we set it here as well.
         self.model_config = model_config
-        model, tokenizer = load_model_and_tokenizer(
+        model, tokeniser = load_model_and_tokeniser(
             model_config=model_config,
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
             model_max_length=self.model_max_length,
         )
         self._model: "PreTrainedModel" = model
-        self._tokenizer: "PreTrainedTokenizer" = tokenizer
+        self._tokeniser: "PreTrainedTokenizer" = tokeniser
-        self._model, self._tokenizer = align_model_and_tokenizer(
+        self._model, self._tokeniser = align_model_and_tokeniser(
             model=self._model,
-            tokenizer=self._tokenizer,
+            tokeniser=self._tokeniser,
             model_max_length=self.model_max_length,
             raise_errors=benchmark_config.raise_errors,
         )
@@ -88,6 +98,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
             model_config=model_config,
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
+            log_metadata=log_metadata,
         )
     @cached_property
@@ -180,9 +191,10 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
         """
         return ModelConfig(
             model_id=model_id,
+            revision="main",
+            param=None,
             task="fill-mask",
             languages=list(),
-            revision="main",
             merge=False,
             inference_backend=InferenceBackend.TRANSFORMERS,
             model_type=ModelType.ENCODER,
@@ -194,13 +206,13 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
         )
-def load_model_and_tokenizer(
+def load_model_and_tokeniser(
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
     model_max_length: int,
 ) -> "tuple[PreTrainedModel, PreTrainedTokenizer]":
-    """Load the model and tokenizer.
+    """Load the model and tokeniser.
     Args:
         model_config:
@@ -213,7 +225,7 @@ def load_model_and_tokenizer(
             The maximum context length of the model.
     Returns:
-        The loaded model and tokenizer.
+        The loaded model and tokeniser.
     """
     config: "PretrainedConfig"
     block_terminal_output()
@@ -262,7 +274,7 @@ def load_model_and_tokenizer(
     config = AutoConfig.from_pretrained(
         real_model_id,
-        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+        token=get_hf_token(api_key=benchmark_config.api_key),
         num_labels=len(id2label),
         id2label=id2label,
         label2id={label: id_ for id_, label in id2label.items()},
@@ -274,29 +286,31 @@ def load_model_and_tokenizer(
     if dataset_config.task.task_group == TaskGroup.QUESTION_ANSWERING:
         model = setup_model_for_question_answering(model=model)
-    # Load the tokenizer. If the model is a subclass of a RoBERTa model then we
+    # Load the tokeniser. If the model is a subclass of a RoBERTa model then we
     # have to add a prefix space to the tokens, by the way the model is constructed
     prefix_models = ["Roberta", "GPT", "Deberta"]
     prefix = any(model_type in type(model).__name__ for model_type in prefix_models)
     try:
-        tokenizer: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
+        tokeniser: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
             real_model_id,
             revision=model_config.revision,
-            token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+            token=get_hf_token(api_key=benchmark_config.api_key),
             add_prefix_space=prefix,
             cache_dir=model_config.model_cache_dir,
-            use_fast=True,
+            use_fast=False if model_config.param == "slow-tokenizer" else True,
             verbose=False,
             trust_remote_code=benchmark_config.trust_remote_code,
         )
-    except (JSONDecodeError, OSError):
-        raise InvalidModel(f"Could not load tokenizer for model {real_model_id!r}.")
+    except (JSONDecodeError, OSError) as e:
+        raise InvalidModel(
+            f"Could not load tokeniser for model {real_model_id!r}."
+        ) from e
-    model, tokenizer = align_model_and_tokenizer(
+    model, tokeniser = align_model_and_tokeniser(
         model=model,
-        tokenizer=tokenizer,
+        tokeniser=tokeniser,
         model_max_length=model_max_length,
         raise_errors=benchmark_config.raise_errors,
     )
-    return model, tokenizer
+    return model, tokeniser

EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl