PyPI - EuroEval - Versions diffs - 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl - Mend

EuroEval 15.15.0py3-none-any.whl → 16.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show

euroeval/__init__.py +3 -7
euroeval/benchmark_config_factory.py +3 -7
euroeval/benchmark_modules/base.py +35 -19
euroeval/benchmark_modules/fresh.py +24 -19
euroeval/benchmark_modules/hf.py +136 -154
euroeval/benchmark_modules/litellm.py +323 -193
euroeval/benchmark_modules/vllm.py +166 -112
euroeval/benchmarker.py +59 -33
euroeval/cli.py +3 -3
euroeval/constants.py +13 -15
euroeval/data_loading.py +33 -28
euroeval/data_models.py +53 -7
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/danish.py +38 -1
euroeval/dataset_configs/dutch.py +38 -1
euroeval/dataset_configs/english.py +38 -1
euroeval/dataset_configs/estonian.py +95 -0
euroeval/dataset_configs/faroese.py +38 -0
euroeval/dataset_configs/finnish.py +39 -1
euroeval/dataset_configs/french.py +38 -1
euroeval/dataset_configs/german.py +38 -1
euroeval/dataset_configs/icelandic.py +39 -1
euroeval/dataset_configs/italian.py +38 -1
euroeval/dataset_configs/latvian.py +81 -0
euroeval/dataset_configs/norwegian.py +38 -1
euroeval/dataset_configs/portuguese.py +38 -1
euroeval/dataset_configs/spanish.py +38 -1
euroeval/dataset_configs/swedish.py +38 -1
euroeval/enums.py +0 -6
euroeval/finetuning.py +8 -7
euroeval/generation.py +25 -14
euroeval/generation_utils.py +46 -14
euroeval/languages.py +947 -187
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +76 -0
euroeval/metrics/huggingface.py +192 -0
euroeval/metrics/llm_as_a_judge.py +257 -0
euroeval/metrics/pipeline.py +234 -0
euroeval/metrics/speed.py +51 -0
euroeval/prompt_templates/linguistic_acceptability.py +40 -2
euroeval/prompt_templates/multiple_choice.py +23 -2
euroeval/prompt_templates/named_entity_recognition.py +65 -2
euroeval/prompt_templates/reading_comprehension.py +42 -2
euroeval/prompt_templates/sentiment_classification.py +46 -2
euroeval/prompt_templates/summarization.py +24 -4
euroeval/scores.py +7 -2
euroeval/speed_benchmark.py +6 -6
euroeval/task_group_utils/multiple_choice_classification.py +17 -6
euroeval/task_group_utils/question_answering.py +35 -28
euroeval/task_group_utils/sequence_classification.py +96 -23
euroeval/task_group_utils/text_to_text.py +7 -3
euroeval/task_group_utils/token_classification.py +47 -75
euroeval/tasks.py +31 -6
euroeval/tokenization_utils.py +295 -207
euroeval/utils.py +118 -34
{euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +12 -14
euroeval-16.0.0.dist-info/RECORD +69 -0
{euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -738
euroeval/metrics.py +0 -468
euroeval-15.15.0.dist-info/RECORD +0 -63
{euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
{euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0

euroeval/dataset_configs/spanish.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from ..data_models import DatasetConfig
 from ..languages import ES
-from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
+from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
 ### Official datasets ###
@@ -66,6 +66,17 @@ HELLASWAG_ES_CONFIG = DatasetConfig(
     languages=[ES],
 )
+EUROPEAN_VALUES_ES_CONFIG = DatasetConfig(
+    name="european-values-es",
+    pretty_name="the Spanish version of the European values evaluation dataset",
+    huggingface_id="EuroEval/european-values-es",
+    task=EUROPEAN_VALUES,
+    languages=[ES],
+    splits=["test"],
+    bootstrap_samples=False,
+    _instruction_prompt="{text}",
+)
 ### Unofficial datasets ###
@@ -107,3 +118,29 @@ GOLDENSWAG_ES_CONFIG = DatasetConfig(
     languages=[ES],
     unofficial=True,
 )
+EUROPEAN_VALUES_SITUATIONAL_ES_CONFIG = DatasetConfig(
+    name="european-values-situational-es",
+    pretty_name="the Spanish version of the European values evaluation dataset, where "
+    "the questions are phrased in a situational way",
+    huggingface_id="EuroEval/european-values-situational-es",
+    task=EUROPEAN_VALUES,
+    languages=[ES],
+    splits=["test"],
+    bootstrap_samples=False,
+    _instruction_prompt="{text}",
+    unofficial=True,
+)
+EUROPEAN_VALUES_COMPLETIONS_ES_CONFIG = DatasetConfig(
+    name="european-values-completions-es",
+    pretty_name="the Spanish version of the European values evaluation dataset, where "
+    "the questions are phrased as sentence completions",
+    huggingface_id="EuroEval/european-values-completions-es",
+    task=EUROPEAN_VALUES,
+    languages=[ES],
+    splits=["test"],
+    bootstrap_samples=False,
+    _instruction_prompt="{text}",
+    unofficial=True,
+)

euroeval/dataset_configs/swedish.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from ..data_models import DatasetConfig
 from ..languages import SV
-from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
+from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
 ### Official datasets ###
@@ -67,6 +67,17 @@ HELLASWAG_SV_CONFIG = DatasetConfig(
     languages=[SV],
 )
+EUROPEAN_VALUES_SV_CONFIG = DatasetConfig(
+    name="european-values-sv",
+    pretty_name="the Swedish version of the European values evaluation dataset",
+    huggingface_id="EuroEval/european-values-sv",
+    task=EUROPEAN_VALUES,
+    languages=[SV],
+    splits=["test"],
+    bootstrap_samples=False,
+    _instruction_prompt="{text}",
+)
 ### Unofficial datasets ###
@@ -118,3 +129,29 @@ GOLDENSWAG_SV_CONFIG = DatasetConfig(
     languages=[SV],
     unofficial=True,
 )
+EUROPEAN_VALUES_SITUATIONAL_SV_CONFIG = DatasetConfig(
+    name="european-values-situational-sv",
+    pretty_name="the Swedish version of the European values evaluation dataset, where "
+    "the questions are phrased in a situational way",
+    huggingface_id="EuroEval/european-values-situational-sv",
+    task=EUROPEAN_VALUES,
+    languages=[SV],
+    splits=["test"],
+    bootstrap_samples=False,
+    _instruction_prompt="{text}",
+    unofficial=True,
+)
+EUROPEAN_VALUES_COMPLETIONS_SV_CONFIG = DatasetConfig(
+    name="european-values-completions-sv",
+    pretty_name="the Swedish version of the European values evaluation dataset, where "
+    "the questions are phrased as sentence completions",
+    huggingface_id="EuroEval/european-values-completions-sv",
+    task=EUROPEAN_VALUES,
+    languages=[SV],
+    splits=["test"],
+    bootstrap_samples=False,
+    _instruction_prompt="{text}",
+    unofficial=True,
+)

euroeval/enums.py CHANGED Viewed

@@ -40,14 +40,11 @@ class InferenceBackend(AutoStrEnum):
             VLLM library.
         LITELLM:
             LiteLLM library.
-        NONE:
-            No inference backend used (e.g., for human evaluation).
     """
     TRANSFORMERS = auto()
     VLLM = auto()
     LITELLM = auto()
-    NONE = auto()
 class ModelType(AutoStrEnum):
@@ -58,13 +55,10 @@ class ModelType(AutoStrEnum):
             An encoder (i.e., BERT-style) model.
         GENERATIVE:
             A generative model. Can be either decoder or encoder-decoder (aka seq2seq).
-        HUMAN:
-            Human evaluator.
     """
     ENCODER = auto()
     GENERATIVE = auto()
-    HUMAN = auto()
 class GenerativeType(AutoStrEnum):

euroeval/finetuning.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import logging
 import sys
 import typing as t
+from functools import partial
 import torch
 from tqdm.auto import tqdm
@@ -118,7 +119,7 @@ def finetune(
             # NaN values can appear in the model output when using mixed precision, as
             # the hidden states get overflowed. In this case we try to disable mixed
             # precision and try again.
-            except NaNValueInModelOutput:
+            except NaNValueInModelOutput as e:
                 if dtype != DataType.FP32:
                     dtype = DataType.FP32
                     model_already_initialized = False
@@ -130,11 +131,11 @@ def finetune(
                     raise InvalidBenchmark(
                         "NaN value detected in model outputs, even with mixed "
                         "precision disabled."
-                    )
+                    ) from e
             except Exception as e:
                 if "CUDA" not in str(e) and "out of memory" not in str(e):
-                    raise InvalidBenchmark(str(e))
+                    raise InvalidBenchmark(str(e)) from e
                 if bs <= 1:
                     msg = "Could not benchmark the model, even with a batch size of 1!"
@@ -145,7 +146,7 @@ def finetune(
                             "environment variable set, as this removes the upper bound "
                             "on the memory usage."
                         )
-                    raise InvalidBenchmark(msg)
+                    raise InvalidBenchmark(msg) from e
                 model_already_initialized = False
@@ -194,11 +195,11 @@ def finetune_single_iteration(
     trainer = model.trainer_class(
         model=model.get_pytorch_module(),
-        processing_class=model.get_tokenizer(),
+        processing_class=model.get_tokeniser(),
         args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["val"],
-        compute_metrics=model.compute_metrics,
+        compute_metrics=partial(model.compute_metrics, dataset=None),
         callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
         data_collator=model.data_collator,
         preprocess_logits_for_metrics=remove_extra_tensors_from_logits,
@@ -244,7 +245,7 @@ def finetune_single_iteration(
             clear_memory()
             raise e
         except (RuntimeError, ValueError, IndexError) as e:
-            raise InvalidBenchmark(str(e))
+            raise InvalidBenchmark(str(e)) from e
     return test_scores

euroeval/generation.py CHANGED Viewed

@@ -6,6 +6,7 @@ import typing as t
 from pathlib import Path
 import more_itertools as mit
+from datasets import Dataset
 from tqdm.auto import tqdm
 from .enums import BatchingPreference, TaskGroup
@@ -15,10 +16,10 @@ from .model_cache import (
     load_cached_model_outputs,
     split_dataset_into_cached_and_non_cached,
 )
-from .utils import clear_memory
+from .utils import clear_memory, log_once
 if t.TYPE_CHECKING:
-    from datasets import Dataset, DatasetDict
+    from datasets import DatasetDict
     from .benchmark_modules import BenchmarkModule
     from .data_models import (
@@ -78,7 +79,7 @@ def generate(
     scores: list[dict[str, float]] = list()
     for idx in tqdm(
-        iterable=range(benchmark_config.num_iterations),
+        iterable=range(len(datasets)),
         desc="Benchmarking",
         disable=not benchmark_config.progress_bar,
     ):
@@ -89,7 +90,6 @@ def generate(
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
         )
         logger.debug(f"Test scores for iteration {idx}: {test_scores}")
         scores.append(test_scores)
         clear_memory()
@@ -126,10 +126,15 @@ def generate_single_iteration(
     """
     cache.load()
-    # Split up the dataset into a cached and non-cached part
-    cached_dataset, non_cached_dataset = split_dataset_into_cached_and_non_cached(
-        dataset=dataset, cache=cache
-    )
+    # Split up the dataset into a cached and non-cached part, unless we are not
+    # bootstrapping the samples. In that case, we just use the dataset as is.
+    if dataset_config.bootstrap_samples:
+        cached_dataset, non_cached_dataset = split_dataset_into_cached_and_non_cached(
+            dataset=dataset, cache=cache
+        )
+    else:
+        cached_dataset = Dataset.from_dict({})
+        non_cached_dataset = dataset
     all_preds: list[str] = list()
@@ -230,9 +235,12 @@ def generate_single_iteration(
             cached_labels = list(cached_labels)
         ground_truth = non_cached_labels + cached_labels
     else:
-        raise ValueError(
-            "The dataset must have either a 'label', 'labels', or 'target_text' column"
+        log_once(
+            "No labels found in the dataset. We assume that this is intentional, and "
+            "will not supply any ground truth labels for evaluation.",
+            level=logging.DEBUG,
         )
+        ground_truth = []
     itr_scores: dict[str, float] = model.compute_metrics(
         model_outputs_and_labels=(all_preds, ground_truth), dataset=dataset
@@ -293,10 +301,13 @@ def debug_log(
         case (
             TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
         ):
-            labels = [
-                dataset_config.prompt_label_mapping.get(label, label).lower()
-                for label in batch["label"]
-            ]
+            if "label" in batch:
+                labels = [
+                    dataset_config.prompt_label_mapping.get(label, label).lower()
+                    for label in batch["label"]
+                ]
+            else:
+                labels = ["N/A"] * len(extracted_labels)
         case TaskGroup.QUESTION_ANSWERING:
             extracted_labels = [

euroeval/generation_utils.py CHANGED Viewed

@@ -8,19 +8,23 @@ import typing as t
 from .enums import TaskGroup
 from .exceptions import InvalidBenchmark
+from .tokenization_utils import apply_chat_template
 from .utils import log_once
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
     from transformers.tokenization_utils import PreTrainedTokenizer
-    from .data_models import DatasetConfig, ModelConfig
+    from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
 logger = logging.getLogger("euroeval")
 def extract_few_shot_examples(
-    dataset: "DatasetDict", dataset_config: "DatasetConfig", itr_idx: int
+    dataset: "DatasetDict",
+    dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
+    itr_idx: int,
 ) -> list[dict[str, t.Any]]:
     """Extract few-shot examples from a dataset.
@@ -33,12 +37,32 @@ def extract_few_shot_examples(
             The dataset to extract the few-shot examples from.
         dataset_config:
             The dataset configuration.
+        benchmark_config:
+            The benchmark configuration.
         itr_idx:
             The index of the dataset in the iterator.
     Returns:
         The few-shot examples.
+    Raises:
+        InvalidBenchmark:
+            If there are not enough short examples for few-shot learning.
     """
+    if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
+        msg = (
+            "This task only allows zero-shot evaluation, so even though you have "
+            "requested few-shot evaluation "
+        )
+        if benchmark_config.run_with_cli:
+            msg += "(by not setting the --zero-shot flag), "
+        else:
+            msg += "(by setting the default `few_shot=True` argument), "
+        msg += "we will run the evaluation in zero-shot mode."
+        benchmark_config.few_shot = False
+        log_once(msg, level=logging.DEBUG)
+        return []
     random_seed = 4242 + itr_idx
     num_few_shots = dataset_config.num_few_shot_examples
     few_shot_examples: list[dict[str, t.Any]] = list()
@@ -63,12 +87,19 @@ def extract_few_shot_examples(
             shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
             labels = it.cycle(dataset_config.labels)
+            labels_with_no_samples: set[str] = set()
             while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
+                if len(labels_with_no_samples) == len(dataset_config.labels):
+                    raise InvalidBenchmark(
+                        "Could not find enough examples for few-shot learning. "
+                        "Please check the dataset and the labels."
+                    )
                 label = next(labels)
                 possible_examples = shuffled_train.filter(
                     lambda x: x["label"].lower() == label.lower()
                 )
                 if len(possible_examples) == 0:
+                    labels_with_no_samples.add(label)
                     continue
                 example = possible_examples.select(range(1))[0]
                 few_shot_examples.append(example)
@@ -144,7 +175,7 @@ def apply_prompt(
     dataset_config: "DatasetConfig",
     instruction_model: bool,
     always_populate_text_field: bool,
-    tokenizer: "PreTrainedTokenizer | None",
+    tokeniser: "PreTrainedTokenizer | None",
 ) -> dict[str, t.Any]:
     """Apply prompt template to an example, potentially with few-shot examples.
@@ -160,16 +191,16 @@ def apply_prompt(
         always_populate_text_field:
             Whether to always populate the 'text' field in the examples, as opposed to
             the 'messages' field.
-        tokenizer:
-            The tokenizer to use for the model. If None, the tokenizer is not used.
+        tokeniser:
+            The tokeniser to use for the model. If None, the tokeniser is not used.
     Returns:
         The example with the few-shot examples applied.
     """
     # Sanity check
-    if instruction_model and always_populate_text_field and tokenizer is None:
+    if instruction_model and always_populate_text_field and tokeniser is None:
         raise ValueError(
-            "The `tokenizer` argument must be provided when the model is instruction "
+            "The `tokeniser` argument must be provided when the model is instruction "
             "tuned and when we are not just returning the raw messages."
         )
@@ -298,30 +329,31 @@ def apply_prompt(
             examples["messages"] = messages_list
         else:
-            assert tokenizer is not None
+            assert tokeniser is not None
             # Pick the chat template that matches the language of the dataset, if such a
             # template exists
             chat_template: str | None = None
-            if isinstance(tokenizer.chat_template, dict):
+            if hasattr(tokeniser, "chat_template") and isinstance(
+                tokeniser.chat_template, dict
+            ):
                 language_codes = [
                     language.code for language in dataset_config.languages
                 ]
-                for name, candidate_template in tokenizer.chat_template.items():
+                for name, candidate_template in tokeniser.chat_template.items():
                     if name.lower() in language_codes:
                         chat_template = candidate_template
                         log_once(
-                            f"Using the {name!r} chat template for the tokenizer for "
+                            f"Using the {name!r} chat template for the tokeniser for "
                             f"model {model_config.model_id!r}.",
                             level=logging.DEBUG,
                         )
                         break
             texts = [
-                tokenizer.apply_chat_template(
+                apply_chat_template(
                     conversation=messages,
-                    tokenize=False,
-                    add_generation_prompt=True,
+                    tokeniser=tokeniser,
                     chat_template=chat_template,
                 )
                 for messages in messages_list

EuroEval 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.15.0py3-none-any.whl → 16.0.0py3-none-any.whl