PyPI - EuroEval - Versions diffs - 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl - Mend

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

euroeval/__init__.py +32 -14
euroeval/benchmark_config_factory.py +92 -180
euroeval/benchmark_modules/base.py +49 -39
euroeval/benchmark_modules/fresh.py +35 -21
euroeval/benchmark_modules/hf.py +280 -244
euroeval/benchmark_modules/litellm.py +752 -312
euroeval/benchmark_modules/vllm.py +570 -268
euroeval/benchmarker.py +651 -528
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +49 -38
euroeval/constants.py +44 -25
euroeval/data_loading.py +111 -55
euroeval/data_models.py +490 -323
euroeval/dataset_configs/__init__.py +26 -4
euroeval/dataset_configs/bosnian.py +39 -0
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/croatian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +78 -50
euroeval/dataset_configs/dutch.py +74 -44
euroeval/dataset_configs/english.py +71 -36
euroeval/dataset_configs/estonian.py +111 -0
euroeval/dataset_configs/faroese.py +25 -18
euroeval/dataset_configs/finnish.py +63 -26
euroeval/dataset_configs/french.py +65 -32
euroeval/dataset_configs/german.py +77 -36
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +68 -57
euroeval/dataset_configs/italian.py +68 -36
euroeval/dataset_configs/latvian.py +87 -0
euroeval/dataset_configs/lithuanian.py +64 -0
euroeval/dataset_configs/norwegian.py +98 -72
euroeval/dataset_configs/polish.py +96 -0
euroeval/dataset_configs/portuguese.py +63 -40
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/slovene.py +56 -0
euroeval/dataset_configs/spanish.py +68 -34
euroeval/dataset_configs/swedish.py +82 -41
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/enums.py +12 -6
euroeval/exceptions.py +21 -1
euroeval/finetuning.py +34 -26
euroeval/generation.py +76 -41
euroeval/generation_utils.py +169 -34
euroeval/languages.py +1020 -188
euroeval/logging_utils.py +268 -0
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +85 -0
euroeval/metrics/huggingface.py +216 -0
euroeval/metrics/llm_as_a_judge.py +260 -0
euroeval/metrics/pipeline.py +289 -0
euroeval/metrics/speed.py +48 -0
euroeval/model_cache.py +40 -21
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +157 -22
euroeval/prompt_templates/multiple_choice.py +159 -17
euroeval/prompt_templates/named_entity_recognition.py +318 -21
euroeval/prompt_templates/reading_comprehension.py +207 -16
euroeval/prompt_templates/sentiment_classification.py +205 -22
euroeval/prompt_templates/summarization.py +122 -22
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +20 -9
euroeval/speed_benchmark.py +11 -12
euroeval/task_group_utils/multiple_choice_classification.py +21 -12
euroeval/task_group_utils/question_answering.py +101 -73
euroeval/task_group_utils/sequence_classification.py +144 -61
euroeval/task_group_utils/text_to_text.py +33 -12
euroeval/task_group_utils/token_classification.py +86 -89
euroeval/tasks.py +75 -16
euroeval/tokenisation_utils.py +603 -0
euroeval/types.py +17 -11
euroeval/utils.py +332 -137
euroeval-16.7.1.dist-info/METADATA +623 -0
euroeval-16.7.1.dist-info/RECORD +84 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -737
euroeval/metrics.py +0 -452
euroeval/tokenization_utils.py +0 -498
euroeval-15.12.0.dist-info/METADATA +0 -285
euroeval-15.12.0.dist-info/RECORD +0 -63
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/finetuning.py CHANGED Viewed

@@ -1,11 +1,12 @@
 """Functions related to the finetuning of models."""
+import collections.abc as c
 import logging
 import sys
 import typing as t
+from functools import partial
 import torch
-from tqdm.auto import tqdm
 from transformers.trainer_callback import (
     EarlyStoppingCallback,
     PrinterCallback,
@@ -17,13 +18,9 @@ from transformers.training_args import OptimizerNames, TrainingArguments
 from .callbacks import NeverLeaveProgressCallback
 from .enums import DataType
 from .exceptions import InvalidBenchmark, NaNValueInModelOutput
+from .logging_utils import block_terminal_output, get_pbar, log, log_once
 from .model_loading import load_model
-from .utils import (
-    block_terminal_output,
-    clear_memory,
-    enforce_reproducibility,
-    log_once,
-)
+from .utils import clear_memory, enforce_reproducibility
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -31,16 +28,14 @@ if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
     from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
-logger = logging.getLogger("euroeval")
 def finetune(
     model: "BenchmarkModule",
-    datasets: list["DatasetDict"],
+    datasets: c.Sequence["DatasetDict"],
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
-) -> list[dict[str, float]]:
+) -> c.Sequence[dict[str, float]]:
     """Evaluate a model on a dataset through finetuning.
     Args:
@@ -57,6 +52,10 @@ def finetune(
     Returns:
         A list of dicts containing the scores for each metric for each iteration.
+    Raises:
+        InvalidBenchmark:
+            If the benchmark could not be completed.
     """
     # Set the data type to use for the model weights
     using_cuda = benchmark_config.device == torch.device("cuda")
@@ -67,9 +66,9 @@ def finetune(
     else:
         dtype = DataType.FP32
-    bs: int = benchmark_config.batch_size
+    bs: int = benchmark_config.finetuning_batch_size
     scores: list[dict[str, float]] = list()
-    for idx in tqdm(
+    for idx in get_pbar(
         iterable=range(benchmark_config.num_iterations),
         desc="Benchmarking",
         disable=not benchmark_config.progress_bar,
@@ -79,7 +78,7 @@ def finetune(
         model_already_initialized = idx == 0
         # Run a loop here to deal with automatic reduction of batch size
-        while True:
+        for _ in range(num_attempts := 10):
             # Clear GPU memory
             if not model_already_initialized:
                 try:
@@ -111,30 +110,34 @@ def finetune(
                 )
                 scores.append(itr_scores)
-                logger.debug(f"Test scores for iteration {idx}: {itr_scores}")
+                log(
+                    f"Test scores for iteration {idx}: {itr_scores}",
+                    level=logging.DEBUG,
+                )
                 break
             # NaN values can appear in the model output when using mixed precision, as
             # the hidden states get overflowed. In this case we try to disable mixed
             # precision and try again.
-            except NaNValueInModelOutput:
+            except NaNValueInModelOutput as e:
                 if dtype != DataType.FP32:
                     dtype = DataType.FP32
                     model_already_initialized = False
-                    logger.debug(
+                    log(
                         "NaN value detected in model outputs while using mixed "
-                        "precision. Retrying with full fp32 precision."
+                        "precision. Retrying with full fp32 precision.",
+                        level=logging.DEBUG,
                     )
                 else:
                     raise InvalidBenchmark(
                         "NaN value detected in model outputs, even with mixed "
                         "precision disabled."
-                    )
+                    ) from e
             except Exception as e:
                 if "CUDA" not in str(e) and "out of memory" not in str(e):
-                    raise InvalidBenchmark(str(e))
+                    raise InvalidBenchmark(str(e)) from e
                 if bs <= 1:
                     msg = "Could not benchmark the model, even with a batch size of 1!"
@@ -145,12 +148,17 @@ def finetune(
                             "environment variable set, as this removes the upper bound "
                             "on the memory usage."
                         )
-                    raise InvalidBenchmark(msg)
+                    raise InvalidBenchmark(msg) from e
                 model_already_initialized = False
                 bs //= 2
-                logger.debug(f"Reduced batch size to {bs}")
+                log(f"Reduced batch size to {bs}", level=logging.DEBUG)
+        else:
+            raise InvalidBenchmark(
+                f"Could not benchmark the model after {num_attempts} attempts!"
+            )
     return scores
@@ -194,11 +202,11 @@ def finetune_single_iteration(
     trainer = model.trainer_class(
         model=model.get_pytorch_module(),
-        processing_class=model.get_tokenizer(),
+        processing_class=model.get_tokeniser(),
         args=training_args,
         train_dataset=dataset["train"],
         eval_dataset=dataset["val"],
-        compute_metrics=model.compute_metrics,
+        compute_metrics=partial(model.compute_metrics, dataset=None),
         callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
         data_collator=model.data_collator,
         preprocess_logits_for_metrics=remove_extra_tensors_from_logits,
@@ -244,7 +252,7 @@ def finetune_single_iteration(
             clear_memory()
             raise e
         except (RuntimeError, ValueError, IndexError) as e:
-            raise InvalidBenchmark(str(e))
+            raise InvalidBenchmark(str(e)) from e
     return test_scores
@@ -283,7 +291,7 @@ def get_training_args(
         logging_strategy = IntervalStrategy.NO
     if batch_size is None:
-        batch_size = benchmark_config.batch_size
+        batch_size = benchmark_config.finetuning_batch_size
     training_args = TrainingArguments(
         output_dir=model_config.model_cache_dir,

euroeval/generation.py CHANGED Viewed

@@ -1,15 +1,17 @@
 """Functions related to text generation of models."""
+import collections.abc as c
 import logging
 import sys
 import typing as t
 from pathlib import Path
-import more_itertools as mit
+from datasets import Dataset
 from tqdm.auto import tqdm
 from .enums import BatchingPreference, TaskGroup
-from .exceptions import InvalidBenchmark
+from .exceptions import InvalidBenchmark, InvalidModel
+from .logging_utils import get_pbar, log, log_once
 from .model_cache import (
     ModelCache,
     load_cached_model_outputs,
@@ -18,7 +20,7 @@ from .model_cache import (
 from .utils import clear_memory
 if t.TYPE_CHECKING:
-    from datasets import Dataset, DatasetDict
+    from datasets import DatasetDict
     from .benchmark_modules import BenchmarkModule
     from .data_models import (
@@ -28,16 +30,14 @@ if t.TYPE_CHECKING:
         ModelConfig,
     )
-logger = logging.getLogger("euroeval")
 def generate(
     model: "BenchmarkModule",
-    datasets: list["DatasetDict"],
+    datasets: c.Sequence["DatasetDict"],
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
-) -> list[dict[str, float]]:
+) -> c.Sequence[dict[str, float]]:
     """Evaluate a model on a dataset through generation.
     Args:
@@ -74,11 +74,12 @@ def generate(
         model_cache_dir=model_cache_dir,
         cache_name=cache_name,
         max_generated_tokens=dataset_config.max_generated_tokens,
+        progress_bar=benchmark_config.progress_bar,
     )
     scores: list[dict[str, float]] = list()
-    for idx in tqdm(
-        iterable=range(benchmark_config.num_iterations),
+    for idx in get_pbar(
+        iterable=range(len(datasets)),
         desc="Benchmarking",
         disable=not benchmark_config.progress_bar,
     ):
@@ -89,8 +90,7 @@ def generate(
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
         )
-        logger.debug(f"Test scores for iteration {idx}: {test_scores}")
+        log(f"Test scores for iteration {idx}: {test_scores}", level=logging.DEBUG)
         scores.append(test_scores)
         clear_memory()
@@ -126,10 +126,15 @@ def generate_single_iteration(
     """
     cache.load()
-    # Split up the dataset into a cached and non-cached part
-    cached_dataset, non_cached_dataset = split_dataset_into_cached_and_non_cached(
-        dataset=dataset, cache=cache
-    )
+    # Split up the dataset into a cached and non-cached part, unless we are not
+    # bootstrapping the samples. In that case, we just use the dataset as is.
+    if dataset_config.bootstrap_samples:
+        cached_dataset, non_cached_dataset = split_dataset_into_cached_and_non_cached(
+            dataset=dataset, cache=cache
+        )
+    else:
+        cached_dataset = Dataset.from_dict({})
+        non_cached_dataset = dataset
     all_preds: list[str] = list()
@@ -137,19 +142,31 @@ def generate_single_iteration(
         itr: t.Iterable
         match model.batching_preference:
             case BatchingPreference.SINGLE_SAMPLE:
-                itr = tqdm(iterable=non_cached_dataset, leave=False)
+                itr = get_pbar(
+                    iterable=non_cached_dataset,
+                    disable=not benchmark_config.progress_bar,
+                )
             case BatchingPreference.ALL_AT_ONCE:
                 itr = [non_cached_dataset[:]]
             case _:
-                num_batches = len(non_cached_dataset) // benchmark_config.batch_size
-                if len(non_cached_dataset) % benchmark_config.batch_size != 0:
-                    num_batches += 1
-                itr = tqdm(
-                    iterable=mit.batched(
-                        iterable=non_cached_dataset, n=benchmark_config.batch_size
-                    ),
-                    total=len(non_cached_dataset) // benchmark_config.batch_size,
+                raise InvalidModel(
+                    f"The batching preference {model.batching_preference!r} is "
+                    "currently not supported."
                 )
+                # NOTE: The code below can be used if we want to support batching for
+                # generative models. But in that case, we have to deal with the naming
+                # of the batch size variable, since it is currently
+                # `finetuning_batch_size`, as it is only used during finetuning of
+                # encoder models.
+                # num_batches = len(non_cached_dataset) // benchmark_config.batch_size
+                # if len(non_cached_dataset) % benchmark_config.batch_size != 0:
+                #     num_batches += 1
+                # itr = get_pbar(
+                #     iterable=mit.batched(
+                #         iterable=non_cached_dataset, n=benchmark_config.batch_size
+                #     ),
+                #     total=len(non_cached_dataset) // benchmark_config.batch_size,
+                # )
         # Generate the completions for the non-cached examples
         for batch in itr:
@@ -230,12 +247,17 @@ def generate_single_iteration(
             cached_labels = list(cached_labels)
         ground_truth = non_cached_labels + cached_labels
     else:
-        raise ValueError(
-            "The dataset must have either a 'label', 'labels', or 'target_text' column"
+        log_once(
+            "No labels found in the dataset. We assume that this is intentional, and "
+            "will not supply any ground truth labels for evaluation.",
+            level=logging.DEBUG,
         )
+        ground_truth = []
     itr_scores: dict[str, float] = model.compute_metrics(
-        model_outputs_and_labels=(all_preds, ground_truth)
+        model_outputs_and_labels=(all_preds, ground_truth),
+        dataset=dataset,
+        benchmark_config=benchmark_config,
     )
     return itr_scores
@@ -244,7 +266,7 @@ def generate_single_iteration(
 def debug_log(
     batch: dict[str, t.Any],
     model_output: "GenerativeModelOutput",
-    extracted_labels: list[dict | str | list[str]],
+    extracted_labels: c.Sequence[dict | str | c.Sequence[str]],
     dataset_config: "DatasetConfig",
 ) -> None:
     """Log inputs and outputs for debugging purposes.
@@ -287,16 +309,19 @@ def debug_log(
                         + "\n"
                         + "\t".join(labels)
                     )
-            logger.info("\n\n".join(log_msgs))
+            log("\n\n".join(log_msgs), level=logging.DEBUG)
             return
         case (
             TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
         ):
-            labels = [
-                dataset_config.prompt_label_mapping.get(label, label).lower()
-                for label in batch["label"]
-            ]
+            if "label" in batch:
+                labels = [
+                    dataset_config.prompt_label_mapping.get(label, label).lower()
+                    for label in batch["label"]
+                ]
+            else:
+                labels = [None] * len(extracted_labels)
         case TaskGroup.QUESTION_ANSWERING:
             extracted_labels = [
@@ -319,12 +344,22 @@ def debug_log(
     else:
         input_texts = batch["text"]
-    for input_text, raw_output, prediction, label in zip(
-        input_texts, model_output.sequences, extracted_labels, labels
-    ):
-        logger.info(
-            f"Input: '{input_text}'\n"
-            f"Raw output: '{raw_output}'\n"
-            f"Prediction: '{prediction}'\n"
-            f"Label: '{label}'"
+    metadata_keys: c.Sequence[str] = [
+        key
+        for key in batch.keys()
+        if key not in ["text", "messages", "label", "labels", "target_text"]
+    ]
+    for idx in range(len(input_texts)):
+        data_to_log: dict[str, t.Any] = {
+            "Input": input_texts[idx],
+            "Raw output": model_output.sequences[idx],
+            "Prediction": extracted_labels[idx],
+        }
+        if labels[idx]:
+            data_to_log["Label"] = labels[idx]
+        data_to_log |= {key.capitalize(): batch[key][idx] for key in metadata_keys}
+        log(
+            "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items()),
+            level=logging.DEBUG,
         )

EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl