PyPI - EuroEval - Versions diffs - 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl - Mend

EuroEval 16.2.2py3-none-any.whl → 16.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (65) hide show

euroeval/__init__.py +7 -4
euroeval/benchmark_config_factory.py +0 -4
euroeval/benchmark_modules/base.py +3 -16
euroeval/benchmark_modules/fresh.py +5 -2
euroeval/benchmark_modules/hf.py +107 -66
euroeval/benchmark_modules/litellm.py +103 -55
euroeval/benchmark_modules/vllm.py +155 -82
euroeval/benchmarker.py +184 -129
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +1 -1
euroeval/constants.py +9 -0
euroeval/data_loading.py +14 -11
euroeval/data_models.py +12 -4
euroeval/dataset_configs/__init__.py +3 -0
euroeval/dataset_configs/czech.py +79 -0
euroeval/dataset_configs/danish.py +10 -13
euroeval/dataset_configs/dutch.py +0 -3
euroeval/dataset_configs/english.py +0 -3
euroeval/dataset_configs/estonian.py +11 -1
euroeval/dataset_configs/finnish.py +0 -3
euroeval/dataset_configs/french.py +0 -3
euroeval/dataset_configs/german.py +0 -3
euroeval/dataset_configs/italian.py +0 -3
euroeval/dataset_configs/latvian.py +2 -4
euroeval/dataset_configs/lithuanian.py +68 -0
euroeval/dataset_configs/norwegian.py +0 -3
euroeval/dataset_configs/polish.py +0 -3
euroeval/dataset_configs/portuguese.py +0 -3
euroeval/dataset_configs/slovak.py +60 -0
euroeval/dataset_configs/spanish.py +0 -3
euroeval/dataset_configs/swedish.py +10 -15
euroeval/finetuning.py +21 -15
euroeval/generation.py +10 -10
euroeval/generation_utils.py +2 -3
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +10 -6
euroeval/metrics/llm_as_a_judge.py +5 -3
euroeval/metrics/pipeline.py +22 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +11 -14
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/linguistic_acceptability.py +30 -3
euroeval/prompt_templates/multiple_choice.py +34 -1
euroeval/prompt_templates/named_entity_recognition.py +71 -11
euroeval/prompt_templates/reading_comprehension.py +41 -3
euroeval/prompt_templates/sentiment_classification.py +34 -1
euroeval/prompt_templates/summarization.py +26 -6
euroeval/scores.py +7 -7
euroeval/speed_benchmark.py +3 -5
euroeval/task_group_utils/multiple_choice_classification.py +0 -3
euroeval/task_group_utils/question_answering.py +0 -3
euroeval/task_group_utils/sequence_classification.py +43 -31
euroeval/task_group_utils/text_to_text.py +17 -8
euroeval/task_group_utils/token_classification.py +10 -9
euroeval/tokenisation_utils.py +22 -20
euroeval/utils.py +30 -147
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
euroeval-16.4.0.dist-info/RECORD +75 -0
euroeval-16.2.2.dist-info/RECORD +0 -70
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0

euroeval/dataset_configs/swedish.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Swedish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import SV
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -33,11 +32,11 @@ SUC3_CONFIG = DatasetConfig(
     languages=[SV],
 )
-SCANDIQA_SV_CONFIG = DatasetConfig(
-    name="scandiqa-sv",
-    pretty_name="the Swedish part of the truncated version of the question answering "
-    "dataset ScandiQA",
-    huggingface_id="EuroEval/scandiqa-sv-mini",
+MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
+    name="multi-wiki-qa-sv",
+    pretty_name="the truncated version of the Swedish part of the reading "
+    "comprehension dataset MultiWikiQA",
+    huggingface_id="EuroEval/multi-wiki-qa-sv-mini",
     task=RC,
     languages=[SV],
 )
@@ -111,11 +110,11 @@ BELEBELE_SV_CONFIG = DatasetConfig(
     unofficial=True,
 )
-MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
-    name="multi-wiki-qa-sv",
-    pretty_name="the truncated version of the Swedish part of the reading "
-    "comprehension dataset MultiWikiQA",
-    huggingface_id="EuroEval/multi-wiki-qa-sv-mini",
+SCANDIQA_SV_CONFIG = DatasetConfig(
+    name="scandiqa-sv",
+    pretty_name="the Swedish part of the truncated version of the question answering "
+    "dataset ScandiQA",
+    huggingface_id="EuroEval/scandiqa-sv-mini",
     task=RC,
     languages=[SV],
     unofficial=True,
@@ -138,9 +137,7 @@ WINOGRANDE_SV_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-sv",
     task=COMMON_SENSE,
     languages=[SV],
-    splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )
@@ -176,7 +173,5 @@ SKOLPROV_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/skolprov",
     task=KNOW,
     languages=[SV],
-    splits=["train", "test"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/finetuning.py CHANGED Viewed

@@ -6,7 +6,6 @@ import typing as t
 from functools import partial
 import torch
-from tqdm.auto import tqdm
 from transformers.trainer_callback import (
     EarlyStoppingCallback,
     PrinterCallback,
@@ -18,13 +17,9 @@ from transformers.training_args import OptimizerNames, TrainingArguments
 from .callbacks import NeverLeaveProgressCallback
 from .enums import DataType
 from .exceptions import InvalidBenchmark, NaNValueInModelOutput
+from .logging_utils import block_terminal_output, get_pbar, log, log_once
 from .model_loading import load_model
-from .utils import (
-    block_terminal_output,
-    clear_memory,
-    enforce_reproducibility,
-    log_once,
-)
+from .utils import clear_memory, enforce_reproducibility
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -32,8 +27,6 @@ if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
     from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
-logger = logging.getLogger("euroeval")
 def finetune(
     model: "BenchmarkModule",
@@ -58,6 +51,10 @@ def finetune(
     Returns:
         A list of dicts containing the scores for each metric for each iteration.
+    Raises:
+        InvalidBenchmark:
+            If the benchmark could not be completed.
     """
     # Set the data type to use for the model weights
     using_cuda = benchmark_config.device == torch.device("cuda")
@@ -70,7 +67,7 @@ def finetune(
     bs: int = benchmark_config.batch_size
     scores: list[dict[str, float]] = list()
-    for idx in tqdm(
+    for idx in get_pbar(
         iterable=range(benchmark_config.num_iterations),
         desc="Benchmarking",
         disable=not benchmark_config.progress_bar,
@@ -80,7 +77,7 @@ def finetune(
         model_already_initialized = idx == 0
         # Run a loop here to deal with automatic reduction of batch size
-        while True:
+        for _ in range(num_attempts := 10):
             # Clear GPU memory
             if not model_already_initialized:
                 try:
@@ -112,7 +109,10 @@ def finetune(
                 )
                 scores.append(itr_scores)
-                logger.debug(f"Test scores for iteration {idx}: {itr_scores}")
+                log(
+                    f"Test scores for iteration {idx}: {itr_scores}",
+                    level=logging.DEBUG,
+                )
                 break
@@ -123,9 +123,10 @@ def finetune(
                 if dtype != DataType.FP32:
                     dtype = DataType.FP32
                     model_already_initialized = False
-                    logger.debug(
+                    log(
                         "NaN value detected in model outputs while using mixed "
-                        "precision. Retrying with full fp32 precision."
+                        "precision. Retrying with full fp32 precision.",
+                        level=logging.DEBUG,
                     )
                 else:
                     raise InvalidBenchmark(
@@ -151,7 +152,12 @@ def finetune(
                 model_already_initialized = False
                 bs //= 2
-                logger.debug(f"Reduced batch size to {bs}")
+                log(f"Reduced batch size to {bs}", level=logging.DEBUG)
+        else:
+            raise InvalidBenchmark(
+                f"Could not benchmark the model after {num_attempts} attempts!"
+            )
     return scores

euroeval/generation.py CHANGED Viewed

@@ -11,12 +11,13 @@ from tqdm.auto import tqdm
 from .enums import BatchingPreference, TaskGroup
 from .exceptions import InvalidBenchmark
+from .logging_utils import get_pbar, log, log_once
 from .model_cache import (
     ModelCache,
     load_cached_model_outputs,
     split_dataset_into_cached_and_non_cached,
 )
-from .utils import clear_memory, log_once
+from .utils import clear_memory
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -29,8 +30,6 @@ if t.TYPE_CHECKING:
         ModelConfig,
     )
-logger = logging.getLogger("euroeval")
 def generate(
     model: "BenchmarkModule",
@@ -78,7 +77,7 @@ def generate(
     )
     scores: list[dict[str, float]] = list()
-    for idx in tqdm(
+    for idx in get_pbar(
         iterable=range(len(datasets)),
         desc="Benchmarking",
         disable=not benchmark_config.progress_bar,
@@ -90,7 +89,7 @@ def generate(
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
         )
-        logger.debug(f"Test scores for iteration {idx}: {test_scores}")
+        log(f"Test scores for iteration {idx}: {test_scores}", level=logging.DEBUG)
         scores.append(test_scores)
         clear_memory()
@@ -142,14 +141,14 @@ def generate_single_iteration(
         itr: t.Iterable
         match model.batching_preference:
             case BatchingPreference.SINGLE_SAMPLE:
-                itr = tqdm(iterable=non_cached_dataset, leave=False)
+                itr = get_pbar(iterable=non_cached_dataset)
             case BatchingPreference.ALL_AT_ONCE:
                 itr = [non_cached_dataset[:]]
             case _:
                 num_batches = len(non_cached_dataset) // benchmark_config.batch_size
                 if len(non_cached_dataset) % benchmark_config.batch_size != 0:
                     num_batches += 1
-                itr = tqdm(
+                itr = get_pbar(
                     iterable=mit.batched(
                         iterable=non_cached_dataset, n=benchmark_config.batch_size
                     ),
@@ -297,7 +296,7 @@ def debug_log(
                         + "\n"
                         + "\t".join(labels)
                     )
-            logger.info("\n\n".join(log_msgs))
+            log("\n\n".join(log_msgs), level=logging.DEBUG)
             return
         case (
@@ -347,6 +346,7 @@ def debug_log(
         if labels[idx]:
             data_to_log["Label"] = labels[idx]
         data_to_log |= {key.capitalize(): batch[key][idx] for key in metadata_keys}
-        logger.info(
-            "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items())
+        log(
+            "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items()),
+            level=logging.DEBUG,
         )

euroeval/generation_utils.py CHANGED Viewed

@@ -9,8 +9,9 @@ import typing as t
 from .enums import GenerativeType, TaskGroup
 from .exceptions import InvalidBenchmark, InvalidModel
+from .logging_utils import log_once
 from .tokenisation_utils import apply_chat_template
-from .utils import extract_multiple_choice_labels, log_once
+from .utils import extract_multiple_choice_labels
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -18,8 +19,6 @@ if t.TYPE_CHECKING:
     from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
-logger = logging.getLogger("euroeval")
 def extract_few_shot_examples(
     dataset: "DatasetDict",

euroeval/logging_utils.py ADDED Viewed

@@ -0,0 +1,250 @@
+"""Utility functions related to logging."""
+import datetime as dt
+import logging
+import os
+import sys
+import warnings
+from io import TextIOWrapper
+import litellm
+from datasets.utils import disable_progress_bars as disable_datasets_progress_bars
+from evaluate import disable_progress_bar as disable_evaluate_progress_bar
+from huggingface_hub.utils.tqdm import (
+    disable_progress_bars as disable_hf_hub_progress_bars,
+)
+from termcolor import colored
+from tqdm.auto import tqdm
+from transformers import logging as tf_logging
+from .caching_utils import cache_arguments
+logger = logging.getLogger("euroeval")
+def get_pbar(*tqdm_args, **tqdm_kwargs) -> tqdm:
+    """Get a progress bar for vLLM with custom hard-coded arguments.
+    Args:
+        *tqdm_args:
+            Positional arguments to pass to tqdm.
+        **tqdm_kwargs:
+            Additional keyword arguments to pass to tqdm.
+    Returns:
+        A tqdm progress bar.
+    """
+    tqdm_kwargs = dict(colour="yellow", ascii="—▰", leave=False) | tqdm_kwargs
+    tqdm_kwargs["desc"] = colored(
+        text=tqdm_kwargs.get("desc", "Processing"), color="light_yellow"
+    )
+    return tqdm(*tqdm_args, **tqdm_kwargs)
+def log(message: str, level: int, colour: str | None = None) -> None:
+    """Log a message.
+    Args:
+        message:
+            The message to log.
+        level:
+            The logging level. Defaults to logging.INFO.
+        colour:
+            The colour to use for the message. If None, a default colour will be used
+            based on the logging level.
+    Raises:
+        ValueError:
+            If the logging level is invalid.
+    """
+    match level:
+        case logging.DEBUG:
+            message = colored(
+                text=(
+                    "[DEBUG] "
+                    + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                    + f" · {message}"
+                ),
+                color=colour or "light_blue",
+            )
+            logger.debug(message)
+        case logging.INFO:
+            if colour is not None:
+                message = colored(text=message, color=colour)
+            logger.info(message)
+        case logging.WARNING:
+            message = colored(text=message, color=colour or "light_red")
+            logger.warning(message)
+        case logging.ERROR:
+            message = colored(text=message, color=colour or "red")
+            logger.error(message)
+        case logging.CRITICAL:
+            message = colored(text=message, color=colour or "red")
+            logger.critical(message)
+        case _:
+            raise ValueError(f"Invalid logging level: {level}")
+@cache_arguments("message")
+def log_once(message: str, level: int = logging.INFO, prefix: str = "") -> None:
+    """Log a message once.
+    This is ensured by caching the "message" argument and only logging it the first time
+    this function is called with that message.
+    Args:
+        message:
+            The message to log.
+        level:
+            The logging level. Defaults to logging.INFO.
+        prefix:
+            A prefix to add to the message, which is not considered when determining if
+            the message has been logged before.
+    """
+    log(message=prefix + message, level=level)
+def block_terminal_output() -> None:
+    """Blocks libraries from writing output to the terminal.
+    This filters warnings from some libraries, sets the logging level to ERROR for some
+    libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and
+    disables most of the logging from the `transformers` library.
+    """
+    if os.getenv("FULL_LOG") == "1":
+        return
+    # Ignore miscellaneous warnings
+    warnings.filterwarnings("ignore", category=UserWarning)
+    warnings.filterwarnings("ignore", category=FutureWarning)
+    logging.getLogger("absl").setLevel(logging.CRITICAL)
+    # Disable matplotlib logging
+    logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
+    # Disable PyTorch logging
+    logging.getLogger("torch.utils.cpp_extension").setLevel(logging.CRITICAL)
+    warnings.filterwarnings(action="ignore", module="torch*")
+    os.environ["TORCH_LOGS"] = "-all"
+    # Disable huggingface_hub logging
+    logging.getLogger("huggingface_hub").setLevel(logging.CRITICAL)
+    disable_hf_hub_progress_bars()
+    # Disable LiteLLM logging
+    logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
+    logging.getLogger("LiteLLM Router").setLevel(logging.CRITICAL)
+    logging.getLogger("LiteLLM Proxy").setLevel(logging.CRITICAL)
+    logging.getLogger("openai").setLevel(logging.CRITICAL)
+    logging.getLogger("httpx").setLevel(logging.CRITICAL)
+    litellm.suppress_debug_info = True
+    # Disable vLLM logging
+    logging.getLogger("vllm").setLevel(logging.CRITICAL)
+    logging.getLogger("vllm.engine.llm_engine").setLevel(logging.CRITICAL)
+    logging.getLogger("vllm.transformers_utils.tokenizer").setLevel(logging.CRITICAL)
+    logging.getLogger("vllm.core.scheduler").setLevel(logging.CRITICAL)
+    logging.getLogger("vllm.model_executor.weight_utils").setLevel(logging.CRITICAL)
+    logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
+    logging.getLogger("mistral_common.tokens.tokenizers.tekken").setLevel(
+        logging.CRITICAL
+    )
+    os.environ["LOG_LEVEL"] = "CRITICAL"
+    os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
+    # Disable flashinfer logging
+    os.environ["FLASHINFER_LOGGING_LEVEL"] = "CRITICAL"
+    # Disable datasets logging
+    logging.getLogger("datasets").setLevel(logging.CRITICAL)
+    logging.getLogger("filelock").setLevel(logging.CRITICAL)
+    disable_datasets_progress_bars()
+    # Disable evaluate logging
+    warnings.filterwarnings("ignore", module="seqeval*")
+    disable_evaluate_progress_bar()
+    # Disable most of the `transformers` logging
+    tf_logging._default_log_level = logging.CRITICAL
+    tf_logging.set_verbosity(logging.CRITICAL)
+    logging.getLogger("transformers.trainer").setLevel(logging.CRITICAL)
+    logging.getLogger("accelerate").setLevel(logging.CRITICAL)
+class no_terminal_output:
+    """Context manager that suppresses all terminal output."""
+    def __init__(self, disable: bool = False) -> None:
+        """Initialise the context manager.
+        Args:
+            disable:
+                If True, this context manager does nothing.
+        """
+        self.disable = disable
+        self.nothing_file: TextIOWrapper | None = None
+        self._cpp_stdout_file: int | None = None
+        self._cpp_stderr_file: int | None = None
+        try:
+            self._cpp_stdout_file = os.dup(sys.stdout.fileno())
+            self._cpp_stderr_file = os.dup(sys.stderr.fileno())
+        except OSError:
+            self._log_windows_warning()
+    def _log_windows_warning(self) -> None:
+        """Log a warning about Windows not supporting blocking terminal output."""
+        log_once(
+            "Your operating system (probably Windows) does not support blocking "
+            "terminal output, so expect more messy output - sorry!",
+            level=logging.WARNING,
+        )
+    def __enter__(self) -> None:
+        """Suppress all terminal output."""
+        if not self.disable:
+            self.nothing_file = open(os.devnull, "w")
+            try:
+                os.dup2(fd=self.nothing_file.fileno(), fd2=sys.stdout.fileno())
+                os.dup2(fd=self.nothing_file.fileno(), fd2=sys.stderr.fileno())
+            except OSError:
+                self._log_windows_warning()
+    def __exit__(
+        self,
+        exc_type: type[BaseException] | None,
+        exc_val: BaseException | None,
+        exc_tb: type[BaseException] | None,
+    ) -> None:
+        """Re-enable terminal output."""
+        if not self.disable:
+            if self.nothing_file is not None:
+                self.nothing_file.close()
+            try:
+                if self._cpp_stdout_file is not None:
+                    os.dup2(fd=self._cpp_stdout_file, fd2=sys.stdout.fileno())
+                if self._cpp_stderr_file is not None:
+                    os.dup2(fd=self._cpp_stderr_file, fd2=sys.stderr.fileno())
+            except OSError:
+                self._log_windows_warning()
+def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
+    """Adjust the logging level based on verbosity.
+    Args:
+        verbose:
+            Whether to output additional output.
+        ignore_testing:
+            Whether to ignore the testing flag.
+    Returns:
+        The logging level that was set.
+    """
+    if hasattr(sys, "_called_from_test") and not ignore_testing:
+        logging_level = logging.CRITICAL
+    elif verbose:
+        logging_level = logging.DEBUG
+    else:
+        logging_level = logging.INFO
+    logger.setLevel(logging_level)
+    return logging_level

euroeval/metrics/base.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import abc
 import collections.abc as c
-import logging
 import typing as t
 if t.TYPE_CHECKING:
@@ -10,8 +9,6 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig
-logger: logging.Logger = logging.getLogger("euroeval")
 class Metric(abc.ABC):
     """Abstract base class for all metrics."""

euroeval/metrics/huggingface.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All the Hugging Face metrics used in EuroEval."""
 import collections.abc as c
-import logging
 import typing as t
 from pathlib import Path
@@ -9,7 +8,7 @@ import evaluate
 import numpy as np
 from datasets import DownloadConfig
-from ..utils import HiddenPrints
+from ..logging_utils import no_terminal_output
 from .base import Metric
 if t.TYPE_CHECKING:
@@ -18,8 +17,6 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig
-logger: logging.Logger = logging.getLogger("euroeval")
 class HuggingFaceMetric(Metric):
     """A metric which is implemented in the `evaluate` package.
@@ -126,7 +123,7 @@ class HuggingFaceMetric(Metric):
         assert self.metric is not None
-        with HiddenPrints():
+        with no_terminal_output(disable=benchmark_config.verbose):
             results = self.metric.compute(
                 predictions=predictions, references=references, **self.compute_kwargs
             )
@@ -145,6 +142,13 @@ class HuggingFaceMetric(Metric):
         return score
+    def __del__(self) -> None:
+        """Clean up the metric from memory."""
+        if self.metric is not None:
+            if self.metric.writer is not None:
+                self.metric.writer.close()
+            del self.metric
 mcc_metric = HuggingFaceMetric(
     name="mcc",
@@ -197,7 +201,7 @@ bert_score_metric = HuggingFaceMetric(
     huggingface_id="bertscore",
     results_key="f1",
     compute_kwargs=dict(
-        model_type="microsoft/mdeberta-v3-base", device="cpu", batch_size=16
+        model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
     ),
 )

euroeval/metrics/llm_as_a_judge.py CHANGED Viewed

@@ -8,6 +8,7 @@ from pathlib import Path
 from pydantic import BaseModel, Field
 from ..exceptions import InvalidBenchmark
+from ..logging_utils import log
 from ..model_cache import ModelCache
 from ..utils import extract_json_dict_from_string
 from .base import Metric
@@ -17,8 +18,6 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig
-logger: logging.Logger = logging.getLogger("euroeval")
 class LLMAsAJudgeMetric(Metric):
     """Use an LLM to judge the quality of the predictions."""
@@ -190,7 +189,10 @@ class LLMAsAJudgeMetric(Metric):
         # Calculate the scores using the scoring function
         scores = [self.scoring_fn(output) for output in outputs]
         if not scores:
-            logger.warning(f"No scores were calculated for {self.pretty_name}.")
+            log(
+                f"No scores were calculated for {self.pretty_name}.",
+                level=logging.WARNING,
+            )
             return None
         return sum(scores) / len(scores)

euroeval/metrics/pipeline.py CHANGED Viewed

@@ -11,6 +11,7 @@ import numpy as np
 from scipy.special import expit as sigmoid
 from ..exceptions import InvalidBenchmark
+from ..logging_utils import log, no_terminal_output
 from ..utils import unscramble
 from .base import Metric
@@ -20,8 +21,6 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig
-logger: logging.Logger = logging.getLogger("euroeval")
 T = t.TypeVar("T", bound=int | float | str | bool)
@@ -121,16 +120,22 @@ class PipelineMetric(Metric):
             The calculated metric score, or None if the score should be ignored.
         """
         if self.pipeline is None:
-            self.pipeline = self._download_pipeline()
+            self.pipeline = self._download_pipeline(
+                cache_dir=benchmark_config.cache_dir
+            )
         if self.preprocessing_fn is not None:
             predictions = self.preprocessing_fn(
                 predictions=predictions, dataset=dataset
             )
         return self.pipeline_scoring_function(self.pipeline, predictions)
-    def _download_pipeline(self) -> "Pipeline":
+    def _download_pipeline(self, cache_dir: str) -> "Pipeline":
         """Download the scikit-learn pipeline from the given URL.
+        Args:
+            cache_dir:
+                The directory to use for caching the downloaded pipeline.
         Returns:
             The downloaded scikit-learn pipeline.
@@ -138,10 +143,13 @@ class PipelineMetric(Metric):
             InvalidBenchmark:
                 If the loading of the pipeline fails for any reason.
         """
-        logger.debug(f"Loading pipeline from {self.pipeline_repo}...")
-        folder_path = hf_hub.HfApi(
-            token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_")
-        ).snapshot_download(repo_id=self.pipeline_repo, repo_type="model")
+        log(f"Loading pipeline from {self.pipeline_repo}...", level=logging.DEBUG)
+        with no_terminal_output():
+            folder_path = hf_hub.HfApi(
+                token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_")
+            ).snapshot_download(
+                repo_id=self.pipeline_repo, repo_type="model", cache_dir=cache_dir
+            )
         model_path = Path(folder_path, self.pipeline_file_name)
         try:
             with model_path.open(mode="rb") as f:
@@ -150,7 +158,7 @@ class PipelineMetric(Metric):
             raise InvalidBenchmark(
                 f"Failed to load pipeline from {self.pipeline_repo!r}: {e}"
             ) from e
-        logger.debug(f"Successfully loaded pipeline: {pipeline}")
+        log(f"Successfully loaded pipeline: {pipeline}", level=logging.DEBUG)
         return pipeline
@@ -191,6 +199,11 @@ def european_values_preprocessing_fn(
             for idx, choice in idx_to_choice.items()
             if choice is not None
         }
+        if prediction not in idx_to_choice:
+            raise InvalidBenchmark(
+                f"The prediction {prediction} is not a valid index for the "
+                f"question with choices {idx_to_choice}."
+            )
         integer_prediction = idx_to_choice[prediction]
         integer_predictions.append(integer_prediction)

EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.2.2py3-none-any.whl → 16.4.0py3-none-any.whl