PyPI - EuroEval - Versions diffs - 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl - Mend

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

euroeval/__init__.py +32 -14
euroeval/benchmark_config_factory.py +92 -180
euroeval/benchmark_modules/base.py +49 -39
euroeval/benchmark_modules/fresh.py +35 -21
euroeval/benchmark_modules/hf.py +280 -244
euroeval/benchmark_modules/litellm.py +752 -312
euroeval/benchmark_modules/vllm.py +570 -268
euroeval/benchmarker.py +651 -528
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +49 -38
euroeval/constants.py +44 -25
euroeval/data_loading.py +111 -55
euroeval/data_models.py +490 -323
euroeval/dataset_configs/__init__.py +26 -4
euroeval/dataset_configs/bosnian.py +39 -0
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/croatian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +78 -50
euroeval/dataset_configs/dutch.py +74 -44
euroeval/dataset_configs/english.py +71 -36
euroeval/dataset_configs/estonian.py +111 -0
euroeval/dataset_configs/faroese.py +25 -18
euroeval/dataset_configs/finnish.py +63 -26
euroeval/dataset_configs/french.py +65 -32
euroeval/dataset_configs/german.py +77 -36
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +68 -57
euroeval/dataset_configs/italian.py +68 -36
euroeval/dataset_configs/latvian.py +87 -0
euroeval/dataset_configs/lithuanian.py +64 -0
euroeval/dataset_configs/norwegian.py +98 -72
euroeval/dataset_configs/polish.py +96 -0
euroeval/dataset_configs/portuguese.py +63 -40
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/slovene.py +56 -0
euroeval/dataset_configs/spanish.py +68 -34
euroeval/dataset_configs/swedish.py +82 -41
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/enums.py +12 -6
euroeval/exceptions.py +21 -1
euroeval/finetuning.py +34 -26
euroeval/generation.py +76 -41
euroeval/generation_utils.py +169 -34
euroeval/languages.py +1020 -188
euroeval/logging_utils.py +268 -0
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +85 -0
euroeval/metrics/huggingface.py +216 -0
euroeval/metrics/llm_as_a_judge.py +260 -0
euroeval/metrics/pipeline.py +289 -0
euroeval/metrics/speed.py +48 -0
euroeval/model_cache.py +40 -21
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +157 -22
euroeval/prompt_templates/multiple_choice.py +159 -17
euroeval/prompt_templates/named_entity_recognition.py +318 -21
euroeval/prompt_templates/reading_comprehension.py +207 -16
euroeval/prompt_templates/sentiment_classification.py +205 -22
euroeval/prompt_templates/summarization.py +122 -22
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +20 -9
euroeval/speed_benchmark.py +11 -12
euroeval/task_group_utils/multiple_choice_classification.py +21 -12
euroeval/task_group_utils/question_answering.py +101 -73
euroeval/task_group_utils/sequence_classification.py +144 -61
euroeval/task_group_utils/text_to_text.py +33 -12
euroeval/task_group_utils/token_classification.py +86 -89
euroeval/tasks.py +75 -16
euroeval/tokenisation_utils.py +603 -0
euroeval/types.py +17 -11
euroeval/utils.py +332 -137
euroeval-16.7.1.dist-info/METADATA +623 -0
euroeval-16.7.1.dist-info/RECORD +84 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -737
euroeval/metrics.py +0 -452
euroeval/tokenization_utils.py +0 -498
euroeval-15.12.0.dist-info/METADATA +0 -285
euroeval-15.12.0.dist-info/RECORD +0 -63
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/utils.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Utility functions to be used in other scripts."""
 import asyncio
+import collections.abc as c
 import gc
 import importlib
 import importlib.metadata
@@ -8,34 +9,28 @@ import importlib.util
 import logging
 import os
 import random
+import re
+import socket
 import sys
 import typing as t
-import warnings
-from functools import cache
 from pathlib import Path
+from types import ModuleType
-import litellm
+import demjson3
+import huggingface_hub as hf_hub
 import numpy as np
-import requests
 import torch
-from datasets.utils import disable_progress_bar
-from requests.exceptions import RequestException
-from transformers import logging as tf_logging
-from .exceptions import NaNValueInModelOutput
-if importlib.util.find_spec("ray") is not None:
-    import ray
+from .caching_utils import cache_arguments
+from .constants import T
+from .exceptions import InvalidBenchmark, InvalidModel, NaNValueInModelOutput
+from .logging_utils import log, log_once
 if t.TYPE_CHECKING:
-    from types import TracebackType
+    from .data_models import ModelIdComponents
     from .types import Predictions
-logger = logging.getLogger("euroeval")
 def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
     """Create cache directory for a model.
@@ -54,6 +49,72 @@ def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
     return str(cache_dir_path)
+def resolve_model_path(download_dir: str) -> str:
+    """Resolve the path to the directory containing the model config files and weights.
+    Args:
+        download_dir:
+            The download directory
+    Returns:
+        The path to the model.
+    Raises:
+        InvalidModel:
+            If the model path is not valid, or if required files are missing.
+    """
+    model_path = Path(download_dir)
+    # Get the 'path safe' version of the model id, which is the last dir in the path
+    model_id_path = model_path.name
+    # Hf hub `cache_dir` puts the files in models--`model_id_path`/snapshots
+    model_path = model_path / f"models--{model_id_path}" / "snapshots"
+    if not model_path.exists():
+        raise InvalidModel(
+            f"Attempted to load models from the {model_path} directory, "
+            "but it does not exist."
+        )
+    # Get all files in the model path
+    found_files = [
+        found_file for found_file in model_path.rglob("*") if found_file.is_file()
+    ]
+    if not found_files:
+        raise InvalidModel(f"No model files found at {model_path}")
+    # Make sure that there arent multiples of the files found
+    if len(found_files) == len(set(found_files)):
+        raise InvalidModel(
+            f"Found multiple model config files for {model_id_path.strip('models--')}"
+            f"at {model_path}"
+        )
+    # Check that found_files contains at least a 'config.json'
+    config_file = next(
+        (file for file in found_files if file.name == "config.json"), None
+    )
+    if config_file is None:
+        raise InvalidModel(
+            f"Missing required file 'config.json' for {model_id_path.strip('models--')}"
+            f"at {model_path}"
+        )
+    model_path = config_file.parent
+    # As a precaution we also check that all of the files are in the same directory
+    # if not we create a new dir with symlinks to all of the files from all snapshots
+    # this is especially useful for vllm where we can only specify one folder and e.g.,
+    # the safetensors version of the weights was added in an unmerged PR
+    if not all(
+        [found_file.parent == found_files[0].parent for found_file in found_files]
+    ):
+        new_model_path = model_path.parent / "model_files"
+        new_model_path.mkdir(exist_ok=True)
+        for found_file in found_files:
+            Path(new_model_path / found_file.name).symlink_to(found_file)
+        model_path = new_model_path
+    return str(model_path)
 def clear_memory() -> None:
     """Clears the memory of unused items."""
     for gc_generation in range(3):
@@ -84,67 +145,9 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
     return rng
-def block_terminal_output() -> None:
-    """Blocks libraries from writing output to the terminal.
-    This filters warnings from some libraries, sets the logging level to ERROR for some
-    libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and
-    disables most of the logging from the `transformers` library.
-    """
-    # Ignore miscellaneous warnings
-    warnings.filterwarnings("ignore", category=UserWarning)
-    warnings.filterwarnings("ignore", category=FutureWarning)
-    warnings.filterwarnings(
-        "ignore",
-        module="torch.nn.parallel*",
-        message="Was asked to gather along dimension 0, but all input tensors were "
-        "scalars; will instead unsqueeze and return a vector.",
-    )
-    warnings.filterwarnings("ignore", module="seqeval*")
-    # Up the logging level, to disable outputs
-    logging.getLogger("filelock").setLevel(logging.CRITICAL)
-    logging.getLogger("absl").setLevel(logging.CRITICAL)
-    logging.getLogger("datasets").setLevel(logging.CRITICAL)
-    logging.getLogger("openai").setLevel(logging.CRITICAL)
-    logging.getLogger("torch.distributed.distributed_c10d").setLevel(logging.CRITICAL)
-    logging.getLogger("torch.nn.parallel.distributed").setLevel(logging.CRITICAL)
-    logging.getLogger("vllm").setLevel(logging.CRITICAL)
-    logging.getLogger("vllm.engine.llm_engine").setLevel(logging.CRITICAL)
-    logging.getLogger("vllm.transformers_utils.tokenizer").setLevel(logging.CRITICAL)
-    logging.getLogger("vllm.core.scheduler").setLevel(logging.CRITICAL)
-    logging.getLogger("vllm.model_executor.weight_utils").setLevel(logging.CRITICAL)
-    logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
-    logging.getLogger("httpx").setLevel(logging.CRITICAL)
-    logging.getLogger("ray._private.worker").setLevel(logging.CRITICAL)
-    logging.getLogger("ray._private.services").setLevel(logging.CRITICAL)
-    logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
-    logging.getLogger("accelerate").setLevel(logging.CRITICAL)
-    logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
-    logging.getLogger("LiteLLM Router").setLevel(logging.CRITICAL)
-    logging.getLogger("LiteLLM Proxy").setLevel(logging.CRITICAL)
-    logging.getLogger("huggingface_hub").setLevel(logging.CRITICAL)
-    # This suppresses vLLM logging
-    os.environ["LOG_LEVEL"] = "CRITICAL"
-    os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
-    if importlib.util.find_spec("ray") is not None:
-        ray._private.worker._worker_logs_enabled = False
-    # Disable the tokeniser progress bars
-    disable_progress_bar()
-    # Disable most of the `transformers` logging
-    tf_logging._default_log_level = logging.CRITICAL
-    tf_logging.set_verbosity(logging.CRITICAL)
-    logging.getLogger("transformers.trainer").setLevel(logging.CRITICAL)
-    # Disable logging from `litellm`
-    litellm.suppress_debug_info = True
-def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type | None:
+def get_class_by_name(
+    class_name: str | c.Sequence[str], module_name: str
+) -> t.Type | None:
     """Get a class by its name.
     Args:
@@ -173,9 +176,10 @@ def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type |
     if error_messages:
         errors = "\n- " + "\n- ".join(error_messages)
-        logger.debug(
+        log(
             f"Could not find the class with the name(s) {', '.join(class_name)}. The "
-            f"following error messages were raised: {errors}"
+            f"following error messages were raised: {errors}",
+            level=logging.DEBUG,
         )
     # If the class could not be found, return None
@@ -197,40 +201,27 @@ def get_min_cuda_compute_capability() -> float | None:
     return float(f"{major}.{minor}")
+@cache_arguments(disable_condition=lambda: hasattr(sys, "_called_from_test"))
 def internet_connection_available() -> bool:
     """Checks if internet connection is available by pinging google.com.
     Returns:
         Whether or not internet connection is available.
     """
-    try:
-        requests.get("https://www.google.com")
-        return True
-    except RequestException:
-        return False
-class HiddenPrints:
-    """Context manager which removes all terminal output."""
+    internet_available: bool = False
-    def __enter__(self) -> None:
-        """Enter the context manager."""
-        self._original_stdout = sys.stdout
-        self._original_stderr = sys.stderr
-        sys.stdout = open(os.devnull, "w")
-        sys.stderr = open(os.devnull, "w")
+    try:
+        s = socket.create_connection(("1.1.1.1", 80))
+        s.close()
+        internet_available = True
+    except OSError:
+        pass
+    except Exception as e:
+        pytest_socket_errors = ["SocketConnectBlockedError", "SocketBlockedError"]
+        if type(e).__name__ not in pytest_socket_errors:
+            raise e
-    def __exit__(
-        self,
-        exc_type: t.Type[BaseException],
-        exc_val: BaseException,
-        exc_tb: "TracebackType",
-    ) -> None:
-        """Exit the context manager."""
-        sys.stdout.close()
-        sys.stderr.close()
-        sys.stdout = self._original_stdout
-        sys.stderr = self._original_stderr
+    return internet_available
 def raise_if_model_output_contains_nan_values(model_output: "Predictions") -> None:
@@ -288,34 +279,6 @@ def unscramble(scrambled_text: str) -> str:
     return unscrambled
-@cache
-def log_once(message: str, level: int = logging.INFO) -> None:
-    """Log a message once.
-    This is ensured by caching the input/output pairs of this function, using the
-    `functools.cache` decorator.
-    Args:
-        message:
-            The message to log.
-        level:
-            The logging level. Defaults to logging.INFO.
-    """
-    match level:
-        case logging.DEBUG:
-            logger.debug(message)
-        case logging.INFO:
-            logger.info(message)
-        case logging.WARNING:
-            logger.warning(message)
-        case logging.ERROR:
-            logger.error(message)
-        case logging.CRITICAL:
-            logger.critical(message)
-        case _:
-            raise ValueError(f"Invalid logging level: {level}")
 def get_package_version(package_name: str) -> str | None:
     """Get the version of a package.
@@ -332,9 +295,6 @@ def get_package_version(package_name: str) -> str | None:
         return None
-T = t.TypeVar("T", bound=object)
 def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
     """Run a coroutine, ensuring that the event loop is always closed when we're done.
@@ -348,7 +308,8 @@ def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
     loop = asyncio.new_event_loop()
     try:
         asyncio.set_event_loop(loop)
-        return loop.run_until_complete(coroutine)
+        response = loop.run_until_complete(coroutine)
+        return response
     finally:
         loop.close()
         asyncio.set_event_loop(None)
@@ -373,3 +334,237 @@ async def add_semaphore_and_catch_exception(
             return await coroutine
         except Exception as exc:
             return exc
+def extract_json_dict_from_string(s: str) -> dict | None:
+    """Extract a JSON dictionary from a string.
+    Args:
+        s:
+            The string to extract the JSON dictionary from.
+    Returns:
+        The extracted JSON dictionary, or None if no JSON dictionary could be found.
+    """
+    json_regex = r"\{[^{}]*?\}"
+    if (json_match := re.search(pattern=json_regex, string=s, flags=re.DOTALL)) is None:
+        log(
+            "The model output does not contain any JSON dictionary, so cannot parse "
+            f"it. Skipping. Here is the output: {s!r}",
+            level=logging.DEBUG,
+        )
+        return None
+    json_string = json_match.group()
+    try:
+        json_output = demjson3.decode(txt=json_string)
+    except demjson3.JSONDecodeError:
+        log(
+            "The model output is not valid JSON, so cannot parse it. Skipping. "
+            f"Here is the output: {json_string!r}",
+            level=logging.DEBUG,
+        )
+        return None
+    if not isinstance(json_output, dict):
+        log(
+            "The model output is not a JSON dictionary, so cannot parse "
+            f"it. Skipping. Here is the output: {json_string!r}",
+            level=logging.DEBUG,
+        )
+        return None
+    elif not all(isinstance(key, str) for key in json_output.keys()):
+        log(
+            "The model output is not a JSON dictionary with string keys, "
+            "so cannot parse it. Skipping. Here is the output: "
+            f"{json_string!r}",
+            level=logging.DEBUG,
+        )
+        return None
+    return json_output
+@cache_arguments()
+def get_hf_token(api_key: str | None) -> str | bool:
+    """Get the Hugging Face token.
+    Args:
+        api_key:
+            The API key to use as the Hugging Face token. If None, we will try to
+            extract it in other ways.
+    Returns:
+        The Hugging Face token, or True if no token is set but the user is logged in, or
+        False if no token is set and the user is not logged in.
+    """
+    if api_key is not None:
+        log_once(
+            "Using the Hugging Face API key passed to the function.",
+            level=logging.DEBUG,
+        )
+        return api_key
+    elif (token := os.getenv("HUGGINGFACE_API_KEY")) is not None:
+        log_once(
+            "Using the Hugging Face API key from the environment variable "
+            "`HUGGINGFACE_API_KEY`.",
+            level=logging.DEBUG,
+        )
+        return token
+    try:
+        hf_hub.whoami()
+        log_once(
+            "No Hugging Face API key was set, but the user is logged in to Hugging "
+            "Face, so using the local token.",
+            level=logging.DEBUG,
+        )
+        return True
+    except hf_hub.errors.LocalTokenNotFoundError:
+        log_once(
+            "No Hugging Face API key was set and the user is not logged in to Hugging "
+            "Face, so no token will be used.",
+            level=logging.DEBUG,
+        )
+        return False
+def extract_multiple_choice_labels(
+    prompt: str, candidate_labels: c.Sequence[str]
+) -> c.Sequence[str]:
+    """Extract multiple choice labels from a prompt.
+    Args:
+        prompt:
+            The prompt to extract the labels from.
+        candidate_labels:
+            The candidate labels to look for in the prompt.
+    Returns:
+        The extracted labels.
+    """
+    sample_candidate_labels: list[str] = list()
+    for candidate_label in candidate_labels:
+        candidate_label_match = re.search(
+            pattern=rf"\b{candidate_label}\. ", string=prompt, flags=re.IGNORECASE
+        )
+        if candidate_label_match is not None:
+            sample_candidate_labels.append(candidate_label)
+    if not sample_candidate_labels:
+        raise InvalidBenchmark(
+            "Could not extract any candidate labels from the prompt. Please ensure "
+            "that the candidate labels are present in the prompt, each followed by a "
+            "dot and a space (e.g., 'a. '). The candidate labels are: "
+            f"{', '.join(candidate_labels)}. Here is the prompt: {prompt!r}"
+        )
+    return sample_candidate_labels
+def split_model_id(model_id: str) -> "ModelIdComponents":
+    """Split a model ID into its components.
+    Args:
+        model_id:
+            The model ID to split.
+    Returns:
+        The split model ID.
+    Raises:
+        If the model ID is not valid.
+    """
+    # Importing here to avoid circular imports
+    from .data_models import ModelIdComponents
+    # Attempt to extract the model ID, revision, and param using regex
+    model_id_match = re.match(pattern=r"^[^@#]+", string=model_id)
+    revision_match = re.search(pattern=r"@([^@#]+)", string=model_id)
+    param_match = re.search(pattern=r"#([^@#]+)", string=model_id)
+    # If we cannot extract the model ID, raise an error
+    if model_id_match is None:
+        raise InvalidModel(f"The model ID {model_id!r} is not valid.")
+    model_id = model_id_match.group()
+    # Extract the revision and param and return the result
+    revision = revision_match.group(1) if revision_match is not None else "main"
+    param = param_match.group(1) if param_match is not None else None
+    return ModelIdComponents(model_id=model_id, revision=revision, param=param)
+def load_custom_datasets_module() -> ModuleType | None:
+    """Load the custom datasets module if it exists.
+    Raises:
+        RuntimeError:
+            If the custom datasets module cannot be loaded.
+    """
+    custom_datasets_file = Path("custom_datasets.py")
+    if custom_datasets_file.exists():
+        spec = importlib.util.spec_from_file_location(
+            name="custom_datasets_module", location=str(custom_datasets_file.resolve())
+        )
+        if spec is None:
+            log_once(
+                "Could not load the spec for the custom datasets file from "
+                f"{custom_datasets_file.resolve()}.",
+                level=logging.ERROR,
+            )
+            return None
+        module = importlib.util.module_from_spec(spec=spec)
+        if spec.loader is None:
+            log_once(
+                "Could not load the module for the custom datasets file from "
+                f"{custom_datasets_file.resolve()}.",
+                level=logging.ERROR,
+            )
+            return None
+        spec.loader.exec_module(module)
+        return module
+    return None
+class flash_attention_backend:
+    """Context manager to temporarily set the flash attention backend.
+    This sets the `VLLM_ATTENTION_BACKEND` environment variable to `FLASH_ATTN`
+    for the duration of the context manager, and restores the previous value afterwards.
+    """
+    def __init__(self, disabled: bool = False) -> None:
+        """Initialise the context manager.
+        Args:
+            disabled:
+                If True, this context manager does nothing.
+        """
+        self.disabled = (
+            True if disabled else os.environ["VLLM_ATTENTION_BACKEND"] != "FLASHINFER"
+        )
+        self.previous_value: str | None = None
+    def __enter__(self) -> None:
+        """Enter the context manager."""
+        if self.disabled:
+            return
+        self.previous_value = os.getenv("VLLM_ATTENTION_BACKEND")
+        os.environ["VLLM_ATTENTION_BACKEND"] = "FLASH_ATTN"
+    def __exit__(
+        self,
+        exc_type: t.Type[BaseException] | None,
+        exc_value: BaseException | None,
+        traceback: type[BaseException] | None,
+    ) -> None:
+        """Exit the context manager.
+        Args:
+            exc_type:
+                The type of the exception.
+            exc_value:
+                The value of the exception.
+            exc_tb:
+                The traceback of the exception.
+        """
+        if self.disabled:
+            return
+        if self.previous_value is None:
+            os.environ.pop("VLLM_ATTENTION_BACKEND", None)
+        else:
+            os.environ["VLLM_ATTENTION_BACKEND"] = self.previous_value

EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

EuroEval 15.12.0py3-none-any.whl → 16.7.1py3-none-any.whl