PyPI - ScandEval - Versions diffs - 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl - Mend

ScandEval 16.11.0py3-none-any.whl → 16.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

scandeval/__init__.py +0 -9
scandeval/async_utils.py +46 -0
scandeval/benchmark_config_factory.py +31 -2
scandeval/benchmark_modules/fresh.py +2 -1
scandeval/benchmark_modules/hf.py +76 -23
scandeval/benchmark_modules/litellm.py +33 -15
scandeval/benchmark_modules/vllm.py +97 -44
scandeval/benchmarker.py +29 -33
scandeval/cli.py +11 -0
scandeval/constants.py +36 -2
scandeval/custom_dataset_configs.py +152 -0
scandeval/data_loading.py +87 -31
scandeval/data_models.py +405 -224
scandeval/dataset_configs/__init__.py +51 -25
scandeval/dataset_configs/albanian.py +1 -1
scandeval/dataset_configs/belarusian.py +47 -0
scandeval/dataset_configs/bulgarian.py +1 -1
scandeval/dataset_configs/catalan.py +1 -1
scandeval/dataset_configs/croatian.py +1 -1
scandeval/dataset_configs/danish.py +3 -2
scandeval/dataset_configs/dutch.py +16 -5
scandeval/dataset_configs/english.py +4 -3
scandeval/dataset_configs/estonian.py +8 -7
scandeval/dataset_configs/faroese.py +1 -1
scandeval/dataset_configs/finnish.py +5 -4
scandeval/dataset_configs/french.py +6 -5
scandeval/dataset_configs/german.py +4 -3
scandeval/dataset_configs/greek.py +1 -1
scandeval/dataset_configs/hungarian.py +1 -1
scandeval/dataset_configs/icelandic.py +4 -3
scandeval/dataset_configs/italian.py +4 -3
scandeval/dataset_configs/latvian.py +2 -2
scandeval/dataset_configs/lithuanian.py +1 -1
scandeval/dataset_configs/norwegian.py +6 -5
scandeval/dataset_configs/polish.py +4 -3
scandeval/dataset_configs/portuguese.py +5 -4
scandeval/dataset_configs/romanian.py +2 -2
scandeval/dataset_configs/serbian.py +1 -1
scandeval/dataset_configs/slovene.py +1 -1
scandeval/dataset_configs/spanish.py +4 -3
scandeval/dataset_configs/swedish.py +4 -3
scandeval/dataset_configs/ukrainian.py +1 -1
scandeval/generation_utils.py +6 -6
scandeval/metrics/__init__.py +1 -0
scandeval/metrics/bias.py +237 -0
scandeval/metrics/huggingface.py +2 -1
scandeval/metrics/llm_as_a_judge.py +1 -1
scandeval/metrics/pipeline.py +1 -1
scandeval/model_cache.py +34 -4
scandeval/prompt_templates/linguistic_acceptability.py +9 -0
scandeval/prompt_templates/multiple_choice.py +9 -0
scandeval/prompt_templates/named_entity_recognition.py +21 -0
scandeval/prompt_templates/reading_comprehension.py +10 -0
scandeval/prompt_templates/sentiment_classification.py +11 -0
scandeval/string_utils.py +157 -0
scandeval/task_group_utils/sequence_classification.py +2 -5
scandeval/task_group_utils/token_classification.py +2 -4
scandeval/tasks.py +22 -0
scandeval/tokenisation_utils.py +12 -1
scandeval/utils.py +13 -383
scandeval-16.13.0.dist-info/METADATA +334 -0
scandeval-16.13.0.dist-info/RECORD +94 -0
scandeval-16.11.0.dist-info/METADATA +0 -649
scandeval-16.11.0.dist-info/RECORD +0 -89
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0

scandeval/__init__.py CHANGED Viewed

@@ -110,15 +110,6 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
 os.environ["VLLM_USE_V1"] = "1"
-# Use the FlashInfer flash-attention backend for vLLM, unless the user has already
-# specified a different backend.
-if os.getenv("VLLM_ATTENTION_BACKEND") is None:
-    os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
-    os.environ["USER_HAS_SET_VLLM_ATTENTION_BACKEND"] = "0"
-else:
-    os.environ["USER_HAS_SET_VLLM_ATTENTION_BACKEND"] = "1"
 # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
 # former and LiteLLM uses the latter
 if os.getenv("HUGGINGFACE_API_KEY"):

scandeval/async_utils.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Utility functions for asyncronous tasks."""
+import asyncio
+import typing as t
+from .constants import T
+def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
+    """Run a coroutine, ensuring that the event loop is always closed when we're done.
+    Args:
+        coroutine:
+            The coroutine to run.
+    Returns:
+        The result of the coroutine.
+    """
+    try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError:  # If the current event loop is closed
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+    response = loop.run_until_complete(coroutine)
+    return response
+async def add_semaphore_and_catch_exception(
+    coroutine: t.Coroutine[t.Any, t.Any, T], semaphore: asyncio.Semaphore
+) -> T | Exception:
+    """Run a coroutine with a semaphore.
+    Args:
+        coroutine:
+            The coroutine to run.
+        semaphore:
+            The semaphore to use.
+    Returns:
+        The result of the coroutine.
+    """
+    async with semaphore:
+        try:
+            return await coroutine
+        except Exception as exc:
+            return exc

scandeval/benchmark_config_factory.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Factory class for creating dataset configurations."""
 import collections.abc as c
+import importlib.util
 import sys
 import typing as t
 from pathlib import Path
@@ -13,6 +14,9 @@ from .enums import Device
 from .exceptions import InvalidBenchmark
 from .languages import get_all_languages
+if importlib.util.find_spec("vllm") is not None:
+    pass
 if t.TYPE_CHECKING:
     from .data_models import Language
@@ -42,6 +46,8 @@ def build_benchmark_config(
         dataset=benchmark_config_params.dataset,
         languages=languages,
         custom_datasets_file=benchmark_config_params.custom_datasets_file,
+        api_key=benchmark_config_params.api_key,
+        cache_dir=Path(benchmark_config_params.cache_dir),
     )
     return BenchmarkConfig(
@@ -68,6 +74,7 @@ def build_benchmark_config(
         api_base=benchmark_config_params.api_base,
         api_version=benchmark_config_params.api_version,
         gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
+        attention_backend=benchmark_config_params.attention_backend,
         generative_type=benchmark_config_params.generative_type,
         debug=benchmark_config_params.debug,
         run_with_cli=benchmark_config_params.run_with_cli,
@@ -154,7 +161,9 @@ def prepare_dataset_configs(
     languages: c.Sequence["Language"],
     dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None",
     custom_datasets_file: Path,
-) -> c.Sequence["DatasetConfig"]:
+    api_key: str | None,
+    cache_dir: Path,
+) -> list["DatasetConfig"]:
     """Prepare dataset config(s) for benchmarking.
     Args:
@@ -168,6 +177,10 @@ def prepare_dataset_configs(
             included, limited by the `task` and `languages` parameters.
         custom_datasets_file:
             A path to a Python file containing custom dataset configurations.
+        api_key:
+            The API key to use for accessing the Hugging Face Hub.
+        cache_dir:
+            The directory to store the cache in.
     Returns:
         The prepared dataset configs.
@@ -176,9 +189,25 @@ def prepare_dataset_configs(
         InvalidBenchmark:
             If the task or dataset is not found in the benchmark tasks or datasets.
     """
+    # Extract the dataset IDs from the `dataset` argument
+    dataset_ids: list[str] = list()
+    if isinstance(dataset, str):
+        dataset_ids.append(dataset)
+    elif isinstance(dataset, DatasetConfig):
+        dataset_ids.append(dataset.name)
+    elif isinstance(dataset, list):
+        for d in dataset:
+            if isinstance(d, str):
+                dataset_ids.append(d)
+            elif isinstance(d, DatasetConfig):
+                dataset_ids.append(d.name)
     # Create the list of dataset configs
     all_dataset_configs = get_all_dataset_configs(
-        custom_datasets_file=custom_datasets_file
+        custom_datasets_file=custom_datasets_file,
+        dataset_ids=dataset_ids,
+        api_key=api_key,
+        cache_dir=cache_dir,
     )
     all_official_dataset_configs: c.Sequence[DatasetConfig] = [
         dataset_config

scandeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -28,8 +28,9 @@ from ..exceptions import (
 )
 from ..generation_utils import raise_if_wrong_params
 from ..logging_utils import block_terminal_output
+from ..model_cache import create_model_cache_dir
 from ..types import Tokeniser
-from ..utils import create_model_cache_dir, get_hf_token
+from ..utils import get_hf_token
 from .hf import (
     HuggingFaceEncoderModel,
     align_model_and_tokeniser,

scandeval/benchmark_modules/hf.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Encoder models from the Hugging Face Hub."""
 import collections.abc as c
+import importlib
 import logging
 import re
 import typing as t
@@ -63,6 +64,8 @@ from ..exceptions import (
 from ..generation_utils import raise_if_wrong_params
 from ..languages import get_all_languages
 from ..logging_utils import block_terminal_output, log, log_once
+from ..model_cache import create_model_cache_dir
+from ..string_utils import split_model_id
 from ..task_group_utils import (
     multiple_choice_classification,
     question_answering,
@@ -70,13 +73,7 @@ from ..task_group_utils import (
 )
 from ..tokenisation_utils import get_bos_token, get_eos_token
 from ..types import Tokeniser
-from ..utils import (
-    create_model_cache_dir,
-    get_class_by_name,
-    get_hf_token,
-    internet_connection_available,
-    split_model_id,
-)
+from ..utils import get_hf_token, internet_connection_available
 from .base import BenchmarkModule
 try:
@@ -381,7 +378,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             if "label" in examples:
                 try:
                     examples["label"] = [
-                        self._model.config.label2id[lbl.lower()]
+                        self._model.config.label2id[str(lbl).lower()]
                         if self._model.config.label2id is not None
                         else lbl
                         for lbl in examples["label"]
@@ -758,20 +755,30 @@ def get_model_repo_info(
     # model info object.
     model_info: HfApiModelInfo | None = None
     if Path(model_id).is_dir():
-        if all(
-            (Path(model_id) / required_file).exists()
-            for required_file in LOCAL_MODELS_REQUIRED_FILES
-        ):
+        if Path(model_id, "config.json").exists():
             log_once(
-                f"The local model directory {model_id!r} has all the required model "
-                f"files ({LOCAL_MODELS_REQUIRED_FILES}), so we're skipping looking up "
-                "model information from the Hugging Face Hub.",
+                f"The local model directory {model_id!r} has a 'config.json' file, so "
+                "we're skipping looking up model information from the Hugging Face "
+                "Hub.",
                 level=logging.DEBUG,
             )
             model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
+        elif Path(model_id, "adapter_config.json").exists():
+            log_once(
+                f"The local model directory {model_id!r} has an 'adapter_config.json' "
+                "file, so we're skipping looking up model information from the Hugging "
+                "Face Hub.",
+                level=logging.DEBUG,
+            )
+            model_info = HfApiModelInfo(
+                id=model_id,
+                tags=None,
+                pipeline_tag=None,
+                siblings=[dict(rfilename="adapter_config.json")],
+            )
         else:
             log_once(
-                f"The local model directory {model_id} does not contain all the "
+                f"The local model directory {model_id} does not contain any of the "
                 f"required files: {LOCAL_MODELS_REQUIRED_FILES}. Skipping this "
                 f"model.",
                 level=logging.WARNING,
@@ -807,8 +814,8 @@ def get_model_repo_info(
                     log(
                         f"Could not access the model {model_id} with the revision "
                         f"{revision}. The error was {str(e)!r}. Please set the "
-                        "`HUGGINGFACE_API_KEY` environment variable or use the "
-                        "`--api-key` argument.",
+                        "`HUGGINGFACE_API_KEY` or `HF_TOKEN` environment variable or "
+                        "use the `--api-key` argument.",
                         level=logging.DEBUG,
                     )
                     return None
@@ -876,8 +883,9 @@ def get_model_repo_info(
             for tag in GENERATIVE_PIPELINE_TAGS
             for class_name in TASK_MAPPING.get(tag, dict()).values()  # type: ignore[attr-defined]
         ]
-        if class_names is not None and any(
-            class_name in generative_class_names for class_name in class_names
+        if class_names is not None and (
+            any(class_name in generative_class_names for class_name in class_names)
+            or any("ForCausalLM" in class_name for class_name in class_names)
         ):
             pipeline_tag = "text-generation"
         else:
@@ -1084,8 +1092,8 @@ def load_hf_model_config(
                     f"The model {model_id!r} is a gated repository. Please ensure "
                     "that you are logged in with `hf auth login` or have provided a "
                     "valid Hugging Face access token with the `HUGGINGFACE_API_KEY` "
-                    "environment variable or the `--api-key` argument. Also check that "
-                    "your account has access to this model."
+                    "or `HF_TOKEN` environment variable or the `--api-key` argument. "
+                    "Also check that your account has access to this model."
                 ) from e
             raise InvalidModel(
                 f"Couldn't load model config for {model_id!r}. The error was "
@@ -1121,7 +1129,11 @@ def load_hf_model_config(
         )
     # Ensure that the PAD token ID is set
-    if config.eos_token_id is not None and config.pad_token_id is None:
+    if (
+        hasattr(config, "eos_token_id")
+        and config.eos_token_id is not None
+        and (not hasattr(config, "pad_token_id") or config.pad_token_id is None)
+    ):
         if isinstance(config.eos_token_id, list):
             config.pad_token_id = config.eos_token_id[0]
         else:
@@ -1319,3 +1331,44 @@ def task_group_to_class_name(task_group: TaskGroup) -> str:
     )
     pascal_case = special_case_mapping.get(pascal_case, pascal_case)
     return f"AutoModelFor{pascal_case}"
+def get_class_by_name(
+    class_name: str | c.Sequence[str], module_name: str
+) -> t.Type | None:
+    """Get a class by its name.
+    Args:
+        class_name:
+            The name of the class, written in kebab-case. The corresponding class name
+            must be the same, but written in PascalCase, and lying in a module with the
+            same name, but written in snake_case. If a list of strings is passed, the
+            first class that is found is returned.
+        module_name:
+            The name of the module where the class is located.
+    Returns:
+        The class. If the class is not found, None is returned.
+    """
+    if isinstance(class_name, str):
+        class_name = [class_name]
+    error_messages = list()
+    for name in class_name:
+        try:
+            module = importlib.import_module(name=module_name)
+            class_: t.Type = getattr(module, name)
+            return class_
+        except (ModuleNotFoundError, AttributeError) as e:
+            error_messages.append(str(e))
+    if error_messages:
+        errors = "\n- " + "\n- ".join(error_messages)
+        log(
+            f"Could not find the class with the name(s) {', '.join(class_name)}. The "
+            f"following error messages were raised: {errors}",
+            level=logging.DEBUG,
+        )
+    # If the class could not be found, return None
+    return None

scandeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -40,7 +40,7 @@ from pydantic import ValidationError, conlist, create_model
 from requests.exceptions import RequestException
 from tqdm.asyncio import tqdm as tqdm_async
-from ..caching_utils import cache_arguments
+from ..async_utils import add_semaphore_and_catch_exception, safe_run
 from ..constants import (
     JSON_STRIP_CHARACTERS,
     LITELLM_CLASSIFICATION_OUTPUT_KEY,
@@ -74,6 +74,8 @@ from ..generation_utils import (
     raise_if_wrong_params,
 )
 from ..logging_utils import get_pbar, log, log_once
+from ..model_cache import create_model_cache_dir
+from ..string_utils import split_model_id
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
@@ -83,13 +85,7 @@ from ..task_group_utils import (
 from ..tasks import NER
 from ..tokenisation_utils import get_first_label_token_mapping
 from ..types import ExtractLabelsFunction
-from ..utils import (
-    add_semaphore_and_catch_exception,
-    create_model_cache_dir,
-    get_hf_token,
-    safe_run,
-    split_model_id,
-)
+from ..utils import get_hf_token
 from .base import BenchmarkModule
 from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokeniser
@@ -700,10 +696,10 @@ class LiteLLMModel(BenchmarkModule):
         elif isinstance(
             error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
         ):
-            log_once(
-                f"Service temporarily unavailable. The error message was: {error}. "
-                "Retrying in 10 seconds...",
-                level=logging.DEBUG,
+            log(
+                "Service temporarily unavailable during generation. The error "
+                f"message was: {error}. Retrying in 10 seconds...",
+                level=logging.INFO,
             )
             return generation_kwargs, 10
         elif isinstance(error, UnsupportedParamsError):
@@ -764,6 +760,20 @@ class LiteLLMModel(BenchmarkModule):
                 run_with_cli=self.benchmark_config.run_with_cli,
             ) from error
+        if (
+            isinstance(error, (BadRequestError, NotFoundError))
+            and self.benchmark_config.api_base is not None
+            and not self.benchmark_config.api_base.endswith("/v1")
+        ):
+            log_once(
+                f"The API base {self.benchmark_config.api_base!r} is not valid. We "
+                "will try appending '/v1' to it and try again.",
+                level=logging.DEBUG,
+            )
+            self.benchmark_config.api_base += "/v1"
+            generation_kwargs["api_base"] = self.benchmark_config.api_base
+            return generation_kwargs, 0
         raise InvalidBenchmark(
             f"Failed to generate text. The error message was: {error}"
         ) from error
@@ -1390,9 +1400,10 @@ class LiteLLMModel(BenchmarkModule):
                 InternalServerError,
             ) as e:
                 log(
-                    f"Service temporarily unavailable. The error message was: {e}. "
+                    "Service temporarily unavailable while checking for model "
+                    f"existence of the model {model_id!r}. The error message was: {e}. "
                     "Retrying in 10 seconds...",
-                    level=logging.DEBUG,
+                    level=logging.INFO,
                 )
                 sleep(10)
             except APIError as e:
@@ -1567,7 +1578,6 @@ class LiteLLMModel(BenchmarkModule):
         return dataset
-    @cache_arguments()
     def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
         """Get the generation arguments for the model.
@@ -1865,6 +1875,14 @@ def clean_model_id(model_id: str, benchmark_config: BenchmarkConfig) -> str:
         else:
             prefix = "openai/"
         model_id = prefix + model_id
+    # When we want to evaluate an OpenAI model on a custom inference server, such as HF
+    # inference endpoints, LiteLLM gets confused since it's already using the `openai/`
+    # prefix. We thus have to add it twice, and this hack here is to ensure that we
+    # don't store the results with model ID `openai/openai/...`.
+    elif benchmark_config.api_base is not None and model_id.startswith("openai/"):
+        model_id = "openai/openai/" + re.sub(r"(openai/)*", "", model_id)
     return model_id

ScandEval 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

ScandEval 16.11.0py3-none-any.whl → 16.13.0py3-none-any.whl