PyPI - ScandEval - Versions diffs - 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl - Mend

ScandEval 16.11.0py3-none-any.whl → 16.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

scandeval/__init__.py +0 -9
scandeval/async_utils.py +46 -0
scandeval/benchmark_config_factory.py +31 -2
scandeval/benchmark_modules/fresh.py +2 -1
scandeval/benchmark_modules/hf.py +76 -23
scandeval/benchmark_modules/litellm.py +33 -15
scandeval/benchmark_modules/vllm.py +97 -44
scandeval/benchmarker.py +29 -33
scandeval/cli.py +11 -0
scandeval/constants.py +36 -2
scandeval/custom_dataset_configs.py +152 -0
scandeval/data_loading.py +87 -31
scandeval/data_models.py +405 -224
scandeval/dataset_configs/__init__.py +51 -25
scandeval/dataset_configs/albanian.py +1 -1
scandeval/dataset_configs/belarusian.py +47 -0
scandeval/dataset_configs/bulgarian.py +1 -1
scandeval/dataset_configs/catalan.py +1 -1
scandeval/dataset_configs/croatian.py +1 -1
scandeval/dataset_configs/danish.py +3 -2
scandeval/dataset_configs/dutch.py +16 -5
scandeval/dataset_configs/english.py +4 -3
scandeval/dataset_configs/estonian.py +8 -7
scandeval/dataset_configs/faroese.py +1 -1
scandeval/dataset_configs/finnish.py +5 -4
scandeval/dataset_configs/french.py +6 -5
scandeval/dataset_configs/german.py +4 -3
scandeval/dataset_configs/greek.py +1 -1
scandeval/dataset_configs/hungarian.py +1 -1
scandeval/dataset_configs/icelandic.py +4 -3
scandeval/dataset_configs/italian.py +4 -3
scandeval/dataset_configs/latvian.py +2 -2
scandeval/dataset_configs/lithuanian.py +1 -1
scandeval/dataset_configs/norwegian.py +6 -5
scandeval/dataset_configs/polish.py +4 -3
scandeval/dataset_configs/portuguese.py +5 -4
scandeval/dataset_configs/romanian.py +2 -2
scandeval/dataset_configs/serbian.py +1 -1
scandeval/dataset_configs/slovene.py +1 -1
scandeval/dataset_configs/spanish.py +4 -3
scandeval/dataset_configs/swedish.py +4 -3
scandeval/dataset_configs/ukrainian.py +1 -1
scandeval/generation_utils.py +6 -6
scandeval/metrics/__init__.py +1 -0
scandeval/metrics/bias.py +237 -0
scandeval/metrics/huggingface.py +2 -1
scandeval/metrics/llm_as_a_judge.py +1 -1
scandeval/metrics/pipeline.py +1 -1
scandeval/model_cache.py +34 -4
scandeval/prompt_templates/linguistic_acceptability.py +9 -0
scandeval/prompt_templates/multiple_choice.py +9 -0
scandeval/prompt_templates/named_entity_recognition.py +21 -0
scandeval/prompt_templates/reading_comprehension.py +10 -0
scandeval/prompt_templates/sentiment_classification.py +11 -0
scandeval/string_utils.py +157 -0
scandeval/task_group_utils/sequence_classification.py +2 -5
scandeval/task_group_utils/token_classification.py +2 -4
scandeval/tasks.py +22 -0
scandeval/tokenisation_utils.py +12 -1
scandeval/utils.py +13 -383
scandeval-16.13.0.dist-info/METADATA +334 -0
scandeval-16.13.0.dist-info/RECORD +94 -0
scandeval-16.11.0.dist-info/METADATA +0 -649
scandeval-16.11.0.dist-info/RECORD +0 -89
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
{scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0

scandeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -21,6 +21,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
 from urllib3.exceptions import RequestError
 from ..constants import (
+    ATTENTION_BACKENDS,
     CUSTOM_STOP_TOKENS,
     GENERATION_KWARGS,
     GENERATIVE_PIPELINE_TAGS,
@@ -53,6 +54,8 @@ from ..generation_utils import (
 )
 from ..languages import get_all_languages
 from ..logging_utils import get_pbar, log, log_once, no_terminal_output
+from ..model_cache import create_model_cache_dir
+from ..string_utils import split_model_id
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
@@ -71,14 +74,11 @@ from ..tokenisation_utils import (
 )
 from ..types import ExtractLabelsFunction, Tokeniser
 from ..utils import (
-    attention_backend,
     clear_memory,
-    create_model_cache_dir,
     get_hf_token,
     get_min_cuda_compute_capability,
     internet_connection_available,
     resolve_model_path,
-    split_model_id,
 )
 from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
@@ -90,18 +90,23 @@ except ImportError:
     )
 if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
-    from vllm import LLM, SamplingParams  # type: ignore[missing-import]
-    from vllm.distributed.parallel_state import (  # type: ignore[missing-import]
+    import vllm.config
+    # MacOS/CPU installs an older version of vLLM, which doesn't have the attention
+    # config
+    if hasattr(vllm.config, "attention"):
+        from vllm.config.attention import AttentionConfig
+    from vllm import LLM, SamplingParams
+    from vllm.distributed.parallel_state import (
         destroy_distributed_environment,
         destroy_model_parallel,
     )
-    from vllm.lora.request import LoRARequest  # type: ignore[missing-import]
-    from vllm.sampling_params import (  #  type: ignore[missing-import]
-        StructuredOutputsParams,
-    )
+    from vllm.lora.request import LoRARequest
+    from vllm.sampling_params import StructuredOutputsParams
 if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
-    import ray  # type: ignore[missing-import]
+    import ray
 if t.TYPE_CHECKING:
@@ -111,7 +116,9 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig, Task
-MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[re.Pattern, str] = {
+MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[
+    re.Pattern, t.Literal[*ATTENTION_BACKENDS]  # pyrefly: ignore[invalid-literal]
+] = {
     re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "TRITON_ATTN",
     re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "TRITON_ATTN",
     re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "TRITON_ATTN",
@@ -153,7 +160,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         if importlib.util.find_spec("vllm") is None:
             raise NeedsExtraInstalled(extra="generative")
-        if shutil.which("nvcc") is None:
+        if torch.cuda.is_available() and shutil.which("nvcc") is None:
             raise NeedsSystemDependency(
                 dependency="nvcc",
                 instructions=(
@@ -163,23 +170,43 @@ class VLLMModel(HuggingFaceEncoderModel):
                 ),
             )
+        if not torch.cuda.is_available() and (
+            dataset_config.task.task_group
+            in [
+                TaskGroup.SEQUENCE_CLASSIFICATION,
+                TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
+            ]
+            or dataset_config.task.uses_structured_output
+        ):
+            raise InvalidBenchmark(
+                "We currently require CUDA to benchmark generative models on tasks "
+                "that uses structured generation, which includes the current task "
+                f"{dataset_config.task.name}. This is due to an xgrammar issue, which "
+                "will hopefully be fixed soon."
+            )
         raise_if_wrong_params(
             model_config=model_config, allowed_params=self.allowed_params
         )
-        # See if the model requires a particular attention backend
-        default_flash_attention_backend = None
-        for pattern, backend in MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS.items():
-            if re.search(pattern=pattern, string=model_config.model_id):
-                default_flash_attention_backend = backend
-                break
+        # Determine the attention backend to use:
+        # Override for models that require a specific backend, otherwise use user's
+        # choice from CLI (defaults to FLASHINFER)
+        if hasattr(vllm.config, "attention"):
+            for pattern, backend in MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS.items():
+                if re.search(pattern=pattern, string=model_config.model_id):
+                    attention_backend = backend
+                    break
+            else:
+                attention_backend = benchmark_config.attention_backend
+        else:
+            attention_backend = benchmark_config.attention_backend
-        with (
-            no_terminal_output(disable=benchmark_config.verbose),
-            attention_backend(value=default_flash_attention_backend),
-        ):
+        with no_terminal_output(disable=benchmark_config.verbose):
             model, tokeniser = load_model_and_tokeniser(
-                model_config=model_config, benchmark_config=benchmark_config
+                model_config=model_config,
+                benchmark_config=benchmark_config,
+                attention_backend=attention_backend,
             )
         self._model: "LLM" = model
         self._tokeniser: Tokeniser = tokeniser
@@ -216,11 +243,14 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
         )
         if self.model_config.adapter_base_model_id is not None:
-            adapter_path = snapshot_download(
-                repo_id=self.model_config.model_id,
-                revision=self.model_config.revision,
-                cache_dir=Path(self.model_config.model_cache_dir),
-            )
+            if Path(self.model_config.model_id).exists():
+                adapter_path = self.model_config.model_id
+            else:
+                adapter_path = snapshot_download(
+                    repo_id=self.model_config.model_id,
+                    revision=self.model_config.revision,
+                    cache_dir=Path(self.model_config.model_cache_dir),
+                )
             self.buffer["lora_request"] = LoRARequest(
                 lora_name="adapter", lora_int_id=1, lora_path=adapter_path
             )
@@ -543,7 +573,7 @@ class VLLMModel(HuggingFaceEncoderModel):
             else None,
             temperature=generation_kwargs["temperature"],
             top_p=generation_kwargs["top_p"],
-            top_k=generation_kwargs["top_k"],
+            top_k=int(generation_kwargs["top_k"]),
             repetition_penalty=generation_kwargs["repetition_penalty"],
             stop=[stop_token for stop_token in stop_tokens if stop_token],
             structured_outputs=structured_outputs,
@@ -552,10 +582,12 @@ class VLLMModel(HuggingFaceEncoderModel):
         # If any of the prompts are empty then we need to replace them with a BOS token
         # so that the vLLM model can generate from them
         prompts: c.Sequence[str] = inputs["text"]
-        if any(len(prompt) == 0 for prompt in prompts):
+        if any(len(prompt.strip()) == 0 for prompt in prompts):
             log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
             prompts = [
-                prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
+                prompt
+                if len(prompt.strip()) > 0
+                else str(self._tokeniser.bos_token or "x")
                 for prompt in prompts
             ]
@@ -583,7 +615,7 @@ class VLLMModel(HuggingFaceEncoderModel):
             text=prompts, max_length=max_tokens_per_prompt
         )
         if any(
-            len(input_ids) > max_tokens_per_prompt
+            len(input_ids) >= max_tokens_per_prompt
             for input_ids in tokenized_prompts.input_ids
         ):
             log(
@@ -615,7 +647,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                         for prompt in prompts
                     ]
                     for num_few_shots_to_remove in range(
-                        0, self.dataset_config.num_few_shot_examples + 1
+                        1, self.dataset_config.num_few_shot_examples + 1
                     ):
                         new_prompts = [
                             end_of_chat_token.join(
@@ -627,7 +659,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                             text=new_prompts, max_length=max_tokens_per_prompt
                         )
                         if all(
-                            len(input_ids) <= max_tokens_per_prompt
+                            len(input_ids) < max_tokens_per_prompt
                             for input_ids in tokenized_prompts.input_ids
                         ):
                             prompts = new_prompts
@@ -637,6 +669,8 @@ class VLLMModel(HuggingFaceEncoderModel):
                             "Truncation of prompts failed, some prompts are still too "
                             "long."
                         )
+                case _:
+                    raise InvalidBenchmark("The model type is not set!")
         else:
             log(
                 f"Truncation of prompts for model {self.model_config.model_id!r} is "
@@ -939,7 +973,11 @@ class VLLMModel(HuggingFaceEncoderModel):
 def load_model_and_tokeniser(
-    model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
+    model_config: "ModelConfig",
+    benchmark_config: "BenchmarkConfig",
+    attention_backend: t.Literal[
+        *ATTENTION_BACKENDS  # pyrefly: ignore[invalid-literal]
+    ],
 ) -> tuple["LLM", Tokeniser]:
     """Load the model and tokeniser.
@@ -948,6 +986,8 @@ def load_model_and_tokeniser(
             The model configuration.
         benchmark_config:
             The benchmark configuration.
+        attention_backend:
+            The attention backend to use.
     Returns:
         A pair (model, tokeniser), with the loaded model and tokeniser
@@ -1064,10 +1104,15 @@ def load_model_and_tokeniser(
         model_config=model_config,
         token=get_hf_token(api_key=benchmark_config.api_key),
     )
-    vllm_tokenisation_params = get_vllm_tokenisation_params(
+    vllm_params = get_vllm_tokenisation_params(
         tokeniser=tokeniser, model_config=model_config
     )
+    # MacOS/CPU installs an older version of vLLM, which doesn't have the attention
+    # config
+    if hasattr(vllm.config, "attention"):
+        vllm_params["attention_config"] = AttentionConfig(backend=attention_backend)
     clear_vllm()
     distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
@@ -1080,11 +1125,16 @@ def load_model_and_tokeniser(
             if internet_connection_available() or Path(model_id).is_dir()
             else resolve_model_path(download_dir=download_dir)
         )
+        max_model_len = min(
+            true_max_model_len, MAX_CONTEXT_LENGTH + REASONING_MAX_TOKENS
+        )
         model = LLM(
             model=model_location,
             tokenizer=model_location,
             gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
-            max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
+            max_model_len=max_model_len,
+            max_num_batched_tokens=max_model_len,
             download_dir=download_dir,
             trust_remote_code=benchmark_config.trust_remote_code,
             revision=revision,
@@ -1094,14 +1144,14 @@ def load_model_and_tokeniser(
             pipeline_parallel_size=pipeline_parallel_size,
             disable_custom_all_reduce=True,
             quantization=quantization,
-            dtype=dtype,
+            dtype=dtype,  # pyrefly: ignore[bad-argument-type]
             enforce_eager=True,
             # TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
             # so we disable it for now
             enable_prefix_caching=False,
             enable_lora=model_config.adapter_base_model_id is not None,
             max_lora_rank=256,
-            **vllm_tokenisation_params,
+            **vllm_params,
         )
     except (RuntimeError, ValueError, OSError) as e:
         if "awaiting a review from the repo authors" in str(e):
@@ -1126,11 +1176,11 @@ def load_model_and_tokeniser(
                 (
                     "Since you're running in verbose mode, you might see a descriptive "
                     "error above already. Note however that if the error message urges "
-                    "you to set the environment variable `VLLM_ATTENTION_BACKEND` to "
-                    "'FLEX_ATTENTION', please try setting it to 'TRITON_ATTN' first, "
-                    "as that often solves the issue, whereas 'FLEX_ATTENTION' usually "
-                    "doesn't. If you don't see any descriptive error above, then you "
-                    "can try "
+                    "you to use the attention backend 'FLEX_ATTENTION', please try "
+                    "setting it to 'TRITON_ATTN' instead using the "
+                    "`--attention-backend` CLI argument, as that often solves the "
+                    "issue, whereas 'FLEX_ATTENTION' usually doesn't. If you don't "
+                    "see any descriptive error above, then you can try "
                 )
                 if benchmark_config.verbose
                 else "Try "
@@ -1505,6 +1555,9 @@ def select_backend_and_parallelism() -> tuple[str, int, int]:
         - tensor_parallel_size (int): Number of GPUs per node.
         - pipeline_parallel_size (int): Number of stages across nodes.
     """
+    if not torch.cuda.is_available():
+        return "mp", 1, 1
     if not ray.is_initialized():
         try:
             ray.init(address="auto", ignore_reinit_error=True)

scandeval/benchmarker.py CHANGED Viewed

@@ -15,10 +15,9 @@ from time import sleep
 from torch.distributed import destroy_process_group
 from .benchmark_config_factory import build_benchmark_config
-from .constants import GENERATIVE_PIPELINE_TAGS
+from .constants import ATTENTION_BACKENDS, GENERATIVE_PIPELINE_TAGS
 from .data_loading import load_data, load_raw_data
 from .data_models import BenchmarkConfigParams, BenchmarkResult
-from .dataset_configs import get_all_dataset_configs
 from .enums import Device, GenerativeType, ModelType
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
 from .finetuning import finetune
@@ -28,12 +27,9 @@ from .model_config import get_model_config
 from .model_loading import load_model
 from .scores import log_scores
 from .speed_benchmark import benchmark_speed
+from .string_utils import split_model_id
 from .tasks import SPEED
-from .utils import (
-    enforce_reproducibility,
-    internet_connection_available,
-    split_model_id,
-)
+from .utils import enforce_reproducibility, internet_connection_available
 if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
@@ -79,6 +75,9 @@ class Benchmarker:
         api_base: str | None = None,
         api_version: str | None = None,
         gpu_memory_utilization: float = 0.8,
+        attention_backend: t.Literal[
+            *ATTENTION_BACKENDS  # pyrefly: ignore[invalid-literal]
+        ] = "FLASHINFER",
         generative_type: GenerativeType | None = None,
         custom_datasets_file: Path | str = Path("custom_datasets.py"),
         debug: bool = False,
@@ -149,6 +148,9 @@ class Benchmarker:
                 is generative. A larger value will result in faster evaluation, but at
                 the risk of running out of GPU memory. Only reduce this if you are
                 running out of GPU memory. Defaults to 0.9.
+            attention_backend:
+                The attention backend to use for vLLM. Defaults to FLASHINFER. Only
+                relevant if the model is generative.
             generative_type:
                 The type of generative model to benchmark. Only relevant if the model is
                 generative. If not specified, then the type will be inferred based on
@@ -264,6 +266,7 @@ class Benchmarker:
             requires_safetensors=requires_safetensors,
             download_only=download_only,
             gpu_memory_utilization=gpu_memory_utilization,
+            attention_backend=attention_backend,
             generative_type=generative_type,
             custom_datasets_file=Path(custom_datasets_file),
             verbose=verbose,
@@ -341,7 +344,9 @@ class Benchmarker:
             f"Loading data for {dataset_config.logging_string}", level=logging.INFO
         )
         dataset = load_raw_data(
-            dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
+            dataset_config=dataset_config,
+            cache_dir=benchmark_config.cache_dir,
+            api_key=benchmark_config.api_key,
         )
         del dataset
@@ -385,6 +390,10 @@ class Benchmarker:
         download_only: bool | None = None,
         gpu_memory_utilization: float | None = None,
         generative_type: GenerativeType | None = None,
+        attention_backend: t.Literal[
+            *ATTENTION_BACKENDS  # pyrefly: ignore[invalid-literal]
+        ]
+        | None = None,
         custom_datasets_file: Path | str | None = None,
         force: bool | None = None,
         verbose: bool | None = None,
@@ -504,6 +513,11 @@ class Benchmarker:
             ValueError:
                 If both `task` and `dataset` are specified.
         """
+        log(
+            "Started EuroEval run. Run with `--verbose` for more information.",
+            level=logging.INFO,
+        )
         if task is not None and dataset is not None:
             raise ValueError("Only one of `task` and `dataset` can be specified.")
@@ -638,6 +652,11 @@ class Benchmarker:
                 if generative_type is not None
                 else self.benchmark_config_default_params.generative_type
             ),
+            attention_backend=(
+                attention_backend
+                if attention_backend is not None
+                else self.benchmark_config_default_params.attention_backend
+            ),
             custom_datasets_file=(
                 Path(custom_datasets_file)
                 if custom_datasets_file is not None
@@ -776,7 +795,7 @@ class Benchmarker:
                 # Update the benchmark config if the dataset requires it
                 if (
-                    "val" not in dataset_config.splits
+                    dataset_config.val_split is None
                     and not benchmark_config.evaluate_test_split
                 ):
                     log(
@@ -1052,7 +1071,7 @@ class Benchmarker:
                     ),
                     validation_split=(
                         None
-                        if "val" not in dataset_config.splits
+                        if dataset_config.val_split is None
                         else not benchmark_config.evaluate_test_split
                     ),
                 )
@@ -1167,29 +1186,6 @@ def clear_model_cache_fn(cache_dir: str) -> None:
                     rmtree(sub_model_dir)
-def prepare_dataset_configs(
-    dataset_names: c.Sequence[str], custom_datasets_file: Path
-) -> c.Sequence["DatasetConfig"]:
-    """Prepare the dataset configuration(s) to be benchmarked.
-    Args:
-        dataset_names:
-            The dataset names to benchmark.
-        custom_datasets_file:
-            A path to a Python file containing custom dataset configurations.
-    Returns:
-        The prepared list of model IDs.
-    """
-    return [
-        cfg
-        for cfg in get_all_dataset_configs(
-            custom_datasets_file=custom_datasets_file
-        ).values()
-        if cfg.name in dataset_names
-    ]
 def initial_logging(
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",

scandeval/cli.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pathlib import Path
 import click
 from .benchmarker import Benchmarker
+from .constants import ATTENTION_BACKENDS
 from .data_models import DatasetConfig
 from .enums import Device, GenerativeType
 from .languages import get_all_languages
@@ -170,6 +171,14 @@ from .languages import get_all_languages
     "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
     "if you are running out of GPU memory. Only relevant if the model is generative.",
 )
+@click.option(
+    "--attention-backend",
+    default="FLASHINFER",
+    show_default=True,
+    type=click.Choice(ATTENTION_BACKENDS, case_sensitive=True),
+    help="The attention backend to use for vLLM. Only relevant if the model is "
+    "generative.",
+)
 @click.option(
     "--requires-safetensors",
     is_flag=True,
@@ -254,6 +263,7 @@ def benchmark(
     api_base: str | None,
     api_version: str | None,
     gpu_memory_utilization: float,
+    attention_backend: str,
     requires_safetensors: bool,
     generative_type: str | None,
     custom_datasets_file: Path,
@@ -285,6 +295,7 @@ def benchmark(
         api_base=api_base,
         api_version=api_version,
         gpu_memory_utilization=gpu_memory_utilization,
+        attention_backend=attention_backend,
         generative_type=GenerativeType[generative_type.upper()]
         if generative_type
         else None,

scandeval/constants.py CHANGED Viewed

@@ -33,8 +33,8 @@ GENERATIVE_PIPELINE_TAGS = [
 # Used to disallow non-generative models to be evaluated on these task groups
 GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
-# Local models are required to have these files in their directory
-LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
+# Local models are required to have one of these files in their directory
+LOCAL_MODELS_REQUIRED_FILES = ["config.json", "adapter_config.json"]
 # The number of top log probabilities to return for generative models. For several APIs
 # this is the maximum number of log probabilities that can be returned
@@ -105,3 +105,37 @@ GENERATION_KWARGS = {
     "top_k": 0,
     "repetition_penalty": 1.0,
 }
+# This is a mirror of `AttentionBackendEnum` in vLLM, but since we don't have access to
+# this when running on CPU/MacOS (as we can only run an old vLLM version), we have to
+# define it here
+ATTENTION_BACKENDS: list[str] = [
+    "FLASH_ATTN",
+    "FLASH_ATTN_DIFFKV",
+    "TRITON_ATTN",
+    "ROCM_ATTN",
+    "ROCM_AITER_MLA",
+    "ROCM_AITER_TRITON_MLA",
+    "ROCM_AITER_FA",
+    "ROCM_AITER_MLA_SPARSE",
+    "TORCH_SDPA",
+    "FLASHINFER",
+    "FLASHINFER_MLA",
+    "TRITON_MLA",
+    "CUTLASS_MLA",
+    "FLASHMLA",
+    "FLASHMLA_SPARSE",
+    "FLASH_ATTN_MLA",
+    "IPEX",
+    "NO_ATTENTION",
+    "FLEX_ATTENTION",
+    "TREE_ATTN",
+    "ROCM_AITER_UNIFIED_ATTN",
+    "CPU_ATTN",
+    "CUSTOM",
+]
+# If a dataset configuration has more than this number of languages, we won't log any of
+# the languages. This is for instance the case for the speed benchmark, which has all
+# the languages. The threshold of 5 is somewhat arbitrary.
+MAX_NUMBER_OF_LOGGING_LANGUAGES = 5

ScandEval 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

ScandEval 16.11.0py3-none-any.whl → 16.13.0py3-none-any.whl