PyPI - ScandEval - Versions diffs - 16.10.1__py3-none-any.whl → 16.12.0__py3-none-any.whl - Mend

ScandEval 16.10.1py3-none-any.whl → 16.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

scandeval/__init__.py +0 -9
scandeval/benchmark_config_factory.py +5 -0
scandeval/benchmark_modules/hf.py +36 -8
scandeval/benchmark_modules/litellm.py +119 -22
scandeval/benchmark_modules/vllm.py +202 -94
scandeval/benchmarker.py +28 -7
scandeval/cli.py +13 -0
scandeval/constants.py +31 -2
scandeval/data_models.py +12 -2
scandeval/dataset_configs/dutch.py +10 -0
scandeval/logging_utils.py +1 -1
scandeval/metrics/__init__.py +1 -0
scandeval/metrics/bias.py +237 -0
scandeval/metrics/huggingface.py +5 -3
scandeval/metrics/llm_as_a_judge.py +79 -15
scandeval/model_loading.py +2 -1
scandeval/task_group_utils/sequence_classification.py +12 -3
scandeval/tasks.py +22 -0
scandeval/tokenisation_utils.py +12 -1
scandeval/types.py +39 -0
scandeval/utils.py +38 -66
{scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/METADATA +50 -24
{scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/RECORD +26 -25
{scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/licenses/LICENSE +1 -1
{scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/WHEEL +0 -0
{scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/entry_points.txt +0 -0

scandeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -21,6 +21,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
 from urllib3.exceptions import RequestError
 from ..constants import (
+    ATTENTION_BACKENDS,
     CUSTOM_STOP_TOKENS,
     GENERATION_KWARGS,
     GENERATIVE_PIPELINE_TAGS,
@@ -71,7 +72,6 @@ from ..tokenisation_utils import (
 )
 from ..types import ExtractLabelsFunction, Tokeniser
 from ..utils import (
-    attention_backend,
     clear_memory,
     create_model_cache_dir,
     get_hf_token,
@@ -90,18 +90,23 @@ except ImportError:
     )
 if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
-    from vllm import LLM, SamplingParams  # type: ignore[missing-import]
-    from vllm.distributed.parallel_state import (  # type: ignore[missing-import]
+    import vllm.config
+    # MacOS/CPU installs an older version of vLLM, which doesn't have the attention
+    # config
+    if hasattr(vllm.config, "attention"):
+        from vllm.config.attention import AttentionConfig
+    from vllm import LLM, SamplingParams
+    from vllm.distributed.parallel_state import (
         destroy_distributed_environment,
         destroy_model_parallel,
     )
-    from vllm.lora.request import LoRARequest  # type: ignore[missing-import]
-    from vllm.sampling_params import (  #  type: ignore[missing-import]
-        StructuredOutputsParams,
-    )
+    from vllm.lora.request import LoRARequest
+    from vllm.sampling_params import StructuredOutputsParams
 if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
-    import ray  # type: ignore[missing-import]
+    import ray
 if t.TYPE_CHECKING:
@@ -111,7 +116,9 @@ if t.TYPE_CHECKING:
     from ..data_models import BenchmarkConfig, DatasetConfig, Task
-MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[re.Pattern, str] = {
+MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[
+    re.Pattern, t.Literal[*ATTENTION_BACKENDS]  # pyrefly: ignore[invalid-literal]
+] = {
     re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "TRITON_ATTN",
     re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "TRITON_ATTN",
     re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "TRITON_ATTN",
@@ -153,7 +160,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         if importlib.util.find_spec("vllm") is None:
             raise NeedsExtraInstalled(extra="generative")
-        if shutil.which("nvcc") is None:
+        if torch.cuda.is_available() and shutil.which("nvcc") is None:
             raise NeedsSystemDependency(
                 dependency="nvcc",
                 instructions=(
@@ -163,23 +170,43 @@ class VLLMModel(HuggingFaceEncoderModel):
                 ),
             )
+        if not torch.cuda.is_available() and (
+            dataset_config.task.task_group
+            in [
+                TaskGroup.SEQUENCE_CLASSIFICATION,
+                TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
+            ]
+            or dataset_config.task.uses_structured_output
+        ):
+            raise InvalidBenchmark(
+                "We currently require CUDA to benchmark generative models on tasks "
+                "that uses structured generation, which includes the current task "
+                f"{dataset_config.task.name}. This is due to an xgrammar issue, which "
+                "will hopefully be fixed soon."
+            )
         raise_if_wrong_params(
             model_config=model_config, allowed_params=self.allowed_params
         )
-        # See if the model requires a particular attention backend
-        default_flash_attention_backend = None
-        for pattern, backend in MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS.items():
-            if re.search(pattern=pattern, string=model_config.model_id):
-                default_flash_attention_backend = backend
-                break
+        # Determine the attention backend to use:
+        # Override for models that require a specific backend, otherwise use user's
+        # choice from CLI (defaults to FLASHINFER)
+        if hasattr(vllm.config, "attention"):
+            for pattern, backend in MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS.items():
+                if re.search(pattern=pattern, string=model_config.model_id):
+                    attention_backend = backend
+                    break
+            else:
+                attention_backend = benchmark_config.attention_backend
+        else:
+            attention_backend = benchmark_config.attention_backend
-        with (
-            no_terminal_output(disable=benchmark_config.verbose),
-            attention_backend(value=default_flash_attention_backend),
-        ):
+        with no_terminal_output(disable=benchmark_config.verbose):
             model, tokeniser = load_model_and_tokeniser(
-                model_config=model_config, benchmark_config=benchmark_config
+                model_config=model_config,
+                benchmark_config=benchmark_config,
+                attention_backend=attention_backend,
             )
         self._model: "LLM" = model
         self._tokeniser: Tokeniser = tokeniser
@@ -216,11 +243,14 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
         )
         if self.model_config.adapter_base_model_id is not None:
-            adapter_path = snapshot_download(
-                repo_id=self.model_config.model_id,
-                revision=self.model_config.revision,
-                cache_dir=Path(self.model_config.model_cache_dir),
-            )
+            if Path(self.model_config.model_id).exists():
+                adapter_path = self.model_config.model_id
+            else:
+                adapter_path = snapshot_download(
+                    repo_id=self.model_config.model_id,
+                    revision=self.model_config.revision,
+                    cache_dir=Path(self.model_config.model_cache_dir),
+                )
             self.buffer["lora_request"] = LoRARequest(
                 lora_name="adapter", lora_int_id=1, lora_path=adapter_path
             )
@@ -500,7 +530,8 @@ class VLLMModel(HuggingFaceEncoderModel):
                 log_once(
                     f"Using temperature={temperature} with the model "
                     f"{self.model_config.model_id!r} as specified in its "
-                    "generation configuration."
+                    "generation configuration.",
+                    level=logging.DEBUG,
                 )
             if "top_p" in changed_params:
                 top_p = changed_params["top_p"]
@@ -508,7 +539,8 @@ class VLLMModel(HuggingFaceEncoderModel):
                 log_once(
                     f"Using top_p={top_p} with the model "
                     f"{self.model_config.model_id!r} as specified in its "
-                    "generation configuration."
+                    "generation configuration.",
+                    level=logging.DEBUG,
                 )
             if "top_k" in changed_params:
                 top_k = changed_params["top_k"]
@@ -516,7 +548,8 @@ class VLLMModel(HuggingFaceEncoderModel):
                 log_once(
                     f"Using top_k={top_k} with the model "
                     f"{self.model_config.model_id!r} as specified in its "
-                    "generation configuration."
+                    "generation configuration.",
+                    level=logging.DEBUG,
                 )
             if "repetition_penalty" in changed_params:
                 repetition_penalty = changed_params["repetition_penalty"]
@@ -524,8 +557,10 @@ class VLLMModel(HuggingFaceEncoderModel):
                 log_once(
                     f"Using repetition_penalty={repetition_penalty} with the model "
                     f"{self.model_config.model_id!r} as specified in its "
-                    "generation configuration."
+                    "generation configuration.",
+                    level=logging.DEBUG,
                 )
         max_tokens: int = (
             REASONING_MAX_TOKENS
             if self.generative_type == GenerativeType.REASONING
@@ -538,7 +573,7 @@ class VLLMModel(HuggingFaceEncoderModel):
             else None,
             temperature=generation_kwargs["temperature"],
             top_p=generation_kwargs["top_p"],
-            top_k=generation_kwargs["top_k"],
+            top_k=int(generation_kwargs["top_k"]),
             repetition_penalty=generation_kwargs["repetition_penalty"],
             stop=[stop_token for stop_token in stop_tokens if stop_token],
             structured_outputs=structured_outputs,
@@ -547,10 +582,12 @@ class VLLMModel(HuggingFaceEncoderModel):
         # If any of the prompts are empty then we need to replace them with a BOS token
         # so that the vLLM model can generate from them
         prompts: c.Sequence[str] = inputs["text"]
-        if any(len(prompt) == 0 for prompt in prompts):
+        if any(len(prompt.strip()) == 0 for prompt in prompts):
             log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
             prompts = [
-                prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
+                prompt
+                if len(prompt.strip()) > 0
+                else str(self._tokeniser.bos_token or "x")
                 for prompt in prompts
             ]
@@ -567,16 +604,78 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
             prompts = [prompt.strip() for prompt in prompts]
-        # Truncate the prompts if needed, but only if it's not a reasoning model
-        if self.generative_type != GenerativeType.REASONING:
-            max_tokens_per_prompt = (
-                min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH) - max_tokens
-            )
-            tokenized_prompts = self._tokeniser(
-                text=list(prompts), truncation=True, max_length=max_tokens_per_prompt
+        # Truncate the prompts if needed
+        max_tokens_per_prompt = min(
+            self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH
+        )
+        max_tokens_per_prompt -= min(
+            self.dataset_config.max_generated_tokens, max_tokens_per_prompt - 1
+        )
+        tokenized_prompts = self._tokeniser(
+            text=prompts, max_length=max_tokens_per_prompt
+        )
+        if any(
+            len(input_ids) >= max_tokens_per_prompt
+            for input_ids in tokenized_prompts.input_ids
+        ):
+            log(
+                f"Truncating prompts for the model {self.model_config.model_id!r} "
+                f"to a maximum of {max_tokens_per_prompt:,} tokens.",
+                level=logging.DEBUG,
             )
-            prompts = self._tokeniser.batch_decode(
-                sequences=tokenized_prompts.input_ids, skip_special_tokens=True
+            match self.generative_type:
+                case GenerativeType.BASE:
+                    truncated_tokenized_prompts = self._tokeniser(
+                        text=prompts, max_length=max_tokens_per_prompt, truncation=True
+                    )
+                    prompts = self._tokeniser.batch_decode(
+                        sequences=truncated_tokenized_prompts.input_ids,
+                        skip_special_tokens=True,
+                    )
+                case GenerativeType.INSTRUCTION_TUNED | GenerativeType.REASONING:
+                    assert self.end_of_chat_token_ids is not None, (
+                        "The end-of-chat token IDs should be set for instruction-tuned "
+                        "and reasoning models."
+                    )
+                    end_of_chat_token = self._tokeniser.decode(
+                        list(self.end_of_chat_token_ids)
+                    )
+                    prompt_segments: list[list[str]] = [
+                        prompt.replace(self._tokeniser.bos_token, "").split(
+                            end_of_chat_token
+                        )
+                        for prompt in prompts
+                    ]
+                    for num_few_shots_to_remove in range(
+                        1, self.dataset_config.num_few_shot_examples + 1
+                    ):
+                        new_prompts = [
+                            end_of_chat_token.join(
+                                prompt_segment[2 * num_few_shots_to_remove :]
+                            )
+                            for prompt_segment in prompt_segments
+                        ]
+                        tokenized_prompts = self._tokeniser(
+                            text=new_prompts, max_length=max_tokens_per_prompt
+                        )
+                        if all(
+                            len(input_ids) < max_tokens_per_prompt
+                            for input_ids in tokenized_prompts.input_ids
+                        ):
+                            prompts = new_prompts
+                            break
+                    else:
+                        raise InvalidBenchmark(
+                            "Truncation of prompts failed, some prompts are still too "
+                            "long."
+                        )
+                case _:
+                    raise InvalidBenchmark("The model type is not set!")
+        else:
+            log(
+                f"Truncation of prompts for model {self.model_config.model_id!r} is "
+                "not needed, so skipping truncation.",
+                level=logging.DEBUG,
             )
         # Generate sequences using vLLM
@@ -598,10 +697,11 @@ class VLLMModel(HuggingFaceEncoderModel):
                     level=logging.DEBUG,
                 )
                 sleep(1)
-            except ValueError as e:
+            except (ValueError, RuntimeError) as e:
                 # Truncate the prompts if they are too long for the model
                 truncate_error_messages = [
-                    r"prompt \(length [0-9]+\) is longer than the maximum model length"
+                    r"prompt \(length [0-9]+\) is longer than the maximum model length",
+                    "Sampled token IDs exceed the max model length",
                 ]
                 if any(
                     re.search(pattern, str(e), flags=re.IGNORECASE) is not None
@@ -873,7 +973,11 @@ class VLLMModel(HuggingFaceEncoderModel):
 def load_model_and_tokeniser(
-    model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
+    model_config: "ModelConfig",
+    benchmark_config: "BenchmarkConfig",
+    attention_backend: t.Literal[
+        *ATTENTION_BACKENDS  # pyrefly: ignore[invalid-literal]
+    ],
 ) -> tuple["LLM", Tokeniser]:
     """Load the model and tokeniser.
@@ -882,6 +986,8 @@ def load_model_and_tokeniser(
             The model configuration.
         benchmark_config:
             The benchmark configuration.
+        attention_backend:
+            The attention backend to use.
     Returns:
         A pair (model, tokeniser), with the loaded model and tokeniser
@@ -905,19 +1011,6 @@ def load_model_and_tokeniser(
         run_with_cli=benchmark_config.run_with_cli,
     )
-    quantization = None
-    if hasattr(hf_model_config, "quantization_config"):
-        quantization = hf_model_config.quantization_config.get("quant_method")
-    # The quantised models require extra dependencies
-    if quantization == "gptq" and (
-        importlib.util.find_spec("auto_gptq") is None
-        or importlib.util.find_spec("optimum") is None
-    ):
-        raise NeedsExtraInstalled(extra="quantization")
-    if quantization == "awq" and importlib.util.find_spec("awq") is None:
-        raise NeedsExtraInstalled(extra="quantization")
     # Start with dtype being the "auto" vLLM dtype
     dtype: str | torch.dtype = "auto"
@@ -940,23 +1033,6 @@ def load_model_and_tokeniser(
             )
             dtype = torch.float16
-    # If the model is a quantized model, we might need to change the dtype
-    if quantization == "mxfp4" and hf_model_config.dtype is None:
-        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
-        log(
-            "You are loading a quantized model where `dtype` has not been set. "
-            f"Setting dtype to {dtype!r}.",
-            level=logging.DEBUG,
-        )
-    elif quantization is not None and hf_model_config.dtype != torch.float16:
-        log(
-            "You are loading a quantized model with dtype "
-            f"{hf_model_config.dtype}, which vLLM does not support. Setting "
-            "dtype to float16 instead.",
-            level=logging.WARNING,
-        )
-        dtype = torch.float16
     # If the model is a bf16 model, we need to check the CUDA compute capability
     if hf_model_config.dtype == torch.bfloat16:
         min_cuda_compute_capability = get_min_cuda_compute_capability()
@@ -974,6 +1050,28 @@ def load_model_and_tokeniser(
                 )
                 dtype = torch.float16
+    quantization = None
+    if hasattr(hf_model_config, "quantization_config"):
+        quantization = hf_model_config.quantization_config.get("quant_method")
+    # The quantised models require extra dependencies
+    if quantization == "gptq" and (
+        importlib.util.find_spec("auto_gptq") is None
+        or importlib.util.find_spec("optimum") is None
+    ):
+        raise NeedsExtraInstalled(extra="quantization")
+    if quantization == "awq" and importlib.util.find_spec("awq") is None:
+        raise NeedsExtraInstalled(extra="quantization")
+    # If the model is a quantized model, let vLLM decide the dtype
+    if quantization is not None:
+        log(
+            f"You are loading a quantized model with quantization {quantization}. "
+            "Forcing the vLLM dtype to 'auto'",
+            level=logging.WARNING,
+        )
+        dtype = "auto"
     if model_config.adapter_base_model_id is not None:
         download_dir = str(Path(model_config.model_cache_dir) / "base_model")
     else:
@@ -1006,10 +1104,15 @@ def load_model_and_tokeniser(
         model_config=model_config,
         token=get_hf_token(api_key=benchmark_config.api_key),
     )
-    vllm_tokenisation_params = get_vllm_tokenisation_params(
+    vllm_params = get_vllm_tokenisation_params(
         tokeniser=tokeniser, model_config=model_config
     )
+    # MacOS/CPU installs an older version of vLLM, which doesn't have the attention
+    # config
+    if hasattr(vllm.config, "attention"):
+        vllm_params["attention_config"] = AttentionConfig(backend=attention_backend)
     clear_vllm()
     distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
@@ -1017,19 +1120,21 @@ def load_model_and_tokeniser(
     )
     try:
+        model_location = (
+            model_id
+            if internet_connection_available() or Path(model_id).is_dir()
+            else resolve_model_path(download_dir=download_dir)
+        )
+        max_model_len = min(
+            true_max_model_len, MAX_CONTEXT_LENGTH + REASONING_MAX_TOKENS
+        )
         model = LLM(
-            model=(
-                model_id
-                if internet_connection_available()
-                else resolve_model_path(download_dir=download_dir)
-            ),
-            tokenizer=(
-                model_id
-                if internet_connection_available()
-                else resolve_model_path(download_dir=download_dir)
-            ),
+            model=model_location,
+            tokenizer=model_location,
             gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
-            max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
+            max_model_len=max_model_len,
+            max_num_batched_tokens=max_model_len,
             download_dir=download_dir,
             trust_remote_code=benchmark_config.trust_remote_code,
             revision=revision,
@@ -1046,7 +1151,7 @@ def load_model_and_tokeniser(
             enable_prefix_caching=False,
             enable_lora=model_config.adapter_base_model_id is not None,
             max_lora_rank=256,
-            **vllm_tokenisation_params,
+            **vllm_params,
         )
     except (RuntimeError, ValueError, OSError) as e:
         if "awaiting a review from the repo authors" in str(e):
@@ -1071,11 +1176,11 @@ def load_model_and_tokeniser(
                 (
                     "Since you're running in verbose mode, you might see a descriptive "
                     "error above already. Note however that if the error message urges "
-                    "you to set the environment variable `VLLM_ATTENTION_BACKEND` to "
-                    "'FLEX_ATTENTION', please try setting it to 'TRITON_ATTN' first, "
-                    "as that often solves the issue, whereas 'FLEX_ATTENTION' usually "
-                    "doesn't. If you don't see any descriptive error above, then you "
-                    "can try "
+                    "you to use the attention backend 'FLEX_ATTENTION', please try "
+                    "setting it to 'TRITON_ATTN' instead using the "
+                    "`--attention-backend` CLI argument, as that often solves the "
+                    "issue, whereas 'FLEX_ATTENTION' usually doesn't. If you don't "
+                    "see any descriptive error above, then you can try "
                 )
                 if benchmark_config.verbose
                 else "Try "
@@ -1450,6 +1555,9 @@ def select_backend_and_parallelism() -> tuple[str, int, int]:
         - tensor_parallel_size (int): Number of GPUs per node.
         - pipeline_parallel_size (int): Number of stages across nodes.
     """
+    if not torch.cuda.is_available():
+        return "mp", 1, 1
     if not ray.is_initialized():
         try:
             ray.init(address="auto", ignore_reinit_error=True)
@@ -1476,7 +1584,7 @@ def select_backend_and_parallelism() -> tuple[str, int, int]:
         pipeline_parallel_size = max(1, total_gpus // tensor_parallel_size)
         log_once(
             f"Detected a multi-node setup with {pipeline_parallel_size:,} nodes, each "
-            "with {tensor_parallel_size:,} GPUs, so using `ray` as the "
+            f"with {tensor_parallel_size:,} GPUs, so using `ray` as the "
             "distributed backend.",
             level=logging.DEBUG,
         )

scandeval/benchmarker.py CHANGED Viewed

@@ -15,7 +15,7 @@ from time import sleep
 from torch.distributed import destroy_process_group
 from .benchmark_config_factory import build_benchmark_config
-from .constants import GENERATIVE_PIPELINE_TAGS
+from .constants import ATTENTION_BACKENDS, GENERATIVE_PIPELINE_TAGS
 from .data_loading import load_data, load_raw_data
 from .data_models import BenchmarkConfigParams, BenchmarkResult
 from .dataset_configs import get_all_dataset_configs
@@ -79,6 +79,7 @@ class Benchmarker:
         api_base: str | None = None,
         api_version: str | None = None,
         gpu_memory_utilization: float = 0.8,
+        attention_backend: str = "FLASHINFER",
         generative_type: GenerativeType | None = None,
         custom_datasets_file: Path | str = Path("custom_datasets.py"),
         debug: bool = False,
@@ -149,6 +150,9 @@ class Benchmarker:
                 is generative. A larger value will result in faster evaluation, but at
                 the risk of running out of GPU memory. Only reduce this if you are
                 running out of GPU memory. Defaults to 0.9.
+            attention_backend:
+                The attention backend to use for vLLM. Defaults to FLASHINFER. Only
+                relevant if the model is generative.
             generative_type:
                 The type of generative model to benchmark. Only relevant if the model is
                 generative. If not specified, then the type will be inferred based on
@@ -264,6 +268,7 @@ class Benchmarker:
             requires_safetensors=requires_safetensors,
             download_only=download_only,
             gpu_memory_utilization=gpu_memory_utilization,
+            attention_backend=attention_backend,
             generative_type=generative_type,
             custom_datasets_file=Path(custom_datasets_file),
             verbose=verbose,
@@ -385,6 +390,10 @@ class Benchmarker:
         download_only: bool | None = None,
         gpu_memory_utilization: float | None = None,
         generative_type: GenerativeType | None = None,
+        attention_backend: t.Literal[
+            *ATTENTION_BACKENDS  # pyrefly: ignore[invalid-literal]
+        ]
+        | None = None,
         custom_datasets_file: Path | str | None = None,
         force: bool | None = None,
         verbose: bool | None = None,
@@ -638,6 +647,11 @@ class Benchmarker:
                 if generative_type is not None
                 else self.benchmark_config_default_params.generative_type
             ),
+            attention_backend=(
+                attention_backend
+                if attention_backend is not None
+                else self.benchmark_config_default_params.attention_backend
+            ),
             custom_datasets_file=(
                 Path(custom_datasets_file)
                 if custom_datasets_file is not None
@@ -1045,8 +1059,16 @@ class Benchmarker:
                         if model.generative_type is not None
                         else None
                     ),
-                    few_shot=benchmark_config.few_shot,
-                    validation_split=not benchmark_config.evaluate_test_split,
+                    few_shot=(
+                        None
+                        if dataset_config.task.requires_zero_shot
+                        else benchmark_config.few_shot
+                    ),
+                    validation_split=(
+                        None
+                        if "val" not in dataset_config.splits
+                        else not benchmark_config.evaluate_test_split
+                    ),
                 )
                 log(f"Results:\n{results}", level=logging.DEBUG)
                 return record
@@ -1122,12 +1144,10 @@ def get_record(
         same_revision = model_id_components.revision == model_config.revision
         same_param = model_id_components.param == model_config.param
         same_dataset = record.dataset == dataset_config.name
-        same_split = (
-            record.validation_split != benchmark_config.evaluate_test_split
-            or "val" not in dataset_config.splits
-        )
+        same_split = record.validation_split != benchmark_config.evaluate_test_split
         same_num_shots = (
             record.few_shot == benchmark_config.few_shot
+            or record.few_shot is None
             or not record.generative
             or dataset_config.task.requires_zero_shot
         )
@@ -1225,6 +1245,7 @@ def initial_logging(
         f"{dataset_config.logging_string} ({num_finished_benchmarks + 1}/"
         f"{num_total_benchmarks} benchmarks)...",
         prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
+        level=logging.INFO,
     )
     if dataset_config.unofficial:

scandeval/cli.py CHANGED Viewed

@@ -170,6 +170,17 @@ from .languages import get_all_languages
     "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
     "if you are running out of GPU memory. Only relevant if the model is generative.",
 )
+@click.option(
+    "--attention-backend",
+    default="FLASHINFER",
+    show_default=True,
+    type=click.Choice(
+        ["FLASHINFER", "FLASH_ATTN", "TRITON_ATTN", "FLEX_ATTENTION"],
+        case_sensitive=True,
+    ),
+    help="The attention backend to use for vLLM. Only relevant if the model is "
+    "generative.",
+)
 @click.option(
     "--requires-safetensors",
     is_flag=True,
@@ -254,6 +265,7 @@ def benchmark(
     api_base: str | None,
     api_version: str | None,
     gpu_memory_utilization: float,
+    attention_backend: str,
     requires_safetensors: bool,
     generative_type: str | None,
     custom_datasets_file: Path,
@@ -285,6 +297,7 @@ def benchmark(
         api_base=api_base,
         api_version=api_version,
         gpu_memory_utilization=gpu_memory_utilization,
+        attention_backend=attention_backend,
         generative_type=GenerativeType[generative_type.upper()]
         if generative_type
         else None,

scandeval/constants.py CHANGED Viewed

@@ -33,8 +33,8 @@ GENERATIVE_PIPELINE_TAGS = [
 # Used to disallow non-generative models to be evaluated on these task groups
 GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
-# Local models are required to have these files in their directory
-LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
+# Local models are required to have one of these files in their directory
+LOCAL_MODELS_REQUIRED_FILES = ["config.json", "adapter_config.json"]
 # The number of top log probabilities to return for generative models. For several APIs
 # this is the maximum number of log probabilities that can be returned
@@ -105,3 +105,32 @@ GENERATION_KWARGS = {
     "top_k": 0,
     "repetition_penalty": 1.0,
 }
+# This is a mirror of `AttentionBackendEnum` in vLLM, but since we don't have access to
+# this when running on CPU/MacOS (as we can only run an old vLLM version), we have to
+# define it here
+ATTENTION_BACKENDS: list[str] = [
+    "FLASH_ATTN",
+    "FLASH_ATTN_DIFFKV",
+    "TRITON_ATTN",
+    "ROCM_ATTN",
+    "ROCM_AITER_MLA",
+    "ROCM_AITER_TRITON_MLA",
+    "ROCM_AITER_FA",
+    "ROCM_AITER_MLA_SPARSE",
+    "TORCH_SDPA",
+    "FLASHINFER",
+    "FLASHINFER_MLA",
+    "TRITON_MLA",
+    "CUTLASS_MLA",
+    "FLASHMLA",
+    "FLASHMLA_SPARSE",
+    "FLASH_ATTN_MLA",
+    "IPEX",
+    "NO_ATTENTION",
+    "FLEX_ATTENTION",
+    "TREE_ATTN",
+    "ROCM_AITER_UNIFIED_ATTN",
+    "CPU_ATTN",
+    "CUSTOM",
+]

ScandEval 16.10.1__py3-none-any.whl → 16.12.0__py3-none-any.whl

ScandEval 16.10.1py3-none-any.whl → 16.12.0py3-none-any.whl