PyPI - ScandEval - Versions diffs - 16.10.0__py3-none-any.whl → 16.11.0__py3-none-any.whl - Mend

ScandEval 16.10.0py3-none-any.whl → 16.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

scandeval/benchmark_modules/hf.py +14 -1
scandeval/benchmark_modules/litellm.py +111 -22
scandeval/benchmark_modules/vllm.py +116 -60
scandeval/benchmarker.py +13 -6
scandeval/data_models.py +2 -2
scandeval/dataset_configs/dutch.py +8 -9
scandeval/dataset_configs/norwegian.py +3 -3
scandeval/logging_utils.py +1 -1
scandeval/metrics/huggingface.py +3 -2
scandeval/metrics/llm_as_a_judge.py +79 -15
scandeval/model_loading.py +2 -1
scandeval/task_group_utils/sequence_classification.py +12 -3
scandeval/types.py +39 -0
scandeval/utils.py +29 -4
{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/METADATA +27 -19
{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/RECORD +19 -19
{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/licenses/LICENSE +1 -1
{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/WHEEL +0 -0
{scandeval-16.10.0.dist-info → scandeval-16.11.0.dist-info}/entry_points.txt +0 -0

scandeval/benchmark_modules/hf.py CHANGED Viewed

@@ -758,12 +758,25 @@ def get_model_repo_info(
     # model info object.
     model_info: HfApiModelInfo | None = None
     if Path(model_id).is_dir():
-        log(f"Checking for local model in {model_id}.", level=logging.DEBUG)
         if all(
             (Path(model_id) / required_file).exists()
             for required_file in LOCAL_MODELS_REQUIRED_FILES
         ):
+            log_once(
+                f"The local model directory {model_id!r} has all the required model "
+                f"files ({LOCAL_MODELS_REQUIRED_FILES}), so we're skipping looking up "
+                "model information from the Hugging Face Hub.",
+                level=logging.DEBUG,
+            )
             model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
+        else:
+            log_once(
+                f"The local model directory {model_id} does not contain all the "
+                f"required files: {LOCAL_MODELS_REQUIRED_FILES}. Skipping this "
+                f"model.",
+                level=logging.WARNING,
+            )
+            return None
     # If we have not internet, and the model_id is not a directory for a local model
     # we also just create a dummy model info object.

scandeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -4,6 +4,7 @@ import asyncio
 import collections.abc as c
 import json
 import logging
+import os
 import re
 import typing as t
 from functools import cached_property, partial
@@ -32,9 +33,10 @@ from litellm.exceptions import (
 )
 from litellm.llms.vertex_ai.common_utils import VertexAIError
 from litellm.router import Router
+from litellm.types.router import RouterRateLimitError
 from litellm.types.utils import ChoiceLogprobs, Logprobs
 from litellm.utils import supports_reasoning, supports_response_schema
-from pydantic import conlist, create_model
+from pydantic import ValidationError, conlist, create_model
 from requests.exceptions import RequestException
 from tqdm.asyncio import tqdm as tqdm_async
@@ -99,12 +101,13 @@ if t.TYPE_CHECKING:
 VOCAB_SIZE_MAPPING = {
     # OpenAI models
+    r"gpt-5\.2.*": -1,
     r"gpt-5-.*": 100_256,
     r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
     r"gpt-4-[0-9]{4}-preview": 100_256,
     r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
     r"gpt-4-(vision|turbo)(-preview)?": 100_256,
-    r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 100_256,
+    r"gpt-3\.5-turbo-instruct(-[0-9]{4})?": 100_256,
     r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
     r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
     # Anthropic models
@@ -113,23 +116,27 @@ VOCAB_SIZE_MAPPING = {
     r"(gemini/)?gemini-[1-9](\.[0-9])?-(flash|pro).*": 256_128,
     # xAI models
     r"(xai/)?grok.*": -1,
+    # Chat.dk models
+    r"(ordbogen/)?odin-medium.*": -1,
+    r"(ordbogen/)?odin-large.*": -1,
 }
 MODEL_MAX_LENGTH_MAPPING = {
     # OpenAI models
+    r"gpt-5\.2.*": 400_000,
     r"gpt-5-.*": 272_000,
     r"gpt-4(-[0-9]{4})?": 8_191,
     r"gpt-4-32k(-[0-9]{4})?": 32_767,
     r"gpt-4-[0-9]{4}-preview": 128_000,
     r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
     r"gpt-4-(vision|turbo)(-preview)?": 128_000,
-    r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 4_095,
+    r"gpt-3\.5-turbo-instruct(-[0-9]{4})?": 4_095,
     r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
     r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
     r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
     r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
-    r"gpt-4.1.*": 1_047_576,
+    r"gpt-4\.1.*": 1_047_576,
     # Anthropic models
     r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
     r"(anthropic/)?claude-(opus|sonnet|haiku)-[1-9](-[1-9])?-[0-9]{8}": 200_000,
@@ -139,12 +146,15 @@ MODEL_MAX_LENGTH_MAPPING = {
     r"(gemini/)?gemini-[23](\.[05])?.*": 1_048_576,
     # xAI models
     r"(xai/)?grok.*": 131_072,
+    # Chat.dk models
+    r"(ordbogen/)?odin-medium.*": 131_072,
+    r"(ordbogen/)?odin-large.*": 202_752,
 }
 NUM_PARAMS_MAPPING = {
     # OpenAI models
-    r"gpt-5-.*": -1,
+    r"gpt-5.*": -1,
     r"gpt-4.*": -1,
     r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
     # Anthropic models
@@ -155,6 +165,9 @@ NUM_PARAMS_MAPPING = {
     r"(gemini/)?gemini-[23](.[05])?.*": -1,
     # xAI models
     r"(xai/)?grok.*": -1,
+    # Chat.dk models
+    r"(ordbogen/)?odin-medium.*": -1,
+    r"(ordbogen/)?odin-large.*": -1,
 }
@@ -164,6 +177,7 @@ REASONING_MODELS = [
     r"(gemini/)?gemini-2.5.*",
     r"(xai/)?grok-3-mini.*",
     r".*gpt-oss.*",
+    r"(ordbogen/)?odin-.*",
 ]
 BASE_DECODER_MODELS = [
@@ -186,6 +200,8 @@ CUSTOM_INFERENCE_API_PREFIXES = [
     "openai/",
 ]
+UNOFFICIAL_INFERENCE_API_PREFIXES = ["ordbogen/"]
 class LiteLLMModel(BenchmarkModule):
     """A generative model from LiteLLM."""
@@ -220,7 +236,7 @@ class LiteLLMModel(BenchmarkModule):
         dataset_config: DatasetConfig,
         benchmark_config: BenchmarkConfig,
         log_metadata: bool = True,
-        **generation_kwargs: dict[str, t.Any],
+        **generation_kwargs,
     ) -> None:
         """Initialise the model.
@@ -241,6 +257,10 @@ class LiteLLMModel(BenchmarkModule):
             model_config=model_config, allowed_params=self.allowed_params
         )
+        set_up_benchmark_config_for_model(
+            benchmark_config=benchmark_config, model_id=model_config.model_id
+        )
         # Detect whether the model is an Ollama model, as we need to extract metadata
         # differently for these models
         self.is_ollama = model_config.model_id.startswith(
@@ -401,7 +421,7 @@ class LiteLLMModel(BenchmarkModule):
             http_429_errors = [
                 idx
                 for idx, (_, error) in enumerate(failures)
-                if isinstance(error, RateLimitError) and "Error code: 429" in str(error)
+                if isinstance(error, RateLimitError)
             ]
             if http_429_errors and self.buffer["max_concurrent_calls"] > 1:
                 failures = [
@@ -417,7 +437,6 @@ class LiteLLMModel(BenchmarkModule):
                     f"{self.buffer['max_concurrent_calls']:,} due to rate limiting.",
                     level=logging.DEBUG,
                 )
-                continue
             # Attempt to handle the exceptions, to improve the chance of getting
             # successful generations next time around
@@ -483,11 +502,13 @@ class LiteLLMModel(BenchmarkModule):
             "you've reached the maximum number of requests with logprobs",
             "logprobs is not supported",
             "logprobs is not enabled",
-            "Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)",
         ]
         logprobs_pattern = re.compile(
             r"does not support parameters: \[.*'logprobs'.*\]"
         )
+        logprobs_argument_should_be_bool_messages = [
+            "Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)"
+        ]
         top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
         top_logprobs_pattern = re.compile(
             r"does not support parameters: \[.*'top_logprobs'.*\]"
@@ -548,6 +569,17 @@ class LiteLLMModel(BenchmarkModule):
             generation_kwargs.pop("top_logprobs", None)
             generation_kwargs.pop("response_format", None)
             return generation_kwargs, 0
+        elif any(
+            msg.lower() in error_msg
+            for msg in logprobs_argument_should_be_bool_messages
+        ):
+            log_once(
+                f"The model {model_id!r} requires the `logprobs` argument to be a "
+                "Boolean, so setting it to True.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs["logprobs"] = True
+            return generation_kwargs, 0
         elif (
             any(msg.lower() in error_msg for msg in top_logprobs_messages)
             or top_logprobs_pattern.search(string=error_msg) is not None
@@ -700,23 +732,25 @@ class LiteLLMModel(BenchmarkModule):
             ) from error
         if (
-            isinstance(error, (RateLimitError, BadRequestError))
+            isinstance(error, (RateLimitError, RouterRateLimitError, BadRequestError))
             and (
                 retry_match := re.search(
-                    pattern=r"\bretry in ([0-9]+(.[0-9]+)?) ?(s|seconds)\b",
+                    pattern=(
+                        r"\b(try( again)?|retry) in ([0-9]+(\.[0-9]+)?) ?(s|seconds?)\b"
+                    ),
                     string=error_msg,
                     flags=re.IGNORECASE,
                 )
             )
             is not None
         ):
-            retry_seconds = float(retry_match.group(1))
+            retry_seconds = float(retry_match.group(3))
             log_once(
                 f"You have encountered your rate limit for model {model_id!r}.",
                 level=logging.DEBUG,
             )
             return generation_kwargs, int(retry_seconds)
-        elif isinstance(error, RateLimitError):
+        elif isinstance(error, (RateLimitError, RouterRateLimitError)):
             log_once(
                 f"You have encountered your rate limit for model {model_id!r}.",
                 level=logging.DEBUG,
@@ -919,12 +953,37 @@ class LiteLLMModel(BenchmarkModule):
                 logprobs_obj = model_response_choices.logprobs
                 if not isinstance(logprobs_obj, (Logprobs, ChoiceLogprobs)):
-                    log_once(
-                        "The logprobs object is malformed, so we won't use logprobs to "
-                        "determine the labels.",
-                        level=logging.WARNING,
+                    error_msg = (
+                        "The logprobs object is malformed, so we won't use logprobs "
+                        "to determine the labels."
+                    )
+                    if not isinstance(logprobs_obj, list):
+                        log_once(error_msg, level=logging.WARNING)
+                        continue
+                    # Some APIs have implemented the logprobs differently, being a list
+                    # of ChoiceLogprobs dictionaries rather than having that list being
+                    # under the 'content' key, so we deal with that here.
+                    # TODO: Maybe remove this in future if all APIs standardise this
+                    try:
+                        choice_logprobs_list = [
+                            ChoiceLogprobs.model_validate(item) for item in logprobs_obj
+                        ]
+                    except ValidationError:
+                        log_once(error_msg, level=logging.WARNING)
+                        continue
+                    if not all(
+                        len(item.content or []) == 1 for item in choice_logprobs_list
+                    ):
+                        log_once(error_msg, level=logging.WARNING)
+                        continue
+                    logprobs_obj = ChoiceLogprobs(
+                        content=[
+                            item.content[0]
+                            for item in choice_logprobs_list
+                            if item.content
+                        ]
                     )
-                    continue
                 logprobs_list: c.Sequence[c.Sequence[tuple[str, float]]]
                 if isinstance(logprobs_obj, ChoiceLogprobs):
@@ -964,10 +1023,9 @@ class LiteLLMModel(BenchmarkModule):
         if not sequences:
             log(
-                "No sequences were generated by the model "
-                f"{model_id!r}. This may be due to the "
-                "model running out of tokens or an issue with the input data. "
-                "Returning an empty GenerativeModelOutput.",
+                f"No sequences were generated by the model {model_id!r}. This may be "
+                "due to the model running out of tokens or an issue with the input "
+                "data. Returning an empty GenerativeModelOutput.",
                 level=logging.WARNING,
             )
             return GenerativeModelOutput(sequences=[], scores=None)
@@ -1295,6 +1353,10 @@ class LiteLLMModel(BenchmarkModule):
         if model_id in litellm.model_list:
             return True
+        set_up_benchmark_config_for_model(
+            benchmark_config=benchmark_config, model_id=model_id
+        )
         # Separate check for Ollama models
         if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
             ollama_model_exists = try_download_ollama_model(
@@ -1596,6 +1658,11 @@ class LiteLLMModel(BenchmarkModule):
                 level=logging.DEBUG,
             )
+        # If the model is a Chat.dk model, we make sure reasoning traces are not
+        # included in the output
+        if self.model_config.model_id.startswith("ordbogen/"):
+            generation_kwargs["include_reasoning"] = False
         # Handle manually set parameters
         if self.buffer["first_label_token_mapping"]:
             generation_kwargs["logprobs"] = True
@@ -1784,6 +1851,12 @@ def clean_model_id(model_id: str, benchmark_config: BenchmarkConfig) -> str:
     Returns:
         The cleaned model ID.
     """
+    # Remove unofficial prefixes
+    for unofficial_prefix in UNOFFICIAL_INFERENCE_API_PREFIXES:
+        model_id = re.sub(
+            pattern=rf"^{re.escape(unofficial_prefix)}", repl="", string=model_id
+        )
     if benchmark_config.api_base is not None and not any(
         model_id.startswith(prefix) for prefix in CUSTOM_INFERENCE_API_PREFIXES
     ):
@@ -1793,3 +1866,19 @@ def clean_model_id(model_id: str, benchmark_config: BenchmarkConfig) -> str:
             prefix = "openai/"
         model_id = prefix + model_id
     return model_id
+def set_up_benchmark_config_for_model(
+    benchmark_config: BenchmarkConfig, model_id: str
+) -> None:
+    """Set up the benchmark configuration for the model.
+    Args:
+        benchmark_config:
+            The benchmark configuration to set up.
+        model_id:
+            The model ID.
+    """
+    if model_id.startswith("ordbogen/"):
+        benchmark_config.api_key = os.getenv("ORDBOGEN_API_KEY")
+        benchmark_config.api_base = "https://api.ordbogen.ai/v1"

scandeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -500,7 +500,8 @@ class VLLMModel(HuggingFaceEncoderModel):
                 log_once(
                     f"Using temperature={temperature} with the model "
                     f"{self.model_config.model_id!r} as specified in its "
-                    "generation configuration."
+                    "generation configuration.",
+                    level=logging.DEBUG,
                 )
             if "top_p" in changed_params:
                 top_p = changed_params["top_p"]
@@ -508,7 +509,8 @@ class VLLMModel(HuggingFaceEncoderModel):
                 log_once(
                     f"Using top_p={top_p} with the model "
                     f"{self.model_config.model_id!r} as specified in its "
-                    "generation configuration."
+                    "generation configuration.",
+                    level=logging.DEBUG,
                 )
             if "top_k" in changed_params:
                 top_k = changed_params["top_k"]
@@ -516,7 +518,8 @@ class VLLMModel(HuggingFaceEncoderModel):
                 log_once(
                     f"Using top_k={top_k} with the model "
                     f"{self.model_config.model_id!r} as specified in its "
-                    "generation configuration."
+                    "generation configuration.",
+                    level=logging.DEBUG,
                 )
             if "repetition_penalty" in changed_params:
                 repetition_penalty = changed_params["repetition_penalty"]
@@ -524,8 +527,10 @@ class VLLMModel(HuggingFaceEncoderModel):
                 log_once(
                     f"Using repetition_penalty={repetition_penalty} with the model "
                     f"{self.model_config.model_id!r} as specified in its "
-                    "generation configuration."
+                    "generation configuration.",
+                    level=logging.DEBUG,
                 )
         max_tokens: int = (
             REASONING_MAX_TOKENS
             if self.generative_type == GenerativeType.REASONING
@@ -567,16 +572,76 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
             prompts = [prompt.strip() for prompt in prompts]
-        # Truncate the prompts if needed, but only if it's not a reasoning model
-        if self.generative_type != GenerativeType.REASONING:
-            max_tokens_per_prompt = (
-                min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH) - max_tokens
-            )
-            tokenized_prompts = self._tokeniser(
-                text=list(prompts), truncation=True, max_length=max_tokens_per_prompt
+        # Truncate the prompts if needed
+        max_tokens_per_prompt = min(
+            self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH
+        )
+        max_tokens_per_prompt -= min(
+            self.dataset_config.max_generated_tokens, max_tokens_per_prompt - 1
+        )
+        tokenized_prompts = self._tokeniser(
+            text=prompts, max_length=max_tokens_per_prompt
+        )
+        if any(
+            len(input_ids) > max_tokens_per_prompt
+            for input_ids in tokenized_prompts.input_ids
+        ):
+            log(
+                f"Truncating prompts for the model {self.model_config.model_id!r} "
+                f"to a maximum of {max_tokens_per_prompt:,} tokens.",
+                level=logging.DEBUG,
             )
-            prompts = self._tokeniser.batch_decode(
-                sequences=tokenized_prompts.input_ids, skip_special_tokens=True
+            match self.generative_type:
+                case GenerativeType.BASE:
+                    truncated_tokenized_prompts = self._tokeniser(
+                        text=prompts, max_length=max_tokens_per_prompt, truncation=True
+                    )
+                    prompts = self._tokeniser.batch_decode(
+                        sequences=truncated_tokenized_prompts.input_ids,
+                        skip_special_tokens=True,
+                    )
+                case GenerativeType.INSTRUCTION_TUNED | GenerativeType.REASONING:
+                    assert self.end_of_chat_token_ids is not None, (
+                        "The end-of-chat token IDs should be set for instruction-tuned "
+                        "and reasoning models."
+                    )
+                    end_of_chat_token = self._tokeniser.decode(
+                        list(self.end_of_chat_token_ids)
+                    )
+                    prompt_segments: list[list[str]] = [
+                        prompt.replace(self._tokeniser.bos_token, "").split(
+                            end_of_chat_token
+                        )
+                        for prompt in prompts
+                    ]
+                    for num_few_shots_to_remove in range(
+                        0, self.dataset_config.num_few_shot_examples + 1
+                    ):
+                        new_prompts = [
+                            end_of_chat_token.join(
+                                prompt_segment[2 * num_few_shots_to_remove :]
+                            )
+                            for prompt_segment in prompt_segments
+                        ]
+                        tokenized_prompts = self._tokeniser(
+                            text=new_prompts, max_length=max_tokens_per_prompt
+                        )
+                        if all(
+                            len(input_ids) <= max_tokens_per_prompt
+                            for input_ids in tokenized_prompts.input_ids
+                        ):
+                            prompts = new_prompts
+                            break
+                    else:
+                        raise InvalidBenchmark(
+                            "Truncation of prompts failed, some prompts are still too "
+                            "long."
+                        )
+        else:
+            log(
+                f"Truncation of prompts for model {self.model_config.model_id!r} is "
+                "not needed, so skipping truncation.",
+                level=logging.DEBUG,
             )
         # Generate sequences using vLLM
@@ -598,10 +663,11 @@ class VLLMModel(HuggingFaceEncoderModel):
                     level=logging.DEBUG,
                 )
                 sleep(1)
-            except ValueError as e:
+            except (ValueError, RuntimeError) as e:
                 # Truncate the prompts if they are too long for the model
                 truncate_error_messages = [
-                    r"prompt \(length [0-9]+\) is longer than the maximum model length"
+                    r"prompt \(length [0-9]+\) is longer than the maximum model length",
+                    "Sampled token IDs exceed the max model length",
                 ]
                 if any(
                     re.search(pattern, str(e), flags=re.IGNORECASE) is not None
@@ -905,19 +971,6 @@ def load_model_and_tokeniser(
         run_with_cli=benchmark_config.run_with_cli,
     )
-    quantization = None
-    if hasattr(hf_model_config, "quantization_config"):
-        quantization = hf_model_config.quantization_config.get("quant_method")
-    # The quantised models require extra dependencies
-    if quantization == "gptq" and (
-        importlib.util.find_spec("auto_gptq") is None
-        or importlib.util.find_spec("optimum") is None
-    ):
-        raise NeedsExtraInstalled(extra="quantization")
-    if quantization == "awq" and importlib.util.find_spec("awq") is None:
-        raise NeedsExtraInstalled(extra="quantization")
     # Start with dtype being the "auto" vLLM dtype
     dtype: str | torch.dtype = "auto"
@@ -940,23 +993,6 @@ def load_model_and_tokeniser(
             )
             dtype = torch.float16
-    # If the model is a quantized model, we might need to change the dtype
-    if quantization == "mxfp4" and hf_model_config.dtype is None:
-        dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
-        log(
-            "You are loading a quantized model where `dtype` has not been set. "
-            f"Setting dtype to {dtype!r}.",
-            level=logging.DEBUG,
-        )
-    elif quantization is not None and hf_model_config.dtype != torch.float16:
-        log(
-            "You are loading a quantized model with dtype "
-            f"{hf_model_config.dtype}, which vLLM does not support. Setting "
-            "dtype to float16 instead.",
-            level=logging.WARNING,
-        )
-        dtype = torch.float16
     # If the model is a bf16 model, we need to check the CUDA compute capability
     if hf_model_config.dtype == torch.bfloat16:
         min_cuda_compute_capability = get_min_cuda_compute_capability()
@@ -974,6 +1010,28 @@ def load_model_and_tokeniser(
                 )
                 dtype = torch.float16
+    quantization = None
+    if hasattr(hf_model_config, "quantization_config"):
+        quantization = hf_model_config.quantization_config.get("quant_method")
+    # The quantised models require extra dependencies
+    if quantization == "gptq" and (
+        importlib.util.find_spec("auto_gptq") is None
+        or importlib.util.find_spec("optimum") is None
+    ):
+        raise NeedsExtraInstalled(extra="quantization")
+    if quantization == "awq" and importlib.util.find_spec("awq") is None:
+        raise NeedsExtraInstalled(extra="quantization")
+    # If the model is a quantized model, let vLLM decide the dtype
+    if quantization is not None:
+        log(
+            f"You are loading a quantized model with quantization {quantization}. "
+            "Forcing the vLLM dtype to 'auto'",
+            level=logging.WARNING,
+        )
+        dtype = "auto"
     if model_config.adapter_base_model_id is not None:
         download_dir = str(Path(model_config.model_cache_dir) / "base_model")
     else:
@@ -1017,17 +1075,14 @@ def load_model_and_tokeniser(
     )
     try:
+        model_location = (
+            model_id
+            if internet_connection_available() or Path(model_id).is_dir()
+            else resolve_model_path(download_dir=download_dir)
+        )
         model = LLM(
-            model=(
-                model_id
-                if internet_connection_available()
-                else resolve_model_path(download_dir=download_dir)
-            ),
-            tokenizer=(
-                model_id
-                if internet_connection_available()
-                else resolve_model_path(download_dir=download_dir)
-            ),
+            model=model_location,
+            tokenizer=model_location,
             gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
             max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
             download_dir=download_dir,
@@ -1454,10 +1509,11 @@ def select_backend_and_parallelism() -> tuple[str, int, int]:
         try:
             ray.init(address="auto", ignore_reinit_error=True)
         except Exception as e:
-            log_once(
-                f"Ray initialisation failed with a {type(e)} exception: {e}",
-                level=logging.DEBUG,
-            )
+            if "could not find any running ray instance" not in str(e).lower():
+                log_once(
+                    f"Ray initialisation failed with a {type(e)} exception: {e}",
+                    level=logging.DEBUG,
+                )
     is_ray = ray.is_initialized()
     local_gpu_count = torch.cuda.device_count()
@@ -1475,7 +1531,7 @@ def select_backend_and_parallelism() -> tuple[str, int, int]:
         pipeline_parallel_size = max(1, total_gpus // tensor_parallel_size)
         log_once(
             f"Detected a multi-node setup with {pipeline_parallel_size:,} nodes, each "
-            "with {tensor_parallel_size:,} GPUs, so using `ray` as the "
+            f"with {tensor_parallel_size:,} GPUs, so using `ray` as the "
             "distributed backend.",
             level=logging.DEBUG,
         )

scandeval/benchmarker.py CHANGED Viewed

@@ -1045,8 +1045,16 @@ class Benchmarker:
                         if model.generative_type is not None
                         else None
                     ),
-                    few_shot=benchmark_config.few_shot,
-                    validation_split=not benchmark_config.evaluate_test_split,
+                    few_shot=(
+                        None
+                        if dataset_config.task.requires_zero_shot
+                        else benchmark_config.few_shot
+                    ),
+                    validation_split=(
+                        None
+                        if "val" not in dataset_config.splits
+                        else not benchmark_config.evaluate_test_split
+                    ),
                 )
                 log(f"Results:\n{results}", level=logging.DEBUG)
                 return record
@@ -1122,12 +1130,10 @@ def get_record(
         same_revision = model_id_components.revision == model_config.revision
         same_param = model_id_components.param == model_config.param
         same_dataset = record.dataset == dataset_config.name
-        same_split = (
-            record.validation_split != benchmark_config.evaluate_test_split
-            or "val" not in dataset_config.splits
-        )
+        same_split = record.validation_split != benchmark_config.evaluate_test_split
         same_num_shots = (
             record.few_shot == benchmark_config.few_shot
+            or record.few_shot is None
             or not record.generative
             or dataset_config.task.requires_zero_shot
         )
@@ -1225,6 +1231,7 @@ def initial_logging(
         f"{dataset_config.logging_string} ({num_finished_benchmarks + 1}/"
         f"{num_total_benchmarks} benchmarks)...",
         prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
+        level=logging.INFO,
     )
     if dataset_config.unofficial:

scandeval/data_models.py CHANGED Viewed

@@ -623,8 +623,8 @@ class BenchmarkResult(pydantic.BaseModel):
     merge: bool
     generative: bool
     generative_type: str | None
-    few_shot: bool
-    validation_split: bool
+    few_shot: bool | None
+    validation_split: bool | None
     euroeval_version: str | None = get_package_version("euroeval")
     transformers_version: str | None = get_package_version("transformers")
     torch_version: str | None = get_package_version("torch")

ScandEval 16.10.0__py3-none-any.whl → 16.11.0__py3-none-any.whl

ScandEval 16.10.0py3-none-any.whl → 16.11.0py3-none-any.whl