PyPI - ScandEval - Versions diffs - 16.10.1__py3-none-any.whl → 16.12.0__py3-none-any.whl - Mend

ScandEval 16.10.1py3-none-any.whl → 16.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

scandeval/__init__.py +0 -9
scandeval/benchmark_config_factory.py +5 -0
scandeval/benchmark_modules/hf.py +36 -8
scandeval/benchmark_modules/litellm.py +119 -22
scandeval/benchmark_modules/vllm.py +202 -94
scandeval/benchmarker.py +28 -7
scandeval/cli.py +13 -0
scandeval/constants.py +31 -2
scandeval/data_models.py +12 -2
scandeval/dataset_configs/dutch.py +10 -0
scandeval/logging_utils.py +1 -1
scandeval/metrics/__init__.py +1 -0
scandeval/metrics/bias.py +237 -0
scandeval/metrics/huggingface.py +5 -3
scandeval/metrics/llm_as_a_judge.py +79 -15
scandeval/model_loading.py +2 -1
scandeval/task_group_utils/sequence_classification.py +12 -3
scandeval/tasks.py +22 -0
scandeval/tokenisation_utils.py +12 -1
scandeval/types.py +39 -0
scandeval/utils.py +38 -66
{scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/METADATA +50 -24
{scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/RECORD +26 -25
{scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/licenses/LICENSE +1 -1
{scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/WHEEL +0 -0
{scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/entry_points.txt +0 -0

scandeval/__init__.py CHANGED Viewed

@@ -110,15 +110,6 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
 os.environ["VLLM_USE_V1"] = "1"
-# Use the FlashInfer flash-attention backend for vLLM, unless the user has already
-# specified a different backend.
-if os.getenv("VLLM_ATTENTION_BACKEND") is None:
-    os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
-    os.environ["USER_HAS_SET_VLLM_ATTENTION_BACKEND"] = "0"
-else:
-    os.environ["USER_HAS_SET_VLLM_ATTENTION_BACKEND"] = "1"
 # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
 # former and LiteLLM uses the latter
 if os.getenv("HUGGINGFACE_API_KEY"):

scandeval/benchmark_config_factory.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Factory class for creating dataset configurations."""
 import collections.abc as c
+import importlib.util
 import sys
 import typing as t
 from pathlib import Path
@@ -13,6 +14,9 @@ from .enums import Device
 from .exceptions import InvalidBenchmark
 from .languages import get_all_languages
+if importlib.util.find_spec("vllm") is not None:
+    pass
 if t.TYPE_CHECKING:
     from .data_models import Language
@@ -68,6 +72,7 @@ def build_benchmark_config(
         api_base=benchmark_config_params.api_base,
         api_version=benchmark_config_params.api_version,
         gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
+        attention_backend=benchmark_config_params.attention_backend,
         generative_type=benchmark_config_params.generative_type,
         debug=benchmark_config_params.debug,
         run_with_cli=benchmark_config_params.run_with_cli,

scandeval/benchmark_modules/hf.py CHANGED Viewed

@@ -758,12 +758,35 @@ def get_model_repo_info(
     # model info object.
     model_info: HfApiModelInfo | None = None
     if Path(model_id).is_dir():
-        log(f"Checking for local model in {model_id}.", level=logging.DEBUG)
-        if all(
-            (Path(model_id) / required_file).exists()
-            for required_file in LOCAL_MODELS_REQUIRED_FILES
-        ):
+        if Path(model_id, "config.json").exists():
+            log_once(
+                f"The local model directory {model_id!r} has a 'config.json' file, so "
+                "we're skipping looking up model information from the Hugging Face "
+                "Hub.",
+                level=logging.DEBUG,
+            )
             model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
+        elif Path(model_id, "adapter_config.json").exists():
+            log_once(
+                f"The local model directory {model_id!r} has an 'adapter_config.json' "
+                "file, so we're skipping looking up model information from the Hugging "
+                "Face Hub.",
+                level=logging.DEBUG,
+            )
+            model_info = HfApiModelInfo(
+                id=model_id,
+                tags=None,
+                pipeline_tag=None,
+                siblings=[dict(rfilename="adapter_config.json")],
+            )
+        else:
+            log_once(
+                f"The local model directory {model_id} does not contain any of the "
+                f"required files: {LOCAL_MODELS_REQUIRED_FILES}. Skipping this "
+                f"model.",
+                level=logging.WARNING,
+            )
+            return None
     # If we have not internet, and the model_id is not a directory for a local model
     # we also just create a dummy model info object.
@@ -863,8 +886,9 @@ def get_model_repo_info(
             for tag in GENERATIVE_PIPELINE_TAGS
             for class_name in TASK_MAPPING.get(tag, dict()).values()  # type: ignore[attr-defined]
         ]
-        if class_names is not None and any(
-            class_name in generative_class_names for class_name in class_names
+        if class_names is not None and (
+            any(class_name in generative_class_names for class_name in class_names)
+            or any("ForCausalLM" in class_name for class_name in class_names)
         ):
             pipeline_tag = "text-generation"
         else:
@@ -1108,7 +1132,11 @@ def load_hf_model_config(
         )
     # Ensure that the PAD token ID is set
-    if config.eos_token_id is not None and config.pad_token_id is None:
+    if (
+        hasattr(config, "eos_token_id")
+        and config.eos_token_id is not None
+        and (not hasattr(config, "pad_token_id") or config.pad_token_id is None)
+    ):
         if isinstance(config.eos_token_id, list):
             config.pad_token_id = config.eos_token_id[0]
         else:

scandeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -4,6 +4,7 @@ import asyncio
 import collections.abc as c
 import json
 import logging
+import os
 import re
 import typing as t
 from functools import cached_property, partial
@@ -32,9 +33,10 @@ from litellm.exceptions import (
 )
 from litellm.llms.vertex_ai.common_utils import VertexAIError
 from litellm.router import Router
+from litellm.types.router import RouterRateLimitError
 from litellm.types.utils import ChoiceLogprobs, Logprobs
 from litellm.utils import supports_reasoning, supports_response_schema
-from pydantic import conlist, create_model
+from pydantic import ValidationError, conlist, create_model
 from requests.exceptions import RequestException
 from tqdm.asyncio import tqdm as tqdm_async
@@ -99,12 +101,13 @@ if t.TYPE_CHECKING:
 VOCAB_SIZE_MAPPING = {
     # OpenAI models
+    r"gpt-5\.2.*": -1,
     r"gpt-5-.*": 100_256,
     r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
     r"gpt-4-[0-9]{4}-preview": 100_256,
     r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
     r"gpt-4-(vision|turbo)(-preview)?": 100_256,
-    r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 100_256,
+    r"gpt-3\.5-turbo-instruct(-[0-9]{4})?": 100_256,
     r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
     r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
     # Anthropic models
@@ -113,23 +116,27 @@ VOCAB_SIZE_MAPPING = {
     r"(gemini/)?gemini-[1-9](\.[0-9])?-(flash|pro).*": 256_128,
     # xAI models
     r"(xai/)?grok.*": -1,
+    # Chat.dk models
+    r"(ordbogen/)?odin-medium.*": -1,
+    r"(ordbogen/)?odin-large.*": -1,
 }
 MODEL_MAX_LENGTH_MAPPING = {
     # OpenAI models
+    r"gpt-5\.2.*": 400_000,
     r"gpt-5-.*": 272_000,
     r"gpt-4(-[0-9]{4})?": 8_191,
     r"gpt-4-32k(-[0-9]{4})?": 32_767,
     r"gpt-4-[0-9]{4}-preview": 128_000,
     r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
     r"gpt-4-(vision|turbo)(-preview)?": 128_000,
-    r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 4_095,
+    r"gpt-3\.5-turbo-instruct(-[0-9]{4})?": 4_095,
     r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
     r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
     r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
     r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
-    r"gpt-4.1.*": 1_047_576,
+    r"gpt-4\.1.*": 1_047_576,
     # Anthropic models
     r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
     r"(anthropic/)?claude-(opus|sonnet|haiku)-[1-9](-[1-9])?-[0-9]{8}": 200_000,
@@ -139,12 +146,15 @@ MODEL_MAX_LENGTH_MAPPING = {
     r"(gemini/)?gemini-[23](\.[05])?.*": 1_048_576,
     # xAI models
     r"(xai/)?grok.*": 131_072,
+    # Chat.dk models
+    r"(ordbogen/)?odin-medium.*": 131_072,
+    r"(ordbogen/)?odin-large.*": 202_752,
 }
 NUM_PARAMS_MAPPING = {
     # OpenAI models
-    r"gpt-5-.*": -1,
+    r"gpt-5.*": -1,
     r"gpt-4.*": -1,
     r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
     # Anthropic models
@@ -155,6 +165,9 @@ NUM_PARAMS_MAPPING = {
     r"(gemini/)?gemini-[23](.[05])?.*": -1,
     # xAI models
     r"(xai/)?grok.*": -1,
+    # Chat.dk models
+    r"(ordbogen/)?odin-medium.*": -1,
+    r"(ordbogen/)?odin-large.*": -1,
 }
@@ -164,6 +177,7 @@ REASONING_MODELS = [
     r"(gemini/)?gemini-2.5.*",
     r"(xai/)?grok-3-mini.*",
     r".*gpt-oss.*",
+    r"(ordbogen/)?odin-.*",
 ]
 BASE_DECODER_MODELS = [
@@ -186,6 +200,8 @@ CUSTOM_INFERENCE_API_PREFIXES = [
     "openai/",
 ]
+UNOFFICIAL_INFERENCE_API_PREFIXES = ["ordbogen/"]
 class LiteLLMModel(BenchmarkModule):
     """A generative model from LiteLLM."""
@@ -220,7 +236,7 @@ class LiteLLMModel(BenchmarkModule):
         dataset_config: DatasetConfig,
         benchmark_config: BenchmarkConfig,
         log_metadata: bool = True,
-        **generation_kwargs: dict[str, t.Any],
+        **generation_kwargs,
     ) -> None:
         """Initialise the model.
@@ -241,6 +257,10 @@ class LiteLLMModel(BenchmarkModule):
             model_config=model_config, allowed_params=self.allowed_params
         )
+        set_up_benchmark_config_for_model(
+            benchmark_config=benchmark_config, model_id=model_config.model_id
+        )
         # Detect whether the model is an Ollama model, as we need to extract metadata
         # differently for these models
         self.is_ollama = model_config.model_id.startswith(
@@ -401,7 +421,7 @@ class LiteLLMModel(BenchmarkModule):
             http_429_errors = [
                 idx
                 for idx, (_, error) in enumerate(failures)
-                if isinstance(error, RateLimitError) and "Error code: 429" in str(error)
+                if isinstance(error, RateLimitError)
             ]
             if http_429_errors and self.buffer["max_concurrent_calls"] > 1:
                 failures = [
@@ -417,7 +437,6 @@ class LiteLLMModel(BenchmarkModule):
                     f"{self.buffer['max_concurrent_calls']:,} due to rate limiting.",
                     level=logging.DEBUG,
                 )
-                continue
             # Attempt to handle the exceptions, to improve the chance of getting
             # successful generations next time around
@@ -483,11 +502,13 @@ class LiteLLMModel(BenchmarkModule):
             "you've reached the maximum number of requests with logprobs",
             "logprobs is not supported",
             "logprobs is not enabled",
-            "Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)",
         ]
         logprobs_pattern = re.compile(
             r"does not support parameters: \[.*'logprobs'.*\]"
         )
+        logprobs_argument_should_be_bool_messages = [
+            "Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)"
+        ]
         top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
         top_logprobs_pattern = re.compile(
             r"does not support parameters: \[.*'top_logprobs'.*\]"
@@ -548,6 +569,17 @@ class LiteLLMModel(BenchmarkModule):
             generation_kwargs.pop("top_logprobs", None)
             generation_kwargs.pop("response_format", None)
             return generation_kwargs, 0
+        elif any(
+            msg.lower() in error_msg
+            for msg in logprobs_argument_should_be_bool_messages
+        ):
+            log_once(
+                f"The model {model_id!r} requires the `logprobs` argument to be a "
+                "Boolean, so setting it to True.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs["logprobs"] = True
+            return generation_kwargs, 0
         elif (
             any(msg.lower() in error_msg for msg in top_logprobs_messages)
             or top_logprobs_pattern.search(string=error_msg) is not None
@@ -700,23 +732,25 @@ class LiteLLMModel(BenchmarkModule):
             ) from error
         if (
-            isinstance(error, (RateLimitError, BadRequestError))
+            isinstance(error, (RateLimitError, RouterRateLimitError, BadRequestError))
             and (
                 retry_match := re.search(
-                    pattern=r"\bretry in ([0-9]+(.[0-9]+)?) ?(s|seconds)\b",
+                    pattern=(
+                        r"\b(try( again)?|retry) in ([0-9]+(\.[0-9]+)?) ?(s|seconds?)\b"
+                    ),
                     string=error_msg,
                     flags=re.IGNORECASE,
                 )
             )
             is not None
         ):
-            retry_seconds = float(retry_match.group(1))
+            retry_seconds = float(retry_match.group(3))
             log_once(
                 f"You have encountered your rate limit for model {model_id!r}.",
                 level=logging.DEBUG,
             )
             return generation_kwargs, int(retry_seconds)
-        elif isinstance(error, RateLimitError):
+        elif isinstance(error, (RateLimitError, RouterRateLimitError)):
             log_once(
                 f"You have encountered your rate limit for model {model_id!r}.",
                 level=logging.DEBUG,
@@ -919,12 +953,37 @@ class LiteLLMModel(BenchmarkModule):
                 logprobs_obj = model_response_choices.logprobs
                 if not isinstance(logprobs_obj, (Logprobs, ChoiceLogprobs)):
-                    log_once(
-                        "The logprobs object is malformed, so we won't use logprobs to "
-                        "determine the labels.",
-                        level=logging.WARNING,
+                    error_msg = (
+                        "The logprobs object is malformed, so we won't use logprobs "
+                        "to determine the labels."
+                    )
+                    if not isinstance(logprobs_obj, list):
+                        log_once(error_msg, level=logging.WARNING)
+                        continue
+                    # Some APIs have implemented the logprobs differently, being a list
+                    # of ChoiceLogprobs dictionaries rather than having that list being
+                    # under the 'content' key, so we deal with that here.
+                    # TODO: Maybe remove this in future if all APIs standardise this
+                    try:
+                        choice_logprobs_list = [
+                            ChoiceLogprobs.model_validate(item) for item in logprobs_obj
+                        ]
+                    except ValidationError:
+                        log_once(error_msg, level=logging.WARNING)
+                        continue
+                    if not all(
+                        len(item.content or []) == 1 for item in choice_logprobs_list
+                    ):
+                        log_once(error_msg, level=logging.WARNING)
+                        continue
+                    logprobs_obj = ChoiceLogprobs(
+                        content=[
+                            item.content[0]
+                            for item in choice_logprobs_list
+                            if item.content
+                        ]
                     )
-                    continue
                 logprobs_list: c.Sequence[c.Sequence[tuple[str, float]]]
                 if isinstance(logprobs_obj, ChoiceLogprobs):
@@ -964,10 +1023,9 @@ class LiteLLMModel(BenchmarkModule):
         if not sequences:
             log(
-                "No sequences were generated by the model "
-                f"{model_id!r}. This may be due to the "
-                "model running out of tokens or an issue with the input data. "
-                "Returning an empty GenerativeModelOutput.",
+                f"No sequences were generated by the model {model_id!r}. This may be "
+                "due to the model running out of tokens or an issue with the input "
+                "data. Returning an empty GenerativeModelOutput.",
                 level=logging.WARNING,
             )
             return GenerativeModelOutput(sequences=[], scores=None)
@@ -1295,6 +1353,10 @@ class LiteLLMModel(BenchmarkModule):
         if model_id in litellm.model_list:
             return True
+        set_up_benchmark_config_for_model(
+            benchmark_config=benchmark_config, model_id=model_id
+        )
         # Separate check for Ollama models
         if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
             ollama_model_exists = try_download_ollama_model(
@@ -1596,6 +1658,11 @@ class LiteLLMModel(BenchmarkModule):
                 level=logging.DEBUG,
             )
+        # If the model is a Chat.dk model, we make sure reasoning traces are not
+        # included in the output
+        if self.model_config.model_id.startswith("ordbogen/"):
+            generation_kwargs["include_reasoning"] = False
         # Handle manually set parameters
         if self.buffer["first_label_token_mapping"]:
             generation_kwargs["logprobs"] = True
@@ -1784,6 +1851,12 @@ def clean_model_id(model_id: str, benchmark_config: BenchmarkConfig) -> str:
     Returns:
         The cleaned model ID.
     """
+    # Remove unofficial prefixes
+    for unofficial_prefix in UNOFFICIAL_INFERENCE_API_PREFIXES:
+        model_id = re.sub(
+            pattern=rf"^{re.escape(unofficial_prefix)}", repl="", string=model_id
+        )
     if benchmark_config.api_base is not None and not any(
         model_id.startswith(prefix) for prefix in CUSTOM_INFERENCE_API_PREFIXES
     ):
@@ -1792,4 +1865,28 @@ def clean_model_id(model_id: str, benchmark_config: BenchmarkConfig) -> str:
         else:
             prefix = "openai/"
         model_id = prefix + model_id
+    # When we want to evaluate an OpenAI model on a custom inference server, such as HF
+    # inference endpoints, LiteLLM gets confused since it's already using the `openai/`
+    # prefix. We thus have to add it twice, and this hack here is to ensure that we
+    # don't store the results with model ID `openai/openai/...`.
+    elif benchmark_config.api_base is not None and model_id.startswith("openai/"):
+        model_id = "openai/openai/" + re.sub(r"(openai/)*", "", model_id)
     return model_id
+def set_up_benchmark_config_for_model(
+    benchmark_config: BenchmarkConfig, model_id: str
+) -> None:
+    """Set up the benchmark configuration for the model.
+    Args:
+        benchmark_config:
+            The benchmark configuration to set up.
+        model_id:
+            The model ID.
+    """
+    if model_id.startswith("ordbogen/"):
+        benchmark_config.api_key = os.getenv("ORDBOGEN_API_KEY")
+        benchmark_config.api_base = "https://api.ordbogen.ai/v1"

ScandEval 16.10.1__py3-none-any.whl → 16.12.0__py3-none-any.whl

ScandEval 16.10.1py3-none-any.whl → 16.12.0py3-none-any.whl