PyPI - ScandEval - Versions diffs - 16.8.0__py3-none-any.whl → 16.10.0__py3-none-any.whl - Mend

ScandEval 16.8.0py3-none-any.whl → 16.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

scandeval/benchmark_modules/hf.py +18 -3
scandeval/benchmark_modules/litellm.py +14 -13
scandeval/benchmark_modules/vllm.py +127 -9
scandeval/benchmarker.py +0 -11
scandeval/cli.py +39 -39
scandeval/constants.py +9 -0
scandeval/data_models.py +5 -0
scandeval/dataset_configs/__init__.py +1 -0
scandeval/dataset_configs/albanian.py +64 -0
scandeval/dataset_configs/dutch.py +31 -1
scandeval/dataset_configs/swedish.py +9 -0
scandeval/logging_utils.py +1 -0
scandeval/metrics/huggingface.py +82 -0
scandeval/metrics/llm_as_a_judge.py +1 -3
scandeval/model_config.py +2 -2
scandeval/prompt_templates/__init__.py +1 -0
scandeval/prompt_templates/linguistic_acceptability.py +9 -0
scandeval/prompt_templates/multiple_choice.py +9 -0
scandeval/prompt_templates/named_entity_recognition.py +20 -0
scandeval/prompt_templates/reading_comprehension.py +9 -0
scandeval/prompt_templates/sentiment_classification.py +11 -0
scandeval/prompt_templates/simplification.py +23 -0
scandeval/prompt_templates/summarization.py +11 -0
scandeval/task_group_utils/question_answering.py +30 -19
scandeval/task_group_utils/sequence_classification.py +4 -4
scandeval/task_group_utils/text_to_text.py +3 -4
scandeval/task_group_utils/token_classification.py +6 -8
scandeval/tasks.py +11 -0
scandeval/tokenisation_utils.py +7 -1
scandeval/types.py +7 -1
scandeval/utils.py +5 -6
{scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/METADATA +21 -3
{scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/RECORD +36 -34
{scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/WHEEL +1 -1
{scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/entry_points.txt +0 -0
{scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/licenses/LICENSE +0 -0

scandeval/benchmark_modules/hf.py CHANGED Viewed

@@ -33,7 +33,6 @@ from transformers.modelcard import TASK_MAPPING
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
-from transformers.tokenization_mistral_common import MistralCommonTokenizer
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer import Trainer
 from urllib3.exceptions import RequestError
@@ -80,6 +79,13 @@ from ..utils import (
 )
 from .base import BenchmarkModule
+try:
+    from transformers.tokenization_mistral_common import MistralCommonTokenizer
+except ImportError:
+    from transformers.tokenization_mistral_common import (
+        MistralCommonBackend as MistralCommonTokenizer,
+    )
 if t.TYPE_CHECKING:
     from transformers.configuration_utils import PretrainedConfig
     from transformers.tokenization_utils import PreTrainedTokenizer
@@ -175,7 +181,16 @@ class HuggingFaceEncoderModel(BenchmarkModule):
             and repo_info.safetensors is not None
             and "total" in repo_info.safetensors
         ):
-            num_params = repo_info.safetensors["total"]
+            num_params_candidates: list[int] = [repo_info.safetensors["total"]]
+            if "parameters" in repo_info.safetensors and isinstance(
+                repo_info.safetensors["parameters"], dict
+            ):
+                num_params_candidates.extend(
+                    int(v)
+                    for v in repo_info.safetensors["parameters"].values()
+                    if isinstance(v, int) or (isinstance(v, str) and v.isdigit())
+                )
+            num_params = max(num_params_candidates)
         elif (
             hasattr(self._model.config, "num_params")
             and self._model.config.num_params is not None
@@ -1146,7 +1161,7 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
                     "The token type embeddings of the model do not have a `data` "
                     "attribute, which is needed to modify the embeddings."
                 )
-            token_type_embeddings.weight.data = torch.cat(  # type: ignore[missing-attribute]
+            token_type_embeddings.weight.data = torch.cat(
                 (
                     token_type_embedding_tensor,
                     torch.rand_like(token_type_embedding_tensor),

scandeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -110,7 +110,7 @@ VOCAB_SIZE_MAPPING = {
     # Anthropic models
     r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
     # Gemini models
-    r"(gemini/)?gemini-[1-9]\.[0-9]-(flash|pro).*": 256_128,
+    r"(gemini/)?gemini-[1-9](\.[0-9])?-(flash|pro).*": 256_128,
     # xAI models
     r"(xai/)?grok.*": -1,
 }
@@ -136,7 +136,7 @@ MODEL_MAX_LENGTH_MAPPING = {
     # Gemini models
     r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
     r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
-    r"(gemini/)?gemini-2\.(0|5).*": 1_048_576,
+    r"(gemini/)?gemini-[23](\.[05])?.*": 1_048_576,
     # xAI models
     r"(xai/)?grok.*": 131_072,
 }
@@ -152,7 +152,7 @@ NUM_PARAMS_MAPPING = {
     # Gemini models
     r"(gemini/)?gemini-1.5-flash-8b": 8_000_000_000,
     r"(gemini/)?gemini-1.5-flash-[0-9]+": -1,
-    r"(gemini/)?gemini-2.(0|5).*": -1,
+    r"(gemini/)?gemini-[23](.[05])?.*": -1,
     # xAI models
     r"(xai/)?grok.*": -1,
 }
@@ -208,8 +208,8 @@ class LiteLLMModel(BenchmarkModule):
             "thinking",
         ],
         # Gemini models
-        re.compile(r"(gemini/)?gemini-2.5-flash-lite.*"): ["no-thinking", "thinking"],
-        re.compile(r"(gemini/)?gemini-2.5-flash.*"): ["no-thinking", "thinking"],
+        re.compile(r"(gemini/)?gemini-2\.5-flash-lite.*"): ["no-thinking", "thinking"],
+        re.compile(r"(gemini/)?gemini-(2\.5|3)-flash.*"): ["no-thinking", "thinking"],
         # xAI models
         re.compile(r"(xai/)?grok-3-mini(-fast)?(-beta)?"): ["low", "medium", "high"],
     }
@@ -517,6 +517,7 @@ class LiteLLMModel(BenchmarkModule):
         response_format_messages = [
             "got an unexpected keyword argument 'response_format'",
             "the model returned empty outputs",
+            "'maxitems' is not supported",
         ]
         if (
@@ -838,14 +839,14 @@ class LiteLLMModel(BenchmarkModule):
         ]
         # Close connections
-        for request in requests:
-            if hasattr(request, "close"):
-                try:
-                    request.close()
-                except RuntimeError as e:
-                    log(
-                        f"RuntimeError during request.close(): {e}", level=logging.DEBUG
-                    )
+        semaphore.release()
+        router.reset()
+        try:
+            loop = asyncio.get_event_loop()
+            if not loop.is_closed():
+                loop.close()
+        except RuntimeError:
+            pass  # Already closed
         return successes, failures

scandeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -15,13 +15,14 @@ from time import sleep
 import torch
 from huggingface_hub import snapshot_download
 from pydantic import conlist, create_model
+from transformers.generation.configuration_utils import GenerationConfig
 from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
-from transformers.tokenization_mistral_common import MistralCommonTokenizer
 from urllib3.exceptions import RequestError
 from ..constants import (
     CUSTOM_STOP_TOKENS,
+    GENERATION_KWARGS,
     GENERATIVE_PIPELINE_TAGS,
     MAX_CONTEXT_LENGTH,
     MAX_VLLM_LOGPROBS,
@@ -81,6 +82,13 @@ from ..utils import (
 )
 from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
+try:
+    from transformers.tokenization_mistral_common import MistralCommonTokenizer
+except ImportError:
+    from transformers.tokenization_mistral_common import (
+        MistralCommonBackend as MistralCommonTokenizer,
+    )
 if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
     from vllm import LLM, SamplingParams  # type: ignore[missing-import]
     from vllm.distributed.parallel_state import (  # type: ignore[missing-import]
@@ -92,6 +100,10 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
         StructuredOutputsParams,
     )
+if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
+    import ray  # type: ignore[missing-import]
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
     from transformers.trainer import Trainer
@@ -100,10 +112,11 @@ if t.TYPE_CHECKING:
 MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[re.Pattern, str] = {
-    re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "FLASH_ATTN",
-    re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "FLASH_ATTN",
-    re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "FLASH_ATTN",
+    re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "TRITON_ATTN",
+    re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "TRITON_ATTN",
+    re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "TRITON_ATTN",
     re.compile(r"google/gemma-3-(4|12|27)b.*", flags=re.IGNORECASE): "TRITON_ATTN",
+    re.compile(r"PleIAs/Pleias-3b-Preview", flags=re.IGNORECASE): "TRITON_ATTN",
 }
@@ -478,6 +491,41 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
         # Define the parameters used for vLLM generation
+        generation_kwargs = GENERATION_KWARGS.copy()
+        if (generation_config := self.model_config.generation_config) is not None:
+            changed_params = generation_config.to_diff_dict()
+            if "temperature" in changed_params:
+                temperature = changed_params["temperature"]
+                generation_kwargs["temperature"] = temperature
+                log_once(
+                    f"Using temperature={temperature} with the model "
+                    f"{self.model_config.model_id!r} as specified in its "
+                    "generation configuration."
+                )
+            if "top_p" in changed_params:
+                top_p = changed_params["top_p"]
+                generation_kwargs["top_p"] = top_p
+                log_once(
+                    f"Using top_p={top_p} with the model "
+                    f"{self.model_config.model_id!r} as specified in its "
+                    "generation configuration."
+                )
+            if "top_k" in changed_params:
+                top_k = changed_params["top_k"]
+                generation_kwargs["top_k"] = top_k
+                log_once(
+                    f"Using top_k={top_k} with the model "
+                    f"{self.model_config.model_id!r} as specified in its "
+                    "generation configuration."
+                )
+            if "repetition_penalty" in changed_params:
+                repetition_penalty = changed_params["repetition_penalty"]
+                generation_kwargs["repetition_penalty"] = repetition_penalty
+                log_once(
+                    f"Using repetition_penalty={repetition_penalty} with the model "
+                    f"{self.model_config.model_id!r} as specified in its "
+                    "generation configuration."
+                )
         max_tokens: int = (
             REASONING_MAX_TOKENS
             if self.generative_type == GenerativeType.REASONING
@@ -488,7 +536,10 @@ class VLLMModel(HuggingFaceEncoderModel):
             logprobs=MAX_VLLM_LOGPROBS
             if self.buffer["first_label_token_mapping"]
             else None,
-            temperature=0.0,
+            temperature=generation_kwargs["temperature"],
+            top_p=generation_kwargs["top_p"],
+            top_k=generation_kwargs["top_k"],
+            repetition_penalty=generation_kwargs["repetition_penalty"],
             stop=[stop_token for stop_token in stop_tokens if stop_token],
             structured_outputs=structured_outputs,
         )
@@ -762,6 +813,16 @@ class VLLMModel(HuggingFaceEncoderModel):
         if model_info is None:
             raise InvalidModel(f"The model {model_id!r} could not be found.")
+        try:
+            generation_config = GenerationConfig.from_pretrained(
+                pretrained_model_name=model_id_components.model_id,
+                revision=model_id_components.revision,
+                cache_dir=benchmark_config.cache_dir,
+                token=benchmark_config.api_key,
+            )
+        except OSError:
+            generation_config = None
         language_mapping = get_all_languages()
         language_codes = list(language_mapping.keys())
@@ -783,6 +844,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 cache_dir=benchmark_config.cache_dir, model_id=model_id
             ),
             adapter_base_model_id=model_info.adapter_base_model_id,
+            generation_config=generation_config,
         )
         return model_config
@@ -950,6 +1012,10 @@ def load_model_and_tokeniser(
     clear_vllm()
+    distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
+        select_backend_and_parallelism()
+    )
     try:
         model = LLM(
             model=(
@@ -968,8 +1034,9 @@ def load_model_and_tokeniser(
             trust_remote_code=benchmark_config.trust_remote_code,
             revision=revision,
             seed=4242,
-            distributed_executor_backend="mp",
-            tensor_parallel_size=torch.cuda.device_count(),
+            distributed_executor_backend=distributed_executor_backend,
+            tensor_parallel_size=tensor_parallel_size,
+            pipeline_parallel_size=pipeline_parallel_size,
             disable_custom_all_reduce=True,
             quantization=quantization,
             dtype=dtype,
@@ -1005,8 +1072,8 @@ def load_model_and_tokeniser(
                     "Since you're running in verbose mode, you might see a descriptive "
                     "error above already. Note however that if the error message urges "
                     "you to set the environment variable `VLLM_ATTENTION_BACKEND` to "
-                    "'FLEX_ATTENTION', please try setting it to 'FLASH_ATTN' first, as "
-                    "that often solves the issue, whereas 'FLEX_ATTENTION' usually "
+                    "'FLEX_ATTENTION', please try setting it to 'TRITON_ATTN' first, "
+                    "as that often solves the issue, whereas 'FLEX_ATTENTION' usually "
                     "doesn't. If you don't see any descriptive error above, then you "
                     "can try "
                 )
@@ -1372,3 +1439,54 @@ def get_vllm_tokenisation_params(
         config_format=config_format,
         load_format=load_format,
     )
+def select_backend_and_parallelism() -> tuple[str, int, int]:
+    """Determine the distributed backend and parallelism for vLLM.
+    Returns:
+        Tuple containing:
+        - backend (str): "ray" if multi-node Ray is available, else "mp".
+        - tensor_parallel_size (int): Number of GPUs per node.
+        - pipeline_parallel_size (int): Number of stages across nodes.
+    """
+    if not ray.is_initialized():
+        try:
+            ray.init(address="auto", ignore_reinit_error=True)
+        except Exception as e:
+            log_once(
+                f"Ray initialisation failed with a {type(e)} exception: {e}",
+                level=logging.DEBUG,
+            )
+    is_ray = ray.is_initialized()
+    local_gpu_count = torch.cuda.device_count()
+    if is_ray:
+        resources = ray.cluster_resources()
+        total_gpus = int(resources.get("GPU", 0))
+    else:
+        total_gpus = local_gpu_count
+    using_multiple_nodes = total_gpus > local_gpu_count
+    if is_ray and using_multiple_nodes:
+        distributed_executor_backend = "ray"
+        tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
+        pipeline_parallel_size = max(1, total_gpus // tensor_parallel_size)
+        log_once(
+            f"Detected a multi-node setup with {pipeline_parallel_size:,} nodes, each "
+            "with {tensor_parallel_size:,} GPUs, so using `ray` as the "
+            "distributed backend.",
+            level=logging.DEBUG,
+        )
+    else:
+        distributed_executor_backend = "mp"
+        tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
+        pipeline_parallel_size = 1
+        log_once(
+            f"Detected a single-node setup with {tensor_parallel_size:,} GPUs, "
+            "so using the multiprocessing distributed backend.",
+            level=logging.DEBUG,
+        )
+    return distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size

scandeval/benchmarker.py CHANGED Viewed

@@ -12,7 +12,6 @@ from pathlib import Path
 from shutil import rmtree
 from time import sleep
-from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
 from torch.distributed import destroy_process_group
 from .benchmark_config_factory import build_benchmark_config
@@ -32,7 +31,6 @@ from .speed_benchmark import benchmark_speed
 from .tasks import SPEED
 from .utils import (
     enforce_reproducibility,
-    get_package_version,
     internet_connection_available,
     split_model_id,
 )
@@ -194,15 +192,6 @@ class Benchmarker:
                 msg += "the argument `download_only` was set to True."
             raise ValueError(msg)
-        # Bail early if hf_transfer is enabled but not installed.
-        if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
-            raise ImportError(
-                "Fast download using 'hf_transfer' is enabled "
-                "(HF_HUB_ENABLE_HF_TRANSFER=1) but the 'hf_transfer' "
-                "package is not available in your environment. "
-                "Try installing it with `pip install hf_transfer`."
-            )
         # Deprecation warnings
         if batch_size is not None:
             if run_with_cli:

scandeval/cli.py CHANGED Viewed

@@ -37,26 +37,6 @@ from .languages import get_all_languages
     help="""The languages to benchmark, both for models and datasets. If "all" then all
     models will be benchmarked on all datasets.""",
 )
-@click.option(
-    "--model-language",
-    "-ml",
-    default=None,
-    show_default=True,
-    multiple=True,
-    metavar="ISO 639-1 LANGUAGE CODE",
-    type=click.Choice(["all"] + list(get_all_languages().keys())),
-    help="""This option is deprecated - please use --language instead.""",
-)
-@click.option(
-    "--dataset-language",
-    "-dl",
-    default=None,
-    show_default=True,
-    multiple=True,
-    metavar="ISO 639-1 LANGUAGE CODE",
-    type=click.Choice(["all"] + list(get_all_languages().keys())),
-    help="""This option is deprecated - please use --language instead.""",
-)
 @click.option(
     "--dataset",
     default=None,
@@ -65,13 +45,6 @@ from .languages import get_all_languages
     help="""The name of the benchmark dataset. We recommend to use the `task` and
     `language` options instead of this option.""",
 )
-@click.option(
-    "--batch-size",
-    default=None,
-    type=click.Choice(["1", "2", "4", "8", "16", "32"]),
-    help="This option is deprecated - please use --finetuning-batch-size instead.",
-    deprecated=True,
-)
 @click.option(
     "--finetuning-batch-size",
     default="32",
@@ -197,14 +170,6 @@ from .languages import get_all_languages
     "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
     "if you are running out of GPU memory. Only relevant if the model is generative.",
 )
-@click.option(
-    "--debug/--no-debug",
-    default=False,
-    show_default=True,
-    help="Whether to run the benchmark in debug mode. This prints out extra "
-    "information and stores all outputs to the current working directory. Only "
-    "relevant if the model is generative.",
-)
 @click.option(
     "--requires-safetensors",
     is_flag=True,
@@ -232,15 +197,47 @@ from .languages import get_all_languages
     help="Only download the requested model weights and datasets, and exit.",
     default=False,
 )
+@click.option(
+    "--debug/--no-debug",
+    default=False,
+    show_default=True,
+    help="Whether to run the benchmark in debug mode. This prints out extra "
+    "information and stores all outputs to the current working directory. Only "
+    "relevant if the model is generative.",
+)
+@click.option(
+    "--model-language",
+    "-ml",
+    default=None,
+    show_default=True,
+    multiple=True,
+    metavar="ISO 639-1 LANGUAGE CODE",
+    type=click.Choice(["all"] + list(get_all_languages().keys())),
+    help="""This option is deprecated - please use --language instead.""",
+)
+@click.option(
+    "--dataset-language",
+    "-dl",
+    default=None,
+    show_default=True,
+    multiple=True,
+    metavar="ISO 639-1 LANGUAGE CODE",
+    type=click.Choice(["all"] + list(get_all_languages().keys())),
+    help="""This option is deprecated - please use --language instead.""",
+)
+@click.option(
+    "--batch-size",
+    default=None,
+    type=click.Choice(["1", "2", "4", "8", "16", "32"]),
+    help="This option is deprecated - please use --finetuning-batch-size instead.",
+    deprecated=True,
+)
 def benchmark(
     model: tuple[str],
     dataset: tuple[str | DatasetConfig],
     language: tuple[str],
-    model_language: tuple[str],
-    dataset_language: tuple[str],
     raise_errors: bool,
     task: tuple[str],
-    batch_size: str | None,
     finetuning_batch_size: str,
     progress_bar: bool,
     save_results: bool,
@@ -257,11 +254,14 @@ def benchmark(
     api_base: str | None,
     api_version: str | None,
     gpu_memory_utilization: float,
-    debug: bool,
     requires_safetensors: bool,
     generative_type: str | None,
     custom_datasets_file: Path,
     download_only: bool,
+    debug: bool,
+    model_language: tuple[str],
+    dataset_language: tuple[str],
+    batch_size: str | None,
 ) -> None:
     """Benchmark pretrained language models on language tasks."""
     Benchmarker(

scandeval/constants.py CHANGED Viewed

@@ -96,3 +96,12 @@ NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
 # We only allow loading local datasets in these file formats
 SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]
+# These are default generation parameters, and can be overridden if a generative model
+# has a `generation_config.json` file in its repository
+GENERATION_KWARGS = {
+    "temperature": 0.0,
+    "top_p": 1.0,
+    "top_k": 0,
+    "repetition_penalty": 1.0,
+}

scandeval/data_models.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pathlib import Path
 import pydantic
 import torch
+from transformers.generation.configuration_utils import GenerationConfig
 from .enums import Device, GenerativeType, ModelType, TaskGroup
 from .exceptions import InvalidBenchmark
@@ -709,6 +710,9 @@ class ModelConfig:
         adapter_base_model_id:
             The model ID of the base model if the model is an adapter model. Can be None
             if the model is not an adapter model.
+        generation_config (optional):
+            The generation configuration for generative models, if specified in the
+            model repository. Defaults to no generation configuration.
     """
     model_id: str
@@ -722,6 +726,7 @@ class ModelConfig:
     fresh: bool
     model_cache_dir: str
     adapter_base_model_id: str | None
+    generation_config: GenerationConfig | None = None
     def __hash__(self) -> int:
         """Return a hash of the model configuration."""

scandeval/dataset_configs/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@ from ..data_models import DatasetConfig
 from ..languages import get_all_languages
 from ..tasks import SPEED
 from ..utils import load_custom_datasets_module
+from .albanian import *  # noqa: F403
 from .bosnian import *  # noqa: F403
 from .bulgarian import *  # noqa: F403
 from .catalan import *  # noqa: F403

scandeval/dataset_configs/albanian.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""All Albanian dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..languages import ALBANIAN
+from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
+### Official datasets ###
+MMS_SQ_CONFIG = DatasetConfig(
+    name="mms-sq",
+    pretty_name="MMS-sq",
+    source="EuroEval/mms-sq-mini",
+    task=SENT,
+    languages=[ALBANIAN],
+)
+SCALA_SQ_CONFIG = DatasetConfig(
+    name="scala-sq",
+    pretty_name="ScaLA-sq",
+    source="EuroEval/scala-sq",
+    task=LA,
+    languages=[ALBANIAN],
+)
+WIKIANN_SQ_CONFIG = DatasetConfig(
+    name="wikiann-sq",
+    pretty_name="WikiANN-sq",
+    source="EuroEval/wikiann-sq-mini",
+    task=NER,
+    languages=[ALBANIAN],
+)
+MULTI_WIKI_QA_SQ_CONFIG = DatasetConfig(
+    name="multi-wiki-qa-sq",
+    pretty_name="MultiWikiQA-sq",
+    source="EuroEval/multi-wiki-qa-sq-mini",
+    task=RC,
+    languages=[ALBANIAN],
+)
+LR_SUM_SQ_CONFIG = DatasetConfig(
+    name="lr-sum-sq",
+    pretty_name="LRSum-sq",
+    source="EuroEval/lr-sum-sq-mini",
+    task=SUMM,
+    languages=[ALBANIAN],
+)
+GLOBAL_MMLU_LITE_SQ_CONFIG = DatasetConfig(
+    name="global-mmlu-lite-sq",
+    pretty_name="GlobalMMLULite-sq",
+    source="EuroEval/global-mmlu-lite-sq",
+    task=KNOW,
+    languages=[ALBANIAN],
+)
+WINOGRANDE_SQ_CONFIG = DatasetConfig(
+    name="winogrande-sq",
+    pretty_name="Winogrande-sq",
+    source="EuroEval/winogrande-sq",
+    task=COMMON_SENSE,
+    languages=[ALBANIAN],
+    _labels=["a", "b"],
+)

scandeval/dataset_configs/dutch.py CHANGED Viewed

@@ -2,7 +2,18 @@
 from ..data_models import DatasetConfig
 from ..languages import DUTCH
-from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
+from ..tasks import (
+    COMMON_SENSE,
+    EUROPEAN_VALUES,
+    KNOW,
+    LA,
+    MCRC,
+    NER,
+    RC,
+    SENT,
+    SIMPL,
+    SUMM,
+)
 ### Official datasets ###
@@ -122,6 +133,16 @@ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
     unofficial=True,
 )
+COPA_NL_CONFIG = DatasetConfig(
+    name="copa-nl",
+    pretty_name="COPA-nl",
+    source="EuroEval/copa-nl",
+    task=COMMON_SENSE,
+    languages=[DUTCH],
+    unofficial=True,
+    _labels=["a", "b"],
+)
 GOLDENSWAG_NL_CONFIG = DatasetConfig(
     name="goldenswag-nl",
     pretty_name="GoldenSwag-nl",
@@ -140,3 +161,12 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
     _labels=["a", "b"],
     unofficial=True,
 )
+DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
+    name="duidelijke-taal",
+    pretty_name="Duidelijke Taal",
+    source="EuroEval/duidelijke-taal",
+    task=SIMPL,
+    languages=[DUTCH],
+    unofficial=True,
+)

scandeval/dataset_configs/swedish.py CHANGED Viewed

@@ -139,3 +139,12 @@ SKOLPROV_CONFIG = DatasetConfig(
     languages=[SWEDISH],
     unofficial=True,
 )
+SWEDISH_FACTS_CONFIG = DatasetConfig(
+    name="swedish-facts",
+    pretty_name="Swedish Facts",
+    source="EuroEval/swedish-facts",
+    task=KNOW,
+    languages=[SWEDISH],
+    unofficial=True,
+)

scandeval/logging_utils.py CHANGED Viewed

@@ -140,6 +140,7 @@ def block_terminal_output() -> None:
     logging.getLogger("openai").setLevel(logging.CRITICAL)
     logging.getLogger("httpx").setLevel(logging.CRITICAL)
     litellm.suppress_debug_info = True  # type: ignore[bad-assignment]
+    litellm.turn_off_message_logging = True
     # Disable vLLM logging
     logging.getLogger("vllm").setLevel(logging.CRITICAL)

ScandEval 16.8.0__py3-none-any.whl → 16.10.0__py3-none-any.whl

ScandEval 16.8.0py3-none-any.whl → 16.10.0py3-none-any.whl