PyPI - ScandEval - Versions diffs - 16.9.0__py3-none-any.whl → 16.10.1__py3-none-any.whl - Mend

ScandEval 16.9.0py3-none-any.whl → 16.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

scandeval/benchmark_modules/litellm.py +14 -13
scandeval/benchmark_modules/vllm.py +115 -3
scandeval/cli.py +39 -39
scandeval/constants.py +9 -0
scandeval/data_models.py +5 -0
scandeval/dataset_configs/__init__.py +1 -0
scandeval/dataset_configs/albanian.py +64 -0
scandeval/dataset_configs/dutch.py +30 -1
scandeval/dataset_configs/norwegian.py +3 -3
scandeval/logging_utils.py +1 -0
scandeval/metrics/huggingface.py +82 -0
scandeval/prompt_templates/__init__.py +1 -0
scandeval/prompt_templates/linguistic_acceptability.py +9 -0
scandeval/prompt_templates/multiple_choice.py +9 -0
scandeval/prompt_templates/named_entity_recognition.py +20 -0
scandeval/prompt_templates/reading_comprehension.py +9 -0
scandeval/prompt_templates/sentiment_classification.py +11 -0
scandeval/prompt_templates/simplification.py +23 -0
scandeval/prompt_templates/summarization.py +11 -0
scandeval/tasks.py +11 -0
scandeval/utils.py +5 -6
{scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/METADATA +18 -1
{scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/RECORD +26 -24
{scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/WHEEL +0 -0
{scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/entry_points.txt +0 -0
{scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/licenses/LICENSE +0 -0

scandeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -110,7 +110,7 @@ VOCAB_SIZE_MAPPING = {
     # Anthropic models
     r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
     # Gemini models
-    r"(gemini/)?gemini-[1-9]\.[0-9]-(flash|pro).*": 256_128,
+    r"(gemini/)?gemini-[1-9](\.[0-9])?-(flash|pro).*": 256_128,
     # xAI models
     r"(xai/)?grok.*": -1,
 }
@@ -136,7 +136,7 @@ MODEL_MAX_LENGTH_MAPPING = {
     # Gemini models
     r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
     r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
-    r"(gemini/)?gemini-2\.(0|5).*": 1_048_576,
+    r"(gemini/)?gemini-[23](\.[05])?.*": 1_048_576,
     # xAI models
     r"(xai/)?grok.*": 131_072,
 }
@@ -152,7 +152,7 @@ NUM_PARAMS_MAPPING = {
     # Gemini models
     r"(gemini/)?gemini-1.5-flash-8b": 8_000_000_000,
     r"(gemini/)?gemini-1.5-flash-[0-9]+": -1,
-    r"(gemini/)?gemini-2.(0|5).*": -1,
+    r"(gemini/)?gemini-[23](.[05])?.*": -1,
     # xAI models
     r"(xai/)?grok.*": -1,
 }
@@ -208,8 +208,8 @@ class LiteLLMModel(BenchmarkModule):
             "thinking",
         ],
         # Gemini models
-        re.compile(r"(gemini/)?gemini-2.5-flash-lite.*"): ["no-thinking", "thinking"],
-        re.compile(r"(gemini/)?gemini-2.5-flash.*"): ["no-thinking", "thinking"],
+        re.compile(r"(gemini/)?gemini-2\.5-flash-lite.*"): ["no-thinking", "thinking"],
+        re.compile(r"(gemini/)?gemini-(2\.5|3)-flash.*"): ["no-thinking", "thinking"],
         # xAI models
         re.compile(r"(xai/)?grok-3-mini(-fast)?(-beta)?"): ["low", "medium", "high"],
     }
@@ -517,6 +517,7 @@ class LiteLLMModel(BenchmarkModule):
         response_format_messages = [
             "got an unexpected keyword argument 'response_format'",
             "the model returned empty outputs",
+            "'maxitems' is not supported",
         ]
         if (
@@ -838,14 +839,14 @@ class LiteLLMModel(BenchmarkModule):
         ]
         # Close connections
-        for request in requests:
-            if hasattr(request, "close"):
-                try:
-                    request.close()
-                except RuntimeError as e:
-                    log(
-                        f"RuntimeError during request.close(): {e}", level=logging.DEBUG
-                    )
+        semaphore.release()
+        router.reset()
+        try:
+            loop = asyncio.get_event_loop()
+            if not loop.is_closed():
+                loop.close()
+        except RuntimeError:
+            pass  # Already closed
         return successes, failures

scandeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -15,12 +15,14 @@ from time import sleep
 import torch
 from huggingface_hub import snapshot_download
 from pydantic import conlist, create_model
+from transformers.generation.configuration_utils import GenerationConfig
 from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 from urllib3.exceptions import RequestError
 from ..constants import (
     CUSTOM_STOP_TOKENS,
+    GENERATION_KWARGS,
     GENERATIVE_PIPELINE_TAGS,
     MAX_CONTEXT_LENGTH,
     MAX_VLLM_LOGPROBS,
@@ -98,6 +100,10 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
         StructuredOutputsParams,
     )
+if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
+    import ray  # type: ignore[missing-import]
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
     from transformers.trainer import Trainer
@@ -485,6 +491,41 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
         # Define the parameters used for vLLM generation
+        generation_kwargs = GENERATION_KWARGS.copy()
+        if (generation_config := self.model_config.generation_config) is not None:
+            changed_params = generation_config.to_diff_dict()
+            if "temperature" in changed_params:
+                temperature = changed_params["temperature"]
+                generation_kwargs["temperature"] = temperature
+                log_once(
+                    f"Using temperature={temperature} with the model "
+                    f"{self.model_config.model_id!r} as specified in its "
+                    "generation configuration."
+                )
+            if "top_p" in changed_params:
+                top_p = changed_params["top_p"]
+                generation_kwargs["top_p"] = top_p
+                log_once(
+                    f"Using top_p={top_p} with the model "
+                    f"{self.model_config.model_id!r} as specified in its "
+                    "generation configuration."
+                )
+            if "top_k" in changed_params:
+                top_k = changed_params["top_k"]
+                generation_kwargs["top_k"] = top_k
+                log_once(
+                    f"Using top_k={top_k} with the model "
+                    f"{self.model_config.model_id!r} as specified in its "
+                    "generation configuration."
+                )
+            if "repetition_penalty" in changed_params:
+                repetition_penalty = changed_params["repetition_penalty"]
+                generation_kwargs["repetition_penalty"] = repetition_penalty
+                log_once(
+                    f"Using repetition_penalty={repetition_penalty} with the model "
+                    f"{self.model_config.model_id!r} as specified in its "
+                    "generation configuration."
+                )
         max_tokens: int = (
             REASONING_MAX_TOKENS
             if self.generative_type == GenerativeType.REASONING
@@ -495,7 +536,10 @@ class VLLMModel(HuggingFaceEncoderModel):
             logprobs=MAX_VLLM_LOGPROBS
             if self.buffer["first_label_token_mapping"]
             else None,
-            temperature=0.0,
+            temperature=generation_kwargs["temperature"],
+            top_p=generation_kwargs["top_p"],
+            top_k=generation_kwargs["top_k"],
+            repetition_penalty=generation_kwargs["repetition_penalty"],
             stop=[stop_token for stop_token in stop_tokens if stop_token],
             structured_outputs=structured_outputs,
         )
@@ -769,6 +813,16 @@ class VLLMModel(HuggingFaceEncoderModel):
         if model_info is None:
             raise InvalidModel(f"The model {model_id!r} could not be found.")
+        try:
+            generation_config = GenerationConfig.from_pretrained(
+                pretrained_model_name=model_id_components.model_id,
+                revision=model_id_components.revision,
+                cache_dir=benchmark_config.cache_dir,
+                token=benchmark_config.api_key,
+            )
+        except OSError:
+            generation_config = None
         language_mapping = get_all_languages()
         language_codes = list(language_mapping.keys())
@@ -790,6 +844,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 cache_dir=benchmark_config.cache_dir, model_id=model_id
             ),
             adapter_base_model_id=model_info.adapter_base_model_id,
+            generation_config=generation_config,
         )
         return model_config
@@ -957,6 +1012,10 @@ def load_model_and_tokeniser(
     clear_vllm()
+    distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
+        select_backend_and_parallelism()
+    )
     try:
         model = LLM(
             model=(
@@ -975,8 +1034,9 @@ def load_model_and_tokeniser(
             trust_remote_code=benchmark_config.trust_remote_code,
             revision=revision,
             seed=4242,
-            distributed_executor_backend="mp",
-            tensor_parallel_size=torch.cuda.device_count(),
+            distributed_executor_backend=distributed_executor_backend,
+            tensor_parallel_size=tensor_parallel_size,
+            pipeline_parallel_size=pipeline_parallel_size,
             disable_custom_all_reduce=True,
             quantization=quantization,
             dtype=dtype,
@@ -1379,3 +1439,55 @@ def get_vllm_tokenisation_params(
         config_format=config_format,
         load_format=load_format,
     )
+def select_backend_and_parallelism() -> tuple[str, int, int]:
+    """Determine the distributed backend and parallelism for vLLM.
+    Returns:
+        Tuple containing:
+        - backend (str): "ray" if multi-node Ray is available, else "mp".
+        - tensor_parallel_size (int): Number of GPUs per node.
+        - pipeline_parallel_size (int): Number of stages across nodes.
+    """
+    if not ray.is_initialized():
+        try:
+            ray.init(address="auto", ignore_reinit_error=True)
+        except Exception as e:
+            if "could not find any running ray instance" not in str(e).lower():
+                log_once(
+                    f"Ray initialisation failed with a {type(e)} exception: {e}",
+                    level=logging.DEBUG,
+                )
+    is_ray = ray.is_initialized()
+    local_gpu_count = torch.cuda.device_count()
+    if is_ray:
+        resources = ray.cluster_resources()
+        total_gpus = int(resources.get("GPU", 0))
+    else:
+        total_gpus = local_gpu_count
+    using_multiple_nodes = total_gpus > local_gpu_count
+    if is_ray and using_multiple_nodes:
+        distributed_executor_backend = "ray"
+        tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
+        pipeline_parallel_size = max(1, total_gpus // tensor_parallel_size)
+        log_once(
+            f"Detected a multi-node setup with {pipeline_parallel_size:,} nodes, each "
+            "with {tensor_parallel_size:,} GPUs, so using `ray` as the "
+            "distributed backend.",
+            level=logging.DEBUG,
+        )
+    else:
+        distributed_executor_backend = "mp"
+        tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
+        pipeline_parallel_size = 1
+        log_once(
+            f"Detected a single-node setup with {tensor_parallel_size:,} GPUs, "
+            "so using the multiprocessing distributed backend.",
+            level=logging.DEBUG,
+        )
+    return distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size

scandeval/cli.py CHANGED Viewed

@@ -37,26 +37,6 @@ from .languages import get_all_languages
     help="""The languages to benchmark, both for models and datasets. If "all" then all
     models will be benchmarked on all datasets.""",
 )
-@click.option(
-    "--model-language",
-    "-ml",
-    default=None,
-    show_default=True,
-    multiple=True,
-    metavar="ISO 639-1 LANGUAGE CODE",
-    type=click.Choice(["all"] + list(get_all_languages().keys())),
-    help="""This option is deprecated - please use --language instead.""",
-)
-@click.option(
-    "--dataset-language",
-    "-dl",
-    default=None,
-    show_default=True,
-    multiple=True,
-    metavar="ISO 639-1 LANGUAGE CODE",
-    type=click.Choice(["all"] + list(get_all_languages().keys())),
-    help="""This option is deprecated - please use --language instead.""",
-)
 @click.option(
     "--dataset",
     default=None,
@@ -65,13 +45,6 @@ from .languages import get_all_languages
     help="""The name of the benchmark dataset. We recommend to use the `task` and
     `language` options instead of this option.""",
 )
-@click.option(
-    "--batch-size",
-    default=None,
-    type=click.Choice(["1", "2", "4", "8", "16", "32"]),
-    help="This option is deprecated - please use --finetuning-batch-size instead.",
-    deprecated=True,
-)
 @click.option(
     "--finetuning-batch-size",
     default="32",
@@ -197,14 +170,6 @@ from .languages import get_all_languages
     "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
     "if you are running out of GPU memory. Only relevant if the model is generative.",
 )
-@click.option(
-    "--debug/--no-debug",
-    default=False,
-    show_default=True,
-    help="Whether to run the benchmark in debug mode. This prints out extra "
-    "information and stores all outputs to the current working directory. Only "
-    "relevant if the model is generative.",
-)
 @click.option(
     "--requires-safetensors",
     is_flag=True,
@@ -232,15 +197,47 @@ from .languages import get_all_languages
     help="Only download the requested model weights and datasets, and exit.",
     default=False,
 )
+@click.option(
+    "--debug/--no-debug",
+    default=False,
+    show_default=True,
+    help="Whether to run the benchmark in debug mode. This prints out extra "
+    "information and stores all outputs to the current working directory. Only "
+    "relevant if the model is generative.",
+)
+@click.option(
+    "--model-language",
+    "-ml",
+    default=None,
+    show_default=True,
+    multiple=True,
+    metavar="ISO 639-1 LANGUAGE CODE",
+    type=click.Choice(["all"] + list(get_all_languages().keys())),
+    help="""This option is deprecated - please use --language instead.""",
+)
+@click.option(
+    "--dataset-language",
+    "-dl",
+    default=None,
+    show_default=True,
+    multiple=True,
+    metavar="ISO 639-1 LANGUAGE CODE",
+    type=click.Choice(["all"] + list(get_all_languages().keys())),
+    help="""This option is deprecated - please use --language instead.""",
+)
+@click.option(
+    "--batch-size",
+    default=None,
+    type=click.Choice(["1", "2", "4", "8", "16", "32"]),
+    help="This option is deprecated - please use --finetuning-batch-size instead.",
+    deprecated=True,
+)
 def benchmark(
     model: tuple[str],
     dataset: tuple[str | DatasetConfig],
     language: tuple[str],
-    model_language: tuple[str],
-    dataset_language: tuple[str],
     raise_errors: bool,
     task: tuple[str],
-    batch_size: str | None,
     finetuning_batch_size: str,
     progress_bar: bool,
     save_results: bool,
@@ -257,11 +254,14 @@ def benchmark(
     api_base: str | None,
     api_version: str | None,
     gpu_memory_utilization: float,
-    debug: bool,
     requires_safetensors: bool,
     generative_type: str | None,
     custom_datasets_file: Path,
     download_only: bool,
+    debug: bool,
+    model_language: tuple[str],
+    dataset_language: tuple[str],
+    batch_size: str | None,
 ) -> None:
     """Benchmark pretrained language models on language tasks."""
     Benchmarker(

scandeval/constants.py CHANGED Viewed

@@ -96,3 +96,12 @@ NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
 # We only allow loading local datasets in these file formats
 SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]
+# These are default generation parameters, and can be overridden if a generative model
+# has a `generation_config.json` file in its repository
+GENERATION_KWARGS = {
+    "temperature": 0.0,
+    "top_p": 1.0,
+    "top_k": 0,
+    "repetition_penalty": 1.0,
+}

scandeval/data_models.py CHANGED Viewed

@@ -10,6 +10,7 @@ from pathlib import Path
 import pydantic
 import torch
+from transformers.generation.configuration_utils import GenerationConfig
 from .enums import Device, GenerativeType, ModelType, TaskGroup
 from .exceptions import InvalidBenchmark
@@ -709,6 +710,9 @@ class ModelConfig:
         adapter_base_model_id:
             The model ID of the base model if the model is an adapter model. Can be None
             if the model is not an adapter model.
+        generation_config (optional):
+            The generation configuration for generative models, if specified in the
+            model repository. Defaults to no generation configuration.
     """
     model_id: str
@@ -722,6 +726,7 @@ class ModelConfig:
     fresh: bool
     model_cache_dir: str
     adapter_base_model_id: str | None
+    generation_config: GenerationConfig | None = None
     def __hash__(self) -> int:
         """Return a hash of the model configuration."""

scandeval/dataset_configs/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@ from ..data_models import DatasetConfig
 from ..languages import get_all_languages
 from ..tasks import SPEED
 from ..utils import load_custom_datasets_module
+from .albanian import *  # noqa: F403
 from .bosnian import *  # noqa: F403
 from .bulgarian import *  # noqa: F403
 from .catalan import *  # noqa: F403

scandeval/dataset_configs/albanian.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""All Albanian dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..languages import ALBANIAN
+from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
+### Official datasets ###
+MMS_SQ_CONFIG = DatasetConfig(
+    name="mms-sq",
+    pretty_name="MMS-sq",
+    source="EuroEval/mms-sq-mini",
+    task=SENT,
+    languages=[ALBANIAN],
+)
+SCALA_SQ_CONFIG = DatasetConfig(
+    name="scala-sq",
+    pretty_name="ScaLA-sq",
+    source="EuroEval/scala-sq",
+    task=LA,
+    languages=[ALBANIAN],
+)
+WIKIANN_SQ_CONFIG = DatasetConfig(
+    name="wikiann-sq",
+    pretty_name="WikiANN-sq",
+    source="EuroEval/wikiann-sq-mini",
+    task=NER,
+    languages=[ALBANIAN],
+)
+MULTI_WIKI_QA_SQ_CONFIG = DatasetConfig(
+    name="multi-wiki-qa-sq",
+    pretty_name="MultiWikiQA-sq",
+    source="EuroEval/multi-wiki-qa-sq-mini",
+    task=RC,
+    languages=[ALBANIAN],
+)
+LR_SUM_SQ_CONFIG = DatasetConfig(
+    name="lr-sum-sq",
+    pretty_name="LRSum-sq",
+    source="EuroEval/lr-sum-sq-mini",
+    task=SUMM,
+    languages=[ALBANIAN],
+)
+GLOBAL_MMLU_LITE_SQ_CONFIG = DatasetConfig(
+    name="global-mmlu-lite-sq",
+    pretty_name="GlobalMMLULite-sq",
+    source="EuroEval/global-mmlu-lite-sq",
+    task=KNOW,
+    languages=[ALBANIAN],
+)
+WINOGRANDE_SQ_CONFIG = DatasetConfig(
+    name="winogrande-sq",
+    pretty_name="Winogrande-sq",
+    source="EuroEval/winogrande-sq",
+    task=COMMON_SENSE,
+    languages=[ALBANIAN],
+    _labels=["a", "b"],
+)

scandeval/dataset_configs/dutch.py CHANGED Viewed

@@ -2,7 +2,18 @@
 from ..data_models import DatasetConfig
 from ..languages import DUTCH
-from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
+from ..tasks import (
+    COMMON_SENSE,
+    EUROPEAN_VALUES,
+    KNOW,
+    LA,
+    MCRC,
+    NER,
+    RC,
+    SENT,
+    SIMPL,
+    SUMM,
+)
 ### Official datasets ###
@@ -63,6 +74,14 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
     languages=[DUTCH],
 )
+DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
+    name="duidelijke-taal",
+    pretty_name="Duidelijke Taal",
+    source="EuroEval/duidelijke-taal",
+    task=SIMPL,
+    languages=[DUTCH],
+)
 VALEU_NL_CONFIG = DatasetConfig(
     name="valeu-nl",
     pretty_name="VaLEU-nl",
@@ -122,6 +141,16 @@ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
     unofficial=True,
 )
+COPA_NL_CONFIG = DatasetConfig(
+    name="copa-nl",
+    pretty_name="COPA-nl",
+    source="EuroEval/copa-nl",
+    task=COMMON_SENSE,
+    languages=[DUTCH],
+    unofficial=True,
+    _labels=["a", "b"],
+)
 GOLDENSWAG_NL_CONFIG = DatasetConfig(
     name="goldenswag-nl",
     pretty_name="GoldenSwag-nl",

scandeval/dataset_configs/norwegian.py CHANGED Viewed

@@ -27,7 +27,7 @@ SCALA_NN_CONFIG = DatasetConfig(
     pretty_name="ScaLA-nn",
     source="EuroEval/scala-nn",
     task=LA,
-    languages=[NORWEGIAN_NYNORSK],
+    languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
 )
 NORNE_NB_CONFIG = DatasetConfig(
@@ -43,7 +43,7 @@ NORNE_NN_CONFIG = DatasetConfig(
     pretty_name="NorNE-nn",
     source="EuroEval/norne-nn-mini",
     task=NER,
-    languages=[NORWEGIAN_NYNORSK],
+    languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
 )
 NORQUAD_CONFIG = DatasetConfig(
@@ -197,7 +197,7 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
     pretty_name="MultiWikiQA-nn",
     source="EuroEval/multi-wiki-qa-nn-mini",
     task=RC,
-    languages=[NORWEGIAN_NYNORSK],
+    languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
     unofficial=True,
 )

scandeval/logging_utils.py CHANGED Viewed

@@ -140,6 +140,7 @@ def block_terminal_output() -> None:
     logging.getLogger("openai").setLevel(logging.CRITICAL)
     logging.getLogger("httpx").setLevel(logging.CRITICAL)
     litellm.suppress_debug_info = True  # type: ignore[bad-assignment]
+    litellm.turn_off_message_logging = True
     # Disable vLLM logging
     logging.getLogger("vllm").setLevel(logging.CRITICAL)

scandeval/metrics/huggingface.py CHANGED Viewed

@@ -8,6 +8,7 @@ import evaluate
 import numpy as np
 from datasets import DownloadConfig, DownloadMode
+from ..exceptions import InvalidBenchmark
 from ..logging_utils import no_terminal_output
 from .base import Metric
@@ -149,6 +150,75 @@ class HuggingFaceMetric(Metric):
         return score
+class SourceBasedMetric(HuggingFaceMetric):
+    """Subclass of HuggingfaceMetric for metrics also requiring source text as input."""
+    def __call__(
+        self,
+        predictions: c.Sequence,
+        references: c.Sequence,
+        dataset: "Dataset",
+        dataset_config: "DatasetConfig",
+        benchmark_config: "BenchmarkConfig",
+    ) -> float | None:
+        """Calculate metric score for metrics requiring original source text.
+        Passes the source text to the evaluate function via its `sources` param.
+        Args:
+            predictions:
+                The model predictions.
+            references:
+                The ground truth references.
+            dataset:
+                The dataset used for evaluation. This is used for collecting the source
+                text and in case any additional metadata is used to compute the metrics.
+            dataset_config:
+                The dataset configuration.
+            benchmark_config:
+                The benchmark configuration.
+        Returns:
+            The calculated metric score, or None if the score should be ignored.
+        """
+        if dataset is None:
+            raise InvalidBenchmark("SourceBasedMetric requires `dataset` to be passed.")
+        if self.metric is None:
+            self.metric = evaluate.load(path=self.huggingface_id)
+        sources = dataset["text"]
+        if not len(sources) == len(predictions):
+            raise InvalidBenchmark(
+                f"SourceBasedMetric expects same number of inputs as predictions."
+                f"Got {len(sources)} sources and {len(predictions)} predictions "
+                f"instead."
+            )
+        with no_terminal_output(disable=benchmark_config.verbose):
+            results = self.metric.compute(
+                sources=sources,
+                predictions=predictions,
+                references=[[r] for r in references],
+                **self.compute_kwargs,
+            )
+        # The metric returns None if we are running on multi-GPU and the current
+        # process is not the main process
+        if results is None:
+            return None
+        # Convert the results to a float score
+        score = results[self.results_key]
+        if isinstance(score, list):
+            score = sum(score) / len(score)
+        if isinstance(score, np.floating):
+            score = float(score)
+        return score
 mcc_metric = HuggingFaceMetric(
     name="mcc",
     pretty_name="Matthew's Correlation Coefficient",
@@ -214,3 +284,15 @@ accuracy_metric = HuggingFaceMetric(
     huggingface_id="accuracy",
     results_key="accuracy",
 )
+meteor_metric = HuggingFaceMetric(
+    name="meteor", pretty_name="METEOR", huggingface_id="meteor", results_key="meteor"
+)
+sari_metric = SourceBasedMetric(
+    name="sari",
+    pretty_name="SARI",
+    huggingface_id="sari",
+    results_key="sari",
+    postprocessing_fn=lambda x: (x, f"{x:.2f}%"),
+)

scandeval/prompt_templates/__init__.py CHANGED Viewed

@@ -6,5 +6,6 @@ from .multiple_choice import MULTIPLE_CHOICE_TEMPLATES
 from .named_entity_recognition import NER_TEMPLATES
 from .reading_comprehension import RC_TEMPLATES
 from .sentiment_classification import SENT_TEMPLATES
+from .simplification import SIMPL_TEMPLATES
 from .summarization import SUMM_TEMPLATES
 from .token_classification import TOKEN_CLASSIFICATION_TEMPLATES

scandeval/prompt_templates/linguistic_acceptability.py CHANGED Viewed

@@ -4,6 +4,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
+    ALBANIAN,
     BULGARIAN,
     CATALAN,
     CROATIAN,
@@ -40,6 +41,14 @@ if t.TYPE_CHECKING:
     from ..languages import Language
 LA_TEMPLATES: dict["Language", PromptConfig] = {
+    ALBANIAN: PromptConfig(
+        default_prompt_label_mapping=dict(correct="po", incorrect="jo"),
+        default_prompt_prefix="Më poshtë janë fjali dhe nëse janë gramatikisht të "
+        "sakta.",
+        default_prompt_template="Fjali: {text}\nGramatikisht e saktë: {label}",
+        default_instruction_prompt="Fjali: {text}\n\nPërcaktoni nëse fjalia është "
+        "gramatikisht e saktë apo jo. Përgjigjuni me {labels_str}, dhe asgjë tjetër.",
+    ),
     BULGARIAN: PromptConfig(
         default_prompt_label_mapping=dict(correct="да", incorrect="не"),
         default_prompt_prefix="Следват изречения и дали са граматически правилни.",

scandeval/prompt_templates/multiple_choice.py CHANGED Viewed

@@ -4,6 +4,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
+    ALBANIAN,
     BULGARIAN,
     CATALAN,
     CROATIAN,
@@ -40,6 +41,14 @@ if t.TYPE_CHECKING:
 # TODO: Missing Faroese
 MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
+    ALBANIAN: PromptConfig(
+        default_prompt_prefix="Më poshtë janë pyetje me zgjedhje të shumëfishtë "
+        "(me përgjigje).",
+        default_prompt_template="Pyetje: {text}\nPërgjigje: {label}",
+        default_instruction_prompt="Pyetje: {text}\n\nPërgjigjuni pyetjes së "
+        "mësipërme duke u përgjigjur me {labels_str}, dhe asgjë tjetër.",
+        default_prompt_label_mapping="auto",
+    ),
     BULGARIAN: PromptConfig(
         default_prompt_prefix="Следват въпроси с множествен избор (с отговори).",
         default_prompt_template="Въпрос: {text}\nОтговор: {label}",

scandeval/prompt_templates/named_entity_recognition.py CHANGED Viewed

@@ -4,6 +4,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
+    ALBANIAN,
     BOSNIAN,
     BULGARIAN,
     CATALAN,
@@ -42,6 +43,25 @@ if t.TYPE_CHECKING:
 NER_TEMPLATES: dict["Language", PromptConfig] = {
+    ALBANIAN: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "person",
+            "i-per": "person",
+            "b-loc": "vendndodhje",
+            "i-loc": "vendndodhje",
+            "b-org": "organizatë",
+            "i-org": "organizatë",
+            "b-misc": "të ndryshme",
+            "i-misc": "të ndryshme",
+        },
+        default_prompt_prefix="Më poshtë janë fjali dhe fjalorë JSON me entitetet e "
+        "emërtuara që shfaqen në fjalinë e dhënë.",
+        default_prompt_template="Fjali: {text}\nEntitete të emërtuara: {label}",
+        default_instruction_prompt="Fjali: {text}\n\nIdentifikoni entitetet e "
+        "emërtuara në fjali. Duhet t’i jepni ato si një fjalor JSON me çelësat "
+        "{labels_str}. Vlerat duhet të jenë lista të entiteteve të emërtuara të atij "
+        "lloji, saktësisht ashtu siç shfaqen në fjali.",
+    ),
     BOSNIAN: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "osoba",

scandeval/prompt_templates/reading_comprehension.py CHANGED Viewed

@@ -4,6 +4,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
+    ALBANIAN,
     BOSNIAN,
     BULGARIAN,
     CATALAN,
@@ -41,6 +42,14 @@ if t.TYPE_CHECKING:
     from ..languages import Language
 RC_TEMPLATES: dict["Language", PromptConfig] = {
+    ALBANIAN: PromptConfig(
+        default_prompt_prefix="Më poshtë janë tekste me pyetje dhe përgjigje.",
+        default_prompt_template="Tekst: {text}\nPyetje: {question}\nPërgjigje me "
+        "maksimum 3 fjalë: {label}",
+        default_instruction_prompt="Tekst: {text}\n\nPërgjigjuni pyetjes së mëposhtme "
+        "rreth tekstit të mësipërm me maksimum 3 fjalë.\n\nPyetje: {question}",
+        default_prompt_label_mapping=dict(),
+    ),
     BOSNIAN: PromptConfig(
         default_prompt_prefix="Slijede tekstovi s pitanjima i odgovorima.",
         default_prompt_template="Tekst: {text}\nPitanje: {question}\nOdgovor s "

scandeval/prompt_templates/sentiment_classification.py CHANGED Viewed

@@ -4,6 +4,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
+    ALBANIAN,
     BOSNIAN,
     BULGARIAN,
     CATALAN,
@@ -41,6 +42,16 @@ if t.TYPE_CHECKING:
     from ..languages import Language
 SENT_TEMPLATES: dict["Language", PromptConfig] = {
+    ALBANIAN: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="pozitive", neutral="neutrale", negative="negative"
+        ),
+        default_prompt_prefix="Më poshtë janë dokumentet dhe ndjenjat e tyre, të cilat "
+        "mund të jenë {labels_str}.",
+        default_prompt_template="Dokument: {text}\nNdjenja: {label}",
+        default_instruction_prompt="Dokument: {text}\n\nKlasifikoni ndjenjën në "
+        "dokument. Përgjigjuni vetëm me {labels_str}, dhe asgjë tjetër.",
+    ),
     BOSNIAN: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="pozitivno", neutral="neutralno", negative="negativno"

scandeval/prompt_templates/simplification.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""Templates for the Simplification task."""
+from ..data_models import PromptConfig
+from ..languages import DUTCH, ENGLISH
+SIMPL_TEMPLATES = {
+    ENGLISH: PromptConfig(
+        default_prompt_prefix="The following are documents with accompanying "
+        "simplifications.",
+        default_prompt_template="Document: {text}\nSimplification: {target_text}",
+        default_instruction_prompt="Document: {text}\n\nWrite a simplification "
+        "of the above document.",
+        default_prompt_label_mapping=dict(),
+    ),
+    DUTCH: PromptConfig(
+        default_prompt_prefix="Hieronder volgen documenten met bijbehorende "
+        "versimpelingen.",
+        default_prompt_template="Document: {text}\nVersimpeling: {target_text}",
+        default_instruction_prompt="Document: {text}\n\nVersimpel het "
+        "bovenstaande document.",
+        default_prompt_label_mapping=dict(),
+    ),
+}

scandeval/prompt_templates/summarization.py CHANGED Viewed

@@ -4,6 +4,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
+    ALBANIAN,
     BOSNIAN,
     CATALAN,
     CZECH,
@@ -37,6 +38,16 @@ if t.TYPE_CHECKING:
 # TODO: Missing Faroese
 SUMM_TEMPLATES: dict["Language", PromptConfig] = {
+    ALBANIAN: PromptConfig(
+        default_prompt_prefix=(
+            "Më poshtë janë dokumente me përmbledhje të bashkëngjitura."
+        ),
+        default_prompt_template=("Dokument: {text}\nPërmbledhje: {target_text}"),
+        default_instruction_prompt=(
+            "Dokument: {text}\n\nShkruani një përmbledhje të dokumentit të mësipërm."
+        ),
+        default_prompt_label_mapping=dict(),
+    ),
     BOSNIAN: PromptConfig(
         default_prompt_prefix="Slijede dokumenti s priloženim sažecima.",
         default_prompt_template="Dokument: {text}\nSažetak: {target_text}",

scandeval/tasks.py CHANGED Viewed

@@ -11,6 +11,7 @@ from .prompt_templates import (
     NER_TEMPLATES,
     RC_TEMPLATES,
     SENT_TEMPLATES,
+    SIMPL_TEMPLATES,
     SUMM_TEMPLATES,
     TOKEN_CLASSIFICATION_TEMPLATES,
 )
@@ -71,6 +72,16 @@ SENT = Task(
     uses_logprobs=True,
 )
+SIMPL = Task(
+    name="simplification",
+    task_group=TaskGroup.TEXT_TO_TEXT,
+    template_dict=SIMPL_TEMPLATES,
+    metrics=[m.meteor_metric, m.sari_metric],
+    default_num_few_shot_examples=3,
+    default_max_generated_tokens=256,
+    default_labels=[],
+    default_allowed_model_types=[ModelType.GENERATIVE],
+)
 SUMM = Task(
     name="summarization",

scandeval/utils.py CHANGED Viewed

@@ -306,14 +306,13 @@ def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
     Returns:
         The result of the coroutine.
     """
-    loop = asyncio.new_event_loop()
     try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError:  # If the current event loop is closed
+        loop = asyncio.new_event_loop()
         asyncio.set_event_loop(loop)
-        response = loop.run_until_complete(coroutine)
-        return response
-    finally:
-        loop.close()
-        asyncio.set_event_loop(None)
+    response = loop.run_until_complete(coroutine)
+    return response
 async def add_semaphore_and_catch_exception(

{scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ScandEval
-Version: 16.9.0
+Version: 16.10.1
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -50,6 +50,7 @@ Requires-Dist: pydantic>=2.6.0
 Requires-Dist: pyinfer>=0.0.3
 Requires-Dist: python-dotenv>=1.0.1
 Requires-Dist: rouge-score>=0.1.2
+Requires-Dist: sacrebleu>=2.5.1
 Requires-Dist: sacremoses>=0.1.1
 Requires-Dist: scikit-learn==1.6.1
 Requires-Dist: sentencepiece>=0.1.96
@@ -62,11 +63,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
 Provides-Extra: all
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: ray>=2.53.0; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: timm>=1.0.19; extra == 'all'
 Requires-Dist: vllm[flashinfer]==0.11.0; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: ray>=2.53.0; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: timm>=1.0.19; extra == 'generative'
 Requires-Dist: vllm[flashinfer]==0.11.0; (platform_system == 'Linux') and extra == 'generative'
 Description-Content-Type: text/markdown
@@ -575,6 +578,20 @@ A huge thank you to all the contributors who have helped make this project a suc
         alt="Contributor avatar for mrkowalski"
     />
 </a>
+<a href="https://github.com/simonevanbruggen">
+    <img
+        src="https://avatars.githubusercontent.com/u/24842609"
+        width=50
+        alt="Contributor avatar for simonevanbruggen"
+    />
+</a>
+<a href="https://github.com/tvosch">
+    <img
+        src="https://avatars.githubusercontent.com/u/110661769"
+        width=50
+        alt="Contributor avatar for tvosch"
+    />
+</a>
 ### Contribute to EuroEval

{scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/RECORD RENAMED Viewed

@@ -3,40 +3,41 @@ scandeval/benchmark_config_factory.py,sha256=2stmcqKwx0G9pAiA0atunqDchJ9eoezp1Wh
 scandeval/benchmarker.py,sha256=ARH1ATYAunKNRgIQTDvGqMN_M-ygG0SIQw-hfTOuC6U,53556
 scandeval/caching_utils.py,sha256=lLUbkpDdJZy4xodIpwIz5d-WNKGuszbr_d9dyiJ5kZc,2591
 scandeval/callbacks.py,sha256=l8f6Zr8EoHfVFsI1ZnMUK0Y8uZB00Nvaz_I6XDn6avE,2515
-scandeval/cli.py,sha256=QRpylEtrJ34WXrkrWBL8WPmhjvU_sjh9Z_czNuQt66w,9411
-scandeval/constants.py,sha256=1Ew9yBPNu2blYb3v4HD5V_RGZV_MJ9PXNiakDrwMiGs,3509
+scandeval/cli.py,sha256=zvPGomSdrcjxc4uhmh8SkB4s2d7U9JYhxBJ34vznqUI,9411
+scandeval/constants.py,sha256=wF7fQwaX8yZIypq_eh5RcaQFEhABR7dJxQaAX82b4P8,3766
 scandeval/data_loading.py,sha256=8ryYEmj6di1f9QefGfNajxObQ9iapIGuAsL8m9KzDyI,7050
-scandeval/data_models.py,sha256=FKJudSbSGfc6rRetk0hHrIQxWKlYxz6l5Xf8Tk5zcFU,30228
+scandeval/data_models.py,sha256=vRGKrYr1YFBcH4ngOHrESicbTaIcz-joKz58JN5YMFE,30548
 scandeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
 scandeval/exceptions.py,sha256=4-N2OIo5PJ2aciLjagNAVhdHPxpq2QxywbBqJ8lkKj0,5780
 scandeval/finetuning.py,sha256=dTjchPHLFRD65ZrEmtj5TfMTPZ6PODn77t372fgTNwE,11983
 scandeval/generation.py,sha256=ccE-S0jxkM99XziIdeaBbk8yRGv4YBkzZkoabhFCSKA,13382
 scandeval/generation_utils.py,sha256=A6YCiiMrMEUHq5BcVEjsouIKMPGt0sCfPzsJY1GVyk0,20092
 scandeval/languages.py,sha256=gUSosFbvf1eEQHjVsKhXdJ4jiGXC-9lMkOL8AsBG33Q,37295
-scandeval/logging_utils.py,sha256=l7eafHBZrx66AGaxT3pngwXYXSlVbew7Ph-pg9zPSpk,9478
+scandeval/logging_utils.py,sha256=Pd6DyHTPHCUsjtriomJboiTB35UdXvzxwnNpGTuec-g,9522
 scandeval/model_cache.py,sha256=sjMYW0klnHt2yAFLavDTsp_InxPeSOuVEFo-Rh_31UM,10219
 scandeval/model_config.py,sha256=fxHfgpw-9vj3hwke28DguVGvG9TU06nkTXT0V6KAMpQ,2761
 scandeval/model_loading.py,sha256=bE51L4-AaVgo9h10UsKH_47CB4tOJGU988HxotQ5sYE,2342
 scandeval/scores.py,sha256=9a1XtppFbp8GJFc9JdThGxqBY0YUE7-92oyrlxScjNk,3281
 scandeval/speed_benchmark.py,sha256=VUOvauc9tuAegThNT2g1a-Z1l7DEmKq57dHI4t16o5A,4068
-scandeval/tasks.py,sha256=XaEI1IKpHU66DII-6D_8FishBur8kZ7Hx4aojqlmf48,5642
+scandeval/tasks.py,sha256=mgE6Vx_1WD9-aY-yeBxc_09Uyz-tqk69xISMWVYcrsY,5980
 scandeval/tokenisation_utils.py,sha256=Sa8V91J4NDFBF-qbConPsQvUkW_02cJp0gySz_Q3NDo,21191
 scandeval/types.py,sha256=-VNeeDEvlNwfemszpvuGb3Dr9Gu3Eqc6XRmR11HLRi4,3293
-scandeval/utils.py,sha256=FkCWe3Olj1Sf5EpDstoJdP7dWKY9Tww4xyrNIs7FDiM,18360
+scandeval/utils.py,sha256=BIAP9TWmY_xv6tuCUgmnYifoeodxlz8N2Q0We3frgLU,18389
 scandeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 scandeval/benchmark_modules/base.py,sha256=5YAsCMILKTRXFx_ylGQ7iS5AFKN25iFdkBjj8KzzElw,11445
 scandeval/benchmark_modules/fresh.py,sha256=sG5ae4p1J-GGmVNcVBIxY1xZIAlUwq_pu-9c4uAYU3Y,10734
 scandeval/benchmark_modules/hf.py,sha256=f89E7XoMqsBHhYnMYBgy7ZuXDsAQ7VaIqMfFrHyjg8g,47363
-scandeval/benchmark_modules/litellm.py,sha256=oHSOfugP_SO9k59UvFUPvbcANzEpfNL-hLD_PzOIkmY,71600
-scandeval/benchmark_modules/vllm.py,sha256=1A_ouFN8svoje6RiETwAl_M5TJnrciSb-oGpTbGyEgg,52450
-scandeval/dataset_configs/__init__.py,sha256=LT-6JXnQVgI9CekcoHLtumYMJrgaen9mQTUQy1Y-4CY,3185
+scandeval/benchmark_modules/litellm.py,sha256=TH35CQhoVinlmfHnAW-XJE21o96YfiIv993m0ASS80E,71590
+scandeval/benchmark_modules/vllm.py,sha256=pFCBuIp2m2KIlVMlqc7sGp1twiENvRHx3ppVs0bFvFo,57319
+scandeval/dataset_configs/__init__.py,sha256=GFI_W9GKd3OSDdhhJzHc8mwoP9b32IHIIyvPBI-hK6k,3223
+scandeval/dataset_configs/albanian.py,sha256=D__dli7JO3yeHzzdJ3FFyUGw-z20f1yI6QLnws-WB8I,1473
 scandeval/dataset_configs/bosnian.py,sha256=golIWqwW1pFwSkuBM1v0yhHDblB2FoJgK24aO7kKm7M,877
 scandeval/dataset_configs/bulgarian.py,sha256=OVoDPTRdU-lVq-xUka7-Ct20h2jbs8HV43KBxRQenIE,1284
 scandeval/dataset_configs/catalan.py,sha256=SXwRJjIcMMN7rVuhFRZSnCGDoMfabW5HFoZOkq0Jpg0,1427
 scandeval/dataset_configs/croatian.py,sha256=U5oBTjttpWTWonTEzZAf-G3nvQICRQmw6Kla-HWn_5k,1260
 scandeval/dataset_configs/czech.py,sha256=ghv2yNw839G-utll8PQRSjyKYbM5gfoQhFKy664GTCI,1562
 scandeval/dataset_configs/danish.py,sha256=LEKs04vK2KnV0CYheT7FeS-g3iHBvf2bQxyl0D_LbTg,3293
-scandeval/dataset_configs/dutch.py,sha256=HB1O7IxQUyOxLg7g0tqcCci1MHaKtZJiFlRJZo2jPr4,3107
+scandeval/dataset_configs/dutch.py,sha256=OZJmaqGguXY5D9hz0zFNrwGQPRXgxZonctSc8Gsy9sY,3550
 scandeval/dataset_configs/english.py,sha256=nc9nGwxf1tHVMUhQeND61yJbpTO4rJaAusPZlstqtq0,2817
 scandeval/dataset_configs/estonian.py,sha256=bWiKA_dJ7WUE8Z_1YZnSewhi4ZdCQBGJZ7pQxkCwMcU,2757
 scandeval/dataset_configs/faroese.py,sha256=13qYwXonDPWG9Av5MY_NBNTRDglPVKz5_mbz7ZCJ_mo,1247
@@ -49,7 +50,7 @@ scandeval/dataset_configs/icelandic.py,sha256=G2Ibe6oF1NknkQmHqLpoHlysW_8f-0G53D
 scandeval/dataset_configs/italian.py,sha256=qhjAQChnQanzs7EyN1DSAJ4OOU41HAlWqWntQOtbWCw,2761
 scandeval/dataset_configs/latvian.py,sha256=wbwIDieq5Lplng5Jzx9LEqq4d8b5LnNOyCUmT64b4bA,1928
 scandeval/dataset_configs/lithuanian.py,sha256=RPqKwsysO1TYeQuEEsbhzGcSFHDX94lk1hgl1CfQaMU,1724
-scandeval/dataset_configs/norwegian.py,sha256=skKKs4V4-zbd-1lpVUaxKXAjTMpBM6SAU5HZ8kcQ2mI,5454
+scandeval/dataset_configs/norwegian.py,sha256=k70T78rTY3pmmVRxG3i_J1j7td_boFHJetkyITskIL0,5487
 scandeval/dataset_configs/polish.py,sha256=nN_NT8cUK2iv1L_zO_aCYOk2R7ACSDZgvI7e0hIaFAM,2074
 scandeval/dataset_configs/portuguese.py,sha256=m9lEeVtI_yNvIdTIEOn3HFK_ilY2tn3-acC981hjZFM,2401
 scandeval/dataset_configs/romanian.py,sha256=AcDp0mqOHmmv3EodovGEcBmarxjLYsXOPr_X4IQoNTw,1472
@@ -61,18 +62,19 @@ scandeval/dataset_configs/swedish.py,sha256=kpEK29swY7iyUSzUvD9hNf2qwb3d7bHrFwbo
 scandeval/dataset_configs/ukrainian.py,sha256=spbCmCOU27jOfz6FZxqCIfVmDN5l8H-7VCl-k-8eAIo,1527
 scandeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
 scandeval/metrics/base.py,sha256=dUBby-ZzettMjdcjek6rw0JTZMuScX4cQ2Rd6untKHY,2525
-scandeval/metrics/huggingface.py,sha256=w0iTFIavi4Q4IGJCSFpcCX1ce28e8D6S1WjllNggi18,6735
+scandeval/metrics/huggingface.py,sha256=W1hPuIGBALOogGN2yTGTJUsylsMII3A66fEe9nB8N2k,9493
 scandeval/metrics/llm_as_a_judge.py,sha256=cZ7ZCuB3633T87MjBtAekrBQ_vYaNv1uTcqnI32gNpQ,9837
 scandeval/metrics/pipeline.py,sha256=GTIqaFkn-nTLU4xBi8-zP1J4Ytv3qeFVuRB4OcuwkOw,10876
 scandeval/metrics/speed.py,sha256=G5hEQcrtqxF070ZZwLDh61iZnq2CSW2o6ZM7zR4lOTY,1298
-scandeval/prompt_templates/__init__.py,sha256=HN6Qspqm10ik6RKoPBJsvM-Nng9sywQojZbtbCqj4Z8,475
+scandeval/prompt_templates/__init__.py,sha256=p3CUcSaJiiUm6EQyhceDUjotH7GdyHolMznAn2f44as,519
 scandeval/prompt_templates/classification.py,sha256=QuZh6hTMaqMYTsoruAhwjVP9381zzlQmDIwSeyGnav0,10121
-scandeval/prompt_templates/linguistic_acceptability.py,sha256=bOcmGYa8OgyHRsd5oTS6hPqUsaN_YqQ4hOfb3qo1vhg,14984
-scandeval/prompt_templates/multiple_choice.py,sha256=pbTUcU-n0Zu8NgX2tO-ArdlTJktT_k3onzdKbyFzCdk,12536
-scandeval/prompt_templates/named_entity_recognition.py,sha256=IEGMedQ8VJw1L_lU7JNGp7G9qlmgI3d_8xRB-R9YKPE,29264
-scandeval/prompt_templates/reading_comprehension.py,sha256=mcf8SzDuktmAaqV7gQbZU91cn90fzyFSg32TBkqrWxk,15844
-scandeval/prompt_templates/sentiment_classification.py,sha256=occxjsJuJ0SdqZxpWlsqN9VPE75wTCG8Ii83Pay1ju4,16860
-scandeval/prompt_templates/summarization.py,sha256=fmx3xzSho2LAz1xZe2wQp9DgSWdes-zUtbgEvC6pK5A,10331
+scandeval/prompt_templates/linguistic_acceptability.py,sha256=V31apMLPNhTeDJO6va_04SjuDSXMOJEFurIeSldDi7o,15474
+scandeval/prompt_templates/multiple_choice.py,sha256=pgz-Xb-vUthwJyjla56CxeeXPDkgtZ7Mi9z1J-PjepY,12977
+scandeval/prompt_templates/named_entity_recognition.py,sha256=U9KYr4eIbiMdHECc35CjkNUDoiRd6Jd8w0v35kRWGL4,30197
+scandeval/prompt_templates/reading_comprehension.py,sha256=4C16Mf1MGtEZG9x8PxrJmK1Cxfz9kzjrJLNS725_5oI,16319
+scandeval/prompt_templates/sentiment_classification.py,sha256=mLrhWh0rQTjiowzprv8S5CfLO_g7DvnSjWiw0CsaXpg,17401
+scandeval/prompt_templates/simplification.py,sha256=DF50F1JSxy00ZOO3OJJZOtoTlkGjE35krjjbDaW7RUk,900
+scandeval/prompt_templates/summarization.py,sha256=LKiz5fd6A0J5NyoLBeyrZ4ir1skDB2pytKCEeF4zbmw,10770
 scandeval/prompt_templates/token_classification.py,sha256=8Uw34mN2xQ_5es-nz7vCK-GgDg_oE-zsAzPJPzAxFrQ,15531
 scandeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 scandeval/task_group_utils/multiple_choice_classification.py,sha256=PWUXeGn-9RsXxdVRYHJASyBVQ8L5Jla981eot0GLooY,7316
@@ -80,8 +82,8 @@ scandeval/task_group_utils/question_answering.py,sha256=tuMwr-RnvJap5jkTrluxC1tf
 scandeval/task_group_utils/sequence_classification.py,sha256=VhiggNrB7Gi2x-99MPL0RR2VZRv-wpJerXulgQH6wcU,16556
 scandeval/task_group_utils/text_to_text.py,sha256=p6zzjob70qQUpfUOs0LToSzavE1ERqRAHu_727Jb2mM,5476
 scandeval/task_group_utils/token_classification.py,sha256=8dF32KQAYAFnnn7DPHX-yvJmRrMBmT2CyFREacyTwvQ,17321
-scandeval-16.9.0.dist-info/METADATA,sha256=9zkQ0iVpFbPt8IWSc7C6G3X5_fq6_SL3y3q5IfPAW-U,22858
-scandeval-16.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-scandeval-16.9.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
-scandeval-16.9.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
-scandeval-16.9.0.dist-info/RECORD,,
+scandeval-16.10.1.dist-info/METADATA,sha256=IYJza42KMRZdoc2-8z9NHaniGAH4K7hT1WHCyFT-Wow,23435
+scandeval-16.10.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+scandeval-16.10.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
+scandeval-16.10.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
+scandeval-16.10.1.dist-info/RECORD,,

{scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

ScandEval 16.9.0__py3-none-any.whl → 16.10.1__py3-none-any.whl

ScandEval 16.9.0py3-none-any.whl → 16.10.1py3-none-any.whl