PyPI - ScandEval - Versions diffs - 16.12.0__py3-none-any.whl → 16.13.0__py3-none-any.whl - Mend

ScandEval 16.12.0py3-none-any.whl → 16.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

scandeval/async_utils.py +46 -0
scandeval/benchmark_config_factory.py +26 -2
scandeval/benchmark_modules/fresh.py +2 -1
scandeval/benchmark_modules/hf.py +50 -12
scandeval/benchmark_modules/litellm.py +25 -15
scandeval/benchmark_modules/vllm.py +3 -3
scandeval/benchmarker.py +15 -33
scandeval/cli.py +2 -4
scandeval/constants.py +5 -0
scandeval/custom_dataset_configs.py +152 -0
scandeval/data_loading.py +87 -31
scandeval/data_models.py +396 -225
scandeval/dataset_configs/__init__.py +51 -25
scandeval/dataset_configs/albanian.py +1 -1
scandeval/dataset_configs/belarusian.py +47 -0
scandeval/dataset_configs/bulgarian.py +1 -1
scandeval/dataset_configs/catalan.py +1 -1
scandeval/dataset_configs/croatian.py +1 -1
scandeval/dataset_configs/danish.py +3 -2
scandeval/dataset_configs/dutch.py +7 -6
scandeval/dataset_configs/english.py +4 -3
scandeval/dataset_configs/estonian.py +8 -7
scandeval/dataset_configs/faroese.py +1 -1
scandeval/dataset_configs/finnish.py +5 -4
scandeval/dataset_configs/french.py +6 -5
scandeval/dataset_configs/german.py +4 -3
scandeval/dataset_configs/greek.py +1 -1
scandeval/dataset_configs/hungarian.py +1 -1
scandeval/dataset_configs/icelandic.py +4 -3
scandeval/dataset_configs/italian.py +4 -3
scandeval/dataset_configs/latvian.py +2 -2
scandeval/dataset_configs/lithuanian.py +1 -1
scandeval/dataset_configs/norwegian.py +6 -5
scandeval/dataset_configs/polish.py +4 -3
scandeval/dataset_configs/portuguese.py +5 -4
scandeval/dataset_configs/romanian.py +2 -2
scandeval/dataset_configs/serbian.py +1 -1
scandeval/dataset_configs/slovene.py +1 -1
scandeval/dataset_configs/spanish.py +4 -3
scandeval/dataset_configs/swedish.py +4 -3
scandeval/dataset_configs/ukrainian.py +1 -1
scandeval/generation_utils.py +6 -6
scandeval/metrics/llm_as_a_judge.py +1 -1
scandeval/metrics/pipeline.py +1 -1
scandeval/model_cache.py +34 -4
scandeval/prompt_templates/linguistic_acceptability.py +9 -0
scandeval/prompt_templates/multiple_choice.py +9 -0
scandeval/prompt_templates/named_entity_recognition.py +21 -0
scandeval/prompt_templates/reading_comprehension.py +10 -0
scandeval/prompt_templates/sentiment_classification.py +11 -0
scandeval/string_utils.py +157 -0
scandeval/task_group_utils/sequence_classification.py +2 -5
scandeval/task_group_utils/token_classification.py +2 -4
scandeval/utils.py +6 -323
scandeval-16.13.0.dist-info/METADATA +334 -0
scandeval-16.13.0.dist-info/RECORD +94 -0
scandeval-16.12.0.dist-info/METADATA +0 -667
scandeval-16.12.0.dist-info/RECORD +0 -90
{scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
{scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
{scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0

scandeval/dataset_configs/swedish.py CHANGED Viewed

@@ -68,9 +68,10 @@ VALEU_SV_CONFIG = DatasetConfig(
     source="EuroEval/european-values-sv",
     task=EUROPEAN_VALUES,
     languages=[SWEDISH],
-    splits=["test"],
+    train_split=None,
+    val_split=None,
     bootstrap_samples=False,
-    _instruction_prompt="{text}",
+    instruction_prompt="{text}",
 )
@@ -127,7 +128,7 @@ WINOGRANDE_SV_CONFIG = DatasetConfig(
     source="EuroEval/winogrande-sv",
     task=COMMON_SENSE,
     languages=[SWEDISH],
-    _labels=["a", "b"],
+    labels=["a", "b"],
     unofficial=True,
 )

scandeval/dataset_configs/ukrainian.py CHANGED Viewed

@@ -60,5 +60,5 @@ WINOGRANDE_UK_CONFIG = DatasetConfig(
     source="EuroEval/winogrande-uk",
     task=COMMON_SENSE,
     languages=[UKRAINIAN],
-    _labels=["a", "b"],
+    labels=["a", "b"],
 )

scandeval/generation_utils.py CHANGED Viewed

@@ -13,8 +13,8 @@ from datasets import Dataset
 from .enums import GenerativeType, TaskGroup
 from .exceptions import InvalidBenchmark, InvalidModel
 from .logging_utils import log_once
+from .string_utils import extract_multiple_choice_labels
 from .tokenisation_utils import apply_chat_template
-from .utils import extract_multiple_choice_labels
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -102,7 +102,7 @@ def extract_few_shot_examples(
                     )
                 label = next(labels)
                 possible_examples = shuffled_train.filter(
-                    lambda x: x["label"].lower() == label.lower()
+                    lambda x: str(x["label"]).lower() == label.lower()
                 )
                 assert isinstance(possible_examples, Dataset), (
                     f"Expected `possible_examples` to be a Dataset, but got "
@@ -142,7 +142,7 @@ def extract_few_shot_examples(
             while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
                 label = next(labels)
                 possible_examples = shuffled_train.filter(
-                    lambda x: label in [tag.lower() for tag in x["labels"]]
+                    lambda x: label in [str(tag).lower() for tag in x["labels"]]
                 )
                 assert isinstance(possible_examples, Dataset), (
                     f"Expected `possible_examples` to be a Dataset, but got "
@@ -274,7 +274,7 @@ def apply_prompt(
             few_shot_sections = [
                 create_prompt(
                     text=example["text"].replace("\n", " ").strip(),
-                    label=example["label"].replace("\n", " ").strip(),
+                    label=str(example["label"]).replace("\n", " ").strip(),
                     labels_str=labels_str,
                 )
                 for example in few_shot_examples
@@ -292,7 +292,7 @@ def apply_prompt(
             few_shot_sections = [
                 create_prompt(
                     text=example["text"].replace("\n", " ").strip(),
-                    label=example["label"].replace("\n", " ").strip(),
+                    label=str(example["label"]).replace("\n", " ").strip(),
                     labels_str=dataset_config.get_labels_str(
                         labels=extract_multiple_choice_labels(
                             prompt=example["text"],
@@ -337,7 +337,7 @@ def apply_prompt(
                     prompt_label: list() for prompt_label in prompt_labels
                 }
                 for token, label in zip(example["tokens"], example["labels"]):
-                    label = label.lower()
+                    label = str(label).lower()
                     if label == "o":
                         continue
                     prompt_label = dataset_config.prompt_label_mapping[label]

scandeval/metrics/llm_as_a_judge.py CHANGED Viewed

@@ -9,7 +9,7 @@ from pydantic import BaseModel, Field, ValidationError
 from ..exceptions import InvalidBenchmark
 from ..logging_utils import log
-from ..utils import extract_json_dict_from_string
+from ..string_utils import extract_json_dict_from_string
 from .base import Metric
 if t.TYPE_CHECKING:

scandeval/metrics/pipeline.py CHANGED Viewed

@@ -12,7 +12,7 @@ from scipy.special import expit as sigmoid
 from ..exceptions import InvalidBenchmark
 from ..logging_utils import log, no_terminal_output
-from ..utils import unscramble
+from ..string_utils import unscramble
 from .base import Metric
 if t.TYPE_CHECKING:

scandeval/model_cache.py CHANGED Viewed

@@ -5,9 +5,9 @@ import hashlib
 import json
 import logging
 import sys
-import typing as t
 from collections import defaultdict
 from dataclasses import asdict
+from pathlib import Path
 from datasets import Dataset
@@ -15,9 +15,6 @@ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
 from .data_models import GenerativeModelOutput, SingleGenerativeModelOutput
 from .logging_utils import get_pbar, log, log_once
-if t.TYPE_CHECKING:
-    from pathlib import Path
 class ModelCache:
     """A cache for model outputs.
@@ -295,3 +292,36 @@ def load_cached_model_outputs(
     cached_scores = [model_output.scores or [] for model_output in cached_model_outputs]
     return GenerativeModelOutput(sequences=cached_sequences, scores=cached_scores)
+def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
+    """Create cache directory for a model.
+    Args:
+        cache_dir:
+            The cache directory.
+        model_id:
+            The model ID.
+    Returns:
+        The path to the cache directory.
+    """
+    # If the model ID is a path, we just use that as the cache dir
+    if Path(model_id).is_dir():
+        log_once(
+            f"Since the model {model_id!r} is a local model, we will use the model "
+            "directory directly as the model cache directory.",
+            level=logging.DEBUG,
+        )
+        return model_id
+    # Otherwise, we create a cache dir based on the model ID
+    model_cache_dir = Path(
+        cache_dir, "model_cache", model_id.replace("/", "--")
+    ).as_posix()
+    log_once(
+        f"Using the model cache directory {model_cache_dir!r} for the model "
+        f"{model_id!r}.",
+        level=logging.DEBUG,
+    )
+    return model_cache_dir

scandeval/prompt_templates/linguistic_acceptability.py CHANGED Viewed

@@ -5,6 +5,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
     ALBANIAN,
+    BELARUSIAN,
     BULGARIAN,
     CATALAN,
     CROATIAN,
@@ -49,6 +50,14 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
         default_instruction_prompt="Fjali: {text}\n\nPërcaktoni nëse fjalia është "
         "gramatikisht e saktë apo jo. Përgjigjuni me {labels_str}, dhe asgjë tjetër.",
     ),
+    BELARUSIAN: PromptConfig(
+        default_prompt_label_mapping=dict(correct="так", incorrect="не"),
+        default_prompt_prefix="Ніжэй прыведзены сказы і ці з'яўляюцца яны "
+        "граматычна правільнымі.",
+        default_prompt_template="Сказ: {text}\nГраматычна правільны: {label}",
+        default_instruction_prompt="Сказ: {text}\n\nВызначце, ці сказ граматычна "
+        "правільны ці не. Адкажыце толькі {labels_str}, і нічога іншага.",
+    ),
     BULGARIAN: PromptConfig(
         default_prompt_label_mapping=dict(correct="да", incorrect="не"),
         default_prompt_prefix="Следват изречения и дали са граматически правилни.",

scandeval/prompt_templates/multiple_choice.py CHANGED Viewed

@@ -5,6 +5,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
     ALBANIAN,
+    BELARUSIAN,
     BULGARIAN,
     CATALAN,
     CROATIAN,
@@ -49,6 +50,14 @@ MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
         "mësipërme duke u përgjigjur me {labels_str}, dhe asgjë tjetër.",
         default_prompt_label_mapping="auto",
     ),
+    BELARUSIAN: PromptConfig(
+        default_prompt_prefix="Ніжэй прыведзены пытанні з некалькімі варыянтамі "
+        "адказу (з адказамі).",
+        default_prompt_template="Пытанне: {text}\nАдказ: {label}",
+        default_instruction_prompt="Пытанне: {text}\n\nАдкажыце на пытанне вышэй, "
+        "адказаўшы {labels_str}, і нічога іншага.",
+        default_prompt_label_mapping="auto",
+    ),
     BULGARIAN: PromptConfig(
         default_prompt_prefix="Следват въпроси с множествен избор (с отговори).",
         default_prompt_template="Въпрос: {text}\nОтговор: {label}",

scandeval/prompt_templates/named_entity_recognition.py CHANGED Viewed

@@ -5,6 +5,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
     ALBANIAN,
+    BELARUSIAN,
     BOSNIAN,
     BULGARIAN,
     CATALAN,
@@ -62,6 +63,26 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
         "{labels_str}. Vlerat duhet të jenë lista të entiteteve të emërtuara të atij "
         "lloji, saktësisht ashtu siç shfaqen në fjali.",
     ),
+    BELARUSIAN: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "асоба",
+            "i-per": "асоба",
+            "b-loc": "месца",
+            "i-loc": "месца",
+            "b-org": "арганізацыя",
+            "i-org": "арганізацыя",
+            "b-misc": "рознае",
+            "i-misc": "рознае",
+        },
+        default_prompt_prefix="Ніжэй прыведзены сказы і JSON-слоўнікі з іменаванымі "
+        "сутнасцямі, якія прысутнічаюць у дадзеным сказе.",
+        default_prompt_template="Сказ: {text}\nІменаваныя сутнасці: {label}",
+        default_instruction_prompt="Сказ: {text}\n\n"
+        "Ідэнтыфікуйце іменаваныя сутнасці ў сказе. Вы павінны вывесці гэта як "
+        "JSON-слоўнік з ключамі {labels_str}. Значэнні павінны быць спісамі "
+        "іменаваных сутнасцей гэтага тыпу, дакладна такімі, як яны з'яўляюцца ў "
+        "сказе.",
+    ),
     BOSNIAN: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "osoba",

scandeval/prompt_templates/reading_comprehension.py CHANGED Viewed

@@ -5,6 +5,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
     ALBANIAN,
+    BELARUSIAN,
     BOSNIAN,
     BULGARIAN,
     CATALAN,
@@ -50,6 +51,15 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
         "rreth tekstit të mësipërm me maksimum 3 fjalë.\n\nPyetje: {question}",
         default_prompt_label_mapping=dict(),
     ),
+    BELARUSIAN: PromptConfig(
+        default_prompt_prefix="Ніжэй прыведзены тэксты з адпаведнымі пытаннямі і "
+        "адказамі.",
+        default_prompt_template="Тэкст: {text}\nПытанне: {question}\nАдказ "
+        "максімум 3 словамі: {label}",
+        default_instruction_prompt="Тэкст: {text}\n\nАдкажыце на наступнае пытанне "
+        "пра тэкст вышэй максімум 3 словамі.\n\nПытанне: {question}",
+        default_prompt_label_mapping=dict(),
+    ),
     BOSNIAN: PromptConfig(
         default_prompt_prefix="Slijede tekstovi s pitanjima i odgovorima.",
         default_prompt_template="Tekst: {text}\nPitanje: {question}\nOdgovor s "

scandeval/prompt_templates/sentiment_classification.py CHANGED Viewed

@@ -5,6 +5,7 @@ import typing as t
 from ..data_models import PromptConfig
 from ..languages import (
     ALBANIAN,
+    BELARUSIAN,
     BOSNIAN,
     BULGARIAN,
     CATALAN,
@@ -52,6 +53,16 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
         default_instruction_prompt="Dokument: {text}\n\nKlasifikoni ndjenjën në "
         "dokument. Përgjigjuni vetëm me {labels_str}, dhe asgjë tjetër.",
     ),
+    BELARUSIAN: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="станоўчы", neutral="нейтральны", negative="адмоўны"
+        ),
+        default_prompt_prefix="Ніжэй прыведзены дакументы і іх сентымент, які можа "
+        "быць {labels_str}.",
+        default_prompt_template="Дакумент: {text}\nСентымент: {label}",
+        default_instruction_prompt="Дакумент: {text}\n\nКласіфікуйце сентымент у "
+        "дакуменце. Адкажыце толькі {labels_str}, і нічога іншага.",
+    ),
     BOSNIAN: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="pozitivno", neutral="neutralno", negative="negativno"

scandeval/string_utils.py ADDED Viewed

@@ -0,0 +1,157 @@
+"""Utility functions related to string manipulation or structuring."""
+import collections.abc as c
+import logging
+import re
+import typing as t
+import demjson3
+import numpy as np
+from .exceptions import InvalidBenchmark, InvalidModel
+from .logging_utils import log
+if t.TYPE_CHECKING:
+    from .data_models import ModelIdComponents
+def scramble(text: str) -> str:
+    """Scramble a string in a bijective manner.
+    Args:
+        text:
+            The string to scramble.
+    Returns:
+        The scrambled string.
+    """
+    rng = np.random.default_rng(seed=4242)
+    permutation = rng.permutation(x=len(text))
+    scrambled = "".join(text[i] for i in permutation)
+    return scrambled
+def unscramble(scrambled_text: str) -> str:
+    """Unscramble a string in a bijective manner.
+    Args:
+        scrambled_text:
+            The scrambled string to unscramble.
+    Returns:
+        The unscrambled string.
+    """
+    rng = np.random.default_rng(seed=4242)
+    permutation = rng.permutation(x=len(scrambled_text))
+    inverse_permutation = np.argsort(permutation)
+    unscrambled = "".join(scrambled_text[i] for i in inverse_permutation)
+    return unscrambled
+def extract_json_dict_from_string(s: str) -> dict | None:
+    """Extract a JSON dictionary from a string.
+    Args:
+        s:
+            The string to extract the JSON dictionary from.
+    Returns:
+        The extracted JSON dictionary, or None if no JSON dictionary could be found.
+    """
+    json_regex = r"\{[^{}]*?\}"
+    if (json_match := re.search(pattern=json_regex, string=s, flags=re.DOTALL)) is None:
+        log(
+            "The model output does not contain any JSON dictionary, so cannot parse "
+            f"it. Skipping. Here is the output: {s!r}",
+            level=logging.DEBUG,
+        )
+        return None
+    json_string = json_match.group()
+    try:
+        json_output = demjson3.decode(txt=json_string)
+    except demjson3.JSONDecodeError:
+        log(
+            "The model output is not valid JSON, so cannot parse it. Skipping. "
+            f"Here is the output: {json_string!r}",
+            level=logging.DEBUG,
+        )
+        return None
+    if not isinstance(json_output, dict):
+        log(
+            "The model output is not a JSON dictionary, so cannot parse "
+            f"it. Skipping. Here is the output: {json_string!r}",
+            level=logging.DEBUG,
+        )
+        return None
+    elif not all(isinstance(key, str) for key in json_output.keys()):
+        log(
+            "The model output is not a JSON dictionary with string keys, "
+            "so cannot parse it. Skipping. Here is the output: "
+            f"{json_string!r}",
+            level=logging.DEBUG,
+        )
+        return None
+    return json_output
+def extract_multiple_choice_labels(
+    prompt: str, candidate_labels: c.Sequence[str]
+) -> c.Sequence[str]:
+    """Extract multiple choice labels from a prompt.
+    Args:
+        prompt:
+            The prompt to extract the labels from.
+        candidate_labels:
+            The candidate labels to look for in the prompt.
+    Returns:
+        The extracted labels.
+    """
+    sample_candidate_labels: list[str] = list()
+    for candidate_label in candidate_labels:
+        candidate_label_match = re.search(
+            pattern=rf"\b{candidate_label}\. ", string=prompt, flags=re.IGNORECASE
+        )
+        if candidate_label_match is not None:
+            sample_candidate_labels.append(candidate_label)
+    if not sample_candidate_labels:
+        raise InvalidBenchmark(
+            "Could not extract any candidate labels from the prompt. Please ensure "
+            "that the candidate labels are present in the prompt, each followed by a "
+            "dot and a space (e.g., 'a. '). The candidate labels are: "
+            f"{', '.join(candidate_labels)}. Here is the prompt: {prompt!r}"
+        )
+    return sample_candidate_labels
+def split_model_id(model_id: str) -> "ModelIdComponents":
+    """Split a model ID into its components.
+    Args:
+        model_id:
+            The model ID to split.
+    Returns:
+        The split model ID.
+    Raises:
+        If the model ID is not valid.
+    """
+    # Importing here to avoid circular imports
+    from .data_models import ModelIdComponents
+    # Attempt to extract the model ID, revision, and param using regex
+    model_id_match = re.match(pattern=r"^[^@#]+", string=model_id)
+    revision_match = re.search(pattern=r"@([^@#]+)", string=model_id)
+    param_match = re.search(pattern=r"#([^@#]+)", string=model_id)
+    # If we cannot extract the model ID, raise an error
+    if model_id_match is None:
+        raise InvalidModel(f"The model ID {model_id!r} is not valid.")
+    model_id = model_id_match.group()
+    # Extract the revision and param and return the result
+    revision = revision_match.group(1) if revision_match is not None else "main"
+    param = param_match.group(1) if param_match is not None else None
+    return ModelIdComponents(model_id=model_id, revision=revision, param=param)

scandeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -10,12 +10,9 @@ import numpy as np
 from ..enums import TaskGroup
 from ..exceptions import InvalidBenchmark
+from ..string_utils import extract_multiple_choice_labels
 from ..types import Predictions
-from ..utils import (
-    extract_multiple_choice_labels,
-    log_once,
-    raise_if_model_output_contains_nan_values,
-)
+from ..utils import log_once, raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset

scandeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -9,10 +9,8 @@ import numpy as np
 from ..exceptions import InvalidBenchmark
 from ..logging_utils import log
-from ..utils import (
-    extract_json_dict_from_string,
-    raise_if_model_output_contains_nan_values,
-)
+from ..string_utils import extract_json_dict_from_string
+from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset

ScandEval 16.12.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

ScandEval 16.12.0py3-none-any.whl → 16.13.0py3-none-any.whl