PyPI - EuroEval - Versions diffs - 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl - Mend

EuroEval 16.3.0py3-none-any.whl → 16.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show

euroeval/__init__.py +3 -2
euroeval/benchmark_config_factory.py +0 -4
euroeval/benchmark_modules/base.py +3 -16
euroeval/benchmark_modules/fresh.py +2 -1
euroeval/benchmark_modules/hf.py +99 -62
euroeval/benchmark_modules/litellm.py +101 -41
euroeval/benchmark_modules/vllm.py +91 -83
euroeval/benchmarker.py +84 -78
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/constants.py +6 -0
euroeval/data_loading.py +14 -11
euroeval/data_models.py +12 -4
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/czech.py +79 -0
euroeval/dataset_configs/danish.py +10 -11
euroeval/dataset_configs/dutch.py +0 -1
euroeval/dataset_configs/english.py +0 -1
euroeval/dataset_configs/estonian.py +11 -1
euroeval/dataset_configs/finnish.py +0 -1
euroeval/dataset_configs/french.py +0 -1
euroeval/dataset_configs/german.py +0 -1
euroeval/dataset_configs/italian.py +0 -1
euroeval/dataset_configs/latvian.py +0 -1
euroeval/dataset_configs/lithuanian.py +9 -3
euroeval/dataset_configs/norwegian.py +0 -1
euroeval/dataset_configs/polish.py +0 -1
euroeval/dataset_configs/portuguese.py +0 -1
euroeval/dataset_configs/slovak.py +60 -0
euroeval/dataset_configs/spanish.py +0 -1
euroeval/dataset_configs/swedish.py +10 -12
euroeval/finetuning.py +21 -15
euroeval/generation.py +10 -10
euroeval/generation_utils.py +2 -3
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +9 -5
euroeval/metrics/llm_as_a_judge.py +5 -3
euroeval/metrics/pipeline.py +17 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +11 -14
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/linguistic_acceptability.py +21 -3
euroeval/prompt_templates/multiple_choice.py +25 -1
euroeval/prompt_templates/named_entity_recognition.py +51 -11
euroeval/prompt_templates/reading_comprehension.py +31 -3
euroeval/prompt_templates/sentiment_classification.py +23 -1
euroeval/prompt_templates/summarization.py +26 -6
euroeval/scores.py +7 -7
euroeval/speed_benchmark.py +3 -5
euroeval/task_group_utils/multiple_choice_classification.py +0 -3
euroeval/task_group_utils/question_answering.py +0 -3
euroeval/task_group_utils/sequence_classification.py +43 -31
euroeval/task_group_utils/text_to_text.py +17 -8
euroeval/task_group_utils/token_classification.py +10 -9
euroeval/tokenisation_utils.py +14 -12
euroeval/utils.py +29 -146
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/METADATA +4 -4
euroeval-16.4.0.dist-info/RECORD +75 -0
euroeval-16.3.0.dist-info/RECORD +0 -71
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmarker.py CHANGED Viewed

@@ -1,10 +1,11 @@
 """Class that benchmarks language models."""
 import contextlib
+import datetime as dt
 import json
 import logging
+import os
 import re
-import sys
 import typing as t
 from pathlib import Path
 from shutil import rmtree
@@ -12,7 +13,6 @@ from time import sleep
 from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
 from torch.distributed import destroy_process_group
-from tqdm.auto import tqdm
 from .benchmark_config_factory import build_benchmark_config
 from .constants import GENERATIVE_PIPELINE_TAGS
@@ -23,6 +23,7 @@ from .enums import Device, GenerativeType, ModelType
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
 from .finetuning import finetune
 from .generation import generate
+from .logging_utils import adjust_logging_level, get_pbar, log, log_once
 from .model_config import get_model_config
 from .model_loading import load_model
 from .scores import log_scores
@@ -32,7 +33,6 @@ from .utils import (
     enforce_reproducibility,
     get_package_version,
     internet_connection_available,
-    log_once,
     split_model_id,
 )
@@ -41,9 +41,6 @@ if t.TYPE_CHECKING:
     from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
-logger = logging.getLogger("euroeval")
 class Benchmarker:
     """Benchmarking all the language models.
@@ -200,6 +197,10 @@ class Benchmarker:
                 "Try installing it with `pip install hf_transfer`."
             )
+        # If FULL_LOG has been set, then force verbose mode
+        if os.getenv("FULL_LOG", "0") == "1":
+            verbose = True
         self.benchmark_config_default_params = BenchmarkConfigParams(
             task=task,
             dataset=dataset,
@@ -301,7 +302,6 @@ class Benchmarker:
         )
         del dataset
-        log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
         model = load_model(
             model_config=model_config,
             dataset_config=dataset_config,
@@ -611,7 +611,7 @@ class Benchmarker:
         # Get all the model configs
         model_configs: list[ModelConfig] = list()
-        for model_id in tqdm(
+        for model_id in get_pbar(
             iterable=model_ids,
             desc="Fetching model configurations",
             disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
@@ -622,7 +622,7 @@ class Benchmarker:
                 )
                 model_configs.append(model_config)
             except InvalidModel as e:
-                logger.info(e.message)
+                log(e.message, level=logging.ERROR)
         # Create a dictionary that takes each model config to the dataset configs that
         # we need to benchmark the model on. Here we remove the datasets that the model
@@ -651,21 +651,22 @@ class Benchmarker:
             for dataset_configs in model_config_to_dataset_configs.values()
         )
         if total_benchmarks == 0:
-            logger.info(
+            log(
                 "No benchmarks to run, as all the selected models have already been "
-                "benchmarked on all the selected datasets."
+                "benchmarked on all the selected datasets.",
+                level=logging.INFO,
             )
             return list()
-        logger.info(f"Initiated evaluation of {total_benchmarks:,} benchmarks.")
         num_finished_benchmarks = 0
         current_benchmark_results: list[BenchmarkResult] = list()
+        benchmark_params_to_revert: dict[str, t.Any] = dict()
         for model_config in model_configs:
             if not model_config_to_dataset_configs[model_config]:
-                logger.debug(
+                log(
                     f"Skipping model {model_config.model_id!r} because it has "
-                    "already been benchmarked on all valid datasets."
+                    "already been benchmarked on all valid datasets.",
+                    level=logging.DEBUG,
                 )
                 continue
@@ -691,7 +692,6 @@ class Benchmarker:
                     )
             loaded_model: BenchmarkModule | None = None
-            benchmark_params_to_revert: dict[str, t.Any] = dict()
             for dataset_config in model_config_to_dataset_configs[model_config]:
                 # Revert any changes to the benchmark configuration made for the
                 # previous dataset
@@ -704,18 +704,20 @@ class Benchmarker:
                     "val" not in dataset_config.splits
                     and not benchmark_config.evaluate_test_split
                 ):
-                    logger.debug(
+                    log(
                         "The dataset does not have a validation split, so even though "
                         "you requested evaluating the validation split (the default), "
-                        "we will evaluate on the test split."
+                        "we will evaluate on the test split.",
+                        level=logging.DEBUG,
                     )
                     benchmark_params_to_revert["evaluate_test_split"] = False
                     benchmark_config.evaluate_test_split = True
                 if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
-                    logger.debug(
+                    log(
                         "The task requires zero-shot evaluation, so even though you "
                         "requested few-shot evaluation (the default), we will evaluate "
-                        "zero-shot."
+                        "zero-shot.",
+                        level=logging.DEBUG,
                     )
                     benchmark_params_to_revert["few_shot"] = True
                     benchmark_config.few_shot = False
@@ -723,13 +725,7 @@ class Benchmarker:
                 # We do not re-initialise generative models as their architecture is not
                 # customised to specific datasets
                 if model_config.model_type == ModelType.GENERATIVE:
-                    initial_logging(
-                        model_config=model_config,
-                        dataset_config=dataset_config,
-                        benchmark_config=benchmark_config,
-                    )
                     if loaded_model is None:
-                        logger.info("Loading model...")
                         try:
                             loaded_model = load_model(
                                 model_config=model_config,
@@ -739,7 +735,7 @@ class Benchmarker:
                         except InvalidModel as e:
                             if benchmark_config.raise_errors:
                                 raise e
-                            logger.info(e.message)
+                            log(e.message, level=logging.ERROR)
                             # Add the remaining number of benchmarks for the model to
                             # our benchmark counter, since we're skipping the rest of
@@ -759,12 +755,13 @@ class Benchmarker:
                         loaded_model.generative_type
                         not in dataset_config.allowed_generative_types
                     ):
-                        logger.debug(
+                        log(
                             f"Skipping the benchmark of model "
                             f"{model_config.model_id!r}on dataset "
                             f"{dataset_config.name!r} because the model has generative "
                             f"type {loaded_model.generative_type} and the dataset "
-                            f"only allows {dataset_config.allowed_generative_types}."
+                            f"only allows {dataset_config.allowed_generative_types}.",
+                            level=logging.DEBUG,
                         )
                         num_finished_benchmarks += 1
                         continue
@@ -775,6 +772,8 @@ class Benchmarker:
                     model_config=model_config,
                     dataset_config=dataset_config,
                     benchmark_config=benchmark_config,
+                    num_finished_benchmarks=num_finished_benchmarks,
+                    num_total_benchmarks=total_benchmarks,
                 )
                 if (
@@ -784,12 +783,12 @@ class Benchmarker:
                     raise benchmark_output_or_err
                 elif isinstance(benchmark_output_or_err, InvalidBenchmark):
-                    logger.info(benchmark_output_or_err.message)
+                    log(benchmark_output_or_err.message, level=logging.WARNING)
                     num_finished_benchmarks += 1
                     continue
                 elif isinstance(benchmark_output_or_err, InvalidModel):
-                    logger.info(benchmark_output_or_err.message)
+                    log(benchmark_output_or_err.message, level=logging.WARNING)
                     # Add the remaining number of benchmarks for the model to our
                     # benchmark counter, since we're skipping the rest of them
@@ -805,15 +804,13 @@ class Benchmarker:
                         record.append_to_results(results_path=self.results_path)
                 num_finished_benchmarks += 1
-                logger.info(
-                    f"Finished {num_finished_benchmarks} out of "
-                    f"{total_benchmarks} benchmarks."
-                )
             del loaded_model
             if benchmark_config.clear_model_cache:
                 clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
+        log(f"Completed {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO)
         # This avoids the following warning at the end of the benchmarking:
         #   Warning: WARNING: process group has NOT been destroyed before we destruct
         #   ProcessGroupNCCL. On normal program exit, the application should call
@@ -857,6 +854,8 @@ class Benchmarker:
         model_config: "ModelConfig",
         dataset_config: "DatasetConfig",
         benchmark_config: "BenchmarkConfig",
+        num_finished_benchmarks: int,
+        num_total_benchmarks: int,
     ) -> BenchmarkResult | InvalidBenchmark | InvalidModel:
         """Benchmark a single model on a single dataset.
@@ -869,25 +868,29 @@ class Benchmarker:
                 The configuration of the dataset we are evaluating on.
             benchmark_config:
                 The general benchmark configuration.
+            num_finished_benchmarks:
+                The number of benchmarks that have already been completed.
+            num_total_benchmarks:
+                The total number of benchmarks to be completed.
         Returns:
             The benchmark result, or an error if the benchmark was unsuccessful.
-        """
-        if model is None:
-            initial_logging(
-                model_config=model_config,
-                dataset_config=dataset_config,
-                benchmark_config=benchmark_config,
-            )
-        while True:
+        Raises:
+            RuntimeError:
+                If the MPS fallback is not enabled when required.
+            InvalidBenchmark:
+                If the benchmark was unsuccessful.
+            InvalidModel:
+                If the model is invalid.
+        """
+        for _ in range(num_attempts := 5):
             try:
                 # Set random seeds to enforce reproducibility of the randomly
                 # initialised weights
                 rng = enforce_reproducibility()
                 if model is None or model_config.model_type != ModelType.GENERATIVE:
-                    logger.info("Loading model...")
                     model = load_model(
                         model_config=model_config,
                         dataset_config=dataset_config,
@@ -895,6 +898,14 @@ class Benchmarker:
                     )
                 assert model is not None
+                initial_logging(
+                    model_config=model_config,
+                    dataset_config=dataset_config,
+                    benchmark_config=benchmark_config,
+                    num_finished_benchmarks=num_finished_benchmarks,
+                    num_total_benchmarks=num_total_benchmarks,
+                )
                 if dataset_config.task == SPEED:
                     scores = benchmark_speed(
                         model=model, benchmark_config=benchmark_config
@@ -962,14 +973,15 @@ class Benchmarker:
                     few_shot=benchmark_config.few_shot,
                     validation_split=not benchmark_config.evaluate_test_split,
                 )
-                logger.debug(f"Results:\n{results}")
+                log(f"Results:\n{results}", level=logging.DEBUG)
                 return record
             except HuggingFaceHubDown:
                 wait_time = 30
-                logger.debug(
+                log(
                     f"The Hugging Face Hub seems to be down. Retrying in {wait_time} "
-                    "seconds."
+                    "seconds.",
+                    level=logging.DEBUG,
                 )
                 sleep(wait_time)
                 continue
@@ -992,12 +1004,18 @@ class Benchmarker:
                 elif benchmark_config.raise_errors:
                     raise e
                 return e
+        else:
+            return InvalidBenchmark(
+                f"Failed to benchmark model {model_config.model_id!r} on dataset "
+                f"{dataset_config.name!r} after {num_attempts} attempts."
+            )
     def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any:  # noqa: ANN401
         """Alias for `self.benchmark()`."""
-        logger.warning(
+        log(
             "Calling the `Benchmarker` class directly is deprecated. Please use the "
-            "`benchmark` function instead. This will be removed in a future version."
+            "`benchmark` function instead. This will be removed in a future version.",
+            level=logging.WARNING,
         )
         return self.benchmark(*args, **kwds)
@@ -1050,28 +1068,6 @@ def model_has_been_benchmarked(
     return False
-def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
-    """Adjust the logging level based on verbosity.
-    Args:
-        verbose:
-            Whether to output additional output.
-        ignore_testing:
-            Whether to ignore the testing flag.
-    Returns:
-        The logging level that was set.
-    """
-    if hasattr(sys, "_called_from_test") and not ignore_testing:
-        logging_level = logging.CRITICAL
-    elif verbose:
-        logging_level = logging.DEBUG
-    else:
-        logging_level = logging.INFO
-    logger.setLevel(logging_level)
-    return logging_level
 def clear_model_cache_fn(cache_dir: str) -> None:
     """Clear the model cache.
@@ -1109,6 +1105,8 @@ def initial_logging(
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
+    num_finished_benchmarks: int,
+    num_total_benchmarks: int,
 ) -> None:
     """Initial logging at the start of the benchmarking process.
@@ -1119,6 +1117,10 @@ def initial_logging(
             The configuration of the dataset we are evaluating on.
         benchmark_config:
             The general benchmark configuration.
+        num_finished_benchmarks:
+            The number of benchmarks that have already been finished.
+        num_total_benchmarks:
+            The total number of benchmarks to be run.
     """
     model_id = model_config.model_id
     if model_config.revision and model_config.revision != "main":
@@ -1135,21 +1137,25 @@ def initial_logging(
     else:
         eval_type = "Benchmarking"
-    logger.info(
-        f"{eval_type} {model_id} on the {split_type} split of "
-        f"{dataset_config.pretty_name}"
+    log_once(
+        f"\n{eval_type} {model_id} on the {split_type} split of "
+        f"{dataset_config.pretty_name} ({num_finished_benchmarks + 1}/"
+        f"{num_total_benchmarks} benchmarks)...",
+        prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
     )
     if dataset_config.unofficial:
-        logger.info(
+        log_once(
             f"Note that the {dataset_config.name!r} dataset is unofficial, "
             "meaning that the resulting evaluation will not be included in the "
-            "official leaderboard."
+            "official leaderboard.",
+            level=logging.WARNING,
         )
     if benchmark_config.debug:
-        logger.info(
+        log_once(
             "Running in debug mode. This will output additional information, as "
             "well as store the model outputs in the current directory after each "
-            "batch. For this reason, evaluation will be slower."
+            "batch. For this reason, evaluation will be slower.",
+            level=logging.WARNING,
         )

euroeval/caching_utils.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""Caching utility functions."""
+import typing as t
+from functools import wraps
+from .constants import T
+def cache_arguments(
+    *arguments: str, disable_condition: t.Callable[[], bool] = lambda: False
+) -> t.Callable[[t.Callable[..., T]], t.Callable[..., T]]:
+    """Cache specified arguments of a function.
+    Args:
+        arguments:
+            The list of argument names to cache. If empty, all arguments are cached.
+        disable_condition:
+            A function that checks if cache should be disabled.
+    Returns:
+        A decorator that caches the specified arguments of a function.
+    """
+    def caching_decorator(func: t.Callable[..., T]) -> t.Callable[..., T]:
+        """Decorator that caches the specified arguments of a function.
+        Args:
+            func:
+                The function to decorate.
+        Returns:
+            The decorated function.
+        """
+        cache: dict[tuple, T] = dict()
+        @wraps(func)
+        def wrapper(*args, **kwargs) -> T:
+            """Wrapper function that caches the specified arguments.
+            Args:
+                *args:
+                    The positional arguments to the function.
+                **kwargs:
+                    The keyword arguments to the function.
+            Returns:
+                The result of the function.
+            Raises:
+                ValueError:
+                    If an argument name is not found in the function parameters.
+            """
+            if not arguments:
+                key = args + tuple(kwargs[k] for k in sorted(kwargs.keys()))
+            else:
+                func_params = func.__code__.co_varnames
+                key_items: list[t.Any] = []
+                for arg_name in arguments:
+                    if arg_name in kwargs:
+                        key_items.append(kwargs[arg_name])
+                    else:
+                        try:
+                            arg_index = func_params.index(arg_name)
+                            key_items.append(args[arg_index])
+                        except (ValueError, IndexError):
+                            raise ValueError(
+                                f"Argument {arg_name} not found in function "
+                                f"{func.__name__} parameters."
+                            )
+                key = tuple(key_items)
+            # Do not cache if the condition is met
+            if key not in cache or disable_condition():
+                cache[key] = func(*args, **kwargs)
+            return cache[key]
+        return wrapper
+    return caching_decorator

euroeval/callbacks.py CHANGED Viewed

@@ -7,6 +7,8 @@ from collections.abc import Sized
 from tqdm.auto import tqdm
 from transformers.trainer_callback import ProgressCallback
+from .logging_utils import get_pbar
 if t.TYPE_CHECKING:
     from torch.utils.data import DataLoader
     from transformers.trainer_callback import TrainerControl, TrainerState
@@ -32,11 +34,8 @@ class NeverLeaveProgressCallback(ProgressCallback):
         """Callback actions when training begins."""
         if state.is_local_process_zero:
             desc = "Finetuning model"
-            self.training_bar = tqdm(
-                total=None,
-                leave=False,
-                desc=desc,
-                disable=hasattr(sys, "_called_from_test"),
+            self.training_bar = get_pbar(
+                total=None, desc=desc, disable=hasattr(sys, "_called_from_test")
             )
         self.current_step = 0
@@ -67,9 +66,8 @@ class NeverLeaveProgressCallback(ProgressCallback):
         if state.is_local_process_zero and correct_dtype:
             if self.prediction_bar is None:
                 desc = "Evaluating model"
-                self.prediction_bar = tqdm(
+                self.prediction_bar = get_pbar(
                     total=len(eval_dataloader),
-                    leave=False,
                     desc=desc,
                     disable=hasattr(sys, "_called_from_test"),
                 )

euroeval/constants.py CHANGED Viewed

@@ -1,7 +1,13 @@
 """Constants used throughout the project."""
+from typing import TypeVar
 from .enums import TaskGroup
+# Type variable used for generic typing
+T = TypeVar("T", bound=object)
 # This is used as input to generative models; it cannot be a special token
 DUMMY_FILL_VALUE = 100

euroeval/data_loading.py CHANGED Viewed

@@ -12,6 +12,7 @@ from huggingface_hub.errors import HfHubHTTPError
 from numpy.random import Generator
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark
+from .logging_utils import log, no_terminal_output
 from .tasks import EUROPEAN_VALUES
 from .utils import unscramble
@@ -20,8 +21,6 @@ if t.TYPE_CHECKING:
     from .data_models import BenchmarkConfig, DatasetConfig
-logger = logging.getLogger("euroeval")
 def load_data(
     rng: Generator, dataset_config: "DatasetConfig", benchmark_config: "BenchmarkConfig"
@@ -106,11 +105,12 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
     num_attempts = 5
     for _ in range(num_attempts):
         try:
-            dataset = load_dataset(
-                path=dataset_config.huggingface_id,
-                cache_dir=cache_dir,
-                token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
-            )
+            with no_terminal_output():
+                dataset = load_dataset(
+                    path=dataset_config.huggingface_id,
+                    cache_dir=cache_dir,
+                    token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_"),
+                )
             break
         except (
             FileNotFoundError,
@@ -118,9 +118,11 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
             DatasetsError,
             requests.ConnectionError,
             requests.ReadTimeout,
-        ):
-            logger.debug(
-                f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
+        ) as e:
+            log(
+                f"Failed to load dataset {dataset_config.huggingface_id!r}, due to "
+                f"the following error: {e}. Retrying...",
+                level=logging.DEBUG,
             )
             time.sleep(1)
             continue
@@ -129,7 +131,8 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
     else:
         raise InvalidBenchmark(
             f"Failed to load dataset {dataset_config.huggingface_id!r} after "
-            f"{num_attempts} attempts."
+            f"{num_attempts} attempts. Run with verbose mode to see the individual "
+            "errors."
         )
     assert isinstance(dataset, DatasetDict)  # type: ignore[used-before-def]
     missing_keys = [key for key in dataset_config.splits if key not in dataset]

euroeval/data_models.py CHANGED Viewed

@@ -558,14 +558,14 @@ class DatasetConfig:
         )
     @property
-    def id2label(self) -> dict[int, str]:
+    def id2label(self) -> "HashableDict":
         """The mapping from ID to label."""
-        return {idx: label for idx, label in enumerate(self.labels)}
+        return HashableDict({idx: label for idx, label in enumerate(self.labels)})
     @property
-    def label2id(self) -> dict[str, int]:
+    def label2id(self) -> "HashableDict":
         """The mapping from label to ID."""
-        return {label: i for i, label in enumerate(self.labels)}
+        return HashableDict({label: i for i, label in enumerate(self.labels)})
     @property
     def num_labels(self) -> int:
@@ -783,3 +783,11 @@ class ModelIdComponents:
     model_id: str
     revision: str
     param: str | None
+class HashableDict(dict):
+    """A hashable dictionary."""
+    def __hash__(self) -> int:  # type: ignore[override]
+        """Return the hash of the dictionary."""
+        return hash(frozenset(self.items()))

euroeval/dataset_configs/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from ..data_models import DatasetConfig
 from ..languages import get_all_languages
 from ..tasks import SPEED
+from .czech import *  # noqa: F403
 from .danish import *  # noqa: F403
 from .dutch import *  # noqa: F403
 from .english import *  # noqa: F403
@@ -18,6 +19,7 @@ from .lithuanian import *  # noqa: F403
 from .norwegian import *  # noqa: F403
 from .polish import *  # noqa: F403
 from .portuguese import *  # noqa: F403
+from .slovak import *  # noqa: F403
 from .spanish import *  # noqa: F403
 from .swedish import *  # noqa: F403

EuroEval 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.3.0py3-none-any.whl → 16.4.0py3-none-any.whl