PyPI - EuroEval - Versions diffs - 16.1.1__py3-none-any.whl → 16.2.1__py3-none-any.whl - Mend

EuroEval 16.1.1py3-none-any.whl → 16.2.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (23) hide show

euroeval/__init__.py +7 -6
euroeval/benchmark_config_factory.py +41 -125
euroeval/benchmark_modules/hf.py +31 -16
euroeval/benchmark_modules/litellm.py +2 -0
euroeval/benchmark_modules/vllm.py +24 -9
euroeval/benchmarker.py +138 -16
euroeval/cli.py +8 -0
euroeval/data_models.py +5 -0
euroeval/generation.py +3 -1
euroeval/metrics/base.py +12 -0
euroeval/metrics/huggingface.py +23 -2
euroeval/prompt_templates/linguistic_acceptability.py +6 -5
euroeval/prompt_templates/named_entity_recognition.py +3 -3
euroeval/prompt_templates/sentiment_classification.py +5 -5
euroeval/tasks.py +3 -0
euroeval/tokenisation_utils.py +0 -6
euroeval/types.py +2 -2
euroeval/utils.py +77 -5
{euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/METADATA +31 -7
{euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/RECORD +23 -23
{euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/WHEEL +0 -0
{euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/entry_points.txt +0 -0
{euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -12,12 +12,13 @@ import warnings
 from termcolor import colored
 # Block specific warnings before importing anything else, as they can be noisy
-warnings.filterwarnings("ignore", category=UserWarning)
-warnings.filterwarnings("ignore", category=FutureWarning)
-logging.getLogger("httpx").setLevel(logging.CRITICAL)
-logging.getLogger("datasets").setLevel(logging.CRITICAL)
-logging.getLogger("vllm").setLevel(logging.CRITICAL)
-os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
+if os.getenv("FULL_LOG") != "1":
+    warnings.filterwarnings("ignore", category=UserWarning)
+    warnings.filterwarnings("ignore", category=FutureWarning)
+    logging.getLogger("httpx").setLevel(logging.CRITICAL)
+    logging.getLogger("datasets").setLevel(logging.CRITICAL)
+    logging.getLogger("vllm").setLevel(logging.CRITICAL)
+    os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
 # Set up logging
 fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")

euroeval/benchmark_config_factory.py CHANGED Viewed

@@ -6,9 +6,9 @@ import typing as t
 import torch
-from .data_models import BenchmarkConfig
+from .data_models import BenchmarkConfig, BenchmarkConfigParams
 from .dataset_configs import get_all_dataset_configs
-from .enums import Device, GenerativeType
+from .enums import Device
 from .exceptions import InvalidBenchmark
 from .languages import get_all_languages
 from .tasks import SPEED, get_all_tasks
@@ -21,150 +21,66 @@ logger = logging.getLogger("euroeval")
 def build_benchmark_config(
-    progress_bar: bool,
-    save_results: bool,
-    task: str | list[str] | None,
-    dataset: str | list[str] | None,
-    language: str | list[str],
-    model_language: str | list[str] | None,
-    dataset_language: str | list[str] | None,
-    device: Device | None,
-    batch_size: int,
-    raise_errors: bool,
-    cache_dir: str,
-    api_key: str | None,
-    force: bool,
-    verbose: bool,
-    trust_remote_code: bool,
-    clear_model_cache: bool,
-    evaluate_test_split: bool,
-    few_shot: bool,
-    num_iterations: int,
-    api_base: str | None,
-    api_version: str | None,
-    gpu_memory_utilization: float,
-    generative_type: GenerativeType | None,
-    debug: bool,
-    run_with_cli: bool,
-    requires_safetensors: bool,
+    benchmark_config_params: BenchmarkConfigParams,
 ) -> BenchmarkConfig:
     """Create a benchmark configuration.
     Args:
-        progress_bar:
-            Whether to show a progress bar when running the benchmark.
-        save_results:
-            Whether to save the benchmark results to a file.
-        task:
-            The tasks to include for dataset. If None then datasets will not be
-            filtered based on their task.
-        dataset:
-            The datasets to include for task. If None then all datasets will be
-            included, limited by the `task` parameter.
-        language:
-            The language codes of the languages to include, both for models and
-            datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
-            to 'all' if all languages should be considered.
-        model_language:
-            The language codes of the languages to include for models. If None then
-            the `language` parameter will be used.
-        dataset_language:
-            The language codes of the languages to include for datasets. If None then
-            the `language` parameter will be used.
-        device:
-            The device to use for running the models. If None then the device will be
-            set automatically.
-        batch_size:
-            The batch size to use for running the models.
-        raise_errors:
-            Whether to raise errors when running the benchmark.
-        cache_dir:
-            The directory to use for caching the models.
-        api_key:
-            The API key to use for a given inference server.
-        force:
-            Whether to force the benchmark to run even if the results are already
-            cached.
-        verbose:
-            Whether to print verbose output when running the benchmark. This is
-            automatically set if `debug` is True.
-        trust_remote_code:
-            Whether to trust remote code when running the benchmark.
-        clear_model_cache:
-            Whether to clear the model cache before running the benchmark.
-        evaluate_test_split:
-            Whether to use the test split for the datasets.
-        few_shot:
-            Whether to use few-shot learning for the models.
-        num_iterations:
-            The number of iterations each model should be evaluated for.
-        api_base:
-            The base URL for a given inference API. Only relevant if `model` refers to a
-            model on an inference API.
-        api_version:
-            The version of the API to use for a given inference API.
-        gpu_memory_utilization:
-            The GPU memory utilization to use for vLLM. A larger value will result in
-            faster evaluation, but at the risk of running out of GPU memory. Only reduce
-            this if you are running out of GPU memory. Only relevant if the model is
-            generative.
-        generative_type:
-            The type of generative model. Only relevant if the model is generative. If
-            not specified, the type will be inferred automatically.
-        debug:
-            Whether to run the benchmark in debug mode.
-        run_with_cli:
-            Whether the benchmark is being run with the CLI.
-        requires_safetensors:
-            Whether to only allow evaluations of models stored as safetensors.
+        benchmark_config_params:
+            The parameters for creating the benchmark configuration.
     Returns:
         The benchmark configuration.
     """
-    language_codes = get_correct_language_codes(language_codes=language)
+    language_codes = get_correct_language_codes(
+        language_codes=benchmark_config_params.language
+    )
     model_languages = prepare_languages(
-        language_codes=model_language, default_language_codes=language_codes
+        language_codes=benchmark_config_params.model_language,
+        default_language_codes=language_codes,
     )
     dataset_languages = prepare_languages(
-        language_codes=dataset_language, default_language_codes=language_codes
+        language_codes=benchmark_config_params.dataset_language,
+        default_language_codes=language_codes,
     )
     tasks, datasets = prepare_tasks_and_datasets(
-        task=task, dataset=dataset, dataset_languages=dataset_languages
+        task=benchmark_config_params.task,
+        dataset=benchmark_config_params.dataset,
+        dataset_languages=dataset_languages,
     )
-    torch_device = prepare_device(device=device)
-    # Set variable with number of iterations
-    if hasattr(sys, "_called_from_test"):
-        num_iterations = 1
     return BenchmarkConfig(
         model_languages=model_languages,
         dataset_languages=dataset_languages,
         tasks=tasks,
         datasets=datasets,
-        batch_size=batch_size,
-        raise_errors=raise_errors,
-        cache_dir=cache_dir,
-        api_key=api_key,
-        force=force,
-        progress_bar=progress_bar,
-        save_results=save_results,
-        verbose=verbose or debug,
-        device=torch_device,
-        trust_remote_code=trust_remote_code,
-        clear_model_cache=clear_model_cache,
-        evaluate_test_split=evaluate_test_split,
-        few_shot=few_shot,
-        num_iterations=num_iterations,
-        api_base=api_base,
-        api_version=api_version,
-        gpu_memory_utilization=gpu_memory_utilization,
-        generative_type=generative_type,
-        debug=debug,
-        run_with_cli=run_with_cli,
-        requires_safetensors=requires_safetensors,
+        batch_size=benchmark_config_params.batch_size,
+        raise_errors=benchmark_config_params.raise_errors,
+        cache_dir=benchmark_config_params.cache_dir,
+        api_key=benchmark_config_params.api_key,
+        force=benchmark_config_params.force,
+        progress_bar=benchmark_config_params.progress_bar,
+        save_results=benchmark_config_params.save_results,
+        verbose=benchmark_config_params.verbose or benchmark_config_params.debug,
+        device=prepare_device(device=benchmark_config_params.device),
+        trust_remote_code=benchmark_config_params.trust_remote_code,
+        clear_model_cache=benchmark_config_params.clear_model_cache,
+        evaluate_test_split=benchmark_config_params.evaluate_test_split,
+        few_shot=benchmark_config_params.few_shot,
+        num_iterations=(
+            1
+            if hasattr(sys, "_called_from_test")
+            else benchmark_config_params.num_iterations
+        ),
+        api_base=benchmark_config_params.api_base,
+        api_version=benchmark_config_params.api_version,
+        gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
+        generative_type=benchmark_config_params.generative_type,
+        debug=benchmark_config_params.debug,
+        run_with_cli=benchmark_config_params.run_with_cli,
+        requires_safetensors=benchmark_config_params.requires_safetensors,
+        download_only=benchmark_config_params.download_only,
     )

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -146,21 +146,25 @@ class HuggingFaceEncoderModel(BenchmarkModule):
         Returns:
             The number of parameters in the model.
         """
-        token = get_hf_token(api_key=self.benchmark_config.api_key)
-        hf_api = HfApi(token=token)
-        try:
-            repo_info = hf_api.model_info(
-                repo_id=self.model_config.adapter_base_model_id
-                or self.model_config.model_id,
-                revision=self.model_config.revision,
-            )
-        except (
-            RepositoryNotFoundError,
-            RevisionNotFoundError,
-            RequestException,
-            HFValidationError,
-        ):
+        # No need to try to use the API if we have no internet.
+        if not internet_connection_available():
             repo_info = None
+        else:
+            token = get_hf_token(api_key=self.benchmark_config.api_key)
+            hf_api = HfApi(token=token)
+            try:
+                repo_info = hf_api.model_info(
+                    repo_id=self.model_config.adapter_base_model_id
+                    or self.model_config.model_id,
+                    revision=self.model_config.revision,
+                )
+            except (
+                RepositoryNotFoundError,
+                RevisionNotFoundError,
+                RequestException,
+                HFValidationError,
+            ):
+                repo_info = None
         if (
             repo_info is not None
@@ -558,7 +562,7 @@ def load_model_and_tokeniser(
             The benchmark configuration
     Returns:
-        The loaded model and tokeniser.
+        A pair (model, tokeniser), with the loaded model and tokeniser
     """
     config: "PretrainedConfig"
     block_terminal_output()
@@ -686,6 +690,7 @@ def load_model_and_tokeniser(
         model=model,
         model_id=model_id,
         trust_remote_code=benchmark_config.trust_remote_code,
+        model_cache_dir=model_config.model_cache_dir,
     )
     return model, tokeniser
@@ -722,6 +727,11 @@ def get_model_repo_info(
         ):
             model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
+    # If we have not internet, and the model_id is not a directory for a local model
+    # we also just create a dummy model info object.
+    elif not internet_connection_available():
+        model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
     # If the model does not exist locally, then we get the model info from the Hugging
     # Face Hub, if possible
     if model_info is None:
@@ -867,7 +877,10 @@ def get_model_repo_info(
 def load_tokeniser(
-    model: "PreTrainedModel | None", model_id: str, trust_remote_code: bool
+    model: "PreTrainedModel | None",
+    model_id: str,
+    trust_remote_code: bool,
+    model_cache_dir: str,
 ) -> "PreTrainedTokenizer":
     """Load the tokeniser.
@@ -889,6 +902,7 @@ def load_tokeniser(
         trust_remote_code=trust_remote_code,
         padding_side="right",
         truncation_side="right",
+        cache_dir=model_cache_dir,
     )
     # If the model is a subclass of a certain model types then we have to add a prefix
@@ -999,6 +1013,7 @@ def load_hf_model_config(
                 token=get_hf_token(api_key=api_key),
                 trust_remote_code=trust_remote_code,
                 cache_dir=model_cache_dir,
+                local_files_only=not internet_connection_available(),
             )
             if config.eos_token_id is not None and config.pad_token_id is None:
                 if isinstance(config.eos_token_id, list):

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -984,6 +984,7 @@ class LiteLLMModel(BenchmarkModule):
                     model=None,
                     model_id=model_id,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
+                    model_cache_dir=self.model_config.model_cache_dir,
                 )
                 if (
@@ -1066,6 +1067,7 @@ class LiteLLMModel(BenchmarkModule):
                     model=None,
                     model_id=model_id,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
+                    model_cache_dir=self.model_config.model_cache_dir,
                 )
                 all_max_lengths: list[int] = list()

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -72,7 +72,9 @@ from ..utils import (
     create_model_cache_dir,
     get_hf_token,
     get_min_cuda_compute_capability,
+    internet_connection_available,
     log_once,
+    resolve_model_path,
     split_model_id,
 )
 from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
@@ -146,7 +148,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         )
         self.end_of_reasoning_token = get_end_of_reasoning_token(
-            model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
+            model=self._model, tokeniser=self._tokeniser, model_config=model_config
         )
         self.end_of_chat_token_ids = get_end_of_chat_token_ids(
             tokeniser=self._tokeniser, generative_type=self.generative_type
@@ -834,10 +836,15 @@ def load_model_and_tokeniser(
     clear_vllm()
+    # if we do not have an internet connection we need to give the path to the folder
+    # that contains the model weights and config files, otherwise vLLM will try to
+    # download them regardless if they are already present in the download_dir
+    model_path = resolve_model_path(download_dir)
     try:
         model = LLM(
-            model=model_id,
-            tokenizer=model_id,
+            model=model_id if internet_connection_available() else model_path,
+            tokenizer=model_id if internet_connection_available() else model_path,
             gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
             max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
             download_dir=download_dir,
@@ -925,6 +932,7 @@ def load_tokeniser(
         cache_dir=model_cache_dir,
         token=token,
         trust_remote_code=trust_remote_code,
+        local_files_only=not internet_connection_available(),
     )
     num_retries = 5
     for _ in range(num_retries):
@@ -937,8 +945,10 @@ def load_tokeniser(
                 padding_side="left",
                 truncation_side="left",
                 model_max_length=model_max_length,
+                cache_dir=model_cache_dir,
                 config=config,
                 token=token,
+                local_files_only=not internet_connection_available(),
             )
             break
         except (json.JSONDecodeError, OSError, TypeError) as e:
@@ -996,7 +1006,7 @@ def clear_vllm() -> None:
 def get_end_of_reasoning_token(
-    model: "LLM", tokeniser: "PreTrainedTokenizer", model_id: str
+    model: "LLM", tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
 ) -> str | None:
     """Get the end-of-reasoning token for a generative model.
@@ -1005,21 +1015,26 @@ def get_end_of_reasoning_token(
             The vLLM model.
         tokeniser:
             The tokeniser.
-        model_id:
-            The model ID.
+        model_config:
+            The model configuration.
     Returns:
         The end of reasoning token, or None if it could not be found.
     """
+    model_id = model_config.model_id
     # Create a prompt to check if the model uses the reasoning tokens
     prompt = "What is your name?"
     if has_chat_template(tokeniser=tokeniser):
+        extra_kwargs = dict()
+        if model_config.param in {"thinking", "no-thinking"}:
+            extra_kwargs["enable_thinking"] = model_config.param == "thinking"
         templated_prompt = apply_chat_template(
             conversation=[dict(role="user", content=prompt)],
             tokeniser=tokeniser,
             tokenise=False,
             add_generation_prompt=True,
-            enable_thinking=True,
+            **extra_kwargs,
         )
         assert isinstance(templated_prompt, str)
         prompt = templated_prompt
@@ -1042,8 +1057,8 @@ def get_end_of_reasoning_token(
     if not bor_reasoning_matches:
         log_once(
             f"The model {model_id!r} did not generate any beginning-of-reasoning "
-            "tokens in the prompt or the completion. Assuming the model is not "
-            "a reasoning model.",
+            "tokens in the prompt or the completion. Assuming the model is not a "
+            "reasoning model.",
             level=logging.DEBUG,
         )
         return None

euroeval/benchmarker.py CHANGED Viewed

@@ -16,7 +16,7 @@ from torch.distributed import destroy_process_group
 from .benchmark_config_factory import build_benchmark_config
 from .constants import GENERATIVE_PIPELINE_TAGS
-from .data_loading import load_data
+from .data_loading import load_data, load_raw_data
 from .data_models import BenchmarkConfigParams, BenchmarkResult
 from .dataset_configs import get_all_dataset_configs
 from .enums import Device, GenerativeType, ModelType
@@ -28,7 +28,12 @@ from .model_loading import load_model
 from .scores import log_scores
 from .speed_benchmark import benchmark_speed
 from .tasks import SPEED
-from .utils import enforce_reproducibility, get_package_version
+from .utils import (
+    enforce_reproducibility,
+    get_package_version,
+    internet_connection_available,
+    log_once,
+)
 if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
@@ -83,6 +88,7 @@ class Benchmarker:
         debug: bool = False,
         run_with_cli: bool = False,
         requires_safetensors: bool = False,
+        download_only: bool = False,
     ) -> None:
         """Initialise the benchmarker.
@@ -164,14 +170,26 @@ class Benchmarker:
             requires_safetensors:
                 Whether to only allow models that use the safetensors format. Defaults
                 to False.
+            download_only:
+                Whether to only download models and datasets without performing any
+                benchmarking. Defaults to False.
         Raises:
             ValueError:
-                If both `task` and `dataset` are specified.
+                If both `task` and `dataset` are specified, or if `download_only`
+                is True and we have no internet connection.
         """
         if task is not None and dataset is not None:
             raise ValueError("Only one of `task` and `dataset` can be specified.")
+        if not internet_connection_available() and download_only:
+            msg = "It appears you do not have an internet connection, but "
+            if run_with_cli:
+                msg += "the --download-only flag was set."
+            else:
+                msg += "the argument `download_only` was set to True."
+            raise ValueError(msg)
         # Bail early if hf_transfer is enabled but not installed.
         if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
             raise ImportError(
@@ -205,13 +223,14 @@ class Benchmarker:
             api_version=api_version,
             gpu_memory_utilization=gpu_memory_utilization,
             generative_type=generative_type,
+            download_only=download_only,
             debug=debug,
             run_with_cli=run_with_cli,
             requires_safetensors=requires_safetensors,
         )
         self.benchmark_config = build_benchmark_config(
-            **self.benchmark_config_default_params.model_dump()
+            benchmark_config_params=self.benchmark_config_default_params
         )
         # Initialise variable storing model lists, so we only have to fetch it once
@@ -222,17 +241,82 @@ class Benchmarker:
     @property
     def benchmark_results(self) -> list[BenchmarkResult]:
-        """The benchmark results."""
+        """The benchmark results.
+        Returns:
+            A list of benchmark results.
+        Raises:
+            ValueError:
+                If there is an error decoding a line in the results file.
+        """
         if self.results_path.exists():
+            benchmark_results: list[BenchmarkResult] = list()
             with self.results_path.open() as f:
-                return [
-                    BenchmarkResult.from_dict(json.loads(line))
-                    for line in f
-                    if line.strip()
-                ]
+                for line in f:
+                    if line.strip():
+                        try:
+                            result_dict = json.loads(line.strip())
+                        except json.JSONDecodeError as e:
+                            raise ValueError(
+                                f"Error decoding JSON line: {line.strip()}"
+                            ) from e
+                        # Fix for older records
+                        has_old_raw_results = (
+                            "results" in result_dict
+                            and isinstance(result_dict["results"], dict)
+                            and "raw" in result_dict["results"]
+                            and isinstance(result_dict["results"]["raw"], dict)
+                            and "test" in result_dict["results"]["raw"]
+                        )
+                        if has_old_raw_results:
+                            result_dict["results"]["raw"] = result_dict["results"][
+                                "raw"
+                            ]["test"]
+                        result = BenchmarkResult.from_dict(result_dict)
+                        benchmark_results.append(result)
+            return benchmark_results
         else:
             return list()
+    def _download(
+        self,
+        dataset_config: "DatasetConfig",
+        model_config: "ModelConfig",
+        benchmark_config: "BenchmarkConfig",
+    ) -> None:
+        """Download data, metrics, and model for the given dataset, and model.
+        Args:
+            dataset_config: The configuration for the dataset.
+            model_config: The configuration for the model.
+            benchmark_config: The configuration for the benchmark.
+        """
+        log_once(f"Loading data for {dataset_config.pretty_name}", level=logging.INFO)
+        dataset = load_raw_data(
+            dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
+        )
+        del dataset
+        log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
+        model = load_model(
+            model_config=model_config,
+            dataset_config=dataset_config,
+            benchmark_config=benchmark_config,
+        )
+        del model
+        log_once(
+            f"Loading metrics for the '{dataset_config.task.name}' task",
+            level=logging.INFO,
+        )
+        for metric_name in dataset_config.task.metrics:
+            log_once(f"Loading metric {metric_name.name}", level=logging.DEBUG)
+            metric = metric_name.download(cache_dir=benchmark_config.cache_dir)
+            del metric
     def benchmark(
         self,
         model: list[str] | str,
@@ -256,6 +340,7 @@ class Benchmarker:
         few_shot: bool | None = None,
         num_iterations: int | None = None,
         requires_safetensors: bool | None = None,
+        download_only: bool | None = None,
     ) -> list[BenchmarkResult]:
         """Benchmarks models on datasets.
@@ -336,6 +421,9 @@ class Benchmarker:
             requires_safetensors:
                 Whether to only allow models that use the safetensors format. Defaults
                 to the value specified when initialising the benchmarker.
+            download_only:
+                Whether to only download the models without evaluating them. Defaults
+                to the value specified when initialising the benchmarker.
         Returns:
             A list of benchmark results.
@@ -368,6 +456,7 @@ class Benchmarker:
             few_shot=few_shot,
             num_iterations=num_iterations,
             requires_safetensors=requires_safetensors,
+            download_only=download_only,
         )
         adjust_logging_level(verbose=benchmark_config.verbose)
@@ -395,6 +484,28 @@ class Benchmarker:
                 num_finished_benchmarks += len(dataset_configs)
                 continue
+            if model_config.adapter_base_model_id:
+                open_issue_msg = (
+                    "If offline support is important to you, please "
+                    "consider opening an issue at https://github.com/EuroEval/EuroEval/issues."
+                )
+                if not internet_connection_available():
+                    raise InvalidModel(
+                        "Offline benchmarking of models with adapters is not currently "
+                        "supported. "
+                        f"An active internet connection is required. {open_issue_msg}"
+                    )
+                elif benchmark_config.download_only:
+                    log_once(
+                        "You are using download only mode with a model that includes "
+                        "an adapter. "
+                        "Please note: Offline benchmarking of adapter models is not "
+                        "currently supported. "
+                        "An internet connection will be required during evaluation. "
+                        f"{open_issue_msg}",
+                        level=logging.WARNING,
+                    )
             loaded_model: BenchmarkModule | None = None
             benchmark_params_to_revert: dict[str, t.Any] = dict()
             for dataset_config in dataset_configs:
@@ -569,6 +680,7 @@ class Benchmarker:
         debug: bool | None = None,
         run_with_cli: bool | None = None,
         requires_safetensors: bool | None = None,
+        download_only: bool | None = None,
     ) -> "BenchmarkConfig":
         """Get an updated benchmark configuration.
@@ -645,6 +757,12 @@ class Benchmarker:
             requires_safetensors:
                 Whether to only allow models that use the safetensors format. If None,
                 then this value will not be updated.
+            download_only:
+                Whether to only download the models without evaluating them. If None,
+                then this value will not be updated.
+            download_only:
+                Whether to only download models and datasets without performing any
+                benchmarking. If None, then this value will not be updated.
         Returns:
             The updated benchmark configuration.
@@ -701,8 +819,10 @@ class Benchmarker:
             benchmark_config_params.run_with_cli = run_with_cli
         if requires_safetensors is not None:
             benchmark_config_params.requires_safetensors = requires_safetensors
+        if download_only is not None:
+            benchmark_config_params.download_only = download_only
-        return build_benchmark_config(**benchmark_config_params.model_dump())
+        return build_benchmark_config(benchmark_config_params=benchmark_config_params)
     def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
         """Prepare the model ID(s) to be benchmarked.
@@ -813,17 +933,19 @@ class Benchmarker:
                     model_param=model_config.param,
                 )
+                model_id_to_be_stored = model_config.model_id
+                if model_config.revision != "main":
+                    model_id_to_be_stored += f"@{model_config.revision}"
+                if model_config.param is not None:
+                    model_id_to_be_stored += f"#{model_config.param}"
                 record = BenchmarkResult(
                     dataset=dataset_config.name,
                     task=dataset_config.task.name,
                     dataset_languages=[
                         language.code for language in dataset_config.languages
                     ],
-                    model=(
-                        f"{model_config.model_id}@{model_config.revision}"
-                        if model_config.revision and model_config.revision != "main"
-                        else model_config.model_id
-                    ),
+                    model=model_id_to_be_stored,
                     results=results,
                     num_model_parameters=model.num_params,
                     max_sequence_length=model.model_max_length,

euroeval/cli.py CHANGED Viewed

@@ -216,6 +216,12 @@ from .tasks import get_all_tasks
     help="The type of generative model. Only relevant if the model is generative. If "
     "not specified, the type will be inferred automatically.",
 )
+@click.option(
+    "--download-only",
+    is_flag=True,
+    help="Only download the requested model weights and datasets, and exit.",
+    default=False,
+)
 def benchmark(
     model: tuple[str],
     dataset: tuple[str],
@@ -243,6 +249,7 @@ def benchmark(
     debug: bool,
     requires_safetensors: bool,
     generative_type: str | None,
+    download_only: bool,
 ) -> None:
     """Benchmark pretrained language models on language tasks."""
     models = list(model)
@@ -284,6 +291,7 @@ def benchmark(
         debug=debug,
         run_with_cli=True,
         requires_safetensors=requires_safetensors,
+        download_only=download_only,
     )
     # Perform the benchmark evaluation

euroeval/data_models.py CHANGED Viewed

@@ -228,6 +228,9 @@ class BenchmarkConfig:
         generative_type:
             The type of generative model to benchmark. Only relevant if the model is
             generative.
+        download_only:
+            Whether to only download the models, metrics and datasets without
+            evaluating.
     """
     model_languages: list[Language]
@@ -255,6 +258,7 @@ class BenchmarkConfig:
     run_with_cli: bool
     requires_safetensors: bool
     generative_type: GenerativeType | None
+    download_only: bool
 class BenchmarkConfigParams(pydantic.BaseModel):
@@ -285,6 +289,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
     api_version: str | None
     gpu_memory_utilization: float
     generative_type: GenerativeType | None
+    download_only: bool
     debug: bool
     run_with_cli: bool
     requires_safetensors: bool

euroeval/generation.py CHANGED Viewed

@@ -243,7 +243,9 @@ def generate_single_iteration(
         ground_truth = []
     itr_scores: dict[str, float] = model.compute_metrics(
-        model_outputs_and_labels=(all_preds, ground_truth), dataset=dataset
+        model_outputs_and_labels=(all_preds, ground_truth),
+        dataset=dataset,
+        benchmark_config=benchmark_config,
     )
     return itr_scores

euroeval/metrics/base.py CHANGED Viewed

@@ -42,6 +42,18 @@ class Metric(abc.ABC):
             else lambda x: (100 * x, f"{x:.2%}")
         )
+    def download(self, cache_dir: str) -> "Metric":
+        """Initiates the download of the metric if needed.
+        Args:
+            cache_dir:
+                The directory where the metric will be downloaded to.
+        Returns:
+            The metric object itself.
+        """
+        return self
     @abc.abstractmethod
     def __call__(
         self,

euroeval/metrics/huggingface.py CHANGED Viewed

@@ -3,9 +3,11 @@
 import collections.abc as c
 import logging
 import typing as t
+from pathlib import Path
 import evaluate
 import numpy as np
+from datasets import DownloadConfig
 from ..utils import HiddenPrints
 from .base import Metric
@@ -76,6 +78,23 @@ class HuggingFaceMetric(Metric):
         )
         self.metric: "EvaluationModule | None" = None
+    def download(self, cache_dir: str) -> "HuggingFaceMetric":
+        """Initiates the download of the metric if needed.
+        Args:
+            cache_dir:
+                The directory where the metric will be downloaded to.
+        Returns:
+            The metric object itself.
+        """
+        # Annoying but needed to make the metric download to a different cache dir
+        download_config = DownloadConfig(cache_dir=Path(cache_dir, "evaluate"))
+        self.metric = evaluate.load(
+            path=self.huggingface_id, download_config=download_config
+        )
+        return self
     def __call__(
         self,
         predictions: c.Sequence,
@@ -103,7 +122,9 @@ class HuggingFaceMetric(Metric):
             The calculated metric score, or None if the score should be ignored.
         """
         if self.metric is None:
-            self.metric = evaluate.load(path=self.huggingface_id)
+            self.download(cache_dir=benchmark_config.cache_dir)
+        assert self.metric is not None
         with HiddenPrints():
             results = self.metric.compute(
@@ -176,7 +197,7 @@ bert_score_metric = HuggingFaceMetric(
     huggingface_id="bertscore",
     results_key="f1",
     compute_kwargs=dict(
-        model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
+        model_type="microsoft/mdeberta-v3-base", device="cpu", batch_size=16
     ),
 )

euroeval/prompt_templates/linguistic_acceptability.py CHANGED Viewed

@@ -97,7 +97,7 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
         default_prompt_prefix="Hetta eru nakrir setningar og um teir eru mállæruliga "
         "rættir.",
         default_prompt_template="Setningur: {text}\nMállæruliga rættur: {label}",
-        default_instruction_prompt="Setningur: {text}\n\nGreinið hvort setningurin er "
+        default_instruction_prompt="Setningur: {text}\n\nGreindu hvort setningurin er "
         "mállæruliga rættur ella ikki. Svara við {labels_str}, og einki annað.",
     ),
     FR: PromptConfig(
@@ -111,11 +111,12 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
     ),
     IS: PromptConfig(
         default_prompt_label_mapping=dict(correct="já", incorrect="nei"),
-        default_prompt_prefix="Eftirfarandi eru setningar og hvort þær eru "
-        "málfræðilega réttar.",
+        default_prompt_prefix="Hér fyrir neðan eru setningar ásamt mati á því hvort "
+        "þær eru málfræðilega réttar.",
         default_prompt_template="Setning: {text}\nMálfræðilega rétt: {label}",
-        default_instruction_prompt="Setning: {text}\n\nGreinið hvort setningin er "
-        "málfræðilega rétt eða ekki. Svaraðu með {labels_str}, og ekkert annað.",
+        default_instruction_prompt="Setning: {text}\n\nGreindu hvort setningin er "
+        "málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún "
+        "er það ekki.",
     ),
     IT: PromptConfig(
         default_prompt_label_mapping=dict(correct="si", incorrect="no"),

euroeval/prompt_templates/named_entity_recognition.py CHANGED Viewed

@@ -176,7 +176,7 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
         default_prompt_prefix="Her eru nakrir setningar og nakrar JSON orðabøkur við "
         "nevndar eindir, sum eru í setningunum.",
         default_prompt_template="Setningur: {text}\nNevndar eindir: {label}",
-        default_instruction_prompt="Setningur: {text}\n\nGreinið nevndu einingarnar í "
+        default_instruction_prompt="Setningur: {text}\n\nGreindu nevndu einingarnar í "
         "setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum "
         "{labels_str}. Gildin ættu að vera listi yfir nevndu einingarnar af "
         "þeirri gerð, nákvæmlega eins og þær koma fram í setningunni.",
@@ -215,8 +215,8 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
         },
         default_prompt_prefix="Eftirfarandi eru setningar ásamt JSON lyklum með "
         "nefndum einingum sem koma fyrir í setningunum.",
-        default_prompt_template="Setning: {text}\nNefndar einingar: {label}",
-        default_instruction_prompt="Setning: {text}\n\nGreinið nefndu einingarnar í "
+        default_prompt_template="Setning: {text}\nNafneiningar: {label}",
+        default_instruction_prompt="Setning: {text}\n\nGreindu nefndu einingarnar í "
         "setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum "
         "{labels_str}. Gildin ættu að vera listi yfir nefndu "
         "einingarnar af þeirri gerð, nákvæmlega eins og þær koma fram í "

euroeval/prompt_templates/sentiment_classification.py CHANGED Viewed

@@ -137,11 +137,11 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
         default_prompt_label_mapping=dict(
             positive="jákvætt", neutral="hlutlaust", negative="neikvætt"
         ),
-        default_prompt_prefix="Eftirfarandi eru skjöl og viðhorf þeirra, sem geta "
-        "verið {labels_str}.",
-        default_prompt_template="Skjal: {text}\nViðhorf: {label}",
-        default_instruction_prompt="Skjal: {text}\n\nFlokkaðu viðhorfið í skjalinu. "
-        "Svaraðu með {labels_str}, og ekkert annað.",
+        default_prompt_prefix="Hér fyrir neðan eru textabrot ásamt lyndisgildi þeirra "
+        "sem getur verið 'jákvætt', 'hlutlaust' eða 'neikvætt'.",
+        default_prompt_template="Textabrot: {text}\nViðhorf: {label}",
+        default_instruction_prompt="Textabrot: {text}\n\nGreindu lyndið í "
+        "textabrotinu. Svaraðu með {labels_str}, og ekkert annað.",
     ),
     IT: PromptConfig(
         default_prompt_label_mapping=dict(

euroeval/tasks.py CHANGED Viewed

@@ -100,6 +100,7 @@ KNOW = Task(
     default_num_few_shot_examples=5,
     default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
+    default_allowed_model_types=[ModelType.GENERATIVE],
     uses_logprobs=True,
 )
@@ -112,6 +113,7 @@ MCRC = Task(
     default_num_few_shot_examples=5,
     default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
+    default_allowed_model_types=[ModelType.GENERATIVE],
     uses_logprobs=True,
 )
@@ -124,6 +126,7 @@ COMMON_SENSE = Task(
     default_num_few_shot_examples=5,
     default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
+    default_allowed_model_types=[ModelType.GENERATIVE],
     uses_logprobs=True,
 )

euroeval/tokenisation_utils.py CHANGED Viewed

@@ -551,7 +551,6 @@ def apply_chat_template(
     tokeniser: "PreTrainedTokenizer",
     tokenise: bool,
     add_generation_prompt: bool,
-    enable_thinking: bool,
     **extra_kwargs,
 ) -> str | list[int]:
     """Apply the chat template to a prompt.
@@ -568,10 +567,6 @@ def apply_chat_template(
             Whether to add a generation prompt at the end of the conversation. This is
             only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
             always add a generation prompt.
-        enable_thinking:
-            Whether to enable special handling for reasoning models, such as adding
-            special tokens for thinking. This is only relevant for regular Hugging
-            Face tokenisers, as Mistral tokenisers always handle reasoning models.
         **extra_kwargs:
             Extra keyword arguments to pass to the tokeniser's `apply_chat_template`
             method. Only relevant for regular Hugging Face tokenisers.
@@ -601,7 +596,6 @@ def apply_chat_template(
             conversation=conversation,
             add_generation_prompt=add_generation_prompt,
             tokenize=tokenise,
-            enable_thinking=enable_thinking,
             **extra_kwargs,
         )
     return templated_prompt

euroeval/types.py CHANGED Viewed

@@ -8,8 +8,7 @@ if t.TYPE_CHECKING:
     from datasets.arrow_dataset import Dataset
     from numpy.typing import NDArray
-    from .data_models import GenerativeModelOutput
+    from .data_models import BenchmarkConfig, GenerativeModelOutput
 ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
 Predictions: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
@@ -27,6 +26,7 @@ class ComputeMetricsFunction(t.Protocol):
             "NDArray | list[str] | list[list[str]]",
         ],
         dataset: "Dataset",
+        benchmark_config: "BenchmarkConfig",
     ) -> dict[str, float]:
         """Compute the metrics.

euroeval/utils.py CHANGED Viewed

@@ -8,6 +8,7 @@ import logging
 import os
 import random
 import re
+import socket
 import sys
 import typing as t
 import warnings
@@ -18,10 +19,8 @@ import demjson3
 import huggingface_hub as hf_hub
 import litellm
 import numpy as np
-import requests
 import torch
 from datasets.utils import disable_progress_bar
-from requests.exceptions import RequestException
 from transformers import logging as tf_logging
 from .exceptions import InvalidBenchmark, InvalidModel, NaNValueInModelOutput
@@ -54,6 +53,68 @@ def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
     return str(cache_dir_path)
+def resolve_model_path(download_dir: str) -> str:
+    """Resolve the path to the directory containing the model config files and weights.
+    Args:
+        download_dir:
+            The download directory
+    Returns:
+        The path to the model.
+    """
+    model_path = Path(download_dir)
+    # Get the 'path safe' version of the model id, which is the last dir in the path
+    model_id_path = model_path.name
+    # Hf hub `cache_dir` puts the files in models--`model_id_path`/snapshots
+    model_path = model_path / f"models--{model_id_path}" / "snapshots"
+    if not model_path.exists():
+        raise InvalidModel(
+            f"Attempted to load models from the {model_path} directory, "
+            "but it does not exist."
+        )
+    # Get all files in the model path
+    found_files = [
+        found_file for found_file in model_path.rglob("*") if found_file.is_file()
+    ]
+    if not found_files:
+        raise InvalidModel(f"No model files found at {model_path}")
+    # Make sure that there arent multiples of the files found
+    if len(found_files) == len(set(found_files)):
+        raise InvalidModel(
+            f"Found multiple model config files for {model_id_path.strip('models--')}"
+            f"at {model_path}"
+        )
+    # Check that found_files contains at least a 'config.json'
+    config_file = next(
+        (file for file in found_files if file.name == "config.json"), None
+    )
+    if config_file is None:
+        raise InvalidModel(
+            f"Missing required file 'config.json' for {model_id_path.strip('models--')}"
+            f"at {model_path}"
+        )
+    model_path = config_file.parent
+    # As a precaution we also check that all of the files are in the same directory
+    # if not we create a new dir with symlinks to all of the files from all snapshots
+    # this is especially useful for vllm where we can only specify one folder and e.g.,
+    # the safetensors version of the weights was added in an unmerged PR
+    if not all(
+        [found_file.parent == found_files[0].parent for found_file in found_files]
+    ):
+        new_model_path = model_path.parent / "model_files"
+        new_model_path.mkdir(exist_ok=True)
+        for found_file in found_files:
+            Path(new_model_path / found_file.name).symlink_to(found_file)
+        model_path = new_model_path
+    return str(model_path)
 def clear_memory() -> None:
     """Clears the memory of unused items."""
     for gc_generation in range(3):
@@ -91,6 +152,9 @@ def block_terminal_output() -> None:
     libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and
     disables most of the logging from the `transformers` library.
     """
+    if os.getenv("FULL_LOG") == "1":
+        return
     # Ignore miscellaneous warnings
     warnings.filterwarnings("ignore", category=UserWarning)
     warnings.filterwarnings("ignore", category=FutureWarning)
@@ -196,6 +260,7 @@ def get_min_cuda_compute_capability() -> float | None:
     return float(f"{major}.{minor}")
+@cache
 def internet_connection_available() -> bool:
     """Checks if internet connection is available by pinging google.com.
@@ -203,10 +268,17 @@ def internet_connection_available() -> bool:
         Whether or not internet connection is available.
     """
     try:
-        requests.get("https://www.google.com")
+        s = socket.create_connection(("1.1.1.1", 80))
+        s.close()
         return True
-    except RequestException:
-        return False
+    # a bit ugly but we dont want to actually import the pytest-socket exceptions
+    # we catch all exceptions and check if the name matches any known errors
+    except Exception as e:
+        pytest_socket_errors = ["SocketConnectBlockedError", "SocketBlockedError"]
+        if type(e).__name__ in pytest_socket_errors or isinstance(e, OSError):
+            return False
+        else:
+            raise e
 class HiddenPrints:

{euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 16.1.1
+Version: 16.2.1
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,13 +61,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
 Provides-Extra: all
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
-Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
-Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: timm>=1.0.19; extra == 'all'
+Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
-Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
-Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: timm>=1.0.19; extra == 'generative'
+Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
 Description-Content-Type: text/markdown
 <div align='center'>
@@ -152,13 +152,13 @@ model:
 ```
 >>> from euroeval import Benchmarker
 >>> benchmark = Benchmarker()
->>> benchmark(model="<model>")
+>>> benchmark(model="<model-id>")
 ```
 To benchmark on a specific task and/or language, you simply specify the `task` or
 `language` arguments, shown here with same example as above:
 ```
->>> benchmark(model="<model>", task="sentiment-classification", language="da")
+>>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
 ```
 If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
@@ -168,6 +168,30 @@ models on the Danish sentiment classification task:
 >>> benchmark(task="sentiment-classification", language="da")
 ```
+### Benchmarking in an Offline Environment
+If you need to benchmark in an offline environment, you need to download the models,
+datasets and metrics beforehand. This can be done by adding the `--download-only`
+argument, from the command line, or the `download_only` argument, if benchmarking from a
+script. For example to download the model you want and all of the Danish sentiment
+classification datasets:
+```
+$ euroeval --model <model-id> --task sentiment-classification --language da --download-only
+```
+Or from a script:
+```
+>>> benchmark(
+... model="<model-id>",
+... task="sentiment-classification",
+... language="da",
+... download_only=True,
+... )
+```
+Please note: Offline benchmarking of adapter models is not currently supported. An
+internet connection will be required during evaluation. If offline support is important
+to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
 ### Benchmarking from Docker
 A Dockerfile is provided in the repo, which can be downloaded and run, without needing
 to clone the repo and installing from source. This can be fetched programmatically by

{euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/RECORD RENAMED Viewed

@@ -1,15 +1,15 @@
-euroeval/__init__.py,sha256=8jqSCcDWvwwNb1guPi8cLAekPSOX9V8DpRx_v3-c19E,3730
-euroeval/benchmark_config_factory.py,sha256=NzNSiqix4hlVXk3xnyzdg2WDxomkectf97UWdVS3POo,11667
-euroeval/benchmarker.py,sha256=JkhvYxhVpQPcWmDLzwnB8Yy6tTqj3yfDWTefklbI7RM,50355
+euroeval/__init__.py,sha256=mXTjuGrEE-1fIS9x28oJKg-gNGt4q7y2E74l330KEmY,3787
+euroeval/benchmark_config_factory.py,sha256=eOQsd9F4cJy8I7a3_lIKDZ5b5ukipIUqk0GZ3pyytwQ,8596
+euroeval/benchmarker.py,sha256=5l4p1ncq4VJX_bDjv2f8oBq2GETPtJmduGOnLAbWjF8,55762
 euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
-euroeval/cli.py,sha256=wUGetj9Ld4wkS872ZOfYqHIJMh58o8L2MDi78wU5nxI,9099
+euroeval/cli.py,sha256=GOAWzdtasJfOvTuVQszu-T1T9GfQ_un-blOICO-y7g4,9316
 euroeval/constants.py,sha256=NN7kcwQdlDyyGFSrLjsL_qKVRyoRqZ9sKO5SjlgtRwA,2741
 euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
-euroeval/data_models.py,sha256=S-PATp4F1wBwvra6wtjlJFXxZbZB_vEpJHXcdTTKA70,27593
+euroeval/data_models.py,sha256=9Sgrq6Ktg1ETXRJ0v4VA_amAPowGuB7fZtL-8RlDQn0,27766
 euroeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
 euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
 euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
-euroeval/generation.py,sha256=MSrd0oIkoqwKsCOaIkY2CFF_urXLOfNR1OO5nMvcCpY,12476
+euroeval/generation.py,sha256=Va3EOmFzOMBNfI4fh3nW5qhhrM3CBT8_4MaLwVtsF_E,12528
 euroeval/generation_utils.py,sha256=d2_vylWXIeH4xIXgbsI5rN6dMt0zKp0zXExD6aOKWaA,18299
 euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
 euroeval/model_cache.py,sha256=h61cL_fy2Sd1sqYZis5lAWqvQIfQXXt_v8QZeftKNkg,9226
@@ -17,16 +17,16 @@ euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
 euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
 euroeval/scores.py,sha256=HQQqyjdgm853FZ_ifIdnSltKfBhsY7pOITov6F3Et5o,3165
 euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
-euroeval/tasks.py,sha256=3qEOBAMmfeqgXqlGkCKzQ-s0Yw-0-jPRgFZ97EZCFng,4535
-euroeval/tokenisation_utils.py,sha256=e2H86vhSVfz5gx6GmzoBJwLZLG6sf3GEcoCGmvJBQLc,21505
-euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
-euroeval/utils.py,sha256=c0tFw1IXZIqgLU4EfY_k28iJ1ZlCZ_oFoKZH2sGCKYg,16499
+euroeval/tasks.py,sha256=EzEWFDo_0ffabBFiRu-mw80jENUioE8D_VEn_Dsv-F8,4703
+euroeval/tokenisation_utils.py,sha256=nLeF2cdZSm5PZiAcDTtxY82nUJ-or8VU8YxYLa167EM,21158
+euroeval/types.py,sha256=_iVy-RwiCGu9TNX2sfyJTdCvXy1akNGTCywAo-YpBqU,2815
+euroeval/utils.py,sha256=DRJW6wtmNpRtuHt03diWo3S5m3rdxoPEQpd-KWi7aGY,19255
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=mHF8XS6GGUXV-sJtxmI5WJBWPLMHuh-4Z4OWjC25x9Y,11566
 euroeval/benchmark_modules/fresh.py,sha256=TveSQiFBi3xXgCEQBdHwkUQ685PDkKW0y3G5Yt5rkeM,10655
-euroeval/benchmark_modules/hf.py,sha256=oBjVumnSM9PW7ZocQwCGLKpbeGFWLN_71DBotxZo1aY,44038
-euroeval/benchmark_modules/litellm.py,sha256=6EKjHnUoPCpuupISZHXqZsXLG8tyiA1-G12a5C6L8MM,64629
-euroeval/benchmark_modules/vllm.py,sha256=sYFdVzB9CZX6_sGI4xghDyXoVn6I95_nbeFUWeSMXcc,43132
+euroeval/benchmark_modules/hf.py,sha256=XmkoDFzaJqnd_5mmUkqCaOgAdRPFs3KZKZZ0cr83TlM,44742
+euroeval/benchmark_modules/litellm.py,sha256=F3udd6NmhQOe3go_7rAcWg7mgZrNQpWWvLe-5U4E2RQ,64771
+euroeval/benchmark_modules/vllm.py,sha256=yLy8TCTnodu4NdTiO7XSdxuHX60AJ1-7p6J3e5h7-iA,43994
 euroeval/dataset_configs/__init__.py,sha256=uuIZmElpJV8iupo5oDj3TeQhBDRANdWpLKYFASLirHA,2046
 euroeval/dataset_configs/danish.py,sha256=QABfgI7m-0-5AimDXegp5ssDSLcM2VrAI_RWsinSZP4,5631
 euroeval/dataset_configs/dutch.py,sha256=63Ro2yFym5MuIDXf5953vUYenw9B0kZSCmZbXjdy4Rs,5517
@@ -45,17 +45,17 @@ euroeval/dataset_configs/portuguese.py,sha256=gQ054SdLQ5fkm4IAP6Mdh5RcPDJPDITcuy
 euroeval/dataset_configs/spanish.py,sha256=DvJlMK6OQg4qmxKzQA2IficlBMB7BafvxqIVuTKiZyw,4902
 euroeval/dataset_configs/swedish.py,sha256=YWHp7hbJ25o36csSg9uXaQCEJK1BPb7u2RQZiCe0lNs,5445
 euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
-euroeval/metrics/base.py,sha256=4vnRIPfKUwTNe0ZVm5YC2jQNecwchGUpN6nAH5cX0PM,2288
-euroeval/metrics/huggingface.py,sha256=b_Z_FUELQcmK7HeJh0zlAZs3pim1uNHnFLu7nvlZ4_A,5824
+euroeval/metrics/base.py,sha256=HST2XeZrUQZV_vTiieePiaznEov3CIGzuVNIITtLsQc,2596
+euroeval/metrics/huggingface.py,sha256=iHKJnvOXRc_e8sxB2ff3WkfK64jXyn5KEnIxPyfD2fM,6522
 euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
 euroeval/metrics/pipeline.py,sha256=Wcan3eDWV7t4WRXMPWCCe_JsA-fZnIfZU2ESinbbL2I,10284
 euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
 euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
-euroeval/prompt_templates/linguistic_acceptability.py,sha256=pRR1QBnYt5DnfxQp6dw1OYFZfIct-1R9pfdgPGpjoco,8667
+euroeval/prompt_templates/linguistic_acceptability.py,sha256=m23LrckohdnToQDsexdsW_5YyBfGTf5DTjiMI643F9A,8717
 euroeval/prompt_templates/multiple_choice.py,sha256=Q-8-ETqG-RZeLzR8v8WUBIN7djiNSfNpmYnZRUWcd84,6905
-euroeval/prompt_templates/named_entity_recognition.py,sha256=LT7J6Y9rUCJFimpnwujBZq_V5buSmXHJteIXbTOoaCE,16442
+euroeval/prompt_templates/named_entity_recognition.py,sha256=HIX9EBkSIBl5JXceFtiZTdvzWr9YHM9-55D6bcjIyQ4,16436
 euroeval/prompt_templates/reading_comprehension.py,sha256=ogzmhiSZO6egrdxxQiWz6a0XMdC0vws-lg5yRKQoYV0,8730
-euroeval/prompt_templates/sentiment_classification.py,sha256=BwnTpSdsAN_rL693ImgtKIRc5T_2G6ptWW0jCdC02NQ,9454
+euroeval/prompt_templates/sentiment_classification.py,sha256=b3TvH26M77vwFfn577NlGVW881qfV7YSm-Xba_w98Fc,9504
 euroeval/prompt_templates/summarization.py,sha256=4Sqwj6C7yNfqj4FFFCseJMLDoSZ13aIOgY0SjIzzsNo,6593
 euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 euroeval/task_group_utils/multiple_choice_classification.py,sha256=i5sidJGAXnENRoB6pOelyaUeGP1qoxwPSzD-F9RLwWk,7106
@@ -63,8 +63,8 @@ euroeval/task_group_utils/question_answering.py,sha256=eUczZntrC9lhCUQlwNQB49i-5
 euroeval/task_group_utils/sequence_classification.py,sha256=TAqZCoMQ9I-HFhMH35_J1mY2SQg95HUbXcgrBIyhgk0,16082
 euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
 euroeval/task_group_utils/token_classification.py,sha256=Yjai937ia1nZBMOWySqCXr_dA6WiVLGvmb4Hm_TU0Bg,17118
-euroeval-16.1.1.dist-info/METADATA,sha256=gyqd2PPeT0vv_ye9nnfqv-0DlpejquzqcftBwpwnH7Y,13729
-euroeval-16.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-16.1.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
-euroeval-16.1.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
-euroeval-16.1.1.dist-info/RECORD,,
+euroeval-16.2.1.dist-info/METADATA,sha256=brIXZ3x3MUf-ggNpKKC_4Lvrqem0MfKPrJ8DZJ5T3Iw,14590
+euroeval-16.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-16.2.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
+euroeval-16.2.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
+euroeval-16.2.1.dist-info/RECORD,,

{euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 16.1.1__py3-none-any.whl → 16.2.1__py3-none-any.whl

Potentially problematic release.

EuroEval 16.1.1py3-none-any.whl → 16.2.1py3-none-any.whl