PyPI - EuroEval - Versions diffs - 15.4.0__py3-none-any.whl → 15.4.2__py3-none-any.whl - Mend

EuroEval 15.4.0py3-none-any.whl → 15.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (14) hide show

euroeval/benchmark_modules/hf.py +68 -37
euroeval/benchmark_modules/vllm.py +47 -8
euroeval/constants.py +3 -0
euroeval/data_models.py +7 -2
euroeval/dataset_configs.py +5 -5
euroeval/generation.py +17 -3
euroeval/task_utils/sequence_classification.py +35 -10
euroeval/types.py +3 -3
euroeval/utils.py +32 -29
{euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/METADATA +4 -3
{euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/RECORD +14 -14
{euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/WHEEL +0 -0
{euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/entry_points.txt +0 -0
{euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -20,6 +20,7 @@ from huggingface_hub.utils import (
     HFValidationError,
     LocalTokenNotFoundError,
 )
+from peft import PeftConfig
 from requests.exceptions import RequestException
 from torch import nn
 from transformers import (
@@ -34,6 +35,9 @@ from transformers import (
     Trainer,
 )
 from transformers.modelcard import TASK_MAPPING
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
+)
 from urllib3.exceptions import RequestError
 from ..constants import (
@@ -73,6 +77,7 @@ from ..utils import (
     get_class_by_name,
     get_eos_token,
     internet_connection_available,
+    log_once,
 )
 from .base import BenchmarkModule
@@ -727,53 +732,54 @@ def get_model_repo_info(
     # If the model does not exist locally, then we get the model info from the Hugging
     # Face Hub
     if model_info is None:
-        try:
-            model_info = hf_api.model_info(
-                repo_id=model_id, revision=revision, token=token
-            )
-        except (GatedRepoError, LocalTokenNotFoundError) as e:
+        num_attempts = 3
+        for _ in range(num_attempts):
             try:
-                hf_whoami(token=token)
-                logger.warning(
-                    f"Could not access the model {model_id} with the revision "
-                    f"{revision}. The error was {str(e)!r}."
+                model_info = hf_api.model_info(
+                    repo_id=model_id, revision=revision, token=token
                 )
+                break
+            except (GatedRepoError, LocalTokenNotFoundError) as e:
+                try:
+                    hf_whoami(token=token)
+                    logger.warning(
+                        f"Could not access the model {model_id} with the revision "
+                        f"{revision}. The error was {str(e)!r}."
+                    )
+                    return None
+                except LocalTokenNotFoundError:
+                    raise NeedsAdditionalArgument(
+                        cli_argument="--api-key",
+                        script_argument="api_key=<your-api-key>",
+                        run_with_cli=benchmark_config.run_with_cli,
+                    )
+            except (RepositoryNotFoundError, HFValidationError):
                 return None
-            except LocalTokenNotFoundError:
-                raise NeedsAdditionalArgument(
-                    cli_argument="--api-key",
-                    script_argument="api_key=<your-api-key>",
-                    run_with_cli=benchmark_config.run_with_cli,
-                )
-        except (RepositoryNotFoundError, HFValidationError):
-            return None
-        except (OSError, RequestException):
-            if internet_connection_available():
-                raise HuggingFaceHubDown()
-            else:
+            except (OSError, RequestException):
+                if internet_connection_available():
+                    continue
                 raise NoInternetConnection()
+        else:
+            raise HuggingFaceHubDown()
     # Get all the Hugging Face repository tags for the model. If the model is an adapter
     # model, then we also get the tags for the base model
     tags = model_info.tags or list()
-    has_base_model_tag = any(
-        tag.startswith("base_model:") and tag.count(":") == 1 for tag in tags
-    )
     base_model_id: str | None = None
-    if has_base_model_tag:
-        has_adapter_config = model_info.siblings is not None and any(
-            sibling.rfilename == "adapter_config.json"
-            for sibling in model_info.siblings
+    has_adapter_config = model_info.siblings is not None and any(
+        sibling.rfilename == "adapter_config.json" for sibling in model_info.siblings
+    )
+    if has_adapter_config:
+        adapter_config = PeftConfig.from_pretrained(model_id, revision=revision)
+        base_model_id = adapter_config.base_model_name_or_path
+        log_once(
+            f"Model {model_id!r} identified as an adapter model, with base model "
+            f"{base_model_id!r}.",
+            level=logging.DEBUG,
         )
-        if has_adapter_config:
-            base_model_id = [
-                tag.split(":")[1]
-                for tag in tags
-                if tag.startswith("base_model:") and tag.count(":") == 1
-            ][0]
+        if base_model_id is not None:
             base_model_info = hf_api.model_info(
                 repo_id=base_model_id,
-                revision=revision,
                 token=benchmark_config.api_key
                 or os.getenv("HUGGINGFACE_API_KEY")
                 or True,
@@ -781,12 +787,18 @@ def get_model_repo_info(
             tags += base_model_info.tags or list()
             tags = list(set(tags))
+    # TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
+    # 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
+    # when this PR has been merged in and published:
+    # https://github.com/huggingface/transformers/pull/37107
+    TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
     # Get the pipeline tag for the model. If it is not specified, then we determine it
     # by checking the model's architecture as written in the model's Hugging Face config
     pipeline_tag = model_info.pipeline_tag
     if pipeline_tag is None:
         hf_config = load_hf_model_config(
-            model_id=model_id,
+            model_id=base_model_id or model_id,
             num_labels=0,
             id2label=dict(),
             label2id=dict(),
@@ -812,7 +824,6 @@ def get_model_repo_info(
             pipeline_tag = "fill-mask"
     if benchmark_config.only_allow_safetensors:
-        # Check if any file ends with .safetensors
         repo_files = hf_api.list_repo_files(repo_id=model_id, revision=revision)
         has_safetensors = any(f.endswith(".safetensors") for f in repo_files)
         if not has_safetensors:
@@ -826,6 +837,26 @@ def get_model_repo_info(
                 )
             raise InvalidModel(msg)
+        # Also check base model if we are evaluating an adapter
+        if base_model_id is not None:
+            base_repo_files = hf_api.list_repo_files(repo_id=base_model_id)
+            base_has_safetensors = any(
+                f.endswith(".safetensors") for f in base_repo_files
+            )
+            if not base_has_safetensors:
+                msg = (
+                    f"Base model {base_model_id} does not have safetensors weights "
+                    "available."
+                )
+                if benchmark_config.run_with_cli:
+                    msg += " Skipping since the `--only-allow-safetensors` flag is set."
+                else:
+                    msg += (
+                        " Skipping since the `only_allow_safetensors` argument is set "
+                        "to `True`."
+                    )
+                raise InvalidModel(msg)
     return HFModelInfo(
         pipeline_tag=pipeline_tag, tags=tags, adapter_base_model_id=base_model_id
     )

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -30,6 +30,7 @@ from ..constants import (
     REASONING_MAX_TOKENS,
     TASK_GROUPS_USING_LOGPROBS,
     TASKS_USING_JSON,
+    VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
 )
 from ..data_models import (
     BenchmarkConfig,
@@ -65,6 +66,7 @@ from ..utils import (
     get_bos_token,
     get_end_of_chat_token_ids,
     get_eos_token,
+    get_min_cuda_compute_capability,
     log_once,
     should_prompts_be_stripped,
 )
@@ -145,6 +147,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         if self.model_config.adapter_base_model_id is not None:
             adapter_path = snapshot_download(
                 repo_id=self.model_config.model_id,
+                revision=self.model_config.revision,
                 cache_dir=Path(self.model_config.model_cache_dir),
             )
             self.buffer["lora_request"] = LoRARequest(
@@ -373,12 +376,27 @@ class VLLMModel(HuggingFaceEncoderModel):
         # Generate sequences using vLLM
         input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
-        raw_outputs = self._model.generate(
-            prompts=prompts,
-            sampling_params=sampling_params,
-            use_tqdm=(not input_is_a_test),
-            lora_request=self.buffer.get("lora_request"),
-        )
+        num_attempts = 3
+        for _ in range(num_attempts):
+            try:
+                raw_outputs = self._model.generate(
+                    prompts=prompts,
+                    sampling_params=sampling_params,
+                    use_tqdm=(not input_is_a_test),
+                    lora_request=self.buffer.get("lora_request"),
+                )
+                break
+            except TypeError as e:
+                logger.debug(
+                    f"Encountered error during vLLM generation: {str(e)}. Retrying..."
+                )
+                sleep(1)
+        else:
+            raise InvalidBenchmark(
+                f"Could not generate sequences after {num_attempts} attempts."
+            )
+        # Parse the raw model outputs
         completion_ids: list[list[int]] = [
             output.outputs[0].token_ids for output in raw_outputs
         ]
@@ -846,13 +864,16 @@ def load_model_and_tokenizer(
     # Prefer base model ID if the model is an adapter - the adapter will be added on
     # during inference in this case
     model_id = model_config.adapter_base_model_id or model_config.model_id
+    revision = (
+        model_config.revision if model_config.adapter_base_model_id is None else "main"
+    )
     hf_model_config = load_hf_model_config(
         model_id=model_id,
         num_labels=0,
         id2label=dict(),
         label2id=dict(),
-        revision=model_config.revision,
+        revision=revision,
         model_cache_dir=model_config.model_cache_dir,
         api_key=benchmark_config.api_key,
         trust_remote_code=benchmark_config.trust_remote_code,
@@ -881,6 +902,23 @@ def load_model_and_tokenizer(
         )
         dtype = torch.float16
+    if hf_model_config.torch_dtype == torch.bfloat16:
+        min_cuda_compute_capability = get_min_cuda_compute_capability()
+        required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
+        if min_cuda_compute_capability is not None:
+            if min_cuda_compute_capability < required_capability:
+                logger.info(
+                    "You are loading a model with "
+                    f"dtype {hf_model_config.torch_dtype}, "
+                    "which vLLM only supports for CUDA devices with"
+                    f"CUDA compute capability >={required_capability}. "
+                    "You are using one or more devices with "
+                    f"compute capability {min_cuda_compute_capability}. "
+                    "Setting dtype to float16 instead."
+                )
+                dtype = torch.float16
     if model_config.adapter_base_model_id is not None:
         download_dir = str(Path(model_config.model_cache_dir) / "base_model")
     else:
@@ -916,7 +954,7 @@ def load_model_and_tokenizer(
             max_model_len=min(true_max_model_len, 5_000),
             download_dir=download_dir,
             trust_remote_code=benchmark_config.trust_remote_code,
-            revision=model_config.revision,
+            revision=revision,
             seed=4242,
             distributed_executor_backend=executor_backend,
             tensor_parallel_size=torch.cuda.device_count(),
@@ -994,6 +1032,7 @@ def load_tokenizer(
     Returns:
         The loaded tokenizer.
     """
+    revision = revision if adapter_base_model_id is None else "main"
     config = AutoConfig.from_pretrained(
         adapter_base_model_id or model_id,
         revision=revision,

euroeval/constants.py CHANGED Viewed

@@ -54,3 +54,6 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
 # Hugging Face Hub tags used to classify models as merge models
 MERGE_TAGS = ["merge", "mergekit"]
+# The minimum required CUDA compute capability for using bfloat16 in vLLM
+VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0

euroeval/data_models.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Data models used in EuroEval."""
 import collections.abc as c
-import importlib.metadata
 import json
 import pathlib
 import re
@@ -11,6 +10,8 @@ from dataclasses import dataclass, field
 import pydantic
 import torch
+from euroeval.utils import get_package_version
 from .enums import Device, InferenceBackend, ModelType, TaskGroup
 from .types import ScoreDict
@@ -228,7 +229,11 @@ class BenchmarkResult(pydantic.BaseModel):
     generative_type: str | None
     few_shot: bool
     validation_split: bool
-    euroeval_version: str = importlib.metadata.version("euroeval")
+    euroeval_version: str | None = get_package_version("euroeval")
+    transformers_version: str | None = get_package_version("transformers")
+    torch_version: str | None = get_package_version("torch")
+    vllm_version: str | None = get_package_version("vllm")
+    outlines_version: str | None = get_package_version("outlines")
     @classmethod
     def from_dict(cls, config: dict) -> "BenchmarkResult":

euroeval/dataset_configs.py CHANGED Viewed

@@ -244,7 +244,7 @@ FOSENT_CONFIG = DatasetConfig(
 ALLOCINE_CONFIG = DatasetConfig(
     name="allocine",
     pretty_name="the truncated version of the French sentiment classification "
-    "dataset Allocine",
+    "dataset AlloCiné",
     huggingface_id="EuroEval/allocine-mini",
     task=SENT,
     languages=[FR],
@@ -1467,9 +1467,9 @@ NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
     max_generated_tokens=256,
 )
-MLSUM_CONFIG = DatasetConfig(
-    name="mlsum",
-    pretty_name="the truncated version of the German summarisation dataset MLSum",
+MLSUM_DE_CONFIG = DatasetConfig(
+    name="mlsum-de",
+    pretty_name="the truncated version of the German summarisation dataset MLSum-de",
     huggingface_id="EuroEval/mlsum-mini",
     task=SUMM,
     languages=[DE],
@@ -1484,7 +1484,7 @@ MLSUM_CONFIG = DatasetConfig(
 MLSUM_ES_CONFIG = DatasetConfig(
     name="mlsum-es",
-    pretty_name="the truncated version of the Spanish summarisation dataset MLSum",
+    pretty_name="the truncated version of the Spanish summarisation dataset MLSum-es",
     huggingface_id="EuroEval/mlsum-es-mini",
     task=SUMM,
     languages=[ES],

euroeval/generation.py CHANGED Viewed

@@ -20,7 +20,12 @@ from .model_cache import (
 from .utils import clear_memory
 if t.TYPE_CHECKING:
-    from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
+    from .data_models import (
+        BenchmarkConfig,
+        DatasetConfig,
+        GenerativeModelOutput,
+        ModelConfig,
+    )
 logger = logging.getLogger("euroeval")
@@ -163,6 +168,7 @@ def generate_single_iteration(
             if benchmark_config.debug:
                 debug_log(
                     batch=batch,
+                    model_output=model_output,
                     extracted_labels=extracted_labels,  # type: ignore[arg-type]
                     dataset_config=dataset_config,
                 )
@@ -217,6 +223,7 @@ def generate_single_iteration(
 def debug_log(
     batch: dict[str, t.Any],
+    model_output: "GenerativeModelOutput",
     extracted_labels: list[dict | str | list[str]],
     dataset_config: "DatasetConfig",
 ) -> None:
@@ -225,6 +232,8 @@ def debug_log(
     Args:
         batch:
             The batch of examples to evaluate on.
+        model_output:
+            The output of the model.
         extracted_labels:
             The extracted labels from the model output.
         dataset_config:
@@ -290,7 +299,12 @@ def debug_log(
     else:
         input_texts = batch["text"]
-    for input_text, prediction, label in zip(input_texts, extracted_labels, labels):
+    for input_text, raw_output, prediction, label in zip(
+        input_texts, model_output.sequences, extracted_labels, labels
+    ):
         logger.info(
-            f"Input: '{input_text}'\nPrediction: '{prediction}'\nLabel: '{label}'"
+            f"Input: '{input_text}'\n"
+            f"Raw outout: '{raw_output}'\n"
+            f"Prediction: '{prediction}'\n"
+            f"Label: '{label}'"
         )

euroeval/task_utils/sequence_classification.py CHANGED Viewed

@@ -162,9 +162,7 @@ def get_closest_logprobs_labels(
     """
     english_labels = list(dataset_config.id2label.values())
     english2local = dataset_config.prompt_label_mapping
-    candidate_labels = [
-        english2local[lbl].lower() for lbl in english_labels
-    ] + english_labels
+    candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
     output_labels: list[str] = list()
     for sample in generation_logprobs:
@@ -179,21 +177,48 @@ def get_closest_logprobs_labels(
             ]
             generated_labels = [label for label in generated_labels if label != ""]
-            # We want to use the first generated label which starts with a candidate
+            # We want to use the first generated label which contains a unique candidate
             # label, as the output label
             output_label: str | None = None
-            for generated_label in generated_labels:
-                candidate_output_labels = [
+            previously_generated_labels: list[str] = list()
+            for label_idx, generated_label in enumerate(generated_labels):
+                generated_label = "".join(previously_generated_labels) + generated_label
+                # Get the candidate labels that starts with the generated label
+                candidate_output_labels = {
                     candidate_label
                     for candidate_label in candidate_labels
                     if candidate_label.startswith(generated_label)
-                ]
-                if candidate_output_labels:
-                    output_label = candidate_output_labels[0]
+                }
+                # If we can uniquely determine the output label, we break the loop. If
+                # there are multiple possible labels then we store the current one, and
+                # concatenate it with the next generated label. We can only do this if
+                # the current one is the first one, however, since we're using greedy
+                # sampling. In case this happens for a label that is not the first one,
+                # we warn the user.
+                if len(candidate_output_labels) == 1:
+                    output_label = candidate_output_labels.pop()
                     break
+                elif len(candidate_output_labels) > 1:
+                    if label_idx == 0:
+                        previously_generated_labels.append(generated_label)
+                    else:
+                        output_label = candidate_output_labels.pop()
+                        candidate_output_labels.add(output_label)
+                        log_once(
+                            "Multiple candidate labels found for the generated label "
+                            f"{generated_label!r}: {candidate_output_labels}. Since "
+                            "this is not the first generated label, we cannot "
+                            "concatenate it with the next generated label. We are thus "
+                            f"forced to use the arbitrary {output_label!r} as the "
+                            "output label, potentially resulting in worse performance. "
+                            "Please report this issue to the EuroEval team at "
+                            "github.com/EuroEval/EuroEval/issues.",
+                            level=logging.WARNING,
+                        )
             if output_label is not None:
-                output_label = english2local.get(output_label, output_label)
                 output_labels.append(output_label)
                 break
         else:

euroeval/types.py CHANGED Viewed

@@ -8,9 +8,9 @@ if t.TYPE_CHECKING:
     from .data_models import GenerativeModelOutput
-ScoreDict = dict[str, dict[str, float] | list[dict[str, float]]]
-Predictions = NDArray | list[str] | list[list[str]]
-Labels = NDArray | list[str] | list[list[str]]
+ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
+Predictions: t.TypeAlias = NDArray | list[str] | list[list[str]]
+Labels: t.TypeAlias = NDArray | list[str] | list[list[str]]
 class ComputeMetricsFunction(t.Protocol):

euroeval/utils.py CHANGED Viewed

@@ -2,11 +2,11 @@
 import gc
 import importlib
+import importlib.metadata
 import importlib.util
 import logging
 import os
 import random
-import re
 import sys
 import typing as t
 import warnings
@@ -16,7 +16,6 @@ from types import TracebackType
 import litellm
 import numpy as np
-import pkg_resources
 import requests
 import torch
 from datasets.utils import disable_progress_bar
@@ -84,33 +83,6 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
     return rng
-def is_module_installed(module: str) -> bool:
-    """Check if a module is installed.
-    This is used when dealing with spaCy models, as these are installed as separate
-    Python packages.
-    Args:
-        module:
-            The name of the module.
-    Returns:
-        Whether the module is installed or not.
-    """
-    # Get list of all modules, including their versions
-    installed_modules_with_versions = list(pkg_resources.working_set)
-    # Strip the module versions from the list of modules. Also make the modules lower
-    # case and replace dashes with underscores
-    installed_modules = [
-        re.sub("[0-9. ]", "", str(module)).lower().replace("-", "_")
-        for module in installed_modules_with_versions
-    ]
-    # Check if the module is installed by checking if the module name is in the list
-    return module.lower() in installed_modules
 def block_terminal_output() -> None:
     """Blocks libraries from writing output to the terminal.
@@ -206,6 +178,21 @@ def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type |
     return None
+def get_min_cuda_compute_capability() -> float | None:
+    """Gets the lowest cuda capability.
+    Returns:
+        Device capability as float, or None if CUDA is not available.
+    """
+    if not torch.cuda.is_available():
+        return None
+    device_range = range(torch.cuda.device_count())
+    capabilities = map(torch.cuda.get_device_capability, device_range)
+    major, minor = min(capabilities)
+    return float(f"{major}.{minor}")
 def kebab_to_pascal(kebab_string: str) -> str:
     """Converts a kebab-case string to PascalCase.
@@ -573,3 +560,19 @@ def log_once(message: str, level: int = logging.INFO) -> None:
             logger.critical(message)
         case _:
             raise ValueError(f"Invalid logging level: {level}")
+def get_package_version(package_name: str) -> str | None:
+    """Get the version of a package.
+    Args:
+        package_name:
+            The name of the package.
+    Returns:
+        The version of the package, or None if the package is not installed.
+    """
+    try:
+        return importlib.metadata.version(package_name)
+    except importlib.metadata.PackageNotFoundError:
+        return None

{euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.4.0
+Version: 15.4.2
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -42,6 +42,7 @@ Requires-Dist: more-itertools>=10.5.0
 Requires-Dist: numpy<2.0.0,>=1.23.0
 Requires-Dist: ollama>=0.4.7
 Requires-Dist: pandas>=2.2.0
+Requires-Dist: peft>=0.15.0
 Requires-Dist: protobuf~=3.20.0
 Requires-Dist: pydantic>=2.6.0
 Requires-Dist: pyinfer>=0.0.3
@@ -61,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: gradio>=4.26.0; extra == 'all'
 Requires-Dist: outlines>=0.1.11; extra == 'all'
-Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: outlines>=0.1.11; extra == 'generative'
-Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'generative'
 Provides-Extra: human-evaluation
 Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
 Provides-Extra: test

{euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/RECORD RENAMED Viewed

@@ -3,14 +3,14 @@ euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHh
 euroeval/benchmarker.py,sha256=PIdqLPleLN3nml5Zb1g_dQaLzqxQhmgC8VuvD5yloV4,46524
 euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
 euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
-euroeval/constants.py,sha256=9iXe26WAigL9RYob3PhsB5c0dr11wCeRxrEfm_ssynM,1562
+euroeval/constants.py,sha256=zL8dm7SEFpIgC2vaPhqzdKydVSWW-ZyMHenWPnNxWqQ,1681
 euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
-euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
-euroeval/dataset_configs.py,sha256=bjMUXvaEtTpo1Eql_mIRCG3K_lB2DZRdPWEAwR5N4ig,90627
+euroeval/data_models.py,sha256=b4rOMdhoxkIPcnTQdwqq5iWaF6uia1OzAgdiOBvoGVM,14779
+euroeval/dataset_configs.py,sha256=C5Gnp95cBeCmmuRA8Rznt0c4gMOn8Pilk_kDCleDMjg,90640
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
 euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
-euroeval/generation.py,sha256=UZ9nmKl4rbNBhW41iwpgw_tqfsEfe1UhOnjGudz9GWs,10382
+euroeval/generation.py,sha256=dohSPYc4eASm5tJhNKfBlpJnellKG7nVeyx8yXXxMlE,10721
 euroeval/human_evaluation.py,sha256=5uOm8cZf5uy2jBPs-ih7g8ni-a3hUz8UiXVPh6PzUWw,27675
 euroeval/languages.py,sha256=d1SyG0KVtCAA_PYpFGZCgZcyVLIr7Q8uYKPxNw6WEBc,7909
 euroeval/model_cache.py,sha256=BhkyWrOhjskESbndy218LUv1ZiWRc48ScdH_42dKHtE,8275
@@ -19,22 +19,22 @@ euroeval/model_loading.py,sha256=ta07tMoSfK1kqjOynVXQA0vVrns6RzsCEE3g1_RGVVs,271
 euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
 euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
 euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
-euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
-euroeval/utils.py,sha256=MkiVI-0KmK4ilKJTTfYAynKaPDOzW1WjyRdZsYmnoIg,18803
+euroeval/types.py,sha256=5DIhaVyzH8RO9jdJfibX9pwbZviQwU35dMsfszD2Whs,2406
+euroeval/utils.py,sha256=CFjYMoKdcxLUEM-aF3pxf_3TnGWvGasjfb8pDMJVe9U,18772
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
 euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
-euroeval/benchmark_modules/hf.py,sha256=YeaaP_YGAlKG5G1KFq0bFOFWv42eH_zfmhuW3FAXjAA,41726
+euroeval/benchmark_modules/hf.py,sha256=Typig7WDqOn_uGE24s_P_9PHvq-V0MrKGD7xbh0aYnk,43244
 euroeval/benchmark_modules/litellm.py,sha256=ZJ9dB683pXPHDf70OOJfmHn_y706xRYzstYLz2ytCKE,39784
-euroeval/benchmark_modules/vllm.py,sha256=5N2ytLR9cZIcPeza-ERQWwyvehDd0F1FUvXY3cKu4Oo,44519
+euroeval/benchmark_modules/vllm.py,sha256=O8-dcVkU2jgZer44EOeTC8E4d-xQjPDOXnoyzXxAToQ,46179
 euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
 euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
-euroeval/task_utils/sequence_classification.py,sha256=FrkvFzxFSnZoXThgpQqvJCIy3_YemyqZFQ1L-YdMMiw,8527
+euroeval/task_utils/sequence_classification.py,sha256=832iWpPR3CsnlBIYA976eN21WUFQLUmIlDxFIvOsROk,10266
 euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
 euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
-euroeval-15.4.0.dist-info/METADATA,sha256=HfNWsANdb8TJAyK__QPBhs7O5qsQp9G_gPlhVVNuK9c,10724
-euroeval-15.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.4.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.4.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
-euroeval-15.4.0.dist-info/RECORD,,
+euroeval-15.4.2.dist-info/METADATA,sha256=cvpyWIKPXNKn1Idv7w3C7z8MBVljmw50jBdskL_32oI,10752
+euroeval-15.4.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.4.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.4.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.4.2.dist-info/RECORD,,

{euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.4.0__py3-none-any.whl → 15.4.2__py3-none-any.whl

Potentially problematic release.

EuroEval 15.4.0py3-none-any.whl → 15.4.2py3-none-any.whl