PyPI - EuroEval - Versions diffs - 15.8.2__py3-none-any.whl → 15.9.1__py3-none-any.whl - Mend

EuroEval 15.8.2py3-none-any.whl → 15.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (14) hide show

euroeval/__init__.py +14 -0
euroeval/benchmark_config_factory.py +0 -31
euroeval/benchmark_modules/hf.py +26 -13
euroeval/benchmark_modules/vllm.py +70 -2
euroeval/benchmarker.py +0 -21
euroeval/cli.py +0 -10
euroeval/data_models.py +0 -5
euroeval/exceptions.py +0 -22
euroeval/human_evaluation.py +0 -1
{euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/METADATA +3 -5
{euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/RECORD +14 -14
{euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/WHEEL +0 -0
{euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/entry_points.txt +0 -0
{euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -3,6 +3,7 @@
 ### STAGE 1 ###
 ### Block unwanted terminal output that happens on importing external modules ###
+import importlib.util
 import logging
 import os
 import sys
@@ -27,6 +28,19 @@ logging.basicConfig(
 ### STAGE 2 ###
+### Check for incompatible packages ###
+# Throw informative error if `flash_attn` is installed ###
+if importlib.util.find_spec("flash_attn") is not None:
+    logging.critical(
+        "The `flash_attn` package is not supported by EuroEval, as it is now built "
+        "into the other packages and it conflicts with the other implementations. "
+        "Please uninstall it using `pip uninstall flash_attn` and try again."
+    )
+    sys.exit(1)
+### STAGE 3 ###
 ### Set the rest up ###
 import importlib.metadata  # noqa: E402

euroeval/benchmark_config_factory.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Factory class for creating dataset configurations."""
-import importlib.util
 import logging
 import sys
 import typing as t
@@ -13,7 +12,6 @@ from .enums import Device
 from .exceptions import InvalidBenchmark
 from .languages import get_all_languages
 from .tasks import SPEED, get_all_tasks
-from .utils import log_once
 if t.TYPE_CHECKING:
     from .data_models import Language, Task
@@ -38,7 +36,6 @@ def build_benchmark_config(
     force: bool,
     verbose: bool,
     trust_remote_code: bool,
-    use_flash_attention: bool | None,
     clear_model_cache: bool,
     evaluate_test_split: bool,
     few_shot: bool,
@@ -92,9 +89,6 @@ def build_benchmark_config(
             automatically set if `debug` is True.
         trust_remote_code:
             Whether to trust remote code when running the benchmark.
-        use_flash_attention:
-            Whether to use Flash Attention for the models. If None then it will be used
-            if it is available.
         clear_model_cache:
             Whether to clear the model cache before running the benchmark.
         evaluate_test_split:
@@ -135,30 +129,6 @@ def build_benchmark_config(
     torch_device = prepare_device(device=device)
-    if use_flash_attention is None:
-        if torch_device.type != "cuda":
-            use_flash_attention = False
-        elif (
-            importlib.util.find_spec("flash_attn") is None
-            and importlib.util.find_spec("vllm_flash_attn") is None
-        ):
-            use_flash_attention = False
-            if first_time and torch_device.type == "cuda":
-                message = (
-                    "Flash attention has not been installed, so this will not be used. "
-                    "To install it, run `pip install -U wheel && "
-                    "FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn "
-                    "--no-build-isolation`. Alternatively, you can disable this "
-                    "message by setting "
-                )
-                if run_with_cli:
-                    message += "the flag `--no-use-flash-attention`."
-                else:
-                    message += (
-                        "the argument `use_flash_attention=False` in the `Benchmarker`."
-                    )
-                log_once(message=message, level=logging.INFO)
     # Set variable with number of iterations
     if hasattr(sys, "_called_from_test"):
         num_iterations = 1
@@ -178,7 +148,6 @@ def build_benchmark_config(
         verbose=verbose or debug,
         device=torch_device,
         trust_remote_code=trust_remote_code,
-        use_flash_attention=use_flash_attention,
         clear_model_cache=clear_model_cache,
         evaluate_test_split=evaluate_test_split,
         few_shot=few_shot,

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -54,13 +54,11 @@ from ..enums import (
     TaskGroup,
 )
 from ..exceptions import (
-    HuggingFaceHubDown,
     InvalidBenchmark,
     InvalidModel,
     NeedsAdditionalArgument,
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
-    NoInternetConnection,
 )
 from ..languages import get_all_languages
 from ..task_group_utils import (
@@ -737,9 +735,10 @@ def get_model_repo_info(
             model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
     # If the model does not exist locally, then we get the model info from the Hugging
-    # Face Hub
+    # Face Hub, if possible
     if model_info is None:
         num_attempts = 3
+        errors: list[Exception] = list()
         for _ in range(num_attempts):
             try:
                 model_info = hf_api.model_info(
@@ -749,25 +748,37 @@ def get_model_repo_info(
             except (GatedRepoError, LocalTokenNotFoundError) as e:
                 try:
                     hf_whoami(token=token)
-                    logger.warning(
+                    logger.debug(
                         f"Could not access the model {model_id} with the revision "
                         f"{revision}. The error was {str(e)!r}."
                     )
                     return None
                 except LocalTokenNotFoundError:
-                    raise NeedsAdditionalArgument(
-                        cli_argument="--api-key",
-                        script_argument="api_key=<your-api-key>",
-                        run_with_cli=benchmark_config.run_with_cli,
+                    logger.debug(
+                        f"Could not access the model {model_id} with the revision "
+                        f"{revision}. The error was {str(e)!r}. Please set the "
+                        "`HUGGINGFACE_API_KEY` environment variable or use the "
+                        "`--api-key` argument."
                     )
+                    return None
             except (RepositoryNotFoundError, HFValidationError):
                 return None
-            except (OSError, RequestException):
+            except (OSError, RequestException) as e:
                 if internet_connection_available():
+                    errors.append(e)
                     continue
-                raise NoInternetConnection()
+                logger.debug(
+                    "Could not access the Hugging Face Hub. Please check your internet "
+                    "connection."
+                )
+                return None
         else:
-            raise HuggingFaceHubDown()
+            logger.debug(
+                f"Could not access model info for the model {model_id!r} from the "
+                f"Hugging Face Hub, after {num_attempts} attempts. The errors "
+                f"encountered were {errors!r}."
+            )
+            return None
     # Get all the Hugging Face repository tags for the model. If the model is an adapter
     # model, then we also get the tags for the base model
@@ -836,7 +847,8 @@ def get_model_repo_info(
                     "Skipping since the `only_allow_safetensors` argument is set "
                     "to `True`."
                 )
-            raise InvalidModel(msg)
+            logger.warning(msg)
+            return None
         # Also check base model if we are evaluating an adapter
         if base_model_id is not None:
@@ -856,7 +868,8 @@ def get_model_repo_info(
                         " Skipping since the `only_allow_safetensors` argument is set "
                         "to `True`."
                     )
-                raise InvalidModel(msg)
+                logging.warning(msg)
+                return None
     return HFModelInfo(
         pipeline_tag=pipeline_tag, tags=tags, adapter_base_model_id=base_model_id

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -84,7 +84,12 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
         destroy_distributed_environment,
         destroy_model_parallel,
     )
+    from vllm.inputs import PromptType
     from vllm.lora.request import LoRARequest
+    from vllm.model_executor.guided_decoding.guided_fields import GuidedDecodingRequest
+    from vllm.pooling_params import PoolingParams
+    from vllm.prompt_adapter.request import PromptAdapterRequest
+    from vllm.sampling_params import RequestOutputKind
 if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
     from outlines.models.vllm import adapt_tokenizer
@@ -451,7 +456,9 @@ class VLLMModel(HuggingFaceEncoderModel):
                         text=prompts,
                         truncation=True,
                         max_length=max(
-                            self._tokenizer.model_max_length - max_tokens, 0
+                            min(self._tokenizer.model_max_length, MAX_CONTEXT_LENGTH)
+                            - max_tokens,
+                            0,
                         ),
                     )
                     prompts = self._tokenizer.batch_decode(
@@ -491,8 +498,19 @@ class VLLMModel(HuggingFaceEncoderModel):
             output.outputs[0].token_ids for output in raw_outputs
         ]
         if self.end_of_reasoning_token_id in completion_ids[0]:
+            # Find the latest index of the end of reasoning token and slice
+            # the token IDs to only include the tokens after it
             completion_ids = [
-                token_ids[token_ids.index(self.end_of_reasoning_token_id) + 1 :]
+                token_ids[
+                    max(
+                        [
+                            i
+                            for i, x in enumerate(token_ids)
+                            if x == self.end_of_reasoning_token_id
+                        ]
+                    )
+                    + 1 :
+                ]
                 if self.end_of_reasoning_token_id in token_ids
                 else token_ids
                 for token_ids in completion_ids
@@ -814,6 +832,9 @@ def load_model_and_tokenizer(
         )
     model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
+    model._validate_and_add_requests = MethodType(
+        _validate_and_add_requests_with_fixed_progress_bars, model
+    )
     model.config = hf_model_config
     return model, tokenizer
@@ -934,6 +955,53 @@ def _run_engine_with_fixed_progress_bars(
     return outputs
+def _validate_and_add_requests_with_fixed_progress_bars(
+    self: "LLM",
+    prompts: "PromptType | c.Sequence[PromptType]",
+    params: "SamplingParams | c.Sequence[SamplingParams] | PoolingParams | c.Sequence[PoolingParams]",  # noqa: E501
+    *,
+    use_tqdm: bool,
+    lora_request: "c.Sequence[LoRARequest] | LoRARequest | None",
+    prompt_adapter_request: "PromptAdapterRequest | None",
+    tokenization_kwargs: dict[str, t.Any] | None = None,
+    guided_options: "GuidedDecodingRequest | None" = None,
+    priority: list[int] | None = None,
+) -> None:
+    if isinstance(prompts, (str, dict)):
+        # Convert a single prompt to a list.
+        prompts = [prompts]
+    num_requests = len(prompts)
+    if isinstance(params, list) and len(params) != num_requests:
+        raise ValueError("The lengths of prompts and params must be the same.")
+    if isinstance(lora_request, list) and len(lora_request) != num_requests:
+        raise ValueError("The lengths of prompts and lora_request must be the same.")
+    for sp in params if isinstance(params, list) else (params,):
+        if isinstance(sp, SamplingParams):
+            self._add_guided_params(sp, guided_options)
+            # We only care about the final output
+            sp.output_kind = RequestOutputKind.FINAL_ONLY
+    # Add requests to the engine.
+    it = prompts
+    if use_tqdm:
+        it = tqdm(it, desc="Adding requests", leave=False)
+    for i, prompt in enumerate(it):
+        self._add_request(
+            prompt,
+            params[i] if isinstance(params, c.Sequence) else params,
+            tokenization_kwargs=tokenization_kwargs,
+            lora_request=lora_request[i]
+            if isinstance(lora_request, c.Sequence)
+            else lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+            priority=priority[i] if priority else 0,
+        )
 def clear_vllm() -> None:
     """Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
     with contextlib.suppress(ValueError):

euroeval/benchmarker.py CHANGED Viewed

@@ -72,7 +72,6 @@ class Benchmarker:
         force: bool = False,
         verbose: bool = False,
         trust_remote_code: bool = False,
-        use_flash_attention: bool | None = None,
         clear_model_cache: bool = False,
         evaluate_test_split: bool = False,
         few_shot: bool = True,
@@ -129,9 +128,6 @@ class Benchmarker:
                 `debug` is True. Defaults to False.
             trust_remote_code:
                 Whether to trust remote code when loading models. Defaults to False.
-            use_flash_attention:
-                Whether to use Flash Attention. If None then it will be used if it is
-                installed and the model is a decoder model. Defaults to None.
             clear_model_cache:
                 Whether to clear the model cache after benchmarking each model.
                 Defaults to False.
@@ -190,7 +186,6 @@ class Benchmarker:
             force=force,
             verbose=verbose,
             trust_remote_code=trust_remote_code,
-            use_flash_attention=use_flash_attention,
             clear_model_cache=clear_model_cache,
             evaluate_test_split=evaluate_test_split,
             few_shot=few_shot,
@@ -243,7 +238,6 @@ class Benchmarker:
         force: bool | None = None,
         verbose: bool | None = None,
         trust_remote_code: bool | None = None,
-        use_flash_attention: bool | None = None,
         clear_model_cache: bool | None = None,
         evaluate_test_split: bool | None = None,
         few_shot: bool | None = None,
@@ -311,9 +305,6 @@ class Benchmarker:
             trust_remote_code:
                 Whether to trust remote code when loading models. Defaults to the value
                 specified when initialising the benchmarker.
-            use_flash_attention:
-                Whether to use Flash Attention. Defaults to the value specified when
-                initialising the benchmarker.
             clear_model_cache:
                 Whether to clear the model cache after benchmarking each model. Defaults
                 to the value specified when initialising the benchmarker.
@@ -359,7 +350,6 @@ class Benchmarker:
             force=force,
             verbose=verbose,
             trust_remote_code=trust_remote_code,
-            use_flash_attention=use_flash_attention,
             clear_model_cache=clear_model_cache,
             evaluate_test_split=evaluate_test_split,
             few_shot=few_shot,
@@ -531,7 +521,6 @@ class Benchmarker:
         force: bool | None = None,
         verbose: bool | None = None,
         trust_remote_code: bool | None = None,
-        use_flash_attention: bool | None | None = None,
         clear_model_cache: bool | None = None,
         evaluate_test_split: bool | None = None,
         few_shot: bool | None = None,
@@ -590,9 +579,6 @@ class Benchmarker:
             trust_remote_code:
                 Whether to trust remote code when loading models. If None, then this
                 value will not be updated.
-            use_flash_attention:
-                Whether to use Flash Attention. If None, then this value will not be
-                updated.
             clear_model_cache:
                 Whether to clear the model cache after benchmarking each model. If None,
                 then this value will not be updated.
@@ -658,8 +644,6 @@ class Benchmarker:
             benchmark_config_params.verbose = verbose
         if trust_remote_code is not None:
             benchmark_config_params.trust_remote_code = trust_remote_code
-        if use_flash_attention is not None:
-            benchmark_config_params.use_flash_attention = use_flash_attention
         if clear_model_cache is not None:
             benchmark_config_params.clear_model_cache = clear_model_cache
         if evaluate_test_split is not None:
@@ -863,7 +847,6 @@ class Benchmarker:
         force: bool | None = None,
         verbose: bool | None = None,
         trust_remote_code: bool | None = None,
-        use_flash_attention: bool | None = None,
         clear_model_cache: bool | None = None,
         evaluate_test_split: bool | None = None,
         few_shot: bool | None = None,
@@ -931,9 +914,6 @@ class Benchmarker:
             trust_remote_code:
                 Whether to trust remote code when loading models. Defaults to the value
                 specified when initialising the benchmarker.
-            use_flash_attention:
-                Whether to use Flash Attention. Defaults to the value specified when
-                initialising the benchmarker.
             clear_model_cache:
                 Whether to clear the model cache after benchmarking each model. Defaults
                 to the value specified when initialising the benchmarker.
@@ -981,7 +961,6 @@ class Benchmarker:
             force=force,
             verbose=verbose,
             trust_remote_code=trust_remote_code,
-            use_flash_attention=use_flash_attention,
             clear_model_cache=clear_model_cache,
             evaluate_test_split=evaluate_test_split,
             few_shot=few_shot,

euroeval/cli.py CHANGED Viewed

@@ -141,14 +141,6 @@ from .tasks import get_all_tasks
     help="""Whether to trust remote code. Only set this flag if you trust the supplier
     of the model.""",
 )
-@click.option(
-    "--use-flash-attention/--no-use-flash-attention",
-    default=None,
-    show_default=True,
-    help="""Whether to use Flash Attention. If not specified then the model will use
-    Flash Attention for generative models if a CUDA GPU is available and `flash-attn`
-    or `vllm-flash-attn` are installed.""",
-)
 @click.option(
     "--clear-model-cache/--no-clear-model-cache",
     default=False,
@@ -225,7 +217,6 @@ def benchmark(
     verbose: bool,
     device: str | None,
     trust_remote_code: bool,
-    use_flash_attention: bool | None,
     clear_model_cache: bool,
     evaluate_test_split: bool,
     few_shot: bool,
@@ -261,7 +252,6 @@ def benchmark(
         cache_dir=cache_dir,
         device=device,
         trust_remote_code=trust_remote_code,
-        use_flash_attention=use_flash_attention,
         clear_model_cache=clear_model_cache,
         evaluate_test_split=evaluate_test_split,
         few_shot=few_shot,

euroeval/data_models.py CHANGED Viewed

@@ -191,9 +191,6 @@ class BenchmarkConfig:
             Whether to print verbose output.
         trust_remote_code:
             Whether to trust remote code when loading models from the Hugging Face Hub.
-        use_flash_attention:
-            Whether to use Flash Attention. If None then this will be used for
-            generative models.
         clear_model_cache:
             Whether to clear the model cache after benchmarking each model.
         evaluate_test_split:
@@ -231,7 +228,6 @@ class BenchmarkConfig:
     device: torch.device
     verbose: bool
     trust_remote_code: bool
-    use_flash_attention: bool | None
     clear_model_cache: bool
     evaluate_test_split: bool
     few_shot: bool
@@ -263,7 +259,6 @@ class BenchmarkConfigParams(pydantic.BaseModel):
     force: bool
     verbose: bool
     trust_remote_code: bool
-    use_flash_attention: bool | None
     clear_model_cache: bool
     evaluate_test_split: bool
     few_shot: bool

euroeval/exceptions.py CHANGED Viewed

@@ -81,28 +81,6 @@ class NaNValueInModelOutput(Exception):
         super().__init__(self.message)
-class FlashAttentionNotInstalled(Exception):
-    """The `flash-attn` package has not been installed."""
-    def __init__(
-        self,
-        message: str = (
-            "The model you are trying to load requires Flash Attention. To use Flash "
-            "Attention, please install the `flash-attn` package, which can be done by "
-            "running `pip install -U wheel && FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE "
-            "pip install flash-attn --no-build-isolation`."
-        ),
-    ) -> None:
-        """Initialise the exception.
-        Args:
-            message:
-                The message to display.
-        """
-        self.message = message
-        super().__init__(self.message)
 class NeedsExtraInstalled(InvalidModel):
     """The evaluation requires extra to be installed."""

euroeval/human_evaluation.py CHANGED Viewed

@@ -263,7 +263,6 @@ class HumanEvaluator:
             force=False,
             verbose=False,
             trust_remote_code=False,
-            use_flash_attention=None,
             clear_model_cache=False,
             evaluate_test_split=False,
             few_shot=True,

{euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.8.2
+Version: 15.9.1
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -62,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: gradio>=4.26.0; extra == 'all'
 Requires-Dist: outlines>=0.1.11; extra == 'all'
-Requires-Dist: vllm<0.8.5,>=0.8.3; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: outlines>=0.1.11; extra == 'generative'
-Requires-Dist: vllm<0.8.5,>=0.8.3; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'generative'
 Provides-Extra: human-evaluation
 Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
 Provides-Extra: test
@@ -97,8 +97,6 @@ ______________________________________________________________________
 - Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
   dan.nielsen@alexandra.dk)
-- Kenneth Enevoldsen ([@KennethEnevoldsen](https://github.com/KennethEnevoldsen),
-  kenneth.enevoldsen@cas.au.dk)
 ## Installation

{euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/RECORD RENAMED Viewed

@@ -1,17 +1,17 @@
-euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
-euroeval/benchmark_config_factory.py,sha256=RDYotoLcfNr3xU8Cw-G-Y8wLe6RSlJD1Ok9C97lWfOs,12553
-euroeval/benchmarker.py,sha256=EHoYilZ2Xx0-6_aEBlG84MsZbomJSiHNHc4wKOVVBB8,49199
+euroeval/__init__.py,sha256=jjInLLkd5IrDrwqag3U35g7SgzITBlFYllgofc-uQFg,3067
+euroeval/benchmark_config_factory.py,sha256=icTeT5C-bNCJmvSWFlxKdEpRboZN8OjwaHGu7JM-2xI,11158
+euroeval/benchmarker.py,sha256=wmgrYVS31PMhhrVienjaVHHyfnZAy51kUvC6OjooiOw,48047
 euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
-euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
+euroeval/cli.py,sha256=d8JztMi_RbpUlEBXidd6DQ-xeC-xhozf_qU6Vkzye20,8161
 euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
 euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
-euroeval/data_models.py,sha256=59ca6gxmHwMdeIIU6f-gGXOVIXXDQPOt7m5nCXHK86E,23166
+euroeval/data_models.py,sha256=7nAGDpN58Y35Lt9JZE_y0y5iOYesw2htcwHc68MkBZU,22953
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
-euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
+euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
 euroeval/finetuning.py,sha256=uuaUxNQJb7TivPQuI1OYQ_MIKbD-6-7mpkobLKsDefQ,10667
 euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
 euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
-euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
+euroeval/human_evaluation.py,sha256=zqbbJkqm2Uymf-88PxM3R9vVRR8SZJlq3QrqWEoiVeE,27643
 euroeval/languages.py,sha256=LerXuRBAUYkQL6qSV-F82itAE4EgBGFBtzaGnJJZvOE,8555
 euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
 euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
@@ -25,9 +25,9 @@ euroeval/utils.py,sha256=e83OnWc0GJn0Tn_vP3tbqh1DAbLy2ky-LnIlTEOKzKU,11410
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
 euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
-euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
+euroeval/benchmark_modules/hf.py,sha256=CoiaNakjhg6gm_5IbUUeevXQZebg2VrRLuhzEi2Hhrk,44617
 euroeval/benchmark_modules/litellm.py,sha256=SxSr_0C6b_jVavR3y9QyhfkCOP5-va4zijGfghFTArY,48362
-euroeval/benchmark_modules/vllm.py,sha256=DJyla0jr-DVMPPs4RBguxq1Xn5YguvyuAnIlgIOfFaw,39394
+euroeval/benchmark_modules/vllm.py,sha256=rz_Xau5TGiFeb2VkdVpW_fYOfRCCvYrH0q9BGzCwZlo,42156
 euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
 euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
 euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
@@ -54,8 +54,8 @@ euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iY
 euroeval/task_group_utils/sequence_classification.py,sha256=Yqx0pUhuHYmSkv1ZUfOndSLTvpr0lWCk19oYITfSjV4,13555
 euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
 euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
-euroeval-15.8.2.dist-info/METADATA,sha256=4L3u0qzbAjcZsog0LZXSurfKJO7ILdXk4h0ORMGepd0,13683
-euroeval-15.8.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.8.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.8.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
-euroeval-15.8.2.dist-info/RECORD,,
+euroeval-15.9.1.dist-info/METADATA,sha256=UkGmFcnarstFwD1J1eS6h3gbyxnucnaAVLnB5QhkdSo,13555
+euroeval-15.9.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.9.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.9.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.9.1.dist-info/RECORD,,

{euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.8.2__py3-none-any.whl → 15.9.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.8.2py3-none-any.whl → 15.9.1py3-none-any.whl