PyPI - EuroEval - Versions diffs - 15.7.1__py3-none-any.whl → 15.8.0__py3-none-any.whl - Mend

EuroEval 15.7.1py3-none-any.whl → 15.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (19) hide show

euroeval/benchmark_config_factory.py +1 -1
euroeval/benchmark_modules/litellm.py +341 -150
euroeval/benchmark_modules/vllm.py +1 -1
euroeval/benchmarker.py +24 -12
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/english.py +1 -1
euroeval/dataset_configs/finnish.py +11 -1
euroeval/dataset_configs/italian.py +11 -1
euroeval/dataset_configs/spanish.py +11 -1
euroeval/finetuning.py +29 -31
euroeval/languages.py +1 -1
euroeval/task_group_utils/sequence_classification.py +46 -11
euroeval/tokenization_utils.py +52 -16
euroeval/utils.py +41 -0
{euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/METADATA +1 -1
{euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/RECORD +19 -19
{euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/WHEEL +0 -0
{euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_config_factory.py CHANGED Viewed

@@ -238,7 +238,7 @@ def prepare_languages(
             The default language codes of the languages to include.
     Returns:
-        The prepared model or dataset languages.
+        The prepared dataset languages.
     """
     # Create a dictionary that maps languages to their associated language objects
     language_mapping = get_all_languages()

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -32,6 +32,7 @@ from litellm.llms.vertex_ai.common_utils import VertexAIError
 from litellm.types.utils import ChoiceLogprobs, ModelResponse
 from pydantic import conlist, create_model
 from requests.exceptions import RequestException
+from tqdm.asyncio import tqdm as tqdm_async
 from tqdm.auto import tqdm
 from transformers.trainer import Trainer
@@ -66,7 +67,12 @@ from ..task_group_utils import (
 )
 from ..tokenization_utils import get_first_label_token_mapping
 from ..types import ExtractLabelsFunction
-from ..utils import create_model_cache_dir, log_once
+from ..utils import (
+    catch_coroutine_exception,
+    create_model_cache_dir,
+    log_once,
+    safe_run,
+)
 from .base import BenchmarkModule
 from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
@@ -159,9 +165,21 @@ class LiteLLMModel(BenchmarkModule):
     """A generative model from LiteLLM."""
     fresh_model = False
-    batching_preference = BatchingPreference.SINGLE_SAMPLE
+    batching_preference = BatchingPreference.ALL_AT_ONCE
     high_priority = False
+    _handleable_exceptions = (
+        BadRequestError,
+        RateLimitError,
+        APIError,
+        APIConnectionError,
+        Timeout,
+        ServiceUnavailableError,
+        InternalServerError,
+        SystemError,
+        AuthenticationError,
+    )
     def __init__(
         self,
         model_config: ModelConfig,
@@ -233,10 +251,7 @@ class LiteLLMModel(BenchmarkModule):
             The generated model outputs.
         """
         assert "messages" in inputs, "The input must contain a 'messages' key."
-        assert len(inputs["messages"]) == 1, (
-            "API models only support single-sample batching."
-        )
-        messages = inputs["messages"][0]
+        messages = inputs["messages"]
         generation_kwargs: dict[str, t.Any] = dict(
             model=self.model_config.model_id,
@@ -267,9 +282,20 @@ class LiteLLMModel(BenchmarkModule):
             generation_kwargs["top_logprobs"] = MAX_LOGPROBS
         if self.dataset_config.task in TASKS_USING_JSON:
-            assert "json" in messages[0]["content"].lower(), (
-                "Prompt must contain 'json' for JSON tasks."
-            )
+            for msg_list in messages:
+                # msg_list is a list of {'role':…, 'content':…} dicts
+                if not msg_list:
+                    raise InvalidBenchmark(
+                        "Encountered an empty message list in 'messages'."
+                    )
+                last = msg_list[-1]
+                assert isinstance(last, dict), (
+                    f"Expected dict message, got {type(last)}"
+                )
+                assert "json" in last["content"].lower(), (
+                    "Prompt must contain 'json' for JSON tasks."
+                )
             if self.generative_type == GenerativeType.REASONING:
                 log_once(
                     f"The model {self.model_config.model_id!r} is a reasoning model "
@@ -321,6 +347,76 @@ class LiteLLMModel(BenchmarkModule):
         # This drops generation kwargs that are not supported by the model
         litellm.drop_params = True
+        # Extract the generated sequences from the model response. Some APIs cannot
+        # handle using newlines as stop sequences, so we try both.
+        num_attempts = 10
+        all_responses = {}
+        all_failures = []
+        to_run = list(enumerate(messages))
+        for attempt in range(num_attempts):
+            if not to_run:
+                break
+            batch_indices, batch_msgs = zip(*to_run)
+            model_response, failures = safe_run(
+                self._generate_async(
+                    messages=list(batch_msgs),
+                    generation_kwargs=generation_kwargs,
+                    max_retries=3,
+                    max_reruns=15,
+                )
+            )
+            for orig_idx, response in zip(batch_indices, model_response):
+                all_responses[orig_idx] = response
+            if not failures:
+                to_run = []
+                break
+            all_failures.extend(failures)
+            to_run = [(orig_idx, messages[orig_idx]) for orig_idx, _ in failures]
+            logger.debug(
+                f"Attempt {attempt + 1}/{num_attempts}: "
+                f"retrying {len(to_run)} failed message(s)"
+            )
+            for _, error in failures:
+                self._handle_exception(error=error, generation_kwargs=generation_kwargs)
+        else:
+            raise InvalidBenchmark(
+                message=f"Failed to generate text, after {num_attempts} attempts."
+            )
+        if to_run:
+            raise InvalidBenchmark(
+                f"Failed to generate text after {num_attempts} attempts. "
+                f"Errors: {all_failures}"
+            )
+        ordered_responses = [all_responses[i] for i in range(len(messages))]
+        model_output = self._create_model_output(
+            model_responses=ordered_responses, model_id=self.model_config.model_id
+        )
+        return model_output
+    def _handle_exception(
+        self, error: Exception, generation_kwargs: dict[str, t.Any]
+    ) -> None:
+        """Handle an exception from the model.
+        Args:
+            error:
+                The exception to handle.
+            generation_kwargs:
+                The generation kwargs to pass to the model.
+        """
+        error_msg = str(error).lower()
+        model_id = self.model_config.model_id
         # Error messages that we want to catch and handle
         stop_messages = ["stop_sequences", "'stop' is not supported with this model"]
         logprobs_messages = [
@@ -341,153 +437,238 @@ class LiteLLMModel(BenchmarkModule):
         max_items_messages = ["'maxItems' is not permitted."]
         no_json_schema_messages = ["Property keys should match pattern"]
-        # Extract the generated sequences from the model response. Some APIs cannot
-        # handle using newlines as stop sequences, so we try both.
-        num_attempts = 10
-        for _ in range(num_attempts):
-            try:
-                model_response = litellm.completion_with_retries(
-                    messages=messages, **generation_kwargs
-                )
-                break
-            except (BadRequestError, RateLimitError) as e:
-                if any(msg.lower() in str(e).lower() for msg in stop_messages):
-                    log_once(
-                        f"The model {self.model_config.model_id!r} does not support "
-                        "stop sequences, so disabling them.",
-                        level=logging.DEBUG,
-                    )
-                    generation_kwargs["stop"] = None
-                elif (
-                    any(msg.lower() in str(e).lower() for msg in logprobs_messages)
-                    # Special case for Vertex AI models, since they have strict rate
-                    # limits on using logprobs. They also have a cap of 5 logprobs, but
-                    # we ignore this since the rate limiting makes it unusable anyway.
-                    or (isinstance(e, VertexAIError) and "logprobs" in str(e).lower())
-                ):
-                    log_once(
-                        f"The model {self.model_config.model_id!r} does not support "
-                        "logprobs, so disabling it.",
-                        level=logging.DEBUG,
-                    )
-                    generation_kwargs.pop("logprobs")
-                    generation_kwargs.pop("top_logprobs")
-                elif any(msg.lower() in str(e).lower() for msg in temperature_messages):
-                    log_once(
-                        f"The model {self.model_config.model_id!r} does not support "
-                        "temperature, so disabling it.",
-                        level=logging.DEBUG,
-                    )
-                    generation_kwargs.pop("temperature")
-                elif any(
-                    msg.lower() in str(e).lower()
-                    for msg in temperature_must_be_one_messages
-                ):
-                    log_once(
-                        f"The model {self.model_config.model_id!r} requires "
-                        "temperature to be set to 1, so setting it.",
-                        level=logging.DEBUG,
-                    )
-                    generation_kwargs["temperature"] = 1.0
-                elif any(msg.lower() in str(e).lower() for msg in max_items_messages):
-                    log_once(
-                        f"The model {self.model_config.model_id!r} does not support "
-                        "maxItems in the JSON schema, so disabling it.",
-                        level=logging.DEBUG,
-                    )
-                    ner_tag_names = list(
-                        self.dataset_config.prompt_label_mapping.values()
-                    )
-                    keys_and_their_types = {
-                        tag_name: (list[str], ...) for tag_name in ner_tag_names
-                    }
-                    pydantic_class = create_model(
-                        "AnswerFormat", **keys_and_their_types
-                    )
-                    generation_kwargs["response_format"] = pydantic_class
-                elif any(
-                    msg.lower() in str(e).lower() for msg in no_json_schema_messages
-                ):
-                    log_once(
-                        f"The model {self.model_config.model_id!r} does not support "
-                        "JSON schemas, so using the vanilla JSON format.",
-                        level=logging.DEBUG,
-                    )
-                    generation_kwargs["response_format"] = dict(type="json_object")
-                elif isinstance(e, RateLimitError):
-                    raise InvalidModel(
-                        "You have encountered your rate limit for model "
-                        f"{self.model_config.model_id!r}. Skipping."
-                    )
-                else:
-                    raise InvalidBenchmark(
-                        f"Failed to generate text. The error message was: {e}"
-                    )
-            except APIError as e:
-                raise InvalidBenchmark(
-                    f"Failed to generate text. The error message was: {e}"
-                )
-            except (
+        if any(msg.lower() in error_msg for msg in stop_messages):
+            log_once(
+                f"The model {model_id!r} does not support "
+                "stop sequences, so disabling them.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs["stop"] = None
+            return
+        elif (
+            any(msg.lower() in error_msg for msg in logprobs_messages)
+            # Special case for Vertex AI models, since they have strict rate
+            # limits on using logprobs. They also have a cap of 5 logprobs, but
+            # we ignore this since the rate limiting makes it unusable anyway.
+            or (isinstance(error, VertexAIError) and "logprobs" in error_msg)
+        ):
+            log_once(
+                f"The model {model_id!r} does not support logprobs, so disabling it.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs.pop("logprobs")
+            generation_kwargs.pop("top_logprobs")
+            return
+        elif any(msg.lower() in error_msg for msg in temperature_messages):
+            log_once(
+                f"The model {model_id!r} does not support "
+                "temperature, so disabling it.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs.pop("temperature")
+            return
+        elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
+            log_once(
+                f"The model {model_id!r} requires "
+                "temperature to be set to 1, so setting it.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs["temperature"] = 1.0
+            return
+        elif any(msg.lower() in error_msg for msg in max_items_messages):
+            log_once(
+                f"The model {model_id!r} does not support "
+                "maxItems in the JSON schema, so disabling it.",
+                level=logging.DEBUG,
+            )
+            ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
+            keys_and_their_types = {
+                tag_name: (list[str], ...) for tag_name in ner_tag_names
+            }
+            pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
+            generation_kwargs["response_format"] = pydantic_class
+            return
+        elif any(msg.lower() in error_msg for msg in no_json_schema_messages):
+            log_once(
+                f"The model {self.model_config.model_id!r} does not support "
+                "JSON schemas, so using the vanilla JSON format.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs["response_format"] = dict(type="json_object")
+            return
+        elif isinstance(
+            error,
+            (
                 APIConnectionError,
                 Timeout,
                 ServiceUnavailableError,
                 InternalServerError,
                 SystemError,
-            ) as e:
-                logger.debug(
-                    f"Service temporarily unavailable. The error message was: {e}. "
-                    f"Retrying in 5 seconds..."
+            ),
+        ):
+            logger.debug(
+                f"Service temporarily unavailable. The error message was: {error}. "
+                f"Retrying in 5 seconds..."
+            )
+            sleep(5)
+            return
+        if isinstance(error, RateLimitError):
+            raise InvalidModel(
+                f"You have encountered your rate limit for model {model_id!r}. "
+                "Skipping."
+            )
+        if isinstance(error, AuthenticationError):
+            raise NeedsAdditionalArgument(
+                cli_argument="--api-key",
+                script_argument="api_key=<your-api-key>",
+                run_with_cli=self.benchmark_config.run_with_cli,
+            )
+        raise InvalidBenchmark(
+            f"Failed to generate text. The error message was: {error}"
+        )
+    async def _generate_async(
+        self,
+        messages: list[dict[str, t.Any]],
+        generation_kwargs: dict[str, t.Any],
+        max_retries: int,
+        max_reruns: int,
+    ) -> tuple[list[ModelResponse], list[tuple[int, Exception]]]:
+        """Generate outputs from the model asynchronously.
+        Args:
+            messages:
+                The messages to pass to the model.
+            generation_kwargs:
+                The generation kwargs to pass to the model.
+            max_retries:
+                The maximum number of retries to make.
+            max_reruns:
+                The maximum number of reruns to make.
+        Returns:
+            A tuple containing the successful responses and the failed responses.
+        """
+        success = []
+        all_failures = {}
+        to_run = list(enumerate(messages))
+        prev_fail_count = len(to_run)
+        rerun_count = 0
+        while to_run and rerun_count < max_reruns and prev_fail_count > 0:
+            requests = [
+                litellm.acompletion(
+                    messages=msg, max_retries=max_retries, **generation_kwargs
                 )
-                sleep(5)
-            except AuthenticationError:
-                raise NeedsAdditionalArgument(
-                    cli_argument="--api-key",
-                    script_argument="api_key=<your-api-key>",
-                    run_with_cli=self.benchmark_config.run_with_cli,
+                for _, msg in to_run
+            ]
+            wrapped_requests = [
+                catch_coroutine_exception(request) for request in requests
+            ]
+            responses = await tqdm_async.gather(*wrapped_requests, leave=False)
+            next_to_run = []
+            current_fail_count = 0
+            for (orig_idx, _), response in zip(to_run, responses):
+                if isinstance(response, Exception):
+                    current_fail_count += 1
+                    all_failures[orig_idx] = response
+                    next_to_run.append((orig_idx, messages[orig_idx]))
+                else:
+                    success.append(response)
+            if current_fail_count >= prev_fail_count:
+                logger.warning(
+                    "Retry loop aborting due to no progress: "
+                    f"current_fail_count={current_fail_count}, "
+                    f"prev_fail_count={prev_fail_count}"
                 )
-        else:
-            raise InvalidBenchmark(
-                message=f"Failed to generate text, after {num_attempts} attempts."
-            )
+                break
+            prev_fail_count = current_fail_count
+            to_run = next_to_run
+            rerun_count += 1
+        failures = [(orig_idx, all_failures[orig_idx]) for orig_idx, _ in to_run]
+        return success, failures
-        assert isinstance(model_response, ModelResponse)
-        if not model_response.choices:
-            # This happens for reasoning models, when they don't finish thinking and run
-            # out of tokens. Happens quite rarely, but we need to handle it.
+    @staticmethod
+    def _create_model_output(
+        model_responses: list[ModelResponse], model_id: str
+    ) -> GenerativeModelOutput:
+        """Create a GenerativeModelOutput object from a list of ModelResponse objects.
+        Args:
+            model_responses:
+                The list of ModelResponse objects to create the GenerativeModelOutput
+                object from.
+            model_id:
+                The ID of the model.
+        Returns:
+            A GenerativeModelOutput object.
+        """
+        sequences = []
+        scores = []
+        for model_response in model_responses:
+            if not model_response.choices:
+                # This happens for reasoning models, when they don't finish thinking
+                # and run out of tokens. Happens quite rarely, but we need to handle it.
+                logger.warning(
+                    f"The model {model_id!r} did not end up "
+                    "generating any text. This is likely because the model ran "
+                    "out of tokens while reasoning. Returning an empty string."
+                )
+                continue
+            model_response_choices = model_response.choices[0]
+            assert isinstance(model_response_choices, litellm.Choices)
+            generated_message: litellm.Message = model_response_choices.message
+            generation_output = generated_message.content or ""
+            generation_output = generation_output.strip()
+            # Structure the model output as a GenerativeModelOutput object
+            sequences.append(generation_output)
+            if hasattr(model_response_choices, "logprobs"):
+                logprobs_obj = model_response_choices.logprobs
+                if isinstance(logprobs_obj, ChoiceLogprobs):
+                    logprobs_list: list[list[tuple[str, float]]] = [
+                        [
+                            (top_logprob.token, top_logprob.logprob)
+                            for top_logprob in content.top_logprobs
+                        ]
+                        for content in model_response_choices.logprobs.content or list()
+                    ]
+                    scores.append(logprobs_list)
+                else:
+                    log_once(
+                        "The logprobs object is malformed, so we won't use logprobs to "
+                        "determine the labels.",
+                        level=logging.WARNING,
+                    )
+        if not sequences:
             logger.warning(
-                f"The model {self.model_config.model_id!r} did not end up generating "
-                "any text. This is likely because the model ran out of tokens while "
-                "reasoning. Returning an empty string."
+                "No sequences were generated by the model "
+                f"{model_id!r}. This may be due to the "
+                "model running out of tokens or an issue with the input data. "
+                "Returning an empty GenerativeModelOutput."
             )
-            return GenerativeModelOutput(sequences=[""])
-        model_response_choices = model_response.choices[0]
-        assert isinstance(model_response_choices, litellm.Choices)
-        generated_message: litellm.Message = model_response_choices.message
-        generation_output = generated_message.content or ""
-        generation_output = generation_output.strip()
-        # Structure the model output as a GenerativeModelOutput object
-        model_output = GenerativeModelOutput(sequences=[generation_output])
-        if hasattr(model_response_choices, "logprobs"):
-            logprobs_obj = model_response_choices.logprobs
-            if isinstance(logprobs_obj, ChoiceLogprobs):
-                logprobs_list: list[list[tuple[str, float]]] = [
-                    [
-                        (top_logprob.token, top_logprob.logprob)
-                        for top_logprob in content.top_logprobs
-                    ]
-                    for content in model_response_choices.logprobs.content or list()
-                ]
-                model_output.scores = [logprobs_list]
-            else:
-                log_once(
-                    "The logprobs object is malformed, so we won't use logprobs to "
-                    "determine the labels.",
-                    level=logging.WARNING,
-                )
+            return GenerativeModelOutput(sequences=[], scores=None)
-        return model_output
+        if scores and len(sequences) != len(scores):
+            raise InvalidBenchmark(
+                "Sequences and scores must have the same length. "
+                f"Got {len(sequences)} sequences and {len(scores)} scores."
+            )
+        return GenerativeModelOutput(
+            sequences=sequences, scores=scores if scores else None
+        )
     @cached_property
     def num_params(self) -> int:
@@ -1007,6 +1188,10 @@ def try_download_ollama_model(model_id: str) -> bool:
     Returns:
         Whether the model was downloaded successfully.
+    Raises:
+        InvalidModel:
+            If Ollama is not running or the model cannot be downloaded.
     """
     if not (model_id.startswith("ollama/") or model_id.startswith("ollama_chat/")):
         return False
@@ -1021,11 +1206,17 @@ def try_download_ollama_model(model_id: str) -> bool:
             level=logging.WARNING,
         )
-    downloaded_ollama_models: list[str] = [
-        model_obj.model
-        for model_obj in ollama.list().models
-        if model_obj.model is not None
-    ]
+    try:
+        downloaded_ollama_models: list[str] = [
+            model_obj.model
+            for model_obj in ollama.list().models
+            if model_obj.model is not None
+        ]
+    except ConnectionError:
+        raise InvalidModel(
+            "Ollama does not seem to be running, so we cannot evaluate the model "
+            f"{model_id!r}. Please make sure that Ollama is running and try again."
+        )
     ollama_model_id = "/".join(model_id.split("/")[1:])
     if ollama_model_id not in downloaded_ollama_models:

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -797,7 +797,7 @@ def load_model_and_tokenizer(
             enable_lora=model_config.adapter_base_model_id is not None,
             max_lora_rank=256,
         )
-    except (ValueError, OSError) as e:
+    except (RuntimeError, ValueError, OSError) as e:
         if "awaiting a review from the repo authors" in str(e):
             raise InvalidModel(
                 f"The model {model_id!r} is awaiting a review from the repository "

euroeval/benchmarker.py CHANGED Viewed

@@ -11,6 +11,7 @@ from pathlib import Path
 from shutil import rmtree
 from time import sleep
+from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
 from torch.distributed import destroy_process_group
 from .benchmark_config_factory import build_benchmark_config
@@ -27,7 +28,7 @@ from .model_loading import load_model
 from .scores import log_scores
 from .speed_benchmark import benchmark_speed
 from .tasks import SPEED
-from .utils import enforce_reproducibility
+from .utils import enforce_reproducibility, get_package_version
 if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
@@ -164,6 +165,15 @@ class Benchmarker:
         if task is not None and dataset is not None:
             raise ValueError("Only one of `task` and `dataset` can be specified.")
+        # Bail early if hf_transfer is enabled but not installed.
+        if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
+            raise ImportError(
+                "Fast download using 'hf_transfer' is enabled "
+                "(HF_HUB_ENABLE_HF_TRANSFER=1) but the 'hf_transfer' "
+                "package is not available in your environment. "
+                "Try installing it with `pip install hf_transfer`."
+            )
         self.benchmark_config_default_params = BenchmarkConfigParams(
             progress_bar=progress_bar,
             save_results=save_results,
@@ -372,15 +382,7 @@ class Benchmarker:
         current_benchmark_results: list[BenchmarkResult] = list()
         for model_id in model_ids:
-            try:
-                model_config = get_model_config(
-                    model_id=model_id, benchmark_config=benchmark_config
-                )
-            except InvalidModel as e:
-                logger.info(e.message)
-                num_finished_benchmarks += len(dataset_configs)
-                continue
+            model_config: ModelConfig | None = None
             loaded_model: BenchmarkModule | None = None
             for dataset_config in dataset_configs:
                 # Skip if we have already benchmarked this model on this dataset and
@@ -394,12 +396,22 @@ class Benchmarker:
                 ):
                     logger.debug(
                         f"Skipping benchmarking {model_id} on "
-                        f"{dataset_config.pretty_name}, as it "
-                        "has already been benchmarked."
+                        f"{dataset_config.pretty_name}, as it has already been "
+                        "benchmarked."
                     )
                     num_finished_benchmarks += 1
                     continue
+                if model_config is None:
+                    try:
+                        model_config = get_model_config(
+                            model_id=model_id, benchmark_config=benchmark_config
+                        )
+                    except InvalidModel as e:
+                        logger.info(e.message)
+                        num_finished_benchmarks += len(dataset_configs)
+                        continue
                 # Skip if the model is an encoder model and the task is generative
                 task_is_generative = (
                     dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS

euroeval/dataset_configs/__init__.py CHANGED Viewed

@@ -7,6 +7,7 @@ from .danish import *  # noqa: F403
 from .dutch import *  # noqa: F403
 from .english import *  # noqa: F403
 from .faroese import *  # noqa: F403
+from .finnish import *  # noqa: F403
 from .french import *  # noqa: F403
 from .german import *  # noqa: F403
 from .icelandic import *  # noqa: F403

euroeval/dataset_configs/english.py CHANGED Viewed

@@ -79,7 +79,7 @@ ARC_CONFIG = DatasetConfig(
 )
 BELEBELE_CONFIG = DatasetConfig(
-    name="belebele",
+    name="belebele-en",
     pretty_name="the English multiple choice reading comprehension dataset BeleBele",
     huggingface_id="EuroEval/belebele-mini",
     task=MCRC,

euroeval/dataset_configs/finnish.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from ..data_models import DatasetConfig
 from ..languages import FI
-from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
+from ..tasks import COMMON_SENSE, LA, MCRC, NER, RC, SENT, SUMM
 ### Official datasets ###
@@ -58,3 +58,13 @@ SCALA_FI_CONFIG = DatasetConfig(
 )
 ### Unofficial datasets ###
+BELEBELE_FI_CONFIG = DatasetConfig(
+    name="belebele-fi",
+    pretty_name="the Finnish multiple choice reading comprehension dataset "
+    "BeleBele-fi, translated from the English BeleBele dataset",
+    huggingface_id="EuroEval/belebele-fi-mini",
+    task=MCRC,
+    languages=[FI],
+    unofficial=True,
+)

euroeval/dataset_configs/italian.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from ..data_models import DatasetConfig
 from ..languages import IT
-from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
+from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
 ### Official datasets ###
@@ -79,3 +79,13 @@ WIKINEURAL_IT_CONFIG = DatasetConfig(
     languages=[IT],
     unofficial=True,
 )
+BELEBELE_IT_CONFIG = DatasetConfig(
+    name="belebele-it",
+    pretty_name="the Italian multiple choice reading comprehension dataset "
+    "BeleBele-it, translated from the English BeleBele dataset",
+    huggingface_id="EuroEval/belebele-it-mini",
+    task=MCRC,
+    languages=[IT],
+    unofficial=True,
+)

euroeval/dataset_configs/spanish.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from ..data_models import DatasetConfig
 from ..languages import ES
-from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
+from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
 ### Official datasets ###
@@ -76,3 +76,13 @@ XQUAD_ES_CONFIG = DatasetConfig(
     languages=[ES],
     unofficial=True,
 )
+BELEBELE_ES_CONFIG = DatasetConfig(
+    name="belebele-es",
+    pretty_name="the Spanish multiple choice reading comprehension dataset "
+    "BeleBele-es, translated from the English BeleBele dataset",
+    huggingface_id="EuroEval/belebele-es-mini",
+    task=MCRC,
+    languages=[ES],
+    unofficial=True,
+)

euroeval/finetuning.py CHANGED Viewed

@@ -103,7 +103,6 @@ def finetune(
                 itr_scores = finetune_single_iteration(
                     model=model if model_already_initialized else None,
                     dataset=datasets[idx],
-                    iteration_idx=idx,
                     training_args=training_args,
                     model_config=model_config,
                     dataset_config=dataset_config,
@@ -158,7 +157,6 @@ def finetune(
 def finetune_single_iteration(
     model: BenchmarkModule | None,
     dataset: DatasetDict,
-    iteration_idx: int,
     training_args: TrainingArguments,
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
@@ -171,8 +169,6 @@ def finetune_single_iteration(
             The model to use in the benchmark. If None then a new model will be loaded.
         dataset:
             The dataset to use for training and evaluation.
-        iteration_idx:
-            The index of the iteration.
         training_args:
             The training arguments.
         model_config:
@@ -213,41 +209,42 @@ def finetune_single_iteration(
         trainer.log = no_logging
-    # Re-block terminal output, as it gets unblocked by the `transformers`
-    # package before training
+    # Re-block terminal output, as it gets unblocked by the `transformers` package
+    # before training
     block_terminal_output()
-    # Sort out callbacks. We remove the callbacks that are producing unnecessary
-    # output, to avoid cluttering the terminal output
+    # Sort out callbacks. We remove the callbacks that are producing unnecessary output,
+    # to avoid cluttering the terminal output
     if not benchmark_config.verbose:
         trainer.remove_callback(PrinterCallback)
     trainer.remove_callback(ProgressCallback)
     if benchmark_config.progress_bar:
         trainer.add_callback(NeverLeaveProgressCallback)
-    try:
-        trainer.train()
-        with torch.inference_mode():
-            try:
-                test_scores = trainer.evaluate(
-                    eval_dataset=dataset["test"],
-                    orig_eval_dataset=dataset["original_test"],
-                    metric_key_prefix="test",
-                )
-            except TypeError:
-                test_scores = trainer.evaluate(
-                    eval_dataset=dataset["test"], metric_key_prefix="test"
-                )
-        return test_scores
-    except NaNValueInModelOutput as e:
-        del trainer
-        del model
-        clear_memory()
-        raise e
-    except (RuntimeError, ValueError, IndexError) as e:
-        raise InvalidBenchmark(str(e))
+    # Train the model
+    trainer.train()
+    # Evaluate the model
+    with torch.inference_mode():
+        try:
+            test_scores = trainer.evaluate(
+                eval_dataset=dataset["test"],
+                orig_eval_dataset=dataset["original_test"],
+                metric_key_prefix="test",
+            )
+        except TypeError:
+            test_scores = trainer.evaluate(
+                eval_dataset=dataset["test"], metric_key_prefix="test"
+            )
+        except NaNValueInModelOutput as e:
+            del trainer
+            del model
+            clear_memory()
+            raise e
+        except (RuntimeError, ValueError, IndexError) as e:
+            raise InvalidBenchmark(str(e))
+    return test_scores
 def get_training_args(
@@ -300,6 +297,7 @@ def get_training_args(
         save_total_limit=1,
         per_device_train_batch_size=batch_size,
         per_device_eval_batch_size=batch_size,
+        eval_accumulation_steps=32,
         optim=OptimizerNames.ADAMW_TORCH,
         learning_rate=2e-5,
         warmup_ratio=0.01,

euroeval/languages.py CHANGED Viewed

@@ -21,6 +21,7 @@ def get_all_languages() -> dict[str, Language]:
 DA = Language(code="da", name="Danish", _and_separator="og", _or_separator="eller")
 NL = Language(code="nl", name="Dutch", _and_separator="en", _or_separator="of")
 EN = Language(code="en", name="English", _and_separator="and", _or_separator="or")
+FI = Language(code="fi", name="Finnish", _and_separator="ja", _or_separator="tai")
 FO = Language(code="fo", name="Faroese", _and_separator="og", _or_separator="ella")
 FR = Language(code="fr", name="French", _and_separator="et", _or_separator="ou")
 DE = Language(code="de", name="German", _and_separator="und", _or_separator="oder")
@@ -78,7 +79,6 @@ EO = Language(code="eo", name="Esperanto")
 ET = Language(code="et", name="Estonian")
 EE = Language(code="ee", name="Ewe")
 FJ = Language(code="fj", name="Fijian")
-FI = Language(code="fi", name="Finnish")
 FY = Language(code="fy", name="Western Frisian")
 FF = Language(code="ff", name="Fulah")
 GD = Language(code="gd", name="Gaelic")

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -132,6 +132,11 @@ def extract_labels_from_generation(
         The predicted labels.
     """
     if model_output.scores is not None:
+        if first_label_token_mapping is False:
+            raise InvalidBenchmark(
+                "The model outputted logprobs, but the first label token mapping is "
+                "not provided. This means that the model should not output logprobs."
+            )
         labels = get_closest_logprobs_labels(
             generation_logprobs=model_output.scores,
             dataset_config=dataset_config,
@@ -147,7 +152,7 @@ def extract_labels_from_generation(
 def get_closest_logprobs_labels(
     generation_logprobs: list[list[list[tuple[str, float]]]],
     dataset_config: "DatasetConfig",
-    first_label_token_mapping: dict[str, str] | bool,
+    first_label_token_mapping: dict[str, str] | t.Literal[True],
 ) -> list[str] | None:
     """Get the labels with the highest predicted logprob value.
@@ -164,8 +169,7 @@ def get_closest_logprobs_labels(
             The configuration of the dataset.
         first_label_token_mapping:
             A mapping from labels to the first token in each label, or alternatively a
-            Boolean value indicating whether the model should output scores (if the
-            mapping is outputted then the model will always output scores).
+            `True` value indicating that the model should output logprobs.
     Returns:
         The predicted labels, or None if labels could not be extracted.
@@ -195,7 +199,9 @@ def get_closest_logprobs_labels(
             # label, as the output label
             output_label: str | None = None
             for generated_label in generated_labels:
-                # Get the candidate labels that starts with the generated label
+                # Get the candidate labels. If we have a first label token mapping, we
+                # use it to get the candidate labels. Otherwise, we check if any of the
+                # labels start with the generated label.
                 if isinstance(first_label_token_mapping, dict):
                     if any(
                         candidate_label not in first_label_token_mapping
@@ -239,14 +245,43 @@ def get_closest_logprobs_labels(
                     )
                     return None
-                # If no candidate label is found, we ignore the generated label, as it
-                # basically means that the model is just really bad at generating
-                # labels.
+                # If no candidate label is found, we first check if any of the labels
+                # start with the generated label. This could be the case if the labels
+                # in the first token mapping is inaccurate or incomplete, for instance
+                # if 'pos' is in the first label token mapping, but the model outputted
+                # 'posit'. If this is the case then we cannot trust the first label
+                # token mapping, and we fall back to using word edit distance.
+                # Otherwise, the generated label is just bad, and we skip to the next
+                # generated label.
                 elif len(candidate_output_labels) == 0:
-                    logger.debug(
-                        f"No candidate label found for the generated label "
-                        f"{generated_label!r}. The generated label is thus ignored."
-                    )
+                    candidate_output_labels_starting_with_generated_label = [
+                        candidate_label
+                        for candidate_label in candidate_labels
+                        if candidate_label.startswith(generated_label)
+                    ]
+                    if candidate_output_labels_starting_with_generated_label:
+                        log_once(
+                            f"No candidate label found for the generated label "
+                            f"{generated_label!r}. This means that using logprobs to "
+                            "extract the labels is not reliable, and we will instead "
+                            "fall back to extracting the labels using word edit "
+                            "distance.",
+                            level=logging.DEBUG,
+                        )
+                        return None
+            # If we did not find any candidate label for any of the generated labels, we
+            # assume that something is wrong with the model output, and we fall back to
+            # using word edit distance to extract the labels
+            else:
+                log_once(
+                    f"No candidate label found for any of the generated labels "
+                    f"{generated_labels}. This means that using logprobs to extract "
+                    "the labels is not reliable, and we will instead fall back to "
+                    "extracting the labels using word edit distance.",
+                    level=logging.DEBUG,
+                )
+                return None
             if output_label is not None:
                 output_labels.append(output_label)

euroeval/tokenization_utils.py CHANGED Viewed

@@ -169,7 +169,7 @@ def get_bos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
     vocab: dict[str, int] = tokenizer.get_vocab()
-    candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "[CLS]"]
+    candidate_bos_tokens = ["<s>", "<|begin_of_text|>", "<|startoftext|>", "[CLS]"]
     for candidate_bos_token in candidate_bos_tokens:
         if candidate_bos_token in vocab:
             bos_token = candidate_bos_token
@@ -200,7 +200,7 @@ def get_eos_token(tokenizer: "PreTrainedTokenizer") -> tuple[str, int]:
     vocab: dict[str, int] = tokenizer.get_vocab()
-    candidate_eos_tokens = ["</s>", "<|end_of_text|>", "[SEP]"]
+    candidate_eos_tokens = ["</s>", "<|end_of_text|>", "<|endoftext|>", "[SEP]"]
     for candidate_eos_token in candidate_eos_tokens:
         if candidate_eos_token in vocab:
             eos_token = candidate_eos_token
@@ -311,24 +311,60 @@ def get_first_label_token_mapping(
             for label in dataset_config.labels
         ]
-        # Get the first token of each label, where we add a prefix space if needed
-        add_prefix_space = (
-            should_prefix_space_be_added_to_labels(
+        # Tokenize some text containing each label, which we will use to extract the
+        # first token of each label
+        all_tokens: list[list[str]]
+        if tokenizer.chat_template is None:
+            add_prefix_space = should_prefix_space_be_added_to_labels(
                 labels_to_be_generated=local_labels, tokenizer=tokenizer
             )
-            and tokenizer.chat_template is None
-        )
-        first_tokens = [
-            tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
-            for label in local_labels
-        ]
-        first_tokens = [
-            re.sub(
-                pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
-            )
-            for token in first_tokens
+            all_tokens = [
+                tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)
+                for label in local_labels
+            ]
+        else:
+            all_tokens = [
+                tokenizer.convert_ids_to_tokens(
+                    ids=tokenizer.apply_chat_template(
+                        conversation=[
+                            dict(role="user", content=""),
+                            dict(role="assistant", content=label),
+                        ],
+                        add_generation_prompt=True,
+                        tokenize=True,
+                    )
+                )
+                for label in local_labels
+            ]
+        # Remove any non-alphabetic characters from the tokens
+        all_tokens = [
+            [
+                re.sub(
+                    pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
+                    repl="",
+                    string=token.lower(),
+                )
+                for token in token_list
+            ]
+            for token_list in all_tokens
         ]
+        # Extract the first token of each label
+        first_tokens: list[str] = list()
+        for token_list, label in zip(all_tokens, local_labels):
+            matching_tokens = [
+                tok for tok in token_list if tok and label.startswith(tok)
+            ]
+            if not matching_tokens:
+                log_once(
+                    f"No matching token found in token_list for label '{label}', so "
+                    "we will not output scores.",
+                    level=logging.DEBUG,
+                )
+                return False
+            first_tokens.append(matching_tokens[0])
         # Build a mapping from labels to the first token in each label if the first
         # tokens are distinct
         if len(first_tokens) == len(set(first_tokens)):

euroeval/utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Utility functions to be used in other scripts."""
+import asyncio
 import gc
 import importlib
 import importlib.metadata
@@ -327,3 +328,43 @@ def get_package_version(package_name: str) -> str | None:
         return importlib.metadata.version(package_name)
     except importlib.metadata.PackageNotFoundError:
         return None
+T = t.TypeVar("T", bound=object)
+def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
+    """Run a coroutine, ensuring that the event loop is always closed when we're done.
+    Args:
+        coroutine:
+            The coroutine to run.
+    Returns:
+        The result of the coroutine.
+    """
+    loop = asyncio.new_event_loop()
+    try:
+        asyncio.set_event_loop(loop)
+        return loop.run_until_complete(coroutine)
+    finally:
+        loop.close()
+        asyncio.set_event_loop(None)
+async def catch_coroutine_exception(
+    coroutine: t.Coroutine[t.Any, t.Any, T],
+) -> T | Exception:
+    """Run a coroutine, catching any exceptions and returning them.
+    Args:
+        coroutine:
+            The coroutine to run.
+    Returns:
+        The result of the coroutine, or the exception if it was raised.
+    """
+    try:
+        return await coroutine
+    except Exception as exc:
+        return exc

{euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.7.1
+Version: 15.8.0
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues

{euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
-euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
-euroeval/benchmarker.py,sha256=OnjGVblWW20wSmA7Tr2c-qE3g8FIjxW6wTJySAcGxVk,48492
+euroeval/benchmark_config_factory.py,sha256=RDYotoLcfNr3xU8Cw-G-Y8wLe6RSlJD1Ok9C97lWfOs,12553
+euroeval/benchmarker.py,sha256=EHoYilZ2Xx0-6_aEBlG84MsZbomJSiHNHc4wKOVVBB8,49199
 euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
 euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
 euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
@@ -8,38 +8,38 @@ euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
 euroeval/data_models.py,sha256=t5FwpGxiSIMe7iKae-tT7usUWki-ILzAFFm7dPJoFsk,22973
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
-euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
+euroeval/finetuning.py,sha256=uuaUxNQJb7TivPQuI1OYQ_MIKbD-6-7mpkobLKsDefQ,10667
 euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
 euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
 euroeval/human_evaluation.py,sha256=VGvw1X6Mkdf22r-THSNWXMIqyJP44yh4rW53vq-0huo,27681
-euroeval/languages.py,sha256=IQUbGMyn7pxAyM70M0FTO80m92Q4KgIU604MJhVia-Q,8513
+euroeval/languages.py,sha256=LerXuRBAUYkQL6qSV-F82itAE4EgBGFBtzaGnJJZvOE,8555
 euroeval/model_cache.py,sha256=n39yFpZkudBCVwz1EQpZ-g5BQtlQemQ5nP3IiFKJZHg,8275
 euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
 euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
 euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
 euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
 euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
-euroeval/tokenization_utils.py,sha256=fbMVAOkRdcpf9L2SVechPpmWYgDXgQcc-sDrYu21wFI,12487
+euroeval/tokenization_utils.py,sha256=kghOIZMM3H0P9YDv0VBSNI7drzgJXlkRtMwt3Cgeev8,13907
 euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
-euroeval/utils.py,sha256=DyWhtdFlAM1TZuiYXWNPN8KxNrZGNa-J3WfS6DGwkvM,10467
+euroeval/utils.py,sha256=e83OnWc0GJn0Tn_vP3tbqh1DAbLy2ky-LnIlTEOKzKU,11410
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
 euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
 euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
-euroeval/benchmark_modules/litellm.py,sha256=v_rbCm2FiTMqcUui_09k3E1-s5uOmbfAvSy2c7Mm0_E,42636
-euroeval/benchmark_modules/vllm.py,sha256=Q-3vtZz5XxQQImJxOiF0XDrQ4T_p0bkgdPw1Jobgu3s,39380
-euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
+euroeval/benchmark_modules/litellm.py,sha256=dd7OqBvWA75zNrsEHtC3cx3rNpNJ-1QOL2arV_CqYG0,48231
+euroeval/benchmark_modules/vllm.py,sha256=DJyla0jr-DVMPPs4RBguxq1Xn5YguvyuAnIlgIOfFaw,39394
+euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
 euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
 euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
-euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
+euroeval/dataset_configs/english.py,sha256=-N85DiNVrZFqpahNUTfxaWy4vvdOWC8Bi0G4uAO4uDw,2326
 euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
-euroeval/dataset_configs/finnish.py,sha256=BIfcxdgJu4CfevHAjzwH7cYd8Xd9DGrm49lcJZcGVQM,1730
+euroeval/dataset_configs/finnish.py,sha256=_8YWIlZNpO8Qi233bH7cKwm3tq3WETLfC_6mzg7LLog,2045
 euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
 euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
 euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
-euroeval/dataset_configs/italian.py,sha256=5yYMMBbxkfSDpLgJ9IH_pgkpzEp-74vMMvx-dT8x4WY,2345
+euroeval/dataset_configs/italian.py,sha256=KNjCvTzsEqH_EEk3At8slKqNwWWiIdbv_t5ke7n9nZI,2660
 euroeval/dataset_configs/norwegian.py,sha256=2SD5681gZFa1Ig-AEpnyStbivan_bq_Pada4qwE7tw0,5181
-euroeval/dataset_configs/spanish.py,sha256=fc0dHWU7-g_p6kaSGA8nD1vLVQF_yqR2PkixrYyWywc,2212
+euroeval/dataset_configs/spanish.py,sha256=NviL-FzJ5jq1bLTRvbtZBiGrAmZjxyijZNpKZFrnT-M,2527
 euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
 euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
 euroeval/prompt_templates/linguistic_acceptability.py,sha256=FAIJKS26EVRxlLHk1C3lN0GDtd5AM0MwvaMf-NNIxfU,6677
@@ -51,11 +51,11 @@ euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5w
 euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
 euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
-euroeval/task_group_utils/sequence_classification.py,sha256=xPz1gJioK96iv2bNoDWiC2EJkhRvRd7QZNgY8bT237c,11703
+euroeval/task_group_utils/sequence_classification.py,sha256=MCdO5h3v_LWTkrvKAeefPq7rl1H5mFed50nAL4uZq0E,13837
 euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
 euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
-euroeval-15.7.1.dist-info/METADATA,sha256=Fj6QejwQCK0zGuP_DHSQ7sul195ivUqOUCT5AVxgLSI,13669
-euroeval-15.7.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.7.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.7.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
-euroeval-15.7.1.dist-info/RECORD,,
+euroeval-15.8.0.dist-info/METADATA,sha256=-GcGBuEnlAPmpT9ItDAmS0psT__jwbVoNkTYOiSeRzA,13669
+euroeval-15.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.8.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.8.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.8.0.dist-info/RECORD,,

{euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.7.1.dist-info → euroeval-15.8.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.7.1__py3-none-any.whl → 15.8.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.7.1py3-none-any.whl → 15.8.0py3-none-any.whl