PyPI - EuroEval - Versions diffs - 15.9.2__py3-none-any.whl → 15.10.0__py3-none-any.whl - Mend

EuroEval 15.9.2py3-none-any.whl → 15.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (14) hide show

euroeval/benchmark_modules/hf.py +3 -3
euroeval/benchmark_modules/litellm.py +158 -122
euroeval/benchmark_modules/vllm.py +47 -143
euroeval/data_loading.py +8 -2
euroeval/finetuning.py +22 -0
euroeval/task_group_utils/multiple_choice_classification.py +11 -1
euroeval/task_group_utils/question_answering.py +14 -4
euroeval/tokenization_utils.py +103 -9
euroeval/utils.py +13 -8
{euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/METADATA +7 -8
{euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/RECORD +14 -14
{euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/WHEEL +0 -0
{euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -378,7 +378,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
                             tokenizer=self._tokenizer,
                         ),
                         batched=True,
-                        batch_size=1,
+                        batch_size=10,
                         remove_columns=dataset["train"].column_names,
                         load_from_cache_file=False,
                         keep_in_memory=True,
@@ -389,7 +389,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
                             tokenizer=self._tokenizer,
                         ),
                         batched=True,
-                        batch_size=1,
+                        batch_size=10,
                         remove_columns=dataset["val"].column_names,
                         load_from_cache_file=False,
                         keep_in_memory=True,
@@ -400,7 +400,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
                             tokenizer=self._tokenizer,
                         ),
                         batched=True,
-                        batch_size=1,
+                        batch_size=10,
                         remove_columns=dataset["test"].column_names,
                         load_from_cache_file=False,
                         keep_in_memory=True,

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Generative models from an inference API, using the LiteLLM framework."""
+import asyncio
 import collections.abc as c
 import logging
 import os
@@ -29,6 +30,7 @@ from litellm.exceptions import (
     Timeout,
 )
 from litellm.llms.vertex_ai.common_utils import VertexAIError
+from litellm.router import Router
 from litellm.types.utils import ChoiceLogprobs, ModelResponse
 from pydantic import conlist, create_model
 from requests.exceptions import RequestException
@@ -68,7 +70,7 @@ from ..task_group_utils import (
 from ..tokenization_utils import get_first_label_token_mapping
 from ..types import ExtractLabelsFunction
 from ..utils import (
-    catch_coroutine_exception,
+    add_semaphore_and_catch_exception,
     create_model_cache_dir,
     log_once,
     safe_run,
@@ -201,6 +203,11 @@ class LiteLLMModel(BenchmarkModule):
         self.is_ollama = model_config.model_id.startswith(
             "ollama/"
         ) or model_config.model_id.startswith("ollama_chat/")
+        self._ollama_show: ollama.ShowResponse = (
+            ollama.show("/".join(model_config.model_id.split("/")[1:]))
+            if self.is_ollama
+            else ollama.ShowResponse(model_info=None)
+        )
         raise_if_wrong_params(model_config=model_config, allowed_params=ALLOWED_PARAMS)
@@ -224,7 +231,14 @@ class LiteLLMModel(BenchmarkModule):
         Returns:
             The generative type of the model, or None if it has not been set yet.
         """
-        if self.model_config.revision == "thinking":
+        if self.is_ollama:
+            reasoning_model = "thinking" in (self._ollama_show.capabilities or [])
+            type_ = (
+                GenerativeType.REASONING
+                if reasoning_model
+                else GenerativeType.INSTRUCTION_TUNED
+            )
+        elif self.model_config.revision in {"thinking"}:
             type_ = GenerativeType.REASONING
         elif re.fullmatch(
             pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
@@ -251,8 +265,18 @@ class LiteLLMModel(BenchmarkModule):
             The generated model outputs.
         """
         assert "messages" in inputs, "The input must contain a 'messages' key."
-        messages = inputs["messages"]
+        conversations: list[list[litellm.AllMessageValues]] = inputs["messages"]
+        # Get the mapping from labels to the first token in the label. We call this each
+        # time we generate a new dataset since the dataset config can change
+        self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
+            dataset_config=self.dataset_config,
+            model_config=self.model_config,
+            tokenizer=None,
+            generative_type=self.generative_type,
+        )
+        # Set the core generation arguments
         generation_kwargs: dict[str, t.Any] = dict(
             model=self.model_config.model_id,
             max_completion_tokens=(
@@ -266,33 +290,30 @@ class LiteLLMModel(BenchmarkModule):
             api_key=self.benchmark_config.api_key,
             api_base=self.benchmark_config.api_base,
             api_version=self.benchmark_config.api_version,
+            max_retries=3,
         )
-        # Get the mapping from labels to the first token in the label. We call this each
-        # time we generate a new dataset since the dataset config can change
-        self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
-            dataset_config=self.dataset_config,
-            model_config=self.model_config,
-            tokenizer=None,
-            generative_type=self.generative_type,
-        )
-        if self.buffer["first_label_token_mapping"]:
-            generation_kwargs["logprobs"] = True
-            generation_kwargs["top_logprobs"] = MAX_LOGPROBS
+        # Set up the `response_format` generation argument if we are dealing with a task
+        # using structured generation
         if self.dataset_config.task in TASKS_USING_JSON:
-            for msg_list in messages:
-                # msg_list is a list of {'role':…, 'content':…} dicts
-                if not msg_list:
+            # Sanity check that "JSON" is included in the prompt, as some models require
+            # this
+            for conversation in conversations:
+                if not conversation:
                     raise InvalidBenchmark(
-                        "Encountered an empty message list in 'messages'."
+                        "Encountered an empty conversation in 'messages'."
                     )
-                last = msg_list[-1]
-                assert isinstance(last, dict), (
-                    f"Expected dict message, got {type(last)}"
+                last_message = conversation[-1]
+                assert isinstance(last_message, dict), (
+                    f"Expected dict message, got {type(last_message)}"
                 )
-                assert "json" in last["content"].lower(), (
+                assert "content" in last_message, (
+                    "Expected 'content' key in the last message of the conversation."
+                )
+                assert isinstance(last_message["content"], str), (
+                    "Expected 'content' to be a string."
+                )
+                assert "json" in last_message["content"].lower(), (
                     "Prompt must contain 'json' for JSON tasks."
                 )
@@ -328,6 +349,19 @@ class LiteLLMModel(BenchmarkModule):
                     level=logging.DEBUG,
                 )
+        # If the model is an Ollama reasoning model, we ensure that thinking is enabled
+        if self.is_ollama and self.generative_type == GenerativeType.REASONING:
+            generation_kwargs["think"] = True
+            log_once(
+                "Enabling thinking mode for Ollama model "
+                f"{self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        # Handle manually set parameters
+        if self.buffer["first_label_token_mapping"]:
+            generation_kwargs["logprobs"] = True
+            generation_kwargs["top_logprobs"] = MAX_LOGPROBS
         if self.model_config.revision == "thinking":
             generation_kwargs["thinking"] = dict(
                 type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
@@ -344,66 +378,67 @@ class LiteLLMModel(BenchmarkModule):
                 level=logging.DEBUG,
             )
-        # This drops generation kwargs that are not supported by the model
+        # Drop generation kwargs that are not supported by the model
         litellm.drop_params = True
-        # Extract the generated sequences from the model response. Some APIs cannot
-        # handle using newlines as stop sequences, so we try both.
-        num_attempts = 10
-        all_responses = {}
-        all_failures = []
-        to_run = list(enumerate(messages))
-        for attempt in range(num_attempts):
-            if not to_run:
+        all_responses: dict[int, ModelResponse] = {}
+        conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
+            enumerate(conversations)
+        )
+        for attempt in range(num_attempts := 10):
+            if not conversations_to_run:
                 break
-            batch_indices, batch_msgs = zip(*to_run)
-            model_response, failures = safe_run(
+            batch_indices, batch_conversations = zip(*conversations_to_run)
+            successes, failures = safe_run(
                 self._generate_async(
-                    messages=list(batch_msgs),
-                    generation_kwargs=generation_kwargs,
-                    max_retries=3,
-                    max_reruns=15,
+                    model_id=self.model_config.model_id,
+                    conversations=list(batch_conversations),
+                    **generation_kwargs,
                 )
             )
-            for orig_idx, response in zip(batch_indices, model_response):
+            # Store the successful model outputs
+            for idx, response in successes:
+                orig_idx = batch_indices[idx]
                 all_responses[orig_idx] = response
+            # If all requests were successful, break
             if not failures:
-                to_run = []
+                conversations_to_run = []
                 break
-            all_failures.extend(failures)
-            to_run = [(orig_idx, messages[orig_idx]) for orig_idx, _ in failures]
+            # Put the failed requests back in the queue to try again
+            conversations_to_run = [
+                (batch_indices[idx], conversations[batch_indices[idx]])
+                for idx, _ in failures
+            ]
             logger.debug(
-                f"Attempt {attempt + 1}/{num_attempts}: "
-                f"retrying {len(to_run)} failed message(s)"
+                f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
+                f"{len(conversations_to_run):,} failed message(s)"
             )
+            # Attempt to handle the exceptions, to improve the chance of getting
+            # successful generations next time around
             for _, error in failures:
                 self._handle_exception(error=error, generation_kwargs=generation_kwargs)
-        else:
-            raise InvalidBenchmark(
-                message=f"Failed to generate text, after {num_attempts} attempts."
-            )
-        if to_run:
+            # Sleep for a second to avoid pinging the API server too quickly
+            sleep(1)
+        else:
             raise InvalidBenchmark(
-                f"Failed to generate text after {num_attempts} attempts. "
-                f"Errors: {all_failures}"
+                message=f"Failed to generate text, after {num_attempts:,} attempts."
             )
-        ordered_responses = [all_responses[i] for i in range(len(messages))]
+        # Extract the generations from the model output
+        ordered_responses = [all_responses[i] for i in range(len(conversations))]
         model_output = self._create_model_output(
             model_responses=ordered_responses, model_id=self.model_config.model_id
         )
-        if len(messages) != len(model_output.sequences):
+        if len(conversations) != len(model_output.sequences):
             raise InvalidBenchmark(
-                f"Number of model inputs ({len(messages):,}) does not match the "
+                f"Number of model inputs ({len(conversations):,}) does not match the "
                 f"number of model outputs ({len(model_output.sequences):,})."
             )
@@ -462,8 +497,8 @@ class LiteLLMModel(BenchmarkModule):
                 f"The model {model_id!r} does not support logprobs, so disabling it.",
                 level=logging.DEBUG,
             )
-            generation_kwargs.pop("logprobs")
-            generation_kwargs.pop("top_logprobs")
+            generation_kwargs.pop("logprobs", None)
+            generation_kwargs.pop("top_logprobs", None)
             return
         elif any(msg.lower() in error_msg for msg in temperature_messages):
             log_once(
@@ -471,7 +506,7 @@ class LiteLLMModel(BenchmarkModule):
                 "temperature, so disabling it.",
                 level=logging.DEBUG,
             )
-            generation_kwargs.pop("temperature")
+            generation_kwargs.pop("temperature", None)
             return
         elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
             log_once(
@@ -503,14 +538,7 @@ class LiteLLMModel(BenchmarkModule):
             generation_kwargs["response_format"] = dict(type="json_object")
             return
         elif isinstance(
-            error,
-            (
-                APIConnectionError,
-                Timeout,
-                ServiceUnavailableError,
-                InternalServerError,
-                SystemError,
-            ),
+            error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
         ):
             logger.debug(
                 f"Service temporarily unavailable. The error message was: {error}. "
@@ -518,6 +546,18 @@ class LiteLLMModel(BenchmarkModule):
             )
             sleep(5)
             return
+        elif isinstance(error, (APIConnectionError, OSError)):
+            # If there are too many I/O connections, we increase the number of allowed
+            # file descriptors
+            if "too many open files" in error_msg:
+                raise InvalidBenchmark(
+                    "There are too many file descriptors running. See the current "
+                    "value by running `ulimit -n`. Try increasing it by running "
+                    "`ulimit -n <new-value>` and try again."
+                )
+            raise InvalidBenchmark(
+                f"Encountered {type(error)} during generation: {error}."
+            )
         if isinstance(error, RateLimitError):
             raise InvalidModel(
@@ -538,69 +578,66 @@ class LiteLLMModel(BenchmarkModule):
     async def _generate_async(
         self,
-        messages: list[dict[str, t.Any]],
-        generation_kwargs: dict[str, t.Any],
-        max_retries: int,
-        max_reruns: int,
-    ) -> tuple[list[ModelResponse], list[tuple[int, Exception]]]:
+        model_id: str,
+        conversations: list[list[litellm.AllMessageValues]],
+        **generation_kwargs,
+    ) -> tuple[list[tuple[int, ModelResponse]], list[tuple[int, Exception]]]:
         """Generate outputs from the model asynchronously.
         Args:
-            messages:
-                The messages to pass to the model.
-            generation_kwargs:
-                The generation kwargs to pass to the model.
-            max_retries:
-                The maximum number of retries to make.
-            max_reruns:
-                The maximum number of reruns to make.
+            model_id:
+                The ID of the model to use for generation.
+            conversations:
+                The conversations to pass to the model.
+            **generation_kwargs:
+                Additional generation arguments to pass to the model.
         Returns:
-            A tuple containing the successful responses and the failed responses.
+            A tuple (successes, failures), each being a list of tuples (idx, content),
+            where the `idx` corresponds to the index of `conversations`, and `content`
+            is either the model response or an Exception.
         """
-        success = []
-        all_failures = {}
-        to_run = list(enumerate(messages))
-        prev_fail_count = len(to_run)
-        rerun_count = 0
-        while to_run and rerun_count < max_reruns and prev_fail_count > 0:
-            requests = [
-                litellm.acompletion(
-                    messages=msg, max_retries=max_retries, **generation_kwargs
+        # Create a LiteLLM router, which will ensure that we only use a single client
+        # for all the requests, preventing "too many open files" errors
+        router = Router(
+            model_list=[
+                dict(
+                    model_name=self.model_config.model_id,
+                    litellm_params=generation_kwargs,
                 )
-                for _, msg in to_run
             ]
-            wrapped_requests = [
-                catch_coroutine_exception(request) for request in requests
-            ]
-            responses = await tqdm_async.gather(*wrapped_requests, leave=False)
-            next_to_run = []
-            current_fail_count = 0
+        )
-            for (orig_idx, _), response in zip(to_run, responses):
-                if isinstance(response, Exception):
-                    current_fail_count += 1
-                    all_failures[orig_idx] = response
-                    next_to_run.append((orig_idx, messages[orig_idx]))
-                else:
-                    success.append(response)
+        # Get the LLM generations asynchronously
+        max_concurrent_calls = 20
+        semaphore = asyncio.Semaphore(max_concurrent_calls)
+        requests = [
+            add_semaphore_and_catch_exception(
+                router.acompletion(model=model_id, messages=conversation),
+                semaphore=semaphore,
+            )
+            for conversation in conversations
+        ]
+        responses = await tqdm_async.gather(*requests, leave=False)
-            if current_fail_count >= prev_fail_count:
-                logger.warning(
-                    "Retry loop aborting due to no progress: "
-                    f"current_fail_count={current_fail_count}, "
-                    f"prev_fail_count={prev_fail_count}"
-                )
-                break
+        # Separate the successful responses from the failed ones
+        successes = [
+            (idx, response)
+            for idx, response in enumerate(responses)
+            if not isinstance(response, Exception)
+        ]
+        failures = [
+            (idx, response)
+            for idx, response in enumerate(responses)
+            if isinstance(response, Exception)
+        ]
-            prev_fail_count = current_fail_count
-            to_run = next_to_run
-            rerun_count += 1
+        # Close connections
+        for request in requests:
+            if hasattr(request, "close"):
+                request.close()
-        failures = [(orig_idx, all_failures[orig_idx]) for orig_idx, _ in to_run]
-        return success, failures
+        return successes, failures
     @staticmethod
     def _create_model_output(
@@ -690,8 +727,7 @@ class LiteLLMModel(BenchmarkModule):
         # If it is an Ollama model then we can get the number of parameters from the
         # Ollama Python SDK
         if self.is_ollama:
-            ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
-            model_info = ollama.show(ollama_model_id).modelinfo
+            model_info = self._ollama_show.modelinfo
             if model_info is not None:
                 num_params = model_info.get("general.parameter_count")
                 if num_params is not None:
@@ -819,7 +855,7 @@ class LiteLLMModel(BenchmarkModule):
         # Python SDK
         if self.is_ollama:
             ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
-            model_info = ollama.show(ollama_model_id).modelinfo
+            model_info = self._ollama_show.modelinfo
             if model_info is not None:
                 context_length_keys = [
                     key for key in model_info.keys() if "context_length" in key.lower()

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -7,12 +7,10 @@ import json
 import logging
 import os
 import re
-import sys
 import typing as t
 from functools import partial
 from pathlib import Path
 from time import sleep
-from types import MethodType
 import torch
 from datasets import DatasetDict
@@ -69,6 +67,7 @@ from ..tokenization_utils import (
     get_end_of_chat_token_ids,
     get_eos_token,
     get_first_label_token_mapping,
+    get_pad_token,
     should_prompts_be_stripped,
 )
 from ..types import ExtractLabelsFunction
@@ -81,17 +80,12 @@ from ..utils import (
 from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
 if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
-    from vllm import LLM, RequestOutput, SamplingParams
+    from vllm import LLM, SamplingParams
     from vllm.distributed.parallel_state import (
         destroy_distributed_environment,
         destroy_model_parallel,
     )
-    from vllm.inputs import PromptType
     from vllm.lora.request import LoRARequest
-    from vllm.model_executor.guided_decoding.guided_fields import GuidedDecodingRequest
-    from vllm.pooling_params import PoolingParams
-    from vllm.prompt_adapter.request import PromptAdapterRequest
-    from vllm.sampling_params import RequestOutputKind
 if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
     from outlines.models.vllm import adapt_tokenizer
@@ -140,6 +134,9 @@ class VLLMModel(HuggingFaceEncoderModel):
         self.end_of_reasoning_token = get_end_of_reasoning_token(
             model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
         )
+        self.end_of_chat_token_ids = get_end_of_chat_token_ids(
+            tokenizer=self._tokenizer
+        )
         self.custom_stop_tokens = get_custom_stop_tokens(
             model=self._model,
             tokenizer=self._tokenizer,
@@ -193,7 +190,10 @@ class VLLMModel(HuggingFaceEncoderModel):
             return None
         elif self.end_of_reasoning_token is not None:
             return GenerativeType.REASONING
-        elif self._tokenizer.chat_template is not None:
+        elif (
+            self._tokenizer.chat_template is not None
+            or "instruct" in self.model_config.model_id.lower()
+        ):
             return GenerativeType.INSTRUCTION_TUNED
         else:
             return GenerativeType.BASE
@@ -303,55 +303,29 @@ class VLLMModel(HuggingFaceEncoderModel):
         Returns:
             The generated model outputs.
         """
-        # Define which tokens to use as stopping criteria. We want to use the padding
-        # token, end-of-sentence token, and a double newline if the model isn't
-        # instruction tuned (since these separate the few-shot examples in the input in
-        # this case)
+        # Get stopping tokens
         stop_tokens: list[str] = self.custom_stop_tokens.copy()
         if self.buffer["instruction_model"] is False:
             stop_tokens.append("\n\n")
         if self._tokenizer.pad_token_id is not None:
+            assert isinstance(self._tokenizer.pad_token, str), (
+                f"The pad token for the model {self.model_config.model_id!r} "
+                f"is not a string, which is unexpected: {self._tokenizer.pad_token!r}."
+            )
             stop_tokens.append(self._tokenizer.pad_token)
         if self._tokenizer.eos_token_id is not None:
+            assert isinstance(self._tokenizer.eos_token, str), (
+                f"The EOS token for the model {self.model_config.model_id!r} "
+                f"is not a string, which is unexpected: {self._tokenizer.eos_token!r}."
+            )
             stop_tokens.append(self._tokenizer.eos_token)
             if self._tokenizer.pad_token_id is None:
                 self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
                 self._tokenizer.pad_token = self._tokenizer.eos_token
-        if (
-            self._tokenizer.bos_token_id is not None
-            and self._tokenizer.pad_token_id is None
-        ):
-            self._tokenizer.pad_token_id = self._tokenizer.bos_token_id
-            self._tokenizer.pad_token = self._tokenizer.bos_token
-        elif (
-            self._tokenizer.eos_token_id is not None
-            and self._tokenizer.pad_token_id is None
-        ):
-            self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
-            self._tokenizer.pad_token = self._tokenizer.eos_token
-        elif self._tokenizer.pad_token_id is None:
-            pad_token_candidates = ["<pad>", "[pad]", "<|endoftext|>", "<|im_end|>"]
-            pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
-            for candidate in pad_token_candidates:
-                if candidate in self._tokenizer.get_vocab():
-                    pad_token_id = self._tokenizer.get_vocab()[candidate]
-                    self._tokenizer.pad_token = candidate
-                    self._tokenizer.pad_token_id = pad_token_id
-                    break
-            else:
-                raise InvalidModel(
-                    "Could not find a suitable token to use as a padding token, since "
-                    "the model does not have a BOS, EOS, or padding token, and does "
-                    f"not have any of the following tokens in its vocabulary: "
-                    f"{pad_token_candidates}."
-                )
-        assert self._tokenizer.pad_token_id is not None
-        # Add end of chat token as a stopping token, if it exists
-        end_of_chat_token_ids = get_end_of_chat_token_ids(tokenizer=self._tokenizer)
-        if end_of_chat_token_ids is not None:
-            end_of_chat_token = self._tokenizer.decode(end_of_chat_token_ids).strip()
+        if self.end_of_chat_token_ids is not None:
+            end_of_chat_token = self._tokenizer.decode(
+                self.end_of_chat_token_ids
+            ).strip()
             if end_of_chat_token:
                 stop_tokens.append(end_of_chat_token)
@@ -438,7 +412,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 raw_outputs = self._model.generate(
                     prompts=prompts,
                     sampling_params=sampling_params,
-                    use_tqdm=(not input_is_a_test),
+                    use_tqdm=False if input_is_a_test else get_pbar_without_leave,
                     lora_request=self.buffer.get("lora_request"),
                 )
                 break
@@ -515,16 +489,13 @@ class VLLMModel(HuggingFaceEncoderModel):
                 completion.split(self.end_of_reasoning_token)[-1]
                 for completion in completions
             ]
-        if self.custom_stop_tokens:
-            stop_token_pattern = re.compile(
-                "|".join(
-                    re.escape(stop_token) for stop_token in self.custom_stop_tokens
-                )
-            )
-            completions = [
-                re.split(pattern=stop_token_pattern, string=completion)[0]
-                for completion in completions
-            ]
+        stop_token_pattern = re.compile(
+            "|".join(re.escape(stop_token) for stop_token in stop_tokens)
+        )
+        completions = [
+            re.split(pattern=stop_token_pattern, string=completion)[0]
+            for completion in completions
+        ]
         completions = [completion.strip() for completion in completions]
         # Sanity check
@@ -824,10 +795,6 @@ def load_model_and_tokenizer(
             f"The model {model_id!r} could not be loaded. The error was {e!r}."
         )
-    model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
-    model._validate_and_add_requests = MethodType(
-        _validate_and_add_requests_with_fixed_progress_bars, model
-    )
     model.config = hf_model_config
     return model, tokenizer
@@ -911,90 +878,11 @@ def load_tokenizer(
     # Ensure that BOS, EOS and PAD tokens are set
     tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
     tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.pad_token, tokenizer.pad_token_id = get_pad_token(tokenizer=tokenizer)
     return tokenizer
-def _run_engine_with_fixed_progress_bars(
-    self: "LLM", use_tqdm: bool
-) -> list["RequestOutput"]:
-    if use_tqdm:
-        num_requests = self.llm_engine.get_num_unfinished_requests()
-        pbar = tqdm(
-            total=num_requests, leave=False, disable=hasattr(sys, "_called_from_test")
-        )
-    else:
-        pbar = None
-    # Run the engine.
-    outputs: list["RequestOutput"] = list()
-    while self.llm_engine.has_unfinished_requests():
-        step_outputs = self.llm_engine.step()
-        for output in step_outputs:
-            if output.finished:
-                outputs.append(output)
-                if pbar is not None:
-                    pbar.update(1)
-    if pbar is not None:
-        pbar.close()
-    # Sort the outputs by request ID. This is necessary because some requests may be
-    # finished earlier than its previous requests.
-    outputs = sorted(outputs, key=lambda x: int(x.request_id))
-    return outputs
-def _validate_and_add_requests_with_fixed_progress_bars(
-    self: "LLM",
-    prompts: "PromptType | c.Sequence[PromptType]",
-    params: "SamplingParams | c.Sequence[SamplingParams] | PoolingParams | c.Sequence[PoolingParams]",  # noqa: E501
-    *,
-    use_tqdm: bool,
-    lora_request: "c.Sequence[LoRARequest] | LoRARequest | None",
-    prompt_adapter_request: "PromptAdapterRequest | None",
-    tokenization_kwargs: dict[str, t.Any] | None = None,
-    guided_options: "GuidedDecodingRequest | None" = None,
-    priority: list[int] | None = None,
-) -> None:
-    if isinstance(prompts, (str, dict)):
-        # Convert a single prompt to a list.
-        prompts = [prompts]
-    num_requests = len(prompts)
-    if isinstance(params, list) and len(params) != num_requests:
-        raise ValueError("The lengths of prompts and params must be the same.")
-    if isinstance(lora_request, list) and len(lora_request) != num_requests:
-        raise ValueError("The lengths of prompts and lora_request must be the same.")
-    for sp in params if isinstance(params, list) else (params,):
-        if isinstance(sp, SamplingParams):
-            self._add_guided_params(sp, guided_options)
-            # We only care about the final output
-            sp.output_kind = RequestOutputKind.FINAL_ONLY
-    # Add requests to the engine.
-    it = prompts
-    if use_tqdm:
-        it = tqdm(it, desc="Adding requests", leave=False)
-    for i, prompt in enumerate(it):
-        self._add_request(
-            prompt,
-            params[i] if isinstance(params, c.Sequence) else params,
-            tokenization_kwargs=tokenization_kwargs,
-            lora_request=lora_request[i]
-            if isinstance(lora_request, c.Sequence)
-            else lora_request,
-            prompt_adapter_request=prompt_adapter_request,
-            priority=priority[i] if priority else 0,
-        )
 def clear_vllm() -> None:
     """Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
     with contextlib.suppress(ValueError):
@@ -1166,3 +1054,19 @@ def get_custom_stop_tokens(
         logger.debug(f"Found no custom stop tokens for model {model_id!r}.")
     return stop_tokens
+def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
+    """Get a progress bar for vLLM which disappears after completion.
+    Args:
+        *tqdm_args:
+            Positional arguments to pass to tqdm.
+        **tqdm_kwargs:
+            Additional keyword arguments to pass to tqdm.
+    Returns:
+        A tqdm progress bar.
+    """
+    tqdm_kwargs.pop("leave", None)  # Remove the 'leave' key if it exists
+    return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)

euroeval/data_loading.py CHANGED Viewed

@@ -4,11 +4,11 @@ import logging
 import sys
 import time
+import requests
 from datasets import Dataset, DatasetDict, load_dataset
 from datasets.exceptions import DatasetsError
 from huggingface_hub.errors import HfHubHTTPError
 from numpy.random import Generator
-from requests import ReadTimeout
 from .data_models import BenchmarkConfig, DatasetConfig
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark
@@ -101,7 +101,13 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDic
                 token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
             )
             break
-        except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
+        except (
+            FileNotFoundError,
+            ConnectionError,
+            DatasetsError,
+            requests.ConnectionError,
+            requests.ReadTimeout,
+        ):
             logger.warning(
                 f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
             )

euroeval/finetuning.py CHANGED Viewed

@@ -200,6 +200,7 @@ def finetune_single_iteration(
         compute_metrics=model.compute_metrics,
         callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
         data_collator=model.data_collator,
+        preprocess_logits_for_metrics=remove_extra_tensors_from_logits,
     )
     if not benchmark_config.verbose:
@@ -316,3 +317,24 @@ def get_training_args(
         training_args._n_gpu = 1
     return training_args
+def remove_extra_tensors_from_logits(
+    logits: torch.Tensor | tuple[torch.Tensor, ...], labels: torch.Tensor
+) -> torch.Tensor | tuple[torch.Tensor, ...]:
+    """If the logits are a tuple, return only the first element.
+    Args:
+        logits:
+            The logits to process.
+        labels:
+            The labels to use for the processing.
+    Returns:
+        The processed logits.
+    """
+    if isinstance(logits, tuple):
+        logits = logits[:-1]
+        if len(logits) == 1:
+            logits = logits[0]
+    return logits

euroeval/task_group_utils/multiple_choice_classification.py CHANGED Viewed

@@ -12,6 +12,8 @@ from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_base import BatchEncoding
 from transformers.trainer import Trainer
+from ..exceptions import InvalidBenchmark
 if t.TYPE_CHECKING:
     from ..types import Labels, Predictions
@@ -19,7 +21,7 @@ logger = logging.getLogger("euroeval")
 class MultipleChoiceClassificationTrainer(Trainer):
-    """Trainer subclass for question answering tasks."""
+    """Trainer subclass for multiple-choice classification tasks."""
     def evaluate(  # type: ignore[override]
         self,
@@ -57,6 +59,8 @@ class MultipleChoiceClassificationTrainer(Trainer):
         )
         predictions = output.predictions
+        if isinstance(predictions, tuple):
+            predictions = predictions[0]
         assert isinstance(predictions, np.ndarray)
         metrics = output.metrics
@@ -150,6 +154,12 @@ def postprocess_predictions_and_labels(
     Returns:
         The postprocessed predictions and labels.
     """
+    if predictions.ndim != 2 or predictions.shape[1] != 2:
+        raise InvalidBenchmark(
+            "Predictions must be a 2D array with shape (num_examples, 2). Found "
+            f"shape {predictions.shape}."
+        )
     mapping = {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"}
     all_predictions: list[str] = list()

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -8,11 +8,11 @@ from collections import defaultdict
 import evaluate
 import numpy as np
 from evaluate import EvaluationModule
-from transformers.tokenization_utils import PreTrainedTokenizer
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer import Trainer
 from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
+from ..exceptions import InvalidBenchmark
 from ..tokenization_utils import get_special_token_metadata
 from ..utils import raise_if_model_output_contains_nan_values
@@ -20,6 +20,7 @@ if t.TYPE_CHECKING:
     import torch.nn as nn
     from datasets.arrow_dataset import Dataset
     from transformers.modeling_utils import PreTrainedModel
+    from transformers.tokenization_utils import PreTrainedTokenizer
     from transformers.tokenization_utils_base import BatchEncoding
     from transformers.trainer_callback import TrainerCallback
     from transformers.trainer_utils import EvalPrediction
@@ -43,6 +44,7 @@ class QuestionAnsweringTrainer(Trainer):
         compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
         callbacks: "list[TrainerCallback]",
         data_collator: "c.Callable",
+        **kwargs,
     ) -> None:
         """Initialise the trainer."""
         super().__init__(
@@ -54,6 +56,7 @@ class QuestionAnsweringTrainer(Trainer):
             compute_metrics=compute_metrics,
             callbacks=callbacks,
             data_collator=data_collator,
+            **kwargs,
         )
         # Get the CLS token id for the tokenizer
@@ -475,7 +478,7 @@ def prepare_test_examples(
 def postprocess_predictions_and_labels(
-    predictions: tuple[np.ndarray, np.ndarray],
+    predictions: tuple[np.ndarray, ...],
     dataset: "Dataset",
     prepared_dataset: "Dataset",
     cls_token_index: int,
@@ -484,7 +487,7 @@ def postprocess_predictions_and_labels(
     Args:
         predictions:
-            A pair of (start_logits, end_logits) predictions.
+            A tuple whose first two elements are (start_logits, end_logits).
         dataset:
             The dataset containing the examples.
         prepared_dataset:
@@ -495,7 +498,14 @@ def postprocess_predictions_and_labels(
     Returns:
         The postprocessed predictions and labels.
     """
-    all_start_logits, all_end_logits = predictions
+    if len(predictions) < 2:
+        raise InvalidBenchmark(
+            "The predictions should be a tuple with the first two elements being "
+            "(start_logits, end_logits), but got {len(predictions)} elements instead: "
+            f"{predictions}."
+        )
+    all_start_logits, all_end_logits = predictions[:2]
     # Build a map from an example to its corresponding features, being the blocks of
     # text from the context that we're feeding into the model. An example can have

euroeval/tokenization_utils.py CHANGED Viewed

@@ -185,6 +185,11 @@ def get_bos_token(
         )
         return None, None
+    log_once(
+        f"Beginning-of-sequence token was not set, but detected it as {bos_token!r} "
+        f"with ID {bos_token_id}.",
+        level=logging.DEBUG,
+    )
     return bos_token, bos_token_id
@@ -221,9 +226,97 @@ def get_eos_token(
         )
         return None, None
+    log_once(
+        f"End-of-sequence token was not set, but detected it as {eos_token!r} with "
+        f"ID {eos_token_id}.",
+        level=logging.DEBUG,
+    )
     return eos_token, eos_token_id
+def get_pad_token(
+    tokenizer: "PreTrainedTokenizer",
+) -> tuple[str, int] | tuple[None, None]:
+    """Get the padding token from a tokenizer.
+    Args:
+        tokenizer:
+            The tokenizer.
+    Returns:
+        A pair (token, token_id) representing the padding token and its token ID, or
+        (None, None) if no padding token is found.
+    """
+    # If the tokenizer already has a padding token, return it
+    if tokenizer.pad_token is not None and tokenizer.pad_token_id is not None:
+        assert isinstance(tokenizer.pad_token, str), (
+            "Expected tokenizer.pad_token to be a string, but got "
+            f"{type(tokenizer.pad_token)}."
+        )
+        assert isinstance(tokenizer.pad_token_id, int), (
+            "Expected tokenizer.pad_token_id to be an integer, but got "
+            f"{type(tokenizer.pad_token_id)}."
+        )
+        return (tokenizer.pad_token, tokenizer.pad_token_id)
+    # If the tokenizer has a BOS token, use it as the padding token
+    if tokenizer.bos_token is not None and tokenizer.bos_token_id is not None:
+        assert isinstance(tokenizer.bos_token, str), (
+            "Expected tokenizer.bos_token to be a string, but got "
+            f"{type(tokenizer.bos_token)}."
+        )
+        assert isinstance(tokenizer.bos_token_id, int), (
+            "Expected tokenizer.bos_token_id to be an integer, but got "
+            f"{type(tokenizer.bos_token_id)}."
+        )
+        pad_token = tokenizer.bos_token
+        pad_token_id = tokenizer.bos_token_id
+    # If the tokenizer has an EOS token, use it as the padding token
+    elif tokenizer.eos_token is not None and tokenizer.eos_token_id is not None:
+        assert isinstance(tokenizer.eos_token, str), (
+            "Expected tokenizer.eos_token to be a string, but got "
+            f"{type(tokenizer.eos_token)}."
+        )
+        assert isinstance(tokenizer.eos_token_id, int), (
+            "Expected tokenizer.eos_token_id to be an integer, but got "
+            f"{type(tokenizer.eos_token_id)}."
+        )
+        pad_token = tokenizer.eos_token
+        pad_token_id = tokenizer.eos_token_id
+    # Otherwise, try to find a candidate padding token in the vocabulary
+    else:
+        pad_token_candidates = [
+            "<pad>",
+            "[pad]",
+            "<|endoftext|>",
+            "<｜end▁of▁sentence｜>",
+            "<|im_end|>",
+        ]
+        pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
+        for candidate in pad_token_candidates:
+            if candidate in tokenizer.get_vocab():
+                pad_token = candidate
+                pad_token_id = tokenizer.get_vocab()[candidate]
+                break
+        else:
+            log_once(
+                "Could not identify a padding token for the model. Please ensure that "
+                "this has been set in the tokenizer's configuration. Using no padding "
+                "token. This may lead to unexpected behavior in the model.",
+                level=logging.INFO,
+            )
+            return None, None
+    log_once(
+        f"Padding token was not set, but detected it as {pad_token!r} with ID "
+        f"{pad_token_id}.",
+        level=logging.DEBUG,
+    )
+    return pad_token, pad_token_id
 def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | None:
     """Get the end token ID for chat models.
@@ -300,14 +393,14 @@ def get_first_label_token_mapping(
     if tokenizer is None:
         if output_scores:
             log_once(
-                f"The model {model_config.model_id!r} will output scores, since the "
-                "dataset supports it and no tokenizer is available.",
+                f"We will use logprobs with the model {model_config.model_id!r} "
+                "since the dataset supports it and no tokenizer is available.",
                 level=logging.DEBUG,
             )
         else:
             log_once(
-                f"The model {model_config.model_id!r} will not output scores, since "
-                "the dataset does not support it and no tokenizer is available.",
+                f"We will not use logprobs with the model {model_config.model_id!r} "
+                "since the dataset does not support it and no tokenizer is available.",
                 level=logging.DEBUG,
             )
         return output_scores
@@ -368,7 +461,7 @@ def get_first_label_token_mapping(
             if not matching_tokens:
                 log_once(
                     f"No matching token found in token_list for label '{label}', so "
-                    "we will not output scores.",
+                    "we will not use logprobs with the model.",
                     level=logging.DEBUG,
                 )
                 return False
@@ -378,8 +471,8 @@ def get_first_label_token_mapping(
         # tokens are distinct
         if len(first_tokens) == len(set(first_tokens)):
             log_once(
-                "The model will output scores, since the first tokens of the labels "
-                "are distinct.",
+                "We will use logprobs with the model since the first tokens of the "
+                "labels are distinct.",
                 level=logging.DEBUG,
             )
             return {
@@ -388,7 +481,7 @@ def get_first_label_token_mapping(
             }
         else:
             log_once(
-                "The model will not output scores, since the first tokens of the "
+                "We will not use logprobs with the model since the first tokens of the "
                 "labels are not distinct. The first tokens for the labels "
                 f"{local_labels} are {first_tokens}"
             )
@@ -398,7 +491,8 @@ def get_first_label_token_mapping(
     # evaluation errors. This will force the label extraction to rely on word edit
     # distance instead of logprobs.
     log_once(
-        "The model will not output scores, since the dataset does not have labels.",
+        "We will not use logprobs with the model, since the dataset does not have "
+        "labels.",
         level=logging.DEBUG,
     )
     return False

euroeval/utils.py CHANGED Viewed

@@ -121,6 +121,8 @@ def block_terminal_output() -> None:
     logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
     logging.getLogger("accelerate").setLevel(logging.CRITICAL)
     logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
+    logging.getLogger("LiteLLM Router").setLevel(logging.CRITICAL)
+    logging.getLogger("LiteLLM Proxy").setLevel(logging.CRITICAL)
     logging.getLogger("huggingface_hub").setLevel(logging.CRITICAL)
     # This suppresses vLLM logging
@@ -352,19 +354,22 @@ def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
         asyncio.set_event_loop(None)
-async def catch_coroutine_exception(
-    coroutine: t.Coroutine[t.Any, t.Any, T],
+async def add_semaphore_and_catch_exception(
+    coroutine: t.Coroutine[t.Any, t.Any, T], semaphore: asyncio.Semaphore
 ) -> T | Exception:
-    """Run a coroutine, catching any exceptions and returning them.
+    """Run a coroutine with a semaphore.
     Args:
         coroutine:
             The coroutine to run.
+        semaphore:
+            The semaphore to use.
     Returns:
-        The result of the coroutine, or the exception if it was raised.
+        The result of the coroutine.
     """
-    try:
-        return await coroutine
-    except Exception as exc:
-        return exc
+    async with semaphore:
+        try:
+            return await coroutine
+        except Exception as exc:
+            return exc

{euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/METADATA RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.9.2
+Version: 15.10.0
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
 Author-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
-Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>, Kenneth Enevoldsen <kenneth.enevoldsen@cas.au.dk>
+Maintainer-email: Dan Saattrup Nielsen <dan.nielsen@alexandra.dk>
 License: MIT License
         Copyright (c) 2022-2024 Dan Saattrup Nielsen
@@ -37,13 +37,12 @@ Requires-Dist: demjson3>=3.0.6
 Requires-Dist: evaluate>=0.4.1
 Requires-Dist: huggingface-hub>=0.30.1
 Requires-Dist: levenshtein>=0.24.0
-Requires-Dist: litellm>=1.63.0
+Requires-Dist: litellm>=1.72.2
 Requires-Dist: more-itertools>=10.5.0
 Requires-Dist: numpy<2.0.0,>=1.23.0
-Requires-Dist: ollama>=0.4.7
+Requires-Dist: ollama>=0.5.1
 Requires-Dist: pandas>=2.2.0
 Requires-Dist: peft>=0.15.0
-Requires-Dist: protobuf~=3.20.0
 Requires-Dist: pydantic>=2.6.0
 Requires-Dist: pyinfer>=0.0.3
 Requires-Dist: python-dotenv>=1.0.1
@@ -62,12 +61,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: gradio>=4.26.0; extra == 'all'
 Requires-Dist: outlines>=0.1.11; extra == 'all'
-Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: outlines>=0.1.11; extra == 'generative'
-Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'generative'
 Provides-Extra: human-evaluation
 Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
 Provides-Extra: test
@@ -93,7 +92,7 @@ ______________________________________________________________________
 [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
-## Maintainers
+## Maintainer
 - Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
   dan.nielsen@alexandra.dk)

{euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/RECORD RENAMED Viewed

@@ -4,11 +4,11 @@ euroeval/benchmarker.py,sha256=wmgrYVS31PMhhrVienjaVHHyfnZAy51kUvC6OjooiOw,48047
 euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
 euroeval/cli.py,sha256=d8JztMi_RbpUlEBXidd6DQ-xeC-xhozf_qU6Vkzye20,8161
 euroeval/constants.py,sha256=0KHrH74zGM8vNF4uZG_a5qFJRZH5YgyQULYZtCKlo68,2452
-euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
+euroeval/data_loading.py,sha256=2rMLSy8pbntlwmImizMtkTiUzj93mcv5kzYjZELWWfU,4081
 euroeval/data_models.py,sha256=7nAGDpN58Y35Lt9JZE_y0y5iOYesw2htcwHc68MkBZU,22953
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
-euroeval/finetuning.py,sha256=uuaUxNQJb7TivPQuI1OYQ_MIKbD-6-7mpkobLKsDefQ,10667
+euroeval/finetuning.py,sha256=cx5SVgEsveMDNfoMxwLfAFsjZeKmYyHftaOZWZ-L9hA,11285
 euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
 euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
 euroeval/human_evaluation.py,sha256=zqbbJkqm2Uymf-88PxM3R9vVRR8SZJlq3QrqWEoiVeE,27643
@@ -19,15 +19,15 @@ euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,223
 euroeval/scores.py,sha256=TovjCZD8wmGrIjA4v5oAQp18P5KVcHvakkByDh0Hstk,3059
 euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
 euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
-euroeval/tokenization_utils.py,sha256=_B4KN3ZcuvVr8y3LedtfxBJfmPKjfVMjpbtl8bbQAuc,14278
+euroeval/tokenization_utils.py,sha256=LxgGs7juS5PuMYt5LL2X6eVXdtnpi-A2jFxqcWpF6NA,17931
 euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
-euroeval/utils.py,sha256=e83OnWc0GJn0Tn_vP3tbqh1DAbLy2ky-LnIlTEOKzKU,11410
+euroeval/utils.py,sha256=5R7y67xe0ODaje7k8nOu2AFS3Ph2gcsiWpIq5rjSSuA,11613
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
 euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
-euroeval/benchmark_modules/hf.py,sha256=CoiaNakjhg6gm_5IbUUeevXQZebg2VrRLuhzEi2Hhrk,44617
-euroeval/benchmark_modules/litellm.py,sha256=SxSr_0C6b_jVavR3y9QyhfkCOP5-va4zijGfghFTArY,48362
-euroeval/benchmark_modules/vllm.py,sha256=SbQ_EYSwUFBVLsp9io1Q75A9S_H-iw6AzLOn3rlEhK0,43034
+euroeval/benchmark_modules/hf.py,sha256=Nbtn5eZ4axbmL09M8dGZCBr07pn9-btbqGgQ6q7KbHg,44620
+euroeval/benchmark_modules/litellm.py,sha256=LS4mBXXG6h4uJwySPc6SI6f0y_HuiKE7IprprqWpoCI,50601
+euroeval/benchmark_modules/vllm.py,sha256=sgeltOVfZA9bu0AmXV7PtZvuRst0I8s6VOIp0CI6DO8,38880
 euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
 euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
 euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
@@ -49,13 +49,13 @@ euroeval/prompt_templates/reading_comprehension.py,sha256=yLqryWQAW04GULz_EyNDLO
 euroeval/prompt_templates/sentiment_classification.py,sha256=LDOwjGQ2kqhwgNyphPywQeolwNB09o-xYWc9RUbzc84,7136
 euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5wKddjSbJNYFDp8,4984
 euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
-euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
-euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
+euroeval/task_group_utils/multiple_choice_classification.py,sha256=LQ6zD1UGi-jGCKI2xUJiQdAXoqb5QMpIJu41B2U0HPw,6543
+euroeval/task_group_utils/question_answering.py,sha256=D4oJL2vQEjHghyxiiiq_vj1IQC6eryqNoLXuTiQEPmw,28071
 euroeval/task_group_utils/sequence_classification.py,sha256=zwRUgVHqLlREILwyg-yuDPkrIQOfqGVPsFBai-2D9a8,13525
 euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
 euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
-euroeval-15.9.2.dist-info/METADATA,sha256=LwHTlJ51OGVwcRTUPulH-gh8IFxu82CUFYHZ1uOUyT0,13555
-euroeval-15.9.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.9.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.9.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
-euroeval-15.9.2.dist-info/RECORD,,
+euroeval-15.10.0.dist-info/METADATA,sha256=WUXtSfS6qvrlA25lazql3DvyS5chyMnBPKyu-l65A_I,13472
+euroeval-15.10.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.10.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.10.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.10.0.dist-info/RECORD,,

{euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.9.2.dist-info → euroeval-15.10.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.9.2__py3-none-any.whl → 15.10.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.9.2py3-none-any.whl → 15.10.0py3-none-any.whl