PyPI - EuroEval - Versions diffs - 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl - Mend

EuroEval 15.4.2py3-none-any.whl → 15.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (54) hide show

euroeval/__init__.py +2 -2
euroeval/benchmark_modules/base.py +3 -2
euroeval/benchmark_modules/fresh.py +8 -6
euroeval/benchmark_modules/hf.py +44 -33
euroeval/benchmark_modules/litellm.py +314 -120
euroeval/benchmark_modules/vllm.py +99 -59
euroeval/benchmarker.py +52 -21
euroeval/callbacks.py +2 -2
euroeval/constants.py +9 -2
euroeval/data_models.py +258 -44
euroeval/dataset_configs/__init__.py +61 -0
euroeval/dataset_configs/danish.py +120 -0
euroeval/dataset_configs/dutch.py +123 -0
euroeval/dataset_configs/english.py +88 -0
euroeval/dataset_configs/faroese.py +53 -0
euroeval/dataset_configs/french.py +83 -0
euroeval/dataset_configs/german.py +91 -0
euroeval/dataset_configs/icelandic.py +148 -0
euroeval/dataset_configs/italian.py +81 -0
euroeval/dataset_configs/norwegian.py +178 -0
euroeval/dataset_configs/spanish.py +78 -0
euroeval/dataset_configs/swedish.py +100 -0
euroeval/exceptions.py +10 -10
euroeval/finetuning.py +6 -10
euroeval/generation.py +1 -0
euroeval/human_evaluation.py +2 -2
euroeval/languages.py +20 -13
euroeval/model_cache.py +1 -1
euroeval/model_loading.py +1 -12
euroeval/prompt_templates/__init__.py +8 -0
euroeval/prompt_templates/linguistic_acceptability.py +112 -0
euroeval/prompt_templates/multiple_choice.py +97 -0
euroeval/prompt_templates/named_entity_recognition.py +257 -0
euroeval/prompt_templates/reading_comprehension.py +118 -0
euroeval/prompt_templates/sentiment_classification.py +137 -0
euroeval/prompt_templates/summarization.py +97 -0
euroeval/speed_benchmark.py +1 -1
euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
euroeval/tasks.py +54 -0
euroeval/tokenization_utils.py +343 -0
euroeval/types.py +3 -1
euroeval/utils.py +5 -254
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
euroeval-15.6.0.dist-info/RECORD +59 -0
euroeval/dataset_configs.py +0 -2408
euroeval-15.4.2.dist-info/RECORD +0 -40
/euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -27,20 +27,17 @@ from litellm.exceptions import (
     BadRequestError,
     InternalServerError,
     NotFoundError,
+    RateLimitError,
     ServiceUnavailableError,
     Timeout,
 )
-from litellm.types.utils import ModelResponse
+from litellm.llms.vertex_ai.common_utils import VertexAIError
+from litellm.types.utils import ChoiceLogprobs, ModelResponse
 from requests.exceptions import RequestException
 from tqdm.auto import tqdm
-from transformers import Trainer
+from transformers.trainer import Trainer
-from ..constants import (
-    MAX_LOGPROBS,
-    REASONING_MAX_TOKENS,
-    TASK_GROUPS_USING_LOGPROBS,
-    TASKS_USING_JSON,
-)
+from ..constants import MAX_LOGPROBS, REASONING_MAX_TOKENS, TASKS_USING_JSON
 from ..data_models import (
     BenchmarkConfig,
     DatasetConfig,
@@ -62,12 +59,13 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
-from ..task_utils import (
+from ..task_group_utils import (
     question_answering,
     sequence_classification,
     text_to_text,
     token_classification,
 )
+from ..tokenization_utils import get_first_label_token_mapping
 from ..types import ExtractLabelsFunction
 from ..utils import create_model_cache_dir, log_once
 from .base import BenchmarkModule
@@ -78,64 +76,80 @@ logger = logging.getLogger("euroeval")
 VOCAB_SIZE_MAPPING = {
     # OpenAI models
-    "(text-)?(ada|babbage|curie|davinci)(-001)?": 50_257,
-    "(code|text)-davinci-00[2-9]": 50_281,
-    "gpt-3.5-turbo(-16k)?(-[0-9]{4})?": 100_256,
-    "gpt-4-(32k)?(-[0-9]{4})?": 100_256,
-    "gpt-4-[0-9]{4}-preview": 100_256,
-    "gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
-    "gpt-4-(vision|turbo)(-preview)?": 100_256,
-    "gpt-3.5-turbo-instruct(-[0-9]{4})?": 100_256,
-    "gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
-    "o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
+    r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
+    r"gpt-4-[0-9]{4}-preview": 100_256,
+    r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
+    r"gpt-4-(vision|turbo)(-preview)?": 100_256,
+    r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 100_256,
+    r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
+    r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
     # Anthropic models
-    "claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
+    r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
+    # Gemini models
+    r"(gemini/)?gemini-[1-9]\.[0-9]-(flash|pro).*": 256_128,
+    # xAI models
+    r"(xai/)?grok.*": -1,
 }
 MODEL_MAX_LENGTH_MAPPING = {
     # OpenAI models
-    "(text-)?(ada|babbage|curie|davinci)(-001)?": 2_050,
-    "text-davinci-00[2-9]": 4_098,
-    "code-davinci-00[1-9]": 8_002,
-    "gpt-3.5-turbo-0613": 4_096,
-    "gpt-3.5-turbo(-[0-9]{4})?": 16_385,
-    "gpt-3.5-turbo-16k(-[0-9]{4})?": 16_384,
-    "gpt-4(-[0-9]{4})?": 8_191,
-    "gpt-4-32k(-[0-9]{4})?": 32_767,
-    "gpt-4-[0-9]{4}-preview": 128_000,
-    "gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
-    "gpt-4-(vision|turbo)(-preview)?": 128_000,
-    "gpt-3.5-turbo-instruct(-[0-9]{4})?": 4_095,
-    "gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
-    "o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
-    "o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
-    "o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
+    r"gpt-4(-[0-9]{4})?": 8_191,
+    r"gpt-4-32k(-[0-9]{4})?": 32_767,
+    r"gpt-4-[0-9]{4}-preview": 128_000,
+    r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
+    r"gpt-4-(vision|turbo)(-preview)?": 128_000,
+    r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 4_095,
+    r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
+    r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
+    r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
+    r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
     # Anthropic models
-    "claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
+    r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
+    # Gemini models
+    r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
+    r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
+    r"(gemini/)?gemini-2\.(0|5).*": 1_048_576,
+    # xAI models
+    r"(xai/)?grok.*": 131_072,
 }
 NUM_PARAMS_MAPPING = {
     # OpenAI models
-    "(text-)?ada(-001)?": 350_000_000,
-    "(text-)?babbage(-001)?": 3_000_000_000,
-    "(text-)?curie(-001)?": 13_000_000_000,
-    "((text|code)-)?davinci(-00[1-9])?": 175_000_000_000,
-    "gpt-(3.5|4)-turbo-((16|32)k)?(-[0-9]{4})?": -1,
-    "gpt-4-[0-9]{4}-preview": -1,
-    "gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
-    "gpt-4-(vision|turbo)(-preview)?": -1,
-    "gpt-3.5-turbo-instruct(-[0-9]{4})?": -1,
-    "gpt-4o(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
-    "gpt-4o-mini(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
-    "o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
+    r"gpt-4.*": -1,
+    r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
+    # Anthropic models
+    r"(anthropic/)?claude-*": -1,
+    # Gemini models
+    r"(gemini/)?gemini-1.5-flash-8b": 8_000_000_000,
+    r"(gemini/)?gemini-1.5-flash-[0-9]+": -1,
+    r"(gemini/)?gemini-2.(0|5).*": -1,
+    # xAI models
+    r"(xai/)?grok.*": -1,
+}
+ALLOWED_PARAMS = {
+    # OpenAI models
+    r"gpt-4.*": [],
+    r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "high"],
     # Anthropic models
-    "claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
+    r"(anthropic/)?claude-3-.*": [],
+    r"(anthropic/)?claude-3.5-.*": [],
+    r"(anthropic/)?claude-3.7-sonnet.*": ["thinking"],
+    # Gemini models
+    r"(gemini/)?gemini-.*": [],
+    # xAI models
+    r"(xai/)?grok.*": [],
 }
-REASONING_MODELS = ["o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?"]
+REASONING_MODELS = [
+    r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",
+    r"(gemini/)?gemini.*thinking.*",
+    r"(gemini/)?gemini-2.5-pro.*",
+]
 class LiteLLMModel(BenchmarkModule):
@@ -167,12 +181,18 @@ class LiteLLMModel(BenchmarkModule):
             "ollama/"
         ) or model_config.model_id.startswith("ollama_chat/")
+        raise_if_wrong_params(model_config=model_config, allowed_params=ALLOWED_PARAMS)
         super().__init__(
             model_config=model_config,
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
         )
+        self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
+            dataset_config=self.dataset_config, tokenizer=None
+        )
     @property
     def generative_type(self) -> GenerativeType | None:
         """Get the generative type of the model.
@@ -180,7 +200,9 @@ class LiteLLMModel(BenchmarkModule):
         Returns:
             The generative type of the model, or None if it has not been set yet.
         """
-        if re.fullmatch(
+        if self.model_config.revision == "thinking":
+            return GenerativeType.REASONING
+        elif re.fullmatch(
             pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
         ):
             return GenerativeType.REASONING
@@ -218,7 +240,13 @@ class LiteLLMModel(BenchmarkModule):
             api_version=self.benchmark_config.api_version,
         )
-        if self.dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS:
+        # Get the mapping from labels to the first token in the label. We call this each
+        # time we generate a new dataset since the dataset config can change
+        self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
+            dataset_config=self.dataset_config, tokenizer=None
+        )
+        if self.buffer["first_label_token_mapping"]:
             generation_kwargs["logprobs"] = True
             generation_kwargs["top_logprobs"] = MAX_LOGPROBS
@@ -227,6 +255,27 @@ class LiteLLMModel(BenchmarkModule):
                 "Prompt must contain 'json' for JSON tasks."
             )
             generation_kwargs["response_format"] = dict(type="json_object")
+            log_once(
+                "Enabling JSON response format for model "
+                f"{self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        if self.model_config.revision == "thinking":
+            generation_kwargs["thinking"] = dict(
+                type="enabled", budget_tokens=REASONING_MAX_TOKENS
+            )
+            log_once(
+                f"Enabling thinking mode for model {self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
+        elif self.model_config.revision in {"low", "high"}:
+            generation_kwargs["reasoning_effort"] = self.model_config.revision
+            log_once(
+                f"Enabling reasoning effort {self.model_config.revision!r} for model "
+                f"{self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
         # This drops generation kwargs that are not supported by the model
         litellm.drop_params = True
@@ -235,39 +284,60 @@ class LiteLLMModel(BenchmarkModule):
         # handle using newlines as stop sequences, so we try both.
         num_attempts = 10
         for _ in range(num_attempts):
+            stop_messages = ["stop_sequences"]
+            logprobs_messages = [
+                "you are not allowed to request logprobs",
+                "you've reached the maximum number of requests with logprobs",
+                "logprobs is not supported",
+                "logprobs is not enabled",
+            ]
+            temperature_messages = [
+                "'temperature' is not supported with this model.",
+                "temperature is not supported with this model",
+            ]
             try:
                 model_response = litellm.completion(
                     messages=messages, max_retries=3, **generation_kwargs
                 )
                 break
-            except BadRequestError as e:
-                if "stop_sequences" in str(e).lower():
+            except (BadRequestError, RateLimitError) as e:
+                if any(msg.lower() in str(e).lower() for msg in stop_messages):
                     generation_kwargs["stop"] = None
-                elif "you are not allowed to request logprobs" in str(e).lower():
-                    generation_kwargs.pop("logprobs")
-                    generation_kwargs.pop("top_logprobs")
                 elif (
-                    "'temperature' is not supported with this model." in str(e).lower()
+                    any(msg.lower() in str(e).lower() for msg in logprobs_messages)
+                    # Special case for Vertex AI models, since they have strict rate
+                    # limits on using logprobs. They also have a cap of 5 logprobs, but
+                    # we ignore this since the rate limiting makes it unusable anyway.
+                    or (isinstance(e, VertexAIError) and "logprobs" in str(e).lower())
                 ):
+                    generation_kwargs.pop("logprobs")
+                    generation_kwargs.pop("top_logprobs")
+                elif any(msg.lower() in str(e).lower() for msg in temperature_messages):
                     generation_kwargs.pop("temperature")
+                elif isinstance(e, RateLimitError):
+                    raise InvalidModel(
+                        "You have encountered your rate limit for model "
+                        f"{self.model_config.model_id!r}. Skipping."
+                    )
                 else:
                     raise InvalidBenchmark(
                         f"Failed to generate text. The error message was: {e}"
                     )
+            except APIError as e:
+                raise InvalidBenchmark(
+                    f"Failed to generate text. The error message was: {e}"
+                )
             except (
+                APIConnectionError,
                 Timeout,
                 ServiceUnavailableError,
-                APIConnectionError,
                 InternalServerError,
-            ):
+            ) as e:
                 logger.debug(
-                    "Service temporarily unavailable. Retrying in 5 seconds..."
+                    f"Service temporarily unavailable. The error message was: {e}. "
+                    f"Retrying in 5 seconds..."
                 )
                 sleep(5)
-            except APIError as e:
-                raise InvalidBenchmark(
-                    f"Failed to generate text. The error message was: {e}"
-                )
             except AuthenticationError:
                 raise NeedsAdditionalArgument(
                     cli_argument="--api-key",
@@ -280,6 +350,15 @@ class LiteLLMModel(BenchmarkModule):
             )
         assert isinstance(model_response, ModelResponse)
+        if not model_response.choices:
+            # This happens for reasoning models, when they don't finish thinking and run
+            # out of tokens. Happens quite rarely, but we need to handle it.
+            logger.warning(
+                f"The model {self.model_config.model_id!r} did not end up generating "
+                "any text. This is likely because the model ran out of tokens while "
+                "reasoning. Returning an empty string."
+            )
+            return GenerativeModelOutput(sequences=[""])
         model_response_choices = model_response.choices[0]
         assert isinstance(model_response_choices, litellm.Choices)
         generation_output = model_response_choices.message["content"] or ""
@@ -288,14 +367,22 @@ class LiteLLMModel(BenchmarkModule):
         # Structure the model output as a GenerativeModelOutput object
         model_output = GenerativeModelOutput(sequences=[generation_output])
         if hasattr(model_response_choices, "logprobs"):
-            logprobs_list: list[list[tuple[str, float]]] = [
-                [
-                    (top_logprob.token, top_logprob.logprob)
-                    for top_logprob in content.top_logprobs
+            logprobs_obj = model_response_choices.logprobs
+            if isinstance(logprobs_obj, ChoiceLogprobs):
+                logprobs_list: list[list[tuple[str, float]]] = [
+                    [
+                        (top_logprob.token, top_logprob.logprob)
+                        for top_logprob in content.top_logprobs
+                    ]
+                    for content in model_response_choices.logprobs.content or list()
                 ]
-                for content in model_response_choices.logprobs.content or list()
-            ]
-            model_output.scores = [logprobs_list]
+                model_output.scores = [logprobs_list]
+            else:
+                log_once(
+                    "The logprobs object is malformed, so we won't use logprobs to "
+                    "determine the labels.",
+                    level=logging.WARNING,
+                )
         return model_output
@@ -314,7 +401,7 @@ class LiteLLMModel(BenchmarkModule):
         # If it is an Ollama model then we can get the number of parameters from the
         # Ollama Python SDK
         if self.is_ollama:
-            ollama_model_id = self.model_config.model_id.split("/")[-1]
+            ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
             model_info = ollama.show(ollama_model_id).modelinfo
             if model_info is not None:
                 num_params = model_info.get("general.parameter_count")
@@ -325,7 +412,7 @@ class LiteLLMModel(BenchmarkModule):
         # get the number of parameters from the Hugging Face model configuration from
         # the Hugging Face Hub
         if self.model_config.model_id.startswith("huggingface/"):
-            model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
+            model_id = "/".join(self.model_config.model_id.split(sep="/")[-2:])
             if HuggingFaceEncoderModel.model_exists(
                 model_id=model_id, benchmark_config=self.benchmark_config
             ):
@@ -334,7 +421,7 @@ class LiteLLMModel(BenchmarkModule):
                     num_labels=self.dataset_config.num_labels,
                     id2label=self.dataset_config.id2label,
                     label2id=self.dataset_config.label2id,
-                    revision=self.model_config.revision,
+                    revision="main",
                     model_cache_dir=self.model_config.model_cache_dir,
                     api_key=self.benchmark_config.api_key,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -345,7 +432,7 @@ class LiteLLMModel(BenchmarkModule):
                 try:
                     repo_info = hf_api.model_info(
                         repo_id=model_id,
-                        revision=self.model_config.revision,
+                        revision="main",
                         token=os.getenv("HUGGINGFACE_API_KEY")
                         or self.benchmark_config.api_key
                         or True,
@@ -389,7 +476,7 @@ class LiteLLMModel(BenchmarkModule):
         # get the vocabulary size from the Hugging Face model configuration from the
         # Hugging Face Hub
         if self.model_config.model_id.startswith("huggingface/"):
-            model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
+            model_id = "/".join(self.model_config.model_id.split(sep="/")[-2:])
             if HuggingFaceEncoderModel.model_exists(
                 model_id=model_id, benchmark_config=self.benchmark_config
             ):
@@ -398,7 +485,7 @@ class LiteLLMModel(BenchmarkModule):
                     num_labels=self.dataset_config.num_labels,
                     id2label=self.dataset_config.id2label,
                     label2id=self.dataset_config.label2id,
-                    revision=self.model_config.revision,
+                    revision="main",
                     model_cache_dir=self.model_config.model_cache_dir,
                     api_key=self.benchmark_config.api_key,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -442,7 +529,7 @@ class LiteLLMModel(BenchmarkModule):
         # If it is an Ollama model then we can get the maximum length from the Ollama
         # Python SDK
         if self.is_ollama:
-            ollama_model_id = self.model_config.model_id.split("/")[-1]
+            ollama_model_id = "/".join(self.model_config.model_id.split("/")[1:])
             model_info = ollama.show(ollama_model_id).modelinfo
             if model_info is not None:
                 context_length_keys = [
@@ -469,7 +556,7 @@ class LiteLLMModel(BenchmarkModule):
         # get the maximum length from the Hugging Face model configuration from the
         # Hugging Face Hub
         if self.model_config.model_id.startswith("huggingface/"):
-            model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
+            model_id = "/".join(self.model_config.model_id.split(sep="/")[-2:])
             if HuggingFaceEncoderModel.model_exists(
                 model_id=model_id, benchmark_config=self.benchmark_config
             ):
@@ -478,7 +565,7 @@ class LiteLLMModel(BenchmarkModule):
                     num_labels=self.dataset_config.num_labels,
                     id2label=self.dataset_config.id2label,
                     label2id=self.dataset_config.label2id,
-                    revision=self.model_config.revision,
+                    revision="main",
                     model_cache_dir=self.model_config.model_cache_dir,
                     api_key=self.benchmark_config.api_key,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -563,6 +650,7 @@ class LiteLLMModel(BenchmarkModule):
                 return partial(
                     sequence_classification.extract_labels_from_generation,
                     dataset_config=self.dataset_config,
+                    first_label_token_mapping=self.buffer["first_label_token_mapping"],
                 )
             case TaskGroup.TEXT_TO_TEXT:
                 return text_to_text.extract_labels_from_generation
@@ -605,45 +693,15 @@ class LiteLLMModel(BenchmarkModule):
             Whether the model exists, or an error describing why we cannot check
             whether the model exists.
         """
+        model_id, _ = model_id.split("@") if "@" in model_id else (model_id, "main")
         if model_id in litellm.model_list:
             return True
-        # If it is an Ollama model then try to download it
+        # Separate check for Ollama models
         if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
-            ollama_model_id = model_id.split("/")[-1]
-            downloaded_ollama_models: list[str] = [
-                model_obj.model
-                for model_obj in ollama.list().models
-                if model_obj.model is not None
-            ]
-            if ollama_model_id not in downloaded_ollama_models:
-                try:
-                    response = ollama.pull(model=ollama_model_id, stream=True)
-                    with tqdm(
-                        desc=f"Downloading {ollama_model_id}",
-                        unit_scale=True,
-                        unit="B",
-                        leave=False,
-                    ) as pbar:
-                        for status in response:
-                            if status.total is not None:
-                                pbar.total = status.total
-                            if status.completed is not None:
-                                pbar.update(status.completed - pbar.n)
-                except ollama.ResponseError as e:
-                    if "file does not exist" in str(e).lower():
-                        return False
-                    else:
-                        raise InvalidModel(
-                            f"Failed to download Ollama model {ollama_model_id}. The "
-                            f"error message was: {e}"
-                        )
-            else:
-                log_once(
-                    f"Ollama model {ollama_model_id!r} already downloaded, so skipping "
-                    "download.",
-                    level=logging.DEBUG,
-                )
+            ollama_model_exists = try_download_ollama_model(model_id=model_id)
+            if ollama_model_exists:
+                return ollama_model_exists
         num_attempts = 10
         for _ in range(num_attempts):
@@ -657,12 +715,27 @@ class LiteLLMModel(BenchmarkModule):
                     api_version=benchmark_config.api_version,
                 )
                 return True
+            # A rate limit indicates that the model *does* exist, but we are being rate
+            # limited.
+            except RateLimitError:
+                return True
+            except (
+                APIConnectionError,
+                Timeout,
+                ServiceUnavailableError,
+                InternalServerError,
+            ) as e:
+                logger.debug(
+                    f"Service temporarily unavailable. The error message was: {e}. "
+                    "Retrying in 10 seconds..."
+                )
+                sleep(5)
             except APIError as e:
                 if "'503 Service Unavailable" not in str(e):
                     raise e
                 logger.warning(
-                    f"Failed to check if model {model_id!r} exists. Retrying in "
-                    f"{num_attempts} seconds..."
+                    f"Failed to check if model {model_id!r} exists. Retrying in 10 "
+                    "seconds..."
                 )
                 sleep(10)
             except (BadRequestError, NotFoundError):
@@ -708,9 +781,10 @@ class LiteLLMModel(BenchmarkModule):
         Returns:
             The model configuration.
         """
+        model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "")
         return ModelConfig(
             model_id=model_id,
-            revision="main",
+            revision=revision,
             task="text-generation",
             languages=list(),
             merge=False,
@@ -1025,3 +1099,123 @@ class LiteLLMModel(BenchmarkModule):
         examples["messages"] = messages_list
         return examples
+def raise_if_wrong_params(
+    model_config: ModelConfig, allowed_params: dict[str, list[str]]
+) -> None:
+    """Raise an error if the model configuration has invalid parameters.
+    Args:
+        model_config:
+            The model configuration.
+        allowed_params:
+            The allowed parameters for the model.
+    Raises:
+        InvalidModel:
+            If the model configuration has invalid parameters.
+    """
+    param = model_config.revision
+    if param == "":
+        return
+    for model_regex, allowed_params_list in allowed_params.items():
+        if re.fullmatch(pattern=model_regex, string=model_config.model_id):
+            if param not in allowed_params_list:
+                msg = (
+                    f"Invalid parameter {param!r} for model {model_config.model_id!r}."
+                )
+                if allowed_params_list:
+                    msg += f" Allowed parameters are: {', '.join(allowed_params_list)}."
+                else:
+                    msg += " No parameters are allowed."
+                raise InvalidModel(msg)
+            return
+def try_download_ollama_model(model_id: str) -> bool:
+    """Try to download an Ollama model.
+    Args:
+        model_id:
+            The model ID. If the model does not start with "ollama/" or "ollama_chat/"
+            then this function will return False.
+    Returns:
+        Whether the model was downloaded successfully.
+    """
+    if not (model_id.startswith("ollama/") or model_id.startswith("ollama_chat/")):
+        return False
+    if model_id.startswith("ollama/"):
+        log_once(
+            "You're trying to benchmark a model with the old 'ollama/' prefix, which "
+            "probably results in bad performance, as it doesn't use the model's chat "
+            "template. If the model is not a chat model then just disregard this "
+            "warning, but if it is a chat model then please cancel this run and "
+            "use the 'ollama_chat/' prefix instead.",
+            level=logging.WARNING,
+        )
+    downloaded_ollama_models: list[str] = [
+        model_obj.model
+        for model_obj in ollama.list().models
+        if model_obj.model is not None
+    ]
+    ollama_model_id = "/".join(model_id.split("/")[1:])
+    if ollama_model_id not in downloaded_ollama_models:
+        # Try fetching the model info
+        try:
+            response = ollama.pull(model=ollama_model_id, stream=True)
+        except ollama.ResponseError as e:
+            if "file does not exist" in str(e).lower():
+                # Check if the model exists if we prepend "hf.co/"
+                try:
+                    ollama_model_id_with_prefix = f"hf.co/{ollama_model_id}"
+                    model_id_with_prefix = (
+                        f"{model_id.split('/')[0]}/{ollama_model_id_with_prefix}"
+                    )
+                    ollama.pull(model=ollama_model_id_with_prefix, stream=True)
+                    log_once(
+                        f"The model {model_id!r} cannot be found on Ollama, but the "
+                        f"model {model_id_with_prefix} *was* found, so we would "
+                        "recommend you cancelling this run and trying the evaluation "
+                        "with that model ID instead."
+                    )
+                    return False
+                except ollama.ResponseError as inner_e:
+                    if "file does not exist" in str(inner_e).lower():
+                        return False
+                    else:
+                        raise InvalidModel(
+                            f"Failed to download Ollama model {ollama_model_id}. "
+                            f"The error message was: {inner_e}"
+                        )
+            else:
+                raise InvalidModel(
+                    f"Failed to download Ollama model {ollama_model_id}. "
+                    f"The error message was: {e}"
+                )
+        # Download the model
+        with tqdm(
+            desc=f"Downloading {ollama_model_id}",
+            unit_scale=True,
+            unit="B",
+            leave=False,
+        ) as pbar:
+            for status in response:
+                if status.total is not None:
+                    pbar.total = status.total
+                if status.completed is not None:
+                    pbar.update(status.completed - pbar.n)
+        return True
+    else:
+        log_once(
+            f"Ollama model {ollama_model_id!r} already downloaded, so skipping "
+            "download.",
+            level=logging.DEBUG,
+        )
+        return True

EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.4.2py3-none-any.whl → 15.6.0py3-none-any.whl