PyPI - EuroEval - Versions diffs - 16.0.1__py3-none-any.whl → 16.1.0__py3-none-any.whl - Mend

EuroEval 16.0.1py3-none-any.whl → 16.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (48) hide show

euroeval/benchmark_config_factory.py +6 -1
euroeval/benchmark_modules/base.py +2 -0
euroeval/benchmark_modules/fresh.py +7 -1
euroeval/benchmark_modules/hf.py +26 -21
euroeval/benchmark_modules/litellm.py +258 -131
euroeval/benchmark_modules/vllm.py +79 -40
euroeval/benchmarker.py +11 -2
euroeval/cli.py +14 -1
euroeval/constants.py +1 -1
euroeval/data_models.py +77 -6
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +14 -0
euroeval/dataset_configs/dutch.py +14 -0
euroeval/dataset_configs/english.py +22 -0
euroeval/dataset_configs/estonian.py +15 -7
euroeval/dataset_configs/finnish.py +14 -0
euroeval/dataset_configs/french.py +14 -0
euroeval/dataset_configs/german.py +23 -0
euroeval/dataset_configs/italian.py +14 -0
euroeval/dataset_configs/latvian.py +14 -0
euroeval/dataset_configs/norwegian.py +14 -0
euroeval/dataset_configs/polish.py +126 -0
euroeval/dataset_configs/portuguese.py +14 -0
euroeval/dataset_configs/spanish.py +14 -0
euroeval/dataset_configs/swedish.py +25 -0
euroeval/enums.py +12 -0
euroeval/generation.py +17 -8
euroeval/generation_utils.py +58 -10
euroeval/metrics/pipeline.py +1 -1
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +27 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +11 -0
euroeval/prompt_templates/sentiment_classification.py +15 -0
euroeval/prompt_templates/summarization.py +27 -1
euroeval/scores.py +5 -0
euroeval/task_group_utils/question_answering.py +29 -29
euroeval/task_group_utils/sequence_classification.py +10 -33
euroeval/task_group_utils/token_classification.py +3 -3
euroeval/tasks.py +4 -4
euroeval/{tokenization_utils.py → tokenisation_utils.py} +40 -23
euroeval/utils.py +36 -3
{euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/METADATA +1 -1
euroeval-16.1.0.dist-info/RECORD +70 -0
euroeval-16.0.1.dist-info/RECORD +0 -69
{euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
{euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -31,7 +31,7 @@ from litellm.exceptions import (
 )
 from litellm.llms.vertex_ai.common_utils import VertexAIError
 from litellm.router import Router
-from litellm.types.utils import ChoiceLogprobs
+from litellm.types.utils import ChoiceLogprobs, Logprobs
 from litellm.utils import supports_reasoning, supports_response_schema
 from pydantic import conlist, create_model
 from requests.exceptions import RequestException
@@ -65,7 +65,11 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
-from ..generation_utils import apply_prompt, extract_few_shot_examples
+from ..generation_utils import (
+    apply_prompt,
+    extract_few_shot_examples,
+    raise_if_wrong_params,
+)
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
@@ -73,7 +77,7 @@ from ..task_group_utils import (
     token_classification,
 )
 from ..tasks import NER
-from ..tokenization_utils import get_first_label_token_mapping
+from ..tokenisation_utils import get_first_label_token_mapping
 from ..types import ExtractLabelsFunction
 from ..utils import (
     add_semaphore_and_catch_exception,
@@ -81,6 +85,7 @@ from ..utils import (
     get_hf_token,
     log_once,
     safe_run,
+    split_model_id,
 )
 from .base import BenchmarkModule
 from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokeniser
@@ -153,21 +158,6 @@ NUM_PARAMS_MAPPING = {
 }
-ALLOWED_PARAMS = {
-    # OpenAI models
-    r"gpt-5-.*": ["minimal", "low", "medium", "high"],
-    r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
-    # Anthropic models
-    r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
-    r"(anthropic/)?claude-(sonnet|opus)-4.*": ["no-thinking", "thinking"],
-    # Gemini models
-    r"(gemini/)?gemini-2.5-flash-lite.*": ["no-thinking", "thinking"],
-    r"(gemini/)?gemini-2.5-flash.*": ["no-thinking", "thinking"],
-    # xAI models
-    r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "medium", "high"],
-}
 REASONING_MODELS = [
     r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",
     r"(gemini/)?gemini.*thinking.*",
@@ -175,6 +165,15 @@ REASONING_MODELS = [
     r"(xai/)?grok-3-mini.*",
 ]
+BASE_DECODER_MODELS = [
+    r"gpt-3.5-turbo-instruct.*",
+    r"ada-[0-9]{3}",
+    r"babbage-[0-9]{3}",
+    r"curie-[0-9]{3}",
+    r"davinci-[0-9]{3}",
+    r"text-davinci-[0-9]{3}",
+]
 class LiteLLMModel(BenchmarkModule):
     """A generative model from LiteLLM."""
@@ -182,6 +181,26 @@ class LiteLLMModel(BenchmarkModule):
     fresh_model = False
     batching_preference = BatchingPreference.ALL_AT_ONCE
     high_priority = False
+    allowed_params = {
+        # OpenAI models
+        re.compile(r"gpt-5-.*"): ["minimal", "low", "medium", "high"],
+        re.compile(r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?"): [
+            "low",
+            "medium",
+            "high",
+        ],
+        # Anthropic models
+        re.compile(r"(anthropic/)?claude-3-7-sonnet.*"): ["no-thinking", "thinking"],
+        re.compile(r"(anthropic/)?claude-(sonnet|opus)-4.*"): [
+            "no-thinking",
+            "thinking",
+        ],
+        # Gemini models
+        re.compile(r"(gemini/)?gemini-2.5-flash-lite.*"): ["no-thinking", "thinking"],
+        re.compile(r"(gemini/)?gemini-2.5-flash.*"): ["no-thinking", "thinking"],
+        # xAI models
+        re.compile(r"(xai/)?grok-3-mini(-fast)?(-beta)?"): ["low", "medium", "high"],
+    }
     def __init__(
         self,
@@ -206,6 +225,10 @@ class LiteLLMModel(BenchmarkModule):
                 The generation kwargs to pass to the model. If None, default values will
                 be used.
         """
+        raise_if_wrong_params(
+            model_config=model_config, allowed_params=self.allowed_params
+        )
         # Detect whether the model is an Ollama model, as we need to extract metadata
         # differently for these models
         self.is_ollama = model_config.model_id.startswith(
@@ -217,8 +240,6 @@ class LiteLLMModel(BenchmarkModule):
             else ollama.ShowResponse(model_info=None)
         )
-        raise_if_wrong_params(model_config=model_config, allowed_params=ALLOWED_PARAMS)
         super().__init__(
             model_config=model_config,
             dataset_config=dataset_config,
@@ -242,21 +263,27 @@ class LiteLLMModel(BenchmarkModule):
         Returns:
             The generative type of the model, or None if it has not been set yet.
         """
-        if self.is_ollama:
+        if self.benchmark_config.generative_type is not None:
+            type_ = self.benchmark_config.generative_type
+        elif self.is_ollama:
             reasoning_model = "thinking" in (self._ollama_show.capabilities or [])
             type_ = (
                 GenerativeType.REASONING
                 if reasoning_model
                 else GenerativeType.INSTRUCTION_TUNED
             )
-        elif self.model_config.revision in {"thinking"}:
+        elif self.model_config.param in {"thinking"}:
             type_ = GenerativeType.REASONING
-        elif self.model_config.revision in {"no-thinking"}:
+        elif self.model_config.param in {"no-thinking"}:
             type_ = GenerativeType.INSTRUCTION_TUNED
         elif re.fullmatch(
             pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
         ):
             type_ = GenerativeType.REASONING
+        elif re.fullmatch(
+            pattern="|".join(BASE_DECODER_MODELS), string=self.model_config.model_id
+        ):
+            type_ = GenerativeType.BASE
         elif supports_reasoning(model=self.model_config.model_id):
             type_ = GenerativeType.REASONING
         else:
@@ -279,9 +306,20 @@ class LiteLLMModel(BenchmarkModule):
         Returns:
             The generated model outputs.
+        Raises:
+            InvalidBenchmark:
+                If the inputs do not contain either 'messages' or 'text' keys.
         """
-        assert "messages" in inputs, "The input must contain a 'messages' key."
-        conversations: list[list[litellm.AllMessageValues]] = inputs["messages"]
+        model_inputs: list[list[litellm.AllMessageValues] | str]
+        if "messages" in inputs:
+            model_inputs = inputs["messages"]
+        elif "text" in inputs:
+            model_inputs = inputs["text"]
+        else:
+            raise InvalidBenchmark(
+                "The inputs must contain either 'messages' or 'text' keys."
+            )
         # Get the mapping from labels to the first token in the label. We call this each
         # time we generate a new dataset since the dataset config can change
@@ -294,22 +332,22 @@ class LiteLLMModel(BenchmarkModule):
         )
         all_responses: dict[int, "ModelResponse"] = {}
-        conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
-            enumerate(conversations)
+        inputs_to_run: list[tuple[int, list[litellm.AllMessageValues] | str]] = list(
+            enumerate(model_inputs)
         )
         for attempt in range(num_attempts := 10):
-            if not conversations_to_run:
+            if not inputs_to_run:
                 break
             generation_kwargs = self.generation_kwargs or self.get_generation_kwargs(
                 dataset_config=self.dataset_config
             )
-            batch_indices, batch_conversations = zip(*conversations_to_run)
+            batch_indices, batch_inputs = zip(*inputs_to_run)
             successes, failures = safe_run(
                 self._generate_async(
                     model_id=self.model_config.model_id,
-                    conversations=list(batch_conversations),
+                    inputs=list(batch_inputs),
                     **generation_kwargs,
                 )
             )
@@ -321,17 +359,17 @@ class LiteLLMModel(BenchmarkModule):
             # If all requests were successful, break
             if not failures:
-                conversations_to_run = []
+                inputs_to_run = []
                 break
             # Put the failed requests back in the queue to try again
-            conversations_to_run = [
-                (batch_indices[idx], conversations[batch_indices[idx]])
+            inputs_to_run = [
+                (batch_indices[idx], model_inputs[batch_indices[idx]])
                 for idx, _ in failures
             ]
             logger.debug(
                 f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
-                f"{len(conversations_to_run):,} failed message(s)"
+                f"{len(inputs_to_run):,} failed message(s)"
             )
             # Attempt to handle the exceptions, to improve the chance of getting
@@ -349,14 +387,14 @@ class LiteLLMModel(BenchmarkModule):
             )
         # Extract the generations from the model output
-        ordered_responses = [all_responses[i] for i in range(len(conversations))]
+        ordered_responses = [all_responses[i] for i in range(len(model_inputs))]
         model_output = self._create_model_output(
             model_responses=ordered_responses, model_id=self.model_config.model_id
         )
-        if len(conversations) != len(model_output.sequences):
+        if len(model_inputs) != len(model_output.sequences):
             raise InvalidBenchmark(
-                f"Number of model inputs ({len(conversations):,}) does not match the "
+                f"Number of model inputs ({len(model_inputs):,}) does not match the "
                 f"number of model outputs ({len(model_output.sequences):,})."
             )
@@ -378,16 +416,24 @@ class LiteLLMModel(BenchmarkModule):
         model_id = self.model_config.model_id
         # Error messages that we want to catch and handle
-        stop_messages = ["stop_sequences", "'stop' is not supported with this model"]
+        stop_messages = [
+            "stop_sequences",
+            "'stop' is not supported with this model",
+            "'$.stop' is invalid",
+        ]
         logprobs_messages = [
             "you are not allowed to request logprobs",
             "you've reached the maximum number of requests with logprobs",
             "logprobs is not supported",
             "logprobs is not enabled",
         ]
+        top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
         logprobs_pattern = re.compile(
             r"does not support parameters: \[.*'top_logprobs'.*\]"
         )
+        max_completion_tokens_pattern = re.compile(
+            r"does not support parameters: \[.*'max_completion_tokens'.*\]"
+        )
         temperature_messages = [
             "'temperature' is not supported with this model.",
             "temperature is not supported with this model",
@@ -406,6 +452,10 @@ class LiteLLMModel(BenchmarkModule):
         )
         requires_thinking_disabled_messages = ["thinking.type: Field required"]
         seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
+        response_format_messages = [
+            "got an unexpected keyword argument 'response_format'",
+            "The model outputs empty dictionaries.",
+        ]
         if any(msg.lower() in error_msg for msg in stop_messages):
             log_once(
@@ -430,6 +480,24 @@ class LiteLLMModel(BenchmarkModule):
             generation_kwargs.pop("logprobs", None)
             generation_kwargs.pop("top_logprobs", None)
             return generation_kwargs
+        elif any(msg.lower() in error_msg for msg in top_logprobs_messages):
+            log_once(
+                f"The model {model_id!r} does not support the `top_logprobs` argument, "
+                "so moving the value to `logprobs`.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs["logprobs"] = generation_kwargs.pop("top_logprobs", None)
+            return generation_kwargs
+        elif max_completion_tokens_pattern.search(string=error_msg):
+            log_once(
+                f"The model {model_id!r} does not support max_completion_tokens, so "
+                "disabling it.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs["max_tokens"] = generation_kwargs.pop(
+                "max_completion_tokens", None
+            )
+            return generation_kwargs
         elif any(msg.lower() in error_msg for msg in temperature_messages):
             log_once(
                 f"The model {model_id!r} does not support "
@@ -510,6 +578,14 @@ class LiteLLMModel(BenchmarkModule):
             )
             generation_kwargs.pop("seed", None)
             return generation_kwargs
+        elif any(msg.lower() in error_msg for msg in response_format_messages):
+            log_once(
+                f"The model {model_id!r} does not support the `response_format` "
+                "parameter, so disabling it.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs.pop("response_format", None)
+            return generation_kwargs
         # If there are too many I/O connections, we increase the number of allowed file
         # descriptors
         elif "too many open files" in error_msg:
@@ -572,7 +648,7 @@ class LiteLLMModel(BenchmarkModule):
     async def _generate_async(
         self,
         model_id: str,
-        conversations: list[list[litellm.AllMessageValues]],
+        inputs: list[list[litellm.AllMessageValues] | str],
         **generation_kwargs,
     ) -> tuple[list[tuple[int, "ModelResponse"]], list[tuple[int, Exception]]]:
         """Generate outputs from the model asynchronously.
@@ -580,8 +656,8 @@ class LiteLLMModel(BenchmarkModule):
         Args:
             model_id:
                 The ID of the model to use for generation.
-            conversations:
-                The conversations to pass to the model.
+            inputs:
+                The inputs to pass to the model.
             **generation_kwargs:
                 Additional generation arguments to pass to the model.
@@ -604,17 +680,51 @@ class LiteLLMModel(BenchmarkModule):
         # Get the LLM generations asynchronously
         max_concurrent_calls = 20
         semaphore = asyncio.Semaphore(max_concurrent_calls)
-        requests = [
-            add_semaphore_and_catch_exception(
-                router.acompletion(
-                    model=model_id, messages=conversation, **generation_kwargs
-                ),
-                semaphore=semaphore,
-            )
-            for conversation in conversations
-        ]
+        if self.generative_type == GenerativeType.BASE:
+            if not all(isinstance(input_, str) for input_ in inputs):
+                raise InvalidBenchmark(
+                    "For base generative models, all inputs must be strings."
+                )
+            requests = [
+                add_semaphore_and_catch_exception(
+                    router.atext_completion(
+                        model=model_id, prompt=input_, **generation_kwargs
+                    ),
+                    semaphore=semaphore,
+                )
+                for input_ in inputs
+                if isinstance(input_, str)
+            ]
+        else:
+            if not all(isinstance(input_, list) for input_ in inputs):
+                raise InvalidBenchmark(
+                    "For instruction-tuned and reasoning generative models, all "
+                    "inputs must be lists of messages."
+                )
+            requests = [
+                add_semaphore_and_catch_exception(
+                    router.acompletion(
+                        model=model_id, messages=input_, **generation_kwargs
+                    ),
+                    semaphore=semaphore,
+                )
+                for input_ in inputs
+                if isinstance(input_, list)
+            ]
         responses = await tqdm_async.gather(*requests, leave=False)
+        # If we are performing structured generation and the model just outputs an empty
+        # dictionary, then we convert those to exceptions, to disable structured
+        # generation
+        if "response_format" in generation_kwargs:
+            responses = [
+                RuntimeError("The model outputs empty dictionaries.")
+                if not isinstance(response, Exception)
+                and any(choice.message.content == "{}" for choice in response.choices)
+                else response
+                for response in responses
+            ]
         # Separate the successful responses from the failed ones
         successes = [
             (idx, response)
@@ -630,7 +740,10 @@ class LiteLLMModel(BenchmarkModule):
         # Close connections
         for request in requests:
             if hasattr(request, "close"):
-                request.close()
+                try:
+                    request.close()
+                except RuntimeError as e:
+                    logger.debug(f"RuntimeError during request.close(): {e}")
         return successes, failures
@@ -663,10 +776,18 @@ class LiteLLMModel(BenchmarkModule):
                 continue
             model_response_choices = model_response.choices[0]
-            assert isinstance(model_response_choices, litellm.Choices)
-            generated_message: litellm.Message = model_response_choices.message
-            generation_output = generated_message.content or ""
-            generation_output = generation_output.strip()
+            if isinstance(model_response_choices, litellm.Choices):
+                generated_message: litellm.Message = model_response_choices.message
+                generation_output = generated_message.content or ""
+                generation_output = generation_output.strip()
+            elif isinstance(model_response_choices, litellm.litellm.TextChoices):
+                generation_output = model_response_choices.text or ""
+            else:
+                raise InvalidBenchmark(
+                    "The model response choices must be of type Choices or "
+                    f"TextChoices. Got {type(model_response_choices)}."
+                )
             # In the case where we're dealing with a classification task, the model is
             # outputting a JSON dictionary, so we will extract the generated text from
@@ -687,40 +808,55 @@ class LiteLLMModel(BenchmarkModule):
             # Structure the model output as a GenerativeModelOutput object
             sequences.append(generation_output)
-            if hasattr(model_response_choices, "logprobs"):
+            if (
+                hasattr(model_response_choices, "logprobs")
+                and model_response_choices.logprobs is not None
+            ):
                 logprobs_obj = model_response_choices.logprobs
+                if not isinstance(logprobs_obj, (Logprobs, ChoiceLogprobs)):
+                    log_once(
+                        "The logprobs object is malformed, so we won't use logprobs to "
+                        "determine the labels.",
+                        level=logging.WARNING,
+                    )
+                    continue
+                logprobs_list: list[list[tuple[str, float]]]
                 if isinstance(logprobs_obj, ChoiceLogprobs):
-                    logprobs_list: list[list[tuple[str, float]]] = [
+                    logprobs_list = [
                         [
                             (top_logprob.token, top_logprob.logprob)
                             for top_logprob in content.top_logprobs
                         ]
-                        for content in model_response_choices.logprobs.content or list()
+                        for content in logprobs_obj.content or list()
+                    ]
+                else:
+                    logprobs_list = [
+                        [
+                            (token, logprob)
+                            for token, logprob in (top_logprobs_dct or dict()).items()
+                        ]
+                        for top_logprobs_dct in logprobs_obj.top_logprobs or list()
                     ]
-                    # If the model outputted a JSON dictionary, we need to find the
-                    # token index of the value within the dictionary, rather than the
-                    # first token of the entire output
-                    if generation_dct:
-                        key_name = next(iter(generation_dct.keys()))
-                        logprobs_list = [
+                # If the model outputted a JSON dictionary, we need to find the
+                # token index of the value within the dictionary, rather than the
+                # first token of the entire output
+                if generation_dct:
+                    key_name = next(iter(generation_dct.keys()))
+                    logprobs_list = [
+                        lst
+                        for lst in logprobs_list
+                        if (
                             lst
-                            for lst in logprobs_list
-                            if (
-                                lst
-                                and lst[0]
-                                and (token := lst[0][0].strip(JSON_STRIP_CHARACTERS))
-                                and not key_name.startswith(token)
-                            )
-                        ]
+                            and lst[0]
+                            and (token := lst[0][0].strip(JSON_STRIP_CHARACTERS))
+                            and not key_name.startswith(token)
+                        )
+                    ]
-                    scores.append(logprobs_list)
-                else:
-                    log_once(
-                        "The logprobs object is malformed, so we won't use logprobs to "
-                        "determine the labels.",
-                        level=logging.WARNING,
-                    )
+                scores.append(logprobs_list)
         if not sequences:
             logger.warning(
@@ -1047,7 +1183,7 @@ class LiteLLMModel(BenchmarkModule):
             Whether the model exists, or an error describing why we cannot check
             whether the model exists.
         """
-        model_id, _ = model_id.split("@") if "@" in model_id else (model_id, "main")
+        model_id = split_model_id(model_id=model_id).model_id
         if model_id in litellm.model_list:
             return True
@@ -1135,10 +1271,29 @@ class LiteLLMModel(BenchmarkModule):
         Returns:
             The model configuration.
         """
-        model_id, revision = model_id.split("@") if "@" in model_id else (model_id, "")
+        model_id_components = split_model_id(model_id=model_id)
+        # Backwards compatibility: If the revision is set but not the parameter, we
+        # assume that the revision is actually the parameter and log this as a warning.
+        if model_id_components.revision != "main" and model_id_components.param is None:
+            proper_model_id = (
+                f"{model_id_components.model_id}#{model_id_components.revision}"
+            )
+            log_once(
+                f"The model ID {model_id!r} specifies a revision "
+                f"{model_id_components.revision!r} but not a parameter. We assume "
+                "that the revision is actually the parameter and set the revision "
+                "to 'main'. In the future, use the new '#' syntax to specify the "
+                f"parameter (in this case, this would be {proper_model_id!r}), as this "
+                "will be an error in future versions of EuroEval."
+            )
+            model_id_components.param = model_id_components.revision
+            model_id_components.revision = "main"
         return ModelConfig(
-            model_id=model_id,
-            revision=revision,
+            model_id=model_id_components.model_id,
+            revision=model_id_components.revision,
+            param=model_id_components.param,
             task="text-generation",
             languages=list(),
             merge=False,
@@ -1207,7 +1362,7 @@ class LiteLLMModel(BenchmarkModule):
                 few_shot_examples=few_shot_examples,
                 model_config=self.model_config,
                 dataset_config=self.dataset_config,
-                instruction_model=True,
+                generative_type=self.generative_type,
                 always_populate_text_field=False,
                 tokeniser=None,
             ),
@@ -1313,7 +1468,7 @@ class LiteLLMModel(BenchmarkModule):
         if self.buffer["first_label_token_mapping"]:
             generation_kwargs["logprobs"] = True
             generation_kwargs["top_logprobs"] = MAX_LITELLM_LOGPROBS
-        if self.model_config.revision == "thinking":
+        if self.model_config.param == "thinking":
             generation_kwargs["thinking"] = dict(
                 type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
             )
@@ -1321,16 +1476,16 @@ class LiteLLMModel(BenchmarkModule):
                 f"Enabling thinking mode for model {self.model_config.model_id!r}",
                 level=logging.DEBUG,
             )
-        elif self.model_config.revision == "no-thinking":
+        elif self.model_config.param == "no-thinking":
             generation_kwargs["thinking"] = dict(budget_tokens=0)
             log_once(
                 f"Disabling thinking mode for model {self.model_config.model_id!r}",
                 level=logging.DEBUG,
             )
-        elif self.model_config.revision in {"minimal", "low", "medium", "high"}:
-            generation_kwargs["reasoning_effort"] = self.model_config.revision
+        elif self.model_config.param in {"minimal", "low", "medium", "high"}:
+            generation_kwargs["reasoning_effort"] = self.model_config.param
             log_once(
-                f"Enabling reasoning effort {self.model_config.revision!r} for model "
+                f"Enabling reasoning effort {self.model_config.param!r} for model "
                 f"{self.model_config.model_id!r}",
                 level=logging.DEBUG,
             )
@@ -1338,14 +1493,18 @@ class LiteLLMModel(BenchmarkModule):
         # First attempt is a test run with a single conversation to handle errors
         # quickly. We repeat this multiple times to deal with different types of
         # errors, and stop if we get a successful response.
-        test_conversation: list[litellm.AllMessageValues] = [
-            litellm.ChatCompletionUserMessage(role="user", content="Test message")
-        ]
-        for _ in range(5):
+        test_input: list[litellm.AllMessageValues] | str
+        if self.generative_type == GenerativeType.BASE:
+            test_input = "Test message"
+        else:
+            test_input = [
+                litellm.ChatCompletionUserMessage(role="user", content="Test message")
+            ]
+        for _ in range(num_attempts := 10):
             _, failures = safe_run(
                 self._generate_async(
                     model_id=self.model_config.model_id,
-                    conversations=[test_conversation],
+                    inputs=[test_input],
                     **generation_kwargs,
                 )
             )
@@ -1355,47 +1514,15 @@ class LiteLLMModel(BenchmarkModule):
                 generation_kwargs = self._handle_exception(
                     error=error, **generation_kwargs
                 )
+        else:
+            raise InvalidModel(
+                "Failed to get a successful response from the model "
+                f"{self.model_config.model_id!r} after {num_attempts} attempts."
+            )
         return generation_kwargs
-def raise_if_wrong_params(
-    model_config: ModelConfig, allowed_params: dict[str, list[str]]
-) -> None:
-    """Raise an error if the model configuration has invalid parameters.
-    Args:
-        model_config:
-            The model configuration.
-        allowed_params:
-            The allowed parameters for the model.
-    Raises:
-        InvalidModel:
-            If the model configuration has invalid parameters.
-    """
-    param = model_config.revision
-    if param == "":
-        return
-    for model_regex, allowed_params_list in allowed_params.items():
-        if re.fullmatch(pattern=model_regex, string=model_config.model_id):
-            if param not in allowed_params_list:
-                msg = (
-                    f"Invalid parameter {param!r} for model {model_config.model_id!r}."
-                )
-                if allowed_params_list:
-                    msg += f" Allowed parameters are: {', '.join(allowed_params_list)}."
-                else:
-                    msg += " No parameters are allowed."
-                raise InvalidModel(msg)
-            return
-    else:
-        raise InvalidModel(
-            f"The parameter {param!r} is not supported for the model "
-            f"{model_config.model_id!r}."
-        )
 def try_download_ollama_model(model_id: str) -> bool:
     """Try to download an Ollama model.

EuroEval 16.0.1__py3-none-any.whl → 16.1.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.0.1py3-none-any.whl → 16.1.0py3-none-any.whl