PyPI - EuroEval - Versions diffs - 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl - Mend

EuroEval 15.16.0py3-none-any.whl → 16.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show

euroeval/__init__.py +3 -7
euroeval/benchmark_config_factory.py +3 -7
euroeval/benchmark_modules/base.py +35 -19
euroeval/benchmark_modules/fresh.py +24 -19
euroeval/benchmark_modules/hf.py +136 -154
euroeval/benchmark_modules/litellm.py +190 -110
euroeval/benchmark_modules/vllm.py +161 -114
euroeval/benchmarker.py +49 -22
euroeval/cli.py +3 -3
euroeval/constants.py +13 -15
euroeval/data_loading.py +33 -28
euroeval/data_models.py +53 -7
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/danish.py +38 -1
euroeval/dataset_configs/dutch.py +38 -1
euroeval/dataset_configs/english.py +38 -1
euroeval/dataset_configs/estonian.py +95 -0
euroeval/dataset_configs/faroese.py +38 -0
euroeval/dataset_configs/finnish.py +39 -1
euroeval/dataset_configs/french.py +38 -1
euroeval/dataset_configs/german.py +38 -1
euroeval/dataset_configs/icelandic.py +39 -1
euroeval/dataset_configs/italian.py +38 -1
euroeval/dataset_configs/latvian.py +81 -0
euroeval/dataset_configs/norwegian.py +38 -1
euroeval/dataset_configs/portuguese.py +38 -1
euroeval/dataset_configs/spanish.py +38 -1
euroeval/dataset_configs/swedish.py +38 -1
euroeval/enums.py +0 -6
euroeval/finetuning.py +6 -6
euroeval/generation.py +25 -14
euroeval/generation_utils.py +46 -14
euroeval/languages.py +947 -187
euroeval/metrics/__init__.py +6 -0
euroeval/metrics/base.py +76 -0
euroeval/metrics/huggingface.py +192 -0
euroeval/metrics/llm_as_a_judge.py +257 -0
euroeval/metrics/pipeline.py +234 -0
euroeval/metrics/speed.py +51 -0
euroeval/prompt_templates/linguistic_acceptability.py +40 -2
euroeval/prompt_templates/multiple_choice.py +23 -2
euroeval/prompt_templates/named_entity_recognition.py +65 -2
euroeval/prompt_templates/reading_comprehension.py +42 -2
euroeval/prompt_templates/sentiment_classification.py +46 -2
euroeval/prompt_templates/summarization.py +24 -4
euroeval/scores.py +7 -2
euroeval/speed_benchmark.py +6 -6
euroeval/task_group_utils/multiple_choice_classification.py +17 -6
euroeval/task_group_utils/question_answering.py +35 -28
euroeval/task_group_utils/sequence_classification.py +96 -23
euroeval/task_group_utils/text_to_text.py +7 -3
euroeval/task_group_utils/token_classification.py +47 -75
euroeval/tasks.py +31 -6
euroeval/tokenization_utils.py +295 -207
euroeval/utils.py +118 -34
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
euroeval-16.0.0.dist-info/RECORD +69 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
euroeval/human_evaluation.py +0 -738
euroeval/metrics.py +0 -470
euroeval-15.16.0.dist-info/RECORD +0 -63
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
{euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -2,8 +2,8 @@
 import asyncio
 import collections.abc as c
+import json
 import logging
-import os
 import re
 import typing as t
 from functools import cache, cached_property, partial
@@ -38,7 +38,12 @@ from requests.exceptions import RequestException
 from tqdm.asyncio import tqdm as tqdm_async
 from tqdm.auto import tqdm
-from ..constants import MAX_LOGPROBS, REASONING_MAX_TOKENS, TASKS_USING_JSON
+from ..constants import (
+    JSON_STRIP_CHARACTERS,
+    LITELLM_CLASSIFICATION_OUTPUT_KEY,
+    MAX_LITELLM_LOGPROBS,
+    REASONING_MAX_TOKENS,
+)
 from ..data_models import (
     BenchmarkConfig,
     DatasetConfig,
@@ -67,16 +72,18 @@ from ..task_group_utils import (
     text_to_text,
     token_classification,
 )
+from ..tasks import NER
 from ..tokenization_utils import get_first_label_token_mapping
 from ..types import ExtractLabelsFunction
 from ..utils import (
     add_semaphore_and_catch_exception,
     create_model_cache_dir,
+    get_hf_token,
     log_once,
     safe_run,
 )
 from .base import BenchmarkModule
-from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
+from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokeniser
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -155,7 +162,7 @@ ALLOWED_PARAMS = {
     r"(anthropic/)?claude-(sonnet|opus)-4.*": ["no-thinking", "thinking"],
     # Gemini models
     r"(gemini/)?gemini-2.5-flash-lite.*": ["no-thinking", "thinking"],
-    r"(gemini/)?gemini-2.5-flash-[0-9].*": ["no-thinking", "thinking"],
+    r"(gemini/)?gemini-2.5-flash.*": ["no-thinking", "thinking"],
     # xAI models
     r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "medium", "high"],
 }
@@ -181,6 +188,8 @@ class LiteLLMModel(BenchmarkModule):
         model_config: ModelConfig,
         dataset_config: DatasetConfig,
         benchmark_config: BenchmarkConfig,
+        log_metadata: bool = True,
+        **generation_kwargs: dict[str, t.Any],
     ) -> None:
         """Initialise the model.
@@ -191,6 +200,11 @@ class LiteLLMModel(BenchmarkModule):
                 The dataset configuration.
             benchmark_config:
                 The benchmark configuration.
+            log_metadata:
+                Whether to log the model metadata.
+            generation_kwargs:
+                The generation kwargs to pass to the model. If None, default values will
+                be used.
         """
         # Detect whether the model is an Ollama model, as we need to extract metadata
         # differently for these models
@@ -209,13 +223,16 @@ class LiteLLMModel(BenchmarkModule):
             model_config=model_config,
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
+            log_metadata=log_metadata,
         )
+        self.generation_kwargs = generation_kwargs
         self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
             dataset_config=self.dataset_config,
             model_config=self.model_config,
-            tokenizer=None,
+            tokeniser=None,
             generative_type=self.generative_type,
+            log_metadata=self.log_metadata,
         )
     @property
@@ -245,11 +262,12 @@ class LiteLLMModel(BenchmarkModule):
         else:
             type_ = GenerativeType.INSTRUCTION_TUNED
-        log_once(
-            f"Detected generative type {type_.name!r} for model "
-            f"{self.model_config.model_id!r}",
-            level=logging.DEBUG,
-        )
+        if self.log_metadata:
+            log_once(
+                f"Detected generative type {type_.name!r} for model "
+                f"{self.model_config.model_id!r}",
+                level=logging.DEBUG,
+            )
         return type_
     def generate(self, inputs: dict) -> GenerativeModelOutput:
@@ -270,32 +288,11 @@ class LiteLLMModel(BenchmarkModule):
         self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
             dataset_config=self.dataset_config,
             model_config=self.model_config,
-            tokenizer=None,
+            tokeniser=None,
             generative_type=self.generative_type,
+            log_metadata=self.log_metadata,
         )
-        # Sanity check that "JSON" is included in the prompt, as some models require
-        # this
-        if self.dataset_config.task in TASKS_USING_JSON:
-            for conversation in conversations:
-                if not conversation:
-                    raise InvalidBenchmark(
-                        "Encountered an empty conversation in 'messages'."
-                    )
-                last_message = conversation[-1]
-                assert isinstance(last_message, dict), (
-                    f"Expected dict message, got {type(last_message)}"
-                )
-                assert "content" in last_message, (
-                    "Expected 'content' key in the last message of the conversation."
-                )
-                assert isinstance(last_message["content"], str), (
-                    "Expected 'content' to be a string."
-                )
-                assert "json" in last_message["content"].lower(), (
-                    "Prompt must contain 'json' for JSON tasks."
-                )
         all_responses: dict[int, "ModelResponse"] = {}
         conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
             enumerate(conversations)
@@ -304,12 +301,16 @@ class LiteLLMModel(BenchmarkModule):
             if not conversations_to_run:
                 break
+            generation_kwargs = self.generation_kwargs or self.get_generation_kwargs(
+                dataset_config=self.dataset_config
+            )
             batch_indices, batch_conversations = zip(*conversations_to_run)
             successes, failures = safe_run(
                 self._generate_async(
                     model_id=self.model_config.model_id,
                     conversations=list(batch_conversations),
-                    **self.get_generation_kwargs(dataset_config=self.dataset_config),
+                    **generation_kwargs,
                 )
             )
@@ -336,11 +337,8 @@ class LiteLLMModel(BenchmarkModule):
             # Attempt to handle the exceptions, to improve the chance of getting
             # successful generations next time around
             for _, error in failures:
-                self._handle_exception(
-                    error=error,
-                    generation_kwargs=self.get_generation_kwargs(
-                        dataset_config=self.dataset_config
-                    ),
+                generation_kwargs = self._handle_exception(
+                    error=error, **generation_kwargs
                 )
             # Sleep for a second to avoid pinging the API server too quickly
@@ -364,9 +362,7 @@ class LiteLLMModel(BenchmarkModule):
         return model_output
-    def _handle_exception(
-        self, error: Exception, generation_kwargs: dict[str, t.Any]
-    ) -> None:
+    def _handle_exception(self, error: Exception, **generation_kwargs) -> dict:
         """Handle an exception from the model.
         Args:
@@ -374,6 +370,9 @@ class LiteLLMModel(BenchmarkModule):
                 The exception to handle.
             generation_kwargs:
                 The generation kwargs to pass to the model.
+        Returns:
+            The updated generation kwargs to pass to the model.
         """
         error_msg = str(error).lower()
         model_id = self.model_config.model_id
@@ -386,6 +385,9 @@ class LiteLLMModel(BenchmarkModule):
             "logprobs is not supported",
             "logprobs is not enabled",
         ]
+        logprobs_pattern = re.compile(
+            r"does not support parameters: \[.*'top_logprobs'.*\]"
+        )
         temperature_messages = [
             "'temperature' is not supported with this model.",
             "temperature is not supported with this model",
@@ -403,6 +405,7 @@ class LiteLLMModel(BenchmarkModule):
             r"[0-9]+ and ([0-9]+)\."
         )
         requires_thinking_disabled_messages = ["thinking.type: Field required"]
+        seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
         if any(msg.lower() in error_msg for msg in stop_messages):
             log_once(
@@ -411,9 +414,10 @@ class LiteLLMModel(BenchmarkModule):
                 level=logging.DEBUG,
             )
             generation_kwargs["stop"] = None
-            return
+            return generation_kwargs
         elif (
             any(msg.lower() in error_msg for msg in logprobs_messages)
+            or logprobs_pattern.search(string=error_msg)
             # Special case for Vertex AI models, since they have strict rate
             # limits on using logprobs. They also have a cap of 5 logprobs, but
             # we ignore this since the rate limiting makes it unusable anyway.
@@ -425,7 +429,7 @@ class LiteLLMModel(BenchmarkModule):
             )
             generation_kwargs.pop("logprobs", None)
             generation_kwargs.pop("top_logprobs", None)
-            return
+            return generation_kwargs
         elif any(msg.lower() in error_msg for msg in temperature_messages):
             log_once(
                 f"The model {model_id!r} does not support "
@@ -433,7 +437,7 @@ class LiteLLMModel(BenchmarkModule):
                 level=logging.DEBUG,
             )
             generation_kwargs.pop("temperature", None)
-            return
+            return generation_kwargs
         elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
             log_once(
                 f"The model {model_id!r} requires "
@@ -441,8 +445,11 @@ class LiteLLMModel(BenchmarkModule):
                 level=logging.DEBUG,
             )
             generation_kwargs["temperature"] = 1.0
-            return
-        elif any(msg.lower() in error_msg for msg in max_items_messages):
+            return generation_kwargs
+        elif (
+            any(msg.lower() in error_msg for msg in max_items_messages)
+            and self.dataset_config.task == NER
+        ):
             log_once(
                 f"The model {model_id!r} does not support "
                 "maxItems in the JSON schema, so disabling it.",
@@ -454,7 +461,7 @@ class LiteLLMModel(BenchmarkModule):
             }
             pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
             generation_kwargs["response_format"] = pydantic_class
-            return
+            return generation_kwargs
         elif any(msg.lower() in error_msg for msg in no_json_schema_messages):
             log_once(
                 f"The model {self.model_config.model_id!r} does not support "
@@ -462,7 +469,7 @@ class LiteLLMModel(BenchmarkModule):
                 level=logging.DEBUG,
             )
             generation_kwargs["response_format"] = dict(type="json_object")
-            return
+            return generation_kwargs
         elif thinking_match := thinking_budget_pattern.search(string=error_msg):
             thinking_budget = int(thinking_match.group(1))
             if thinking_budget >= REASONING_MAX_TOKENS:
@@ -471,7 +478,7 @@ class LiteLLMModel(BenchmarkModule):
                     f"{thinking_budget:,} tokens, which is within the limit of "
                     f"{REASONING_MAX_TOKENS:,} tokens. This should not happen. The "
                     f"error message was: {error_msg}."
-                )
+                ) from error
             log_once(
                 f"The model {model_id!r} can at most use {thinking_budget:,} tokens "
                 "for reasoning, which is less than the default of "
@@ -482,7 +489,7 @@ class LiteLLMModel(BenchmarkModule):
             generation_kwargs["thinking"] = dict(
                 type="enabled", budget_tokens=thinking_budget - 1
             )
-            return
+            return generation_kwargs
         elif (
             any(msg.lower() in error_msg for msg in requires_thinking_disabled_messages)
             and self.generative_type != GenerativeType.REASONING
@@ -494,59 +501,73 @@ class LiteLLMModel(BenchmarkModule):
                 level=logging.DEBUG,
             )
             generation_kwargs["thinking"] = dict(type="disabled")
-            return
+            return generation_kwargs
+        elif re.search(pattern=seed_pattern, string=error_msg):
+            log_once(
+                f"The model {model_id!r} does not support the `seed` parameter, so "
+                "disabling it.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs.pop("seed", None)
+            return generation_kwargs
+        # If there are too many I/O connections, we increase the number of allowed file
+        # descriptors
+        elif "too many open files" in error_msg:
+            raise InvalidBenchmark(
+                "There are too many file descriptors running. See the current "
+                "value by running `ulimit -n`. Try increasing it by running "
+                "`ulimit -n <new-value>` and try again."
+            ) from error
         elif isinstance(
             error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
         ):
             logger.debug(
                 f"Service temporarily unavailable. The error message was: {error}. "
-                f"Retrying in 5 seconds..."
+                "Retrying in 10 seconds..."
             )
-            sleep(5)
-            return
+            sleep(10)
+            return generation_kwargs
         elif isinstance(error, UnsupportedParamsError):
             unsupported_param_match = re.search(
                 pattern=r"(?<=does not support parameters\: \[')([^ ']+)(?='\])",
                 string=error.message,
             )
             if unsupported_param_match is None:
-                raise InvalidModel(error.message)
+                raise InvalidModel(error.message) from error
             else:
                 unsupported_param = unsupported_param_match.group(0)
                 raise InvalidModel(
                     f"The model {model_id!r} does not support the parameter "
                     f"{unsupported_param!r}. Try again without this parameter. "
                     "Skipping this model."
-                )
+                ) from error
         elif isinstance(error, (APIConnectionError, OSError)):
-            # If there are too many I/O connections, we increase the number of allowed
-            # file descriptors
-            if "too many open files" in error_msg:
-                raise InvalidBenchmark(
-                    "There are too many file descriptors running. See the current "
-                    "value by running `ulimit -n`. Try increasing it by running "
-                    "`ulimit -n <new-value>` and try again."
-                )
             raise InvalidBenchmark(
                 f"Encountered {type(error)} during generation: {error}."
-            )
+            ) from error
+        if isinstance(error, NotFoundError):
+            raise InvalidModel(
+                f"The model {model_id!r} was not found. Please check the model ID "
+                "and try again."
+            ) from error
         if isinstance(error, RateLimitError):
             raise InvalidModel(
                 f"You have encountered your rate limit for model {model_id!r}. "
                 "Skipping."
-            )
+            ) from error
         if isinstance(error, AuthenticationError):
             raise NeedsAdditionalArgument(
                 cli_argument="--api-key",
                 script_argument="api_key=<your-api-key>",
                 run_with_cli=self.benchmark_config.run_with_cli,
-            )
+            ) from error
         raise InvalidBenchmark(
             f"Failed to generate text. The error message was: {error}"
-        )
+        ) from error
     async def _generate_async(
         self,
@@ -573,9 +594,9 @@ class LiteLLMModel(BenchmarkModule):
         # for all the requests, preventing "too many open files" errors
         router = Router(
             model_list=[
-                dict(
+                litellm.DeploymentTypedDict(
                     model_name=self.model_config.model_id,
-                    litellm_params=generation_kwargs,
+                    litellm_params=litellm.LiteLLMParamsTypedDict(model=model_id),
                 )
             ]
         )
@@ -585,7 +606,9 @@ class LiteLLMModel(BenchmarkModule):
         semaphore = asyncio.Semaphore(max_concurrent_calls)
         requests = [
             add_semaphore_and_catch_exception(
-                router.acompletion(model=model_id, messages=conversation),
+                router.acompletion(
+                    model=model_id, messages=conversation, **generation_kwargs
+                ),
                 semaphore=semaphore,
             )
             for conversation in conversations
@@ -645,6 +668,23 @@ class LiteLLMModel(BenchmarkModule):
             generation_output = generated_message.content or ""
             generation_output = generation_output.strip()
+            # In the case where we're dealing with a classification task, the model is
+            # outputting a JSON dictionary, so we will extract the generated text from
+            # within the dictionary
+            generation_dct: dict[str, t.Any] | None = None
+            if LITELLM_CLASSIFICATION_OUTPUT_KEY in generation_output:
+                try:
+                    generation_dct = json.loads(generation_output)
+                    assert isinstance(generation_dct, dict)
+                    if set(generation_dct.keys()) == {
+                        LITELLM_CLASSIFICATION_OUTPUT_KEY
+                    }:
+                        generation_output = str(
+                            generation_dct[LITELLM_CLASSIFICATION_OUTPUT_KEY]
+                        ).strip()
+                except json.JSONDecodeError:
+                    pass
             # Structure the model output as a GenerativeModelOutput object
             sequences.append(generation_output)
             if hasattr(model_response_choices, "logprobs"):
@@ -657,6 +697,23 @@ class LiteLLMModel(BenchmarkModule):
                         ]
                         for content in model_response_choices.logprobs.content or list()
                     ]
+                    # If the model outputted a JSON dictionary, we need to find the
+                    # token index of the value within the dictionary, rather than the
+                    # first token of the entire output
+                    if generation_dct:
+                        key_name = next(iter(generation_dct.keys()))
+                        logprobs_list = [
+                            lst
+                            for lst in logprobs_list
+                            if (
+                                lst
+                                and lst[0]
+                                and (token := lst[0][0].strip(JSON_STRIP_CHARACTERS))
+                                and not key_name.startswith(token)
+                            )
+                        ]
                     scores.append(logprobs_list)
                 else:
                     log_once(
@@ -730,9 +787,7 @@ class LiteLLMModel(BenchmarkModule):
                     repo_info = hf_api.model_info(
                         repo_id=model_id,
                         revision="main",
-                        token=os.getenv("HUGGINGFACE_API_KEY")
-                        or self.benchmark_config.api_key
-                        or True,
+                        token=get_hf_token(api_key=self.benchmark_config.api_key),
                     )
                 except (
                     RepositoryNotFoundError,
@@ -789,7 +844,7 @@ class LiteLLMModel(BenchmarkModule):
                     run_with_cli=self.benchmark_config.run_with_cli,
                 )
-                tokenizer = load_tokenizer(
+                tokeniser = load_tokeniser(
                     model=None,
                     model_id=model_id,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -801,10 +856,10 @@ class LiteLLMModel(BenchmarkModule):
                 ):
                     vocab_size = hf_config.vocab_size
                 elif (
-                    hasattr(tokenizer, "vocab_size")
-                    and tokenizer.vocab_size is not None
+                    hasattr(tokeniser, "vocab_size")
+                    and tokeniser.vocab_size is not None
                 ):
-                    vocab_size = tokenizer.vocab_size
+                    vocab_size = tokeniser.vocab_size
                 else:
                     vocab_size = -1
                 return vocab_size
@@ -835,13 +890,15 @@ class LiteLLMModel(BenchmarkModule):
                 if context_length_keys:
                     context_length = model_info[context_length_keys[0]]
                     if context_length is not None:
-                        log_once(
-                            f"Detected context length key {context_length_keys[0]!r} "
-                            f"for Ollama model {ollama_model_id!r}",
-                            level=logging.DEBUG,
-                        )
+                        if self.log_metadata:
+                            log_once(
+                                f"Detected context length key "
+                                f"{context_length_keys[0]!r} for Ollama model "
+                                f"{ollama_model_id!r}",
+                                level=logging.DEBUG,
+                            )
                         return int(context_length)
-                else:
+                elif self.log_metadata:
                     log_once(
                         f"Tried to get the maximum length of the Ollama model "
                         f"{ollama_model_id!r}, but could not find a context length. "
@@ -869,7 +926,7 @@ class LiteLLMModel(BenchmarkModule):
                     run_with_cli=self.benchmark_config.run_with_cli,
                 )
-                tokenizer = load_tokenizer(
+                tokeniser = load_tokeniser(
                     model=None,
                     model_id=model_id,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
@@ -877,18 +934,18 @@ class LiteLLMModel(BenchmarkModule):
                 all_max_lengths: list[int] = list()
-                # Add the registered max length of the tokenizer
+                # Add the registered max length of the tokeniser
                 if hasattr(
-                    tokenizer, "model_max_length"
-                ) and tokenizer.model_max_length < int(1e30):
-                    all_max_lengths.append(tokenizer.model_max_length)
+                    tokeniser, "model_max_length"
+                ) and tokeniser.model_max_length < int(1e30):
+                    all_max_lengths.append(tokeniser.model_max_length)
                 # Add the max length derived from the model's input sizes
-                if hasattr(tokenizer, "max_model_input_sizes"):
+                if hasattr(tokeniser, "max_model_input_sizes"):
                     all_max_lengths.extend(
                         [
                             size
-                            for size in tokenizer.max_model_input_sizes.values()
+                            for size in tokeniser.max_model_input_sizes.values()
                             if size is not None
                         ]
                     )
@@ -1026,7 +1083,7 @@ class LiteLLMModel(BenchmarkModule):
                     f"Service temporarily unavailable. The error message was: {e}. "
                     "Retrying in 10 seconds..."
                 )
-                sleep(5)
+                sleep(10)
             except APIError as e:
                 if "'503 Service Unavailable" not in str(e):
                     raise e
@@ -1136,7 +1193,10 @@ class LiteLLMModel(BenchmarkModule):
         if self.benchmark_config.few_shot:
             few_shot_examples = extract_few_shot_examples(
-                dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
+                dataset=dataset,
+                dataset_config=self.dataset_config,
+                benchmark_config=self.benchmark_config,
+                itr_idx=itr_idx,
             )
         else:
             few_shot_examples = list()
@@ -1149,7 +1209,7 @@ class LiteLLMModel(BenchmarkModule):
                 dataset_config=self.dataset_config,
                 instruction_model=True,
                 always_populate_text_field=False,
-                tokenizer=None,
+                tokeniser=None,
             ),
             batched=True,
             load_from_cache_file=False,
@@ -1174,7 +1234,6 @@ class LiteLLMModel(BenchmarkModule):
         """
         # Set the core generation arguments
         generation_kwargs: dict[str, t.Any] = dict(
-            model=self.model_config.model_id,
             max_completion_tokens=(
                 REASONING_MAX_TOKENS
                 if self.generative_type == GenerativeType.REASONING
@@ -1191,7 +1250,7 @@ class LiteLLMModel(BenchmarkModule):
         # Set up the `response_format` generation argument if we are dealing with a task
         # using structured generation
-        if dataset_config.task in TASKS_USING_JSON:
+        if dataset_config.task.uses_structured_output:
             if self.generative_type == GenerativeType.REASONING:
                 log_once(
                     f"The model {self.model_config.model_id!r} is a reasoning model "
@@ -1200,12 +1259,21 @@ class LiteLLMModel(BenchmarkModule):
                     level=logging.DEBUG,
                 )
             elif supports_response_schema(model=self.model_config.model_id):
-                ner_tag_names = list(dataset_config.prompt_label_mapping.values())
-                keys_and_their_types: dict[str, t.Any] = {
-                    tag_name: (conlist(str, max_length=5), ...)
-                    for tag_name in ner_tag_names
-                }
-                pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
+                if dataset_config.task == NER:
+                    ner_tag_names = list(dataset_config.prompt_label_mapping.values())
+                    keys_and_their_types: dict[str, t.Any] = {
+                        tag_name: (conlist(str, max_length=5), ...)
+                        for tag_name in ner_tag_names
+                    }
+                    pydantic_class = create_model(
+                        "AnswerFormat", **keys_and_their_types
+                    )
+                else:
+                    raise InvalidBenchmark(
+                        "This task requires structured generation, but it has not "
+                        "been implemented for this task yet. Please open an issue "
+                        "at https://github.com/EuroEval/EuroEval/issues."
+                    )
                 generation_kwargs["response_format"] = pydantic_class
                 log_once(
                     "Enabling structured generation for model "
@@ -1221,6 +1289,16 @@ class LiteLLMModel(BenchmarkModule):
                     "the model does not support schemas.",
                     level=logging.DEBUG,
                 )
+        elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
+            localised_labels = [
+                self.dataset_config.prompt_label_mapping[label]
+                for label in self.dataset_config.labels
+            ]
+            keys_and_their_types = {
+                LITELLM_CLASSIFICATION_OUTPUT_KEY: (t.Literal[*localised_labels], ...)
+            }
+            pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
+            generation_kwargs["response_format"] = pydantic_class
         # If the model is an Ollama reasoning model, we ensure that thinking is enabled
         if self.is_ollama and self.generative_type == GenerativeType.REASONING:
@@ -1234,7 +1312,7 @@ class LiteLLMModel(BenchmarkModule):
         # Handle manually set parameters
         if self.buffer["first_label_token_mapping"]:
             generation_kwargs["logprobs"] = True
-            generation_kwargs["top_logprobs"] = MAX_LOGPROBS
+            generation_kwargs["top_logprobs"] = MAX_LITELLM_LOGPROBS
         if self.model_config.revision == "thinking":
             generation_kwargs["thinking"] = dict(
                 type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
@@ -1260,7 +1338,7 @@ class LiteLLMModel(BenchmarkModule):
         # First attempt is a test run with a single conversation to handle errors
         # quickly. We repeat this multiple times to deal with different types of
         # errors, and stop if we get a successful response.
-        test_conversation = [
+        test_conversation: list[litellm.AllMessageValues] = [
             litellm.ChatCompletionUserMessage(role="user", content="Test message")
         ]
         for _ in range(5):
@@ -1274,7 +1352,9 @@ class LiteLLMModel(BenchmarkModule):
             if not failures:
                 break
             for _, error in failures:
-                self._handle_exception(error=error, generation_kwargs=generation_kwargs)
+                generation_kwargs = self._handle_exception(
+                    error=error, **generation_kwargs
+                )
         return generation_kwargs
@@ -1350,11 +1430,11 @@ def try_download_ollama_model(model_id: str) -> bool:
             for model_obj in ollama.list().models
             if model_obj.model is not None
         ]
-    except ConnectionError:
+    except ConnectionError as e:
         raise InvalidModel(
             "Ollama does not seem to be running, so we cannot evaluate the model "
             f"{model_id!r}. Please make sure that Ollama is running and try again."
-        )
+        ) from e
     ollama_model_id = "/".join(model_id.split("/")[1:])
     if ollama_model_id not in downloaded_ollama_models:
@@ -1384,12 +1464,12 @@ def try_download_ollama_model(model_id: str) -> bool:
                         raise InvalidModel(
                             f"Failed to download Ollama model {ollama_model_id}. "
                             f"The error message was: {inner_e}"
-                        )
+                        ) from inner_e
             else:
                 raise InvalidModel(
                     f"Failed to download Ollama model {ollama_model_id}. "
                     f"The error message was: {e}"
-                )
+                ) from e
         # Download the model
         with tqdm(

EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.16.0py3-none-any.whl → 16.0.0py3-none-any.whl