PyPI - EuroEval - Versions diffs - 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl - Mend

EuroEval 16.3.0py3-none-any.whl → 16.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (78) hide show

euroeval/__init__.py +9 -2
euroeval/benchmark_config_factory.py +51 -50
euroeval/benchmark_modules/base.py +9 -21
euroeval/benchmark_modules/fresh.py +2 -1
euroeval/benchmark_modules/hf.py +101 -71
euroeval/benchmark_modules/litellm.py +115 -53
euroeval/benchmark_modules/vllm.py +107 -92
euroeval/benchmarker.py +144 -121
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +86 -8
euroeval/constants.py +9 -0
euroeval/data_loading.py +80 -29
euroeval/data_models.py +338 -330
euroeval/dataset_configs/__init__.py +12 -3
euroeval/dataset_configs/bulgarian.py +56 -0
euroeval/dataset_configs/czech.py +75 -0
euroeval/dataset_configs/danish.py +55 -93
euroeval/dataset_configs/dutch.py +48 -87
euroeval/dataset_configs/english.py +45 -77
euroeval/dataset_configs/estonian.py +42 -34
euroeval/dataset_configs/faroese.py +19 -60
euroeval/dataset_configs/finnish.py +36 -69
euroeval/dataset_configs/french.py +39 -75
euroeval/dataset_configs/german.py +45 -82
euroeval/dataset_configs/greek.py +64 -0
euroeval/dataset_configs/icelandic.py +54 -91
euroeval/dataset_configs/italian.py +42 -79
euroeval/dataset_configs/latvian.py +28 -35
euroeval/dataset_configs/lithuanian.py +28 -26
euroeval/dataset_configs/norwegian.py +72 -115
euroeval/dataset_configs/polish.py +33 -61
euroeval/dataset_configs/portuguese.py +33 -66
euroeval/dataset_configs/serbian.py +64 -0
euroeval/dataset_configs/slovak.py +55 -0
euroeval/dataset_configs/spanish.py +42 -77
euroeval/dataset_configs/swedish.py +52 -90
euroeval/dataset_configs/ukrainian.py +64 -0
euroeval/exceptions.py +1 -1
euroeval/finetuning.py +24 -17
euroeval/generation.py +15 -14
euroeval/generation_utils.py +8 -8
euroeval/languages.py +395 -323
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +21 -6
euroeval/metrics/llm_as_a_judge.py +6 -4
euroeval/metrics/pipeline.py +17 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +17 -19
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/__init__.py +2 -0
euroeval/prompt_templates/classification.py +206 -0
euroeval/prompt_templates/linguistic_acceptability.py +99 -42
euroeval/prompt_templates/multiple_choice.py +102 -38
euroeval/prompt_templates/named_entity_recognition.py +172 -51
euroeval/prompt_templates/reading_comprehension.py +119 -42
euroeval/prompt_templates/sentiment_classification.py +110 -40
euroeval/prompt_templates/summarization.py +85 -40
euroeval/prompt_templates/token_classification.py +279 -0
euroeval/scores.py +11 -10
euroeval/speed_benchmark.py +5 -6
euroeval/task_group_utils/multiple_choice_classification.py +2 -4
euroeval/task_group_utils/question_answering.py +24 -16
euroeval/task_group_utils/sequence_classification.py +48 -35
euroeval/task_group_utils/text_to_text.py +19 -9
euroeval/task_group_utils/token_classification.py +21 -17
euroeval/tasks.py +44 -1
euroeval/tokenisation_utils.py +33 -22
euroeval/types.py +10 -9
euroeval/utils.py +35 -149
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
euroeval-16.5.0.dist-info/RECORD +81 -0
euroeval-16.3.0.dist-info/RECORD +0 -71
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -6,7 +6,7 @@ import json
 import logging
 import re
 import typing as t
-from functools import cache, cached_property, partial
+from functools import cached_property, partial
 from time import sleep
 import litellm
@@ -36,8 +36,8 @@ from litellm.utils import supports_reasoning, supports_response_schema
 from pydantic import conlist, create_model
 from requests.exceptions import RequestException
 from tqdm.asyncio import tqdm as tqdm_async
-from tqdm.auto import tqdm
+from ..caching_utils import cache_arguments
 from ..constants import (
     JSON_STRIP_CHARACTERS,
     LITELLM_CLASSIFICATION_OUTPUT_KEY,
@@ -70,6 +70,7 @@ from ..generation_utils import (
     extract_few_shot_examples,
     raise_if_wrong_params,
 )
+from ..logging_utils import get_pbar, log, log_once
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
@@ -83,7 +84,6 @@ from ..utils import (
     add_semaphore_and_catch_exception,
     create_model_cache_dir,
     get_hf_token,
-    log_once,
     safe_run,
     split_model_id,
 )
@@ -95,8 +95,6 @@ if t.TYPE_CHECKING:
     from litellm.types.utils import ModelResponse
     from transformers.trainer import Trainer
-logger = logging.getLogger("euroeval")
 VOCAB_SIZE_MAPPING = {
     # OpenAI models
@@ -133,6 +131,7 @@ MODEL_MAX_LENGTH_MAPPING = {
     r"gpt-4.1.*": 1_047_576,
     # Anthropic models
     r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
+    r"(anthropic/)?claude-(opus|sonnet|haiku)-[1-9](-[1-9])?-[0-9]{8}": 200_000,
     # Gemini models
     r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
     r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
@@ -311,7 +310,7 @@ class LiteLLMModel(BenchmarkModule):
             InvalidBenchmark:
                 If the inputs do not contain either 'messages' or 'text' keys.
         """
-        model_inputs: list[list[litellm.AllMessageValues] | str]
+        model_inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str]
         if "messages" in inputs:
             model_inputs = inputs["messages"]
         elif "text" in inputs:
@@ -332,9 +331,9 @@ class LiteLLMModel(BenchmarkModule):
         )
         all_responses: dict[int, "ModelResponse"] = {}
-        inputs_to_run: list[tuple[int, list[litellm.AllMessageValues] | str]] = list(
-            enumerate(model_inputs)
-        )
+        inputs_to_run: c.Sequence[
+            tuple[int, c.Sequence[litellm.AllMessageValues] | str]
+        ] = list(enumerate(model_inputs))
         for attempt in range(num_attempts := 10):
             if not inputs_to_run:
                 break
@@ -367,10 +366,11 @@ class LiteLLMModel(BenchmarkModule):
                 (batch_indices[idx], model_inputs[batch_indices[idx]])
                 for idx, _ in failures
             ]
-            logger.debug(
+            log(
                 f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
                 f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
-                f"{failures[0][1]}."
+                f"{failures[0][1]}.",
+                level=logging.DEBUG,
             )
             # Attempt to handle the exceptions, to improve the chance of getting
@@ -422,14 +422,19 @@ class LiteLLMModel(BenchmarkModule):
             "'stop' is not supported with this model",
             "'$.stop' is invalid",
         ]
+        stop_pattern = re.compile(r"does not support parameters: \[.*'stop'.*\]")
         logprobs_messages = [
             "you are not allowed to request logprobs",
             "you've reached the maximum number of requests with logprobs",
             "logprobs is not supported",
             "logprobs is not enabled",
+            "Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)",
         ]
-        top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
         logprobs_pattern = re.compile(
+            r"does not support parameters: \[.*'logprobs'.*\]"
+        )
+        top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
+        top_logprobs_pattern = re.compile(
             r"does not support parameters: \[.*'top_logprobs'.*\]"
         )
         max_completion_tokens_pattern = re.compile(
@@ -438,6 +443,7 @@ class LiteLLMModel(BenchmarkModule):
         temperature_messages = [
             "'temperature' is not supported with this model.",
             "temperature is not supported with this model",
+            r"does not support parameters: \[.*'temperature'.*\]",
         ]
         temperature_must_be_one_messages = [
             "`temperature` may only be set to 1",
@@ -454,10 +460,14 @@ class LiteLLMModel(BenchmarkModule):
         requires_thinking_disabled_messages = ["thinking.type: Field required"]
         seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
         response_format_messages = [
-            "got an unexpected keyword argument 'response_format'"
+            "got an unexpected keyword argument 'response_format'",
+            "the model returned empty outputs",
         ]
-        if any(msg.lower() in error_msg for msg in stop_messages):
+        if (
+            any(msg.lower() in error_msg for msg in stop_messages)
+            or stop_pattern.search(string=error_msg) is not None
+        ):
             log_once(
                 f"The model {model_id!r} does not support "
                 "stop sequences, so disabling them.",
@@ -467,7 +477,7 @@ class LiteLLMModel(BenchmarkModule):
             return generation_kwargs
         elif (
             any(msg.lower() in error_msg for msg in logprobs_messages)
-            or logprobs_pattern.search(string=error_msg)
+            or logprobs_pattern.search(string=error_msg) is not None
             # Special case for Vertex AI models, since they have strict rate
             # limits on using logprobs. They also have a cap of 5 logprobs, but
             # we ignore this since the rate limiting makes it unusable anyway.
@@ -477,10 +487,15 @@ class LiteLLMModel(BenchmarkModule):
                 f"The model {model_id!r} does not support logprobs, so disabling it.",
                 level=logging.DEBUG,
             )
+            self.buffer["first_label_token_mapping"] = False
             generation_kwargs.pop("logprobs", None)
             generation_kwargs.pop("top_logprobs", None)
+            generation_kwargs.pop("response_format", None)
             return generation_kwargs
-        elif any(msg.lower() in error_msg for msg in top_logprobs_messages):
+        elif (
+            any(msg.lower() in error_msg for msg in top_logprobs_messages)
+            or top_logprobs_pattern.search(string=error_msg) is not None
+        ):
             log_once(
                 f"The model {model_id!r} does not support the `top_logprobs` argument, "
                 "so moving the value to `logprobs`.",
@@ -525,7 +540,7 @@ class LiteLLMModel(BenchmarkModule):
             )
             ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
             keys_and_their_types = {
-                tag_name: (list[str], ...) for tag_name in ner_tag_names
+                tag_name: (c.Sequence[str], ...) for tag_name in ner_tag_names
             }
             pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
             generation_kwargs["response_format"] = pydantic_class
@@ -597,9 +612,10 @@ class LiteLLMModel(BenchmarkModule):
         elif isinstance(
             error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
         ):
-            logger.debug(
+            log(
                 f"Service temporarily unavailable. The error message was: {error}. "
-                "Retrying in 10 seconds..."
+                "Retrying in 10 seconds...",
+                level=logging.DEBUG,
             )
             sleep(10)
             return generation_kwargs
@@ -629,10 +645,32 @@ class LiteLLMModel(BenchmarkModule):
             ) from error
         if isinstance(error, RateLimitError):
-            raise InvalidModel(
+            log(
                 f"You have encountered your rate limit for model {model_id!r}. "
-                "Skipping."
-            ) from error
+                "Retrying in 10 seconds...",
+                level=logging.DEBUG,
+            )
+            sleep(10)
+            return generation_kwargs
+        if (
+            isinstance(error, BadRequestError)
+            and (
+                retry_match := re.search(
+                    pattern=r"\bretry in ([0-9]+(.[0-9]+)?) ?(s|seconds)\b",
+                    string=error_msg,
+                )
+            )
+            is not None
+        ):
+            retry_seconds = float(retry_match.group(1))
+            log(
+                f"Bad request error encountered. Retrying in {retry_seconds:.1f} "
+                "seconds...",
+                level=logging.DEBUG,
+            )
+            sleep(retry_seconds)
+            return generation_kwargs
         if isinstance(error, AuthenticationError):
             raise NeedsAdditionalArgument(
@@ -648,9 +686,11 @@ class LiteLLMModel(BenchmarkModule):
     async def _generate_async(
         self,
         model_id: str,
-        inputs: list[list[litellm.AllMessageValues] | str],
+        inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str],
         **generation_kwargs,
-    ) -> tuple[list[tuple[int, "ModelResponse"]], list[tuple[int, Exception]]]:
+    ) -> tuple[
+        c.Sequence[tuple[int, "ModelResponse"]], c.Sequence[tuple[int, Exception]]
+    ]:
         """Generate outputs from the model asynchronously.
         Args:
@@ -711,7 +751,19 @@ class LiteLLMModel(BenchmarkModule):
                 for input_ in inputs
                 if isinstance(input_, list)
             ]
-        responses = await tqdm_async.gather(*requests, leave=False)
+        responses = await tqdm_async.gather(
+            *requests, colour="yellow", ascii="—▰", leave=False
+        )
+        # If the outputs are empty, convert them to exceptions
+        if all(
+            not isinstance(response, Exception)
+            and response.choices[0].message.content == "{}"
+            for response in responses
+        ):
+            responses = [ValueError("The model returned empty outputs.")] * len(
+                responses
+            )
         # Separate the successful responses from the failed ones
         successes = [
@@ -731,13 +783,15 @@ class LiteLLMModel(BenchmarkModule):
                 try:
                     request.close()
                 except RuntimeError as e:
-                    logger.debug(f"RuntimeError during request.close(): {e}")
+                    log(
+                        f"RuntimeError during request.close(): {e}", level=logging.DEBUG
+                    )
         return successes, failures
     @staticmethod
     def _create_model_output(
-        model_responses: list["ModelResponse"], model_id: str
+        model_responses: c.Sequence["ModelResponse"], model_id: str
     ) -> GenerativeModelOutput:
         """Create a GenerativeModelOutput object from a list of ModelResponse objects.
@@ -756,10 +810,11 @@ class LiteLLMModel(BenchmarkModule):
         for model_response in model_responses:
             if not model_response.choices:
                 sequences.append("")
-                logger.warning(
+                log(
                     f"The model {model_id!r} did not end up "
                     "generating any text. This is likely because the model ran "
-                    "out of tokens while reasoning. Returning an empty string."
+                    "out of tokens while reasoning. Returning an empty string.",
+                    level=logging.WARNING,
                 )
                 continue
@@ -810,7 +865,7 @@ class LiteLLMModel(BenchmarkModule):
                     )
                     continue
-                logprobs_list: list[list[tuple[str, float]]]
+                logprobs_list: c.Sequence[c.Sequence[tuple[str, float]]]
                 if isinstance(logprobs_obj, ChoiceLogprobs):
                     logprobs_list = [
                         [
@@ -847,11 +902,12 @@ class LiteLLMModel(BenchmarkModule):
                 scores.append(logprobs_list)
         if not sequences:
-            logger.warning(
+            log(
                 "No sequences were generated by the model "
                 f"{model_id!r}. This may be due to the "
                 "model running out of tokens or an issue with the input data. "
-                "Returning an empty GenerativeModelOutput."
+                "Returning an empty GenerativeModelOutput.",
+                level=logging.WARNING,
             )
             return GenerativeModelOutput(sequences=[], scores=None)
@@ -1105,7 +1161,7 @@ class LiteLLMModel(BenchmarkModule):
         return -1
     @property
-    def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
+    def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
         Returns:
@@ -1130,6 +1186,7 @@ class LiteLLMModel(BenchmarkModule):
                 return partial(
                     sequence_classification.extract_labels_from_generation,
                     dataset_config=self.dataset_config,
+                    model_config=self.model_config,
                     first_label_token_mapping=self.buffer["first_label_token_mapping"],
                 )
             case TaskGroup.TEXT_TO_TEXT:
@@ -1205,17 +1262,19 @@ class LiteLLMModel(BenchmarkModule):
                 ServiceUnavailableError,
                 InternalServerError,
             ) as e:
-                logger.debug(
+                log(
                     f"Service temporarily unavailable. The error message was: {e}. "
-                    "Retrying in 10 seconds..."
+                    "Retrying in 10 seconds...",
+                    level=logging.DEBUG,
                 )
                 sleep(10)
             except APIError as e:
                 if "'503 Service Unavailable" not in str(e):
                     raise e
-                logger.warning(
+                log(
                     f"Failed to check if model {model_id!r} exists. Retrying in 10 "
-                    "seconds..."
+                    "seconds...",
+                    level=logging.WARNING,
                 )
                 sleep(10)
             except (BadRequestError, NotFoundError):
@@ -1228,21 +1287,25 @@ class LiteLLMModel(BenchmarkModule):
                     case 0:
                         pass
                     case 1:
-                        logger.warning(
+                        log(
                             f"Could not find the model ID {model_id!r}. Did you mean "
-                            f"{candidate_models[0]!r}?"
+                            f"{candidate_models[0]!r}?",
+                            level=logging.WARNING,
                         )
                     case _:
                         candidate_models_str = "', '".join(candidate_models)
-                        logger.warning(
+                        log(
                             f"Could not find the model ID {model_id!r}. Did you mean "
-                            f"any of the following model IDs: '{candidate_models_str}'?"
+                            "any of the following model IDs: "
+                            f"'{candidate_models_str}'?",
+                            level=logging.WARNING,
                         )
                 return False
         else:
-            logger.error(
+            log(
                 f"Failed to check if model {model_id!r} exists after {num_attempts} "
-                "attempts. Assuming it does not exist."
+                "attempts. Assuming it does not exist.",
+                level=logging.ERROR,
             )
             return False
@@ -1275,7 +1338,8 @@ class LiteLLMModel(BenchmarkModule):
                 "that the revision is actually the parameter and set the revision "
                 "to 'main'. In the future, use the new '#' syntax to specify the "
                 f"parameter (in this case, this would be {proper_model_id!r}), as this "
-                "will be an error in future versions of EuroEval."
+                "will be an error in future versions of EuroEval.",
+                level=logging.WARNING,
             )
             model_id_components.param = model_id_components.revision
             model_id_components.revision = "main"
@@ -1363,7 +1427,7 @@ class LiteLLMModel(BenchmarkModule):
         return dataset
-    @cache
+    @cache_arguments()
     def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
         """Get the generation arguments for the model.
@@ -1483,7 +1547,7 @@ class LiteLLMModel(BenchmarkModule):
         # First attempt is a test run with a single conversation to handle errors
         # quickly. We repeat this multiple times to deal with different types of
         # errors, and stop if we get a successful response.
-        test_input: list[litellm.AllMessageValues] | str
+        test_input: c.Sequence[litellm.AllMessageValues] | str
         if self.generative_type == GenerativeType.BASE:
             test_input = "Test message"
         else:
@@ -1542,7 +1606,7 @@ def try_download_ollama_model(model_id: str) -> bool:
         )
     try:
-        downloaded_ollama_models: list[str] = [
+        downloaded_ollama_models: c.Sequence[str] = [
             model_obj.model
             for model_obj in ollama.list().models
             if model_obj.model is not None
@@ -1571,7 +1635,8 @@ def try_download_ollama_model(model_id: str) -> bool:
                         f"The model {model_id!r} cannot be found on Ollama, but the "
                         f"model {model_id_with_prefix} *was* found, so we would "
                         "recommend you cancelling this run and trying the evaluation "
-                        "with that model ID instead."
+                        "with that model ID instead.",
+                        level=logging.WARNING,
                     )
                     return False
                 except ollama.ResponseError as inner_e:
@@ -1589,11 +1654,8 @@ def try_download_ollama_model(model_id: str) -> bool:
                 ) from e
         # Download the model
-        with tqdm(
-            desc=f"Downloading {ollama_model_id}",
-            unit_scale=True,
-            unit="B",
-            leave=False,
+        with get_pbar(
+            desc=f"Downloading {ollama_model_id}", unit_scale=True, unit="B"
         ) as pbar:
             for status in response:
                 if status.total is not None:

EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.3.0py3-none-any.whl → 16.5.0py3-none-any.whl