PyPI - EuroEval - Versions diffs - 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl - Mend

EuroEval 16.2.2py3-none-any.whl → 16.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (65) hide show

euroeval/__init__.py +7 -4
euroeval/benchmark_config_factory.py +0 -4
euroeval/benchmark_modules/base.py +3 -16
euroeval/benchmark_modules/fresh.py +5 -2
euroeval/benchmark_modules/hf.py +107 -66
euroeval/benchmark_modules/litellm.py +103 -55
euroeval/benchmark_modules/vllm.py +155 -82
euroeval/benchmarker.py +184 -129
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/cli.py +1 -1
euroeval/constants.py +9 -0
euroeval/data_loading.py +14 -11
euroeval/data_models.py +12 -4
euroeval/dataset_configs/__init__.py +3 -0
euroeval/dataset_configs/czech.py +79 -0
euroeval/dataset_configs/danish.py +10 -13
euroeval/dataset_configs/dutch.py +0 -3
euroeval/dataset_configs/english.py +0 -3
euroeval/dataset_configs/estonian.py +11 -1
euroeval/dataset_configs/finnish.py +0 -3
euroeval/dataset_configs/french.py +0 -3
euroeval/dataset_configs/german.py +0 -3
euroeval/dataset_configs/italian.py +0 -3
euroeval/dataset_configs/latvian.py +2 -4
euroeval/dataset_configs/lithuanian.py +68 -0
euroeval/dataset_configs/norwegian.py +0 -3
euroeval/dataset_configs/polish.py +0 -3
euroeval/dataset_configs/portuguese.py +0 -3
euroeval/dataset_configs/slovak.py +60 -0
euroeval/dataset_configs/spanish.py +0 -3
euroeval/dataset_configs/swedish.py +10 -15
euroeval/finetuning.py +21 -15
euroeval/generation.py +10 -10
euroeval/generation_utils.py +2 -3
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +10 -6
euroeval/metrics/llm_as_a_judge.py +5 -3
euroeval/metrics/pipeline.py +22 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +11 -14
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/linguistic_acceptability.py +30 -3
euroeval/prompt_templates/multiple_choice.py +34 -1
euroeval/prompt_templates/named_entity_recognition.py +71 -11
euroeval/prompt_templates/reading_comprehension.py +41 -3
euroeval/prompt_templates/sentiment_classification.py +34 -1
euroeval/prompt_templates/summarization.py +26 -6
euroeval/scores.py +7 -7
euroeval/speed_benchmark.py +3 -5
euroeval/task_group_utils/multiple_choice_classification.py +0 -3
euroeval/task_group_utils/question_answering.py +0 -3
euroeval/task_group_utils/sequence_classification.py +43 -31
euroeval/task_group_utils/text_to_text.py +17 -8
euroeval/task_group_utils/token_classification.py +10 -9
euroeval/tokenisation_utils.py +22 -20
euroeval/utils.py +30 -147
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
euroeval-16.4.0.dist-info/RECORD +75 -0
euroeval-16.2.2.dist-info/RECORD +0 -70
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -6,7 +6,7 @@ import json
 import logging
 import re
 import typing as t
-from functools import cache, cached_property, partial
+from functools import cached_property, partial
 from time import sleep
 import litellm
@@ -36,8 +36,8 @@ from litellm.utils import supports_reasoning, supports_response_schema
 from pydantic import conlist, create_model
 from requests.exceptions import RequestException
 from tqdm.asyncio import tqdm as tqdm_async
-from tqdm.auto import tqdm
+from ..caching_utils import cache_arguments
 from ..constants import (
     JSON_STRIP_CHARACTERS,
     LITELLM_CLASSIFICATION_OUTPUT_KEY,
@@ -70,6 +70,7 @@ from ..generation_utils import (
     extract_few_shot_examples,
     raise_if_wrong_params,
 )
+from ..logging_utils import get_pbar, log, log_once
 from ..task_group_utils import (
     question_answering,
     sequence_classification,
@@ -83,7 +84,6 @@ from ..utils import (
     add_semaphore_and_catch_exception,
     create_model_cache_dir,
     get_hf_token,
-    log_once,
     safe_run,
     split_model_id,
 )
@@ -95,8 +95,6 @@ if t.TYPE_CHECKING:
     from litellm.types.utils import ModelResponse
     from transformers.trainer import Trainer
-logger = logging.getLogger("euroeval")
 VOCAB_SIZE_MAPPING = {
     # OpenAI models
@@ -133,6 +131,7 @@ MODEL_MAX_LENGTH_MAPPING = {
     r"gpt-4.1.*": 1_047_576,
     # Anthropic models
     r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
+    r"(anthropic/)?claude-(opus|sonnet|haiku)-[1-9](-[1-9])?-[0-9]{8}": 200_000,
     # Gemini models
     r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
     r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
@@ -367,9 +366,11 @@ class LiteLLMModel(BenchmarkModule):
                 (batch_indices[idx], model_inputs[batch_indices[idx]])
                 for idx, _ in failures
             ]
-            logger.debug(
+            log(
                 f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
-                f"{len(inputs_to_run):,} failed message(s)"
+                f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
+                f"{failures[0][1]}.",
+                level=logging.DEBUG,
             )
             # Attempt to handle the exceptions, to improve the chance of getting
@@ -421,14 +422,19 @@ class LiteLLMModel(BenchmarkModule):
             "'stop' is not supported with this model",
             "'$.stop' is invalid",
         ]
+        stop_pattern = re.compile(r"does not support parameters: \[.*'stop'.*\]")
         logprobs_messages = [
             "you are not allowed to request logprobs",
             "you've reached the maximum number of requests with logprobs",
             "logprobs is not supported",
             "logprobs is not enabled",
+            "Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)",
         ]
-        top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
         logprobs_pattern = re.compile(
+            r"does not support parameters: \[.*'logprobs'.*\]"
+        )
+        top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
+        top_logprobs_pattern = re.compile(
             r"does not support parameters: \[.*'top_logprobs'.*\]"
         )
         max_completion_tokens_pattern = re.compile(
@@ -437,6 +443,7 @@ class LiteLLMModel(BenchmarkModule):
         temperature_messages = [
             "'temperature' is not supported with this model.",
             "temperature is not supported with this model",
+            r"does not support parameters: \[.*'temperature'.*\]",
         ]
         temperature_must_be_one_messages = [
             "`temperature` may only be set to 1",
@@ -454,10 +461,13 @@ class LiteLLMModel(BenchmarkModule):
         seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
         response_format_messages = [
             "got an unexpected keyword argument 'response_format'",
-            "The model outputs empty dictionaries.",
+            "the model returned empty outputs",
         ]
-        if any(msg.lower() in error_msg for msg in stop_messages):
+        if (
+            any(msg.lower() in error_msg for msg in stop_messages)
+            or stop_pattern.search(string=error_msg) is not None
+        ):
             log_once(
                 f"The model {model_id!r} does not support "
                 "stop sequences, so disabling them.",
@@ -467,7 +477,7 @@ class LiteLLMModel(BenchmarkModule):
             return generation_kwargs
         elif (
             any(msg.lower() in error_msg for msg in logprobs_messages)
-            or logprobs_pattern.search(string=error_msg)
+            or logprobs_pattern.search(string=error_msg) is not None
             # Special case for Vertex AI models, since they have strict rate
             # limits on using logprobs. They also have a cap of 5 logprobs, but
             # we ignore this since the rate limiting makes it unusable anyway.
@@ -477,10 +487,15 @@ class LiteLLMModel(BenchmarkModule):
                 f"The model {model_id!r} does not support logprobs, so disabling it.",
                 level=logging.DEBUG,
             )
+            self.buffer["first_label_token_mapping"] = False
             generation_kwargs.pop("logprobs", None)
             generation_kwargs.pop("top_logprobs", None)
+            generation_kwargs.pop("response_format", None)
             return generation_kwargs
-        elif any(msg.lower() in error_msg for msg in top_logprobs_messages):
+        elif (
+            any(msg.lower() in error_msg for msg in top_logprobs_messages)
+            or top_logprobs_pattern.search(string=error_msg) is not None
+        ):
             log_once(
                 f"The model {model_id!r} does not support the `top_logprobs` argument, "
                 "so moving the value to `logprobs`.",
@@ -597,9 +612,10 @@ class LiteLLMModel(BenchmarkModule):
         elif isinstance(
             error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
         ):
-            logger.debug(
+            log(
                 f"Service temporarily unavailable. The error message was: {error}. "
-                "Retrying in 10 seconds..."
+                "Retrying in 10 seconds...",
+                level=logging.DEBUG,
             )
             sleep(10)
             return generation_kwargs
@@ -629,10 +645,32 @@ class LiteLLMModel(BenchmarkModule):
             ) from error
         if isinstance(error, RateLimitError):
-            raise InvalidModel(
+            log(
                 f"You have encountered your rate limit for model {model_id!r}. "
-                "Skipping."
-            ) from error
+                "Retrying in 10 seconds...",
+                level=logging.DEBUG,
+            )
+            sleep(10)
+            return generation_kwargs
+        if (
+            isinstance(error, BadRequestError)
+            and (
+                retry_match := re.search(
+                    pattern=r"\bretry in ([0-9]+(.[0-9]+)?) ?(s|seconds)\b",
+                    string=error_msg,
+                )
+            )
+            is not None
+        ):
+            retry_seconds = float(retry_match.group(1))
+            log(
+                f"Bad request error encountered. Retrying in {retry_seconds:.1f} "
+                "seconds...",
+                level=logging.DEBUG,
+            )
+            sleep(retry_seconds)
+            return generation_kwargs
         if isinstance(error, AuthenticationError):
             raise NeedsAdditionalArgument(
@@ -711,19 +749,19 @@ class LiteLLMModel(BenchmarkModule):
                 for input_ in inputs
                 if isinstance(input_, list)
             ]
-        responses = await tqdm_async.gather(*requests, leave=False)
-        # If we are performing structured generation and the model just outputs an empty
-        # dictionary, then we convert those to exceptions, to disable structured
-        # generation
-        if "response_format" in generation_kwargs:
-            responses = [
-                RuntimeError("The model outputs empty dictionaries.")
-                if not isinstance(response, Exception)
-                and any(choice.message.content == "{}" for choice in response.choices)
-                else response
-                for response in responses
-            ]
+        responses = await tqdm_async.gather(
+            *requests, colour="yellow", ascii="—▰", leave=False
+        )
+        # If the outputs are empty, convert them to exceptions
+        if all(
+            not isinstance(response, Exception)
+            and response.choices[0].message.content == "{}"
+            for response in responses
+        ):
+            responses = [ValueError("The model returned empty outputs.")] * len(
+                responses
+            )
         # Separate the successful responses from the failed ones
         successes = [
@@ -743,7 +781,9 @@ class LiteLLMModel(BenchmarkModule):
                 try:
                     request.close()
                 except RuntimeError as e:
-                    logger.debug(f"RuntimeError during request.close(): {e}")
+                    log(
+                        f"RuntimeError during request.close(): {e}", level=logging.DEBUG
+                    )
         return successes, failures
@@ -768,10 +808,11 @@ class LiteLLMModel(BenchmarkModule):
         for model_response in model_responses:
             if not model_response.choices:
                 sequences.append("")
-                logger.warning(
+                log(
                     f"The model {model_id!r} did not end up "
                     "generating any text. This is likely because the model ran "
-                    "out of tokens while reasoning. Returning an empty string."
+                    "out of tokens while reasoning. Returning an empty string.",
+                    level=logging.WARNING,
                 )
                 continue
@@ -859,11 +900,12 @@ class LiteLLMModel(BenchmarkModule):
                 scores.append(logprobs_list)
         if not sequences:
-            logger.warning(
+            log(
                 "No sequences were generated by the model "
                 f"{model_id!r}. This may be due to the "
                 "model running out of tokens or an issue with the input data. "
-                "Returning an empty GenerativeModelOutput."
+                "Returning an empty GenerativeModelOutput.",
+                level=logging.WARNING,
             )
             return GenerativeModelOutput(sequences=[], scores=None)
@@ -984,7 +1026,7 @@ class LiteLLMModel(BenchmarkModule):
                     model=None,
                     model_id=model_id,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
-                    model_cache_dir=self.model_config.model_cache_dir,
+                    model_config=self.model_config,
                 )
                 if (
@@ -1067,7 +1109,7 @@ class LiteLLMModel(BenchmarkModule):
                     model=None,
                     model_id=model_id,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
-                    model_cache_dir=self.model_config.model_cache_dir,
+                    model_config=self.model_config,
                 )
                 all_max_lengths: list[int] = list()
@@ -1142,6 +1184,7 @@ class LiteLLMModel(BenchmarkModule):
                 return partial(
                     sequence_classification.extract_labels_from_generation,
                     dataset_config=self.dataset_config,
+                    model_config=self.model_config,
                     first_label_token_mapping=self.buffer["first_label_token_mapping"],
                 )
             case TaskGroup.TEXT_TO_TEXT:
@@ -1217,17 +1260,19 @@ class LiteLLMModel(BenchmarkModule):
                 ServiceUnavailableError,
                 InternalServerError,
             ) as e:
-                logger.debug(
+                log(
                     f"Service temporarily unavailable. The error message was: {e}. "
-                    "Retrying in 10 seconds..."
+                    "Retrying in 10 seconds...",
+                    level=logging.DEBUG,
                 )
                 sleep(10)
             except APIError as e:
                 if "'503 Service Unavailable" not in str(e):
                     raise e
-                logger.warning(
+                log(
                     f"Failed to check if model {model_id!r} exists. Retrying in 10 "
-                    "seconds..."
+                    "seconds...",
+                    level=logging.WARNING,
                 )
                 sleep(10)
             except (BadRequestError, NotFoundError):
@@ -1240,21 +1285,25 @@ class LiteLLMModel(BenchmarkModule):
                     case 0:
                         pass
                     case 1:
-                        logger.warning(
+                        log(
                             f"Could not find the model ID {model_id!r}. Did you mean "
-                            f"{candidate_models[0]!r}?"
+                            f"{candidate_models[0]!r}?",
+                            level=logging.WARNING,
                         )
                     case _:
                         candidate_models_str = "', '".join(candidate_models)
-                        logger.warning(
+                        log(
                             f"Could not find the model ID {model_id!r}. Did you mean "
-                            f"any of the following model IDs: '{candidate_models_str}'?"
+                            "any of the following model IDs: "
+                            f"'{candidate_models_str}'?",
+                            level=logging.WARNING,
                         )
                 return False
         else:
-            logger.error(
+            log(
                 f"Failed to check if model {model_id!r} exists after {num_attempts} "
-                "attempts. Assuming it does not exist."
+                "attempts. Assuming it does not exist.",
+                level=logging.ERROR,
             )
             return False
@@ -1287,7 +1336,8 @@ class LiteLLMModel(BenchmarkModule):
                 "that the revision is actually the parameter and set the revision "
                 "to 'main'. In the future, use the new '#' syntax to specify the "
                 f"parameter (in this case, this would be {proper_model_id!r}), as this "
-                "will be an error in future versions of EuroEval."
+                "will be an error in future versions of EuroEval.",
+                level=logging.WARNING,
             )
             model_id_components.param = model_id_components.revision
             model_id_components.revision = "main"
@@ -1375,7 +1425,7 @@ class LiteLLMModel(BenchmarkModule):
         return dataset
-    @cache
+    @cache_arguments()
     def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
         """Get the generation arguments for the model.
@@ -1583,7 +1633,8 @@ def try_download_ollama_model(model_id: str) -> bool:
                         f"The model {model_id!r} cannot be found on Ollama, but the "
                         f"model {model_id_with_prefix} *was* found, so we would "
                         "recommend you cancelling this run and trying the evaluation "
-                        "with that model ID instead."
+                        "with that model ID instead.",
+                        level=logging.WARNING,
                     )
                     return False
                 except ollama.ResponseError as inner_e:
@@ -1601,11 +1652,8 @@ def try_download_ollama_model(model_id: str) -> bool:
                 ) from e
         # Download the model
-        with tqdm(
-            desc=f"Downloading {ollama_model_id}",
-            unit_scale=True,
-            unit="B",
-            leave=False,
+        with get_pbar(
+            desc=f"Downloading {ollama_model_id}", unit_scale=True, unit="B"
         ) as pbar:
             for status in response:
                 if status.total is not None:

EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.2.2py3-none-any.whl → 16.4.0py3-none-any.whl