PyPI - EuroEval - Versions diffs - 15.6.1__py3-none-any.whl → 15.7.0__py3-none-any.whl - Mend

EuroEval 15.6.1py3-none-any.whl → 15.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (22) hide show

euroeval/benchmark_modules/litellm.py +136 -31
euroeval/benchmark_modules/vllm.py +105 -38
euroeval/benchmarker.py +12 -2
euroeval/constants.py +1 -1
euroeval/data_loading.py +48 -26
euroeval/data_models.py +0 -8
euroeval/dataset_configs/finnish.py +60 -0
euroeval/prompt_templates/linguistic_acceptability.py +9 -1
euroeval/prompt_templates/multiple_choice.py +8 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -1
euroeval/prompt_templates/reading_comprehension.py +11 -1
euroeval/prompt_templates/sentiment_classification.py +11 -1
euroeval/prompt_templates/summarization.py +9 -1
euroeval/task_group_utils/sequence_classification.py +27 -32
euroeval/task_group_utils/text_to_text.py +10 -27
euroeval/tasks.py +1 -1
euroeval/tokenization_utils.py +22 -6
{euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/METADATA +14 -2
{euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/RECORD +22 -21
{euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/WHEEL +0 -0
{euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -33,6 +33,7 @@ from litellm.exceptions import (
 )
 from litellm.llms.vertex_ai.common_utils import VertexAIError
 from litellm.types.utils import ChoiceLogprobs, ModelResponse
+from pydantic import conlist, create_model
 from requests.exceptions import RequestException
 from tqdm.auto import tqdm
 from transformers.trainer import Trainer
@@ -104,6 +105,7 @@ MODEL_MAX_LENGTH_MAPPING = {
     r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
     r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
     r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
+    r"gpt-4.1.*": 1_047_576,
     # Anthropic models
     r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
     # Gemini models
@@ -135,20 +137,23 @@ ALLOWED_PARAMS = {
     r"gpt-4.*": [],
     r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "high"],
     # Anthropic models
-    r"(anthropic/)?claude-3-.*": [],
-    r"(anthropic/)?claude-3.5-.*": [],
-    r"(anthropic/)?claude-3.7-sonnet.*": ["thinking"],
+    r"(anthropic/)?claude-3-(haiku|sonnet|opus).*": [],
+    r"(anthropic/)?claude-3-5-.*": [],
+    r"(anthropic/)?claude-3-7-sonnet.*": ["thinking"],
     # Gemini models
     r"(gemini/)?gemini-.*": [],
     # xAI models
-    r"(xai/)?grok.*": [],
+    r"(xai/)?grok-2.*": [],
+    r"(xai/)?grok-3(-fast)?(-beta)?": [],
+    r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "high"],
 }
 REASONING_MODELS = [
     r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",
     r"(gemini/)?gemini.*thinking.*",
-    r"(gemini/)?gemini-2.5-pro.*",
+    r"(gemini/)?gemini-2.5.*",
+    r"(xai/)?grok-3-mini.*",
 ]
@@ -190,7 +195,10 @@ class LiteLLMModel(BenchmarkModule):
         )
         self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
-            dataset_config=self.dataset_config, tokenizer=None
+            dataset_config=self.dataset_config,
+            model_config=self.model_config,
+            tokenizer=None,
+            generative_type=self.generative_type,
         )
     @property
@@ -201,13 +209,20 @@ class LiteLLMModel(BenchmarkModule):
             The generative type of the model, or None if it has not been set yet.
         """
         if self.model_config.revision == "thinking":
-            return GenerativeType.REASONING
+            type_ = GenerativeType.REASONING
         elif re.fullmatch(
             pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
         ):
-            return GenerativeType.REASONING
+            type_ = GenerativeType.REASONING
         else:
-            return GenerativeType.INSTRUCTION_TUNED
+            type_ = GenerativeType.INSTRUCTION_TUNED
+        log_once(
+            f"Detected generative type {type_.name!r} for model "
+            f"{self.model_config.model_id!r}",
+            level=logging.DEBUG,
+        )
+        return type_
     def generate(self, inputs: dict) -> GenerativeModelOutput:
         """Generate outputs from the model.
@@ -243,7 +258,10 @@ class LiteLLMModel(BenchmarkModule):
         # Get the mapping from labels to the first token in the label. We call this each
         # time we generate a new dataset since the dataset config can change
         self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
-            dataset_config=self.dataset_config, tokenizer=None
+            dataset_config=self.dataset_config,
+            model_config=self.model_config,
+            tokenizer=None,
+            generative_type=self.generative_type,
         )
         if self.buffer["first_label_token_mapping"]:
@@ -254,16 +272,41 @@ class LiteLLMModel(BenchmarkModule):
             assert "json" in messages[0]["content"].lower(), (
                 "Prompt must contain 'json' for JSON tasks."
             )
-            generation_kwargs["response_format"] = dict(type="json_object")
-            log_once(
-                "Enabling JSON response format for model "
-                f"{self.model_config.model_id!r}",
-                level=logging.DEBUG,
-            )
+            if self.generative_type == GenerativeType.REASONING:
+                log_once(
+                    f"The model {self.model_config.model_id!r} is a reasoning model "
+                    "and thus does not support structured generation, so we do not "
+                    "enable it.",
+                    level=logging.DEBUG,
+                )
+            elif litellm.utils.supports_response_schema(
+                model=self.model_config.model_id
+            ):
+                ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
+                keys_and_their_types: dict[str, t.Any] = {
+                    tag_name: (conlist(str, max_length=5), ...)
+                    for tag_name in ner_tag_names
+                }
+                pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
+                generation_kwargs["response_format"] = pydantic_class
+                log_once(
+                    "Enabling structured generation for model "
+                    f"{self.model_config.model_id!r} with the JSON schema "
+                    f"{pydantic_class.model_json_schema()}",
+                    level=logging.DEBUG,
+                )
+            else:
+                generation_kwargs["response_format"] = dict(type="json_object")
+                log_once(
+                    "Enabling structured JSON generation for model "
+                    f"{self.model_config.model_id!r} with no custom JSON schema, as "
+                    "the model does not support schemas.",
+                    level=logging.DEBUG,
+                )
         if self.model_config.revision == "thinking":
             generation_kwargs["thinking"] = dict(
-                type="enabled", budget_tokens=REASONING_MAX_TOKENS
+                type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
             )
             log_once(
                 f"Enabling thinking mode for model {self.model_config.model_id!r}",
@@ -280,28 +323,42 @@ class LiteLLMModel(BenchmarkModule):
         # This drops generation kwargs that are not supported by the model
         litellm.drop_params = True
+        # Error messages that we want to catch and handle
+        stop_messages = ["stop_sequences", "'stop' is not supported with this model"]
+        logprobs_messages = [
+            "you are not allowed to request logprobs",
+            "you've reached the maximum number of requests with logprobs",
+            "logprobs is not supported",
+            "logprobs is not enabled",
+        ]
+        temperature_messages = [
+            "'temperature' is not supported with this model.",
+            "temperature is not supported with this model",
+        ]
+        temperature_must_be_one_messages = [
+            "`temperature` may only be set to 1",
+            "'temperature' does not support 0.0 with this model. Only the default "
+            "(1) value is supported",
+        ]
+        max_items_messages = ["'maxItems' is not permitted."]
+        no_json_schema_messages = ["Property keys should match pattern"]
         # Extract the generated sequences from the model response. Some APIs cannot
         # handle using newlines as stop sequences, so we try both.
         num_attempts = 10
         for _ in range(num_attempts):
-            stop_messages = ["stop_sequences"]
-            logprobs_messages = [
-                "you are not allowed to request logprobs",
-                "you've reached the maximum number of requests with logprobs",
-                "logprobs is not supported",
-                "logprobs is not enabled",
-            ]
-            temperature_messages = [
-                "'temperature' is not supported with this model.",
-                "temperature is not supported with this model",
-            ]
             try:
-                model_response = litellm.completion(
-                    messages=messages, max_retries=3, **generation_kwargs
+                model_response = litellm.completion_with_retries(
+                    messages=messages, **generation_kwargs
                 )
                 break
             except (BadRequestError, RateLimitError) as e:
                 if any(msg.lower() in str(e).lower() for msg in stop_messages):
+                    log_once(
+                        f"The model {self.model_config.model_id!r} does not support "
+                        "stop sequences, so disabling them.",
+                        level=logging.DEBUG,
+                    )
                     generation_kwargs["stop"] = None
                 elif (
                     any(msg.lower() in str(e).lower() for msg in logprobs_messages)
@@ -310,10 +367,55 @@ class LiteLLMModel(BenchmarkModule):
                     # we ignore this since the rate limiting makes it unusable anyway.
                     or (isinstance(e, VertexAIError) and "logprobs" in str(e).lower())
                 ):
+                    log_once(
+                        f"The model {self.model_config.model_id!r} does not support "
+                        "logprobs, so disabling it.",
+                        level=logging.DEBUG,
+                    )
                     generation_kwargs.pop("logprobs")
                     generation_kwargs.pop("top_logprobs")
                 elif any(msg.lower() in str(e).lower() for msg in temperature_messages):
+                    log_once(
+                        f"The model {self.model_config.model_id!r} does not support "
+                        "temperature, so disabling it.",
+                        level=logging.DEBUG,
+                    )
                     generation_kwargs.pop("temperature")
+                elif any(
+                    msg.lower() in str(e).lower()
+                    for msg in temperature_must_be_one_messages
+                ):
+                    log_once(
+                        f"The model {self.model_config.model_id!r} requires "
+                        "temperature to be set to 1, so setting it.",
+                        level=logging.DEBUG,
+                    )
+                    generation_kwargs["temperature"] = 1.0
+                elif any(msg.lower() in str(e).lower() for msg in max_items_messages):
+                    log_once(
+                        f"The model {self.model_config.model_id!r} does not support "
+                        "maxItems in the JSON schema, so disabling it.",
+                        level=logging.DEBUG,
+                    )
+                    ner_tag_names = list(
+                        self.dataset_config.prompt_label_mapping.values()
+                    )
+                    keys_and_their_types = {
+                        tag_name: (list[str], ...) for tag_name in ner_tag_names
+                    }
+                    pydantic_class = create_model(
+                        "AnswerFormat", **keys_and_their_types
+                    )
+                    generation_kwargs["response_format"] = pydantic_class
+                elif any(
+                    msg.lower() in str(e).lower() for msg in no_json_schema_messages
+                ):
+                    log_once(
+                        f"The model {self.model_config.model_id!r} does not support "
+                        "JSON schemas, so using the vanilla JSON format.",
+                        level=logging.DEBUG,
+                    )
+                    generation_kwargs["response_format"] = dict(type="json_object")
                 elif isinstance(e, RateLimitError):
                     raise InvalidModel(
                         "You have encountered your rate limit for model "
@@ -332,6 +434,7 @@ class LiteLLMModel(BenchmarkModule):
                 Timeout,
                 ServiceUnavailableError,
                 InternalServerError,
+                SystemError,
             ) as e:
                 logger.debug(
                     f"Service temporarily unavailable. The error message was: {e}. "
@@ -359,9 +462,11 @@ class LiteLLMModel(BenchmarkModule):
                 "reasoning. Returning an empty string."
             )
             return GenerativeModelOutput(sequences=[""])
         model_response_choices = model_response.choices[0]
         assert isinstance(model_response_choices, litellm.Choices)
-        generation_output = model_response_choices.message["content"] or ""
+        generated_message: litellm.Message = model_response_choices.message
+        generation_output = generated_message.content or ""
         generation_output = generation_output.strip()
         # Structure the model output as a GenerativeModelOutput object

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -132,7 +132,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         self._model: LLM = model
         self._tokenizer: PreTrainedTokenizer = tokenizer
         self.end_of_reasoning_token_id = get_end_of_reasoning_token_id(
-            model=self._model, tokenizer=self._tokenizer
+            model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
         )
         # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
@@ -146,7 +146,10 @@ class VLLMModel(HuggingFaceEncoderModel):
         self.buffer |= dict(
             instruction_model=self._tokenizer.chat_template is not None,
             first_label_token_mapping=get_first_label_token_mapping(
-                dataset_config=self.dataset_config, tokenizer=self._tokenizer
+                dataset_config=self.dataset_config,
+                model_config=self.model_config,
+                tokenizer=self._tokenizer,
+                generative_type=self.generative_type,
             ),
         )
         if self.model_config.adapter_base_model_id is not None:
@@ -332,30 +335,40 @@ class VLLMModel(HuggingFaceEncoderModel):
             if end_of_chat_token:
                 stop_tokens.append(end_of_chat_token)
+        logits_processor = None
         if self.dataset_config.task in TASKS_USING_JSON:
-            ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
-            keys_and_their_types: dict[str, t.Any] = {
-                tag_name: (conlist(str, max_length=5), ...)
-                for tag_name in ner_tag_names
-            }
-            pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
-            logits_processor = JSONLogitsProcessor(
-                schema=pydantic_class,
-                tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  # type: ignore
-                whitespace_pattern=r" ?",
-            )
-            log_once(
-                "Using structured generation with the schema "
-                f"{pydantic_class.model_json_schema()}",
-                level=logging.DEBUG,
-            )
-        else:
-            logits_processor = None
+            if self.generative_type == GenerativeType.REASONING:
+                log_once(
+                    f"The model {self.model_config.model_id!r} is a reasoning model "
+                    "and thus does not support structured generation, so we do not "
+                    "enable it.",
+                    level=logging.DEBUG,
+                )
+            else:
+                ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
+                keys_and_their_types: dict[str, t.Any] = {
+                    tag_name: (conlist(str, max_length=5), ...)
+                    for tag_name in ner_tag_names
+                }
+                pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
+                logits_processor = JSONLogitsProcessor(
+                    schema=pydantic_class,
+                    tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  # type: ignore
+                    whitespace_pattern=r" ?",
+                )
+                log_once(
+                    "Using structured generation with the JSON schema "
+                    f"{pydantic_class.model_json_schema()}",
+                    level=logging.DEBUG,
+                )
         # Get the mapping from labels to the first token in the label. We call this each
         # time we generate a new dataset since the dataset config can change
         self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
-            dataset_config=self.dataset_config, tokenizer=self._tokenizer
+            dataset_config=self.dataset_config,
+            model_config=self.model_config,
+            tokenizer=self._tokenizer,
+            generative_type=self.generative_type,
         )
         # Define the parameters used for vLLM generation
@@ -391,7 +404,10 @@ class VLLMModel(HuggingFaceEncoderModel):
         ) and should_prompts_be_stripped(
             labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
         ):
-            log_once(message="Stripping prompts.", level=logging.DEBUG)
+            log_once(
+                f"Stripping prompts for model {self.model_config.model_id!r}.",
+                level=logging.DEBUG,
+            )
             prompts = [prompt.strip() for prompt in prompts]
         # Generate sequences using vLLM
@@ -411,18 +427,64 @@ class VLLMModel(HuggingFaceEncoderModel):
                     f"Encountered error during vLLM generation: {str(e)}. Retrying..."
                 )
                 sleep(1)
+            except ValueError as e:
+                # Truncate the prompts if they are too long for the model
+                truncate_error_messages = [
+                    r"prompt \(length [0-9]+\) is longer than the maximum model length"
+                ]
+                if any(
+                    re.search(pattern, str(e), flags=re.IGNORECASE) is not None
+                    for pattern in truncate_error_messages
+                ):
+                    logger.info(
+                        "Prompts are too long, so truncating them and trying again..."
+                    )
+                    tokenized_prompts = self._tokenizer(
+                        text=prompts,
+                        truncation=True,
+                        max_length=max(
+                            self._tokenizer.model_max_length - max_tokens, 0
+                        ),
+                    )
+                    prompts = self._tokenizer.batch_decode(
+                        sequences=tokenized_prompts.input_ids, skip_special_tokens=True
+                    )
+                else:
+                    raise InvalidBenchmark(
+                        f"An error occurred during vLLM generation: {str(e)}"
+                    )
         else:
             raise InvalidBenchmark(
                 f"Could not generate sequences after {num_attempts} attempts."
             )
+        # When we shorten the prompts then some residual model outputs persist, so we
+        # need to filter these out
+        num_extra_outputs = len(raw_outputs) - len(prompts)
+        if num_extra_outputs > 0:
+            raw_outputs = raw_outputs[num_extra_outputs:]
+            if not all(
+                raw_output.prompt == prompt
+                for raw_output, prompt in zip(raw_outputs, prompts)
+            ):
+                raise InvalidBenchmark(
+                    f"The prompts and the model outputs do not match. There were "
+                    f"{num_extra_outputs!r} extra outputs."
+                )
+            else:
+                logger.debug(
+                    f"Filtered out {num_extra_outputs:,} extra outputs from the model, "
+                    "which occured as we interupted the generation when we truncated "
+                    "the prompts."
+                )
         # Parse the raw model outputs
         completion_ids: list[list[int]] = [
             output.outputs[0].token_ids for output in raw_outputs
         ]
         if self.end_of_reasoning_token_id in completion_ids[0]:
             completion_ids = [
-                token_ids[token_ids.index(self.end_of_reasoning_token_id) + 2 :]
+                token_ids[token_ids.index(self.end_of_reasoning_token_id) + 1 :]
                 if self.end_of_reasoning_token_id in token_ids
                 else token_ids
                 for token_ids in completion_ids
@@ -435,6 +497,13 @@ class VLLMModel(HuggingFaceEncoderModel):
         )
         completions = [completion.strip() for completion in completions]
+        # Sanity check
+        if len(completions) != len(prompts):
+            breakpoint()
+            raise InvalidBenchmark(
+                f"Expected {len(prompts):,} completions, but got {len(completions):,}."
+            )
         # Add logprobs scores to the output
         if self.buffer["first_label_token_mapping"]:
             scores: list[list[list[tuple[str, float]]]] = [
@@ -809,7 +878,8 @@ class VLLMModel(HuggingFaceEncoderModel):
                     if name.lower() in language_codes:
                         chat_template = candidate_template
                         log_once(
-                            f"Using the {name!r} chat template for the tokenizer.",
+                            f"Using the {name!r} chat template for the tokenizer for "
+                            f"model {self.model_config.model_id!r}.",
                             level=logging.DEBUG,
                         )
                         break
@@ -1169,7 +1239,7 @@ def clear_vllm() -> None:
 def get_end_of_reasoning_token_id(
-    model: "LLM", tokenizer: "PreTrainedTokenizer"
+    model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
 ) -> int | None:
     """Get the end of reasoning token ID for a generative model.
@@ -1182,6 +1252,8 @@ def get_end_of_reasoning_token_id(
             The vLLM model.
         tokenizer:
             The tokenizer.
+        model_id:
+            The model ID.
     Returns:
         The end of reasoning token ID, or None if it could not be found.
@@ -1220,10 +1292,8 @@ def get_end_of_reasoning_token_id(
     completion_match = re.search(pattern=r"<\w+>", string=completion)
     if completion_match is None and prompt_match is None:
         log_once(
-            message=(
-                "Could not find a reasoning token, so assuming the model is not a "
-                "reasoning model."
-            ),
+            f"Could not find a reasoning token for model {model_id!r}, so assuming "
+            "the model is not a reasoning model.",
             level=logging.DEBUG,
         )
         return None
@@ -1249,20 +1319,17 @@ def get_end_of_reasoning_token_id(
         or end_of_reasoning_token not in special_tokens
     ):
         log_once(
-            message=(
-                f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
-                f"token {end_of_reasoning_token!r}, but one of them is not registered "
-                "as a special token, so assuming it is not a real reasoning token."
-            ),
+            f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
+            f"token {end_of_reasoning_token!r} for model {model_id!r}, but one of "
+            "them is not registered as a special token, so assuming it is not a "
+            "real reasoning token.",
             level=logging.DEBUG,
         )
         return None
     log_once(
-        message=(
-            f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
-            f"token {end_of_reasoning_token!r}."
-        ),
+        f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
+        f"token {end_of_reasoning_token!r} for model {model_id!r}.",
         level=logging.DEBUG,
     )

euroeval/benchmarker.py CHANGED Viewed

@@ -782,7 +782,11 @@ class Benchmarker:
                     dataset_languages=[
                         language.code for language in dataset_config.languages
                     ],
-                    model=model_config.model_id,
+                    model=(
+                        f"{model_config.model_id}@{model_config.revision}"
+                        if model_config.revision and model_config.revision != "main"
+                        else model_config.model_id
+                    ),
                     results=results,
                     num_model_parameters=model.num_params,
                     max_sequence_length=model.model_max_length,
@@ -1076,6 +1080,10 @@ def initial_logging(
         benchmark_config:
             The general benchmark configuration.
     """
+    model_id = model_config.model_id
+    if model_config.revision and model_config.revision != "main":
+        model_id += f"@{model_config.revision}"
     split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
     if model_config.task in GENERATIVE_PIPELINE_TAGS:
         if benchmark_config.few_shot:
@@ -1084,8 +1092,9 @@ def initial_logging(
             eval_type = "Zero-shot benchmarking"
     else:
         eval_type = "Benchmarking"
     logger.info(
-        f"{eval_type} {model_config.model_id} on the {split_type} split of "
+        f"{eval_type} {model_id} on the {split_type} split of "
         f"{dataset_config.pretty_name}"
     )
@@ -1095,6 +1104,7 @@ def initial_logging(
             "meaning that the resulting evaluation will not be included in the "
             "official leaderboard."
         )
     if benchmark_config.debug:
         logger.info(
             "Running in debug mode. This will output additional information, as "

euroeval/constants.py CHANGED Viewed

@@ -16,7 +16,7 @@ MAX_CONTEXT_LENGTH = 5_000
 # We need to raise the amount of tokens generated for reasoning models, to give them
 # time to think
-REASONING_MAX_TOKENS = 8_192
+REASONING_MAX_TOKENS = 32_768
 # The Hugging Face Hub pipeline tags used to classify models as generative

euroeval/data_loading.py CHANGED Viewed

@@ -39,32 +39,9 @@ def load_data(
         HuggingFaceHubDown:
             If the Hugging Face Hub is down.
     """
-    num_attempts = 5
-    for _ in range(num_attempts):
-        try:
-            dataset = load_dataset(
-                path=dataset_config.huggingface_id,
-                cache_dir=benchmark_config.cache_dir,
-                token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
-            )
-            break
-        except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
-            logger.warning(
-                f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
-            )
-            time.sleep(1)
-            continue
-        except HfHubHTTPError:
-            raise HuggingFaceHubDown()
-    else:
-        raise InvalidBenchmark(
-            f"Failed to load dataset {dataset_config.huggingface_id!r} after "
-            f"{num_attempts} attempts."
-        )
-    assert isinstance(dataset, DatasetDict)  # type: ignore[used-before-def]
-    dataset = DatasetDict({key: dataset[key] for key in ["train", "val", "test"]})
+    dataset = load_raw_data(
+        dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
+    )
     if not benchmark_config.evaluate_test_split:
         dataset["test"] = dataset["val"]
@@ -101,3 +78,48 @@ def load_data(
         for idx in range(benchmark_config.num_iterations)
     ]
     return datasets
+def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDict:
+    """Load the raw dataset.
+    Args:
+        dataset_config:
+            The configuration for the dataset.
+        cache_dir:
+            The directory to cache the dataset.
+    Returns:
+        The dataset.
+    """
+    num_attempts = 5
+    for _ in range(num_attempts):
+        try:
+            dataset = load_dataset(
+                path=dataset_config.huggingface_id,
+                cache_dir=cache_dir,
+                token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
+            )
+            break
+        except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
+            logger.warning(
+                f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
+            )
+            time.sleep(1)
+            continue
+        except HfHubHTTPError:
+            raise HuggingFaceHubDown()
+    else:
+        raise InvalidBenchmark(
+            f"Failed to load dataset {dataset_config.huggingface_id!r} after "
+            f"{num_attempts} attempts."
+        )
+    assert isinstance(dataset, DatasetDict)  # type: ignore[used-before-def]
+    required_keys = ["train", "val", "test"]
+    missing_keys = [key for key in required_keys if key not in dataset]
+    if missing_keys:
+        raise InvalidBenchmark(
+            "The dataset is missing the following required splits: "
+            f"{', '.join(missing_keys)}"
+        )
+    return DatasetDict({key: dataset[key] for key in required_keys})

euroeval/data_models.py CHANGED Viewed

@@ -521,14 +521,6 @@ class DatasetConfig:
         Returns:
             The natural string representation of the labels in specified language.
-        Raises:
-            NotImplementedError:
-                If `and_separator` or `or_separator` are `None`, see `Language`.
-        Example:
-            >>> get_labels_str(language=DA)
-            "'a', 'b', 'c' eller 'd'"
         """
         main_language = self.languages[0]

euroeval/dataset_configs/finnish.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""All Finnish dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..languages import FI
+from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
+### Official datasets ###
+SCANDISENT_FI_CONFIG = DatasetConfig(
+    name="scandisent-fi",
+    pretty_name="the truncated version of the Finnish part of the binary sentiment "
+    "classification dataset ScandiSent",
+    huggingface_id="EuroEval/scandisent-fi-mini",
+    task=SENT,
+    languages=[FI],
+    _labels=["negative", "positive"],
+)
+TURKU_NER_FI_CONFIG = DatasetConfig(
+    name="turku-ner-fi",
+    pretty_name="the Finnish part of the named entity recognition dataset Turku NER",
+    huggingface_id="EuroEval/turku-ner-fi-mini",
+    task=NER,
+    languages=[FI],
+)
+TYDIQA_FI_CONFIG = DatasetConfig(
+    name="tydiqa-fi",
+    pretty_name="the Finnish part of the TydiQA reading comprehension dataset",
+    huggingface_id="EuroEval/tydiqa-fi-mini",
+    task=RC,
+    languages=[FI],
+)
+XLSUM_FI_CONFIG = DatasetConfig(
+    name="xlsum-fi",
+    pretty_name="the Finnish summarisation dataset XL-Sum",
+    huggingface_id="EuroEval/xlsum-fi-mini",
+    task=SUMM,
+    languages=[FI],
+)
+HELLASWAG_FI_CONFIG = DatasetConfig(
+    name="hellaswag-fi",
+    pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
+    "HellaSwag-fi, translated from the English HellaSwag dataset",
+    huggingface_id="EuroEval/hellaswag-fi-mini",
+    task=COMMON_SENSE,
+    languages=[FI],
+)
+SCALA_FI_CONFIG = DatasetConfig(
+    name="scala-fi",
+    pretty_name="the Finnish part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-fi",
+    task=LA,
+    languages=[FI],
+)
+### Unofficial datasets ###

euroeval/prompt_templates/linguistic_acceptability.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Templates for the Linguistic Acceptability task."""
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
+from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
 LA_TEMPLATES = {
     DA: PromptConfig(
@@ -36,6 +36,14 @@ LA_TEMPLATES = {
         default_instruction_prompt="Texto: {text}\n\nDetermina si el texto es "
         "gramaticalmente correcto o no. Responde con {labels_str}, y nada más.",
     ),
+    FI: PromptConfig(
+        default_prompt_label_mapping=dict(correct="kyllä", incorrect="ei"),
+        default_prompt_prefix="Seuraavat ovat lauseita ja ovatko ne "
+        "kieliopillisesti oikein.",
+        default_prompt_template="Lause: {text}\nKieliopillisesti oikein: {label}",
+        default_instruction_prompt="Lause: {text}\n\nMääritä onko lause "
+        "oikein vai ei. Vastaa {labels_str}, ja ei mitään muuta.",
+    ),
     FO: PromptConfig(
         default_prompt_label_mapping=dict(correct="ja", incorrect="nei"),
         default_prompt_prefix="Hetta eru nakrir setningar og um teir eru mállæruliga "

euroeval/prompt_templates/multiple_choice.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Templates for all multiple choice tasks."""
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
+from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
 # TODO: Missing Faroese
 MULTIPLE_CHOICE_TEMPLATES = {
@@ -36,6 +36,13 @@ MULTIPLE_CHOICE_TEMPLATES = {
         "usando solo {labels_str}, y nada más.",
         default_prompt_label_mapping="auto",
     ),
+    FI: PromptConfig(
+        default_prompt_prefix="Seuraavat ovat monivalintakysymyksiä (vastauksineen).",
+        default_prompt_template="Kysymys: {text}\nVastaus: {label}",
+        default_instruction_prompt="Kysymys: {text}\n\nVastaa yllä olevaan kysymykseen "
+        "käyttämällä {labels_str}, äläkä mitään muuta.",
+        default_prompt_label_mapping="auto",
+    ),
     FR: PromptConfig(
         default_prompt_prefix="Les questions suivantes sont des questions à choix "
         "multiples (avec réponses).",

euroeval/prompt_templates/named_entity_recognition.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Templates for the Named Entity Recognition task."""
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
+from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
 NER_TEMPLATES = {
     DA: PromptConfig(
@@ -80,6 +80,25 @@ NER_TEMPLATES = {
         "claves {labels_str}. Los valores deben ser listas de las "
         "entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
     ),
+    FI: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "henkilö",
+            "i-per": "henkilö",
+            "b-loc": "paikka",
+            "i-loc": "paikka",
+            "b-org": "organisaatio",
+            "i-org": "organisaatio",
+            "b-misc": "muut",
+            "i-misc": "muut",
+        },
+        default_prompt_prefix="Seuraavassa on lauseita ja JSON-sanakirjoja, jotka "
+        "sisältävät annetussa lauseessa esiintyvät nimetyt entiteetit.",
+        default_prompt_template="Lause: {text}\nNimetyt entiteetit: {label}",
+        default_instruction_prompt="Lause: {text}\n\nTunnista lauseessa olevat "
+        "entiteetit. Tulosta ne JSON-sanakirjana, jonka avaimet ovat {labels_str}. "
+        "Arvojen tulee olla listoja kyseisen tyypin nimetyistä entiteeteistä "
+        "täsmälleen siinä muodossa kuin ne esiintyvät lauseessa.",
+    ),
     FO: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "persónur",

euroeval/prompt_templates/reading_comprehension.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Templates for the Reading Comprehension task."""
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
+from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
 RC_TEMPLATES = {
     DA: PromptConfig(
@@ -39,6 +39,16 @@ RC_TEMPLATES = {
         "sobre el texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
         default_prompt_label_mapping=dict(),
     ),
+    FI: PromptConfig(
+        default_prompt_prefix="Seuraavassa on tekstejä ja niihin liittyviä kysymyksiä "
+        "ja vastauksia.",
+        default_prompt_template="Teksti: {text}\nKysymys: {question} "
+        "\nVastaa enintään 3 sanalla: {label}",
+        default_instruction_prompt="Teksti: {text}\n\nVastaa seuraavaan "
+        "kysymykseen yllä olevasta tekstistä enintään 3 sanalla.\n\n"
+        "Kysymys: {question}",
+        default_prompt_label_mapping=dict(),
+    ),
     FO: PromptConfig(
         default_prompt_prefix="Hetta eru tekstir saman við spurningum og svar.",
         default_prompt_template="Tekstur: {text}\nSpurningur: {question}\nSvara við í "

euroeval/prompt_templates/sentiment_classification.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Templates for the Sentiment Analysis task."""
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
+from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
 SENT_TEMPLATES = {
     DA: PromptConfig(
@@ -44,6 +44,16 @@ SENT_TEMPLATES = {
         default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
         "documento. Responde con {labels_str}, y nada más.",
     ),
+    FI: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positiivinen", neutral="neutrali", negative="negatiivinen"
+        ),
+        default_prompt_prefix="Seuraavassa on arvosteluja ja niiden tunnesävy, joka "
+        "voi olla {labels_str}.",
+        default_prompt_template="Teksti: {text}\nTunnesävy: {label}",
+        default_instruction_prompt="Teksti: {text}\n\nLuokittele arvostelun tunnesävy. "
+        "Vastaa vain {labels_str}, ei muuta.",
+    ),
     FO: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="positivt", neutral="neutralt", negative="negativt"

euroeval/prompt_templates/summarization.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Templates for the Summarization task."""
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
+from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
 # TODO: Missing Faroese
 SUMM_TEMPLATES = {
@@ -36,6 +36,14 @@ SUMM_TEMPLATES = {
         "documento anterior.",
         default_prompt_label_mapping=dict(),
     ),
+    FI: PromptConfig(
+        default_prompt_prefix="Seuraavassa on artikkeleita ja niihin liittyviä "
+        "tiivistelmiä.",
+        default_prompt_template="Uutisartikkeli: {text}\nTiivistelmä: {target_text}",
+        default_instruction_prompt="Uutisartikkeli: {text}\n\nKirjoita tiivistelmä "
+        "yllä olevasta artikkelista.",
+        default_prompt_label_mapping=dict(),
+    ),
     FR: PromptConfig(
         default_prompt_prefix="Les documents suivants sont accompagnés d'un résumé.",
         default_prompt_template="Document: {text}\nRésumé: {target_text}",

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -132,22 +132,23 @@ def extract_labels_from_generation(
         The predicted labels.
     """
     if model_output.scores is not None:
-        return get_closest_logprobs_labels(
+        labels = get_closest_logprobs_labels(
             generation_logprobs=model_output.scores,
             dataset_config=dataset_config,
             first_label_token_mapping=first_label_token_mapping,
         )
-    else:
-        return get_closest_word_edit_labels(
-            generated_sequences=model_output.sequences, dataset_config=dataset_config
-        )
+        if labels is not None:
+            return labels
+    return get_closest_word_edit_labels(
+        generated_sequences=model_output.sequences, dataset_config=dataset_config
+    )
 def get_closest_logprobs_labels(
     generation_logprobs: list[list[list[tuple[str, float]]]],
     dataset_config: "DatasetConfig",
     first_label_token_mapping: dict[str, str] | bool,
-) -> list[str]:
+) -> list[str] | None:
     """Get the labels with the highest predicted logprob value.
     In case a candidate label is split into multiple tokens, we only use the first
@@ -167,7 +168,7 @@ def get_closest_logprobs_labels(
             mapping is outputted then the model will always output scores).
     Returns:
-        The predicted labels.
+        The predicted labels, or None if labels could not be extracted.
     Raises:
         InvalidBenchmark:
@@ -193,10 +194,7 @@ def get_closest_logprobs_labels(
             # We want to use the first generated label which contains a unique candidate
             # label, as the output label
             output_label: str | None = None
-            previously_generated_labels: list[str] = list()
-            for label_idx, generated_label in enumerate(generated_labels):
-                generated_label = "".join(previously_generated_labels) + generated_label
+            for generated_label in generated_labels:
                 # Get the candidate labels that starts with the generated label
                 if isinstance(first_label_token_mapping, dict):
                     if any(
@@ -222,31 +220,28 @@ def get_closest_logprobs_labels(
                         if candidate_label.startswith(generated_label)
                     }
-                # If we can uniquely determine the output label, we break the loop. If
-                # there are multiple possible labels then we store the current one, and
-                # concatenate it with the next generated label. We can only do this if
-                # the current one is the first one, however, since we're using greedy
-                # sampling. In case this happens for a label that is not the first one,
-                # we warn the user.
+                # If we can uniquely determine the output label, we break the loop.
                 if len(candidate_output_labels) == 1:
                     output_label = candidate_output_labels.pop()
                     break
+                # If we have multiple candidate labels, we cannot uniquely determine the
+                # output label, so we abandon extracting the labels using logprobs and
+                # fall back to using word edit distance.
                 elif len(candidate_output_labels) > 1:
-                    if label_idx == 0:
-                        previously_generated_labels.append(generated_label)
-                    else:
-                        output_label = candidate_output_labels.pop()
-                        candidate_output_labels.add(output_label)
-                        raise InvalidBenchmark(
-                            "Multiple candidate labels found for the generated label "
-                            f"{generated_label!r}: {candidate_output_labels}. Since "
-                            "this is not the first generated label, we cannot "
-                            "concatenate it with the next generated label. We are thus "
-                            f"forced to use the arbitrary {output_label!r} as the "
-                            "output label, potentially resulting in worse performance. "
-                            "Please report this issue to the EuroEval team at "
-                            "github.com/EuroEval/EuroEval/issues."
-                        )
+                    log_once(
+                        "Multiple candidate labels found for the generated label "
+                        f"{generated_label!r}: {candidate_output_labels}. This means "
+                        "that using logprobs to extract the labels is not reliable, "
+                        "and we will instead fall back to extracting the labels "
+                        "using word edit distance.",
+                        level=logging.DEBUG,
+                    )
+                    return None
+                # If no candidate label is found, we ignore the generated label, as it
+                # basically means that the model is just really bad at generating
+                # labels.
                 elif len(candidate_output_labels) == 0:
                     logger.debug(
                         f"No candidate label found for the generated label "

euroeval/task_group_utils/text_to_text.py CHANGED Viewed

@@ -10,11 +10,7 @@ from evaluate import EvaluationModule
 from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
 from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
 from ..exceptions import InvalidBenchmark
-from ..utils import (
-    HiddenPrints,
-    clear_memory,
-    raise_if_model_output_contains_nan_values,
-)
+from ..utils import HiddenPrints, raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
     from transformers.trainer_utils import EvalPrediction
@@ -89,20 +85,8 @@ def compute_metrics(
                     score_dict: dict[str, float] | None = metric.compute(
                         predictions=predictions, references=labels, **cfg.compute_kwargs
                     )
-                # Clear the cache of the BERTScorer to avoid memory leaks
-                for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
-                    if hasattr(metric, attribute):
-                        delattr(metric, attribute)
-                clear_memory()
                 break
             except Exception as e:
-                # Clear the cache of the BERTScorer to avoid memory leaks
-                if hasattr(metric, "cached_bertscorer"):
-                    del metric.cached_bertscorer
-                    clear_memory()
                 oom_error = [
                     "CUDA out of memory",
                     "CUDA error",
@@ -111,16 +95,7 @@ def compute_metrics(
                 if not any(error in str(e) for error in oom_error):
                     raise InvalidBenchmark(str(e))
-                if cfg.compute_kwargs.get("batch_size", 1) > 1:
-                    batch_size = cfg.compute_kwargs["batch_size"]
-                    cfg.compute_kwargs["batch_size"] = batch_size // 2
-                    logger.debug(
-                        "Out of memory error occurred during the computation of "
-                        f"the metric {cfg.pretty_name}. Reducing the batch size to "
-                        f"{cfg.compute_kwargs['batch_size']}."
-                    )
-                elif cfg.compute_kwargs.get("device", "cpu") != "cpu":
-                    cfg.compute_kwargs["batch_size"] = 32
+                if cfg.compute_kwargs.get("device", "cpu") != "cpu":
                     cfg.compute_kwargs["device"] = "cpu"
                     logger.debug(
                         "Out of memory error occurred during the computation of "
@@ -129,6 +104,14 @@ def compute_metrics(
                     )
                 else:
                     raise InvalidBenchmark(str(e))
+            finally:
+                for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
+                    if hasattr(metric, attribute):
+                        logger.debug(
+                            f"Deleting the {attribute!r} attribute of the metric "
+                            f"{cfg.pretty_name} to free up memory."
+                        )
+                        delattr(metric, attribute)
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process

euroeval/tasks.py CHANGED Viewed

@@ -142,7 +142,7 @@ SUMM = Task(
             huggingface_id="bertscore",
             results_key="f1",
             compute_kwargs=dict(
-                model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=32
+                model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
             ),
         ),
         MetricConfig(

euroeval/tokenization_utils.py CHANGED Viewed

@@ -7,6 +7,7 @@ import typing as t
 import torch
 from .constants import TASK_GROUPS_USING_LOGPROBS
+from .enums import GenerativeType
 from .exceptions import InvalidModel
 from .utils import log_once
@@ -14,7 +15,7 @@ if t.TYPE_CHECKING:
     from transformers.tokenization_utils import PreTrainedTokenizer
     from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-    from .data_models import DatasetConfig
+    from .data_models import DatasetConfig, ModelConfig
 logger = logging.getLogger("euroeval")
@@ -254,35 +255,50 @@ def get_end_of_chat_token_ids(tokenizer: "PreTrainedTokenizer") -> list[int] | N
 def get_first_label_token_mapping(
-    dataset_config: "DatasetConfig", tokenizer: "PreTrainedTokenizer | None"
+    dataset_config: "DatasetConfig",
+    model_config: "ModelConfig",
+    tokenizer: "PreTrainedTokenizer | None",
+    generative_type: "GenerativeType | None",
 ) -> dict[str, str] | bool:
     """Check if the model should output scores.
     Args:
         dataset_config:
             The dataset configuration.
+        model_config:
+            The model configuration.
         tokenizer:
             The tokenizer, or None if not available.
+        generative_type:
+            The generative type, or None if not available.
     Returns:
         A mapping from labels to the first token in each label, or alternatively a
         Boolean value indicating whether the model should output scores (if the mapping
         is outputted then the model will always output scores).
     """
+    if generative_type == GenerativeType.REASONING:
+        log_once(
+            f"The model {model_config.model_id!r} is a reasoning model and "
+            "thus does not support logprobs, so we do not enable it.",
+            level=logging.DEBUG,
+        )
+        return False
     # If we do not have any tokenizer, then we cannot check if the model should output
     # scores and we just assume it should if the dataset supports it
     output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
     if tokenizer is None:
         if output_scores:
             log_once(
-                "The model will output scores, since the dataset supports it and no "
-                "tokenizer is available.",
+                f"The model {model_config.model_id!r} will output scores, since the "
+                "dataset supports it and no tokenizer is available.",
                 level=logging.DEBUG,
             )
         else:
             log_once(
-                "The model will not output scores, since the dataset does not support "
-                "it and no tokenizer is available.",
+                f"The model {model_config.model_id!r} will not output scores, since "
+                "the dataset does not support it and no tokenizer is available.",
                 level=logging.DEBUG,
             )
         return output_scores

{euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.6.1
+Version: 15.7.0
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -32,7 +32,7 @@ Requires-Python: <4.0,>=3.10
 Requires-Dist: accelerate>=0.34.2
 Requires-Dist: bert-score>=0.3.13
 Requires-Dist: click>=8.1.3
-Requires-Dist: datasets>=2.15.0
+Requires-Dist: datasets>=3.5.0
 Requires-Dist: demjson3>=3.0.6
 Requires-Dist: evaluate>=0.4.1
 Requires-Dist: huggingface-hub>=0.30.1
@@ -239,6 +239,18 @@ A huge thank you to all the contributors who have helped make this project a suc
 <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
 <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
+### Contribute to EuroEval
+We welcome contributions to EuroEval! Whether you're fixing bugs, adding features, or
+contributing new datasets, your help makes this project better for everyone.
+- **General contributions**: Check out our [contribution guidelines](CONTRIBUTING.md)
+  for information on how to get started.
+- **Adding datasets**: If you're interested in adding a new dataset to EuroEval, we have
+  a [dedicated guide](NEW_DATASET_GUIDE.md) with step-by-step instructions.
 ### Special Thanks
 - Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
   [Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).

{euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/RECORD RENAMED Viewed

@@ -1,11 +1,11 @@
 euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
 euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
-euroeval/benchmarker.py,sha256=7LVFr7zL7OeJPs7WVYwekNnEmiIKPXHydcbAkW99MUk,48080
+euroeval/benchmarker.py,sha256=gOLNpW11cBX_8AvotnlGNbejtOM4acmXS3aovNREqhA,48434
 euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
 euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
-euroeval/constants.py,sha256=t2mAT8tE3Dn2lXWHTnaFoaOIaUcdiBjJTASCt7nSdkg,1984
-euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
-euroeval/data_models.py,sha256=oZLrGg1dhIIwbgtEzq4U_fu_ZbBsz35mrqsyizuZNPw,23138
+euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
+euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
+euroeval/data_models.py,sha256=Nlb2s26u5OvQ2AITAt25NMpeI1IHM2_qqbpyU_bZhiY,22907
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=LRd7HoudupRp5-AX3L0X4hIAWCa6JVx-LViHPg7u7dg,5821
 euroeval/finetuning.py,sha256=IieAhgvxjeLHAHBief1Ay-STcCosQmrDHFTRTXFZX0Q,10743
@@ -17,21 +17,22 @@ euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
 euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
 euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
 euroeval/speed_benchmark.py,sha256=J7VKWMf7GU_l0lRR8f0QeUr_vAaBQqTbgQ_yToHhp_0,3980
-euroeval/tasks.py,sha256=VVXFDcEM250KTGXd1pxQb8vwdia4ZJxgTUY5Kdsa-ik,7070
-euroeval/tokenization_utils.py,sha256=PNuS-FTdVrL9TWNDGlq42MvUggKwmyYM0BnC5I37IO0,11876
+euroeval/tasks.py,sha256=87gbe__K5KNIb1aBSuwGnMPmZgamJFecNNYmNgMxaVo,7069
+euroeval/tokenization_utils.py,sha256=fbMVAOkRdcpf9L2SVechPpmWYgDXgQcc-sDrYu21wFI,12487
 euroeval/types.py,sha256=E0JhLfg-ek5pdFcYJbnGRUSodHxkuR3o8XGuIrBcuRM,2485
 euroeval/utils.py,sha256=DyWhtdFlAM1TZuiYXWNPN8KxNrZGNa-J3WfS6DGwkvM,10467
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
 euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
 euroeval/benchmark_modules/hf.py,sha256=yFApLL4_ia5Kw2iat5RSI8h5RhI4OP04HlzYidlhBCs,44012
-euroeval/benchmark_modules/litellm.py,sha256=wohdi1WoeJ-JEdQLgg2q3JbZJA77XO7yGZaTRvbRU4o,47575
-euroeval/benchmark_modules/vllm.py,sha256=FTpwal5WdrVsOpkjm_RXwf6-2PrNrrP1LO6BVGYb6GE,48086
+euroeval/benchmark_modules/litellm.py,sha256=9Fhh7Zyn6F4JBlRoQkST1wIeb8z0YliRRrcmD5pONs4,52551
+euroeval/benchmark_modules/vllm.py,sha256=vwAE7SGRhePqkzAt1S-FKPelEqe8VMGwah9Nj2J1hLs,51295
 euroeval/dataset_configs/__init__.py,sha256=fkD1hzW7szJLc1MdK-AY4EBFWBUX5Z8t4f9uBHQnRvU,1858
 euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
 euroeval/dataset_configs/dutch.py,sha256=N3zL0vGe4OyPgVU_AiYNNfk96jSc_JDtKrVIHbaEYCU,3536
 euroeval/dataset_configs/english.py,sha256=yHw7D0zSNVbiSBAjR1mWX4V5FSkhqy4y-o-pnyWCLxE,2323
 euroeval/dataset_configs/faroese.py,sha256=QQgLe5gv0f3AtXe5rV65xZ98gFgyITQPDr3UwO4Bnv4,1350
+euroeval/dataset_configs/finnish.py,sha256=BIfcxdgJu4CfevHAjzwH7cYd8Xd9DGrm49lcJZcGVQM,1730
 euroeval/dataset_configs/french.py,sha256=ATsj8_9_GxFTQgmfrniPQFZ1R9hoQCI1_ieWTnscFHU,2382
 euroeval/dataset_configs/german.py,sha256=QO6PrBQY6kyZeQMU1vg6KrC_sKyj9U2ukS9nbKO19is,2560
 euroeval/dataset_configs/icelandic.py,sha256=mncl7X4yO9gBmYqXMBfm7FKU1jcKryerSgd0dqlIA_4,4198
@@ -40,20 +41,20 @@ euroeval/dataset_configs/norwegian.py,sha256=2SD5681gZFa1Ig-AEpnyStbivan_bq_Pada
 euroeval/dataset_configs/spanish.py,sha256=fc0dHWU7-g_p6kaSGA8nD1vLVQF_yqR2PkixrYyWywc,2212
 euroeval/dataset_configs/swedish.py,sha256=SOD2nKQTVwTpTvr362mDPHon42kr9vWs5C0mK02Fh-o,2811
 euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
-euroeval/prompt_templates/linguistic_acceptability.py,sha256=sx_WqLm7N6Thll6COUCCA0lXe9RMZ7WhoH6X498pixM,6232
-euroeval/prompt_templates/multiple_choice.py,sha256=H0CDQPs_WzgSJ7oI_FBzHs0TOF0Na2qZYJLhDC7S8tk,4710
-euroeval/prompt_templates/named_entity_recognition.py,sha256=T65oFEtVT8JRF9c7bq2nPm233rftPdEAGic0DU-toko,11835
-euroeval/prompt_templates/reading_comprehension.py,sha256=WbQoal_tjoTt7qsmSZXEWwlI77vgiANcZoZC1l1AZjc,6090
-euroeval/prompt_templates/sentiment_classification.py,sha256=LcFD89e5nMOv4u-Unj8_jHpNjKMmgKPEfz0-e39VbsM,6639
-euroeval/prompt_templates/summarization.py,sha256=eX0uUTf_5Xorm6f_TlBBNwLC9zKvR7YJkP0RSaLWgIw,4585
+euroeval/prompt_templates/linguistic_acceptability.py,sha256=FAIJKS26EVRxlLHk1C3lN0GDtd5AM0MwvaMf-NNIxfU,6677
+euroeval/prompt_templates/multiple_choice.py,sha256=6iEqiPpT-3WJN_gsyhyapnwsrcsYGdVkSkzwn-VKKxw,5101
+euroeval/prompt_templates/named_entity_recognition.py,sha256=Xd6gBJD2e1l8-We2Ujor7crRUBcbgnNeeVknBIrTMJo,12737
+euroeval/prompt_templates/reading_comprehension.py,sha256=yLqryWQAW04GULz_EyNDLOS7ZrDUeasuLFt-dtqCnYk,6585
+euroeval/prompt_templates/sentiment_classification.py,sha256=LDOwjGQ2kqhwgNyphPywQeolwNB09o-xYWc9RUbzc84,7136
+euroeval/prompt_templates/summarization.py,sha256=mcWeKNhGWmp7IG_iY64T-VOSabQg5wKddjSbJNYFDp8,4984
 euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 euroeval/task_group_utils/multiple_choice_classification.py,sha256=nB78TzOgd0HBvTclmjOYJid9ZVAgu8IHZsqB_n1SAZU,6178
 euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iYvfoGt0EUObSaXRCGmk,27700
-euroeval/task_group_utils/sequence_classification.py,sha256=gqd0-l5o7vAY5QIpGSkSqwJwez3Y0r5SqOiywfPNW8A,12239
-euroeval/task_group_utils/text_to_text.py,sha256=QECnGdZ0YLjsbMc6LwXqVi4KMuITdiOjmJUNQtAAOW0,5712
+euroeval/task_group_utils/sequence_classification.py,sha256=xPz1gJioK96iv2bNoDWiC2EJkhRvRd7QZNgY8bT237c,11703
+euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
 euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
-euroeval-15.6.1.dist-info/METADATA,sha256=4i98IBxn6yWh4ugBW-SnljmDfKEXBSfRGjZyf_dlOUs,13183
-euroeval-15.6.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.6.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.6.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
-euroeval-15.6.1.dist-info/RECORD,,
+euroeval-15.7.0.dist-info/METADATA,sha256=8oMsbhHWeO7j4KQdn4lpt-O94Nw0erwRoD_Ogk6CX2U,13669
+euroeval-15.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.7.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.7.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.7.0.dist-info/RECORD,,

{euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.6.1.dist-info → euroeval-15.7.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.6.1__py3-none-any.whl → 15.7.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.6.1py3-none-any.whl → 15.7.0py3-none-any.whl