PyPI - EuroEval - Versions diffs - 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl - Mend

EuroEval 16.0.0py3-none-any.whl → 16.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (51) hide show

euroeval/__init__.py +5 -0
euroeval/benchmark_config_factory.py +6 -1
euroeval/benchmark_modules/base.py +2 -0
euroeval/benchmark_modules/fresh.py +7 -1
euroeval/benchmark_modules/hf.py +26 -21
euroeval/benchmark_modules/litellm.py +258 -131
euroeval/benchmark_modules/vllm.py +120 -68
euroeval/benchmarker.py +11 -2
euroeval/cli.py +14 -1
euroeval/constants.py +7 -1
euroeval/data_models.py +95 -20
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +14 -3
euroeval/dataset_configs/dutch.py +14 -0
euroeval/dataset_configs/english.py +22 -0
euroeval/dataset_configs/estonian.py +15 -7
euroeval/dataset_configs/finnish.py +14 -0
euroeval/dataset_configs/french.py +14 -0
euroeval/dataset_configs/german.py +23 -0
euroeval/dataset_configs/italian.py +14 -0
euroeval/dataset_configs/latvian.py +14 -0
euroeval/dataset_configs/norwegian.py +14 -0
euroeval/dataset_configs/polish.py +126 -0
euroeval/dataset_configs/portuguese.py +14 -0
euroeval/dataset_configs/spanish.py +14 -0
euroeval/dataset_configs/swedish.py +25 -0
euroeval/enums.py +12 -0
euroeval/generation.py +17 -8
euroeval/generation_utils.py +102 -16
euroeval/metrics/pipeline.py +51 -9
euroeval/model_cache.py +13 -1
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +27 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +11 -0
euroeval/prompt_templates/sentiment_classification.py +15 -0
euroeval/prompt_templates/summarization.py +27 -1
euroeval/scores.py +5 -0
euroeval/task_group_utils/multiple_choice_classification.py +2 -2
euroeval/task_group_utils/question_answering.py +29 -29
euroeval/task_group_utils/sequence_classification.py +71 -81
euroeval/task_group_utils/token_classification.py +17 -3
euroeval/tasks.py +12 -10
euroeval/{tokenization_utils.py → tokenisation_utils.py} +41 -25
euroeval/utils.py +67 -3
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/METADATA +3 -1
euroeval-16.1.0.dist-info/RECORD +70 -0
euroeval-16.0.0.dist-info/RECORD +0 -69
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -215,6 +215,20 @@ def extract_labels_from_generation(
         prompt_label_mapping = dataset_config.prompt_label_mapping
         for prompt_tag_name, named_entities in prediction_dict.items():
+            if not isinstance(named_entities, list):
+                logger.debug(
+                    "The model produced an invalid format for the named entities. "
+                    f"Expected a list but got {type(named_entities)}. Skipping."
+                )
+                continue
+            try:
+                named_entities = [str(ne) for ne in named_entities]
+            except Exception:
+                logger.debug(
+                    "The model produced an invalid format for the named entities. "
+                    f"Expected a list of strings but got {named_entities}. Skipping."
+                )
+                continue
             try:
                 tag_name = [
                     tag[2:]
@@ -259,7 +273,7 @@ def tokenize_and_align_labels(
     Returns:
         A dictionary containing the tokenized data as well as labels.
     """
-    # Tokenize the texts. We use the `is_split_into_words` argument here because
+    # Tokenise the texts. We use the `is_split_into_words` argument here because
     # the texts in our dataset are lists of words (with a label for each word)
     tokenized_inputs = tokeniser(
         examples["tokens"], is_split_into_words=True, truncation=True, padding=True
@@ -382,7 +396,7 @@ def handle_unk_tokens(
     Args:
         tokeniser:
-            The tokeniser used to tokenize the words.
+            The tokeniser used to tokenise the words.
         tokens:
             The list of tokens.
         words:
@@ -409,7 +423,7 @@ def handle_unk_tokens(
         # Fetch the word
         word = words[word_idx]
-        # Tokenize the word, which is now a list containing at least one UNK token
+        # Tokenise the word, which is now a list containing at least one UNK token
         tokens_with_unk = tokeniser.convert_ids_to_tokens(
             tokeniser.encode(word, add_special_tokens=False)
         )

euroeval/tasks.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """All benchmarks tasks used in EuroEval."""
 from . import metrics as m
+from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
 from .data_models import Task
 from .enums import GenerativeType, ModelType, TaskGroup
 from .prompt_templates import (
@@ -28,7 +29,7 @@ LA = Task(
     template_dict=LA_TEMPLATES,
     metrics=[m.mcc_metric, m.macro_f1_metric],
     default_num_few_shot_examples=12,
-    default_max_generated_tokens=10,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["correct", "incorrect"],
     uses_logprobs=True,
 )
@@ -73,7 +74,7 @@ SENT = Task(
     template_dict=SENT_TEMPLATES,
     metrics=[m.mcc_metric, m.macro_f1_metric],
     default_num_few_shot_examples=12,
-    default_max_generated_tokens=10,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["positive", "neutral", "negative"],
     uses_logprobs=True,
 )
@@ -87,7 +88,7 @@ SUMM = Task(
     default_num_few_shot_examples=1,
     default_max_generated_tokens=256,
     default_labels=[],
-    allowed_model_types=[ModelType.GENERATIVE],
+    default_allowed_model_types=[ModelType.GENERATIVE],
 )
@@ -97,7 +98,7 @@ KNOW = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=10,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
     uses_logprobs=True,
 )
@@ -109,7 +110,7 @@ MCRC = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=10,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
     uses_logprobs=True,
 )
@@ -121,7 +122,7 @@ COMMON_SENSE = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.mcc_metric, m.accuracy_metric],
     default_num_few_shot_examples=5,
-    default_max_generated_tokens=10,
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
     default_labels=["a", "b", "c", "d"],
     uses_logprobs=True,
 )
@@ -133,15 +134,16 @@ EUROPEAN_VALUES = Task(
     template_dict=MULTIPLE_CHOICE_TEMPLATES,
     metrics=[m.european_values_metric],
     default_num_few_shot_examples=0,
-    default_max_generated_tokens=10,
-    default_labels=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
-    allowed_model_types=[ModelType.GENERATIVE],
-    allowed_generative_types=[
+    default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
+    default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
+    default_allowed_model_types=[ModelType.GENERATIVE],
+    default_allowed_generative_types=[
         GenerativeType.INSTRUCTION_TUNED,
         GenerativeType.REASONING,
     ],
     requires_zero_shot=True,
     uses_logprobs=True,
+    default_allow_invalid_model_outputs=False,
 )

euroeval/{tokenization_utils.py → tokenisation_utils.py} RENAMED Viewed

@@ -1,4 +1,4 @@
-"""Utility functions related to tokenization."""
+"""Utility functions related to tokenisation."""
 import logging
 import re
@@ -7,9 +7,8 @@ import typing as t
 import torch
 from transformers import MistralCommonTokenizer
-from euroeval.exceptions import InvalidModel
 from .enums import GenerativeType
+from .exceptions import InvalidModel
 from .utils import log_once
 if t.TYPE_CHECKING:
@@ -80,8 +79,8 @@ def should_prompts_be_stripped(
     """Determine if we should strip the prompts for few-shot evaluation.
     This is the case if the tokeniser needs to include the space as part of the label
-    token. The strategy is thus to tokenize a label with a preceeding colon (as in the
-    prompts), i.e., ": positive", and check if the tokenization starts with the tokens
+    token. The strategy is thus to tokenise a label with a preceeding colon (as in the
+    prompts), i.e., ": positive", and check if the tokenisation starts with the tokens
     of ": ". If this is the case, then we should not strip the prompts, since the
     tokeniser produces the whitespace token separately.
@@ -89,7 +88,7 @@ def should_prompts_be_stripped(
         labels_to_be_generated:
             The labels that are to be generated.
         tokeniser:
-            The tokeniser used to tokenize the labels.
+            The tokeniser used to tokenise the labels.
     Returns:
         Whether we should strip the prompts.
@@ -125,7 +124,7 @@ def should_prefix_space_be_added_to_labels(
         labels_to_be_generated:
             The labels that are to be generated.
         tokeniser:
-            The tokeniser used to tokenize the labels.
+            The tokeniser used to tokenise the labels.
     Returns:
         Whether we should add a prefix space to the labels.
@@ -319,7 +318,9 @@ def get_pad_token(
     return pad_token, pad_token_id
-def get_end_of_chat_token_ids(tokeniser: "PreTrainedTokenizer") -> list[int] | None:
+def get_end_of_chat_token_ids(
+    tokeniser: "PreTrainedTokenizer", generative_type: GenerativeType | None
+) -> list[int] | None:
     """Get the end token ID for chat models.
     This is only relevant for tokenisers with a chat template.
@@ -327,20 +328,23 @@ def get_end_of_chat_token_ids(tokeniser: "PreTrainedTokenizer") -> list[int] | N
     Args:
         tokeniser:
             The tokeniser.
+        generative_type:
+            The generative type, or None if not available.
     Returns:
         The token IDs used to end chats, or None if the tokeniser does not have a chat
         template or if no end-of-chat token could be found.
     """
-    if not has_chat_template(tokeniser=tokeniser):
+    if generative_type == GenerativeType.BASE:
         return None
     user_message: dict[str, str] = dict(role="user", content="X")
     token_ids = apply_chat_template(
         conversation=[user_message],
         tokeniser=tokeniser,
-        tokenize=True,
+        tokenise=True,
         add_generation_prompt=False,
+        enable_thinking=generative_type == GenerativeType.REASONING,
     )
     assert isinstance(token_ids, list)
@@ -421,7 +425,7 @@ def get_first_label_token_mapping(
         for label in dataset_config.labels
     ]
-    # Tokenize some text containing each label, which we will use to extract the
+    # Tokenise some text containing each label, which we will use to extract the
     # first token of each label
     all_tokens: list[list[str]]
     if not has_chat_template(tokeniser=tokeniser):
@@ -440,11 +444,13 @@ def get_first_label_token_mapping(
                         dict(role="user", content=""),
                         dict(role="assistant", content=label),
                         # Adding extra user message as Mistral tokenisers require
-                        # conversamtions to end with a user message
+                        # conversations to end with a user message
                         dict(role="user", content=""),
                     ],
                     tokeniser=tokeniser,
-                    tokenize=True,
+                    tokenise=True,
+                    add_generation_prompt=True,
+                    enable_thinking=generative_type == GenerativeType.REASONING,
                 )
             )
             for label in local_labels
@@ -538,9 +544,10 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
 def apply_chat_template(
     conversation: list[dict[str, str]],
     tokeniser: "PreTrainedTokenizer",
-    tokenize: bool = False,
-    add_generation_prompt: bool = True,
-    **transformers_tokeniser_kwargs,
+    tokenise: bool,
+    add_generation_prompt: bool,
+    enable_thinking: bool,
+    **extra_kwargs,
 ) -> str | list[int]:
     """Apply the chat template to a prompt.
@@ -549,38 +556,47 @@ def apply_chat_template(
             The conversation to apply the chat template to.
         tokeniser:
             The tokeniser.
-        tokenize:
-            Whether to tokenize the resulting prompt, returning a list of token IDs
+        tokenise:
+            Whether to tokenise the resulting prompt, returning a list of token IDs
             instead of a string.
         add_generation_prompt:
             Whether to add a generation prompt at the end of the conversation. This is
             only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
             always add a generation prompt.
-        **transformers_tokeniser_kwargs:
-            Additional keyword arguments to pass to the tokeniser, in case the tokeniser
-            is a regular Hugging Face tokeniser.
+        enable_thinking:
+            Whether to enable special handling for reasoning models, such as adding
+            special tokens for thinking. This is only relevant for regular Hugging
+            Face tokenisers, as Mistral tokenisers always handle reasoning models.
+        **extra_kwargs:
+            Extra keyword arguments to pass to the tokeniser's `apply_chat_template`
+            method. Only relevant for regular Hugging Face tokenisers.
     Returns:
         The prompt with the chat template applied, either as a string or a list of
-        token IDs, depending on the value of `tokenize`.
+        token IDs, depending on the value of `tokenise`.
     Raises:
         InvalidModel:
             If the tokeniser does not have a chat template.
     """
+    # Ensure that the first user message is not empty, as this can cause issues with
+    # Jinja2
+    conversation[0]["content"] = conversation[0]["content"] or " "
     if not has_chat_template(tokeniser=tokeniser):
         raise InvalidModel(
             "The tokeniser does not have a chat template, so cannot apply it."
         )
     elif isinstance(tokeniser, MistralCommonTokenizer):
         templated_prompt = tokeniser.apply_chat_template(
-            conversation=conversation, tokenize=tokenize
+            conversation=conversation, tokenize=tokenise
         )
     else:
         templated_prompt = tokeniser.apply_chat_template(
             conversation=conversation,
             add_generation_prompt=add_generation_prompt,
-            tokenize=tokenize,
-            **transformers_tokeniser_kwargs,
+            tokenize=tokenise,
+            enable_thinking=enable_thinking,
+            **extra_kwargs,
         )
     return templated_prompt

euroeval/utils.py CHANGED Viewed

@@ -4,7 +4,6 @@ import asyncio
 import gc
 import importlib
 import importlib.metadata
-import importlib.util
 import logging
 import os
 import random
@@ -25,11 +24,12 @@ from datasets.utils import disable_progress_bar
 from requests.exceptions import RequestException
 from transformers import logging as tf_logging
-from .exceptions import NaNValueInModelOutput
+from .exceptions import InvalidBenchmark, InvalidModel, NaNValueInModelOutput
 if t.TYPE_CHECKING:
     from types import TracebackType
+    from .data_models import ModelIdComponents
     from .types import Predictions
@@ -347,7 +347,8 @@ def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
     loop = asyncio.new_event_loop()
     try:
         asyncio.set_event_loop(loop)
-        return loop.run_until_complete(coroutine)
+        response = loop.run_until_complete(coroutine)
+        return response
     finally:
         loop.close()
         asyncio.set_event_loop(None)
@@ -457,3 +458,66 @@ def get_hf_token(api_key: str | None) -> str | bool:
             level=logging.DEBUG,
         )
         return False
+def extract_multiple_choice_labels(
+    prompt: str, candidate_labels: list[str]
+) -> list[str]:
+    """Extract multiple choice labels from a prompt.
+    Args:
+        prompt:
+            The prompt to extract the labels from.
+        candidate_labels:
+            The candidate labels to look for in the prompt.
+    Returns:
+        The extracted labels.
+    """
+    sample_candidate_labels: list[str] = list()
+    for candidate_label in candidate_labels:
+        candidate_label_match = re.search(
+            pattern=rf"\b{candidate_label}\. ", string=prompt, flags=re.IGNORECASE
+        )
+        if candidate_label_match is not None:
+            sample_candidate_labels.append(candidate_label)
+    if not sample_candidate_labels:
+        raise InvalidBenchmark(
+            "Could not extract any candidate labels from the prompt. Please ensure "
+            "that the candidate labels are present in the prompt, each followed by a "
+            "dot and a space (e.g., 'a. '). The candidate labels are: "
+            f"{', '.join(candidate_labels)}. Here is the prompt: {prompt!r}"
+        )
+    return sample_candidate_labels
+def split_model_id(model_id: str) -> "ModelIdComponents":
+    """Split a model ID into its components.
+    Args:
+        model_id:
+            The model ID to split.
+    Returns:
+        The split model ID.
+    Raises:
+        If the model ID is not valid.
+    """
+    # Importing here to avoid circular imports
+    from .data_models import ModelIdComponents
+    # Attempt to extract the model ID, revision, and param using regex
+    model_id_match = re.match(pattern=r"^[^@#]+", string=model_id)
+    revision_match = re.search(pattern=r"@([^@#]+)", string=model_id)
+    param_match = re.search(pattern=r"#([^@#]+)", string=model_id)
+    # If we cannot extract the model ID, raise an error
+    if model_id_match is None:
+        raise InvalidModel(f"The model ID {model_id!r} is not valid.")
+    model_id = model_id_match.group()
+    # Extract the revision and param and return the result
+    revision = revision_match.group(1) if revision_match is not None else "main"
+    param = param_match.group(1) if param_match is not None else None
+    return ModelIdComponents(model_id=model_id, revision=revision, param=param)

{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 16.0.0
+Version: 16.1.0
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,10 +61,12 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
 Provides-Extra: all
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
 Description-Content-Type: text/markdown

euroeval-16.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,70 @@
+euroeval/__init__.py,sha256=8jqSCcDWvwwNb1guPi8cLAekPSOX9V8DpRx_v3-c19E,3730
+euroeval/benchmark_config_factory.py,sha256=NzNSiqix4hlVXk3xnyzdg2WDxomkectf97UWdVS3POo,11667
+euroeval/benchmarker.py,sha256=JkhvYxhVpQPcWmDLzwnB8Yy6tTqj3yfDWTefklbI7RM,50355
+euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
+euroeval/cli.py,sha256=wUGetj9Ld4wkS872ZOfYqHIJMh58o8L2MDi78wU5nxI,9099
+euroeval/constants.py,sha256=NN7kcwQdlDyyGFSrLjsL_qKVRyoRqZ9sKO5SjlgtRwA,2741
+euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
+euroeval/data_models.py,sha256=S-PATp4F1wBwvra6wtjlJFXxZbZB_vEpJHXcdTTKA70,27593
+euroeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
+euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
+euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
+euroeval/generation.py,sha256=MSrd0oIkoqwKsCOaIkY2CFF_urXLOfNR1OO5nMvcCpY,12476
+euroeval/generation_utils.py,sha256=OtEXLhI6L1vlbC768dH3xzj0qkokz43m0vswGKrRmBA,18061
+euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
+euroeval/model_cache.py,sha256=h61cL_fy2Sd1sqYZis5lAWqvQIfQXXt_v8QZeftKNkg,9226
+euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
+euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
+euroeval/scores.py,sha256=HQQqyjdgm853FZ_ifIdnSltKfBhsY7pOITov6F3Et5o,3165
+euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
+euroeval/tasks.py,sha256=3qEOBAMmfeqgXqlGkCKzQ-s0Yw-0-jPRgFZ97EZCFng,4535
+euroeval/tokenisation_utils.py,sha256=jRIi9m8XmGh3LeZna47AWmJI9U9m4ojXQynQTe7kzWc,21344
+euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
+euroeval/utils.py,sha256=c0tFw1IXZIqgLU4EfY_k28iJ1ZlCZ_oFoKZH2sGCKYg,16499
+euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
+euroeval/benchmark_modules/base.py,sha256=mHF8XS6GGUXV-sJtxmI5WJBWPLMHuh-4Z4OWjC25x9Y,11566
+euroeval/benchmark_modules/fresh.py,sha256=TveSQiFBi3xXgCEQBdHwkUQ685PDkKW0y3G5Yt5rkeM,10655
+euroeval/benchmark_modules/hf.py,sha256=oBjVumnSM9PW7ZocQwCGLKpbeGFWLN_71DBotxZo1aY,44038
+euroeval/benchmark_modules/litellm.py,sha256=6EKjHnUoPCpuupISZHXqZsXLG8tyiA1-G12a5C6L8MM,64629
+euroeval/benchmark_modules/vllm.py,sha256=sYFdVzB9CZX6_sGI4xghDyXoVn6I95_nbeFUWeSMXcc,43132
+euroeval/dataset_configs/__init__.py,sha256=uuIZmElpJV8iupo5oDj3TeQhBDRANdWpLKYFASLirHA,2046
+euroeval/dataset_configs/danish.py,sha256=QABfgI7m-0-5AimDXegp5ssDSLcM2VrAI_RWsinSZP4,5631
+euroeval/dataset_configs/dutch.py,sha256=63Ro2yFym5MuIDXf5953vUYenw9B0kZSCmZbXjdy4Rs,5517
+euroeval/dataset_configs/english.py,sha256=7lS12Tj7FnMGkS4xj7UoZyymNX6PGXTVl5muPswIgAE,4737
+euroeval/dataset_configs/estonian.py,sha256=tdnz0gmMR9yO5rm3SsIz-Wd0LmlCvi3UJ2M5r4VwkSE,3093
+euroeval/dataset_configs/faroese.py,sha256=sFC25nwlPtnl6hwkPOxPkwVggPGTjw167YhSBnLl1EA,3039
+euroeval/dataset_configs/finnish.py,sha256=esb5nu4HAEdqiP7F9klmME-tkjme01Qd89TOxTB1S20,4390
+euroeval/dataset_configs/french.py,sha256=lZKhJcTpaG8n3y8u5KY61UfU9YzEHF9tIPKm8UakoBs,4720
+euroeval/dataset_configs/german.py,sha256=gF0idcfDt5Iy89ozwgEXEYR_ukyYurdQSS1KITPz5aM,5130
+euroeval/dataset_configs/icelandic.py,sha256=qX-szARxqzJ9l-h0k5iXirC5StpW_B3BOakZQ14zmpM,5797
+euroeval/dataset_configs/italian.py,sha256=tJ_-OYRJ8wJX7ZCwdE4KJIScn1ijYigAXK3lDTZTA3E,5004
+euroeval/dataset_configs/latvian.py,sha256=-zVftcd7Zl6MbrqL-zqBSixsIiPsbt5ZAqldE2wFOEI,2713
+euroeval/dataset_configs/norwegian.py,sha256=ccLM2Zkf5eaFH1K1KyzqoMwkVNcXgjMQTxIhPf4tl_E,7745
+euroeval/dataset_configs/polish.py,sha256=Z-9PT9KaopQUmBgFk5F85ve3pjQwTJqouG8IFgg5iqw,3672
+euroeval/dataset_configs/portuguese.py,sha256=gQ054SdLQ5fkm4IAP6Mdh5RcPDJPDITcuyaLKZit_9o,4089
+euroeval/dataset_configs/spanish.py,sha256=DvJlMK6OQg4qmxKzQA2IficlBMB7BafvxqIVuTKiZyw,4902
+euroeval/dataset_configs/swedish.py,sha256=YWHp7hbJ25o36csSg9uXaQCEJK1BPb7u2RQZiCe0lNs,5445
+euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
+euroeval/metrics/base.py,sha256=4vnRIPfKUwTNe0ZVm5YC2jQNecwchGUpN6nAH5cX0PM,2288
+euroeval/metrics/huggingface.py,sha256=b_Z_FUELQcmK7HeJh0zlAZs3pim1uNHnFLu7nvlZ4_A,5824
+euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
+euroeval/metrics/pipeline.py,sha256=Wcan3eDWV7t4WRXMPWCCe_JsA-fZnIfZU2ESinbbL2I,10284
+euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
+euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
+euroeval/prompt_templates/linguistic_acceptability.py,sha256=pRR1QBnYt5DnfxQp6dw1OYFZfIct-1R9pfdgPGpjoco,8667
+euroeval/prompt_templates/multiple_choice.py,sha256=Q-8-ETqG-RZeLzR8v8WUBIN7djiNSfNpmYnZRUWcd84,6905
+euroeval/prompt_templates/named_entity_recognition.py,sha256=LT7J6Y9rUCJFimpnwujBZq_V5buSmXHJteIXbTOoaCE,16442
+euroeval/prompt_templates/reading_comprehension.py,sha256=ogzmhiSZO6egrdxxQiWz6a0XMdC0vws-lg5yRKQoYV0,8730
+euroeval/prompt_templates/sentiment_classification.py,sha256=BwnTpSdsAN_rL693ImgtKIRc5T_2G6ptWW0jCdC02NQ,9454
+euroeval/prompt_templates/summarization.py,sha256=4Sqwj6C7yNfqj4FFFCseJMLDoSZ13aIOgY0SjIzzsNo,6593
+euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
+euroeval/task_group_utils/multiple_choice_classification.py,sha256=i5sidJGAXnENRoB6pOelyaUeGP1qoxwPSzD-F9RLwWk,7106
+euroeval/task_group_utils/question_answering.py,sha256=eUczZntrC9lhCUQlwNQB49i-5Ei12cdRnrfq4pE-T7Y,27750
+euroeval/task_group_utils/sequence_classification.py,sha256=qWUUrh4X4jK2XfUzP4aoPDoJhVJifrnDEaaw_F48hig,16080
+euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
+euroeval/task_group_utils/token_classification.py,sha256=Yjai937ia1nZBMOWySqCXr_dA6WiVLGvmb4Hm_TU0Bg,17118
+euroeval-16.1.0.dist-info/METADATA,sha256=pYdW0IZwY8vatTA55EERxBK1kMaQuGhqzNys5xiSqsM,13729
+euroeval-16.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-16.1.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
+euroeval-16.1.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
+euroeval-16.1.0.dist-info/RECORD,,

euroeval-16.0.0.dist-info/RECORD DELETED Viewed

@@ -1,69 +0,0 @@
-euroeval/__init__.py,sha256=MgFG1amMgiTJmK_hcQ7nnX-o4KFhlD1P5xKUBTloPCQ,3564
-euroeval/benchmark_config_factory.py,sha256=ZKzGkWr-Mr4wEMYNXUHsYkd2R-dxnNyETZJJ-Fq-my0,11386
-euroeval/benchmarker.py,sha256=YNqhl2QchqzbGMGu8QoJAG_mnYbcJ46ksfaS0x78fiw,49847
-euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
-euroeval/cli.py,sha256=RR45NiHMI9hphqBJ7Xopde-C18Be9JgJxgg6eYPFVMM,8594
-euroeval/constants.py,sha256=HWJ3PJRS-ZbAMXTvujiK8QP7IiS4RHkjnegv3oi52w0,2499
-euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
-euroeval/data_models.py,sha256=NdzD1ER3GHJp51UXLGTW8iTYwzZlITH2nO0vanTkEWU,24272
-euroeval/enums.py,sha256=V73E8FTL1aRz74OKcxokTYLnO7Q8HGs2QI0JPZI4qQo,3032
-euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
-euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
-euroeval/generation.py,sha256=wm2u8fDGDgtWxCReG3N6v4_lLvo0OHTpR88ThGSRH7A,12139
-euroeval/generation_utils.py,sha256=vU-j9kjFDuPlSizEaRByx_XJyyAVpE8PdGOm9i--9zQ,14613
-euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
-euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
-euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
-euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
-euroeval/scores.py,sha256=gJ7DSQVyE2_8qZxJPuUJcFk7Byj2D7nevE23kd4XMbA,3004
-euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
-euroeval/tasks.py,sha256=jl8HicriMSN_LfHANokVGFqzgV53QcJ5dmzb297xI04,4173
-euroeval/tokenization_utils.py,sha256=icEfttWReKRC5MbREOuxTHOPpuVvH6uHhnqz1w7qIyA,20565
-euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
-euroeval/utils.py,sha256=O4JIROPfbA7MD9SbOY0CifoCckYjmdNjXYjOxDwBnwM,14149
-euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
-euroeval/benchmark_modules/base.py,sha256=vYW97bnlzqxxcIq6lY-zd0o6zxyDRMhT85jOhdKnoYE,11482
-euroeval/benchmark_modules/fresh.py,sha256=_iRTHt9qUkq7jPOlgwx7IwZG48dK4mjMrh7KiEHeUjE,10462
-euroeval/benchmark_modules/hf.py,sha256=HDXuVwt0kZUyL9x3aG5pEjSdGCRfzegqT0xKZYprjU0,43843
-euroeval/benchmark_modules/litellm.py,sha256=M6ct5ppcYfO-Il5VMRm3PuyAeQ-rtS22UKyRStLnqfM,59210
-euroeval/benchmark_modules/vllm.py,sha256=dTwGGOFQ7wqYXg7x2YBUJNQcO6OwqjTMBfUf5OveXNk,41289
-euroeval/dataset_configs/__init__.py,sha256=lEOr4kJzgtUymeNBVhd-VwdUK0YTUZ3GjUMlLz5fGWk,2010
-euroeval/dataset_configs/danish.py,sha256=3n9e0r-hYRI2hPOgLDMQsO8bPgZKjw7OcFCUsCvdmk4,5294
-euroeval/dataset_configs/dutch.py,sha256=tY7FDw7BmhXxNfI1hqfasxQXP0QbYTqknokTZ7gqdRY,5079
-euroeval/dataset_configs/english.py,sha256=Y4yc3AQu8WojqENj0sy4-rIlx1LhPnsCQ0DeonqDsVs,4128
-euroeval/dataset_configs/estonian.py,sha256=o13P_XkrdhLFCz9l8LJy-TSY3JIN7XmByxesEDiagnc,2879
-euroeval/dataset_configs/faroese.py,sha256=sFC25nwlPtnl6hwkPOxPkwVggPGTjw167YhSBnLl1EA,3039
-euroeval/dataset_configs/finnish.py,sha256=7iXjjpJ23tupvtXwJF3TH1Tzwhxw0RFaoBv38HclsJc,3950
-euroeval/dataset_configs/french.py,sha256=9ofGQpnjw0j_lPB0SuWMvbuWVZXfOvROMqZ03d-EAHs,4281
-euroeval/dataset_configs/german.py,sha256=qsJO2YCND8Kuc_atSWXjkoD2itUQNbUsExiGk7P0OnE,4459
-euroeval/dataset_configs/icelandic.py,sha256=qX-szARxqzJ9l-h0k5iXirC5StpW_B3BOakZQ14zmpM,5797
-euroeval/dataset_configs/italian.py,sha256=xoS_oIFXnTraiV9PX2dBsE1GyodlAbma5dEB7yM_Q8A,4564
-euroeval/dataset_configs/latvian.py,sha256=tibwTbe-atsRZEBbegJ6nbr1Oh4RthUYhZoHPVVawq0,2273
-euroeval/dataset_configs/norwegian.py,sha256=eTX0KpjH60FyLGrUTfspvNvYaL-Ytfw3DTFftlriVM0,7295
-euroeval/dataset_configs/portuguese.py,sha256=x-Idrdo_EtmB_xoabwKivKG091DvFEQEbO6MTcjZVqs,3646
-euroeval/dataset_configs/spanish.py,sha256=5m3Qh328YPhbN8jFPIy9Sa7ZWob02ToCWzlDoT8IsSw,4462
-euroeval/dataset_configs/swedish.py,sha256=j_I7ba9a0nXzEPvpnPTuNFEkS51pnUPrnRwcqGh7tu0,4715
-euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
-euroeval/metrics/base.py,sha256=4vnRIPfKUwTNe0ZVm5YC2jQNecwchGUpN6nAH5cX0PM,2288
-euroeval/metrics/huggingface.py,sha256=b_Z_FUELQcmK7HeJh0zlAZs3pim1uNHnFLu7nvlZ4_A,5824
-euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
-euroeval/metrics/pipeline.py,sha256=T65p2sxPnwh2WgCjqsqzvE3XOzizNY7rlSm8KPR7sCk,8883
-euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
-euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
-euroeval/prompt_templates/linguistic_acceptability.py,sha256=9ZIyv_hfI2Aj20Uy9SY1izq5OBRV844PXPiZCNCOoEY,8207
-euroeval/prompt_templates/multiple_choice.py,sha256=TCMKB0xS5IEa8f4YEUjsoifcUpaIv4yOL4FisVvPwok,6423
-euroeval/prompt_templates/named_entity_recognition.py,sha256=_ZRVDcnbXvTs_C2NXy78oMbCLFDtW9SuxmvSVg51Umo,15554
-euroeval/prompt_templates/reading_comprehension.py,sha256=eRMN-kCT3wuImbuFXzZYfo5WiVhCFWJkCYwRUDtpeWo,8208
-euroeval/prompt_templates/sentiment_classification.py,sha256=eIXn-aAY7LKeXqxzMKoqdVbihA2f1RaNQk7DhceuQdQ,8887
-euroeval/prompt_templates/summarization.py,sha256=GvnKuYJKbJ_2QkdtSWp_h4RhfOXdq-7_yYeClJSPaTY,6137
-euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
-euroeval/task_group_utils/multiple_choice_classification.py,sha256=lNEOWi3ckLBnMP1QoSTxNxT-s6kBz2XH17mrmjQlv5s,7075
-euroeval/task_group_utils/question_answering.py,sha256=vdEbcZy7BE6ICA7kWkPYmPW4eVuIiZ_4uJRLUexDhwY,27750
-euroeval/task_group_utils/sequence_classification.py,sha256=K_hFWY6D5WR8-uy6ZikCq3ighHNHSyzW7A62vwDkwDs,16512
-euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
-euroeval/task_group_utils/token_classification.py,sha256=6bN9soT1kLthutCpqUT-jDmZZw9Mt7H3tjI4zVvE4BY,16469
-euroeval-16.0.0.dist-info/METADATA,sha256=uvzi8Bkgab8rKhgKavqFnv8rpL0KntFIYMZ7f1Joa0U,13544
-euroeval-16.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-16.0.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
-euroeval-16.0.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
-euroeval-16.0.0.dist-info/RECORD,,

{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.0.0py3-none-any.whl → 16.1.0py3-none-any.whl