PyPI - EuroEval - Versions diffs - 15.13.0__py3-none-any.whl → 15.15.0__py3-none-any.whl - Mend

EuroEval 15.13.0py3-none-any.whl → 15.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (26) hide show

euroeval/__init__.py +7 -0
euroeval/benchmark_modules/litellm.py +31 -4
euroeval/benchmark_modules/vllm.py +11 -12
euroeval/data_models.py +1 -1
euroeval/dataset_configs/danish.py +10 -0
euroeval/dataset_configs/dutch.py +10 -0
euroeval/dataset_configs/finnish.py +10 -0
euroeval/dataset_configs/french.py +10 -0
euroeval/dataset_configs/german.py +10 -0
euroeval/dataset_configs/italian.py +10 -0
euroeval/dataset_configs/spanish.py +10 -0
euroeval/dataset_configs/swedish.py +10 -0
euroeval/generation.py +1 -1
euroeval/human_evaluation.py +2 -1
euroeval/metrics.py +20 -4
euroeval/prompt_templates/multiple_choice.py +1 -1
euroeval/task_group_utils/question_answering.py +7 -1
euroeval/task_group_utils/sequence_classification.py +8 -1
euroeval/task_group_utils/text_to_text.py +8 -1
euroeval/task_group_utils/token_classification.py +9 -2
euroeval/types.py +5 -0
{euroeval-15.13.0.dist-info → euroeval-15.15.0.dist-info}/METADATA +3 -5
{euroeval-15.13.0.dist-info → euroeval-15.15.0.dist-info}/RECORD +26 -26
{euroeval-15.13.0.dist-info → euroeval-15.15.0.dist-info}/WHEEL +0 -0
{euroeval-15.13.0.dist-info → euroeval-15.15.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.13.0.dist-info → euroeval-15.15.0.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -86,6 +86,13 @@ os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+# Allow long max model length in vLLM. This happens when vLLM registers that the model
+# has a shorter context length than the value we are inserting. But since we do a
+# thorough check of the model's config before setting the context length, we trust our
+# own checks and ignore the internal vLLM check.
+os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
 # Avoid the "Unclosed client session" error when evaluating Ollama models with LiteLLM.
 # The error comes from the `aiohttp` package, and this environment variable forces the
 # use of `httpx` instead.

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -31,6 +31,7 @@ from litellm.exceptions import (
 from litellm.llms.vertex_ai.common_utils import VertexAIError
 from litellm.router import Router
 from litellm.types.utils import ChoiceLogprobs
+from litellm.utils import supports_reasoning, supports_response_schema
 from pydantic import conlist, create_model
 from requests.exceptions import RequestException
 from tqdm.asyncio import tqdm as tqdm_async
@@ -234,6 +235,8 @@ class LiteLLMModel(BenchmarkModule):
             pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
         ):
             type_ = GenerativeType.REASONING
+        elif supports_reasoning(model=self.model_config.model_id):
+            type_ = GenerativeType.REASONING
         else:
             type_ = GenerativeType.INSTRUCTION_TUNED
@@ -314,9 +317,7 @@ class LiteLLMModel(BenchmarkModule):
                     "enable it.",
                     level=logging.DEBUG,
                 )
-            elif litellm.utils.supports_response_schema(
-                model=self.model_config.model_id
-            ):
+            elif supports_response_schema(model=self.model_config.model_id):
                 ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
                 keys_and_their_types: dict[str, t.Any] = {
                     tag_name: (conlist(str, max_length=5), ...)
@@ -361,7 +362,7 @@ class LiteLLMModel(BenchmarkModule):
                 level=logging.DEBUG,
             )
         elif self.model_config.revision == "no-thinking":
-            generation_kwargs["thinking"] = dict(type="disabled", budget_tokens=0)
+            generation_kwargs["thinking"] = dict(budget_tokens=0)
             log_once(
                 f"Disabling thinking mode for model {self.model_config.model_id!r}",
                 level=logging.DEBUG,
@@ -377,6 +378,19 @@ class LiteLLMModel(BenchmarkModule):
         # Drop generation kwargs that are not supported by the model
         litellm.drop_params = True
+        # First attempt is a test run with a single conversation to handle errors
+        # quickly
+        test_conversation = conversations[0]
+        _, failures = safe_run(
+            self._generate_async(
+                model_id=self.model_config.model_id,
+                conversations=[test_conversation],
+                **generation_kwargs,
+            )
+        )
+        for _, error in failures:
+            self._handle_exception(error=error, generation_kwargs=generation_kwargs)
         all_responses: dict[int, "ModelResponse"] = {}
         conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
             enumerate(conversations)
@@ -477,6 +491,7 @@ class LiteLLMModel(BenchmarkModule):
             r"the thinking budget [0-9]+ is invalid. please choose a value between "
             r"[0-9]+ and ([0-9]+)\."
         )
+        requires_thinking_disabled_messages = ["thinking.type: Field required"]
         if any(msg.lower() in error_msg for msg in stop_messages):
             log_once(
@@ -557,6 +572,18 @@ class LiteLLMModel(BenchmarkModule):
                 type="enabled", budget_tokens=thinking_budget - 1
             )
             return
+        elif (
+            any(msg.lower() in error_msg for msg in requires_thinking_disabled_messages)
+            and self.generative_type != GenerativeType.REASONING
+        ):
+            log_once(
+                f"The model {model_id!r} requires the `thinking.type` field to be "
+                f"set to `disabled` rather than just setting `budget_tokens` to 0. "
+                "Setting `thinking.type` to `disabled`.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs["thinking"] = dict(type="disabled")
+            return
         elif isinstance(
             error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
         ):

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -77,10 +77,7 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
         destroy_model_parallel,
     )
     from vllm.lora.request import LoRARequest
-if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
-    from outlines.models.vllm import adapt_tokenizer
-    from outlines.processors.structured import JSONLogitsProcessor
+    from vllm.sampling_params import GuidedDecodingParams
 if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
     import ray
@@ -327,7 +324,7 @@ class VLLMModel(HuggingFaceEncoderModel):
             if end_of_chat_token:
                 stop_tokens.append(end_of_chat_token)
-        logits_processor = None
+        structured_generation_schema = None
         if self.dataset_config.task in TASKS_USING_JSON:
             if self.generative_type == GenerativeType.REASONING:
                 log_once(
@@ -342,15 +339,13 @@ class VLLMModel(HuggingFaceEncoderModel):
                     tag_name: (conlist(str, max_length=5), ...)
                     for tag_name in ner_tag_names
                 }
-                pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
-                logits_processor = JSONLogitsProcessor(
-                    schema=pydantic_class,
-                    tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  # type: ignore
-                    whitespace_pattern=r" ?",
+                answer_format_class = create_model(
+                    "AnswerFormat", **keys_and_their_types
                 )
+                structured_generation_schema = answer_format_class.model_json_schema()
                 log_once(
                     "Using structured generation with the JSON schema "
-                    f"{pydantic_class.model_json_schema()}",
+                    f"{structured_generation_schema}",
                     level=logging.DEBUG,
                 )
@@ -374,7 +369,11 @@ class VLLMModel(HuggingFaceEncoderModel):
             logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
             temperature=0.0,
             stop=[stop_token for stop_token in stop_tokens if stop_token],
-            logits_processors=[logits_processor] if logits_processor else None,
+            guided_decoding=(
+                GuidedDecodingParams(json=structured_generation_schema)
+                if structured_generation_schema
+                else None
+            ),
         )
         # If any of the prompts are empty then we need to replace them with a BOS token

euroeval/data_models.py CHANGED Viewed

@@ -259,7 +259,7 @@ class BenchmarkResult(pydantic.BaseModel):
     transformers_version: str | None = get_package_version("transformers")
     torch_version: str | None = get_package_version("torch")
     vllm_version: str | None = get_package_version("vllm")
-    outlines_version: str | None = get_package_version("outlines")
+    xgrammar_version: str | None = get_package_version("xgrammar")
     @classmethod
     def from_dict(cls, config: dict) -> "BenchmarkResult":

euroeval/dataset_configs/danish.py CHANGED Viewed

@@ -128,3 +128,13 @@ MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
     languages=[DA],
     unofficial=True,
 )
+GOLDENSWAG_DA_CONFIG = DatasetConfig(
+    name="goldenswag-da",
+    pretty_name="the truncated version of the Danish common-sense reasoning "
+    "dataset GoldenSwag-da, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-da-mini",
+    task=COMMON_SENSE,
+    languages=[DA],
+    unofficial=True,
+)

euroeval/dataset_configs/dutch.py CHANGED Viewed

@@ -120,3 +120,13 @@ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
     languages=[NL],
     unofficial=True,
 )
+GOLDENSWAG_NL_CONFIG = DatasetConfig(
+    name="goldenswag-nl",
+    pretty_name="the truncated version of the Dutch common-sense reasoning "
+    "dataset GoldenSwag-nl, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-nl-mini",
+    task=COMMON_SENSE,
+    languages=[NL],
+    unofficial=True,
+)

euroeval/dataset_configs/finnish.py CHANGED Viewed

@@ -78,3 +78,13 @@ MULTI_WIKI_QA_FI_CONFIG = DatasetConfig(
     languages=[FI],
     unofficial=True,
 )
+GOLDENSWAG_FI_CONFIG = DatasetConfig(
+    name="goldenswag-fi",
+    pretty_name="the truncated version of the Finnish common-sense reasoning "
+    "dataset GoldenSwag-fi, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-fi-mini",
+    task=COMMON_SENSE,
+    languages=[FI],
+    unofficial=True,
+)

euroeval/dataset_configs/french.py CHANGED Viewed

@@ -91,3 +91,13 @@ MULTI_WIKI_QA_FR_CONFIG = DatasetConfig(
     languages=[FR],
     unofficial=True,
 )
+GOLDENSWAG_FR_CONFIG = DatasetConfig(
+    name="goldenswag-fr",
+    pretty_name="the truncated version of the French common-sense reasoning "
+    "dataset GoldenSwag-fr, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-fr-mini",
+    task=COMMON_SENSE,
+    languages=[FR],
+    unofficial=True,
+)

euroeval/dataset_configs/german.py CHANGED Viewed

@@ -99,3 +99,13 @@ MULTI_WIKI_QA_DE_CONFIG = DatasetConfig(
     languages=[DE],
     unofficial=True,
 )
+GOLDENSWAG_DE_CONFIG = DatasetConfig(
+    name="goldenswag-de",
+    pretty_name="the truncated version of the German common-sense reasoning "
+    "dataset GoldenSwag-de, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-de-mini",
+    task=COMMON_SENSE,
+    languages=[DE],
+    unofficial=True,
+)

euroeval/dataset_configs/italian.py CHANGED Viewed

@@ -99,3 +99,13 @@ MULTI_WIKI_QA_IT_CONFIG = DatasetConfig(
     languages=[IT],
     unofficial=True,
 )
+GOLDENSWAG_IT_CONFIG = DatasetConfig(
+    name="goldenswag-it",
+    pretty_name="the truncated version of the Italian common-sense reasoning "
+    "dataset GoldenSwag-it, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-it-mini",
+    task=COMMON_SENSE,
+    languages=[IT],
+    unofficial=True,
+)

euroeval/dataset_configs/spanish.py CHANGED Viewed

@@ -97,3 +97,13 @@ MULTI_WIKI_QA_ES_CONFIG = DatasetConfig(
     languages=[ES],
     unofficial=True,
 )
+GOLDENSWAG_ES_CONFIG = DatasetConfig(
+    name="goldenswag-es",
+    pretty_name="the truncated version of the Spanish common-sense reasoning "
+    "dataset GoldenSwag-es, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-es-mini",
+    task=COMMON_SENSE,
+    languages=[ES],
+    unofficial=True,
+)

euroeval/dataset_configs/swedish.py CHANGED Viewed

@@ -108,3 +108,13 @@ MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
     languages=[SV],
     unofficial=True,
 )
+GOLDENSWAG_SV_CONFIG = DatasetConfig(
+    name="goldenswag-sv",
+    pretty_name="the truncated version of the Swedish common-sense reasoning "
+    "dataset GoldenSwag-sv, translated from the English GoldenSwag dataset",
+    huggingface_id="EuroEval/goldenswag-sv-mini",
+    task=COMMON_SENSE,
+    languages=[SV],
+    unofficial=True,
+)

euroeval/generation.py CHANGED Viewed

@@ -235,7 +235,7 @@ def generate_single_iteration(
         )
     itr_scores: dict[str, float] = model.compute_metrics(
-        model_outputs_and_labels=(all_preds, ground_truth)
+        model_outputs_and_labels=(all_preds, ground_truth), dataset=dataset
     )
     return itr_scores

euroeval/human_evaluation.py CHANGED Viewed

@@ -620,7 +620,8 @@ class HumanEvaluator:
         )
         ground_truth = self.active_dataset["label"]
         itr_scores: dict[str, float] = self.compute_metrics(
-            model_outputs_and_labels=(all_preds, ground_truth)
+            model_outputs_and_labels=(all_preds, ground_truth),
+            dataset=self.active_dataset,
         )
         # We reverse the order, as the Info messages are printed in reverse order

euroeval/metrics.py CHANGED Viewed

@@ -14,6 +14,7 @@ from .exceptions import InvalidBenchmark
 from .utils import HiddenPrints
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from evaluate import EvaluationModule
 logger = logging.getLogger(__name__)
@@ -49,7 +50,9 @@ class Metric(abc.ABC):
         )
     @abc.abstractmethod
-    def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
+    def __call__(
+        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
+    ) -> float | None:
         """Calculate the metric score.
         Args:
@@ -57,6 +60,9 @@ class Metric(abc.ABC):
                 The model predictions.
             references:
                 The ground truth references.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
         Returns:
             The calculated metric score, or None if the score should be ignored.
@@ -125,7 +131,9 @@ class HuggingFaceMetric(Metric):
         )
         self.metric: "EvaluationModule | None" = None
-    def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
+    def __call__(
+        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
+    ) -> float | None:
         """Calculate the metric score.
         Args:
@@ -133,6 +141,9 @@ class HuggingFaceMetric(Metric):
                 The model predictions.
             references:
                 The ground truth references.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
         Returns:
             The calculated metric score, or None if the score should be ignored.
@@ -213,7 +224,9 @@ class LLMAsAJudgeMetric(Metric):
         self.condition_formatting_fn = condition_formatting_fn
         self.system_prompt = system_prompt
-    def __call__(self, predictions: t.Sequence, references: t.Sequence) -> float | None:
+    def __call__(
+        self, predictions: t.Sequence, references: t.Sequence, dataset: "Dataset"
+    ) -> float | None:
         """Calculate the metric score using the judge model.
         Args:
@@ -221,6 +234,9 @@ class LLMAsAJudgeMetric(Metric):
                 The model predictions.
             references:
                 The ground truth references.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
         Returns:
             The calculated metric score, or None if the score should be ignored.
@@ -343,7 +359,7 @@ class SpeedMetric(Metric):
             postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
         )
-    def __call__(self, _: t.Sequence, __: t.Sequence) -> float | None:
+    def __call__(self, _: t.Sequence, __: t.Sequence, ___: "Dataset") -> float | None:
         """Not used with the speed metric, but required for consistency."""
         raise NotImplementedError

euroeval/prompt_templates/multiple_choice.py CHANGED Viewed

@@ -69,7 +69,7 @@ MULTIPLE_CHOICE_TEMPLATES = {
     IT: PromptConfig(
         default_prompt_prefix="Le seguenti sono domande a scelta multipla "
         "(con relative risposte).",
-        default_prompt_template="Domanda: {text}\nRéponse: {label}",
+        default_prompt_template="Domanda: {text}\nRisposta: {label}",
         default_instruction_prompt="Domanda: {text}\n\nRispondete alla domanda "
         "precedente con {labels_str}, e nient'altro.",
         default_prompt_label_mapping="auto",

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -149,6 +149,7 @@ class QuestionAnsweringTrainer(Trainer):
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -158,6 +159,9 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -181,7 +185,9 @@ def compute_metrics(
     results: dict[str, float] = dict()
     for metric in dataset_config.task.metrics:
-        score: float | None = metric(predictions=predictions, references=labels)
+        score: float | None = metric(
+            predictions=predictions, references=labels, dataset=dataset
+        )
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -11,6 +11,7 @@ from ..exceptions import InvalidBenchmark
 from ..utils import log_once, raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from transformers.trainer_utils import EvalPrediction
     from ..data_models import DatasetConfig, GenerativeModelOutput
@@ -23,6 +24,7 @@ logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -32,6 +34,9 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -73,7 +78,9 @@ def compute_metrics(
     results: dict[str, float] = dict()
     for metric in dataset_config.task.metrics:
-        score: float | None = metric(predictions=predictions, references=label_ids)
+        score: float | None = metric(
+            predictions=predictions, references=label_ids, dataset=dataset
+        )
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process

euroeval/task_group_utils/text_to_text.py CHANGED Viewed

@@ -11,6 +11,7 @@ from ..metrics import HuggingFaceMetric
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from transformers.trainer_utils import EvalPrediction
     from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
@@ -24,6 +25,7 @@ def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -35,6 +37,9 @@ def compute_metrics(
             The configuration of the dataset.
         benchmark_config:
             The configuration of the benchmark.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -69,7 +74,9 @@ def compute_metrics(
         while True:
             try:
-                score: float | None = metric(predictions=predictions, references=labels)
+                score: float | None = metric(
+                    predictions=predictions, references=labels, dataset=dataset
+                )
                 break
             except Exception as e:
                 oom_error = [

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -12,6 +12,7 @@ from ..exceptions import InvalidBenchmark
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from transformers.tokenization_utils import PreTrainedTokenizer
     from transformers.tokenization_utils_base import BatchEncoding
     from transformers.trainer_utils import EvalPrediction
@@ -27,6 +28,7 @@ def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     has_misc_tags: bool,
     dataset_config: "DatasetConfig",
+    dataset: "Dataset",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -38,6 +40,9 @@ def compute_metrics(
             Whether the dataset has MISC tags.
         dataset_config:
             The configuration of the dataset.
+        dataset:
+            The dataset used for evaluation. This is only used in case any additional
+            metadata is used to compute the metrics.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -136,7 +141,9 @@ def compute_metrics(
             for metric in dataset_config.task.metrics
             if metric.name == "micro_f1"
         )
-        micro_f1_score = metric(predictions=predictions, references=list(labels))
+        micro_f1_score = metric(
+            predictions=predictions, references=list(labels), dataset=dataset
+        )
     # Compute the metrics without MISC tags
     # We manually set the F1 metric to be 100% if both the labels and the models
@@ -158,7 +165,7 @@ def compute_metrics(
             if metric.name == "micro_f1_no_misc"
         )
         micro_f1_no_misc_score = metric(
-            predictions=predictions_no_misc, references=labels_no_misc
+            predictions=predictions_no_misc, references=labels_no_misc, dataset=dataset
         )
     # Raise error if the metrics are invalid

euroeval/types.py CHANGED Viewed

@@ -5,6 +5,7 @@ import typing as t
 from transformers.trainer_utils import EvalPrediction
 if t.TYPE_CHECKING:
+    from datasets.arrow_dataset import Dataset
     from numpy.typing import NDArray
     from .data_models import GenerativeModelOutput
@@ -25,12 +26,16 @@ class ComputeMetricsFunction(t.Protocol):
             "NDArray | list[str] | list[list[str]]",
             "NDArray | list[str] | list[list[str]]",
         ],
+        dataset: "Dataset",
     ) -> dict[str, float]:
         """Compute the metrics.
         Args:
             model_outputs_and_labels:
                 The model outputs and labels.
+            dataset:
+                The dataset used for evaluation. This is only used in case any
+                additional metadata is used to compute the metrics.
         Returns:
             The computed metrics.

{euroeval-15.13.0.dist-info → euroeval-15.15.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.13.0
+Version: 15.15.0
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,13 +61,11 @@ Provides-Extra: all
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: gradio>=4.26.0; extra == 'all'
-Requires-Dist: outlines>=0.1.11; extra == 'all'
-Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
-Requires-Dist: outlines>=0.1.11; extra == 'generative'
-Requires-Dist: vllm>=0.9.1; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: vllm>=0.10.0; (platform_system == 'Linux') and extra == 'generative'
 Provides-Extra: human-evaluation
 Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
 Provides-Extra: test

{euroeval-15.13.0.dist-info → euroeval-15.15.0.dist-info}/RECORD RENAMED Viewed

@@ -1,19 +1,19 @@
-euroeval/__init__.py,sha256=fZyR9R3C3vwGJS3CrCJ6ySr_FDnMu_Aqnz0FdadWEEs,3399
+euroeval/__init__.py,sha256=ZZoVc6tKWz_h8Pw2n26PV-q_Gd4TM_02O235ZBRUNJw,3756
 euroeval/benchmark_config_factory.py,sha256=jKC8bEzJSGGCcG8aWsPxiyHX6fjOQYQWvkp1MIUuHYM,11564
 euroeval/benchmarker.py,sha256=SDBzdCa4I8u1XDeN_1mKTFzfaaQbbY_oWcHt3niADxk,48497
 euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
 euroeval/cli.py,sha256=h81Lswm_q9htkYz-GQQQVIsdsUPnfe3LDH8AZdBcpKs,8602
 euroeval/constants.py,sha256=0KHrH74zGM8vNF4uZG_a5qFJRZH5YgyQULYZtCKlo68,2452
 euroeval/data_loading.py,sha256=DP-cqwN_d0Y-KaN8P8c3fDr6PX80UYROHgRwX82ix4w,4156
-euroeval/data_models.py,sha256=gPHyIoN2A5_O-cJgyb6jhn6enH8zsiIBI09W_wdHMQs,22031
+euroeval/data_models.py,sha256=qSCNq3PV7qo--gibqEvvu4cXkEkhGGAb6UiZW8U_KiU,22031
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
 euroeval/finetuning.py,sha256=BrPZ-6qFY8K-dwfaRwNetVYfYburoQwLQty6pn6iP_s,11340
-euroeval/generation.py,sha256=1fqFEWwM2RzI3uPZem95VFWbN8EfrKZQTrHEP34ihHs,11622
+euroeval/generation.py,sha256=lmvu__6w3cLxi0zBtXSlyZvV8CJpV3BdajUoIEA9ElA,11639
 euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
-euroeval/human_evaluation.py,sha256=Jtz3K5Lqne48wPZWf4EAd3d-n_wX27nGJHigjhV1D7s,27537
+euroeval/human_evaluation.py,sha256=FLuTl1DHxCiWB_laVVQHIH86yXvA_ZeNNSrUmyExZXI,27579
 euroeval/languages.py,sha256=cr_Z5jtaHb2XY0zeOhuk3ATHX74PODzt6gMPC2zMD7c,8594
-euroeval/metrics.py,sha256=nxosyoRjlk7TcoAOkjU7zx2TB43b9tA8M1m4V1s5eKU,15516
+euroeval/metrics.py,sha256=d59VRsjGFA2h2s4J8zRgdGxCu_pA3YhfvKxkK6pN6GI,16185
 euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
 euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
 euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
@@ -21,43 +21,43 @@ euroeval/scores.py,sha256=TatSbjia7Zwj71gQFyV_gCHyppMbOgeaZgNCib8G86k,2849
 euroeval/speed_benchmark.py,sha256=6bFGeMmtdl_6owkxNQ3ZKiyQQS58k0NApzlsbDgBW5s,4037
 euroeval/tasks.py,sha256=btxf29M5rUP7JjBl6u9aQlHQAxrJNP4bRbdEQtDnmDA,3376
 euroeval/tokenization_utils.py,sha256=LxgGs7juS5PuMYt5LL2X6eVXdtnpi-A2jFxqcWpF6NA,17931
-euroeval/types.py,sha256=EIYMNOqqHqibnbNw-fvdst6HwTvq32gtxhr7jL7i-xM,2511
+euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
 euroeval/utils.py,sha256=5R7y67xe0ODaje7k8nOu2AFS3Ph2gcsiWpIq5rjSSuA,11613
 euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
 euroeval/benchmark_modules/base.py,sha256=D1oKD16KBvxEoBUfqwvzvcDc1hx6letdD3v1PnBmF4A,10669
 euroeval/benchmark_modules/fresh.py,sha256=sg_AXNPApFObCzCRWhCgKxfr-eqQsT6Ri0xx0_Yy5JM,10293
 euroeval/benchmark_modules/hf.py,sha256=-W_bWEdm0zePkn4nDz4l0T4hhJJnlfwHrtIO3m5BrUs,44725
-euroeval/benchmark_modules/litellm.py,sha256=_gKBbJsXzo_cHJVaeuQpHRBENEZUGS_vcC-uGIhhmHA,52111
-euroeval/benchmark_modules/vllm.py,sha256=kq3PMUuRT0NOky6XSHl1JeHTDGehwcub0HcGC5S_Wv4,38834
+euroeval/benchmark_modules/litellm.py,sha256=qv-k2ntk48OF4ikevQ95k4zLbBkZYOZ2z-GAisA-tFY,53374
+euroeval/benchmark_modules/vllm.py,sha256=Uq81tgNSkajuawdJ1lH1s9Te9wubYd-CyBbM-B5YZcA,38693
 euroeval/dataset_configs/__init__.py,sha256=EbjEyHwBtSztASl8_xblD8hessruDdV4Eg1vXrmGOuY,1935
-euroeval/dataset_configs/danish.py,sha256=-y-n08hTApwTdSVdjRlZYa3gOX92cTGhg8xsuG-Lhww,3691
-euroeval/dataset_configs/dutch.py,sha256=siyFeEKYx2gBpyqQPtOZ0cD8FTsIMUqzRX5xrQfrNXI,3480
+euroeval/dataset_configs/danish.py,sha256=0lDtvpgszXY1XaPjTU8yA3oNCU8W2OllvrBWgn6pkhk,4027
+euroeval/dataset_configs/dutch.py,sha256=ekZxLL9d09BUMijCxy9EFa2heNQVvySPySOjhWdtJc8,3815
 euroeval/dataset_configs/english.py,sha256=uQAaGWpHk8xqFCeIhmmPXYTb1cZomeEdRaRe9qIZQrg,2858
 euroeval/dataset_configs/faroese.py,sha256=gkgxQTWGFbfg9Eo1z-NSLROgKDcaij9tAN2mfgtrt0M,1647
-euroeval/dataset_configs/finnish.py,sha256=OyveLgyii0hOlo6HZsqAq4rwDrj8tl2qstRfQKugURo,2342
-euroeval/dataset_configs/french.py,sha256=DKKZEtohWkw_ouBaxWcPzp-K6NhQNtvCKxj8NLbIpUc,2678
-euroeval/dataset_configs/german.py,sha256=3bfRgkqIGkAhcw4kwcJN9PKuJSmi1r6AFTJY-IWKgWM,2856
+euroeval/dataset_configs/finnish.py,sha256=UZwy0_d17O2L-v2AKOu3OlDwFPcLGTZNAOt7ZKlr4K8,2679
+euroeval/dataset_configs/french.py,sha256=Hei2M4bGIz8hVtaPKQlQATcmK-0bFBNEocEszR3gia0,3014
+euroeval/dataset_configs/german.py,sha256=sRYtOl6CYf4kZkeINfff6xoKBG4OsDxb2b72lKwELGc,3192
 euroeval/dataset_configs/icelandic.py,sha256=g21IHjcwEZvf_yJ9PobeuBOqRiLOk0oCdEjY34g-UMk,4497
-euroeval/dataset_configs/italian.py,sha256=rHLMkSXT0kFoQlkwHODxO50WBRIfGtkAnW_C-sfIu74,2957
+euroeval/dataset_configs/italian.py,sha256=4SEmdUyfGbbwMPhv_9nL3JNJtoDKHLAlWuvr7Ihmi9o,3294
 euroeval/dataset_configs/norwegian.py,sha256=-WvQM44xCwjrqBzlAy4rjf6v87fGera2JmZV_069TeQ,6003
 euroeval/dataset_configs/portuguese.py,sha256=3SqbwD0PNTILGALzh50pVoEwC-spRD75ZeE2NEj151E,2367
-euroeval/dataset_configs/spanish.py,sha256=VKfBIpBRR38ckuULw7Ftmc-0smsm6GshUAik2-Y1Npw,2855
-euroeval/dataset_configs/swedish.py,sha256=WpExi4TJqy_Ruwy4Kvde94jM605vT_88el_KKUzLV4E,3108
+euroeval/dataset_configs/spanish.py,sha256=Bm0Z19Mh2qYXR0RIRlqEkzfVb5KiqJRectfuY7JLql4,3192
+euroeval/dataset_configs/swedish.py,sha256=js4paNsuC0nQzPpf6_BzHBf7MT60XUpP1-qM2uxRtQs,3445
 euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
 euroeval/prompt_templates/linguistic_acceptability.py,sha256=ZN71BEt4HAhSYY-GWjh-S-iVvq5AODQJThkrjDhy4oM,7138
-euroeval/prompt_templates/multiple_choice.py,sha256=F9ItGQtnaaez15A8MQ1UCpKRDsLM-AZyRdYetGAofa0,5494
+euroeval/prompt_templates/multiple_choice.py,sha256=wHnQCE5bv947L6hSK5zJitE37V-PbuNYAp156mWaIYA,5494
 euroeval/prompt_templates/named_entity_recognition.py,sha256=ga21s9T4_Hhbf88boWm7gnL7OgD7txuS_EeDgXaxEoE,13602
 euroeval/prompt_templates/reading_comprehension.py,sha256=3Nch-9zHfUDIwy-k5mP-TRhHQRQ9nad8HdhpJ1S8nGc,7072
 euroeval/prompt_templates/sentiment_classification.py,sha256=2Xsmj8lbaAXACHhwbbR4dWhoKyKB87TqpMO-ssQ-Djo,7649
 euroeval/prompt_templates/summarization.py,sha256=I98LlUOBVa_xo02npq7BWKKZOXGqm-_15i64QzbEsb0,5334
 euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 euroeval/task_group_utils/multiple_choice_classification.py,sha256=yfy8lczpZ_MY-Y4FQx3Et9vEUpuD3YMFjF3wQGCfMNw,6632
-euroeval/task_group_utils/question_answering.py,sha256=agwtWOmctgat98yqgFiMSPY6zmoaPgYVyzMmOkNjr58,27284
-euroeval/task_group_utils/sequence_classification.py,sha256=igmD24aMNN7QBJ8NDzgEnGwM-jq_zhC37QxazNm7GZ4,12711
-euroeval/task_group_utils/text_to_text.py,sha256=xOpja-W4E-1peMjZX8G-3G5iRgmFHHygrQ5WN1hB3FI,4550
-euroeval/task_group_utils/token_classification.py,sha256=wCy3aI-Sn9f-87tHzAnYDA6EbY3ah3xao1SnfnoRNz4,17490
-euroeval-15.13.0.dist-info/METADATA,sha256=HnDtAE2-sYFmSl4yM9PQhgUrfklR_OB5C5aXPOgz5U8,13478
-euroeval-15.13.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.13.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.13.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
-euroeval-15.13.0.dist-info/RECORD,,
+euroeval/task_group_utils/question_answering.py,sha256=6jpiHukzA7IrJh4vVYyZDDyvD5Xc2GsxoXzpm_PHpXw,27503
+euroeval/task_group_utils/sequence_classification.py,sha256=ihJO55f3Dy565d3ByYGMuSINasnjAADaTrM59LwZzA0,12977
+euroeval/task_group_utils/text_to_text.py,sha256=go0y6X9QAv5iywlLAclb8cqFX_3QlAT-1-VNZ9zMWFA,4832
+euroeval/task_group_utils/token_classification.py,sha256=BDqOfopdH5Bbj67HTEbZd9KZtNCDNket8NrCTfxZFzQ,17773
+euroeval-15.15.0.dist-info/METADATA,sha256=ldIaYcwIlgDbuHPz_uHKrcYbmh-GLB9T239BjqYRalk,13377
+euroeval-15.15.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.15.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.15.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
+euroeval-15.15.0.dist-info/RECORD,,

{euroeval-15.13.0.dist-info → euroeval-15.15.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.13.0.dist-info → euroeval-15.15.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.13.0.dist-info → euroeval-15.15.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.13.0__py3-none-any.whl → 15.15.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.13.0py3-none-any.whl → 15.15.0py3-none-any.whl