PyPI - EuroEval - Versions diffs - 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl - Mend

EuroEval 15.10.1py3-none-any.whl → 15.12.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

euroeval/__init__.py +7 -0
euroeval/benchmark_config_factory.py +7 -0
euroeval/benchmark_modules/base.py +29 -29
euroeval/benchmark_modules/fresh.py +31 -19
euroeval/benchmark_modules/hf.py +27 -23
euroeval/benchmark_modules/litellm.py +50 -30
euroeval/benchmark_modules/vllm.py +22 -26
euroeval/benchmarker.py +8 -1
euroeval/callbacks.py +17 -13
euroeval/cli.py +10 -0
euroeval/data_loading.py +10 -5
euroeval/data_models.py +9 -40
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/english.py +13 -4
euroeval/dataset_configs/norwegian.py +8 -0
euroeval/dataset_configs/portuguese.py +74 -0
euroeval/dataset_configs/spanish.py +4 -3
euroeval/finetuning.py +9 -8
euroeval/generation.py +27 -8
euroeval/human_evaluation.py +14 -13
euroeval/languages.py +1 -2
euroeval/metrics.py +452 -0
euroeval/prompt_templates/linguistic_acceptability.py +9 -1
euroeval/prompt_templates/multiple_choice.py +9 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -1
euroeval/prompt_templates/sentiment_classification.py +11 -1
euroeval/prompt_templates/summarization.py +8 -1
euroeval/scores.py +14 -19
euroeval/speed_benchmark.py +6 -7
euroeval/task_group_utils/multiple_choice_classification.py +6 -4
euroeval/task_group_utils/question_answering.py +5 -28
euroeval/task_group_utils/sequence_classification.py +6 -30
euroeval/task_group_utils/text_to_text.py +19 -34
euroeval/task_group_utils/token_classification.py +18 -30
euroeval/tasks.py +11 -136
euroeval/types.py +6 -4
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/METADATA +10 -10
euroeval-15.12.0.dist-info/RECORD +63 -0
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/licenses/LICENSE +1 -1
euroeval-15.10.1.dist-info/RECORD +0 -61
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/WHEEL +0 -0
{euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/entry_points.txt +0 -0

euroeval/prompt_templates/multiple_choice.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Templates for all multiple choice tasks."""
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
+from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
 # TODO: Missing Faroese
 MULTIPLE_CHOICE_TEMPLATES = {
@@ -36,6 +36,14 @@ MULTIPLE_CHOICE_TEMPLATES = {
         "usando solo {labels_str}, y nada más.",
         default_prompt_label_mapping="auto",
     ),
+    PT: PromptConfig(
+        default_prompt_prefix="As seguintes são perguntas de escolha múltipla "
+        "(com respostas).",
+        default_prompt_template="Pergunta: {text}\nResposta: {label}",
+        default_instruction_prompt="Pergunta: {text}\n\nResponde à pergunta "
+        "acima usando só {labels_str}, e nada mais.",
+        default_prompt_label_mapping="auto",
+    ),
     FI: PromptConfig(
         default_prompt_prefix="Seuraavat ovat monivalintakysymyksiä (vastauksineen).",
         default_prompt_template="Kysymys: {text}\nVastaus: {label}",

euroeval/prompt_templates/named_entity_recognition.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Templates for the Named Entity Recognition task."""
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
+from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
 NER_TEMPLATES = {
     DA: PromptConfig(
@@ -80,6 +80,25 @@ NER_TEMPLATES = {
         "claves {labels_str}. Los valores deben ser listas de las "
         "entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
     ),
+    PT: PromptConfig(
+        default_prompt_label_mapping={
+            "b-per": "pessoa",
+            "i-per": "pessoa",
+            "b-loc": "local",
+            "i-loc": "local",
+            "b-org": "organização",
+            "i-org": "organização",
+            "b-misc": "diverso",
+            "i-misc": "diverso",
+        },
+        default_prompt_prefix="Seguem-se frases e dicionários JSON com as entidades "
+        "mencionadas presentes na frase indicada.",
+        default_prompt_template="Frase: {text}\nEntidades mencionadas: {label}",
+        default_instruction_prompt="Frase: {text}\n\nIdentifica as entidades "
+        "mencionadas na frase. Deves devolver um dicionário JSON com as chaves "
+        "{labels_str}. Os valores devem ser listas contendo as entidades "
+        "mencionadas desse tipo, tal como ocorrem na frase.",
+    ),
     FI: PromptConfig(
         default_prompt_label_mapping={
             "b-per": "henkilö",

euroeval/prompt_templates/sentiment_classification.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Templates for the Sentiment Analysis task."""
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
+from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
 SENT_TEMPLATES = {
     DA: PromptConfig(
@@ -44,6 +44,16 @@ SENT_TEMPLATES = {
         default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
         "documento. Responde con {labels_str}, y nada más.",
     ),
+    PT: PromptConfig(
+        default_prompt_label_mapping=dict(
+            positive="positivo", neutral="neutro", negative="negativo"
+        ),
+        default_prompt_prefix="Abaixo encontras documentos e os seus "
+        "sentimentos correspondentes, que podem ser {labels_str}.",
+        default_prompt_template="Documento: {text}\nSentimento: {label}",
+        default_instruction_prompt="Documento: {text}\n\nClassifica o "
+        "sentimento do documento. Responde apenas com {labels_str}.",
+    ),
     FI: PromptConfig(
         default_prompt_label_mapping=dict(
             positive="positiivinen", neutral="neutrali", negative="negatiivinen"

euroeval/prompt_templates/summarization.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Templates for the Summarization task."""
 from ..data_models import PromptConfig
-from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
+from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
 # TODO: Missing Faroese
 SUMM_TEMPLATES = {
@@ -36,6 +36,13 @@ SUMM_TEMPLATES = {
         "documento anterior.",
         default_prompt_label_mapping=dict(),
     ),
+    PT: PromptConfig(
+        default_prompt_prefix="Abaixo encontras documentos com resumos associados.",
+        default_prompt_template="Documento: {text}\nResumo: {target_text}",
+        default_instruction_prompt="Documento: {text}\n\nEscreve um resumo do "
+        "documento anterior.",
+        default_prompt_label_mapping=dict(),
+    ),
     FI: PromptConfig(
         default_prompt_prefix="Seuraavassa on artikkeleita ja niihin liittyviä "
         "tiivistelmiä.",

euroeval/scores.py CHANGED Viewed

@@ -7,7 +7,7 @@ import warnings
 import numpy as np
 if t.TYPE_CHECKING:
-    from .data_models import MetricConfig
+    from .metrics import Metric
     from .types import ScoreDict
 logger = logging.getLogger("euroeval")
@@ -15,7 +15,7 @@ logger = logging.getLogger("euroeval")
 def log_scores(
     dataset_name: str,
-    metric_configs: list["MetricConfig"],
+    metrics: list["Metric"],
     scores: list[dict[str, float]],
     model_id: str,
     model_revision: str,
@@ -25,7 +25,7 @@ def log_scores(
     Args:
         dataset_name:
             Name of the dataset.
-        metric_configs:
+        metrics:
             List of metrics to log.
         scores:
             The scores that are to be logged. This is a list of dictionaries full of
@@ -46,19 +46,19 @@ def log_scores(
     logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
     total_dict: dict[str, float] = dict()
-    for metric_cfg in metric_configs:
-        test_score, test_se = aggregate_scores(scores=scores, metric_config=metric_cfg)
-        test_score, test_score_str = metric_cfg.postprocessing_fn(test_score)
-        test_se, test_se_str = metric_cfg.postprocessing_fn(test_se)
-        total_dict[f"test_{metric_cfg.name}"] = test_score
-        total_dict[f"test_{metric_cfg.name}_se"] = test_se
-        logger.info(f"{metric_cfg.pretty_name}: {test_score_str} ± {test_se_str}")
+    for metric in metrics:
+        test_score, test_se = aggregate_scores(scores=scores, metric=metric)
+        test_score, test_score_str = metric.postprocessing_fn(test_score)
+        test_se, test_se_str = metric.postprocessing_fn(test_se)
+        total_dict[f"test_{metric.name}"] = test_score
+        total_dict[f"test_{metric.name}_se"] = test_se
+        logger.info(f"{metric.pretty_name}: {test_score_str} ± {test_se_str}")
     return dict(raw=scores, total=total_dict)
 def aggregate_scores(
-    scores: list[dict[str, float]], metric_config: "MetricConfig"
+    scores: list[dict[str, float]], metric: "Metric"
 ) -> tuple[float, float]:
     """Helper function to compute the mean with confidence intervals.
@@ -66,9 +66,8 @@ def aggregate_scores(
         scores:
             Dictionary with the names of the metrics as keys, of the form
             "<split>_<metric_name>", such as "val_f1", and values the metric values.
-        metric_config:
-            The configuration of the metric, which is used to collect the correct
-            metric from `scores`.
+        metric:
+            The metric, which is used to collect the correct metric from `scores`.
     Returns:
         A pair of floats, containing the score and the radius of its 95% confidence
@@ -78,11 +77,7 @@ def aggregate_scores(
         warnings.simplefilter("ignore")
         test_scores = [
-            (
-                dct[metric_config.name]
-                if metric_config.name in dct
-                else dct[f"test_{metric_config.name}"]
-            )
+            dct[metric.name] if metric.name in dct else dct[f"test_{metric.name}"]
             for dct in scores
         ]
         test_score = np.mean(test_scores).item()

euroeval/speed_benchmark.py CHANGED Viewed

@@ -1,21 +1,20 @@
 """Benchmarking model inference speed."""
 import logging
+import typing as t
 import pyinfer
 from tqdm.auto import tqdm
 from transformers.models.auto.tokenization_auto import AutoTokenizer
-from .benchmark_modules import (
-    BenchmarkModule,
-    HuggingFaceEncoderModel,
-    LiteLLMModel,
-    VLLMModel,
-)
-from .data_models import BenchmarkConfig
+from .benchmark_modules import HuggingFaceEncoderModel, LiteLLMModel, VLLMModel
 from .exceptions import InvalidBenchmark
 from .utils import clear_memory
+if t.TYPE_CHECKING:
+    from .benchmark_modules import BenchmarkModule
+    from .data_models import BenchmarkConfig
 logger = logging.getLogger("euroeval")

euroeval/task_group_utils/multiple_choice_classification.py CHANGED Viewed

@@ -7,14 +7,15 @@ import typing as t
 from collections import defaultdict
 import numpy as np
-from datasets import Dataset
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.tokenization_utils_base import BatchEncoding
 from transformers.trainer import Trainer
 from ..exceptions import InvalidBenchmark
 if t.TYPE_CHECKING:
+    from datasets import Dataset
+    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers.tokenization_utils_base import BatchEncoding
     from ..types import Labels, Predictions
 logger = logging.getLogger("euroeval")
@@ -147,7 +148,8 @@ def postprocess_predictions_and_labels(
     Args:
         predictions:
-            The model predictions, of shape (num_examples, 2).
+            The model predictions, of shape (num_examples, 2), corresponding to the
+            False/True probabilities for each example.
         dataset:
             The dataset containing the examples.

euroeval/task_group_utils/question_answering.py CHANGED Viewed

@@ -5,13 +5,10 @@ import logging
 import typing as t
 from collections import defaultdict
-import evaluate
 import numpy as np
-from evaluate import EvaluationModule
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer import Trainer
-from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
 from ..exceptions import InvalidBenchmark
 from ..tokenization_utils import get_special_token_metadata
 from ..utils import raise_if_model_output_contains_nan_values
@@ -26,6 +23,7 @@ if t.TYPE_CHECKING:
     from transformers.trainer_utils import EvalPrediction
     from transformers.training_args import TrainingArguments
+    from ..data_models import DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
 logger = logging.getLogger("euroeval")
@@ -151,7 +149,6 @@ class QuestionAnsweringTrainer(Trainer):
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
-    benchmark_config: "BenchmarkConfig",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -161,8 +158,6 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
-        benchmark_config:
-            The configuration of the benchmark.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -178,17 +173,6 @@ def compute_metrics(
     assert not isinstance(model_outputs, tuple)
     raise_if_model_output_contains_nan_values(model_output=model_outputs)
-    metrics = {
-        metric_cfg.name: (
-            evaluate.load(
-                path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
-            )
-            if metric_cfg.huggingface_id != ""
-            else None
-        )
-        for metric_cfg in dataset_config.task.metrics
-    }
     model_output_dtype = np.asarray(model_outputs).dtype
     if model_output_dtype in [np.float16, np.float32, np.float64]:
         predictions = np.asarray(model_outputs).argmax(axis=-1)
@@ -196,20 +180,13 @@ def compute_metrics(
         predictions = model_outputs
     results: dict[str, float] = dict()
-    for cfg in dataset_config.task.metrics:
-        metric = metrics[cfg.name]
-        assert isinstance(metric, EvaluationModule)
-        score_dict: dict[str, float] | None = metric.compute(
-            predictions=predictions, references=labels, **cfg.compute_kwargs
-        )
+    for metric in dataset_config.task.metrics:
+        score: float | None = metric(predictions=predictions, references=labels)
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process
-        if score_dict is not None:
-            scores = score_dict[cfg.results_key]
-            if isinstance(scores, list):
-                scores = sum(scores) / len(scores)
-            results[cfg.name] = scores
+        if score is not None:
+            results[metric.name] = score
     return results

euroeval/task_group_utils/sequence_classification.py CHANGED Viewed

@@ -4,19 +4,16 @@ import logging
 import re
 import typing as t
-import evaluate
 import Levenshtein
 import numpy as np
-from evaluate import EvaluationModule
-from ..data_models import BenchmarkConfig, GenerativeModelOutput
 from ..exceptions import InvalidBenchmark
 from ..utils import log_once, raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
     from transformers.trainer_utils import EvalPrediction
-    from ..data_models import DatasetConfig
+    from ..data_models import DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
@@ -26,7 +23,6 @@ logger = logging.getLogger("euroeval")
 def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     dataset_config: "DatasetConfig",
-    benchmark_config: "BenchmarkConfig",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -36,8 +32,6 @@ def compute_metrics(
             contains the true labels.
         dataset_config:
             The configuration of the dataset.
-        benchmark_config:
-            The configuration of the benchmark.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -51,17 +45,6 @@ def compute_metrics(
     if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
         model_outputs = model_outputs[0]
-    metrics = {
-        metric_cfg.name: (
-            evaluate.load(
-                path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
-            )
-            if metric_cfg.huggingface_id != ""
-            else None
-        )
-        for metric_cfg in dataset_config.task.metrics
-    }
     model_output_dtype = np.asarray(model_outputs).dtype
     if model_output_dtype in [np.float16, np.float32, np.float64]:
         predictions = np.asarray(model_outputs).argmax(axis=-1)
@@ -89,27 +72,20 @@ def compute_metrics(
     ]
     results: dict[str, float] = dict()
-    for cfg in dataset_config.task.metrics:
-        metric = metrics[cfg.name]
-        assert isinstance(metric, EvaluationModule)
-        score_dict: dict[str, float] | None = metric.compute(
-            predictions=predictions, references=label_ids, **cfg.compute_kwargs
-        )
+    for metric in dataset_config.task.metrics:
+        score: float | None = metric(predictions=predictions, references=label_ids)
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process
-        if score_dict is not None:
-            scores = score_dict[cfg.results_key]
-            if isinstance(scores, list):
-                scores = sum(scores) / len(scores)
-            results[cfg.name] = scores
+        if score is not None:
+            results[metric.name] = score
     return results
 def extract_labels_from_generation(
     input_batch: dict[str, list],
-    model_output: GenerativeModelOutput,
+    model_output: "GenerativeModelOutput",
     dataset_config: "DatasetConfig",
     first_label_token_mapping: dict[str, str] | bool,
 ) -> list[str]:

euroeval/task_group_utils/text_to_text.py CHANGED Viewed

@@ -3,18 +3,17 @@
 import logging
 import typing as t
-import evaluate
 import numpy as np
-from evaluate import EvaluationModule
 from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
-from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
 from ..exceptions import InvalidBenchmark
-from ..utils import HiddenPrints, raise_if_model_output_contains_nan_values
+from ..metrics import HuggingFaceMetric
+from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
     from transformers.trainer_utils import EvalPrediction
+    from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
@@ -51,17 +50,6 @@ def compute_metrics(
     assert not isinstance(model_outputs, tuple)
     raise_if_model_output_contains_nan_values(model_output=model_outputs)
-    metrics = {
-        metric_cfg.name: (
-            evaluate.load(
-                path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
-            )
-            if metric_cfg.huggingface_id != ""
-            else None
-        )
-        for metric_cfg in dataset_config.task.metrics
-    }
     model_output_dtype = np.asarray(model_outputs).dtype
     output_is_prob = model_output_dtype in [np.float16, np.float32, np.float64]
     if output_is_prob:
@@ -70,21 +58,18 @@ def compute_metrics(
         predictions = model_outputs
     results: dict[str, float] = dict()
-    for cfg in dataset_config.task.metrics:
-        metric = metrics[cfg.name]
-        assert isinstance(metric, EvaluationModule)
+    for metric in dataset_config.task.metrics:
         # Some metrics can be computed on hardware accelerators. In this case we
         # start by setting the device to the same device as the model
-        if cfg.compute_kwargs.get("device", None) == "auto":
-            cfg.compute_kwargs["device"] = benchmark_config.device.type
+        if (
+            isinstance(metric, HuggingFaceMetric)
+            and metric.compute_kwargs.get("device", None) == "auto"
+        ):
+            metric.compute_kwargs["device"] = benchmark_config.device.type
         while True:
             try:
-                with HiddenPrints():
-                    score_dict: dict[str, float] | None = metric.compute(
-                        predictions=predictions, references=labels, **cfg.compute_kwargs
-                    )
+                score: float | None = metric(predictions=predictions, references=labels)
                 break
             except Exception as e:
                 oom_error = [
@@ -95,11 +80,14 @@ def compute_metrics(
                 if not any(error in str(e) for error in oom_error):
                     raise InvalidBenchmark(str(e))
-                if cfg.compute_kwargs.get("device", "cpu") != "cpu":
-                    cfg.compute_kwargs["device"] = "cpu"
+                if (
+                    isinstance(metric, HuggingFaceMetric)
+                    and metric.compute_kwargs.get("device", "cpu") != "cpu"
+                ):
+                    metric.compute_kwargs["device"] = "cpu"
                     logger.debug(
                         "Out of memory error occurred during the computation of "
-                        f"the metric {cfg.pretty_name}. Moving the computation to "
+                        f"the metric {metric.pretty_name}. Moving the computation to "
                         "the CPU."
                     )
                 else:
@@ -109,17 +97,14 @@ def compute_metrics(
                     if hasattr(metric, attribute):
                         logger.debug(
                             f"Deleting the {attribute!r} attribute of the metric "
-                            f"{cfg.pretty_name} to free up memory."
+                            f"{metric.pretty_name} to free up memory."
                         )
                         delattr(metric, attribute)
         # The metric returns None if we are running on multi-GPU and the current
         # process is not the main process
-        if score_dict is not None:
-            scores = score_dict[cfg.results_key]
-            if isinstance(scores, list):
-                scores = sum(scores) / len(scores)
-            results[cfg.name] = scores
+        if score is not None:
+            results[metric.name] = score
     return results

euroeval/task_group_utils/token_classification.py CHANGED Viewed

@@ -6,19 +6,17 @@ import typing as t
 from copy import deepcopy
 import demjson3
-import evaluate
 import numpy as np
-from evaluate import EvaluationModule
-from transformers.tokenization_utils import PreTrainedTokenizer
-from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
 from ..exceptions import InvalidBenchmark
 from ..utils import raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
+    from transformers.tokenization_utils import PreTrainedTokenizer
     from transformers.tokenization_utils_base import BatchEncoding
     from transformers.trainer_utils import EvalPrediction
+    from ..data_models import DatasetConfig, GenerativeModelOutput
     from ..types import Labels, Predictions
@@ -29,7 +27,6 @@ def compute_metrics(
     model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
     has_misc_tags: bool,
     dataset_config: "DatasetConfig",
-    benchmark_config: "BenchmarkConfig",
 ) -> dict[str, float]:
     """Compute the metrics needed for evaluation.
@@ -41,8 +38,6 @@ def compute_metrics(
             Whether the dataset has MISC tags.
         dataset_config:
             The configuration of the dataset.
-        benchmark_config:
-            The configuration of the benchmark.
     Returns:
         A dictionary with the names of the metrics as keys and the metric values as
@@ -55,17 +50,6 @@ def compute_metrics(
     if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
         model_outputs = model_outputs[0]
-    metrics = {
-        metric_cfg.name: (
-            evaluate.load(
-                path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
-            )
-            if metric_cfg.huggingface_id != ""
-            else None
-        )
-        for metric_cfg in dataset_config.task.metrics
-    }
     predictions: list[list[str]]
     if not isinstance(model_outputs[0][0], str):
         raw_predictions: list[list[int]] = np.argmax(model_outputs, axis=-1).tolist()
@@ -145,11 +129,14 @@ def compute_metrics(
         all(ner_tag == "o" for ner_tag in label_list) for label_list in labels
     )
     if predictions_all_zero and labels_all_zero:
-        results = dict(overall_f1=1.0)
+        micro_f1_score: float | None = 1.0
     else:
-        metric = metrics["micro_f1"]
-        assert isinstance(metric, EvaluationModule)
-        results = metric.compute(predictions=predictions, references=labels)
+        metric = next(
+            metric
+            for metric in dataset_config.task.metrics
+            if metric.name == "micro_f1"
+        )
+        micro_f1_score = metric(predictions=predictions, references=list(labels))
     # Compute the metrics without MISC tags
     # We manually set the F1 metric to be 100% if both the labels and the models
@@ -163,21 +150,22 @@ def compute_metrics(
         all(ner_tag == "o" for ner_tag in label_list) for label_list in labels_no_misc
     )
     if predictions_no_misc_all_zero and labels_no_misc_all_zero:
-        results_no_misc = dict(overall_f1=1.0)
+        micro_f1_no_misc_score: float | None = 1.0
     else:
-        metric = metrics["micro_f1_no_misc"]
-        assert isinstance(metric, EvaluationModule)
-        results_no_misc = metric.compute(
+        metric = next(
+            metric
+            for metric in dataset_config.task.metrics
+            if metric.name == "micro_f1_no_misc"
+        )
+        micro_f1_no_misc_score = metric(
             predictions=predictions_no_misc, references=labels_no_misc
         )
     # Raise error if the metrics are invalid
-    if results is None or results_no_misc is None:
+    if micro_f1_score is None or micro_f1_no_misc_score is None:
         raise InvalidBenchmark("The predictions and labels are not of the same length.")
-    return dict(
-        micro_f1_no_misc=results_no_misc["overall_f1"], micro_f1=results["overall_f1"]
-    )
+    return dict(micro_f1_no_misc=micro_f1_no_misc_score, micro_f1=micro_f1_score)
 def extract_labels_from_generation(

EuroEval 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl

EuroEval 15.10.1py3-none-any.whl → 15.12.0py3-none-any.whl