PyPI - EuroEval - Versions diffs - 15.4.1__py3-none-any.whl → 15.5.0__py3-none-any.whl - Mend

EuroEval 15.4.1py3-none-any.whl → 15.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (16) hide show

euroeval/__init__.py +2 -2
euroeval/benchmark_modules/hf.py +79 -39
euroeval/benchmark_modules/litellm.py +204 -74
euroeval/benchmark_modules/vllm.py +106 -42
euroeval/benchmarker.py +35 -6
euroeval/constants.py +11 -1
euroeval/data_models.py +6 -2
euroeval/dataset_configs.py +6 -6
euroeval/task_utils/sequence_classification.py +70 -30
euroeval/types.py +3 -3
euroeval/utils.py +131 -32
{euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/METADATA +6 -4
{euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/RECORD +16 -16
{euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/WHEEL +0 -0
{euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/entry_points.txt +0 -0
{euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -25,11 +25,12 @@ from urllib3.exceptions import RequestError
 from ..constants import (
     GENERATIVE_PIPELINE_TAGS,
+    MAX_CONTEXT_LENGTH,
     MAX_LOGPROBS,
     MERGE_TAGS,
     REASONING_MAX_TOKENS,
-    TASK_GROUPS_USING_LOGPROBS,
     TASKS_USING_JSON,
+    VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
 )
 from ..data_models import (
     BenchmarkConfig,
@@ -65,6 +66,8 @@ from ..utils import (
     get_bos_token,
     get_end_of_chat_token_ids,
     get_eos_token,
+    get_first_label_token_mapping,
+    get_min_cuda_compute_capability,
     log_once,
     should_prompts_be_stripped,
 )
@@ -120,11 +123,8 @@ class VLLMModel(HuggingFaceEncoderModel):
         ):
             raise NeedsExtraInstalled(extra="generative")
-        output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
         model, tokenizer = load_model_and_tokenizer(
-            model_config=model_config,
-            benchmark_config=benchmark_config,
-            output_scores=output_scores,
+            model_config=model_config, benchmark_config=benchmark_config
         )
         self._model: LLM = model
         self._tokenizer: PreTrainedTokenizer = tokenizer
@@ -140,11 +140,16 @@ class VLLMModel(HuggingFaceEncoderModel):
             benchmark_config=benchmark_config,
         )
-        self.buffer["output_scores"] = output_scores
-        self.buffer["instruction_model"] = self._tokenizer.chat_template is not None
+        self.buffer |= dict(
+            instruction_model=self._tokenizer.chat_template is not None,
+            first_label_token_mapping=get_first_label_token_mapping(
+                dataset_config=self.dataset_config, tokenizer=self._tokenizer
+            ),
+        )
         if self.model_config.adapter_base_model_id is not None:
             adapter_path = snapshot_download(
                 repo_id=self.model_config.model_id,
+                revision=self.model_config.revision,
                 cache_dir=Path(self.model_config.model_cache_dir),
             )
             self.buffer["lora_request"] = LoRARequest(
@@ -182,6 +187,7 @@ class VLLMModel(HuggingFaceEncoderModel):
                 return partial(
                     sequence_classification.extract_labels_from_generation,
                     dataset_config=self.dataset_config,
+                    first_label_token_mapping=self.buffer["first_label_token_mapping"],
                 )
             case TaskGroup.TEXT_TO_TEXT:
                 return text_to_text.extract_labels_from_generation
@@ -335,6 +341,12 @@ class VLLMModel(HuggingFaceEncoderModel):
         else:
             logits_processor = None
+        # Get the mapping from labels to the first token in the label. We call this each
+        # time we generate a new dataset since the dataset config can change
+        self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
+            dataset_config=self.dataset_config, tokenizer=self._tokenizer
+        )
         # Define the parameters used for vLLM generation
         max_tokens: int = (
             REASONING_MAX_TOKENS
@@ -343,7 +355,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         )
         sampling_params = SamplingParams(
             max_tokens=max_tokens,
-            logprobs=MAX_LOGPROBS if self.buffer["output_scores"] else None,
+            logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
             temperature=0.0,
             stop=[stop_token for stop_token in stop_tokens if stop_token],
             logits_processors=[logits_processor] if logits_processor else None,
@@ -373,12 +385,27 @@ class VLLMModel(HuggingFaceEncoderModel):
         # Generate sequences using vLLM
         input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
-        raw_outputs = self._model.generate(
-            prompts=prompts,
-            sampling_params=sampling_params,
-            use_tqdm=(not input_is_a_test),
-            lora_request=self.buffer.get("lora_request"),
-        )
+        num_attempts = 3
+        for _ in range(num_attempts):
+            try:
+                raw_outputs = self._model.generate(
+                    prompts=prompts,
+                    sampling_params=sampling_params,
+                    use_tqdm=(not input_is_a_test),
+                    lora_request=self.buffer.get("lora_request"),
+                )
+                break
+            except TypeError as e:
+                logger.debug(
+                    f"Encountered error during vLLM generation: {str(e)}. Retrying..."
+                )
+                sleep(1)
+        else:
+            raise InvalidBenchmark(
+                f"Could not generate sequences after {num_attempts} attempts."
+            )
+        # Parse the raw model outputs
         completion_ids: list[list[int]] = [
             output.outputs[0].token_ids for output in raw_outputs
         ]
@@ -398,7 +425,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         completions = [completion.strip() for completion in completions]
         # Add logprobs scores to the output
-        if self.buffer["output_scores"]:
+        if self.buffer["first_label_token_mapping"]:
             scores: list[list[list[tuple[str, float]]]] = [
                 [
                     [
@@ -828,7 +855,7 @@ class VLLMModel(HuggingFaceEncoderModel):
 def load_model_and_tokenizer(
-    model_config: ModelConfig, benchmark_config: BenchmarkConfig, output_scores: bool
+    model_config: ModelConfig, benchmark_config: BenchmarkConfig
 ) -> "tuple[LLM, PreTrainedTokenizer]":
     """Load the model and tokenizer.
@@ -837,22 +864,23 @@ def load_model_and_tokenizer(
             The model configuration.
         benchmark_config:
             The benchmark configuration.
-        output_scores:
-            Whether to output scores.
     Returns:
-        The loaded model and tokenizer.
+        A pair (model, tokenizer), with the loaded model and tokenizer
     """
     # Prefer base model ID if the model is an adapter - the adapter will be added on
     # during inference in this case
     model_id = model_config.adapter_base_model_id or model_config.model_id
+    revision = (
+        model_config.revision if model_config.adapter_base_model_id is None else "main"
+    )
     hf_model_config = load_hf_model_config(
         model_id=model_id,
         num_labels=0,
         id2label=dict(),
         label2id=dict(),
-        revision=model_config.revision,
+        revision=revision,
         model_cache_dir=model_config.model_cache_dir,
         api_key=benchmark_config.api_key,
         trust_remote_code=benchmark_config.trust_remote_code,
@@ -872,7 +900,27 @@ def load_model_and_tokenizer(
     if quantization == "awq" and importlib.util.find_spec("awq") is None:
         raise NeedsExtraInstalled(extra="quantization")
+    # Start with dtype being the "auto" vLLM dtype
     dtype: str | torch.dtype = "auto"
+    # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
+    if hf_model_config.torch_dtype == torch.float32:
+        if torch.cuda.is_bf16_supported():
+            logger.info(
+                "You are loading a model with dtype FP32, which we will convert to "
+                "BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
+                "GPU."
+            )
+            dtype = torch.bfloat16
+        else:
+            logger.info(
+                "You are loading a model with dtype FP32, which we will convert to "
+                "FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
+                "your GPU."
+            )
+            dtype = torch.float16
+    # If the model is a quantized model, we need to set the dtype to float16
     if quantization is not None and hf_model_config.torch_dtype != torch.float16:
         logger.info(
             "You are loading a quantized model with dtype "
@@ -881,6 +929,24 @@ def load_model_and_tokenizer(
         )
         dtype = torch.float16
+    # If the model is a bf16 model, we need to check the CUDA compute capability
+    if hf_model_config.torch_dtype == torch.bfloat16:
+        min_cuda_compute_capability = get_min_cuda_compute_capability()
+        required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
+        if min_cuda_compute_capability is not None:
+            if min_cuda_compute_capability < required_capability:
+                logger.info(
+                    "You are loading a model with "
+                    f"dtype {hf_model_config.torch_dtype}, "
+                    "which vLLM only supports for CUDA devices with"
+                    f"CUDA compute capability >={required_capability}. "
+                    "You are using one or more devices with "
+                    f"compute capability {min_cuda_compute_capability}. "
+                    "Setting dtype to float16 instead."
+                )
+                dtype = torch.float16
     if model_config.adapter_base_model_id is not None:
         download_dir = str(Path(model_config.model_cache_dir) / "base_model")
     else:
@@ -902,7 +968,17 @@ def load_model_and_tokenizer(
     if len(true_max_model_len_candidates) > 0:
         true_max_model_len = min(true_max_model_len_candidates)
     else:
-        true_max_model_len = 5_000
+        true_max_model_len = MAX_CONTEXT_LENGTH
+    tokenizer = load_tokenizer(
+        model_id=model_config.model_id,
+        revision=model_config.revision,
+        adapter_base_model_id=model_config.adapter_base_model_id,
+        trust_remote_code=benchmark_config.trust_remote_code,
+        model_max_length=true_max_model_len,
+        model_cache_dir=model_config.model_cache_dir,
+        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
+    )
     clear_vllm()
@@ -913,10 +989,10 @@ def load_model_and_tokenizer(
             model=model_id,
             tokenizer=model_id,
             gpu_memory_utilization=0.95,
-            max_model_len=min(true_max_model_len, 5_000),
+            max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
             download_dir=download_dir,
             trust_remote_code=benchmark_config.trust_remote_code,
-            revision=model_config.revision,
+            revision=revision,
             seed=4242,
             distributed_executor_backend=executor_backend,
             tensor_parallel_size=torch.cuda.device_count(),
@@ -924,7 +1000,6 @@ def load_model_and_tokenizer(
             quantization=quantization,
             dtype=dtype,
             enforce_eager=True,
-            max_logprobs=MAX_LOGPROBS if output_scores else None,
             # TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
             # so we disable it for now
             enable_prefix_caching=False,
@@ -950,16 +1025,6 @@ def load_model_and_tokenizer(
     model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
     model.config = hf_model_config
-    tokenizer = load_tokenizer(
-        model_id=model_config.model_id,
-        revision=model_config.revision,
-        adapter_base_model_id=model_config.adapter_base_model_id,
-        trust_remote_code=benchmark_config.trust_remote_code,
-        model_max_length=true_max_model_len,
-        model_cache_dir=model_config.model_cache_dir,
-        token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
-    )
     return model, tokenizer
@@ -994,6 +1059,7 @@ def load_tokenizer(
     Returns:
         The loaded tokenizer.
     """
+    revision = revision if adapter_base_model_id is None else "main"
     config = AutoConfig.from_pretrained(
         adapter_base_model_id or model_id,
         revision=revision,
@@ -1118,15 +1184,13 @@ def get_end_of_reasoning_token_id(
     # Generate a completion and remove the BOS token from it, to not confuse it with the
     # potential reasoning token
-    completion = (
-        model.generate(
-            prompts=[prompt],
-            sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
-            use_tqdm=False,
-        )[0]
-        .outputs[0]
-        .text
+    model_output = model.generate(
+        prompts=[prompt],
+        sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
+        use_tqdm=False,
     )
+    completion = model_output[0].outputs[0].text
     if tokenizer.bos_token is not None:
         if isinstance(tokenizer.bos_token, str):
             prompt = prompt.replace(tokenizer.bos_token, "").strip()

euroeval/benchmarker.py CHANGED Viewed

@@ -366,14 +366,18 @@ class Benchmarker:
             dataset_names=benchmark_config.datasets
         )
+        total_benchmarks = len(model_ids) * len(dataset_configs)
+        num_finished_benchmarks = 0
         current_benchmark_results: list[BenchmarkResult] = list()
-        for m_id in model_ids:
+        for model_id in model_ids:
             try:
                 model_config = get_model_config(
-                    model_id=m_id, benchmark_config=benchmark_config
+                    model_id=model_id, benchmark_config=benchmark_config
                 )
             except InvalidModel as e:
                 logger.info(e.message)
+                num_finished_benchmarks += len(dataset_configs)
                 continue
             loaded_model: BenchmarkModule | None = None
@@ -381,16 +385,18 @@ class Benchmarker:
                 # Skip if we have already benchmarked this model on this dataset and
                 # we are not forcing the benchmark
                 if not benchmark_config.force and model_has_been_benchmarked(
-                    model_id=m_id,
+                    model_id=model_id,
                     dataset=dataset_config.name,
                     few_shot=benchmark_config.few_shot,
                     validation_split=not benchmark_config.evaluate_test_split,
                     benchmark_results=self.benchmark_results,
                 ):
                     logger.debug(
-                        f"Skipping benchmarking {m_id} on {dataset_config.pretty_name},"
-                        " as it has already been benchmarked."
+                        f"Skipping benchmarking {model_id} on "
+                        f"{dataset_config.pretty_name}, as it "
+                        "has already been benchmarked."
                     )
+                    num_finished_benchmarks += 1
                     continue
                 # We do not re-initialise generative models as their architecture is not
@@ -413,6 +419,15 @@ class Benchmarker:
                             if benchmark_config.raise_errors:
                                 raise e
                             logger.info(e.message)
+                            # Add the remaining number of benchmarks for the model to
+                            # our benchmark counter, since we're skipping the
+                            # rest of them
+                            num_finished_benchmarks += (
+                                len(dataset_configs)
+                                - dataset_configs.index(dataset_config)
+                                - 1
+                            )
                             break
                     else:
                         loaded_model.dataset_config = dataset_config
@@ -435,16 +450,24 @@ class Benchmarker:
                     if benchmark_config.raise_errors:
                         raise benchmark_output_or_err
                     logger.info(
-                        f"{m_id} could not be benchmarked on "
+                        f"{model_id} could not be benchmarked on "
                         f"{dataset_config.pretty_name}. Skipping. The error message "
                         f"raised was {benchmark_output_or_err.message!r}."
                     )
+                    num_finished_benchmarks += 1
                     continue
                 elif isinstance(benchmark_output_or_err, InvalidModel):
                     if benchmark_config.raise_errors:
                         raise benchmark_output_or_err
                     logger.info(benchmark_output_or_err.message)
+                    # Add the remaining number of benchmarks for the model to
+                    # our benchmark counter, since we're skipping the
+                    # rest of them
+                    num_finished_benchmarks += (
+                        len(dataset_configs) - dataset_configs.index(dataset_config) - 1
+                    )
                     break
                 else:
@@ -453,6 +476,12 @@ class Benchmarker:
                     if benchmark_config.save_results:
                         record.append_to_results(results_path=self.results_path)
+                num_finished_benchmarks += 1
+                logger.info(
+                    f"Finished {num_finished_benchmarks} out of "
+                    f"{total_benchmarks} benchmarks."
+                )
             if benchmark_config.clear_model_cache:
                 clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)

euroeval/constants.py CHANGED Viewed

@@ -7,6 +7,13 @@ from .tasks import NER
 DUMMY_FILL_VALUE = 100
+# This is the maximum allowed context length for models for the purpose of this
+# benchmark. We will still report the models' true maximum context length in the
+# metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
+# all tokens in the context.
+MAX_CONTEXT_LENGTH = 5_000
 # We need to raise the amount of tokens generated for reasoning models, to give them
 # time to think
 REASONING_MAX_TOKENS = 8_192
@@ -47,10 +54,13 @@ TASK_GROUPS_USING_LOGPROBS = [
 MAX_LOGPROBS = 10
-# We make sure to remove these metric attributed after each iteration, to avoid memory
+# We make sure to remove these metric attributes after each iteration, to avoid memory
 # leaks
 METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
 # Hugging Face Hub tags used to classify models as merge models
 MERGE_TAGS = ["merge", "mergekit"]
+# The minimum required CUDA compute capability for using bfloat16 in vLLM
+VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0

euroeval/data_models.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """Data models used in EuroEval."""
 import collections.abc as c
-import importlib.metadata
 import json
 import pathlib
 import re
@@ -13,6 +12,7 @@ import torch
 from .enums import Device, InferenceBackend, ModelType, TaskGroup
 from .types import ScoreDict
+from .utils import get_package_version
 @dataclass
@@ -228,7 +228,11 @@ class BenchmarkResult(pydantic.BaseModel):
     generative_type: str | None
     few_shot: bool
     validation_split: bool
-    euroeval_version: str = importlib.metadata.version("euroeval")
+    euroeval_version: str | None = get_package_version("euroeval")
+    transformers_version: str | None = get_package_version("transformers")
+    torch_version: str | None = get_package_version("torch")
+    vllm_version: str | None = get_package_version("vllm")
+    outlines_version: str | None = get_package_version("outlines")
     @classmethod
     def from_dict(cls, config: dict) -> "BenchmarkResult":

euroeval/dataset_configs.py CHANGED Viewed

@@ -244,7 +244,7 @@ FOSENT_CONFIG = DatasetConfig(
 ALLOCINE_CONFIG = DatasetConfig(
     name="allocine",
     pretty_name="the truncated version of the French sentiment classification "
-    "dataset Allocine",
+    "dataset AlloCiné",
     huggingface_id="EuroEval/allocine-mini",
     task=SENT,
     languages=[FR],
@@ -1467,9 +1467,9 @@ NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
     max_generated_tokens=256,
 )
-MLSUM_CONFIG = DatasetConfig(
-    name="mlsum",
-    pretty_name="the truncated version of the German summarisation dataset MLSum",
+MLSUM_DE_CONFIG = DatasetConfig(
+    name="mlsum-de",
+    pretty_name="the truncated version of the German summarisation dataset MLSum-de",
     huggingface_id="EuroEval/mlsum-mini",
     task=SUMM,
     languages=[DE],
@@ -1484,7 +1484,7 @@ MLSUM_CONFIG = DatasetConfig(
 MLSUM_ES_CONFIG = DatasetConfig(
     name="mlsum-es",
-    pretty_name="the truncated version of the Spanish summarisation dataset MLSum",
+    pretty_name="the truncated version of the Spanish summarisation dataset MLSum-es",
     huggingface_id="EuroEval/mlsum-es-mini",
     task=SUMM,
     languages=[ES],
@@ -1643,7 +1643,7 @@ ORANGE_SUM_CONFIG = DatasetConfig(
 ILPOST_SUM_CONFIG = DatasetConfig(
     name="ilpost-sum",
-    pretty_name="the truncated version of the Italian summarisation dataset IlPost",
+    pretty_name="the truncated version of the Italian summarisation dataset IlPost-Sum",
     huggingface_id="EuroEval/ilpost-sum",
     task=SUMM,
     languages=[IT],

euroeval/task_utils/sequence_classification.py CHANGED Viewed

@@ -10,6 +10,7 @@ import numpy as np
 from evaluate import EvaluationModule
 from ..data_models import BenchmarkConfig, GenerativeModelOutput
+from ..exceptions import InvalidBenchmark
 from ..utils import log_once, raise_if_model_output_contains_nan_values
 if t.TYPE_CHECKING:
@@ -110,6 +111,7 @@ def extract_labels_from_generation(
     input_batch: dict[str, list],
     model_output: GenerativeModelOutput,
     dataset_config: "DatasetConfig",
+    first_label_token_mapping: dict[str, str] | bool,
 ) -> list[str]:
     """Extract the predicted labels from the generated output.
@@ -121,13 +123,19 @@ def extract_labels_from_generation(
             The raw generated output of the model.
         dataset_config:
             The configuration of the dataset.
+        first_label_token_mapping:
+            A mapping from labels to the first token in each label, or alternatively a
+            Boolean value indicating whether the model should output scores (if the
+            mapping is outputted then the model will always output scores).
     Returns:
         The predicted labels.
     """
     if model_output.scores is not None:
         return get_closest_logprobs_labels(
-            generation_logprobs=model_output.scores, dataset_config=dataset_config
+            generation_logprobs=model_output.scores,
+            dataset_config=dataset_config,
+            first_label_token_mapping=first_label_token_mapping,
         )
     else:
         return get_closest_word_edit_labels(
@@ -138,6 +146,7 @@ def extract_labels_from_generation(
 def get_closest_logprobs_labels(
     generation_logprobs: list[list[list[tuple[str, float]]]],
     dataset_config: "DatasetConfig",
+    first_label_token_mapping: dict[str, str] | bool,
 ) -> list[str]:
     """Get the labels with the highest predicted logprob value.
@@ -152,6 +161,10 @@ def get_closest_logprobs_labels(
             (batch_size, num_tokens, num_logprobs).
         dataset_config:
             The configuration of the dataset.
+        first_label_token_mapping:
+            A mapping from labels to the first token in each label, or alternatively a
+            Boolean value indicating whether the model should output scores (if the
+            mapping is outputted then the model will always output scores).
     Returns:
         The predicted labels.
@@ -162,8 +175,7 @@ def get_closest_logprobs_labels(
     """
     english_labels = list(dataset_config.id2label.values())
     english2local = dataset_config.prompt_label_mapping
-    local_labels = [english2local[lbl].lower() for lbl in english_labels]
-    candidate_labels = local_labels + english_labels
+    candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
     output_labels: list[str] = list()
     for sample in generation_logprobs:
@@ -182,38 +194,66 @@ def get_closest_logprobs_labels(
             # label, as the output label
             output_label: str | None = None
             previously_generated_labels: list[str] = list()
-            for generated_label in generated_labels:
+            for label_idx, generated_label in enumerate(generated_labels):
                 generated_label = "".join(previously_generated_labels) + generated_label
-                # Get the candidate labels that contain the generated label
-                candidate_output_labels = [
-                    candidate_label
-                    for candidate_label in candidate_labels
-                    if generated_label in candidate_label
-                ]
-                # If we can uniquely determine the output label, we break the loop.
-                # Since we have both the original local labels as well as the English
-                # versions, we want to have 0 or 1 candidate labels from each set. This
-                # means that ["positive", "positiv"] is fine as they're both referencing
-                # the same label, but ["negativ", "neutral"] is not. In the bad case we
-                # cannot use the scores and we fall back to using the
-                # candidate label with the highest edit distance.
-                at_most_one_english_label = (
-                    len(set(candidate_output_labels).intersection(english_labels)) <= 1
-                )
-                at_most_one_local_label = (
-                    len(set(candidate_output_labels).intersection(local_labels)) <= 1
-                )
-                if candidate_output_labels:
-                    if at_most_one_english_label and at_most_one_local_label:
-                        output_label = candidate_output_labels[0]
-                        break
-                    else:
+                # Get the candidate labels that starts with the generated label
+                if isinstance(first_label_token_mapping, dict):
+                    if any(
+                        candidate_label not in first_label_token_mapping
+                        for candidate_label in candidate_labels
+                    ):
+                        raise InvalidBenchmark(
+                            "There is a label not present in the first label token "
+                            "mapping - this should never happen! Please report this "
+                            "issue to the EuroEval team at "
+                            "github.com/EuroEval/EuroEval/issues."
+                        )
+                    candidate_output_labels = {
+                        candidate_label
+                        for candidate_label in candidate_labels
+                        if generated_label == first_label_token_mapping[candidate_label]
+                    }
+                else:
+                    candidate_output_labels = {
+                        candidate_label
+                        for candidate_label in candidate_labels
+                        if candidate_label.startswith(generated_label)
+                    }
+                # If we can uniquely determine the output label, we break the loop. If
+                # there are multiple possible labels then we store the current one, and
+                # concatenate it with the next generated label. We can only do this if
+                # the current one is the first one, however, since we're using greedy
+                # sampling. In case this happens for a label that is not the first one,
+                # we warn the user.
+                if len(candidate_output_labels) == 1:
+                    output_label = candidate_output_labels.pop()
+                    break
+                elif len(candidate_output_labels) > 1:
+                    if label_idx == 0:
                         previously_generated_labels.append(generated_label)
+                    else:
+                        output_label = candidate_output_labels.pop()
+                        candidate_output_labels.add(output_label)
+                        raise InvalidBenchmark(
+                            "Multiple candidate labels found for the generated label "
+                            f"{generated_label!r}: {candidate_output_labels}. Since "
+                            "this is not the first generated label, we cannot "
+                            "concatenate it with the next generated label. We are thus "
+                            f"forced to use the arbitrary {output_label!r} as the "
+                            "output label, potentially resulting in worse performance. "
+                            "Please report this issue to the EuroEval team at "
+                            "github.com/EuroEval/EuroEval/issues."
+                        )
+                elif len(candidate_output_labels) == 0:
+                    logger.debug(
+                        f"No candidate label found for the generated label "
+                        f"{generated_label!r}. The generated label is thus ignored."
+                    )
             if output_label is not None:
-                output_label = english2local.get(output_label, output_label)
                 output_labels.append(output_label)
                 break
         else:

euroeval/types.py CHANGED Viewed

@@ -8,9 +8,9 @@ if t.TYPE_CHECKING:
     from .data_models import GenerativeModelOutput
-ScoreDict = dict[str, dict[str, float] | list[dict[str, float]]]
-Predictions = NDArray | list[str] | list[list[str]]
-Labels = NDArray | list[str] | list[list[str]]
+ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
+Predictions: t.TypeAlias = NDArray | list[str] | list[list[str]]
+Labels: t.TypeAlias = NDArray | list[str] | list[list[str]]
 class ComputeMetricsFunction(t.Protocol):

EuroEval 15.4.1__py3-none-any.whl → 15.5.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.4.1py3-none-any.whl → 15.5.0py3-none-any.whl