PyPI - EuroEval - Versions diffs - 15.4.0__py3-none-any.whl → 15.4.1__py3-none-any.whl - Mend

EuroEval 15.4.0py3-none-any.whl → 15.4.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (7) hide show

euroeval/generation.py CHANGED Viewed

@@ -20,7 +20,12 @@ from .model_cache import (
 from .utils import clear_memory
 if t.TYPE_CHECKING:
-    from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
+    from .data_models import (
+        BenchmarkConfig,
+        DatasetConfig,
+        GenerativeModelOutput,
+        ModelConfig,
+    )
 logger = logging.getLogger("euroeval")
@@ -163,6 +168,7 @@ def generate_single_iteration(
             if benchmark_config.debug:
                 debug_log(
                     batch=batch,
+                    model_output=model_output,
                     extracted_labels=extracted_labels,  # type: ignore[arg-type]
                     dataset_config=dataset_config,
                 )
@@ -217,6 +223,7 @@ def generate_single_iteration(
 def debug_log(
     batch: dict[str, t.Any],
+    model_output: "GenerativeModelOutput",
     extracted_labels: list[dict | str | list[str]],
     dataset_config: "DatasetConfig",
 ) -> None:
@@ -225,6 +232,8 @@ def debug_log(
     Args:
         batch:
             The batch of examples to evaluate on.
+        model_output:
+            The output of the model.
         extracted_labels:
             The extracted labels from the model output.
         dataset_config:
@@ -290,7 +299,12 @@ def debug_log(
     else:
         input_texts = batch["text"]
-    for input_text, prediction, label in zip(input_texts, extracted_labels, labels):
+    for input_text, raw_output, prediction, label in zip(
+        input_texts, model_output.sequences, extracted_labels, labels
+    ):
         logger.info(
-            f"Input: '{input_text}'\nPrediction: '{prediction}'\nLabel: '{label}'"
+            f"Input: '{input_text}'\n"
+            f"Raw outout: '{raw_output}'\n"
+            f"Prediction: '{prediction}'\n"
+            f"Label: '{label}'"
         )

euroeval/task_utils/sequence_classification.py CHANGED Viewed

@@ -162,9 +162,8 @@ def get_closest_logprobs_labels(
     """
     english_labels = list(dataset_config.id2label.values())
     english2local = dataset_config.prompt_label_mapping
-    candidate_labels = [
-        english2local[lbl].lower() for lbl in english_labels
-    ] + english_labels
+    local_labels = [english2local[lbl].lower() for lbl in english_labels]
+    candidate_labels = local_labels + english_labels
     output_labels: list[str] = list()
     for sample in generation_logprobs:
@@ -179,18 +178,39 @@ def get_closest_logprobs_labels(
             ]
             generated_labels = [label for label in generated_labels if label != ""]
-            # We want to use the first generated label which starts with a candidate
+            # We want to use the first generated label which contains a unique candidate
             # label, as the output label
             output_label: str | None = None
+            previously_generated_labels: list[str] = list()
             for generated_label in generated_labels:
+                generated_label = "".join(previously_generated_labels) + generated_label
+                # Get the candidate labels that contain the generated label
                 candidate_output_labels = [
                     candidate_label
                     for candidate_label in candidate_labels
-                    if candidate_label.startswith(generated_label)
+                    if generated_label in candidate_label
                 ]
+                # If we can uniquely determine the output label, we break the loop.
+                # Since we have both the original local labels as well as the English
+                # versions, we want to have 0 or 1 candidate labels from each set. This
+                # means that ["positive", "positiv"] is fine as they're both referencing
+                # the same label, but ["negativ", "neutral"] is not. In the bad case we
+                # cannot use the scores and we fall back to using the
+                # candidate label with the highest edit distance.
+                at_most_one_english_label = (
+                    len(set(candidate_output_labels).intersection(english_labels)) <= 1
+                )
+                at_most_one_local_label = (
+                    len(set(candidate_output_labels).intersection(local_labels)) <= 1
+                )
                 if candidate_output_labels:
-                    output_label = candidate_output_labels[0]
-                    break
+                    if at_most_one_english_label and at_most_one_local_label:
+                        output_label = candidate_output_labels[0]
+                        break
+                    else:
+                        previously_generated_labels.append(generated_label)
             if output_label is not None:
                 output_label = english2local.get(output_label, output_label)

{euroeval-15.4.0.dist-info → euroeval-15.4.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.4.0
+Version: 15.4.1
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,12 +61,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
 Requires-Dist: gradio>=4.26.0; extra == 'all'
 Requires-Dist: outlines>=0.1.11; extra == 'all'
-Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
+Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'all'
 Provides-Extra: generative
 Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
 Requires-Dist: outlines>=0.1.11; extra == 'generative'
-Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
+Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
 Provides-Extra: human-evaluation
 Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
 Provides-Extra: test

{euroeval-15.4.0.dist-info → euroeval-15.4.1.dist-info}/RECORD RENAMED Viewed

@@ -10,7 +10,7 @@ euroeval/dataset_configs.py,sha256=bjMUXvaEtTpo1Eql_mIRCG3K_lB2DZRdPWEAwR5N4ig,9
 euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
 euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
 euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
-euroeval/generation.py,sha256=UZ9nmKl4rbNBhW41iwpgw_tqfsEfe1UhOnjGudz9GWs,10382
+euroeval/generation.py,sha256=dohSPYc4eASm5tJhNKfBlpJnellKG7nVeyx8yXXxMlE,10721
 euroeval/human_evaluation.py,sha256=5uOm8cZf5uy2jBPs-ih7g8ni-a3hUz8UiXVPh6PzUWw,27675
 euroeval/languages.py,sha256=d1SyG0KVtCAA_PYpFGZCgZcyVLIr7Q8uYKPxNw6WEBc,7909
 euroeval/model_cache.py,sha256=BhkyWrOhjskESbndy218LUv1ZiWRc48ScdH_42dKHtE,8275
@@ -30,11 +30,11 @@ euroeval/benchmark_modules/vllm.py,sha256=5N2ytLR9cZIcPeza-ERQWwyvehDd0F1FUvXY3c
 euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
 euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
 euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
-euroeval/task_utils/sequence_classification.py,sha256=FrkvFzxFSnZoXThgpQqvJCIy3_YemyqZFQ1L-YdMMiw,8527
+euroeval/task_utils/sequence_classification.py,sha256=bIsbAj123hEyW40QeSUW8Dpc2SyI3ZPCGexapr9qqjw,9826
 euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
 euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
-euroeval-15.4.0.dist-info/METADATA,sha256=HfNWsANdb8TJAyK__QPBhs7O5qsQp9G_gPlhVVNuK9c,10724
-euroeval-15.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.4.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.4.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
-euroeval-15.4.0.dist-info/RECORD,,
+euroeval-15.4.1.dist-info/METADATA,sha256=OdTP-FAbbF9vUV3OTeV5Y-B6P7FXN2bAalG903ny8hU,10740
+euroeval-15.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.4.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.4.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
+euroeval-15.4.1.dist-info/RECORD,,

{euroeval-15.4.0.dist-info → euroeval-15.4.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.4.0.dist-info → euroeval-15.4.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.4.0.dist-info → euroeval-15.4.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.4.0__py3-none-any.whl → 15.4.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.4.0py3-none-any.whl → 15.4.1py3-none-any.whl