PyPI - EuroEval - Versions diffs - 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl - Mend

EuroEval 16.3.0py3-none-any.whl → 16.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show

euroeval/__init__.py +3 -2
euroeval/benchmark_config_factory.py +0 -4
euroeval/benchmark_modules/base.py +3 -16
euroeval/benchmark_modules/fresh.py +2 -1
euroeval/benchmark_modules/hf.py +99 -62
euroeval/benchmark_modules/litellm.py +101 -41
euroeval/benchmark_modules/vllm.py +91 -83
euroeval/benchmarker.py +84 -78
euroeval/caching_utils.py +79 -0
euroeval/callbacks.py +5 -7
euroeval/constants.py +6 -0
euroeval/data_loading.py +14 -11
euroeval/data_models.py +12 -4
euroeval/dataset_configs/__init__.py +2 -0
euroeval/dataset_configs/czech.py +79 -0
euroeval/dataset_configs/danish.py +10 -11
euroeval/dataset_configs/dutch.py +0 -1
euroeval/dataset_configs/english.py +0 -1
euroeval/dataset_configs/estonian.py +11 -1
euroeval/dataset_configs/finnish.py +0 -1
euroeval/dataset_configs/french.py +0 -1
euroeval/dataset_configs/german.py +0 -1
euroeval/dataset_configs/italian.py +0 -1
euroeval/dataset_configs/latvian.py +0 -1
euroeval/dataset_configs/lithuanian.py +9 -3
euroeval/dataset_configs/norwegian.py +0 -1
euroeval/dataset_configs/polish.py +0 -1
euroeval/dataset_configs/portuguese.py +0 -1
euroeval/dataset_configs/slovak.py +60 -0
euroeval/dataset_configs/spanish.py +0 -1
euroeval/dataset_configs/swedish.py +10 -12
euroeval/finetuning.py +21 -15
euroeval/generation.py +10 -10
euroeval/generation_utils.py +2 -3
euroeval/logging_utils.py +250 -0
euroeval/metrics/base.py +0 -3
euroeval/metrics/huggingface.py +9 -5
euroeval/metrics/llm_as_a_judge.py +5 -3
euroeval/metrics/pipeline.py +17 -9
euroeval/metrics/speed.py +0 -3
euroeval/model_cache.py +11 -14
euroeval/model_config.py +4 -5
euroeval/model_loading.py +3 -0
euroeval/prompt_templates/linguistic_acceptability.py +21 -3
euroeval/prompt_templates/multiple_choice.py +25 -1
euroeval/prompt_templates/named_entity_recognition.py +51 -11
euroeval/prompt_templates/reading_comprehension.py +31 -3
euroeval/prompt_templates/sentiment_classification.py +23 -1
euroeval/prompt_templates/summarization.py +26 -6
euroeval/scores.py +7 -7
euroeval/speed_benchmark.py +3 -5
euroeval/task_group_utils/multiple_choice_classification.py +0 -3
euroeval/task_group_utils/question_answering.py +0 -3
euroeval/task_group_utils/sequence_classification.py +43 -31
euroeval/task_group_utils/text_to_text.py +17 -8
euroeval/task_group_utils/token_classification.py +10 -9
euroeval/tokenisation_utils.py +14 -12
euroeval/utils.py +29 -146
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/METADATA +4 -4
euroeval-16.4.0.dist-info/RECORD +75 -0
euroeval-16.3.0.dist-info/RECORD +0 -71
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0

euroeval/dataset_configs/czech.py ADDED Viewed

@@ -0,0 +1,79 @@
+"""All Czech dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..languages import CS
+from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
+### Official datasets ###
+CSFD_SENTIMENT_CONFIG = DatasetConfig(
+    name="csfd-sentiment",
+    pretty_name="the truncated version of the Czech sentiment classification dataset "
+    "CSFD Sentiment",
+    huggingface_id="EuroEval/csfd-sentiment-mini",
+    task=SENT,
+    languages=[CS],
+)
+CS_GEC_CONFIG = DatasetConfig(
+    name="cs-gec",
+    pretty_name="the truncated version of the Czech linguistic acceptability dataset "
+    "CS-GEC",
+    huggingface_id="EuroEval/cs-gec-mini",
+    task=LA,
+    languages=[CS],
+)
+PONER_CONFIG = DatasetConfig(
+    name="poner",
+    pretty_name="the truncated version of the Czech named entity recognition dataset "
+    "PONER",
+    huggingface_id="EuroEval/poner-mini",
+    task=NER,
+    languages=[CS],
+)
+SQAD_CONFIG = DatasetConfig(
+    name="sqad",
+    pretty_name="the truncated version of the Czech reading comprehension dataset SQAD",
+    huggingface_id="EuroEval/sqad-mini",
+    task=RC,
+    languages=[CS],
+)
+CZECH_NEWS_CONFIG = DatasetConfig(
+    name="czech-news",
+    pretty_name="the truncated version of the Czech summarisation dataset",
+    huggingface_id="EuroEval/czech-news-mini",
+    task=SUMM,
+    languages=[CS],
+)
+UMIMETO_QA_CONFIG = DatasetConfig(
+    name="umimeto-qa",
+    pretty_name="the Czech knowledge dataset UmimetoQA",
+    huggingface_id="EuroEval/umimeto-qa",
+    task=KNOW,
+    languages=[CS],
+)
+HELLASWAG_CS_CONFIG = DatasetConfig(
+    name="hellaswag-cs",
+    pretty_name="the truncated version of the Czech common-sense reasoning dataset "
+    "HellaSwag-cs, translated from the English HellaSwag dataset",
+    huggingface_id="EuroEval/hellaswag-cs-mini",
+    task=COMMON_SENSE,
+    languages=[CS],
+)
+### Unofficial datasets ###
+SCALA_CS_CONFIG = DatasetConfig(
+    name="scala-cs",
+    pretty_name="the Czech part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-cs",
+    task=LA,
+    languages=[CS],
+    unofficial=True,
+)

euroeval/dataset_configs/danish.py CHANGED Viewed

@@ -32,11 +32,11 @@ DANSK_CONFIG = DatasetConfig(
     languages=[DA],
 )
-SCANDIQA_DA_CONFIG = DatasetConfig(
-    name="scandiqa-da",
-    pretty_name="the Danish part of the truncated version of the question answering "
-    "dataset ScandiQA",
-    huggingface_id="EuroEval/scandiqa-da-mini",
+MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
+    name="multi-wiki-qa-da",
+    pretty_name="the truncated version of the Danish part of the reading "
+    "comprehension dataset MultiWikiQA",
+    huggingface_id="EuroEval/multi-wiki-qa-da-mini",
     task=RC,
     languages=[DA],
 )
@@ -129,11 +129,11 @@ BELEBELE_DA_CONFIG = DatasetConfig(
     unofficial=True,
 )
-MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
-    name="multi-wiki-qa-da",
-    pretty_name="the truncated version of the Danish part of the reading "
-    "comprehension dataset MultiWikiQA",
-    huggingface_id="EuroEval/multi-wiki-qa-da-mini",
+SCANDIQA_DA_CONFIG = DatasetConfig(
+    name="scandiqa-da",
+    pretty_name="the Danish part of the truncated version of the question answering "
+    "dataset ScandiQA",
+    huggingface_id="EuroEval/scandiqa-da-mini",
     task=RC,
     languages=[DA],
     unofficial=True,
@@ -156,7 +156,6 @@ WINOGRANDE_DA_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-da",
     task=COMMON_SENSE,
     languages=[DA],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )

euroeval/dataset_configs/dutch.py CHANGED Viewed

@@ -149,7 +149,6 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-nl",
     task=COMMON_SENSE,
     languages=[NL],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )

euroeval/dataset_configs/english.py CHANGED Viewed

@@ -132,7 +132,6 @@ WINOGRANDE_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-en",
     task=COMMON_SENSE,
     languages=[EN],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )

euroeval/dataset_configs/estonian.py CHANGED Viewed

@@ -94,10 +94,20 @@ SCALA_ET_CONFIG = DatasetConfig(
 EXAM_ET_CONFIG = DatasetConfig(
     name="exam-et",
-    pretty_name="the Estonian knowledge assessment dataset Exam-et",
+    pretty_name="the Estonian knowledge dataset Exam-et",
     huggingface_id="EuroEval/exam-et",
     task=KNOW,
     languages=[ET],
     _labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
     unofficial=True,
 )
+MMLU_ET_CONFIG = DatasetConfig(
+    name="mmlu-et",
+    pretty_name="the truncated version of the Estonian knowledge dataset MMLU-et, "
+    "translated from the English MMLU dataset",
+    huggingface_id="EuroEval/mmlu-et-mini",
+    task=KNOW,
+    languages=[ET],
+    unofficial=True,
+)

euroeval/dataset_configs/finnish.py CHANGED Viewed

@@ -108,7 +108,6 @@ WINOGRANDE_FI_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-fi",
     task=COMMON_SENSE,
     languages=[FI],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )

euroeval/dataset_configs/french.py CHANGED Viewed

@@ -120,7 +120,6 @@ WINOGRANDE_FR_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-fr",
     task=COMMON_SENSE,
     languages=[FR],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )

euroeval/dataset_configs/german.py CHANGED Viewed

@@ -137,7 +137,6 @@ WINOGRANDE_DE_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-de",
     task=COMMON_SENSE,
     languages=[DE],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )

euroeval/dataset_configs/italian.py CHANGED Viewed

@@ -128,7 +128,6 @@ WINOGRANDE_IT_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-it",
     task=COMMON_SENSE,
     languages=[IT],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )

euroeval/dataset_configs/latvian.py CHANGED Viewed

@@ -88,7 +88,6 @@ WINOGRANDE_LV_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-lv",
     task=COMMON_SENSE,
     languages=[LV],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )

euroeval/dataset_configs/lithuanian.py CHANGED Viewed

@@ -2,7 +2,7 @@
 from ..data_models import DatasetConfig
 from ..languages import LT
-from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
+from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
 ### Official datasets ###
@@ -41,13 +41,20 @@ MULTI_WIKI_QA_LT_CONFIG = DatasetConfig(
     languages=[LT],
 )
+LRYTAS_CONFIG = DatasetConfig(
+    name="lrytas",
+    pretty_name="the truncated version of the Lithuanian summarisation dataset Lrytas",
+    huggingface_id="EuroEval/lrytas-mini",
+    task=SUMM,
+    languages=[LT],
+)
 LT_HISTORY_CONFIG = DatasetConfig(
     name="lt-history",
     pretty_name="the Lithuanian knowledge dataset LT-History",
     huggingface_id="EuroEval/lt-history",
     task=KNOW,
     languages=[LT],
-    splits=["train", "test"],
 )
 WINOGRANDE_LT_CONFIG = DatasetConfig(
@@ -57,6 +64,5 @@ WINOGRANDE_LT_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-lt",
     task=COMMON_SENSE,
     languages=[LT],
-    splits=["train", "test"],
     _labels=["a", "b"],
 )

euroeval/dataset_configs/norwegian.py CHANGED Viewed

@@ -223,7 +223,6 @@ WINOGRANDE_NO_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-no",
     task=COMMON_SENSE,
     languages=[NB, NN, NO],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )

euroeval/dataset_configs/polish.py CHANGED Viewed

@@ -61,7 +61,6 @@ WINOGRANDE_PL_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-pl",
     task=COMMON_SENSE,
     languages=[PL],
-    splits=["train", "test"],
     _labels=["a", "b"],
 )

euroeval/dataset_configs/portuguese.py CHANGED Viewed

@@ -98,7 +98,6 @@ WINOGRANDE_PT_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-pt",
     task=COMMON_SENSE,
     languages=[PT],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )

euroeval/dataset_configs/slovak.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""All Slovak dataset configurations used in EuroEval."""
+from ..data_models import DatasetConfig
+from ..languages import SK
+from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
+### Official datasets ###
+CSFD_SENTIMENT_SK_CONFIG = DatasetConfig(
+    name="csfd-sentiment-sk",
+    pretty_name="the truncated version of the Slovak sentiment classification dataset "
+    "CSFD-sentiment-sk",
+    huggingface_id="EuroEval/csfd-sentiment-sk-mini",
+    task=SENT,
+    languages=[SK],
+)
+SCALA_SK_CONFIG = DatasetConfig(
+    name="scala-sk",
+    pretty_name="the Slovak part of the linguistic acceptability dataset ScaLA",
+    huggingface_id="EuroEval/scala-sk",
+    task=LA,
+    languages=[SK],
+)
+UNER_SK_CONFIG = DatasetConfig(
+    name="uner-sk",
+    pretty_name="the truncated version of the Slovak named entity recognition dataset "
+    "UNER-sk",
+    huggingface_id="EuroEval/uner-sk-mini",
+    task=NER,
+    languages=[SK],
+)
+MULTI_WIKI_QA_SK_CONFIG = DatasetConfig(
+    name="multi-wiki-qa-sk",
+    pretty_name="the truncated version of the Slovak part of the reading comprehension "
+    "dataset MultiWikiQA",
+    huggingface_id="EuroEval/multi-wiki-qa-sk-mini",
+    task=RC,
+    languages=[SK],
+)
+MMLU_SK_CONFIG = DatasetConfig(
+    name="mmlu-sk",
+    pretty_name="the truncated version of the Slovak knowledge dataset MMLU-sk, "
+    "translated from the English MMLU dataset",
+    huggingface_id="EuroEval/mmlu-sk-mini",
+    task=KNOW,
+    languages=[SK],
+)
+WINOGRANDE_SK_CONFIG = DatasetConfig(
+    name="winogrande-sk",
+    pretty_name="the Slovak common-sense reasoning dataset Winogrande-sk, translated "
+    "from the English Winogrande dataset",
+    huggingface_id="EuroEval/winogrande-sk",
+    task=COMMON_SENSE,
+    languages=[SK],
+)

euroeval/dataset_configs/spanish.py CHANGED Viewed

@@ -126,7 +126,6 @@ WINOGRANDE_ES_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-es",
     task=COMMON_SENSE,
     languages=[ES],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )

euroeval/dataset_configs/swedish.py CHANGED Viewed

@@ -32,11 +32,11 @@ SUC3_CONFIG = DatasetConfig(
     languages=[SV],
 )
-SCANDIQA_SV_CONFIG = DatasetConfig(
-    name="scandiqa-sv",
-    pretty_name="the Swedish part of the truncated version of the question answering "
-    "dataset ScandiQA",
-    huggingface_id="EuroEval/scandiqa-sv-mini",
+MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
+    name="multi-wiki-qa-sv",
+    pretty_name="the truncated version of the Swedish part of the reading "
+    "comprehension dataset MultiWikiQA",
+    huggingface_id="EuroEval/multi-wiki-qa-sv-mini",
     task=RC,
     languages=[SV],
 )
@@ -110,11 +110,11 @@ BELEBELE_SV_CONFIG = DatasetConfig(
     unofficial=True,
 )
-MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
-    name="multi-wiki-qa-sv",
-    pretty_name="the truncated version of the Swedish part of the reading "
-    "comprehension dataset MultiWikiQA",
-    huggingface_id="EuroEval/multi-wiki-qa-sv-mini",
+SCANDIQA_SV_CONFIG = DatasetConfig(
+    name="scandiqa-sv",
+    pretty_name="the Swedish part of the truncated version of the question answering "
+    "dataset ScandiQA",
+    huggingface_id="EuroEval/scandiqa-sv-mini",
     task=RC,
     languages=[SV],
     unofficial=True,
@@ -137,7 +137,6 @@ WINOGRANDE_SV_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/winogrande-sv",
     task=COMMON_SENSE,
     languages=[SV],
-    splits=["train", "test"],
     _labels=["a", "b"],
     unofficial=True,
 )
@@ -174,6 +173,5 @@ SKOLPROV_CONFIG = DatasetConfig(
     huggingface_id="EuroEval/skolprov",
     task=KNOW,
     languages=[SV],
-    splits=["train", "test"],
     unofficial=True,
 )

euroeval/finetuning.py CHANGED Viewed

@@ -6,7 +6,6 @@ import typing as t
 from functools import partial
 import torch
-from tqdm.auto import tqdm
 from transformers.trainer_callback import (
     EarlyStoppingCallback,
     PrinterCallback,
@@ -18,13 +17,9 @@ from transformers.training_args import OptimizerNames, TrainingArguments
 from .callbacks import NeverLeaveProgressCallback
 from .enums import DataType
 from .exceptions import InvalidBenchmark, NaNValueInModelOutput
+from .logging_utils import block_terminal_output, get_pbar, log, log_once
 from .model_loading import load_model
-from .utils import (
-    block_terminal_output,
-    clear_memory,
-    enforce_reproducibility,
-    log_once,
-)
+from .utils import clear_memory, enforce_reproducibility
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -32,8 +27,6 @@ if t.TYPE_CHECKING:
     from .benchmark_modules import BenchmarkModule
     from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
-logger = logging.getLogger("euroeval")
 def finetune(
     model: "BenchmarkModule",
@@ -58,6 +51,10 @@ def finetune(
     Returns:
         A list of dicts containing the scores for each metric for each iteration.
+    Raises:
+        InvalidBenchmark:
+            If the benchmark could not be completed.
     """
     # Set the data type to use for the model weights
     using_cuda = benchmark_config.device == torch.device("cuda")
@@ -70,7 +67,7 @@ def finetune(
     bs: int = benchmark_config.batch_size
     scores: list[dict[str, float]] = list()
-    for idx in tqdm(
+    for idx in get_pbar(
         iterable=range(benchmark_config.num_iterations),
         desc="Benchmarking",
         disable=not benchmark_config.progress_bar,
@@ -80,7 +77,7 @@ def finetune(
         model_already_initialized = idx == 0
         # Run a loop here to deal with automatic reduction of batch size
-        while True:
+        for _ in range(num_attempts := 10):
             # Clear GPU memory
             if not model_already_initialized:
                 try:
@@ -112,7 +109,10 @@ def finetune(
                 )
                 scores.append(itr_scores)
-                logger.debug(f"Test scores for iteration {idx}: {itr_scores}")
+                log(
+                    f"Test scores for iteration {idx}: {itr_scores}",
+                    level=logging.DEBUG,
+                )
                 break
@@ -123,9 +123,10 @@ def finetune(
                 if dtype != DataType.FP32:
                     dtype = DataType.FP32
                     model_already_initialized = False
-                    logger.debug(
+                    log(
                         "NaN value detected in model outputs while using mixed "
-                        "precision. Retrying with full fp32 precision."
+                        "precision. Retrying with full fp32 precision.",
+                        level=logging.DEBUG,
                     )
                 else:
                     raise InvalidBenchmark(
@@ -151,7 +152,12 @@ def finetune(
                 model_already_initialized = False
                 bs //= 2
-                logger.debug(f"Reduced batch size to {bs}")
+                log(f"Reduced batch size to {bs}", level=logging.DEBUG)
+        else:
+            raise InvalidBenchmark(
+                f"Could not benchmark the model after {num_attempts} attempts!"
+            )
     return scores

euroeval/generation.py CHANGED Viewed

@@ -11,12 +11,13 @@ from tqdm.auto import tqdm
 from .enums import BatchingPreference, TaskGroup
 from .exceptions import InvalidBenchmark
+from .logging_utils import get_pbar, log, log_once
 from .model_cache import (
     ModelCache,
     load_cached_model_outputs,
     split_dataset_into_cached_and_non_cached,
 )
-from .utils import clear_memory, log_once
+from .utils import clear_memory
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -29,8 +30,6 @@ if t.TYPE_CHECKING:
         ModelConfig,
     )
-logger = logging.getLogger("euroeval")
 def generate(
     model: "BenchmarkModule",
@@ -78,7 +77,7 @@ def generate(
     )
     scores: list[dict[str, float]] = list()
-    for idx in tqdm(
+    for idx in get_pbar(
         iterable=range(len(datasets)),
         desc="Benchmarking",
         disable=not benchmark_config.progress_bar,
@@ -90,7 +89,7 @@ def generate(
             dataset_config=dataset_config,
             benchmark_config=benchmark_config,
         )
-        logger.debug(f"Test scores for iteration {idx}: {test_scores}")
+        log(f"Test scores for iteration {idx}: {test_scores}", level=logging.DEBUG)
         scores.append(test_scores)
         clear_memory()
@@ -142,14 +141,14 @@ def generate_single_iteration(
         itr: t.Iterable
         match model.batching_preference:
             case BatchingPreference.SINGLE_SAMPLE:
-                itr = tqdm(iterable=non_cached_dataset, leave=False)
+                itr = get_pbar(iterable=non_cached_dataset)
             case BatchingPreference.ALL_AT_ONCE:
                 itr = [non_cached_dataset[:]]
             case _:
                 num_batches = len(non_cached_dataset) // benchmark_config.batch_size
                 if len(non_cached_dataset) % benchmark_config.batch_size != 0:
                     num_batches += 1
-                itr = tqdm(
+                itr = get_pbar(
                     iterable=mit.batched(
                         iterable=non_cached_dataset, n=benchmark_config.batch_size
                     ),
@@ -297,7 +296,7 @@ def debug_log(
                         + "\n"
                         + "\t".join(labels)
                     )
-            logger.info("\n\n".join(log_msgs))
+            log("\n\n".join(log_msgs), level=logging.DEBUG)
             return
         case (
@@ -347,6 +346,7 @@ def debug_log(
         if labels[idx]:
             data_to_log["Label"] = labels[idx]
         data_to_log |= {key.capitalize(): batch[key][idx] for key in metadata_keys}
-        logger.info(
-            "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items())
+        log(
+            "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items()),
+            level=logging.DEBUG,
         )

euroeval/generation_utils.py CHANGED Viewed

@@ -9,8 +9,9 @@ import typing as t
 from .enums import GenerativeType, TaskGroup
 from .exceptions import InvalidBenchmark, InvalidModel
+from .logging_utils import log_once
 from .tokenisation_utils import apply_chat_template
-from .utils import extract_multiple_choice_labels, log_once
+from .utils import extract_multiple_choice_labels
 if t.TYPE_CHECKING:
     from datasets import DatasetDict
@@ -18,8 +19,6 @@ if t.TYPE_CHECKING:
     from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
-logger = logging.getLogger("euroeval")
 def extract_few_shot_examples(
     dataset: "DatasetDict",

EuroEval 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.3.0py3-none-any.whl → 16.4.0py3-none-any.whl