PyPI - EuroEval - Versions diffs - 15.10.1__py3-none-any.whl → 15.11.0__py3-none-any.whl - Mend

EuroEval 15.10.1py3-none-any.whl → 15.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (31) hide show

euroeval/__init__.py +7 -0
euroeval/benchmark_modules/base.py +29 -29
euroeval/benchmark_modules/fresh.py +31 -19
euroeval/benchmark_modules/hf.py +27 -23
euroeval/benchmark_modules/litellm.py +50 -30
euroeval/benchmark_modules/vllm.py +21 -25
euroeval/benchmarker.py +1 -1
euroeval/callbacks.py +17 -13
euroeval/data_loading.py +10 -5
euroeval/data_models.py +2 -40
euroeval/dataset_configs/english.py +13 -4
euroeval/dataset_configs/norwegian.py +8 -0
euroeval/finetuning.py +9 -8
euroeval/generation.py +5 -4
euroeval/generation_utils.py +1 -0
euroeval/human_evaluation.py +13 -13
euroeval/metrics.py +452 -0
euroeval/scores.py +14 -19
euroeval/speed_benchmark.py +6 -7
euroeval/task_group_utils/multiple_choice_classification.py +6 -4
euroeval/task_group_utils/question_answering.py +5 -28
euroeval/task_group_utils/sequence_classification.py +6 -30
euroeval/task_group_utils/text_to_text.py +19 -34
euroeval/task_group_utils/token_classification.py +18 -30
euroeval/tasks.py +11 -136
euroeval/types.py +6 -4
{euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/METADATA +10 -10
{euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/RECORD +31 -30
{euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/licenses/LICENSE +1 -1
{euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/WHEEL +0 -0
{euroeval-15.10.1.dist-info → euroeval-15.11.0.dist-info}/entry_points.txt +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -13,14 +13,11 @@ from pathlib import Path
 from time import sleep
 import torch
-from datasets import DatasetDict
 from huggingface_hub import snapshot_download
 from pydantic import conlist, create_model
 from tqdm.auto import tqdm
 from transformers.models.auto.configuration_auto import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.trainer import Trainer
 from urllib3.exceptions import RequestError
 from ..constants import (
@@ -34,13 +31,7 @@ from ..constants import (
     TASKS_USING_JSON,
     VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
 )
-from ..data_models import (
-    BenchmarkConfig,
-    DatasetConfig,
-    GenerativeModelOutput,
-    ModelConfig,
-    Task,
-)
+from ..data_models import GenerativeModelOutput, ModelConfig
 from ..enums import (
     BatchingPreference,
     GenerativeType,
@@ -94,6 +85,13 @@ if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
 if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
     import ray
+if t.TYPE_CHECKING:
+    from datasets import DatasetDict
+    from transformers.tokenization_utils import PreTrainedTokenizer
+    from transformers.trainer import Trainer
+    from ..data_models import BenchmarkConfig, DatasetConfig, Task
 logger = logging.getLogger("euroeval")
@@ -106,9 +104,9 @@ class VLLMModel(HuggingFaceEncoderModel):
     def __init__(
         self,
-        model_config: ModelConfig,
-        dataset_config: DatasetConfig,
-        benchmark_config: BenchmarkConfig,
+        model_config: "ModelConfig",
+        dataset_config: "DatasetConfig",
+        benchmark_config: "BenchmarkConfig",
     ) -> None:
         """Initialise the vLLM model.
@@ -129,8 +127,8 @@ class VLLMModel(HuggingFaceEncoderModel):
         model, tokenizer = load_model_and_tokenizer(
             model_config=model_config, benchmark_config=benchmark_config
         )
-        self._model: LLM = model
-        self._tokenizer: PreTrainedTokenizer = tokenizer
+        self._model: "LLM" = model
+        self._tokenizer: "PreTrainedTokenizer" = tokenizer
         self.end_of_reasoning_token = get_end_of_reasoning_token(
             model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
         )
@@ -230,8 +228,8 @@ class VLLMModel(HuggingFaceEncoderModel):
                 )
     def prepare_dataset(
-        self, dataset: DatasetDict, task: Task, itr_idx: int
-    ) -> DatasetDict:
+        self, dataset: "DatasetDict", task: "Task", itr_idx: int
+    ) -> "DatasetDict":
         """Prepare the dataset for the model.
         This includes things like tokenisation.
@@ -293,7 +291,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         return dataset
-    def generate(self, inputs: dict) -> GenerativeModelOutput:
+    def generate(self, inputs: dict) -> "GenerativeModelOutput":
         """Generate outputs from the model.
         Args:
@@ -524,7 +522,7 @@ class VLLMModel(HuggingFaceEncoderModel):
     @classmethod
     def model_exists(
-        cls, model_id: str, benchmark_config: BenchmarkConfig
+        cls, model_id: str, benchmark_config: "BenchmarkConfig"
     ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
         """Check if a model exists.
@@ -558,8 +556,8 @@ class VLLMModel(HuggingFaceEncoderModel):
     @classmethod
     def get_model_config(
-        cls, model_id: str, benchmark_config: BenchmarkConfig
-    ) -> ModelConfig:
+        cls, model_id: str, benchmark_config: "BenchmarkConfig"
+    ) -> "ModelConfig":
         """Fetch the model configuration.
         Args:
@@ -628,8 +626,8 @@ class VLLMModel(HuggingFaceEncoderModel):
 def load_model_and_tokenizer(
-    model_config: ModelConfig, benchmark_config: BenchmarkConfig
-) -> "tuple[LLM, PreTrainedTokenizer]":
+    model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
+) -> tuple["LLM", "PreTrainedTokenizer"]:
     """Load the model and tokenizer.
     Args:
@@ -1017,7 +1015,6 @@ def get_custom_stop_tokens(
     """
     candidate_stop_tokens = CUSTOM_STOP_TOKENS
-    # Create a prompt to check if the model uses the reasoning tokens
     prompt = "Hello"
     if tokenizer.chat_template is not None:
         templated_prompt = tokenizer.apply_chat_template(
@@ -1028,7 +1025,6 @@ def get_custom_stop_tokens(
         assert isinstance(templated_prompt, str)
         prompt = templated_prompt
-    # Check that the beginning-of-reasoning token is actually used by the model
     max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
     completion = (
         model.generate(

euroeval/benchmarker.py CHANGED Viewed

@@ -767,7 +767,7 @@ class Benchmarker:
                 results = log_scores(
                     dataset_name=dataset_config.pretty_name,
-                    metric_configs=dataset_config.task.metrics,
+                    metrics=dataset_config.task.metrics,
                     scores=scores,
                     model_id=model_config.model_id,
                     model_revision=model_config.revision,

euroeval/callbacks.py CHANGED Viewed

@@ -1,12 +1,16 @@
 """Callbacks for the Hugging Face Trainer."""
 import sys
+import typing as t
 from collections.abc import Sized
-from torch.utils.data import DataLoader
 from tqdm.auto import tqdm
-from transformers.trainer_callback import ProgressCallback, TrainerControl, TrainerState
-from transformers.training_args import TrainingArguments
+from transformers.trainer_callback import ProgressCallback
+if t.TYPE_CHECKING:
+    from torch.utils.data import DataLoader
+    from transformers.trainer_callback import TrainerControl, TrainerState
+    from transformers.training_args import TrainingArguments
 class NeverLeaveProgressCallback(ProgressCallback):
@@ -20,9 +24,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
     def on_train_begin(
         self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
+        args: "TrainingArguments",
+        state: "TrainerState",
+        control: "TrainerControl",
         **kwargs: str,
     ) -> None:
         """Callback actions when training begins."""
@@ -38,9 +42,9 @@ class NeverLeaveProgressCallback(ProgressCallback):
     def on_step_end(
         self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
+        args: "TrainingArguments",
+        state: "TrainerState",
+        control: "TrainerControl",
         **kwargs: str,
     ) -> None:
         """Callback actions when a training step ends."""
@@ -50,10 +54,10 @@ class NeverLeaveProgressCallback(ProgressCallback):
     def on_prediction_step(
         self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        eval_dataloader: DataLoader | None = None,
+        args: "TrainingArguments",
+        state: "TrainerState",
+        control: "TrainerControl",
+        eval_dataloader: "DataLoader | None" = None,
         **kwargs: str,
     ) -> None:
         """Callback actions when a prediction step ends."""

euroeval/data_loading.py CHANGED Viewed

@@ -3,23 +3,28 @@
 import logging
 import sys
 import time
+import typing as t
 import requests
-from datasets import Dataset, DatasetDict, load_dataset
+from datasets import DatasetDict, load_dataset
 from datasets.exceptions import DatasetsError
 from huggingface_hub.errors import HfHubHTTPError
 from numpy.random import Generator
-from .data_models import BenchmarkConfig, DatasetConfig
 from .exceptions import HuggingFaceHubDown, InvalidBenchmark
 from .utils import unscramble
+if t.TYPE_CHECKING:
+    from datasets import Dataset
+    from .data_models import BenchmarkConfig, DatasetConfig
 logger = logging.getLogger("euroeval")
 def load_data(
     rng: Generator, dataset_config: "DatasetConfig", benchmark_config: "BenchmarkConfig"
-) -> list[DatasetDict]:
+) -> list["DatasetDict"]:
     """Load the raw bootstrapped datasets.
     Args:
@@ -56,7 +61,7 @@ def load_data(
         dataset["test"] = dataset["test"].select(range(1))
     # Bootstrap the splits
-    bootstrapped_splits: dict[str, list[Dataset]] = dict()
+    bootstrapped_splits: dict[str, list["Dataset"]] = dict()
     for split in ["train", "val", "test"]:
         bootstrap_indices = rng.integers(
             0,
@@ -80,7 +85,7 @@ def load_data(
     return datasets
-def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> DatasetDict:
+def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDict":
     """Load the raw dataset.
     Args:

euroeval/data_models.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Data models used in EuroEval."""
-import collections.abc as c
 import json
 import pathlib
 import re
@@ -11,48 +10,11 @@ import pydantic
 import torch
 from .enums import Device, InferenceBackend, ModelType, TaskGroup
+from .metrics import Metric
 from .types import ScoreDict
 from .utils import get_package_version
-@dataclass
-class MetricConfig:
-    """Configuration for a metric.
-    Attributes:
-        name:
-            The name of the metric.
-        pretty_name:
-            A longer prettier name for the metric, which allows cases and spaces. Used
-            for logging.
-        huggingface_id:
-            The Hugging Face ID of the metric.
-        results_key:
-            The name of the key used to extract the metric scores from the results
-            dictionary.
-        compute_kwargs:
-            Keyword arguments to pass to the metric's compute function. Defaults to
-            an empty dictionary.
-        postprocessing_fn:
-            A function to apply to the metric scores after they are computed, taking
-            the score to the postprocessed score along with its string representation.
-            Defaults to x -> (100 * x, f"{x:.2%}").
-    """
-    name: str
-    pretty_name: str
-    huggingface_id: str
-    results_key: str
-    compute_kwargs: dict[str, t.Any] = field(default_factory=dict)
-    postprocessing_fn: c.Callable[[float], tuple[float, str]] = field(
-        default_factory=lambda: lambda raw_score: (100 * raw_score, f"{raw_score:.2%}")
-    )
-    def __hash__(self) -> int:
-        """Return a hash of the metric configuration."""
-        return hash(self.name)
 @dataclass
 class Language:
     """A benchmarkable language.
@@ -147,7 +109,7 @@ class Task:
     name: str
     task_group: TaskGroup
     template_dict: dict["Language", "PromptConfig"]
-    metrics: list[MetricConfig]
+    metrics: list[Metric]
     default_num_few_shot_examples: int
     default_max_generated_tokens: int
     default_labels: list[str]

euroeval/dataset_configs/english.py CHANGED Viewed

@@ -49,10 +49,10 @@ CNN_DAILYMAIL_CONFIG = DatasetConfig(
     languages=[EN],
 )
-MMLU_CONFIG = DatasetConfig(
-    name="mmlu",
-    pretty_name="the truncated version of the English knowledge dataset MMLU",
-    huggingface_id="EuroEval/mmlu-mini",
+LIFE_IN_THE_UK_CONFIG = DatasetConfig(
+    name="life-in-the-uk",
+    pretty_name="the English knowledge dataset Life in the UK",
+    huggingface_id="EuroEval/life-in-the-uk",
     task=KNOW,
     languages=[EN],
 )
@@ -86,3 +86,12 @@ BELEBELE_CONFIG = DatasetConfig(
     languages=[EN],
     unofficial=True,
 )
+MMLU_CONFIG = DatasetConfig(
+    name="mmlu",
+    pretty_name="the truncated version of the English knowledge dataset MMLU",
+    huggingface_id="EuroEval/mmlu-mini",
+    task=KNOW,
+    languages=[EN],
+    unofficial=True,
+)

euroeval/dataset_configs/norwegian.py CHANGED Viewed

@@ -76,6 +76,14 @@ NRK_QUIZ_QA_CONFIG = DatasetConfig(
     languages=[NB, NN, NO],
 )
+IDIOMS_NO_CONFIG = DatasetConfig(
+    name="idioms-no",
+    pretty_name="the Norwegian knowledge dataset Idioms-no",
+    huggingface_id="EuroEval/idioms-no",
+    task=KNOW,
+    languages=[NB, NN, NO],
+)
 NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
     name="nor-common-sense-qa",
     pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "

euroeval/finetuning.py CHANGED Viewed

@@ -5,7 +5,6 @@ import sys
 import typing as t
 import torch
-from datasets import DatasetDict
 from tqdm.auto import tqdm
 from transformers.trainer_callback import (
     EarlyStoppingCallback,
@@ -15,7 +14,6 @@ from transformers.trainer_callback import (
 from transformers.trainer_utils import IntervalStrategy
 from transformers.training_args import OptimizerNames, TrainingArguments
-from .benchmark_modules import BenchmarkModule
 from .callbacks import NeverLeaveProgressCallback
 from .enums import DataType
 from .exceptions import InvalidBenchmark, NaNValueInModelOutput
@@ -28,14 +26,17 @@ from .utils import (
 )
 if t.TYPE_CHECKING:
+    from datasets import DatasetDict
+    from .benchmark_modules import BenchmarkModule
     from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
 logger = logging.getLogger("euroeval")
 def finetune(
-    model: BenchmarkModule,
-    datasets: list[DatasetDict],
+    model: "BenchmarkModule",
+    datasets: list["DatasetDict"],
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
@@ -155,9 +156,9 @@ def finetune(
 def finetune_single_iteration(
-    model: BenchmarkModule | None,
-    dataset: DatasetDict,
-    training_args: TrainingArguments,
+    model: "BenchmarkModule | None",
+    dataset: "DatasetDict",
+    training_args: "TrainingArguments",
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
@@ -254,7 +255,7 @@ def get_training_args(
     iteration_idx: int,
     dtype: DataType,
     batch_size: int | None = None,
-) -> TrainingArguments:
+) -> "TrainingArguments":
     """Get the training arguments for the current iteration.
     Args:

euroeval/generation.py CHANGED Viewed

@@ -6,10 +6,8 @@ import typing as t
 from pathlib import Path
 import more_itertools as mit
-from datasets import Dataset, DatasetDict
 from tqdm.auto import tqdm
-from .benchmark_modules import BenchmarkModule
 from .enums import BatchingPreference, TaskGroup
 from .exceptions import InvalidBenchmark
 from .model_cache import (
@@ -20,6 +18,9 @@ from .model_cache import (
 from .utils import clear_memory
 if t.TYPE_CHECKING:
+    from datasets import Dataset, DatasetDict
+    from .benchmark_modules import BenchmarkModule
     from .data_models import (
         BenchmarkConfig,
         DatasetConfig,
@@ -32,7 +33,7 @@ logger = logging.getLogger("euroeval")
 def generate(
     model: "BenchmarkModule",
-    datasets: list[DatasetDict],
+    datasets: list["DatasetDict"],
     model_config: "ModelConfig",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",
@@ -100,7 +101,7 @@ def generate(
 def generate_single_iteration(
-    dataset: Dataset,
+    dataset: "Dataset",
     model: "BenchmarkModule",
     dataset_config: "DatasetConfig",
     benchmark_config: "BenchmarkConfig",

euroeval/generation_utils.py CHANGED Viewed

@@ -323,6 +323,7 @@ def apply_prompt(
                     tokenize=False,
                     add_generation_prompt=True,
                     chat_template=chat_template,
+                    enable_thinking=True,
                 )
                 for messages in messages_list
             ]

euroeval/human_evaluation.py CHANGED Viewed

@@ -3,6 +3,7 @@
 import importlib.util
 import json
 import logging
+import typing as t
 from collections import defaultdict
 from functools import partial
 from pathlib import Path
@@ -24,13 +25,15 @@ from .task_group_utils import (
     token_classification,
 )
 from .tasks import NER
-from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
 from .utils import enforce_reproducibility
 if importlib.util.find_spec("gradio") is not None:
     import gradio as gr
     from gradio.components import HTML, Button, Dropdown, Markdown, Textbox
+if t.TYPE_CHECKING:
+    from .types import ComputeMetricsFunction, ExtractLabelsFunction, ScoreDict
 logger = logging.getLogger("euroeval")
@@ -86,8 +89,8 @@ class HumanEvaluator:
             }
         )
-        self.extract_labels_from_generation: ExtractLabelsFunction
-        self.compute_metrics: ComputeMetricsFunction
+        self.extract_labels_from_generation: "ExtractLabelsFunction"
+        self.compute_metrics: "ComputeMetricsFunction"
     def create_app(self) -> "gr.Blocks":
         """Create the Gradio app for human evaluation.
@@ -342,7 +345,6 @@ class HumanEvaluator:
                 self.compute_metrics = partial(
                     sequence_classification.compute_metrics,
                     dataset_config=self.dataset_config,
-                    benchmark_config=benchmark_config,
                 )
                 self.extract_labels_from_generation = partial(
                     sequence_classification.extract_labels_from_generation,
@@ -362,7 +364,6 @@ class HumanEvaluator:
                     token_classification.compute_metrics,
                     has_misc_tags=self.has_misc_tags,
                     dataset_config=self.dataset_config,
-                    benchmark_config=benchmark_config,
                 )
                 self.extract_labels_from_generation = partial(
                     token_classification.extract_labels_from_generation,
@@ -372,7 +373,6 @@ class HumanEvaluator:
                 self.compute_metrics = partial(
                     question_answering.compute_metrics,
                     dataset_config=self.dataset_config,
-                    benchmark_config=benchmark_config,
                 )
                 self.extract_labels_from_generation = (
                     question_answering.extract_labels_from_generation
@@ -641,7 +641,7 @@ class HumanEvaluator:
         # only a single iteration, so the results from the current annotation should be
         # added to the previous results.
         results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
-        results: ScoreDict = defaultdict(list)
+        results: "ScoreDict" = defaultdict(list)
         if results_path.exists():
             all_results = [
                 json.loads(line.strip())
@@ -664,15 +664,15 @@ class HumanEvaluator:
         # Aggregate scores
         total_dict: dict[str, float] = dict()
-        for metric_cfg in self.dataset_config.task.metrics:
+        for metric in self.dataset_config.task.metrics:
             test_score, test_se = aggregate_scores(
                 scores=results["raw"],  # type: ignore[arg-type]
-                metric_config=metric_cfg,
+                metric=metric,
             )
-            test_score, _ = metric_cfg.postprocessing_fn(test_score)
-            test_se, _ = metric_cfg.postprocessing_fn(test_se)
-            total_dict[f"test_{metric_cfg.name}"] = test_score
-            total_dict[f"test_{metric_cfg.name}_se"] = test_se
+            test_score, _ = metric.postprocessing_fn(test_score)
+            test_se, _ = metric.postprocessing_fn(test_se)
+            total_dict[f"test_{metric.name}"] = test_score
+            total_dict[f"test_{metric.name}_se"] = test_se
         results["total"] = total_dict
         benchmark_result = BenchmarkResult(

EuroEval 15.10.1__py3-none-any.whl → 15.11.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.10.1py3-none-any.whl → 15.11.0py3-none-any.whl