PyPI - EuroEval - Versions diffs - 16.2.2__py3-none-any.whl → 16.3.0__py3-none-any.whl - Mend

EuroEval 16.2.2py3-none-any.whl → 16.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (38) hide show

euroeval/__init__.py +4 -2
euroeval/benchmark_modules/fresh.py +3 -1
euroeval/benchmark_modules/hf.py +8 -4
euroeval/benchmark_modules/litellm.py +5 -17
euroeval/benchmark_modules/vllm.py +88 -23
euroeval/benchmarker.py +110 -61
euroeval/cli.py +1 -1
euroeval/constants.py +3 -0
euroeval/dataset_configs/__init__.py +1 -0
euroeval/dataset_configs/danish.py +0 -2
euroeval/dataset_configs/dutch.py +0 -2
euroeval/dataset_configs/english.py +0 -2
euroeval/dataset_configs/finnish.py +0 -2
euroeval/dataset_configs/french.py +0 -2
euroeval/dataset_configs/german.py +0 -2
euroeval/dataset_configs/italian.py +0 -2
euroeval/dataset_configs/latvian.py +2 -3
euroeval/dataset_configs/lithuanian.py +62 -0
euroeval/dataset_configs/norwegian.py +0 -2
euroeval/dataset_configs/polish.py +0 -2
euroeval/dataset_configs/portuguese.py +0 -2
euroeval/dataset_configs/spanish.py +0 -2
euroeval/dataset_configs/swedish.py +0 -3
euroeval/metrics/huggingface.py +1 -1
euroeval/metrics/pipeline.py +5 -0
euroeval/prompt_templates/linguistic_acceptability.py +9 -0
euroeval/prompt_templates/multiple_choice.py +9 -0
euroeval/prompt_templates/named_entity_recognition.py +20 -0
euroeval/prompt_templates/reading_comprehension.py +10 -0
euroeval/prompt_templates/sentiment_classification.py +11 -0
euroeval/tokenisation_utils.py +8 -8
euroeval/utils.py +1 -1
{euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
euroeval-16.3.0.dist-info/RECORD +71 -0
euroeval-16.2.2.dist-info/RECORD +0 -70
{euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
{euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0

euroeval/__init__.py CHANGED Viewed

@@ -103,8 +103,10 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
 os.environ["VLLM_USE_V1"] = "1"
-# Use the FlashInfer flash-attention backend for vLLM
-os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
+# Use the FlashInfer flash-attention backend for vLLM, unless the user has already
+# specified a different backend.
+if os.getenv("VLLM_ATTENTION_BACKEND") is None:
+    os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
 # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the

euroeval/benchmark_modules/fresh.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Freshly initialised encoder models."""
+import re
 import typing as t
 from functools import cached_property
 from json import JSONDecodeError
@@ -45,6 +46,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
     """A freshly initialised encoder model."""
     fresh_model = True
+    allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
     def __init__(
         self,
@@ -294,7 +296,7 @@ def load_model_and_tokeniser(
             token=get_hf_token(api_key=benchmark_config.api_key),
             add_prefix_space=prefix,
             cache_dir=model_config.model_cache_dir,
-            use_fast=True,
+            use_fast=False if model_config.param == "slow-tokenizer" else True,
             verbose=False,
             trust_remote_code=benchmark_config.trust_remote_code,
         )

euroeval/benchmark_modules/hf.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import collections.abc as c
 import logging
+import re
 import typing as t
 from functools import cached_property, partial
 from json import JSONDecodeError
@@ -93,6 +94,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
     fresh_model = False
     batching_preference = BatchingPreference.NO_PREFERENCE
     high_priority = True
+    allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
     def __init__(
         self,
@@ -690,7 +692,7 @@ def load_model_and_tokeniser(
         model=model,
         model_id=model_id,
         trust_remote_code=benchmark_config.trust_remote_code,
-        model_cache_dir=model_config.model_cache_dir,
+        model_config=model_config,
     )
     return model, tokeniser
@@ -880,7 +882,7 @@ def load_tokeniser(
     model: "PreTrainedModel | None",
     model_id: str,
     trust_remote_code: bool,
-    model_cache_dir: str,
+    model_config: "ModelConfig",
 ) -> "PreTrainedTokenizer":
     """Load the tokeniser.
@@ -892,17 +894,19 @@ def load_tokeniser(
             The model identifier. Used for logging.
         trust_remote_code:
             Whether to trust remote code.
+        model_config:
+            The model configuration.
     Returns:
         The loaded tokeniser.
     """
     loading_kwargs: dict[str, bool | str] = dict(
-        use_fast=True,
+        use_fast=False if model_config.param == "slow-tokenizer" else True,
         verbose=False,
         trust_remote_code=trust_remote_code,
         padding_side="right",
         truncation_side="right",
-        cache_dir=model_cache_dir,
+        cache_dir=model_config.model_cache_dir,
     )
     # If the model is a subclass of a certain model types then we have to add a prefix

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -369,7 +369,8 @@ class LiteLLMModel(BenchmarkModule):
             ]
             logger.debug(
                 f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
-                f"{len(inputs_to_run):,} failed message(s)"
+                f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
+                f"{failures[0][1]}."
             )
             # Attempt to handle the exceptions, to improve the chance of getting
@@ -453,8 +454,7 @@ class LiteLLMModel(BenchmarkModule):
         requires_thinking_disabled_messages = ["thinking.type: Field required"]
         seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
         response_format_messages = [
-            "got an unexpected keyword argument 'response_format'",
-            "The model outputs empty dictionaries.",
+            "got an unexpected keyword argument 'response_format'"
         ]
         if any(msg.lower() in error_msg for msg in stop_messages):
@@ -713,18 +713,6 @@ class LiteLLMModel(BenchmarkModule):
             ]
         responses = await tqdm_async.gather(*requests, leave=False)
-        # If we are performing structured generation and the model just outputs an empty
-        # dictionary, then we convert those to exceptions, to disable structured
-        # generation
-        if "response_format" in generation_kwargs:
-            responses = [
-                RuntimeError("The model outputs empty dictionaries.")
-                if not isinstance(response, Exception)
-                and any(choice.message.content == "{}" for choice in response.choices)
-                else response
-                for response in responses
-            ]
         # Separate the successful responses from the failed ones
         successes = [
             (idx, response)
@@ -984,7 +972,7 @@ class LiteLLMModel(BenchmarkModule):
                     model=None,
                     model_id=model_id,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
-                    model_cache_dir=self.model_config.model_cache_dir,
+                    model_config=self.model_config,
                 )
                 if (
@@ -1067,7 +1055,7 @@ class LiteLLMModel(BenchmarkModule):
                     model=None,
                     model_id=model_id,
                     trust_remote_code=self.benchmark_config.trust_remote_code,
-                    model_cache_dir=self.model_config.model_cache_dir,
+                    model_config=self.model_config,
                 )
                 all_max_lengths: list[int] = list()

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -104,7 +104,7 @@ class VLLMModel(HuggingFaceEncoderModel):
     fresh_model = False
     batching_preference = BatchingPreference.ALL_AT_ONCE
     high_priority = True
-    allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
+    allowed_params = {re.compile(r".*"): ["thinking", "no-thinking", "slow-tokenizer"]}
     def __init__(
         self,
@@ -559,11 +559,34 @@ class VLLMModel(HuggingFaceEncoderModel):
                 torch.LongTensor(completion_id) for completion_id in completion_ids
             ]
         )
-        if self.end_of_reasoning_token is not None:
-            completions = [
-                completion.split(self.end_of_reasoning_token)[-1]
-                for completion in completions
-            ]
+        if (
+            self.end_of_reasoning_token is not None
+            and self.generative_type == GenerativeType.REASONING
+        ):
+            for idx in range(len(completions)):
+                if self.end_of_reasoning_token in completions[idx]:
+                    completions[idx] = completions[idx].split(
+                        self.end_of_reasoning_token
+                    )[-1]
+                elif self.benchmark_config.verbose:
+                    logger.warning(
+                        f"The model {self.model_config.model_id!r} is a reasoning "
+                        "model, but the generated output does not contain the end of "
+                        f"reasoning token ({self.end_of_reasoning_token!r}). Using "
+                        "an empty string as the prediction instead."
+                    )
+                    completions[idx] = ""
+                else:
+                    log_once(
+                        f"The model {self.model_config.model_id!r} is a reasoning "
+                        "model, but the generated output does not contain the end of "
+                        f"reasoning token ({self.end_of_reasoning_token!r}). Using "
+                        "an empty string as the prediction instead. Only showing "
+                        "this warning once - see all occurrences if you run with the "
+                        "`verbose` flag.",
+                        level=logging.WARNING,
+                    )
+                    completions[idx] = ""
         stop_token_pattern = re.compile(
             "|".join(re.escape(stop_token) for stop_token in stop_tokens)
         )
@@ -830,9 +853,12 @@ def load_model_and_tokeniser(
         adapter_base_model_id=model_config.adapter_base_model_id,
         trust_remote_code=benchmark_config.trust_remote_code,
         model_max_length=true_max_model_len,
-        model_cache_dir=model_config.model_cache_dir,
+        model_config=model_config,
         token=get_hf_token(api_key=benchmark_config.api_key),
     )
+    vllm_tokenisation_params = get_vllm_tokenisation_params(
+        tokeniser=tokeniser, model_config=model_config
+    )
     clear_vllm()
@@ -865,16 +891,7 @@ def load_model_and_tokeniser(
             enable_prefix_caching=False,
             enable_lora=model_config.adapter_base_model_id is not None,
             max_lora_rank=256,
-            # Special arguments in case we are dealing with a Mistral model
-            tokenizer_mode="mistral"
-            if isinstance(tokeniser, MistralCommonTokenizer)
-            else "auto",
-            config_format="mistral"
-            if isinstance(tokeniser, MistralCommonTokenizer)
-            else "auto",
-            load_format="mistral"
-            if isinstance(tokeniser, MistralCommonTokenizer)
-            else "auto",
+            **vllm_tokenisation_params,
         )
     except (RuntimeError, ValueError, OSError) as e:
         if "awaiting a review from the repo authors" in str(e):
@@ -903,7 +920,7 @@ def load_tokeniser(
     adapter_base_model_id: str | None,
     trust_remote_code: bool,
     model_max_length: int,
-    model_cache_dir: str,
+    model_config: "ModelConfig",
     token: str | bool,
 ) -> "PreTrainedTokenizer":
     """Load the tokeniser.
@@ -920,8 +937,8 @@ def load_tokeniser(
             Whether to trust remote code.
         model_max_length:
             The maximum length of the model.
-        model_cache_dir:
-            The cache directory for the model.
+        model_config:
+            The model configuration.
         token:
             The Hugging Face API token.
@@ -932,7 +949,7 @@ def load_tokeniser(
     config = AutoConfig.from_pretrained(
         adapter_base_model_id or model_id,
         revision=revision,
-        cache_dir=model_cache_dir,
+        cache_dir=model_config.model_cache_dir,
         token=token,
         trust_remote_code=trust_remote_code,
         local_files_only=not internet_connection_available(),
@@ -940,15 +957,25 @@ def load_tokeniser(
     num_retries = 5
     for _ in range(num_retries):
         try:
+            # Mistral instruction-tuned models need a custom tokeniser
+            if model_id.startswith("mistralai/") and "base" not in model_id.lower():
+                tokeniser = MistralCommonTokenizer.from_pretrained(
+                    model_id,
+                    padding_side="left",
+                    truncation_side="left",
+                    model_max_length=model_max_length,
+                    token=token,
+                )
+                break
             tokeniser = AutoTokenizer.from_pretrained(
                 model_id,
-                use_fast=True,
+                use_fast=False if model_config.param == "slow-tokenizer" else True,
                 verbose=False,
                 trust_remote_code=trust_remote_code,
                 padding_side="left",
                 truncation_side="left",
                 model_max_length=model_max_length,
-                cache_dir=model_cache_dir,
+                cache_dir=model_config.model_cache_dir,
                 config=config,
                 token=token,
                 local_files_only=not internet_connection_available(),
@@ -1189,3 +1216,41 @@ def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
     """
     tqdm_kwargs.pop("leave", None)  # Remove the 'leave' key if it exists
     return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
+def get_vllm_tokenisation_params(
+    tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
+) -> dict[str, t.Any]:
+    """Get the tokenisation parameters for vLLM.
+    Args:
+        tokeniser:
+            The tokeniser.
+        model_config:
+            The model configuration.
+    Returns:
+        A dictionary of tokenisation parameters to pass to vLLM.
+    """
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        tokeniser_mode = "mistral"
+    elif model_config.param == "slow-tokenizer":
+        tokeniser_mode = "slow"
+    else:
+        tokeniser_mode = "auto"
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        config_format = "mistral"
+    else:
+        config_format = "auto"
+    if isinstance(tokeniser, MistralCommonTokenizer):
+        load_format = "mistral"
+    else:
+        load_format = "auto"
+    return dict(
+        tokenizer_mode=tokeniser_mode,
+        config_format=config_format,
+        load_format=load_format,
+    )

euroeval/benchmarker.py CHANGED Viewed

@@ -12,6 +12,7 @@ from time import sleep
 from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
 from torch.distributed import destroy_process_group
+from tqdm.auto import tqdm
 from .benchmark_config_factory import build_benchmark_config
 from .constants import GENERATIVE_PIPELINE_TAGS
@@ -32,6 +33,7 @@ from .utils import (
     get_package_version,
     internet_connection_available,
     log_once,
+    split_model_id,
 )
 if t.TYPE_CHECKING:
@@ -82,7 +84,7 @@ class Benchmarker:
         num_iterations: int = 10,
         api_base: str | None = None,
         api_version: str | None = None,
-        gpu_memory_utilization: float = 0.9,
+        gpu_memory_utilization: float = 0.8,
         generative_type: GenerativeType | None = None,
         debug: bool = False,
         run_with_cli: bool = False,
@@ -607,46 +609,90 @@ class Benchmarker:
             dataset_names=benchmark_config.datasets
         )
-        total_benchmarks = len(model_ids) * len(dataset_configs)
-        num_finished_benchmarks = 0
-        current_benchmark_results: list[BenchmarkResult] = list()
-        for model_id in model_ids:
-            # Load the model configuration, or skip the model if it is invalid
+        # Get all the model configs
+        model_configs: list[ModelConfig] = list()
+        for model_id in tqdm(
+            iterable=model_ids,
+            desc="Fetching model configurations",
+            disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
+        ):
             try:
                 model_config = get_model_config(
                     model_id=model_id, benchmark_config=benchmark_config
                 )
+                model_configs.append(model_config)
             except InvalidModel as e:
                 logger.info(e.message)
-                num_finished_benchmarks += len(dataset_configs)
+        # Create a dictionary that takes each model config to the dataset configs that
+        # we need to benchmark the model on. Here we remove the datasets that the model
+        # has already been benchmarked on, or datasets that the model cannot be
+        # benchmarked on.
+        model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
+            model_config: [
+                dataset_config
+                for dataset_config in dataset_configs
+                if (
+                    benchmark_config.force
+                    or not model_has_been_benchmarked(
+                        model_config=model_config,
+                        dataset_config=dataset_config,
+                        benchmark_config=benchmark_config,
+                        benchmark_results=self.benchmark_results,
+                    )
+                )
+                and model_config.model_type in dataset_config.allowed_model_types
+            ]
+            for model_config in model_configs
+        }
+        total_benchmarks = sum(
+            len(dataset_configs)
+            for dataset_configs in model_config_to_dataset_configs.values()
+        )
+        if total_benchmarks == 0:
+            logger.info(
+                "No benchmarks to run, as all the selected models have already been "
+                "benchmarked on all the selected datasets."
+            )
+            return list()
+        logger.info(f"Initiated evaluation of {total_benchmarks:,} benchmarks.")
+        num_finished_benchmarks = 0
+        current_benchmark_results: list[BenchmarkResult] = list()
+        for model_config in model_configs:
+            if not model_config_to_dataset_configs[model_config]:
+                logger.debug(
+                    f"Skipping model {model_config.model_id!r} because it has "
+                    "already been benchmarked on all valid datasets."
+                )
                 continue
             if model_config.adapter_base_model_id:
                 open_issue_msg = (
-                    "If offline support is important to you, please "
-                    "consider opening an issue at https://github.com/EuroEval/EuroEval/issues."
+                    "If offline support is important to you, please consider opening "
+                    "an issue at https://github.com/EuroEval/EuroEval/issues."
                 )
                 if not internet_connection_available():
                     raise InvalidModel(
                         "Offline benchmarking of models with adapters is not currently "
-                        "supported. "
-                        f"An active internet connection is required. {open_issue_msg}"
+                        "supported. An active internet connection is required. "
+                        "{open_issue_msg}"
                     )
                 elif benchmark_config.download_only:
                     log_once(
                         "You are using download only mode with a model that includes "
-                        "an adapter. "
-                        "Please note: Offline benchmarking of adapter models is not "
-                        "currently supported. "
-                        "An internet connection will be required during evaluation. "
+                        "an adapter. Please note that offline benchmarking of "
+                        "adapter models is not currently supported - an internet "
+                        "connection will be required during evaluation in this case. "
                         f"{open_issue_msg}",
                         level=logging.WARNING,
                     )
             loaded_model: BenchmarkModule | None = None
             benchmark_params_to_revert: dict[str, t.Any] = dict()
-            for dataset_config in dataset_configs:
+            for dataset_config in model_config_to_dataset_configs[model_config]:
                 # Revert any changes to the benchmark configuration made for the
                 # previous dataset
                 for param, value in benchmark_params_to_revert.items():
@@ -674,34 +720,6 @@ class Benchmarker:
                     benchmark_params_to_revert["few_shot"] = True
                     benchmark_config.few_shot = False
-                # Skip if we have already benchmarked this model on this dataset and
-                # we are not forcing the benchmark
-                if not benchmark_config.force and model_has_been_benchmarked(
-                    model_id=model_id,
-                    dataset=dataset_config.name,
-                    few_shot=benchmark_config.few_shot,
-                    validation_split=not benchmark_config.evaluate_test_split,
-                    benchmark_results=self.benchmark_results,
-                ):
-                    logger.debug(
-                        f"Skipping benchmarking {model_id} on "
-                        f"{dataset_config.pretty_name}, as it has already been "
-                        "benchmarked."
-                    )
-                    num_finished_benchmarks += 1
-                    continue
-                # Skip if the model type should not be benchmarked on this dataset
-                model_type = model_config.model_type
-                allowed_model_types = dataset_config.allowed_model_types
-                if model_type not in allowed_model_types:
-                    logger.debug(
-                        f"Skipping benchmarking {model_id} on "
-                        f"{dataset_config.pretty_name}, as it is of type {model_type}, "
-                        f"and the only allowed model types are {allowed_model_types}."
-                    )
-                    continue
                 # We do not re-initialise generative models as their architecture is not
                 # customised to specific datasets
                 if model_config.model_type == ModelType.GENERATIVE:
@@ -735,6 +753,22 @@ class Benchmarker:
                     else:
                         loaded_model.dataset_config = dataset_config
+                    # Skip the benchmark if the model is not of the correct
+                    # generative type
+                    if (
+                        loaded_model.generative_type
+                        not in dataset_config.allowed_generative_types
+                    ):
+                        logger.debug(
+                            f"Skipping the benchmark of model "
+                            f"{model_config.model_id!r}on dataset "
+                            f"{dataset_config.name!r} because the model has generative "
+                            f"type {loaded_model.generative_type} and the dataset "
+                            f"only allows {dataset_config.allowed_generative_types}."
+                        )
+                        num_finished_benchmarks += 1
+                        continue
                 # Benchmark a single model on a single dataset
                 benchmark_output_or_err = self._benchmark_single(
                     model=loaded_model,
@@ -969,23 +1003,20 @@ class Benchmarker:
 def model_has_been_benchmarked(
-    model_id: str,
-    dataset: str,
-    few_shot: bool,
-    validation_split: bool,
+    model_config: "ModelConfig",
+    dataset_config: "DatasetConfig",
+    benchmark_config: "BenchmarkConfig",
     benchmark_results: list[BenchmarkResult],
 ) -> bool:
     """Checks whether a model has already been benchmarked on a dataset.
     Args:
-        model_id:
-            The model ID.
-        dataset:
-            The dataset.
-        few_shot:
-            Whether the model was evaluated using few-shot evaluation.
-        validation_split:
-            Whether the model was evaluated on the validation split.
+        model_config:
+            The configuration of the model we are evaluating.
+        dataset_config:
+            The configuration of the dataset we are evaluating on.
+        benchmark_config:
+            The general benchmark configuration.
         benchmark_results:
             The benchmark results.
@@ -993,10 +1024,28 @@ def model_has_been_benchmarked(
         Whether the model has already been evaluated on the dataset.
     """
     for record in benchmark_results:
-        same_evaluation = record.model == model_id and record.dataset == dataset
-        same_validation_split_setting = record.validation_split == validation_split
-        same_few_shot_setting = record.few_shot == few_shot or not record.generative
-        if same_evaluation and same_validation_split_setting and same_few_shot_setting:
+        model_id_components = split_model_id(model_id=record.model)
+        same_model_id = model_id_components.model_id == model_config.model_id
+        same_revision = model_id_components.revision == model_config.revision
+        same_param = model_id_components.param == model_config.param
+        same_dataset = record.dataset == dataset_config.name
+        same_split = (
+            record.validation_split != benchmark_config.evaluate_test_split
+            or "val" not in dataset_config.splits
+        )
+        same_num_shots = (
+            record.few_shot == benchmark_config.few_shot
+            or not record.generative
+            or dataset_config.task.requires_zero_shot
+        )
+        if (
+            same_model_id
+            and same_revision
+            and same_param
+            and same_dataset
+            and same_split
+            and same_num_shots
+        ):
             return True
     return False

euroeval/cli.py CHANGED Viewed

@@ -188,7 +188,7 @@ from .tasks import get_all_tasks
 )
 @click.option(
     "--gpu-memory-utilization",
-    default=0.9,
+    default=0.8,
     show_default=True,
     help="The GPU memory utilization to use for vLLM. A larger value will result in "
     "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "

euroeval/constants.py CHANGED Viewed

@@ -50,9 +50,11 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
 # Hugging Face Hub tags used to classify models as merge models
 MERGE_TAGS = ["merge", "mergekit"]
 # The minimum required CUDA compute capability for using bfloat16 in vLLM
 VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
 # Used to detect whether a model is a reasoning model
 REASONING_TOKENS = [
     ("<think>", "</think>"),
@@ -60,6 +62,7 @@ REASONING_TOKENS = [
     ("<reasoning>", "</reasoning>"),
 ]
 # These tokens are sometimes used by models to indicate the end of a generated
 # response, but they do not use them as a proper EOS token, so we have to deal with them
 # manually. We only use them as stop tokens if they actually appear in the model's

euroeval/dataset_configs/__init__.py CHANGED Viewed

@@ -14,6 +14,7 @@ from .german import *  # noqa: F403
 from .icelandic import *  # noqa: F403
 from .italian import *  # noqa: F403
 from .latvian import *  # noqa: F403
+from .lithuanian import *  # noqa: F403
 from .norwegian import *  # noqa: F403
 from .polish import *  # noqa: F403
 from .portuguese import *  # noqa: F403

euroeval/dataset_configs/danish.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Danish dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import DA
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -159,7 +158,6 @@ WINOGRANDE_DA_CONFIG = DatasetConfig(
     languages=[DA],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/dutch.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All Dutch dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import NL
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -152,7 +151,6 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
     languages=[NL],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

euroeval/dataset_configs/english.py CHANGED Viewed

@@ -1,7 +1,6 @@
 """All English dataset configurations used in EuroEval."""
 from ..data_models import DatasetConfig
-from ..enums import ModelType
 from ..languages import EN
 from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
@@ -135,7 +134,6 @@ WINOGRANDE_CONFIG = DatasetConfig(
     languages=[EN],
     splits=["train", "test"],
     _labels=["a", "b"],
-    _allowed_model_types=[ModelType.GENERATIVE],
     unofficial=True,
 )

EuroEval 16.2.2__py3-none-any.whl → 16.3.0__py3-none-any.whl

Potentially problematic release.

EuroEval 16.2.2py3-none-any.whl → 16.3.0py3-none-any.whl