PyPI - EuroEval - Versions diffs - 15.6.1__py3-none-any.whl → 15.7.1__py3-none-any.whl - Mend

EuroEval 15.6.1py3-none-any.whl → 15.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (25) hide show

euroeval/benchmark_modules/litellm.py +148 -284
euroeval/benchmark_modules/vllm.py +115 -338
euroeval/benchmarker.py +13 -2
euroeval/constants.py +1 -1
euroeval/data_loading.py +48 -26
euroeval/data_models.py +3 -9
euroeval/dataset_configs/dutch.py +5 -16
euroeval/dataset_configs/finnish.py +60 -0
euroeval/generation_utils.py +346 -0
euroeval/prompt_templates/linguistic_acceptability.py +9 -1
euroeval/prompt_templates/multiple_choice.py +8 -1
euroeval/prompt_templates/named_entity_recognition.py +20 -1
euroeval/prompt_templates/reading_comprehension.py +11 -1
euroeval/prompt_templates/sentiment_classification.py +11 -1
euroeval/prompt_templates/summarization.py +9 -1
euroeval/scores.py +7 -1
euroeval/task_group_utils/sequence_classification.py +27 -32
euroeval/task_group_utils/text_to_text.py +10 -27
euroeval/tasks.py +1 -1
euroeval/tokenization_utils.py +22 -6
{euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/METADATA +14 -2
{euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/RECORD +25 -23
{euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/WHEEL +0 -0
{euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/entry_points.txt +0 -0
{euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/licenses/LICENSE +0 -0

euroeval/benchmark_modules/vllm.py CHANGED Viewed

@@ -3,11 +3,9 @@
 import collections.abc as c
 import contextlib
 import importlib.util
-import itertools as it
 import json
 import logging
 import os
-import random
 import re
 import sys
 import typing as t
@@ -56,6 +54,7 @@ from ..exceptions import (
     NeedsEnvironmentVariable,
     NeedsExtraInstalled,
 )
+from ..generation_utils import apply_prompt, extract_few_shot_examples
 from ..languages import get_all_languages
 from ..task_group_utils import (
     question_answering,
@@ -132,7 +131,7 @@ class VLLMModel(HuggingFaceEncoderModel):
         self._model: LLM = model
         self._tokenizer: PreTrainedTokenizer = tokenizer
         self.end_of_reasoning_token_id = get_end_of_reasoning_token_id(
-            model=self._model, tokenizer=self._tokenizer
+            model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
         )
         # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
@@ -146,7 +145,10 @@ class VLLMModel(HuggingFaceEncoderModel):
         self.buffer |= dict(
             instruction_model=self._tokenizer.chat_template is not None,
             first_label_token_mapping=get_first_label_token_mapping(
-                dataset_config=self.dataset_config, tokenizer=self._tokenizer
+                dataset_config=self.dataset_config,
+                model_config=self.model_config,
+                tokenizer=self._tokenizer,
+                generative_type=self.generative_type,
             ),
         )
         if self.model_config.adapter_base_model_id is not None:
@@ -255,14 +257,22 @@ class VLLMModel(HuggingFaceEncoderModel):
             )
         if self.benchmark_config.few_shot:
-            few_shot_examples = self._extract_few_shot_examples(
-                dataset=dataset, task=task, itr_idx=itr_idx
+            few_shot_examples = extract_few_shot_examples(
+                dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
             )
         else:
             few_shot_examples = list()
         dataset["test"] = dataset["test"].map(
-            partial(self._apply_prompt, few_shot_examples=few_shot_examples, task=task),
+            partial(
+                apply_prompt,
+                few_shot_examples=few_shot_examples,
+                model_config=self.model_config,
+                dataset_config=self.dataset_config,
+                instruction_model=self.buffer["instruction_model"],
+                always_populate_text_field=True,
+                tokenizer=self._tokenizer,
+            ),
             batched=True,
             load_from_cache_file=False,
             keep_in_memory=True,
@@ -332,30 +342,40 @@ class VLLMModel(HuggingFaceEncoderModel):
             if end_of_chat_token:
                 stop_tokens.append(end_of_chat_token)
+        logits_processor = None
         if self.dataset_config.task in TASKS_USING_JSON:
-            ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
-            keys_and_their_types: dict[str, t.Any] = {
-                tag_name: (conlist(str, max_length=5), ...)
-                for tag_name in ner_tag_names
-            }
-            pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
-            logits_processor = JSONLogitsProcessor(
-                schema=pydantic_class,
-                tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  # type: ignore
-                whitespace_pattern=r" ?",
-            )
-            log_once(
-                "Using structured generation with the schema "
-                f"{pydantic_class.model_json_schema()}",
-                level=logging.DEBUG,
-            )
-        else:
-            logits_processor = None
+            if self.generative_type == GenerativeType.REASONING:
+                log_once(
+                    f"The model {self.model_config.model_id!r} is a reasoning model "
+                    "and thus does not support structured generation, so we do not "
+                    "enable it.",
+                    level=logging.DEBUG,
+                )
+            else:
+                ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
+                keys_and_their_types: dict[str, t.Any] = {
+                    tag_name: (conlist(str, max_length=5), ...)
+                    for tag_name in ner_tag_names
+                }
+                pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
+                logits_processor = JSONLogitsProcessor(
+                    schema=pydantic_class,
+                    tokenizer=adapt_tokenizer(tokenizer=self._tokenizer),  # type: ignore
+                    whitespace_pattern=r" ?",
+                )
+                log_once(
+                    "Using structured generation with the JSON schema "
+                    f"{pydantic_class.model_json_schema()}",
+                    level=logging.DEBUG,
+                )
         # Get the mapping from labels to the first token in the label. We call this each
         # time we generate a new dataset since the dataset config can change
         self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
-            dataset_config=self.dataset_config, tokenizer=self._tokenizer
+            dataset_config=self.dataset_config,
+            model_config=self.model_config,
+            tokenizer=self._tokenizer,
+            generative_type=self.generative_type,
         )
         # Define the parameters used for vLLM generation
@@ -391,7 +411,10 @@ class VLLMModel(HuggingFaceEncoderModel):
         ) and should_prompts_be_stripped(
             labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
         ):
-            log_once(message="Stripping prompts.", level=logging.DEBUG)
+            log_once(
+                f"Stripping prompts for model {self.model_config.model_id!r}.",
+                level=logging.DEBUG,
+            )
             prompts = [prompt.strip() for prompt in prompts]
         # Generate sequences using vLLM
@@ -411,18 +434,65 @@ class VLLMModel(HuggingFaceEncoderModel):
                     f"Encountered error during vLLM generation: {str(e)}. Retrying..."
                 )
                 sleep(1)
+            except ValueError as e:
+                # Truncate the prompts if they are too long for the model
+                truncate_error_messages = [
+                    r"prompt \(length [0-9]+\) is longer than the maximum model length"
+                ]
+                if any(
+                    re.search(pattern, str(e), flags=re.IGNORECASE) is not None
+                    for pattern in truncate_error_messages
+                ):
+                    logger.info(
+                        "Prompts are too long, so truncating them and trying again..."
+                    )
+                    logger.debug(f"The error message was: {str(e)}")
+                    tokenized_prompts = self._tokenizer(
+                        text=prompts,
+                        truncation=True,
+                        max_length=max(
+                            self._tokenizer.model_max_length - max_tokens, 0
+                        ),
+                    )
+                    prompts = self._tokenizer.batch_decode(
+                        sequences=tokenized_prompts.input_ids, skip_special_tokens=True
+                    )
+                else:
+                    raise InvalidBenchmark(
+                        f"An error occurred during vLLM generation: {str(e)}"
+                    )
         else:
             raise InvalidBenchmark(
                 f"Could not generate sequences after {num_attempts} attempts."
             )
+        # When we shorten the prompts then some residual model outputs persist, so we
+        # need to filter these out
+        num_extra_outputs = len(raw_outputs) - len(prompts)
+        if num_extra_outputs > 0:
+            raw_outputs = raw_outputs[num_extra_outputs:]
+            if not all(
+                raw_output.prompt == prompt
+                for raw_output, prompt in zip(raw_outputs, prompts)
+            ):
+                raise InvalidBenchmark(
+                    f"The prompts and the model outputs do not match. There were "
+                    f"{num_extra_outputs!r} extra outputs."
+                )
+            else:
+                logger.debug(
+                    f"Filtered out {num_extra_outputs:,} extra outputs from the model, "
+                    "which occured as we interupted the generation when we truncated "
+                    "the prompts."
+                )
         # Parse the raw model outputs
         completion_ids: list[list[int]] = [
             output.outputs[0].token_ids for output in raw_outputs
         ]
         if self.end_of_reasoning_token_id in completion_ids[0]:
             completion_ids = [
-                token_ids[token_ids.index(self.end_of_reasoning_token_id) + 2 :]
+                token_ids[token_ids.index(self.end_of_reasoning_token_id) + 1 :]
                 if self.end_of_reasoning_token_id in token_ids
                 else token_ids
                 for token_ids in completion_ids
@@ -435,6 +505,12 @@ class VLLMModel(HuggingFaceEncoderModel):
         )
         completions = [completion.strip() for completion in completions]
+        # Sanity check
+        if len(completions) != len(prompts):
+            raise InvalidBenchmark(
+                f"Expected {len(prompts):,} completions, but got {len(completions):,}."
+            )
         # Add logprobs scores to the output
         if self.buffer["first_label_token_mapping"]:
             scores: list[list[list[tuple[str, float]]]] = [
@@ -546,302 +622,6 @@ class VLLMModel(HuggingFaceEncoderModel):
         return model_config
-    def _extract_few_shot_examples(
-        self, dataset: DatasetDict, task: Task, itr_idx: int
-    ) -> list[dict[str, t.Any]]:
-        """Extract few-shot examples from a dataset.
-        This will always extract the examples from the training split.
-        We ensure that the few-shot examples are unique by picking them one at a time.
-        Args:
-            dataset:
-                The dataset to extract the few-shot examples from.
-            task:
-                The task that is being benchmarked.
-            itr_idx:
-                The index of the dataset in the iterator.
-        Returns:
-            The few-shot examples.
-        """
-        random_seed = 4242 + itr_idx
-        num_few_shots = self.dataset_config.num_few_shot_examples
-        few_shot_examples: list[dict[str, t.Any]] = list()
-        shuffled_train = dataset["train"].shuffle(seed=random_seed)
-        match task.task_group:
-            case (
-                TaskGroup.SEQUENCE_CLASSIFICATION
-                | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
-            ):
-                labels = it.cycle(self.dataset_config.labels)
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    label = next(labels)
-                    possible_examples = shuffled_train.filter(
-                        lambda x: x["label"].lower() == label.lower()
-                    )
-                    if len(possible_examples) == 0:
-                        continue
-                    example = possible_examples.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["text"] != example["text"]
-                    )
-            case TaskGroup.TEXT_TO_TEXT:
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    example = shuffled_train.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["text"] != example["text"]
-                    )
-            case TaskGroup.TOKEN_CLASSIFICATION:
-                labels = it.cycle(
-                    [
-                        label.lower()
-                        for label in self.dataset_config.labels
-                        if label.lower().startswith("b-")
-                    ]
-                )
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    label = next(labels)
-                    possible_examples = shuffled_train.filter(
-                        lambda x: label in [tag.lower() for tag in x["labels"]]
-                    )
-                    if len(possible_examples) == 0:
-                        continue
-                    example = possible_examples.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["tokens"] != example["tokens"]
-                    )
-            case TaskGroup.QUESTION_ANSWERING:
-                # Locate the maximum number of tokens that constitutes a short example
-                for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
-                    train_with_short_examples = dataset["train"].filter(
-                        lambda example: len(example["context"]) < max_num_tokens
-                    )
-                    num_short_examples = len(train_with_short_examples)
-                    if num_short_examples >= self.dataset_config.num_few_shot_examples:
-                        break
-                else:
-                    raise InvalidBenchmark(
-                        "Could not find enough short examples for few-shot learning."
-                    )
-                shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
-                while (
-                    len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
-                ):
-                    example = shuffled_train.select(range(1))[0]
-                    few_shot_examples.append(example)
-                    shuffled_train = shuffled_train.filter(
-                        lambda x: x["context"] != example["context"]
-                    )
-            case _:
-                raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
-        random.seed(random_seed)
-        random.shuffle(few_shot_examples)
-        return few_shot_examples
-    def _apply_prompt(
-        self,
-        examples: dict[str, t.Any],
-        few_shot_examples: list[dict[str, t.Any]],
-        task: Task,
-    ) -> dict[str, t.Any]:
-        """Apply prompt template to an example, potentially with few-shot examples.
-        Args:
-            examples:
-                The examples to apply the few-shot examples to.
-            few_shot_examples:
-                The few-shot examples to apply.
-            task:
-                The task that is being benchmarked.
-        Returns:
-            The example with the few-shot examples applied.
-        """
-        def create_prompt(**kwargs: str) -> tuple[str, str]:
-            """Create a prompt from the given keyword arguments.
-            Args:
-                kwargs:
-                    The keyword arguments to use in the prompt.
-            Returns:
-                A pair (prompt, label), where "label" is an empty string if the model is
-                not instruction tuned (as in this case it is included in the prompt).
-            """
-            label_key = "label" if "label" in kwargs else "target_text"
-            label = kwargs.pop(label_key)
-            assert label is not None, (
-                f"Found a None label for the prompt: {kwargs}. This should not happen."
-            )
-            label_mapping = self.dataset_config.prompt_label_mapping
-            label = label_mapping.get(label, label)
-            if self.buffer["instruction_model"]:
-                prompt = self.dataset_config.instruction_prompt.format(**kwargs)
-                return prompt, label
-            else:
-                kwargs[label_key] = label
-                return self.dataset_config.prompt_template.format(**kwargs), ""
-        match task.task_group:
-            case (
-                TaskGroup.SEQUENCE_CLASSIFICATION
-                | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
-            ):
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["text"].replace("\n", " ").strip(),
-                        label=example["label"].replace("\n", " ").strip(),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(text=text.replace("\n", " ").strip(), label="")
-                    for text in examples["text"]
-                ]
-            case TaskGroup.TEXT_TO_TEXT:
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["text"].replace("\n", " ").strip(),
-                        target_text=example["target_text"].replace("\n", " ").strip(),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(text=text.replace("\n", " ").strip(), target_text="")
-                    for text in examples["text"]
-                ]
-            case TaskGroup.TOKEN_CLASSIFICATION:
-                def create_label(example: dict) -> str:
-                    prompt_labels = self.dataset_config.prompt_label_mapping.values()
-                    labels: dict[str, list[str]] = {
-                        prompt_label: list() for prompt_label in prompt_labels
-                    }
-                    for token, label in zip(example["tokens"], example["labels"]):
-                        label = label.lower()
-                        if label == "o":
-                            continue
-                        prompt_label = self.dataset_config.prompt_label_mapping[label]
-                        if label.startswith("b-"):
-                            labels[prompt_label].append(token)
-                        elif label.startswith("i-"):
-                            labels[prompt_label][-1] += " " + token
-                    return json.dumps(labels, ensure_ascii=False)
-                few_shot_sections = [
-                    create_prompt(
-                        text=" ".join(example["tokens"]).replace("\n", " ").strip(),
-                        label=create_label(example=example),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(
-                        text=" ".join(tokens).replace("\n", " ").strip(), label=""
-                    )
-                    for tokens in examples["tokens"]
-                ]
-            case TaskGroup.QUESTION_ANSWERING:
-                few_shot_sections = [
-                    create_prompt(
-                        text=example["context"].replace("\n", " ").strip(),
-                        question=example["question"].replace("\n", " ").strip(),
-                        label=example["answers"]["text"][0].replace("\n", " "),
-                    )
-                    for example in few_shot_examples
-                ]
-                new_sections = [
-                    create_prompt(
-                        text=context.replace("\n", " ").strip(),
-                        question=question.replace("\n", " ").strip(),
-                        label="",
-                    )
-                    for context, question in zip(
-                        examples["context"], examples["question"]
-                    )
-                ]
-            case _:
-                raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
-        if self.buffer["instruction_model"]:
-            few_shot_messages = [
-                dict(role=role, content=content)
-                for prompt, label in few_shot_sections
-                for role, content in [("user", prompt), ("assistant", label)]
-            ]
-            messages_list = [
-                few_shot_messages + [dict(role="user", content=prompt)]
-                for prompt, _ in new_sections
-            ]
-            # Pick the chat template that matches the language of the dataset, if such a
-            # template exists
-            chat_template: str | None = None
-            if isinstance(self._tokenizer.chat_template, dict):
-                language_codes = [
-                    language.code for language in self.dataset_config.languages
-                ]
-                for name, candidate_template in self._tokenizer.chat_template.items():
-                    if name.lower() in language_codes:
-                        chat_template = candidate_template
-                        log_once(
-                            f"Using the {name!r} chat template for the tokenizer.",
-                            level=logging.DEBUG,
-                        )
-                        break
-            texts = [
-                self._tokenizer.apply_chat_template(
-                    conversation=messages,
-                    tokenize=False,
-                    add_generation_prompt=True,
-                    chat_template=chat_template,
-                )
-                for messages in messages_list
-            ]
-            examples["text"] = texts
-        else:
-            prompt_prefix = ""
-            if self.dataset_config.prompt_prefix:
-                prompt_prefix = self.dataset_config.prompt_prefix + "\n\n"
-            few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
-            if few_shot_prompt:
-                few_shot_prompt += "\n\n"
-            examples["text"] = [
-                prompt_prefix + few_shot_prompt + new_prompt
-                for new_prompt, _ in new_sections
-            ]
-        return examples
     @property
     def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
         """The data collator used to prepare samples during finetuning.
@@ -1169,7 +949,7 @@ def clear_vllm() -> None:
 def get_end_of_reasoning_token_id(
-    model: "LLM", tokenizer: "PreTrainedTokenizer"
+    model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
 ) -> int | None:
     """Get the end of reasoning token ID for a generative model.
@@ -1182,6 +962,8 @@ def get_end_of_reasoning_token_id(
             The vLLM model.
         tokenizer:
             The tokenizer.
+        model_id:
+            The model ID.
     Returns:
         The end of reasoning token ID, or None if it could not be found.
@@ -1220,10 +1002,8 @@ def get_end_of_reasoning_token_id(
     completion_match = re.search(pattern=r"<\w+>", string=completion)
     if completion_match is None and prompt_match is None:
         log_once(
-            message=(
-                "Could not find a reasoning token, so assuming the model is not a "
-                "reasoning model."
-            ),
+            f"Could not find a reasoning token for model {model_id!r}, so assuming "
+            "the model is not a reasoning model.",
             level=logging.DEBUG,
         )
         return None
@@ -1249,20 +1029,17 @@ def get_end_of_reasoning_token_id(
         or end_of_reasoning_token not in special_tokens
     ):
         log_once(
-            message=(
-                f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
-                f"token {end_of_reasoning_token!r}, but one of them is not registered "
-                "as a special token, so assuming it is not a real reasoning token."
-            ),
+            f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
+            f"token {end_of_reasoning_token!r} for model {model_id!r}, but one of "
+            "them is not registered as a special token, so assuming it is not a "
+            "real reasoning token.",
             level=logging.DEBUG,
         )
         return None
     log_once(
-        message=(
-            f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
-            f"token {end_of_reasoning_token!r}."
-        ),
+        f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
+        f"token {end_of_reasoning_token!r} for model {model_id!r}.",
         level=logging.DEBUG,
     )

euroeval/benchmarker.py CHANGED Viewed

@@ -774,6 +774,7 @@ class Benchmarker:
                     metric_configs=dataset_config.task.metrics,
                     scores=scores,
                     model_id=model_config.model_id,
+                    model_revision=model_config.revision,
                 )
                 record = BenchmarkResult(
@@ -782,7 +783,11 @@ class Benchmarker:
                     dataset_languages=[
                         language.code for language in dataset_config.languages
                     ],
-                    model=model_config.model_id,
+                    model=(
+                        f"{model_config.model_id}@{model_config.revision}"
+                        if model_config.revision and model_config.revision != "main"
+                        else model_config.model_id
+                    ),
                     results=results,
                     num_model_parameters=model.num_params,
                     max_sequence_length=model.model_max_length,
@@ -1076,6 +1081,10 @@ def initial_logging(
         benchmark_config:
             The general benchmark configuration.
     """
+    model_id = model_config.model_id
+    if model_config.revision and model_config.revision != "main":
+        model_id += f"@{model_config.revision}"
     split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
     if model_config.task in GENERATIVE_PIPELINE_TAGS:
         if benchmark_config.few_shot:
@@ -1084,8 +1093,9 @@ def initial_logging(
             eval_type = "Zero-shot benchmarking"
     else:
         eval_type = "Benchmarking"
     logger.info(
-        f"{eval_type} {model_config.model_id} on the {split_type} split of "
+        f"{eval_type} {model_id} on the {split_type} split of "
         f"{dataset_config.pretty_name}"
     )
@@ -1095,6 +1105,7 @@ def initial_logging(
             "meaning that the resulting evaluation will not be included in the "
             "official leaderboard."
         )
     if benchmark_config.debug:
         logger.info(
             "Running in debug mode. This will output additional information, as "

euroeval/constants.py CHANGED Viewed

@@ -16,7 +16,7 @@ MAX_CONTEXT_LENGTH = 5_000
 # We need to raise the amount of tokens generated for reasoning models, to give them
 # time to think
-REASONING_MAX_TOKENS = 8_192
+REASONING_MAX_TOKENS = 32_768
 # The Hugging Face Hub pipeline tags used to classify models as generative

EuroEval 15.6.1__py3-none-any.whl → 15.7.1__py3-none-any.whl

Potentially problematic release.

EuroEval 15.6.1py3-none-any.whl → 15.7.1py3-none-any.whl