PyPI - EuroEval - Versions diffs - 15.13.0__py3-none-any.whl → 15.14.0__py3-none-any.whl - Mend

EuroEval 15.13.0py3-none-any.whl → 15.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of EuroEval might be problematic. Click here for more details.

Files changed (6) hide show

euroeval/benchmark_modules/litellm.py CHANGED Viewed

@@ -31,6 +31,7 @@ from litellm.exceptions import (
 from litellm.llms.vertex_ai.common_utils import VertexAIError
 from litellm.router import Router
 from litellm.types.utils import ChoiceLogprobs
+from litellm.utils import supports_reasoning, supports_response_schema
 from pydantic import conlist, create_model
 from requests.exceptions import RequestException
 from tqdm.asyncio import tqdm as tqdm_async
@@ -234,6 +235,8 @@ class LiteLLMModel(BenchmarkModule):
             pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
         ):
             type_ = GenerativeType.REASONING
+        elif supports_reasoning(model=self.model_config.model_id):
+            type_ = GenerativeType.REASONING
         else:
             type_ = GenerativeType.INSTRUCTION_TUNED
@@ -314,9 +317,7 @@ class LiteLLMModel(BenchmarkModule):
                     "enable it.",
                     level=logging.DEBUG,
                 )
-            elif litellm.utils.supports_response_schema(
-                model=self.model_config.model_id
-            ):
+            elif supports_response_schema(model=self.model_config.model_id):
                 ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
                 keys_and_their_types: dict[str, t.Any] = {
                     tag_name: (conlist(str, max_length=5), ...)
@@ -361,7 +362,7 @@ class LiteLLMModel(BenchmarkModule):
                 level=logging.DEBUG,
             )
         elif self.model_config.revision == "no-thinking":
-            generation_kwargs["thinking"] = dict(type="disabled", budget_tokens=0)
+            generation_kwargs["thinking"] = dict(budget_tokens=0)
             log_once(
                 f"Disabling thinking mode for model {self.model_config.model_id!r}",
                 level=logging.DEBUG,
@@ -377,6 +378,19 @@ class LiteLLMModel(BenchmarkModule):
         # Drop generation kwargs that are not supported by the model
         litellm.drop_params = True
+        # First attempt is a test run with a single conversation to handle errors
+        # quickly
+        test_conversation = conversations[0]
+        _, failures = safe_run(
+            self._generate_async(
+                model_id=self.model_config.model_id,
+                conversations=[test_conversation],
+                **generation_kwargs,
+            )
+        )
+        for _, error in failures:
+            self._handle_exception(error=error, generation_kwargs=generation_kwargs)
         all_responses: dict[int, "ModelResponse"] = {}
         conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
             enumerate(conversations)
@@ -477,6 +491,7 @@ class LiteLLMModel(BenchmarkModule):
             r"the thinking budget [0-9]+ is invalid. please choose a value between "
             r"[0-9]+ and ([0-9]+)\."
         )
+        requires_thinking_disabled_messages = ["thinking.type: Field required"]
         if any(msg.lower() in error_msg for msg in stop_messages):
             log_once(
@@ -557,6 +572,18 @@ class LiteLLMModel(BenchmarkModule):
                 type="enabled", budget_tokens=thinking_budget - 1
             )
             return
+        elif (
+            any(msg.lower() in error_msg for msg in requires_thinking_disabled_messages)
+            and self.generative_type != GenerativeType.REASONING
+        ):
+            log_once(
+                f"The model {model_id!r} requires the `thinking.type` field to be "
+                f"set to `disabled` rather than just setting `budget_tokens` to 0. "
+                "Setting `thinking.type` to `disabled`.",
+                level=logging.DEBUG,
+            )
+            generation_kwargs["thinking"] = dict(type="disabled")
+            return
         elif isinstance(
             error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
         ):

{euroeval-15.13.0.dist-info → euroeval-15.14.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: EuroEval
-Version: 15.13.0
+Version: 15.14.0
 Summary: The robust European language model benchmark.
 Project-URL: Repository, https://github.com/EuroEval/EuroEval
 Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues

{euroeval-15.13.0.dist-info → euroeval-15.14.0.dist-info}/RECORD RENAMED Viewed

@@ -27,7 +27,7 @@ euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwR
 euroeval/benchmark_modules/base.py,sha256=D1oKD16KBvxEoBUfqwvzvcDc1hx6letdD3v1PnBmF4A,10669
 euroeval/benchmark_modules/fresh.py,sha256=sg_AXNPApFObCzCRWhCgKxfr-eqQsT6Ri0xx0_Yy5JM,10293
 euroeval/benchmark_modules/hf.py,sha256=-W_bWEdm0zePkn4nDz4l0T4hhJJnlfwHrtIO3m5BrUs,44725
-euroeval/benchmark_modules/litellm.py,sha256=_gKBbJsXzo_cHJVaeuQpHRBENEZUGS_vcC-uGIhhmHA,52111
+euroeval/benchmark_modules/litellm.py,sha256=qv-k2ntk48OF4ikevQ95k4zLbBkZYOZ2z-GAisA-tFY,53374
 euroeval/benchmark_modules/vllm.py,sha256=kq3PMUuRT0NOky6XSHl1JeHTDGehwcub0HcGC5S_Wv4,38834
 euroeval/dataset_configs/__init__.py,sha256=EbjEyHwBtSztASl8_xblD8hessruDdV4Eg1vXrmGOuY,1935
 euroeval/dataset_configs/danish.py,sha256=-y-n08hTApwTdSVdjRlZYa3gOX92cTGhg8xsuG-Lhww,3691
@@ -56,8 +56,8 @@ euroeval/task_group_utils/question_answering.py,sha256=agwtWOmctgat98yqgFiMSPY6z
 euroeval/task_group_utils/sequence_classification.py,sha256=igmD24aMNN7QBJ8NDzgEnGwM-jq_zhC37QxazNm7GZ4,12711
 euroeval/task_group_utils/text_to_text.py,sha256=xOpja-W4E-1peMjZX8G-3G5iRgmFHHygrQ5WN1hB3FI,4550
 euroeval/task_group_utils/token_classification.py,sha256=wCy3aI-Sn9f-87tHzAnYDA6EbY3ah3xao1SnfnoRNz4,17490
-euroeval-15.13.0.dist-info/METADATA,sha256=HnDtAE2-sYFmSl4yM9PQhgUrfklR_OB5C5aXPOgz5U8,13478
-euroeval-15.13.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-euroeval-15.13.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
-euroeval-15.13.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
-euroeval-15.13.0.dist-info/RECORD,,
+euroeval-15.14.0.dist-info/METADATA,sha256=uQY74VCgn3TRCTXJGCb8ilS-3U5UL69lbhNGQw2NGTM,13478
+euroeval-15.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+euroeval-15.14.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
+euroeval-15.14.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
+euroeval-15.14.0.dist-info/RECORD,,

{euroeval-15.13.0.dist-info → euroeval-15.14.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{euroeval-15.13.0.dist-info → euroeval-15.14.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{euroeval-15.13.0.dist-info → euroeval-15.14.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

EuroEval 15.13.0__py3-none-any.whl → 15.14.0__py3-none-any.whl

Potentially problematic release.

EuroEval 15.13.0py3-none-any.whl → 15.14.0py3-none-any.whl