EuroEval 15.13.0__py3-none-any.whl → 15.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -31,6 +31,7 @@ from litellm.exceptions import (
31
31
  from litellm.llms.vertex_ai.common_utils import VertexAIError
32
32
  from litellm.router import Router
33
33
  from litellm.types.utils import ChoiceLogprobs
34
+ from litellm.utils import supports_reasoning, supports_response_schema
34
35
  from pydantic import conlist, create_model
35
36
  from requests.exceptions import RequestException
36
37
  from tqdm.asyncio import tqdm as tqdm_async
@@ -234,6 +235,8 @@ class LiteLLMModel(BenchmarkModule):
234
235
  pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
235
236
  ):
236
237
  type_ = GenerativeType.REASONING
238
+ elif supports_reasoning(model=self.model_config.model_id):
239
+ type_ = GenerativeType.REASONING
237
240
  else:
238
241
  type_ = GenerativeType.INSTRUCTION_TUNED
239
242
 
@@ -314,9 +317,7 @@ class LiteLLMModel(BenchmarkModule):
314
317
  "enable it.",
315
318
  level=logging.DEBUG,
316
319
  )
317
- elif litellm.utils.supports_response_schema(
318
- model=self.model_config.model_id
319
- ):
320
+ elif supports_response_schema(model=self.model_config.model_id):
320
321
  ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
321
322
  keys_and_their_types: dict[str, t.Any] = {
322
323
  tag_name: (conlist(str, max_length=5), ...)
@@ -361,7 +362,7 @@ class LiteLLMModel(BenchmarkModule):
361
362
  level=logging.DEBUG,
362
363
  )
363
364
  elif self.model_config.revision == "no-thinking":
364
- generation_kwargs["thinking"] = dict(type="disabled", budget_tokens=0)
365
+ generation_kwargs["thinking"] = dict(budget_tokens=0)
365
366
  log_once(
366
367
  f"Disabling thinking mode for model {self.model_config.model_id!r}",
367
368
  level=logging.DEBUG,
@@ -377,6 +378,19 @@ class LiteLLMModel(BenchmarkModule):
377
378
  # Drop generation kwargs that are not supported by the model
378
379
  litellm.drop_params = True
379
380
 
381
+ # First attempt is a test run with a single conversation to handle errors
382
+ # quickly
383
+ test_conversation = conversations[0]
384
+ _, failures = safe_run(
385
+ self._generate_async(
386
+ model_id=self.model_config.model_id,
387
+ conversations=[test_conversation],
388
+ **generation_kwargs,
389
+ )
390
+ )
391
+ for _, error in failures:
392
+ self._handle_exception(error=error, generation_kwargs=generation_kwargs)
393
+
380
394
  all_responses: dict[int, "ModelResponse"] = {}
381
395
  conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
382
396
  enumerate(conversations)
@@ -477,6 +491,7 @@ class LiteLLMModel(BenchmarkModule):
477
491
  r"the thinking budget [0-9]+ is invalid. please choose a value between "
478
492
  r"[0-9]+ and ([0-9]+)\."
479
493
  )
494
+ requires_thinking_disabled_messages = ["thinking.type: Field required"]
480
495
 
481
496
  if any(msg.lower() in error_msg for msg in stop_messages):
482
497
  log_once(
@@ -557,6 +572,18 @@ class LiteLLMModel(BenchmarkModule):
557
572
  type="enabled", budget_tokens=thinking_budget - 1
558
573
  )
559
574
  return
575
+ elif (
576
+ any(msg.lower() in error_msg for msg in requires_thinking_disabled_messages)
577
+ and self.generative_type != GenerativeType.REASONING
578
+ ):
579
+ log_once(
580
+ f"The model {model_id!r} requires the `thinking.type` field to be "
581
+ f"set to `disabled` rather than just setting `budget_tokens` to 0. "
582
+ "Setting `thinking.type` to `disabled`.",
583
+ level=logging.DEBUG,
584
+ )
585
+ generation_kwargs["thinking"] = dict(type="disabled")
586
+ return
560
587
  elif isinstance(
561
588
  error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
562
589
  ):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.13.0
3
+ Version: 15.14.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -27,7 +27,7 @@ euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwR
27
27
  euroeval/benchmark_modules/base.py,sha256=D1oKD16KBvxEoBUfqwvzvcDc1hx6letdD3v1PnBmF4A,10669
28
28
  euroeval/benchmark_modules/fresh.py,sha256=sg_AXNPApFObCzCRWhCgKxfr-eqQsT6Ri0xx0_Yy5JM,10293
29
29
  euroeval/benchmark_modules/hf.py,sha256=-W_bWEdm0zePkn4nDz4l0T4hhJJnlfwHrtIO3m5BrUs,44725
30
- euroeval/benchmark_modules/litellm.py,sha256=_gKBbJsXzo_cHJVaeuQpHRBENEZUGS_vcC-uGIhhmHA,52111
30
+ euroeval/benchmark_modules/litellm.py,sha256=qv-k2ntk48OF4ikevQ95k4zLbBkZYOZ2z-GAisA-tFY,53374
31
31
  euroeval/benchmark_modules/vllm.py,sha256=kq3PMUuRT0NOky6XSHl1JeHTDGehwcub0HcGC5S_Wv4,38834
32
32
  euroeval/dataset_configs/__init__.py,sha256=EbjEyHwBtSztASl8_xblD8hessruDdV4Eg1vXrmGOuY,1935
33
33
  euroeval/dataset_configs/danish.py,sha256=-y-n08hTApwTdSVdjRlZYa3gOX92cTGhg8xsuG-Lhww,3691
@@ -56,8 +56,8 @@ euroeval/task_group_utils/question_answering.py,sha256=agwtWOmctgat98yqgFiMSPY6z
56
56
  euroeval/task_group_utils/sequence_classification.py,sha256=igmD24aMNN7QBJ8NDzgEnGwM-jq_zhC37QxazNm7GZ4,12711
57
57
  euroeval/task_group_utils/text_to_text.py,sha256=xOpja-W4E-1peMjZX8G-3G5iRgmFHHygrQ5WN1hB3FI,4550
58
58
  euroeval/task_group_utils/token_classification.py,sha256=wCy3aI-Sn9f-87tHzAnYDA6EbY3ah3xao1SnfnoRNz4,17490
59
- euroeval-15.13.0.dist-info/METADATA,sha256=HnDtAE2-sYFmSl4yM9PQhgUrfklR_OB5C5aXPOgz5U8,13478
60
- euroeval-15.13.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
61
- euroeval-15.13.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
62
- euroeval-15.13.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
63
- euroeval-15.13.0.dist-info/RECORD,,
59
+ euroeval-15.14.0.dist-info/METADATA,sha256=uQY74VCgn3TRCTXJGCb8ilS-3U5UL69lbhNGQw2NGTM,13478
60
+ euroeval-15.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
61
+ euroeval-15.14.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
62
+ euroeval-15.14.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
63
+ euroeval-15.14.0.dist-info/RECORD,,