EuroEval 15.13.0__py3-none-any.whl → 15.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/litellm.py +31 -4
- {euroeval-15.13.0.dist-info → euroeval-15.14.0.dist-info}/METADATA +1 -1
- {euroeval-15.13.0.dist-info → euroeval-15.14.0.dist-info}/RECORD +6 -6
- {euroeval-15.13.0.dist-info → euroeval-15.14.0.dist-info}/WHEEL +0 -0
- {euroeval-15.13.0.dist-info → euroeval-15.14.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.13.0.dist-info → euroeval-15.14.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -31,6 +31,7 @@ from litellm.exceptions import (
|
|
|
31
31
|
from litellm.llms.vertex_ai.common_utils import VertexAIError
|
|
32
32
|
from litellm.router import Router
|
|
33
33
|
from litellm.types.utils import ChoiceLogprobs
|
|
34
|
+
from litellm.utils import supports_reasoning, supports_response_schema
|
|
34
35
|
from pydantic import conlist, create_model
|
|
35
36
|
from requests.exceptions import RequestException
|
|
36
37
|
from tqdm.asyncio import tqdm as tqdm_async
|
|
@@ -234,6 +235,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
234
235
|
pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
|
|
235
236
|
):
|
|
236
237
|
type_ = GenerativeType.REASONING
|
|
238
|
+
elif supports_reasoning(model=self.model_config.model_id):
|
|
239
|
+
type_ = GenerativeType.REASONING
|
|
237
240
|
else:
|
|
238
241
|
type_ = GenerativeType.INSTRUCTION_TUNED
|
|
239
242
|
|
|
@@ -314,9 +317,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
314
317
|
"enable it.",
|
|
315
318
|
level=logging.DEBUG,
|
|
316
319
|
)
|
|
317
|
-
elif
|
|
318
|
-
model=self.model_config.model_id
|
|
319
|
-
):
|
|
320
|
+
elif supports_response_schema(model=self.model_config.model_id):
|
|
320
321
|
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
321
322
|
keys_and_their_types: dict[str, t.Any] = {
|
|
322
323
|
tag_name: (conlist(str, max_length=5), ...)
|
|
@@ -361,7 +362,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
361
362
|
level=logging.DEBUG,
|
|
362
363
|
)
|
|
363
364
|
elif self.model_config.revision == "no-thinking":
|
|
364
|
-
generation_kwargs["thinking"] = dict(
|
|
365
|
+
generation_kwargs["thinking"] = dict(budget_tokens=0)
|
|
365
366
|
log_once(
|
|
366
367
|
f"Disabling thinking mode for model {self.model_config.model_id!r}",
|
|
367
368
|
level=logging.DEBUG,
|
|
@@ -377,6 +378,19 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
377
378
|
# Drop generation kwargs that are not supported by the model
|
|
378
379
|
litellm.drop_params = True
|
|
379
380
|
|
|
381
|
+
# First attempt is a test run with a single conversation to handle errors
|
|
382
|
+
# quickly
|
|
383
|
+
test_conversation = conversations[0]
|
|
384
|
+
_, failures = safe_run(
|
|
385
|
+
self._generate_async(
|
|
386
|
+
model_id=self.model_config.model_id,
|
|
387
|
+
conversations=[test_conversation],
|
|
388
|
+
**generation_kwargs,
|
|
389
|
+
)
|
|
390
|
+
)
|
|
391
|
+
for _, error in failures:
|
|
392
|
+
self._handle_exception(error=error, generation_kwargs=generation_kwargs)
|
|
393
|
+
|
|
380
394
|
all_responses: dict[int, "ModelResponse"] = {}
|
|
381
395
|
conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
|
|
382
396
|
enumerate(conversations)
|
|
@@ -477,6 +491,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
477
491
|
r"the thinking budget [0-9]+ is invalid. please choose a value between "
|
|
478
492
|
r"[0-9]+ and ([0-9]+)\."
|
|
479
493
|
)
|
|
494
|
+
requires_thinking_disabled_messages = ["thinking.type: Field required"]
|
|
480
495
|
|
|
481
496
|
if any(msg.lower() in error_msg for msg in stop_messages):
|
|
482
497
|
log_once(
|
|
@@ -557,6 +572,18 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
557
572
|
type="enabled", budget_tokens=thinking_budget - 1
|
|
558
573
|
)
|
|
559
574
|
return
|
|
575
|
+
elif (
|
|
576
|
+
any(msg.lower() in error_msg for msg in requires_thinking_disabled_messages)
|
|
577
|
+
and self.generative_type != GenerativeType.REASONING
|
|
578
|
+
):
|
|
579
|
+
log_once(
|
|
580
|
+
f"The model {model_id!r} requires the `thinking.type` field to be "
|
|
581
|
+
f"set to `disabled` rather than just setting `budget_tokens` to 0. "
|
|
582
|
+
"Setting `thinking.type` to `disabled`.",
|
|
583
|
+
level=logging.DEBUG,
|
|
584
|
+
)
|
|
585
|
+
generation_kwargs["thinking"] = dict(type="disabled")
|
|
586
|
+
return
|
|
560
587
|
elif isinstance(
|
|
561
588
|
error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
|
|
562
589
|
):
|
|
@@ -27,7 +27,7 @@ euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwR
|
|
|
27
27
|
euroeval/benchmark_modules/base.py,sha256=D1oKD16KBvxEoBUfqwvzvcDc1hx6letdD3v1PnBmF4A,10669
|
|
28
28
|
euroeval/benchmark_modules/fresh.py,sha256=sg_AXNPApFObCzCRWhCgKxfr-eqQsT6Ri0xx0_Yy5JM,10293
|
|
29
29
|
euroeval/benchmark_modules/hf.py,sha256=-W_bWEdm0zePkn4nDz4l0T4hhJJnlfwHrtIO3m5BrUs,44725
|
|
30
|
-
euroeval/benchmark_modules/litellm.py,sha256=
|
|
30
|
+
euroeval/benchmark_modules/litellm.py,sha256=qv-k2ntk48OF4ikevQ95k4zLbBkZYOZ2z-GAisA-tFY,53374
|
|
31
31
|
euroeval/benchmark_modules/vllm.py,sha256=kq3PMUuRT0NOky6XSHl1JeHTDGehwcub0HcGC5S_Wv4,38834
|
|
32
32
|
euroeval/dataset_configs/__init__.py,sha256=EbjEyHwBtSztASl8_xblD8hessruDdV4Eg1vXrmGOuY,1935
|
|
33
33
|
euroeval/dataset_configs/danish.py,sha256=-y-n08hTApwTdSVdjRlZYa3gOX92cTGhg8xsuG-Lhww,3691
|
|
@@ -56,8 +56,8 @@ euroeval/task_group_utils/question_answering.py,sha256=agwtWOmctgat98yqgFiMSPY6z
|
|
|
56
56
|
euroeval/task_group_utils/sequence_classification.py,sha256=igmD24aMNN7QBJ8NDzgEnGwM-jq_zhC37QxazNm7GZ4,12711
|
|
57
57
|
euroeval/task_group_utils/text_to_text.py,sha256=xOpja-W4E-1peMjZX8G-3G5iRgmFHHygrQ5WN1hB3FI,4550
|
|
58
58
|
euroeval/task_group_utils/token_classification.py,sha256=wCy3aI-Sn9f-87tHzAnYDA6EbY3ah3xao1SnfnoRNz4,17490
|
|
59
|
-
euroeval-15.
|
|
60
|
-
euroeval-15.
|
|
61
|
-
euroeval-15.
|
|
62
|
-
euroeval-15.
|
|
63
|
-
euroeval-15.
|
|
59
|
+
euroeval-15.14.0.dist-info/METADATA,sha256=uQY74VCgn3TRCTXJGCb8ilS-3U5UL69lbhNGQw2NGTM,13478
|
|
60
|
+
euroeval-15.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
61
|
+
euroeval-15.14.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
62
|
+
euroeval-15.14.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
|
|
63
|
+
euroeval-15.14.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|