ScandEval 16.10.1__py3-none-any.whl → 16.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scandeval/__init__.py +0 -9
- scandeval/benchmark_config_factory.py +5 -0
- scandeval/benchmark_modules/hf.py +36 -8
- scandeval/benchmark_modules/litellm.py +119 -22
- scandeval/benchmark_modules/vllm.py +202 -94
- scandeval/benchmarker.py +28 -7
- scandeval/cli.py +13 -0
- scandeval/constants.py +31 -2
- scandeval/data_models.py +12 -2
- scandeval/dataset_configs/dutch.py +10 -0
- scandeval/logging_utils.py +1 -1
- scandeval/metrics/__init__.py +1 -0
- scandeval/metrics/bias.py +237 -0
- scandeval/metrics/huggingface.py +5 -3
- scandeval/metrics/llm_as_a_judge.py +79 -15
- scandeval/model_loading.py +2 -1
- scandeval/task_group_utils/sequence_classification.py +12 -3
- scandeval/tasks.py +22 -0
- scandeval/tokenisation_utils.py +12 -1
- scandeval/types.py +39 -0
- scandeval/utils.py +38 -66
- {scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/METADATA +50 -24
- {scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/RECORD +26 -25
- {scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/licenses/LICENSE +1 -1
- {scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/WHEEL +0 -0
- {scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/entry_points.txt +0 -0
scandeval/__init__.py
CHANGED
|
@@ -110,15 +110,6 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
|
|
|
110
110
|
os.environ["VLLM_USE_V1"] = "1"
|
|
111
111
|
|
|
112
112
|
|
|
113
|
-
# Use the FlashInfer flash-attention backend for vLLM, unless the user has already
|
|
114
|
-
# specified a different backend.
|
|
115
|
-
if os.getenv("VLLM_ATTENTION_BACKEND") is None:
|
|
116
|
-
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
|
|
117
|
-
os.environ["USER_HAS_SET_VLLM_ATTENTION_BACKEND"] = "0"
|
|
118
|
-
else:
|
|
119
|
-
os.environ["USER_HAS_SET_VLLM_ATTENTION_BACKEND"] = "1"
|
|
120
|
-
|
|
121
|
-
|
|
122
113
|
# Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
|
|
123
114
|
# former and LiteLLM uses the latter
|
|
124
115
|
if os.getenv("HUGGINGFACE_API_KEY"):
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Factory class for creating dataset configurations."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
+
import importlib.util
|
|
4
5
|
import sys
|
|
5
6
|
import typing as t
|
|
6
7
|
from pathlib import Path
|
|
@@ -13,6 +14,9 @@ from .enums import Device
|
|
|
13
14
|
from .exceptions import InvalidBenchmark
|
|
14
15
|
from .languages import get_all_languages
|
|
15
16
|
|
|
17
|
+
if importlib.util.find_spec("vllm") is not None:
|
|
18
|
+
pass
|
|
19
|
+
|
|
16
20
|
if t.TYPE_CHECKING:
|
|
17
21
|
from .data_models import Language
|
|
18
22
|
|
|
@@ -68,6 +72,7 @@ def build_benchmark_config(
|
|
|
68
72
|
api_base=benchmark_config_params.api_base,
|
|
69
73
|
api_version=benchmark_config_params.api_version,
|
|
70
74
|
gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
|
|
75
|
+
attention_backend=benchmark_config_params.attention_backend,
|
|
71
76
|
generative_type=benchmark_config_params.generative_type,
|
|
72
77
|
debug=benchmark_config_params.debug,
|
|
73
78
|
run_with_cli=benchmark_config_params.run_with_cli,
|
|
@@ -758,12 +758,35 @@ def get_model_repo_info(
|
|
|
758
758
|
# model info object.
|
|
759
759
|
model_info: HfApiModelInfo | None = None
|
|
760
760
|
if Path(model_id).is_dir():
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
761
|
+
if Path(model_id, "config.json").exists():
|
|
762
|
+
log_once(
|
|
763
|
+
f"The local model directory {model_id!r} has a 'config.json' file, so "
|
|
764
|
+
"we're skipping looking up model information from the Hugging Face "
|
|
765
|
+
"Hub.",
|
|
766
|
+
level=logging.DEBUG,
|
|
767
|
+
)
|
|
766
768
|
model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
|
|
769
|
+
elif Path(model_id, "adapter_config.json").exists():
|
|
770
|
+
log_once(
|
|
771
|
+
f"The local model directory {model_id!r} has an 'adapter_config.json' "
|
|
772
|
+
"file, so we're skipping looking up model information from the Hugging "
|
|
773
|
+
"Face Hub.",
|
|
774
|
+
level=logging.DEBUG,
|
|
775
|
+
)
|
|
776
|
+
model_info = HfApiModelInfo(
|
|
777
|
+
id=model_id,
|
|
778
|
+
tags=None,
|
|
779
|
+
pipeline_tag=None,
|
|
780
|
+
siblings=[dict(rfilename="adapter_config.json")],
|
|
781
|
+
)
|
|
782
|
+
else:
|
|
783
|
+
log_once(
|
|
784
|
+
f"The local model directory {model_id} does not contain any of the "
|
|
785
|
+
f"required files: {LOCAL_MODELS_REQUIRED_FILES}. Skipping this "
|
|
786
|
+
f"model.",
|
|
787
|
+
level=logging.WARNING,
|
|
788
|
+
)
|
|
789
|
+
return None
|
|
767
790
|
|
|
768
791
|
# If we have not internet, and the model_id is not a directory for a local model
|
|
769
792
|
# we also just create a dummy model info object.
|
|
@@ -863,8 +886,9 @@ def get_model_repo_info(
|
|
|
863
886
|
for tag in GENERATIVE_PIPELINE_TAGS
|
|
864
887
|
for class_name in TASK_MAPPING.get(tag, dict()).values() # type: ignore[attr-defined]
|
|
865
888
|
]
|
|
866
|
-
if class_names is not None and
|
|
867
|
-
class_name in generative_class_names for class_name in class_names
|
|
889
|
+
if class_names is not None and (
|
|
890
|
+
any(class_name in generative_class_names for class_name in class_names)
|
|
891
|
+
or any("ForCausalLM" in class_name for class_name in class_names)
|
|
868
892
|
):
|
|
869
893
|
pipeline_tag = "text-generation"
|
|
870
894
|
else:
|
|
@@ -1108,7 +1132,11 @@ def load_hf_model_config(
|
|
|
1108
1132
|
)
|
|
1109
1133
|
|
|
1110
1134
|
# Ensure that the PAD token ID is set
|
|
1111
|
-
if
|
|
1135
|
+
if (
|
|
1136
|
+
hasattr(config, "eos_token_id")
|
|
1137
|
+
and config.eos_token_id is not None
|
|
1138
|
+
and (not hasattr(config, "pad_token_id") or config.pad_token_id is None)
|
|
1139
|
+
):
|
|
1112
1140
|
if isinstance(config.eos_token_id, list):
|
|
1113
1141
|
config.pad_token_id = config.eos_token_id[0]
|
|
1114
1142
|
else:
|
|
@@ -4,6 +4,7 @@ import asyncio
|
|
|
4
4
|
import collections.abc as c
|
|
5
5
|
import json
|
|
6
6
|
import logging
|
|
7
|
+
import os
|
|
7
8
|
import re
|
|
8
9
|
import typing as t
|
|
9
10
|
from functools import cached_property, partial
|
|
@@ -32,9 +33,10 @@ from litellm.exceptions import (
|
|
|
32
33
|
)
|
|
33
34
|
from litellm.llms.vertex_ai.common_utils import VertexAIError
|
|
34
35
|
from litellm.router import Router
|
|
36
|
+
from litellm.types.router import RouterRateLimitError
|
|
35
37
|
from litellm.types.utils import ChoiceLogprobs, Logprobs
|
|
36
38
|
from litellm.utils import supports_reasoning, supports_response_schema
|
|
37
|
-
from pydantic import conlist, create_model
|
|
39
|
+
from pydantic import ValidationError, conlist, create_model
|
|
38
40
|
from requests.exceptions import RequestException
|
|
39
41
|
from tqdm.asyncio import tqdm as tqdm_async
|
|
40
42
|
|
|
@@ -99,12 +101,13 @@ if t.TYPE_CHECKING:
|
|
|
99
101
|
|
|
100
102
|
VOCAB_SIZE_MAPPING = {
|
|
101
103
|
# OpenAI models
|
|
104
|
+
r"gpt-5\.2.*": -1,
|
|
102
105
|
r"gpt-5-.*": 100_256,
|
|
103
106
|
r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
|
|
104
107
|
r"gpt-4-[0-9]{4}-preview": 100_256,
|
|
105
108
|
r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
|
|
106
109
|
r"gpt-4-(vision|turbo)(-preview)?": 100_256,
|
|
107
|
-
r"gpt-3
|
|
110
|
+
r"gpt-3\.5-turbo-instruct(-[0-9]{4})?": 100_256,
|
|
108
111
|
r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
|
|
109
112
|
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
110
113
|
# Anthropic models
|
|
@@ -113,23 +116,27 @@ VOCAB_SIZE_MAPPING = {
|
|
|
113
116
|
r"(gemini/)?gemini-[1-9](\.[0-9])?-(flash|pro).*": 256_128,
|
|
114
117
|
# xAI models
|
|
115
118
|
r"(xai/)?grok.*": -1,
|
|
119
|
+
# Chat.dk models
|
|
120
|
+
r"(ordbogen/)?odin-medium.*": -1,
|
|
121
|
+
r"(ordbogen/)?odin-large.*": -1,
|
|
116
122
|
}
|
|
117
123
|
|
|
118
124
|
|
|
119
125
|
MODEL_MAX_LENGTH_MAPPING = {
|
|
120
126
|
# OpenAI models
|
|
127
|
+
r"gpt-5\.2.*": 400_000,
|
|
121
128
|
r"gpt-5-.*": 272_000,
|
|
122
129
|
r"gpt-4(-[0-9]{4})?": 8_191,
|
|
123
130
|
r"gpt-4-32k(-[0-9]{4})?": 32_767,
|
|
124
131
|
r"gpt-4-[0-9]{4}-preview": 128_000,
|
|
125
132
|
r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
|
|
126
133
|
r"gpt-4-(vision|turbo)(-preview)?": 128_000,
|
|
127
|
-
r"gpt-3
|
|
134
|
+
r"gpt-3\.5-turbo-instruct(-[0-9]{4})?": 4_095,
|
|
128
135
|
r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
|
|
129
136
|
r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
|
|
130
137
|
r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
|
|
131
138
|
r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
|
|
132
|
-
r"gpt-4
|
|
139
|
+
r"gpt-4\.1.*": 1_047_576,
|
|
133
140
|
# Anthropic models
|
|
134
141
|
r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
|
|
135
142
|
r"(anthropic/)?claude-(opus|sonnet|haiku)-[1-9](-[1-9])?-[0-9]{8}": 200_000,
|
|
@@ -139,12 +146,15 @@ MODEL_MAX_LENGTH_MAPPING = {
|
|
|
139
146
|
r"(gemini/)?gemini-[23](\.[05])?.*": 1_048_576,
|
|
140
147
|
# xAI models
|
|
141
148
|
r"(xai/)?grok.*": 131_072,
|
|
149
|
+
# Chat.dk models
|
|
150
|
+
r"(ordbogen/)?odin-medium.*": 131_072,
|
|
151
|
+
r"(ordbogen/)?odin-large.*": 202_752,
|
|
142
152
|
}
|
|
143
153
|
|
|
144
154
|
|
|
145
155
|
NUM_PARAMS_MAPPING = {
|
|
146
156
|
# OpenAI models
|
|
147
|
-
r"gpt-5
|
|
157
|
+
r"gpt-5.*": -1,
|
|
148
158
|
r"gpt-4.*": -1,
|
|
149
159
|
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
150
160
|
# Anthropic models
|
|
@@ -155,6 +165,9 @@ NUM_PARAMS_MAPPING = {
|
|
|
155
165
|
r"(gemini/)?gemini-[23](.[05])?.*": -1,
|
|
156
166
|
# xAI models
|
|
157
167
|
r"(xai/)?grok.*": -1,
|
|
168
|
+
# Chat.dk models
|
|
169
|
+
r"(ordbogen/)?odin-medium.*": -1,
|
|
170
|
+
r"(ordbogen/)?odin-large.*": -1,
|
|
158
171
|
}
|
|
159
172
|
|
|
160
173
|
|
|
@@ -164,6 +177,7 @@ REASONING_MODELS = [
|
|
|
164
177
|
r"(gemini/)?gemini-2.5.*",
|
|
165
178
|
r"(xai/)?grok-3-mini.*",
|
|
166
179
|
r".*gpt-oss.*",
|
|
180
|
+
r"(ordbogen/)?odin-.*",
|
|
167
181
|
]
|
|
168
182
|
|
|
169
183
|
BASE_DECODER_MODELS = [
|
|
@@ -186,6 +200,8 @@ CUSTOM_INFERENCE_API_PREFIXES = [
|
|
|
186
200
|
"openai/",
|
|
187
201
|
]
|
|
188
202
|
|
|
203
|
+
UNOFFICIAL_INFERENCE_API_PREFIXES = ["ordbogen/"]
|
|
204
|
+
|
|
189
205
|
|
|
190
206
|
class LiteLLMModel(BenchmarkModule):
|
|
191
207
|
"""A generative model from LiteLLM."""
|
|
@@ -220,7 +236,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
220
236
|
dataset_config: DatasetConfig,
|
|
221
237
|
benchmark_config: BenchmarkConfig,
|
|
222
238
|
log_metadata: bool = True,
|
|
223
|
-
**generation_kwargs
|
|
239
|
+
**generation_kwargs,
|
|
224
240
|
) -> None:
|
|
225
241
|
"""Initialise the model.
|
|
226
242
|
|
|
@@ -241,6 +257,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
241
257
|
model_config=model_config, allowed_params=self.allowed_params
|
|
242
258
|
)
|
|
243
259
|
|
|
260
|
+
set_up_benchmark_config_for_model(
|
|
261
|
+
benchmark_config=benchmark_config, model_id=model_config.model_id
|
|
262
|
+
)
|
|
263
|
+
|
|
244
264
|
# Detect whether the model is an Ollama model, as we need to extract metadata
|
|
245
265
|
# differently for these models
|
|
246
266
|
self.is_ollama = model_config.model_id.startswith(
|
|
@@ -401,7 +421,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
401
421
|
http_429_errors = [
|
|
402
422
|
idx
|
|
403
423
|
for idx, (_, error) in enumerate(failures)
|
|
404
|
-
if isinstance(error, RateLimitError)
|
|
424
|
+
if isinstance(error, RateLimitError)
|
|
405
425
|
]
|
|
406
426
|
if http_429_errors and self.buffer["max_concurrent_calls"] > 1:
|
|
407
427
|
failures = [
|
|
@@ -417,7 +437,6 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
417
437
|
f"{self.buffer['max_concurrent_calls']:,} due to rate limiting.",
|
|
418
438
|
level=logging.DEBUG,
|
|
419
439
|
)
|
|
420
|
-
continue
|
|
421
440
|
|
|
422
441
|
# Attempt to handle the exceptions, to improve the chance of getting
|
|
423
442
|
# successful generations next time around
|
|
@@ -483,11 +502,13 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
483
502
|
"you've reached the maximum number of requests with logprobs",
|
|
484
503
|
"logprobs is not supported",
|
|
485
504
|
"logprobs is not enabled",
|
|
486
|
-
"Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)",
|
|
487
505
|
]
|
|
488
506
|
logprobs_pattern = re.compile(
|
|
489
507
|
r"does not support parameters: \[.*'logprobs'.*\]"
|
|
490
508
|
)
|
|
509
|
+
logprobs_argument_should_be_bool_messages = [
|
|
510
|
+
"Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)"
|
|
511
|
+
]
|
|
491
512
|
top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
|
|
492
513
|
top_logprobs_pattern = re.compile(
|
|
493
514
|
r"does not support parameters: \[.*'top_logprobs'.*\]"
|
|
@@ -548,6 +569,17 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
548
569
|
generation_kwargs.pop("top_logprobs", None)
|
|
549
570
|
generation_kwargs.pop("response_format", None)
|
|
550
571
|
return generation_kwargs, 0
|
|
572
|
+
elif any(
|
|
573
|
+
msg.lower() in error_msg
|
|
574
|
+
for msg in logprobs_argument_should_be_bool_messages
|
|
575
|
+
):
|
|
576
|
+
log_once(
|
|
577
|
+
f"The model {model_id!r} requires the `logprobs` argument to be a "
|
|
578
|
+
"Boolean, so setting it to True.",
|
|
579
|
+
level=logging.DEBUG,
|
|
580
|
+
)
|
|
581
|
+
generation_kwargs["logprobs"] = True
|
|
582
|
+
return generation_kwargs, 0
|
|
551
583
|
elif (
|
|
552
584
|
any(msg.lower() in error_msg for msg in top_logprobs_messages)
|
|
553
585
|
or top_logprobs_pattern.search(string=error_msg) is not None
|
|
@@ -700,23 +732,25 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
700
732
|
) from error
|
|
701
733
|
|
|
702
734
|
if (
|
|
703
|
-
isinstance(error, (RateLimitError, BadRequestError))
|
|
735
|
+
isinstance(error, (RateLimitError, RouterRateLimitError, BadRequestError))
|
|
704
736
|
and (
|
|
705
737
|
retry_match := re.search(
|
|
706
|
-
pattern=
|
|
738
|
+
pattern=(
|
|
739
|
+
r"\b(try( again)?|retry) in ([0-9]+(\.[0-9]+)?) ?(s|seconds?)\b"
|
|
740
|
+
),
|
|
707
741
|
string=error_msg,
|
|
708
742
|
flags=re.IGNORECASE,
|
|
709
743
|
)
|
|
710
744
|
)
|
|
711
745
|
is not None
|
|
712
746
|
):
|
|
713
|
-
retry_seconds = float(retry_match.group(
|
|
747
|
+
retry_seconds = float(retry_match.group(3))
|
|
714
748
|
log_once(
|
|
715
749
|
f"You have encountered your rate limit for model {model_id!r}.",
|
|
716
750
|
level=logging.DEBUG,
|
|
717
751
|
)
|
|
718
752
|
return generation_kwargs, int(retry_seconds)
|
|
719
|
-
elif isinstance(error, RateLimitError):
|
|
753
|
+
elif isinstance(error, (RateLimitError, RouterRateLimitError)):
|
|
720
754
|
log_once(
|
|
721
755
|
f"You have encountered your rate limit for model {model_id!r}.",
|
|
722
756
|
level=logging.DEBUG,
|
|
@@ -919,12 +953,37 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
919
953
|
logprobs_obj = model_response_choices.logprobs
|
|
920
954
|
|
|
921
955
|
if not isinstance(logprobs_obj, (Logprobs, ChoiceLogprobs)):
|
|
922
|
-
|
|
923
|
-
"The logprobs object is malformed, so we won't use logprobs
|
|
924
|
-
"determine the labels."
|
|
925
|
-
|
|
956
|
+
error_msg = (
|
|
957
|
+
"The logprobs object is malformed, so we won't use logprobs "
|
|
958
|
+
"to determine the labels."
|
|
959
|
+
)
|
|
960
|
+
if not isinstance(logprobs_obj, list):
|
|
961
|
+
log_once(error_msg, level=logging.WARNING)
|
|
962
|
+
continue
|
|
963
|
+
|
|
964
|
+
# Some APIs have implemented the logprobs differently, being a list
|
|
965
|
+
# of ChoiceLogprobs dictionaries rather than having that list being
|
|
966
|
+
# under the 'content' key, so we deal with that here.
|
|
967
|
+
# TODO: Maybe remove this in future if all APIs standardise this
|
|
968
|
+
try:
|
|
969
|
+
choice_logprobs_list = [
|
|
970
|
+
ChoiceLogprobs.model_validate(item) for item in logprobs_obj
|
|
971
|
+
]
|
|
972
|
+
except ValidationError:
|
|
973
|
+
log_once(error_msg, level=logging.WARNING)
|
|
974
|
+
continue
|
|
975
|
+
if not all(
|
|
976
|
+
len(item.content or []) == 1 for item in choice_logprobs_list
|
|
977
|
+
):
|
|
978
|
+
log_once(error_msg, level=logging.WARNING)
|
|
979
|
+
continue
|
|
980
|
+
logprobs_obj = ChoiceLogprobs(
|
|
981
|
+
content=[
|
|
982
|
+
item.content[0]
|
|
983
|
+
for item in choice_logprobs_list
|
|
984
|
+
if item.content
|
|
985
|
+
]
|
|
926
986
|
)
|
|
927
|
-
continue
|
|
928
987
|
|
|
929
988
|
logprobs_list: c.Sequence[c.Sequence[tuple[str, float]]]
|
|
930
989
|
if isinstance(logprobs_obj, ChoiceLogprobs):
|
|
@@ -964,10 +1023,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
964
1023
|
|
|
965
1024
|
if not sequences:
|
|
966
1025
|
log(
|
|
967
|
-
"No sequences were generated by the model "
|
|
968
|
-
|
|
969
|
-
"
|
|
970
|
-
"Returning an empty GenerativeModelOutput.",
|
|
1026
|
+
f"No sequences were generated by the model {model_id!r}. This may be "
|
|
1027
|
+
"due to the model running out of tokens or an issue with the input "
|
|
1028
|
+
"data. Returning an empty GenerativeModelOutput.",
|
|
971
1029
|
level=logging.WARNING,
|
|
972
1030
|
)
|
|
973
1031
|
return GenerativeModelOutput(sequences=[], scores=None)
|
|
@@ -1295,6 +1353,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1295
1353
|
if model_id in litellm.model_list:
|
|
1296
1354
|
return True
|
|
1297
1355
|
|
|
1356
|
+
set_up_benchmark_config_for_model(
|
|
1357
|
+
benchmark_config=benchmark_config, model_id=model_id
|
|
1358
|
+
)
|
|
1359
|
+
|
|
1298
1360
|
# Separate check for Ollama models
|
|
1299
1361
|
if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
|
|
1300
1362
|
ollama_model_exists = try_download_ollama_model(
|
|
@@ -1596,6 +1658,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1596
1658
|
level=logging.DEBUG,
|
|
1597
1659
|
)
|
|
1598
1660
|
|
|
1661
|
+
# If the model is a Chat.dk model, we make sure reasoning traces are not
|
|
1662
|
+
# included in the output
|
|
1663
|
+
if self.model_config.model_id.startswith("ordbogen/"):
|
|
1664
|
+
generation_kwargs["include_reasoning"] = False
|
|
1665
|
+
|
|
1599
1666
|
# Handle manually set parameters
|
|
1600
1667
|
if self.buffer["first_label_token_mapping"]:
|
|
1601
1668
|
generation_kwargs["logprobs"] = True
|
|
@@ -1784,6 +1851,12 @@ def clean_model_id(model_id: str, benchmark_config: BenchmarkConfig) -> str:
|
|
|
1784
1851
|
Returns:
|
|
1785
1852
|
The cleaned model ID.
|
|
1786
1853
|
"""
|
|
1854
|
+
# Remove unofficial prefixes
|
|
1855
|
+
for unofficial_prefix in UNOFFICIAL_INFERENCE_API_PREFIXES:
|
|
1856
|
+
model_id = re.sub(
|
|
1857
|
+
pattern=rf"^{re.escape(unofficial_prefix)}", repl="", string=model_id
|
|
1858
|
+
)
|
|
1859
|
+
|
|
1787
1860
|
if benchmark_config.api_base is not None and not any(
|
|
1788
1861
|
model_id.startswith(prefix) for prefix in CUSTOM_INFERENCE_API_PREFIXES
|
|
1789
1862
|
):
|
|
@@ -1792,4 +1865,28 @@ def clean_model_id(model_id: str, benchmark_config: BenchmarkConfig) -> str:
|
|
|
1792
1865
|
else:
|
|
1793
1866
|
prefix = "openai/"
|
|
1794
1867
|
model_id = prefix + model_id
|
|
1868
|
+
|
|
1869
|
+
# When we want to evaluate an OpenAI model on a custom inference server, such as HF
|
|
1870
|
+
# inference endpoints, LiteLLM gets confused since it's already using the `openai/`
|
|
1871
|
+
# prefix. We thus have to add it twice, and this hack here is to ensure that we
|
|
1872
|
+
# don't store the results with model ID `openai/openai/...`.
|
|
1873
|
+
elif benchmark_config.api_base is not None and model_id.startswith("openai/"):
|
|
1874
|
+
model_id = "openai/openai/" + re.sub(r"(openai/)*", "", model_id)
|
|
1875
|
+
|
|
1795
1876
|
return model_id
|
|
1877
|
+
|
|
1878
|
+
|
|
1879
|
+
def set_up_benchmark_config_for_model(
|
|
1880
|
+
benchmark_config: BenchmarkConfig, model_id: str
|
|
1881
|
+
) -> None:
|
|
1882
|
+
"""Set up the benchmark configuration for the model.
|
|
1883
|
+
|
|
1884
|
+
Args:
|
|
1885
|
+
benchmark_config:
|
|
1886
|
+
The benchmark configuration to set up.
|
|
1887
|
+
model_id:
|
|
1888
|
+
The model ID.
|
|
1889
|
+
"""
|
|
1890
|
+
if model_id.startswith("ordbogen/"):
|
|
1891
|
+
benchmark_config.api_key = os.getenv("ORDBOGEN_API_KEY")
|
|
1892
|
+
benchmark_config.api_base = "https://api.ordbogen.ai/v1"
|