ScandEval 16.10.1__py3-none-any.whl → 16.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scandeval/__init__.py CHANGED
@@ -110,15 +110,6 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
110
110
  os.environ["VLLM_USE_V1"] = "1"
111
111
 
112
112
 
113
- # Use the FlashInfer flash-attention backend for vLLM, unless the user has already
114
- # specified a different backend.
115
- if os.getenv("VLLM_ATTENTION_BACKEND") is None:
116
- os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
117
- os.environ["USER_HAS_SET_VLLM_ATTENTION_BACKEND"] = "0"
118
- else:
119
- os.environ["USER_HAS_SET_VLLM_ATTENTION_BACKEND"] = "1"
120
-
121
-
122
113
  # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
123
114
  # former and LiteLLM uses the latter
124
115
  if os.getenv("HUGGINGFACE_API_KEY"):
@@ -1,6 +1,7 @@
1
1
  """Factory class for creating dataset configurations."""
2
2
 
3
3
  import collections.abc as c
4
+ import importlib.util
4
5
  import sys
5
6
  import typing as t
6
7
  from pathlib import Path
@@ -13,6 +14,9 @@ from .enums import Device
13
14
  from .exceptions import InvalidBenchmark
14
15
  from .languages import get_all_languages
15
16
 
17
+ if importlib.util.find_spec("vllm") is not None:
18
+ pass
19
+
16
20
  if t.TYPE_CHECKING:
17
21
  from .data_models import Language
18
22
 
@@ -68,6 +72,7 @@ def build_benchmark_config(
68
72
  api_base=benchmark_config_params.api_base,
69
73
  api_version=benchmark_config_params.api_version,
70
74
  gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
75
+ attention_backend=benchmark_config_params.attention_backend,
71
76
  generative_type=benchmark_config_params.generative_type,
72
77
  debug=benchmark_config_params.debug,
73
78
  run_with_cli=benchmark_config_params.run_with_cli,
@@ -758,12 +758,35 @@ def get_model_repo_info(
758
758
  # model info object.
759
759
  model_info: HfApiModelInfo | None = None
760
760
  if Path(model_id).is_dir():
761
- log(f"Checking for local model in {model_id}.", level=logging.DEBUG)
762
- if all(
763
- (Path(model_id) / required_file).exists()
764
- for required_file in LOCAL_MODELS_REQUIRED_FILES
765
- ):
761
+ if Path(model_id, "config.json").exists():
762
+ log_once(
763
+ f"The local model directory {model_id!r} has a 'config.json' file, so "
764
+ "we're skipping looking up model information from the Hugging Face "
765
+ "Hub.",
766
+ level=logging.DEBUG,
767
+ )
766
768
  model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
769
+ elif Path(model_id, "adapter_config.json").exists():
770
+ log_once(
771
+ f"The local model directory {model_id!r} has an 'adapter_config.json' "
772
+ "file, so we're skipping looking up model information from the Hugging "
773
+ "Face Hub.",
774
+ level=logging.DEBUG,
775
+ )
776
+ model_info = HfApiModelInfo(
777
+ id=model_id,
778
+ tags=None,
779
+ pipeline_tag=None,
780
+ siblings=[dict(rfilename="adapter_config.json")],
781
+ )
782
+ else:
783
+ log_once(
784
+ f"The local model directory {model_id} does not contain any of the "
785
+ f"required files: {LOCAL_MODELS_REQUIRED_FILES}. Skipping this "
786
+ f"model.",
787
+ level=logging.WARNING,
788
+ )
789
+ return None
767
790
 
768
791
  # If we have not internet, and the model_id is not a directory for a local model
769
792
  # we also just create a dummy model info object.
@@ -863,8 +886,9 @@ def get_model_repo_info(
863
886
  for tag in GENERATIVE_PIPELINE_TAGS
864
887
  for class_name in TASK_MAPPING.get(tag, dict()).values() # type: ignore[attr-defined]
865
888
  ]
866
- if class_names is not None and any(
867
- class_name in generative_class_names for class_name in class_names
889
+ if class_names is not None and (
890
+ any(class_name in generative_class_names for class_name in class_names)
891
+ or any("ForCausalLM" in class_name for class_name in class_names)
868
892
  ):
869
893
  pipeline_tag = "text-generation"
870
894
  else:
@@ -1108,7 +1132,11 @@ def load_hf_model_config(
1108
1132
  )
1109
1133
 
1110
1134
  # Ensure that the PAD token ID is set
1111
- if config.eos_token_id is not None and config.pad_token_id is None:
1135
+ if (
1136
+ hasattr(config, "eos_token_id")
1137
+ and config.eos_token_id is not None
1138
+ and (not hasattr(config, "pad_token_id") or config.pad_token_id is None)
1139
+ ):
1112
1140
  if isinstance(config.eos_token_id, list):
1113
1141
  config.pad_token_id = config.eos_token_id[0]
1114
1142
  else:
@@ -4,6 +4,7 @@ import asyncio
4
4
  import collections.abc as c
5
5
  import json
6
6
  import logging
7
+ import os
7
8
  import re
8
9
  import typing as t
9
10
  from functools import cached_property, partial
@@ -32,9 +33,10 @@ from litellm.exceptions import (
32
33
  )
33
34
  from litellm.llms.vertex_ai.common_utils import VertexAIError
34
35
  from litellm.router import Router
36
+ from litellm.types.router import RouterRateLimitError
35
37
  from litellm.types.utils import ChoiceLogprobs, Logprobs
36
38
  from litellm.utils import supports_reasoning, supports_response_schema
37
- from pydantic import conlist, create_model
39
+ from pydantic import ValidationError, conlist, create_model
38
40
  from requests.exceptions import RequestException
39
41
  from tqdm.asyncio import tqdm as tqdm_async
40
42
 
@@ -99,12 +101,13 @@ if t.TYPE_CHECKING:
99
101
 
100
102
  VOCAB_SIZE_MAPPING = {
101
103
  # OpenAI models
104
+ r"gpt-5\.2.*": -1,
102
105
  r"gpt-5-.*": 100_256,
103
106
  r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
104
107
  r"gpt-4-[0-9]{4}-preview": 100_256,
105
108
  r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
106
109
  r"gpt-4-(vision|turbo)(-preview)?": 100_256,
107
- r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 100_256,
110
+ r"gpt-3\.5-turbo-instruct(-[0-9]{4})?": 100_256,
108
111
  r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_019,
109
112
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
110
113
  # Anthropic models
@@ -113,23 +116,27 @@ VOCAB_SIZE_MAPPING = {
113
116
  r"(gemini/)?gemini-[1-9](\.[0-9])?-(flash|pro).*": 256_128,
114
117
  # xAI models
115
118
  r"(xai/)?grok.*": -1,
119
+ # Chat.dk models
120
+ r"(ordbogen/)?odin-medium.*": -1,
121
+ r"(ordbogen/)?odin-large.*": -1,
116
122
  }
117
123
 
118
124
 
119
125
  MODEL_MAX_LENGTH_MAPPING = {
120
126
  # OpenAI models
127
+ r"gpt-5\.2.*": 400_000,
121
128
  r"gpt-5-.*": 272_000,
122
129
  r"gpt-4(-[0-9]{4})?": 8_191,
123
130
  r"gpt-4-32k(-[0-9]{4})?": 32_767,
124
131
  r"gpt-4-[0-9]{4}-preview": 128_000,
125
132
  r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
126
133
  r"gpt-4-(vision|turbo)(-preview)?": 128_000,
127
- r"gpt-3.5-turbo-instruct(-[0-9]{4})?": 4_095,
134
+ r"gpt-3\.5-turbo-instruct(-[0-9]{4})?": 4_095,
128
135
  r"gpt-4o(-mini)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
129
136
  r"o1-(mini|preview)(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 128_000,
130
137
  r"o1(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
131
138
  r"o[2-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 200_000,
132
- r"gpt-4.1.*": 1_047_576,
139
+ r"gpt-4\.1.*": 1_047_576,
133
140
  # Anthropic models
134
141
  r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
135
142
  r"(anthropic/)?claude-(opus|sonnet|haiku)-[1-9](-[1-9])?-[0-9]{8}": 200_000,
@@ -139,12 +146,15 @@ MODEL_MAX_LENGTH_MAPPING = {
139
146
  r"(gemini/)?gemini-[23](\.[05])?.*": 1_048_576,
140
147
  # xAI models
141
148
  r"(xai/)?grok.*": 131_072,
149
+ # Chat.dk models
150
+ r"(ordbogen/)?odin-medium.*": 131_072,
151
+ r"(ordbogen/)?odin-large.*": 202_752,
142
152
  }
143
153
 
144
154
 
145
155
  NUM_PARAMS_MAPPING = {
146
156
  # OpenAI models
147
- r"gpt-5-.*": -1,
157
+ r"gpt-5.*": -1,
148
158
  r"gpt-4.*": -1,
149
159
  r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
150
160
  # Anthropic models
@@ -155,6 +165,9 @@ NUM_PARAMS_MAPPING = {
155
165
  r"(gemini/)?gemini-[23](.[05])?.*": -1,
156
166
  # xAI models
157
167
  r"(xai/)?grok.*": -1,
168
+ # Chat.dk models
169
+ r"(ordbogen/)?odin-medium.*": -1,
170
+ r"(ordbogen/)?odin-large.*": -1,
158
171
  }
159
172
 
160
173
 
@@ -164,6 +177,7 @@ REASONING_MODELS = [
164
177
  r"(gemini/)?gemini-2.5.*",
165
178
  r"(xai/)?grok-3-mini.*",
166
179
  r".*gpt-oss.*",
180
+ r"(ordbogen/)?odin-.*",
167
181
  ]
168
182
 
169
183
  BASE_DECODER_MODELS = [
@@ -186,6 +200,8 @@ CUSTOM_INFERENCE_API_PREFIXES = [
186
200
  "openai/",
187
201
  ]
188
202
 
203
+ UNOFFICIAL_INFERENCE_API_PREFIXES = ["ordbogen/"]
204
+
189
205
 
190
206
  class LiteLLMModel(BenchmarkModule):
191
207
  """A generative model from LiteLLM."""
@@ -220,7 +236,7 @@ class LiteLLMModel(BenchmarkModule):
220
236
  dataset_config: DatasetConfig,
221
237
  benchmark_config: BenchmarkConfig,
222
238
  log_metadata: bool = True,
223
- **generation_kwargs: dict[str, t.Any],
239
+ **generation_kwargs,
224
240
  ) -> None:
225
241
  """Initialise the model.
226
242
 
@@ -241,6 +257,10 @@ class LiteLLMModel(BenchmarkModule):
241
257
  model_config=model_config, allowed_params=self.allowed_params
242
258
  )
243
259
 
260
+ set_up_benchmark_config_for_model(
261
+ benchmark_config=benchmark_config, model_id=model_config.model_id
262
+ )
263
+
244
264
  # Detect whether the model is an Ollama model, as we need to extract metadata
245
265
  # differently for these models
246
266
  self.is_ollama = model_config.model_id.startswith(
@@ -401,7 +421,7 @@ class LiteLLMModel(BenchmarkModule):
401
421
  http_429_errors = [
402
422
  idx
403
423
  for idx, (_, error) in enumerate(failures)
404
- if isinstance(error, RateLimitError) and "Error code: 429" in str(error)
424
+ if isinstance(error, RateLimitError)
405
425
  ]
406
426
  if http_429_errors and self.buffer["max_concurrent_calls"] > 1:
407
427
  failures = [
@@ -417,7 +437,6 @@ class LiteLLMModel(BenchmarkModule):
417
437
  f"{self.buffer['max_concurrent_calls']:,} due to rate limiting.",
418
438
  level=logging.DEBUG,
419
439
  )
420
- continue
421
440
 
422
441
  # Attempt to handle the exceptions, to improve the chance of getting
423
442
  # successful generations next time around
@@ -483,11 +502,13 @@ class LiteLLMModel(BenchmarkModule):
483
502
  "you've reached the maximum number of requests with logprobs",
484
503
  "logprobs is not supported",
485
504
  "logprobs is not enabled",
486
- "Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)",
487
505
  ]
488
506
  logprobs_pattern = re.compile(
489
507
  r"does not support parameters: \[.*'logprobs'.*\]"
490
508
  )
509
+ logprobs_argument_should_be_bool_messages = [
510
+ "Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)"
511
+ ]
491
512
  top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
492
513
  top_logprobs_pattern = re.compile(
493
514
  r"does not support parameters: \[.*'top_logprobs'.*\]"
@@ -548,6 +569,17 @@ class LiteLLMModel(BenchmarkModule):
548
569
  generation_kwargs.pop("top_logprobs", None)
549
570
  generation_kwargs.pop("response_format", None)
550
571
  return generation_kwargs, 0
572
+ elif any(
573
+ msg.lower() in error_msg
574
+ for msg in logprobs_argument_should_be_bool_messages
575
+ ):
576
+ log_once(
577
+ f"The model {model_id!r} requires the `logprobs` argument to be a "
578
+ "Boolean, so setting it to True.",
579
+ level=logging.DEBUG,
580
+ )
581
+ generation_kwargs["logprobs"] = True
582
+ return generation_kwargs, 0
551
583
  elif (
552
584
  any(msg.lower() in error_msg for msg in top_logprobs_messages)
553
585
  or top_logprobs_pattern.search(string=error_msg) is not None
@@ -700,23 +732,25 @@ class LiteLLMModel(BenchmarkModule):
700
732
  ) from error
701
733
 
702
734
  if (
703
- isinstance(error, (RateLimitError, BadRequestError))
735
+ isinstance(error, (RateLimitError, RouterRateLimitError, BadRequestError))
704
736
  and (
705
737
  retry_match := re.search(
706
- pattern=r"\bretry in ([0-9]+(.[0-9]+)?) ?(s|seconds)\b",
738
+ pattern=(
739
+ r"\b(try( again)?|retry) in ([0-9]+(\.[0-9]+)?) ?(s|seconds?)\b"
740
+ ),
707
741
  string=error_msg,
708
742
  flags=re.IGNORECASE,
709
743
  )
710
744
  )
711
745
  is not None
712
746
  ):
713
- retry_seconds = float(retry_match.group(1))
747
+ retry_seconds = float(retry_match.group(3))
714
748
  log_once(
715
749
  f"You have encountered your rate limit for model {model_id!r}.",
716
750
  level=logging.DEBUG,
717
751
  )
718
752
  return generation_kwargs, int(retry_seconds)
719
- elif isinstance(error, RateLimitError):
753
+ elif isinstance(error, (RateLimitError, RouterRateLimitError)):
720
754
  log_once(
721
755
  f"You have encountered your rate limit for model {model_id!r}.",
722
756
  level=logging.DEBUG,
@@ -919,12 +953,37 @@ class LiteLLMModel(BenchmarkModule):
919
953
  logprobs_obj = model_response_choices.logprobs
920
954
 
921
955
  if not isinstance(logprobs_obj, (Logprobs, ChoiceLogprobs)):
922
- log_once(
923
- "The logprobs object is malformed, so we won't use logprobs to "
924
- "determine the labels.",
925
- level=logging.WARNING,
956
+ error_msg = (
957
+ "The logprobs object is malformed, so we won't use logprobs "
958
+ "to determine the labels."
959
+ )
960
+ if not isinstance(logprobs_obj, list):
961
+ log_once(error_msg, level=logging.WARNING)
962
+ continue
963
+
964
+ # Some APIs have implemented the logprobs differently, being a list
965
+ # of ChoiceLogprobs dictionaries rather than having that list being
966
+ # under the 'content' key, so we deal with that here.
967
+ # TODO: Maybe remove this in future if all APIs standardise this
968
+ try:
969
+ choice_logprobs_list = [
970
+ ChoiceLogprobs.model_validate(item) for item in logprobs_obj
971
+ ]
972
+ except ValidationError:
973
+ log_once(error_msg, level=logging.WARNING)
974
+ continue
975
+ if not all(
976
+ len(item.content or []) == 1 for item in choice_logprobs_list
977
+ ):
978
+ log_once(error_msg, level=logging.WARNING)
979
+ continue
980
+ logprobs_obj = ChoiceLogprobs(
981
+ content=[
982
+ item.content[0]
983
+ for item in choice_logprobs_list
984
+ if item.content
985
+ ]
926
986
  )
927
- continue
928
987
 
929
988
  logprobs_list: c.Sequence[c.Sequence[tuple[str, float]]]
930
989
  if isinstance(logprobs_obj, ChoiceLogprobs):
@@ -964,10 +1023,9 @@ class LiteLLMModel(BenchmarkModule):
964
1023
 
965
1024
  if not sequences:
966
1025
  log(
967
- "No sequences were generated by the model "
968
- f"{model_id!r}. This may be due to the "
969
- "model running out of tokens or an issue with the input data. "
970
- "Returning an empty GenerativeModelOutput.",
1026
+ f"No sequences were generated by the model {model_id!r}. This may be "
1027
+ "due to the model running out of tokens or an issue with the input "
1028
+ "data. Returning an empty GenerativeModelOutput.",
971
1029
  level=logging.WARNING,
972
1030
  )
973
1031
  return GenerativeModelOutput(sequences=[], scores=None)
@@ -1295,6 +1353,10 @@ class LiteLLMModel(BenchmarkModule):
1295
1353
  if model_id in litellm.model_list:
1296
1354
  return True
1297
1355
 
1356
+ set_up_benchmark_config_for_model(
1357
+ benchmark_config=benchmark_config, model_id=model_id
1358
+ )
1359
+
1298
1360
  # Separate check for Ollama models
1299
1361
  if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
1300
1362
  ollama_model_exists = try_download_ollama_model(
@@ -1596,6 +1658,11 @@ class LiteLLMModel(BenchmarkModule):
1596
1658
  level=logging.DEBUG,
1597
1659
  )
1598
1660
 
1661
+ # If the model is a Chat.dk model, we make sure reasoning traces are not
1662
+ # included in the output
1663
+ if self.model_config.model_id.startswith("ordbogen/"):
1664
+ generation_kwargs["include_reasoning"] = False
1665
+
1599
1666
  # Handle manually set parameters
1600
1667
  if self.buffer["first_label_token_mapping"]:
1601
1668
  generation_kwargs["logprobs"] = True
@@ -1784,6 +1851,12 @@ def clean_model_id(model_id: str, benchmark_config: BenchmarkConfig) -> str:
1784
1851
  Returns:
1785
1852
  The cleaned model ID.
1786
1853
  """
1854
+ # Remove unofficial prefixes
1855
+ for unofficial_prefix in UNOFFICIAL_INFERENCE_API_PREFIXES:
1856
+ model_id = re.sub(
1857
+ pattern=rf"^{re.escape(unofficial_prefix)}", repl="", string=model_id
1858
+ )
1859
+
1787
1860
  if benchmark_config.api_base is not None and not any(
1788
1861
  model_id.startswith(prefix) for prefix in CUSTOM_INFERENCE_API_PREFIXES
1789
1862
  ):
@@ -1792,4 +1865,28 @@ def clean_model_id(model_id: str, benchmark_config: BenchmarkConfig) -> str:
1792
1865
  else:
1793
1866
  prefix = "openai/"
1794
1867
  model_id = prefix + model_id
1868
+
1869
+ # When we want to evaluate an OpenAI model on a custom inference server, such as HF
1870
+ # inference endpoints, LiteLLM gets confused since it's already using the `openai/`
1871
+ # prefix. We thus have to add it twice, and this hack here is to ensure that we
1872
+ # don't store the results with model ID `openai/openai/...`.
1873
+ elif benchmark_config.api_base is not None and model_id.startswith("openai/"):
1874
+ model_id = "openai/openai/" + re.sub(r"(openai/)*", "", model_id)
1875
+
1795
1876
  return model_id
1877
+
1878
+
1879
+ def set_up_benchmark_config_for_model(
1880
+ benchmark_config: BenchmarkConfig, model_id: str
1881
+ ) -> None:
1882
+ """Set up the benchmark configuration for the model.
1883
+
1884
+ Args:
1885
+ benchmark_config:
1886
+ The benchmark configuration to set up.
1887
+ model_id:
1888
+ The model ID.
1889
+ """
1890
+ if model_id.startswith("ordbogen/"):
1891
+ benchmark_config.api_key = os.getenv("ORDBOGEN_API_KEY")
1892
+ benchmark_config.api_base = "https://api.ordbogen.ai/v1"