ScandEval 16.10.1__py3-none-any.whl → 16.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -21,6 +21,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
21
21
  from urllib3.exceptions import RequestError
22
22
 
23
23
  from ..constants import (
24
+ ATTENTION_BACKENDS,
24
25
  CUSTOM_STOP_TOKENS,
25
26
  GENERATION_KWARGS,
26
27
  GENERATIVE_PIPELINE_TAGS,
@@ -71,7 +72,6 @@ from ..tokenisation_utils import (
71
72
  )
72
73
  from ..types import ExtractLabelsFunction, Tokeniser
73
74
  from ..utils import (
74
- attention_backend,
75
75
  clear_memory,
76
76
  create_model_cache_dir,
77
77
  get_hf_token,
@@ -90,18 +90,23 @@ except ImportError:
90
90
  )
91
91
 
92
92
  if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
93
- from vllm import LLM, SamplingParams # type: ignore[missing-import]
94
- from vllm.distributed.parallel_state import ( # type: ignore[missing-import]
93
+ import vllm.config
94
+
95
+ # MacOS/CPU installs an older version of vLLM, which doesn't have the attention
96
+ # config
97
+ if hasattr(vllm.config, "attention"):
98
+ from vllm.config.attention import AttentionConfig
99
+
100
+ from vllm import LLM, SamplingParams
101
+ from vllm.distributed.parallel_state import (
95
102
  destroy_distributed_environment,
96
103
  destroy_model_parallel,
97
104
  )
98
- from vllm.lora.request import LoRARequest # type: ignore[missing-import]
99
- from vllm.sampling_params import ( #  type: ignore[missing-import]
100
- StructuredOutputsParams,
101
- )
105
+ from vllm.lora.request import LoRARequest
106
+ from vllm.sampling_params import StructuredOutputsParams
102
107
 
103
108
  if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
104
- import ray # type: ignore[missing-import]
109
+ import ray
105
110
 
106
111
 
107
112
  if t.TYPE_CHECKING:
@@ -111,7 +116,9 @@ if t.TYPE_CHECKING:
111
116
  from ..data_models import BenchmarkConfig, DatasetConfig, Task
112
117
 
113
118
 
114
- MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[re.Pattern, str] = {
119
+ MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[
120
+ re.Pattern, t.Literal[*ATTENTION_BACKENDS] # pyrefly: ignore[invalid-literal]
121
+ ] = {
115
122
  re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "TRITON_ATTN",
116
123
  re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "TRITON_ATTN",
117
124
  re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "TRITON_ATTN",
@@ -153,7 +160,7 @@ class VLLMModel(HuggingFaceEncoderModel):
153
160
  if importlib.util.find_spec("vllm") is None:
154
161
  raise NeedsExtraInstalled(extra="generative")
155
162
 
156
- if shutil.which("nvcc") is None:
163
+ if torch.cuda.is_available() and shutil.which("nvcc") is None:
157
164
  raise NeedsSystemDependency(
158
165
  dependency="nvcc",
159
166
  instructions=(
@@ -163,23 +170,43 @@ class VLLMModel(HuggingFaceEncoderModel):
163
170
  ),
164
171
  )
165
172
 
173
+ if not torch.cuda.is_available() and (
174
+ dataset_config.task.task_group
175
+ in [
176
+ TaskGroup.SEQUENCE_CLASSIFICATION,
177
+ TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
178
+ ]
179
+ or dataset_config.task.uses_structured_output
180
+ ):
181
+ raise InvalidBenchmark(
182
+ "We currently require CUDA to benchmark generative models on tasks "
183
+ "that uses structured generation, which includes the current task "
184
+ f"{dataset_config.task.name}. This is due to an xgrammar issue, which "
185
+ "will hopefully be fixed soon."
186
+ )
187
+
166
188
  raise_if_wrong_params(
167
189
  model_config=model_config, allowed_params=self.allowed_params
168
190
  )
169
191
 
170
- # See if the model requires a particular attention backend
171
- default_flash_attention_backend = None
172
- for pattern, backend in MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS.items():
173
- if re.search(pattern=pattern, string=model_config.model_id):
174
- default_flash_attention_backend = backend
175
- break
192
+ # Determine the attention backend to use:
193
+ # Override for models that require a specific backend, otherwise use user's
194
+ # choice from CLI (defaults to FLASHINFER)
195
+ if hasattr(vllm.config, "attention"):
196
+ for pattern, backend in MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS.items():
197
+ if re.search(pattern=pattern, string=model_config.model_id):
198
+ attention_backend = backend
199
+ break
200
+ else:
201
+ attention_backend = benchmark_config.attention_backend
202
+ else:
203
+ attention_backend = benchmark_config.attention_backend
176
204
 
177
- with (
178
- no_terminal_output(disable=benchmark_config.verbose),
179
- attention_backend(value=default_flash_attention_backend),
180
- ):
205
+ with no_terminal_output(disable=benchmark_config.verbose):
181
206
  model, tokeniser = load_model_and_tokeniser(
182
- model_config=model_config, benchmark_config=benchmark_config
207
+ model_config=model_config,
208
+ benchmark_config=benchmark_config,
209
+ attention_backend=attention_backend,
183
210
  )
184
211
  self._model: "LLM" = model
185
212
  self._tokeniser: Tokeniser = tokeniser
@@ -216,11 +243,14 @@ class VLLMModel(HuggingFaceEncoderModel):
216
243
  )
217
244
  )
218
245
  if self.model_config.adapter_base_model_id is not None:
219
- adapter_path = snapshot_download(
220
- repo_id=self.model_config.model_id,
221
- revision=self.model_config.revision,
222
- cache_dir=Path(self.model_config.model_cache_dir),
223
- )
246
+ if Path(self.model_config.model_id).exists():
247
+ adapter_path = self.model_config.model_id
248
+ else:
249
+ adapter_path = snapshot_download(
250
+ repo_id=self.model_config.model_id,
251
+ revision=self.model_config.revision,
252
+ cache_dir=Path(self.model_config.model_cache_dir),
253
+ )
224
254
  self.buffer["lora_request"] = LoRARequest(
225
255
  lora_name="adapter", lora_int_id=1, lora_path=adapter_path
226
256
  )
@@ -500,7 +530,8 @@ class VLLMModel(HuggingFaceEncoderModel):
500
530
  log_once(
501
531
  f"Using temperature={temperature} with the model "
502
532
  f"{self.model_config.model_id!r} as specified in its "
503
- "generation configuration."
533
+ "generation configuration.",
534
+ level=logging.DEBUG,
504
535
  )
505
536
  if "top_p" in changed_params:
506
537
  top_p = changed_params["top_p"]
@@ -508,7 +539,8 @@ class VLLMModel(HuggingFaceEncoderModel):
508
539
  log_once(
509
540
  f"Using top_p={top_p} with the model "
510
541
  f"{self.model_config.model_id!r} as specified in its "
511
- "generation configuration."
542
+ "generation configuration.",
543
+ level=logging.DEBUG,
512
544
  )
513
545
  if "top_k" in changed_params:
514
546
  top_k = changed_params["top_k"]
@@ -516,7 +548,8 @@ class VLLMModel(HuggingFaceEncoderModel):
516
548
  log_once(
517
549
  f"Using top_k={top_k} with the model "
518
550
  f"{self.model_config.model_id!r} as specified in its "
519
- "generation configuration."
551
+ "generation configuration.",
552
+ level=logging.DEBUG,
520
553
  )
521
554
  if "repetition_penalty" in changed_params:
522
555
  repetition_penalty = changed_params["repetition_penalty"]
@@ -524,8 +557,10 @@ class VLLMModel(HuggingFaceEncoderModel):
524
557
  log_once(
525
558
  f"Using repetition_penalty={repetition_penalty} with the model "
526
559
  f"{self.model_config.model_id!r} as specified in its "
527
- "generation configuration."
560
+ "generation configuration.",
561
+ level=logging.DEBUG,
528
562
  )
563
+
529
564
  max_tokens: int = (
530
565
  REASONING_MAX_TOKENS
531
566
  if self.generative_type == GenerativeType.REASONING
@@ -538,7 +573,7 @@ class VLLMModel(HuggingFaceEncoderModel):
538
573
  else None,
539
574
  temperature=generation_kwargs["temperature"],
540
575
  top_p=generation_kwargs["top_p"],
541
- top_k=generation_kwargs["top_k"],
576
+ top_k=int(generation_kwargs["top_k"]),
542
577
  repetition_penalty=generation_kwargs["repetition_penalty"],
543
578
  stop=[stop_token for stop_token in stop_tokens if stop_token],
544
579
  structured_outputs=structured_outputs,
@@ -547,10 +582,12 @@ class VLLMModel(HuggingFaceEncoderModel):
547
582
  # If any of the prompts are empty then we need to replace them with a BOS token
548
583
  # so that the vLLM model can generate from them
549
584
  prompts: c.Sequence[str] = inputs["text"]
550
- if any(len(prompt) == 0 for prompt in prompts):
585
+ if any(len(prompt.strip()) == 0 for prompt in prompts):
551
586
  log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
552
587
  prompts = [
553
- prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
588
+ prompt
589
+ if len(prompt.strip()) > 0
590
+ else str(self._tokeniser.bos_token or "x")
554
591
  for prompt in prompts
555
592
  ]
556
593
 
@@ -567,16 +604,78 @@ class VLLMModel(HuggingFaceEncoderModel):
567
604
  )
568
605
  prompts = [prompt.strip() for prompt in prompts]
569
606
 
570
- # Truncate the prompts if needed, but only if it's not a reasoning model
571
- if self.generative_type != GenerativeType.REASONING:
572
- max_tokens_per_prompt = (
573
- min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH) - max_tokens
574
- )
575
- tokenized_prompts = self._tokeniser(
576
- text=list(prompts), truncation=True, max_length=max_tokens_per_prompt
607
+ # Truncate the prompts if needed
608
+ max_tokens_per_prompt = min(
609
+ self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH
610
+ )
611
+ max_tokens_per_prompt -= min(
612
+ self.dataset_config.max_generated_tokens, max_tokens_per_prompt - 1
613
+ )
614
+ tokenized_prompts = self._tokeniser(
615
+ text=prompts, max_length=max_tokens_per_prompt
616
+ )
617
+ if any(
618
+ len(input_ids) >= max_tokens_per_prompt
619
+ for input_ids in tokenized_prompts.input_ids
620
+ ):
621
+ log(
622
+ f"Truncating prompts for the model {self.model_config.model_id!r} "
623
+ f"to a maximum of {max_tokens_per_prompt:,} tokens.",
624
+ level=logging.DEBUG,
577
625
  )
578
- prompts = self._tokeniser.batch_decode(
579
- sequences=tokenized_prompts.input_ids, skip_special_tokens=True
626
+ match self.generative_type:
627
+ case GenerativeType.BASE:
628
+ truncated_tokenized_prompts = self._tokeniser(
629
+ text=prompts, max_length=max_tokens_per_prompt, truncation=True
630
+ )
631
+ prompts = self._tokeniser.batch_decode(
632
+ sequences=truncated_tokenized_prompts.input_ids,
633
+ skip_special_tokens=True,
634
+ )
635
+ case GenerativeType.INSTRUCTION_TUNED | GenerativeType.REASONING:
636
+ assert self.end_of_chat_token_ids is not None, (
637
+ "The end-of-chat token IDs should be set for instruction-tuned "
638
+ "and reasoning models."
639
+ )
640
+ end_of_chat_token = self._tokeniser.decode(
641
+ list(self.end_of_chat_token_ids)
642
+ )
643
+ prompt_segments: list[list[str]] = [
644
+ prompt.replace(self._tokeniser.bos_token, "").split(
645
+ end_of_chat_token
646
+ )
647
+ for prompt in prompts
648
+ ]
649
+ for num_few_shots_to_remove in range(
650
+ 1, self.dataset_config.num_few_shot_examples + 1
651
+ ):
652
+ new_prompts = [
653
+ end_of_chat_token.join(
654
+ prompt_segment[2 * num_few_shots_to_remove :]
655
+ )
656
+ for prompt_segment in prompt_segments
657
+ ]
658
+ tokenized_prompts = self._tokeniser(
659
+ text=new_prompts, max_length=max_tokens_per_prompt
660
+ )
661
+ if all(
662
+ len(input_ids) < max_tokens_per_prompt
663
+ for input_ids in tokenized_prompts.input_ids
664
+ ):
665
+ prompts = new_prompts
666
+ break
667
+ else:
668
+ raise InvalidBenchmark(
669
+ "Truncation of prompts failed, some prompts are still too "
670
+ "long."
671
+ )
672
+ case _:
673
+ raise InvalidBenchmark("The model type is not set!")
674
+ else:
675
+ log(
676
+ f"Truncation of prompts for model {self.model_config.model_id!r} is "
677
+ "not needed, so skipping truncation.",
678
+ level=logging.DEBUG,
580
679
  )
581
680
 
582
681
  # Generate sequences using vLLM
@@ -598,10 +697,11 @@ class VLLMModel(HuggingFaceEncoderModel):
598
697
  level=logging.DEBUG,
599
698
  )
600
699
  sleep(1)
601
- except ValueError as e:
700
+ except (ValueError, RuntimeError) as e:
602
701
  # Truncate the prompts if they are too long for the model
603
702
  truncate_error_messages = [
604
- r"prompt \(length [0-9]+\) is longer than the maximum model length"
703
+ r"prompt \(length [0-9]+\) is longer than the maximum model length",
704
+ "Sampled token IDs exceed the max model length",
605
705
  ]
606
706
  if any(
607
707
  re.search(pattern, str(e), flags=re.IGNORECASE) is not None
@@ -873,7 +973,11 @@ class VLLMModel(HuggingFaceEncoderModel):
873
973
 
874
974
 
875
975
  def load_model_and_tokeniser(
876
- model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
976
+ model_config: "ModelConfig",
977
+ benchmark_config: "BenchmarkConfig",
978
+ attention_backend: t.Literal[
979
+ *ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
980
+ ],
877
981
  ) -> tuple["LLM", Tokeniser]:
878
982
  """Load the model and tokeniser.
879
983
 
@@ -882,6 +986,8 @@ def load_model_and_tokeniser(
882
986
  The model configuration.
883
987
  benchmark_config:
884
988
  The benchmark configuration.
989
+ attention_backend:
990
+ The attention backend to use.
885
991
 
886
992
  Returns:
887
993
  A pair (model, tokeniser), with the loaded model and tokeniser
@@ -905,19 +1011,6 @@ def load_model_and_tokeniser(
905
1011
  run_with_cli=benchmark_config.run_with_cli,
906
1012
  )
907
1013
 
908
- quantization = None
909
- if hasattr(hf_model_config, "quantization_config"):
910
- quantization = hf_model_config.quantization_config.get("quant_method")
911
-
912
- # The quantised models require extra dependencies
913
- if quantization == "gptq" and (
914
- importlib.util.find_spec("auto_gptq") is None
915
- or importlib.util.find_spec("optimum") is None
916
- ):
917
- raise NeedsExtraInstalled(extra="quantization")
918
- if quantization == "awq" and importlib.util.find_spec("awq") is None:
919
- raise NeedsExtraInstalled(extra="quantization")
920
-
921
1014
  # Start with dtype being the "auto" vLLM dtype
922
1015
  dtype: str | torch.dtype = "auto"
923
1016
 
@@ -940,23 +1033,6 @@ def load_model_and_tokeniser(
940
1033
  )
941
1034
  dtype = torch.float16
942
1035
 
943
- # If the model is a quantized model, we might need to change the dtype
944
- if quantization == "mxfp4" and hf_model_config.dtype is None:
945
- dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
946
- log(
947
- "You are loading a quantized model where `dtype` has not been set. "
948
- f"Setting dtype to {dtype!r}.",
949
- level=logging.DEBUG,
950
- )
951
- elif quantization is not None and hf_model_config.dtype != torch.float16:
952
- log(
953
- "You are loading a quantized model with dtype "
954
- f"{hf_model_config.dtype}, which vLLM does not support. Setting "
955
- "dtype to float16 instead.",
956
- level=logging.WARNING,
957
- )
958
- dtype = torch.float16
959
-
960
1036
  # If the model is a bf16 model, we need to check the CUDA compute capability
961
1037
  if hf_model_config.dtype == torch.bfloat16:
962
1038
  min_cuda_compute_capability = get_min_cuda_compute_capability()
@@ -974,6 +1050,28 @@ def load_model_and_tokeniser(
974
1050
  )
975
1051
  dtype = torch.float16
976
1052
 
1053
+ quantization = None
1054
+ if hasattr(hf_model_config, "quantization_config"):
1055
+ quantization = hf_model_config.quantization_config.get("quant_method")
1056
+
1057
+ # The quantised models require extra dependencies
1058
+ if quantization == "gptq" and (
1059
+ importlib.util.find_spec("auto_gptq") is None
1060
+ or importlib.util.find_spec("optimum") is None
1061
+ ):
1062
+ raise NeedsExtraInstalled(extra="quantization")
1063
+ if quantization == "awq" and importlib.util.find_spec("awq") is None:
1064
+ raise NeedsExtraInstalled(extra="quantization")
1065
+
1066
+ # If the model is a quantized model, let vLLM decide the dtype
1067
+ if quantization is not None:
1068
+ log(
1069
+ f"You are loading a quantized model with quantization {quantization}. "
1070
+ "Forcing the vLLM dtype to 'auto'",
1071
+ level=logging.WARNING,
1072
+ )
1073
+ dtype = "auto"
1074
+
977
1075
  if model_config.adapter_base_model_id is not None:
978
1076
  download_dir = str(Path(model_config.model_cache_dir) / "base_model")
979
1077
  else:
@@ -1006,10 +1104,15 @@ def load_model_and_tokeniser(
1006
1104
  model_config=model_config,
1007
1105
  token=get_hf_token(api_key=benchmark_config.api_key),
1008
1106
  )
1009
- vllm_tokenisation_params = get_vllm_tokenisation_params(
1107
+ vllm_params = get_vllm_tokenisation_params(
1010
1108
  tokeniser=tokeniser, model_config=model_config
1011
1109
  )
1012
1110
 
1111
+ # MacOS/CPU installs an older version of vLLM, which doesn't have the attention
1112
+ # config
1113
+ if hasattr(vllm.config, "attention"):
1114
+ vllm_params["attention_config"] = AttentionConfig(backend=attention_backend)
1115
+
1013
1116
  clear_vllm()
1014
1117
 
1015
1118
  distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
@@ -1017,19 +1120,21 @@ def load_model_and_tokeniser(
1017
1120
  )
1018
1121
 
1019
1122
  try:
1123
+ model_location = (
1124
+ model_id
1125
+ if internet_connection_available() or Path(model_id).is_dir()
1126
+ else resolve_model_path(download_dir=download_dir)
1127
+ )
1128
+
1129
+ max_model_len = min(
1130
+ true_max_model_len, MAX_CONTEXT_LENGTH + REASONING_MAX_TOKENS
1131
+ )
1020
1132
  model = LLM(
1021
- model=(
1022
- model_id
1023
- if internet_connection_available()
1024
- else resolve_model_path(download_dir=download_dir)
1025
- ),
1026
- tokenizer=(
1027
- model_id
1028
- if internet_connection_available()
1029
- else resolve_model_path(download_dir=download_dir)
1030
- ),
1133
+ model=model_location,
1134
+ tokenizer=model_location,
1031
1135
  gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
1032
- max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
1136
+ max_model_len=max_model_len,
1137
+ max_num_batched_tokens=max_model_len,
1033
1138
  download_dir=download_dir,
1034
1139
  trust_remote_code=benchmark_config.trust_remote_code,
1035
1140
  revision=revision,
@@ -1046,7 +1151,7 @@ def load_model_and_tokeniser(
1046
1151
  enable_prefix_caching=False,
1047
1152
  enable_lora=model_config.adapter_base_model_id is not None,
1048
1153
  max_lora_rank=256,
1049
- **vllm_tokenisation_params,
1154
+ **vllm_params,
1050
1155
  )
1051
1156
  except (RuntimeError, ValueError, OSError) as e:
1052
1157
  if "awaiting a review from the repo authors" in str(e):
@@ -1071,11 +1176,11 @@ def load_model_and_tokeniser(
1071
1176
  (
1072
1177
  "Since you're running in verbose mode, you might see a descriptive "
1073
1178
  "error above already. Note however that if the error message urges "
1074
- "you to set the environment variable `VLLM_ATTENTION_BACKEND` to "
1075
- "'FLEX_ATTENTION', please try setting it to 'TRITON_ATTN' first, "
1076
- "as that often solves the issue, whereas 'FLEX_ATTENTION' usually "
1077
- "doesn't. If you don't see any descriptive error above, then you "
1078
- "can try "
1179
+ "you to use the attention backend 'FLEX_ATTENTION', please try "
1180
+ "setting it to 'TRITON_ATTN' instead using the "
1181
+ "`--attention-backend` CLI argument, as that often solves the "
1182
+ "issue, whereas 'FLEX_ATTENTION' usually doesn't. If you don't "
1183
+ "see any descriptive error above, then you can try "
1079
1184
  )
1080
1185
  if benchmark_config.verbose
1081
1186
  else "Try "
@@ -1450,6 +1555,9 @@ def select_backend_and_parallelism() -> tuple[str, int, int]:
1450
1555
  - tensor_parallel_size (int): Number of GPUs per node.
1451
1556
  - pipeline_parallel_size (int): Number of stages across nodes.
1452
1557
  """
1558
+ if not torch.cuda.is_available():
1559
+ return "mp", 1, 1
1560
+
1453
1561
  if not ray.is_initialized():
1454
1562
  try:
1455
1563
  ray.init(address="auto", ignore_reinit_error=True)
@@ -1476,7 +1584,7 @@ def select_backend_and_parallelism() -> tuple[str, int, int]:
1476
1584
  pipeline_parallel_size = max(1, total_gpus // tensor_parallel_size)
1477
1585
  log_once(
1478
1586
  f"Detected a multi-node setup with {pipeline_parallel_size:,} nodes, each "
1479
- "with {tensor_parallel_size:,} GPUs, so using `ray` as the "
1587
+ f"with {tensor_parallel_size:,} GPUs, so using `ray` as the "
1480
1588
  "distributed backend.",
1481
1589
  level=logging.DEBUG,
1482
1590
  )
scandeval/benchmarker.py CHANGED
@@ -15,7 +15,7 @@ from time import sleep
15
15
  from torch.distributed import destroy_process_group
16
16
 
17
17
  from .benchmark_config_factory import build_benchmark_config
18
- from .constants import GENERATIVE_PIPELINE_TAGS
18
+ from .constants import ATTENTION_BACKENDS, GENERATIVE_PIPELINE_TAGS
19
19
  from .data_loading import load_data, load_raw_data
20
20
  from .data_models import BenchmarkConfigParams, BenchmarkResult
21
21
  from .dataset_configs import get_all_dataset_configs
@@ -79,6 +79,7 @@ class Benchmarker:
79
79
  api_base: str | None = None,
80
80
  api_version: str | None = None,
81
81
  gpu_memory_utilization: float = 0.8,
82
+ attention_backend: str = "FLASHINFER",
82
83
  generative_type: GenerativeType | None = None,
83
84
  custom_datasets_file: Path | str = Path("custom_datasets.py"),
84
85
  debug: bool = False,
@@ -149,6 +150,9 @@ class Benchmarker:
149
150
  is generative. A larger value will result in faster evaluation, but at
150
151
  the risk of running out of GPU memory. Only reduce this if you are
151
152
  running out of GPU memory. Defaults to 0.9.
153
+ attention_backend:
154
+ The attention backend to use for vLLM. Defaults to FLASHINFER. Only
155
+ relevant if the model is generative.
152
156
  generative_type:
153
157
  The type of generative model to benchmark. Only relevant if the model is
154
158
  generative. If not specified, then the type will be inferred based on
@@ -264,6 +268,7 @@ class Benchmarker:
264
268
  requires_safetensors=requires_safetensors,
265
269
  download_only=download_only,
266
270
  gpu_memory_utilization=gpu_memory_utilization,
271
+ attention_backend=attention_backend,
267
272
  generative_type=generative_type,
268
273
  custom_datasets_file=Path(custom_datasets_file),
269
274
  verbose=verbose,
@@ -385,6 +390,10 @@ class Benchmarker:
385
390
  download_only: bool | None = None,
386
391
  gpu_memory_utilization: float | None = None,
387
392
  generative_type: GenerativeType | None = None,
393
+ attention_backend: t.Literal[
394
+ *ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
395
+ ]
396
+ | None = None,
388
397
  custom_datasets_file: Path | str | None = None,
389
398
  force: bool | None = None,
390
399
  verbose: bool | None = None,
@@ -638,6 +647,11 @@ class Benchmarker:
638
647
  if generative_type is not None
639
648
  else self.benchmark_config_default_params.generative_type
640
649
  ),
650
+ attention_backend=(
651
+ attention_backend
652
+ if attention_backend is not None
653
+ else self.benchmark_config_default_params.attention_backend
654
+ ),
641
655
  custom_datasets_file=(
642
656
  Path(custom_datasets_file)
643
657
  if custom_datasets_file is not None
@@ -1045,8 +1059,16 @@ class Benchmarker:
1045
1059
  if model.generative_type is not None
1046
1060
  else None
1047
1061
  ),
1048
- few_shot=benchmark_config.few_shot,
1049
- validation_split=not benchmark_config.evaluate_test_split,
1062
+ few_shot=(
1063
+ None
1064
+ if dataset_config.task.requires_zero_shot
1065
+ else benchmark_config.few_shot
1066
+ ),
1067
+ validation_split=(
1068
+ None
1069
+ if "val" not in dataset_config.splits
1070
+ else not benchmark_config.evaluate_test_split
1071
+ ),
1050
1072
  )
1051
1073
  log(f"Results:\n{results}", level=logging.DEBUG)
1052
1074
  return record
@@ -1122,12 +1144,10 @@ def get_record(
1122
1144
  same_revision = model_id_components.revision == model_config.revision
1123
1145
  same_param = model_id_components.param == model_config.param
1124
1146
  same_dataset = record.dataset == dataset_config.name
1125
- same_split = (
1126
- record.validation_split != benchmark_config.evaluate_test_split
1127
- or "val" not in dataset_config.splits
1128
- )
1147
+ same_split = record.validation_split != benchmark_config.evaluate_test_split
1129
1148
  same_num_shots = (
1130
1149
  record.few_shot == benchmark_config.few_shot
1150
+ or record.few_shot is None
1131
1151
  or not record.generative
1132
1152
  or dataset_config.task.requires_zero_shot
1133
1153
  )
@@ -1225,6 +1245,7 @@ def initial_logging(
1225
1245
  f"{dataset_config.logging_string} ({num_finished_benchmarks + 1}/"
1226
1246
  f"{num_total_benchmarks} benchmarks)...",
1227
1247
  prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
1248
+ level=logging.INFO,
1228
1249
  )
1229
1250
 
1230
1251
  if dataset_config.unofficial:
scandeval/cli.py CHANGED
@@ -170,6 +170,17 @@ from .languages import get_all_languages
170
170
  "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
171
171
  "if you are running out of GPU memory. Only relevant if the model is generative.",
172
172
  )
173
+ @click.option(
174
+ "--attention-backend",
175
+ default="FLASHINFER",
176
+ show_default=True,
177
+ type=click.Choice(
178
+ ["FLASHINFER", "FLASH_ATTN", "TRITON_ATTN", "FLEX_ATTENTION"],
179
+ case_sensitive=True,
180
+ ),
181
+ help="The attention backend to use for vLLM. Only relevant if the model is "
182
+ "generative.",
183
+ )
173
184
  @click.option(
174
185
  "--requires-safetensors",
175
186
  is_flag=True,
@@ -254,6 +265,7 @@ def benchmark(
254
265
  api_base: str | None,
255
266
  api_version: str | None,
256
267
  gpu_memory_utilization: float,
268
+ attention_backend: str,
257
269
  requires_safetensors: bool,
258
270
  generative_type: str | None,
259
271
  custom_datasets_file: Path,
@@ -285,6 +297,7 @@ def benchmark(
285
297
  api_base=api_base,
286
298
  api_version=api_version,
287
299
  gpu_memory_utilization=gpu_memory_utilization,
300
+ attention_backend=attention_backend,
288
301
  generative_type=GenerativeType[generative_type.upper()]
289
302
  if generative_type
290
303
  else None,
scandeval/constants.py CHANGED
@@ -33,8 +33,8 @@ GENERATIVE_PIPELINE_TAGS = [
33
33
  # Used to disallow non-generative models to be evaluated on these task groups
34
34
  GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
35
35
 
36
- # Local models are required to have these files in their directory
37
- LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
36
+ # Local models are required to have one of these files in their directory
37
+ LOCAL_MODELS_REQUIRED_FILES = ["config.json", "adapter_config.json"]
38
38
 
39
39
  # The number of top log probabilities to return for generative models. For several APIs
40
40
  # this is the maximum number of log probabilities that can be returned
@@ -105,3 +105,32 @@ GENERATION_KWARGS = {
105
105
  "top_k": 0,
106
106
  "repetition_penalty": 1.0,
107
107
  }
108
+
109
+ # This is a mirror of `AttentionBackendEnum` in vLLM, but since we don't have access to
110
+ # this when running on CPU/MacOS (as we can only run an old vLLM version), we have to
111
+ # define it here
112
+ ATTENTION_BACKENDS: list[str] = [
113
+ "FLASH_ATTN",
114
+ "FLASH_ATTN_DIFFKV",
115
+ "TRITON_ATTN",
116
+ "ROCM_ATTN",
117
+ "ROCM_AITER_MLA",
118
+ "ROCM_AITER_TRITON_MLA",
119
+ "ROCM_AITER_FA",
120
+ "ROCM_AITER_MLA_SPARSE",
121
+ "TORCH_SDPA",
122
+ "FLASHINFER",
123
+ "FLASHINFER_MLA",
124
+ "TRITON_MLA",
125
+ "CUTLASS_MLA",
126
+ "FLASHMLA",
127
+ "FLASHMLA_SPARSE",
128
+ "FLASH_ATTN_MLA",
129
+ "IPEX",
130
+ "NO_ATTENTION",
131
+ "FLEX_ATTENTION",
132
+ "TREE_ATTN",
133
+ "ROCM_AITER_UNIFIED_ATTN",
134
+ "CPU_ATTN",
135
+ "CUSTOM",
136
+ ]