ScandEval 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. scandeval/__init__.py +0 -9
  2. scandeval/async_utils.py +46 -0
  3. scandeval/benchmark_config_factory.py +31 -2
  4. scandeval/benchmark_modules/fresh.py +2 -1
  5. scandeval/benchmark_modules/hf.py +76 -23
  6. scandeval/benchmark_modules/litellm.py +33 -15
  7. scandeval/benchmark_modules/vllm.py +97 -44
  8. scandeval/benchmarker.py +29 -33
  9. scandeval/cli.py +11 -0
  10. scandeval/constants.py +36 -2
  11. scandeval/custom_dataset_configs.py +152 -0
  12. scandeval/data_loading.py +87 -31
  13. scandeval/data_models.py +405 -224
  14. scandeval/dataset_configs/__init__.py +51 -25
  15. scandeval/dataset_configs/albanian.py +1 -1
  16. scandeval/dataset_configs/belarusian.py +47 -0
  17. scandeval/dataset_configs/bulgarian.py +1 -1
  18. scandeval/dataset_configs/catalan.py +1 -1
  19. scandeval/dataset_configs/croatian.py +1 -1
  20. scandeval/dataset_configs/danish.py +3 -2
  21. scandeval/dataset_configs/dutch.py +16 -5
  22. scandeval/dataset_configs/english.py +4 -3
  23. scandeval/dataset_configs/estonian.py +8 -7
  24. scandeval/dataset_configs/faroese.py +1 -1
  25. scandeval/dataset_configs/finnish.py +5 -4
  26. scandeval/dataset_configs/french.py +6 -5
  27. scandeval/dataset_configs/german.py +4 -3
  28. scandeval/dataset_configs/greek.py +1 -1
  29. scandeval/dataset_configs/hungarian.py +1 -1
  30. scandeval/dataset_configs/icelandic.py +4 -3
  31. scandeval/dataset_configs/italian.py +4 -3
  32. scandeval/dataset_configs/latvian.py +2 -2
  33. scandeval/dataset_configs/lithuanian.py +1 -1
  34. scandeval/dataset_configs/norwegian.py +6 -5
  35. scandeval/dataset_configs/polish.py +4 -3
  36. scandeval/dataset_configs/portuguese.py +5 -4
  37. scandeval/dataset_configs/romanian.py +2 -2
  38. scandeval/dataset_configs/serbian.py +1 -1
  39. scandeval/dataset_configs/slovene.py +1 -1
  40. scandeval/dataset_configs/spanish.py +4 -3
  41. scandeval/dataset_configs/swedish.py +4 -3
  42. scandeval/dataset_configs/ukrainian.py +1 -1
  43. scandeval/generation_utils.py +6 -6
  44. scandeval/metrics/__init__.py +1 -0
  45. scandeval/metrics/bias.py +237 -0
  46. scandeval/metrics/huggingface.py +2 -1
  47. scandeval/metrics/llm_as_a_judge.py +1 -1
  48. scandeval/metrics/pipeline.py +1 -1
  49. scandeval/model_cache.py +34 -4
  50. scandeval/prompt_templates/linguistic_acceptability.py +9 -0
  51. scandeval/prompt_templates/multiple_choice.py +9 -0
  52. scandeval/prompt_templates/named_entity_recognition.py +21 -0
  53. scandeval/prompt_templates/reading_comprehension.py +10 -0
  54. scandeval/prompt_templates/sentiment_classification.py +11 -0
  55. scandeval/string_utils.py +157 -0
  56. scandeval/task_group_utils/sequence_classification.py +2 -5
  57. scandeval/task_group_utils/token_classification.py +2 -4
  58. scandeval/tasks.py +22 -0
  59. scandeval/tokenisation_utils.py +12 -1
  60. scandeval/utils.py +13 -383
  61. scandeval-16.13.0.dist-info/METADATA +334 -0
  62. scandeval-16.13.0.dist-info/RECORD +94 -0
  63. scandeval-16.11.0.dist-info/METADATA +0 -649
  64. scandeval-16.11.0.dist-info/RECORD +0 -89
  65. {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
  66. {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
  67. {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0
@@ -21,6 +21,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
21
21
  from urllib3.exceptions import RequestError
22
22
 
23
23
  from ..constants import (
24
+ ATTENTION_BACKENDS,
24
25
  CUSTOM_STOP_TOKENS,
25
26
  GENERATION_KWARGS,
26
27
  GENERATIVE_PIPELINE_TAGS,
@@ -53,6 +54,8 @@ from ..generation_utils import (
53
54
  )
54
55
  from ..languages import get_all_languages
55
56
  from ..logging_utils import get_pbar, log, log_once, no_terminal_output
57
+ from ..model_cache import create_model_cache_dir
58
+ from ..string_utils import split_model_id
56
59
  from ..task_group_utils import (
57
60
  question_answering,
58
61
  sequence_classification,
@@ -71,14 +74,11 @@ from ..tokenisation_utils import (
71
74
  )
72
75
  from ..types import ExtractLabelsFunction, Tokeniser
73
76
  from ..utils import (
74
- attention_backend,
75
77
  clear_memory,
76
- create_model_cache_dir,
77
78
  get_hf_token,
78
79
  get_min_cuda_compute_capability,
79
80
  internet_connection_available,
80
81
  resolve_model_path,
81
- split_model_id,
82
82
  )
83
83
  from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
84
84
 
@@ -90,18 +90,23 @@ except ImportError:
90
90
  )
91
91
 
92
92
  if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
93
- from vllm import LLM, SamplingParams # type: ignore[missing-import]
94
- from vllm.distributed.parallel_state import ( # type: ignore[missing-import]
93
+ import vllm.config
94
+
95
+ # MacOS/CPU installs an older version of vLLM, which doesn't have the attention
96
+ # config
97
+ if hasattr(vllm.config, "attention"):
98
+ from vllm.config.attention import AttentionConfig
99
+
100
+ from vllm import LLM, SamplingParams
101
+ from vllm.distributed.parallel_state import (
95
102
  destroy_distributed_environment,
96
103
  destroy_model_parallel,
97
104
  )
98
- from vllm.lora.request import LoRARequest # type: ignore[missing-import]
99
- from vllm.sampling_params import ( #  type: ignore[missing-import]
100
- StructuredOutputsParams,
101
- )
105
+ from vllm.lora.request import LoRARequest
106
+ from vllm.sampling_params import StructuredOutputsParams
102
107
 
103
108
  if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
104
- import ray # type: ignore[missing-import]
109
+ import ray
105
110
 
106
111
 
107
112
  if t.TYPE_CHECKING:
@@ -111,7 +116,9 @@ if t.TYPE_CHECKING:
111
116
  from ..data_models import BenchmarkConfig, DatasetConfig, Task
112
117
 
113
118
 
114
- MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[re.Pattern, str] = {
119
+ MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[
120
+ re.Pattern, t.Literal[*ATTENTION_BACKENDS] # pyrefly: ignore[invalid-literal]
121
+ ] = {
115
122
  re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "TRITON_ATTN",
116
123
  re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "TRITON_ATTN",
117
124
  re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "TRITON_ATTN",
@@ -153,7 +160,7 @@ class VLLMModel(HuggingFaceEncoderModel):
153
160
  if importlib.util.find_spec("vllm") is None:
154
161
  raise NeedsExtraInstalled(extra="generative")
155
162
 
156
- if shutil.which("nvcc") is None:
163
+ if torch.cuda.is_available() and shutil.which("nvcc") is None:
157
164
  raise NeedsSystemDependency(
158
165
  dependency="nvcc",
159
166
  instructions=(
@@ -163,23 +170,43 @@ class VLLMModel(HuggingFaceEncoderModel):
163
170
  ),
164
171
  )
165
172
 
173
+ if not torch.cuda.is_available() and (
174
+ dataset_config.task.task_group
175
+ in [
176
+ TaskGroup.SEQUENCE_CLASSIFICATION,
177
+ TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
178
+ ]
179
+ or dataset_config.task.uses_structured_output
180
+ ):
181
+ raise InvalidBenchmark(
182
+ "We currently require CUDA to benchmark generative models on tasks "
183
+ "that uses structured generation, which includes the current task "
184
+ f"{dataset_config.task.name}. This is due to an xgrammar issue, which "
185
+ "will hopefully be fixed soon."
186
+ )
187
+
166
188
  raise_if_wrong_params(
167
189
  model_config=model_config, allowed_params=self.allowed_params
168
190
  )
169
191
 
170
- # See if the model requires a particular attention backend
171
- default_flash_attention_backend = None
172
- for pattern, backend in MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS.items():
173
- if re.search(pattern=pattern, string=model_config.model_id):
174
- default_flash_attention_backend = backend
175
- break
192
+ # Determine the attention backend to use:
193
+ # Override for models that require a specific backend, otherwise use user's
194
+ # choice from CLI (defaults to FLASHINFER)
195
+ if hasattr(vllm.config, "attention"):
196
+ for pattern, backend in MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS.items():
197
+ if re.search(pattern=pattern, string=model_config.model_id):
198
+ attention_backend = backend
199
+ break
200
+ else:
201
+ attention_backend = benchmark_config.attention_backend
202
+ else:
203
+ attention_backend = benchmark_config.attention_backend
176
204
 
177
- with (
178
- no_terminal_output(disable=benchmark_config.verbose),
179
- attention_backend(value=default_flash_attention_backend),
180
- ):
205
+ with no_terminal_output(disable=benchmark_config.verbose):
181
206
  model, tokeniser = load_model_and_tokeniser(
182
- model_config=model_config, benchmark_config=benchmark_config
207
+ model_config=model_config,
208
+ benchmark_config=benchmark_config,
209
+ attention_backend=attention_backend,
183
210
  )
184
211
  self._model: "LLM" = model
185
212
  self._tokeniser: Tokeniser = tokeniser
@@ -216,11 +243,14 @@ class VLLMModel(HuggingFaceEncoderModel):
216
243
  )
217
244
  )
218
245
  if self.model_config.adapter_base_model_id is not None:
219
- adapter_path = snapshot_download(
220
- repo_id=self.model_config.model_id,
221
- revision=self.model_config.revision,
222
- cache_dir=Path(self.model_config.model_cache_dir),
223
- )
246
+ if Path(self.model_config.model_id).exists():
247
+ adapter_path = self.model_config.model_id
248
+ else:
249
+ adapter_path = snapshot_download(
250
+ repo_id=self.model_config.model_id,
251
+ revision=self.model_config.revision,
252
+ cache_dir=Path(self.model_config.model_cache_dir),
253
+ )
224
254
  self.buffer["lora_request"] = LoRARequest(
225
255
  lora_name="adapter", lora_int_id=1, lora_path=adapter_path
226
256
  )
@@ -543,7 +573,7 @@ class VLLMModel(HuggingFaceEncoderModel):
543
573
  else None,
544
574
  temperature=generation_kwargs["temperature"],
545
575
  top_p=generation_kwargs["top_p"],
546
- top_k=generation_kwargs["top_k"],
576
+ top_k=int(generation_kwargs["top_k"]),
547
577
  repetition_penalty=generation_kwargs["repetition_penalty"],
548
578
  stop=[stop_token for stop_token in stop_tokens if stop_token],
549
579
  structured_outputs=structured_outputs,
@@ -552,10 +582,12 @@ class VLLMModel(HuggingFaceEncoderModel):
552
582
  # If any of the prompts are empty then we need to replace them with a BOS token
553
583
  # so that the vLLM model can generate from them
554
584
  prompts: c.Sequence[str] = inputs["text"]
555
- if any(len(prompt) == 0 for prompt in prompts):
585
+ if any(len(prompt.strip()) == 0 for prompt in prompts):
556
586
  log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
557
587
  prompts = [
558
- prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
588
+ prompt
589
+ if len(prompt.strip()) > 0
590
+ else str(self._tokeniser.bos_token or "x")
559
591
  for prompt in prompts
560
592
  ]
561
593
 
@@ -583,7 +615,7 @@ class VLLMModel(HuggingFaceEncoderModel):
583
615
  text=prompts, max_length=max_tokens_per_prompt
584
616
  )
585
617
  if any(
586
- len(input_ids) > max_tokens_per_prompt
618
+ len(input_ids) >= max_tokens_per_prompt
587
619
  for input_ids in tokenized_prompts.input_ids
588
620
  ):
589
621
  log(
@@ -615,7 +647,7 @@ class VLLMModel(HuggingFaceEncoderModel):
615
647
  for prompt in prompts
616
648
  ]
617
649
  for num_few_shots_to_remove in range(
618
- 0, self.dataset_config.num_few_shot_examples + 1
650
+ 1, self.dataset_config.num_few_shot_examples + 1
619
651
  ):
620
652
  new_prompts = [
621
653
  end_of_chat_token.join(
@@ -627,7 +659,7 @@ class VLLMModel(HuggingFaceEncoderModel):
627
659
  text=new_prompts, max_length=max_tokens_per_prompt
628
660
  )
629
661
  if all(
630
- len(input_ids) <= max_tokens_per_prompt
662
+ len(input_ids) < max_tokens_per_prompt
631
663
  for input_ids in tokenized_prompts.input_ids
632
664
  ):
633
665
  prompts = new_prompts
@@ -637,6 +669,8 @@ class VLLMModel(HuggingFaceEncoderModel):
637
669
  "Truncation of prompts failed, some prompts are still too "
638
670
  "long."
639
671
  )
672
+ case _:
673
+ raise InvalidBenchmark("The model type is not set!")
640
674
  else:
641
675
  log(
642
676
  f"Truncation of prompts for model {self.model_config.model_id!r} is "
@@ -939,7 +973,11 @@ class VLLMModel(HuggingFaceEncoderModel):
939
973
 
940
974
 
941
975
  def load_model_and_tokeniser(
942
- model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
976
+ model_config: "ModelConfig",
977
+ benchmark_config: "BenchmarkConfig",
978
+ attention_backend: t.Literal[
979
+ *ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
980
+ ],
943
981
  ) -> tuple["LLM", Tokeniser]:
944
982
  """Load the model and tokeniser.
945
983
 
@@ -948,6 +986,8 @@ def load_model_and_tokeniser(
948
986
  The model configuration.
949
987
  benchmark_config:
950
988
  The benchmark configuration.
989
+ attention_backend:
990
+ The attention backend to use.
951
991
 
952
992
  Returns:
953
993
  A pair (model, tokeniser), with the loaded model and tokeniser
@@ -1064,10 +1104,15 @@ def load_model_and_tokeniser(
1064
1104
  model_config=model_config,
1065
1105
  token=get_hf_token(api_key=benchmark_config.api_key),
1066
1106
  )
1067
- vllm_tokenisation_params = get_vllm_tokenisation_params(
1107
+ vllm_params = get_vllm_tokenisation_params(
1068
1108
  tokeniser=tokeniser, model_config=model_config
1069
1109
  )
1070
1110
 
1111
+ # MacOS/CPU installs an older version of vLLM, which doesn't have the attention
1112
+ # config
1113
+ if hasattr(vllm.config, "attention"):
1114
+ vllm_params["attention_config"] = AttentionConfig(backend=attention_backend)
1115
+
1071
1116
  clear_vllm()
1072
1117
 
1073
1118
  distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
@@ -1080,11 +1125,16 @@ def load_model_and_tokeniser(
1080
1125
  if internet_connection_available() or Path(model_id).is_dir()
1081
1126
  else resolve_model_path(download_dir=download_dir)
1082
1127
  )
1128
+
1129
+ max_model_len = min(
1130
+ true_max_model_len, MAX_CONTEXT_LENGTH + REASONING_MAX_TOKENS
1131
+ )
1083
1132
  model = LLM(
1084
1133
  model=model_location,
1085
1134
  tokenizer=model_location,
1086
1135
  gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
1087
- max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
1136
+ max_model_len=max_model_len,
1137
+ max_num_batched_tokens=max_model_len,
1088
1138
  download_dir=download_dir,
1089
1139
  trust_remote_code=benchmark_config.trust_remote_code,
1090
1140
  revision=revision,
@@ -1094,14 +1144,14 @@ def load_model_and_tokeniser(
1094
1144
  pipeline_parallel_size=pipeline_parallel_size,
1095
1145
  disable_custom_all_reduce=True,
1096
1146
  quantization=quantization,
1097
- dtype=dtype,
1147
+ dtype=dtype, # pyrefly: ignore[bad-argument-type]
1098
1148
  enforce_eager=True,
1099
1149
  # TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
1100
1150
  # so we disable it for now
1101
1151
  enable_prefix_caching=False,
1102
1152
  enable_lora=model_config.adapter_base_model_id is not None,
1103
1153
  max_lora_rank=256,
1104
- **vllm_tokenisation_params,
1154
+ **vllm_params,
1105
1155
  )
1106
1156
  except (RuntimeError, ValueError, OSError) as e:
1107
1157
  if "awaiting a review from the repo authors" in str(e):
@@ -1126,11 +1176,11 @@ def load_model_and_tokeniser(
1126
1176
  (
1127
1177
  "Since you're running in verbose mode, you might see a descriptive "
1128
1178
  "error above already. Note however that if the error message urges "
1129
- "you to set the environment variable `VLLM_ATTENTION_BACKEND` to "
1130
- "'FLEX_ATTENTION', please try setting it to 'TRITON_ATTN' first, "
1131
- "as that often solves the issue, whereas 'FLEX_ATTENTION' usually "
1132
- "doesn't. If you don't see any descriptive error above, then you "
1133
- "can try "
1179
+ "you to use the attention backend 'FLEX_ATTENTION', please try "
1180
+ "setting it to 'TRITON_ATTN' instead using the "
1181
+ "`--attention-backend` CLI argument, as that often solves the "
1182
+ "issue, whereas 'FLEX_ATTENTION' usually doesn't. If you don't "
1183
+ "see any descriptive error above, then you can try "
1134
1184
  )
1135
1185
  if benchmark_config.verbose
1136
1186
  else "Try "
@@ -1505,6 +1555,9 @@ def select_backend_and_parallelism() -> tuple[str, int, int]:
1505
1555
  - tensor_parallel_size (int): Number of GPUs per node.
1506
1556
  - pipeline_parallel_size (int): Number of stages across nodes.
1507
1557
  """
1558
+ if not torch.cuda.is_available():
1559
+ return "mp", 1, 1
1560
+
1508
1561
  if not ray.is_initialized():
1509
1562
  try:
1510
1563
  ray.init(address="auto", ignore_reinit_error=True)
scandeval/benchmarker.py CHANGED
@@ -15,10 +15,9 @@ from time import sleep
15
15
  from torch.distributed import destroy_process_group
16
16
 
17
17
  from .benchmark_config_factory import build_benchmark_config
18
- from .constants import GENERATIVE_PIPELINE_TAGS
18
+ from .constants import ATTENTION_BACKENDS, GENERATIVE_PIPELINE_TAGS
19
19
  from .data_loading import load_data, load_raw_data
20
20
  from .data_models import BenchmarkConfigParams, BenchmarkResult
21
- from .dataset_configs import get_all_dataset_configs
22
21
  from .enums import Device, GenerativeType, ModelType
23
22
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
24
23
  from .finetuning import finetune
@@ -28,12 +27,9 @@ from .model_config import get_model_config
28
27
  from .model_loading import load_model
29
28
  from .scores import log_scores
30
29
  from .speed_benchmark import benchmark_speed
30
+ from .string_utils import split_model_id
31
31
  from .tasks import SPEED
32
- from .utils import (
33
- enforce_reproducibility,
34
- internet_connection_available,
35
- split_model_id,
36
- )
32
+ from .utils import enforce_reproducibility, internet_connection_available
37
33
 
38
34
  if t.TYPE_CHECKING:
39
35
  from .benchmark_modules import BenchmarkModule
@@ -79,6 +75,9 @@ class Benchmarker:
79
75
  api_base: str | None = None,
80
76
  api_version: str | None = None,
81
77
  gpu_memory_utilization: float = 0.8,
78
+ attention_backend: t.Literal[
79
+ *ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
80
+ ] = "FLASHINFER",
82
81
  generative_type: GenerativeType | None = None,
83
82
  custom_datasets_file: Path | str = Path("custom_datasets.py"),
84
83
  debug: bool = False,
@@ -149,6 +148,9 @@ class Benchmarker:
149
148
  is generative. A larger value will result in faster evaluation, but at
150
149
  the risk of running out of GPU memory. Only reduce this if you are
151
150
  running out of GPU memory. Defaults to 0.9.
151
+ attention_backend:
152
+ The attention backend to use for vLLM. Defaults to FLASHINFER. Only
153
+ relevant if the model is generative.
152
154
  generative_type:
153
155
  The type of generative model to benchmark. Only relevant if the model is
154
156
  generative. If not specified, then the type will be inferred based on
@@ -264,6 +266,7 @@ class Benchmarker:
264
266
  requires_safetensors=requires_safetensors,
265
267
  download_only=download_only,
266
268
  gpu_memory_utilization=gpu_memory_utilization,
269
+ attention_backend=attention_backend,
267
270
  generative_type=generative_type,
268
271
  custom_datasets_file=Path(custom_datasets_file),
269
272
  verbose=verbose,
@@ -341,7 +344,9 @@ class Benchmarker:
341
344
  f"Loading data for {dataset_config.logging_string}", level=logging.INFO
342
345
  )
343
346
  dataset = load_raw_data(
344
- dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
347
+ dataset_config=dataset_config,
348
+ cache_dir=benchmark_config.cache_dir,
349
+ api_key=benchmark_config.api_key,
345
350
  )
346
351
  del dataset
347
352
 
@@ -385,6 +390,10 @@ class Benchmarker:
385
390
  download_only: bool | None = None,
386
391
  gpu_memory_utilization: float | None = None,
387
392
  generative_type: GenerativeType | None = None,
393
+ attention_backend: t.Literal[
394
+ *ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
395
+ ]
396
+ | None = None,
388
397
  custom_datasets_file: Path | str | None = None,
389
398
  force: bool | None = None,
390
399
  verbose: bool | None = None,
@@ -504,6 +513,11 @@ class Benchmarker:
504
513
  ValueError:
505
514
  If both `task` and `dataset` are specified.
506
515
  """
516
+ log(
517
+ "Started EuroEval run. Run with `--verbose` for more information.",
518
+ level=logging.INFO,
519
+ )
520
+
507
521
  if task is not None and dataset is not None:
508
522
  raise ValueError("Only one of `task` and `dataset` can be specified.")
509
523
 
@@ -638,6 +652,11 @@ class Benchmarker:
638
652
  if generative_type is not None
639
653
  else self.benchmark_config_default_params.generative_type
640
654
  ),
655
+ attention_backend=(
656
+ attention_backend
657
+ if attention_backend is not None
658
+ else self.benchmark_config_default_params.attention_backend
659
+ ),
641
660
  custom_datasets_file=(
642
661
  Path(custom_datasets_file)
643
662
  if custom_datasets_file is not None
@@ -776,7 +795,7 @@ class Benchmarker:
776
795
 
777
796
  # Update the benchmark config if the dataset requires it
778
797
  if (
779
- "val" not in dataset_config.splits
798
+ dataset_config.val_split is None
780
799
  and not benchmark_config.evaluate_test_split
781
800
  ):
782
801
  log(
@@ -1052,7 +1071,7 @@ class Benchmarker:
1052
1071
  ),
1053
1072
  validation_split=(
1054
1073
  None
1055
- if "val" not in dataset_config.splits
1074
+ if dataset_config.val_split is None
1056
1075
  else not benchmark_config.evaluate_test_split
1057
1076
  ),
1058
1077
  )
@@ -1167,29 +1186,6 @@ def clear_model_cache_fn(cache_dir: str) -> None:
1167
1186
  rmtree(sub_model_dir)
1168
1187
 
1169
1188
 
1170
- def prepare_dataset_configs(
1171
- dataset_names: c.Sequence[str], custom_datasets_file: Path
1172
- ) -> c.Sequence["DatasetConfig"]:
1173
- """Prepare the dataset configuration(s) to be benchmarked.
1174
-
1175
- Args:
1176
- dataset_names:
1177
- The dataset names to benchmark.
1178
- custom_datasets_file:
1179
- A path to a Python file containing custom dataset configurations.
1180
-
1181
- Returns:
1182
- The prepared list of model IDs.
1183
- """
1184
- return [
1185
- cfg
1186
- for cfg in get_all_dataset_configs(
1187
- custom_datasets_file=custom_datasets_file
1188
- ).values()
1189
- if cfg.name in dataset_names
1190
- ]
1191
-
1192
-
1193
1189
  def initial_logging(
1194
1190
  model_config: "ModelConfig",
1195
1191
  dataset_config: "DatasetConfig",
scandeval/cli.py CHANGED
@@ -5,6 +5,7 @@ from pathlib import Path
5
5
  import click
6
6
 
7
7
  from .benchmarker import Benchmarker
8
+ from .constants import ATTENTION_BACKENDS
8
9
  from .data_models import DatasetConfig
9
10
  from .enums import Device, GenerativeType
10
11
  from .languages import get_all_languages
@@ -170,6 +171,14 @@ from .languages import get_all_languages
170
171
  "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
171
172
  "if you are running out of GPU memory. Only relevant if the model is generative.",
172
173
  )
174
+ @click.option(
175
+ "--attention-backend",
176
+ default="FLASHINFER",
177
+ show_default=True,
178
+ type=click.Choice(ATTENTION_BACKENDS, case_sensitive=True),
179
+ help="The attention backend to use for vLLM. Only relevant if the model is "
180
+ "generative.",
181
+ )
173
182
  @click.option(
174
183
  "--requires-safetensors",
175
184
  is_flag=True,
@@ -254,6 +263,7 @@ def benchmark(
254
263
  api_base: str | None,
255
264
  api_version: str | None,
256
265
  gpu_memory_utilization: float,
266
+ attention_backend: str,
257
267
  requires_safetensors: bool,
258
268
  generative_type: str | None,
259
269
  custom_datasets_file: Path,
@@ -285,6 +295,7 @@ def benchmark(
285
295
  api_base=api_base,
286
296
  api_version=api_version,
287
297
  gpu_memory_utilization=gpu_memory_utilization,
298
+ attention_backend=attention_backend,
288
299
  generative_type=GenerativeType[generative_type.upper()]
289
300
  if generative_type
290
301
  else None,
scandeval/constants.py CHANGED
@@ -33,8 +33,8 @@ GENERATIVE_PIPELINE_TAGS = [
33
33
  # Used to disallow non-generative models to be evaluated on these task groups
34
34
  GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
35
35
 
36
- # Local models are required to have these files in their directory
37
- LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
36
+ # Local models are required to have one of these files in their directory
37
+ LOCAL_MODELS_REQUIRED_FILES = ["config.json", "adapter_config.json"]
38
38
 
39
39
  # The number of top log probabilities to return for generative models. For several APIs
40
40
  # this is the maximum number of log probabilities that can be returned
@@ -105,3 +105,37 @@ GENERATION_KWARGS = {
105
105
  "top_k": 0,
106
106
  "repetition_penalty": 1.0,
107
107
  }
108
+
109
+ # This is a mirror of `AttentionBackendEnum` in vLLM, but since we don't have access to
110
+ # this when running on CPU/MacOS (as we can only run an old vLLM version), we have to
111
+ # define it here
112
+ ATTENTION_BACKENDS: list[str] = [
113
+ "FLASH_ATTN",
114
+ "FLASH_ATTN_DIFFKV",
115
+ "TRITON_ATTN",
116
+ "ROCM_ATTN",
117
+ "ROCM_AITER_MLA",
118
+ "ROCM_AITER_TRITON_MLA",
119
+ "ROCM_AITER_FA",
120
+ "ROCM_AITER_MLA_SPARSE",
121
+ "TORCH_SDPA",
122
+ "FLASHINFER",
123
+ "FLASHINFER_MLA",
124
+ "TRITON_MLA",
125
+ "CUTLASS_MLA",
126
+ "FLASHMLA",
127
+ "FLASHMLA_SPARSE",
128
+ "FLASH_ATTN_MLA",
129
+ "IPEX",
130
+ "NO_ATTENTION",
131
+ "FLEX_ATTENTION",
132
+ "TREE_ATTN",
133
+ "ROCM_AITER_UNIFIED_ATTN",
134
+ "CPU_ATTN",
135
+ "CUSTOM",
136
+ ]
137
+
138
+ # If a dataset configuration has more than this number of languages, we won't log any of
139
+ # the languages. This is for instance the case for the speed benchmark, which has all
140
+ # the languages. The threshold of 5 is somewhat arbitrary.
141
+ MAX_NUMBER_OF_LOGGING_LANGUAGES = 5