ScandEval 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scandeval/__init__.py +0 -9
- scandeval/async_utils.py +46 -0
- scandeval/benchmark_config_factory.py +31 -2
- scandeval/benchmark_modules/fresh.py +2 -1
- scandeval/benchmark_modules/hf.py +76 -23
- scandeval/benchmark_modules/litellm.py +33 -15
- scandeval/benchmark_modules/vllm.py +97 -44
- scandeval/benchmarker.py +29 -33
- scandeval/cli.py +11 -0
- scandeval/constants.py +36 -2
- scandeval/custom_dataset_configs.py +152 -0
- scandeval/data_loading.py +87 -31
- scandeval/data_models.py +405 -224
- scandeval/dataset_configs/__init__.py +51 -25
- scandeval/dataset_configs/albanian.py +1 -1
- scandeval/dataset_configs/belarusian.py +47 -0
- scandeval/dataset_configs/bulgarian.py +1 -1
- scandeval/dataset_configs/catalan.py +1 -1
- scandeval/dataset_configs/croatian.py +1 -1
- scandeval/dataset_configs/danish.py +3 -2
- scandeval/dataset_configs/dutch.py +16 -5
- scandeval/dataset_configs/english.py +4 -3
- scandeval/dataset_configs/estonian.py +8 -7
- scandeval/dataset_configs/faroese.py +1 -1
- scandeval/dataset_configs/finnish.py +5 -4
- scandeval/dataset_configs/french.py +6 -5
- scandeval/dataset_configs/german.py +4 -3
- scandeval/dataset_configs/greek.py +1 -1
- scandeval/dataset_configs/hungarian.py +1 -1
- scandeval/dataset_configs/icelandic.py +4 -3
- scandeval/dataset_configs/italian.py +4 -3
- scandeval/dataset_configs/latvian.py +2 -2
- scandeval/dataset_configs/lithuanian.py +1 -1
- scandeval/dataset_configs/norwegian.py +6 -5
- scandeval/dataset_configs/polish.py +4 -3
- scandeval/dataset_configs/portuguese.py +5 -4
- scandeval/dataset_configs/romanian.py +2 -2
- scandeval/dataset_configs/serbian.py +1 -1
- scandeval/dataset_configs/slovene.py +1 -1
- scandeval/dataset_configs/spanish.py +4 -3
- scandeval/dataset_configs/swedish.py +4 -3
- scandeval/dataset_configs/ukrainian.py +1 -1
- scandeval/generation_utils.py +6 -6
- scandeval/metrics/__init__.py +1 -0
- scandeval/metrics/bias.py +237 -0
- scandeval/metrics/huggingface.py +2 -1
- scandeval/metrics/llm_as_a_judge.py +1 -1
- scandeval/metrics/pipeline.py +1 -1
- scandeval/model_cache.py +34 -4
- scandeval/prompt_templates/linguistic_acceptability.py +9 -0
- scandeval/prompt_templates/multiple_choice.py +9 -0
- scandeval/prompt_templates/named_entity_recognition.py +21 -0
- scandeval/prompt_templates/reading_comprehension.py +10 -0
- scandeval/prompt_templates/sentiment_classification.py +11 -0
- scandeval/string_utils.py +157 -0
- scandeval/task_group_utils/sequence_classification.py +2 -5
- scandeval/task_group_utils/token_classification.py +2 -4
- scandeval/tasks.py +22 -0
- scandeval/tokenisation_utils.py +12 -1
- scandeval/utils.py +13 -383
- scandeval-16.13.0.dist-info/METADATA +334 -0
- scandeval-16.13.0.dist-info/RECORD +94 -0
- scandeval-16.11.0.dist-info/METADATA +0 -649
- scandeval-16.11.0.dist-info/RECORD +0 -89
- {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
- {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
- {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -21,6 +21,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
|
21
21
|
from urllib3.exceptions import RequestError
|
|
22
22
|
|
|
23
23
|
from ..constants import (
|
|
24
|
+
ATTENTION_BACKENDS,
|
|
24
25
|
CUSTOM_STOP_TOKENS,
|
|
25
26
|
GENERATION_KWARGS,
|
|
26
27
|
GENERATIVE_PIPELINE_TAGS,
|
|
@@ -53,6 +54,8 @@ from ..generation_utils import (
|
|
|
53
54
|
)
|
|
54
55
|
from ..languages import get_all_languages
|
|
55
56
|
from ..logging_utils import get_pbar, log, log_once, no_terminal_output
|
|
57
|
+
from ..model_cache import create_model_cache_dir
|
|
58
|
+
from ..string_utils import split_model_id
|
|
56
59
|
from ..task_group_utils import (
|
|
57
60
|
question_answering,
|
|
58
61
|
sequence_classification,
|
|
@@ -71,14 +74,11 @@ from ..tokenisation_utils import (
|
|
|
71
74
|
)
|
|
72
75
|
from ..types import ExtractLabelsFunction, Tokeniser
|
|
73
76
|
from ..utils import (
|
|
74
|
-
attention_backend,
|
|
75
77
|
clear_memory,
|
|
76
|
-
create_model_cache_dir,
|
|
77
78
|
get_hf_token,
|
|
78
79
|
get_min_cuda_compute_capability,
|
|
79
80
|
internet_connection_available,
|
|
80
81
|
resolve_model_path,
|
|
81
|
-
split_model_id,
|
|
82
82
|
)
|
|
83
83
|
from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
|
|
84
84
|
|
|
@@ -90,18 +90,23 @@ except ImportError:
|
|
|
90
90
|
)
|
|
91
91
|
|
|
92
92
|
if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
93
|
-
|
|
94
|
-
|
|
93
|
+
import vllm.config
|
|
94
|
+
|
|
95
|
+
# MacOS/CPU installs an older version of vLLM, which doesn't have the attention
|
|
96
|
+
# config
|
|
97
|
+
if hasattr(vllm.config, "attention"):
|
|
98
|
+
from vllm.config.attention import AttentionConfig
|
|
99
|
+
|
|
100
|
+
from vllm import LLM, SamplingParams
|
|
101
|
+
from vllm.distributed.parallel_state import (
|
|
95
102
|
destroy_distributed_environment,
|
|
96
103
|
destroy_model_parallel,
|
|
97
104
|
)
|
|
98
|
-
from vllm.lora.request import LoRARequest
|
|
99
|
-
from vllm.sampling_params import
|
|
100
|
-
StructuredOutputsParams,
|
|
101
|
-
)
|
|
105
|
+
from vllm.lora.request import LoRARequest
|
|
106
|
+
from vllm.sampling_params import StructuredOutputsParams
|
|
102
107
|
|
|
103
108
|
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
104
|
-
import ray
|
|
109
|
+
import ray
|
|
105
110
|
|
|
106
111
|
|
|
107
112
|
if t.TYPE_CHECKING:
|
|
@@ -111,7 +116,9 @@ if t.TYPE_CHECKING:
|
|
|
111
116
|
from ..data_models import BenchmarkConfig, DatasetConfig, Task
|
|
112
117
|
|
|
113
118
|
|
|
114
|
-
MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[
|
|
119
|
+
MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[
|
|
120
|
+
re.Pattern, t.Literal[*ATTENTION_BACKENDS] # pyrefly: ignore[invalid-literal]
|
|
121
|
+
] = {
|
|
115
122
|
re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "TRITON_ATTN",
|
|
116
123
|
re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "TRITON_ATTN",
|
|
117
124
|
re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "TRITON_ATTN",
|
|
@@ -153,7 +160,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
153
160
|
if importlib.util.find_spec("vllm") is None:
|
|
154
161
|
raise NeedsExtraInstalled(extra="generative")
|
|
155
162
|
|
|
156
|
-
if shutil.which("nvcc") is None:
|
|
163
|
+
if torch.cuda.is_available() and shutil.which("nvcc") is None:
|
|
157
164
|
raise NeedsSystemDependency(
|
|
158
165
|
dependency="nvcc",
|
|
159
166
|
instructions=(
|
|
@@ -163,23 +170,43 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
163
170
|
),
|
|
164
171
|
)
|
|
165
172
|
|
|
173
|
+
if not torch.cuda.is_available() and (
|
|
174
|
+
dataset_config.task.task_group
|
|
175
|
+
in [
|
|
176
|
+
TaskGroup.SEQUENCE_CLASSIFICATION,
|
|
177
|
+
TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
178
|
+
]
|
|
179
|
+
or dataset_config.task.uses_structured_output
|
|
180
|
+
):
|
|
181
|
+
raise InvalidBenchmark(
|
|
182
|
+
"We currently require CUDA to benchmark generative models on tasks "
|
|
183
|
+
"that uses structured generation, which includes the current task "
|
|
184
|
+
f"{dataset_config.task.name}. This is due to an xgrammar issue, which "
|
|
185
|
+
"will hopefully be fixed soon."
|
|
186
|
+
)
|
|
187
|
+
|
|
166
188
|
raise_if_wrong_params(
|
|
167
189
|
model_config=model_config, allowed_params=self.allowed_params
|
|
168
190
|
)
|
|
169
191
|
|
|
170
|
-
#
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
192
|
+
# Determine the attention backend to use:
|
|
193
|
+
# Override for models that require a specific backend, otherwise use user's
|
|
194
|
+
# choice from CLI (defaults to FLASHINFER)
|
|
195
|
+
if hasattr(vllm.config, "attention"):
|
|
196
|
+
for pattern, backend in MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS.items():
|
|
197
|
+
if re.search(pattern=pattern, string=model_config.model_id):
|
|
198
|
+
attention_backend = backend
|
|
199
|
+
break
|
|
200
|
+
else:
|
|
201
|
+
attention_backend = benchmark_config.attention_backend
|
|
202
|
+
else:
|
|
203
|
+
attention_backend = benchmark_config.attention_backend
|
|
176
204
|
|
|
177
|
-
with (
|
|
178
|
-
no_terminal_output(disable=benchmark_config.verbose),
|
|
179
|
-
attention_backend(value=default_flash_attention_backend),
|
|
180
|
-
):
|
|
205
|
+
with no_terminal_output(disable=benchmark_config.verbose):
|
|
181
206
|
model, tokeniser = load_model_and_tokeniser(
|
|
182
|
-
model_config=model_config,
|
|
207
|
+
model_config=model_config,
|
|
208
|
+
benchmark_config=benchmark_config,
|
|
209
|
+
attention_backend=attention_backend,
|
|
183
210
|
)
|
|
184
211
|
self._model: "LLM" = model
|
|
185
212
|
self._tokeniser: Tokeniser = tokeniser
|
|
@@ -216,11 +243,14 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
216
243
|
)
|
|
217
244
|
)
|
|
218
245
|
if self.model_config.adapter_base_model_id is not None:
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
246
|
+
if Path(self.model_config.model_id).exists():
|
|
247
|
+
adapter_path = self.model_config.model_id
|
|
248
|
+
else:
|
|
249
|
+
adapter_path = snapshot_download(
|
|
250
|
+
repo_id=self.model_config.model_id,
|
|
251
|
+
revision=self.model_config.revision,
|
|
252
|
+
cache_dir=Path(self.model_config.model_cache_dir),
|
|
253
|
+
)
|
|
224
254
|
self.buffer["lora_request"] = LoRARequest(
|
|
225
255
|
lora_name="adapter", lora_int_id=1, lora_path=adapter_path
|
|
226
256
|
)
|
|
@@ -543,7 +573,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
543
573
|
else None,
|
|
544
574
|
temperature=generation_kwargs["temperature"],
|
|
545
575
|
top_p=generation_kwargs["top_p"],
|
|
546
|
-
top_k=generation_kwargs["top_k"],
|
|
576
|
+
top_k=int(generation_kwargs["top_k"]),
|
|
547
577
|
repetition_penalty=generation_kwargs["repetition_penalty"],
|
|
548
578
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
549
579
|
structured_outputs=structured_outputs,
|
|
@@ -552,10 +582,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
552
582
|
# If any of the prompts are empty then we need to replace them with a BOS token
|
|
553
583
|
# so that the vLLM model can generate from them
|
|
554
584
|
prompts: c.Sequence[str] = inputs["text"]
|
|
555
|
-
if any(len(prompt) == 0 for prompt in prompts):
|
|
585
|
+
if any(len(prompt.strip()) == 0 for prompt in prompts):
|
|
556
586
|
log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
|
|
557
587
|
prompts = [
|
|
558
|
-
prompt
|
|
588
|
+
prompt
|
|
589
|
+
if len(prompt.strip()) > 0
|
|
590
|
+
else str(self._tokeniser.bos_token or "x")
|
|
559
591
|
for prompt in prompts
|
|
560
592
|
]
|
|
561
593
|
|
|
@@ -583,7 +615,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
583
615
|
text=prompts, max_length=max_tokens_per_prompt
|
|
584
616
|
)
|
|
585
617
|
if any(
|
|
586
|
-
len(input_ids)
|
|
618
|
+
len(input_ids) >= max_tokens_per_prompt
|
|
587
619
|
for input_ids in tokenized_prompts.input_ids
|
|
588
620
|
):
|
|
589
621
|
log(
|
|
@@ -615,7 +647,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
615
647
|
for prompt in prompts
|
|
616
648
|
]
|
|
617
649
|
for num_few_shots_to_remove in range(
|
|
618
|
-
|
|
650
|
+
1, self.dataset_config.num_few_shot_examples + 1
|
|
619
651
|
):
|
|
620
652
|
new_prompts = [
|
|
621
653
|
end_of_chat_token.join(
|
|
@@ -627,7 +659,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
627
659
|
text=new_prompts, max_length=max_tokens_per_prompt
|
|
628
660
|
)
|
|
629
661
|
if all(
|
|
630
|
-
len(input_ids)
|
|
662
|
+
len(input_ids) < max_tokens_per_prompt
|
|
631
663
|
for input_ids in tokenized_prompts.input_ids
|
|
632
664
|
):
|
|
633
665
|
prompts = new_prompts
|
|
@@ -637,6 +669,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
637
669
|
"Truncation of prompts failed, some prompts are still too "
|
|
638
670
|
"long."
|
|
639
671
|
)
|
|
672
|
+
case _:
|
|
673
|
+
raise InvalidBenchmark("The model type is not set!")
|
|
640
674
|
else:
|
|
641
675
|
log(
|
|
642
676
|
f"Truncation of prompts for model {self.model_config.model_id!r} is "
|
|
@@ -939,7 +973,11 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
939
973
|
|
|
940
974
|
|
|
941
975
|
def load_model_and_tokeniser(
|
|
942
|
-
model_config: "ModelConfig",
|
|
976
|
+
model_config: "ModelConfig",
|
|
977
|
+
benchmark_config: "BenchmarkConfig",
|
|
978
|
+
attention_backend: t.Literal[
|
|
979
|
+
*ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
|
|
980
|
+
],
|
|
943
981
|
) -> tuple["LLM", Tokeniser]:
|
|
944
982
|
"""Load the model and tokeniser.
|
|
945
983
|
|
|
@@ -948,6 +986,8 @@ def load_model_and_tokeniser(
|
|
|
948
986
|
The model configuration.
|
|
949
987
|
benchmark_config:
|
|
950
988
|
The benchmark configuration.
|
|
989
|
+
attention_backend:
|
|
990
|
+
The attention backend to use.
|
|
951
991
|
|
|
952
992
|
Returns:
|
|
953
993
|
A pair (model, tokeniser), with the loaded model and tokeniser
|
|
@@ -1064,10 +1104,15 @@ def load_model_and_tokeniser(
|
|
|
1064
1104
|
model_config=model_config,
|
|
1065
1105
|
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
1066
1106
|
)
|
|
1067
|
-
|
|
1107
|
+
vllm_params = get_vllm_tokenisation_params(
|
|
1068
1108
|
tokeniser=tokeniser, model_config=model_config
|
|
1069
1109
|
)
|
|
1070
1110
|
|
|
1111
|
+
# MacOS/CPU installs an older version of vLLM, which doesn't have the attention
|
|
1112
|
+
# config
|
|
1113
|
+
if hasattr(vllm.config, "attention"):
|
|
1114
|
+
vllm_params["attention_config"] = AttentionConfig(backend=attention_backend)
|
|
1115
|
+
|
|
1071
1116
|
clear_vllm()
|
|
1072
1117
|
|
|
1073
1118
|
distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
|
|
@@ -1080,11 +1125,16 @@ def load_model_and_tokeniser(
|
|
|
1080
1125
|
if internet_connection_available() or Path(model_id).is_dir()
|
|
1081
1126
|
else resolve_model_path(download_dir=download_dir)
|
|
1082
1127
|
)
|
|
1128
|
+
|
|
1129
|
+
max_model_len = min(
|
|
1130
|
+
true_max_model_len, MAX_CONTEXT_LENGTH + REASONING_MAX_TOKENS
|
|
1131
|
+
)
|
|
1083
1132
|
model = LLM(
|
|
1084
1133
|
model=model_location,
|
|
1085
1134
|
tokenizer=model_location,
|
|
1086
1135
|
gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
|
|
1087
|
-
max_model_len=
|
|
1136
|
+
max_model_len=max_model_len,
|
|
1137
|
+
max_num_batched_tokens=max_model_len,
|
|
1088
1138
|
download_dir=download_dir,
|
|
1089
1139
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
1090
1140
|
revision=revision,
|
|
@@ -1094,14 +1144,14 @@ def load_model_and_tokeniser(
|
|
|
1094
1144
|
pipeline_parallel_size=pipeline_parallel_size,
|
|
1095
1145
|
disable_custom_all_reduce=True,
|
|
1096
1146
|
quantization=quantization,
|
|
1097
|
-
dtype=dtype,
|
|
1147
|
+
dtype=dtype, # pyrefly: ignore[bad-argument-type]
|
|
1098
1148
|
enforce_eager=True,
|
|
1099
1149
|
# TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
|
|
1100
1150
|
# so we disable it for now
|
|
1101
1151
|
enable_prefix_caching=False,
|
|
1102
1152
|
enable_lora=model_config.adapter_base_model_id is not None,
|
|
1103
1153
|
max_lora_rank=256,
|
|
1104
|
-
**
|
|
1154
|
+
**vllm_params,
|
|
1105
1155
|
)
|
|
1106
1156
|
except (RuntimeError, ValueError, OSError) as e:
|
|
1107
1157
|
if "awaiting a review from the repo authors" in str(e):
|
|
@@ -1126,11 +1176,11 @@ def load_model_and_tokeniser(
|
|
|
1126
1176
|
(
|
|
1127
1177
|
"Since you're running in verbose mode, you might see a descriptive "
|
|
1128
1178
|
"error above already. Note however that if the error message urges "
|
|
1129
|
-
"you to
|
|
1130
|
-
"
|
|
1131
|
-
"as that often solves the
|
|
1132
|
-
"doesn't. If you don't
|
|
1133
|
-
"can try "
|
|
1179
|
+
"you to use the attention backend 'FLEX_ATTENTION', please try "
|
|
1180
|
+
"setting it to 'TRITON_ATTN' instead using the "
|
|
1181
|
+
"`--attention-backend` CLI argument, as that often solves the "
|
|
1182
|
+
"issue, whereas 'FLEX_ATTENTION' usually doesn't. If you don't "
|
|
1183
|
+
"see any descriptive error above, then you can try "
|
|
1134
1184
|
)
|
|
1135
1185
|
if benchmark_config.verbose
|
|
1136
1186
|
else "Try "
|
|
@@ -1505,6 +1555,9 @@ def select_backend_and_parallelism() -> tuple[str, int, int]:
|
|
|
1505
1555
|
- tensor_parallel_size (int): Number of GPUs per node.
|
|
1506
1556
|
- pipeline_parallel_size (int): Number of stages across nodes.
|
|
1507
1557
|
"""
|
|
1558
|
+
if not torch.cuda.is_available():
|
|
1559
|
+
return "mp", 1, 1
|
|
1560
|
+
|
|
1508
1561
|
if not ray.is_initialized():
|
|
1509
1562
|
try:
|
|
1510
1563
|
ray.init(address="auto", ignore_reinit_error=True)
|
scandeval/benchmarker.py
CHANGED
|
@@ -15,10 +15,9 @@ from time import sleep
|
|
|
15
15
|
from torch.distributed import destroy_process_group
|
|
16
16
|
|
|
17
17
|
from .benchmark_config_factory import build_benchmark_config
|
|
18
|
-
from .constants import GENERATIVE_PIPELINE_TAGS
|
|
18
|
+
from .constants import ATTENTION_BACKENDS, GENERATIVE_PIPELINE_TAGS
|
|
19
19
|
from .data_loading import load_data, load_raw_data
|
|
20
20
|
from .data_models import BenchmarkConfigParams, BenchmarkResult
|
|
21
|
-
from .dataset_configs import get_all_dataset_configs
|
|
22
21
|
from .enums import Device, GenerativeType, ModelType
|
|
23
22
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
|
|
24
23
|
from .finetuning import finetune
|
|
@@ -28,12 +27,9 @@ from .model_config import get_model_config
|
|
|
28
27
|
from .model_loading import load_model
|
|
29
28
|
from .scores import log_scores
|
|
30
29
|
from .speed_benchmark import benchmark_speed
|
|
30
|
+
from .string_utils import split_model_id
|
|
31
31
|
from .tasks import SPEED
|
|
32
|
-
from .utils import
|
|
33
|
-
enforce_reproducibility,
|
|
34
|
-
internet_connection_available,
|
|
35
|
-
split_model_id,
|
|
36
|
-
)
|
|
32
|
+
from .utils import enforce_reproducibility, internet_connection_available
|
|
37
33
|
|
|
38
34
|
if t.TYPE_CHECKING:
|
|
39
35
|
from .benchmark_modules import BenchmarkModule
|
|
@@ -79,6 +75,9 @@ class Benchmarker:
|
|
|
79
75
|
api_base: str | None = None,
|
|
80
76
|
api_version: str | None = None,
|
|
81
77
|
gpu_memory_utilization: float = 0.8,
|
|
78
|
+
attention_backend: t.Literal[
|
|
79
|
+
*ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
|
|
80
|
+
] = "FLASHINFER",
|
|
82
81
|
generative_type: GenerativeType | None = None,
|
|
83
82
|
custom_datasets_file: Path | str = Path("custom_datasets.py"),
|
|
84
83
|
debug: bool = False,
|
|
@@ -149,6 +148,9 @@ class Benchmarker:
|
|
|
149
148
|
is generative. A larger value will result in faster evaluation, but at
|
|
150
149
|
the risk of running out of GPU memory. Only reduce this if you are
|
|
151
150
|
running out of GPU memory. Defaults to 0.9.
|
|
151
|
+
attention_backend:
|
|
152
|
+
The attention backend to use for vLLM. Defaults to FLASHINFER. Only
|
|
153
|
+
relevant if the model is generative.
|
|
152
154
|
generative_type:
|
|
153
155
|
The type of generative model to benchmark. Only relevant if the model is
|
|
154
156
|
generative. If not specified, then the type will be inferred based on
|
|
@@ -264,6 +266,7 @@ class Benchmarker:
|
|
|
264
266
|
requires_safetensors=requires_safetensors,
|
|
265
267
|
download_only=download_only,
|
|
266
268
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
269
|
+
attention_backend=attention_backend,
|
|
267
270
|
generative_type=generative_type,
|
|
268
271
|
custom_datasets_file=Path(custom_datasets_file),
|
|
269
272
|
verbose=verbose,
|
|
@@ -341,7 +344,9 @@ class Benchmarker:
|
|
|
341
344
|
f"Loading data for {dataset_config.logging_string}", level=logging.INFO
|
|
342
345
|
)
|
|
343
346
|
dataset = load_raw_data(
|
|
344
|
-
dataset_config=dataset_config,
|
|
347
|
+
dataset_config=dataset_config,
|
|
348
|
+
cache_dir=benchmark_config.cache_dir,
|
|
349
|
+
api_key=benchmark_config.api_key,
|
|
345
350
|
)
|
|
346
351
|
del dataset
|
|
347
352
|
|
|
@@ -385,6 +390,10 @@ class Benchmarker:
|
|
|
385
390
|
download_only: bool | None = None,
|
|
386
391
|
gpu_memory_utilization: float | None = None,
|
|
387
392
|
generative_type: GenerativeType | None = None,
|
|
393
|
+
attention_backend: t.Literal[
|
|
394
|
+
*ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
|
|
395
|
+
]
|
|
396
|
+
| None = None,
|
|
388
397
|
custom_datasets_file: Path | str | None = None,
|
|
389
398
|
force: bool | None = None,
|
|
390
399
|
verbose: bool | None = None,
|
|
@@ -504,6 +513,11 @@ class Benchmarker:
|
|
|
504
513
|
ValueError:
|
|
505
514
|
If both `task` and `dataset` are specified.
|
|
506
515
|
"""
|
|
516
|
+
log(
|
|
517
|
+
"Started EuroEval run. Run with `--verbose` for more information.",
|
|
518
|
+
level=logging.INFO,
|
|
519
|
+
)
|
|
520
|
+
|
|
507
521
|
if task is not None and dataset is not None:
|
|
508
522
|
raise ValueError("Only one of `task` and `dataset` can be specified.")
|
|
509
523
|
|
|
@@ -638,6 +652,11 @@ class Benchmarker:
|
|
|
638
652
|
if generative_type is not None
|
|
639
653
|
else self.benchmark_config_default_params.generative_type
|
|
640
654
|
),
|
|
655
|
+
attention_backend=(
|
|
656
|
+
attention_backend
|
|
657
|
+
if attention_backend is not None
|
|
658
|
+
else self.benchmark_config_default_params.attention_backend
|
|
659
|
+
),
|
|
641
660
|
custom_datasets_file=(
|
|
642
661
|
Path(custom_datasets_file)
|
|
643
662
|
if custom_datasets_file is not None
|
|
@@ -776,7 +795,7 @@ class Benchmarker:
|
|
|
776
795
|
|
|
777
796
|
# Update the benchmark config if the dataset requires it
|
|
778
797
|
if (
|
|
779
|
-
|
|
798
|
+
dataset_config.val_split is None
|
|
780
799
|
and not benchmark_config.evaluate_test_split
|
|
781
800
|
):
|
|
782
801
|
log(
|
|
@@ -1052,7 +1071,7 @@ class Benchmarker:
|
|
|
1052
1071
|
),
|
|
1053
1072
|
validation_split=(
|
|
1054
1073
|
None
|
|
1055
|
-
if
|
|
1074
|
+
if dataset_config.val_split is None
|
|
1056
1075
|
else not benchmark_config.evaluate_test_split
|
|
1057
1076
|
),
|
|
1058
1077
|
)
|
|
@@ -1167,29 +1186,6 @@ def clear_model_cache_fn(cache_dir: str) -> None:
|
|
|
1167
1186
|
rmtree(sub_model_dir)
|
|
1168
1187
|
|
|
1169
1188
|
|
|
1170
|
-
def prepare_dataset_configs(
|
|
1171
|
-
dataset_names: c.Sequence[str], custom_datasets_file: Path
|
|
1172
|
-
) -> c.Sequence["DatasetConfig"]:
|
|
1173
|
-
"""Prepare the dataset configuration(s) to be benchmarked.
|
|
1174
|
-
|
|
1175
|
-
Args:
|
|
1176
|
-
dataset_names:
|
|
1177
|
-
The dataset names to benchmark.
|
|
1178
|
-
custom_datasets_file:
|
|
1179
|
-
A path to a Python file containing custom dataset configurations.
|
|
1180
|
-
|
|
1181
|
-
Returns:
|
|
1182
|
-
The prepared list of model IDs.
|
|
1183
|
-
"""
|
|
1184
|
-
return [
|
|
1185
|
-
cfg
|
|
1186
|
-
for cfg in get_all_dataset_configs(
|
|
1187
|
-
custom_datasets_file=custom_datasets_file
|
|
1188
|
-
).values()
|
|
1189
|
-
if cfg.name in dataset_names
|
|
1190
|
-
]
|
|
1191
|
-
|
|
1192
|
-
|
|
1193
1189
|
def initial_logging(
|
|
1194
1190
|
model_config: "ModelConfig",
|
|
1195
1191
|
dataset_config: "DatasetConfig",
|
scandeval/cli.py
CHANGED
|
@@ -5,6 +5,7 @@ from pathlib import Path
|
|
|
5
5
|
import click
|
|
6
6
|
|
|
7
7
|
from .benchmarker import Benchmarker
|
|
8
|
+
from .constants import ATTENTION_BACKENDS
|
|
8
9
|
from .data_models import DatasetConfig
|
|
9
10
|
from .enums import Device, GenerativeType
|
|
10
11
|
from .languages import get_all_languages
|
|
@@ -170,6 +171,14 @@ from .languages import get_all_languages
|
|
|
170
171
|
"faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
|
|
171
172
|
"if you are running out of GPU memory. Only relevant if the model is generative.",
|
|
172
173
|
)
|
|
174
|
+
@click.option(
|
|
175
|
+
"--attention-backend",
|
|
176
|
+
default="FLASHINFER",
|
|
177
|
+
show_default=True,
|
|
178
|
+
type=click.Choice(ATTENTION_BACKENDS, case_sensitive=True),
|
|
179
|
+
help="The attention backend to use for vLLM. Only relevant if the model is "
|
|
180
|
+
"generative.",
|
|
181
|
+
)
|
|
173
182
|
@click.option(
|
|
174
183
|
"--requires-safetensors",
|
|
175
184
|
is_flag=True,
|
|
@@ -254,6 +263,7 @@ def benchmark(
|
|
|
254
263
|
api_base: str | None,
|
|
255
264
|
api_version: str | None,
|
|
256
265
|
gpu_memory_utilization: float,
|
|
266
|
+
attention_backend: str,
|
|
257
267
|
requires_safetensors: bool,
|
|
258
268
|
generative_type: str | None,
|
|
259
269
|
custom_datasets_file: Path,
|
|
@@ -285,6 +295,7 @@ def benchmark(
|
|
|
285
295
|
api_base=api_base,
|
|
286
296
|
api_version=api_version,
|
|
287
297
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
298
|
+
attention_backend=attention_backend,
|
|
288
299
|
generative_type=GenerativeType[generative_type.upper()]
|
|
289
300
|
if generative_type
|
|
290
301
|
else None,
|
scandeval/constants.py
CHANGED
|
@@ -33,8 +33,8 @@ GENERATIVE_PIPELINE_TAGS = [
|
|
|
33
33
|
# Used to disallow non-generative models to be evaluated on these task groups
|
|
34
34
|
GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
|
|
35
35
|
|
|
36
|
-
# Local models are required to have these files in their directory
|
|
37
|
-
LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
|
|
36
|
+
# Local models are required to have one of these files in their directory
|
|
37
|
+
LOCAL_MODELS_REQUIRED_FILES = ["config.json", "adapter_config.json"]
|
|
38
38
|
|
|
39
39
|
# The number of top log probabilities to return for generative models. For several APIs
|
|
40
40
|
# this is the maximum number of log probabilities that can be returned
|
|
@@ -105,3 +105,37 @@ GENERATION_KWARGS = {
|
|
|
105
105
|
"top_k": 0,
|
|
106
106
|
"repetition_penalty": 1.0,
|
|
107
107
|
}
|
|
108
|
+
|
|
109
|
+
# This is a mirror of `AttentionBackendEnum` in vLLM, but since we don't have access to
|
|
110
|
+
# this when running on CPU/MacOS (as we can only run an old vLLM version), we have to
|
|
111
|
+
# define it here
|
|
112
|
+
ATTENTION_BACKENDS: list[str] = [
|
|
113
|
+
"FLASH_ATTN",
|
|
114
|
+
"FLASH_ATTN_DIFFKV",
|
|
115
|
+
"TRITON_ATTN",
|
|
116
|
+
"ROCM_ATTN",
|
|
117
|
+
"ROCM_AITER_MLA",
|
|
118
|
+
"ROCM_AITER_TRITON_MLA",
|
|
119
|
+
"ROCM_AITER_FA",
|
|
120
|
+
"ROCM_AITER_MLA_SPARSE",
|
|
121
|
+
"TORCH_SDPA",
|
|
122
|
+
"FLASHINFER",
|
|
123
|
+
"FLASHINFER_MLA",
|
|
124
|
+
"TRITON_MLA",
|
|
125
|
+
"CUTLASS_MLA",
|
|
126
|
+
"FLASHMLA",
|
|
127
|
+
"FLASHMLA_SPARSE",
|
|
128
|
+
"FLASH_ATTN_MLA",
|
|
129
|
+
"IPEX",
|
|
130
|
+
"NO_ATTENTION",
|
|
131
|
+
"FLEX_ATTENTION",
|
|
132
|
+
"TREE_ATTN",
|
|
133
|
+
"ROCM_AITER_UNIFIED_ATTN",
|
|
134
|
+
"CPU_ATTN",
|
|
135
|
+
"CUSTOM",
|
|
136
|
+
]
|
|
137
|
+
|
|
138
|
+
# If a dataset configuration has more than this number of languages, we won't log any of
|
|
139
|
+
# the languages. This is for instance the case for the speed benchmark, which has all
|
|
140
|
+
# the languages. The threshold of 5 is somewhat arbitrary.
|
|
141
|
+
MAX_NUMBER_OF_LOGGING_LANGUAGES = 5
|