ScandEval 16.10.1__py3-none-any.whl → 16.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scandeval/__init__.py +0 -9
- scandeval/benchmark_config_factory.py +5 -0
- scandeval/benchmark_modules/hf.py +36 -8
- scandeval/benchmark_modules/litellm.py +119 -22
- scandeval/benchmark_modules/vllm.py +202 -94
- scandeval/benchmarker.py +28 -7
- scandeval/cli.py +13 -0
- scandeval/constants.py +31 -2
- scandeval/data_models.py +12 -2
- scandeval/dataset_configs/dutch.py +10 -0
- scandeval/logging_utils.py +1 -1
- scandeval/metrics/__init__.py +1 -0
- scandeval/metrics/bias.py +237 -0
- scandeval/metrics/huggingface.py +5 -3
- scandeval/metrics/llm_as_a_judge.py +79 -15
- scandeval/model_loading.py +2 -1
- scandeval/task_group_utils/sequence_classification.py +12 -3
- scandeval/tasks.py +22 -0
- scandeval/tokenisation_utils.py +12 -1
- scandeval/types.py +39 -0
- scandeval/utils.py +38 -66
- {scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/METADATA +50 -24
- {scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/RECORD +26 -25
- {scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/licenses/LICENSE +1 -1
- {scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/WHEEL +0 -0
- {scandeval-16.10.1.dist-info → scandeval-16.12.0.dist-info}/entry_points.txt +0 -0
|
@@ -21,6 +21,7 @@ from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
|
21
21
|
from urllib3.exceptions import RequestError
|
|
22
22
|
|
|
23
23
|
from ..constants import (
|
|
24
|
+
ATTENTION_BACKENDS,
|
|
24
25
|
CUSTOM_STOP_TOKENS,
|
|
25
26
|
GENERATION_KWARGS,
|
|
26
27
|
GENERATIVE_PIPELINE_TAGS,
|
|
@@ -71,7 +72,6 @@ from ..tokenisation_utils import (
|
|
|
71
72
|
)
|
|
72
73
|
from ..types import ExtractLabelsFunction, Tokeniser
|
|
73
74
|
from ..utils import (
|
|
74
|
-
attention_backend,
|
|
75
75
|
clear_memory,
|
|
76
76
|
create_model_cache_dir,
|
|
77
77
|
get_hf_token,
|
|
@@ -90,18 +90,23 @@ except ImportError:
|
|
|
90
90
|
)
|
|
91
91
|
|
|
92
92
|
if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
93
|
-
|
|
94
|
-
|
|
93
|
+
import vllm.config
|
|
94
|
+
|
|
95
|
+
# MacOS/CPU installs an older version of vLLM, which doesn't have the attention
|
|
96
|
+
# config
|
|
97
|
+
if hasattr(vllm.config, "attention"):
|
|
98
|
+
from vllm.config.attention import AttentionConfig
|
|
99
|
+
|
|
100
|
+
from vllm import LLM, SamplingParams
|
|
101
|
+
from vllm.distributed.parallel_state import (
|
|
95
102
|
destroy_distributed_environment,
|
|
96
103
|
destroy_model_parallel,
|
|
97
104
|
)
|
|
98
|
-
from vllm.lora.request import LoRARequest
|
|
99
|
-
from vllm.sampling_params import
|
|
100
|
-
StructuredOutputsParams,
|
|
101
|
-
)
|
|
105
|
+
from vllm.lora.request import LoRARequest
|
|
106
|
+
from vllm.sampling_params import StructuredOutputsParams
|
|
102
107
|
|
|
103
108
|
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
104
|
-
import ray
|
|
109
|
+
import ray
|
|
105
110
|
|
|
106
111
|
|
|
107
112
|
if t.TYPE_CHECKING:
|
|
@@ -111,7 +116,9 @@ if t.TYPE_CHECKING:
|
|
|
111
116
|
from ..data_models import BenchmarkConfig, DatasetConfig, Task
|
|
112
117
|
|
|
113
118
|
|
|
114
|
-
MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[
|
|
119
|
+
MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[
|
|
120
|
+
re.Pattern, t.Literal[*ATTENTION_BACKENDS] # pyrefly: ignore[invalid-literal]
|
|
121
|
+
] = {
|
|
115
122
|
re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "TRITON_ATTN",
|
|
116
123
|
re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "TRITON_ATTN",
|
|
117
124
|
re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "TRITON_ATTN",
|
|
@@ -153,7 +160,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
153
160
|
if importlib.util.find_spec("vllm") is None:
|
|
154
161
|
raise NeedsExtraInstalled(extra="generative")
|
|
155
162
|
|
|
156
|
-
if shutil.which("nvcc") is None:
|
|
163
|
+
if torch.cuda.is_available() and shutil.which("nvcc") is None:
|
|
157
164
|
raise NeedsSystemDependency(
|
|
158
165
|
dependency="nvcc",
|
|
159
166
|
instructions=(
|
|
@@ -163,23 +170,43 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
163
170
|
),
|
|
164
171
|
)
|
|
165
172
|
|
|
173
|
+
if not torch.cuda.is_available() and (
|
|
174
|
+
dataset_config.task.task_group
|
|
175
|
+
in [
|
|
176
|
+
TaskGroup.SEQUENCE_CLASSIFICATION,
|
|
177
|
+
TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
178
|
+
]
|
|
179
|
+
or dataset_config.task.uses_structured_output
|
|
180
|
+
):
|
|
181
|
+
raise InvalidBenchmark(
|
|
182
|
+
"We currently require CUDA to benchmark generative models on tasks "
|
|
183
|
+
"that uses structured generation, which includes the current task "
|
|
184
|
+
f"{dataset_config.task.name}. This is due to an xgrammar issue, which "
|
|
185
|
+
"will hopefully be fixed soon."
|
|
186
|
+
)
|
|
187
|
+
|
|
166
188
|
raise_if_wrong_params(
|
|
167
189
|
model_config=model_config, allowed_params=self.allowed_params
|
|
168
190
|
)
|
|
169
191
|
|
|
170
|
-
#
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
192
|
+
# Determine the attention backend to use:
|
|
193
|
+
# Override for models that require a specific backend, otherwise use user's
|
|
194
|
+
# choice from CLI (defaults to FLASHINFER)
|
|
195
|
+
if hasattr(vllm.config, "attention"):
|
|
196
|
+
for pattern, backend in MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS.items():
|
|
197
|
+
if re.search(pattern=pattern, string=model_config.model_id):
|
|
198
|
+
attention_backend = backend
|
|
199
|
+
break
|
|
200
|
+
else:
|
|
201
|
+
attention_backend = benchmark_config.attention_backend
|
|
202
|
+
else:
|
|
203
|
+
attention_backend = benchmark_config.attention_backend
|
|
176
204
|
|
|
177
|
-
with (
|
|
178
|
-
no_terminal_output(disable=benchmark_config.verbose),
|
|
179
|
-
attention_backend(value=default_flash_attention_backend),
|
|
180
|
-
):
|
|
205
|
+
with no_terminal_output(disable=benchmark_config.verbose):
|
|
181
206
|
model, tokeniser = load_model_and_tokeniser(
|
|
182
|
-
model_config=model_config,
|
|
207
|
+
model_config=model_config,
|
|
208
|
+
benchmark_config=benchmark_config,
|
|
209
|
+
attention_backend=attention_backend,
|
|
183
210
|
)
|
|
184
211
|
self._model: "LLM" = model
|
|
185
212
|
self._tokeniser: Tokeniser = tokeniser
|
|
@@ -216,11 +243,14 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
216
243
|
)
|
|
217
244
|
)
|
|
218
245
|
if self.model_config.adapter_base_model_id is not None:
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
246
|
+
if Path(self.model_config.model_id).exists():
|
|
247
|
+
adapter_path = self.model_config.model_id
|
|
248
|
+
else:
|
|
249
|
+
adapter_path = snapshot_download(
|
|
250
|
+
repo_id=self.model_config.model_id,
|
|
251
|
+
revision=self.model_config.revision,
|
|
252
|
+
cache_dir=Path(self.model_config.model_cache_dir),
|
|
253
|
+
)
|
|
224
254
|
self.buffer["lora_request"] = LoRARequest(
|
|
225
255
|
lora_name="adapter", lora_int_id=1, lora_path=adapter_path
|
|
226
256
|
)
|
|
@@ -500,7 +530,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
500
530
|
log_once(
|
|
501
531
|
f"Using temperature={temperature} with the model "
|
|
502
532
|
f"{self.model_config.model_id!r} as specified in its "
|
|
503
|
-
"generation configuration."
|
|
533
|
+
"generation configuration.",
|
|
534
|
+
level=logging.DEBUG,
|
|
504
535
|
)
|
|
505
536
|
if "top_p" in changed_params:
|
|
506
537
|
top_p = changed_params["top_p"]
|
|
@@ -508,7 +539,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
508
539
|
log_once(
|
|
509
540
|
f"Using top_p={top_p} with the model "
|
|
510
541
|
f"{self.model_config.model_id!r} as specified in its "
|
|
511
|
-
"generation configuration."
|
|
542
|
+
"generation configuration.",
|
|
543
|
+
level=logging.DEBUG,
|
|
512
544
|
)
|
|
513
545
|
if "top_k" in changed_params:
|
|
514
546
|
top_k = changed_params["top_k"]
|
|
@@ -516,7 +548,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
516
548
|
log_once(
|
|
517
549
|
f"Using top_k={top_k} with the model "
|
|
518
550
|
f"{self.model_config.model_id!r} as specified in its "
|
|
519
|
-
"generation configuration."
|
|
551
|
+
"generation configuration.",
|
|
552
|
+
level=logging.DEBUG,
|
|
520
553
|
)
|
|
521
554
|
if "repetition_penalty" in changed_params:
|
|
522
555
|
repetition_penalty = changed_params["repetition_penalty"]
|
|
@@ -524,8 +557,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
524
557
|
log_once(
|
|
525
558
|
f"Using repetition_penalty={repetition_penalty} with the model "
|
|
526
559
|
f"{self.model_config.model_id!r} as specified in its "
|
|
527
|
-
"generation configuration."
|
|
560
|
+
"generation configuration.",
|
|
561
|
+
level=logging.DEBUG,
|
|
528
562
|
)
|
|
563
|
+
|
|
529
564
|
max_tokens: int = (
|
|
530
565
|
REASONING_MAX_TOKENS
|
|
531
566
|
if self.generative_type == GenerativeType.REASONING
|
|
@@ -538,7 +573,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
538
573
|
else None,
|
|
539
574
|
temperature=generation_kwargs["temperature"],
|
|
540
575
|
top_p=generation_kwargs["top_p"],
|
|
541
|
-
top_k=generation_kwargs["top_k"],
|
|
576
|
+
top_k=int(generation_kwargs["top_k"]),
|
|
542
577
|
repetition_penalty=generation_kwargs["repetition_penalty"],
|
|
543
578
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
544
579
|
structured_outputs=structured_outputs,
|
|
@@ -547,10 +582,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
547
582
|
# If any of the prompts are empty then we need to replace them with a BOS token
|
|
548
583
|
# so that the vLLM model can generate from them
|
|
549
584
|
prompts: c.Sequence[str] = inputs["text"]
|
|
550
|
-
if any(len(prompt) == 0 for prompt in prompts):
|
|
585
|
+
if any(len(prompt.strip()) == 0 for prompt in prompts):
|
|
551
586
|
log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
|
|
552
587
|
prompts = [
|
|
553
|
-
prompt
|
|
588
|
+
prompt
|
|
589
|
+
if len(prompt.strip()) > 0
|
|
590
|
+
else str(self._tokeniser.bos_token or "x")
|
|
554
591
|
for prompt in prompts
|
|
555
592
|
]
|
|
556
593
|
|
|
@@ -567,16 +604,78 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
567
604
|
)
|
|
568
605
|
prompts = [prompt.strip() for prompt in prompts]
|
|
569
606
|
|
|
570
|
-
# Truncate the prompts if needed
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
607
|
+
# Truncate the prompts if needed
|
|
608
|
+
max_tokens_per_prompt = min(
|
|
609
|
+
self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH
|
|
610
|
+
)
|
|
611
|
+
max_tokens_per_prompt -= min(
|
|
612
|
+
self.dataset_config.max_generated_tokens, max_tokens_per_prompt - 1
|
|
613
|
+
)
|
|
614
|
+
tokenized_prompts = self._tokeniser(
|
|
615
|
+
text=prompts, max_length=max_tokens_per_prompt
|
|
616
|
+
)
|
|
617
|
+
if any(
|
|
618
|
+
len(input_ids) >= max_tokens_per_prompt
|
|
619
|
+
for input_ids in tokenized_prompts.input_ids
|
|
620
|
+
):
|
|
621
|
+
log(
|
|
622
|
+
f"Truncating prompts for the model {self.model_config.model_id!r} "
|
|
623
|
+
f"to a maximum of {max_tokens_per_prompt:,} tokens.",
|
|
624
|
+
level=logging.DEBUG,
|
|
577
625
|
)
|
|
578
|
-
|
|
579
|
-
|
|
626
|
+
match self.generative_type:
|
|
627
|
+
case GenerativeType.BASE:
|
|
628
|
+
truncated_tokenized_prompts = self._tokeniser(
|
|
629
|
+
text=prompts, max_length=max_tokens_per_prompt, truncation=True
|
|
630
|
+
)
|
|
631
|
+
prompts = self._tokeniser.batch_decode(
|
|
632
|
+
sequences=truncated_tokenized_prompts.input_ids,
|
|
633
|
+
skip_special_tokens=True,
|
|
634
|
+
)
|
|
635
|
+
case GenerativeType.INSTRUCTION_TUNED | GenerativeType.REASONING:
|
|
636
|
+
assert self.end_of_chat_token_ids is not None, (
|
|
637
|
+
"The end-of-chat token IDs should be set for instruction-tuned "
|
|
638
|
+
"and reasoning models."
|
|
639
|
+
)
|
|
640
|
+
end_of_chat_token = self._tokeniser.decode(
|
|
641
|
+
list(self.end_of_chat_token_ids)
|
|
642
|
+
)
|
|
643
|
+
prompt_segments: list[list[str]] = [
|
|
644
|
+
prompt.replace(self._tokeniser.bos_token, "").split(
|
|
645
|
+
end_of_chat_token
|
|
646
|
+
)
|
|
647
|
+
for prompt in prompts
|
|
648
|
+
]
|
|
649
|
+
for num_few_shots_to_remove in range(
|
|
650
|
+
1, self.dataset_config.num_few_shot_examples + 1
|
|
651
|
+
):
|
|
652
|
+
new_prompts = [
|
|
653
|
+
end_of_chat_token.join(
|
|
654
|
+
prompt_segment[2 * num_few_shots_to_remove :]
|
|
655
|
+
)
|
|
656
|
+
for prompt_segment in prompt_segments
|
|
657
|
+
]
|
|
658
|
+
tokenized_prompts = self._tokeniser(
|
|
659
|
+
text=new_prompts, max_length=max_tokens_per_prompt
|
|
660
|
+
)
|
|
661
|
+
if all(
|
|
662
|
+
len(input_ids) < max_tokens_per_prompt
|
|
663
|
+
for input_ids in tokenized_prompts.input_ids
|
|
664
|
+
):
|
|
665
|
+
prompts = new_prompts
|
|
666
|
+
break
|
|
667
|
+
else:
|
|
668
|
+
raise InvalidBenchmark(
|
|
669
|
+
"Truncation of prompts failed, some prompts are still too "
|
|
670
|
+
"long."
|
|
671
|
+
)
|
|
672
|
+
case _:
|
|
673
|
+
raise InvalidBenchmark("The model type is not set!")
|
|
674
|
+
else:
|
|
675
|
+
log(
|
|
676
|
+
f"Truncation of prompts for model {self.model_config.model_id!r} is "
|
|
677
|
+
"not needed, so skipping truncation.",
|
|
678
|
+
level=logging.DEBUG,
|
|
580
679
|
)
|
|
581
680
|
|
|
582
681
|
# Generate sequences using vLLM
|
|
@@ -598,10 +697,11 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
598
697
|
level=logging.DEBUG,
|
|
599
698
|
)
|
|
600
699
|
sleep(1)
|
|
601
|
-
except ValueError as e:
|
|
700
|
+
except (ValueError, RuntimeError) as e:
|
|
602
701
|
# Truncate the prompts if they are too long for the model
|
|
603
702
|
truncate_error_messages = [
|
|
604
|
-
r"prompt \(length [0-9]+\) is longer than the maximum model length"
|
|
703
|
+
r"prompt \(length [0-9]+\) is longer than the maximum model length",
|
|
704
|
+
"Sampled token IDs exceed the max model length",
|
|
605
705
|
]
|
|
606
706
|
if any(
|
|
607
707
|
re.search(pattern, str(e), flags=re.IGNORECASE) is not None
|
|
@@ -873,7 +973,11 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
873
973
|
|
|
874
974
|
|
|
875
975
|
def load_model_and_tokeniser(
|
|
876
|
-
model_config: "ModelConfig",
|
|
976
|
+
model_config: "ModelConfig",
|
|
977
|
+
benchmark_config: "BenchmarkConfig",
|
|
978
|
+
attention_backend: t.Literal[
|
|
979
|
+
*ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
|
|
980
|
+
],
|
|
877
981
|
) -> tuple["LLM", Tokeniser]:
|
|
878
982
|
"""Load the model and tokeniser.
|
|
879
983
|
|
|
@@ -882,6 +986,8 @@ def load_model_and_tokeniser(
|
|
|
882
986
|
The model configuration.
|
|
883
987
|
benchmark_config:
|
|
884
988
|
The benchmark configuration.
|
|
989
|
+
attention_backend:
|
|
990
|
+
The attention backend to use.
|
|
885
991
|
|
|
886
992
|
Returns:
|
|
887
993
|
A pair (model, tokeniser), with the loaded model and tokeniser
|
|
@@ -905,19 +1011,6 @@ def load_model_and_tokeniser(
|
|
|
905
1011
|
run_with_cli=benchmark_config.run_with_cli,
|
|
906
1012
|
)
|
|
907
1013
|
|
|
908
|
-
quantization = None
|
|
909
|
-
if hasattr(hf_model_config, "quantization_config"):
|
|
910
|
-
quantization = hf_model_config.quantization_config.get("quant_method")
|
|
911
|
-
|
|
912
|
-
# The quantised models require extra dependencies
|
|
913
|
-
if quantization == "gptq" and (
|
|
914
|
-
importlib.util.find_spec("auto_gptq") is None
|
|
915
|
-
or importlib.util.find_spec("optimum") is None
|
|
916
|
-
):
|
|
917
|
-
raise NeedsExtraInstalled(extra="quantization")
|
|
918
|
-
if quantization == "awq" and importlib.util.find_spec("awq") is None:
|
|
919
|
-
raise NeedsExtraInstalled(extra="quantization")
|
|
920
|
-
|
|
921
1014
|
# Start with dtype being the "auto" vLLM dtype
|
|
922
1015
|
dtype: str | torch.dtype = "auto"
|
|
923
1016
|
|
|
@@ -940,23 +1033,6 @@ def load_model_and_tokeniser(
|
|
|
940
1033
|
)
|
|
941
1034
|
dtype = torch.float16
|
|
942
1035
|
|
|
943
|
-
# If the model is a quantized model, we might need to change the dtype
|
|
944
|
-
if quantization == "mxfp4" and hf_model_config.dtype is None:
|
|
945
|
-
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
|
946
|
-
log(
|
|
947
|
-
"You are loading a quantized model where `dtype` has not been set. "
|
|
948
|
-
f"Setting dtype to {dtype!r}.",
|
|
949
|
-
level=logging.DEBUG,
|
|
950
|
-
)
|
|
951
|
-
elif quantization is not None and hf_model_config.dtype != torch.float16:
|
|
952
|
-
log(
|
|
953
|
-
"You are loading a quantized model with dtype "
|
|
954
|
-
f"{hf_model_config.dtype}, which vLLM does not support. Setting "
|
|
955
|
-
"dtype to float16 instead.",
|
|
956
|
-
level=logging.WARNING,
|
|
957
|
-
)
|
|
958
|
-
dtype = torch.float16
|
|
959
|
-
|
|
960
1036
|
# If the model is a bf16 model, we need to check the CUDA compute capability
|
|
961
1037
|
if hf_model_config.dtype == torch.bfloat16:
|
|
962
1038
|
min_cuda_compute_capability = get_min_cuda_compute_capability()
|
|
@@ -974,6 +1050,28 @@ def load_model_and_tokeniser(
|
|
|
974
1050
|
)
|
|
975
1051
|
dtype = torch.float16
|
|
976
1052
|
|
|
1053
|
+
quantization = None
|
|
1054
|
+
if hasattr(hf_model_config, "quantization_config"):
|
|
1055
|
+
quantization = hf_model_config.quantization_config.get("quant_method")
|
|
1056
|
+
|
|
1057
|
+
# The quantised models require extra dependencies
|
|
1058
|
+
if quantization == "gptq" and (
|
|
1059
|
+
importlib.util.find_spec("auto_gptq") is None
|
|
1060
|
+
or importlib.util.find_spec("optimum") is None
|
|
1061
|
+
):
|
|
1062
|
+
raise NeedsExtraInstalled(extra="quantization")
|
|
1063
|
+
if quantization == "awq" and importlib.util.find_spec("awq") is None:
|
|
1064
|
+
raise NeedsExtraInstalled(extra="quantization")
|
|
1065
|
+
|
|
1066
|
+
# If the model is a quantized model, let vLLM decide the dtype
|
|
1067
|
+
if quantization is not None:
|
|
1068
|
+
log(
|
|
1069
|
+
f"You are loading a quantized model with quantization {quantization}. "
|
|
1070
|
+
"Forcing the vLLM dtype to 'auto'",
|
|
1071
|
+
level=logging.WARNING,
|
|
1072
|
+
)
|
|
1073
|
+
dtype = "auto"
|
|
1074
|
+
|
|
977
1075
|
if model_config.adapter_base_model_id is not None:
|
|
978
1076
|
download_dir = str(Path(model_config.model_cache_dir) / "base_model")
|
|
979
1077
|
else:
|
|
@@ -1006,10 +1104,15 @@ def load_model_and_tokeniser(
|
|
|
1006
1104
|
model_config=model_config,
|
|
1007
1105
|
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
1008
1106
|
)
|
|
1009
|
-
|
|
1107
|
+
vllm_params = get_vllm_tokenisation_params(
|
|
1010
1108
|
tokeniser=tokeniser, model_config=model_config
|
|
1011
1109
|
)
|
|
1012
1110
|
|
|
1111
|
+
# MacOS/CPU installs an older version of vLLM, which doesn't have the attention
|
|
1112
|
+
# config
|
|
1113
|
+
if hasattr(vllm.config, "attention"):
|
|
1114
|
+
vllm_params["attention_config"] = AttentionConfig(backend=attention_backend)
|
|
1115
|
+
|
|
1013
1116
|
clear_vllm()
|
|
1014
1117
|
|
|
1015
1118
|
distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
|
|
@@ -1017,19 +1120,21 @@ def load_model_and_tokeniser(
|
|
|
1017
1120
|
)
|
|
1018
1121
|
|
|
1019
1122
|
try:
|
|
1123
|
+
model_location = (
|
|
1124
|
+
model_id
|
|
1125
|
+
if internet_connection_available() or Path(model_id).is_dir()
|
|
1126
|
+
else resolve_model_path(download_dir=download_dir)
|
|
1127
|
+
)
|
|
1128
|
+
|
|
1129
|
+
max_model_len = min(
|
|
1130
|
+
true_max_model_len, MAX_CONTEXT_LENGTH + REASONING_MAX_TOKENS
|
|
1131
|
+
)
|
|
1020
1132
|
model = LLM(
|
|
1021
|
-
model=
|
|
1022
|
-
|
|
1023
|
-
if internet_connection_available()
|
|
1024
|
-
else resolve_model_path(download_dir=download_dir)
|
|
1025
|
-
),
|
|
1026
|
-
tokenizer=(
|
|
1027
|
-
model_id
|
|
1028
|
-
if internet_connection_available()
|
|
1029
|
-
else resolve_model_path(download_dir=download_dir)
|
|
1030
|
-
),
|
|
1133
|
+
model=model_location,
|
|
1134
|
+
tokenizer=model_location,
|
|
1031
1135
|
gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
|
|
1032
|
-
max_model_len=
|
|
1136
|
+
max_model_len=max_model_len,
|
|
1137
|
+
max_num_batched_tokens=max_model_len,
|
|
1033
1138
|
download_dir=download_dir,
|
|
1034
1139
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
1035
1140
|
revision=revision,
|
|
@@ -1046,7 +1151,7 @@ def load_model_and_tokeniser(
|
|
|
1046
1151
|
enable_prefix_caching=False,
|
|
1047
1152
|
enable_lora=model_config.adapter_base_model_id is not None,
|
|
1048
1153
|
max_lora_rank=256,
|
|
1049
|
-
**
|
|
1154
|
+
**vllm_params,
|
|
1050
1155
|
)
|
|
1051
1156
|
except (RuntimeError, ValueError, OSError) as e:
|
|
1052
1157
|
if "awaiting a review from the repo authors" in str(e):
|
|
@@ -1071,11 +1176,11 @@ def load_model_and_tokeniser(
|
|
|
1071
1176
|
(
|
|
1072
1177
|
"Since you're running in verbose mode, you might see a descriptive "
|
|
1073
1178
|
"error above already. Note however that if the error message urges "
|
|
1074
|
-
"you to
|
|
1075
|
-
"
|
|
1076
|
-
"as that often solves the
|
|
1077
|
-
"doesn't. If you don't
|
|
1078
|
-
"can try "
|
|
1179
|
+
"you to use the attention backend 'FLEX_ATTENTION', please try "
|
|
1180
|
+
"setting it to 'TRITON_ATTN' instead using the "
|
|
1181
|
+
"`--attention-backend` CLI argument, as that often solves the "
|
|
1182
|
+
"issue, whereas 'FLEX_ATTENTION' usually doesn't. If you don't "
|
|
1183
|
+
"see any descriptive error above, then you can try "
|
|
1079
1184
|
)
|
|
1080
1185
|
if benchmark_config.verbose
|
|
1081
1186
|
else "Try "
|
|
@@ -1450,6 +1555,9 @@ def select_backend_and_parallelism() -> tuple[str, int, int]:
|
|
|
1450
1555
|
- tensor_parallel_size (int): Number of GPUs per node.
|
|
1451
1556
|
- pipeline_parallel_size (int): Number of stages across nodes.
|
|
1452
1557
|
"""
|
|
1558
|
+
if not torch.cuda.is_available():
|
|
1559
|
+
return "mp", 1, 1
|
|
1560
|
+
|
|
1453
1561
|
if not ray.is_initialized():
|
|
1454
1562
|
try:
|
|
1455
1563
|
ray.init(address="auto", ignore_reinit_error=True)
|
|
@@ -1476,7 +1584,7 @@ def select_backend_and_parallelism() -> tuple[str, int, int]:
|
|
|
1476
1584
|
pipeline_parallel_size = max(1, total_gpus // tensor_parallel_size)
|
|
1477
1585
|
log_once(
|
|
1478
1586
|
f"Detected a multi-node setup with {pipeline_parallel_size:,} nodes, each "
|
|
1479
|
-
"with {tensor_parallel_size:,} GPUs, so using `ray` as the "
|
|
1587
|
+
f"with {tensor_parallel_size:,} GPUs, so using `ray` as the "
|
|
1480
1588
|
"distributed backend.",
|
|
1481
1589
|
level=logging.DEBUG,
|
|
1482
1590
|
)
|
scandeval/benchmarker.py
CHANGED
|
@@ -15,7 +15,7 @@ from time import sleep
|
|
|
15
15
|
from torch.distributed import destroy_process_group
|
|
16
16
|
|
|
17
17
|
from .benchmark_config_factory import build_benchmark_config
|
|
18
|
-
from .constants import GENERATIVE_PIPELINE_TAGS
|
|
18
|
+
from .constants import ATTENTION_BACKENDS, GENERATIVE_PIPELINE_TAGS
|
|
19
19
|
from .data_loading import load_data, load_raw_data
|
|
20
20
|
from .data_models import BenchmarkConfigParams, BenchmarkResult
|
|
21
21
|
from .dataset_configs import get_all_dataset_configs
|
|
@@ -79,6 +79,7 @@ class Benchmarker:
|
|
|
79
79
|
api_base: str | None = None,
|
|
80
80
|
api_version: str | None = None,
|
|
81
81
|
gpu_memory_utilization: float = 0.8,
|
|
82
|
+
attention_backend: str = "FLASHINFER",
|
|
82
83
|
generative_type: GenerativeType | None = None,
|
|
83
84
|
custom_datasets_file: Path | str = Path("custom_datasets.py"),
|
|
84
85
|
debug: bool = False,
|
|
@@ -149,6 +150,9 @@ class Benchmarker:
|
|
|
149
150
|
is generative. A larger value will result in faster evaluation, but at
|
|
150
151
|
the risk of running out of GPU memory. Only reduce this if you are
|
|
151
152
|
running out of GPU memory. Defaults to 0.9.
|
|
153
|
+
attention_backend:
|
|
154
|
+
The attention backend to use for vLLM. Defaults to FLASHINFER. Only
|
|
155
|
+
relevant if the model is generative.
|
|
152
156
|
generative_type:
|
|
153
157
|
The type of generative model to benchmark. Only relevant if the model is
|
|
154
158
|
generative. If not specified, then the type will be inferred based on
|
|
@@ -264,6 +268,7 @@ class Benchmarker:
|
|
|
264
268
|
requires_safetensors=requires_safetensors,
|
|
265
269
|
download_only=download_only,
|
|
266
270
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
271
|
+
attention_backend=attention_backend,
|
|
267
272
|
generative_type=generative_type,
|
|
268
273
|
custom_datasets_file=Path(custom_datasets_file),
|
|
269
274
|
verbose=verbose,
|
|
@@ -385,6 +390,10 @@ class Benchmarker:
|
|
|
385
390
|
download_only: bool | None = None,
|
|
386
391
|
gpu_memory_utilization: float | None = None,
|
|
387
392
|
generative_type: GenerativeType | None = None,
|
|
393
|
+
attention_backend: t.Literal[
|
|
394
|
+
*ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
|
|
395
|
+
]
|
|
396
|
+
| None = None,
|
|
388
397
|
custom_datasets_file: Path | str | None = None,
|
|
389
398
|
force: bool | None = None,
|
|
390
399
|
verbose: bool | None = None,
|
|
@@ -638,6 +647,11 @@ class Benchmarker:
|
|
|
638
647
|
if generative_type is not None
|
|
639
648
|
else self.benchmark_config_default_params.generative_type
|
|
640
649
|
),
|
|
650
|
+
attention_backend=(
|
|
651
|
+
attention_backend
|
|
652
|
+
if attention_backend is not None
|
|
653
|
+
else self.benchmark_config_default_params.attention_backend
|
|
654
|
+
),
|
|
641
655
|
custom_datasets_file=(
|
|
642
656
|
Path(custom_datasets_file)
|
|
643
657
|
if custom_datasets_file is not None
|
|
@@ -1045,8 +1059,16 @@ class Benchmarker:
|
|
|
1045
1059
|
if model.generative_type is not None
|
|
1046
1060
|
else None
|
|
1047
1061
|
),
|
|
1048
|
-
few_shot=
|
|
1049
|
-
|
|
1062
|
+
few_shot=(
|
|
1063
|
+
None
|
|
1064
|
+
if dataset_config.task.requires_zero_shot
|
|
1065
|
+
else benchmark_config.few_shot
|
|
1066
|
+
),
|
|
1067
|
+
validation_split=(
|
|
1068
|
+
None
|
|
1069
|
+
if "val" not in dataset_config.splits
|
|
1070
|
+
else not benchmark_config.evaluate_test_split
|
|
1071
|
+
),
|
|
1050
1072
|
)
|
|
1051
1073
|
log(f"Results:\n{results}", level=logging.DEBUG)
|
|
1052
1074
|
return record
|
|
@@ -1122,12 +1144,10 @@ def get_record(
|
|
|
1122
1144
|
same_revision = model_id_components.revision == model_config.revision
|
|
1123
1145
|
same_param = model_id_components.param == model_config.param
|
|
1124
1146
|
same_dataset = record.dataset == dataset_config.name
|
|
1125
|
-
same_split =
|
|
1126
|
-
record.validation_split != benchmark_config.evaluate_test_split
|
|
1127
|
-
or "val" not in dataset_config.splits
|
|
1128
|
-
)
|
|
1147
|
+
same_split = record.validation_split != benchmark_config.evaluate_test_split
|
|
1129
1148
|
same_num_shots = (
|
|
1130
1149
|
record.few_shot == benchmark_config.few_shot
|
|
1150
|
+
or record.few_shot is None
|
|
1131
1151
|
or not record.generative
|
|
1132
1152
|
or dataset_config.task.requires_zero_shot
|
|
1133
1153
|
)
|
|
@@ -1225,6 +1245,7 @@ def initial_logging(
|
|
|
1225
1245
|
f"{dataset_config.logging_string} ({num_finished_benchmarks + 1}/"
|
|
1226
1246
|
f"{num_total_benchmarks} benchmarks)...",
|
|
1227
1247
|
prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
|
|
1248
|
+
level=logging.INFO,
|
|
1228
1249
|
)
|
|
1229
1250
|
|
|
1230
1251
|
if dataset_config.unofficial:
|
scandeval/cli.py
CHANGED
|
@@ -170,6 +170,17 @@ from .languages import get_all_languages
|
|
|
170
170
|
"faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
|
|
171
171
|
"if you are running out of GPU memory. Only relevant if the model is generative.",
|
|
172
172
|
)
|
|
173
|
+
@click.option(
|
|
174
|
+
"--attention-backend",
|
|
175
|
+
default="FLASHINFER",
|
|
176
|
+
show_default=True,
|
|
177
|
+
type=click.Choice(
|
|
178
|
+
["FLASHINFER", "FLASH_ATTN", "TRITON_ATTN", "FLEX_ATTENTION"],
|
|
179
|
+
case_sensitive=True,
|
|
180
|
+
),
|
|
181
|
+
help="The attention backend to use for vLLM. Only relevant if the model is "
|
|
182
|
+
"generative.",
|
|
183
|
+
)
|
|
173
184
|
@click.option(
|
|
174
185
|
"--requires-safetensors",
|
|
175
186
|
is_flag=True,
|
|
@@ -254,6 +265,7 @@ def benchmark(
|
|
|
254
265
|
api_base: str | None,
|
|
255
266
|
api_version: str | None,
|
|
256
267
|
gpu_memory_utilization: float,
|
|
268
|
+
attention_backend: str,
|
|
257
269
|
requires_safetensors: bool,
|
|
258
270
|
generative_type: str | None,
|
|
259
271
|
custom_datasets_file: Path,
|
|
@@ -285,6 +297,7 @@ def benchmark(
|
|
|
285
297
|
api_base=api_base,
|
|
286
298
|
api_version=api_version,
|
|
287
299
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
300
|
+
attention_backend=attention_backend,
|
|
288
301
|
generative_type=GenerativeType[generative_type.upper()]
|
|
289
302
|
if generative_type
|
|
290
303
|
else None,
|
scandeval/constants.py
CHANGED
|
@@ -33,8 +33,8 @@ GENERATIVE_PIPELINE_TAGS = [
|
|
|
33
33
|
# Used to disallow non-generative models to be evaluated on these task groups
|
|
34
34
|
GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
|
|
35
35
|
|
|
36
|
-
# Local models are required to have these files in their directory
|
|
37
|
-
LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
|
|
36
|
+
# Local models are required to have one of these files in their directory
|
|
37
|
+
LOCAL_MODELS_REQUIRED_FILES = ["config.json", "adapter_config.json"]
|
|
38
38
|
|
|
39
39
|
# The number of top log probabilities to return for generative models. For several APIs
|
|
40
40
|
# this is the maximum number of log probabilities that can be returned
|
|
@@ -105,3 +105,32 @@ GENERATION_KWARGS = {
|
|
|
105
105
|
"top_k": 0,
|
|
106
106
|
"repetition_penalty": 1.0,
|
|
107
107
|
}
|
|
108
|
+
|
|
109
|
+
# This is a mirror of `AttentionBackendEnum` in vLLM, but since we don't have access to
|
|
110
|
+
# this when running on CPU/MacOS (as we can only run an old vLLM version), we have to
|
|
111
|
+
# define it here
|
|
112
|
+
ATTENTION_BACKENDS: list[str] = [
|
|
113
|
+
"FLASH_ATTN",
|
|
114
|
+
"FLASH_ATTN_DIFFKV",
|
|
115
|
+
"TRITON_ATTN",
|
|
116
|
+
"ROCM_ATTN",
|
|
117
|
+
"ROCM_AITER_MLA",
|
|
118
|
+
"ROCM_AITER_TRITON_MLA",
|
|
119
|
+
"ROCM_AITER_FA",
|
|
120
|
+
"ROCM_AITER_MLA_SPARSE",
|
|
121
|
+
"TORCH_SDPA",
|
|
122
|
+
"FLASHINFER",
|
|
123
|
+
"FLASHINFER_MLA",
|
|
124
|
+
"TRITON_MLA",
|
|
125
|
+
"CUTLASS_MLA",
|
|
126
|
+
"FLASHMLA",
|
|
127
|
+
"FLASHMLA_SPARSE",
|
|
128
|
+
"FLASH_ATTN_MLA",
|
|
129
|
+
"IPEX",
|
|
130
|
+
"NO_ATTENTION",
|
|
131
|
+
"FLEX_ATTENTION",
|
|
132
|
+
"TREE_ATTN",
|
|
133
|
+
"ROCM_AITER_UNIFIED_ATTN",
|
|
134
|
+
"CPU_ATTN",
|
|
135
|
+
"CUSTOM",
|
|
136
|
+
]
|