ScandEval 16.8.0__py3-none-any.whl → 16.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scandeval/benchmark_modules/hf.py +18 -3
- scandeval/benchmark_modules/litellm.py +14 -13
- scandeval/benchmark_modules/vllm.py +127 -9
- scandeval/benchmarker.py +0 -11
- scandeval/cli.py +39 -39
- scandeval/constants.py +9 -0
- scandeval/data_models.py +5 -0
- scandeval/dataset_configs/__init__.py +1 -0
- scandeval/dataset_configs/albanian.py +64 -0
- scandeval/dataset_configs/dutch.py +31 -1
- scandeval/dataset_configs/swedish.py +9 -0
- scandeval/logging_utils.py +1 -0
- scandeval/metrics/huggingface.py +82 -0
- scandeval/metrics/llm_as_a_judge.py +1 -3
- scandeval/model_config.py +2 -2
- scandeval/prompt_templates/__init__.py +1 -0
- scandeval/prompt_templates/linguistic_acceptability.py +9 -0
- scandeval/prompt_templates/multiple_choice.py +9 -0
- scandeval/prompt_templates/named_entity_recognition.py +20 -0
- scandeval/prompt_templates/reading_comprehension.py +9 -0
- scandeval/prompt_templates/sentiment_classification.py +11 -0
- scandeval/prompt_templates/simplification.py +23 -0
- scandeval/prompt_templates/summarization.py +11 -0
- scandeval/task_group_utils/question_answering.py +30 -19
- scandeval/task_group_utils/sequence_classification.py +4 -4
- scandeval/task_group_utils/text_to_text.py +3 -4
- scandeval/task_group_utils/token_classification.py +6 -8
- scandeval/tasks.py +11 -0
- scandeval/tokenisation_utils.py +7 -1
- scandeval/types.py +7 -1
- scandeval/utils.py +5 -6
- {scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/METADATA +21 -3
- {scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/RECORD +36 -34
- {scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/WHEEL +1 -1
- {scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/entry_points.txt +0 -0
- {scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -33,7 +33,6 @@ from transformers.modelcard import TASK_MAPPING
|
|
|
33
33
|
from transformers.modeling_utils import PreTrainedModel
|
|
34
34
|
from transformers.models.auto.configuration_auto import AutoConfig
|
|
35
35
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
36
|
-
from transformers.tokenization_mistral_common import MistralCommonTokenizer
|
|
37
36
|
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
|
38
37
|
from transformers.trainer import Trainer
|
|
39
38
|
from urllib3.exceptions import RequestError
|
|
@@ -80,6 +79,13 @@ from ..utils import (
|
|
|
80
79
|
)
|
|
81
80
|
from .base import BenchmarkModule
|
|
82
81
|
|
|
82
|
+
try:
|
|
83
|
+
from transformers.tokenization_mistral_common import MistralCommonTokenizer
|
|
84
|
+
except ImportError:
|
|
85
|
+
from transformers.tokenization_mistral_common import (
|
|
86
|
+
MistralCommonBackend as MistralCommonTokenizer,
|
|
87
|
+
)
|
|
88
|
+
|
|
83
89
|
if t.TYPE_CHECKING:
|
|
84
90
|
from transformers.configuration_utils import PretrainedConfig
|
|
85
91
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
@@ -175,7 +181,16 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
175
181
|
and repo_info.safetensors is not None
|
|
176
182
|
and "total" in repo_info.safetensors
|
|
177
183
|
):
|
|
178
|
-
|
|
184
|
+
num_params_candidates: list[int] = [repo_info.safetensors["total"]]
|
|
185
|
+
if "parameters" in repo_info.safetensors and isinstance(
|
|
186
|
+
repo_info.safetensors["parameters"], dict
|
|
187
|
+
):
|
|
188
|
+
num_params_candidates.extend(
|
|
189
|
+
int(v)
|
|
190
|
+
for v in repo_info.safetensors["parameters"].values()
|
|
191
|
+
if isinstance(v, int) or (isinstance(v, str) and v.isdigit())
|
|
192
|
+
)
|
|
193
|
+
num_params = max(num_params_candidates)
|
|
179
194
|
elif (
|
|
180
195
|
hasattr(self._model.config, "num_params")
|
|
181
196
|
and self._model.config.num_params is not None
|
|
@@ -1146,7 +1161,7 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
|
|
|
1146
1161
|
"The token type embeddings of the model do not have a `data` "
|
|
1147
1162
|
"attribute, which is needed to modify the embeddings."
|
|
1148
1163
|
)
|
|
1149
|
-
token_type_embeddings.weight.data = torch.cat(
|
|
1164
|
+
token_type_embeddings.weight.data = torch.cat(
|
|
1150
1165
|
(
|
|
1151
1166
|
token_type_embedding_tensor,
|
|
1152
1167
|
torch.rand_like(token_type_embedding_tensor),
|
|
@@ -110,7 +110,7 @@ VOCAB_SIZE_MAPPING = {
|
|
|
110
110
|
# Anthropic models
|
|
111
111
|
r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
|
|
112
112
|
# Gemini models
|
|
113
|
-
r"(gemini/)?gemini-[1-9]\.[0-9]
|
|
113
|
+
r"(gemini/)?gemini-[1-9](\.[0-9])?-(flash|pro).*": 256_128,
|
|
114
114
|
# xAI models
|
|
115
115
|
r"(xai/)?grok.*": -1,
|
|
116
116
|
}
|
|
@@ -136,7 +136,7 @@ MODEL_MAX_LENGTH_MAPPING = {
|
|
|
136
136
|
# Gemini models
|
|
137
137
|
r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
|
|
138
138
|
r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
|
|
139
|
-
r"(gemini/)?gemini-
|
|
139
|
+
r"(gemini/)?gemini-[23](\.[05])?.*": 1_048_576,
|
|
140
140
|
# xAI models
|
|
141
141
|
r"(xai/)?grok.*": 131_072,
|
|
142
142
|
}
|
|
@@ -152,7 +152,7 @@ NUM_PARAMS_MAPPING = {
|
|
|
152
152
|
# Gemini models
|
|
153
153
|
r"(gemini/)?gemini-1.5-flash-8b": 8_000_000_000,
|
|
154
154
|
r"(gemini/)?gemini-1.5-flash-[0-9]+": -1,
|
|
155
|
-
r"(gemini/)?gemini-
|
|
155
|
+
r"(gemini/)?gemini-[23](.[05])?.*": -1,
|
|
156
156
|
# xAI models
|
|
157
157
|
r"(xai/)?grok.*": -1,
|
|
158
158
|
}
|
|
@@ -208,8 +208,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
208
208
|
"thinking",
|
|
209
209
|
],
|
|
210
210
|
# Gemini models
|
|
211
|
-
re.compile(r"(gemini/)?gemini-2
|
|
212
|
-
re.compile(r"(gemini/)?gemini-2
|
|
211
|
+
re.compile(r"(gemini/)?gemini-2\.5-flash-lite.*"): ["no-thinking", "thinking"],
|
|
212
|
+
re.compile(r"(gemini/)?gemini-(2\.5|3)-flash.*"): ["no-thinking", "thinking"],
|
|
213
213
|
# xAI models
|
|
214
214
|
re.compile(r"(xai/)?grok-3-mini(-fast)?(-beta)?"): ["low", "medium", "high"],
|
|
215
215
|
}
|
|
@@ -517,6 +517,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
517
517
|
response_format_messages = [
|
|
518
518
|
"got an unexpected keyword argument 'response_format'",
|
|
519
519
|
"the model returned empty outputs",
|
|
520
|
+
"'maxitems' is not supported",
|
|
520
521
|
]
|
|
521
522
|
|
|
522
523
|
if (
|
|
@@ -838,14 +839,14 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
838
839
|
]
|
|
839
840
|
|
|
840
841
|
# Close connections
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
842
|
+
semaphore.release()
|
|
843
|
+
router.reset()
|
|
844
|
+
try:
|
|
845
|
+
loop = asyncio.get_event_loop()
|
|
846
|
+
if not loop.is_closed():
|
|
847
|
+
loop.close()
|
|
848
|
+
except RuntimeError:
|
|
849
|
+
pass # Already closed
|
|
849
850
|
|
|
850
851
|
return successes, failures
|
|
851
852
|
|
|
@@ -15,13 +15,14 @@ from time import sleep
|
|
|
15
15
|
import torch
|
|
16
16
|
from huggingface_hub import snapshot_download
|
|
17
17
|
from pydantic import conlist, create_model
|
|
18
|
+
from transformers.generation.configuration_utils import GenerationConfig
|
|
18
19
|
from transformers.models.auto.configuration_auto import AutoConfig
|
|
19
20
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
20
|
-
from transformers.tokenization_mistral_common import MistralCommonTokenizer
|
|
21
21
|
from urllib3.exceptions import RequestError
|
|
22
22
|
|
|
23
23
|
from ..constants import (
|
|
24
24
|
CUSTOM_STOP_TOKENS,
|
|
25
|
+
GENERATION_KWARGS,
|
|
25
26
|
GENERATIVE_PIPELINE_TAGS,
|
|
26
27
|
MAX_CONTEXT_LENGTH,
|
|
27
28
|
MAX_VLLM_LOGPROBS,
|
|
@@ -81,6 +82,13 @@ from ..utils import (
|
|
|
81
82
|
)
|
|
82
83
|
from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
|
|
83
84
|
|
|
85
|
+
try:
|
|
86
|
+
from transformers.tokenization_mistral_common import MistralCommonTokenizer
|
|
87
|
+
except ImportError:
|
|
88
|
+
from transformers.tokenization_mistral_common import (
|
|
89
|
+
MistralCommonBackend as MistralCommonTokenizer,
|
|
90
|
+
)
|
|
91
|
+
|
|
84
92
|
if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
85
93
|
from vllm import LLM, SamplingParams # type: ignore[missing-import]
|
|
86
94
|
from vllm.distributed.parallel_state import ( # type: ignore[missing-import]
|
|
@@ -92,6 +100,10 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
|
92
100
|
StructuredOutputsParams,
|
|
93
101
|
)
|
|
94
102
|
|
|
103
|
+
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
104
|
+
import ray # type: ignore[missing-import]
|
|
105
|
+
|
|
106
|
+
|
|
95
107
|
if t.TYPE_CHECKING:
|
|
96
108
|
from datasets import DatasetDict
|
|
97
109
|
from transformers.trainer import Trainer
|
|
@@ -100,10 +112,11 @@ if t.TYPE_CHECKING:
|
|
|
100
112
|
|
|
101
113
|
|
|
102
114
|
MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[re.Pattern, str] = {
|
|
103
|
-
re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "
|
|
104
|
-
re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "
|
|
105
|
-
re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "
|
|
115
|
+
re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "TRITON_ATTN",
|
|
116
|
+
re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "TRITON_ATTN",
|
|
117
|
+
re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "TRITON_ATTN",
|
|
106
118
|
re.compile(r"google/gemma-3-(4|12|27)b.*", flags=re.IGNORECASE): "TRITON_ATTN",
|
|
119
|
+
re.compile(r"PleIAs/Pleias-3b-Preview", flags=re.IGNORECASE): "TRITON_ATTN",
|
|
107
120
|
}
|
|
108
121
|
|
|
109
122
|
|
|
@@ -478,6 +491,41 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
478
491
|
)
|
|
479
492
|
|
|
480
493
|
# Define the parameters used for vLLM generation
|
|
494
|
+
generation_kwargs = GENERATION_KWARGS.copy()
|
|
495
|
+
if (generation_config := self.model_config.generation_config) is not None:
|
|
496
|
+
changed_params = generation_config.to_diff_dict()
|
|
497
|
+
if "temperature" in changed_params:
|
|
498
|
+
temperature = changed_params["temperature"]
|
|
499
|
+
generation_kwargs["temperature"] = temperature
|
|
500
|
+
log_once(
|
|
501
|
+
f"Using temperature={temperature} with the model "
|
|
502
|
+
f"{self.model_config.model_id!r} as specified in its "
|
|
503
|
+
"generation configuration."
|
|
504
|
+
)
|
|
505
|
+
if "top_p" in changed_params:
|
|
506
|
+
top_p = changed_params["top_p"]
|
|
507
|
+
generation_kwargs["top_p"] = top_p
|
|
508
|
+
log_once(
|
|
509
|
+
f"Using top_p={top_p} with the model "
|
|
510
|
+
f"{self.model_config.model_id!r} as specified in its "
|
|
511
|
+
"generation configuration."
|
|
512
|
+
)
|
|
513
|
+
if "top_k" in changed_params:
|
|
514
|
+
top_k = changed_params["top_k"]
|
|
515
|
+
generation_kwargs["top_k"] = top_k
|
|
516
|
+
log_once(
|
|
517
|
+
f"Using top_k={top_k} with the model "
|
|
518
|
+
f"{self.model_config.model_id!r} as specified in its "
|
|
519
|
+
"generation configuration."
|
|
520
|
+
)
|
|
521
|
+
if "repetition_penalty" in changed_params:
|
|
522
|
+
repetition_penalty = changed_params["repetition_penalty"]
|
|
523
|
+
generation_kwargs["repetition_penalty"] = repetition_penalty
|
|
524
|
+
log_once(
|
|
525
|
+
f"Using repetition_penalty={repetition_penalty} with the model "
|
|
526
|
+
f"{self.model_config.model_id!r} as specified in its "
|
|
527
|
+
"generation configuration."
|
|
528
|
+
)
|
|
481
529
|
max_tokens: int = (
|
|
482
530
|
REASONING_MAX_TOKENS
|
|
483
531
|
if self.generative_type == GenerativeType.REASONING
|
|
@@ -488,7 +536,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
488
536
|
logprobs=MAX_VLLM_LOGPROBS
|
|
489
537
|
if self.buffer["first_label_token_mapping"]
|
|
490
538
|
else None,
|
|
491
|
-
temperature=
|
|
539
|
+
temperature=generation_kwargs["temperature"],
|
|
540
|
+
top_p=generation_kwargs["top_p"],
|
|
541
|
+
top_k=generation_kwargs["top_k"],
|
|
542
|
+
repetition_penalty=generation_kwargs["repetition_penalty"],
|
|
492
543
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
493
544
|
structured_outputs=structured_outputs,
|
|
494
545
|
)
|
|
@@ -762,6 +813,16 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
762
813
|
if model_info is None:
|
|
763
814
|
raise InvalidModel(f"The model {model_id!r} could not be found.")
|
|
764
815
|
|
|
816
|
+
try:
|
|
817
|
+
generation_config = GenerationConfig.from_pretrained(
|
|
818
|
+
pretrained_model_name=model_id_components.model_id,
|
|
819
|
+
revision=model_id_components.revision,
|
|
820
|
+
cache_dir=benchmark_config.cache_dir,
|
|
821
|
+
token=benchmark_config.api_key,
|
|
822
|
+
)
|
|
823
|
+
except OSError:
|
|
824
|
+
generation_config = None
|
|
825
|
+
|
|
765
826
|
language_mapping = get_all_languages()
|
|
766
827
|
language_codes = list(language_mapping.keys())
|
|
767
828
|
|
|
@@ -783,6 +844,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
783
844
|
cache_dir=benchmark_config.cache_dir, model_id=model_id
|
|
784
845
|
),
|
|
785
846
|
adapter_base_model_id=model_info.adapter_base_model_id,
|
|
847
|
+
generation_config=generation_config,
|
|
786
848
|
)
|
|
787
849
|
|
|
788
850
|
return model_config
|
|
@@ -950,6 +1012,10 @@ def load_model_and_tokeniser(
|
|
|
950
1012
|
|
|
951
1013
|
clear_vllm()
|
|
952
1014
|
|
|
1015
|
+
distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
|
|
1016
|
+
select_backend_and_parallelism()
|
|
1017
|
+
)
|
|
1018
|
+
|
|
953
1019
|
try:
|
|
954
1020
|
model = LLM(
|
|
955
1021
|
model=(
|
|
@@ -968,8 +1034,9 @@ def load_model_and_tokeniser(
|
|
|
968
1034
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
969
1035
|
revision=revision,
|
|
970
1036
|
seed=4242,
|
|
971
|
-
distributed_executor_backend=
|
|
972
|
-
tensor_parallel_size=
|
|
1037
|
+
distributed_executor_backend=distributed_executor_backend,
|
|
1038
|
+
tensor_parallel_size=tensor_parallel_size,
|
|
1039
|
+
pipeline_parallel_size=pipeline_parallel_size,
|
|
973
1040
|
disable_custom_all_reduce=True,
|
|
974
1041
|
quantization=quantization,
|
|
975
1042
|
dtype=dtype,
|
|
@@ -1005,8 +1072,8 @@ def load_model_and_tokeniser(
|
|
|
1005
1072
|
"Since you're running in verbose mode, you might see a descriptive "
|
|
1006
1073
|
"error above already. Note however that if the error message urges "
|
|
1007
1074
|
"you to set the environment variable `VLLM_ATTENTION_BACKEND` to "
|
|
1008
|
-
"'FLEX_ATTENTION', please try setting it to '
|
|
1009
|
-
"that often solves the issue, whereas 'FLEX_ATTENTION' usually "
|
|
1075
|
+
"'FLEX_ATTENTION', please try setting it to 'TRITON_ATTN' first, "
|
|
1076
|
+
"as that often solves the issue, whereas 'FLEX_ATTENTION' usually "
|
|
1010
1077
|
"doesn't. If you don't see any descriptive error above, then you "
|
|
1011
1078
|
"can try "
|
|
1012
1079
|
)
|
|
@@ -1372,3 +1439,54 @@ def get_vllm_tokenisation_params(
|
|
|
1372
1439
|
config_format=config_format,
|
|
1373
1440
|
load_format=load_format,
|
|
1374
1441
|
)
|
|
1442
|
+
|
|
1443
|
+
|
|
1444
|
+
def select_backend_and_parallelism() -> tuple[str, int, int]:
|
|
1445
|
+
"""Determine the distributed backend and parallelism for vLLM.
|
|
1446
|
+
|
|
1447
|
+
Returns:
|
|
1448
|
+
Tuple containing:
|
|
1449
|
+
- backend (str): "ray" if multi-node Ray is available, else "mp".
|
|
1450
|
+
- tensor_parallel_size (int): Number of GPUs per node.
|
|
1451
|
+
- pipeline_parallel_size (int): Number of stages across nodes.
|
|
1452
|
+
"""
|
|
1453
|
+
if not ray.is_initialized():
|
|
1454
|
+
try:
|
|
1455
|
+
ray.init(address="auto", ignore_reinit_error=True)
|
|
1456
|
+
except Exception as e:
|
|
1457
|
+
log_once(
|
|
1458
|
+
f"Ray initialisation failed with a {type(e)} exception: {e}",
|
|
1459
|
+
level=logging.DEBUG,
|
|
1460
|
+
)
|
|
1461
|
+
|
|
1462
|
+
is_ray = ray.is_initialized()
|
|
1463
|
+
local_gpu_count = torch.cuda.device_count()
|
|
1464
|
+
|
|
1465
|
+
if is_ray:
|
|
1466
|
+
resources = ray.cluster_resources()
|
|
1467
|
+
total_gpus = int(resources.get("GPU", 0))
|
|
1468
|
+
else:
|
|
1469
|
+
total_gpus = local_gpu_count
|
|
1470
|
+
|
|
1471
|
+
using_multiple_nodes = total_gpus > local_gpu_count
|
|
1472
|
+
if is_ray and using_multiple_nodes:
|
|
1473
|
+
distributed_executor_backend = "ray"
|
|
1474
|
+
tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
|
|
1475
|
+
pipeline_parallel_size = max(1, total_gpus // tensor_parallel_size)
|
|
1476
|
+
log_once(
|
|
1477
|
+
f"Detected a multi-node setup with {pipeline_parallel_size:,} nodes, each "
|
|
1478
|
+
"with {tensor_parallel_size:,} GPUs, so using `ray` as the "
|
|
1479
|
+
"distributed backend.",
|
|
1480
|
+
level=logging.DEBUG,
|
|
1481
|
+
)
|
|
1482
|
+
else:
|
|
1483
|
+
distributed_executor_backend = "mp"
|
|
1484
|
+
tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
|
|
1485
|
+
pipeline_parallel_size = 1
|
|
1486
|
+
log_once(
|
|
1487
|
+
f"Detected a single-node setup with {tensor_parallel_size:,} GPUs, "
|
|
1488
|
+
"so using the multiprocessing distributed backend.",
|
|
1489
|
+
level=logging.DEBUG,
|
|
1490
|
+
)
|
|
1491
|
+
|
|
1492
|
+
return distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size
|
scandeval/benchmarker.py
CHANGED
|
@@ -12,7 +12,6 @@ from pathlib import Path
|
|
|
12
12
|
from shutil import rmtree
|
|
13
13
|
from time import sleep
|
|
14
14
|
|
|
15
|
-
from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
|
|
16
15
|
from torch.distributed import destroy_process_group
|
|
17
16
|
|
|
18
17
|
from .benchmark_config_factory import build_benchmark_config
|
|
@@ -32,7 +31,6 @@ from .speed_benchmark import benchmark_speed
|
|
|
32
31
|
from .tasks import SPEED
|
|
33
32
|
from .utils import (
|
|
34
33
|
enforce_reproducibility,
|
|
35
|
-
get_package_version,
|
|
36
34
|
internet_connection_available,
|
|
37
35
|
split_model_id,
|
|
38
36
|
)
|
|
@@ -194,15 +192,6 @@ class Benchmarker:
|
|
|
194
192
|
msg += "the argument `download_only` was set to True."
|
|
195
193
|
raise ValueError(msg)
|
|
196
194
|
|
|
197
|
-
# Bail early if hf_transfer is enabled but not installed.
|
|
198
|
-
if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
|
|
199
|
-
raise ImportError(
|
|
200
|
-
"Fast download using 'hf_transfer' is enabled "
|
|
201
|
-
"(HF_HUB_ENABLE_HF_TRANSFER=1) but the 'hf_transfer' "
|
|
202
|
-
"package is not available in your environment. "
|
|
203
|
-
"Try installing it with `pip install hf_transfer`."
|
|
204
|
-
)
|
|
205
|
-
|
|
206
195
|
# Deprecation warnings
|
|
207
196
|
if batch_size is not None:
|
|
208
197
|
if run_with_cli:
|
scandeval/cli.py
CHANGED
|
@@ -37,26 +37,6 @@ from .languages import get_all_languages
|
|
|
37
37
|
help="""The languages to benchmark, both for models and datasets. If "all" then all
|
|
38
38
|
models will be benchmarked on all datasets.""",
|
|
39
39
|
)
|
|
40
|
-
@click.option(
|
|
41
|
-
"--model-language",
|
|
42
|
-
"-ml",
|
|
43
|
-
default=None,
|
|
44
|
-
show_default=True,
|
|
45
|
-
multiple=True,
|
|
46
|
-
metavar="ISO 639-1 LANGUAGE CODE",
|
|
47
|
-
type=click.Choice(["all"] + list(get_all_languages().keys())),
|
|
48
|
-
help="""This option is deprecated - please use --language instead.""",
|
|
49
|
-
)
|
|
50
|
-
@click.option(
|
|
51
|
-
"--dataset-language",
|
|
52
|
-
"-dl",
|
|
53
|
-
default=None,
|
|
54
|
-
show_default=True,
|
|
55
|
-
multiple=True,
|
|
56
|
-
metavar="ISO 639-1 LANGUAGE CODE",
|
|
57
|
-
type=click.Choice(["all"] + list(get_all_languages().keys())),
|
|
58
|
-
help="""This option is deprecated - please use --language instead.""",
|
|
59
|
-
)
|
|
60
40
|
@click.option(
|
|
61
41
|
"--dataset",
|
|
62
42
|
default=None,
|
|
@@ -65,13 +45,6 @@ from .languages import get_all_languages
|
|
|
65
45
|
help="""The name of the benchmark dataset. We recommend to use the `task` and
|
|
66
46
|
`language` options instead of this option.""",
|
|
67
47
|
)
|
|
68
|
-
@click.option(
|
|
69
|
-
"--batch-size",
|
|
70
|
-
default=None,
|
|
71
|
-
type=click.Choice(["1", "2", "4", "8", "16", "32"]),
|
|
72
|
-
help="This option is deprecated - please use --finetuning-batch-size instead.",
|
|
73
|
-
deprecated=True,
|
|
74
|
-
)
|
|
75
48
|
@click.option(
|
|
76
49
|
"--finetuning-batch-size",
|
|
77
50
|
default="32",
|
|
@@ -197,14 +170,6 @@ from .languages import get_all_languages
|
|
|
197
170
|
"faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
|
|
198
171
|
"if you are running out of GPU memory. Only relevant if the model is generative.",
|
|
199
172
|
)
|
|
200
|
-
@click.option(
|
|
201
|
-
"--debug/--no-debug",
|
|
202
|
-
default=False,
|
|
203
|
-
show_default=True,
|
|
204
|
-
help="Whether to run the benchmark in debug mode. This prints out extra "
|
|
205
|
-
"information and stores all outputs to the current working directory. Only "
|
|
206
|
-
"relevant if the model is generative.",
|
|
207
|
-
)
|
|
208
173
|
@click.option(
|
|
209
174
|
"--requires-safetensors",
|
|
210
175
|
is_flag=True,
|
|
@@ -232,15 +197,47 @@ from .languages import get_all_languages
|
|
|
232
197
|
help="Only download the requested model weights and datasets, and exit.",
|
|
233
198
|
default=False,
|
|
234
199
|
)
|
|
200
|
+
@click.option(
|
|
201
|
+
"--debug/--no-debug",
|
|
202
|
+
default=False,
|
|
203
|
+
show_default=True,
|
|
204
|
+
help="Whether to run the benchmark in debug mode. This prints out extra "
|
|
205
|
+
"information and stores all outputs to the current working directory. Only "
|
|
206
|
+
"relevant if the model is generative.",
|
|
207
|
+
)
|
|
208
|
+
@click.option(
|
|
209
|
+
"--model-language",
|
|
210
|
+
"-ml",
|
|
211
|
+
default=None,
|
|
212
|
+
show_default=True,
|
|
213
|
+
multiple=True,
|
|
214
|
+
metavar="ISO 639-1 LANGUAGE CODE",
|
|
215
|
+
type=click.Choice(["all"] + list(get_all_languages().keys())),
|
|
216
|
+
help="""This option is deprecated - please use --language instead.""",
|
|
217
|
+
)
|
|
218
|
+
@click.option(
|
|
219
|
+
"--dataset-language",
|
|
220
|
+
"-dl",
|
|
221
|
+
default=None,
|
|
222
|
+
show_default=True,
|
|
223
|
+
multiple=True,
|
|
224
|
+
metavar="ISO 639-1 LANGUAGE CODE",
|
|
225
|
+
type=click.Choice(["all"] + list(get_all_languages().keys())),
|
|
226
|
+
help="""This option is deprecated - please use --language instead.""",
|
|
227
|
+
)
|
|
228
|
+
@click.option(
|
|
229
|
+
"--batch-size",
|
|
230
|
+
default=None,
|
|
231
|
+
type=click.Choice(["1", "2", "4", "8", "16", "32"]),
|
|
232
|
+
help="This option is deprecated - please use --finetuning-batch-size instead.",
|
|
233
|
+
deprecated=True,
|
|
234
|
+
)
|
|
235
235
|
def benchmark(
|
|
236
236
|
model: tuple[str],
|
|
237
237
|
dataset: tuple[str | DatasetConfig],
|
|
238
238
|
language: tuple[str],
|
|
239
|
-
model_language: tuple[str],
|
|
240
|
-
dataset_language: tuple[str],
|
|
241
239
|
raise_errors: bool,
|
|
242
240
|
task: tuple[str],
|
|
243
|
-
batch_size: str | None,
|
|
244
241
|
finetuning_batch_size: str,
|
|
245
242
|
progress_bar: bool,
|
|
246
243
|
save_results: bool,
|
|
@@ -257,11 +254,14 @@ def benchmark(
|
|
|
257
254
|
api_base: str | None,
|
|
258
255
|
api_version: str | None,
|
|
259
256
|
gpu_memory_utilization: float,
|
|
260
|
-
debug: bool,
|
|
261
257
|
requires_safetensors: bool,
|
|
262
258
|
generative_type: str | None,
|
|
263
259
|
custom_datasets_file: Path,
|
|
264
260
|
download_only: bool,
|
|
261
|
+
debug: bool,
|
|
262
|
+
model_language: tuple[str],
|
|
263
|
+
dataset_language: tuple[str],
|
|
264
|
+
batch_size: str | None,
|
|
265
265
|
) -> None:
|
|
266
266
|
"""Benchmark pretrained language models on language tasks."""
|
|
267
267
|
Benchmarker(
|
scandeval/constants.py
CHANGED
|
@@ -96,3 +96,12 @@ NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
|
|
|
96
96
|
|
|
97
97
|
# We only allow loading local datasets in these file formats
|
|
98
98
|
SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]
|
|
99
|
+
|
|
100
|
+
# These are default generation parameters, and can be overridden if a generative model
|
|
101
|
+
# has a `generation_config.json` file in its repository
|
|
102
|
+
GENERATION_KWARGS = {
|
|
103
|
+
"temperature": 0.0,
|
|
104
|
+
"top_p": 1.0,
|
|
105
|
+
"top_k": 0,
|
|
106
|
+
"repetition_penalty": 1.0,
|
|
107
|
+
}
|
scandeval/data_models.py
CHANGED
|
@@ -10,6 +10,7 @@ from pathlib import Path
|
|
|
10
10
|
|
|
11
11
|
import pydantic
|
|
12
12
|
import torch
|
|
13
|
+
from transformers.generation.configuration_utils import GenerationConfig
|
|
13
14
|
|
|
14
15
|
from .enums import Device, GenerativeType, ModelType, TaskGroup
|
|
15
16
|
from .exceptions import InvalidBenchmark
|
|
@@ -709,6 +710,9 @@ class ModelConfig:
|
|
|
709
710
|
adapter_base_model_id:
|
|
710
711
|
The model ID of the base model if the model is an adapter model. Can be None
|
|
711
712
|
if the model is not an adapter model.
|
|
713
|
+
generation_config (optional):
|
|
714
|
+
The generation configuration for generative models, if specified in the
|
|
715
|
+
model repository. Defaults to no generation configuration.
|
|
712
716
|
"""
|
|
713
717
|
|
|
714
718
|
model_id: str
|
|
@@ -722,6 +726,7 @@ class ModelConfig:
|
|
|
722
726
|
fresh: bool
|
|
723
727
|
model_cache_dir: str
|
|
724
728
|
adapter_base_model_id: str | None
|
|
729
|
+
generation_config: GenerationConfig | None = None
|
|
725
730
|
|
|
726
731
|
def __hash__(self) -> int:
|
|
727
732
|
"""Return a hash of the model configuration."""
|
|
@@ -6,6 +6,7 @@ from ..data_models import DatasetConfig
|
|
|
6
6
|
from ..languages import get_all_languages
|
|
7
7
|
from ..tasks import SPEED
|
|
8
8
|
from ..utils import load_custom_datasets_module
|
|
9
|
+
from .albanian import * # noqa: F403
|
|
9
10
|
from .bosnian import * # noqa: F403
|
|
10
11
|
from .bulgarian import * # noqa: F403
|
|
11
12
|
from .catalan import * # noqa: F403
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""All Albanian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import ALBANIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
MMS_SQ_CONFIG = DatasetConfig(
|
|
10
|
+
name="mms-sq",
|
|
11
|
+
pretty_name="MMS-sq",
|
|
12
|
+
source="EuroEval/mms-sq-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[ALBANIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_SQ_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-sq",
|
|
19
|
+
pretty_name="ScaLA-sq",
|
|
20
|
+
source="EuroEval/scala-sq",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[ALBANIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
WIKIANN_SQ_CONFIG = DatasetConfig(
|
|
26
|
+
name="wikiann-sq",
|
|
27
|
+
pretty_name="WikiANN-sq",
|
|
28
|
+
source="EuroEval/wikiann-sq-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[ALBANIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_SQ_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-sq",
|
|
35
|
+
pretty_name="MultiWikiQA-sq",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sq-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[ALBANIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
LR_SUM_SQ_CONFIG = DatasetConfig(
|
|
42
|
+
name="lr-sum-sq",
|
|
43
|
+
pretty_name="LRSum-sq",
|
|
44
|
+
source="EuroEval/lr-sum-sq-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[ALBANIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
GLOBAL_MMLU_LITE_SQ_CONFIG = DatasetConfig(
|
|
50
|
+
name="global-mmlu-lite-sq",
|
|
51
|
+
pretty_name="GlobalMMLULite-sq",
|
|
52
|
+
source="EuroEval/global-mmlu-lite-sq",
|
|
53
|
+
task=KNOW,
|
|
54
|
+
languages=[ALBANIAN],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
WINOGRANDE_SQ_CONFIG = DatasetConfig(
|
|
58
|
+
name="winogrande-sq",
|
|
59
|
+
pretty_name="Winogrande-sq",
|
|
60
|
+
source="EuroEval/winogrande-sq",
|
|
61
|
+
task=COMMON_SENSE,
|
|
62
|
+
languages=[ALBANIAN],
|
|
63
|
+
_labels=["a", "b"],
|
|
64
|
+
)
|
|
@@ -2,7 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import DUTCH
|
|
5
|
-
from ..tasks import
|
|
5
|
+
from ..tasks import (
|
|
6
|
+
COMMON_SENSE,
|
|
7
|
+
EUROPEAN_VALUES,
|
|
8
|
+
KNOW,
|
|
9
|
+
LA,
|
|
10
|
+
MCRC,
|
|
11
|
+
NER,
|
|
12
|
+
RC,
|
|
13
|
+
SENT,
|
|
14
|
+
SIMPL,
|
|
15
|
+
SUMM,
|
|
16
|
+
)
|
|
6
17
|
|
|
7
18
|
### Official datasets ###
|
|
8
19
|
|
|
@@ -122,6 +133,16 @@ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
|
|
|
122
133
|
unofficial=True,
|
|
123
134
|
)
|
|
124
135
|
|
|
136
|
+
COPA_NL_CONFIG = DatasetConfig(
|
|
137
|
+
name="copa-nl",
|
|
138
|
+
pretty_name="COPA-nl",
|
|
139
|
+
source="EuroEval/copa-nl",
|
|
140
|
+
task=COMMON_SENSE,
|
|
141
|
+
languages=[DUTCH],
|
|
142
|
+
unofficial=True,
|
|
143
|
+
_labels=["a", "b"],
|
|
144
|
+
)
|
|
145
|
+
|
|
125
146
|
GOLDENSWAG_NL_CONFIG = DatasetConfig(
|
|
126
147
|
name="goldenswag-nl",
|
|
127
148
|
pretty_name="GoldenSwag-nl",
|
|
@@ -140,3 +161,12 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
|
|
|
140
161
|
_labels=["a", "b"],
|
|
141
162
|
unofficial=True,
|
|
142
163
|
)
|
|
164
|
+
|
|
165
|
+
DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
|
|
166
|
+
name="duidelijke-taal",
|
|
167
|
+
pretty_name="Duidelijke Taal",
|
|
168
|
+
source="EuroEval/duidelijke-taal",
|
|
169
|
+
task=SIMPL,
|
|
170
|
+
languages=[DUTCH],
|
|
171
|
+
unofficial=True,
|
|
172
|
+
)
|
|
@@ -139,3 +139,12 @@ SKOLPROV_CONFIG = DatasetConfig(
|
|
|
139
139
|
languages=[SWEDISH],
|
|
140
140
|
unofficial=True,
|
|
141
141
|
)
|
|
142
|
+
|
|
143
|
+
SWEDISH_FACTS_CONFIG = DatasetConfig(
|
|
144
|
+
name="swedish-facts",
|
|
145
|
+
pretty_name="Swedish Facts",
|
|
146
|
+
source="EuroEval/swedish-facts",
|
|
147
|
+
task=KNOW,
|
|
148
|
+
languages=[SWEDISH],
|
|
149
|
+
unofficial=True,
|
|
150
|
+
)
|
scandeval/logging_utils.py
CHANGED
|
@@ -140,6 +140,7 @@ def block_terminal_output() -> None:
|
|
|
140
140
|
logging.getLogger("openai").setLevel(logging.CRITICAL)
|
|
141
141
|
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
142
142
|
litellm.suppress_debug_info = True # type: ignore[bad-assignment]
|
|
143
|
+
litellm.turn_off_message_logging = True
|
|
143
144
|
|
|
144
145
|
# Disable vLLM logging
|
|
145
146
|
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|