ScandEval 16.8.0__py3-none-any.whl → 16.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. scandeval/benchmark_modules/hf.py +18 -3
  2. scandeval/benchmark_modules/litellm.py +14 -13
  3. scandeval/benchmark_modules/vllm.py +127 -9
  4. scandeval/benchmarker.py +0 -11
  5. scandeval/cli.py +39 -39
  6. scandeval/constants.py +9 -0
  7. scandeval/data_models.py +5 -0
  8. scandeval/dataset_configs/__init__.py +1 -0
  9. scandeval/dataset_configs/albanian.py +64 -0
  10. scandeval/dataset_configs/dutch.py +31 -1
  11. scandeval/dataset_configs/swedish.py +9 -0
  12. scandeval/logging_utils.py +1 -0
  13. scandeval/metrics/huggingface.py +82 -0
  14. scandeval/metrics/llm_as_a_judge.py +1 -3
  15. scandeval/model_config.py +2 -2
  16. scandeval/prompt_templates/__init__.py +1 -0
  17. scandeval/prompt_templates/linguistic_acceptability.py +9 -0
  18. scandeval/prompt_templates/multiple_choice.py +9 -0
  19. scandeval/prompt_templates/named_entity_recognition.py +20 -0
  20. scandeval/prompt_templates/reading_comprehension.py +9 -0
  21. scandeval/prompt_templates/sentiment_classification.py +11 -0
  22. scandeval/prompt_templates/simplification.py +23 -0
  23. scandeval/prompt_templates/summarization.py +11 -0
  24. scandeval/task_group_utils/question_answering.py +30 -19
  25. scandeval/task_group_utils/sequence_classification.py +4 -4
  26. scandeval/task_group_utils/text_to_text.py +3 -4
  27. scandeval/task_group_utils/token_classification.py +6 -8
  28. scandeval/tasks.py +11 -0
  29. scandeval/tokenisation_utils.py +7 -1
  30. scandeval/types.py +7 -1
  31. scandeval/utils.py +5 -6
  32. {scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/METADATA +21 -3
  33. {scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/RECORD +36 -34
  34. {scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/WHEEL +1 -1
  35. {scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/entry_points.txt +0 -0
  36. {scandeval-16.8.0.dist-info → scandeval-16.10.0.dist-info}/licenses/LICENSE +0 -0
@@ -33,7 +33,6 @@ from transformers.modelcard import TASK_MAPPING
33
33
  from transformers.modeling_utils import PreTrainedModel
34
34
  from transformers.models.auto.configuration_auto import AutoConfig
35
35
  from transformers.models.auto.tokenization_auto import AutoTokenizer
36
- from transformers.tokenization_mistral_common import MistralCommonTokenizer
37
36
  from transformers.tokenization_utils_base import PreTrainedTokenizerBase
38
37
  from transformers.trainer import Trainer
39
38
  from urllib3.exceptions import RequestError
@@ -80,6 +79,13 @@ from ..utils import (
80
79
  )
81
80
  from .base import BenchmarkModule
82
81
 
82
+ try:
83
+ from transformers.tokenization_mistral_common import MistralCommonTokenizer
84
+ except ImportError:
85
+ from transformers.tokenization_mistral_common import (
86
+ MistralCommonBackend as MistralCommonTokenizer,
87
+ )
88
+
83
89
  if t.TYPE_CHECKING:
84
90
  from transformers.configuration_utils import PretrainedConfig
85
91
  from transformers.tokenization_utils import PreTrainedTokenizer
@@ -175,7 +181,16 @@ class HuggingFaceEncoderModel(BenchmarkModule):
175
181
  and repo_info.safetensors is not None
176
182
  and "total" in repo_info.safetensors
177
183
  ):
178
- num_params = repo_info.safetensors["total"]
184
+ num_params_candidates: list[int] = [repo_info.safetensors["total"]]
185
+ if "parameters" in repo_info.safetensors and isinstance(
186
+ repo_info.safetensors["parameters"], dict
187
+ ):
188
+ num_params_candidates.extend(
189
+ int(v)
190
+ for v in repo_info.safetensors["parameters"].values()
191
+ if isinstance(v, int) or (isinstance(v, str) and v.isdigit())
192
+ )
193
+ num_params = max(num_params_candidates)
179
194
  elif (
180
195
  hasattr(self._model.config, "num_params")
181
196
  and self._model.config.num_params is not None
@@ -1146,7 +1161,7 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
1146
1161
  "The token type embeddings of the model do not have a `data` "
1147
1162
  "attribute, which is needed to modify the embeddings."
1148
1163
  )
1149
- token_type_embeddings.weight.data = torch.cat( # type: ignore[missing-attribute]
1164
+ token_type_embeddings.weight.data = torch.cat(
1150
1165
  (
1151
1166
  token_type_embedding_tensor,
1152
1167
  torch.rand_like(token_type_embedding_tensor),
@@ -110,7 +110,7 @@ VOCAB_SIZE_MAPPING = {
110
110
  # Anthropic models
111
111
  r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
112
112
  # Gemini models
113
- r"(gemini/)?gemini-[1-9]\.[0-9]-(flash|pro).*": 256_128,
113
+ r"(gemini/)?gemini-[1-9](\.[0-9])?-(flash|pro).*": 256_128,
114
114
  # xAI models
115
115
  r"(xai/)?grok.*": -1,
116
116
  }
@@ -136,7 +136,7 @@ MODEL_MAX_LENGTH_MAPPING = {
136
136
  # Gemini models
137
137
  r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
138
138
  r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
139
- r"(gemini/)?gemini-2\.(0|5).*": 1_048_576,
139
+ r"(gemini/)?gemini-[23](\.[05])?.*": 1_048_576,
140
140
  # xAI models
141
141
  r"(xai/)?grok.*": 131_072,
142
142
  }
@@ -152,7 +152,7 @@ NUM_PARAMS_MAPPING = {
152
152
  # Gemini models
153
153
  r"(gemini/)?gemini-1.5-flash-8b": 8_000_000_000,
154
154
  r"(gemini/)?gemini-1.5-flash-[0-9]+": -1,
155
- r"(gemini/)?gemini-2.(0|5).*": -1,
155
+ r"(gemini/)?gemini-[23](.[05])?.*": -1,
156
156
  # xAI models
157
157
  r"(xai/)?grok.*": -1,
158
158
  }
@@ -208,8 +208,8 @@ class LiteLLMModel(BenchmarkModule):
208
208
  "thinking",
209
209
  ],
210
210
  # Gemini models
211
- re.compile(r"(gemini/)?gemini-2.5-flash-lite.*"): ["no-thinking", "thinking"],
212
- re.compile(r"(gemini/)?gemini-2.5-flash.*"): ["no-thinking", "thinking"],
211
+ re.compile(r"(gemini/)?gemini-2\.5-flash-lite.*"): ["no-thinking", "thinking"],
212
+ re.compile(r"(gemini/)?gemini-(2\.5|3)-flash.*"): ["no-thinking", "thinking"],
213
213
  # xAI models
214
214
  re.compile(r"(xai/)?grok-3-mini(-fast)?(-beta)?"): ["low", "medium", "high"],
215
215
  }
@@ -517,6 +517,7 @@ class LiteLLMModel(BenchmarkModule):
517
517
  response_format_messages = [
518
518
  "got an unexpected keyword argument 'response_format'",
519
519
  "the model returned empty outputs",
520
+ "'maxitems' is not supported",
520
521
  ]
521
522
 
522
523
  if (
@@ -838,14 +839,14 @@ class LiteLLMModel(BenchmarkModule):
838
839
  ]
839
840
 
840
841
  # Close connections
841
- for request in requests:
842
- if hasattr(request, "close"):
843
- try:
844
- request.close()
845
- except RuntimeError as e:
846
- log(
847
- f"RuntimeError during request.close(): {e}", level=logging.DEBUG
848
- )
842
+ semaphore.release()
843
+ router.reset()
844
+ try:
845
+ loop = asyncio.get_event_loop()
846
+ if not loop.is_closed():
847
+ loop.close()
848
+ except RuntimeError:
849
+ pass # Already closed
849
850
 
850
851
  return successes, failures
851
852
 
@@ -15,13 +15,14 @@ from time import sleep
15
15
  import torch
16
16
  from huggingface_hub import snapshot_download
17
17
  from pydantic import conlist, create_model
18
+ from transformers.generation.configuration_utils import GenerationConfig
18
19
  from transformers.models.auto.configuration_auto import AutoConfig
19
20
  from transformers.models.auto.tokenization_auto import AutoTokenizer
20
- from transformers.tokenization_mistral_common import MistralCommonTokenizer
21
21
  from urllib3.exceptions import RequestError
22
22
 
23
23
  from ..constants import (
24
24
  CUSTOM_STOP_TOKENS,
25
+ GENERATION_KWARGS,
25
26
  GENERATIVE_PIPELINE_TAGS,
26
27
  MAX_CONTEXT_LENGTH,
27
28
  MAX_VLLM_LOGPROBS,
@@ -81,6 +82,13 @@ from ..utils import (
81
82
  )
82
83
  from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
83
84
 
85
+ try:
86
+ from transformers.tokenization_mistral_common import MistralCommonTokenizer
87
+ except ImportError:
88
+ from transformers.tokenization_mistral_common import (
89
+ MistralCommonBackend as MistralCommonTokenizer,
90
+ )
91
+
84
92
  if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
85
93
  from vllm import LLM, SamplingParams # type: ignore[missing-import]
86
94
  from vllm.distributed.parallel_state import ( # type: ignore[missing-import]
@@ -92,6 +100,10 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
92
100
  StructuredOutputsParams,
93
101
  )
94
102
 
103
+ if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
104
+ import ray # type: ignore[missing-import]
105
+
106
+
95
107
  if t.TYPE_CHECKING:
96
108
  from datasets import DatasetDict
97
109
  from transformers.trainer import Trainer
@@ -100,10 +112,11 @@ if t.TYPE_CHECKING:
100
112
 
101
113
 
102
114
  MODELS_REQUIRING_CUSTOM_ATTENTION_BACKENDS: dict[re.Pattern, str] = {
103
- re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "FLASH_ATTN",
104
- re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "FLASH_ATTN",
105
- re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "FLASH_ATTN",
115
+ re.compile(r".*gpt-oss.*", flags=re.IGNORECASE): "TRITON_ATTN",
116
+ re.compile(r"google/gemma-3-1b.*", flags=re.IGNORECASE): "TRITON_ATTN",
117
+ re.compile(r"google/gemma-3n.*", flags=re.IGNORECASE): "TRITON_ATTN",
106
118
  re.compile(r"google/gemma-3-(4|12|27)b.*", flags=re.IGNORECASE): "TRITON_ATTN",
119
+ re.compile(r"PleIAs/Pleias-3b-Preview", flags=re.IGNORECASE): "TRITON_ATTN",
107
120
  }
108
121
 
109
122
 
@@ -478,6 +491,41 @@ class VLLMModel(HuggingFaceEncoderModel):
478
491
  )
479
492
 
480
493
  # Define the parameters used for vLLM generation
494
+ generation_kwargs = GENERATION_KWARGS.copy()
495
+ if (generation_config := self.model_config.generation_config) is not None:
496
+ changed_params = generation_config.to_diff_dict()
497
+ if "temperature" in changed_params:
498
+ temperature = changed_params["temperature"]
499
+ generation_kwargs["temperature"] = temperature
500
+ log_once(
501
+ f"Using temperature={temperature} with the model "
502
+ f"{self.model_config.model_id!r} as specified in its "
503
+ "generation configuration."
504
+ )
505
+ if "top_p" in changed_params:
506
+ top_p = changed_params["top_p"]
507
+ generation_kwargs["top_p"] = top_p
508
+ log_once(
509
+ f"Using top_p={top_p} with the model "
510
+ f"{self.model_config.model_id!r} as specified in its "
511
+ "generation configuration."
512
+ )
513
+ if "top_k" in changed_params:
514
+ top_k = changed_params["top_k"]
515
+ generation_kwargs["top_k"] = top_k
516
+ log_once(
517
+ f"Using top_k={top_k} with the model "
518
+ f"{self.model_config.model_id!r} as specified in its "
519
+ "generation configuration."
520
+ )
521
+ if "repetition_penalty" in changed_params:
522
+ repetition_penalty = changed_params["repetition_penalty"]
523
+ generation_kwargs["repetition_penalty"] = repetition_penalty
524
+ log_once(
525
+ f"Using repetition_penalty={repetition_penalty} with the model "
526
+ f"{self.model_config.model_id!r} as specified in its "
527
+ "generation configuration."
528
+ )
481
529
  max_tokens: int = (
482
530
  REASONING_MAX_TOKENS
483
531
  if self.generative_type == GenerativeType.REASONING
@@ -488,7 +536,10 @@ class VLLMModel(HuggingFaceEncoderModel):
488
536
  logprobs=MAX_VLLM_LOGPROBS
489
537
  if self.buffer["first_label_token_mapping"]
490
538
  else None,
491
- temperature=0.0,
539
+ temperature=generation_kwargs["temperature"],
540
+ top_p=generation_kwargs["top_p"],
541
+ top_k=generation_kwargs["top_k"],
542
+ repetition_penalty=generation_kwargs["repetition_penalty"],
492
543
  stop=[stop_token for stop_token in stop_tokens if stop_token],
493
544
  structured_outputs=structured_outputs,
494
545
  )
@@ -762,6 +813,16 @@ class VLLMModel(HuggingFaceEncoderModel):
762
813
  if model_info is None:
763
814
  raise InvalidModel(f"The model {model_id!r} could not be found.")
764
815
 
816
+ try:
817
+ generation_config = GenerationConfig.from_pretrained(
818
+ pretrained_model_name=model_id_components.model_id,
819
+ revision=model_id_components.revision,
820
+ cache_dir=benchmark_config.cache_dir,
821
+ token=benchmark_config.api_key,
822
+ )
823
+ except OSError:
824
+ generation_config = None
825
+
765
826
  language_mapping = get_all_languages()
766
827
  language_codes = list(language_mapping.keys())
767
828
 
@@ -783,6 +844,7 @@ class VLLMModel(HuggingFaceEncoderModel):
783
844
  cache_dir=benchmark_config.cache_dir, model_id=model_id
784
845
  ),
785
846
  adapter_base_model_id=model_info.adapter_base_model_id,
847
+ generation_config=generation_config,
786
848
  )
787
849
 
788
850
  return model_config
@@ -950,6 +1012,10 @@ def load_model_and_tokeniser(
950
1012
 
951
1013
  clear_vllm()
952
1014
 
1015
+ distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
1016
+ select_backend_and_parallelism()
1017
+ )
1018
+
953
1019
  try:
954
1020
  model = LLM(
955
1021
  model=(
@@ -968,8 +1034,9 @@ def load_model_and_tokeniser(
968
1034
  trust_remote_code=benchmark_config.trust_remote_code,
969
1035
  revision=revision,
970
1036
  seed=4242,
971
- distributed_executor_backend="mp",
972
- tensor_parallel_size=torch.cuda.device_count(),
1037
+ distributed_executor_backend=distributed_executor_backend,
1038
+ tensor_parallel_size=tensor_parallel_size,
1039
+ pipeline_parallel_size=pipeline_parallel_size,
973
1040
  disable_custom_all_reduce=True,
974
1041
  quantization=quantization,
975
1042
  dtype=dtype,
@@ -1005,8 +1072,8 @@ def load_model_and_tokeniser(
1005
1072
  "Since you're running in verbose mode, you might see a descriptive "
1006
1073
  "error above already. Note however that if the error message urges "
1007
1074
  "you to set the environment variable `VLLM_ATTENTION_BACKEND` to "
1008
- "'FLEX_ATTENTION', please try setting it to 'FLASH_ATTN' first, as "
1009
- "that often solves the issue, whereas 'FLEX_ATTENTION' usually "
1075
+ "'FLEX_ATTENTION', please try setting it to 'TRITON_ATTN' first, "
1076
+ "as that often solves the issue, whereas 'FLEX_ATTENTION' usually "
1010
1077
  "doesn't. If you don't see any descriptive error above, then you "
1011
1078
  "can try "
1012
1079
  )
@@ -1372,3 +1439,54 @@ def get_vllm_tokenisation_params(
1372
1439
  config_format=config_format,
1373
1440
  load_format=load_format,
1374
1441
  )
1442
+
1443
+
1444
+ def select_backend_and_parallelism() -> tuple[str, int, int]:
1445
+ """Determine the distributed backend and parallelism for vLLM.
1446
+
1447
+ Returns:
1448
+ Tuple containing:
1449
+ - backend (str): "ray" if multi-node Ray is available, else "mp".
1450
+ - tensor_parallel_size (int): Number of GPUs per node.
1451
+ - pipeline_parallel_size (int): Number of stages across nodes.
1452
+ """
1453
+ if not ray.is_initialized():
1454
+ try:
1455
+ ray.init(address="auto", ignore_reinit_error=True)
1456
+ except Exception as e:
1457
+ log_once(
1458
+ f"Ray initialisation failed with a {type(e)} exception: {e}",
1459
+ level=logging.DEBUG,
1460
+ )
1461
+
1462
+ is_ray = ray.is_initialized()
1463
+ local_gpu_count = torch.cuda.device_count()
1464
+
1465
+ if is_ray:
1466
+ resources = ray.cluster_resources()
1467
+ total_gpus = int(resources.get("GPU", 0))
1468
+ else:
1469
+ total_gpus = local_gpu_count
1470
+
1471
+ using_multiple_nodes = total_gpus > local_gpu_count
1472
+ if is_ray and using_multiple_nodes:
1473
+ distributed_executor_backend = "ray"
1474
+ tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
1475
+ pipeline_parallel_size = max(1, total_gpus // tensor_parallel_size)
1476
+ log_once(
1477
+ f"Detected a multi-node setup with {pipeline_parallel_size:,} nodes, each "
1478
+ "with {tensor_parallel_size:,} GPUs, so using `ray` as the "
1479
+ "distributed backend.",
1480
+ level=logging.DEBUG,
1481
+ )
1482
+ else:
1483
+ distributed_executor_backend = "mp"
1484
+ tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
1485
+ pipeline_parallel_size = 1
1486
+ log_once(
1487
+ f"Detected a single-node setup with {tensor_parallel_size:,} GPUs, "
1488
+ "so using the multiprocessing distributed backend.",
1489
+ level=logging.DEBUG,
1490
+ )
1491
+
1492
+ return distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size
scandeval/benchmarker.py CHANGED
@@ -12,7 +12,6 @@ from pathlib import Path
12
12
  from shutil import rmtree
13
13
  from time import sleep
14
14
 
15
- from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
16
15
  from torch.distributed import destroy_process_group
17
16
 
18
17
  from .benchmark_config_factory import build_benchmark_config
@@ -32,7 +31,6 @@ from .speed_benchmark import benchmark_speed
32
31
  from .tasks import SPEED
33
32
  from .utils import (
34
33
  enforce_reproducibility,
35
- get_package_version,
36
34
  internet_connection_available,
37
35
  split_model_id,
38
36
  )
@@ -194,15 +192,6 @@ class Benchmarker:
194
192
  msg += "the argument `download_only` was set to True."
195
193
  raise ValueError(msg)
196
194
 
197
- # Bail early if hf_transfer is enabled but not installed.
198
- if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
199
- raise ImportError(
200
- "Fast download using 'hf_transfer' is enabled "
201
- "(HF_HUB_ENABLE_HF_TRANSFER=1) but the 'hf_transfer' "
202
- "package is not available in your environment. "
203
- "Try installing it with `pip install hf_transfer`."
204
- )
205
-
206
195
  # Deprecation warnings
207
196
  if batch_size is not None:
208
197
  if run_with_cli:
scandeval/cli.py CHANGED
@@ -37,26 +37,6 @@ from .languages import get_all_languages
37
37
  help="""The languages to benchmark, both for models and datasets. If "all" then all
38
38
  models will be benchmarked on all datasets.""",
39
39
  )
40
- @click.option(
41
- "--model-language",
42
- "-ml",
43
- default=None,
44
- show_default=True,
45
- multiple=True,
46
- metavar="ISO 639-1 LANGUAGE CODE",
47
- type=click.Choice(["all"] + list(get_all_languages().keys())),
48
- help="""This option is deprecated - please use --language instead.""",
49
- )
50
- @click.option(
51
- "--dataset-language",
52
- "-dl",
53
- default=None,
54
- show_default=True,
55
- multiple=True,
56
- metavar="ISO 639-1 LANGUAGE CODE",
57
- type=click.Choice(["all"] + list(get_all_languages().keys())),
58
- help="""This option is deprecated - please use --language instead.""",
59
- )
60
40
  @click.option(
61
41
  "--dataset",
62
42
  default=None,
@@ -65,13 +45,6 @@ from .languages import get_all_languages
65
45
  help="""The name of the benchmark dataset. We recommend to use the `task` and
66
46
  `language` options instead of this option.""",
67
47
  )
68
- @click.option(
69
- "--batch-size",
70
- default=None,
71
- type=click.Choice(["1", "2", "4", "8", "16", "32"]),
72
- help="This option is deprecated - please use --finetuning-batch-size instead.",
73
- deprecated=True,
74
- )
75
48
  @click.option(
76
49
  "--finetuning-batch-size",
77
50
  default="32",
@@ -197,14 +170,6 @@ from .languages import get_all_languages
197
170
  "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
198
171
  "if you are running out of GPU memory. Only relevant if the model is generative.",
199
172
  )
200
- @click.option(
201
- "--debug/--no-debug",
202
- default=False,
203
- show_default=True,
204
- help="Whether to run the benchmark in debug mode. This prints out extra "
205
- "information and stores all outputs to the current working directory. Only "
206
- "relevant if the model is generative.",
207
- )
208
173
  @click.option(
209
174
  "--requires-safetensors",
210
175
  is_flag=True,
@@ -232,15 +197,47 @@ from .languages import get_all_languages
232
197
  help="Only download the requested model weights and datasets, and exit.",
233
198
  default=False,
234
199
  )
200
+ @click.option(
201
+ "--debug/--no-debug",
202
+ default=False,
203
+ show_default=True,
204
+ help="Whether to run the benchmark in debug mode. This prints out extra "
205
+ "information and stores all outputs to the current working directory. Only "
206
+ "relevant if the model is generative.",
207
+ )
208
+ @click.option(
209
+ "--model-language",
210
+ "-ml",
211
+ default=None,
212
+ show_default=True,
213
+ multiple=True,
214
+ metavar="ISO 639-1 LANGUAGE CODE",
215
+ type=click.Choice(["all"] + list(get_all_languages().keys())),
216
+ help="""This option is deprecated - please use --language instead.""",
217
+ )
218
+ @click.option(
219
+ "--dataset-language",
220
+ "-dl",
221
+ default=None,
222
+ show_default=True,
223
+ multiple=True,
224
+ metavar="ISO 639-1 LANGUAGE CODE",
225
+ type=click.Choice(["all"] + list(get_all_languages().keys())),
226
+ help="""This option is deprecated - please use --language instead.""",
227
+ )
228
+ @click.option(
229
+ "--batch-size",
230
+ default=None,
231
+ type=click.Choice(["1", "2", "4", "8", "16", "32"]),
232
+ help="This option is deprecated - please use --finetuning-batch-size instead.",
233
+ deprecated=True,
234
+ )
235
235
  def benchmark(
236
236
  model: tuple[str],
237
237
  dataset: tuple[str | DatasetConfig],
238
238
  language: tuple[str],
239
- model_language: tuple[str],
240
- dataset_language: tuple[str],
241
239
  raise_errors: bool,
242
240
  task: tuple[str],
243
- batch_size: str | None,
244
241
  finetuning_batch_size: str,
245
242
  progress_bar: bool,
246
243
  save_results: bool,
@@ -257,11 +254,14 @@ def benchmark(
257
254
  api_base: str | None,
258
255
  api_version: str | None,
259
256
  gpu_memory_utilization: float,
260
- debug: bool,
261
257
  requires_safetensors: bool,
262
258
  generative_type: str | None,
263
259
  custom_datasets_file: Path,
264
260
  download_only: bool,
261
+ debug: bool,
262
+ model_language: tuple[str],
263
+ dataset_language: tuple[str],
264
+ batch_size: str | None,
265
265
  ) -> None:
266
266
  """Benchmark pretrained language models on language tasks."""
267
267
  Benchmarker(
scandeval/constants.py CHANGED
@@ -96,3 +96,12 @@ NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
96
96
 
97
97
  # We only allow loading local datasets in these file formats
98
98
  SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]
99
+
100
+ # These are default generation parameters, and can be overridden if a generative model
101
+ # has a `generation_config.json` file in its repository
102
+ GENERATION_KWARGS = {
103
+ "temperature": 0.0,
104
+ "top_p": 1.0,
105
+ "top_k": 0,
106
+ "repetition_penalty": 1.0,
107
+ }
scandeval/data_models.py CHANGED
@@ -10,6 +10,7 @@ from pathlib import Path
10
10
 
11
11
  import pydantic
12
12
  import torch
13
+ from transformers.generation.configuration_utils import GenerationConfig
13
14
 
14
15
  from .enums import Device, GenerativeType, ModelType, TaskGroup
15
16
  from .exceptions import InvalidBenchmark
@@ -709,6 +710,9 @@ class ModelConfig:
709
710
  adapter_base_model_id:
710
711
  The model ID of the base model if the model is an adapter model. Can be None
711
712
  if the model is not an adapter model.
713
+ generation_config (optional):
714
+ The generation configuration for generative models, if specified in the
715
+ model repository. Defaults to no generation configuration.
712
716
  """
713
717
 
714
718
  model_id: str
@@ -722,6 +726,7 @@ class ModelConfig:
722
726
  fresh: bool
723
727
  model_cache_dir: str
724
728
  adapter_base_model_id: str | None
729
+ generation_config: GenerationConfig | None = None
725
730
 
726
731
  def __hash__(self) -> int:
727
732
  """Return a hash of the model configuration."""
@@ -6,6 +6,7 @@ from ..data_models import DatasetConfig
6
6
  from ..languages import get_all_languages
7
7
  from ..tasks import SPEED
8
8
  from ..utils import load_custom_datasets_module
9
+ from .albanian import * # noqa: F403
9
10
  from .bosnian import * # noqa: F403
10
11
  from .bulgarian import * # noqa: F403
11
12
  from .catalan import * # noqa: F403
@@ -0,0 +1,64 @@
1
+ """All Albanian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import ALBANIAN
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ MMS_SQ_CONFIG = DatasetConfig(
10
+ name="mms-sq",
11
+ pretty_name="MMS-sq",
12
+ source="EuroEval/mms-sq-mini",
13
+ task=SENT,
14
+ languages=[ALBANIAN],
15
+ )
16
+
17
+ SCALA_SQ_CONFIG = DatasetConfig(
18
+ name="scala-sq",
19
+ pretty_name="ScaLA-sq",
20
+ source="EuroEval/scala-sq",
21
+ task=LA,
22
+ languages=[ALBANIAN],
23
+ )
24
+
25
+ WIKIANN_SQ_CONFIG = DatasetConfig(
26
+ name="wikiann-sq",
27
+ pretty_name="WikiANN-sq",
28
+ source="EuroEval/wikiann-sq-mini",
29
+ task=NER,
30
+ languages=[ALBANIAN],
31
+ )
32
+
33
+ MULTI_WIKI_QA_SQ_CONFIG = DatasetConfig(
34
+ name="multi-wiki-qa-sq",
35
+ pretty_name="MultiWikiQA-sq",
36
+ source="EuroEval/multi-wiki-qa-sq-mini",
37
+ task=RC,
38
+ languages=[ALBANIAN],
39
+ )
40
+
41
+ LR_SUM_SQ_CONFIG = DatasetConfig(
42
+ name="lr-sum-sq",
43
+ pretty_name="LRSum-sq",
44
+ source="EuroEval/lr-sum-sq-mini",
45
+ task=SUMM,
46
+ languages=[ALBANIAN],
47
+ )
48
+
49
+ GLOBAL_MMLU_LITE_SQ_CONFIG = DatasetConfig(
50
+ name="global-mmlu-lite-sq",
51
+ pretty_name="GlobalMMLULite-sq",
52
+ source="EuroEval/global-mmlu-lite-sq",
53
+ task=KNOW,
54
+ languages=[ALBANIAN],
55
+ )
56
+
57
+ WINOGRANDE_SQ_CONFIG = DatasetConfig(
58
+ name="winogrande-sq",
59
+ pretty_name="Winogrande-sq",
60
+ source="EuroEval/winogrande-sq",
61
+ task=COMMON_SENSE,
62
+ languages=[ALBANIAN],
63
+ _labels=["a", "b"],
64
+ )
@@ -2,7 +2,18 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import DUTCH
5
- from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
5
+ from ..tasks import (
6
+ COMMON_SENSE,
7
+ EUROPEAN_VALUES,
8
+ KNOW,
9
+ LA,
10
+ MCRC,
11
+ NER,
12
+ RC,
13
+ SENT,
14
+ SIMPL,
15
+ SUMM,
16
+ )
6
17
 
7
18
  ### Official datasets ###
8
19
 
@@ -122,6 +133,16 @@ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
122
133
  unofficial=True,
123
134
  )
124
135
 
136
+ COPA_NL_CONFIG = DatasetConfig(
137
+ name="copa-nl",
138
+ pretty_name="COPA-nl",
139
+ source="EuroEval/copa-nl",
140
+ task=COMMON_SENSE,
141
+ languages=[DUTCH],
142
+ unofficial=True,
143
+ _labels=["a", "b"],
144
+ )
145
+
125
146
  GOLDENSWAG_NL_CONFIG = DatasetConfig(
126
147
  name="goldenswag-nl",
127
148
  pretty_name="GoldenSwag-nl",
@@ -140,3 +161,12 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
140
161
  _labels=["a", "b"],
141
162
  unofficial=True,
142
163
  )
164
+
165
+ DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
166
+ name="duidelijke-taal",
167
+ pretty_name="Duidelijke Taal",
168
+ source="EuroEval/duidelijke-taal",
169
+ task=SIMPL,
170
+ languages=[DUTCH],
171
+ unofficial=True,
172
+ )
@@ -139,3 +139,12 @@ SKOLPROV_CONFIG = DatasetConfig(
139
139
  languages=[SWEDISH],
140
140
  unofficial=True,
141
141
  )
142
+
143
+ SWEDISH_FACTS_CONFIG = DatasetConfig(
144
+ name="swedish-facts",
145
+ pretty_name="Swedish Facts",
146
+ source="EuroEval/swedish-facts",
147
+ task=KNOW,
148
+ languages=[SWEDISH],
149
+ unofficial=True,
150
+ )
@@ -140,6 +140,7 @@ def block_terminal_output() -> None:
140
140
  logging.getLogger("openai").setLevel(logging.CRITICAL)
141
141
  logging.getLogger("httpx").setLevel(logging.CRITICAL)
142
142
  litellm.suppress_debug_info = True # type: ignore[bad-assignment]
143
+ litellm.turn_off_message_logging = True
143
144
 
144
145
  # Disable vLLM logging
145
146
  logging.getLogger("vllm").setLevel(logging.CRITICAL)