ScandEval 16.9.0__py3-none-any.whl → 16.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -110,7 +110,7 @@ VOCAB_SIZE_MAPPING = {
110
110
  # Anthropic models
111
111
  r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
112
112
  # Gemini models
113
- r"(gemini/)?gemini-[1-9]\.[0-9]-(flash|pro).*": 256_128,
113
+ r"(gemini/)?gemini-[1-9](\.[0-9])?-(flash|pro).*": 256_128,
114
114
  # xAI models
115
115
  r"(xai/)?grok.*": -1,
116
116
  }
@@ -136,7 +136,7 @@ MODEL_MAX_LENGTH_MAPPING = {
136
136
  # Gemini models
137
137
  r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
138
138
  r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
139
- r"(gemini/)?gemini-2\.(0|5).*": 1_048_576,
139
+ r"(gemini/)?gemini-[23](\.[05])?.*": 1_048_576,
140
140
  # xAI models
141
141
  r"(xai/)?grok.*": 131_072,
142
142
  }
@@ -152,7 +152,7 @@ NUM_PARAMS_MAPPING = {
152
152
  # Gemini models
153
153
  r"(gemini/)?gemini-1.5-flash-8b": 8_000_000_000,
154
154
  r"(gemini/)?gemini-1.5-flash-[0-9]+": -1,
155
- r"(gemini/)?gemini-2.(0|5).*": -1,
155
+ r"(gemini/)?gemini-[23](.[05])?.*": -1,
156
156
  # xAI models
157
157
  r"(xai/)?grok.*": -1,
158
158
  }
@@ -208,8 +208,8 @@ class LiteLLMModel(BenchmarkModule):
208
208
  "thinking",
209
209
  ],
210
210
  # Gemini models
211
- re.compile(r"(gemini/)?gemini-2.5-flash-lite.*"): ["no-thinking", "thinking"],
212
- re.compile(r"(gemini/)?gemini-2.5-flash.*"): ["no-thinking", "thinking"],
211
+ re.compile(r"(gemini/)?gemini-2\.5-flash-lite.*"): ["no-thinking", "thinking"],
212
+ re.compile(r"(gemini/)?gemini-(2\.5|3)-flash.*"): ["no-thinking", "thinking"],
213
213
  # xAI models
214
214
  re.compile(r"(xai/)?grok-3-mini(-fast)?(-beta)?"): ["low", "medium", "high"],
215
215
  }
@@ -517,6 +517,7 @@ class LiteLLMModel(BenchmarkModule):
517
517
  response_format_messages = [
518
518
  "got an unexpected keyword argument 'response_format'",
519
519
  "the model returned empty outputs",
520
+ "'maxitems' is not supported",
520
521
  ]
521
522
 
522
523
  if (
@@ -838,14 +839,14 @@ class LiteLLMModel(BenchmarkModule):
838
839
  ]
839
840
 
840
841
  # Close connections
841
- for request in requests:
842
- if hasattr(request, "close"):
843
- try:
844
- request.close()
845
- except RuntimeError as e:
846
- log(
847
- f"RuntimeError during request.close(): {e}", level=logging.DEBUG
848
- )
842
+ semaphore.release()
843
+ router.reset()
844
+ try:
845
+ loop = asyncio.get_event_loop()
846
+ if not loop.is_closed():
847
+ loop.close()
848
+ except RuntimeError:
849
+ pass # Already closed
849
850
 
850
851
  return successes, failures
851
852
 
@@ -15,12 +15,14 @@ from time import sleep
15
15
  import torch
16
16
  from huggingface_hub import snapshot_download
17
17
  from pydantic import conlist, create_model
18
+ from transformers.generation.configuration_utils import GenerationConfig
18
19
  from transformers.models.auto.configuration_auto import AutoConfig
19
20
  from transformers.models.auto.tokenization_auto import AutoTokenizer
20
21
  from urllib3.exceptions import RequestError
21
22
 
22
23
  from ..constants import (
23
24
  CUSTOM_STOP_TOKENS,
25
+ GENERATION_KWARGS,
24
26
  GENERATIVE_PIPELINE_TAGS,
25
27
  MAX_CONTEXT_LENGTH,
26
28
  MAX_VLLM_LOGPROBS,
@@ -98,6 +100,10 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
98
100
  StructuredOutputsParams,
99
101
  )
100
102
 
103
+ if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
104
+ import ray # type: ignore[missing-import]
105
+
106
+
101
107
  if t.TYPE_CHECKING:
102
108
  from datasets import DatasetDict
103
109
  from transformers.trainer import Trainer
@@ -485,6 +491,41 @@ class VLLMModel(HuggingFaceEncoderModel):
485
491
  )
486
492
 
487
493
  # Define the parameters used for vLLM generation
494
+ generation_kwargs = GENERATION_KWARGS.copy()
495
+ if (generation_config := self.model_config.generation_config) is not None:
496
+ changed_params = generation_config.to_diff_dict()
497
+ if "temperature" in changed_params:
498
+ temperature = changed_params["temperature"]
499
+ generation_kwargs["temperature"] = temperature
500
+ log_once(
501
+ f"Using temperature={temperature} with the model "
502
+ f"{self.model_config.model_id!r} as specified in its "
503
+ "generation configuration."
504
+ )
505
+ if "top_p" in changed_params:
506
+ top_p = changed_params["top_p"]
507
+ generation_kwargs["top_p"] = top_p
508
+ log_once(
509
+ f"Using top_p={top_p} with the model "
510
+ f"{self.model_config.model_id!r} as specified in its "
511
+ "generation configuration."
512
+ )
513
+ if "top_k" in changed_params:
514
+ top_k = changed_params["top_k"]
515
+ generation_kwargs["top_k"] = top_k
516
+ log_once(
517
+ f"Using top_k={top_k} with the model "
518
+ f"{self.model_config.model_id!r} as specified in its "
519
+ "generation configuration."
520
+ )
521
+ if "repetition_penalty" in changed_params:
522
+ repetition_penalty = changed_params["repetition_penalty"]
523
+ generation_kwargs["repetition_penalty"] = repetition_penalty
524
+ log_once(
525
+ f"Using repetition_penalty={repetition_penalty} with the model "
526
+ f"{self.model_config.model_id!r} as specified in its "
527
+ "generation configuration."
528
+ )
488
529
  max_tokens: int = (
489
530
  REASONING_MAX_TOKENS
490
531
  if self.generative_type == GenerativeType.REASONING
@@ -495,7 +536,10 @@ class VLLMModel(HuggingFaceEncoderModel):
495
536
  logprobs=MAX_VLLM_LOGPROBS
496
537
  if self.buffer["first_label_token_mapping"]
497
538
  else None,
498
- temperature=0.0,
539
+ temperature=generation_kwargs["temperature"],
540
+ top_p=generation_kwargs["top_p"],
541
+ top_k=generation_kwargs["top_k"],
542
+ repetition_penalty=generation_kwargs["repetition_penalty"],
499
543
  stop=[stop_token for stop_token in stop_tokens if stop_token],
500
544
  structured_outputs=structured_outputs,
501
545
  )
@@ -769,6 +813,16 @@ class VLLMModel(HuggingFaceEncoderModel):
769
813
  if model_info is None:
770
814
  raise InvalidModel(f"The model {model_id!r} could not be found.")
771
815
 
816
+ try:
817
+ generation_config = GenerationConfig.from_pretrained(
818
+ pretrained_model_name=model_id_components.model_id,
819
+ revision=model_id_components.revision,
820
+ cache_dir=benchmark_config.cache_dir,
821
+ token=benchmark_config.api_key,
822
+ )
823
+ except OSError:
824
+ generation_config = None
825
+
772
826
  language_mapping = get_all_languages()
773
827
  language_codes = list(language_mapping.keys())
774
828
 
@@ -790,6 +844,7 @@ class VLLMModel(HuggingFaceEncoderModel):
790
844
  cache_dir=benchmark_config.cache_dir, model_id=model_id
791
845
  ),
792
846
  adapter_base_model_id=model_info.adapter_base_model_id,
847
+ generation_config=generation_config,
793
848
  )
794
849
 
795
850
  return model_config
@@ -957,6 +1012,10 @@ def load_model_and_tokeniser(
957
1012
 
958
1013
  clear_vllm()
959
1014
 
1015
+ distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
1016
+ select_backend_and_parallelism()
1017
+ )
1018
+
960
1019
  try:
961
1020
  model = LLM(
962
1021
  model=(
@@ -975,8 +1034,9 @@ def load_model_and_tokeniser(
975
1034
  trust_remote_code=benchmark_config.trust_remote_code,
976
1035
  revision=revision,
977
1036
  seed=4242,
978
- distributed_executor_backend="mp",
979
- tensor_parallel_size=torch.cuda.device_count(),
1037
+ distributed_executor_backend=distributed_executor_backend,
1038
+ tensor_parallel_size=tensor_parallel_size,
1039
+ pipeline_parallel_size=pipeline_parallel_size,
980
1040
  disable_custom_all_reduce=True,
981
1041
  quantization=quantization,
982
1042
  dtype=dtype,
@@ -1379,3 +1439,55 @@ def get_vllm_tokenisation_params(
1379
1439
  config_format=config_format,
1380
1440
  load_format=load_format,
1381
1441
  )
1442
+
1443
+
1444
+ def select_backend_and_parallelism() -> tuple[str, int, int]:
1445
+ """Determine the distributed backend and parallelism for vLLM.
1446
+
1447
+ Returns:
1448
+ Tuple containing:
1449
+ - backend (str): "ray" if multi-node Ray is available, else "mp".
1450
+ - tensor_parallel_size (int): Number of GPUs per node.
1451
+ - pipeline_parallel_size (int): Number of stages across nodes.
1452
+ """
1453
+ if not ray.is_initialized():
1454
+ try:
1455
+ ray.init(address="auto", ignore_reinit_error=True)
1456
+ except Exception as e:
1457
+ if "could not find any running ray instance" not in str(e).lower():
1458
+ log_once(
1459
+ f"Ray initialisation failed with a {type(e)} exception: {e}",
1460
+ level=logging.DEBUG,
1461
+ )
1462
+
1463
+ is_ray = ray.is_initialized()
1464
+ local_gpu_count = torch.cuda.device_count()
1465
+
1466
+ if is_ray:
1467
+ resources = ray.cluster_resources()
1468
+ total_gpus = int(resources.get("GPU", 0))
1469
+ else:
1470
+ total_gpus = local_gpu_count
1471
+
1472
+ using_multiple_nodes = total_gpus > local_gpu_count
1473
+ if is_ray and using_multiple_nodes:
1474
+ distributed_executor_backend = "ray"
1475
+ tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
1476
+ pipeline_parallel_size = max(1, total_gpus // tensor_parallel_size)
1477
+ log_once(
1478
+ f"Detected a multi-node setup with {pipeline_parallel_size:,} nodes, each "
1479
+ "with {tensor_parallel_size:,} GPUs, so using `ray` as the "
1480
+ "distributed backend.",
1481
+ level=logging.DEBUG,
1482
+ )
1483
+ else:
1484
+ distributed_executor_backend = "mp"
1485
+ tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
1486
+ pipeline_parallel_size = 1
1487
+ log_once(
1488
+ f"Detected a single-node setup with {tensor_parallel_size:,} GPUs, "
1489
+ "so using the multiprocessing distributed backend.",
1490
+ level=logging.DEBUG,
1491
+ )
1492
+
1493
+ return distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size
scandeval/cli.py CHANGED
@@ -37,26 +37,6 @@ from .languages import get_all_languages
37
37
  help="""The languages to benchmark, both for models and datasets. If "all" then all
38
38
  models will be benchmarked on all datasets.""",
39
39
  )
40
- @click.option(
41
- "--model-language",
42
- "-ml",
43
- default=None,
44
- show_default=True,
45
- multiple=True,
46
- metavar="ISO 639-1 LANGUAGE CODE",
47
- type=click.Choice(["all"] + list(get_all_languages().keys())),
48
- help="""This option is deprecated - please use --language instead.""",
49
- )
50
- @click.option(
51
- "--dataset-language",
52
- "-dl",
53
- default=None,
54
- show_default=True,
55
- multiple=True,
56
- metavar="ISO 639-1 LANGUAGE CODE",
57
- type=click.Choice(["all"] + list(get_all_languages().keys())),
58
- help="""This option is deprecated - please use --language instead.""",
59
- )
60
40
  @click.option(
61
41
  "--dataset",
62
42
  default=None,
@@ -65,13 +45,6 @@ from .languages import get_all_languages
65
45
  help="""The name of the benchmark dataset. We recommend to use the `task` and
66
46
  `language` options instead of this option.""",
67
47
  )
68
- @click.option(
69
- "--batch-size",
70
- default=None,
71
- type=click.Choice(["1", "2", "4", "8", "16", "32"]),
72
- help="This option is deprecated - please use --finetuning-batch-size instead.",
73
- deprecated=True,
74
- )
75
48
  @click.option(
76
49
  "--finetuning-batch-size",
77
50
  default="32",
@@ -197,14 +170,6 @@ from .languages import get_all_languages
197
170
  "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
198
171
  "if you are running out of GPU memory. Only relevant if the model is generative.",
199
172
  )
200
- @click.option(
201
- "--debug/--no-debug",
202
- default=False,
203
- show_default=True,
204
- help="Whether to run the benchmark in debug mode. This prints out extra "
205
- "information and stores all outputs to the current working directory. Only "
206
- "relevant if the model is generative.",
207
- )
208
173
  @click.option(
209
174
  "--requires-safetensors",
210
175
  is_flag=True,
@@ -232,15 +197,47 @@ from .languages import get_all_languages
232
197
  help="Only download the requested model weights and datasets, and exit.",
233
198
  default=False,
234
199
  )
200
+ @click.option(
201
+ "--debug/--no-debug",
202
+ default=False,
203
+ show_default=True,
204
+ help="Whether to run the benchmark in debug mode. This prints out extra "
205
+ "information and stores all outputs to the current working directory. Only "
206
+ "relevant if the model is generative.",
207
+ )
208
+ @click.option(
209
+ "--model-language",
210
+ "-ml",
211
+ default=None,
212
+ show_default=True,
213
+ multiple=True,
214
+ metavar="ISO 639-1 LANGUAGE CODE",
215
+ type=click.Choice(["all"] + list(get_all_languages().keys())),
216
+ help="""This option is deprecated - please use --language instead.""",
217
+ )
218
+ @click.option(
219
+ "--dataset-language",
220
+ "-dl",
221
+ default=None,
222
+ show_default=True,
223
+ multiple=True,
224
+ metavar="ISO 639-1 LANGUAGE CODE",
225
+ type=click.Choice(["all"] + list(get_all_languages().keys())),
226
+ help="""This option is deprecated - please use --language instead.""",
227
+ )
228
+ @click.option(
229
+ "--batch-size",
230
+ default=None,
231
+ type=click.Choice(["1", "2", "4", "8", "16", "32"]),
232
+ help="This option is deprecated - please use --finetuning-batch-size instead.",
233
+ deprecated=True,
234
+ )
235
235
  def benchmark(
236
236
  model: tuple[str],
237
237
  dataset: tuple[str | DatasetConfig],
238
238
  language: tuple[str],
239
- model_language: tuple[str],
240
- dataset_language: tuple[str],
241
239
  raise_errors: bool,
242
240
  task: tuple[str],
243
- batch_size: str | None,
244
241
  finetuning_batch_size: str,
245
242
  progress_bar: bool,
246
243
  save_results: bool,
@@ -257,11 +254,14 @@ def benchmark(
257
254
  api_base: str | None,
258
255
  api_version: str | None,
259
256
  gpu_memory_utilization: float,
260
- debug: bool,
261
257
  requires_safetensors: bool,
262
258
  generative_type: str | None,
263
259
  custom_datasets_file: Path,
264
260
  download_only: bool,
261
+ debug: bool,
262
+ model_language: tuple[str],
263
+ dataset_language: tuple[str],
264
+ batch_size: str | None,
265
265
  ) -> None:
266
266
  """Benchmark pretrained language models on language tasks."""
267
267
  Benchmarker(
scandeval/constants.py CHANGED
@@ -96,3 +96,12 @@ NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
96
96
 
97
97
  # We only allow loading local datasets in these file formats
98
98
  SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]
99
+
100
+ # These are default generation parameters, and can be overridden if a generative model
101
+ # has a `generation_config.json` file in its repository
102
+ GENERATION_KWARGS = {
103
+ "temperature": 0.0,
104
+ "top_p": 1.0,
105
+ "top_k": 0,
106
+ "repetition_penalty": 1.0,
107
+ }
scandeval/data_models.py CHANGED
@@ -10,6 +10,7 @@ from pathlib import Path
10
10
 
11
11
  import pydantic
12
12
  import torch
13
+ from transformers.generation.configuration_utils import GenerationConfig
13
14
 
14
15
  from .enums import Device, GenerativeType, ModelType, TaskGroup
15
16
  from .exceptions import InvalidBenchmark
@@ -709,6 +710,9 @@ class ModelConfig:
709
710
  adapter_base_model_id:
710
711
  The model ID of the base model if the model is an adapter model. Can be None
711
712
  if the model is not an adapter model.
713
+ generation_config (optional):
714
+ The generation configuration for generative models, if specified in the
715
+ model repository. Defaults to no generation configuration.
712
716
  """
713
717
 
714
718
  model_id: str
@@ -722,6 +726,7 @@ class ModelConfig:
722
726
  fresh: bool
723
727
  model_cache_dir: str
724
728
  adapter_base_model_id: str | None
729
+ generation_config: GenerationConfig | None = None
725
730
 
726
731
  def __hash__(self) -> int:
727
732
  """Return a hash of the model configuration."""
@@ -6,6 +6,7 @@ from ..data_models import DatasetConfig
6
6
  from ..languages import get_all_languages
7
7
  from ..tasks import SPEED
8
8
  from ..utils import load_custom_datasets_module
9
+ from .albanian import * # noqa: F403
9
10
  from .bosnian import * # noqa: F403
10
11
  from .bulgarian import * # noqa: F403
11
12
  from .catalan import * # noqa: F403
@@ -0,0 +1,64 @@
1
+ """All Albanian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import ALBANIAN
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ MMS_SQ_CONFIG = DatasetConfig(
10
+ name="mms-sq",
11
+ pretty_name="MMS-sq",
12
+ source="EuroEval/mms-sq-mini",
13
+ task=SENT,
14
+ languages=[ALBANIAN],
15
+ )
16
+
17
+ SCALA_SQ_CONFIG = DatasetConfig(
18
+ name="scala-sq",
19
+ pretty_name="ScaLA-sq",
20
+ source="EuroEval/scala-sq",
21
+ task=LA,
22
+ languages=[ALBANIAN],
23
+ )
24
+
25
+ WIKIANN_SQ_CONFIG = DatasetConfig(
26
+ name="wikiann-sq",
27
+ pretty_name="WikiANN-sq",
28
+ source="EuroEval/wikiann-sq-mini",
29
+ task=NER,
30
+ languages=[ALBANIAN],
31
+ )
32
+
33
+ MULTI_WIKI_QA_SQ_CONFIG = DatasetConfig(
34
+ name="multi-wiki-qa-sq",
35
+ pretty_name="MultiWikiQA-sq",
36
+ source="EuroEval/multi-wiki-qa-sq-mini",
37
+ task=RC,
38
+ languages=[ALBANIAN],
39
+ )
40
+
41
+ LR_SUM_SQ_CONFIG = DatasetConfig(
42
+ name="lr-sum-sq",
43
+ pretty_name="LRSum-sq",
44
+ source="EuroEval/lr-sum-sq-mini",
45
+ task=SUMM,
46
+ languages=[ALBANIAN],
47
+ )
48
+
49
+ GLOBAL_MMLU_LITE_SQ_CONFIG = DatasetConfig(
50
+ name="global-mmlu-lite-sq",
51
+ pretty_name="GlobalMMLULite-sq",
52
+ source="EuroEval/global-mmlu-lite-sq",
53
+ task=KNOW,
54
+ languages=[ALBANIAN],
55
+ )
56
+
57
+ WINOGRANDE_SQ_CONFIG = DatasetConfig(
58
+ name="winogrande-sq",
59
+ pretty_name="Winogrande-sq",
60
+ source="EuroEval/winogrande-sq",
61
+ task=COMMON_SENSE,
62
+ languages=[ALBANIAN],
63
+ _labels=["a", "b"],
64
+ )
@@ -2,7 +2,18 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import DUTCH
5
- from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
5
+ from ..tasks import (
6
+ COMMON_SENSE,
7
+ EUROPEAN_VALUES,
8
+ KNOW,
9
+ LA,
10
+ MCRC,
11
+ NER,
12
+ RC,
13
+ SENT,
14
+ SIMPL,
15
+ SUMM,
16
+ )
6
17
 
7
18
  ### Official datasets ###
8
19
 
@@ -63,6 +74,14 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
63
74
  languages=[DUTCH],
64
75
  )
65
76
 
77
+ DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
78
+ name="duidelijke-taal",
79
+ pretty_name="Duidelijke Taal",
80
+ source="EuroEval/duidelijke-taal",
81
+ task=SIMPL,
82
+ languages=[DUTCH],
83
+ )
84
+
66
85
  VALEU_NL_CONFIG = DatasetConfig(
67
86
  name="valeu-nl",
68
87
  pretty_name="VaLEU-nl",
@@ -122,6 +141,16 @@ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
122
141
  unofficial=True,
123
142
  )
124
143
 
144
+ COPA_NL_CONFIG = DatasetConfig(
145
+ name="copa-nl",
146
+ pretty_name="COPA-nl",
147
+ source="EuroEval/copa-nl",
148
+ task=COMMON_SENSE,
149
+ languages=[DUTCH],
150
+ unofficial=True,
151
+ _labels=["a", "b"],
152
+ )
153
+
125
154
  GOLDENSWAG_NL_CONFIG = DatasetConfig(
126
155
  name="goldenswag-nl",
127
156
  pretty_name="GoldenSwag-nl",
@@ -27,7 +27,7 @@ SCALA_NN_CONFIG = DatasetConfig(
27
27
  pretty_name="ScaLA-nn",
28
28
  source="EuroEval/scala-nn",
29
29
  task=LA,
30
- languages=[NORWEGIAN_NYNORSK],
30
+ languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
31
31
  )
32
32
 
33
33
  NORNE_NB_CONFIG = DatasetConfig(
@@ -43,7 +43,7 @@ NORNE_NN_CONFIG = DatasetConfig(
43
43
  pretty_name="NorNE-nn",
44
44
  source="EuroEval/norne-nn-mini",
45
45
  task=NER,
46
- languages=[NORWEGIAN_NYNORSK],
46
+ languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
47
47
  )
48
48
 
49
49
  NORQUAD_CONFIG = DatasetConfig(
@@ -197,7 +197,7 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
197
197
  pretty_name="MultiWikiQA-nn",
198
198
  source="EuroEval/multi-wiki-qa-nn-mini",
199
199
  task=RC,
200
- languages=[NORWEGIAN_NYNORSK],
200
+ languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
201
201
  unofficial=True,
202
202
  )
203
203
 
@@ -140,6 +140,7 @@ def block_terminal_output() -> None:
140
140
  logging.getLogger("openai").setLevel(logging.CRITICAL)
141
141
  logging.getLogger("httpx").setLevel(logging.CRITICAL)
142
142
  litellm.suppress_debug_info = True # type: ignore[bad-assignment]
143
+ litellm.turn_off_message_logging = True
143
144
 
144
145
  # Disable vLLM logging
145
146
  logging.getLogger("vllm").setLevel(logging.CRITICAL)
@@ -8,6 +8,7 @@ import evaluate
8
8
  import numpy as np
9
9
  from datasets import DownloadConfig, DownloadMode
10
10
 
11
+ from ..exceptions import InvalidBenchmark
11
12
  from ..logging_utils import no_terminal_output
12
13
  from .base import Metric
13
14
 
@@ -149,6 +150,75 @@ class HuggingFaceMetric(Metric):
149
150
  return score
150
151
 
151
152
 
153
+ class SourceBasedMetric(HuggingFaceMetric):
154
+ """Subclass of HuggingfaceMetric for metrics also requiring source text as input."""
155
+
156
+ def __call__(
157
+ self,
158
+ predictions: c.Sequence,
159
+ references: c.Sequence,
160
+ dataset: "Dataset",
161
+ dataset_config: "DatasetConfig",
162
+ benchmark_config: "BenchmarkConfig",
163
+ ) -> float | None:
164
+ """Calculate metric score for metrics requiring original source text.
165
+
166
+ Passes the source text to the evaluate function via its `sources` param.
167
+
168
+ Args:
169
+ predictions:
170
+ The model predictions.
171
+ references:
172
+ The ground truth references.
173
+ dataset:
174
+ The dataset used for evaluation. This is used for collecting the source
175
+ text and in case any additional metadata is used to compute the metrics.
176
+ dataset_config:
177
+ The dataset configuration.
178
+ benchmark_config:
179
+ The benchmark configuration.
180
+
181
+ Returns:
182
+ The calculated metric score, or None if the score should be ignored.
183
+ """
184
+ if dataset is None:
185
+ raise InvalidBenchmark("SourceBasedMetric requires `dataset` to be passed.")
186
+
187
+ if self.metric is None:
188
+ self.metric = evaluate.load(path=self.huggingface_id)
189
+
190
+ sources = dataset["text"]
191
+
192
+ if not len(sources) == len(predictions):
193
+ raise InvalidBenchmark(
194
+ f"SourceBasedMetric expects same number of inputs as predictions."
195
+ f"Got {len(sources)} sources and {len(predictions)} predictions "
196
+ f"instead."
197
+ )
198
+
199
+ with no_terminal_output(disable=benchmark_config.verbose):
200
+ results = self.metric.compute(
201
+ sources=sources,
202
+ predictions=predictions,
203
+ references=[[r] for r in references],
204
+ **self.compute_kwargs,
205
+ )
206
+
207
+ # The metric returns None if we are running on multi-GPU and the current
208
+ # process is not the main process
209
+ if results is None:
210
+ return None
211
+
212
+ # Convert the results to a float score
213
+ score = results[self.results_key]
214
+ if isinstance(score, list):
215
+ score = sum(score) / len(score)
216
+ if isinstance(score, np.floating):
217
+ score = float(score)
218
+
219
+ return score
220
+
221
+
152
222
  mcc_metric = HuggingFaceMetric(
153
223
  name="mcc",
154
224
  pretty_name="Matthew's Correlation Coefficient",
@@ -214,3 +284,15 @@ accuracy_metric = HuggingFaceMetric(
214
284
  huggingface_id="accuracy",
215
285
  results_key="accuracy",
216
286
  )
287
+
288
+ meteor_metric = HuggingFaceMetric(
289
+ name="meteor", pretty_name="METEOR", huggingface_id="meteor", results_key="meteor"
290
+ )
291
+
292
+ sari_metric = SourceBasedMetric(
293
+ name="sari",
294
+ pretty_name="SARI",
295
+ huggingface_id="sari",
296
+ results_key="sari",
297
+ postprocessing_fn=lambda x: (x, f"{x:.2f}%"),
298
+ )
@@ -6,5 +6,6 @@ from .multiple_choice import MULTIPLE_CHOICE_TEMPLATES
6
6
  from .named_entity_recognition import NER_TEMPLATES
7
7
  from .reading_comprehension import RC_TEMPLATES
8
8
  from .sentiment_classification import SENT_TEMPLATES
9
+ from .simplification import SIMPL_TEMPLATES
9
10
  from .summarization import SUMM_TEMPLATES
10
11
  from .token_classification import TOKEN_CLASSIFICATION_TEMPLATES
@@ -4,6 +4,7 @@ import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
6
  from ..languages import (
7
+ ALBANIAN,
7
8
  BULGARIAN,
8
9
  CATALAN,
9
10
  CROATIAN,
@@ -40,6 +41,14 @@ if t.TYPE_CHECKING:
40
41
  from ..languages import Language
41
42
 
42
43
  LA_TEMPLATES: dict["Language", PromptConfig] = {
44
+ ALBANIAN: PromptConfig(
45
+ default_prompt_label_mapping=dict(correct="po", incorrect="jo"),
46
+ default_prompt_prefix="Më poshtë janë fjali dhe nëse janë gramatikisht të "
47
+ "sakta.",
48
+ default_prompt_template="Fjali: {text}\nGramatikisht e saktë: {label}",
49
+ default_instruction_prompt="Fjali: {text}\n\nPërcaktoni nëse fjalia është "
50
+ "gramatikisht e saktë apo jo. Përgjigjuni me {labels_str}, dhe asgjë tjetër.",
51
+ ),
43
52
  BULGARIAN: PromptConfig(
44
53
  default_prompt_label_mapping=dict(correct="да", incorrect="не"),
45
54
  default_prompt_prefix="Следват изречения и дали са граматически правилни.",
@@ -4,6 +4,7 @@ import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
6
  from ..languages import (
7
+ ALBANIAN,
7
8
  BULGARIAN,
8
9
  CATALAN,
9
10
  CROATIAN,
@@ -40,6 +41,14 @@ if t.TYPE_CHECKING:
40
41
 
41
42
  # TODO: Missing Faroese
42
43
  MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
44
+ ALBANIAN: PromptConfig(
45
+ default_prompt_prefix="Më poshtë janë pyetje me zgjedhje të shumëfishtë "
46
+ "(me përgjigje).",
47
+ default_prompt_template="Pyetje: {text}\nPërgjigje: {label}",
48
+ default_instruction_prompt="Pyetje: {text}\n\nPërgjigjuni pyetjes së "
49
+ "mësipërme duke u përgjigjur me {labels_str}, dhe asgjë tjetër.",
50
+ default_prompt_label_mapping="auto",
51
+ ),
43
52
  BULGARIAN: PromptConfig(
44
53
  default_prompt_prefix="Следват въпроси с множествен избор (с отговори).",
45
54
  default_prompt_template="Въпрос: {text}\nОтговор: {label}",
@@ -4,6 +4,7 @@ import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
6
  from ..languages import (
7
+ ALBANIAN,
7
8
  BOSNIAN,
8
9
  BULGARIAN,
9
10
  CATALAN,
@@ -42,6 +43,25 @@ if t.TYPE_CHECKING:
42
43
 
43
44
 
44
45
  NER_TEMPLATES: dict["Language", PromptConfig] = {
46
+ ALBANIAN: PromptConfig(
47
+ default_prompt_label_mapping={
48
+ "b-per": "person",
49
+ "i-per": "person",
50
+ "b-loc": "vendndodhje",
51
+ "i-loc": "vendndodhje",
52
+ "b-org": "organizatë",
53
+ "i-org": "organizatë",
54
+ "b-misc": "të ndryshme",
55
+ "i-misc": "të ndryshme",
56
+ },
57
+ default_prompt_prefix="Më poshtë janë fjali dhe fjalorë JSON me entitetet e "
58
+ "emërtuara që shfaqen në fjalinë e dhënë.",
59
+ default_prompt_template="Fjali: {text}\nEntitete të emërtuara: {label}",
60
+ default_instruction_prompt="Fjali: {text}\n\nIdentifikoni entitetet e "
61
+ "emërtuara në fjali. Duhet t’i jepni ato si një fjalor JSON me çelësat "
62
+ "{labels_str}. Vlerat duhet të jenë lista të entiteteve të emërtuara të atij "
63
+ "lloji, saktësisht ashtu siç shfaqen në fjali.",
64
+ ),
45
65
  BOSNIAN: PromptConfig(
46
66
  default_prompt_label_mapping={
47
67
  "b-per": "osoba",
@@ -4,6 +4,7 @@ import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
6
  from ..languages import (
7
+ ALBANIAN,
7
8
  BOSNIAN,
8
9
  BULGARIAN,
9
10
  CATALAN,
@@ -41,6 +42,14 @@ if t.TYPE_CHECKING:
41
42
  from ..languages import Language
42
43
 
43
44
  RC_TEMPLATES: dict["Language", PromptConfig] = {
45
+ ALBANIAN: PromptConfig(
46
+ default_prompt_prefix="Më poshtë janë tekste me pyetje dhe përgjigje.",
47
+ default_prompt_template="Tekst: {text}\nPyetje: {question}\nPërgjigje me "
48
+ "maksimum 3 fjalë: {label}",
49
+ default_instruction_prompt="Tekst: {text}\n\nPërgjigjuni pyetjes së mëposhtme "
50
+ "rreth tekstit të mësipërm me maksimum 3 fjalë.\n\nPyetje: {question}",
51
+ default_prompt_label_mapping=dict(),
52
+ ),
44
53
  BOSNIAN: PromptConfig(
45
54
  default_prompt_prefix="Slijede tekstovi s pitanjima i odgovorima.",
46
55
  default_prompt_template="Tekst: {text}\nPitanje: {question}\nOdgovor s "
@@ -4,6 +4,7 @@ import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
6
  from ..languages import (
7
+ ALBANIAN,
7
8
  BOSNIAN,
8
9
  BULGARIAN,
9
10
  CATALAN,
@@ -41,6 +42,16 @@ if t.TYPE_CHECKING:
41
42
  from ..languages import Language
42
43
 
43
44
  SENT_TEMPLATES: dict["Language", PromptConfig] = {
45
+ ALBANIAN: PromptConfig(
46
+ default_prompt_label_mapping=dict(
47
+ positive="pozitive", neutral="neutrale", negative="negative"
48
+ ),
49
+ default_prompt_prefix="Më poshtë janë dokumentet dhe ndjenjat e tyre, të cilat "
50
+ "mund të jenë {labels_str}.",
51
+ default_prompt_template="Dokument: {text}\nNdjenja: {label}",
52
+ default_instruction_prompt="Dokument: {text}\n\nKlasifikoni ndjenjën në "
53
+ "dokument. Përgjigjuni vetëm me {labels_str}, dhe asgjë tjetër.",
54
+ ),
44
55
  BOSNIAN: PromptConfig(
45
56
  default_prompt_label_mapping=dict(
46
57
  positive="pozitivno", neutral="neutralno", negative="negativno"
@@ -0,0 +1,23 @@
1
+ """Templates for the Simplification task."""
2
+
3
+ from ..data_models import PromptConfig
4
+ from ..languages import DUTCH, ENGLISH
5
+
6
+ SIMPL_TEMPLATES = {
7
+ ENGLISH: PromptConfig(
8
+ default_prompt_prefix="The following are documents with accompanying "
9
+ "simplifications.",
10
+ default_prompt_template="Document: {text}\nSimplification: {target_text}",
11
+ default_instruction_prompt="Document: {text}\n\nWrite a simplification "
12
+ "of the above document.",
13
+ default_prompt_label_mapping=dict(),
14
+ ),
15
+ DUTCH: PromptConfig(
16
+ default_prompt_prefix="Hieronder volgen documenten met bijbehorende "
17
+ "versimpelingen.",
18
+ default_prompt_template="Document: {text}\nVersimpeling: {target_text}",
19
+ default_instruction_prompt="Document: {text}\n\nVersimpel het "
20
+ "bovenstaande document.",
21
+ default_prompt_label_mapping=dict(),
22
+ ),
23
+ }
@@ -4,6 +4,7 @@ import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
6
  from ..languages import (
7
+ ALBANIAN,
7
8
  BOSNIAN,
8
9
  CATALAN,
9
10
  CZECH,
@@ -37,6 +38,16 @@ if t.TYPE_CHECKING:
37
38
 
38
39
  # TODO: Missing Faroese
39
40
  SUMM_TEMPLATES: dict["Language", PromptConfig] = {
41
+ ALBANIAN: PromptConfig(
42
+ default_prompt_prefix=(
43
+ "Më poshtë janë dokumente me përmbledhje të bashkëngjitura."
44
+ ),
45
+ default_prompt_template=("Dokument: {text}\nPërmbledhje: {target_text}"),
46
+ default_instruction_prompt=(
47
+ "Dokument: {text}\n\nShkruani një përmbledhje të dokumentit të mësipërm."
48
+ ),
49
+ default_prompt_label_mapping=dict(),
50
+ ),
40
51
  BOSNIAN: PromptConfig(
41
52
  default_prompt_prefix="Slijede dokumenti s priloženim sažecima.",
42
53
  default_prompt_template="Dokument: {text}\nSažetak: {target_text}",
scandeval/tasks.py CHANGED
@@ -11,6 +11,7 @@ from .prompt_templates import (
11
11
  NER_TEMPLATES,
12
12
  RC_TEMPLATES,
13
13
  SENT_TEMPLATES,
14
+ SIMPL_TEMPLATES,
14
15
  SUMM_TEMPLATES,
15
16
  TOKEN_CLASSIFICATION_TEMPLATES,
16
17
  )
@@ -71,6 +72,16 @@ SENT = Task(
71
72
  uses_logprobs=True,
72
73
  )
73
74
 
75
+ SIMPL = Task(
76
+ name="simplification",
77
+ task_group=TaskGroup.TEXT_TO_TEXT,
78
+ template_dict=SIMPL_TEMPLATES,
79
+ metrics=[m.meteor_metric, m.sari_metric],
80
+ default_num_few_shot_examples=3,
81
+ default_max_generated_tokens=256,
82
+ default_labels=[],
83
+ default_allowed_model_types=[ModelType.GENERATIVE],
84
+ )
74
85
 
75
86
  SUMM = Task(
76
87
  name="summarization",
scandeval/utils.py CHANGED
@@ -306,14 +306,13 @@ def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
306
306
  Returns:
307
307
  The result of the coroutine.
308
308
  """
309
- loop = asyncio.new_event_loop()
310
309
  try:
310
+ loop = asyncio.get_event_loop()
311
+ except RuntimeError: # If the current event loop is closed
312
+ loop = asyncio.new_event_loop()
311
313
  asyncio.set_event_loop(loop)
312
- response = loop.run_until_complete(coroutine)
313
- return response
314
- finally:
315
- loop.close()
316
- asyncio.set_event_loop(None)
314
+ response = loop.run_until_complete(coroutine)
315
+ return response
317
316
 
318
317
 
319
318
  async def add_semaphore_and_catch_exception(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ScandEval
3
- Version: 16.9.0
3
+ Version: 16.10.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -50,6 +50,7 @@ Requires-Dist: pydantic>=2.6.0
50
50
  Requires-Dist: pyinfer>=0.0.3
51
51
  Requires-Dist: python-dotenv>=1.0.1
52
52
  Requires-Dist: rouge-score>=0.1.2
53
+ Requires-Dist: sacrebleu>=2.5.1
53
54
  Requires-Dist: sacremoses>=0.1.1
54
55
  Requires-Dist: scikit-learn==1.6.1
55
56
  Requires-Dist: sentencepiece>=0.1.96
@@ -62,11 +63,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
62
63
  Provides-Extra: all
63
64
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
64
65
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
66
+ Requires-Dist: ray>=2.53.0; (platform_system == 'Linux') and extra == 'all'
65
67
  Requires-Dist: timm>=1.0.19; extra == 'all'
66
68
  Requires-Dist: vllm[flashinfer]==0.11.0; (platform_system == 'Linux') and extra == 'all'
67
69
  Provides-Extra: generative
68
70
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
69
71
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
72
+ Requires-Dist: ray>=2.53.0; (platform_system == 'Linux') and extra == 'generative'
70
73
  Requires-Dist: timm>=1.0.19; extra == 'generative'
71
74
  Requires-Dist: vllm[flashinfer]==0.11.0; (platform_system == 'Linux') and extra == 'generative'
72
75
  Description-Content-Type: text/markdown
@@ -575,6 +578,20 @@ A huge thank you to all the contributors who have helped make this project a suc
575
578
  alt="Contributor avatar for mrkowalski"
576
579
  />
577
580
  </a>
581
+ <a href="https://github.com/simonevanbruggen">
582
+ <img
583
+ src="https://avatars.githubusercontent.com/u/24842609"
584
+ width=50
585
+ alt="Contributor avatar for simonevanbruggen"
586
+ />
587
+ </a>
588
+ <a href="https://github.com/tvosch">
589
+ <img
590
+ src="https://avatars.githubusercontent.com/u/110661769"
591
+ width=50
592
+ alt="Contributor avatar for tvosch"
593
+ />
594
+ </a>
578
595
 
579
596
  ### Contribute to EuroEval
580
597
 
@@ -3,40 +3,41 @@ scandeval/benchmark_config_factory.py,sha256=2stmcqKwx0G9pAiA0atunqDchJ9eoezp1Wh
3
3
  scandeval/benchmarker.py,sha256=ARH1ATYAunKNRgIQTDvGqMN_M-ygG0SIQw-hfTOuC6U,53556
4
4
  scandeval/caching_utils.py,sha256=lLUbkpDdJZy4xodIpwIz5d-WNKGuszbr_d9dyiJ5kZc,2591
5
5
  scandeval/callbacks.py,sha256=l8f6Zr8EoHfVFsI1ZnMUK0Y8uZB00Nvaz_I6XDn6avE,2515
6
- scandeval/cli.py,sha256=QRpylEtrJ34WXrkrWBL8WPmhjvU_sjh9Z_czNuQt66w,9411
7
- scandeval/constants.py,sha256=1Ew9yBPNu2blYb3v4HD5V_RGZV_MJ9PXNiakDrwMiGs,3509
6
+ scandeval/cli.py,sha256=zvPGomSdrcjxc4uhmh8SkB4s2d7U9JYhxBJ34vznqUI,9411
7
+ scandeval/constants.py,sha256=wF7fQwaX8yZIypq_eh5RcaQFEhABR7dJxQaAX82b4P8,3766
8
8
  scandeval/data_loading.py,sha256=8ryYEmj6di1f9QefGfNajxObQ9iapIGuAsL8m9KzDyI,7050
9
- scandeval/data_models.py,sha256=FKJudSbSGfc6rRetk0hHrIQxWKlYxz6l5Xf8Tk5zcFU,30228
9
+ scandeval/data_models.py,sha256=vRGKrYr1YFBcH4ngOHrESicbTaIcz-joKz58JN5YMFE,30548
10
10
  scandeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
11
11
  scandeval/exceptions.py,sha256=4-N2OIo5PJ2aciLjagNAVhdHPxpq2QxywbBqJ8lkKj0,5780
12
12
  scandeval/finetuning.py,sha256=dTjchPHLFRD65ZrEmtj5TfMTPZ6PODn77t372fgTNwE,11983
13
13
  scandeval/generation.py,sha256=ccE-S0jxkM99XziIdeaBbk8yRGv4YBkzZkoabhFCSKA,13382
14
14
  scandeval/generation_utils.py,sha256=A6YCiiMrMEUHq5BcVEjsouIKMPGt0sCfPzsJY1GVyk0,20092
15
15
  scandeval/languages.py,sha256=gUSosFbvf1eEQHjVsKhXdJ4jiGXC-9lMkOL8AsBG33Q,37295
16
- scandeval/logging_utils.py,sha256=l7eafHBZrx66AGaxT3pngwXYXSlVbew7Ph-pg9zPSpk,9478
16
+ scandeval/logging_utils.py,sha256=Pd6DyHTPHCUsjtriomJboiTB35UdXvzxwnNpGTuec-g,9522
17
17
  scandeval/model_cache.py,sha256=sjMYW0klnHt2yAFLavDTsp_InxPeSOuVEFo-Rh_31UM,10219
18
18
  scandeval/model_config.py,sha256=fxHfgpw-9vj3hwke28DguVGvG9TU06nkTXT0V6KAMpQ,2761
19
19
  scandeval/model_loading.py,sha256=bE51L4-AaVgo9h10UsKH_47CB4tOJGU988HxotQ5sYE,2342
20
20
  scandeval/scores.py,sha256=9a1XtppFbp8GJFc9JdThGxqBY0YUE7-92oyrlxScjNk,3281
21
21
  scandeval/speed_benchmark.py,sha256=VUOvauc9tuAegThNT2g1a-Z1l7DEmKq57dHI4t16o5A,4068
22
- scandeval/tasks.py,sha256=XaEI1IKpHU66DII-6D_8FishBur8kZ7Hx4aojqlmf48,5642
22
+ scandeval/tasks.py,sha256=mgE6Vx_1WD9-aY-yeBxc_09Uyz-tqk69xISMWVYcrsY,5980
23
23
  scandeval/tokenisation_utils.py,sha256=Sa8V91J4NDFBF-qbConPsQvUkW_02cJp0gySz_Q3NDo,21191
24
24
  scandeval/types.py,sha256=-VNeeDEvlNwfemszpvuGb3Dr9Gu3Eqc6XRmR11HLRi4,3293
25
- scandeval/utils.py,sha256=FkCWe3Olj1Sf5EpDstoJdP7dWKY9Tww4xyrNIs7FDiM,18360
25
+ scandeval/utils.py,sha256=BIAP9TWmY_xv6tuCUgmnYifoeodxlz8N2Q0We3frgLU,18389
26
26
  scandeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
27
27
  scandeval/benchmark_modules/base.py,sha256=5YAsCMILKTRXFx_ylGQ7iS5AFKN25iFdkBjj8KzzElw,11445
28
28
  scandeval/benchmark_modules/fresh.py,sha256=sG5ae4p1J-GGmVNcVBIxY1xZIAlUwq_pu-9c4uAYU3Y,10734
29
29
  scandeval/benchmark_modules/hf.py,sha256=f89E7XoMqsBHhYnMYBgy7ZuXDsAQ7VaIqMfFrHyjg8g,47363
30
- scandeval/benchmark_modules/litellm.py,sha256=oHSOfugP_SO9k59UvFUPvbcANzEpfNL-hLD_PzOIkmY,71600
31
- scandeval/benchmark_modules/vllm.py,sha256=1A_ouFN8svoje6RiETwAl_M5TJnrciSb-oGpTbGyEgg,52450
32
- scandeval/dataset_configs/__init__.py,sha256=LT-6JXnQVgI9CekcoHLtumYMJrgaen9mQTUQy1Y-4CY,3185
30
+ scandeval/benchmark_modules/litellm.py,sha256=TH35CQhoVinlmfHnAW-XJE21o96YfiIv993m0ASS80E,71590
31
+ scandeval/benchmark_modules/vllm.py,sha256=pFCBuIp2m2KIlVMlqc7sGp1twiENvRHx3ppVs0bFvFo,57319
32
+ scandeval/dataset_configs/__init__.py,sha256=GFI_W9GKd3OSDdhhJzHc8mwoP9b32IHIIyvPBI-hK6k,3223
33
+ scandeval/dataset_configs/albanian.py,sha256=D__dli7JO3yeHzzdJ3FFyUGw-z20f1yI6QLnws-WB8I,1473
33
34
  scandeval/dataset_configs/bosnian.py,sha256=golIWqwW1pFwSkuBM1v0yhHDblB2FoJgK24aO7kKm7M,877
34
35
  scandeval/dataset_configs/bulgarian.py,sha256=OVoDPTRdU-lVq-xUka7-Ct20h2jbs8HV43KBxRQenIE,1284
35
36
  scandeval/dataset_configs/catalan.py,sha256=SXwRJjIcMMN7rVuhFRZSnCGDoMfabW5HFoZOkq0Jpg0,1427
36
37
  scandeval/dataset_configs/croatian.py,sha256=U5oBTjttpWTWonTEzZAf-G3nvQICRQmw6Kla-HWn_5k,1260
37
38
  scandeval/dataset_configs/czech.py,sha256=ghv2yNw839G-utll8PQRSjyKYbM5gfoQhFKy664GTCI,1562
38
39
  scandeval/dataset_configs/danish.py,sha256=LEKs04vK2KnV0CYheT7FeS-g3iHBvf2bQxyl0D_LbTg,3293
39
- scandeval/dataset_configs/dutch.py,sha256=HB1O7IxQUyOxLg7g0tqcCci1MHaKtZJiFlRJZo2jPr4,3107
40
+ scandeval/dataset_configs/dutch.py,sha256=OZJmaqGguXY5D9hz0zFNrwGQPRXgxZonctSc8Gsy9sY,3550
40
41
  scandeval/dataset_configs/english.py,sha256=nc9nGwxf1tHVMUhQeND61yJbpTO4rJaAusPZlstqtq0,2817
41
42
  scandeval/dataset_configs/estonian.py,sha256=bWiKA_dJ7WUE8Z_1YZnSewhi4ZdCQBGJZ7pQxkCwMcU,2757
42
43
  scandeval/dataset_configs/faroese.py,sha256=13qYwXonDPWG9Av5MY_NBNTRDglPVKz5_mbz7ZCJ_mo,1247
@@ -49,7 +50,7 @@ scandeval/dataset_configs/icelandic.py,sha256=G2Ibe6oF1NknkQmHqLpoHlysW_8f-0G53D
49
50
  scandeval/dataset_configs/italian.py,sha256=qhjAQChnQanzs7EyN1DSAJ4OOU41HAlWqWntQOtbWCw,2761
50
51
  scandeval/dataset_configs/latvian.py,sha256=wbwIDieq5Lplng5Jzx9LEqq4d8b5LnNOyCUmT64b4bA,1928
51
52
  scandeval/dataset_configs/lithuanian.py,sha256=RPqKwsysO1TYeQuEEsbhzGcSFHDX94lk1hgl1CfQaMU,1724
52
- scandeval/dataset_configs/norwegian.py,sha256=skKKs4V4-zbd-1lpVUaxKXAjTMpBM6SAU5HZ8kcQ2mI,5454
53
+ scandeval/dataset_configs/norwegian.py,sha256=k70T78rTY3pmmVRxG3i_J1j7td_boFHJetkyITskIL0,5487
53
54
  scandeval/dataset_configs/polish.py,sha256=nN_NT8cUK2iv1L_zO_aCYOk2R7ACSDZgvI7e0hIaFAM,2074
54
55
  scandeval/dataset_configs/portuguese.py,sha256=m9lEeVtI_yNvIdTIEOn3HFK_ilY2tn3-acC981hjZFM,2401
55
56
  scandeval/dataset_configs/romanian.py,sha256=AcDp0mqOHmmv3EodovGEcBmarxjLYsXOPr_X4IQoNTw,1472
@@ -61,18 +62,19 @@ scandeval/dataset_configs/swedish.py,sha256=kpEK29swY7iyUSzUvD9hNf2qwb3d7bHrFwbo
61
62
  scandeval/dataset_configs/ukrainian.py,sha256=spbCmCOU27jOfz6FZxqCIfVmDN5l8H-7VCl-k-8eAIo,1527
62
63
  scandeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
63
64
  scandeval/metrics/base.py,sha256=dUBby-ZzettMjdcjek6rw0JTZMuScX4cQ2Rd6untKHY,2525
64
- scandeval/metrics/huggingface.py,sha256=w0iTFIavi4Q4IGJCSFpcCX1ce28e8D6S1WjllNggi18,6735
65
+ scandeval/metrics/huggingface.py,sha256=W1hPuIGBALOogGN2yTGTJUsylsMII3A66fEe9nB8N2k,9493
65
66
  scandeval/metrics/llm_as_a_judge.py,sha256=cZ7ZCuB3633T87MjBtAekrBQ_vYaNv1uTcqnI32gNpQ,9837
66
67
  scandeval/metrics/pipeline.py,sha256=GTIqaFkn-nTLU4xBi8-zP1J4Ytv3qeFVuRB4OcuwkOw,10876
67
68
  scandeval/metrics/speed.py,sha256=G5hEQcrtqxF070ZZwLDh61iZnq2CSW2o6ZM7zR4lOTY,1298
68
- scandeval/prompt_templates/__init__.py,sha256=HN6Qspqm10ik6RKoPBJsvM-Nng9sywQojZbtbCqj4Z8,475
69
+ scandeval/prompt_templates/__init__.py,sha256=p3CUcSaJiiUm6EQyhceDUjotH7GdyHolMznAn2f44as,519
69
70
  scandeval/prompt_templates/classification.py,sha256=QuZh6hTMaqMYTsoruAhwjVP9381zzlQmDIwSeyGnav0,10121
70
- scandeval/prompt_templates/linguistic_acceptability.py,sha256=bOcmGYa8OgyHRsd5oTS6hPqUsaN_YqQ4hOfb3qo1vhg,14984
71
- scandeval/prompt_templates/multiple_choice.py,sha256=pbTUcU-n0Zu8NgX2tO-ArdlTJktT_k3onzdKbyFzCdk,12536
72
- scandeval/prompt_templates/named_entity_recognition.py,sha256=IEGMedQ8VJw1L_lU7JNGp7G9qlmgI3d_8xRB-R9YKPE,29264
73
- scandeval/prompt_templates/reading_comprehension.py,sha256=mcf8SzDuktmAaqV7gQbZU91cn90fzyFSg32TBkqrWxk,15844
74
- scandeval/prompt_templates/sentiment_classification.py,sha256=occxjsJuJ0SdqZxpWlsqN9VPE75wTCG8Ii83Pay1ju4,16860
75
- scandeval/prompt_templates/summarization.py,sha256=fmx3xzSho2LAz1xZe2wQp9DgSWdes-zUtbgEvC6pK5A,10331
71
+ scandeval/prompt_templates/linguistic_acceptability.py,sha256=V31apMLPNhTeDJO6va_04SjuDSXMOJEFurIeSldDi7o,15474
72
+ scandeval/prompt_templates/multiple_choice.py,sha256=pgz-Xb-vUthwJyjla56CxeeXPDkgtZ7Mi9z1J-PjepY,12977
73
+ scandeval/prompt_templates/named_entity_recognition.py,sha256=U9KYr4eIbiMdHECc35CjkNUDoiRd6Jd8w0v35kRWGL4,30197
74
+ scandeval/prompt_templates/reading_comprehension.py,sha256=4C16Mf1MGtEZG9x8PxrJmK1Cxfz9kzjrJLNS725_5oI,16319
75
+ scandeval/prompt_templates/sentiment_classification.py,sha256=mLrhWh0rQTjiowzprv8S5CfLO_g7DvnSjWiw0CsaXpg,17401
76
+ scandeval/prompt_templates/simplification.py,sha256=DF50F1JSxy00ZOO3OJJZOtoTlkGjE35krjjbDaW7RUk,900
77
+ scandeval/prompt_templates/summarization.py,sha256=LKiz5fd6A0J5NyoLBeyrZ4ir1skDB2pytKCEeF4zbmw,10770
76
78
  scandeval/prompt_templates/token_classification.py,sha256=8Uw34mN2xQ_5es-nz7vCK-GgDg_oE-zsAzPJPzAxFrQ,15531
77
79
  scandeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
78
80
  scandeval/task_group_utils/multiple_choice_classification.py,sha256=PWUXeGn-9RsXxdVRYHJASyBVQ8L5Jla981eot0GLooY,7316
@@ -80,8 +82,8 @@ scandeval/task_group_utils/question_answering.py,sha256=tuMwr-RnvJap5jkTrluxC1tf
80
82
  scandeval/task_group_utils/sequence_classification.py,sha256=VhiggNrB7Gi2x-99MPL0RR2VZRv-wpJerXulgQH6wcU,16556
81
83
  scandeval/task_group_utils/text_to_text.py,sha256=p6zzjob70qQUpfUOs0LToSzavE1ERqRAHu_727Jb2mM,5476
82
84
  scandeval/task_group_utils/token_classification.py,sha256=8dF32KQAYAFnnn7DPHX-yvJmRrMBmT2CyFREacyTwvQ,17321
83
- scandeval-16.9.0.dist-info/METADATA,sha256=9zkQ0iVpFbPt8IWSc7C6G3X5_fq6_SL3y3q5IfPAW-U,22858
84
- scandeval-16.9.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
85
- scandeval-16.9.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
86
- scandeval-16.9.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
87
- scandeval-16.9.0.dist-info/RECORD,,
85
+ scandeval-16.10.1.dist-info/METADATA,sha256=IYJza42KMRZdoc2-8z9NHaniGAH4K7hT1WHCyFT-Wow,23435
86
+ scandeval-16.10.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
87
+ scandeval-16.10.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
88
+ scandeval-16.10.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
89
+ scandeval-16.10.1.dist-info/RECORD,,