ScandEval 16.9.0__py3-none-any.whl → 16.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scandeval/benchmark_modules/litellm.py +14 -13
- scandeval/benchmark_modules/vllm.py +115 -3
- scandeval/cli.py +39 -39
- scandeval/constants.py +9 -0
- scandeval/data_models.py +5 -0
- scandeval/dataset_configs/__init__.py +1 -0
- scandeval/dataset_configs/albanian.py +64 -0
- scandeval/dataset_configs/dutch.py +30 -1
- scandeval/dataset_configs/norwegian.py +3 -3
- scandeval/logging_utils.py +1 -0
- scandeval/metrics/huggingface.py +82 -0
- scandeval/prompt_templates/__init__.py +1 -0
- scandeval/prompt_templates/linguistic_acceptability.py +9 -0
- scandeval/prompt_templates/multiple_choice.py +9 -0
- scandeval/prompt_templates/named_entity_recognition.py +20 -0
- scandeval/prompt_templates/reading_comprehension.py +9 -0
- scandeval/prompt_templates/sentiment_classification.py +11 -0
- scandeval/prompt_templates/simplification.py +23 -0
- scandeval/prompt_templates/summarization.py +11 -0
- scandeval/tasks.py +11 -0
- scandeval/utils.py +5 -6
- {scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/METADATA +18 -1
- {scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/RECORD +26 -24
- {scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/WHEEL +0 -0
- {scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/entry_points.txt +0 -0
- {scandeval-16.9.0.dist-info → scandeval-16.10.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -110,7 +110,7 @@ VOCAB_SIZE_MAPPING = {
|
|
|
110
110
|
# Anthropic models
|
|
111
111
|
r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": -1,
|
|
112
112
|
# Gemini models
|
|
113
|
-
r"(gemini/)?gemini-[1-9]\.[0-9]
|
|
113
|
+
r"(gemini/)?gemini-[1-9](\.[0-9])?-(flash|pro).*": 256_128,
|
|
114
114
|
# xAI models
|
|
115
115
|
r"(xai/)?grok.*": -1,
|
|
116
116
|
}
|
|
@@ -136,7 +136,7 @@ MODEL_MAX_LENGTH_MAPPING = {
|
|
|
136
136
|
# Gemini models
|
|
137
137
|
r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
|
|
138
138
|
r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
|
|
139
|
-
r"(gemini/)?gemini-
|
|
139
|
+
r"(gemini/)?gemini-[23](\.[05])?.*": 1_048_576,
|
|
140
140
|
# xAI models
|
|
141
141
|
r"(xai/)?grok.*": 131_072,
|
|
142
142
|
}
|
|
@@ -152,7 +152,7 @@ NUM_PARAMS_MAPPING = {
|
|
|
152
152
|
# Gemini models
|
|
153
153
|
r"(gemini/)?gemini-1.5-flash-8b": 8_000_000_000,
|
|
154
154
|
r"(gemini/)?gemini-1.5-flash-[0-9]+": -1,
|
|
155
|
-
r"(gemini/)?gemini-
|
|
155
|
+
r"(gemini/)?gemini-[23](.[05])?.*": -1,
|
|
156
156
|
# xAI models
|
|
157
157
|
r"(xai/)?grok.*": -1,
|
|
158
158
|
}
|
|
@@ -208,8 +208,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
208
208
|
"thinking",
|
|
209
209
|
],
|
|
210
210
|
# Gemini models
|
|
211
|
-
re.compile(r"(gemini/)?gemini-2
|
|
212
|
-
re.compile(r"(gemini/)?gemini-2
|
|
211
|
+
re.compile(r"(gemini/)?gemini-2\.5-flash-lite.*"): ["no-thinking", "thinking"],
|
|
212
|
+
re.compile(r"(gemini/)?gemini-(2\.5|3)-flash.*"): ["no-thinking", "thinking"],
|
|
213
213
|
# xAI models
|
|
214
214
|
re.compile(r"(xai/)?grok-3-mini(-fast)?(-beta)?"): ["low", "medium", "high"],
|
|
215
215
|
}
|
|
@@ -517,6 +517,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
517
517
|
response_format_messages = [
|
|
518
518
|
"got an unexpected keyword argument 'response_format'",
|
|
519
519
|
"the model returned empty outputs",
|
|
520
|
+
"'maxitems' is not supported",
|
|
520
521
|
]
|
|
521
522
|
|
|
522
523
|
if (
|
|
@@ -838,14 +839,14 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
838
839
|
]
|
|
839
840
|
|
|
840
841
|
# Close connections
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
842
|
+
semaphore.release()
|
|
843
|
+
router.reset()
|
|
844
|
+
try:
|
|
845
|
+
loop = asyncio.get_event_loop()
|
|
846
|
+
if not loop.is_closed():
|
|
847
|
+
loop.close()
|
|
848
|
+
except RuntimeError:
|
|
849
|
+
pass # Already closed
|
|
849
850
|
|
|
850
851
|
return successes, failures
|
|
851
852
|
|
|
@@ -15,12 +15,14 @@ from time import sleep
|
|
|
15
15
|
import torch
|
|
16
16
|
from huggingface_hub import snapshot_download
|
|
17
17
|
from pydantic import conlist, create_model
|
|
18
|
+
from transformers.generation.configuration_utils import GenerationConfig
|
|
18
19
|
from transformers.models.auto.configuration_auto import AutoConfig
|
|
19
20
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
20
21
|
from urllib3.exceptions import RequestError
|
|
21
22
|
|
|
22
23
|
from ..constants import (
|
|
23
24
|
CUSTOM_STOP_TOKENS,
|
|
25
|
+
GENERATION_KWARGS,
|
|
24
26
|
GENERATIVE_PIPELINE_TAGS,
|
|
25
27
|
MAX_CONTEXT_LENGTH,
|
|
26
28
|
MAX_VLLM_LOGPROBS,
|
|
@@ -98,6 +100,10 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
|
98
100
|
StructuredOutputsParams,
|
|
99
101
|
)
|
|
100
102
|
|
|
103
|
+
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
104
|
+
import ray # type: ignore[missing-import]
|
|
105
|
+
|
|
106
|
+
|
|
101
107
|
if t.TYPE_CHECKING:
|
|
102
108
|
from datasets import DatasetDict
|
|
103
109
|
from transformers.trainer import Trainer
|
|
@@ -485,6 +491,41 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
485
491
|
)
|
|
486
492
|
|
|
487
493
|
# Define the parameters used for vLLM generation
|
|
494
|
+
generation_kwargs = GENERATION_KWARGS.copy()
|
|
495
|
+
if (generation_config := self.model_config.generation_config) is not None:
|
|
496
|
+
changed_params = generation_config.to_diff_dict()
|
|
497
|
+
if "temperature" in changed_params:
|
|
498
|
+
temperature = changed_params["temperature"]
|
|
499
|
+
generation_kwargs["temperature"] = temperature
|
|
500
|
+
log_once(
|
|
501
|
+
f"Using temperature={temperature} with the model "
|
|
502
|
+
f"{self.model_config.model_id!r} as specified in its "
|
|
503
|
+
"generation configuration."
|
|
504
|
+
)
|
|
505
|
+
if "top_p" in changed_params:
|
|
506
|
+
top_p = changed_params["top_p"]
|
|
507
|
+
generation_kwargs["top_p"] = top_p
|
|
508
|
+
log_once(
|
|
509
|
+
f"Using top_p={top_p} with the model "
|
|
510
|
+
f"{self.model_config.model_id!r} as specified in its "
|
|
511
|
+
"generation configuration."
|
|
512
|
+
)
|
|
513
|
+
if "top_k" in changed_params:
|
|
514
|
+
top_k = changed_params["top_k"]
|
|
515
|
+
generation_kwargs["top_k"] = top_k
|
|
516
|
+
log_once(
|
|
517
|
+
f"Using top_k={top_k} with the model "
|
|
518
|
+
f"{self.model_config.model_id!r} as specified in its "
|
|
519
|
+
"generation configuration."
|
|
520
|
+
)
|
|
521
|
+
if "repetition_penalty" in changed_params:
|
|
522
|
+
repetition_penalty = changed_params["repetition_penalty"]
|
|
523
|
+
generation_kwargs["repetition_penalty"] = repetition_penalty
|
|
524
|
+
log_once(
|
|
525
|
+
f"Using repetition_penalty={repetition_penalty} with the model "
|
|
526
|
+
f"{self.model_config.model_id!r} as specified in its "
|
|
527
|
+
"generation configuration."
|
|
528
|
+
)
|
|
488
529
|
max_tokens: int = (
|
|
489
530
|
REASONING_MAX_TOKENS
|
|
490
531
|
if self.generative_type == GenerativeType.REASONING
|
|
@@ -495,7 +536,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
495
536
|
logprobs=MAX_VLLM_LOGPROBS
|
|
496
537
|
if self.buffer["first_label_token_mapping"]
|
|
497
538
|
else None,
|
|
498
|
-
temperature=
|
|
539
|
+
temperature=generation_kwargs["temperature"],
|
|
540
|
+
top_p=generation_kwargs["top_p"],
|
|
541
|
+
top_k=generation_kwargs["top_k"],
|
|
542
|
+
repetition_penalty=generation_kwargs["repetition_penalty"],
|
|
499
543
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
500
544
|
structured_outputs=structured_outputs,
|
|
501
545
|
)
|
|
@@ -769,6 +813,16 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
769
813
|
if model_info is None:
|
|
770
814
|
raise InvalidModel(f"The model {model_id!r} could not be found.")
|
|
771
815
|
|
|
816
|
+
try:
|
|
817
|
+
generation_config = GenerationConfig.from_pretrained(
|
|
818
|
+
pretrained_model_name=model_id_components.model_id,
|
|
819
|
+
revision=model_id_components.revision,
|
|
820
|
+
cache_dir=benchmark_config.cache_dir,
|
|
821
|
+
token=benchmark_config.api_key,
|
|
822
|
+
)
|
|
823
|
+
except OSError:
|
|
824
|
+
generation_config = None
|
|
825
|
+
|
|
772
826
|
language_mapping = get_all_languages()
|
|
773
827
|
language_codes = list(language_mapping.keys())
|
|
774
828
|
|
|
@@ -790,6 +844,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
790
844
|
cache_dir=benchmark_config.cache_dir, model_id=model_id
|
|
791
845
|
),
|
|
792
846
|
adapter_base_model_id=model_info.adapter_base_model_id,
|
|
847
|
+
generation_config=generation_config,
|
|
793
848
|
)
|
|
794
849
|
|
|
795
850
|
return model_config
|
|
@@ -957,6 +1012,10 @@ def load_model_and_tokeniser(
|
|
|
957
1012
|
|
|
958
1013
|
clear_vllm()
|
|
959
1014
|
|
|
1015
|
+
distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size = (
|
|
1016
|
+
select_backend_and_parallelism()
|
|
1017
|
+
)
|
|
1018
|
+
|
|
960
1019
|
try:
|
|
961
1020
|
model = LLM(
|
|
962
1021
|
model=(
|
|
@@ -975,8 +1034,9 @@ def load_model_and_tokeniser(
|
|
|
975
1034
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
976
1035
|
revision=revision,
|
|
977
1036
|
seed=4242,
|
|
978
|
-
distributed_executor_backend=
|
|
979
|
-
tensor_parallel_size=
|
|
1037
|
+
distributed_executor_backend=distributed_executor_backend,
|
|
1038
|
+
tensor_parallel_size=tensor_parallel_size,
|
|
1039
|
+
pipeline_parallel_size=pipeline_parallel_size,
|
|
980
1040
|
disable_custom_all_reduce=True,
|
|
981
1041
|
quantization=quantization,
|
|
982
1042
|
dtype=dtype,
|
|
@@ -1379,3 +1439,55 @@ def get_vllm_tokenisation_params(
|
|
|
1379
1439
|
config_format=config_format,
|
|
1380
1440
|
load_format=load_format,
|
|
1381
1441
|
)
|
|
1442
|
+
|
|
1443
|
+
|
|
1444
|
+
def select_backend_and_parallelism() -> tuple[str, int, int]:
|
|
1445
|
+
"""Determine the distributed backend and parallelism for vLLM.
|
|
1446
|
+
|
|
1447
|
+
Returns:
|
|
1448
|
+
Tuple containing:
|
|
1449
|
+
- backend (str): "ray" if multi-node Ray is available, else "mp".
|
|
1450
|
+
- tensor_parallel_size (int): Number of GPUs per node.
|
|
1451
|
+
- pipeline_parallel_size (int): Number of stages across nodes.
|
|
1452
|
+
"""
|
|
1453
|
+
if not ray.is_initialized():
|
|
1454
|
+
try:
|
|
1455
|
+
ray.init(address="auto", ignore_reinit_error=True)
|
|
1456
|
+
except Exception as e:
|
|
1457
|
+
if "could not find any running ray instance" not in str(e).lower():
|
|
1458
|
+
log_once(
|
|
1459
|
+
f"Ray initialisation failed with a {type(e)} exception: {e}",
|
|
1460
|
+
level=logging.DEBUG,
|
|
1461
|
+
)
|
|
1462
|
+
|
|
1463
|
+
is_ray = ray.is_initialized()
|
|
1464
|
+
local_gpu_count = torch.cuda.device_count()
|
|
1465
|
+
|
|
1466
|
+
if is_ray:
|
|
1467
|
+
resources = ray.cluster_resources()
|
|
1468
|
+
total_gpus = int(resources.get("GPU", 0))
|
|
1469
|
+
else:
|
|
1470
|
+
total_gpus = local_gpu_count
|
|
1471
|
+
|
|
1472
|
+
using_multiple_nodes = total_gpus > local_gpu_count
|
|
1473
|
+
if is_ray and using_multiple_nodes:
|
|
1474
|
+
distributed_executor_backend = "ray"
|
|
1475
|
+
tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
|
|
1476
|
+
pipeline_parallel_size = max(1, total_gpus // tensor_parallel_size)
|
|
1477
|
+
log_once(
|
|
1478
|
+
f"Detected a multi-node setup with {pipeline_parallel_size:,} nodes, each "
|
|
1479
|
+
"with {tensor_parallel_size:,} GPUs, so using `ray` as the "
|
|
1480
|
+
"distributed backend.",
|
|
1481
|
+
level=logging.DEBUG,
|
|
1482
|
+
)
|
|
1483
|
+
else:
|
|
1484
|
+
distributed_executor_backend = "mp"
|
|
1485
|
+
tensor_parallel_size = local_gpu_count if local_gpu_count > 0 else 1
|
|
1486
|
+
pipeline_parallel_size = 1
|
|
1487
|
+
log_once(
|
|
1488
|
+
f"Detected a single-node setup with {tensor_parallel_size:,} GPUs, "
|
|
1489
|
+
"so using the multiprocessing distributed backend.",
|
|
1490
|
+
level=logging.DEBUG,
|
|
1491
|
+
)
|
|
1492
|
+
|
|
1493
|
+
return distributed_executor_backend, tensor_parallel_size, pipeline_parallel_size
|
scandeval/cli.py
CHANGED
|
@@ -37,26 +37,6 @@ from .languages import get_all_languages
|
|
|
37
37
|
help="""The languages to benchmark, both for models and datasets. If "all" then all
|
|
38
38
|
models will be benchmarked on all datasets.""",
|
|
39
39
|
)
|
|
40
|
-
@click.option(
|
|
41
|
-
"--model-language",
|
|
42
|
-
"-ml",
|
|
43
|
-
default=None,
|
|
44
|
-
show_default=True,
|
|
45
|
-
multiple=True,
|
|
46
|
-
metavar="ISO 639-1 LANGUAGE CODE",
|
|
47
|
-
type=click.Choice(["all"] + list(get_all_languages().keys())),
|
|
48
|
-
help="""This option is deprecated - please use --language instead.""",
|
|
49
|
-
)
|
|
50
|
-
@click.option(
|
|
51
|
-
"--dataset-language",
|
|
52
|
-
"-dl",
|
|
53
|
-
default=None,
|
|
54
|
-
show_default=True,
|
|
55
|
-
multiple=True,
|
|
56
|
-
metavar="ISO 639-1 LANGUAGE CODE",
|
|
57
|
-
type=click.Choice(["all"] + list(get_all_languages().keys())),
|
|
58
|
-
help="""This option is deprecated - please use --language instead.""",
|
|
59
|
-
)
|
|
60
40
|
@click.option(
|
|
61
41
|
"--dataset",
|
|
62
42
|
default=None,
|
|
@@ -65,13 +45,6 @@ from .languages import get_all_languages
|
|
|
65
45
|
help="""The name of the benchmark dataset. We recommend to use the `task` and
|
|
66
46
|
`language` options instead of this option.""",
|
|
67
47
|
)
|
|
68
|
-
@click.option(
|
|
69
|
-
"--batch-size",
|
|
70
|
-
default=None,
|
|
71
|
-
type=click.Choice(["1", "2", "4", "8", "16", "32"]),
|
|
72
|
-
help="This option is deprecated - please use --finetuning-batch-size instead.",
|
|
73
|
-
deprecated=True,
|
|
74
|
-
)
|
|
75
48
|
@click.option(
|
|
76
49
|
"--finetuning-batch-size",
|
|
77
50
|
default="32",
|
|
@@ -197,14 +170,6 @@ from .languages import get_all_languages
|
|
|
197
170
|
"faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
|
|
198
171
|
"if you are running out of GPU memory. Only relevant if the model is generative.",
|
|
199
172
|
)
|
|
200
|
-
@click.option(
|
|
201
|
-
"--debug/--no-debug",
|
|
202
|
-
default=False,
|
|
203
|
-
show_default=True,
|
|
204
|
-
help="Whether to run the benchmark in debug mode. This prints out extra "
|
|
205
|
-
"information and stores all outputs to the current working directory. Only "
|
|
206
|
-
"relevant if the model is generative.",
|
|
207
|
-
)
|
|
208
173
|
@click.option(
|
|
209
174
|
"--requires-safetensors",
|
|
210
175
|
is_flag=True,
|
|
@@ -232,15 +197,47 @@ from .languages import get_all_languages
|
|
|
232
197
|
help="Only download the requested model weights and datasets, and exit.",
|
|
233
198
|
default=False,
|
|
234
199
|
)
|
|
200
|
+
@click.option(
|
|
201
|
+
"--debug/--no-debug",
|
|
202
|
+
default=False,
|
|
203
|
+
show_default=True,
|
|
204
|
+
help="Whether to run the benchmark in debug mode. This prints out extra "
|
|
205
|
+
"information and stores all outputs to the current working directory. Only "
|
|
206
|
+
"relevant if the model is generative.",
|
|
207
|
+
)
|
|
208
|
+
@click.option(
|
|
209
|
+
"--model-language",
|
|
210
|
+
"-ml",
|
|
211
|
+
default=None,
|
|
212
|
+
show_default=True,
|
|
213
|
+
multiple=True,
|
|
214
|
+
metavar="ISO 639-1 LANGUAGE CODE",
|
|
215
|
+
type=click.Choice(["all"] + list(get_all_languages().keys())),
|
|
216
|
+
help="""This option is deprecated - please use --language instead.""",
|
|
217
|
+
)
|
|
218
|
+
@click.option(
|
|
219
|
+
"--dataset-language",
|
|
220
|
+
"-dl",
|
|
221
|
+
default=None,
|
|
222
|
+
show_default=True,
|
|
223
|
+
multiple=True,
|
|
224
|
+
metavar="ISO 639-1 LANGUAGE CODE",
|
|
225
|
+
type=click.Choice(["all"] + list(get_all_languages().keys())),
|
|
226
|
+
help="""This option is deprecated - please use --language instead.""",
|
|
227
|
+
)
|
|
228
|
+
@click.option(
|
|
229
|
+
"--batch-size",
|
|
230
|
+
default=None,
|
|
231
|
+
type=click.Choice(["1", "2", "4", "8", "16", "32"]),
|
|
232
|
+
help="This option is deprecated - please use --finetuning-batch-size instead.",
|
|
233
|
+
deprecated=True,
|
|
234
|
+
)
|
|
235
235
|
def benchmark(
|
|
236
236
|
model: tuple[str],
|
|
237
237
|
dataset: tuple[str | DatasetConfig],
|
|
238
238
|
language: tuple[str],
|
|
239
|
-
model_language: tuple[str],
|
|
240
|
-
dataset_language: tuple[str],
|
|
241
239
|
raise_errors: bool,
|
|
242
240
|
task: tuple[str],
|
|
243
|
-
batch_size: str | None,
|
|
244
241
|
finetuning_batch_size: str,
|
|
245
242
|
progress_bar: bool,
|
|
246
243
|
save_results: bool,
|
|
@@ -257,11 +254,14 @@ def benchmark(
|
|
|
257
254
|
api_base: str | None,
|
|
258
255
|
api_version: str | None,
|
|
259
256
|
gpu_memory_utilization: float,
|
|
260
|
-
debug: bool,
|
|
261
257
|
requires_safetensors: bool,
|
|
262
258
|
generative_type: str | None,
|
|
263
259
|
custom_datasets_file: Path,
|
|
264
260
|
download_only: bool,
|
|
261
|
+
debug: bool,
|
|
262
|
+
model_language: tuple[str],
|
|
263
|
+
dataset_language: tuple[str],
|
|
264
|
+
batch_size: str | None,
|
|
265
265
|
) -> None:
|
|
266
266
|
"""Benchmark pretrained language models on language tasks."""
|
|
267
267
|
Benchmarker(
|
scandeval/constants.py
CHANGED
|
@@ -96,3 +96,12 @@ NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
|
|
|
96
96
|
|
|
97
97
|
# We only allow loading local datasets in these file formats
|
|
98
98
|
SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS = ["csv"]
|
|
99
|
+
|
|
100
|
+
# These are default generation parameters, and can be overridden if a generative model
|
|
101
|
+
# has a `generation_config.json` file in its repository
|
|
102
|
+
GENERATION_KWARGS = {
|
|
103
|
+
"temperature": 0.0,
|
|
104
|
+
"top_p": 1.0,
|
|
105
|
+
"top_k": 0,
|
|
106
|
+
"repetition_penalty": 1.0,
|
|
107
|
+
}
|
scandeval/data_models.py
CHANGED
|
@@ -10,6 +10,7 @@ from pathlib import Path
|
|
|
10
10
|
|
|
11
11
|
import pydantic
|
|
12
12
|
import torch
|
|
13
|
+
from transformers.generation.configuration_utils import GenerationConfig
|
|
13
14
|
|
|
14
15
|
from .enums import Device, GenerativeType, ModelType, TaskGroup
|
|
15
16
|
from .exceptions import InvalidBenchmark
|
|
@@ -709,6 +710,9 @@ class ModelConfig:
|
|
|
709
710
|
adapter_base_model_id:
|
|
710
711
|
The model ID of the base model if the model is an adapter model. Can be None
|
|
711
712
|
if the model is not an adapter model.
|
|
713
|
+
generation_config (optional):
|
|
714
|
+
The generation configuration for generative models, if specified in the
|
|
715
|
+
model repository. Defaults to no generation configuration.
|
|
712
716
|
"""
|
|
713
717
|
|
|
714
718
|
model_id: str
|
|
@@ -722,6 +726,7 @@ class ModelConfig:
|
|
|
722
726
|
fresh: bool
|
|
723
727
|
model_cache_dir: str
|
|
724
728
|
adapter_base_model_id: str | None
|
|
729
|
+
generation_config: GenerationConfig | None = None
|
|
725
730
|
|
|
726
731
|
def __hash__(self) -> int:
|
|
727
732
|
"""Return a hash of the model configuration."""
|
|
@@ -6,6 +6,7 @@ from ..data_models import DatasetConfig
|
|
|
6
6
|
from ..languages import get_all_languages
|
|
7
7
|
from ..tasks import SPEED
|
|
8
8
|
from ..utils import load_custom_datasets_module
|
|
9
|
+
from .albanian import * # noqa: F403
|
|
9
10
|
from .bosnian import * # noqa: F403
|
|
10
11
|
from .bulgarian import * # noqa: F403
|
|
11
12
|
from .catalan import * # noqa: F403
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""All Albanian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import ALBANIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
MMS_SQ_CONFIG = DatasetConfig(
|
|
10
|
+
name="mms-sq",
|
|
11
|
+
pretty_name="MMS-sq",
|
|
12
|
+
source="EuroEval/mms-sq-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[ALBANIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_SQ_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-sq",
|
|
19
|
+
pretty_name="ScaLA-sq",
|
|
20
|
+
source="EuroEval/scala-sq",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[ALBANIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
WIKIANN_SQ_CONFIG = DatasetConfig(
|
|
26
|
+
name="wikiann-sq",
|
|
27
|
+
pretty_name="WikiANN-sq",
|
|
28
|
+
source="EuroEval/wikiann-sq-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[ALBANIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_SQ_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-sq",
|
|
35
|
+
pretty_name="MultiWikiQA-sq",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sq-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[ALBANIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
LR_SUM_SQ_CONFIG = DatasetConfig(
|
|
42
|
+
name="lr-sum-sq",
|
|
43
|
+
pretty_name="LRSum-sq",
|
|
44
|
+
source="EuroEval/lr-sum-sq-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[ALBANIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
GLOBAL_MMLU_LITE_SQ_CONFIG = DatasetConfig(
|
|
50
|
+
name="global-mmlu-lite-sq",
|
|
51
|
+
pretty_name="GlobalMMLULite-sq",
|
|
52
|
+
source="EuroEval/global-mmlu-lite-sq",
|
|
53
|
+
task=KNOW,
|
|
54
|
+
languages=[ALBANIAN],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
WINOGRANDE_SQ_CONFIG = DatasetConfig(
|
|
58
|
+
name="winogrande-sq",
|
|
59
|
+
pretty_name="Winogrande-sq",
|
|
60
|
+
source="EuroEval/winogrande-sq",
|
|
61
|
+
task=COMMON_SENSE,
|
|
62
|
+
languages=[ALBANIAN],
|
|
63
|
+
_labels=["a", "b"],
|
|
64
|
+
)
|
|
@@ -2,7 +2,18 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import DUTCH
|
|
5
|
-
from ..tasks import
|
|
5
|
+
from ..tasks import (
|
|
6
|
+
COMMON_SENSE,
|
|
7
|
+
EUROPEAN_VALUES,
|
|
8
|
+
KNOW,
|
|
9
|
+
LA,
|
|
10
|
+
MCRC,
|
|
11
|
+
NER,
|
|
12
|
+
RC,
|
|
13
|
+
SENT,
|
|
14
|
+
SIMPL,
|
|
15
|
+
SUMM,
|
|
16
|
+
)
|
|
6
17
|
|
|
7
18
|
### Official datasets ###
|
|
8
19
|
|
|
@@ -63,6 +74,14 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
|
|
|
63
74
|
languages=[DUTCH],
|
|
64
75
|
)
|
|
65
76
|
|
|
77
|
+
DUIDELIJKE_TAAL_NL_CONFIG = DatasetConfig(
|
|
78
|
+
name="duidelijke-taal",
|
|
79
|
+
pretty_name="Duidelijke Taal",
|
|
80
|
+
source="EuroEval/duidelijke-taal",
|
|
81
|
+
task=SIMPL,
|
|
82
|
+
languages=[DUTCH],
|
|
83
|
+
)
|
|
84
|
+
|
|
66
85
|
VALEU_NL_CONFIG = DatasetConfig(
|
|
67
86
|
name="valeu-nl",
|
|
68
87
|
pretty_name="VaLEU-nl",
|
|
@@ -122,6 +141,16 @@ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
|
|
|
122
141
|
unofficial=True,
|
|
123
142
|
)
|
|
124
143
|
|
|
144
|
+
COPA_NL_CONFIG = DatasetConfig(
|
|
145
|
+
name="copa-nl",
|
|
146
|
+
pretty_name="COPA-nl",
|
|
147
|
+
source="EuroEval/copa-nl",
|
|
148
|
+
task=COMMON_SENSE,
|
|
149
|
+
languages=[DUTCH],
|
|
150
|
+
unofficial=True,
|
|
151
|
+
_labels=["a", "b"],
|
|
152
|
+
)
|
|
153
|
+
|
|
125
154
|
GOLDENSWAG_NL_CONFIG = DatasetConfig(
|
|
126
155
|
name="goldenswag-nl",
|
|
127
156
|
pretty_name="GoldenSwag-nl",
|
|
@@ -27,7 +27,7 @@ SCALA_NN_CONFIG = DatasetConfig(
|
|
|
27
27
|
pretty_name="ScaLA-nn",
|
|
28
28
|
source="EuroEval/scala-nn",
|
|
29
29
|
task=LA,
|
|
30
|
-
languages=[NORWEGIAN_NYNORSK],
|
|
30
|
+
languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
NORNE_NB_CONFIG = DatasetConfig(
|
|
@@ -43,7 +43,7 @@ NORNE_NN_CONFIG = DatasetConfig(
|
|
|
43
43
|
pretty_name="NorNE-nn",
|
|
44
44
|
source="EuroEval/norne-nn-mini",
|
|
45
45
|
task=NER,
|
|
46
|
-
languages=[NORWEGIAN_NYNORSK],
|
|
46
|
+
languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
47
47
|
)
|
|
48
48
|
|
|
49
49
|
NORQUAD_CONFIG = DatasetConfig(
|
|
@@ -197,7 +197,7 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
|
|
|
197
197
|
pretty_name="MultiWikiQA-nn",
|
|
198
198
|
source="EuroEval/multi-wiki-qa-nn-mini",
|
|
199
199
|
task=RC,
|
|
200
|
-
languages=[NORWEGIAN_NYNORSK],
|
|
200
|
+
languages=[NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
201
201
|
unofficial=True,
|
|
202
202
|
)
|
|
203
203
|
|
scandeval/logging_utils.py
CHANGED
|
@@ -140,6 +140,7 @@ def block_terminal_output() -> None:
|
|
|
140
140
|
logging.getLogger("openai").setLevel(logging.CRITICAL)
|
|
141
141
|
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
142
142
|
litellm.suppress_debug_info = True # type: ignore[bad-assignment]
|
|
143
|
+
litellm.turn_off_message_logging = True
|
|
143
144
|
|
|
144
145
|
# Disable vLLM logging
|
|
145
146
|
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
scandeval/metrics/huggingface.py
CHANGED
|
@@ -8,6 +8,7 @@ import evaluate
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
from datasets import DownloadConfig, DownloadMode
|
|
10
10
|
|
|
11
|
+
from ..exceptions import InvalidBenchmark
|
|
11
12
|
from ..logging_utils import no_terminal_output
|
|
12
13
|
from .base import Metric
|
|
13
14
|
|
|
@@ -149,6 +150,75 @@ class HuggingFaceMetric(Metric):
|
|
|
149
150
|
return score
|
|
150
151
|
|
|
151
152
|
|
|
153
|
+
class SourceBasedMetric(HuggingFaceMetric):
|
|
154
|
+
"""Subclass of HuggingfaceMetric for metrics also requiring source text as input."""
|
|
155
|
+
|
|
156
|
+
def __call__(
|
|
157
|
+
self,
|
|
158
|
+
predictions: c.Sequence,
|
|
159
|
+
references: c.Sequence,
|
|
160
|
+
dataset: "Dataset",
|
|
161
|
+
dataset_config: "DatasetConfig",
|
|
162
|
+
benchmark_config: "BenchmarkConfig",
|
|
163
|
+
) -> float | None:
|
|
164
|
+
"""Calculate metric score for metrics requiring original source text.
|
|
165
|
+
|
|
166
|
+
Passes the source text to the evaluate function via its `sources` param.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
predictions:
|
|
170
|
+
The model predictions.
|
|
171
|
+
references:
|
|
172
|
+
The ground truth references.
|
|
173
|
+
dataset:
|
|
174
|
+
The dataset used for evaluation. This is used for collecting the source
|
|
175
|
+
text and in case any additional metadata is used to compute the metrics.
|
|
176
|
+
dataset_config:
|
|
177
|
+
The dataset configuration.
|
|
178
|
+
benchmark_config:
|
|
179
|
+
The benchmark configuration.
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
The calculated metric score, or None if the score should be ignored.
|
|
183
|
+
"""
|
|
184
|
+
if dataset is None:
|
|
185
|
+
raise InvalidBenchmark("SourceBasedMetric requires `dataset` to be passed.")
|
|
186
|
+
|
|
187
|
+
if self.metric is None:
|
|
188
|
+
self.metric = evaluate.load(path=self.huggingface_id)
|
|
189
|
+
|
|
190
|
+
sources = dataset["text"]
|
|
191
|
+
|
|
192
|
+
if not len(sources) == len(predictions):
|
|
193
|
+
raise InvalidBenchmark(
|
|
194
|
+
f"SourceBasedMetric expects same number of inputs as predictions."
|
|
195
|
+
f"Got {len(sources)} sources and {len(predictions)} predictions "
|
|
196
|
+
f"instead."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
with no_terminal_output(disable=benchmark_config.verbose):
|
|
200
|
+
results = self.metric.compute(
|
|
201
|
+
sources=sources,
|
|
202
|
+
predictions=predictions,
|
|
203
|
+
references=[[r] for r in references],
|
|
204
|
+
**self.compute_kwargs,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
# The metric returns None if we are running on multi-GPU and the current
|
|
208
|
+
# process is not the main process
|
|
209
|
+
if results is None:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
# Convert the results to a float score
|
|
213
|
+
score = results[self.results_key]
|
|
214
|
+
if isinstance(score, list):
|
|
215
|
+
score = sum(score) / len(score)
|
|
216
|
+
if isinstance(score, np.floating):
|
|
217
|
+
score = float(score)
|
|
218
|
+
|
|
219
|
+
return score
|
|
220
|
+
|
|
221
|
+
|
|
152
222
|
mcc_metric = HuggingFaceMetric(
|
|
153
223
|
name="mcc",
|
|
154
224
|
pretty_name="Matthew's Correlation Coefficient",
|
|
@@ -214,3 +284,15 @@ accuracy_metric = HuggingFaceMetric(
|
|
|
214
284
|
huggingface_id="accuracy",
|
|
215
285
|
results_key="accuracy",
|
|
216
286
|
)
|
|
287
|
+
|
|
288
|
+
meteor_metric = HuggingFaceMetric(
|
|
289
|
+
name="meteor", pretty_name="METEOR", huggingface_id="meteor", results_key="meteor"
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
sari_metric = SourceBasedMetric(
|
|
293
|
+
name="sari",
|
|
294
|
+
pretty_name="SARI",
|
|
295
|
+
huggingface_id="sari",
|
|
296
|
+
results_key="sari",
|
|
297
|
+
postprocessing_fn=lambda x: (x, f"{x:.2f}%"),
|
|
298
|
+
)
|
|
@@ -6,5 +6,6 @@ from .multiple_choice import MULTIPLE_CHOICE_TEMPLATES
|
|
|
6
6
|
from .named_entity_recognition import NER_TEMPLATES
|
|
7
7
|
from .reading_comprehension import RC_TEMPLATES
|
|
8
8
|
from .sentiment_classification import SENT_TEMPLATES
|
|
9
|
+
from .simplification import SIMPL_TEMPLATES
|
|
9
10
|
from .summarization import SUMM_TEMPLATES
|
|
10
11
|
from .token_classification import TOKEN_CLASSIFICATION_TEMPLATES
|
|
@@ -4,6 +4,7 @@ import typing as t
|
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
6
|
from ..languages import (
|
|
7
|
+
ALBANIAN,
|
|
7
8
|
BULGARIAN,
|
|
8
9
|
CATALAN,
|
|
9
10
|
CROATIAN,
|
|
@@ -40,6 +41,14 @@ if t.TYPE_CHECKING:
|
|
|
40
41
|
from ..languages import Language
|
|
41
42
|
|
|
42
43
|
LA_TEMPLATES: dict["Language", PromptConfig] = {
|
|
44
|
+
ALBANIAN: PromptConfig(
|
|
45
|
+
default_prompt_label_mapping=dict(correct="po", incorrect="jo"),
|
|
46
|
+
default_prompt_prefix="Më poshtë janë fjali dhe nëse janë gramatikisht të "
|
|
47
|
+
"sakta.",
|
|
48
|
+
default_prompt_template="Fjali: {text}\nGramatikisht e saktë: {label}",
|
|
49
|
+
default_instruction_prompt="Fjali: {text}\n\nPërcaktoni nëse fjalia është "
|
|
50
|
+
"gramatikisht e saktë apo jo. Përgjigjuni me {labels_str}, dhe asgjë tjetër.",
|
|
51
|
+
),
|
|
43
52
|
BULGARIAN: PromptConfig(
|
|
44
53
|
default_prompt_label_mapping=dict(correct="да", incorrect="не"),
|
|
45
54
|
default_prompt_prefix="Следват изречения и дали са граматически правилни.",
|
|
@@ -4,6 +4,7 @@ import typing as t
|
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
6
|
from ..languages import (
|
|
7
|
+
ALBANIAN,
|
|
7
8
|
BULGARIAN,
|
|
8
9
|
CATALAN,
|
|
9
10
|
CROATIAN,
|
|
@@ -40,6 +41,14 @@ if t.TYPE_CHECKING:
|
|
|
40
41
|
|
|
41
42
|
# TODO: Missing Faroese
|
|
42
43
|
MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
|
|
44
|
+
ALBANIAN: PromptConfig(
|
|
45
|
+
default_prompt_prefix="Më poshtë janë pyetje me zgjedhje të shumëfishtë "
|
|
46
|
+
"(me përgjigje).",
|
|
47
|
+
default_prompt_template="Pyetje: {text}\nPërgjigje: {label}",
|
|
48
|
+
default_instruction_prompt="Pyetje: {text}\n\nPërgjigjuni pyetjes së "
|
|
49
|
+
"mësipërme duke u përgjigjur me {labels_str}, dhe asgjë tjetër.",
|
|
50
|
+
default_prompt_label_mapping="auto",
|
|
51
|
+
),
|
|
43
52
|
BULGARIAN: PromptConfig(
|
|
44
53
|
default_prompt_prefix="Следват въпроси с множествен избор (с отговори).",
|
|
45
54
|
default_prompt_template="Въпрос: {text}\nОтговор: {label}",
|
|
@@ -4,6 +4,7 @@ import typing as t
|
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
6
|
from ..languages import (
|
|
7
|
+
ALBANIAN,
|
|
7
8
|
BOSNIAN,
|
|
8
9
|
BULGARIAN,
|
|
9
10
|
CATALAN,
|
|
@@ -42,6 +43,25 @@ if t.TYPE_CHECKING:
|
|
|
42
43
|
|
|
43
44
|
|
|
44
45
|
NER_TEMPLATES: dict["Language", PromptConfig] = {
|
|
46
|
+
ALBANIAN: PromptConfig(
|
|
47
|
+
default_prompt_label_mapping={
|
|
48
|
+
"b-per": "person",
|
|
49
|
+
"i-per": "person",
|
|
50
|
+
"b-loc": "vendndodhje",
|
|
51
|
+
"i-loc": "vendndodhje",
|
|
52
|
+
"b-org": "organizatë",
|
|
53
|
+
"i-org": "organizatë",
|
|
54
|
+
"b-misc": "të ndryshme",
|
|
55
|
+
"i-misc": "të ndryshme",
|
|
56
|
+
},
|
|
57
|
+
default_prompt_prefix="Më poshtë janë fjali dhe fjalorë JSON me entitetet e "
|
|
58
|
+
"emërtuara që shfaqen në fjalinë e dhënë.",
|
|
59
|
+
default_prompt_template="Fjali: {text}\nEntitete të emërtuara: {label}",
|
|
60
|
+
default_instruction_prompt="Fjali: {text}\n\nIdentifikoni entitetet e "
|
|
61
|
+
"emërtuara në fjali. Duhet t’i jepni ato si një fjalor JSON me çelësat "
|
|
62
|
+
"{labels_str}. Vlerat duhet të jenë lista të entiteteve të emërtuara të atij "
|
|
63
|
+
"lloji, saktësisht ashtu siç shfaqen në fjali.",
|
|
64
|
+
),
|
|
45
65
|
BOSNIAN: PromptConfig(
|
|
46
66
|
default_prompt_label_mapping={
|
|
47
67
|
"b-per": "osoba",
|
|
@@ -4,6 +4,7 @@ import typing as t
|
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
6
|
from ..languages import (
|
|
7
|
+
ALBANIAN,
|
|
7
8
|
BOSNIAN,
|
|
8
9
|
BULGARIAN,
|
|
9
10
|
CATALAN,
|
|
@@ -41,6 +42,14 @@ if t.TYPE_CHECKING:
|
|
|
41
42
|
from ..languages import Language
|
|
42
43
|
|
|
43
44
|
RC_TEMPLATES: dict["Language", PromptConfig] = {
|
|
45
|
+
ALBANIAN: PromptConfig(
|
|
46
|
+
default_prompt_prefix="Më poshtë janë tekste me pyetje dhe përgjigje.",
|
|
47
|
+
default_prompt_template="Tekst: {text}\nPyetje: {question}\nPërgjigje me "
|
|
48
|
+
"maksimum 3 fjalë: {label}",
|
|
49
|
+
default_instruction_prompt="Tekst: {text}\n\nPërgjigjuni pyetjes së mëposhtme "
|
|
50
|
+
"rreth tekstit të mësipërm me maksimum 3 fjalë.\n\nPyetje: {question}",
|
|
51
|
+
default_prompt_label_mapping=dict(),
|
|
52
|
+
),
|
|
44
53
|
BOSNIAN: PromptConfig(
|
|
45
54
|
default_prompt_prefix="Slijede tekstovi s pitanjima i odgovorima.",
|
|
46
55
|
default_prompt_template="Tekst: {text}\nPitanje: {question}\nOdgovor s "
|
|
@@ -4,6 +4,7 @@ import typing as t
|
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
6
|
from ..languages import (
|
|
7
|
+
ALBANIAN,
|
|
7
8
|
BOSNIAN,
|
|
8
9
|
BULGARIAN,
|
|
9
10
|
CATALAN,
|
|
@@ -41,6 +42,16 @@ if t.TYPE_CHECKING:
|
|
|
41
42
|
from ..languages import Language
|
|
42
43
|
|
|
43
44
|
SENT_TEMPLATES: dict["Language", PromptConfig] = {
|
|
45
|
+
ALBANIAN: PromptConfig(
|
|
46
|
+
default_prompt_label_mapping=dict(
|
|
47
|
+
positive="pozitive", neutral="neutrale", negative="negative"
|
|
48
|
+
),
|
|
49
|
+
default_prompt_prefix="Më poshtë janë dokumentet dhe ndjenjat e tyre, të cilat "
|
|
50
|
+
"mund të jenë {labels_str}.",
|
|
51
|
+
default_prompt_template="Dokument: {text}\nNdjenja: {label}",
|
|
52
|
+
default_instruction_prompt="Dokument: {text}\n\nKlasifikoni ndjenjën në "
|
|
53
|
+
"dokument. Përgjigjuni vetëm me {labels_str}, dhe asgjë tjetër.",
|
|
54
|
+
),
|
|
44
55
|
BOSNIAN: PromptConfig(
|
|
45
56
|
default_prompt_label_mapping=dict(
|
|
46
57
|
positive="pozitivno", neutral="neutralno", negative="negativno"
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Templates for the Simplification task."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import PromptConfig
|
|
4
|
+
from ..languages import DUTCH, ENGLISH
|
|
5
|
+
|
|
6
|
+
SIMPL_TEMPLATES = {
|
|
7
|
+
ENGLISH: PromptConfig(
|
|
8
|
+
default_prompt_prefix="The following are documents with accompanying "
|
|
9
|
+
"simplifications.",
|
|
10
|
+
default_prompt_template="Document: {text}\nSimplification: {target_text}",
|
|
11
|
+
default_instruction_prompt="Document: {text}\n\nWrite a simplification "
|
|
12
|
+
"of the above document.",
|
|
13
|
+
default_prompt_label_mapping=dict(),
|
|
14
|
+
),
|
|
15
|
+
DUTCH: PromptConfig(
|
|
16
|
+
default_prompt_prefix="Hieronder volgen documenten met bijbehorende "
|
|
17
|
+
"versimpelingen.",
|
|
18
|
+
default_prompt_template="Document: {text}\nVersimpeling: {target_text}",
|
|
19
|
+
default_instruction_prompt="Document: {text}\n\nVersimpel het "
|
|
20
|
+
"bovenstaande document.",
|
|
21
|
+
default_prompt_label_mapping=dict(),
|
|
22
|
+
),
|
|
23
|
+
}
|
|
@@ -4,6 +4,7 @@ import typing as t
|
|
|
4
4
|
|
|
5
5
|
from ..data_models import PromptConfig
|
|
6
6
|
from ..languages import (
|
|
7
|
+
ALBANIAN,
|
|
7
8
|
BOSNIAN,
|
|
8
9
|
CATALAN,
|
|
9
10
|
CZECH,
|
|
@@ -37,6 +38,16 @@ if t.TYPE_CHECKING:
|
|
|
37
38
|
|
|
38
39
|
# TODO: Missing Faroese
|
|
39
40
|
SUMM_TEMPLATES: dict["Language", PromptConfig] = {
|
|
41
|
+
ALBANIAN: PromptConfig(
|
|
42
|
+
default_prompt_prefix=(
|
|
43
|
+
"Më poshtë janë dokumente me përmbledhje të bashkëngjitura."
|
|
44
|
+
),
|
|
45
|
+
default_prompt_template=("Dokument: {text}\nPërmbledhje: {target_text}"),
|
|
46
|
+
default_instruction_prompt=(
|
|
47
|
+
"Dokument: {text}\n\nShkruani një përmbledhje të dokumentit të mësipërm."
|
|
48
|
+
),
|
|
49
|
+
default_prompt_label_mapping=dict(),
|
|
50
|
+
),
|
|
40
51
|
BOSNIAN: PromptConfig(
|
|
41
52
|
default_prompt_prefix="Slijede dokumenti s priloženim sažecima.",
|
|
42
53
|
default_prompt_template="Dokument: {text}\nSažetak: {target_text}",
|
scandeval/tasks.py
CHANGED
|
@@ -11,6 +11,7 @@ from .prompt_templates import (
|
|
|
11
11
|
NER_TEMPLATES,
|
|
12
12
|
RC_TEMPLATES,
|
|
13
13
|
SENT_TEMPLATES,
|
|
14
|
+
SIMPL_TEMPLATES,
|
|
14
15
|
SUMM_TEMPLATES,
|
|
15
16
|
TOKEN_CLASSIFICATION_TEMPLATES,
|
|
16
17
|
)
|
|
@@ -71,6 +72,16 @@ SENT = Task(
|
|
|
71
72
|
uses_logprobs=True,
|
|
72
73
|
)
|
|
73
74
|
|
|
75
|
+
SIMPL = Task(
|
|
76
|
+
name="simplification",
|
|
77
|
+
task_group=TaskGroup.TEXT_TO_TEXT,
|
|
78
|
+
template_dict=SIMPL_TEMPLATES,
|
|
79
|
+
metrics=[m.meteor_metric, m.sari_metric],
|
|
80
|
+
default_num_few_shot_examples=3,
|
|
81
|
+
default_max_generated_tokens=256,
|
|
82
|
+
default_labels=[],
|
|
83
|
+
default_allowed_model_types=[ModelType.GENERATIVE],
|
|
84
|
+
)
|
|
74
85
|
|
|
75
86
|
SUMM = Task(
|
|
76
87
|
name="summarization",
|
scandeval/utils.py
CHANGED
|
@@ -306,14 +306,13 @@ def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
|
|
|
306
306
|
Returns:
|
|
307
307
|
The result of the coroutine.
|
|
308
308
|
"""
|
|
309
|
-
loop = asyncio.new_event_loop()
|
|
310
309
|
try:
|
|
310
|
+
loop = asyncio.get_event_loop()
|
|
311
|
+
except RuntimeError: # If the current event loop is closed
|
|
312
|
+
loop = asyncio.new_event_loop()
|
|
311
313
|
asyncio.set_event_loop(loop)
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
finally:
|
|
315
|
-
loop.close()
|
|
316
|
-
asyncio.set_event_loop(None)
|
|
314
|
+
response = loop.run_until_complete(coroutine)
|
|
315
|
+
return response
|
|
317
316
|
|
|
318
317
|
|
|
319
318
|
async def add_semaphore_and_catch_exception(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ScandEval
|
|
3
|
-
Version: 16.
|
|
3
|
+
Version: 16.10.1
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -50,6 +50,7 @@ Requires-Dist: pydantic>=2.6.0
|
|
|
50
50
|
Requires-Dist: pyinfer>=0.0.3
|
|
51
51
|
Requires-Dist: python-dotenv>=1.0.1
|
|
52
52
|
Requires-Dist: rouge-score>=0.1.2
|
|
53
|
+
Requires-Dist: sacrebleu>=2.5.1
|
|
53
54
|
Requires-Dist: sacremoses>=0.1.1
|
|
54
55
|
Requires-Dist: scikit-learn==1.6.1
|
|
55
56
|
Requires-Dist: sentencepiece>=0.1.96
|
|
@@ -62,11 +63,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
|
|
|
62
63
|
Provides-Extra: all
|
|
63
64
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
64
65
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
66
|
+
Requires-Dist: ray>=2.53.0; (platform_system == 'Linux') and extra == 'all'
|
|
65
67
|
Requires-Dist: timm>=1.0.19; extra == 'all'
|
|
66
68
|
Requires-Dist: vllm[flashinfer]==0.11.0; (platform_system == 'Linux') and extra == 'all'
|
|
67
69
|
Provides-Extra: generative
|
|
68
70
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
69
71
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
72
|
+
Requires-Dist: ray>=2.53.0; (platform_system == 'Linux') and extra == 'generative'
|
|
70
73
|
Requires-Dist: timm>=1.0.19; extra == 'generative'
|
|
71
74
|
Requires-Dist: vllm[flashinfer]==0.11.0; (platform_system == 'Linux') and extra == 'generative'
|
|
72
75
|
Description-Content-Type: text/markdown
|
|
@@ -575,6 +578,20 @@ A huge thank you to all the contributors who have helped make this project a suc
|
|
|
575
578
|
alt="Contributor avatar for mrkowalski"
|
|
576
579
|
/>
|
|
577
580
|
</a>
|
|
581
|
+
<a href="https://github.com/simonevanbruggen">
|
|
582
|
+
<img
|
|
583
|
+
src="https://avatars.githubusercontent.com/u/24842609"
|
|
584
|
+
width=50
|
|
585
|
+
alt="Contributor avatar for simonevanbruggen"
|
|
586
|
+
/>
|
|
587
|
+
</a>
|
|
588
|
+
<a href="https://github.com/tvosch">
|
|
589
|
+
<img
|
|
590
|
+
src="https://avatars.githubusercontent.com/u/110661769"
|
|
591
|
+
width=50
|
|
592
|
+
alt="Contributor avatar for tvosch"
|
|
593
|
+
/>
|
|
594
|
+
</a>
|
|
578
595
|
|
|
579
596
|
### Contribute to EuroEval
|
|
580
597
|
|
|
@@ -3,40 +3,41 @@ scandeval/benchmark_config_factory.py,sha256=2stmcqKwx0G9pAiA0atunqDchJ9eoezp1Wh
|
|
|
3
3
|
scandeval/benchmarker.py,sha256=ARH1ATYAunKNRgIQTDvGqMN_M-ygG0SIQw-hfTOuC6U,53556
|
|
4
4
|
scandeval/caching_utils.py,sha256=lLUbkpDdJZy4xodIpwIz5d-WNKGuszbr_d9dyiJ5kZc,2591
|
|
5
5
|
scandeval/callbacks.py,sha256=l8f6Zr8EoHfVFsI1ZnMUK0Y8uZB00Nvaz_I6XDn6avE,2515
|
|
6
|
-
scandeval/cli.py,sha256=
|
|
7
|
-
scandeval/constants.py,sha256=
|
|
6
|
+
scandeval/cli.py,sha256=zvPGomSdrcjxc4uhmh8SkB4s2d7U9JYhxBJ34vznqUI,9411
|
|
7
|
+
scandeval/constants.py,sha256=wF7fQwaX8yZIypq_eh5RcaQFEhABR7dJxQaAX82b4P8,3766
|
|
8
8
|
scandeval/data_loading.py,sha256=8ryYEmj6di1f9QefGfNajxObQ9iapIGuAsL8m9KzDyI,7050
|
|
9
|
-
scandeval/data_models.py,sha256=
|
|
9
|
+
scandeval/data_models.py,sha256=vRGKrYr1YFBcH4ngOHrESicbTaIcz-joKz58JN5YMFE,30548
|
|
10
10
|
scandeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
|
|
11
11
|
scandeval/exceptions.py,sha256=4-N2OIo5PJ2aciLjagNAVhdHPxpq2QxywbBqJ8lkKj0,5780
|
|
12
12
|
scandeval/finetuning.py,sha256=dTjchPHLFRD65ZrEmtj5TfMTPZ6PODn77t372fgTNwE,11983
|
|
13
13
|
scandeval/generation.py,sha256=ccE-S0jxkM99XziIdeaBbk8yRGv4YBkzZkoabhFCSKA,13382
|
|
14
14
|
scandeval/generation_utils.py,sha256=A6YCiiMrMEUHq5BcVEjsouIKMPGt0sCfPzsJY1GVyk0,20092
|
|
15
15
|
scandeval/languages.py,sha256=gUSosFbvf1eEQHjVsKhXdJ4jiGXC-9lMkOL8AsBG33Q,37295
|
|
16
|
-
scandeval/logging_utils.py,sha256=
|
|
16
|
+
scandeval/logging_utils.py,sha256=Pd6DyHTPHCUsjtriomJboiTB35UdXvzxwnNpGTuec-g,9522
|
|
17
17
|
scandeval/model_cache.py,sha256=sjMYW0klnHt2yAFLavDTsp_InxPeSOuVEFo-Rh_31UM,10219
|
|
18
18
|
scandeval/model_config.py,sha256=fxHfgpw-9vj3hwke28DguVGvG9TU06nkTXT0V6KAMpQ,2761
|
|
19
19
|
scandeval/model_loading.py,sha256=bE51L4-AaVgo9h10UsKH_47CB4tOJGU988HxotQ5sYE,2342
|
|
20
20
|
scandeval/scores.py,sha256=9a1XtppFbp8GJFc9JdThGxqBY0YUE7-92oyrlxScjNk,3281
|
|
21
21
|
scandeval/speed_benchmark.py,sha256=VUOvauc9tuAegThNT2g1a-Z1l7DEmKq57dHI4t16o5A,4068
|
|
22
|
-
scandeval/tasks.py,sha256=
|
|
22
|
+
scandeval/tasks.py,sha256=mgE6Vx_1WD9-aY-yeBxc_09Uyz-tqk69xISMWVYcrsY,5980
|
|
23
23
|
scandeval/tokenisation_utils.py,sha256=Sa8V91J4NDFBF-qbConPsQvUkW_02cJp0gySz_Q3NDo,21191
|
|
24
24
|
scandeval/types.py,sha256=-VNeeDEvlNwfemszpvuGb3Dr9Gu3Eqc6XRmR11HLRi4,3293
|
|
25
|
-
scandeval/utils.py,sha256=
|
|
25
|
+
scandeval/utils.py,sha256=BIAP9TWmY_xv6tuCUgmnYifoeodxlz8N2Q0We3frgLU,18389
|
|
26
26
|
scandeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
27
27
|
scandeval/benchmark_modules/base.py,sha256=5YAsCMILKTRXFx_ylGQ7iS5AFKN25iFdkBjj8KzzElw,11445
|
|
28
28
|
scandeval/benchmark_modules/fresh.py,sha256=sG5ae4p1J-GGmVNcVBIxY1xZIAlUwq_pu-9c4uAYU3Y,10734
|
|
29
29
|
scandeval/benchmark_modules/hf.py,sha256=f89E7XoMqsBHhYnMYBgy7ZuXDsAQ7VaIqMfFrHyjg8g,47363
|
|
30
|
-
scandeval/benchmark_modules/litellm.py,sha256=
|
|
31
|
-
scandeval/benchmark_modules/vllm.py,sha256=
|
|
32
|
-
scandeval/dataset_configs/__init__.py,sha256=
|
|
30
|
+
scandeval/benchmark_modules/litellm.py,sha256=TH35CQhoVinlmfHnAW-XJE21o96YfiIv993m0ASS80E,71590
|
|
31
|
+
scandeval/benchmark_modules/vllm.py,sha256=pFCBuIp2m2KIlVMlqc7sGp1twiENvRHx3ppVs0bFvFo,57319
|
|
32
|
+
scandeval/dataset_configs/__init__.py,sha256=GFI_W9GKd3OSDdhhJzHc8mwoP9b32IHIIyvPBI-hK6k,3223
|
|
33
|
+
scandeval/dataset_configs/albanian.py,sha256=D__dli7JO3yeHzzdJ3FFyUGw-z20f1yI6QLnws-WB8I,1473
|
|
33
34
|
scandeval/dataset_configs/bosnian.py,sha256=golIWqwW1pFwSkuBM1v0yhHDblB2FoJgK24aO7kKm7M,877
|
|
34
35
|
scandeval/dataset_configs/bulgarian.py,sha256=OVoDPTRdU-lVq-xUka7-Ct20h2jbs8HV43KBxRQenIE,1284
|
|
35
36
|
scandeval/dataset_configs/catalan.py,sha256=SXwRJjIcMMN7rVuhFRZSnCGDoMfabW5HFoZOkq0Jpg0,1427
|
|
36
37
|
scandeval/dataset_configs/croatian.py,sha256=U5oBTjttpWTWonTEzZAf-G3nvQICRQmw6Kla-HWn_5k,1260
|
|
37
38
|
scandeval/dataset_configs/czech.py,sha256=ghv2yNw839G-utll8PQRSjyKYbM5gfoQhFKy664GTCI,1562
|
|
38
39
|
scandeval/dataset_configs/danish.py,sha256=LEKs04vK2KnV0CYheT7FeS-g3iHBvf2bQxyl0D_LbTg,3293
|
|
39
|
-
scandeval/dataset_configs/dutch.py,sha256=
|
|
40
|
+
scandeval/dataset_configs/dutch.py,sha256=OZJmaqGguXY5D9hz0zFNrwGQPRXgxZonctSc8Gsy9sY,3550
|
|
40
41
|
scandeval/dataset_configs/english.py,sha256=nc9nGwxf1tHVMUhQeND61yJbpTO4rJaAusPZlstqtq0,2817
|
|
41
42
|
scandeval/dataset_configs/estonian.py,sha256=bWiKA_dJ7WUE8Z_1YZnSewhi4ZdCQBGJZ7pQxkCwMcU,2757
|
|
42
43
|
scandeval/dataset_configs/faroese.py,sha256=13qYwXonDPWG9Av5MY_NBNTRDglPVKz5_mbz7ZCJ_mo,1247
|
|
@@ -49,7 +50,7 @@ scandeval/dataset_configs/icelandic.py,sha256=G2Ibe6oF1NknkQmHqLpoHlysW_8f-0G53D
|
|
|
49
50
|
scandeval/dataset_configs/italian.py,sha256=qhjAQChnQanzs7EyN1DSAJ4OOU41HAlWqWntQOtbWCw,2761
|
|
50
51
|
scandeval/dataset_configs/latvian.py,sha256=wbwIDieq5Lplng5Jzx9LEqq4d8b5LnNOyCUmT64b4bA,1928
|
|
51
52
|
scandeval/dataset_configs/lithuanian.py,sha256=RPqKwsysO1TYeQuEEsbhzGcSFHDX94lk1hgl1CfQaMU,1724
|
|
52
|
-
scandeval/dataset_configs/norwegian.py,sha256=
|
|
53
|
+
scandeval/dataset_configs/norwegian.py,sha256=k70T78rTY3pmmVRxG3i_J1j7td_boFHJetkyITskIL0,5487
|
|
53
54
|
scandeval/dataset_configs/polish.py,sha256=nN_NT8cUK2iv1L_zO_aCYOk2R7ACSDZgvI7e0hIaFAM,2074
|
|
54
55
|
scandeval/dataset_configs/portuguese.py,sha256=m9lEeVtI_yNvIdTIEOn3HFK_ilY2tn3-acC981hjZFM,2401
|
|
55
56
|
scandeval/dataset_configs/romanian.py,sha256=AcDp0mqOHmmv3EodovGEcBmarxjLYsXOPr_X4IQoNTw,1472
|
|
@@ -61,18 +62,19 @@ scandeval/dataset_configs/swedish.py,sha256=kpEK29swY7iyUSzUvD9hNf2qwb3d7bHrFwbo
|
|
|
61
62
|
scandeval/dataset_configs/ukrainian.py,sha256=spbCmCOU27jOfz6FZxqCIfVmDN5l8H-7VCl-k-8eAIo,1527
|
|
62
63
|
scandeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
|
|
63
64
|
scandeval/metrics/base.py,sha256=dUBby-ZzettMjdcjek6rw0JTZMuScX4cQ2Rd6untKHY,2525
|
|
64
|
-
scandeval/metrics/huggingface.py,sha256=
|
|
65
|
+
scandeval/metrics/huggingface.py,sha256=W1hPuIGBALOogGN2yTGTJUsylsMII3A66fEe9nB8N2k,9493
|
|
65
66
|
scandeval/metrics/llm_as_a_judge.py,sha256=cZ7ZCuB3633T87MjBtAekrBQ_vYaNv1uTcqnI32gNpQ,9837
|
|
66
67
|
scandeval/metrics/pipeline.py,sha256=GTIqaFkn-nTLU4xBi8-zP1J4Ytv3qeFVuRB4OcuwkOw,10876
|
|
67
68
|
scandeval/metrics/speed.py,sha256=G5hEQcrtqxF070ZZwLDh61iZnq2CSW2o6ZM7zR4lOTY,1298
|
|
68
|
-
scandeval/prompt_templates/__init__.py,sha256=
|
|
69
|
+
scandeval/prompt_templates/__init__.py,sha256=p3CUcSaJiiUm6EQyhceDUjotH7GdyHolMznAn2f44as,519
|
|
69
70
|
scandeval/prompt_templates/classification.py,sha256=QuZh6hTMaqMYTsoruAhwjVP9381zzlQmDIwSeyGnav0,10121
|
|
70
|
-
scandeval/prompt_templates/linguistic_acceptability.py,sha256=
|
|
71
|
-
scandeval/prompt_templates/multiple_choice.py,sha256=
|
|
72
|
-
scandeval/prompt_templates/named_entity_recognition.py,sha256=
|
|
73
|
-
scandeval/prompt_templates/reading_comprehension.py,sha256=
|
|
74
|
-
scandeval/prompt_templates/sentiment_classification.py,sha256=
|
|
75
|
-
scandeval/prompt_templates/
|
|
71
|
+
scandeval/prompt_templates/linguistic_acceptability.py,sha256=V31apMLPNhTeDJO6va_04SjuDSXMOJEFurIeSldDi7o,15474
|
|
72
|
+
scandeval/prompt_templates/multiple_choice.py,sha256=pgz-Xb-vUthwJyjla56CxeeXPDkgtZ7Mi9z1J-PjepY,12977
|
|
73
|
+
scandeval/prompt_templates/named_entity_recognition.py,sha256=U9KYr4eIbiMdHECc35CjkNUDoiRd6Jd8w0v35kRWGL4,30197
|
|
74
|
+
scandeval/prompt_templates/reading_comprehension.py,sha256=4C16Mf1MGtEZG9x8PxrJmK1Cxfz9kzjrJLNS725_5oI,16319
|
|
75
|
+
scandeval/prompt_templates/sentiment_classification.py,sha256=mLrhWh0rQTjiowzprv8S5CfLO_g7DvnSjWiw0CsaXpg,17401
|
|
76
|
+
scandeval/prompt_templates/simplification.py,sha256=DF50F1JSxy00ZOO3OJJZOtoTlkGjE35krjjbDaW7RUk,900
|
|
77
|
+
scandeval/prompt_templates/summarization.py,sha256=LKiz5fd6A0J5NyoLBeyrZ4ir1skDB2pytKCEeF4zbmw,10770
|
|
76
78
|
scandeval/prompt_templates/token_classification.py,sha256=8Uw34mN2xQ_5es-nz7vCK-GgDg_oE-zsAzPJPzAxFrQ,15531
|
|
77
79
|
scandeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
78
80
|
scandeval/task_group_utils/multiple_choice_classification.py,sha256=PWUXeGn-9RsXxdVRYHJASyBVQ8L5Jla981eot0GLooY,7316
|
|
@@ -80,8 +82,8 @@ scandeval/task_group_utils/question_answering.py,sha256=tuMwr-RnvJap5jkTrluxC1tf
|
|
|
80
82
|
scandeval/task_group_utils/sequence_classification.py,sha256=VhiggNrB7Gi2x-99MPL0RR2VZRv-wpJerXulgQH6wcU,16556
|
|
81
83
|
scandeval/task_group_utils/text_to_text.py,sha256=p6zzjob70qQUpfUOs0LToSzavE1ERqRAHu_727Jb2mM,5476
|
|
82
84
|
scandeval/task_group_utils/token_classification.py,sha256=8dF32KQAYAFnnn7DPHX-yvJmRrMBmT2CyFREacyTwvQ,17321
|
|
83
|
-
scandeval-16.
|
|
84
|
-
scandeval-16.
|
|
85
|
-
scandeval-16.
|
|
86
|
-
scandeval-16.
|
|
87
|
-
scandeval-16.
|
|
85
|
+
scandeval-16.10.1.dist-info/METADATA,sha256=IYJza42KMRZdoc2-8z9NHaniGAH4K7hT1WHCyFT-Wow,23435
|
|
86
|
+
scandeval-16.10.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
87
|
+
scandeval-16.10.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
|
|
88
|
+
scandeval-16.10.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
|
|
89
|
+
scandeval-16.10.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|