EuroEval 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +3 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +323 -193
- euroeval/benchmark_modules/vllm.py +166 -112
- euroeval/benchmarker.py +59 -33
- euroeval/cli.py +3 -3
- euroeval/constants.py +13 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +53 -7
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +38 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +8 -7
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +46 -14
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +234 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +17 -6
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +96 -23
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +47 -75
- euroeval/tasks.py +31 -6
- euroeval/tokenization_utils.py +295 -207
- euroeval/utils.py +118 -34
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +12 -14
- euroeval-16.0.0.dist-info/RECORD +69 -0
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -468
- euroeval-15.15.0.dist-info/RECORD +0 -63
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py
CHANGED
|
@@ -15,7 +15,7 @@ from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
|
|
|
15
15
|
from torch.distributed import destroy_process_group
|
|
16
16
|
|
|
17
17
|
from .benchmark_config_factory import build_benchmark_config
|
|
18
|
-
from .constants import
|
|
18
|
+
from .constants import GENERATIVE_PIPELINE_TAGS
|
|
19
19
|
from .data_loading import load_data
|
|
20
20
|
from .data_models import BenchmarkConfigParams, BenchmarkResult
|
|
21
21
|
from .dataset_configs import get_all_dataset_configs
|
|
@@ -81,7 +81,7 @@ class Benchmarker:
|
|
|
81
81
|
gpu_memory_utilization: float = 0.9,
|
|
82
82
|
debug: bool = False,
|
|
83
83
|
run_with_cli: bool = False,
|
|
84
|
-
|
|
84
|
+
requires_safetensors: bool = False,
|
|
85
85
|
) -> None:
|
|
86
86
|
"""Initialise the benchmarker.
|
|
87
87
|
|
|
@@ -156,7 +156,7 @@ class Benchmarker:
|
|
|
156
156
|
run_with_cli:
|
|
157
157
|
Whether the benchmarker is being run from the command-line interface.
|
|
158
158
|
Defaults to False.
|
|
159
|
-
|
|
159
|
+
requires_safetensors:
|
|
160
160
|
Whether to only allow models that use the safetensors format. Defaults
|
|
161
161
|
to False.
|
|
162
162
|
|
|
@@ -201,11 +201,11 @@ class Benchmarker:
|
|
|
201
201
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
202
202
|
debug=debug,
|
|
203
203
|
run_with_cli=run_with_cli,
|
|
204
|
-
|
|
204
|
+
requires_safetensors=requires_safetensors,
|
|
205
205
|
)
|
|
206
206
|
|
|
207
207
|
self.benchmark_config = build_benchmark_config(
|
|
208
|
-
|
|
208
|
+
**self.benchmark_config_default_params.model_dump()
|
|
209
209
|
)
|
|
210
210
|
|
|
211
211
|
# Initialise variable storing model lists, so we only have to fetch it once
|
|
@@ -249,7 +249,7 @@ class Benchmarker:
|
|
|
249
249
|
evaluate_test_split: bool | None = None,
|
|
250
250
|
few_shot: bool | None = None,
|
|
251
251
|
num_iterations: int | None = None,
|
|
252
|
-
|
|
252
|
+
requires_safetensors: bool | None = None,
|
|
253
253
|
) -> list[BenchmarkResult]:
|
|
254
254
|
"""Benchmarks models on datasets.
|
|
255
255
|
|
|
@@ -327,7 +327,7 @@ class Benchmarker:
|
|
|
327
327
|
to be used for power users, and scores will not be allowed on the
|
|
328
328
|
leaderboards if this is changed. Defaults to the value specified when
|
|
329
329
|
initialising the benchmarker.
|
|
330
|
-
|
|
330
|
+
requires_safetensors:
|
|
331
331
|
Whether to only allow models that use the safetensors format. Defaults
|
|
332
332
|
to the value specified when initialising the benchmarker.
|
|
333
333
|
|
|
@@ -361,7 +361,7 @@ class Benchmarker:
|
|
|
361
361
|
evaluate_test_split=evaluate_test_split,
|
|
362
362
|
few_shot=few_shot,
|
|
363
363
|
num_iterations=num_iterations,
|
|
364
|
-
|
|
364
|
+
requires_safetensors=requires_safetensors,
|
|
365
365
|
)
|
|
366
366
|
|
|
367
367
|
adjust_logging_level(verbose=benchmark_config.verbose)
|
|
@@ -379,9 +379,46 @@ class Benchmarker:
|
|
|
379
379
|
|
|
380
380
|
current_benchmark_results: list[BenchmarkResult] = list()
|
|
381
381
|
for model_id in model_ids:
|
|
382
|
-
|
|
382
|
+
# Load the model configuration, or skip the model if it is invalid
|
|
383
|
+
try:
|
|
384
|
+
model_config = get_model_config(
|
|
385
|
+
model_id=model_id, benchmark_config=benchmark_config
|
|
386
|
+
)
|
|
387
|
+
except InvalidModel as e:
|
|
388
|
+
logger.info(e.message)
|
|
389
|
+
num_finished_benchmarks += len(dataset_configs)
|
|
390
|
+
continue
|
|
391
|
+
|
|
383
392
|
loaded_model: BenchmarkModule | None = None
|
|
393
|
+
benchmark_params_to_revert: dict[str, t.Any] = dict()
|
|
384
394
|
for dataset_config in dataset_configs:
|
|
395
|
+
# Revert any changes to the benchmark configuration made for the
|
|
396
|
+
# previous dataset
|
|
397
|
+
for param, value in benchmark_params_to_revert.items():
|
|
398
|
+
setattr(benchmark_config, param, value)
|
|
399
|
+
benchmark_params_to_revert = dict()
|
|
400
|
+
|
|
401
|
+
# Update the benchmark config if the dataset requires it
|
|
402
|
+
if (
|
|
403
|
+
"val" not in dataset_config.splits
|
|
404
|
+
and not benchmark_config.evaluate_test_split
|
|
405
|
+
):
|
|
406
|
+
logger.debug(
|
|
407
|
+
"The dataset does not have a validation split, so even though "
|
|
408
|
+
"you requested evaluating the validation split (the default), "
|
|
409
|
+
"we will evaluate on the test split."
|
|
410
|
+
)
|
|
411
|
+
benchmark_params_to_revert["evaluate_test_split"] = False
|
|
412
|
+
benchmark_config.evaluate_test_split = True
|
|
413
|
+
if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
|
|
414
|
+
logger.debug(
|
|
415
|
+
"The task requires zero-shot evaluation, so even though you "
|
|
416
|
+
"requested few-shot evaluation (the default), we will evaluate "
|
|
417
|
+
"zero-shot."
|
|
418
|
+
)
|
|
419
|
+
benchmark_params_to_revert["few_shot"] = True
|
|
420
|
+
benchmark_config.few_shot = False
|
|
421
|
+
|
|
385
422
|
# Skip if we have already benchmarked this model on this dataset and
|
|
386
423
|
# we are not forcing the benchmark
|
|
387
424
|
if not benchmark_config.force and model_has_been_benchmarked(
|
|
@@ -399,25 +436,14 @@ class Benchmarker:
|
|
|
399
436
|
num_finished_benchmarks += 1
|
|
400
437
|
continue
|
|
401
438
|
|
|
402
|
-
if
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
)
|
|
407
|
-
except InvalidModel as e:
|
|
408
|
-
logger.info(e.message)
|
|
409
|
-
num_finished_benchmarks += len(dataset_configs)
|
|
410
|
-
continue
|
|
411
|
-
|
|
412
|
-
# Skip if the model is an encoder model and the task is generative
|
|
413
|
-
task_is_generative = (
|
|
414
|
-
dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
|
|
415
|
-
)
|
|
416
|
-
if model_config.model_type == ModelType.ENCODER and task_is_generative:
|
|
439
|
+
# Skip if the model type should not be benchmarked on this dataset
|
|
440
|
+
model_type = model_config.model_type
|
|
441
|
+
allowed_model_types = dataset_config.task.allowed_model_types
|
|
442
|
+
if model_type not in allowed_model_types:
|
|
417
443
|
logger.debug(
|
|
418
444
|
f"Skipping benchmarking {model_id} on "
|
|
419
|
-
f"{dataset_config.pretty_name}, as it is
|
|
420
|
-
"the
|
|
445
|
+
f"{dataset_config.pretty_name}, as it is of type {model_type}, "
|
|
446
|
+
f"and the only allowed model types are {allowed_model_types}."
|
|
421
447
|
)
|
|
422
448
|
continue
|
|
423
449
|
|
|
@@ -536,7 +562,7 @@ class Benchmarker:
|
|
|
536
562
|
api_version: str | None | None = None,
|
|
537
563
|
debug: bool | None = None,
|
|
538
564
|
run_with_cli: bool | None = None,
|
|
539
|
-
|
|
565
|
+
requires_safetensors: bool | None = None,
|
|
540
566
|
) -> "BenchmarkConfig":
|
|
541
567
|
"""Get an updated benchmark configuration.
|
|
542
568
|
|
|
@@ -610,7 +636,7 @@ class Benchmarker:
|
|
|
610
636
|
run_with_cli:
|
|
611
637
|
Whether the benchmarker is being run from the command-line interface.
|
|
612
638
|
If None, then this value will not be updated.
|
|
613
|
-
|
|
639
|
+
requires_safetensors:
|
|
614
640
|
Whether to only allow models that use the safetensors format. If None,
|
|
615
641
|
then this value will not be updated.
|
|
616
642
|
|
|
@@ -667,8 +693,8 @@ class Benchmarker:
|
|
|
667
693
|
benchmark_config_params.debug = debug
|
|
668
694
|
if run_with_cli is not None:
|
|
669
695
|
benchmark_config_params.run_with_cli = run_with_cli
|
|
670
|
-
if
|
|
671
|
-
benchmark_config_params.
|
|
696
|
+
if requires_safetensors is not None:
|
|
697
|
+
benchmark_config_params.requires_safetensors = requires_safetensors
|
|
672
698
|
|
|
673
699
|
return build_benchmark_config(**benchmark_config_params.model_dump())
|
|
674
700
|
|
|
@@ -858,7 +884,7 @@ class Benchmarker:
|
|
|
858
884
|
evaluate_test_split: bool | None = None,
|
|
859
885
|
few_shot: bool | None = None,
|
|
860
886
|
num_iterations: int | None = None,
|
|
861
|
-
|
|
887
|
+
requires_safetensors: bool | None = None,
|
|
862
888
|
) -> list[BenchmarkResult]:
|
|
863
889
|
"""Benchmarks models on datasets.
|
|
864
890
|
|
|
@@ -936,7 +962,7 @@ class Benchmarker:
|
|
|
936
962
|
to be used for power users, and scores will not be allowed on the
|
|
937
963
|
leaderboards if this is changed. Defaults to the value specified when
|
|
938
964
|
initialising the benchmarker.
|
|
939
|
-
|
|
965
|
+
requires_safetensors:
|
|
940
966
|
Whether to only allow models that use the safetensors format. Defaults
|
|
941
967
|
to the value specified when initialising the benchmarker.
|
|
942
968
|
|
|
@@ -972,7 +998,7 @@ class Benchmarker:
|
|
|
972
998
|
evaluate_test_split=evaluate_test_split,
|
|
973
999
|
few_shot=few_shot,
|
|
974
1000
|
num_iterations=num_iterations,
|
|
975
|
-
|
|
1001
|
+
requires_safetensors=requires_safetensors,
|
|
976
1002
|
)
|
|
977
1003
|
|
|
978
1004
|
|
euroeval/cli.py
CHANGED
|
@@ -203,7 +203,7 @@ from .tasks import get_all_tasks
|
|
|
203
203
|
"relevant if the model is generative.",
|
|
204
204
|
)
|
|
205
205
|
@click.option(
|
|
206
|
-
"--
|
|
206
|
+
"--requires-safetensors",
|
|
207
207
|
is_flag=True,
|
|
208
208
|
help="Only allow loading models that have safetensors weights available",
|
|
209
209
|
default=False,
|
|
@@ -233,7 +233,7 @@ def benchmark(
|
|
|
233
233
|
api_version: str | None,
|
|
234
234
|
gpu_memory_utilization: float,
|
|
235
235
|
debug: bool,
|
|
236
|
-
|
|
236
|
+
requires_safetensors: bool,
|
|
237
237
|
) -> None:
|
|
238
238
|
"""Benchmark pretrained language models on language tasks."""
|
|
239
239
|
models = list(model)
|
|
@@ -270,7 +270,7 @@ def benchmark(
|
|
|
270
270
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
271
271
|
debug=debug,
|
|
272
272
|
run_with_cli=True,
|
|
273
|
-
|
|
273
|
+
requires_safetensors=requires_safetensors,
|
|
274
274
|
)
|
|
275
275
|
|
|
276
276
|
# Perform the benchmark evaluation
|
euroeval/constants.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Constants used throughout the project."""
|
|
2
2
|
|
|
3
3
|
from .enums import TaskGroup
|
|
4
|
-
from .tasks import NER
|
|
5
4
|
|
|
6
5
|
# This is used as input to generative models; it cannot be a special token
|
|
7
6
|
DUMMY_FILL_VALUE = 100
|
|
@@ -11,7 +10,7 @@ DUMMY_FILL_VALUE = 100
|
|
|
11
10
|
# benchmark. We will still report the models' true maximum context length in the
|
|
12
11
|
# metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
|
|
13
12
|
# all tokens in the context.
|
|
14
|
-
MAX_CONTEXT_LENGTH =
|
|
13
|
+
MAX_CONTEXT_LENGTH = 8_192
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
# We need to raise the amount of tokens generated for reasoning models, to give them
|
|
@@ -37,21 +36,10 @@ GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
|
|
|
37
36
|
LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
# Tasks where we use structured generation for generative models
|
|
41
|
-
TASKS_USING_JSON = [NER]
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
# Tasks where we use log probabilities for generative models, rather than the raw
|
|
45
|
-
# completion
|
|
46
|
-
TASK_GROUPS_USING_LOGPROBS = [
|
|
47
|
-
TaskGroup.SEQUENCE_CLASSIFICATION,
|
|
48
|
-
TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
49
|
-
]
|
|
50
|
-
|
|
51
|
-
|
|
52
39
|
# The number of top log probabilities to return for generative models. For several APIs
|
|
53
40
|
# this is the maximum number of log probabilities that can be returned
|
|
54
|
-
|
|
41
|
+
MAX_VLLM_LOGPROBS = 20
|
|
42
|
+
MAX_LITELLM_LOGPROBS = 8
|
|
55
43
|
|
|
56
44
|
|
|
57
45
|
# We make sure to remove these metric attributes after each iteration, to avoid memory
|
|
@@ -77,3 +65,13 @@ REASONING_TOKENS = [
|
|
|
77
65
|
# manually. We only use them as stop tokens if they actually appear in the model's
|
|
78
66
|
# output
|
|
79
67
|
CUSTOM_STOP_TOKENS = ["<sep>"]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# For classification tasks we force LiteLLM models to output a JSON dictionary with a
|
|
71
|
+
# single key and the values being restricted to the allowed labels. This is the key we
|
|
72
|
+
# use
|
|
73
|
+
LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# These characters are stripped from JSON output when trying to identify the label
|
|
77
|
+
JSON_STRIP_CHARACTERS = ' {}\n\r":'
|
euroeval/data_loading.py
CHANGED
|
@@ -12,6 +12,7 @@ from huggingface_hub.errors import HfHubHTTPError
|
|
|
12
12
|
from numpy.random import Generator
|
|
13
13
|
|
|
14
14
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
15
|
+
from .tasks import EUROPEAN_VALUES
|
|
15
16
|
from .utils import unscramble
|
|
16
17
|
|
|
17
18
|
if t.TYPE_CHECKING:
|
|
@@ -48,40 +49,45 @@ def load_data(
|
|
|
48
49
|
dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
|
|
49
50
|
)
|
|
50
51
|
|
|
51
|
-
if not benchmark_config.evaluate_test_split:
|
|
52
|
+
if not benchmark_config.evaluate_test_split and "val" in dataset:
|
|
52
53
|
dataset["test"] = dataset["val"]
|
|
53
54
|
|
|
54
55
|
# Remove empty examples from the datasets
|
|
55
56
|
for text_feature in ["tokens", "text"]:
|
|
56
|
-
|
|
57
|
-
|
|
57
|
+
for split in dataset_config.splits:
|
|
58
|
+
if text_feature in dataset[split].features:
|
|
59
|
+
dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
|
|
58
60
|
|
|
59
|
-
# If we are testing then truncate the test set
|
|
60
|
-
|
|
61
|
+
# If we are testing then truncate the test set, unless we need the full set for
|
|
62
|
+
# evaluation
|
|
63
|
+
if hasattr(sys, "_called_from_test") and dataset_config.task != EUROPEAN_VALUES:
|
|
61
64
|
dataset["test"] = dataset["test"].select(range(1))
|
|
62
65
|
|
|
63
|
-
# Bootstrap the splits
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
66
|
+
# Bootstrap the splits, if applicable
|
|
67
|
+
if dataset_config.bootstrap_samples:
|
|
68
|
+
bootstrapped_splits: dict[str, list["Dataset"]] = dict()
|
|
69
|
+
for split in dataset_config.splits:
|
|
70
|
+
bootstrap_indices = rng.integers(
|
|
71
|
+
0,
|
|
72
|
+
len(dataset[split]),
|
|
73
|
+
size=(benchmark_config.num_iterations, len(dataset[split])),
|
|
74
|
+
)
|
|
75
|
+
bootstrapped_splits[split] = [
|
|
76
|
+
dataset[split].select(bootstrap_indices[idx])
|
|
77
|
+
for idx in range(benchmark_config.num_iterations)
|
|
78
|
+
]
|
|
79
|
+
datasets = [
|
|
80
|
+
DatasetDict(
|
|
81
|
+
{
|
|
82
|
+
split: bootstrapped_splits[split][idx]
|
|
83
|
+
for split in dataset_config.splits
|
|
84
|
+
}
|
|
85
|
+
)
|
|
73
86
|
for idx in range(benchmark_config.num_iterations)
|
|
74
87
|
]
|
|
88
|
+
else:
|
|
89
|
+
datasets = [dataset] * benchmark_config.num_iterations
|
|
75
90
|
|
|
76
|
-
datasets = [
|
|
77
|
-
DatasetDict(
|
|
78
|
-
{
|
|
79
|
-
split: bootstrapped_splits[split][idx]
|
|
80
|
-
for split in ["train", "val", "test"]
|
|
81
|
-
}
|
|
82
|
-
)
|
|
83
|
-
for idx in range(benchmark_config.num_iterations)
|
|
84
|
-
]
|
|
85
91
|
return datasets
|
|
86
92
|
|
|
87
93
|
|
|
@@ -113,7 +119,7 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
113
119
|
requests.ConnectionError,
|
|
114
120
|
requests.ReadTimeout,
|
|
115
121
|
):
|
|
116
|
-
logger.
|
|
122
|
+
logger.debug(
|
|
117
123
|
f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
|
|
118
124
|
)
|
|
119
125
|
time.sleep(1)
|
|
@@ -126,11 +132,10 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
126
132
|
f"{num_attempts} attempts."
|
|
127
133
|
)
|
|
128
134
|
assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
|
|
129
|
-
|
|
130
|
-
missing_keys = [key for key in required_keys if key not in dataset]
|
|
135
|
+
missing_keys = [key for key in dataset_config.splits if key not in dataset]
|
|
131
136
|
if missing_keys:
|
|
132
137
|
raise InvalidBenchmark(
|
|
133
138
|
"The dataset is missing the following required splits: "
|
|
134
139
|
f"{', '.join(missing_keys)}"
|
|
135
140
|
)
|
|
136
|
-
return DatasetDict({key: dataset[key] for key in
|
|
141
|
+
return DatasetDict({key: dataset[key] for key in dataset_config.splits})
|
euroeval/data_models.py
CHANGED
|
@@ -9,11 +9,14 @@ from dataclasses import dataclass, field
|
|
|
9
9
|
import pydantic
|
|
10
10
|
import torch
|
|
11
11
|
|
|
12
|
-
from .enums import Device,
|
|
13
|
-
from .metrics import Metric
|
|
12
|
+
from .enums import Device, GenerativeType, ModelType, TaskGroup
|
|
14
13
|
from .types import ScoreDict
|
|
15
14
|
from .utils import get_package_version
|
|
16
15
|
|
|
16
|
+
if t.TYPE_CHECKING:
|
|
17
|
+
from .enums import InferenceBackend
|
|
18
|
+
from .metrics import Metric
|
|
19
|
+
|
|
17
20
|
|
|
18
21
|
@dataclass
|
|
19
22
|
class Language:
|
|
@@ -104,15 +107,51 @@ class Task:
|
|
|
104
107
|
using few-shot evaluation.
|
|
105
108
|
default_labels:
|
|
106
109
|
The default labels for datasets using this task.
|
|
110
|
+
requires_zero_shot (optional):
|
|
111
|
+
Whether to only allow zero-shot evaluation for this task. If True, the
|
|
112
|
+
task will not be evaluated using few-shot examples.
|
|
113
|
+
uses_structured_output (optional):
|
|
114
|
+
Whether the task uses structured output. If True, the task will return
|
|
115
|
+
structured output (e.g., BIO tags for NER). Defaults to False.
|
|
116
|
+
uses_logprobs (optional):
|
|
117
|
+
Whether the task uses log probabilities. If True, the task will return
|
|
118
|
+
log probabilities for the generated tokens. Defaults to False.
|
|
119
|
+
requires_logprobs (optional):
|
|
120
|
+
Whether the task requires log probabilities. Implies `uses_logprobs`.
|
|
121
|
+
allowed_model_types (optional):
|
|
122
|
+
A list of model types that are allowed to be evaluated on this task.
|
|
123
|
+
Defaults to all model types being allowed.
|
|
124
|
+
allowed_generative_types (optional):
|
|
125
|
+
A list of generative model types that are allowed to be evaluated on this
|
|
126
|
+
task. If None, all generative model types are allowed. Only relevant if
|
|
127
|
+
`allowed_model_types` includes generative models.
|
|
107
128
|
"""
|
|
108
129
|
|
|
109
130
|
name: str
|
|
110
131
|
task_group: TaskGroup
|
|
111
132
|
template_dict: dict["Language", "PromptConfig"]
|
|
112
|
-
metrics: list[Metric]
|
|
133
|
+
metrics: list["Metric"]
|
|
113
134
|
default_num_few_shot_examples: int
|
|
114
135
|
default_max_generated_tokens: int
|
|
115
136
|
default_labels: list[str]
|
|
137
|
+
requires_zero_shot: bool = False
|
|
138
|
+
uses_structured_output: bool = False
|
|
139
|
+
uses_logprobs: bool = False
|
|
140
|
+
requires_logprobs: bool = False
|
|
141
|
+
allowed_model_types: list[ModelType] = field(
|
|
142
|
+
default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
|
|
143
|
+
)
|
|
144
|
+
allowed_generative_types: list[GenerativeType] = field(
|
|
145
|
+
default_factory=lambda: [
|
|
146
|
+
GenerativeType.BASE,
|
|
147
|
+
GenerativeType.INSTRUCTION_TUNED,
|
|
148
|
+
GenerativeType.REASONING,
|
|
149
|
+
]
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
def __post_init__(self) -> None:
|
|
153
|
+
"""Post-initialisation checks."""
|
|
154
|
+
self.uses_logprobs = self.uses_logprobs or self.requires_logprobs
|
|
116
155
|
|
|
117
156
|
def __hash__(self) -> int:
|
|
118
157
|
"""Return a hash of the task."""
|
|
@@ -177,7 +216,7 @@ class BenchmarkConfig:
|
|
|
177
216
|
Whether to run the benchmark in debug mode.
|
|
178
217
|
run_with_cli:
|
|
179
218
|
Whether the benchmark is being run with the CLI.
|
|
180
|
-
|
|
219
|
+
requires_safetensors:
|
|
181
220
|
Whether to only allow models that use the safetensors format.
|
|
182
221
|
"""
|
|
183
222
|
|
|
@@ -204,7 +243,7 @@ class BenchmarkConfig:
|
|
|
204
243
|
gpu_memory_utilization: float
|
|
205
244
|
debug: bool
|
|
206
245
|
run_with_cli: bool
|
|
207
|
-
|
|
246
|
+
requires_safetensors: bool
|
|
208
247
|
|
|
209
248
|
|
|
210
249
|
class BenchmarkConfigParams(pydantic.BaseModel):
|
|
@@ -236,7 +275,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
236
275
|
gpu_memory_utilization: float
|
|
237
276
|
debug: bool
|
|
238
277
|
run_with_cli: bool
|
|
239
|
-
|
|
278
|
+
requires_safetensors: bool
|
|
240
279
|
|
|
241
280
|
|
|
242
281
|
class BenchmarkResult(pydantic.BaseModel):
|
|
@@ -356,6 +395,11 @@ class DatasetConfig:
|
|
|
356
395
|
to a 1:1 mapping between the labels and themselves. If None then the mapping
|
|
357
396
|
will be set to the default mapping for the task and language. Defaults to
|
|
358
397
|
None.
|
|
398
|
+
splits (optional):
|
|
399
|
+
The names of the splits in the dataset. If not provided, defaults to
|
|
400
|
+
["train", "val", "test"].
|
|
401
|
+
bootstrap_samples (optional):
|
|
402
|
+
Whether to bootstrap the dataset samples. Defaults to True.
|
|
359
403
|
unofficial (optional):
|
|
360
404
|
Whether the dataset is unofficial. Defaults to False.
|
|
361
405
|
"""
|
|
@@ -372,6 +416,8 @@ class DatasetConfig:
|
|
|
372
416
|
_max_generated_tokens: int | None = None
|
|
373
417
|
_labels: list[str] | None = None
|
|
374
418
|
_prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
|
|
419
|
+
splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
|
|
420
|
+
bootstrap_samples: bool = True
|
|
375
421
|
unofficial: bool = False
|
|
376
422
|
|
|
377
423
|
@property
|
|
@@ -546,7 +592,7 @@ class ModelConfig:
|
|
|
546
592
|
revision: str
|
|
547
593
|
task: str
|
|
548
594
|
languages: list[Language]
|
|
549
|
-
inference_backend: InferenceBackend
|
|
595
|
+
inference_backend: "InferenceBackend"
|
|
550
596
|
merge: bool
|
|
551
597
|
model_type: ModelType
|
|
552
598
|
fresh: bool
|
|
@@ -6,12 +6,14 @@ from ..tasks import SPEED
|
|
|
6
6
|
from .danish import * # noqa: F403
|
|
7
7
|
from .dutch import * # noqa: F403
|
|
8
8
|
from .english import * # noqa: F403
|
|
9
|
+
from .estonian import * # noqa: F403
|
|
9
10
|
from .faroese import * # noqa: F403
|
|
10
11
|
from .finnish import * # noqa: F403
|
|
11
12
|
from .french import * # noqa: F403
|
|
12
13
|
from .german import * # noqa: F403
|
|
13
14
|
from .icelandic import * # noqa: F403
|
|
14
15
|
from .italian import * # noqa: F403
|
|
16
|
+
from .latvian import * # noqa: F403
|
|
15
17
|
from .norwegian import * # noqa: F403
|
|
16
18
|
from .portuguese import * # noqa: F403
|
|
17
19
|
from .spanish import * # noqa: F403
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import DA
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -76,6 +76,17 @@ HELLASWAG_DA_CONFIG = DatasetConfig(
|
|
|
76
76
|
languages=[DA],
|
|
77
77
|
)
|
|
78
78
|
|
|
79
|
+
EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
|
|
80
|
+
name="european-values-da",
|
|
81
|
+
pretty_name="the Danish version of the European values evaluation dataset",
|
|
82
|
+
huggingface_id="EuroEval/european-values-da",
|
|
83
|
+
task=EUROPEAN_VALUES,
|
|
84
|
+
languages=[DA],
|
|
85
|
+
splits=["test"],
|
|
86
|
+
bootstrap_samples=False,
|
|
87
|
+
_instruction_prompt="{text}",
|
|
88
|
+
)
|
|
89
|
+
|
|
79
90
|
|
|
80
91
|
### Unofficial datasets ###
|
|
81
92
|
|
|
@@ -138,3 +149,29 @@ GOLDENSWAG_DA_CONFIG = DatasetConfig(
|
|
|
138
149
|
languages=[DA],
|
|
139
150
|
unofficial=True,
|
|
140
151
|
)
|
|
152
|
+
|
|
153
|
+
EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
|
|
154
|
+
name="european-values-situational-da",
|
|
155
|
+
pretty_name="the Danish version of the European values evaluation dataset, where "
|
|
156
|
+
"the questions are phrased in a situational way",
|
|
157
|
+
huggingface_id="EuroEval/european-values-situational-da",
|
|
158
|
+
task=EUROPEAN_VALUES,
|
|
159
|
+
languages=[DA],
|
|
160
|
+
splits=["test"],
|
|
161
|
+
bootstrap_samples=False,
|
|
162
|
+
_instruction_prompt="{text}",
|
|
163
|
+
unofficial=True,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
|
|
167
|
+
name="european-values-completions-da",
|
|
168
|
+
pretty_name="the Danish version of the European values evaluation dataset, where "
|
|
169
|
+
"the questions are phrased as sentence completions",
|
|
170
|
+
huggingface_id="EuroEval/european-values-completions-da",
|
|
171
|
+
task=EUROPEAN_VALUES,
|
|
172
|
+
languages=[DA],
|
|
173
|
+
splits=["test"],
|
|
174
|
+
bootstrap_samples=False,
|
|
175
|
+
_instruction_prompt="{text}",
|
|
176
|
+
unofficial=True,
|
|
177
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import NL
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -69,6 +69,17 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
|
|
|
69
69
|
languages=[NL],
|
|
70
70
|
)
|
|
71
71
|
|
|
72
|
+
EUROPEAN_VALUES_NL_CONFIG = DatasetConfig(
|
|
73
|
+
name="european-values-nl",
|
|
74
|
+
pretty_name="the Dutch version of the European values evaluation dataset",
|
|
75
|
+
huggingface_id="EuroEval/european-values-nl",
|
|
76
|
+
task=EUROPEAN_VALUES,
|
|
77
|
+
languages=[NL],
|
|
78
|
+
splits=["test"],
|
|
79
|
+
bootstrap_samples=False,
|
|
80
|
+
_instruction_prompt="{text}",
|
|
81
|
+
)
|
|
82
|
+
|
|
72
83
|
|
|
73
84
|
### Unofficial datasets ###
|
|
74
85
|
|
|
@@ -130,3 +141,29 @@ GOLDENSWAG_NL_CONFIG = DatasetConfig(
|
|
|
130
141
|
languages=[NL],
|
|
131
142
|
unofficial=True,
|
|
132
143
|
)
|
|
144
|
+
|
|
145
|
+
EUROPEAN_VALUES_SITUATIONAL_NL_CONFIG = DatasetConfig(
|
|
146
|
+
name="european-values-situational-nl",
|
|
147
|
+
pretty_name="the Dutch version of the European values evaluation dataset, where "
|
|
148
|
+
"the questions are phrased in a situational way",
|
|
149
|
+
huggingface_id="EuroEval/european-values-situational-nl",
|
|
150
|
+
task=EUROPEAN_VALUES,
|
|
151
|
+
languages=[NL],
|
|
152
|
+
splits=["test"],
|
|
153
|
+
bootstrap_samples=False,
|
|
154
|
+
_instruction_prompt="{text}",
|
|
155
|
+
unofficial=True,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
EUROPEAN_VALUES_COMPLETIONS_NL_CONFIG = DatasetConfig(
|
|
159
|
+
name="european-values-completions-nl",
|
|
160
|
+
pretty_name="the Dutch version of the European values evaluation dataset, where "
|
|
161
|
+
"the questions are phrased as sentence completions",
|
|
162
|
+
huggingface_id="EuroEval/european-values-completions-nl",
|
|
163
|
+
task=EUROPEAN_VALUES,
|
|
164
|
+
languages=[NL],
|
|
165
|
+
splits=["test"],
|
|
166
|
+
bootstrap_samples=False,
|
|
167
|
+
_instruction_prompt="{text}",
|
|
168
|
+
unofficial=True,
|
|
169
|
+
)
|