EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +8 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +190 -110
- euroeval/benchmark_modules/vllm.py +199 -139
- euroeval/benchmarker.py +49 -22
- euroeval/cli.py +3 -3
- euroeval/constants.py +19 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +73 -23
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +35 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +6 -6
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +90 -20
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +276 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/model_cache.py +13 -1
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +19 -8
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +128 -42
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +59 -73
- euroeval/tasks.py +33 -6
- euroeval/tokenization_utils.py +294 -207
- euroeval/utils.py +150 -35
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
- euroeval-16.0.1.dist-info/RECORD +69 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -470
- euroeval-15.16.0.dist-info/RECORD +0 -63
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py
CHANGED
|
@@ -15,7 +15,7 @@ from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
|
|
|
15
15
|
from torch.distributed import destroy_process_group
|
|
16
16
|
|
|
17
17
|
from .benchmark_config_factory import build_benchmark_config
|
|
18
|
-
from .constants import
|
|
18
|
+
from .constants import GENERATIVE_PIPELINE_TAGS
|
|
19
19
|
from .data_loading import load_data
|
|
20
20
|
from .data_models import BenchmarkConfigParams, BenchmarkResult
|
|
21
21
|
from .dataset_configs import get_all_dataset_configs
|
|
@@ -81,7 +81,7 @@ class Benchmarker:
|
|
|
81
81
|
gpu_memory_utilization: float = 0.9,
|
|
82
82
|
debug: bool = False,
|
|
83
83
|
run_with_cli: bool = False,
|
|
84
|
-
|
|
84
|
+
requires_safetensors: bool = False,
|
|
85
85
|
) -> None:
|
|
86
86
|
"""Initialise the benchmarker.
|
|
87
87
|
|
|
@@ -156,7 +156,7 @@ class Benchmarker:
|
|
|
156
156
|
run_with_cli:
|
|
157
157
|
Whether the benchmarker is being run from the command-line interface.
|
|
158
158
|
Defaults to False.
|
|
159
|
-
|
|
159
|
+
requires_safetensors:
|
|
160
160
|
Whether to only allow models that use the safetensors format. Defaults
|
|
161
161
|
to False.
|
|
162
162
|
|
|
@@ -201,11 +201,11 @@ class Benchmarker:
|
|
|
201
201
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
202
202
|
debug=debug,
|
|
203
203
|
run_with_cli=run_with_cli,
|
|
204
|
-
|
|
204
|
+
requires_safetensors=requires_safetensors,
|
|
205
205
|
)
|
|
206
206
|
|
|
207
207
|
self.benchmark_config = build_benchmark_config(
|
|
208
|
-
|
|
208
|
+
**self.benchmark_config_default_params.model_dump()
|
|
209
209
|
)
|
|
210
210
|
|
|
211
211
|
# Initialise variable storing model lists, so we only have to fetch it once
|
|
@@ -249,7 +249,7 @@ class Benchmarker:
|
|
|
249
249
|
evaluate_test_split: bool | None = None,
|
|
250
250
|
few_shot: bool | None = None,
|
|
251
251
|
num_iterations: int | None = None,
|
|
252
|
-
|
|
252
|
+
requires_safetensors: bool | None = None,
|
|
253
253
|
) -> list[BenchmarkResult]:
|
|
254
254
|
"""Benchmarks models on datasets.
|
|
255
255
|
|
|
@@ -327,7 +327,7 @@ class Benchmarker:
|
|
|
327
327
|
to be used for power users, and scores will not be allowed on the
|
|
328
328
|
leaderboards if this is changed. Defaults to the value specified when
|
|
329
329
|
initialising the benchmarker.
|
|
330
|
-
|
|
330
|
+
requires_safetensors:
|
|
331
331
|
Whether to only allow models that use the safetensors format. Defaults
|
|
332
332
|
to the value specified when initialising the benchmarker.
|
|
333
333
|
|
|
@@ -361,7 +361,7 @@ class Benchmarker:
|
|
|
361
361
|
evaluate_test_split=evaluate_test_split,
|
|
362
362
|
few_shot=few_shot,
|
|
363
363
|
num_iterations=num_iterations,
|
|
364
|
-
|
|
364
|
+
requires_safetensors=requires_safetensors,
|
|
365
365
|
)
|
|
366
366
|
|
|
367
367
|
adjust_logging_level(verbose=benchmark_config.verbose)
|
|
@@ -390,7 +390,35 @@ class Benchmarker:
|
|
|
390
390
|
continue
|
|
391
391
|
|
|
392
392
|
loaded_model: BenchmarkModule | None = None
|
|
393
|
+
benchmark_params_to_revert: dict[str, t.Any] = dict()
|
|
393
394
|
for dataset_config in dataset_configs:
|
|
395
|
+
# Revert any changes to the benchmark configuration made for the
|
|
396
|
+
# previous dataset
|
|
397
|
+
for param, value in benchmark_params_to_revert.items():
|
|
398
|
+
setattr(benchmark_config, param, value)
|
|
399
|
+
benchmark_params_to_revert = dict()
|
|
400
|
+
|
|
401
|
+
# Update the benchmark config if the dataset requires it
|
|
402
|
+
if (
|
|
403
|
+
"val" not in dataset_config.splits
|
|
404
|
+
and not benchmark_config.evaluate_test_split
|
|
405
|
+
):
|
|
406
|
+
logger.debug(
|
|
407
|
+
"The dataset does not have a validation split, so even though "
|
|
408
|
+
"you requested evaluating the validation split (the default), "
|
|
409
|
+
"we will evaluate on the test split."
|
|
410
|
+
)
|
|
411
|
+
benchmark_params_to_revert["evaluate_test_split"] = False
|
|
412
|
+
benchmark_config.evaluate_test_split = True
|
|
413
|
+
if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
|
|
414
|
+
logger.debug(
|
|
415
|
+
"The task requires zero-shot evaluation, so even though you "
|
|
416
|
+
"requested few-shot evaluation (the default), we will evaluate "
|
|
417
|
+
"zero-shot."
|
|
418
|
+
)
|
|
419
|
+
benchmark_params_to_revert["few_shot"] = True
|
|
420
|
+
benchmark_config.few_shot = False
|
|
421
|
+
|
|
394
422
|
# Skip if we have already benchmarked this model on this dataset and
|
|
395
423
|
# we are not forcing the benchmark
|
|
396
424
|
if not benchmark_config.force and model_has_been_benchmarked(
|
|
@@ -408,15 +436,14 @@ class Benchmarker:
|
|
|
408
436
|
num_finished_benchmarks += 1
|
|
409
437
|
continue
|
|
410
438
|
|
|
411
|
-
# Skip if the model
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
if model_config.model_type == ModelType.ENCODER and task_is_generative:
|
|
439
|
+
# Skip if the model type should not be benchmarked on this dataset
|
|
440
|
+
model_type = model_config.model_type
|
|
441
|
+
allowed_model_types = dataset_config.task.allowed_model_types
|
|
442
|
+
if model_type not in allowed_model_types:
|
|
416
443
|
logger.debug(
|
|
417
444
|
f"Skipping benchmarking {model_id} on "
|
|
418
|
-
f"{dataset_config.pretty_name}, as it is
|
|
419
|
-
"the
|
|
445
|
+
f"{dataset_config.pretty_name}, as it is of type {model_type}, "
|
|
446
|
+
f"and the only allowed model types are {allowed_model_types}."
|
|
420
447
|
)
|
|
421
448
|
continue
|
|
422
449
|
|
|
@@ -535,7 +562,7 @@ class Benchmarker:
|
|
|
535
562
|
api_version: str | None | None = None,
|
|
536
563
|
debug: bool | None = None,
|
|
537
564
|
run_with_cli: bool | None = None,
|
|
538
|
-
|
|
565
|
+
requires_safetensors: bool | None = None,
|
|
539
566
|
) -> "BenchmarkConfig":
|
|
540
567
|
"""Get an updated benchmark configuration.
|
|
541
568
|
|
|
@@ -609,7 +636,7 @@ class Benchmarker:
|
|
|
609
636
|
run_with_cli:
|
|
610
637
|
Whether the benchmarker is being run from the command-line interface.
|
|
611
638
|
If None, then this value will not be updated.
|
|
612
|
-
|
|
639
|
+
requires_safetensors:
|
|
613
640
|
Whether to only allow models that use the safetensors format. If None,
|
|
614
641
|
then this value will not be updated.
|
|
615
642
|
|
|
@@ -666,8 +693,8 @@ class Benchmarker:
|
|
|
666
693
|
benchmark_config_params.debug = debug
|
|
667
694
|
if run_with_cli is not None:
|
|
668
695
|
benchmark_config_params.run_with_cli = run_with_cli
|
|
669
|
-
if
|
|
670
|
-
benchmark_config_params.
|
|
696
|
+
if requires_safetensors is not None:
|
|
697
|
+
benchmark_config_params.requires_safetensors = requires_safetensors
|
|
671
698
|
|
|
672
699
|
return build_benchmark_config(**benchmark_config_params.model_dump())
|
|
673
700
|
|
|
@@ -857,7 +884,7 @@ class Benchmarker:
|
|
|
857
884
|
evaluate_test_split: bool | None = None,
|
|
858
885
|
few_shot: bool | None = None,
|
|
859
886
|
num_iterations: int | None = None,
|
|
860
|
-
|
|
887
|
+
requires_safetensors: bool | None = None,
|
|
861
888
|
) -> list[BenchmarkResult]:
|
|
862
889
|
"""Benchmarks models on datasets.
|
|
863
890
|
|
|
@@ -935,7 +962,7 @@ class Benchmarker:
|
|
|
935
962
|
to be used for power users, and scores will not be allowed on the
|
|
936
963
|
leaderboards if this is changed. Defaults to the value specified when
|
|
937
964
|
initialising the benchmarker.
|
|
938
|
-
|
|
965
|
+
requires_safetensors:
|
|
939
966
|
Whether to only allow models that use the safetensors format. Defaults
|
|
940
967
|
to the value specified when initialising the benchmarker.
|
|
941
968
|
|
|
@@ -971,7 +998,7 @@ class Benchmarker:
|
|
|
971
998
|
evaluate_test_split=evaluate_test_split,
|
|
972
999
|
few_shot=few_shot,
|
|
973
1000
|
num_iterations=num_iterations,
|
|
974
|
-
|
|
1001
|
+
requires_safetensors=requires_safetensors,
|
|
975
1002
|
)
|
|
976
1003
|
|
|
977
1004
|
|
euroeval/cli.py
CHANGED
|
@@ -203,7 +203,7 @@ from .tasks import get_all_tasks
|
|
|
203
203
|
"relevant if the model is generative.",
|
|
204
204
|
)
|
|
205
205
|
@click.option(
|
|
206
|
-
"--
|
|
206
|
+
"--requires-safetensors",
|
|
207
207
|
is_flag=True,
|
|
208
208
|
help="Only allow loading models that have safetensors weights available",
|
|
209
209
|
default=False,
|
|
@@ -233,7 +233,7 @@ def benchmark(
|
|
|
233
233
|
api_version: str | None,
|
|
234
234
|
gpu_memory_utilization: float,
|
|
235
235
|
debug: bool,
|
|
236
|
-
|
|
236
|
+
requires_safetensors: bool,
|
|
237
237
|
) -> None:
|
|
238
238
|
"""Benchmark pretrained language models on language tasks."""
|
|
239
239
|
models = list(model)
|
|
@@ -270,7 +270,7 @@ def benchmark(
|
|
|
270
270
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
271
271
|
debug=debug,
|
|
272
272
|
run_with_cli=True,
|
|
273
|
-
|
|
273
|
+
requires_safetensors=requires_safetensors,
|
|
274
274
|
)
|
|
275
275
|
|
|
276
276
|
# Perform the benchmark evaluation
|
euroeval/constants.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Constants used throughout the project."""
|
|
2
2
|
|
|
3
3
|
from .enums import TaskGroup
|
|
4
|
-
from .tasks import NER
|
|
5
4
|
|
|
6
5
|
# This is used as input to generative models; it cannot be a special token
|
|
7
6
|
DUMMY_FILL_VALUE = 100
|
|
@@ -11,7 +10,7 @@ DUMMY_FILL_VALUE = 100
|
|
|
11
10
|
# benchmark. We will still report the models' true maximum context length in the
|
|
12
11
|
# metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
|
|
13
12
|
# all tokens in the context.
|
|
14
|
-
MAX_CONTEXT_LENGTH =
|
|
13
|
+
MAX_CONTEXT_LENGTH = 8_192
|
|
15
14
|
|
|
16
15
|
|
|
17
16
|
# We need to raise the amount of tokens generated for reasoning models, to give them
|
|
@@ -37,21 +36,10 @@ GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
|
|
|
37
36
|
LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
|
|
38
37
|
|
|
39
38
|
|
|
40
|
-
# Tasks where we use structured generation for generative models
|
|
41
|
-
TASKS_USING_JSON = [NER]
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
# Tasks where we use log probabilities for generative models, rather than the raw
|
|
45
|
-
# completion
|
|
46
|
-
TASK_GROUPS_USING_LOGPROBS = [
|
|
47
|
-
TaskGroup.SEQUENCE_CLASSIFICATION,
|
|
48
|
-
TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
49
|
-
]
|
|
50
|
-
|
|
51
|
-
|
|
52
39
|
# The number of top log probabilities to return for generative models. For several APIs
|
|
53
40
|
# this is the maximum number of log probabilities that can be returned
|
|
54
|
-
|
|
41
|
+
MAX_VLLM_LOGPROBS = 20
|
|
42
|
+
MAX_LITELLM_LOGPROBS = 8
|
|
55
43
|
|
|
56
44
|
|
|
57
45
|
# We make sure to remove these metric attributes after each iteration, to avoid memory
|
|
@@ -77,3 +65,19 @@ REASONING_TOKENS = [
|
|
|
77
65
|
# manually. We only use them as stop tokens if they actually appear in the model's
|
|
78
66
|
# output
|
|
79
67
|
CUSTOM_STOP_TOKENS = ["<sep>"]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# For classification tasks we force LiteLLM models to output a JSON dictionary with a
|
|
71
|
+
# single key and the values being restricted to the allowed labels. This is the key we
|
|
72
|
+
# use
|
|
73
|
+
LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# These characters are stripped from JSON output when trying to identify the label
|
|
77
|
+
JSON_STRIP_CHARACTERS = ' {}\n\r":'
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# The number of tokens we generate when evaluating generative models on classification
|
|
81
|
+
# tasks. We also use this to determine whether we should store logprobs in the model
|
|
82
|
+
# outputs (and cache).
|
|
83
|
+
NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
|
euroeval/data_loading.py
CHANGED
|
@@ -12,6 +12,7 @@ from huggingface_hub.errors import HfHubHTTPError
|
|
|
12
12
|
from numpy.random import Generator
|
|
13
13
|
|
|
14
14
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
15
|
+
from .tasks import EUROPEAN_VALUES
|
|
15
16
|
from .utils import unscramble
|
|
16
17
|
|
|
17
18
|
if t.TYPE_CHECKING:
|
|
@@ -48,40 +49,45 @@ def load_data(
|
|
|
48
49
|
dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
|
|
49
50
|
)
|
|
50
51
|
|
|
51
|
-
if not benchmark_config.evaluate_test_split:
|
|
52
|
+
if not benchmark_config.evaluate_test_split and "val" in dataset:
|
|
52
53
|
dataset["test"] = dataset["val"]
|
|
53
54
|
|
|
54
55
|
# Remove empty examples from the datasets
|
|
55
56
|
for text_feature in ["tokens", "text"]:
|
|
56
|
-
|
|
57
|
-
|
|
57
|
+
for split in dataset_config.splits:
|
|
58
|
+
if text_feature in dataset[split].features:
|
|
59
|
+
dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
|
|
58
60
|
|
|
59
|
-
# If we are testing then truncate the test set
|
|
60
|
-
|
|
61
|
+
# If we are testing then truncate the test set, unless we need the full set for
|
|
62
|
+
# evaluation
|
|
63
|
+
if hasattr(sys, "_called_from_test") and dataset_config.task != EUROPEAN_VALUES:
|
|
61
64
|
dataset["test"] = dataset["test"].select(range(1))
|
|
62
65
|
|
|
63
|
-
# Bootstrap the splits
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
66
|
+
# Bootstrap the splits, if applicable
|
|
67
|
+
if dataset_config.bootstrap_samples:
|
|
68
|
+
bootstrapped_splits: dict[str, list["Dataset"]] = dict()
|
|
69
|
+
for split in dataset_config.splits:
|
|
70
|
+
bootstrap_indices = rng.integers(
|
|
71
|
+
0,
|
|
72
|
+
len(dataset[split]),
|
|
73
|
+
size=(benchmark_config.num_iterations, len(dataset[split])),
|
|
74
|
+
)
|
|
75
|
+
bootstrapped_splits[split] = [
|
|
76
|
+
dataset[split].select(bootstrap_indices[idx])
|
|
77
|
+
for idx in range(benchmark_config.num_iterations)
|
|
78
|
+
]
|
|
79
|
+
datasets = [
|
|
80
|
+
DatasetDict(
|
|
81
|
+
{
|
|
82
|
+
split: bootstrapped_splits[split][idx]
|
|
83
|
+
for split in dataset_config.splits
|
|
84
|
+
}
|
|
85
|
+
)
|
|
73
86
|
for idx in range(benchmark_config.num_iterations)
|
|
74
87
|
]
|
|
88
|
+
else:
|
|
89
|
+
datasets = [dataset] * benchmark_config.num_iterations
|
|
75
90
|
|
|
76
|
-
datasets = [
|
|
77
|
-
DatasetDict(
|
|
78
|
-
{
|
|
79
|
-
split: bootstrapped_splits[split][idx]
|
|
80
|
-
for split in ["train", "val", "test"]
|
|
81
|
-
}
|
|
82
|
-
)
|
|
83
|
-
for idx in range(benchmark_config.num_iterations)
|
|
84
|
-
]
|
|
85
91
|
return datasets
|
|
86
92
|
|
|
87
93
|
|
|
@@ -113,7 +119,7 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
113
119
|
requests.ConnectionError,
|
|
114
120
|
requests.ReadTimeout,
|
|
115
121
|
):
|
|
116
|
-
logger.
|
|
122
|
+
logger.debug(
|
|
117
123
|
f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
|
|
118
124
|
)
|
|
119
125
|
time.sleep(1)
|
|
@@ -126,11 +132,10 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
126
132
|
f"{num_attempts} attempts."
|
|
127
133
|
)
|
|
128
134
|
assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
|
|
129
|
-
|
|
130
|
-
missing_keys = [key for key in required_keys if key not in dataset]
|
|
135
|
+
missing_keys = [key for key in dataset_config.splits if key not in dataset]
|
|
131
136
|
if missing_keys:
|
|
132
137
|
raise InvalidBenchmark(
|
|
133
138
|
"The dataset is missing the following required splits: "
|
|
134
139
|
f"{', '.join(missing_keys)}"
|
|
135
140
|
)
|
|
136
|
-
return DatasetDict({key: dataset[key] for key in
|
|
141
|
+
return DatasetDict({key: dataset[key] for key in dataset_config.splits})
|
euroeval/data_models.py
CHANGED
|
@@ -9,11 +9,14 @@ from dataclasses import dataclass, field
|
|
|
9
9
|
import pydantic
|
|
10
10
|
import torch
|
|
11
11
|
|
|
12
|
-
from .enums import Device,
|
|
13
|
-
from .metrics import Metric
|
|
12
|
+
from .enums import Device, GenerativeType, ModelType, TaskGroup
|
|
14
13
|
from .types import ScoreDict
|
|
15
14
|
from .utils import get_package_version
|
|
16
15
|
|
|
16
|
+
if t.TYPE_CHECKING:
|
|
17
|
+
from .enums import InferenceBackend
|
|
18
|
+
from .metrics import Metric
|
|
19
|
+
|
|
17
20
|
|
|
18
21
|
@dataclass
|
|
19
22
|
class Language:
|
|
@@ -104,15 +107,58 @@ class Task:
|
|
|
104
107
|
using few-shot evaluation.
|
|
105
108
|
default_labels:
|
|
106
109
|
The default labels for datasets using this task.
|
|
110
|
+
requires_zero_shot (optional):
|
|
111
|
+
Whether to only allow zero-shot evaluation for this task. If True, the
|
|
112
|
+
task will not be evaluated using few-shot examples.
|
|
113
|
+
uses_structured_output (optional):
|
|
114
|
+
Whether the task uses structured output. If True, the task will return
|
|
115
|
+
structured output (e.g., BIO tags for NER). Defaults to False.
|
|
116
|
+
uses_logprobs (optional):
|
|
117
|
+
Whether the task uses log probabilities. If True, the task will return
|
|
118
|
+
log probabilities for the generated tokens. Defaults to False.
|
|
119
|
+
requires_logprobs (optional):
|
|
120
|
+
Whether the task requires log probabilities. Implies `uses_logprobs`.
|
|
121
|
+
allowed_model_types (optional):
|
|
122
|
+
A list of model types that are allowed to be evaluated on this task.
|
|
123
|
+
Defaults to all model types being allowed.
|
|
124
|
+
allowed_generative_types (optional):
|
|
125
|
+
A list of generative model types that are allowed to be evaluated on this
|
|
126
|
+
task. If None, all generative model types are allowed. Only relevant if
|
|
127
|
+
`allowed_model_types` includes generative models.
|
|
128
|
+
allow_invalid_model_outputs (optional):
|
|
129
|
+
Whether to allow invalid model outputs. This is only relevant for generative
|
|
130
|
+
models on classification tasks, where the model may generate an output
|
|
131
|
+
which is not one of the allowed labels. If True, the model output will be
|
|
132
|
+
mapped to the closest valid label. If False, the model output will be
|
|
133
|
+
considered incorrect and the evaluation will be aborted. Defaults to True.
|
|
107
134
|
"""
|
|
108
135
|
|
|
109
136
|
name: str
|
|
110
137
|
task_group: TaskGroup
|
|
111
138
|
template_dict: dict["Language", "PromptConfig"]
|
|
112
|
-
metrics: list[Metric]
|
|
139
|
+
metrics: list["Metric"]
|
|
113
140
|
default_num_few_shot_examples: int
|
|
114
141
|
default_max_generated_tokens: int
|
|
115
142
|
default_labels: list[str]
|
|
143
|
+
requires_zero_shot: bool = False
|
|
144
|
+
uses_structured_output: bool = False
|
|
145
|
+
uses_logprobs: bool = False
|
|
146
|
+
requires_logprobs: bool = False
|
|
147
|
+
allowed_model_types: list[ModelType] = field(
|
|
148
|
+
default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
|
|
149
|
+
)
|
|
150
|
+
allowed_generative_types: list[GenerativeType] = field(
|
|
151
|
+
default_factory=lambda: [
|
|
152
|
+
GenerativeType.BASE,
|
|
153
|
+
GenerativeType.INSTRUCTION_TUNED,
|
|
154
|
+
GenerativeType.REASONING,
|
|
155
|
+
]
|
|
156
|
+
)
|
|
157
|
+
allow_invalid_model_outputs: bool = True
|
|
158
|
+
|
|
159
|
+
def __post_init__(self) -> None:
|
|
160
|
+
"""Post-initialisation checks."""
|
|
161
|
+
self.uses_logprobs = self.uses_logprobs or self.requires_logprobs
|
|
116
162
|
|
|
117
163
|
def __hash__(self) -> int:
|
|
118
164
|
"""Return a hash of the task."""
|
|
@@ -177,7 +223,7 @@ class BenchmarkConfig:
|
|
|
177
223
|
Whether to run the benchmark in debug mode.
|
|
178
224
|
run_with_cli:
|
|
179
225
|
Whether the benchmark is being run with the CLI.
|
|
180
|
-
|
|
226
|
+
requires_safetensors:
|
|
181
227
|
Whether to only allow models that use the safetensors format.
|
|
182
228
|
"""
|
|
183
229
|
|
|
@@ -204,7 +250,7 @@ class BenchmarkConfig:
|
|
|
204
250
|
gpu_memory_utilization: float
|
|
205
251
|
debug: bool
|
|
206
252
|
run_with_cli: bool
|
|
207
|
-
|
|
253
|
+
requires_safetensors: bool
|
|
208
254
|
|
|
209
255
|
|
|
210
256
|
class BenchmarkConfigParams(pydantic.BaseModel):
|
|
@@ -236,7 +282,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
236
282
|
gpu_memory_utilization: float
|
|
237
283
|
debug: bool
|
|
238
284
|
run_with_cli: bool
|
|
239
|
-
|
|
285
|
+
requires_safetensors: bool
|
|
240
286
|
|
|
241
287
|
|
|
242
288
|
class BenchmarkResult(pydantic.BaseModel):
|
|
@@ -356,6 +402,11 @@ class DatasetConfig:
|
|
|
356
402
|
to a 1:1 mapping between the labels and themselves. If None then the mapping
|
|
357
403
|
will be set to the default mapping for the task and language. Defaults to
|
|
358
404
|
None.
|
|
405
|
+
splits (optional):
|
|
406
|
+
The names of the splits in the dataset. If not provided, defaults to
|
|
407
|
+
["train", "val", "test"].
|
|
408
|
+
bootstrap_samples (optional):
|
|
409
|
+
Whether to bootstrap the dataset samples. Defaults to True.
|
|
359
410
|
unofficial (optional):
|
|
360
411
|
Whether the dataset is unofficial. Defaults to False.
|
|
361
412
|
"""
|
|
@@ -372,6 +423,8 @@ class DatasetConfig:
|
|
|
372
423
|
_max_generated_tokens: int | None = None
|
|
373
424
|
_labels: list[str] | None = None
|
|
374
425
|
_prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
|
|
426
|
+
splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
|
|
427
|
+
bootstrap_samples: bool = True
|
|
375
428
|
unofficial: bool = False
|
|
376
429
|
|
|
377
430
|
@property
|
|
@@ -384,7 +437,6 @@ class DatasetConfig:
|
|
|
384
437
|
if self._prompt_prefix is None
|
|
385
438
|
else self._prompt_prefix
|
|
386
439
|
)
|
|
387
|
-
prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
|
|
388
440
|
return prompt_prefix
|
|
389
441
|
|
|
390
442
|
@property
|
|
@@ -397,7 +449,6 @@ class DatasetConfig:
|
|
|
397
449
|
if self._prompt_template is None
|
|
398
450
|
else self._prompt_template
|
|
399
451
|
)
|
|
400
|
-
prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
|
|
401
452
|
return prompt_template
|
|
402
453
|
|
|
403
454
|
@property
|
|
@@ -410,9 +461,6 @@ class DatasetConfig:
|
|
|
410
461
|
if self._instruction_prompt is None
|
|
411
462
|
else self._instruction_prompt
|
|
412
463
|
)
|
|
413
|
-
instruction_prompt = instruction_prompt.replace(
|
|
414
|
-
"{labels_str}", self._labels_str
|
|
415
|
-
)
|
|
416
464
|
return instruction_prompt
|
|
417
465
|
|
|
418
466
|
@property
|
|
@@ -473,15 +521,16 @@ class DatasetConfig:
|
|
|
473
521
|
"""Return a hash of the dataset configuration."""
|
|
474
522
|
return hash(self.name)
|
|
475
523
|
|
|
476
|
-
|
|
477
|
-
def _labels_str(self) -> str:
|
|
524
|
+
def get_labels_str(self, labels: list[str] | None = None) -> str:
|
|
478
525
|
"""Converts a set of labels to a natural string, in the specified language.
|
|
479
526
|
|
|
480
527
|
If the task is NER, we separate using 'and' and use the mapped labels instead of
|
|
481
528
|
the BIO NER labels.
|
|
482
529
|
|
|
483
530
|
Args:
|
|
484
|
-
|
|
531
|
+
labels (optional):
|
|
532
|
+
The labels to convert to a natural string. If None, uses all the labels
|
|
533
|
+
in the dataset. Defaults to None.
|
|
485
534
|
|
|
486
535
|
Returns:
|
|
487
536
|
The natural string representation of the labels in specified language.
|
|
@@ -493,16 +542,17 @@ class DatasetConfig:
|
|
|
493
542
|
else:
|
|
494
543
|
sep_word = main_language.or_separator
|
|
495
544
|
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
545
|
+
if labels is None:
|
|
546
|
+
labels = list()
|
|
547
|
+
for english_label in self.labels:
|
|
548
|
+
if english_label not in self.prompt_label_mapping:
|
|
549
|
+
continue
|
|
550
|
+
label = self.prompt_label_mapping[english_label]
|
|
551
|
+
if label not in labels:
|
|
552
|
+
labels.append(label)
|
|
503
553
|
|
|
504
554
|
# Convert labels to single-quoted labels - and remove duplicates
|
|
505
|
-
quoted_labels = [f"'{label}'" for label in
|
|
555
|
+
quoted_labels = [f"'{label}'" for label in labels]
|
|
506
556
|
|
|
507
557
|
if not quoted_labels:
|
|
508
558
|
return ""
|
|
@@ -546,7 +596,7 @@ class ModelConfig:
|
|
|
546
596
|
revision: str
|
|
547
597
|
task: str
|
|
548
598
|
languages: list[Language]
|
|
549
|
-
inference_backend: InferenceBackend
|
|
599
|
+
inference_backend: "InferenceBackend"
|
|
550
600
|
merge: bool
|
|
551
601
|
model_type: ModelType
|
|
552
602
|
fresh: bool
|
|
@@ -6,12 +6,14 @@ from ..tasks import SPEED
|
|
|
6
6
|
from .danish import * # noqa: F403
|
|
7
7
|
from .dutch import * # noqa: F403
|
|
8
8
|
from .english import * # noqa: F403
|
|
9
|
+
from .estonian import * # noqa: F403
|
|
9
10
|
from .faroese import * # noqa: F403
|
|
10
11
|
from .finnish import * # noqa: F403
|
|
11
12
|
from .french import * # noqa: F403
|
|
12
13
|
from .german import * # noqa: F403
|
|
13
14
|
from .icelandic import * # noqa: F403
|
|
14
15
|
from .italian import * # noqa: F403
|
|
16
|
+
from .latvian import * # noqa: F403
|
|
15
17
|
from .norwegian import * # noqa: F403
|
|
16
18
|
from .portuguese import * # noqa: F403
|
|
17
19
|
from .spanish import * # noqa: F403
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import DA
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -76,6 +76,16 @@ HELLASWAG_DA_CONFIG = DatasetConfig(
|
|
|
76
76
|
languages=[DA],
|
|
77
77
|
)
|
|
78
78
|
|
|
79
|
+
EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
|
|
80
|
+
name="european-values-da",
|
|
81
|
+
pretty_name="the Danish version of the European values evaluation dataset",
|
|
82
|
+
huggingface_id="EuroEval/european-values-da",
|
|
83
|
+
task=EUROPEAN_VALUES,
|
|
84
|
+
languages=[DA],
|
|
85
|
+
splits=["test"],
|
|
86
|
+
bootstrap_samples=False,
|
|
87
|
+
)
|
|
88
|
+
|
|
79
89
|
|
|
80
90
|
### Unofficial datasets ###
|
|
81
91
|
|
|
@@ -138,3 +148,27 @@ GOLDENSWAG_DA_CONFIG = DatasetConfig(
|
|
|
138
148
|
languages=[DA],
|
|
139
149
|
unofficial=True,
|
|
140
150
|
)
|
|
151
|
+
|
|
152
|
+
EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
|
|
153
|
+
name="european-values-situational-da",
|
|
154
|
+
pretty_name="the Danish version of the European values evaluation dataset, where "
|
|
155
|
+
"the questions are phrased in a situational way",
|
|
156
|
+
huggingface_id="EuroEval/european-values-situational-da",
|
|
157
|
+
task=EUROPEAN_VALUES,
|
|
158
|
+
languages=[DA],
|
|
159
|
+
splits=["test"],
|
|
160
|
+
bootstrap_samples=False,
|
|
161
|
+
unofficial=True,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
|
|
165
|
+
name="european-values-completions-da",
|
|
166
|
+
pretty_name="the Danish version of the European values evaluation dataset, where "
|
|
167
|
+
"the questions are phrased as sentence completions",
|
|
168
|
+
huggingface_id="EuroEval/european-values-completions-da",
|
|
169
|
+
task=EUROPEAN_VALUES,
|
|
170
|
+
languages=[DA],
|
|
171
|
+
splits=["test"],
|
|
172
|
+
bootstrap_samples=False,
|
|
173
|
+
unofficial=True,
|
|
174
|
+
)
|