EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +6 -0
- euroeval/benchmark_config_factory.py +51 -46
- euroeval/benchmark_modules/base.py +6 -5
- euroeval/benchmark_modules/hf.py +2 -9
- euroeval/benchmark_modules/litellm.py +14 -12
- euroeval/benchmark_modules/vllm.py +17 -10
- euroeval/benchmarker.py +61 -44
- euroeval/caching_utils.py +1 -1
- euroeval/cli.py +86 -8
- euroeval/constants.py +3 -0
- euroeval/data_loading.py +78 -30
- euroeval/data_models.py +326 -326
- euroeval/dataset_configs/__init__.py +10 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +25 -29
- euroeval/dataset_configs/danish.py +51 -88
- euroeval/dataset_configs/dutch.py +48 -86
- euroeval/dataset_configs/english.py +45 -76
- euroeval/dataset_configs/estonian.py +36 -38
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -68
- euroeval/dataset_configs/french.py +39 -74
- euroeval/dataset_configs/german.py +45 -81
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -78
- euroeval/dataset_configs/latvian.py +28 -34
- euroeval/dataset_configs/lithuanian.py +22 -26
- euroeval/dataset_configs/norwegian.py +72 -114
- euroeval/dataset_configs/polish.py +33 -60
- euroeval/dataset_configs/portuguese.py +33 -65
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +19 -24
- euroeval/dataset_configs/spanish.py +42 -76
- euroeval/dataset_configs/swedish.py +48 -84
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +3 -2
- euroeval/generation.py +5 -4
- euroeval/generation_utils.py +6 -5
- euroeval/languages.py +395 -323
- euroeval/metrics/huggingface.py +14 -3
- euroeval/metrics/llm_as_a_judge.py +1 -1
- euroeval/model_cache.py +6 -5
- euroeval/model_loading.py +1 -1
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +82 -43
- euroeval/prompt_templates/multiple_choice.py +81 -41
- euroeval/prompt_templates/named_entity_recognition.py +125 -44
- euroeval/prompt_templates/reading_comprehension.py +92 -43
- euroeval/prompt_templates/sentiment_classification.py +91 -43
- euroeval/prompt_templates/summarization.py +64 -39
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +4 -3
- euroeval/speed_benchmark.py +2 -1
- euroeval/task_group_utils/multiple_choice_classification.py +2 -1
- euroeval/task_group_utils/question_answering.py +24 -13
- euroeval/task_group_utils/sequence_classification.py +5 -4
- euroeval/task_group_utils/text_to_text.py +2 -1
- euroeval/task_group_utils/token_classification.py +11 -8
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +19 -10
- euroeval/types.py +10 -9
- euroeval/utils.py +6 -3
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.4.0.dist-info/RECORD +0 -75
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import get_all_languages
|
|
5
5
|
from ..tasks import SPEED
|
|
6
|
+
from .bulgarian import * # noqa: F403
|
|
6
7
|
from .czech import * # noqa: F403
|
|
7
8
|
from .danish import * # noqa: F403
|
|
8
9
|
from .dutch import * # noqa: F403
|
|
@@ -12,6 +13,7 @@ from .faroese import * # noqa: F403
|
|
|
12
13
|
from .finnish import * # noqa: F403
|
|
13
14
|
from .french import * # noqa: F403
|
|
14
15
|
from .german import * # noqa: F403
|
|
16
|
+
from .greek import * # noqa: F403
|
|
15
17
|
from .icelandic import * # noqa: F403
|
|
16
18
|
from .italian import * # noqa: F403
|
|
17
19
|
from .latvian import * # noqa: F403
|
|
@@ -19,9 +21,11 @@ from .lithuanian import * # noqa: F403
|
|
|
19
21
|
from .norwegian import * # noqa: F403
|
|
20
22
|
from .polish import * # noqa: F403
|
|
21
23
|
from .portuguese import * # noqa: F403
|
|
24
|
+
from .serbian import * # noqa: F403
|
|
22
25
|
from .slovak import * # noqa: F403
|
|
23
26
|
from .spanish import * # noqa: F403
|
|
24
27
|
from .swedish import * # noqa: F403
|
|
28
|
+
from .ukrainian import * # noqa: F403
|
|
25
29
|
|
|
26
30
|
|
|
27
31
|
def get_all_dataset_configs() -> dict[str, DatasetConfig]:
|
|
@@ -31,7 +35,9 @@ def get_all_dataset_configs() -> dict[str, DatasetConfig]:
|
|
|
31
35
|
A mapping between names of datasets and their configurations.
|
|
32
36
|
"""
|
|
33
37
|
dataset_configs = [
|
|
34
|
-
cfg
|
|
38
|
+
cfg
|
|
39
|
+
for cfg in globals().values()
|
|
40
|
+
if isinstance(cfg, DatasetConfig) and cfg.task != SPEED
|
|
35
41
|
]
|
|
36
42
|
assert len(dataset_configs) == len({cfg.name for cfg in dataset_configs}), (
|
|
37
43
|
"There are duplicate dataset configurations. Please ensure that each dataset "
|
|
@@ -62,8 +68,9 @@ def get_dataset_config(dataset_name: str) -> DatasetConfig:
|
|
|
62
68
|
|
|
63
69
|
SPEED_CONFIG = DatasetConfig(
|
|
64
70
|
name="speed",
|
|
65
|
-
pretty_name="
|
|
66
|
-
|
|
71
|
+
pretty_name="",
|
|
72
|
+
source="",
|
|
67
73
|
task=SPEED,
|
|
68
74
|
languages=list(get_all_languages().values()),
|
|
75
|
+
_logging_string="the speed estimation benchmark",
|
|
69
76
|
)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""All Bulgarian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import BULGARIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
CINEXIO_CONFIG = DatasetConfig(
|
|
10
|
+
name="cinexio",
|
|
11
|
+
pretty_name="Cinexio",
|
|
12
|
+
source="EuroEval/cinexio-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[BULGARIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_BG_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-bg",
|
|
19
|
+
pretty_name="ScaLA-bg",
|
|
20
|
+
source="EuroEval/scala-bg",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[BULGARIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
BG_NER_BSNLP_CONFIG = DatasetConfig(
|
|
26
|
+
name="bg-ner-bsnlp",
|
|
27
|
+
pretty_name="BG-NER-BSNLp",
|
|
28
|
+
source="EuroEval/bg-ner-bsnlp-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[BULGARIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_BG_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-bg",
|
|
35
|
+
pretty_name="MultiWikiQA-bg",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-bg-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[BULGARIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
EXAMS_BG_CONFIG = DatasetConfig(
|
|
42
|
+
name="exams-bg",
|
|
43
|
+
pretty_name="Exams-bg",
|
|
44
|
+
source="EuroEval/exams-bg-mini",
|
|
45
|
+
task=KNOW,
|
|
46
|
+
languages=[BULGARIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
WINOGRANDE_BG_CONFIG = DatasetConfig(
|
|
50
|
+
name="winogrande-bg",
|
|
51
|
+
pretty_name="Winogrande-bg",
|
|
52
|
+
source="EuroEval/winogrande-bg",
|
|
53
|
+
task=COMMON_SENSE,
|
|
54
|
+
languages=[BULGARIAN],
|
|
55
|
+
_labels=["a", "b"],
|
|
56
|
+
)
|
|
@@ -1,69 +1,65 @@
|
|
|
1
1
|
"""All Czech dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
4
|
+
from ..languages import CZECH
|
|
5
5
|
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
CSFD_SENTIMENT_CONFIG = DatasetConfig(
|
|
10
10
|
name="csfd-sentiment",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/csfd-sentiment-mini",
|
|
11
|
+
pretty_name="CSFD Sentiment",
|
|
12
|
+
source="EuroEval/csfd-sentiment-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[CZECH],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
CS_GEC_CONFIG = DatasetConfig(
|
|
19
18
|
name="cs-gec",
|
|
20
|
-
pretty_name="
|
|
21
|
-
"
|
|
22
|
-
huggingface_id="EuroEval/cs-gec-mini",
|
|
19
|
+
pretty_name="CS-GEC",
|
|
20
|
+
source="EuroEval/cs-gec-mini",
|
|
23
21
|
task=LA,
|
|
24
|
-
languages=[
|
|
22
|
+
languages=[CZECH],
|
|
25
23
|
)
|
|
26
24
|
|
|
27
25
|
PONER_CONFIG = DatasetConfig(
|
|
28
26
|
name="poner",
|
|
29
|
-
pretty_name="
|
|
30
|
-
"
|
|
31
|
-
huggingface_id="EuroEval/poner-mini",
|
|
27
|
+
pretty_name="PoNER",
|
|
28
|
+
source="EuroEval/poner-mini",
|
|
32
29
|
task=NER,
|
|
33
|
-
languages=[
|
|
30
|
+
languages=[CZECH],
|
|
34
31
|
)
|
|
35
32
|
|
|
36
33
|
SQAD_CONFIG = DatasetConfig(
|
|
37
34
|
name="sqad",
|
|
38
|
-
pretty_name="
|
|
39
|
-
|
|
35
|
+
pretty_name="SQAD",
|
|
36
|
+
source="EuroEval/sqad-mini",
|
|
40
37
|
task=RC,
|
|
41
|
-
languages=[
|
|
38
|
+
languages=[CZECH],
|
|
42
39
|
)
|
|
43
40
|
|
|
44
41
|
CZECH_NEWS_CONFIG = DatasetConfig(
|
|
45
42
|
name="czech-news",
|
|
46
|
-
pretty_name="
|
|
47
|
-
|
|
43
|
+
pretty_name="Czech News",
|
|
44
|
+
source="EuroEval/czech-news-mini",
|
|
48
45
|
task=SUMM,
|
|
49
|
-
languages=[
|
|
46
|
+
languages=[CZECH],
|
|
50
47
|
)
|
|
51
48
|
|
|
52
49
|
UMIMETO_QA_CONFIG = DatasetConfig(
|
|
53
50
|
name="umimeto-qa",
|
|
54
|
-
pretty_name="
|
|
55
|
-
|
|
51
|
+
pretty_name="Umimeto QA",
|
|
52
|
+
source="EuroEval/umimeto-qa",
|
|
56
53
|
task=KNOW,
|
|
57
|
-
languages=[
|
|
54
|
+
languages=[CZECH],
|
|
58
55
|
)
|
|
59
56
|
|
|
60
57
|
HELLASWAG_CS_CONFIG = DatasetConfig(
|
|
61
58
|
name="hellaswag-cs",
|
|
62
|
-
pretty_name="
|
|
63
|
-
"
|
|
64
|
-
huggingface_id="EuroEval/hellaswag-cs-mini",
|
|
59
|
+
pretty_name="HellaSwag-cs",
|
|
60
|
+
source="EuroEval/hellaswag-cs-mini",
|
|
65
61
|
task=COMMON_SENSE,
|
|
66
|
-
languages=[
|
|
62
|
+
languages=[CZECH],
|
|
67
63
|
)
|
|
68
64
|
|
|
69
65
|
|
|
@@ -71,9 +67,9 @@ HELLASWAG_CS_CONFIG = DatasetConfig(
|
|
|
71
67
|
|
|
72
68
|
SCALA_CS_CONFIG = DatasetConfig(
|
|
73
69
|
name="scala-cs",
|
|
74
|
-
pretty_name="
|
|
75
|
-
|
|
70
|
+
pretty_name="ScaLA-cs",
|
|
71
|
+
source="EuroEval/scala-cs",
|
|
76
72
|
task=LA,
|
|
77
|
-
languages=[
|
|
73
|
+
languages=[CZECH],
|
|
78
74
|
unofficial=True,
|
|
79
75
|
)
|
|
@@ -1,87 +1,81 @@
|
|
|
1
1
|
"""All Danish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
4
|
+
from ..languages import DANISH
|
|
5
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
ANGRY_TWEETS_CONFIG = DatasetConfig(
|
|
10
10
|
name="angry-tweets",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/angry-tweets-mini",
|
|
11
|
+
pretty_name="AngryTweets",
|
|
12
|
+
source="EuroEval/angry-tweets-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[DANISH],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_DA_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-da",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-da",
|
|
20
|
+
source="EuroEval/scala-da",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[DANISH],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
DANSK_CONFIG = DatasetConfig(
|
|
27
26
|
name="dansk",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/dansk-mini",
|
|
27
|
+
pretty_name="DANSK",
|
|
28
|
+
source="EuroEval/dansk-mini",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[DANISH],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
33
|
MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
|
|
36
34
|
name="multi-wiki-qa-da",
|
|
37
|
-
pretty_name="
|
|
38
|
-
"
|
|
39
|
-
huggingface_id="EuroEval/multi-wiki-qa-da-mini",
|
|
35
|
+
pretty_name="MultiWikiQA-da",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-da-mini",
|
|
40
37
|
task=RC,
|
|
41
|
-
languages=[
|
|
38
|
+
languages=[DANISH],
|
|
42
39
|
)
|
|
43
40
|
|
|
44
41
|
NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
|
|
45
42
|
name="nordjylland-news",
|
|
46
|
-
pretty_name="
|
|
47
|
-
"
|
|
48
|
-
huggingface_id="EuroEval/nordjylland-news-mini",
|
|
43
|
+
pretty_name="Nordjylland News",
|
|
44
|
+
source="EuroEval/nordjylland-news-mini",
|
|
49
45
|
task=SUMM,
|
|
50
|
-
languages=[
|
|
46
|
+
languages=[DANISH],
|
|
51
47
|
)
|
|
52
48
|
|
|
53
49
|
DANSKE_TALEMAADER_CONFIG = DatasetConfig(
|
|
54
50
|
name="danske-talemaader",
|
|
55
|
-
pretty_name="
|
|
56
|
-
"
|
|
57
|
-
huggingface_id="EuroEval/danske-talemaader",
|
|
51
|
+
pretty_name="Danske Talemåder",
|
|
52
|
+
source="EuroEval/danske-talemaader",
|
|
58
53
|
task=KNOW,
|
|
59
|
-
languages=[
|
|
54
|
+
languages=[DANISH],
|
|
60
55
|
)
|
|
61
56
|
|
|
62
57
|
DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
|
|
63
58
|
name="danish-citizen-tests",
|
|
64
|
-
pretty_name="
|
|
65
|
-
|
|
59
|
+
pretty_name="Danish Citizen Tests",
|
|
60
|
+
source="EuroEval/danish-citizen-tests-updated",
|
|
66
61
|
task=KNOW,
|
|
67
|
-
languages=[
|
|
62
|
+
languages=[DANISH],
|
|
68
63
|
)
|
|
69
64
|
|
|
70
65
|
HELLASWAG_DA_CONFIG = DatasetConfig(
|
|
71
66
|
name="hellaswag-da",
|
|
72
|
-
pretty_name="
|
|
73
|
-
"
|
|
74
|
-
huggingface_id="EuroEval/hellaswag-da-mini",
|
|
67
|
+
pretty_name="HellaSwag-da",
|
|
68
|
+
source="EuroEval/hellaswag-da-mini",
|
|
75
69
|
task=COMMON_SENSE,
|
|
76
|
-
languages=[
|
|
70
|
+
languages=[DANISH],
|
|
77
71
|
)
|
|
78
72
|
|
|
79
|
-
|
|
80
|
-
name="
|
|
81
|
-
pretty_name="
|
|
82
|
-
|
|
73
|
+
VALEU_DA_CONFIG = DatasetConfig(
|
|
74
|
+
name="valeu-da",
|
|
75
|
+
pretty_name="ValEU-da",
|
|
76
|
+
source="EuroEval/european-values-da",
|
|
83
77
|
task=EUROPEAN_VALUES,
|
|
84
|
-
languages=[
|
|
78
|
+
languages=[DANISH],
|
|
85
79
|
splits=["test"],
|
|
86
80
|
bootstrap_samples=False,
|
|
87
81
|
)
|
|
@@ -91,95 +85,64 @@ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
|
|
|
91
85
|
|
|
92
86
|
DANE_CONFIG = DatasetConfig(
|
|
93
87
|
name="dane",
|
|
94
|
-
pretty_name="
|
|
95
|
-
"
|
|
96
|
-
huggingface_id="EuroEval/dane-mini",
|
|
88
|
+
pretty_name="DaNE",
|
|
89
|
+
source="EuroEval/dane-mini",
|
|
97
90
|
task=NER,
|
|
98
|
-
languages=[
|
|
91
|
+
languages=[DANISH],
|
|
99
92
|
unofficial=True,
|
|
100
93
|
)
|
|
101
94
|
|
|
102
95
|
MMLU_DA_CONFIG = DatasetConfig(
|
|
103
96
|
name="mmlu-da",
|
|
104
|
-
pretty_name="
|
|
105
|
-
"
|
|
106
|
-
huggingface_id="EuroEval/mmlu-da-mini",
|
|
97
|
+
pretty_name="MMLU-da",
|
|
98
|
+
source="EuroEval/mmlu-da-mini",
|
|
107
99
|
task=KNOW,
|
|
108
|
-
languages=[
|
|
100
|
+
languages=[DANISH],
|
|
109
101
|
unofficial=True,
|
|
110
102
|
)
|
|
111
103
|
|
|
112
104
|
ARC_DA_CONFIG = DatasetConfig(
|
|
113
105
|
name="arc-da",
|
|
114
|
-
pretty_name="
|
|
115
|
-
"
|
|
116
|
-
huggingface_id="EuroEval/arc-da-mini",
|
|
106
|
+
pretty_name="ARC-da",
|
|
107
|
+
source="EuroEval/arc-da-mini",
|
|
117
108
|
task=KNOW,
|
|
118
|
-
languages=[
|
|
109
|
+
languages=[DANISH],
|
|
119
110
|
unofficial=True,
|
|
120
111
|
)
|
|
121
112
|
|
|
122
113
|
BELEBELE_DA_CONFIG = DatasetConfig(
|
|
123
114
|
name="belebele-da",
|
|
124
|
-
pretty_name="
|
|
125
|
-
"
|
|
126
|
-
huggingface_id="EuroEval/belebele-da-mini",
|
|
115
|
+
pretty_name="Belebele-da",
|
|
116
|
+
source="EuroEval/belebele-da-mini",
|
|
127
117
|
task=MCRC,
|
|
128
|
-
languages=[
|
|
118
|
+
languages=[DANISH],
|
|
129
119
|
unofficial=True,
|
|
130
120
|
)
|
|
131
121
|
|
|
132
122
|
SCANDIQA_DA_CONFIG = DatasetConfig(
|
|
133
123
|
name="scandiqa-da",
|
|
134
|
-
pretty_name="
|
|
135
|
-
"
|
|
136
|
-
huggingface_id="EuroEval/scandiqa-da-mini",
|
|
124
|
+
pretty_name="ScandiQA-da",
|
|
125
|
+
source="EuroEval/scandiqa-da-mini",
|
|
137
126
|
task=RC,
|
|
138
|
-
languages=[
|
|
127
|
+
languages=[DANISH],
|
|
139
128
|
unofficial=True,
|
|
140
129
|
)
|
|
141
130
|
|
|
142
131
|
GOLDENSWAG_DA_CONFIG = DatasetConfig(
|
|
143
132
|
name="goldenswag-da",
|
|
144
|
-
pretty_name="
|
|
145
|
-
"
|
|
146
|
-
huggingface_id="EuroEval/goldenswag-da-mini",
|
|
133
|
+
pretty_name="GoldenSwag-da",
|
|
134
|
+
source="EuroEval/goldenswag-da-mini",
|
|
147
135
|
task=COMMON_SENSE,
|
|
148
|
-
languages=[
|
|
136
|
+
languages=[DANISH],
|
|
149
137
|
unofficial=True,
|
|
150
138
|
)
|
|
151
139
|
|
|
152
140
|
WINOGRANDE_DA_CONFIG = DatasetConfig(
|
|
153
141
|
name="winogrande-da",
|
|
154
|
-
pretty_name="
|
|
155
|
-
"
|
|
156
|
-
huggingface_id="EuroEval/winogrande-da",
|
|
142
|
+
pretty_name="Winogrande-da",
|
|
143
|
+
source="EuroEval/winogrande-da",
|
|
157
144
|
task=COMMON_SENSE,
|
|
158
|
-
languages=[
|
|
145
|
+
languages=[DANISH],
|
|
159
146
|
_labels=["a", "b"],
|
|
160
147
|
unofficial=True,
|
|
161
148
|
)
|
|
162
|
-
|
|
163
|
-
EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
|
|
164
|
-
name="european-values-situational-da",
|
|
165
|
-
pretty_name="the Danish version of the European values evaluation dataset, where "
|
|
166
|
-
"the questions are phrased in a situational way",
|
|
167
|
-
huggingface_id="EuroEval/european-values-situational-da",
|
|
168
|
-
task=EUROPEAN_VALUES,
|
|
169
|
-
languages=[DA],
|
|
170
|
-
splits=["test"],
|
|
171
|
-
bootstrap_samples=False,
|
|
172
|
-
unofficial=True,
|
|
173
|
-
)
|
|
174
|
-
|
|
175
|
-
EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
|
|
176
|
-
name="european-values-completions-da",
|
|
177
|
-
pretty_name="the Danish version of the European values evaluation dataset, where "
|
|
178
|
-
"the questions are phrased as sentence completions",
|
|
179
|
-
huggingface_id="EuroEval/european-values-completions-da",
|
|
180
|
-
task=EUROPEAN_VALUES,
|
|
181
|
-
languages=[DA],
|
|
182
|
-
splits=["test"],
|
|
183
|
-
bootstrap_samples=False,
|
|
184
|
-
unofficial=True,
|
|
185
|
-
)
|
|
@@ -1,80 +1,74 @@
|
|
|
1
1
|
"""All Dutch dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
4
|
+
from ..languages import DUTCH
|
|
5
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
DBRD_CONFIG = DatasetConfig(
|
|
10
10
|
name="dbrd",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/dbrd-mini",
|
|
11
|
+
pretty_name="DBRD",
|
|
12
|
+
source="EuroEval/dbrd-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[DUTCH],
|
|
16
15
|
_labels=["negative", "positive"],
|
|
17
16
|
)
|
|
18
17
|
|
|
19
18
|
SCALA_NL_CONFIG = DatasetConfig(
|
|
20
19
|
name="scala-nl",
|
|
21
|
-
pretty_name="
|
|
22
|
-
|
|
20
|
+
pretty_name="ScaLA-nl",
|
|
21
|
+
source="EuroEval/scala-nl",
|
|
23
22
|
task=LA,
|
|
24
|
-
languages=[
|
|
23
|
+
languages=[DUTCH],
|
|
25
24
|
)
|
|
26
25
|
|
|
27
26
|
CONLL_NL_CONFIG = DatasetConfig(
|
|
28
27
|
name="conll-nl",
|
|
29
|
-
pretty_name="
|
|
30
|
-
"
|
|
31
|
-
huggingface_id="EuroEval/conll-nl-mini",
|
|
28
|
+
pretty_name="CoNLL-nl",
|
|
29
|
+
source="EuroEval/conll-nl-mini",
|
|
32
30
|
task=NER,
|
|
33
|
-
languages=[
|
|
31
|
+
languages=[DUTCH],
|
|
34
32
|
)
|
|
35
33
|
|
|
36
34
|
SQUAD_NL_CONFIG = DatasetConfig(
|
|
37
35
|
name="squad-nl",
|
|
38
|
-
pretty_name="
|
|
39
|
-
"
|
|
40
|
-
huggingface_id="EuroEval/squad-nl-v2-mini",
|
|
36
|
+
pretty_name="SQuAD-nl",
|
|
37
|
+
source="EuroEval/squad-nl-v2-mini",
|
|
41
38
|
task=RC,
|
|
42
|
-
languages=[
|
|
39
|
+
languages=[DUTCH],
|
|
43
40
|
)
|
|
44
41
|
|
|
45
42
|
WIKI_LINGUA_NL_CONFIG = DatasetConfig(
|
|
46
43
|
name="wiki-lingua-nl",
|
|
47
|
-
pretty_name="
|
|
48
|
-
"
|
|
49
|
-
huggingface_id="EuroEval/wiki-lingua-nl-mini",
|
|
44
|
+
pretty_name="WikiLingua-nl",
|
|
45
|
+
source="EuroEval/wiki-lingua-nl-mini",
|
|
50
46
|
task=SUMM,
|
|
51
|
-
languages=[
|
|
47
|
+
languages=[DUTCH],
|
|
52
48
|
)
|
|
53
49
|
|
|
54
50
|
MMLU_NL_CONFIG = DatasetConfig(
|
|
55
51
|
name="mmlu-nl",
|
|
56
|
-
pretty_name="
|
|
57
|
-
"
|
|
58
|
-
huggingface_id="EuroEval/mmlu-nl-mini",
|
|
52
|
+
pretty_name="MMLU-nl",
|
|
53
|
+
source="EuroEval/mmlu-nl-mini",
|
|
59
54
|
task=KNOW,
|
|
60
|
-
languages=[
|
|
55
|
+
languages=[DUTCH],
|
|
61
56
|
)
|
|
62
57
|
|
|
63
58
|
HELLASWAG_NL_CONFIG = DatasetConfig(
|
|
64
59
|
name="hellaswag-nl",
|
|
65
|
-
pretty_name="
|
|
66
|
-
"
|
|
67
|
-
huggingface_id="EuroEval/hellaswag-nl-mini",
|
|
60
|
+
pretty_name="HellaSwag-nl",
|
|
61
|
+
source="EuroEval/hellaswag-nl-mini",
|
|
68
62
|
task=COMMON_SENSE,
|
|
69
|
-
languages=[
|
|
63
|
+
languages=[DUTCH],
|
|
70
64
|
)
|
|
71
65
|
|
|
72
|
-
|
|
73
|
-
name="
|
|
74
|
-
pretty_name="
|
|
75
|
-
|
|
66
|
+
VALEU_NL_CONFIG = DatasetConfig(
|
|
67
|
+
name="valeu-nl",
|
|
68
|
+
pretty_name="VaLEU-nl",
|
|
69
|
+
source="EuroEval/european-values-nl",
|
|
76
70
|
task=EUROPEAN_VALUES,
|
|
77
|
-
languages=[
|
|
71
|
+
languages=[DUTCH],
|
|
78
72
|
splits=["test"],
|
|
79
73
|
bootstrap_samples=False,
|
|
80
74
|
_instruction_prompt="{text}",
|
|
@@ -85,96 +79,64 @@ EUROPEAN_VALUES_NL_CONFIG = DatasetConfig(
|
|
|
85
79
|
|
|
86
80
|
DUTCH_COLA_CONFIG = DatasetConfig(
|
|
87
81
|
name="dutch-cola",
|
|
88
|
-
pretty_name="
|
|
89
|
-
"
|
|
90
|
-
huggingface_id="EuroEval/dutch-cola",
|
|
82
|
+
pretty_name="Dutch CoLA",
|
|
83
|
+
source="EuroEval/dutch-cola",
|
|
91
84
|
task=LA,
|
|
92
|
-
languages=[
|
|
85
|
+
languages=[DUTCH],
|
|
93
86
|
unofficial=True,
|
|
94
87
|
)
|
|
95
88
|
|
|
96
89
|
DUTCH_COLA_FULL_CONFIG = DatasetConfig(
|
|
97
90
|
name="dutch-cola-full",
|
|
98
|
-
pretty_name="
|
|
99
|
-
|
|
91
|
+
pretty_name="Dutch CoLA Full",
|
|
92
|
+
source="EuroEval/dutch-cola-full",
|
|
100
93
|
task=LA,
|
|
101
|
-
languages=[
|
|
94
|
+
languages=[DUTCH],
|
|
102
95
|
unofficial=True,
|
|
103
96
|
)
|
|
104
97
|
|
|
105
98
|
ARC_NL_CONFIG = DatasetConfig(
|
|
106
99
|
name="arc-nl",
|
|
107
|
-
pretty_name="
|
|
108
|
-
"
|
|
109
|
-
huggingface_id="EuroEval/arc-nl-mini",
|
|
100
|
+
pretty_name="ARC-nl",
|
|
101
|
+
source="EuroEval/arc-nl-mini",
|
|
110
102
|
task=KNOW,
|
|
111
|
-
languages=[
|
|
103
|
+
languages=[DUTCH],
|
|
112
104
|
unofficial=True,
|
|
113
105
|
)
|
|
114
106
|
|
|
115
107
|
BELEBELE_NL_CONFIG = DatasetConfig(
|
|
116
108
|
name="belebele-nl",
|
|
117
|
-
pretty_name="
|
|
118
|
-
"
|
|
119
|
-
huggingface_id="EuroEval/belebele-nl-mini",
|
|
109
|
+
pretty_name="Belebele-nl",
|
|
110
|
+
source="EuroEval/belebele-nl-mini",
|
|
120
111
|
task=MCRC,
|
|
121
|
-
languages=[
|
|
112
|
+
languages=[DUTCH],
|
|
122
113
|
unofficial=True,
|
|
123
114
|
)
|
|
124
115
|
|
|
125
116
|
MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
|
|
126
117
|
name="multi-wiki-qa-nl",
|
|
127
|
-
pretty_name="
|
|
128
|
-
"
|
|
129
|
-
huggingface_id="EuroEval/multi-wiki-qa-nl-mini",
|
|
118
|
+
pretty_name="MultiWikiQA-nl",
|
|
119
|
+
source="EuroEval/multi-wiki-qa-nl-mini",
|
|
130
120
|
task=RC,
|
|
131
|
-
languages=[
|
|
121
|
+
languages=[DUTCH],
|
|
132
122
|
unofficial=True,
|
|
133
123
|
)
|
|
134
124
|
|
|
135
125
|
GOLDENSWAG_NL_CONFIG = DatasetConfig(
|
|
136
126
|
name="goldenswag-nl",
|
|
137
|
-
pretty_name="
|
|
138
|
-
"
|
|
139
|
-
huggingface_id="EuroEval/goldenswag-nl-mini",
|
|
127
|
+
pretty_name="GoldenSwag-nl",
|
|
128
|
+
source="EuroEval/goldenswag-nl-mini",
|
|
140
129
|
task=COMMON_SENSE,
|
|
141
|
-
languages=[
|
|
130
|
+
languages=[DUTCH],
|
|
142
131
|
unofficial=True,
|
|
143
132
|
)
|
|
144
133
|
|
|
145
134
|
WINOGRANDE_NL_CONFIG = DatasetConfig(
|
|
146
135
|
name="winogrande-nl",
|
|
147
|
-
pretty_name="
|
|
148
|
-
"
|
|
149
|
-
huggingface_id="EuroEval/winogrande-nl",
|
|
136
|
+
pretty_name="Winogrande-nl",
|
|
137
|
+
source="EuroEval/winogrande-nl",
|
|
150
138
|
task=COMMON_SENSE,
|
|
151
|
-
languages=[
|
|
139
|
+
languages=[DUTCH],
|
|
152
140
|
_labels=["a", "b"],
|
|
153
141
|
unofficial=True,
|
|
154
142
|
)
|
|
155
|
-
|
|
156
|
-
EUROPEAN_VALUES_SITUATIONAL_NL_CONFIG = DatasetConfig(
|
|
157
|
-
name="european-values-situational-nl",
|
|
158
|
-
pretty_name="the Dutch version of the European values evaluation dataset, where "
|
|
159
|
-
"the questions are phrased in a situational way",
|
|
160
|
-
huggingface_id="EuroEval/european-values-situational-nl",
|
|
161
|
-
task=EUROPEAN_VALUES,
|
|
162
|
-
languages=[NL],
|
|
163
|
-
splits=["test"],
|
|
164
|
-
bootstrap_samples=False,
|
|
165
|
-
_instruction_prompt="{text}",
|
|
166
|
-
unofficial=True,
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
EUROPEAN_VALUES_COMPLETIONS_NL_CONFIG = DatasetConfig(
|
|
170
|
-
name="european-values-completions-nl",
|
|
171
|
-
pretty_name="the Dutch version of the European values evaluation dataset, where "
|
|
172
|
-
"the questions are phrased as sentence completions",
|
|
173
|
-
huggingface_id="EuroEval/european-values-completions-nl",
|
|
174
|
-
task=EUROPEAN_VALUES,
|
|
175
|
-
languages=[NL],
|
|
176
|
-
splits=["test"],
|
|
177
|
-
bootstrap_samples=False,
|
|
178
|
-
_instruction_prompt="{text}",
|
|
179
|
-
unofficial=True,
|
|
180
|
-
)
|