EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +9 -2
- euroeval/benchmark_config_factory.py +51 -50
- euroeval/benchmark_modules/base.py +9 -21
- euroeval/benchmark_modules/fresh.py +2 -1
- euroeval/benchmark_modules/hf.py +101 -71
- euroeval/benchmark_modules/litellm.py +115 -53
- euroeval/benchmark_modules/vllm.py +107 -92
- euroeval/benchmarker.py +144 -121
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +86 -8
- euroeval/constants.py +9 -0
- euroeval/data_loading.py +80 -29
- euroeval/data_models.py +338 -330
- euroeval/dataset_configs/__init__.py +12 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +55 -93
- euroeval/dataset_configs/dutch.py +48 -87
- euroeval/dataset_configs/english.py +45 -77
- euroeval/dataset_configs/estonian.py +42 -34
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -69
- euroeval/dataset_configs/french.py +39 -75
- euroeval/dataset_configs/german.py +45 -82
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -79
- euroeval/dataset_configs/latvian.py +28 -35
- euroeval/dataset_configs/lithuanian.py +28 -26
- euroeval/dataset_configs/norwegian.py +72 -115
- euroeval/dataset_configs/polish.py +33 -61
- euroeval/dataset_configs/portuguese.py +33 -66
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/spanish.py +42 -77
- euroeval/dataset_configs/swedish.py +52 -90
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +24 -17
- euroeval/generation.py +15 -14
- euroeval/generation_utils.py +8 -8
- euroeval/languages.py +395 -323
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +21 -6
- euroeval/metrics/llm_as_a_judge.py +6 -4
- euroeval/metrics/pipeline.py +17 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +17 -19
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +99 -42
- euroeval/prompt_templates/multiple_choice.py +102 -38
- euroeval/prompt_templates/named_entity_recognition.py +172 -51
- euroeval/prompt_templates/reading_comprehension.py +119 -42
- euroeval/prompt_templates/sentiment_classification.py +110 -40
- euroeval/prompt_templates/summarization.py +85 -40
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +11 -10
- euroeval/speed_benchmark.py +5 -6
- euroeval/task_group_utils/multiple_choice_classification.py +2 -4
- euroeval/task_group_utils/question_answering.py +24 -16
- euroeval/task_group_utils/sequence_classification.py +48 -35
- euroeval/task_group_utils/text_to_text.py +19 -9
- euroeval/task_group_utils/token_classification.py +21 -17
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +33 -22
- euroeval/types.py +10 -9
- euroeval/utils.py +35 -149
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.3.0.dist-info/RECORD +0 -71
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""All Serbian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import SERBIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
MMS_SR_CONFIG = DatasetConfig(
|
|
10
|
+
name="mms-sr",
|
|
11
|
+
pretty_name="MMS-sr",
|
|
12
|
+
source="EuroEval/mms-sr-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[SERBIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_SR_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-sr",
|
|
19
|
+
pretty_name="ScaLA-sr",
|
|
20
|
+
source="EuroEval/scala-sr",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[SERBIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
UNER_SR_CONFIG = DatasetConfig(
|
|
26
|
+
name="uner-sr",
|
|
27
|
+
pretty_name="UNER-sr",
|
|
28
|
+
source="EuroEval/uner-sr-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[SERBIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_SR_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-sr",
|
|
35
|
+
pretty_name="MultiWikiQA-sr",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sr-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[SERBIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
LR_SUM_SR_CONFIG = DatasetConfig(
|
|
42
|
+
name="lr-sum-sr",
|
|
43
|
+
pretty_name="LRSum-sr",
|
|
44
|
+
source="EuroEval/lr-sum-sr-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[SERBIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
MMLU_SR_CONFIG = DatasetConfig(
|
|
50
|
+
name="mmlu-sr",
|
|
51
|
+
pretty_name="MMLU-sr",
|
|
52
|
+
source="EuroEval/mmlu-sr-mini",
|
|
53
|
+
task=KNOW,
|
|
54
|
+
languages=[SERBIAN],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
WINOGRANDE_SR_CONFIG = DatasetConfig(
|
|
58
|
+
name="winogrande-sr",
|
|
59
|
+
pretty_name="Winogrande-sr",
|
|
60
|
+
source="EuroEval/winogrande-sr",
|
|
61
|
+
task=COMMON_SENSE,
|
|
62
|
+
languages=[SERBIAN],
|
|
63
|
+
_labels=["a", "b"],
|
|
64
|
+
)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""All Slovak dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import SLOVAK
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
CSFD_SENTIMENT_SK_CONFIG = DatasetConfig(
|
|
10
|
+
name="csfd-sentiment-sk",
|
|
11
|
+
pretty_name="CSFD Sentiment SK",
|
|
12
|
+
source="EuroEval/csfd-sentiment-sk-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[SLOVAK],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_SK_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-sk",
|
|
19
|
+
pretty_name="ScaLA-sk",
|
|
20
|
+
source="EuroEval/scala-sk",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[SLOVAK],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
UNER_SK_CONFIG = DatasetConfig(
|
|
26
|
+
name="uner-sk",
|
|
27
|
+
pretty_name="UNER-sk",
|
|
28
|
+
source="EuroEval/uner-sk-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[SLOVAK],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_SK_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-sk",
|
|
35
|
+
pretty_name="MultiWikiQA-sk",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sk-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[SLOVAK],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
MMLU_SK_CONFIG = DatasetConfig(
|
|
42
|
+
name="mmlu-sk",
|
|
43
|
+
pretty_name="MMLU-sk",
|
|
44
|
+
source="EuroEval/mmlu-sk-mini",
|
|
45
|
+
task=KNOW,
|
|
46
|
+
languages=[SLOVAK],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
WINOGRANDE_SK_CONFIG = DatasetConfig(
|
|
50
|
+
name="winogrande-sk",
|
|
51
|
+
pretty_name="Winogrande-sk",
|
|
52
|
+
source="EuroEval/winogrande-sk",
|
|
53
|
+
task=COMMON_SENSE,
|
|
54
|
+
languages=[SLOVAK],
|
|
55
|
+
)
|
|
@@ -1,77 +1,73 @@
|
|
|
1
1
|
"""All Spanish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
4
|
+
from ..languages import SPANISH
|
|
5
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
|
|
10
10
|
name="sentiment-headlines-es",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/sentiment-headlines-es",
|
|
11
|
+
pretty_name="Sentiment Headlines ES",
|
|
12
|
+
source="EuroEval/sentiment-headlines-es",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[SPANISH],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_ES_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-es",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-es",
|
|
20
|
+
source="EuroEval/scala-es",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[SPANISH],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
CONLL_ES_CONFIG = DatasetConfig(
|
|
27
26
|
name="conll-es",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/conll-es-mini",
|
|
27
|
+
pretty_name="CoNLL-es",
|
|
28
|
+
source="EuroEval/conll-es-mini",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[SPANISH],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
33
|
MLQA_ES_CONFIG = DatasetConfig(
|
|
36
34
|
name="mlqa-es",
|
|
37
|
-
pretty_name="
|
|
38
|
-
|
|
35
|
+
pretty_name="MLQA-es",
|
|
36
|
+
source="EuroEval/mlqa-es",
|
|
39
37
|
task=RC,
|
|
40
|
-
languages=[
|
|
38
|
+
languages=[SPANISH],
|
|
41
39
|
)
|
|
42
40
|
|
|
43
41
|
MLSUM_ES_CONFIG = DatasetConfig(
|
|
44
42
|
name="mlsum-es",
|
|
45
|
-
pretty_name="
|
|
46
|
-
|
|
43
|
+
pretty_name="MLSUM-es",
|
|
44
|
+
source="EuroEval/mlsum-es-mini",
|
|
47
45
|
task=SUMM,
|
|
48
|
-
languages=[
|
|
46
|
+
languages=[SPANISH],
|
|
49
47
|
)
|
|
50
48
|
|
|
51
49
|
MMLU_ES_CONFIG = DatasetConfig(
|
|
52
50
|
name="mmlu-es",
|
|
53
|
-
pretty_name="
|
|
54
|
-
"
|
|
55
|
-
huggingface_id="EuroEval/mmlu-es-mini",
|
|
51
|
+
pretty_name="MMLU-es",
|
|
52
|
+
source="EuroEval/mmlu-es-mini",
|
|
56
53
|
task=KNOW,
|
|
57
|
-
languages=[
|
|
54
|
+
languages=[SPANISH],
|
|
58
55
|
)
|
|
59
56
|
|
|
60
57
|
HELLASWAG_ES_CONFIG = DatasetConfig(
|
|
61
58
|
name="hellaswag-es",
|
|
62
|
-
pretty_name="
|
|
63
|
-
"
|
|
64
|
-
huggingface_id="EuroEval/hellaswag-es-mini",
|
|
59
|
+
pretty_name="HellaSwag-es",
|
|
60
|
+
source="EuroEval/hellaswag-es-mini",
|
|
65
61
|
task=COMMON_SENSE,
|
|
66
|
-
languages=[
|
|
62
|
+
languages=[SPANISH],
|
|
67
63
|
)
|
|
68
64
|
|
|
69
|
-
|
|
70
|
-
name="
|
|
71
|
-
pretty_name="
|
|
72
|
-
|
|
65
|
+
VALEU_ES_CONFIG = DatasetConfig(
|
|
66
|
+
name="valeu-es",
|
|
67
|
+
pretty_name="VaLEU-es",
|
|
68
|
+
source="EuroEval/european-values-es",
|
|
73
69
|
task=EUROPEAN_VALUES,
|
|
74
|
-
languages=[
|
|
70
|
+
languages=[SPANISH],
|
|
75
71
|
splits=["test"],
|
|
76
72
|
bootstrap_samples=False,
|
|
77
73
|
_instruction_prompt="{text}",
|
|
@@ -82,77 +78,46 @@ EUROPEAN_VALUES_ES_CONFIG = DatasetConfig(
|
|
|
82
78
|
|
|
83
79
|
XQUAD_ES_CONFIG = DatasetConfig(
|
|
84
80
|
name="xquad-es",
|
|
85
|
-
pretty_name="
|
|
86
|
-
|
|
81
|
+
pretty_name="XQuAD-es",
|
|
82
|
+
source="EuroEval/xquad-es",
|
|
87
83
|
task=RC,
|
|
88
|
-
languages=[
|
|
84
|
+
languages=[SPANISH],
|
|
89
85
|
unofficial=True,
|
|
90
86
|
)
|
|
91
87
|
|
|
92
88
|
BELEBELE_ES_CONFIG = DatasetConfig(
|
|
93
89
|
name="belebele-es",
|
|
94
|
-
pretty_name="
|
|
95
|
-
"
|
|
96
|
-
huggingface_id="EuroEval/belebele-es-mini",
|
|
90
|
+
pretty_name="Belebele-es",
|
|
91
|
+
source="EuroEval/belebele-es-mini",
|
|
97
92
|
task=MCRC,
|
|
98
|
-
languages=[
|
|
93
|
+
languages=[SPANISH],
|
|
99
94
|
unofficial=True,
|
|
100
95
|
)
|
|
101
96
|
|
|
102
97
|
MULTI_WIKI_QA_ES_CONFIG = DatasetConfig(
|
|
103
98
|
name="multi-wiki-qa-es",
|
|
104
|
-
pretty_name="
|
|
105
|
-
"
|
|
106
|
-
huggingface_id="EuroEval/multi-wiki-qa-es-mini",
|
|
99
|
+
pretty_name="MultiWikiQA-es",
|
|
100
|
+
source="EuroEval/multi-wiki-qa-es-mini",
|
|
107
101
|
task=RC,
|
|
108
|
-
languages=[
|
|
102
|
+
languages=[SPANISH],
|
|
109
103
|
unofficial=True,
|
|
110
104
|
)
|
|
111
105
|
|
|
112
106
|
GOLDENSWAG_ES_CONFIG = DatasetConfig(
|
|
113
107
|
name="goldenswag-es",
|
|
114
|
-
pretty_name="
|
|
115
|
-
"
|
|
116
|
-
huggingface_id="EuroEval/goldenswag-es-mini",
|
|
108
|
+
pretty_name="GoldenSwag-es",
|
|
109
|
+
source="EuroEval/goldenswag-es-mini",
|
|
117
110
|
task=COMMON_SENSE,
|
|
118
|
-
languages=[
|
|
111
|
+
languages=[SPANISH],
|
|
119
112
|
unofficial=True,
|
|
120
113
|
)
|
|
121
114
|
|
|
122
115
|
WINOGRANDE_ES_CONFIG = DatasetConfig(
|
|
123
116
|
name="winogrande-es",
|
|
124
|
-
pretty_name="
|
|
125
|
-
"
|
|
126
|
-
huggingface_id="EuroEval/winogrande-es",
|
|
117
|
+
pretty_name="Winogrande-es",
|
|
118
|
+
source="EuroEval/winogrande-es",
|
|
127
119
|
task=COMMON_SENSE,
|
|
128
|
-
languages=[
|
|
129
|
-
splits=["train", "test"],
|
|
120
|
+
languages=[SPANISH],
|
|
130
121
|
_labels=["a", "b"],
|
|
131
122
|
unofficial=True,
|
|
132
123
|
)
|
|
133
|
-
|
|
134
|
-
EUROPEAN_VALUES_SITUATIONAL_ES_CONFIG = DatasetConfig(
|
|
135
|
-
name="european-values-situational-es",
|
|
136
|
-
pretty_name="the Spanish version of the European values evaluation dataset, where "
|
|
137
|
-
"the questions are phrased in a situational way",
|
|
138
|
-
huggingface_id="EuroEval/european-values-situational-es",
|
|
139
|
-
task=EUROPEAN_VALUES,
|
|
140
|
-
languages=[ES],
|
|
141
|
-
splits=["test"],
|
|
142
|
-
bootstrap_samples=False,
|
|
143
|
-
_instruction_prompt="{text}",
|
|
144
|
-
unofficial=True,
|
|
145
|
-
)
|
|
146
|
-
|
|
147
|
-
EUROPEAN_VALUES_COMPLETIONS_ES_CONFIG = DatasetConfig(
|
|
148
|
-
name="european-values-completions-es",
|
|
149
|
-
pretty_name="the Spanish version of the European values evaluation dataset, where "
|
|
150
|
-
"the questions are phrased as sentence completions",
|
|
151
|
-
huggingface_id="EuroEval/european-values-completions-es",
|
|
152
|
-
task=EUROPEAN_VALUES,
|
|
153
|
-
languages=[ES],
|
|
154
|
-
splits=["test"],
|
|
155
|
-
bootstrap_samples=False,
|
|
156
|
-
_instruction_prompt="{text}",
|
|
157
|
-
unofficial=True,
|
|
158
|
-
)
|
|
@@ -1,78 +1,73 @@
|
|
|
1
1
|
"""All Swedish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
4
|
+
from ..languages import SWEDISH
|
|
5
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
SWEREC_CONFIG = DatasetConfig(
|
|
10
10
|
name="swerec",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/swerec-mini",
|
|
11
|
+
pretty_name="SweReC",
|
|
12
|
+
source="EuroEval/swerec-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[SWEDISH],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_SV_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-sv",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-sv",
|
|
20
|
+
source="EuroEval/scala-sv",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[SWEDISH],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
SUC3_CONFIG = DatasetConfig(
|
|
27
26
|
name="suc3",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/suc3-mini",
|
|
27
|
+
pretty_name="SUC3",
|
|
28
|
+
source="EuroEval/suc3-mini",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[SWEDISH],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
|
-
|
|
36
|
-
name="
|
|
37
|
-
pretty_name="
|
|
38
|
-
"
|
|
39
|
-
huggingface_id="EuroEval/scandiqa-sv-mini",
|
|
33
|
+
MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-sv",
|
|
35
|
+
pretty_name="MultiWikiQA-sv",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sv-mini",
|
|
40
37
|
task=RC,
|
|
41
|
-
languages=[
|
|
38
|
+
languages=[SWEDISH],
|
|
42
39
|
)
|
|
43
40
|
|
|
44
41
|
SWEDN_CONFIG = DatasetConfig(
|
|
45
42
|
name="swedn",
|
|
46
|
-
pretty_name="
|
|
47
|
-
|
|
43
|
+
pretty_name="SweDN",
|
|
44
|
+
source="EuroEval/swedn-mini",
|
|
48
45
|
task=SUMM,
|
|
49
|
-
languages=[
|
|
46
|
+
languages=[SWEDISH],
|
|
50
47
|
)
|
|
51
48
|
|
|
52
49
|
MMLU_SV_CONFIG = DatasetConfig(
|
|
53
50
|
name="mmlu-sv",
|
|
54
|
-
pretty_name="
|
|
55
|
-
"
|
|
56
|
-
huggingface_id="EuroEval/mmlu-sv-mini",
|
|
51
|
+
pretty_name="MMLU-sv",
|
|
52
|
+
source="EuroEval/mmlu-sv-mini",
|
|
57
53
|
task=KNOW,
|
|
58
|
-
languages=[
|
|
54
|
+
languages=[SWEDISH],
|
|
59
55
|
)
|
|
60
56
|
|
|
61
57
|
HELLASWAG_SV_CONFIG = DatasetConfig(
|
|
62
58
|
name="hellaswag-sv",
|
|
63
|
-
pretty_name="
|
|
64
|
-
"
|
|
65
|
-
huggingface_id="EuroEval/hellaswag-sv-mini",
|
|
59
|
+
pretty_name="HellaSwag-sv",
|
|
60
|
+
source="EuroEval/hellaswag-sv-mini",
|
|
66
61
|
task=COMMON_SENSE,
|
|
67
|
-
languages=[
|
|
62
|
+
languages=[SWEDISH],
|
|
68
63
|
)
|
|
69
64
|
|
|
70
|
-
|
|
71
|
-
name="
|
|
72
|
-
pretty_name="
|
|
73
|
-
|
|
65
|
+
VALEU_SV_CONFIG = DatasetConfig(
|
|
66
|
+
name="valeu-sv",
|
|
67
|
+
pretty_name="VaLEU-sv",
|
|
68
|
+
source="EuroEval/european-values-sv",
|
|
74
69
|
task=EUROPEAN_VALUES,
|
|
75
|
-
languages=[
|
|
70
|
+
languages=[SWEDISH],
|
|
76
71
|
splits=["test"],
|
|
77
72
|
bootstrap_samples=False,
|
|
78
73
|
_instruction_prompt="{text}",
|
|
@@ -83,97 +78,64 @@ EUROPEAN_VALUES_SV_CONFIG = DatasetConfig(
|
|
|
83
78
|
|
|
84
79
|
SCHIBSTED_SV_CONFIG = DatasetConfig(
|
|
85
80
|
name="schibsted-sv",
|
|
86
|
-
pretty_name="
|
|
87
|
-
|
|
81
|
+
pretty_name="Schibsted-sv",
|
|
82
|
+
source="EuroEval/schibsted-article-summaries-sv",
|
|
88
83
|
task=SUMM,
|
|
89
|
-
languages=[
|
|
84
|
+
languages=[SWEDISH],
|
|
90
85
|
unofficial=True,
|
|
91
86
|
)
|
|
92
87
|
|
|
93
88
|
ARC_SV_CONFIG = DatasetConfig(
|
|
94
89
|
name="arc-sv",
|
|
95
|
-
pretty_name="
|
|
96
|
-
"
|
|
97
|
-
huggingface_id="EuroEval/arc-sv-mini",
|
|
90
|
+
pretty_name="ARC-sv",
|
|
91
|
+
source="EuroEval/arc-sv-mini",
|
|
98
92
|
task=KNOW,
|
|
99
|
-
languages=[
|
|
93
|
+
languages=[SWEDISH],
|
|
100
94
|
unofficial=True,
|
|
101
95
|
)
|
|
102
96
|
|
|
103
97
|
BELEBELE_SV_CONFIG = DatasetConfig(
|
|
104
98
|
name="belebele-sv",
|
|
105
|
-
pretty_name="
|
|
106
|
-
"
|
|
107
|
-
huggingface_id="EuroEval/belebele-sv-mini",
|
|
99
|
+
pretty_name="Belebele-sv",
|
|
100
|
+
source="EuroEval/belebele-sv-mini",
|
|
108
101
|
task=MCRC,
|
|
109
|
-
languages=[
|
|
102
|
+
languages=[SWEDISH],
|
|
110
103
|
unofficial=True,
|
|
111
104
|
)
|
|
112
105
|
|
|
113
|
-
|
|
114
|
-
name="
|
|
115
|
-
pretty_name="
|
|
116
|
-
"
|
|
117
|
-
huggingface_id="EuroEval/multi-wiki-qa-sv-mini",
|
|
106
|
+
SCANDIQA_SV_CONFIG = DatasetConfig(
|
|
107
|
+
name="scandiqa-sv",
|
|
108
|
+
pretty_name="ScandiQA-sv",
|
|
109
|
+
source="EuroEval/scandiqa-sv-mini",
|
|
118
110
|
task=RC,
|
|
119
|
-
languages=[
|
|
111
|
+
languages=[SWEDISH],
|
|
120
112
|
unofficial=True,
|
|
121
113
|
)
|
|
122
114
|
|
|
123
115
|
GOLDENSWAG_SV_CONFIG = DatasetConfig(
|
|
124
116
|
name="goldenswag-sv",
|
|
125
|
-
pretty_name="
|
|
126
|
-
"
|
|
127
|
-
huggingface_id="EuroEval/goldenswag-sv-mini",
|
|
117
|
+
pretty_name="GoldenSwag-sv",
|
|
118
|
+
source="EuroEval/goldenswag-sv-mini",
|
|
128
119
|
task=COMMON_SENSE,
|
|
129
|
-
languages=[
|
|
120
|
+
languages=[SWEDISH],
|
|
130
121
|
unofficial=True,
|
|
131
122
|
)
|
|
132
123
|
|
|
133
124
|
WINOGRANDE_SV_CONFIG = DatasetConfig(
|
|
134
125
|
name="winogrande-sv",
|
|
135
|
-
pretty_name="
|
|
136
|
-
"
|
|
137
|
-
huggingface_id="EuroEval/winogrande-sv",
|
|
126
|
+
pretty_name="Winogrande-sv",
|
|
127
|
+
source="EuroEval/winogrande-sv",
|
|
138
128
|
task=COMMON_SENSE,
|
|
139
|
-
languages=[
|
|
140
|
-
splits=["train", "test"],
|
|
129
|
+
languages=[SWEDISH],
|
|
141
130
|
_labels=["a", "b"],
|
|
142
131
|
unofficial=True,
|
|
143
132
|
)
|
|
144
133
|
|
|
145
|
-
EUROPEAN_VALUES_SITUATIONAL_SV_CONFIG = DatasetConfig(
|
|
146
|
-
name="european-values-situational-sv",
|
|
147
|
-
pretty_name="the Swedish version of the European values evaluation dataset, where "
|
|
148
|
-
"the questions are phrased in a situational way",
|
|
149
|
-
huggingface_id="EuroEval/european-values-situational-sv",
|
|
150
|
-
task=EUROPEAN_VALUES,
|
|
151
|
-
languages=[SV],
|
|
152
|
-
splits=["test"],
|
|
153
|
-
bootstrap_samples=False,
|
|
154
|
-
_instruction_prompt="{text}",
|
|
155
|
-
unofficial=True,
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
EUROPEAN_VALUES_COMPLETIONS_SV_CONFIG = DatasetConfig(
|
|
159
|
-
name="european-values-completions-sv",
|
|
160
|
-
pretty_name="the Swedish version of the European values evaluation dataset, where "
|
|
161
|
-
"the questions are phrased as sentence completions",
|
|
162
|
-
huggingface_id="EuroEval/european-values-completions-sv",
|
|
163
|
-
task=EUROPEAN_VALUES,
|
|
164
|
-
languages=[SV],
|
|
165
|
-
splits=["test"],
|
|
166
|
-
bootstrap_samples=False,
|
|
167
|
-
_instruction_prompt="{text}",
|
|
168
|
-
unofficial=True,
|
|
169
|
-
)
|
|
170
|
-
|
|
171
134
|
SKOLPROV_CONFIG = DatasetConfig(
|
|
172
135
|
name="skolprov",
|
|
173
|
-
pretty_name="
|
|
174
|
-
|
|
136
|
+
pretty_name="Skolprov",
|
|
137
|
+
source="EuroEval/skolprov",
|
|
175
138
|
task=KNOW,
|
|
176
|
-
languages=[
|
|
177
|
-
splits=["train", "test"],
|
|
139
|
+
languages=[SWEDISH],
|
|
178
140
|
unofficial=True,
|
|
179
141
|
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""All Ukrainian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import UKRAINIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
CROSS_DOMAIN_UK_REVIEWS_CONFIG = DatasetConfig(
|
|
10
|
+
name="cross-domain-uk-reviews",
|
|
11
|
+
pretty_name="Cross Domain Ukrainian Reviews",
|
|
12
|
+
source="EuroEval/cross-domain-uk-reviews-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[UKRAINIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_UK_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-uk",
|
|
19
|
+
pretty_name="ScaLA-uk",
|
|
20
|
+
source="EuroEval/scala-uk",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[UKRAINIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
NER_UK_CONFIG = DatasetConfig(
|
|
26
|
+
name="ner-uk",
|
|
27
|
+
pretty_name="NER-uk",
|
|
28
|
+
source="EuroEval/ner-uk-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[UKRAINIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_UK_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-uk",
|
|
35
|
+
pretty_name="MultiWikiQA-uk",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-uk-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[UKRAINIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
LR_SUM_UK_CONFIG = DatasetConfig(
|
|
42
|
+
name="lr-sum-uk",
|
|
43
|
+
pretty_name="LRSum-uk",
|
|
44
|
+
source="EuroEval/lr-sum-uk-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[UKRAINIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
GLOBAL_MMLU_UK_CONFIG = DatasetConfig(
|
|
50
|
+
name="global-mmlu-uk",
|
|
51
|
+
pretty_name="GlobalMMLU-uk",
|
|
52
|
+
source="EuroEval/global-mmlu-uk-mini",
|
|
53
|
+
task=KNOW,
|
|
54
|
+
languages=[UKRAINIAN],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
WINOGRANDE_UK_CONFIG = DatasetConfig(
|
|
58
|
+
name="winogrande-uk",
|
|
59
|
+
pretty_name="Winogrande-uk",
|
|
60
|
+
source="EuroEval/winogrande-uk",
|
|
61
|
+
task=COMMON_SENSE,
|
|
62
|
+
languages=[UKRAINIAN],
|
|
63
|
+
_labels=["a", "b"],
|
|
64
|
+
)
|
euroeval/exceptions.py
CHANGED
|
@@ -145,7 +145,7 @@ class NeedsAdditionalArgument(InvalidModel):
|
|
|
145
145
|
else:
|
|
146
146
|
self.message = (
|
|
147
147
|
f"The model you are trying to load requires the `{script_argument}` "
|
|
148
|
-
"argument
|
|
148
|
+
"argument to be passed to the `Benchmarker` class. Please pass the "
|
|
149
149
|
"argument and try again."
|
|
150
150
|
)
|
|
151
151
|
super().__init__(self.message)
|