EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +6 -0
- euroeval/benchmark_config_factory.py +51 -46
- euroeval/benchmark_modules/base.py +6 -5
- euroeval/benchmark_modules/hf.py +2 -9
- euroeval/benchmark_modules/litellm.py +14 -12
- euroeval/benchmark_modules/vllm.py +17 -10
- euroeval/benchmarker.py +61 -44
- euroeval/caching_utils.py +1 -1
- euroeval/cli.py +86 -8
- euroeval/constants.py +3 -0
- euroeval/data_loading.py +78 -30
- euroeval/data_models.py +326 -326
- euroeval/dataset_configs/__init__.py +10 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +25 -29
- euroeval/dataset_configs/danish.py +51 -88
- euroeval/dataset_configs/dutch.py +48 -86
- euroeval/dataset_configs/english.py +45 -76
- euroeval/dataset_configs/estonian.py +36 -38
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -68
- euroeval/dataset_configs/french.py +39 -74
- euroeval/dataset_configs/german.py +45 -81
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -78
- euroeval/dataset_configs/latvian.py +28 -34
- euroeval/dataset_configs/lithuanian.py +22 -26
- euroeval/dataset_configs/norwegian.py +72 -114
- euroeval/dataset_configs/polish.py +33 -60
- euroeval/dataset_configs/portuguese.py +33 -65
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +19 -24
- euroeval/dataset_configs/spanish.py +42 -76
- euroeval/dataset_configs/swedish.py +48 -84
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +3 -2
- euroeval/generation.py +5 -4
- euroeval/generation_utils.py +6 -5
- euroeval/languages.py +395 -323
- euroeval/metrics/huggingface.py +14 -3
- euroeval/metrics/llm_as_a_judge.py +1 -1
- euroeval/model_cache.py +6 -5
- euroeval/model_loading.py +1 -1
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +82 -43
- euroeval/prompt_templates/multiple_choice.py +81 -41
- euroeval/prompt_templates/named_entity_recognition.py +125 -44
- euroeval/prompt_templates/reading_comprehension.py +92 -43
- euroeval/prompt_templates/sentiment_classification.py +91 -43
- euroeval/prompt_templates/summarization.py +64 -39
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +4 -3
- euroeval/speed_benchmark.py +2 -1
- euroeval/task_group_utils/multiple_choice_classification.py +2 -1
- euroeval/task_group_utils/question_answering.py +24 -13
- euroeval/task_group_utils/sequence_classification.py +5 -4
- euroeval/task_group_utils/text_to_text.py +2 -1
- euroeval/task_group_utils/token_classification.py +11 -8
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +19 -10
- euroeval/types.py +10 -9
- euroeval/utils.py +6 -3
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.4.0.dist-info/RECORD +0 -75
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,105 +1,99 @@
|
|
|
1
1
|
"""All Norwegian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
4
|
+
from ..languages import NORWEGIAN, NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK
|
|
5
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
NOREC_CONFIG = DatasetConfig(
|
|
10
10
|
name="norec",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/norec-mini",
|
|
11
|
+
pretty_name="NoReC",
|
|
12
|
+
source="EuroEval/norec-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_NB_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-nb",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-nb",
|
|
20
|
+
source="EuroEval/scala-nb",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
SCALA_NN_CONFIG = DatasetConfig(
|
|
27
26
|
name="scala-nn",
|
|
28
|
-
pretty_name="
|
|
29
|
-
|
|
27
|
+
pretty_name="ScaLA-nn",
|
|
28
|
+
source="EuroEval/scala-nn",
|
|
30
29
|
task=LA,
|
|
31
|
-
languages=[
|
|
30
|
+
languages=[NORWEGIAN_NYNORSK],
|
|
32
31
|
)
|
|
33
32
|
|
|
34
33
|
NORNE_NB_CONFIG = DatasetConfig(
|
|
35
34
|
name="norne-nb",
|
|
36
|
-
pretty_name="
|
|
37
|
-
"
|
|
38
|
-
huggingface_id="EuroEval/norne-nb-mini",
|
|
35
|
+
pretty_name="NorNE-nb",
|
|
36
|
+
source="EuroEval/norne-nb-mini",
|
|
39
37
|
task=NER,
|
|
40
|
-
languages=[
|
|
38
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
|
|
41
39
|
)
|
|
42
40
|
|
|
43
41
|
NORNE_NN_CONFIG = DatasetConfig(
|
|
44
42
|
name="norne-nn",
|
|
45
|
-
pretty_name="
|
|
46
|
-
"
|
|
47
|
-
huggingface_id="EuroEval/norne-nn-mini",
|
|
43
|
+
pretty_name="NorNE-nn",
|
|
44
|
+
source="EuroEval/norne-nn-mini",
|
|
48
45
|
task=NER,
|
|
49
|
-
languages=[
|
|
46
|
+
languages=[NORWEGIAN_NYNORSK],
|
|
50
47
|
)
|
|
51
48
|
|
|
52
49
|
NORQUAD_CONFIG = DatasetConfig(
|
|
53
50
|
name="norquad",
|
|
54
|
-
pretty_name="
|
|
55
|
-
"
|
|
56
|
-
huggingface_id="EuroEval/norquad-mini",
|
|
51
|
+
pretty_name="NorQuAD",
|
|
52
|
+
source="EuroEval/norquad-mini",
|
|
57
53
|
task=RC,
|
|
58
|
-
languages=[
|
|
54
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
59
55
|
_num_few_shot_examples=2,
|
|
60
56
|
)
|
|
61
57
|
|
|
62
58
|
NO_SAMMENDRAG_CONFIG = DatasetConfig(
|
|
63
59
|
name="no-sammendrag",
|
|
64
|
-
pretty_name="
|
|
65
|
-
"
|
|
66
|
-
huggingface_id="EuroEval/no-sammendrag-mini",
|
|
60
|
+
pretty_name="NoSammendrag",
|
|
61
|
+
source="EuroEval/no-sammendrag-mini",
|
|
67
62
|
task=SUMM,
|
|
68
|
-
languages=[
|
|
63
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
69
64
|
)
|
|
70
65
|
|
|
71
66
|
NRK_QUIZ_QA_CONFIG = DatasetConfig(
|
|
72
67
|
name="nrk-quiz-qa",
|
|
73
|
-
pretty_name="
|
|
74
|
-
|
|
68
|
+
pretty_name="NRK Quiz QA",
|
|
69
|
+
source="EuroEval/nrk-quiz-qa-mini",
|
|
75
70
|
task=KNOW,
|
|
76
|
-
languages=[
|
|
71
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
77
72
|
)
|
|
78
73
|
|
|
79
74
|
IDIOMS_NO_CONFIG = DatasetConfig(
|
|
80
75
|
name="idioms-no",
|
|
81
|
-
pretty_name="
|
|
82
|
-
|
|
76
|
+
pretty_name="Idioms-no",
|
|
77
|
+
source="EuroEval/idioms-no",
|
|
83
78
|
task=KNOW,
|
|
84
|
-
languages=[
|
|
79
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
85
80
|
)
|
|
86
81
|
|
|
87
82
|
NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
|
|
88
83
|
name="nor-common-sense-qa",
|
|
89
|
-
pretty_name="
|
|
90
|
-
"
|
|
91
|
-
huggingface_id="EuroEval/nor-common-sense-qa",
|
|
84
|
+
pretty_name="NorCommonSenseQA",
|
|
85
|
+
source="EuroEval/nor-common-sense-qa",
|
|
92
86
|
task=COMMON_SENSE,
|
|
93
|
-
languages=[
|
|
87
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
94
88
|
_labels=["a", "b", "c", "d", "e"],
|
|
95
89
|
)
|
|
96
90
|
|
|
97
|
-
|
|
98
|
-
name="
|
|
99
|
-
pretty_name="
|
|
100
|
-
|
|
91
|
+
VALEU_NO_CONFIG = DatasetConfig(
|
|
92
|
+
name="valeu-no",
|
|
93
|
+
pretty_name="VaLEU-no",
|
|
94
|
+
source="EuroEval/european-values-no",
|
|
101
95
|
task=EUROPEAN_VALUES,
|
|
102
|
-
languages=[
|
|
96
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
103
97
|
splits=["test"],
|
|
104
98
|
bootstrap_samples=False,
|
|
105
99
|
_instruction_prompt="{text}",
|
|
@@ -110,145 +104,109 @@ EUROPEAN_VALUES_NO_CONFIG = DatasetConfig(
|
|
|
110
104
|
|
|
111
105
|
NO_COLA_CONFIG = DatasetConfig(
|
|
112
106
|
name="no-cola",
|
|
113
|
-
pretty_name="
|
|
114
|
-
"
|
|
115
|
-
huggingface_id="EuroEval/no-cola-mini",
|
|
107
|
+
pretty_name="NoCoLA",
|
|
108
|
+
source="EuroEval/no-cola-mini",
|
|
116
109
|
task=LA,
|
|
117
|
-
languages=[
|
|
110
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
|
|
118
111
|
unofficial=True,
|
|
119
112
|
)
|
|
120
113
|
|
|
121
114
|
NORGLM_MULTI_QA = DatasetConfig(
|
|
122
115
|
name="norglm-multi-qa",
|
|
123
|
-
pretty_name="
|
|
124
|
-
"
|
|
125
|
-
huggingface_id="EuroEval/norglm-multi-qa",
|
|
116
|
+
pretty_name="NorGLM-Multi-QA",
|
|
117
|
+
source="EuroEval/norglm-multi-qa",
|
|
126
118
|
task=RC,
|
|
127
|
-
languages=[
|
|
119
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
128
120
|
unofficial=True,
|
|
129
121
|
)
|
|
130
122
|
|
|
131
123
|
NORGLM_MULTI_SUM = DatasetConfig(
|
|
132
124
|
name="norglm-multi-sum",
|
|
133
|
-
pretty_name="
|
|
134
|
-
"
|
|
135
|
-
huggingface_id="EuroEval/norglm-multi-sum",
|
|
125
|
+
pretty_name="NorGLM-Multi-Sum",
|
|
126
|
+
source="EuroEval/norglm-multi-sum",
|
|
136
127
|
task=SUMM,
|
|
137
|
-
languages=[
|
|
128
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
138
129
|
unofficial=True,
|
|
139
130
|
)
|
|
140
131
|
|
|
141
132
|
SCHIBSTED_NO_CONFIG = DatasetConfig(
|
|
142
133
|
name="schibsted-no",
|
|
143
|
-
pretty_name="
|
|
144
|
-
|
|
134
|
+
pretty_name="Schibsted-no",
|
|
135
|
+
source="EuroEval/schibsted-article-summaries-no",
|
|
145
136
|
task=SUMM,
|
|
146
|
-
languages=[
|
|
137
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
147
138
|
unofficial=True,
|
|
148
139
|
)
|
|
149
140
|
|
|
150
141
|
PERSONAL_SUM_CONFIG = DatasetConfig(
|
|
151
142
|
name="personal-sum",
|
|
152
|
-
pretty_name="
|
|
153
|
-
|
|
143
|
+
pretty_name="Personal Sum",
|
|
144
|
+
source="EuroEval/personal-sum",
|
|
154
145
|
task=SUMM,
|
|
155
|
-
languages=[
|
|
146
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
156
147
|
unofficial=True,
|
|
157
148
|
)
|
|
158
149
|
|
|
159
150
|
MMLU_NO_CONFIG = DatasetConfig(
|
|
160
151
|
name="mmlu-no",
|
|
161
|
-
pretty_name="
|
|
162
|
-
"
|
|
163
|
-
huggingface_id="EuroEval/mmlu-no-mini",
|
|
152
|
+
pretty_name="MMLU-no",
|
|
153
|
+
source="EuroEval/mmlu-no-mini",
|
|
164
154
|
task=KNOW,
|
|
165
|
-
languages=[
|
|
155
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
166
156
|
unofficial=True,
|
|
167
157
|
)
|
|
168
158
|
|
|
169
159
|
ARC_NO_CONFIG = DatasetConfig(
|
|
170
160
|
name="arc-no",
|
|
171
|
-
pretty_name="
|
|
172
|
-
"
|
|
173
|
-
huggingface_id="EuroEval/arc-no-mini",
|
|
161
|
+
pretty_name="ARC-no",
|
|
162
|
+
source="EuroEval/arc-no-mini",
|
|
174
163
|
task=KNOW,
|
|
175
|
-
languages=[
|
|
164
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
176
165
|
unofficial=True,
|
|
177
166
|
)
|
|
178
167
|
|
|
179
168
|
HELLASWAG_NO_CONFIG = DatasetConfig(
|
|
180
169
|
name="hellaswag-no",
|
|
181
|
-
pretty_name="
|
|
182
|
-
"
|
|
183
|
-
huggingface_id="EuroEval/hellaswag-no-mini",
|
|
170
|
+
pretty_name="HellaSwag-no",
|
|
171
|
+
source="EuroEval/hellaswag-no-mini",
|
|
184
172
|
task=COMMON_SENSE,
|
|
185
|
-
languages=[
|
|
173
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
186
174
|
unofficial=True,
|
|
187
175
|
)
|
|
188
176
|
|
|
189
177
|
BELEBELE_NO_CONFIG = DatasetConfig(
|
|
190
178
|
name="belebele-no",
|
|
191
|
-
pretty_name="
|
|
192
|
-
"
|
|
193
|
-
huggingface_id="EuroEval/belebele-no-mini",
|
|
179
|
+
pretty_name="Belebele-no",
|
|
180
|
+
source="EuroEval/belebele-no-mini",
|
|
194
181
|
task=MCRC,
|
|
195
|
-
languages=[
|
|
182
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
196
183
|
unofficial=True,
|
|
197
184
|
)
|
|
198
185
|
|
|
199
186
|
MULTI_WIKI_QA_NB_CONFIG = DatasetConfig(
|
|
200
187
|
name="multi-wiki-qa-nb",
|
|
201
|
-
pretty_name="
|
|
202
|
-
"
|
|
203
|
-
huggingface_id="EuroEval/multi-wiki-qa-no-mini",
|
|
188
|
+
pretty_name="MultiWikiQA-nb",
|
|
189
|
+
source="EuroEval/multi-wiki-qa-no-mini",
|
|
204
190
|
task=RC,
|
|
205
|
-
languages=[
|
|
191
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
|
|
206
192
|
unofficial=True,
|
|
207
193
|
)
|
|
208
194
|
|
|
209
195
|
MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
|
|
210
196
|
name="multi-wiki-qa-nn",
|
|
211
|
-
pretty_name="
|
|
212
|
-
"
|
|
213
|
-
huggingface_id="EuroEval/multi-wiki-qa-nn-mini",
|
|
197
|
+
pretty_name="MultiWikiQA-nn",
|
|
198
|
+
source="EuroEval/multi-wiki-qa-nn-mini",
|
|
214
199
|
task=RC,
|
|
215
|
-
languages=[
|
|
200
|
+
languages=[NORWEGIAN_NYNORSK],
|
|
216
201
|
unofficial=True,
|
|
217
202
|
)
|
|
218
203
|
|
|
219
204
|
WINOGRANDE_NO_CONFIG = DatasetConfig(
|
|
220
205
|
name="winogrande-no",
|
|
221
|
-
pretty_name="
|
|
222
|
-
"
|
|
223
|
-
huggingface_id="EuroEval/winogrande-no",
|
|
206
|
+
pretty_name="Winogrande-no",
|
|
207
|
+
source="EuroEval/winogrande-no",
|
|
224
208
|
task=COMMON_SENSE,
|
|
225
|
-
languages=[
|
|
209
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
226
210
|
_labels=["a", "b"],
|
|
227
211
|
unofficial=True,
|
|
228
212
|
)
|
|
229
|
-
|
|
230
|
-
EUROPEAN_VALUES_SITUATIONAL_NO_CONFIG = DatasetConfig(
|
|
231
|
-
name="european-values-situational-no",
|
|
232
|
-
pretty_name="the Norwegian version of the European values evaluation dataset, "
|
|
233
|
-
"where the questions are phrased in a situational way",
|
|
234
|
-
huggingface_id="EuroEval/european-values-situational-no",
|
|
235
|
-
task=EUROPEAN_VALUES,
|
|
236
|
-
languages=[NB, NN, NO],
|
|
237
|
-
splits=["test"],
|
|
238
|
-
bootstrap_samples=False,
|
|
239
|
-
_instruction_prompt="{text}",
|
|
240
|
-
unofficial=True,
|
|
241
|
-
)
|
|
242
|
-
|
|
243
|
-
EUROPEAN_VALUES_COMPLETIONS_NO_CONFIG = DatasetConfig(
|
|
244
|
-
name="european-values-completions-no",
|
|
245
|
-
pretty_name="the Norwegian version of the European values evaluation dataset, "
|
|
246
|
-
"where the questions are phrased as sentence completions",
|
|
247
|
-
huggingface_id="EuroEval/european-values-completions-no",
|
|
248
|
-
task=EUROPEAN_VALUES,
|
|
249
|
-
languages=[NO],
|
|
250
|
-
splits=["test"],
|
|
251
|
-
bootstrap_samples=False,
|
|
252
|
-
_instruction_prompt="{text}",
|
|
253
|
-
unofficial=True,
|
|
254
|
-
)
|
|
@@ -1,75 +1,74 @@
|
|
|
1
1
|
"""All Polish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
4
|
+
from ..languages import POLISH
|
|
5
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
POLEMO2_CONFIG = DatasetConfig(
|
|
10
10
|
name="polemo2",
|
|
11
|
-
pretty_name="
|
|
12
|
-
|
|
11
|
+
pretty_name="Polemo2",
|
|
12
|
+
source="EuroEval/polemo2-mini",
|
|
13
13
|
task=SENT,
|
|
14
|
-
languages=[
|
|
14
|
+
languages=[POLISH],
|
|
15
15
|
)
|
|
16
16
|
|
|
17
17
|
SCALA_PL_CONFIG = DatasetConfig(
|
|
18
18
|
name="scala-pl",
|
|
19
|
-
pretty_name="
|
|
20
|
-
|
|
19
|
+
pretty_name="ScaLA-pl",
|
|
20
|
+
source="EuroEval/scala-pl",
|
|
21
21
|
task=LA,
|
|
22
|
-
languages=[
|
|
22
|
+
languages=[POLISH],
|
|
23
23
|
)
|
|
24
24
|
|
|
25
25
|
KPWR_NER_CONFIG = DatasetConfig(
|
|
26
26
|
name="kpwr-ner",
|
|
27
|
-
pretty_name="
|
|
28
|
-
|
|
27
|
+
pretty_name="KPWr-NER",
|
|
28
|
+
source="EuroEval/kpwr-ner",
|
|
29
29
|
task=NER,
|
|
30
|
-
languages=[
|
|
30
|
+
languages=[POLISH],
|
|
31
31
|
)
|
|
32
32
|
|
|
33
33
|
POQUAD_CONFIG = DatasetConfig(
|
|
34
34
|
name="poquad",
|
|
35
|
-
pretty_name="
|
|
36
|
-
|
|
35
|
+
pretty_name="PoQuAD",
|
|
36
|
+
source="EuroEval/poquad-mini",
|
|
37
37
|
task=RC,
|
|
38
|
-
languages=[
|
|
38
|
+
languages=[POLISH],
|
|
39
39
|
)
|
|
40
40
|
|
|
41
41
|
PSC_CONFIG = DatasetConfig(
|
|
42
42
|
name="psc",
|
|
43
|
-
pretty_name="
|
|
44
|
-
|
|
43
|
+
pretty_name="PSC",
|
|
44
|
+
source="EuroEval/psc-mini",
|
|
45
45
|
task=SUMM,
|
|
46
|
-
languages=[
|
|
46
|
+
languages=[POLISH],
|
|
47
47
|
)
|
|
48
48
|
|
|
49
49
|
LLMZSZL_CONFIG = DatasetConfig(
|
|
50
50
|
name="llmzszl",
|
|
51
|
-
pretty_name="
|
|
52
|
-
|
|
51
|
+
pretty_name="LLMzSzŁ",
|
|
52
|
+
source="EuroEval/llmzszl-mini",
|
|
53
53
|
task=KNOW,
|
|
54
|
-
languages=[
|
|
54
|
+
languages=[POLISH],
|
|
55
55
|
)
|
|
56
56
|
|
|
57
57
|
WINOGRANDE_PL_CONFIG = DatasetConfig(
|
|
58
58
|
name="winogrande-pl",
|
|
59
|
-
pretty_name="
|
|
60
|
-
"
|
|
61
|
-
huggingface_id="EuroEval/winogrande-pl",
|
|
59
|
+
pretty_name="Winogrande-pl",
|
|
60
|
+
source="EuroEval/winogrande-pl",
|
|
62
61
|
task=COMMON_SENSE,
|
|
63
|
-
languages=[
|
|
62
|
+
languages=[POLISH],
|
|
64
63
|
_labels=["a", "b"],
|
|
65
64
|
)
|
|
66
65
|
|
|
67
|
-
|
|
68
|
-
name="
|
|
69
|
-
pretty_name="
|
|
70
|
-
|
|
66
|
+
VALEU_PL_CONFIG = DatasetConfig(
|
|
67
|
+
name="valeu-pl",
|
|
68
|
+
pretty_name="VaLEU-pl",
|
|
69
|
+
source="EuroEval/european-values-pl",
|
|
71
70
|
task=EUROPEAN_VALUES,
|
|
72
|
-
languages=[
|
|
71
|
+
languages=[POLISH],
|
|
73
72
|
splits=["test"],
|
|
74
73
|
bootstrap_samples=False,
|
|
75
74
|
_instruction_prompt="{text}",
|
|
@@ -80,44 +79,18 @@ EUROPEAN_VALUES_PL_CONFIG = DatasetConfig(
|
|
|
80
79
|
|
|
81
80
|
MULTI_WIKI_QA_PL_CONFIG = DatasetConfig(
|
|
82
81
|
name="multi-wiki-qa-pl",
|
|
83
|
-
pretty_name="
|
|
84
|
-
"
|
|
85
|
-
huggingface_id="EuroEval/multi-wiki-qa-pl-mini",
|
|
82
|
+
pretty_name="MultiWikiQA-pl",
|
|
83
|
+
source="EuroEval/multi-wiki-qa-pl-mini",
|
|
86
84
|
task=RC,
|
|
87
|
-
languages=[
|
|
85
|
+
languages=[POLISH],
|
|
88
86
|
unofficial=True,
|
|
89
87
|
)
|
|
90
88
|
|
|
91
89
|
GOLDENSWAG_PL_CONFIG = DatasetConfig(
|
|
92
90
|
name="goldenswag-pl",
|
|
93
|
-
pretty_name="
|
|
94
|
-
"
|
|
95
|
-
huggingface_id="EuroEval/goldenswag-pl-mini",
|
|
91
|
+
pretty_name="GoldenSwag-pl",
|
|
92
|
+
source="EuroEval/goldenswag-pl-mini",
|
|
96
93
|
task=COMMON_SENSE,
|
|
97
|
-
languages=[
|
|
98
|
-
unofficial=True,
|
|
99
|
-
)
|
|
100
|
-
|
|
101
|
-
EUROPEAN_VALUES_SITUATIONAL_PL_CONFIG = DatasetConfig(
|
|
102
|
-
name="european-values-situational-pl",
|
|
103
|
-
pretty_name="the Polish version of the European values evaluation dataset, where "
|
|
104
|
-
"the questions are phrased in a situational way",
|
|
105
|
-
huggingface_id="EuroEval/european-values-situational-pl",
|
|
106
|
-
task=EUROPEAN_VALUES,
|
|
107
|
-
languages=[PL],
|
|
108
|
-
splits=["test"],
|
|
109
|
-
bootstrap_samples=False,
|
|
110
|
-
unofficial=True,
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
EUROPEAN_VALUES_COMPLETIONS_PL_CONFIG = DatasetConfig(
|
|
114
|
-
name="european-values-completions-pl",
|
|
115
|
-
pretty_name="the Polish version of the European values evaluation dataset, where "
|
|
116
|
-
"the questions are phrased as sentence completions",
|
|
117
|
-
huggingface_id="EuroEval/european-values-completions-pl",
|
|
118
|
-
task=EUROPEAN_VALUES,
|
|
119
|
-
languages=[PL],
|
|
120
|
-
splits=["test"],
|
|
121
|
-
bootstrap_samples=False,
|
|
94
|
+
languages=[POLISH],
|
|
122
95
|
unofficial=True,
|
|
123
96
|
)
|
|
@@ -1,78 +1,74 @@
|
|
|
1
1
|
"""All Portuguese dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
4
|
+
from ..languages import EUROPEAN_PORTUGUESE, PORTUGUESE
|
|
5
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
SST2_PT_CONFIG = DatasetConfig(
|
|
10
10
|
name="sst2-pt",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/sst2-pt-mini",
|
|
11
|
+
pretty_name="SST2-pt",
|
|
12
|
+
source="EuroEval/sst2-pt-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
16
15
|
_labels=["positive", "negative"],
|
|
17
16
|
)
|
|
18
17
|
|
|
19
18
|
SCALA_PT = DatasetConfig(
|
|
20
19
|
name="scala-pt",
|
|
21
|
-
pretty_name="
|
|
22
|
-
|
|
20
|
+
pretty_name="ScaLA-pt",
|
|
21
|
+
source="EuroEval/scala-pt",
|
|
23
22
|
task=LA,
|
|
24
|
-
languages=[
|
|
23
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
25
24
|
)
|
|
26
25
|
|
|
27
26
|
HAREM_CONFIG = DatasetConfig(
|
|
28
27
|
name="harem",
|
|
29
|
-
pretty_name="
|
|
30
|
-
|
|
28
|
+
pretty_name="HAREM",
|
|
29
|
+
source="EuroEval/harem",
|
|
31
30
|
task=NER,
|
|
32
|
-
languages=[
|
|
31
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
33
32
|
)
|
|
34
33
|
|
|
35
34
|
MULTI_WIKI_QA_PT_CONFIG = DatasetConfig(
|
|
36
35
|
name="multi-wiki-qa-pt",
|
|
37
|
-
pretty_name="
|
|
38
|
-
"
|
|
39
|
-
huggingface_id="EuroEval/multi-wiki-qa-pt-pt-mini",
|
|
36
|
+
pretty_name="MultiWikiQA-pt",
|
|
37
|
+
source="EuroEval/multi-wiki-qa-pt-pt-mini",
|
|
40
38
|
task=RC,
|
|
41
|
-
languages=[
|
|
39
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
42
40
|
)
|
|
43
41
|
|
|
44
42
|
PUBLICO_CONFIG = DatasetConfig(
|
|
45
43
|
name="publico",
|
|
46
|
-
pretty_name="
|
|
47
|
-
|
|
44
|
+
pretty_name="Publico",
|
|
45
|
+
source="EuroEval/publico-mini",
|
|
48
46
|
task=SUMM,
|
|
49
|
-
languages=[
|
|
47
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
50
48
|
)
|
|
51
49
|
|
|
52
50
|
MMLU_PT_CONFIG = DatasetConfig(
|
|
53
51
|
name="mmlu-pt",
|
|
54
|
-
pretty_name="
|
|
55
|
-
"
|
|
56
|
-
huggingface_id="EuroEval/mmlu-pt-mini",
|
|
52
|
+
pretty_name="MMLU-pt",
|
|
53
|
+
source="EuroEval/mmlu-pt-mini",
|
|
57
54
|
task=KNOW,
|
|
58
|
-
languages=[
|
|
55
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
59
56
|
)
|
|
60
57
|
|
|
61
58
|
GOLDENSWAG_PT_CONFIG = DatasetConfig(
|
|
62
59
|
name="goldenswag-pt",
|
|
63
|
-
pretty_name="
|
|
64
|
-
"
|
|
65
|
-
huggingface_id="EuroEval/goldenswag-pt-mini",
|
|
60
|
+
pretty_name="GoldenSwag-pt",
|
|
61
|
+
source="EuroEval/goldenswag-pt-mini",
|
|
66
62
|
task=COMMON_SENSE,
|
|
67
|
-
languages=[
|
|
63
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
68
64
|
)
|
|
69
65
|
|
|
70
|
-
|
|
71
|
-
name="
|
|
72
|
-
pretty_name="
|
|
73
|
-
|
|
66
|
+
VALEU_PT_CONFIG = DatasetConfig(
|
|
67
|
+
name="valeu-pt",
|
|
68
|
+
pretty_name="VaLEU-pt",
|
|
69
|
+
source="EuroEval/european-values-pt",
|
|
74
70
|
task=EUROPEAN_VALUES,
|
|
75
|
-
languages=[
|
|
71
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
76
72
|
splits=["test"],
|
|
77
73
|
bootstrap_samples=False,
|
|
78
74
|
_instruction_prompt="{text}",
|
|
@@ -83,47 +79,19 @@ EUROPEAN_VALUES_PT_CONFIG = DatasetConfig(
|
|
|
83
79
|
|
|
84
80
|
BOOLQ_PT_CONFIG = DatasetConfig(
|
|
85
81
|
name="boolq-pt",
|
|
86
|
-
pretty_name="
|
|
87
|
-
"
|
|
88
|
-
huggingface_id="EuroEval/boolq-pt",
|
|
82
|
+
pretty_name="BoolQ-pt",
|
|
83
|
+
source="EuroEval/boolq-pt",
|
|
89
84
|
task=MCRC,
|
|
90
|
-
languages=[
|
|
85
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
91
86
|
unofficial=True,
|
|
92
87
|
)
|
|
93
88
|
|
|
94
89
|
WINOGRANDE_PT_CONFIG = DatasetConfig(
|
|
95
90
|
name="winogrande-pt",
|
|
96
|
-
pretty_name="
|
|
97
|
-
"
|
|
98
|
-
huggingface_id="EuroEval/winogrande-pt",
|
|
91
|
+
pretty_name="Winogrande-pt",
|
|
92
|
+
source="EuroEval/winogrande-pt",
|
|
99
93
|
task=COMMON_SENSE,
|
|
100
|
-
languages=[
|
|
94
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
101
95
|
_labels=["a", "b"],
|
|
102
96
|
unofficial=True,
|
|
103
97
|
)
|
|
104
|
-
|
|
105
|
-
EUROPEAN_VALUES_SITUATIONAL_PT_CONFIG = DatasetConfig(
|
|
106
|
-
name="european-values-situational-pt",
|
|
107
|
-
pretty_name="the Portuguese version of the European values evaluation dataset, "
|
|
108
|
-
"where the questions are phrased in a situational way",
|
|
109
|
-
huggingface_id="EuroEval/european-values-situational-pt",
|
|
110
|
-
task=EUROPEAN_VALUES,
|
|
111
|
-
languages=[PT],
|
|
112
|
-
splits=["test"],
|
|
113
|
-
bootstrap_samples=False,
|
|
114
|
-
_instruction_prompt="{text}",
|
|
115
|
-
unofficial=True,
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
EUROPEAN_VALUES_COMPLETIONS_PT_CONFIG = DatasetConfig(
|
|
119
|
-
name="european-values-completions-pt",
|
|
120
|
-
pretty_name="the Portuguese version of the European values evaluation dataset, "
|
|
121
|
-
"where the questions are phrased as sentence completions",
|
|
122
|
-
huggingface_id="EuroEval/european-values-completions-pt",
|
|
123
|
-
task=EUROPEAN_VALUES,
|
|
124
|
-
languages=[PT],
|
|
125
|
-
splits=["test"],
|
|
126
|
-
bootstrap_samples=False,
|
|
127
|
-
_instruction_prompt="{text}",
|
|
128
|
-
unofficial=True,
|
|
129
|
-
)
|