EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. euroeval/__init__.py +32 -14
  2. euroeval/benchmark_config_factory.py +92 -180
  3. euroeval/benchmark_modules/base.py +49 -39
  4. euroeval/benchmark_modules/fresh.py +35 -21
  5. euroeval/benchmark_modules/hf.py +280 -244
  6. euroeval/benchmark_modules/litellm.py +752 -312
  7. euroeval/benchmark_modules/vllm.py +570 -268
  8. euroeval/benchmarker.py +651 -528
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +49 -38
  12. euroeval/constants.py +44 -25
  13. euroeval/data_loading.py +111 -55
  14. euroeval/data_models.py +490 -323
  15. euroeval/dataset_configs/__init__.py +26 -4
  16. euroeval/dataset_configs/bosnian.py +39 -0
  17. euroeval/dataset_configs/bulgarian.py +56 -0
  18. euroeval/dataset_configs/croatian.py +56 -0
  19. euroeval/dataset_configs/czech.py +75 -0
  20. euroeval/dataset_configs/danish.py +78 -50
  21. euroeval/dataset_configs/dutch.py +74 -44
  22. euroeval/dataset_configs/english.py +71 -36
  23. euroeval/dataset_configs/estonian.py +111 -0
  24. euroeval/dataset_configs/faroese.py +25 -18
  25. euroeval/dataset_configs/finnish.py +63 -26
  26. euroeval/dataset_configs/french.py +65 -32
  27. euroeval/dataset_configs/german.py +77 -36
  28. euroeval/dataset_configs/greek.py +64 -0
  29. euroeval/dataset_configs/icelandic.py +68 -57
  30. euroeval/dataset_configs/italian.py +68 -36
  31. euroeval/dataset_configs/latvian.py +87 -0
  32. euroeval/dataset_configs/lithuanian.py +64 -0
  33. euroeval/dataset_configs/norwegian.py +98 -72
  34. euroeval/dataset_configs/polish.py +96 -0
  35. euroeval/dataset_configs/portuguese.py +63 -40
  36. euroeval/dataset_configs/serbian.py +64 -0
  37. euroeval/dataset_configs/slovak.py +55 -0
  38. euroeval/dataset_configs/slovene.py +56 -0
  39. euroeval/dataset_configs/spanish.py +68 -34
  40. euroeval/dataset_configs/swedish.py +82 -41
  41. euroeval/dataset_configs/ukrainian.py +64 -0
  42. euroeval/enums.py +12 -6
  43. euroeval/exceptions.py +21 -1
  44. euroeval/finetuning.py +34 -26
  45. euroeval/generation.py +76 -41
  46. euroeval/generation_utils.py +169 -34
  47. euroeval/languages.py +1020 -188
  48. euroeval/logging_utils.py +268 -0
  49. euroeval/metrics/__init__.py +6 -0
  50. euroeval/metrics/base.py +85 -0
  51. euroeval/metrics/huggingface.py +216 -0
  52. euroeval/metrics/llm_as_a_judge.py +260 -0
  53. euroeval/metrics/pipeline.py +289 -0
  54. euroeval/metrics/speed.py +48 -0
  55. euroeval/model_cache.py +40 -21
  56. euroeval/model_config.py +4 -5
  57. euroeval/model_loading.py +3 -0
  58. euroeval/prompt_templates/__init__.py +2 -0
  59. euroeval/prompt_templates/classification.py +206 -0
  60. euroeval/prompt_templates/linguistic_acceptability.py +157 -22
  61. euroeval/prompt_templates/multiple_choice.py +159 -17
  62. euroeval/prompt_templates/named_entity_recognition.py +318 -21
  63. euroeval/prompt_templates/reading_comprehension.py +207 -16
  64. euroeval/prompt_templates/sentiment_classification.py +205 -22
  65. euroeval/prompt_templates/summarization.py +122 -22
  66. euroeval/prompt_templates/token_classification.py +279 -0
  67. euroeval/scores.py +20 -9
  68. euroeval/speed_benchmark.py +11 -12
  69. euroeval/task_group_utils/multiple_choice_classification.py +21 -12
  70. euroeval/task_group_utils/question_answering.py +101 -73
  71. euroeval/task_group_utils/sequence_classification.py +144 -61
  72. euroeval/task_group_utils/text_to_text.py +33 -12
  73. euroeval/task_group_utils/token_classification.py +86 -89
  74. euroeval/tasks.py +75 -16
  75. euroeval/tokenisation_utils.py +603 -0
  76. euroeval/types.py +17 -11
  77. euroeval/utils.py +332 -137
  78. euroeval-16.7.1.dist-info/METADATA +623 -0
  79. euroeval-16.7.1.dist-info/RECORD +84 -0
  80. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
  81. euroeval/human_evaluation.py +0 -737
  82. euroeval/metrics.py +0 -452
  83. euroeval/tokenization_utils.py +0 -498
  84. euroeval-15.12.0.dist-info/METADATA +0 -285
  85. euroeval-15.12.0.dist-info/RECORD +0 -63
  86. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
  87. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,87 @@
1
+ """All Latvian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import LATVIAN
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ LATVIAN_TWITTER_SENTIMENT_CONFIG = DatasetConfig(
10
+ name="latvian-twitter-sentiment",
11
+ pretty_name="Latvian Twitter Sentiment",
12
+ source="EuroEval/latvian-twitter-sentiment-mini",
13
+ task=SENT,
14
+ languages=[LATVIAN],
15
+ )
16
+
17
+ SCALA_LV_CONFIG = DatasetConfig(
18
+ name="scala-lv",
19
+ pretty_name="ScaLA-lv",
20
+ source="EuroEval/scala-lv",
21
+ task=LA,
22
+ languages=[LATVIAN],
23
+ )
24
+
25
+ FULLSTACK_NER_LV_CONFIG = DatasetConfig(
26
+ name="fullstack-ner-lv",
27
+ pretty_name="FullStack NER-lv",
28
+ source="EuroEval/fullstack-ner-lv-mini",
29
+ task=NER,
30
+ languages=[LATVIAN],
31
+ )
32
+
33
+ MULTI_WIKI_QA_LV_CONFIG = DatasetConfig(
34
+ name="multi-wiki-qa-lv",
35
+ pretty_name="MultiWikiQA-lv",
36
+ source="EuroEval/multi-wiki-qa-lv-mini",
37
+ task=RC,
38
+ languages=[LATVIAN],
39
+ )
40
+
41
+ LSM_CONFIG = DatasetConfig(
42
+ name="lsm",
43
+ pretty_name="LSM",
44
+ source="EuroEval/lsm-mini",
45
+ task=SUMM,
46
+ languages=[LATVIAN],
47
+ )
48
+
49
+
50
+ MMLU_LV_CONFIG = DatasetConfig(
51
+ name="mmlu-lv",
52
+ pretty_name="MMLU-lv",
53
+ source="EuroEval/mmlu-lv-mini",
54
+ task=KNOW,
55
+ languages=[LATVIAN],
56
+ )
57
+
58
+ COPA_LV_CONFIG = DatasetConfig(
59
+ name="copa-lv",
60
+ pretty_name="COPA-lv",
61
+ source="EuroEval/copa-lv",
62
+ task=COMMON_SENSE,
63
+ languages=[LATVIAN],
64
+ _labels=["a", "b"],
65
+ )
66
+
67
+
68
+ ### Unofficial datasets ###
69
+
70
+ WIKIANN_LV_CONFIG = DatasetConfig(
71
+ name="wikiann-lv",
72
+ pretty_name="WikiANN-lv",
73
+ source="EuroEval/wikiann-lv-mini",
74
+ task=NER,
75
+ languages=[LATVIAN],
76
+ unofficial=True,
77
+ )
78
+
79
+ WINOGRANDE_LV_CONFIG = DatasetConfig(
80
+ name="winogrande-lv",
81
+ pretty_name="Winogrande-lv",
82
+ source="EuroEval/winogrande-lv",
83
+ task=COMMON_SENSE,
84
+ languages=[LATVIAN],
85
+ _labels=["a", "b"],
86
+ unofficial=True,
87
+ )
@@ -0,0 +1,64 @@
1
+ """All Lithuanian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import LITHUANIAN
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ LITHUANIAN_EMOTIONS_CONFIG = DatasetConfig(
10
+ name="lithuanian-emotions",
11
+ pretty_name="Lithuanian Emotions",
12
+ source="EuroEval/lithuanian-emotions-mini",
13
+ task=SENT,
14
+ languages=[LITHUANIAN],
15
+ )
16
+
17
+ SCALA_LT_CONFIG = DatasetConfig(
18
+ name="scala-lt",
19
+ pretty_name="ScaLA-lt",
20
+ source="EuroEval/scala-lt",
21
+ task=LA,
22
+ languages=[LITHUANIAN],
23
+ )
24
+
25
+ WIKIANN_LT_CONFIG = DatasetConfig(
26
+ name="wikiann-lt",
27
+ pretty_name="WikiANN-lt",
28
+ source="EuroEval/wikiann-lt-mini",
29
+ task=NER,
30
+ languages=[LITHUANIAN],
31
+ )
32
+
33
+ MULTI_WIKI_QA_LT_CONFIG = DatasetConfig(
34
+ name="multi-wiki-qa-lt",
35
+ pretty_name="MultiWikiQA-lt",
36
+ source="EuroEval/multi-wiki-qa-lt-mini",
37
+ task=RC,
38
+ languages=[LITHUANIAN],
39
+ )
40
+
41
+ LRYTAS_CONFIG = DatasetConfig(
42
+ name="lrytas",
43
+ pretty_name="Lrytas",
44
+ source="EuroEval/lrytas-mini",
45
+ task=SUMM,
46
+ languages=[LITHUANIAN],
47
+ )
48
+
49
+ LT_HISTORY_CONFIG = DatasetConfig(
50
+ name="lt-history",
51
+ pretty_name="LT-History",
52
+ source="EuroEval/lt-history",
53
+ task=KNOW,
54
+ languages=[LITHUANIAN],
55
+ )
56
+
57
+ WINOGRANDE_LT_CONFIG = DatasetConfig(
58
+ name="winogrande-lt",
59
+ pretty_name="Winogrande-lt",
60
+ source="EuroEval/winogrande-lt",
61
+ task=COMMON_SENSE,
62
+ languages=[LITHUANIAN],
63
+ _labels=["a", "b"],
64
+ )
@@ -1,186 +1,212 @@
1
1
  """All Norwegian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import NB, NN, NO
5
- from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
4
+ from ..languages import NORWEGIAN, NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK
5
+ from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  NOREC_CONFIG = DatasetConfig(
10
10
  name="norec",
11
- pretty_name="the truncated version of the Norwegian sentiment classification "
12
- "dataset NoReC",
13
- huggingface_id="EuroEval/norec-mini",
11
+ pretty_name="NoReC",
12
+ source="EuroEval/norec-mini",
14
13
  task=SENT,
15
- languages=[NB, NN, NO],
14
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
16
15
  )
17
16
 
18
17
  SCALA_NB_CONFIG = DatasetConfig(
19
18
  name="scala-nb",
20
- pretty_name="the Bokmål part of the linguistic acceptability dataset ScaLA",
21
- huggingface_id="EuroEval/scala-nb",
19
+ pretty_name="ScaLA-nb",
20
+ source="EuroEval/scala-nb",
22
21
  task=LA,
23
- languages=[NB, NO],
22
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
24
23
  )
25
24
 
26
25
  SCALA_NN_CONFIG = DatasetConfig(
27
26
  name="scala-nn",
28
- pretty_name="the Nynorsk part of the linguistic acceptability dataset ScaLA",
29
- huggingface_id="EuroEval/scala-nn",
27
+ pretty_name="ScaLA-nn",
28
+ source="EuroEval/scala-nn",
30
29
  task=LA,
31
- languages=[NN],
30
+ languages=[NORWEGIAN_NYNORSK],
32
31
  )
33
32
 
34
33
  NORNE_NB_CONFIG = DatasetConfig(
35
34
  name="norne-nb",
36
- pretty_name="the truncated version of the Bokmål part of the Norwegian named "
37
- "entity recognition dataset NorNE",
38
- huggingface_id="EuroEval/norne-nb-mini",
35
+ pretty_name="NorNE-nb",
36
+ source="EuroEval/norne-nb-mini",
39
37
  task=NER,
40
- languages=[NB, NO],
38
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
41
39
  )
42
40
 
43
41
  NORNE_NN_CONFIG = DatasetConfig(
44
42
  name="norne-nn",
45
- pretty_name="the truncated version of the Nynorsk part of the Norwegian named "
46
- "entity recognition dataset NorNE",
47
- huggingface_id="EuroEval/norne-nn-mini",
43
+ pretty_name="NorNE-nn",
44
+ source="EuroEval/norne-nn-mini",
48
45
  task=NER,
49
- languages=[NN],
46
+ languages=[NORWEGIAN_NYNORSK],
50
47
  )
51
48
 
52
49
  NORQUAD_CONFIG = DatasetConfig(
53
50
  name="norquad",
54
- pretty_name="the truncated version of the Norwegian question answering "
55
- "dataset NorQuAD",
56
- huggingface_id="EuroEval/norquad-mini",
51
+ pretty_name="NorQuAD",
52
+ source="EuroEval/norquad-mini",
57
53
  task=RC,
58
- languages=[NB, NN, NO],
54
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
59
55
  _num_few_shot_examples=2,
60
56
  )
61
57
 
62
58
  NO_SAMMENDRAG_CONFIG = DatasetConfig(
63
59
  name="no-sammendrag",
64
- pretty_name="the truncated version of the Norwegian summarisation dataset "
65
- "Norske Sammendrag",
66
- huggingface_id="EuroEval/no-sammendrag-mini",
60
+ pretty_name="NoSammendrag",
61
+ source="EuroEval/no-sammendrag-mini",
67
62
  task=SUMM,
68
- languages=[NB, NN, NO],
63
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
69
64
  )
70
65
 
71
66
  NRK_QUIZ_QA_CONFIG = DatasetConfig(
72
67
  name="nrk-quiz-qa",
73
- pretty_name="the truncated version of the Norwegian knowledge dataset NRK Quiz QA",
74
- huggingface_id="EuroEval/nrk-quiz-qa-mini",
68
+ pretty_name="NRK Quiz QA",
69
+ source="EuroEval/nrk-quiz-qa-mini",
75
70
  task=KNOW,
76
- languages=[NB, NN, NO],
71
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
77
72
  )
78
73
 
79
74
  IDIOMS_NO_CONFIG = DatasetConfig(
80
75
  name="idioms-no",
81
- pretty_name="the Norwegian knowledge dataset Idioms-no",
82
- huggingface_id="EuroEval/idioms-no",
76
+ pretty_name="Idioms-no",
77
+ source="EuroEval/idioms-no",
83
78
  task=KNOW,
84
- languages=[NB, NN, NO],
79
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
85
80
  )
86
81
 
87
82
  NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
88
83
  name="nor-common-sense-qa",
89
- pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
90
- "NorCommonSenseQA",
91
- huggingface_id="EuroEval/nor-common-sense-qa",
84
+ pretty_name="NorCommonSenseQA",
85
+ source="EuroEval/nor-common-sense-qa",
92
86
  task=COMMON_SENSE,
93
- languages=[NB, NN, NO],
87
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
94
88
  _labels=["a", "b", "c", "d", "e"],
95
89
  )
96
90
 
91
+ VALEU_NO_CONFIG = DatasetConfig(
92
+ name="valeu-no",
93
+ pretty_name="VaLEU-no",
94
+ source="EuroEval/european-values-no",
95
+ task=EUROPEAN_VALUES,
96
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
97
+ splits=["test"],
98
+ bootstrap_samples=False,
99
+ _instruction_prompt="{text}",
100
+ )
101
+
97
102
 
98
103
  ### Unofficial datasets ###
99
104
 
100
105
  NO_COLA_CONFIG = DatasetConfig(
101
106
  name="no-cola",
102
- pretty_name="the truncated version of the Norwegian linguistic acceptability "
103
- "dataset NoCoLA",
104
- huggingface_id="EuroEval/no-cola-mini",
107
+ pretty_name="NoCoLA",
108
+ source="EuroEval/no-cola-mini",
105
109
  task=LA,
106
- languages=[NB, NO],
110
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
107
111
  unofficial=True,
108
112
  )
109
113
 
110
114
  NORGLM_MULTI_QA = DatasetConfig(
111
115
  name="norglm-multi-qa",
112
- pretty_name="the question answering part of the Norwegian NorGLM multi-task human "
113
- "annotated dataset NO-Multi-QA-Sum",
114
- huggingface_id="EuroEval/norglm-multi-qa",
116
+ pretty_name="NorGLM-Multi-QA",
117
+ source="EuroEval/norglm-multi-qa",
115
118
  task=RC,
116
- languages=[NB, NN, NO],
119
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
117
120
  unofficial=True,
118
121
  )
119
122
 
120
123
  NORGLM_MULTI_SUM = DatasetConfig(
121
124
  name="norglm-multi-sum",
122
- pretty_name="the summarisation part of the Norwegian NorGLM multi-task human "
123
- "annotated dataset NO-Multi-QA-Sum",
124
- huggingface_id="EuroEval/norglm-multi-sum",
125
+ pretty_name="NorGLM-Multi-Sum",
126
+ source="EuroEval/norglm-multi-sum",
125
127
  task=SUMM,
126
- languages=[NB, NN, NO],
128
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
127
129
  unofficial=True,
128
130
  )
129
131
 
130
132
  SCHIBSTED_NO_CONFIG = DatasetConfig(
131
133
  name="schibsted-no",
132
- pretty_name="the Norwegian summarisation dataset Schibsted-no",
133
- huggingface_id="EuroEval/schibsted-article-summaries-no",
134
+ pretty_name="Schibsted-no",
135
+ source="EuroEval/schibsted-article-summaries-no",
134
136
  task=SUMM,
135
- languages=[NB, NN, NO],
137
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
136
138
  unofficial=True,
137
139
  )
138
140
 
139
141
  PERSONAL_SUM_CONFIG = DatasetConfig(
140
142
  name="personal-sum",
141
- pretty_name="the Norwegian summarisation dataset personal-sum",
142
- huggingface_id="EuroEval/personal-sum",
143
+ pretty_name="Personal Sum",
144
+ source="EuroEval/personal-sum",
143
145
  task=SUMM,
144
- languages=[NB, NN, NO],
146
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
145
147
  unofficial=True,
146
148
  )
147
149
 
148
150
  MMLU_NO_CONFIG = DatasetConfig(
149
151
  name="mmlu-no",
150
- pretty_name="the truncated version of the Norwegian knowledge dataset MMLU-no, "
151
- "translated from the English MMLU dataset",
152
- huggingface_id="EuroEval/mmlu-no-mini",
152
+ pretty_name="MMLU-no",
153
+ source="EuroEval/mmlu-no-mini",
153
154
  task=KNOW,
154
- languages=[NB, NN, NO],
155
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
155
156
  unofficial=True,
156
157
  )
157
158
 
158
159
  ARC_NO_CONFIG = DatasetConfig(
159
160
  name="arc-no",
160
- pretty_name="the truncated version of the Norwegian knowledge dataset ARC-no, "
161
- "translated from the English ARC dataset",
162
- huggingface_id="EuroEval/arc-no-mini",
161
+ pretty_name="ARC-no",
162
+ source="EuroEval/arc-no-mini",
163
163
  task=KNOW,
164
- languages=[NB, NN, NO],
164
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
165
165
  unofficial=True,
166
166
  )
167
167
 
168
168
  HELLASWAG_NO_CONFIG = DatasetConfig(
169
169
  name="hellaswag-no",
170
- pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
171
- "HellaSwag-no, translated from the English HellaSwag dataset",
172
- huggingface_id="EuroEval/hellaswag-no-mini",
170
+ pretty_name="HellaSwag-no",
171
+ source="EuroEval/hellaswag-no-mini",
173
172
  task=COMMON_SENSE,
174
- languages=[NB, NN, NO],
173
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
175
174
  unofficial=True,
176
175
  )
177
176
 
178
177
  BELEBELE_NO_CONFIG = DatasetConfig(
179
178
  name="belebele-no",
180
- pretty_name="the Norwegian multiple choice reading comprehension dataset "
181
- "BeleBele-no, translated from the English BeleBele dataset",
182
- huggingface_id="EuroEval/belebele-no-mini",
179
+ pretty_name="Belebele-no",
180
+ source="EuroEval/belebele-no-mini",
183
181
  task=MCRC,
184
- languages=[NB, NN, NO],
182
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
183
+ unofficial=True,
184
+ )
185
+
186
+ MULTI_WIKI_QA_NB_CONFIG = DatasetConfig(
187
+ name="multi-wiki-qa-nb",
188
+ pretty_name="MultiWikiQA-nb",
189
+ source="EuroEval/multi-wiki-qa-no-mini",
190
+ task=RC,
191
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
192
+ unofficial=True,
193
+ )
194
+
195
+ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
196
+ name="multi-wiki-qa-nn",
197
+ pretty_name="MultiWikiQA-nn",
198
+ source="EuroEval/multi-wiki-qa-nn-mini",
199
+ task=RC,
200
+ languages=[NORWEGIAN_NYNORSK],
201
+ unofficial=True,
202
+ )
203
+
204
+ WINOGRANDE_NO_CONFIG = DatasetConfig(
205
+ name="winogrande-no",
206
+ pretty_name="Winogrande-no",
207
+ source="EuroEval/winogrande-no",
208
+ task=COMMON_SENSE,
209
+ languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
210
+ _labels=["a", "b"],
185
211
  unofficial=True,
186
212
  )
@@ -0,0 +1,96 @@
1
+ """All Polish dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import POLISH
5
+ from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ POLEMO2_CONFIG = DatasetConfig(
10
+ name="polemo2",
11
+ pretty_name="Polemo2",
12
+ source="EuroEval/polemo2-mini",
13
+ task=SENT,
14
+ languages=[POLISH],
15
+ )
16
+
17
+ SCALA_PL_CONFIG = DatasetConfig(
18
+ name="scala-pl",
19
+ pretty_name="ScaLA-pl",
20
+ source="EuroEval/scala-pl",
21
+ task=LA,
22
+ languages=[POLISH],
23
+ )
24
+
25
+ KPWR_NER_CONFIG = DatasetConfig(
26
+ name="kpwr-ner",
27
+ pretty_name="KPWr-NER",
28
+ source="EuroEval/kpwr-ner",
29
+ task=NER,
30
+ languages=[POLISH],
31
+ )
32
+
33
+ POQUAD_CONFIG = DatasetConfig(
34
+ name="poquad",
35
+ pretty_name="PoQuAD",
36
+ source="EuroEval/poquad-mini",
37
+ task=RC,
38
+ languages=[POLISH],
39
+ )
40
+
41
+ PSC_CONFIG = DatasetConfig(
42
+ name="psc",
43
+ pretty_name="PSC",
44
+ source="EuroEval/psc-mini",
45
+ task=SUMM,
46
+ languages=[POLISH],
47
+ )
48
+
49
+ LLMZSZL_CONFIG = DatasetConfig(
50
+ name="llmzszl",
51
+ pretty_name="LLMzSzŁ",
52
+ source="EuroEval/llmzszl-mini",
53
+ task=KNOW,
54
+ languages=[POLISH],
55
+ )
56
+
57
+ WINOGRANDE_PL_CONFIG = DatasetConfig(
58
+ name="winogrande-pl",
59
+ pretty_name="Winogrande-pl",
60
+ source="EuroEval/winogrande-pl",
61
+ task=COMMON_SENSE,
62
+ languages=[POLISH],
63
+ _labels=["a", "b"],
64
+ )
65
+
66
+ VALEU_PL_CONFIG = DatasetConfig(
67
+ name="valeu-pl",
68
+ pretty_name="VaLEU-pl",
69
+ source="EuroEval/european-values-pl",
70
+ task=EUROPEAN_VALUES,
71
+ languages=[POLISH],
72
+ splits=["test"],
73
+ bootstrap_samples=False,
74
+ _instruction_prompt="{text}",
75
+ )
76
+
77
+
78
+ ### Unofficial datasets ###
79
+
80
+ MULTI_WIKI_QA_PL_CONFIG = DatasetConfig(
81
+ name="multi-wiki-qa-pl",
82
+ pretty_name="MultiWikiQA-pl",
83
+ source="EuroEval/multi-wiki-qa-pl-mini",
84
+ task=RC,
85
+ languages=[POLISH],
86
+ unofficial=True,
87
+ )
88
+
89
+ GOLDENSWAG_PL_CONFIG = DatasetConfig(
90
+ name="goldenswag-pl",
91
+ pretty_name="GoldenSwag-pl",
92
+ source="EuroEval/goldenswag-pl-mini",
93
+ task=COMMON_SENSE,
94
+ languages=[POLISH],
95
+ unofficial=True,
96
+ )
@@ -1,64 +1,77 @@
1
1
  """All Portuguese dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import PT
5
- from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, SENT, SUMM
4
+ from ..languages import EUROPEAN_PORTUGUESE, PORTUGUESE
5
+ from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  SST2_PT_CONFIG = DatasetConfig(
10
10
  name="sst2-pt",
11
- pretty_name="the truncated version of the Portuguese sentiment classification "
12
- "dataset SST2-pt, translated from the English SST2 dataset",
13
- huggingface_id="EuroEval/sst2-pt-mini",
11
+ pretty_name="SST2-pt",
12
+ source="EuroEval/sst2-pt-mini",
14
13
  task=SENT,
15
- languages=[PT],
14
+ languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
16
15
  _labels=["positive", "negative"],
17
16
  )
18
17
 
19
-
20
- MMLU_PT_CONFIG = DatasetConfig(
21
- name="mmlu-pt",
22
- pretty_name="the truncated version of the Portuguese knowledge dataset MMLU-pt, "
23
- "translated from the English MMLU dataset",
24
- huggingface_id="EuroEval/mmlu-pt-mini",
25
- task=KNOW,
26
- languages=[PT],
27
- )
28
-
29
-
30
- GOLDENSWAG_PT_CONFIG = DatasetConfig(
31
- name="goldenswag-pt",
32
- pretty_name="the truncated version of the Portuguese common-sense reasoning "
33
- "dataset GoldenSwag-pt, translated from the English GoldenSwag dataset",
34
- huggingface_id="EuroEval/goldenswag-pt-mini",
35
- task=COMMON_SENSE,
36
- languages=[PT],
37
- )
38
-
39
-
40
18
  SCALA_PT = DatasetConfig(
41
19
  name="scala-pt",
42
- pretty_name="the Portuguese part of the linguistic acceptability dataset ScaLA",
43
- huggingface_id="EuroEval/scala-pt",
20
+ pretty_name="ScaLA-pt",
21
+ source="EuroEval/scala-pt",
44
22
  task=LA,
45
- languages=[PT],
23
+ languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
46
24
  )
47
25
 
48
26
  HAREM_CONFIG = DatasetConfig(
49
27
  name="harem",
50
- pretty_name="the Portuguese named entity recognition dataset HAREM",
51
- huggingface_id="EuroEval/harem",
28
+ pretty_name="HAREM",
29
+ source="EuroEval/harem",
52
30
  task=NER,
53
- languages=[PT],
31
+ languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
32
+ )
33
+
34
+ MULTI_WIKI_QA_PT_CONFIG = DatasetConfig(
35
+ name="multi-wiki-qa-pt",
36
+ pretty_name="MultiWikiQA-pt",
37
+ source="EuroEval/multi-wiki-qa-pt-pt-mini",
38
+ task=RC,
39
+ languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
54
40
  )
55
41
 
56
42
  PUBLICO_CONFIG = DatasetConfig(
57
43
  name="publico",
58
- pretty_name="the truncated version of the Portuguese summarisation dataset Público",
59
- huggingface_id="EuroEval/publico-mini",
44
+ pretty_name="Publico",
45
+ source="EuroEval/publico-mini",
60
46
  task=SUMM,
61
- languages=[PT],
47
+ languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
48
+ )
49
+
50
+ MMLU_PT_CONFIG = DatasetConfig(
51
+ name="mmlu-pt",
52
+ pretty_name="MMLU-pt",
53
+ source="EuroEval/mmlu-pt-mini",
54
+ task=KNOW,
55
+ languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
56
+ )
57
+
58
+ GOLDENSWAG_PT_CONFIG = DatasetConfig(
59
+ name="goldenswag-pt",
60
+ pretty_name="GoldenSwag-pt",
61
+ source="EuroEval/goldenswag-pt-mini",
62
+ task=COMMON_SENSE,
63
+ languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
64
+ )
65
+
66
+ VALEU_PT_CONFIG = DatasetConfig(
67
+ name="valeu-pt",
68
+ pretty_name="VaLEU-pt",
69
+ source="EuroEval/european-values-pt",
70
+ task=EUROPEAN_VALUES,
71
+ languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
72
+ splits=["test"],
73
+ bootstrap_samples=False,
74
+ _instruction_prompt="{text}",
62
75
  )
63
76
 
64
77
 
@@ -66,9 +79,19 @@ PUBLICO_CONFIG = DatasetConfig(
66
79
 
67
80
  BOOLQ_PT_CONFIG = DatasetConfig(
68
81
  name="boolq-pt",
69
- pretty_name="the Portuguese multiple choice reading comprehension dataset "
70
- "BoolQ-pt, translated from the English BoolQ dataset",
71
- huggingface_id="EuroEval/boolq-pt",
82
+ pretty_name="BoolQ-pt",
83
+ source="EuroEval/boolq-pt",
72
84
  task=MCRC,
73
- languages=[PT],
85
+ languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
86
+ unofficial=True,
87
+ )
88
+
89
+ WINOGRANDE_PT_CONFIG = DatasetConfig(
90
+ name="winogrande-pt",
91
+ pretty_name="Winogrande-pt",
92
+ source="EuroEval/winogrande-pt",
93
+ task=COMMON_SENSE,
94
+ languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
95
+ _labels=["a", "b"],
96
+ unofficial=True,
74
97
  )