EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (71) hide show
  1. euroeval/__init__.py +6 -0
  2. euroeval/benchmark_config_factory.py +51 -46
  3. euroeval/benchmark_modules/base.py +6 -5
  4. euroeval/benchmark_modules/hf.py +2 -9
  5. euroeval/benchmark_modules/litellm.py +14 -12
  6. euroeval/benchmark_modules/vllm.py +17 -10
  7. euroeval/benchmarker.py +61 -44
  8. euroeval/caching_utils.py +1 -1
  9. euroeval/cli.py +86 -8
  10. euroeval/constants.py +3 -0
  11. euroeval/data_loading.py +78 -30
  12. euroeval/data_models.py +326 -326
  13. euroeval/dataset_configs/__init__.py +10 -3
  14. euroeval/dataset_configs/bulgarian.py +56 -0
  15. euroeval/dataset_configs/czech.py +25 -29
  16. euroeval/dataset_configs/danish.py +51 -88
  17. euroeval/dataset_configs/dutch.py +48 -86
  18. euroeval/dataset_configs/english.py +45 -76
  19. euroeval/dataset_configs/estonian.py +36 -38
  20. euroeval/dataset_configs/faroese.py +19 -60
  21. euroeval/dataset_configs/finnish.py +36 -68
  22. euroeval/dataset_configs/french.py +39 -74
  23. euroeval/dataset_configs/german.py +45 -81
  24. euroeval/dataset_configs/greek.py +64 -0
  25. euroeval/dataset_configs/icelandic.py +54 -91
  26. euroeval/dataset_configs/italian.py +42 -78
  27. euroeval/dataset_configs/latvian.py +28 -34
  28. euroeval/dataset_configs/lithuanian.py +22 -26
  29. euroeval/dataset_configs/norwegian.py +72 -114
  30. euroeval/dataset_configs/polish.py +33 -60
  31. euroeval/dataset_configs/portuguese.py +33 -65
  32. euroeval/dataset_configs/serbian.py +64 -0
  33. euroeval/dataset_configs/slovak.py +19 -24
  34. euroeval/dataset_configs/spanish.py +42 -76
  35. euroeval/dataset_configs/swedish.py +48 -84
  36. euroeval/dataset_configs/ukrainian.py +64 -0
  37. euroeval/exceptions.py +1 -1
  38. euroeval/finetuning.py +3 -2
  39. euroeval/generation.py +5 -4
  40. euroeval/generation_utils.py +6 -5
  41. euroeval/languages.py +395 -323
  42. euroeval/metrics/huggingface.py +14 -3
  43. euroeval/metrics/llm_as_a_judge.py +1 -1
  44. euroeval/model_cache.py +6 -5
  45. euroeval/model_loading.py +1 -1
  46. euroeval/prompt_templates/__init__.py +2 -0
  47. euroeval/prompt_templates/classification.py +206 -0
  48. euroeval/prompt_templates/linguistic_acceptability.py +82 -43
  49. euroeval/prompt_templates/multiple_choice.py +81 -41
  50. euroeval/prompt_templates/named_entity_recognition.py +125 -44
  51. euroeval/prompt_templates/reading_comprehension.py +92 -43
  52. euroeval/prompt_templates/sentiment_classification.py +91 -43
  53. euroeval/prompt_templates/summarization.py +64 -39
  54. euroeval/prompt_templates/token_classification.py +279 -0
  55. euroeval/scores.py +4 -3
  56. euroeval/speed_benchmark.py +2 -1
  57. euroeval/task_group_utils/multiple_choice_classification.py +2 -1
  58. euroeval/task_group_utils/question_answering.py +24 -13
  59. euroeval/task_group_utils/sequence_classification.py +5 -4
  60. euroeval/task_group_utils/text_to_text.py +2 -1
  61. euroeval/task_group_utils/token_classification.py +11 -8
  62. euroeval/tasks.py +44 -1
  63. euroeval/tokenisation_utils.py +19 -10
  64. euroeval/types.py +10 -9
  65. euroeval/utils.py +6 -3
  66. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
  67. euroeval-16.5.0.dist-info/RECORD +81 -0
  68. euroeval-16.4.0.dist-info/RECORD +0 -75
  69. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  70. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  71. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -3,6 +3,7 @@
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import get_all_languages
5
5
  from ..tasks import SPEED
6
+ from .bulgarian import * # noqa: F403
6
7
  from .czech import * # noqa: F403
7
8
  from .danish import * # noqa: F403
8
9
  from .dutch import * # noqa: F403
@@ -12,6 +13,7 @@ from .faroese import * # noqa: F403
12
13
  from .finnish import * # noqa: F403
13
14
  from .french import * # noqa: F403
14
15
  from .german import * # noqa: F403
16
+ from .greek import * # noqa: F403
15
17
  from .icelandic import * # noqa: F403
16
18
  from .italian import * # noqa: F403
17
19
  from .latvian import * # noqa: F403
@@ -19,9 +21,11 @@ from .lithuanian import * # noqa: F403
19
21
  from .norwegian import * # noqa: F403
20
22
  from .polish import * # noqa: F403
21
23
  from .portuguese import * # noqa: F403
24
+ from .serbian import * # noqa: F403
22
25
  from .slovak import * # noqa: F403
23
26
  from .spanish import * # noqa: F403
24
27
  from .swedish import * # noqa: F403
28
+ from .ukrainian import * # noqa: F403
25
29
 
26
30
 
27
31
  def get_all_dataset_configs() -> dict[str, DatasetConfig]:
@@ -31,7 +35,9 @@ def get_all_dataset_configs() -> dict[str, DatasetConfig]:
31
35
  A mapping between names of datasets and their configurations.
32
36
  """
33
37
  dataset_configs = [
34
- cfg for cfg in globals().values() if isinstance(cfg, DatasetConfig)
38
+ cfg
39
+ for cfg in globals().values()
40
+ if isinstance(cfg, DatasetConfig) and cfg.task != SPEED
35
41
  ]
36
42
  assert len(dataset_configs) == len({cfg.name for cfg in dataset_configs}), (
37
43
  "There are duplicate dataset configurations. Please ensure that each dataset "
@@ -62,8 +68,9 @@ def get_dataset_config(dataset_name: str) -> DatasetConfig:
62
68
 
63
69
  SPEED_CONFIG = DatasetConfig(
64
70
  name="speed",
65
- pretty_name="the speed estimation benchmark",
66
- huggingface_id="",
71
+ pretty_name="",
72
+ source="",
67
73
  task=SPEED,
68
74
  languages=list(get_all_languages().values()),
75
+ _logging_string="the speed estimation benchmark",
69
76
  )
@@ -0,0 +1,56 @@
1
+ """All Bulgarian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import BULGARIAN
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
6
+
7
+ ### Official datasets ###
8
+
9
+ CINEXIO_CONFIG = DatasetConfig(
10
+ name="cinexio",
11
+ pretty_name="Cinexio",
12
+ source="EuroEval/cinexio-mini",
13
+ task=SENT,
14
+ languages=[BULGARIAN],
15
+ )
16
+
17
+ SCALA_BG_CONFIG = DatasetConfig(
18
+ name="scala-bg",
19
+ pretty_name="ScaLA-bg",
20
+ source="EuroEval/scala-bg",
21
+ task=LA,
22
+ languages=[BULGARIAN],
23
+ )
24
+
25
+ BG_NER_BSNLP_CONFIG = DatasetConfig(
26
+ name="bg-ner-bsnlp",
27
+ pretty_name="BG-NER-BSNLp",
28
+ source="EuroEval/bg-ner-bsnlp-mini",
29
+ task=NER,
30
+ languages=[BULGARIAN],
31
+ )
32
+
33
+ MULTI_WIKI_QA_BG_CONFIG = DatasetConfig(
34
+ name="multi-wiki-qa-bg",
35
+ pretty_name="MultiWikiQA-bg",
36
+ source="EuroEval/multi-wiki-qa-bg-mini",
37
+ task=RC,
38
+ languages=[BULGARIAN],
39
+ )
40
+
41
+ EXAMS_BG_CONFIG = DatasetConfig(
42
+ name="exams-bg",
43
+ pretty_name="Exams-bg",
44
+ source="EuroEval/exams-bg-mini",
45
+ task=KNOW,
46
+ languages=[BULGARIAN],
47
+ )
48
+
49
+ WINOGRANDE_BG_CONFIG = DatasetConfig(
50
+ name="winogrande-bg",
51
+ pretty_name="Winogrande-bg",
52
+ source="EuroEval/winogrande-bg",
53
+ task=COMMON_SENSE,
54
+ languages=[BULGARIAN],
55
+ _labels=["a", "b"],
56
+ )
@@ -1,69 +1,65 @@
1
1
  """All Czech dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import CS
4
+ from ..languages import CZECH
5
5
  from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  CSFD_SENTIMENT_CONFIG = DatasetConfig(
10
10
  name="csfd-sentiment",
11
- pretty_name="the truncated version of the Czech sentiment classification dataset "
12
- "CSFD Sentiment",
13
- huggingface_id="EuroEval/csfd-sentiment-mini",
11
+ pretty_name="CSFD Sentiment",
12
+ source="EuroEval/csfd-sentiment-mini",
14
13
  task=SENT,
15
- languages=[CS],
14
+ languages=[CZECH],
16
15
  )
17
16
 
18
17
  CS_GEC_CONFIG = DatasetConfig(
19
18
  name="cs-gec",
20
- pretty_name="the truncated version of the Czech linguistic acceptability dataset "
21
- "CS-GEC",
22
- huggingface_id="EuroEval/cs-gec-mini",
19
+ pretty_name="CS-GEC",
20
+ source="EuroEval/cs-gec-mini",
23
21
  task=LA,
24
- languages=[CS],
22
+ languages=[CZECH],
25
23
  )
26
24
 
27
25
  PONER_CONFIG = DatasetConfig(
28
26
  name="poner",
29
- pretty_name="the truncated version of the Czech named entity recognition dataset "
30
- "PONER",
31
- huggingface_id="EuroEval/poner-mini",
27
+ pretty_name="PoNER",
28
+ source="EuroEval/poner-mini",
32
29
  task=NER,
33
- languages=[CS],
30
+ languages=[CZECH],
34
31
  )
35
32
 
36
33
  SQAD_CONFIG = DatasetConfig(
37
34
  name="sqad",
38
- pretty_name="the truncated version of the Czech reading comprehension dataset SQAD",
39
- huggingface_id="EuroEval/sqad-mini",
35
+ pretty_name="SQAD",
36
+ source="EuroEval/sqad-mini",
40
37
  task=RC,
41
- languages=[CS],
38
+ languages=[CZECH],
42
39
  )
43
40
 
44
41
  CZECH_NEWS_CONFIG = DatasetConfig(
45
42
  name="czech-news",
46
- pretty_name="the truncated version of the Czech summarisation dataset",
47
- huggingface_id="EuroEval/czech-news-mini",
43
+ pretty_name="Czech News",
44
+ source="EuroEval/czech-news-mini",
48
45
  task=SUMM,
49
- languages=[CS],
46
+ languages=[CZECH],
50
47
  )
51
48
 
52
49
  UMIMETO_QA_CONFIG = DatasetConfig(
53
50
  name="umimeto-qa",
54
- pretty_name="the Czech knowledge dataset UmimetoQA",
55
- huggingface_id="EuroEval/umimeto-qa",
51
+ pretty_name="Umimeto QA",
52
+ source="EuroEval/umimeto-qa",
56
53
  task=KNOW,
57
- languages=[CS],
54
+ languages=[CZECH],
58
55
  )
59
56
 
60
57
  HELLASWAG_CS_CONFIG = DatasetConfig(
61
58
  name="hellaswag-cs",
62
- pretty_name="the truncated version of the Czech common-sense reasoning dataset "
63
- "HellaSwag-cs, translated from the English HellaSwag dataset",
64
- huggingface_id="EuroEval/hellaswag-cs-mini",
59
+ pretty_name="HellaSwag-cs",
60
+ source="EuroEval/hellaswag-cs-mini",
65
61
  task=COMMON_SENSE,
66
- languages=[CS],
62
+ languages=[CZECH],
67
63
  )
68
64
 
69
65
 
@@ -71,9 +67,9 @@ HELLASWAG_CS_CONFIG = DatasetConfig(
71
67
 
72
68
  SCALA_CS_CONFIG = DatasetConfig(
73
69
  name="scala-cs",
74
- pretty_name="the Czech part of the linguistic acceptability dataset ScaLA",
75
- huggingface_id="EuroEval/scala-cs",
70
+ pretty_name="ScaLA-cs",
71
+ source="EuroEval/scala-cs",
76
72
  task=LA,
77
- languages=[CS],
73
+ languages=[CZECH],
78
74
  unofficial=True,
79
75
  )
@@ -1,87 +1,81 @@
1
1
  """All Danish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import DA
4
+ from ..languages import DANISH
5
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  ANGRY_TWEETS_CONFIG = DatasetConfig(
10
10
  name="angry-tweets",
11
- pretty_name="the truncated version of the Danish sentiment classification "
12
- "dataset AngryTweets",
13
- huggingface_id="EuroEval/angry-tweets-mini",
11
+ pretty_name="AngryTweets",
12
+ source="EuroEval/angry-tweets-mini",
14
13
  task=SENT,
15
- languages=[DA],
14
+ languages=[DANISH],
16
15
  )
17
16
 
18
17
  SCALA_DA_CONFIG = DatasetConfig(
19
18
  name="scala-da",
20
- pretty_name="the Danish part of the linguistic acceptability dataset ScaLA",
21
- huggingface_id="EuroEval/scala-da",
19
+ pretty_name="ScaLA-da",
20
+ source="EuroEval/scala-da",
22
21
  task=LA,
23
- languages=[DA],
22
+ languages=[DANISH],
24
23
  )
25
24
 
26
25
  DANSK_CONFIG = DatasetConfig(
27
26
  name="dansk",
28
- pretty_name="the truncated version of the Danish named entity recognition "
29
- "dataset DANSK",
30
- huggingface_id="EuroEval/dansk-mini",
27
+ pretty_name="DANSK",
28
+ source="EuroEval/dansk-mini",
31
29
  task=NER,
32
- languages=[DA],
30
+ languages=[DANISH],
33
31
  )
34
32
 
35
33
  MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
36
34
  name="multi-wiki-qa-da",
37
- pretty_name="the truncated version of the Danish part of the reading "
38
- "comprehension dataset MultiWikiQA",
39
- huggingface_id="EuroEval/multi-wiki-qa-da-mini",
35
+ pretty_name="MultiWikiQA-da",
36
+ source="EuroEval/multi-wiki-qa-da-mini",
40
37
  task=RC,
41
- languages=[DA],
38
+ languages=[DANISH],
42
39
  )
43
40
 
44
41
  NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
45
42
  name="nordjylland-news",
46
- pretty_name="the truncated version of the Danish summarisation dataset "
47
- "Nordjylland News",
48
- huggingface_id="EuroEval/nordjylland-news-mini",
43
+ pretty_name="Nordjylland News",
44
+ source="EuroEval/nordjylland-news-mini",
49
45
  task=SUMM,
50
- languages=[DA],
46
+ languages=[DANISH],
51
47
  )
52
48
 
53
49
  DANSKE_TALEMAADER_CONFIG = DatasetConfig(
54
50
  name="danske-talemaader",
55
- pretty_name="the truncated version of the Danish knowledge dataset Danske "
56
- "Talemåder",
57
- huggingface_id="EuroEval/danske-talemaader",
51
+ pretty_name="Danske Talemåder",
52
+ source="EuroEval/danske-talemaader",
58
53
  task=KNOW,
59
- languages=[DA],
54
+ languages=[DANISH],
60
55
  )
61
56
 
62
57
  DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
63
58
  name="danish-citizen-tests",
64
- pretty_name="the Danish knowledge dataset Danish Citizen Tests",
65
- huggingface_id="EuroEval/danish-citizen-tests-updated",
59
+ pretty_name="Danish Citizen Tests",
60
+ source="EuroEval/danish-citizen-tests-updated",
66
61
  task=KNOW,
67
- languages=[DA],
62
+ languages=[DANISH],
68
63
  )
69
64
 
70
65
  HELLASWAG_DA_CONFIG = DatasetConfig(
71
66
  name="hellaswag-da",
72
- pretty_name="the truncated version of the Danish common-sense reasoning dataset "
73
- "HellaSwag-da, translated from the English HellaSwag dataset",
74
- huggingface_id="EuroEval/hellaswag-da-mini",
67
+ pretty_name="HellaSwag-da",
68
+ source="EuroEval/hellaswag-da-mini",
75
69
  task=COMMON_SENSE,
76
- languages=[DA],
70
+ languages=[DANISH],
77
71
  )
78
72
 
79
- EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
80
- name="european-values-da",
81
- pretty_name="the Danish version of the European values evaluation dataset",
82
- huggingface_id="EuroEval/european-values-da",
73
+ VALEU_DA_CONFIG = DatasetConfig(
74
+ name="valeu-da",
75
+ pretty_name="ValEU-da",
76
+ source="EuroEval/european-values-da",
83
77
  task=EUROPEAN_VALUES,
84
- languages=[DA],
78
+ languages=[DANISH],
85
79
  splits=["test"],
86
80
  bootstrap_samples=False,
87
81
  )
@@ -91,95 +85,64 @@ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
91
85
 
92
86
  DANE_CONFIG = DatasetConfig(
93
87
  name="dane",
94
- pretty_name="the truncated version of the Danish named entity recognition "
95
- "dataset DaNE",
96
- huggingface_id="EuroEval/dane-mini",
88
+ pretty_name="DaNE",
89
+ source="EuroEval/dane-mini",
97
90
  task=NER,
98
- languages=[DA],
91
+ languages=[DANISH],
99
92
  unofficial=True,
100
93
  )
101
94
 
102
95
  MMLU_DA_CONFIG = DatasetConfig(
103
96
  name="mmlu-da",
104
- pretty_name="the truncated version of the Danish knowledge dataset MMLU-da, "
105
- "translated from the English MMLU dataset",
106
- huggingface_id="EuroEval/mmlu-da-mini",
97
+ pretty_name="MMLU-da",
98
+ source="EuroEval/mmlu-da-mini",
107
99
  task=KNOW,
108
- languages=[DA],
100
+ languages=[DANISH],
109
101
  unofficial=True,
110
102
  )
111
103
 
112
104
  ARC_DA_CONFIG = DatasetConfig(
113
105
  name="arc-da",
114
- pretty_name="the truncated version of the Danish knowledge dataset ARC-da, "
115
- "translated from the English ARC dataset",
116
- huggingface_id="EuroEval/arc-da-mini",
106
+ pretty_name="ARC-da",
107
+ source="EuroEval/arc-da-mini",
117
108
  task=KNOW,
118
- languages=[DA],
109
+ languages=[DANISH],
119
110
  unofficial=True,
120
111
  )
121
112
 
122
113
  BELEBELE_DA_CONFIG = DatasetConfig(
123
114
  name="belebele-da",
124
- pretty_name="the Danish multiple choice reading comprehension dataset BeleBele-da, "
125
- "translated from the English BeleBele dataset",
126
- huggingface_id="EuroEval/belebele-da-mini",
115
+ pretty_name="Belebele-da",
116
+ source="EuroEval/belebele-da-mini",
127
117
  task=MCRC,
128
- languages=[DA],
118
+ languages=[DANISH],
129
119
  unofficial=True,
130
120
  )
131
121
 
132
122
  SCANDIQA_DA_CONFIG = DatasetConfig(
133
123
  name="scandiqa-da",
134
- pretty_name="the Danish part of the truncated version of the question answering "
135
- "dataset ScandiQA",
136
- huggingface_id="EuroEval/scandiqa-da-mini",
124
+ pretty_name="ScandiQA-da",
125
+ source="EuroEval/scandiqa-da-mini",
137
126
  task=RC,
138
- languages=[DA],
127
+ languages=[DANISH],
139
128
  unofficial=True,
140
129
  )
141
130
 
142
131
  GOLDENSWAG_DA_CONFIG = DatasetConfig(
143
132
  name="goldenswag-da",
144
- pretty_name="the truncated version of the Danish common-sense reasoning "
145
- "dataset GoldenSwag-da, translated from the English GoldenSwag dataset",
146
- huggingface_id="EuroEval/goldenswag-da-mini",
133
+ pretty_name="GoldenSwag-da",
134
+ source="EuroEval/goldenswag-da-mini",
147
135
  task=COMMON_SENSE,
148
- languages=[DA],
136
+ languages=[DANISH],
149
137
  unofficial=True,
150
138
  )
151
139
 
152
140
  WINOGRANDE_DA_CONFIG = DatasetConfig(
153
141
  name="winogrande-da",
154
- pretty_name="the Danish common-sense reasoning dataset Winogrande-da, translated "
155
- "from the English Winogrande dataset",
156
- huggingface_id="EuroEval/winogrande-da",
142
+ pretty_name="Winogrande-da",
143
+ source="EuroEval/winogrande-da",
157
144
  task=COMMON_SENSE,
158
- languages=[DA],
145
+ languages=[DANISH],
159
146
  _labels=["a", "b"],
160
147
  unofficial=True,
161
148
  )
162
-
163
- EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
164
- name="european-values-situational-da",
165
- pretty_name="the Danish version of the European values evaluation dataset, where "
166
- "the questions are phrased in a situational way",
167
- huggingface_id="EuroEval/european-values-situational-da",
168
- task=EUROPEAN_VALUES,
169
- languages=[DA],
170
- splits=["test"],
171
- bootstrap_samples=False,
172
- unofficial=True,
173
- )
174
-
175
- EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
176
- name="european-values-completions-da",
177
- pretty_name="the Danish version of the European values evaluation dataset, where "
178
- "the questions are phrased as sentence completions",
179
- huggingface_id="EuroEval/european-values-completions-da",
180
- task=EUROPEAN_VALUES,
181
- languages=[DA],
182
- splits=["test"],
183
- bootstrap_samples=False,
184
- unofficial=True,
185
- )
@@ -1,80 +1,74 @@
1
1
  """All Dutch dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import NL
4
+ from ..languages import DUTCH
5
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  DBRD_CONFIG = DatasetConfig(
10
10
  name="dbrd",
11
- pretty_name="the truncated version of the Dutch sentiment classification "
12
- "dataset DBRD",
13
- huggingface_id="EuroEval/dbrd-mini",
11
+ pretty_name="DBRD",
12
+ source="EuroEval/dbrd-mini",
14
13
  task=SENT,
15
- languages=[NL],
14
+ languages=[DUTCH],
16
15
  _labels=["negative", "positive"],
17
16
  )
18
17
 
19
18
  SCALA_NL_CONFIG = DatasetConfig(
20
19
  name="scala-nl",
21
- pretty_name="the Dutch part of the linguistic acceptability dataset ScaLA",
22
- huggingface_id="EuroEval/scala-nl",
20
+ pretty_name="ScaLA-nl",
21
+ source="EuroEval/scala-nl",
23
22
  task=LA,
24
- languages=[NL],
23
+ languages=[DUTCH],
25
24
  )
26
25
 
27
26
  CONLL_NL_CONFIG = DatasetConfig(
28
27
  name="conll-nl",
29
- pretty_name="the Dutch part of the truncated version of the named entity "
30
- "recognition dataset CoNLL 2002",
31
- huggingface_id="EuroEval/conll-nl-mini",
28
+ pretty_name="CoNLL-nl",
29
+ source="EuroEval/conll-nl-mini",
32
30
  task=NER,
33
- languages=[NL],
31
+ languages=[DUTCH],
34
32
  )
35
33
 
36
34
  SQUAD_NL_CONFIG = DatasetConfig(
37
35
  name="squad-nl",
38
- pretty_name="the truncated version of the Dutch reading comprehension dataset "
39
- "SQuAD-nl, translated from the English SQuAD dataset",
40
- huggingface_id="EuroEval/squad-nl-v2-mini",
36
+ pretty_name="SQuAD-nl",
37
+ source="EuroEval/squad-nl-v2-mini",
41
38
  task=RC,
42
- languages=[NL],
39
+ languages=[DUTCH],
43
40
  )
44
41
 
45
42
  WIKI_LINGUA_NL_CONFIG = DatasetConfig(
46
43
  name="wiki-lingua-nl",
47
- pretty_name="the Dutch part of the truncated version of the summarisation dataset "
48
- "WikiLingua",
49
- huggingface_id="EuroEval/wiki-lingua-nl-mini",
44
+ pretty_name="WikiLingua-nl",
45
+ source="EuroEval/wiki-lingua-nl-mini",
50
46
  task=SUMM,
51
- languages=[NL],
47
+ languages=[DUTCH],
52
48
  )
53
49
 
54
50
  MMLU_NL_CONFIG = DatasetConfig(
55
51
  name="mmlu-nl",
56
- pretty_name="the truncated version of the Dutch knowledge dataset MMLU-nl, "
57
- "translated from the English MMLU dataset",
58
- huggingface_id="EuroEval/mmlu-nl-mini",
52
+ pretty_name="MMLU-nl",
53
+ source="EuroEval/mmlu-nl-mini",
59
54
  task=KNOW,
60
- languages=[NL],
55
+ languages=[DUTCH],
61
56
  )
62
57
 
63
58
  HELLASWAG_NL_CONFIG = DatasetConfig(
64
59
  name="hellaswag-nl",
65
- pretty_name="the truncated version of the Dutch common-sense reasoning dataset "
66
- "HellaSwag-nl, translated from the English HellaSwag dataset",
67
- huggingface_id="EuroEval/hellaswag-nl-mini",
60
+ pretty_name="HellaSwag-nl",
61
+ source="EuroEval/hellaswag-nl-mini",
68
62
  task=COMMON_SENSE,
69
- languages=[NL],
63
+ languages=[DUTCH],
70
64
  )
71
65
 
72
- EUROPEAN_VALUES_NL_CONFIG = DatasetConfig(
73
- name="european-values-nl",
74
- pretty_name="the Dutch version of the European values evaluation dataset",
75
- huggingface_id="EuroEval/european-values-nl",
66
+ VALEU_NL_CONFIG = DatasetConfig(
67
+ name="valeu-nl",
68
+ pretty_name="VaLEU-nl",
69
+ source="EuroEval/european-values-nl",
76
70
  task=EUROPEAN_VALUES,
77
- languages=[NL],
71
+ languages=[DUTCH],
78
72
  splits=["test"],
79
73
  bootstrap_samples=False,
80
74
  _instruction_prompt="{text}",
@@ -85,96 +79,64 @@ EUROPEAN_VALUES_NL_CONFIG = DatasetConfig(
85
79
 
86
80
  DUTCH_COLA_CONFIG = DatasetConfig(
87
81
  name="dutch-cola",
88
- pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
89
- "Dutch CoLA",
90
- huggingface_id="EuroEval/dutch-cola",
82
+ pretty_name="Dutch CoLA",
83
+ source="EuroEval/dutch-cola",
91
84
  task=LA,
92
- languages=[NL],
85
+ languages=[DUTCH],
93
86
  unofficial=True,
94
87
  )
95
88
 
96
89
  DUTCH_COLA_FULL_CONFIG = DatasetConfig(
97
90
  name="dutch-cola-full",
98
- pretty_name="the Dutch linguistic acceptability dataset Dutch CoLA",
99
- huggingface_id="EuroEval/dutch-cola-full",
91
+ pretty_name="Dutch CoLA Full",
92
+ source="EuroEval/dutch-cola-full",
100
93
  task=LA,
101
- languages=[NL],
94
+ languages=[DUTCH],
102
95
  unofficial=True,
103
96
  )
104
97
 
105
98
  ARC_NL_CONFIG = DatasetConfig(
106
99
  name="arc-nl",
107
- pretty_name="the truncated version of the Dutch knowledge dataset ARC-nl, "
108
- "translated from the English ARC dataset",
109
- huggingface_id="EuroEval/arc-nl-mini",
100
+ pretty_name="ARC-nl",
101
+ source="EuroEval/arc-nl-mini",
110
102
  task=KNOW,
111
- languages=[NL],
103
+ languages=[DUTCH],
112
104
  unofficial=True,
113
105
  )
114
106
 
115
107
  BELEBELE_NL_CONFIG = DatasetConfig(
116
108
  name="belebele-nl",
117
- pretty_name="the Dutch multiple choice reading comprehension dataset BeleBele-nl, "
118
- "translated from the English BeleBele dataset",
119
- huggingface_id="EuroEval/belebele-nl-mini",
109
+ pretty_name="Belebele-nl",
110
+ source="EuroEval/belebele-nl-mini",
120
111
  task=MCRC,
121
- languages=[NL],
112
+ languages=[DUTCH],
122
113
  unofficial=True,
123
114
  )
124
115
 
125
116
  MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
126
117
  name="multi-wiki-qa-nl",
127
- pretty_name="the truncated version of the Dutch part of the reading "
128
- "comprehension dataset MultiWikiQA",
129
- huggingface_id="EuroEval/multi-wiki-qa-nl-mini",
118
+ pretty_name="MultiWikiQA-nl",
119
+ source="EuroEval/multi-wiki-qa-nl-mini",
130
120
  task=RC,
131
- languages=[NL],
121
+ languages=[DUTCH],
132
122
  unofficial=True,
133
123
  )
134
124
 
135
125
  GOLDENSWAG_NL_CONFIG = DatasetConfig(
136
126
  name="goldenswag-nl",
137
- pretty_name="the truncated version of the Dutch common-sense reasoning "
138
- "dataset GoldenSwag-nl, translated from the English GoldenSwag dataset",
139
- huggingface_id="EuroEval/goldenswag-nl-mini",
127
+ pretty_name="GoldenSwag-nl",
128
+ source="EuroEval/goldenswag-nl-mini",
140
129
  task=COMMON_SENSE,
141
- languages=[NL],
130
+ languages=[DUTCH],
142
131
  unofficial=True,
143
132
  )
144
133
 
145
134
  WINOGRANDE_NL_CONFIG = DatasetConfig(
146
135
  name="winogrande-nl",
147
- pretty_name="the Dutch common-sense reasoning dataset Winogrande-nl, translated "
148
- "from the English Winogrande dataset",
149
- huggingface_id="EuroEval/winogrande-nl",
136
+ pretty_name="Winogrande-nl",
137
+ source="EuroEval/winogrande-nl",
150
138
  task=COMMON_SENSE,
151
- languages=[NL],
139
+ languages=[DUTCH],
152
140
  _labels=["a", "b"],
153
141
  unofficial=True,
154
142
  )
155
-
156
- EUROPEAN_VALUES_SITUATIONAL_NL_CONFIG = DatasetConfig(
157
- name="european-values-situational-nl",
158
- pretty_name="the Dutch version of the European values evaluation dataset, where "
159
- "the questions are phrased in a situational way",
160
- huggingface_id="EuroEval/european-values-situational-nl",
161
- task=EUROPEAN_VALUES,
162
- languages=[NL],
163
- splits=["test"],
164
- bootstrap_samples=False,
165
- _instruction_prompt="{text}",
166
- unofficial=True,
167
- )
168
-
169
- EUROPEAN_VALUES_COMPLETIONS_NL_CONFIG = DatasetConfig(
170
- name="european-values-completions-nl",
171
- pretty_name="the Dutch version of the European values evaluation dataset, where "
172
- "the questions are phrased as sentence completions",
173
- huggingface_id="EuroEval/european-values-completions-nl",
174
- task=EUROPEAN_VALUES,
175
- languages=[NL],
176
- splits=["test"],
177
- bootstrap_samples=False,
178
- _instruction_prompt="{text}",
179
- unofficial=True,
180
- )