EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. euroeval/__init__.py +32 -14
  2. euroeval/benchmark_config_factory.py +92 -180
  3. euroeval/benchmark_modules/base.py +49 -39
  4. euroeval/benchmark_modules/fresh.py +35 -21
  5. euroeval/benchmark_modules/hf.py +280 -244
  6. euroeval/benchmark_modules/litellm.py +752 -312
  7. euroeval/benchmark_modules/vllm.py +570 -268
  8. euroeval/benchmarker.py +651 -528
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +49 -38
  12. euroeval/constants.py +44 -25
  13. euroeval/data_loading.py +111 -55
  14. euroeval/data_models.py +490 -323
  15. euroeval/dataset_configs/__init__.py +26 -4
  16. euroeval/dataset_configs/bosnian.py +39 -0
  17. euroeval/dataset_configs/bulgarian.py +56 -0
  18. euroeval/dataset_configs/croatian.py +56 -0
  19. euroeval/dataset_configs/czech.py +75 -0
  20. euroeval/dataset_configs/danish.py +78 -50
  21. euroeval/dataset_configs/dutch.py +74 -44
  22. euroeval/dataset_configs/english.py +71 -36
  23. euroeval/dataset_configs/estonian.py +111 -0
  24. euroeval/dataset_configs/faroese.py +25 -18
  25. euroeval/dataset_configs/finnish.py +63 -26
  26. euroeval/dataset_configs/french.py +65 -32
  27. euroeval/dataset_configs/german.py +77 -36
  28. euroeval/dataset_configs/greek.py +64 -0
  29. euroeval/dataset_configs/icelandic.py +68 -57
  30. euroeval/dataset_configs/italian.py +68 -36
  31. euroeval/dataset_configs/latvian.py +87 -0
  32. euroeval/dataset_configs/lithuanian.py +64 -0
  33. euroeval/dataset_configs/norwegian.py +98 -72
  34. euroeval/dataset_configs/polish.py +96 -0
  35. euroeval/dataset_configs/portuguese.py +63 -40
  36. euroeval/dataset_configs/serbian.py +64 -0
  37. euroeval/dataset_configs/slovak.py +55 -0
  38. euroeval/dataset_configs/slovene.py +56 -0
  39. euroeval/dataset_configs/spanish.py +68 -34
  40. euroeval/dataset_configs/swedish.py +82 -41
  41. euroeval/dataset_configs/ukrainian.py +64 -0
  42. euroeval/enums.py +12 -6
  43. euroeval/exceptions.py +21 -1
  44. euroeval/finetuning.py +34 -26
  45. euroeval/generation.py +76 -41
  46. euroeval/generation_utils.py +169 -34
  47. euroeval/languages.py +1020 -188
  48. euroeval/logging_utils.py +268 -0
  49. euroeval/metrics/__init__.py +6 -0
  50. euroeval/metrics/base.py +85 -0
  51. euroeval/metrics/huggingface.py +216 -0
  52. euroeval/metrics/llm_as_a_judge.py +260 -0
  53. euroeval/metrics/pipeline.py +289 -0
  54. euroeval/metrics/speed.py +48 -0
  55. euroeval/model_cache.py +40 -21
  56. euroeval/model_config.py +4 -5
  57. euroeval/model_loading.py +3 -0
  58. euroeval/prompt_templates/__init__.py +2 -0
  59. euroeval/prompt_templates/classification.py +206 -0
  60. euroeval/prompt_templates/linguistic_acceptability.py +157 -22
  61. euroeval/prompt_templates/multiple_choice.py +159 -17
  62. euroeval/prompt_templates/named_entity_recognition.py +318 -21
  63. euroeval/prompt_templates/reading_comprehension.py +207 -16
  64. euroeval/prompt_templates/sentiment_classification.py +205 -22
  65. euroeval/prompt_templates/summarization.py +122 -22
  66. euroeval/prompt_templates/token_classification.py +279 -0
  67. euroeval/scores.py +20 -9
  68. euroeval/speed_benchmark.py +11 -12
  69. euroeval/task_group_utils/multiple_choice_classification.py +21 -12
  70. euroeval/task_group_utils/question_answering.py +101 -73
  71. euroeval/task_group_utils/sequence_classification.py +144 -61
  72. euroeval/task_group_utils/text_to_text.py +33 -12
  73. euroeval/task_group_utils/token_classification.py +86 -89
  74. euroeval/tasks.py +75 -16
  75. euroeval/tokenisation_utils.py +603 -0
  76. euroeval/types.py +17 -11
  77. euroeval/utils.py +332 -137
  78. euroeval-16.7.1.dist-info/METADATA +623 -0
  79. euroeval-16.7.1.dist-info/RECORD +84 -0
  80. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
  81. euroeval/human_evaluation.py +0 -737
  82. euroeval/metrics.py +0 -452
  83. euroeval/tokenization_utils.py +0 -498
  84. euroeval-15.12.0.dist-info/METADATA +0 -285
  85. euroeval-15.12.0.dist-info/RECORD +0 -63
  86. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
  87. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
@@ -3,19 +3,33 @@
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import get_all_languages
5
5
  from ..tasks import SPEED
6
+ from ..utils import load_custom_datasets_module
7
+ from .bosnian import * # noqa: F403
8
+ from .bulgarian import * # noqa: F403
9
+ from .croatian import * # noqa: F403
10
+ from .czech import * # noqa: F403
6
11
  from .danish import * # noqa: F403
7
12
  from .dutch import * # noqa: F403
8
13
  from .english import * # noqa: F403
14
+ from .estonian import * # noqa: F403
9
15
  from .faroese import * # noqa: F403
10
16
  from .finnish import * # noqa: F403
11
17
  from .french import * # noqa: F403
12
18
  from .german import * # noqa: F403
19
+ from .greek import * # noqa: F403
13
20
  from .icelandic import * # noqa: F403
14
21
  from .italian import * # noqa: F403
22
+ from .latvian import * # noqa: F403
23
+ from .lithuanian import * # noqa: F403
15
24
  from .norwegian import * # noqa: F403
25
+ from .polish import * # noqa: F403
16
26
  from .portuguese import * # noqa: F403
27
+ from .serbian import * # noqa: F403
28
+ from .slovak import * # noqa: F403
29
+ from .slovene import * # noqa: F403
17
30
  from .spanish import * # noqa: F403
18
31
  from .swedish import * # noqa: F403
32
+ from .ukrainian import * # noqa: F403
19
33
 
20
34
 
21
35
  def get_all_dataset_configs() -> dict[str, DatasetConfig]:
@@ -24,14 +38,21 @@ def get_all_dataset_configs() -> dict[str, DatasetConfig]:
24
38
  Returns:
25
39
  A mapping between names of datasets and their configurations.
26
40
  """
41
+ globals_dict = globals()
42
+ module = load_custom_datasets_module()
43
+ if module is not None:
44
+ globals_dict |= vars(module)
27
45
  dataset_configs = [
28
- cfg for cfg in globals().values() if isinstance(cfg, DatasetConfig)
46
+ cfg
47
+ for cfg in globals_dict.values()
48
+ if isinstance(cfg, DatasetConfig) and cfg.task != SPEED
29
49
  ]
30
50
  assert len(dataset_configs) == len({cfg.name for cfg in dataset_configs}), (
31
51
  "There are duplicate dataset configurations. Please ensure that each dataset "
32
52
  "has a unique name."
33
53
  )
34
- return {cfg.name: cfg for cfg in dataset_configs}
54
+ mapping = {cfg.name: cfg for cfg in dataset_configs}
55
+ return mapping
35
56
 
36
57
 
37
58
  def get_dataset_config(dataset_name: str) -> DatasetConfig:
@@ -56,8 +77,9 @@ def get_dataset_config(dataset_name: str) -> DatasetConfig:
56
77
 
57
78
  SPEED_CONFIG = DatasetConfig(
58
79
  name="speed",
59
- pretty_name="the speed estimation benchmark",
60
- huggingface_id="",
80
+ pretty_name="",
81
+ source="",
61
82
  task=SPEED,
62
83
  languages=list(get_all_languages().values()),
84
+ _logging_string="the speed estimation benchmark",
63
85
  )
@@ -0,0 +1,39 @@
1
+ """All Bosnian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import BOSNIAN
5
+ from ..tasks import NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ MMS_BS_CONFIG = DatasetConfig(
10
+ name="mms-bs",
11
+ pretty_name="MMS-bs",
12
+ source="EuroEval/mms-bs-mini",
13
+ task=SENT,
14
+ languages=[BOSNIAN],
15
+ )
16
+
17
+ WIKIANN_BS_CONFIG = DatasetConfig(
18
+ name="wikiann-bs",
19
+ pretty_name="WikiANN-bs",
20
+ source="EuroEval/wikiann-bs-mini",
21
+ task=NER,
22
+ languages=[BOSNIAN],
23
+ )
24
+
25
+ MULTI_WIKI_QA_BS_CONFIG = DatasetConfig(
26
+ name="multi-wiki-qa-bs",
27
+ pretty_name="MultiWikiQA-bs",
28
+ source="EuroEval/multi-wiki-qa-bs-mini",
29
+ task=RC,
30
+ languages=[BOSNIAN],
31
+ )
32
+
33
+ LR_SUM_BS_CONFIG = DatasetConfig(
34
+ name="lr-sum-bs",
35
+ pretty_name="LRSum-bs",
36
+ source="EuroEval/lr-sum-bs-mini",
37
+ task=SUMM,
38
+ languages=[BOSNIAN],
39
+ )
@@ -0,0 +1,56 @@
1
+ """All Bulgarian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import BULGARIAN
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
6
+
7
+ ### Official datasets ###
8
+
9
+ CINEXIO_CONFIG = DatasetConfig(
10
+ name="cinexio",
11
+ pretty_name="Cinexio",
12
+ source="EuroEval/cinexio-mini",
13
+ task=SENT,
14
+ languages=[BULGARIAN],
15
+ )
16
+
17
+ SCALA_BG_CONFIG = DatasetConfig(
18
+ name="scala-bg",
19
+ pretty_name="ScaLA-bg",
20
+ source="EuroEval/scala-bg",
21
+ task=LA,
22
+ languages=[BULGARIAN],
23
+ )
24
+
25
+ BG_NER_BSNLP_CONFIG = DatasetConfig(
26
+ name="bg-ner-bsnlp",
27
+ pretty_name="BG-NER-BSNLp",
28
+ source="EuroEval/bg-ner-bsnlp-mini",
29
+ task=NER,
30
+ languages=[BULGARIAN],
31
+ )
32
+
33
+ MULTI_WIKI_QA_BG_CONFIG = DatasetConfig(
34
+ name="multi-wiki-qa-bg",
35
+ pretty_name="MultiWikiQA-bg",
36
+ source="EuroEval/multi-wiki-qa-bg-mini",
37
+ task=RC,
38
+ languages=[BULGARIAN],
39
+ )
40
+
41
+ EXAMS_BG_CONFIG = DatasetConfig(
42
+ name="exams-bg",
43
+ pretty_name="Exams-bg",
44
+ source="EuroEval/exams-bg-mini",
45
+ task=KNOW,
46
+ languages=[BULGARIAN],
47
+ )
48
+
49
+ WINOGRANDE_BG_CONFIG = DatasetConfig(
50
+ name="winogrande-bg",
51
+ pretty_name="Winogrande-bg",
52
+ source="EuroEval/winogrande-bg",
53
+ task=COMMON_SENSE,
54
+ languages=[BULGARIAN],
55
+ _labels=["a", "b"],
56
+ )
@@ -0,0 +1,56 @@
1
+ """All Croatian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import CROATIAN
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
6
+
7
+ ### Official datasets ###
8
+
9
+ MMS_HR_CONFIG = DatasetConfig(
10
+ name="mms-hr",
11
+ pretty_name="MMS-hr",
12
+ source="EuroEval/mms-hr-mini",
13
+ task=SENT,
14
+ languages=[CROATIAN],
15
+ )
16
+
17
+ SCALA_HR_CONFIG = DatasetConfig(
18
+ name="scala-hr",
19
+ pretty_name="ScaLA-hr",
20
+ source="EuroEval/scala-hr",
21
+ task=LA,
22
+ languages=[CROATIAN],
23
+ )
24
+
25
+ WIKIANN_HR_CONFIG = DatasetConfig(
26
+ name="wikiann-hr",
27
+ pretty_name="WikiANN-hr",
28
+ source="EuroEval/wikiann-hr-mini",
29
+ task=NER,
30
+ languages=[CROATIAN],
31
+ )
32
+
33
+ MULTI_WIKI_QA_HR_CONFIG = DatasetConfig(
34
+ name="multi-wiki-qa-hr",
35
+ pretty_name="MultiWikiQA-hr",
36
+ source="EuroEval/multi-wiki-qa-hr-mini",
37
+ task=RC,
38
+ languages=[CROATIAN],
39
+ )
40
+
41
+ MMLU_HR_CONFIG = DatasetConfig(
42
+ name="mmlu-hr",
43
+ pretty_name="MMLU-hr",
44
+ source="EuroEval/mmlu-hr-mini",
45
+ task=KNOW,
46
+ languages=[CROATIAN],
47
+ )
48
+
49
+ WINOGRANDE_HR_CONFIG = DatasetConfig(
50
+ name="winogrande-hr",
51
+ pretty_name="Winogrande-hr",
52
+ source="EuroEval/winogrande-hr",
53
+ task=COMMON_SENSE,
54
+ languages=[CROATIAN],
55
+ _labels=["a", "b"],
56
+ )
@@ -0,0 +1,75 @@
1
+ """All Czech dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import CZECH
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ CSFD_SENTIMENT_CONFIG = DatasetConfig(
10
+ name="csfd-sentiment",
11
+ pretty_name="CSFD Sentiment",
12
+ source="EuroEval/csfd-sentiment-mini",
13
+ task=SENT,
14
+ languages=[CZECH],
15
+ )
16
+
17
+ CS_GEC_CONFIG = DatasetConfig(
18
+ name="cs-gec",
19
+ pretty_name="CS-GEC",
20
+ source="EuroEval/cs-gec-mini",
21
+ task=LA,
22
+ languages=[CZECH],
23
+ )
24
+
25
+ PONER_CONFIG = DatasetConfig(
26
+ name="poner",
27
+ pretty_name="PoNER",
28
+ source="EuroEval/poner-mini",
29
+ task=NER,
30
+ languages=[CZECH],
31
+ )
32
+
33
+ SQAD_CONFIG = DatasetConfig(
34
+ name="sqad",
35
+ pretty_name="SQAD",
36
+ source="EuroEval/sqad-mini",
37
+ task=RC,
38
+ languages=[CZECH],
39
+ )
40
+
41
+ CZECH_NEWS_CONFIG = DatasetConfig(
42
+ name="czech-news",
43
+ pretty_name="Czech News",
44
+ source="EuroEval/czech-news-mini",
45
+ task=SUMM,
46
+ languages=[CZECH],
47
+ )
48
+
49
+ UMIMETO_QA_CONFIG = DatasetConfig(
50
+ name="umimeto-qa",
51
+ pretty_name="Umimeto QA",
52
+ source="EuroEval/umimeto-qa",
53
+ task=KNOW,
54
+ languages=[CZECH],
55
+ )
56
+
57
+ HELLASWAG_CS_CONFIG = DatasetConfig(
58
+ name="hellaswag-cs",
59
+ pretty_name="HellaSwag-cs",
60
+ source="EuroEval/hellaswag-cs-mini",
61
+ task=COMMON_SENSE,
62
+ languages=[CZECH],
63
+ )
64
+
65
+
66
+ ### Unofficial datasets ###
67
+
68
+ SCALA_CS_CONFIG = DatasetConfig(
69
+ name="scala-cs",
70
+ pretty_name="ScaLA-cs",
71
+ source="EuroEval/scala-cs",
72
+ task=LA,
73
+ languages=[CZECH],
74
+ unofficial=True,
75
+ )
@@ -1,79 +1,83 @@
1
1
  """All Danish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import DA
5
- from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
4
+ from ..languages import DANISH
5
+ from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  ANGRY_TWEETS_CONFIG = DatasetConfig(
10
10
  name="angry-tweets",
11
- pretty_name="the truncated version of the Danish sentiment classification "
12
- "dataset AngryTweets",
13
- huggingface_id="EuroEval/angry-tweets-mini",
11
+ pretty_name="AngryTweets",
12
+ source="EuroEval/angry-tweets-mini",
14
13
  task=SENT,
15
- languages=[DA],
14
+ languages=[DANISH],
16
15
  )
17
16
 
18
17
  SCALA_DA_CONFIG = DatasetConfig(
19
18
  name="scala-da",
20
- pretty_name="the Danish part of the linguistic acceptability dataset ScaLA",
21
- huggingface_id="EuroEval/scala-da",
19
+ pretty_name="ScaLA-da",
20
+ source="EuroEval/scala-da",
22
21
  task=LA,
23
- languages=[DA],
22
+ languages=[DANISH],
24
23
  )
25
24
 
26
25
  DANSK_CONFIG = DatasetConfig(
27
26
  name="dansk",
28
- pretty_name="the truncated version of the Danish named entity recognition "
29
- "dataset DANSK",
30
- huggingface_id="EuroEval/dansk-mini",
27
+ pretty_name="DANSK",
28
+ source="EuroEval/dansk-mini",
31
29
  task=NER,
32
- languages=[DA],
30
+ languages=[DANISH],
33
31
  )
34
32
 
35
- SCANDIQA_DA_CONFIG = DatasetConfig(
36
- name="scandiqa-da",
37
- pretty_name="the Danish part of the truncated version of the question answering "
38
- "dataset ScandiQA",
39
- huggingface_id="EuroEval/scandiqa-da-mini",
33
+ MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
34
+ name="multi-wiki-qa-da",
35
+ pretty_name="MultiWikiQA-da",
36
+ source="EuroEval/multi-wiki-qa-da-mini",
40
37
  task=RC,
41
- languages=[DA],
38
+ languages=[DANISH],
42
39
  )
43
40
 
44
41
  NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
45
42
  name="nordjylland-news",
46
- pretty_name="the truncated version of the Danish summarisation dataset "
47
- "Nordjylland News",
48
- huggingface_id="EuroEval/nordjylland-news-mini",
43
+ pretty_name="Nordjylland News",
44
+ source="EuroEval/nordjylland-news-mini",
49
45
  task=SUMM,
50
- languages=[DA],
46
+ languages=[DANISH],
51
47
  )
52
48
 
53
49
  DANSKE_TALEMAADER_CONFIG = DatasetConfig(
54
50
  name="danske-talemaader",
55
- pretty_name="the truncated version of the Danish knowledge dataset Danske "
56
- "Talemåder",
57
- huggingface_id="EuroEval/danske-talemaader",
51
+ pretty_name="Danske Talemåder",
52
+ source="EuroEval/danske-talemaader",
58
53
  task=KNOW,
59
- languages=[DA],
54
+ languages=[DANISH],
60
55
  )
61
56
 
62
57
  DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
63
58
  name="danish-citizen-tests",
64
- pretty_name="the Danish knowledge dataset Danish Citizen Tests",
65
- huggingface_id="EuroEval/danish-citizen-tests-updated",
59
+ pretty_name="Danish Citizen Tests",
60
+ source="EuroEval/danish-citizen-tests-updated",
66
61
  task=KNOW,
67
- languages=[DA],
62
+ languages=[DANISH],
68
63
  )
69
64
 
70
65
  HELLASWAG_DA_CONFIG = DatasetConfig(
71
66
  name="hellaswag-da",
72
- pretty_name="the truncated version of the Danish common-sense reasoning dataset "
73
- "HellaSwag-da, translated from the English HellaSwag dataset",
74
- huggingface_id="EuroEval/hellaswag-da-mini",
67
+ pretty_name="HellaSwag-da",
68
+ source="EuroEval/hellaswag-da-mini",
75
69
  task=COMMON_SENSE,
76
- languages=[DA],
70
+ languages=[DANISH],
71
+ )
72
+
73
+ VALEU_DA_CONFIG = DatasetConfig(
74
+ name="valeu-da",
75
+ pretty_name="ValEU-da",
76
+ source="EuroEval/european-values-da",
77
+ task=EUROPEAN_VALUES,
78
+ languages=[DANISH],
79
+ splits=["test"],
80
+ bootstrap_samples=False,
77
81
  )
78
82
 
79
83
 
@@ -81,40 +85,64 @@ HELLASWAG_DA_CONFIG = DatasetConfig(
81
85
 
82
86
  DANE_CONFIG = DatasetConfig(
83
87
  name="dane",
84
- pretty_name="the truncated version of the Danish named entity recognition "
85
- "dataset DaNE",
86
- huggingface_id="EuroEval/dane-mini",
88
+ pretty_name="DaNE",
89
+ source="EuroEval/dane-mini",
87
90
  task=NER,
88
- languages=[DA],
91
+ languages=[DANISH],
89
92
  unofficial=True,
90
93
  )
91
94
 
92
95
  MMLU_DA_CONFIG = DatasetConfig(
93
96
  name="mmlu-da",
94
- pretty_name="the truncated version of the Danish knowledge dataset MMLU-da, "
95
- "translated from the English MMLU dataset",
96
- huggingface_id="EuroEval/mmlu-da-mini",
97
+ pretty_name="MMLU-da",
98
+ source="EuroEval/mmlu-da-mini",
97
99
  task=KNOW,
98
- languages=[DA],
100
+ languages=[DANISH],
99
101
  unofficial=True,
100
102
  )
101
103
 
102
104
  ARC_DA_CONFIG = DatasetConfig(
103
105
  name="arc-da",
104
- pretty_name="the truncated version of the Danish knowledge dataset ARC-da, "
105
- "translated from the English ARC dataset",
106
- huggingface_id="EuroEval/arc-da-mini",
106
+ pretty_name="ARC-da",
107
+ source="EuroEval/arc-da-mini",
107
108
  task=KNOW,
108
- languages=[DA],
109
+ languages=[DANISH],
109
110
  unofficial=True,
110
111
  )
111
112
 
112
113
  BELEBELE_DA_CONFIG = DatasetConfig(
113
114
  name="belebele-da",
114
- pretty_name="the Danish multiple choice reading comprehension dataset BeleBele-da, "
115
- "translated from the English BeleBele dataset",
116
- huggingface_id="EuroEval/belebele-da-mini",
115
+ pretty_name="Belebele-da",
116
+ source="EuroEval/belebele-da-mini",
117
117
  task=MCRC,
118
- languages=[DA],
118
+ languages=[DANISH],
119
+ unofficial=True,
120
+ )
121
+
122
+ SCANDIQA_DA_CONFIG = DatasetConfig(
123
+ name="scandiqa-da",
124
+ pretty_name="ScandiQA-da",
125
+ source="EuroEval/scandiqa-da-mini",
126
+ task=RC,
127
+ languages=[DANISH],
128
+ unofficial=True,
129
+ )
130
+
131
+ GOLDENSWAG_DA_CONFIG = DatasetConfig(
132
+ name="goldenswag-da",
133
+ pretty_name="GoldenSwag-da",
134
+ source="EuroEval/goldenswag-da-mini",
135
+ task=COMMON_SENSE,
136
+ languages=[DANISH],
137
+ unofficial=True,
138
+ )
139
+
140
+ WINOGRANDE_DA_CONFIG = DatasetConfig(
141
+ name="winogrande-da",
142
+ pretty_name="Winogrande-da",
143
+ source="EuroEval/winogrande-da",
144
+ task=COMMON_SENSE,
145
+ languages=[DANISH],
146
+ _labels=["a", "b"],
119
147
  unofficial=True,
120
148
  )
@@ -1,72 +1,77 @@
1
1
  """All Dutch dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import NL
5
- from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
4
+ from ..languages import DUTCH
5
+ from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  DBRD_CONFIG = DatasetConfig(
10
10
  name="dbrd",
11
- pretty_name="the truncated version of the Dutch sentiment classification "
12
- "dataset DBRD",
13
- huggingface_id="EuroEval/dbrd-mini",
11
+ pretty_name="DBRD",
12
+ source="EuroEval/dbrd-mini",
14
13
  task=SENT,
15
- languages=[NL],
14
+ languages=[DUTCH],
16
15
  _labels=["negative", "positive"],
17
16
  )
18
17
 
19
18
  SCALA_NL_CONFIG = DatasetConfig(
20
19
  name="scala-nl",
21
- pretty_name="the Dutch part of the linguistic acceptability dataset ScaLA",
22
- huggingface_id="EuroEval/scala-nl",
20
+ pretty_name="ScaLA-nl",
21
+ source="EuroEval/scala-nl",
23
22
  task=LA,
24
- languages=[NL],
23
+ languages=[DUTCH],
25
24
  )
26
25
 
27
26
  CONLL_NL_CONFIG = DatasetConfig(
28
27
  name="conll-nl",
29
- pretty_name="the Dutch part of the truncated version of the named entity "
30
- "recognition dataset CoNLL 2002",
31
- huggingface_id="EuroEval/conll-nl-mini",
28
+ pretty_name="CoNLL-nl",
29
+ source="EuroEval/conll-nl-mini",
32
30
  task=NER,
33
- languages=[NL],
31
+ languages=[DUTCH],
34
32
  )
35
33
 
36
34
  SQUAD_NL_CONFIG = DatasetConfig(
37
35
  name="squad-nl",
38
- pretty_name="the truncated version of the Dutch reading comprehension dataset "
39
- "SQuAD-nl, translated from the English SQuAD dataset",
40
- huggingface_id="EuroEval/squad-nl-v2-mini",
36
+ pretty_name="SQuAD-nl",
37
+ source="EuroEval/squad-nl-v2-mini",
41
38
  task=RC,
42
- languages=[NL],
39
+ languages=[DUTCH],
43
40
  )
44
41
 
45
42
  WIKI_LINGUA_NL_CONFIG = DatasetConfig(
46
43
  name="wiki-lingua-nl",
47
- pretty_name="the Dutch part of the truncated version of the summarisation dataset "
48
- "WikiLingua",
49
- huggingface_id="EuroEval/wiki-lingua-nl-mini",
44
+ pretty_name="WikiLingua-nl",
45
+ source="EuroEval/wiki-lingua-nl-mini",
50
46
  task=SUMM,
51
- languages=[NL],
47
+ languages=[DUTCH],
52
48
  )
53
49
 
54
50
  MMLU_NL_CONFIG = DatasetConfig(
55
51
  name="mmlu-nl",
56
- pretty_name="the truncated version of the Dutch knowledge dataset MMLU-nl, "
57
- "translated from the English MMLU dataset",
58
- huggingface_id="EuroEval/mmlu-nl-mini",
52
+ pretty_name="MMLU-nl",
53
+ source="EuroEval/mmlu-nl-mini",
59
54
  task=KNOW,
60
- languages=[NL],
55
+ languages=[DUTCH],
61
56
  )
62
57
 
63
58
  HELLASWAG_NL_CONFIG = DatasetConfig(
64
59
  name="hellaswag-nl",
65
- pretty_name="the truncated version of the Dutch common-sense reasoning dataset "
66
- "HellaSwag-nl, translated from the English HellaSwag dataset",
67
- huggingface_id="EuroEval/hellaswag-nl-mini",
60
+ pretty_name="HellaSwag-nl",
61
+ source="EuroEval/hellaswag-nl-mini",
68
62
  task=COMMON_SENSE,
69
- languages=[NL],
63
+ languages=[DUTCH],
64
+ )
65
+
66
+ VALEU_NL_CONFIG = DatasetConfig(
67
+ name="valeu-nl",
68
+ pretty_name="VaLEU-nl",
69
+ source="EuroEval/european-values-nl",
70
+ task=EUROPEAN_VALUES,
71
+ languages=[DUTCH],
72
+ splits=["test"],
73
+ bootstrap_samples=False,
74
+ _instruction_prompt="{text}",
70
75
  )
71
76
 
72
77
 
@@ -74,39 +79,64 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
74
79
 
75
80
  DUTCH_COLA_CONFIG = DatasetConfig(
76
81
  name="dutch-cola",
77
- pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
78
- "Dutch CoLA",
79
- huggingface_id="EuroEval/dutch-cola",
82
+ pretty_name="Dutch CoLA",
83
+ source="EuroEval/dutch-cola",
80
84
  task=LA,
81
- languages=[NL],
85
+ languages=[DUTCH],
82
86
  unofficial=True,
83
87
  )
84
88
 
85
89
  DUTCH_COLA_FULL_CONFIG = DatasetConfig(
86
90
  name="dutch-cola-full",
87
- pretty_name="the Dutch linguistic acceptability dataset Dutch CoLA",
88
- huggingface_id="EuroEval/dutch-cola-full",
91
+ pretty_name="Dutch CoLA Full",
92
+ source="EuroEval/dutch-cola-full",
89
93
  task=LA,
90
- languages=[NL],
94
+ languages=[DUTCH],
91
95
  unofficial=True,
92
96
  )
93
97
 
94
98
  ARC_NL_CONFIG = DatasetConfig(
95
99
  name="arc-nl",
96
- pretty_name="the truncated version of the Dutch knowledge dataset ARC-nl, "
97
- "translated from the English ARC dataset",
98
- huggingface_id="EuroEval/arc-nl-mini",
100
+ pretty_name="ARC-nl",
101
+ source="EuroEval/arc-nl-mini",
99
102
  task=KNOW,
100
- languages=[NL],
103
+ languages=[DUTCH],
101
104
  unofficial=True,
102
105
  )
103
106
 
104
107
  BELEBELE_NL_CONFIG = DatasetConfig(
105
108
  name="belebele-nl",
106
- pretty_name="the Dutch multiple choice reading comprehension dataset BeleBele-nl, "
107
- "translated from the English BeleBele dataset",
108
- huggingface_id="EuroEval/belebele-nl-mini",
109
+ pretty_name="Belebele-nl",
110
+ source="EuroEval/belebele-nl-mini",
109
111
  task=MCRC,
110
- languages=[NL],
112
+ languages=[DUTCH],
113
+ unofficial=True,
114
+ )
115
+
116
+ MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
117
+ name="multi-wiki-qa-nl",
118
+ pretty_name="MultiWikiQA-nl",
119
+ source="EuroEval/multi-wiki-qa-nl-mini",
120
+ task=RC,
121
+ languages=[DUTCH],
122
+ unofficial=True,
123
+ )
124
+
125
+ GOLDENSWAG_NL_CONFIG = DatasetConfig(
126
+ name="goldenswag-nl",
127
+ pretty_name="GoldenSwag-nl",
128
+ source="EuroEval/goldenswag-nl-mini",
129
+ task=COMMON_SENSE,
130
+ languages=[DUTCH],
131
+ unofficial=True,
132
+ )
133
+
134
+ WINOGRANDE_NL_CONFIG = DatasetConfig(
135
+ name="winogrande-nl",
136
+ pretty_name="Winogrande-nl",
137
+ source="EuroEval/winogrande-nl",
138
+ task=COMMON_SENSE,
139
+ languages=[DUTCH],
140
+ _labels=["a", "b"],
111
141
  unofficial=True,
112
142
  )