EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +3 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +190 -110
- euroeval/benchmark_modules/vllm.py +161 -114
- euroeval/benchmarker.py +49 -22
- euroeval/cli.py +3 -3
- euroeval/constants.py +13 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +53 -7
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +38 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +6 -6
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +46 -14
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +234 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +17 -6
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +96 -23
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +47 -75
- euroeval/tasks.py +31 -6
- euroeval/tokenization_utils.py +295 -207
- euroeval/utils.py +118 -34
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
- euroeval-16.0.0.dist-info/RECORD +69 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -470
- euroeval-15.16.0.dist-info/RECORD +0 -63
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""All Estonian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import ET
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
ESTONIAN_VALENCE_CONFIG = DatasetConfig(
|
|
10
|
+
name="estonian-valence",
|
|
11
|
+
pretty_name="the Estonian sentiment classification dataset Estonian Valence",
|
|
12
|
+
huggingface_id="EuroEval/estonian-valence",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[ET],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
GRAMMAR_ET_CONFIG = DatasetConfig(
|
|
18
|
+
name="grammar-et",
|
|
19
|
+
pretty_name="the Estonian linguistic acceptability dataset Grammar-et",
|
|
20
|
+
huggingface_id="EuroEval/grammar-et",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[ET],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
ESTNER_CONFIG = DatasetConfig(
|
|
26
|
+
name="estner",
|
|
27
|
+
pretty_name="the Estonian named entity recognition dataset EstNER",
|
|
28
|
+
huggingface_id="EuroEval/estner-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[ET],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_ET_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-et",
|
|
35
|
+
pretty_name="the truncated version of the Estonian part of the reading "
|
|
36
|
+
"comprehension dataset MultiWikiQA",
|
|
37
|
+
huggingface_id="EuroEval/multi-wiki-qa-et-mini",
|
|
38
|
+
task=RC,
|
|
39
|
+
languages=[ET],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
ERR_NEWS_CONFIG = DatasetConfig(
|
|
43
|
+
name="err-news",
|
|
44
|
+
pretty_name="the Estonian summarisation dataset ErrNews",
|
|
45
|
+
huggingface_id="EuroEval/err-news-mini",
|
|
46
|
+
task=SUMM,
|
|
47
|
+
languages=[ET],
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
EXAM_ET_CONFIG = DatasetConfig(
|
|
51
|
+
name="exam-et",
|
|
52
|
+
pretty_name="the Estonian knowledge assessment dataset Exam-et",
|
|
53
|
+
huggingface_id="EuroEval/exam-et",
|
|
54
|
+
task=KNOW,
|
|
55
|
+
languages=[ET],
|
|
56
|
+
_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
WINOGRANDE_ET_CONFIG = DatasetConfig(
|
|
60
|
+
name="winogrande-et",
|
|
61
|
+
pretty_name="the Estonian common-sense reasoning dataset Winogrande-et",
|
|
62
|
+
huggingface_id="EuroEval/winogrande-et",
|
|
63
|
+
task=COMMON_SENSE,
|
|
64
|
+
languages=[ET],
|
|
65
|
+
_prompt_prefix="Sulle esitatakse lüngaga (_) tekstülesanded, "
|
|
66
|
+
"igal ülesandel on kaks vastusevarianti (a ja b).",
|
|
67
|
+
_prompt_template="Tekstülesanne: {text}\nVastus: {label}",
|
|
68
|
+
_instruction_prompt="Tekstülesanne: {text}\n\n"
|
|
69
|
+
"Sinu ülesanne on valida lünka sobiv vastusevariant. "
|
|
70
|
+
"Vasta ainult {labels_str}. Muud vastused ei ole lubatud.",
|
|
71
|
+
_labels=["a", "b"],
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
EUROPEAN_VALUES_ET_CONFIG = DatasetConfig(
|
|
75
|
+
name="european-values-et",
|
|
76
|
+
pretty_name="the Estonian version of the European values evaluation dataset",
|
|
77
|
+
huggingface_id="EuroEval/european-values-et",
|
|
78
|
+
task=EUROPEAN_VALUES,
|
|
79
|
+
languages=[ET],
|
|
80
|
+
splits=["test"],
|
|
81
|
+
bootstrap_samples=False,
|
|
82
|
+
_instruction_prompt="{text}",
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
### Unofficial datasets ###
|
|
87
|
+
|
|
88
|
+
SCALA_ET_CONFIG = DatasetConfig(
|
|
89
|
+
name="scala-et",
|
|
90
|
+
pretty_name="the Estonian part of the linguistic acceptability dataset ScaLA",
|
|
91
|
+
huggingface_id="EuroEval/scala-et",
|
|
92
|
+
task=LA,
|
|
93
|
+
languages=[ET],
|
|
94
|
+
unofficial=True,
|
|
95
|
+
)
|
|
@@ -40,6 +40,44 @@ FOQA_CONFIG = DatasetConfig(
|
|
|
40
40
|
languages=[FO],
|
|
41
41
|
)
|
|
42
42
|
|
|
43
|
+
# TODO: No Faroese version of the European values dataset exists yet
|
|
44
|
+
# EUROPEAN_VALUES_FO_CONFIG = DatasetConfig(
|
|
45
|
+
# name="european-values-fo",
|
|
46
|
+
# pretty_name="the Faroese version of the European values evaluation dataset",
|
|
47
|
+
# huggingface_id="EuroEval/european-values-fo",
|
|
48
|
+
# task=EUROPEAN_VALUES,
|
|
49
|
+
# languages=[FO],
|
|
50
|
+
# splits=["test"],
|
|
51
|
+
# bootstrap_samples=False,
|
|
52
|
+
# _instruction_prompt="{text}",
|
|
53
|
+
# )
|
|
54
|
+
#
|
|
55
|
+
# EUROPEAN_VALUES_SITUATIONAL_FO_CONFIG = DatasetConfig(
|
|
56
|
+
# name="european-values-situational-fo",
|
|
57
|
+
# pretty_name="the Faroese version of the European values evaluation dataset, "
|
|
58
|
+
# "where the questions are phrased in a situational way",
|
|
59
|
+
# huggingface_id="EuroEval/european-values-situational-fo",
|
|
60
|
+
# task=EUROPEAN_VALUES,
|
|
61
|
+
# languages=[FO],
|
|
62
|
+
# splits=["test"],
|
|
63
|
+
# bootstrap_samples=False,
|
|
64
|
+
# _instruction_prompt="{text}",
|
|
65
|
+
# unofficial=True,
|
|
66
|
+
# )
|
|
67
|
+
#
|
|
68
|
+
# EUROPEAN_VALUES_COMPLETIONS_FO_CONFIG = DatasetConfig(
|
|
69
|
+
# name="european-values-completions-fo",
|
|
70
|
+
# pretty_name="the Faroese version of the European values evaluation dataset, "
|
|
71
|
+
# "where the questions are phrased as sentence completions",
|
|
72
|
+
# huggingface_id="EuroEval/european-values-completions-fo",
|
|
73
|
+
# task=EUROPEAN_VALUES,
|
|
74
|
+
# languages=[FO],
|
|
75
|
+
# splits=["test"],
|
|
76
|
+
# bootstrap_samples=False,
|
|
77
|
+
# _instruction_prompt="{text}",
|
|
78
|
+
# unofficial=True,
|
|
79
|
+
# )
|
|
80
|
+
|
|
43
81
|
|
|
44
82
|
### Unofficial datasets ###
|
|
45
83
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import FI
|
|
5
|
-
from ..tasks import COMMON_SENSE, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -57,6 +57,18 @@ SCALA_FI_CONFIG = DatasetConfig(
|
|
|
57
57
|
languages=[FI],
|
|
58
58
|
)
|
|
59
59
|
|
|
60
|
+
EUROPEAN_VALUES_FI_CONFIG = DatasetConfig(
|
|
61
|
+
name="european-values-fi",
|
|
62
|
+
pretty_name="the Finnish version of the European values evaluation dataset",
|
|
63
|
+
huggingface_id="EuroEval/european-values-fi",
|
|
64
|
+
task=EUROPEAN_VALUES,
|
|
65
|
+
languages=[FI],
|
|
66
|
+
splits=["test"],
|
|
67
|
+
bootstrap_samples=False,
|
|
68
|
+
_instruction_prompt="{text}",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
60
72
|
### Unofficial datasets ###
|
|
61
73
|
|
|
62
74
|
BELEBELE_FI_CONFIG = DatasetConfig(
|
|
@@ -88,3 +100,29 @@ GOLDENSWAG_FI_CONFIG = DatasetConfig(
|
|
|
88
100
|
languages=[FI],
|
|
89
101
|
unofficial=True,
|
|
90
102
|
)
|
|
103
|
+
|
|
104
|
+
EUROPEAN_VALUES_SITUATIONAL_FI_CONFIG = DatasetConfig(
|
|
105
|
+
name="european-values-situational-fi",
|
|
106
|
+
pretty_name="the Finnish version of the European values evaluation dataset, where "
|
|
107
|
+
"the questions are phrased in a situational way",
|
|
108
|
+
huggingface_id="EuroEval/european-values-situational-fi",
|
|
109
|
+
task=EUROPEAN_VALUES,
|
|
110
|
+
languages=[FI],
|
|
111
|
+
splits=["test"],
|
|
112
|
+
bootstrap_samples=False,
|
|
113
|
+
_instruction_prompt="{text}",
|
|
114
|
+
unofficial=True,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
EUROPEAN_VALUES_COMPLETIONS_FI_CONFIG = DatasetConfig(
|
|
118
|
+
name="european-values-completions-fi",
|
|
119
|
+
pretty_name="the Finnish version of the European values evaluation dataset, where "
|
|
120
|
+
"the questions are phrased as sentence completions",
|
|
121
|
+
huggingface_id="EuroEval/european-values-completions-fi",
|
|
122
|
+
task=EUROPEAN_VALUES,
|
|
123
|
+
languages=[FI],
|
|
124
|
+
splits=["test"],
|
|
125
|
+
bootstrap_samples=False,
|
|
126
|
+
_instruction_prompt="{text}",
|
|
127
|
+
unofficial=True,
|
|
128
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import FR
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -69,6 +69,17 @@ HELLASWAG_FR_CONFIG = DatasetConfig(
|
|
|
69
69
|
languages=[FR],
|
|
70
70
|
)
|
|
71
71
|
|
|
72
|
+
EUROPEAN_VALUES_FR_CONFIG = DatasetConfig(
|
|
73
|
+
name="european-values-fr",
|
|
74
|
+
pretty_name="the French version of the European values evaluation dataset",
|
|
75
|
+
huggingface_id="EuroEval/european-values-fr",
|
|
76
|
+
task=EUROPEAN_VALUES,
|
|
77
|
+
languages=[FR],
|
|
78
|
+
splits=["test"],
|
|
79
|
+
bootstrap_samples=False,
|
|
80
|
+
_instruction_prompt="{text}",
|
|
81
|
+
)
|
|
82
|
+
|
|
72
83
|
|
|
73
84
|
### Unofficial datasets ###
|
|
74
85
|
|
|
@@ -101,3 +112,29 @@ GOLDENSWAG_FR_CONFIG = DatasetConfig(
|
|
|
101
112
|
languages=[FR],
|
|
102
113
|
unofficial=True,
|
|
103
114
|
)
|
|
115
|
+
|
|
116
|
+
EUROPEAN_VALUES_SITUATIONAL_FR_CONFIG = DatasetConfig(
|
|
117
|
+
name="european-values-situational-fr",
|
|
118
|
+
pretty_name="the French version of the European values evaluation dataset, where "
|
|
119
|
+
"the questions are phrased in a situational way",
|
|
120
|
+
huggingface_id="EuroEval/european-values-situational-fr",
|
|
121
|
+
task=EUROPEAN_VALUES,
|
|
122
|
+
languages=[FR],
|
|
123
|
+
splits=["test"],
|
|
124
|
+
bootstrap_samples=False,
|
|
125
|
+
_instruction_prompt="{text}",
|
|
126
|
+
unofficial=True,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
EUROPEAN_VALUES_COMPLETIONS_FR_CONFIG = DatasetConfig(
|
|
130
|
+
name="european-values-completions-fr",
|
|
131
|
+
pretty_name="the French version of the European values evaluation dataset, where "
|
|
132
|
+
"the questions are phrased as sentence completions",
|
|
133
|
+
huggingface_id="EuroEval/european-values-completions-fr",
|
|
134
|
+
task=EUROPEAN_VALUES,
|
|
135
|
+
languages=[FR],
|
|
136
|
+
splits=["test"],
|
|
137
|
+
bootstrap_samples=False,
|
|
138
|
+
_instruction_prompt="{text}",
|
|
139
|
+
unofficial=True,
|
|
140
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import DE
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -67,6 +67,17 @@ HELLASWAG_DE_CONFIG = DatasetConfig(
|
|
|
67
67
|
languages=[DE],
|
|
68
68
|
)
|
|
69
69
|
|
|
70
|
+
EUROPEAN_VALUES_DE_CONFIG = DatasetConfig(
|
|
71
|
+
name="european-values-de",
|
|
72
|
+
pretty_name="the German version of the European values evaluation dataset",
|
|
73
|
+
huggingface_id="EuroEval/european-values-de",
|
|
74
|
+
task=EUROPEAN_VALUES,
|
|
75
|
+
languages=[DE],
|
|
76
|
+
splits=["test"],
|
|
77
|
+
bootstrap_samples=False,
|
|
78
|
+
_instruction_prompt="{text}",
|
|
79
|
+
)
|
|
80
|
+
|
|
70
81
|
|
|
71
82
|
### Unofficial datasets ###
|
|
72
83
|
|
|
@@ -109,3 +120,29 @@ GOLDENSWAG_DE_CONFIG = DatasetConfig(
|
|
|
109
120
|
languages=[DE],
|
|
110
121
|
unofficial=True,
|
|
111
122
|
)
|
|
123
|
+
|
|
124
|
+
EUROPEAN_VALUES_SITUATIONAL_DE_CONFIG = DatasetConfig(
|
|
125
|
+
name="european-values-situational-de",
|
|
126
|
+
pretty_name="the German version of the European values evaluation dataset, where "
|
|
127
|
+
"the questions are phrased in a situational way",
|
|
128
|
+
huggingface_id="EuroEval/european-values-situational-de",
|
|
129
|
+
task=EUROPEAN_VALUES,
|
|
130
|
+
languages=[DE],
|
|
131
|
+
splits=["test"],
|
|
132
|
+
bootstrap_samples=False,
|
|
133
|
+
_instruction_prompt="{text}",
|
|
134
|
+
unofficial=True,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
EUROPEAN_VALUES_COMPLETIONS_DE_CONFIG = DatasetConfig(
|
|
138
|
+
name="european-values-completions-de",
|
|
139
|
+
pretty_name="the German version of the European values evaluation dataset, where "
|
|
140
|
+
"the questions are phrased as sentence completions",
|
|
141
|
+
huggingface_id="EuroEval/european-values-completions-de",
|
|
142
|
+
task=EUROPEAN_VALUES,
|
|
143
|
+
languages=[DE],
|
|
144
|
+
splits=["test"],
|
|
145
|
+
bootstrap_samples=False,
|
|
146
|
+
_instruction_prompt="{text}",
|
|
147
|
+
unofficial=True,
|
|
148
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import IS
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -66,6 +66,18 @@ WINOGRANDE_IS_CONFIG = DatasetConfig(
|
|
|
66
66
|
huggingface_id="EuroEval/winogrande-is",
|
|
67
67
|
task=COMMON_SENSE,
|
|
68
68
|
languages=[IS],
|
|
69
|
+
_labels=["a", "b"],
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
EUROPEAN_VALUES_IS_CONFIG = DatasetConfig(
|
|
73
|
+
name="european-values-is",
|
|
74
|
+
pretty_name="the Icelandic version of the European values evaluation dataset",
|
|
75
|
+
huggingface_id="EuroEval/european-values-is",
|
|
76
|
+
task=EUROPEAN_VALUES,
|
|
77
|
+
languages=[IS],
|
|
78
|
+
splits=["test"],
|
|
79
|
+
bootstrap_samples=False,
|
|
80
|
+
_instruction_prompt="{text}",
|
|
69
81
|
)
|
|
70
82
|
|
|
71
83
|
|
|
@@ -156,3 +168,29 @@ MULTI_WIKI_QA_IS_CONFIG = DatasetConfig(
|
|
|
156
168
|
languages=[IS],
|
|
157
169
|
unofficial=True,
|
|
158
170
|
)
|
|
171
|
+
|
|
172
|
+
EUROPEAN_VALUES_SITUATIONAL_IS_CONFIG = DatasetConfig(
|
|
173
|
+
name="european-values-situational-is",
|
|
174
|
+
pretty_name="the Icelandic version of the European values evaluation dataset, "
|
|
175
|
+
"where the questions are phrased in a situational way",
|
|
176
|
+
huggingface_id="EuroEval/european-values-situational-is",
|
|
177
|
+
task=EUROPEAN_VALUES,
|
|
178
|
+
languages=[IS],
|
|
179
|
+
splits=["test"],
|
|
180
|
+
bootstrap_samples=False,
|
|
181
|
+
_instruction_prompt="{text}",
|
|
182
|
+
unofficial=True,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
EUROPEAN_VALUES_COMPLETIONS_IS_CONFIG = DatasetConfig(
|
|
186
|
+
name="european-values-completions-is",
|
|
187
|
+
pretty_name="the Icelandic version of the European values evaluation dataset, "
|
|
188
|
+
"where the questions are phrased as sentence completions",
|
|
189
|
+
huggingface_id="EuroEval/european-values-completions-is",
|
|
190
|
+
task=EUROPEAN_VALUES,
|
|
191
|
+
languages=[IS],
|
|
192
|
+
splits=["test"],
|
|
193
|
+
bootstrap_samples=False,
|
|
194
|
+
_instruction_prompt="{text}",
|
|
195
|
+
unofficial=True,
|
|
196
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import IT
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -67,6 +67,17 @@ HELLASWAG_IT_CONFIG = DatasetConfig(
|
|
|
67
67
|
languages=[IT],
|
|
68
68
|
)
|
|
69
69
|
|
|
70
|
+
EUROPEAN_VALUES_IT_CONFIG = DatasetConfig(
|
|
71
|
+
name="european-values-it",
|
|
72
|
+
pretty_name="the Italian version of the European values evaluation dataset",
|
|
73
|
+
huggingface_id="EuroEval/european-values-it",
|
|
74
|
+
task=EUROPEAN_VALUES,
|
|
75
|
+
languages=[IT],
|
|
76
|
+
splits=["test"],
|
|
77
|
+
bootstrap_samples=False,
|
|
78
|
+
_instruction_prompt="{text}",
|
|
79
|
+
)
|
|
80
|
+
|
|
70
81
|
|
|
71
82
|
### Unofficial datasets ###
|
|
72
83
|
|
|
@@ -109,3 +120,29 @@ GOLDENSWAG_IT_CONFIG = DatasetConfig(
|
|
|
109
120
|
languages=[IT],
|
|
110
121
|
unofficial=True,
|
|
111
122
|
)
|
|
123
|
+
|
|
124
|
+
EUROPEAN_VALUES_SITUATIONAL_IT_CONFIG = DatasetConfig(
|
|
125
|
+
name="european-values-situational-it",
|
|
126
|
+
pretty_name="the Italian version of the European values evaluation dataset, "
|
|
127
|
+
"where the questions are phrased in a situational way",
|
|
128
|
+
huggingface_id="EuroEval/european-values-situational-it",
|
|
129
|
+
task=EUROPEAN_VALUES,
|
|
130
|
+
languages=[IT],
|
|
131
|
+
splits=["test"],
|
|
132
|
+
bootstrap_samples=False,
|
|
133
|
+
_instruction_prompt="{text}",
|
|
134
|
+
unofficial=True,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
EUROPEAN_VALUES_COMPLETIONS_IT_CONFIG = DatasetConfig(
|
|
138
|
+
name="european-values-completions-it",
|
|
139
|
+
pretty_name="the Italian version of the European values evaluation dataset, where "
|
|
140
|
+
"the questions are phrased as sentence completions",
|
|
141
|
+
huggingface_id="EuroEval/european-values-completions-it",
|
|
142
|
+
task=EUROPEAN_VALUES,
|
|
143
|
+
languages=[IT],
|
|
144
|
+
splits=["test"],
|
|
145
|
+
bootstrap_samples=False,
|
|
146
|
+
_instruction_prompt="{text}",
|
|
147
|
+
unofficial=True,
|
|
148
|
+
)
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""All Latvian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import LV
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
LATVIAN_TWITTER_SENTIMENT_CONFIG = DatasetConfig(
|
|
10
|
+
name="latvian-twitter-sentiment",
|
|
11
|
+
pretty_name="the truncated version of the Latvian sentiment classification dataset",
|
|
12
|
+
huggingface_id="EuroEval/latvian-twitter-sentiment-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[LV],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_LV_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-lv",
|
|
19
|
+
pretty_name="the Latvian part of the linguistic acceptability dataset ScaLA",
|
|
20
|
+
huggingface_id="EuroEval/scala-lv",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[LV],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
FULLSTACK_NER_LV_CONFIG = DatasetConfig(
|
|
26
|
+
name="fullstack-ner-lv",
|
|
27
|
+
pretty_name="the truncated version of the FullStack NER dataset",
|
|
28
|
+
huggingface_id="EuroEval/fullstack-ner-lv-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[LV],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_LV_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-lv",
|
|
35
|
+
pretty_name="the truncated version of the Latvian part of the reading "
|
|
36
|
+
"comprehension dataset MultiWikiQA",
|
|
37
|
+
huggingface_id="EuroEval/multi-wiki-qa-lv-mini",
|
|
38
|
+
task=RC,
|
|
39
|
+
languages=[LV],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
LSM_CONFIG = DatasetConfig(
|
|
43
|
+
name="lsm",
|
|
44
|
+
pretty_name="the truncated version of the Latvian summarisation dataset LSM",
|
|
45
|
+
huggingface_id="EuroEval/lsm-mini",
|
|
46
|
+
task=SUMM,
|
|
47
|
+
languages=[LV],
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
MMLU_LV_CONFIG = DatasetConfig(
|
|
52
|
+
name="mmlu-lv",
|
|
53
|
+
pretty_name="the truncated version of the Latvian knowledge dataset MMLU-lv, "
|
|
54
|
+
"translated from the English MMLU dataset",
|
|
55
|
+
huggingface_id="EuroEval/mmlu-lv-mini",
|
|
56
|
+
task=KNOW,
|
|
57
|
+
languages=[LV],
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
COPA_LV_CONFIG = DatasetConfig(
|
|
61
|
+
name="copa-lv",
|
|
62
|
+
pretty_name="the Latvian common-sense reasoning dataset COPA-lv, translated from "
|
|
63
|
+
"the English COPA dataset",
|
|
64
|
+
huggingface_id="EuroEval/copa-lv",
|
|
65
|
+
task=COMMON_SENSE,
|
|
66
|
+
languages=[LV],
|
|
67
|
+
_labels=["a", "b"],
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
### Unofficial datasets ###
|
|
72
|
+
|
|
73
|
+
WIKIANN_LV_CONFIG = DatasetConfig(
|
|
74
|
+
name="wikiann-lv",
|
|
75
|
+
pretty_name="the truncated version of the Latvian part of the named entity "
|
|
76
|
+
"recognition dataset WikiANN",
|
|
77
|
+
huggingface_id="EuroEval/wikiann-lv-mini",
|
|
78
|
+
task=NER,
|
|
79
|
+
languages=[LV],
|
|
80
|
+
unofficial=True,
|
|
81
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import NB, NN, NO
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -94,6 +94,17 @@ NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
|
|
|
94
94
|
_labels=["a", "b", "c", "d", "e"],
|
|
95
95
|
)
|
|
96
96
|
|
|
97
|
+
EUROPEAN_VALUES_NO_CONFIG = DatasetConfig(
|
|
98
|
+
name="european-values-no",
|
|
99
|
+
pretty_name="the Norwegian version of the European values evaluation dataset",
|
|
100
|
+
huggingface_id="EuroEval/european-values-no",
|
|
101
|
+
task=EUROPEAN_VALUES,
|
|
102
|
+
languages=[NB, NN, NO],
|
|
103
|
+
splits=["test"],
|
|
104
|
+
bootstrap_samples=False,
|
|
105
|
+
_instruction_prompt="{text}",
|
|
106
|
+
)
|
|
107
|
+
|
|
97
108
|
|
|
98
109
|
### Unofficial datasets ###
|
|
99
110
|
|
|
@@ -204,3 +215,29 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
|
|
|
204
215
|
languages=[NN],
|
|
205
216
|
unofficial=True,
|
|
206
217
|
)
|
|
218
|
+
|
|
219
|
+
EUROPEAN_VALUES_SITUATIONAL_NO_CONFIG = DatasetConfig(
|
|
220
|
+
name="european-values-situational-no",
|
|
221
|
+
pretty_name="the Norwegian version of the European values evaluation dataset, "
|
|
222
|
+
"where the questions are phrased in a situational way",
|
|
223
|
+
huggingface_id="EuroEval/european-values-situational-no",
|
|
224
|
+
task=EUROPEAN_VALUES,
|
|
225
|
+
languages=[NB, NN, NO],
|
|
226
|
+
splits=["test"],
|
|
227
|
+
bootstrap_samples=False,
|
|
228
|
+
_instruction_prompt="{text}",
|
|
229
|
+
unofficial=True,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
EUROPEAN_VALUES_COMPLETIONS_NO_CONFIG = DatasetConfig(
|
|
233
|
+
name="european-values-completions-no",
|
|
234
|
+
pretty_name="the Norwegian version of the European values evaluation dataset, "
|
|
235
|
+
"where the questions are phrased as sentence completions",
|
|
236
|
+
huggingface_id="EuroEval/european-values-completions-no",
|
|
237
|
+
task=EUROPEAN_VALUES,
|
|
238
|
+
languages=[NO],
|
|
239
|
+
splits=["test"],
|
|
240
|
+
bootstrap_samples=False,
|
|
241
|
+
_instruction_prompt="{text}",
|
|
242
|
+
unofficial=True,
|
|
243
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import PT
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -67,6 +67,17 @@ GOLDENSWAG_PT_CONFIG = DatasetConfig(
|
|
|
67
67
|
languages=[PT],
|
|
68
68
|
)
|
|
69
69
|
|
|
70
|
+
EUROPEAN_VALUES_PT_CONFIG = DatasetConfig(
|
|
71
|
+
name="european-values-pt",
|
|
72
|
+
pretty_name="the Portuguese version of the European values evaluation dataset",
|
|
73
|
+
huggingface_id="EuroEval/european-values-pt",
|
|
74
|
+
task=EUROPEAN_VALUES,
|
|
75
|
+
languages=[PT],
|
|
76
|
+
splits=["test"],
|
|
77
|
+
bootstrap_samples=False,
|
|
78
|
+
_instruction_prompt="{text}",
|
|
79
|
+
)
|
|
80
|
+
|
|
70
81
|
|
|
71
82
|
### Unofficial datasets ###
|
|
72
83
|
|
|
@@ -79,3 +90,29 @@ BOOLQ_PT_CONFIG = DatasetConfig(
|
|
|
79
90
|
languages=[PT],
|
|
80
91
|
unofficial=True,
|
|
81
92
|
)
|
|
93
|
+
|
|
94
|
+
EUROPEAN_VALUES_SITUATIONAL_PT_CONFIG = DatasetConfig(
|
|
95
|
+
name="european-values-situational-pt",
|
|
96
|
+
pretty_name="the Portuguese version of the European values evaluation dataset, "
|
|
97
|
+
"where the questions are phrased in a situational way",
|
|
98
|
+
huggingface_id="EuroEval/european-values-situational-pt",
|
|
99
|
+
task=EUROPEAN_VALUES,
|
|
100
|
+
languages=[PT],
|
|
101
|
+
splits=["test"],
|
|
102
|
+
bootstrap_samples=False,
|
|
103
|
+
_instruction_prompt="{text}",
|
|
104
|
+
unofficial=True,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
EUROPEAN_VALUES_COMPLETIONS_PT_CONFIG = DatasetConfig(
|
|
108
|
+
name="european-values-completions-pt",
|
|
109
|
+
pretty_name="the Portuguese version of the European values evaluation dataset, "
|
|
110
|
+
"where the questions are phrased as sentence completions",
|
|
111
|
+
huggingface_id="EuroEval/european-values-completions-pt",
|
|
112
|
+
task=EUROPEAN_VALUES,
|
|
113
|
+
languages=[PT],
|
|
114
|
+
splits=["test"],
|
|
115
|
+
bootstrap_samples=False,
|
|
116
|
+
_instruction_prompt="{text}",
|
|
117
|
+
unofficial=True,
|
|
118
|
+
)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import ES
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -66,6 +66,17 @@ HELLASWAG_ES_CONFIG = DatasetConfig(
|
|
|
66
66
|
languages=[ES],
|
|
67
67
|
)
|
|
68
68
|
|
|
69
|
+
EUROPEAN_VALUES_ES_CONFIG = DatasetConfig(
|
|
70
|
+
name="european-values-es",
|
|
71
|
+
pretty_name="the Spanish version of the European values evaluation dataset",
|
|
72
|
+
huggingface_id="EuroEval/european-values-es",
|
|
73
|
+
task=EUROPEAN_VALUES,
|
|
74
|
+
languages=[ES],
|
|
75
|
+
splits=["test"],
|
|
76
|
+
bootstrap_samples=False,
|
|
77
|
+
_instruction_prompt="{text}",
|
|
78
|
+
)
|
|
79
|
+
|
|
69
80
|
|
|
70
81
|
### Unofficial datasets ###
|
|
71
82
|
|
|
@@ -107,3 +118,29 @@ GOLDENSWAG_ES_CONFIG = DatasetConfig(
|
|
|
107
118
|
languages=[ES],
|
|
108
119
|
unofficial=True,
|
|
109
120
|
)
|
|
121
|
+
|
|
122
|
+
EUROPEAN_VALUES_SITUATIONAL_ES_CONFIG = DatasetConfig(
|
|
123
|
+
name="european-values-situational-es",
|
|
124
|
+
pretty_name="the Spanish version of the European values evaluation dataset, where "
|
|
125
|
+
"the questions are phrased in a situational way",
|
|
126
|
+
huggingface_id="EuroEval/european-values-situational-es",
|
|
127
|
+
task=EUROPEAN_VALUES,
|
|
128
|
+
languages=[ES],
|
|
129
|
+
splits=["test"],
|
|
130
|
+
bootstrap_samples=False,
|
|
131
|
+
_instruction_prompt="{text}",
|
|
132
|
+
unofficial=True,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
EUROPEAN_VALUES_COMPLETIONS_ES_CONFIG = DatasetConfig(
|
|
136
|
+
name="european-values-completions-es",
|
|
137
|
+
pretty_name="the Spanish version of the European values evaluation dataset, where "
|
|
138
|
+
"the questions are phrased as sentence completions",
|
|
139
|
+
huggingface_id="EuroEval/european-values-completions-es",
|
|
140
|
+
task=EUROPEAN_VALUES,
|
|
141
|
+
languages=[ES],
|
|
142
|
+
splits=["test"],
|
|
143
|
+
bootstrap_samples=False,
|
|
144
|
+
_instruction_prompt="{text}",
|
|
145
|
+
unofficial=True,
|
|
146
|
+
)
|