EuroEval 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_config_factory.py +6 -1
- euroeval/benchmark_modules/base.py +2 -0
- euroeval/benchmark_modules/fresh.py +7 -1
- euroeval/benchmark_modules/hf.py +26 -21
- euroeval/benchmark_modules/litellm.py +258 -131
- euroeval/benchmark_modules/vllm.py +79 -40
- euroeval/benchmarker.py +11 -2
- euroeval/cli.py +14 -1
- euroeval/constants.py +1 -1
- euroeval/data_models.py +77 -6
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/danish.py +14 -0
- euroeval/dataset_configs/dutch.py +14 -0
- euroeval/dataset_configs/english.py +22 -0
- euroeval/dataset_configs/estonian.py +15 -7
- euroeval/dataset_configs/finnish.py +14 -0
- euroeval/dataset_configs/french.py +14 -0
- euroeval/dataset_configs/german.py +23 -0
- euroeval/dataset_configs/italian.py +14 -0
- euroeval/dataset_configs/latvian.py +14 -0
- euroeval/dataset_configs/norwegian.py +14 -0
- euroeval/dataset_configs/polish.py +126 -0
- euroeval/dataset_configs/portuguese.py +14 -0
- euroeval/dataset_configs/spanish.py +14 -0
- euroeval/dataset_configs/swedish.py +25 -0
- euroeval/enums.py +12 -0
- euroeval/generation.py +17 -8
- euroeval/generation_utils.py +65 -11
- euroeval/metrics/pipeline.py +1 -1
- euroeval/prompt_templates/linguistic_acceptability.py +9 -0
- euroeval/prompt_templates/multiple_choice.py +27 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -0
- euroeval/prompt_templates/reading_comprehension.py +11 -0
- euroeval/prompt_templates/sentiment_classification.py +15 -0
- euroeval/prompt_templates/summarization.py +27 -1
- euroeval/scores.py +5 -0
- euroeval/task_group_utils/question_answering.py +29 -29
- euroeval/task_group_utils/sequence_classification.py +11 -34
- euroeval/task_group_utils/token_classification.py +3 -3
- euroeval/tasks.py +4 -4
- euroeval/{tokenization_utils.py → tokenisation_utils.py} +50 -28
- euroeval/utils.py +36 -3
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/METADATA +1 -1
- euroeval-16.1.1.dist-info/RECORD +70 -0
- euroeval-16.0.1.dist-info/RECORD +0 -69
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/WHEEL +0 -0
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/entry_points.txt +0 -0
- {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -47,13 +47,12 @@ ERR_NEWS_CONFIG = DatasetConfig(
|
|
|
47
47
|
languages=[ET],
|
|
48
48
|
)
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
name="
|
|
52
|
-
pretty_name="the Estonian knowledge
|
|
53
|
-
huggingface_id="EuroEval/
|
|
50
|
+
TRIVIA_ET_CONFIG = DatasetConfig(
|
|
51
|
+
name="trivia-et",
|
|
52
|
+
pretty_name="the Estonian knowledge dataset Trivia-et",
|
|
53
|
+
huggingface_id="EuroEval/trivia-et",
|
|
54
54
|
task=KNOW,
|
|
55
55
|
languages=[ET],
|
|
56
|
-
_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
|
|
57
56
|
)
|
|
58
57
|
|
|
59
58
|
WINOGRANDE_ET_CONFIG = DatasetConfig(
|
|
@@ -82,8 +81,7 @@ EUROPEAN_VALUES_ET_CONFIG = DatasetConfig(
|
|
|
82
81
|
_instruction_prompt="{text}",
|
|
83
82
|
)
|
|
84
83
|
|
|
85
|
-
|
|
86
|
-
### Unofficial datasets ###
|
|
84
|
+
### Unofficial datasets ###
|
|
87
85
|
|
|
88
86
|
SCALA_ET_CONFIG = DatasetConfig(
|
|
89
87
|
name="scala-et",
|
|
@@ -93,3 +91,13 @@ SCALA_ET_CONFIG = DatasetConfig(
|
|
|
93
91
|
languages=[ET],
|
|
94
92
|
unofficial=True,
|
|
95
93
|
)
|
|
94
|
+
|
|
95
|
+
EXAM_ET_CONFIG = DatasetConfig(
|
|
96
|
+
name="exam-et",
|
|
97
|
+
pretty_name="the Estonian knowledge assessment dataset Exam-et",
|
|
98
|
+
huggingface_id="EuroEval/exam-et",
|
|
99
|
+
task=KNOW,
|
|
100
|
+
languages=[ET],
|
|
101
|
+
_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
|
|
102
|
+
unofficial=True,
|
|
103
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Finnish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import FI
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -101,6 +102,19 @@ GOLDENSWAG_FI_CONFIG = DatasetConfig(
|
|
|
101
102
|
unofficial=True,
|
|
102
103
|
)
|
|
103
104
|
|
|
105
|
+
WINOGRANDE_FI_CONFIG = DatasetConfig(
|
|
106
|
+
name="winogrande-fi",
|
|
107
|
+
pretty_name="the Finnish common-sense reasoning dataset Winogrande-fi, translated "
|
|
108
|
+
"from the English Winogrande dataset",
|
|
109
|
+
huggingface_id="EuroEval/winogrande-fi",
|
|
110
|
+
task=COMMON_SENSE,
|
|
111
|
+
languages=[FI],
|
|
112
|
+
splits=["train", "test"],
|
|
113
|
+
_labels=["a", "b"],
|
|
114
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
115
|
+
unofficial=True,
|
|
116
|
+
)
|
|
117
|
+
|
|
104
118
|
EUROPEAN_VALUES_SITUATIONAL_FI_CONFIG = DatasetConfig(
|
|
105
119
|
name="european-values-situational-fi",
|
|
106
120
|
pretty_name="the Finnish version of the European values evaluation dataset, where "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All French dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import FR
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -113,6 +114,19 @@ GOLDENSWAG_FR_CONFIG = DatasetConfig(
|
|
|
113
114
|
unofficial=True,
|
|
114
115
|
)
|
|
115
116
|
|
|
117
|
+
WINOGRANDE_FR_CONFIG = DatasetConfig(
|
|
118
|
+
name="winogrande-fr",
|
|
119
|
+
pretty_name="the French common-sense reasoning dataset Winogrande-fr, translated "
|
|
120
|
+
"from the English Winogrande dataset",
|
|
121
|
+
huggingface_id="EuroEval/winogrande-fr",
|
|
122
|
+
task=COMMON_SENSE,
|
|
123
|
+
languages=[FR],
|
|
124
|
+
splits=["train", "test"],
|
|
125
|
+
_labels=["a", "b"],
|
|
126
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
127
|
+
unofficial=True,
|
|
128
|
+
)
|
|
129
|
+
|
|
116
130
|
EUROPEAN_VALUES_SITUATIONAL_FR_CONFIG = DatasetConfig(
|
|
117
131
|
name="european-values-situational-fr",
|
|
118
132
|
pretty_name="the French version of the European values evaluation dataset, where "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All German dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import DE
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -81,6 +82,15 @@ EUROPEAN_VALUES_DE_CONFIG = DatasetConfig(
|
|
|
81
82
|
|
|
82
83
|
### Unofficial datasets ###
|
|
83
84
|
|
|
85
|
+
XQUAD_DE_CONFIG = DatasetConfig(
|
|
86
|
+
name="xquad-de",
|
|
87
|
+
pretty_name="the German version of the reading comprehension dataset XQuAD",
|
|
88
|
+
huggingface_id="EuroEval/xquad-de",
|
|
89
|
+
task=RC,
|
|
90
|
+
languages=[DE],
|
|
91
|
+
unofficial=True,
|
|
92
|
+
)
|
|
93
|
+
|
|
84
94
|
ARC_DE_CONFIG = DatasetConfig(
|
|
85
95
|
name="arc-de",
|
|
86
96
|
pretty_name="the truncated version of the German knowledge dataset ARC-de, "
|
|
@@ -121,6 +131,19 @@ GOLDENSWAG_DE_CONFIG = DatasetConfig(
|
|
|
121
131
|
unofficial=True,
|
|
122
132
|
)
|
|
123
133
|
|
|
134
|
+
WINOGRANDE_DE_CONFIG = DatasetConfig(
|
|
135
|
+
name="winogrande-de",
|
|
136
|
+
pretty_name="the German common-sense reasoning dataset Winogrande-de, translated "
|
|
137
|
+
"from the English Winogrande dataset",
|
|
138
|
+
huggingface_id="EuroEval/winogrande-de",
|
|
139
|
+
task=COMMON_SENSE,
|
|
140
|
+
languages=[DE],
|
|
141
|
+
splits=["train", "test"],
|
|
142
|
+
_labels=["a", "b"],
|
|
143
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
144
|
+
unofficial=True,
|
|
145
|
+
)
|
|
146
|
+
|
|
124
147
|
EUROPEAN_VALUES_SITUATIONAL_DE_CONFIG = DatasetConfig(
|
|
125
148
|
name="european-values-situational-de",
|
|
126
149
|
pretty_name="the German version of the European values evaluation dataset, where "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Italian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import IT
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -121,6 +122,19 @@ GOLDENSWAG_IT_CONFIG = DatasetConfig(
|
|
|
121
122
|
unofficial=True,
|
|
122
123
|
)
|
|
123
124
|
|
|
125
|
+
WINOGRANDE_IT_CONFIG = DatasetConfig(
|
|
126
|
+
name="winogrande-it",
|
|
127
|
+
pretty_name="the Italian common-sense reasoning dataset Winogrande-it, translated "
|
|
128
|
+
"from the English Winogrande dataset",
|
|
129
|
+
huggingface_id="EuroEval/winogrande-it",
|
|
130
|
+
task=COMMON_SENSE,
|
|
131
|
+
languages=[IT],
|
|
132
|
+
splits=["train", "test"],
|
|
133
|
+
_labels=["a", "b"],
|
|
134
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
135
|
+
unofficial=True,
|
|
136
|
+
)
|
|
137
|
+
|
|
124
138
|
EUROPEAN_VALUES_SITUATIONAL_IT_CONFIG = DatasetConfig(
|
|
125
139
|
name="european-values-situational-it",
|
|
126
140
|
pretty_name="the Italian version of the European values evaluation dataset, "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Latvian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import LV
|
|
5
6
|
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -79,3 +80,16 @@ WIKIANN_LV_CONFIG = DatasetConfig(
|
|
|
79
80
|
languages=[LV],
|
|
80
81
|
unofficial=True,
|
|
81
82
|
)
|
|
83
|
+
|
|
84
|
+
WINOGRANDE_LV_CONFIG = DatasetConfig(
|
|
85
|
+
name="winogrande-lv",
|
|
86
|
+
pretty_name="the Latvian common-sense reasoning dataset Winogrande-lv, translated "
|
|
87
|
+
"from the English Winogrande dataset",
|
|
88
|
+
huggingface_id="EuroEval/winogrande-lv",
|
|
89
|
+
task=COMMON_SENSE,
|
|
90
|
+
languages=[LV],
|
|
91
|
+
splits=["train", "test"],
|
|
92
|
+
_labels=["a", "b"],
|
|
93
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
94
|
+
unofficial=True,
|
|
95
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Norwegian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import NB, NN, NO
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -216,6 +217,19 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
|
|
|
216
217
|
unofficial=True,
|
|
217
218
|
)
|
|
218
219
|
|
|
220
|
+
WINOGRANDE_NO_CONFIG = DatasetConfig(
|
|
221
|
+
name="winogrande-no",
|
|
222
|
+
pretty_name="the Norwegian common-sense reasoning dataset Winogrande-no, "
|
|
223
|
+
"translated from the English Winogrande dataset",
|
|
224
|
+
huggingface_id="EuroEval/winogrande-no",
|
|
225
|
+
task=COMMON_SENSE,
|
|
226
|
+
languages=[NB, NN, NO],
|
|
227
|
+
splits=["train", "test"],
|
|
228
|
+
_labels=["a", "b"],
|
|
229
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
230
|
+
unofficial=True,
|
|
231
|
+
)
|
|
232
|
+
|
|
219
233
|
EUROPEAN_VALUES_SITUATIONAL_NO_CONFIG = DatasetConfig(
|
|
220
234
|
name="european-values-situational-no",
|
|
221
235
|
pretty_name="the Norwegian version of the European values evaluation dataset, "
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""All Polish dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
5
|
+
from ..languages import PL
|
|
6
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
|
|
7
|
+
|
|
8
|
+
### Official datasets ###
|
|
9
|
+
|
|
10
|
+
POLEMO2_CONFIG = DatasetConfig(
|
|
11
|
+
name="polemo2",
|
|
12
|
+
pretty_name="the Polish sentiment classification dataset PolEmo2",
|
|
13
|
+
huggingface_id="EuroEval/polemo2-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[PL],
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
SCALA_PL_CONFIG = DatasetConfig(
|
|
19
|
+
name="scala-pl",
|
|
20
|
+
pretty_name="the Polish part of the linguistic acceptability dataset ScaLA",
|
|
21
|
+
huggingface_id="EuroEval/scala-pl",
|
|
22
|
+
task=LA,
|
|
23
|
+
languages=[PL],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
KPWR_NER_CONFIG = DatasetConfig(
|
|
27
|
+
name="kpwr-ner",
|
|
28
|
+
pretty_name="the Polish entity recognition dataset KPWr-NER",
|
|
29
|
+
huggingface_id="EuroEval/kpwr-ner",
|
|
30
|
+
task=NER,
|
|
31
|
+
languages=[PL],
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
POQUAD_CONFIG = DatasetConfig(
|
|
35
|
+
name="poquad",
|
|
36
|
+
pretty_name="the Polish question answering dataset PoQuAD",
|
|
37
|
+
huggingface_id="EuroEval/poquad-mini",
|
|
38
|
+
task=RC,
|
|
39
|
+
languages=[PL],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
PSC_CONFIG = DatasetConfig(
|
|
43
|
+
name="psc",
|
|
44
|
+
pretty_name="the Polish summarisation dataset PSC",
|
|
45
|
+
huggingface_id="EuroEval/psc-mini",
|
|
46
|
+
task=SUMM,
|
|
47
|
+
languages=[PL],
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
LLMZSZL_CONFIG = DatasetConfig(
|
|
51
|
+
name="llmzszl",
|
|
52
|
+
pretty_name="the Polish knowledge dataset LLMzSzŁ",
|
|
53
|
+
huggingface_id="EuroEval/llmzszl-mini",
|
|
54
|
+
task=KNOW,
|
|
55
|
+
languages=[PL],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
WINOGRANDE_PL_CONFIG = DatasetConfig(
|
|
59
|
+
name="winogrande-pl",
|
|
60
|
+
pretty_name="the Polish common-sense reasoning dataset Winogrande-pl, translated "
|
|
61
|
+
"from the English Winogrande dataset",
|
|
62
|
+
huggingface_id="EuroEval/winogrande-pl",
|
|
63
|
+
task=COMMON_SENSE,
|
|
64
|
+
languages=[PL],
|
|
65
|
+
splits=["train", "test"],
|
|
66
|
+
_labels=["a", "b"],
|
|
67
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
EUROPEAN_VALUES_PL_CONFIG = DatasetConfig(
|
|
71
|
+
name="european-values-pl",
|
|
72
|
+
pretty_name="the Polish version of the European values evaluation dataset",
|
|
73
|
+
huggingface_id="EuroEval/european-values-pl",
|
|
74
|
+
task=EUROPEAN_VALUES,
|
|
75
|
+
languages=[PL],
|
|
76
|
+
splits=["test"],
|
|
77
|
+
bootstrap_samples=False,
|
|
78
|
+
_instruction_prompt="{text}",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
### Unofficial datasets ###
|
|
83
|
+
|
|
84
|
+
MULTI_WIKI_QA_PL_CONFIG = DatasetConfig(
|
|
85
|
+
name="multi-wiki-qa-pl",
|
|
86
|
+
pretty_name="the truncated version of the Polish part of the reading "
|
|
87
|
+
"comprehension dataset MultiWikiQA",
|
|
88
|
+
huggingface_id="EuroEval/multi-wiki-qa-pl-mini",
|
|
89
|
+
task=RC,
|
|
90
|
+
languages=[PL],
|
|
91
|
+
unofficial=True,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
GOLDENSWAG_PL_CONFIG = DatasetConfig(
|
|
95
|
+
name="goldenswag-pl",
|
|
96
|
+
pretty_name="the truncated version of the Polish common-sense reasoning "
|
|
97
|
+
"dataset GoldenSwag-pl, translated from the English GoldenSwag dataset",
|
|
98
|
+
huggingface_id="EuroEval/goldenswag-pl-mini",
|
|
99
|
+
task=COMMON_SENSE,
|
|
100
|
+
languages=[PL],
|
|
101
|
+
unofficial=True,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
EUROPEAN_VALUES_SITUATIONAL_PL_CONFIG = DatasetConfig(
|
|
105
|
+
name="european-values-situational-pl",
|
|
106
|
+
pretty_name="the Polish version of the European values evaluation dataset, where "
|
|
107
|
+
"the questions are phrased in a situational way",
|
|
108
|
+
huggingface_id="EuroEval/european-values-situational-pl",
|
|
109
|
+
task=EUROPEAN_VALUES,
|
|
110
|
+
languages=[PL],
|
|
111
|
+
splits=["test"],
|
|
112
|
+
bootstrap_samples=False,
|
|
113
|
+
unofficial=True,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
EUROPEAN_VALUES_COMPLETIONS_PL_CONFIG = DatasetConfig(
|
|
117
|
+
name="european-values-completions-pl",
|
|
118
|
+
pretty_name="the Polish version of the European values evaluation dataset, where "
|
|
119
|
+
"the questions are phrased as sentence completions",
|
|
120
|
+
huggingface_id="EuroEval/european-values-completions-pl",
|
|
121
|
+
task=EUROPEAN_VALUES,
|
|
122
|
+
languages=[PL],
|
|
123
|
+
splits=["test"],
|
|
124
|
+
bootstrap_samples=False,
|
|
125
|
+
unofficial=True,
|
|
126
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Portuguese dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import PT
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -91,6 +92,19 @@ BOOLQ_PT_CONFIG = DatasetConfig(
|
|
|
91
92
|
unofficial=True,
|
|
92
93
|
)
|
|
93
94
|
|
|
95
|
+
WINOGRANDE_PT_CONFIG = DatasetConfig(
|
|
96
|
+
name="winogrande-pt",
|
|
97
|
+
pretty_name="the Portuguese common-sense reasoning dataset Winogrande-pt, "
|
|
98
|
+
"translated from the English Winogrande dataset",
|
|
99
|
+
huggingface_id="EuroEval/winogrande-pt",
|
|
100
|
+
task=COMMON_SENSE,
|
|
101
|
+
languages=[PT],
|
|
102
|
+
splits=["train", "test"],
|
|
103
|
+
_labels=["a", "b"],
|
|
104
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
105
|
+
unofficial=True,
|
|
106
|
+
)
|
|
107
|
+
|
|
94
108
|
EUROPEAN_VALUES_SITUATIONAL_PT_CONFIG = DatasetConfig(
|
|
95
109
|
name="european-values-situational-pt",
|
|
96
110
|
pretty_name="the Portuguese version of the European values evaluation dataset, "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Spanish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import ES
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -119,6 +120,19 @@ GOLDENSWAG_ES_CONFIG = DatasetConfig(
|
|
|
119
120
|
unofficial=True,
|
|
120
121
|
)
|
|
121
122
|
|
|
123
|
+
WINOGRANDE_ES_CONFIG = DatasetConfig(
|
|
124
|
+
name="winogrande-es",
|
|
125
|
+
pretty_name="the Spanish common-sense reasoning dataset Winogrande-es, translated "
|
|
126
|
+
"from the English Winogrande dataset",
|
|
127
|
+
huggingface_id="EuroEval/winogrande-es",
|
|
128
|
+
task=COMMON_SENSE,
|
|
129
|
+
languages=[ES],
|
|
130
|
+
splits=["train", "test"],
|
|
131
|
+
_labels=["a", "b"],
|
|
132
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
133
|
+
unofficial=True,
|
|
134
|
+
)
|
|
135
|
+
|
|
122
136
|
EUROPEAN_VALUES_SITUATIONAL_ES_CONFIG = DatasetConfig(
|
|
123
137
|
name="european-values-situational-es",
|
|
124
138
|
pretty_name="the Spanish version of the European values evaluation dataset, where "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Swedish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import SV
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -130,6 +131,19 @@ GOLDENSWAG_SV_CONFIG = DatasetConfig(
|
|
|
130
131
|
unofficial=True,
|
|
131
132
|
)
|
|
132
133
|
|
|
134
|
+
WINOGRANDE_SV_CONFIG = DatasetConfig(
|
|
135
|
+
name="winogrande-sv",
|
|
136
|
+
pretty_name="the Swedish common-sense reasoning dataset Winogrande-sv, translated "
|
|
137
|
+
"from the English Winogrande dataset",
|
|
138
|
+
huggingface_id="EuroEval/winogrande-sv",
|
|
139
|
+
task=COMMON_SENSE,
|
|
140
|
+
languages=[SV],
|
|
141
|
+
splits=["train", "test"],
|
|
142
|
+
_labels=["a", "b"],
|
|
143
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
144
|
+
unofficial=True,
|
|
145
|
+
)
|
|
146
|
+
|
|
133
147
|
EUROPEAN_VALUES_SITUATIONAL_SV_CONFIG = DatasetConfig(
|
|
134
148
|
name="european-values-situational-sv",
|
|
135
149
|
pretty_name="the Swedish version of the European values evaluation dataset, where "
|
|
@@ -155,3 +169,14 @@ EUROPEAN_VALUES_COMPLETIONS_SV_CONFIG = DatasetConfig(
|
|
|
155
169
|
_instruction_prompt="{text}",
|
|
156
170
|
unofficial=True,
|
|
157
171
|
)
|
|
172
|
+
|
|
173
|
+
SKOLPROV_CONFIG = DatasetConfig(
|
|
174
|
+
name="skolprov",
|
|
175
|
+
pretty_name="the Swedish knowledge dataset Skolprov",
|
|
176
|
+
huggingface_id="EuroEval/skolprov",
|
|
177
|
+
task=KNOW,
|
|
178
|
+
languages=[SV],
|
|
179
|
+
splits=["train", "test"],
|
|
180
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
181
|
+
unofficial=True,
|
|
182
|
+
)
|
euroeval/enums.py
CHANGED
|
@@ -12,6 +12,14 @@ class AutoStrEnum(str, Enum):
|
|
|
12
12
|
) -> str:
|
|
13
13
|
return name.lower()
|
|
14
14
|
|
|
15
|
+
def __str__(self) -> str:
|
|
16
|
+
"""Return the value in upper case for better readability."""
|
|
17
|
+
return self.value.upper()
|
|
18
|
+
|
|
19
|
+
def __repr__(self) -> str:
|
|
20
|
+
"""Return the value in upper case for better readability."""
|
|
21
|
+
return self.value.upper()
|
|
22
|
+
|
|
15
23
|
|
|
16
24
|
class Device(AutoStrEnum):
|
|
17
25
|
"""The compute device to use for the evaluation.
|
|
@@ -60,6 +68,10 @@ class ModelType(AutoStrEnum):
|
|
|
60
68
|
ENCODER = auto()
|
|
61
69
|
GENERATIVE = auto()
|
|
62
70
|
|
|
71
|
+
def __repr__(self) -> str:
|
|
72
|
+
"""Return the value in upper case for better readability."""
|
|
73
|
+
return self.value.upper()
|
|
74
|
+
|
|
63
75
|
|
|
64
76
|
class GenerativeType(AutoStrEnum):
|
|
65
77
|
"""The type of a generative model.
|
euroeval/generation.py
CHANGED
|
@@ -307,7 +307,7 @@ def debug_log(
|
|
|
307
307
|
for label in batch["label"]
|
|
308
308
|
]
|
|
309
309
|
else:
|
|
310
|
-
labels = [
|
|
310
|
+
labels = [None] * len(extracted_labels)
|
|
311
311
|
|
|
312
312
|
case TaskGroup.QUESTION_ANSWERING:
|
|
313
313
|
extracted_labels = [
|
|
@@ -330,12 +330,21 @@ def debug_log(
|
|
|
330
330
|
else:
|
|
331
331
|
input_texts = batch["text"]
|
|
332
332
|
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
333
|
+
metadata_keys: list[str] = [
|
|
334
|
+
key
|
|
335
|
+
for key in batch.keys()
|
|
336
|
+
if key not in ["text", "messages", "label", "labels", "target_text"]
|
|
337
|
+
]
|
|
338
|
+
|
|
339
|
+
for idx in range(len(input_texts)):
|
|
340
|
+
data_to_log: dict[str, t.Any] = {
|
|
341
|
+
"Input": input_texts[idx],
|
|
342
|
+
"Raw output": model_output.sequences[idx],
|
|
343
|
+
"Prediction": extracted_labels[idx],
|
|
344
|
+
}
|
|
345
|
+
if labels[idx]:
|
|
346
|
+
data_to_log["Label"] = labels[idx]
|
|
347
|
+
data_to_log |= {key.capitalize(): batch[key][idx] for key in metadata_keys}
|
|
336
348
|
logger.info(
|
|
337
|
-
f"
|
|
338
|
-
f"Raw output: '{raw_output}'\n"
|
|
339
|
-
f"Prediction: '{prediction}'\n"
|
|
340
|
-
f"Label: '{label}'"
|
|
349
|
+
"\n".join(f"{key}: {value!r}" for key, value in data_to_log.items())
|
|
341
350
|
)
|
euroeval/generation_utils.py
CHANGED
|
@@ -4,11 +4,12 @@ import itertools as it
|
|
|
4
4
|
import json
|
|
5
5
|
import logging
|
|
6
6
|
import random
|
|
7
|
+
import re
|
|
7
8
|
import typing as t
|
|
8
9
|
|
|
9
|
-
from .enums import TaskGroup
|
|
10
|
-
from .exceptions import InvalidBenchmark
|
|
11
|
-
from .
|
|
10
|
+
from .enums import GenerativeType, TaskGroup
|
|
11
|
+
from .exceptions import InvalidBenchmark, InvalidModel
|
|
12
|
+
from .tokenisation_utils import apply_chat_template
|
|
12
13
|
from .utils import extract_multiple_choice_labels, log_once
|
|
13
14
|
|
|
14
15
|
if t.TYPE_CHECKING:
|
|
@@ -173,7 +174,7 @@ def apply_prompt(
|
|
|
173
174
|
few_shot_examples: list[dict[str, t.Any]],
|
|
174
175
|
model_config: "ModelConfig",
|
|
175
176
|
dataset_config: "DatasetConfig",
|
|
176
|
-
|
|
177
|
+
generative_type: GenerativeType | None,
|
|
177
178
|
always_populate_text_field: bool,
|
|
178
179
|
tokeniser: "PreTrainedTokenizer | None",
|
|
179
180
|
) -> dict[str, t.Any]:
|
|
@@ -184,10 +185,12 @@ def apply_prompt(
|
|
|
184
185
|
The examples to apply the few-shot examples to.
|
|
185
186
|
few_shot_examples:
|
|
186
187
|
The few-shot examples to apply.
|
|
188
|
+
model_config:
|
|
189
|
+
The model configuration.
|
|
187
190
|
dataset_config:
|
|
188
191
|
The dataset configuration.
|
|
189
|
-
|
|
190
|
-
|
|
192
|
+
generative_type:
|
|
193
|
+
The generative type of the model.
|
|
191
194
|
always_populate_text_field:
|
|
192
195
|
Whether to always populate the 'text' field in the examples, as opposed to
|
|
193
196
|
the 'messages' field.
|
|
@@ -198,7 +201,11 @@ def apply_prompt(
|
|
|
198
201
|
The example with the few-shot examples applied.
|
|
199
202
|
"""
|
|
200
203
|
# Sanity check
|
|
201
|
-
if
|
|
204
|
+
if (
|
|
205
|
+
generative_type in {GenerativeType.INSTRUCTION_TUNED, GenerativeType.REASONING}
|
|
206
|
+
and always_populate_text_field
|
|
207
|
+
and tokeniser is None
|
|
208
|
+
):
|
|
202
209
|
raise ValueError(
|
|
203
210
|
"The `tokeniser` argument must be provided when the model is instruction "
|
|
204
211
|
"tuned and when we are not just returning the raw messages."
|
|
@@ -222,7 +229,10 @@ def apply_prompt(
|
|
|
222
229
|
)
|
|
223
230
|
label_mapping = dataset_config.prompt_label_mapping
|
|
224
231
|
label = label_mapping.get(label, label)
|
|
225
|
-
if
|
|
232
|
+
if generative_type in {
|
|
233
|
+
GenerativeType.INSTRUCTION_TUNED,
|
|
234
|
+
GenerativeType.REASONING,
|
|
235
|
+
}:
|
|
226
236
|
prompt = dataset_config.instruction_prompt.format(**kwargs)
|
|
227
237
|
return prompt, label
|
|
228
238
|
else:
|
|
@@ -348,7 +358,7 @@ def apply_prompt(
|
|
|
348
358
|
f"Unsupported task group: {dataset_config.task.task_group}."
|
|
349
359
|
)
|
|
350
360
|
|
|
351
|
-
if
|
|
361
|
+
if generative_type in {GenerativeType.INSTRUCTION_TUNED, GenerativeType.REASONING}:
|
|
352
362
|
few_shot_messages = [
|
|
353
363
|
dict(role=role, content=content)
|
|
354
364
|
for prompt, label in few_shot_sections
|
|
@@ -362,7 +372,6 @@ def apply_prompt(
|
|
|
362
372
|
|
|
363
373
|
if not always_populate_text_field:
|
|
364
374
|
examples["messages"] = messages_list
|
|
365
|
-
|
|
366
375
|
else:
|
|
367
376
|
assert tokeniser is not None
|
|
368
377
|
|
|
@@ -389,6 +398,9 @@ def apply_prompt(
|
|
|
389
398
|
apply_chat_template(
|
|
390
399
|
conversation=messages,
|
|
391
400
|
tokeniser=tokeniser,
|
|
401
|
+
tokenise=False,
|
|
402
|
+
add_generation_prompt=True,
|
|
403
|
+
enable_thinking=(generative_type == GenerativeType.REASONING),
|
|
392
404
|
chat_template=chat_template,
|
|
393
405
|
)
|
|
394
406
|
for messages in messages_list
|
|
@@ -399,7 +411,10 @@ def apply_prompt(
|
|
|
399
411
|
else:
|
|
400
412
|
prompt_prefix = ""
|
|
401
413
|
if dataset_config.prompt_prefix:
|
|
402
|
-
|
|
414
|
+
labels_str = dataset_config.get_labels_str()
|
|
415
|
+
prompt_prefix = (
|
|
416
|
+
dataset_config.prompt_prefix.format(labels_str=labels_str) + "\n\n"
|
|
417
|
+
)
|
|
403
418
|
|
|
404
419
|
few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
|
|
405
420
|
if few_shot_prompt:
|
|
@@ -414,3 +429,42 @@ def apply_prompt(
|
|
|
414
429
|
examples["prompt"] = [new_prompt for new_prompt, _ in new_sections]
|
|
415
430
|
|
|
416
431
|
return examples
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def raise_if_wrong_params(
|
|
435
|
+
model_config: "ModelConfig", allowed_params: dict[re.Pattern, list[str]]
|
|
436
|
+
) -> None:
|
|
437
|
+
"""Raise an error if the model configuration has invalid parameters.
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
model_config:
|
|
441
|
+
The model configuration.
|
|
442
|
+
allowed_params:
|
|
443
|
+
The allowed parameters for the model, being a dictionary mapping a regex
|
|
444
|
+
pattern matching the model ID to a list of allowed parameters for those
|
|
445
|
+
models.
|
|
446
|
+
|
|
447
|
+
Raises:
|
|
448
|
+
InvalidModel:
|
|
449
|
+
If the model configuration has invalid parameters.
|
|
450
|
+
"""
|
|
451
|
+
if model_config.param is None:
|
|
452
|
+
return
|
|
453
|
+
for model_regex, allowed_params_list in allowed_params.items():
|
|
454
|
+
if re.fullmatch(pattern=model_regex, string=model_config.model_id):
|
|
455
|
+
if model_config.param not in allowed_params_list:
|
|
456
|
+
msg = (
|
|
457
|
+
f"Invalid parameter {model_config.param!r} for model "
|
|
458
|
+
f"{model_config.model_id!r}."
|
|
459
|
+
)
|
|
460
|
+
if allowed_params_list:
|
|
461
|
+
msg += f" Allowed parameters are: {', '.join(allowed_params_list)}."
|
|
462
|
+
else:
|
|
463
|
+
msg += " No parameters are allowed."
|
|
464
|
+
raise InvalidModel(msg)
|
|
465
|
+
return
|
|
466
|
+
else:
|
|
467
|
+
raise InvalidModel(
|
|
468
|
+
f"The parameter {model_config.param!r} is not supported for the model "
|
|
469
|
+
f"{model_config.model_id!r}."
|
|
470
|
+
)
|
euroeval/metrics/pipeline.py
CHANGED
|
@@ -217,7 +217,7 @@ def european_values_preprocessing_fn(
|
|
|
217
217
|
)
|
|
218
218
|
|
|
219
219
|
# Double check that we reshaped the predictions correctly
|
|
220
|
-
for idx, pred in enumerate(
|
|
220
|
+
for idx, pred in enumerate(integer_predictions):
|
|
221
221
|
assert arr[idx // 5, idx % 5] == pred, (
|
|
222
222
|
f"Reshaped predictions do not match the original predictions at index "
|
|
223
223
|
f"{idx}: {arr[idx // 5, idx % 5]} != {pred}."
|