EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +6 -0
- euroeval/benchmark_config_factory.py +51 -46
- euroeval/benchmark_modules/base.py +6 -5
- euroeval/benchmark_modules/hf.py +2 -9
- euroeval/benchmark_modules/litellm.py +14 -12
- euroeval/benchmark_modules/vllm.py +17 -10
- euroeval/benchmarker.py +61 -44
- euroeval/caching_utils.py +1 -1
- euroeval/cli.py +86 -8
- euroeval/constants.py +3 -0
- euroeval/data_loading.py +78 -30
- euroeval/data_models.py +326 -326
- euroeval/dataset_configs/__init__.py +10 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +25 -29
- euroeval/dataset_configs/danish.py +51 -88
- euroeval/dataset_configs/dutch.py +48 -86
- euroeval/dataset_configs/english.py +45 -76
- euroeval/dataset_configs/estonian.py +36 -38
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -68
- euroeval/dataset_configs/french.py +39 -74
- euroeval/dataset_configs/german.py +45 -81
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -78
- euroeval/dataset_configs/latvian.py +28 -34
- euroeval/dataset_configs/lithuanian.py +22 -26
- euroeval/dataset_configs/norwegian.py +72 -114
- euroeval/dataset_configs/polish.py +33 -60
- euroeval/dataset_configs/portuguese.py +33 -65
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +19 -24
- euroeval/dataset_configs/spanish.py +42 -76
- euroeval/dataset_configs/swedish.py +48 -84
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +3 -2
- euroeval/generation.py +5 -4
- euroeval/generation_utils.py +6 -5
- euroeval/languages.py +395 -323
- euroeval/metrics/huggingface.py +14 -3
- euroeval/metrics/llm_as_a_judge.py +1 -1
- euroeval/model_cache.py +6 -5
- euroeval/model_loading.py +1 -1
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +82 -43
- euroeval/prompt_templates/multiple_choice.py +81 -41
- euroeval/prompt_templates/named_entity_recognition.py +125 -44
- euroeval/prompt_templates/reading_comprehension.py +92 -43
- euroeval/prompt_templates/sentiment_classification.py +91 -43
- euroeval/prompt_templates/summarization.py +64 -39
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +4 -3
- euroeval/speed_benchmark.py +2 -1
- euroeval/task_group_utils/multiple_choice_classification.py +2 -1
- euroeval/task_group_utils/question_answering.py +24 -13
- euroeval/task_group_utils/sequence_classification.py +5 -4
- euroeval/task_group_utils/text_to_text.py +2 -1
- euroeval/task_group_utils/token_classification.py +11 -8
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +19 -10
- euroeval/types.py +10 -9
- euroeval/utils.py +6 -3
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.4.0.dist-info/RECORD +0 -75
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""All Serbian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import SERBIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
MMS_SR_CONFIG = DatasetConfig(
|
|
10
|
+
name="mms-sr",
|
|
11
|
+
pretty_name="MMS-sr",
|
|
12
|
+
source="EuroEval/mms-sr-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[SERBIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_SR_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-sr",
|
|
19
|
+
pretty_name="ScaLA-sr",
|
|
20
|
+
source="EuroEval/scala-sr",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[SERBIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
UNER_SR_CONFIG = DatasetConfig(
|
|
26
|
+
name="uner-sr",
|
|
27
|
+
pretty_name="UNER-sr",
|
|
28
|
+
source="EuroEval/uner-sr-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[SERBIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_SR_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-sr",
|
|
35
|
+
pretty_name="MultiWikiQA-sr",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sr-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[SERBIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
LR_SUM_SR_CONFIG = DatasetConfig(
|
|
42
|
+
name="lr-sum-sr",
|
|
43
|
+
pretty_name="LRSum-sr",
|
|
44
|
+
source="EuroEval/lr-sum-sr-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[SERBIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
MMLU_SR_CONFIG = DatasetConfig(
|
|
50
|
+
name="mmlu-sr",
|
|
51
|
+
pretty_name="MMLU-sr",
|
|
52
|
+
source="EuroEval/mmlu-sr-mini",
|
|
53
|
+
task=KNOW,
|
|
54
|
+
languages=[SERBIAN],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
WINOGRANDE_SR_CONFIG = DatasetConfig(
|
|
58
|
+
name="winogrande-sr",
|
|
59
|
+
pretty_name="Winogrande-sr",
|
|
60
|
+
source="EuroEval/winogrande-sr",
|
|
61
|
+
task=COMMON_SENSE,
|
|
62
|
+
languages=[SERBIAN],
|
|
63
|
+
_labels=["a", "b"],
|
|
64
|
+
)
|
|
@@ -1,60 +1,55 @@
|
|
|
1
1
|
"""All Slovak dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
4
|
+
from ..languages import SLOVAK
|
|
5
5
|
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
CSFD_SENTIMENT_SK_CONFIG = DatasetConfig(
|
|
10
10
|
name="csfd-sentiment-sk",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/csfd-sentiment-sk-mini",
|
|
11
|
+
pretty_name="CSFD Sentiment SK",
|
|
12
|
+
source="EuroEval/csfd-sentiment-sk-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[SLOVAK],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_SK_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-sk",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-sk",
|
|
20
|
+
source="EuroEval/scala-sk",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[SLOVAK],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
UNER_SK_CONFIG = DatasetConfig(
|
|
27
26
|
name="uner-sk",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/uner-sk-mini",
|
|
27
|
+
pretty_name="UNER-sk",
|
|
28
|
+
source="EuroEval/uner-sk-mini",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[SLOVAK],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
33
|
MULTI_WIKI_QA_SK_CONFIG = DatasetConfig(
|
|
36
34
|
name="multi-wiki-qa-sk",
|
|
37
|
-
pretty_name="
|
|
38
|
-
"
|
|
39
|
-
huggingface_id="EuroEval/multi-wiki-qa-sk-mini",
|
|
35
|
+
pretty_name="MultiWikiQA-sk",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sk-mini",
|
|
40
37
|
task=RC,
|
|
41
|
-
languages=[
|
|
38
|
+
languages=[SLOVAK],
|
|
42
39
|
)
|
|
43
40
|
|
|
44
41
|
MMLU_SK_CONFIG = DatasetConfig(
|
|
45
42
|
name="mmlu-sk",
|
|
46
|
-
pretty_name="
|
|
47
|
-
"
|
|
48
|
-
huggingface_id="EuroEval/mmlu-sk-mini",
|
|
43
|
+
pretty_name="MMLU-sk",
|
|
44
|
+
source="EuroEval/mmlu-sk-mini",
|
|
49
45
|
task=KNOW,
|
|
50
|
-
languages=[
|
|
46
|
+
languages=[SLOVAK],
|
|
51
47
|
)
|
|
52
48
|
|
|
53
49
|
WINOGRANDE_SK_CONFIG = DatasetConfig(
|
|
54
50
|
name="winogrande-sk",
|
|
55
|
-
pretty_name="
|
|
56
|
-
"
|
|
57
|
-
huggingface_id="EuroEval/winogrande-sk",
|
|
51
|
+
pretty_name="Winogrande-sk",
|
|
52
|
+
source="EuroEval/winogrande-sk",
|
|
58
53
|
task=COMMON_SENSE,
|
|
59
|
-
languages=[
|
|
54
|
+
languages=[SLOVAK],
|
|
60
55
|
)
|
|
@@ -1,77 +1,73 @@
|
|
|
1
1
|
"""All Spanish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
4
|
+
from ..languages import SPANISH
|
|
5
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
|
|
10
10
|
name="sentiment-headlines-es",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/sentiment-headlines-es",
|
|
11
|
+
pretty_name="Sentiment Headlines ES",
|
|
12
|
+
source="EuroEval/sentiment-headlines-es",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[SPANISH],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_ES_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-es",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-es",
|
|
20
|
+
source="EuroEval/scala-es",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[SPANISH],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
CONLL_ES_CONFIG = DatasetConfig(
|
|
27
26
|
name="conll-es",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/conll-es-mini",
|
|
27
|
+
pretty_name="CoNLL-es",
|
|
28
|
+
source="EuroEval/conll-es-mini",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[SPANISH],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
33
|
MLQA_ES_CONFIG = DatasetConfig(
|
|
36
34
|
name="mlqa-es",
|
|
37
|
-
pretty_name="
|
|
38
|
-
|
|
35
|
+
pretty_name="MLQA-es",
|
|
36
|
+
source="EuroEval/mlqa-es",
|
|
39
37
|
task=RC,
|
|
40
|
-
languages=[
|
|
38
|
+
languages=[SPANISH],
|
|
41
39
|
)
|
|
42
40
|
|
|
43
41
|
MLSUM_ES_CONFIG = DatasetConfig(
|
|
44
42
|
name="mlsum-es",
|
|
45
|
-
pretty_name="
|
|
46
|
-
|
|
43
|
+
pretty_name="MLSUM-es",
|
|
44
|
+
source="EuroEval/mlsum-es-mini",
|
|
47
45
|
task=SUMM,
|
|
48
|
-
languages=[
|
|
46
|
+
languages=[SPANISH],
|
|
49
47
|
)
|
|
50
48
|
|
|
51
49
|
MMLU_ES_CONFIG = DatasetConfig(
|
|
52
50
|
name="mmlu-es",
|
|
53
|
-
pretty_name="
|
|
54
|
-
"
|
|
55
|
-
huggingface_id="EuroEval/mmlu-es-mini",
|
|
51
|
+
pretty_name="MMLU-es",
|
|
52
|
+
source="EuroEval/mmlu-es-mini",
|
|
56
53
|
task=KNOW,
|
|
57
|
-
languages=[
|
|
54
|
+
languages=[SPANISH],
|
|
58
55
|
)
|
|
59
56
|
|
|
60
57
|
HELLASWAG_ES_CONFIG = DatasetConfig(
|
|
61
58
|
name="hellaswag-es",
|
|
62
|
-
pretty_name="
|
|
63
|
-
"
|
|
64
|
-
huggingface_id="EuroEval/hellaswag-es-mini",
|
|
59
|
+
pretty_name="HellaSwag-es",
|
|
60
|
+
source="EuroEval/hellaswag-es-mini",
|
|
65
61
|
task=COMMON_SENSE,
|
|
66
|
-
languages=[
|
|
62
|
+
languages=[SPANISH],
|
|
67
63
|
)
|
|
68
64
|
|
|
69
|
-
|
|
70
|
-
name="
|
|
71
|
-
pretty_name="
|
|
72
|
-
|
|
65
|
+
VALEU_ES_CONFIG = DatasetConfig(
|
|
66
|
+
name="valeu-es",
|
|
67
|
+
pretty_name="VaLEU-es",
|
|
68
|
+
source="EuroEval/european-values-es",
|
|
73
69
|
task=EUROPEAN_VALUES,
|
|
74
|
-
languages=[
|
|
70
|
+
languages=[SPANISH],
|
|
75
71
|
splits=["test"],
|
|
76
72
|
bootstrap_samples=False,
|
|
77
73
|
_instruction_prompt="{text}",
|
|
@@ -82,76 +78,46 @@ EUROPEAN_VALUES_ES_CONFIG = DatasetConfig(
|
|
|
82
78
|
|
|
83
79
|
XQUAD_ES_CONFIG = DatasetConfig(
|
|
84
80
|
name="xquad-es",
|
|
85
|
-
pretty_name="
|
|
86
|
-
|
|
81
|
+
pretty_name="XQuAD-es",
|
|
82
|
+
source="EuroEval/xquad-es",
|
|
87
83
|
task=RC,
|
|
88
|
-
languages=[
|
|
84
|
+
languages=[SPANISH],
|
|
89
85
|
unofficial=True,
|
|
90
86
|
)
|
|
91
87
|
|
|
92
88
|
BELEBELE_ES_CONFIG = DatasetConfig(
|
|
93
89
|
name="belebele-es",
|
|
94
|
-
pretty_name="
|
|
95
|
-
"
|
|
96
|
-
huggingface_id="EuroEval/belebele-es-mini",
|
|
90
|
+
pretty_name="Belebele-es",
|
|
91
|
+
source="EuroEval/belebele-es-mini",
|
|
97
92
|
task=MCRC,
|
|
98
|
-
languages=[
|
|
93
|
+
languages=[SPANISH],
|
|
99
94
|
unofficial=True,
|
|
100
95
|
)
|
|
101
96
|
|
|
102
97
|
MULTI_WIKI_QA_ES_CONFIG = DatasetConfig(
|
|
103
98
|
name="multi-wiki-qa-es",
|
|
104
|
-
pretty_name="
|
|
105
|
-
"
|
|
106
|
-
huggingface_id="EuroEval/multi-wiki-qa-es-mini",
|
|
99
|
+
pretty_name="MultiWikiQA-es",
|
|
100
|
+
source="EuroEval/multi-wiki-qa-es-mini",
|
|
107
101
|
task=RC,
|
|
108
|
-
languages=[
|
|
102
|
+
languages=[SPANISH],
|
|
109
103
|
unofficial=True,
|
|
110
104
|
)
|
|
111
105
|
|
|
112
106
|
GOLDENSWAG_ES_CONFIG = DatasetConfig(
|
|
113
107
|
name="goldenswag-es",
|
|
114
|
-
pretty_name="
|
|
115
|
-
"
|
|
116
|
-
huggingface_id="EuroEval/goldenswag-es-mini",
|
|
108
|
+
pretty_name="GoldenSwag-es",
|
|
109
|
+
source="EuroEval/goldenswag-es-mini",
|
|
117
110
|
task=COMMON_SENSE,
|
|
118
|
-
languages=[
|
|
111
|
+
languages=[SPANISH],
|
|
119
112
|
unofficial=True,
|
|
120
113
|
)
|
|
121
114
|
|
|
122
115
|
WINOGRANDE_ES_CONFIG = DatasetConfig(
|
|
123
116
|
name="winogrande-es",
|
|
124
|
-
pretty_name="
|
|
125
|
-
"
|
|
126
|
-
huggingface_id="EuroEval/winogrande-es",
|
|
117
|
+
pretty_name="Winogrande-es",
|
|
118
|
+
source="EuroEval/winogrande-es",
|
|
127
119
|
task=COMMON_SENSE,
|
|
128
|
-
languages=[
|
|
120
|
+
languages=[SPANISH],
|
|
129
121
|
_labels=["a", "b"],
|
|
130
122
|
unofficial=True,
|
|
131
123
|
)
|
|
132
|
-
|
|
133
|
-
EUROPEAN_VALUES_SITUATIONAL_ES_CONFIG = DatasetConfig(
|
|
134
|
-
name="european-values-situational-es",
|
|
135
|
-
pretty_name="the Spanish version of the European values evaluation dataset, where "
|
|
136
|
-
"the questions are phrased in a situational way",
|
|
137
|
-
huggingface_id="EuroEval/european-values-situational-es",
|
|
138
|
-
task=EUROPEAN_VALUES,
|
|
139
|
-
languages=[ES],
|
|
140
|
-
splits=["test"],
|
|
141
|
-
bootstrap_samples=False,
|
|
142
|
-
_instruction_prompt="{text}",
|
|
143
|
-
unofficial=True,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
EUROPEAN_VALUES_COMPLETIONS_ES_CONFIG = DatasetConfig(
|
|
147
|
-
name="european-values-completions-es",
|
|
148
|
-
pretty_name="the Spanish version of the European values evaluation dataset, where "
|
|
149
|
-
"the questions are phrased as sentence completions",
|
|
150
|
-
huggingface_id="EuroEval/european-values-completions-es",
|
|
151
|
-
task=EUROPEAN_VALUES,
|
|
152
|
-
languages=[ES],
|
|
153
|
-
splits=["test"],
|
|
154
|
-
bootstrap_samples=False,
|
|
155
|
-
_instruction_prompt="{text}",
|
|
156
|
-
unofficial=True,
|
|
157
|
-
)
|
|
@@ -1,78 +1,73 @@
|
|
|
1
1
|
"""All Swedish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
4
|
+
from ..languages import SWEDISH
|
|
5
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
SWEREC_CONFIG = DatasetConfig(
|
|
10
10
|
name="swerec",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/swerec-mini",
|
|
11
|
+
pretty_name="SweReC",
|
|
12
|
+
source="EuroEval/swerec-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[SWEDISH],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_SV_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-sv",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-sv",
|
|
20
|
+
source="EuroEval/scala-sv",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[SWEDISH],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
SUC3_CONFIG = DatasetConfig(
|
|
27
26
|
name="suc3",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/suc3-mini",
|
|
27
|
+
pretty_name="SUC3",
|
|
28
|
+
source="EuroEval/suc3-mini",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[SWEDISH],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
33
|
MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
|
|
36
34
|
name="multi-wiki-qa-sv",
|
|
37
|
-
pretty_name="
|
|
38
|
-
"
|
|
39
|
-
huggingface_id="EuroEval/multi-wiki-qa-sv-mini",
|
|
35
|
+
pretty_name="MultiWikiQA-sv",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sv-mini",
|
|
40
37
|
task=RC,
|
|
41
|
-
languages=[
|
|
38
|
+
languages=[SWEDISH],
|
|
42
39
|
)
|
|
43
40
|
|
|
44
41
|
SWEDN_CONFIG = DatasetConfig(
|
|
45
42
|
name="swedn",
|
|
46
|
-
pretty_name="
|
|
47
|
-
|
|
43
|
+
pretty_name="SweDN",
|
|
44
|
+
source="EuroEval/swedn-mini",
|
|
48
45
|
task=SUMM,
|
|
49
|
-
languages=[
|
|
46
|
+
languages=[SWEDISH],
|
|
50
47
|
)
|
|
51
48
|
|
|
52
49
|
MMLU_SV_CONFIG = DatasetConfig(
|
|
53
50
|
name="mmlu-sv",
|
|
54
|
-
pretty_name="
|
|
55
|
-
"
|
|
56
|
-
huggingface_id="EuroEval/mmlu-sv-mini",
|
|
51
|
+
pretty_name="MMLU-sv",
|
|
52
|
+
source="EuroEval/mmlu-sv-mini",
|
|
57
53
|
task=KNOW,
|
|
58
|
-
languages=[
|
|
54
|
+
languages=[SWEDISH],
|
|
59
55
|
)
|
|
60
56
|
|
|
61
57
|
HELLASWAG_SV_CONFIG = DatasetConfig(
|
|
62
58
|
name="hellaswag-sv",
|
|
63
|
-
pretty_name="
|
|
64
|
-
"
|
|
65
|
-
huggingface_id="EuroEval/hellaswag-sv-mini",
|
|
59
|
+
pretty_name="HellaSwag-sv",
|
|
60
|
+
source="EuroEval/hellaswag-sv-mini",
|
|
66
61
|
task=COMMON_SENSE,
|
|
67
|
-
languages=[
|
|
62
|
+
languages=[SWEDISH],
|
|
68
63
|
)
|
|
69
64
|
|
|
70
|
-
|
|
71
|
-
name="
|
|
72
|
-
pretty_name="
|
|
73
|
-
|
|
65
|
+
VALEU_SV_CONFIG = DatasetConfig(
|
|
66
|
+
name="valeu-sv",
|
|
67
|
+
pretty_name="VaLEU-sv",
|
|
68
|
+
source="EuroEval/european-values-sv",
|
|
74
69
|
task=EUROPEAN_VALUES,
|
|
75
|
-
languages=[
|
|
70
|
+
languages=[SWEDISH],
|
|
76
71
|
splits=["test"],
|
|
77
72
|
bootstrap_samples=False,
|
|
78
73
|
_instruction_prompt="{text}",
|
|
@@ -83,95 +78,64 @@ EUROPEAN_VALUES_SV_CONFIG = DatasetConfig(
|
|
|
83
78
|
|
|
84
79
|
SCHIBSTED_SV_CONFIG = DatasetConfig(
|
|
85
80
|
name="schibsted-sv",
|
|
86
|
-
pretty_name="
|
|
87
|
-
|
|
81
|
+
pretty_name="Schibsted-sv",
|
|
82
|
+
source="EuroEval/schibsted-article-summaries-sv",
|
|
88
83
|
task=SUMM,
|
|
89
|
-
languages=[
|
|
84
|
+
languages=[SWEDISH],
|
|
90
85
|
unofficial=True,
|
|
91
86
|
)
|
|
92
87
|
|
|
93
88
|
ARC_SV_CONFIG = DatasetConfig(
|
|
94
89
|
name="arc-sv",
|
|
95
|
-
pretty_name="
|
|
96
|
-
"
|
|
97
|
-
huggingface_id="EuroEval/arc-sv-mini",
|
|
90
|
+
pretty_name="ARC-sv",
|
|
91
|
+
source="EuroEval/arc-sv-mini",
|
|
98
92
|
task=KNOW,
|
|
99
|
-
languages=[
|
|
93
|
+
languages=[SWEDISH],
|
|
100
94
|
unofficial=True,
|
|
101
95
|
)
|
|
102
96
|
|
|
103
97
|
BELEBELE_SV_CONFIG = DatasetConfig(
|
|
104
98
|
name="belebele-sv",
|
|
105
|
-
pretty_name="
|
|
106
|
-
"
|
|
107
|
-
huggingface_id="EuroEval/belebele-sv-mini",
|
|
99
|
+
pretty_name="Belebele-sv",
|
|
100
|
+
source="EuroEval/belebele-sv-mini",
|
|
108
101
|
task=MCRC,
|
|
109
|
-
languages=[
|
|
102
|
+
languages=[SWEDISH],
|
|
110
103
|
unofficial=True,
|
|
111
104
|
)
|
|
112
105
|
|
|
113
106
|
SCANDIQA_SV_CONFIG = DatasetConfig(
|
|
114
107
|
name="scandiqa-sv",
|
|
115
|
-
pretty_name="
|
|
116
|
-
"
|
|
117
|
-
huggingface_id="EuroEval/scandiqa-sv-mini",
|
|
108
|
+
pretty_name="ScandiQA-sv",
|
|
109
|
+
source="EuroEval/scandiqa-sv-mini",
|
|
118
110
|
task=RC,
|
|
119
|
-
languages=[
|
|
111
|
+
languages=[SWEDISH],
|
|
120
112
|
unofficial=True,
|
|
121
113
|
)
|
|
122
114
|
|
|
123
115
|
GOLDENSWAG_SV_CONFIG = DatasetConfig(
|
|
124
116
|
name="goldenswag-sv",
|
|
125
|
-
pretty_name="
|
|
126
|
-
"
|
|
127
|
-
huggingface_id="EuroEval/goldenswag-sv-mini",
|
|
117
|
+
pretty_name="GoldenSwag-sv",
|
|
118
|
+
source="EuroEval/goldenswag-sv-mini",
|
|
128
119
|
task=COMMON_SENSE,
|
|
129
|
-
languages=[
|
|
120
|
+
languages=[SWEDISH],
|
|
130
121
|
unofficial=True,
|
|
131
122
|
)
|
|
132
123
|
|
|
133
124
|
WINOGRANDE_SV_CONFIG = DatasetConfig(
|
|
134
125
|
name="winogrande-sv",
|
|
135
|
-
pretty_name="
|
|
136
|
-
"
|
|
137
|
-
huggingface_id="EuroEval/winogrande-sv",
|
|
126
|
+
pretty_name="Winogrande-sv",
|
|
127
|
+
source="EuroEval/winogrande-sv",
|
|
138
128
|
task=COMMON_SENSE,
|
|
139
|
-
languages=[
|
|
129
|
+
languages=[SWEDISH],
|
|
140
130
|
_labels=["a", "b"],
|
|
141
131
|
unofficial=True,
|
|
142
132
|
)
|
|
143
133
|
|
|
144
|
-
EUROPEAN_VALUES_SITUATIONAL_SV_CONFIG = DatasetConfig(
|
|
145
|
-
name="european-values-situational-sv",
|
|
146
|
-
pretty_name="the Swedish version of the European values evaluation dataset, where "
|
|
147
|
-
"the questions are phrased in a situational way",
|
|
148
|
-
huggingface_id="EuroEval/european-values-situational-sv",
|
|
149
|
-
task=EUROPEAN_VALUES,
|
|
150
|
-
languages=[SV],
|
|
151
|
-
splits=["test"],
|
|
152
|
-
bootstrap_samples=False,
|
|
153
|
-
_instruction_prompt="{text}",
|
|
154
|
-
unofficial=True,
|
|
155
|
-
)
|
|
156
|
-
|
|
157
|
-
EUROPEAN_VALUES_COMPLETIONS_SV_CONFIG = DatasetConfig(
|
|
158
|
-
name="european-values-completions-sv",
|
|
159
|
-
pretty_name="the Swedish version of the European values evaluation dataset, where "
|
|
160
|
-
"the questions are phrased as sentence completions",
|
|
161
|
-
huggingface_id="EuroEval/european-values-completions-sv",
|
|
162
|
-
task=EUROPEAN_VALUES,
|
|
163
|
-
languages=[SV],
|
|
164
|
-
splits=["test"],
|
|
165
|
-
bootstrap_samples=False,
|
|
166
|
-
_instruction_prompt="{text}",
|
|
167
|
-
unofficial=True,
|
|
168
|
-
)
|
|
169
|
-
|
|
170
134
|
SKOLPROV_CONFIG = DatasetConfig(
|
|
171
135
|
name="skolprov",
|
|
172
|
-
pretty_name="
|
|
173
|
-
|
|
136
|
+
pretty_name="Skolprov",
|
|
137
|
+
source="EuroEval/skolprov",
|
|
174
138
|
task=KNOW,
|
|
175
|
-
languages=[
|
|
139
|
+
languages=[SWEDISH],
|
|
176
140
|
unofficial=True,
|
|
177
141
|
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""All Ukrainian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import UKRAINIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
CROSS_DOMAIN_UK_REVIEWS_CONFIG = DatasetConfig(
|
|
10
|
+
name="cross-domain-uk-reviews",
|
|
11
|
+
pretty_name="Cross Domain Ukrainian Reviews",
|
|
12
|
+
source="EuroEval/cross-domain-uk-reviews-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[UKRAINIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_UK_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-uk",
|
|
19
|
+
pretty_name="ScaLA-uk",
|
|
20
|
+
source="EuroEval/scala-uk",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[UKRAINIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
NER_UK_CONFIG = DatasetConfig(
|
|
26
|
+
name="ner-uk",
|
|
27
|
+
pretty_name="NER-uk",
|
|
28
|
+
source="EuroEval/ner-uk-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[UKRAINIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_UK_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-uk",
|
|
35
|
+
pretty_name="MultiWikiQA-uk",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-uk-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[UKRAINIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
LR_SUM_UK_CONFIG = DatasetConfig(
|
|
42
|
+
name="lr-sum-uk",
|
|
43
|
+
pretty_name="LRSum-uk",
|
|
44
|
+
source="EuroEval/lr-sum-uk-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[UKRAINIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
GLOBAL_MMLU_UK_CONFIG = DatasetConfig(
|
|
50
|
+
name="global-mmlu-uk",
|
|
51
|
+
pretty_name="GlobalMMLU-uk",
|
|
52
|
+
source="EuroEval/global-mmlu-uk-mini",
|
|
53
|
+
task=KNOW,
|
|
54
|
+
languages=[UKRAINIAN],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
WINOGRANDE_UK_CONFIG = DatasetConfig(
|
|
58
|
+
name="winogrande-uk",
|
|
59
|
+
pretty_name="Winogrande-uk",
|
|
60
|
+
source="EuroEval/winogrande-uk",
|
|
61
|
+
task=COMMON_SENSE,
|
|
62
|
+
languages=[UKRAINIAN],
|
|
63
|
+
_labels=["a", "b"],
|
|
64
|
+
)
|
euroeval/exceptions.py
CHANGED
|
@@ -145,7 +145,7 @@ class NeedsAdditionalArgument(InvalidModel):
|
|
|
145
145
|
else:
|
|
146
146
|
self.message = (
|
|
147
147
|
f"The model you are trying to load requires the `{script_argument}` "
|
|
148
|
-
"argument
|
|
148
|
+
"argument to be passed to the `Benchmarker` class. Please pass the "
|
|
149
149
|
"argument and try again."
|
|
150
150
|
)
|
|
151
151
|
super().__init__(self.message)
|
euroeval/finetuning.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Functions related to the finetuning of models."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import sys
|
|
5
6
|
import typing as t
|
|
@@ -30,11 +31,11 @@ if t.TYPE_CHECKING:
|
|
|
30
31
|
|
|
31
32
|
def finetune(
|
|
32
33
|
model: "BenchmarkModule",
|
|
33
|
-
datasets:
|
|
34
|
+
datasets: c.Sequence["DatasetDict"],
|
|
34
35
|
model_config: "ModelConfig",
|
|
35
36
|
dataset_config: "DatasetConfig",
|
|
36
37
|
benchmark_config: "BenchmarkConfig",
|
|
37
|
-
) ->
|
|
38
|
+
) -> c.Sequence[dict[str, float]]:
|
|
38
39
|
"""Evaluate a model on a dataset through finetuning.
|
|
39
40
|
|
|
40
41
|
Args:
|