EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +32 -14
- euroeval/benchmark_config_factory.py +92 -180
- euroeval/benchmark_modules/base.py +49 -39
- euroeval/benchmark_modules/fresh.py +35 -21
- euroeval/benchmark_modules/hf.py +280 -244
- euroeval/benchmark_modules/litellm.py +752 -312
- euroeval/benchmark_modules/vllm.py +570 -268
- euroeval/benchmarker.py +651 -528
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +49 -38
- euroeval/constants.py +44 -25
- euroeval/data_loading.py +111 -55
- euroeval/data_models.py +490 -323
- euroeval/dataset_configs/__init__.py +26 -4
- euroeval/dataset_configs/bosnian.py +39 -0
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/croatian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +78 -50
- euroeval/dataset_configs/dutch.py +74 -44
- euroeval/dataset_configs/english.py +71 -36
- euroeval/dataset_configs/estonian.py +111 -0
- euroeval/dataset_configs/faroese.py +25 -18
- euroeval/dataset_configs/finnish.py +63 -26
- euroeval/dataset_configs/french.py +65 -32
- euroeval/dataset_configs/german.py +77 -36
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +68 -57
- euroeval/dataset_configs/italian.py +68 -36
- euroeval/dataset_configs/latvian.py +87 -0
- euroeval/dataset_configs/lithuanian.py +64 -0
- euroeval/dataset_configs/norwegian.py +98 -72
- euroeval/dataset_configs/polish.py +96 -0
- euroeval/dataset_configs/portuguese.py +63 -40
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/slovene.py +56 -0
- euroeval/dataset_configs/spanish.py +68 -34
- euroeval/dataset_configs/swedish.py +82 -41
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/enums.py +12 -6
- euroeval/exceptions.py +21 -1
- euroeval/finetuning.py +34 -26
- euroeval/generation.py +76 -41
- euroeval/generation_utils.py +169 -34
- euroeval/languages.py +1020 -188
- euroeval/logging_utils.py +268 -0
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +85 -0
- euroeval/metrics/huggingface.py +216 -0
- euroeval/metrics/llm_as_a_judge.py +260 -0
- euroeval/metrics/pipeline.py +289 -0
- euroeval/metrics/speed.py +48 -0
- euroeval/model_cache.py +40 -21
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +157 -22
- euroeval/prompt_templates/multiple_choice.py +159 -17
- euroeval/prompt_templates/named_entity_recognition.py +318 -21
- euroeval/prompt_templates/reading_comprehension.py +207 -16
- euroeval/prompt_templates/sentiment_classification.py +205 -22
- euroeval/prompt_templates/summarization.py +122 -22
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +20 -9
- euroeval/speed_benchmark.py +11 -12
- euroeval/task_group_utils/multiple_choice_classification.py +21 -12
- euroeval/task_group_utils/question_answering.py +101 -73
- euroeval/task_group_utils/sequence_classification.py +144 -61
- euroeval/task_group_utils/text_to_text.py +33 -12
- euroeval/task_group_utils/token_classification.py +86 -89
- euroeval/tasks.py +75 -16
- euroeval/tokenisation_utils.py +603 -0
- euroeval/types.py +17 -11
- euroeval/utils.py +332 -137
- euroeval-16.7.1.dist-info/METADATA +623 -0
- euroeval-16.7.1.dist-info/RECORD +84 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -737
- euroeval/metrics.py +0 -452
- euroeval/tokenization_utils.py +0 -498
- euroeval-15.12.0.dist-info/METADATA +0 -285
- euroeval-15.12.0.dist-info/RECORD +0 -63
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,91 +1,132 @@
|
|
|
1
1
|
"""All German dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
4
|
+
from ..languages import GERMAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
SB10K_CONFIG = DatasetConfig(
|
|
10
10
|
name="sb10k",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/sb10k-mini",
|
|
11
|
+
pretty_name="SB10K",
|
|
12
|
+
source="EuroEval/sb10k-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[GERMAN],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_DE_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-de",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-de",
|
|
20
|
+
source="EuroEval/scala-de",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[GERMAN],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
GERMEVAL_CONFIG = DatasetConfig(
|
|
27
26
|
name="germeval",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/germeval-mini",
|
|
27
|
+
pretty_name="GermEval",
|
|
28
|
+
source="EuroEval/germeval-mini",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[GERMAN],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
33
|
GERMANQUAD_CONFIG = DatasetConfig(
|
|
36
34
|
name="germanquad",
|
|
37
|
-
pretty_name="
|
|
38
|
-
"
|
|
39
|
-
huggingface_id="EuroEval/germanquad-mini",
|
|
35
|
+
pretty_name="GermanQuAD",
|
|
36
|
+
source="EuroEval/germanquad-mini",
|
|
40
37
|
task=RC,
|
|
41
|
-
languages=[
|
|
38
|
+
languages=[GERMAN],
|
|
42
39
|
)
|
|
43
40
|
|
|
44
41
|
MLSUM_DE_CONFIG = DatasetConfig(
|
|
45
42
|
name="mlsum-de",
|
|
46
|
-
pretty_name="
|
|
47
|
-
|
|
43
|
+
pretty_name="MLSUM-de",
|
|
44
|
+
source="EuroEval/mlsum-mini",
|
|
48
45
|
task=SUMM,
|
|
49
|
-
languages=[
|
|
46
|
+
languages=[GERMAN],
|
|
50
47
|
)
|
|
51
48
|
|
|
52
49
|
MMLU_DE_CONFIG = DatasetConfig(
|
|
53
50
|
name="mmlu-de",
|
|
54
|
-
pretty_name="
|
|
55
|
-
"
|
|
56
|
-
huggingface_id="EuroEval/mmlu-de-mini",
|
|
51
|
+
pretty_name="MMLU-de",
|
|
52
|
+
source="EuroEval/mmlu-de-mini",
|
|
57
53
|
task=KNOW,
|
|
58
|
-
languages=[
|
|
54
|
+
languages=[GERMAN],
|
|
59
55
|
)
|
|
60
56
|
|
|
61
57
|
HELLASWAG_DE_CONFIG = DatasetConfig(
|
|
62
58
|
name="hellaswag-de",
|
|
63
|
-
pretty_name="
|
|
64
|
-
"
|
|
65
|
-
huggingface_id="EuroEval/hellaswag-de-mini",
|
|
59
|
+
pretty_name="HellaSwag-de",
|
|
60
|
+
source="EuroEval/hellaswag-de-mini",
|
|
66
61
|
task=COMMON_SENSE,
|
|
67
|
-
languages=[
|
|
62
|
+
languages=[GERMAN],
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
VALEU_DE_CONFIG = DatasetConfig(
|
|
66
|
+
name="valeu-de",
|
|
67
|
+
pretty_name="VaLEU-de",
|
|
68
|
+
source="EuroEval/european-values-de",
|
|
69
|
+
task=EUROPEAN_VALUES,
|
|
70
|
+
languages=[GERMAN],
|
|
71
|
+
splits=["test"],
|
|
72
|
+
bootstrap_samples=False,
|
|
73
|
+
_instruction_prompt="{text}",
|
|
68
74
|
)
|
|
69
75
|
|
|
70
76
|
|
|
71
77
|
### Unofficial datasets ###
|
|
72
78
|
|
|
79
|
+
XQUAD_DE_CONFIG = DatasetConfig(
|
|
80
|
+
name="xquad-de",
|
|
81
|
+
pretty_name="XQuAD-de",
|
|
82
|
+
source="EuroEval/xquad-de",
|
|
83
|
+
task=RC,
|
|
84
|
+
languages=[GERMAN],
|
|
85
|
+
unofficial=True,
|
|
86
|
+
)
|
|
87
|
+
|
|
73
88
|
ARC_DE_CONFIG = DatasetConfig(
|
|
74
89
|
name="arc-de",
|
|
75
|
-
pretty_name="
|
|
76
|
-
"
|
|
77
|
-
huggingface_id="EuroEval/arc-de-mini",
|
|
90
|
+
pretty_name="ARC-de",
|
|
91
|
+
source="EuroEval/arc-de-mini",
|
|
78
92
|
task=KNOW,
|
|
79
|
-
languages=[
|
|
93
|
+
languages=[GERMAN],
|
|
80
94
|
unofficial=True,
|
|
81
95
|
)
|
|
82
96
|
|
|
83
97
|
BELEBELE_DE_CONFIG = DatasetConfig(
|
|
84
98
|
name="belebele-de",
|
|
85
|
-
pretty_name="
|
|
86
|
-
"
|
|
87
|
-
huggingface_id="EuroEval/belebele-de-mini",
|
|
99
|
+
pretty_name="Belebele-de",
|
|
100
|
+
source="EuroEval/belebele-de-mini",
|
|
88
101
|
task=MCRC,
|
|
89
|
-
languages=[
|
|
102
|
+
languages=[GERMAN],
|
|
103
|
+
unofficial=True,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
MULTI_WIKI_QA_DE_CONFIG = DatasetConfig(
|
|
107
|
+
name="multi-wiki-qa-de",
|
|
108
|
+
pretty_name="MultiWikiQA-de",
|
|
109
|
+
source="EuroEval/multi-wiki-qa-de-mini",
|
|
110
|
+
task=RC,
|
|
111
|
+
languages=[GERMAN],
|
|
112
|
+
unofficial=True,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
GOLDENSWAG_DE_CONFIG = DatasetConfig(
|
|
116
|
+
name="goldenswag-de",
|
|
117
|
+
pretty_name="GoldenSwag-de",
|
|
118
|
+
source="EuroEval/goldenswag-de-mini",
|
|
119
|
+
task=COMMON_SENSE,
|
|
120
|
+
languages=[GERMAN],
|
|
121
|
+
unofficial=True,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
WINOGRANDE_DE_CONFIG = DatasetConfig(
|
|
125
|
+
name="winogrande-de",
|
|
126
|
+
pretty_name="Winogrande-de",
|
|
127
|
+
source="EuroEval/winogrande-de",
|
|
128
|
+
task=COMMON_SENSE,
|
|
129
|
+
languages=[GERMAN],
|
|
130
|
+
_labels=["a", "b"],
|
|
90
131
|
unofficial=True,
|
|
91
132
|
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""All Greek dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import GREEK
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
GREEK_SA_CONFIG = DatasetConfig(
|
|
10
|
+
name="greek-sa",
|
|
11
|
+
pretty_name="Greek Sentiment Analysis",
|
|
12
|
+
source="EuroEval/greek-sa-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[GREEK],
|
|
15
|
+
_labels=["negative", "positive"],
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
SCALA_EL_CONFIG = DatasetConfig(
|
|
19
|
+
name="scala-el",
|
|
20
|
+
pretty_name="ScaLA-el",
|
|
21
|
+
source="EuroEval/scala-el",
|
|
22
|
+
task=LA,
|
|
23
|
+
languages=[GREEK],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
ELNER_CONFIG = DatasetConfig(
|
|
27
|
+
name="elner",
|
|
28
|
+
pretty_name="ElNER",
|
|
29
|
+
source="EuroEval/elner-mini",
|
|
30
|
+
task=NER,
|
|
31
|
+
languages=[GREEK],
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
MULTI_WIKI_QA_EL_CONFIG = DatasetConfig(
|
|
35
|
+
name="multi-wiki-qa-el",
|
|
36
|
+
pretty_name="MultiWikiQA-el",
|
|
37
|
+
source="EuroEval/multi-wiki-qa-el-mini",
|
|
38
|
+
task=RC,
|
|
39
|
+
languages=[GREEK],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
GREEK_WIKIPEDIA_CONFIG = DatasetConfig(
|
|
43
|
+
name="greek-wikipedia",
|
|
44
|
+
pretty_name="Greek Wikipedia",
|
|
45
|
+
source="EuroEval/greek-wikipedia-mini",
|
|
46
|
+
task=SUMM,
|
|
47
|
+
languages=[GREEK],
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
GLOBAL_MMLU_EL_CONFIG = DatasetConfig(
|
|
51
|
+
name="global-mmlu-el",
|
|
52
|
+
pretty_name="GlobalMMLU-el",
|
|
53
|
+
source="EuroEval/global-mmlu-el-mini",
|
|
54
|
+
task=KNOW,
|
|
55
|
+
languages=[GREEK],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
WINOGRANDE_EL_CONFIG = DatasetConfig(
|
|
59
|
+
name="winogrande-el",
|
|
60
|
+
pretty_name="Winogrande-el",
|
|
61
|
+
source="EuroEval/winogrande-el",
|
|
62
|
+
task=COMMON_SENSE,
|
|
63
|
+
languages=[GREEK],
|
|
64
|
+
)
|
|
@@ -1,71 +1,77 @@
|
|
|
1
1
|
"""All Icelandic dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
4
|
+
from ..languages import ICELANDIC
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
HOTTER_AND_COLDER_SENTIMENT_CONFIG = DatasetConfig(
|
|
10
10
|
name="hotter-and-colder-sentiment",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"and
|
|
13
|
-
huggingface_id="EuroEval/hotter-and-colder-sentiment",
|
|
11
|
+
pretty_name="Hotter and Colder Sentiment",
|
|
12
|
+
source="EuroEval/hotter-and-colder-sentiment",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[ICELANDIC],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_IS_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-is",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-is",
|
|
20
|
+
source="EuroEval/scala-is",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[ICELANDIC],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
MIM_GOLD_NER_CONFIG = DatasetConfig(
|
|
27
26
|
name="mim-gold-ner",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/mim-gold-ner-mini",
|
|
27
|
+
pretty_name="MIM-GOLD-NER",
|
|
28
|
+
source="EuroEval/mim-gold-ner-mini",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[ICELANDIC],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
33
|
NQII_CONFIG = DatasetConfig(
|
|
36
34
|
name="nqii",
|
|
37
|
-
pretty_name="
|
|
38
|
-
"
|
|
39
|
-
huggingface_id="EuroEval/nqii-mini",
|
|
35
|
+
pretty_name="NQiI",
|
|
36
|
+
source="EuroEval/nqii-mini",
|
|
40
37
|
task=RC,
|
|
41
|
-
languages=[
|
|
38
|
+
languages=[ICELANDIC],
|
|
42
39
|
)
|
|
43
40
|
|
|
44
41
|
RRN_CONFIG = DatasetConfig(
|
|
45
42
|
name="rrn",
|
|
46
|
-
pretty_name="
|
|
47
|
-
"
|
|
48
|
-
huggingface_id="EuroEval/rrn-mini",
|
|
43
|
+
pretty_name="RRN",
|
|
44
|
+
source="EuroEval/rrn-mini",
|
|
49
45
|
task=SUMM,
|
|
50
|
-
languages=[
|
|
46
|
+
languages=[ICELANDIC],
|
|
51
47
|
)
|
|
52
48
|
|
|
53
49
|
ICELANDIC_KNOWLEDGE_CONFIG = DatasetConfig(
|
|
54
50
|
name="icelandic-knowledge",
|
|
55
|
-
pretty_name="
|
|
56
|
-
"
|
|
57
|
-
huggingface_id="EuroEval/icelandic-knowledge",
|
|
51
|
+
pretty_name="Icelandic Knowledge",
|
|
52
|
+
source="EuroEval/icelandic-knowledge",
|
|
58
53
|
task=KNOW,
|
|
59
|
-
languages=[
|
|
54
|
+
languages=[ICELANDIC],
|
|
60
55
|
)
|
|
61
56
|
|
|
62
57
|
WINOGRANDE_IS_CONFIG = DatasetConfig(
|
|
63
58
|
name="winogrande-is",
|
|
64
|
-
pretty_name="
|
|
65
|
-
"
|
|
66
|
-
huggingface_id="EuroEval/winogrande-is",
|
|
59
|
+
pretty_name="Winogrande-is",
|
|
60
|
+
source="EuroEval/winogrande-is",
|
|
67
61
|
task=COMMON_SENSE,
|
|
68
|
-
languages=[
|
|
62
|
+
languages=[ICELANDIC],
|
|
63
|
+
_labels=["a", "b"],
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
VALEU_IS_CONFIG = DatasetConfig(
|
|
67
|
+
name="valeu-is",
|
|
68
|
+
pretty_name="VaLEU-is",
|
|
69
|
+
source="EuroEval/european-values-is",
|
|
70
|
+
task=EUROPEAN_VALUES,
|
|
71
|
+
languages=[ICELANDIC],
|
|
72
|
+
splits=["test"],
|
|
73
|
+
bootstrap_samples=False,
|
|
74
|
+
_instruction_prompt="{text}",
|
|
69
75
|
)
|
|
70
76
|
|
|
71
77
|
|
|
@@ -73,76 +79,81 @@ WINOGRANDE_IS_CONFIG = DatasetConfig(
|
|
|
73
79
|
|
|
74
80
|
ICE_EC_CONFIG = DatasetConfig(
|
|
75
81
|
name="ice-ec",
|
|
76
|
-
pretty_name="
|
|
77
|
-
|
|
82
|
+
pretty_name="ICE-EC",
|
|
83
|
+
source="EuroEval/ice-ec",
|
|
78
84
|
task=LA,
|
|
79
|
-
languages=[
|
|
85
|
+
languages=[ICELANDIC],
|
|
80
86
|
unofficial=True,
|
|
81
87
|
)
|
|
82
88
|
|
|
83
89
|
ICE_EC_FULL_CONFIG = DatasetConfig(
|
|
84
90
|
name="ice-ec-full",
|
|
85
|
-
pretty_name="
|
|
86
|
-
|
|
91
|
+
pretty_name="ICE-EC Full",
|
|
92
|
+
source="EuroEval/ice-ec-full",
|
|
87
93
|
task=LA,
|
|
88
|
-
languages=[
|
|
94
|
+
languages=[ICELANDIC],
|
|
89
95
|
unofficial=True,
|
|
90
96
|
)
|
|
91
97
|
|
|
92
98
|
ICE_LINGUISTIC_CONFIG = DatasetConfig(
|
|
93
99
|
name="ice-linguistic",
|
|
94
|
-
pretty_name="
|
|
95
|
-
|
|
100
|
+
pretty_name="IceLinguistic",
|
|
101
|
+
source="EuroEval/ice-linguistic",
|
|
96
102
|
task=LA,
|
|
97
|
-
languages=[
|
|
103
|
+
languages=[ICELANDIC],
|
|
98
104
|
unofficial=True,
|
|
99
105
|
)
|
|
100
106
|
|
|
101
107
|
ICELANDIC_QA_CONFIG = DatasetConfig(
|
|
102
108
|
name="icelandic-qa",
|
|
103
|
-
pretty_name="
|
|
104
|
-
|
|
109
|
+
pretty_name="Icelandic QA",
|
|
110
|
+
source="EuroEval/icelandic-qa",
|
|
105
111
|
task=RC,
|
|
106
|
-
languages=[
|
|
112
|
+
languages=[ICELANDIC],
|
|
107
113
|
unofficial=True,
|
|
108
114
|
)
|
|
109
115
|
|
|
110
116
|
MMLU_IS_CONFIG = DatasetConfig(
|
|
111
117
|
name="mmlu-is",
|
|
112
|
-
pretty_name="
|
|
113
|
-
"
|
|
114
|
-
huggingface_id="EuroEval/mmlu-is-mini",
|
|
118
|
+
pretty_name="MMLU-is",
|
|
119
|
+
source="EuroEval/mmlu-is-mini",
|
|
115
120
|
task=KNOW,
|
|
116
|
-
languages=[
|
|
121
|
+
languages=[ICELANDIC],
|
|
117
122
|
unofficial=True,
|
|
118
123
|
)
|
|
119
124
|
|
|
120
125
|
ARC_IS_CONFIG = DatasetConfig(
|
|
121
126
|
name="arc-is",
|
|
122
|
-
pretty_name="
|
|
123
|
-
"
|
|
124
|
-
huggingface_id="EuroEval/arc-is-mini",
|
|
127
|
+
pretty_name="ARC-is",
|
|
128
|
+
source="EuroEval/arc-is-mini",
|
|
125
129
|
task=KNOW,
|
|
126
|
-
languages=[
|
|
130
|
+
languages=[ICELANDIC],
|
|
127
131
|
unofficial=True,
|
|
128
132
|
)
|
|
129
133
|
|
|
130
134
|
HELLASWAG_IS_CONFIG = DatasetConfig(
|
|
131
135
|
name="hellaswag-is",
|
|
132
|
-
pretty_name="
|
|
133
|
-
"
|
|
134
|
-
huggingface_id="EuroEval/hellaswag-is-mini",
|
|
136
|
+
pretty_name="HellaSwag-is",
|
|
137
|
+
source="EuroEval/hellaswag-is-mini",
|
|
135
138
|
task=COMMON_SENSE,
|
|
136
|
-
languages=[
|
|
139
|
+
languages=[ICELANDIC],
|
|
137
140
|
unofficial=True,
|
|
138
141
|
)
|
|
139
142
|
|
|
140
143
|
BELEBELE_IS_CONFIG = DatasetConfig(
|
|
141
144
|
name="belebele-is",
|
|
142
|
-
pretty_name="
|
|
143
|
-
"
|
|
144
|
-
huggingface_id="EuroEval/belebele-is-mini",
|
|
145
|
+
pretty_name="Belebele-is",
|
|
146
|
+
source="EuroEval/belebele-is-mini",
|
|
145
147
|
task=MCRC,
|
|
146
|
-
languages=[
|
|
148
|
+
languages=[ICELANDIC],
|
|
149
|
+
unofficial=True,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
MULTI_WIKI_QA_IS_CONFIG = DatasetConfig(
|
|
153
|
+
name="multi-wiki-qa-is",
|
|
154
|
+
pretty_name="MultiWikiQA-is",
|
|
155
|
+
source="EuroEval/multi-wiki-qa-is-mini",
|
|
156
|
+
task=RC,
|
|
157
|
+
languages=[ICELANDIC],
|
|
147
158
|
unofficial=True,
|
|
148
159
|
)
|
|
@@ -1,70 +1,76 @@
|
|
|
1
1
|
"""All Italian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
4
|
+
from ..languages import ITALIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
SENTIPOLC_CONFIG = DatasetConfig(
|
|
10
10
|
name="sentipolc16",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/sentipolc16-mini",
|
|
11
|
+
pretty_name="Sentipolc16",
|
|
12
|
+
source="EuroEval/sentipolc16-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[ITALIAN],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_IT_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-it",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-it",
|
|
20
|
+
source="EuroEval/scala-it",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[ITALIAN],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
MULTINERD_IT_CONFIG = DatasetConfig(
|
|
27
26
|
name="multinerd-it",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/multinerd-mini-it",
|
|
27
|
+
pretty_name="MultiNERD-it",
|
|
28
|
+
source="EuroEval/multinerd-mini-it",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[ITALIAN],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
33
|
SQUAD_IT_CONFIG = DatasetConfig(
|
|
36
34
|
name="squad-it",
|
|
37
|
-
pretty_name="
|
|
38
|
-
"
|
|
39
|
-
huggingface_id="EuroEval/squad-it-mini",
|
|
35
|
+
pretty_name="SQuAD-it",
|
|
36
|
+
source="EuroEval/squad-it-mini",
|
|
40
37
|
task=RC,
|
|
41
|
-
languages=[
|
|
38
|
+
languages=[ITALIAN],
|
|
42
39
|
)
|
|
43
40
|
|
|
44
41
|
ILPOST_SUM_CONFIG = DatasetConfig(
|
|
45
42
|
name="ilpost-sum",
|
|
46
|
-
pretty_name="
|
|
47
|
-
|
|
43
|
+
pretty_name="IlPost-Sum",
|
|
44
|
+
source="EuroEval/ilpost-sum",
|
|
48
45
|
task=SUMM,
|
|
49
|
-
languages=[
|
|
46
|
+
languages=[ITALIAN],
|
|
50
47
|
)
|
|
51
48
|
|
|
52
49
|
MMLU_IT_CONFIG = DatasetConfig(
|
|
53
50
|
name="mmlu-it",
|
|
54
|
-
pretty_name="
|
|
55
|
-
"
|
|
56
|
-
huggingface_id="EuroEval/mmlu-it-mini",
|
|
51
|
+
pretty_name="MMLU-it",
|
|
52
|
+
source="EuroEval/mmlu-it-mini",
|
|
57
53
|
task=KNOW,
|
|
58
|
-
languages=[
|
|
54
|
+
languages=[ITALIAN],
|
|
59
55
|
)
|
|
60
56
|
|
|
61
57
|
HELLASWAG_IT_CONFIG = DatasetConfig(
|
|
62
58
|
name="hellaswag-it",
|
|
63
|
-
pretty_name="
|
|
64
|
-
"
|
|
65
|
-
huggingface_id="EuroEval/hellaswag-it-mini",
|
|
59
|
+
pretty_name="HellaSwag-it",
|
|
60
|
+
source="EuroEval/hellaswag-it-mini",
|
|
66
61
|
task=COMMON_SENSE,
|
|
67
|
-
languages=[
|
|
62
|
+
languages=[ITALIAN],
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
VALEU_IT_CONFIG = DatasetConfig(
|
|
66
|
+
name="valeu-it",
|
|
67
|
+
pretty_name="VaLEU-it",
|
|
68
|
+
source="EuroEval/european-values-it",
|
|
69
|
+
task=EUROPEAN_VALUES,
|
|
70
|
+
languages=[ITALIAN],
|
|
71
|
+
splits=["test"],
|
|
72
|
+
bootstrap_samples=False,
|
|
73
|
+
_instruction_prompt="{text}",
|
|
68
74
|
)
|
|
69
75
|
|
|
70
76
|
|
|
@@ -72,20 +78,46 @@ HELLASWAG_IT_CONFIG = DatasetConfig(
|
|
|
72
78
|
|
|
73
79
|
WIKINEURAL_IT_CONFIG = DatasetConfig(
|
|
74
80
|
name="wikineural-it",
|
|
75
|
-
pretty_name="
|
|
76
|
-
"
|
|
77
|
-
huggingface_id="EuroEval/wikineural-mini-it",
|
|
81
|
+
pretty_name="WikiNeural-it",
|
|
82
|
+
source="EuroEval/wikineural-mini-it",
|
|
78
83
|
task=NER,
|
|
79
|
-
languages=[
|
|
84
|
+
languages=[ITALIAN],
|
|
80
85
|
unofficial=True,
|
|
81
86
|
)
|
|
82
87
|
|
|
83
88
|
BELEBELE_IT_CONFIG = DatasetConfig(
|
|
84
89
|
name="belebele-it",
|
|
85
|
-
pretty_name="
|
|
86
|
-
"
|
|
87
|
-
huggingface_id="EuroEval/belebele-it-mini",
|
|
90
|
+
pretty_name="Belebele-it",
|
|
91
|
+
source="EuroEval/belebele-it-mini",
|
|
88
92
|
task=MCRC,
|
|
89
|
-
languages=[
|
|
93
|
+
languages=[ITALIAN],
|
|
94
|
+
unofficial=True,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
MULTI_WIKI_QA_IT_CONFIG = DatasetConfig(
|
|
98
|
+
name="multi-wiki-qa-it",
|
|
99
|
+
pretty_name="MultiWikiQA-it",
|
|
100
|
+
source="EuroEval/multi-wiki-qa-it-mini",
|
|
101
|
+
task=RC,
|
|
102
|
+
languages=[ITALIAN],
|
|
103
|
+
unofficial=True,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
GOLDENSWAG_IT_CONFIG = DatasetConfig(
|
|
107
|
+
name="goldenswag-it",
|
|
108
|
+
pretty_name="GoldenSwag-it",
|
|
109
|
+
source="EuroEval/goldenswag-it-mini",
|
|
110
|
+
task=COMMON_SENSE,
|
|
111
|
+
languages=[ITALIAN],
|
|
112
|
+
unofficial=True,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
WINOGRANDE_IT_CONFIG = DatasetConfig(
|
|
116
|
+
name="winogrande-it",
|
|
117
|
+
pretty_name="Winogrande-it",
|
|
118
|
+
source="EuroEval/winogrande-it",
|
|
119
|
+
task=COMMON_SENSE,
|
|
120
|
+
languages=[ITALIAN],
|
|
121
|
+
_labels=["a", "b"],
|
|
90
122
|
unofficial=True,
|
|
91
123
|
)
|