EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +32 -14
- euroeval/benchmark_config_factory.py +92 -180
- euroeval/benchmark_modules/base.py +49 -39
- euroeval/benchmark_modules/fresh.py +35 -21
- euroeval/benchmark_modules/hf.py +280 -244
- euroeval/benchmark_modules/litellm.py +752 -312
- euroeval/benchmark_modules/vllm.py +570 -268
- euroeval/benchmarker.py +651 -528
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +49 -38
- euroeval/constants.py +44 -25
- euroeval/data_loading.py +111 -55
- euroeval/data_models.py +490 -323
- euroeval/dataset_configs/__init__.py +26 -4
- euroeval/dataset_configs/bosnian.py +39 -0
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/croatian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +78 -50
- euroeval/dataset_configs/dutch.py +74 -44
- euroeval/dataset_configs/english.py +71 -36
- euroeval/dataset_configs/estonian.py +111 -0
- euroeval/dataset_configs/faroese.py +25 -18
- euroeval/dataset_configs/finnish.py +63 -26
- euroeval/dataset_configs/french.py +65 -32
- euroeval/dataset_configs/german.py +77 -36
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +68 -57
- euroeval/dataset_configs/italian.py +68 -36
- euroeval/dataset_configs/latvian.py +87 -0
- euroeval/dataset_configs/lithuanian.py +64 -0
- euroeval/dataset_configs/norwegian.py +98 -72
- euroeval/dataset_configs/polish.py +96 -0
- euroeval/dataset_configs/portuguese.py +63 -40
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/slovene.py +56 -0
- euroeval/dataset_configs/spanish.py +68 -34
- euroeval/dataset_configs/swedish.py +82 -41
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/enums.py +12 -6
- euroeval/exceptions.py +21 -1
- euroeval/finetuning.py +34 -26
- euroeval/generation.py +76 -41
- euroeval/generation_utils.py +169 -34
- euroeval/languages.py +1020 -188
- euroeval/logging_utils.py +268 -0
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +85 -0
- euroeval/metrics/huggingface.py +216 -0
- euroeval/metrics/llm_as_a_judge.py +260 -0
- euroeval/metrics/pipeline.py +289 -0
- euroeval/metrics/speed.py +48 -0
- euroeval/model_cache.py +40 -21
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +157 -22
- euroeval/prompt_templates/multiple_choice.py +159 -17
- euroeval/prompt_templates/named_entity_recognition.py +318 -21
- euroeval/prompt_templates/reading_comprehension.py +207 -16
- euroeval/prompt_templates/sentiment_classification.py +205 -22
- euroeval/prompt_templates/summarization.py +122 -22
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +20 -9
- euroeval/speed_benchmark.py +11 -12
- euroeval/task_group_utils/multiple_choice_classification.py +21 -12
- euroeval/task_group_utils/question_answering.py +101 -73
- euroeval/task_group_utils/sequence_classification.py +144 -61
- euroeval/task_group_utils/text_to_text.py +33 -12
- euroeval/task_group_utils/token_classification.py +86 -89
- euroeval/tasks.py +75 -16
- euroeval/tokenisation_utils.py +603 -0
- euroeval/types.py +17 -11
- euroeval/utils.py +332 -137
- euroeval-16.7.1.dist-info/METADATA +623 -0
- euroeval-16.7.1.dist-info/RECORD +84 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -737
- euroeval/metrics.py +0 -452
- euroeval/tokenization_utils.py +0 -498
- euroeval-15.12.0.dist-info/METADATA +0 -285
- euroeval-15.12.0.dist-info/RECORD +0 -63
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""All Latvian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import LATVIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
LATVIAN_TWITTER_SENTIMENT_CONFIG = DatasetConfig(
|
|
10
|
+
name="latvian-twitter-sentiment",
|
|
11
|
+
pretty_name="Latvian Twitter Sentiment",
|
|
12
|
+
source="EuroEval/latvian-twitter-sentiment-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[LATVIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_LV_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-lv",
|
|
19
|
+
pretty_name="ScaLA-lv",
|
|
20
|
+
source="EuroEval/scala-lv",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[LATVIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
FULLSTACK_NER_LV_CONFIG = DatasetConfig(
|
|
26
|
+
name="fullstack-ner-lv",
|
|
27
|
+
pretty_name="FullStack NER-lv",
|
|
28
|
+
source="EuroEval/fullstack-ner-lv-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[LATVIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_LV_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-lv",
|
|
35
|
+
pretty_name="MultiWikiQA-lv",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-lv-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[LATVIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
LSM_CONFIG = DatasetConfig(
|
|
42
|
+
name="lsm",
|
|
43
|
+
pretty_name="LSM",
|
|
44
|
+
source="EuroEval/lsm-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[LATVIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
MMLU_LV_CONFIG = DatasetConfig(
|
|
51
|
+
name="mmlu-lv",
|
|
52
|
+
pretty_name="MMLU-lv",
|
|
53
|
+
source="EuroEval/mmlu-lv-mini",
|
|
54
|
+
task=KNOW,
|
|
55
|
+
languages=[LATVIAN],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
COPA_LV_CONFIG = DatasetConfig(
|
|
59
|
+
name="copa-lv",
|
|
60
|
+
pretty_name="COPA-lv",
|
|
61
|
+
source="EuroEval/copa-lv",
|
|
62
|
+
task=COMMON_SENSE,
|
|
63
|
+
languages=[LATVIAN],
|
|
64
|
+
_labels=["a", "b"],
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
### Unofficial datasets ###
|
|
69
|
+
|
|
70
|
+
WIKIANN_LV_CONFIG = DatasetConfig(
|
|
71
|
+
name="wikiann-lv",
|
|
72
|
+
pretty_name="WikiANN-lv",
|
|
73
|
+
source="EuroEval/wikiann-lv-mini",
|
|
74
|
+
task=NER,
|
|
75
|
+
languages=[LATVIAN],
|
|
76
|
+
unofficial=True,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
WINOGRANDE_LV_CONFIG = DatasetConfig(
|
|
80
|
+
name="winogrande-lv",
|
|
81
|
+
pretty_name="Winogrande-lv",
|
|
82
|
+
source="EuroEval/winogrande-lv",
|
|
83
|
+
task=COMMON_SENSE,
|
|
84
|
+
languages=[LATVIAN],
|
|
85
|
+
_labels=["a", "b"],
|
|
86
|
+
unofficial=True,
|
|
87
|
+
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""All Lithuanian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import LITHUANIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
LITHUANIAN_EMOTIONS_CONFIG = DatasetConfig(
|
|
10
|
+
name="lithuanian-emotions",
|
|
11
|
+
pretty_name="Lithuanian Emotions",
|
|
12
|
+
source="EuroEval/lithuanian-emotions-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[LITHUANIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_LT_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-lt",
|
|
19
|
+
pretty_name="ScaLA-lt",
|
|
20
|
+
source="EuroEval/scala-lt",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[LITHUANIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
WIKIANN_LT_CONFIG = DatasetConfig(
|
|
26
|
+
name="wikiann-lt",
|
|
27
|
+
pretty_name="WikiANN-lt",
|
|
28
|
+
source="EuroEval/wikiann-lt-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[LITHUANIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_LT_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-lt",
|
|
35
|
+
pretty_name="MultiWikiQA-lt",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-lt-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[LITHUANIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
LRYTAS_CONFIG = DatasetConfig(
|
|
42
|
+
name="lrytas",
|
|
43
|
+
pretty_name="Lrytas",
|
|
44
|
+
source="EuroEval/lrytas-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[LITHUANIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
LT_HISTORY_CONFIG = DatasetConfig(
|
|
50
|
+
name="lt-history",
|
|
51
|
+
pretty_name="LT-History",
|
|
52
|
+
source="EuroEval/lt-history",
|
|
53
|
+
task=KNOW,
|
|
54
|
+
languages=[LITHUANIAN],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
WINOGRANDE_LT_CONFIG = DatasetConfig(
|
|
58
|
+
name="winogrande-lt",
|
|
59
|
+
pretty_name="Winogrande-lt",
|
|
60
|
+
source="EuroEval/winogrande-lt",
|
|
61
|
+
task=COMMON_SENSE,
|
|
62
|
+
languages=[LITHUANIAN],
|
|
63
|
+
_labels=["a", "b"],
|
|
64
|
+
)
|
|
@@ -1,186 +1,212 @@
|
|
|
1
1
|
"""All Norwegian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
4
|
+
from ..languages import NORWEGIAN, NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
NOREC_CONFIG = DatasetConfig(
|
|
10
10
|
name="norec",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/norec-mini",
|
|
11
|
+
pretty_name="NoReC",
|
|
12
|
+
source="EuroEval/norec-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_NB_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-nb",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-nb",
|
|
20
|
+
source="EuroEval/scala-nb",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
SCALA_NN_CONFIG = DatasetConfig(
|
|
27
26
|
name="scala-nn",
|
|
28
|
-
pretty_name="
|
|
29
|
-
|
|
27
|
+
pretty_name="ScaLA-nn",
|
|
28
|
+
source="EuroEval/scala-nn",
|
|
30
29
|
task=LA,
|
|
31
|
-
languages=[
|
|
30
|
+
languages=[NORWEGIAN_NYNORSK],
|
|
32
31
|
)
|
|
33
32
|
|
|
34
33
|
NORNE_NB_CONFIG = DatasetConfig(
|
|
35
34
|
name="norne-nb",
|
|
36
|
-
pretty_name="
|
|
37
|
-
"
|
|
38
|
-
huggingface_id="EuroEval/norne-nb-mini",
|
|
35
|
+
pretty_name="NorNE-nb",
|
|
36
|
+
source="EuroEval/norne-nb-mini",
|
|
39
37
|
task=NER,
|
|
40
|
-
languages=[
|
|
38
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
|
|
41
39
|
)
|
|
42
40
|
|
|
43
41
|
NORNE_NN_CONFIG = DatasetConfig(
|
|
44
42
|
name="norne-nn",
|
|
45
|
-
pretty_name="
|
|
46
|
-
"
|
|
47
|
-
huggingface_id="EuroEval/norne-nn-mini",
|
|
43
|
+
pretty_name="NorNE-nn",
|
|
44
|
+
source="EuroEval/norne-nn-mini",
|
|
48
45
|
task=NER,
|
|
49
|
-
languages=[
|
|
46
|
+
languages=[NORWEGIAN_NYNORSK],
|
|
50
47
|
)
|
|
51
48
|
|
|
52
49
|
NORQUAD_CONFIG = DatasetConfig(
|
|
53
50
|
name="norquad",
|
|
54
|
-
pretty_name="
|
|
55
|
-
"
|
|
56
|
-
huggingface_id="EuroEval/norquad-mini",
|
|
51
|
+
pretty_name="NorQuAD",
|
|
52
|
+
source="EuroEval/norquad-mini",
|
|
57
53
|
task=RC,
|
|
58
|
-
languages=[
|
|
54
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
59
55
|
_num_few_shot_examples=2,
|
|
60
56
|
)
|
|
61
57
|
|
|
62
58
|
NO_SAMMENDRAG_CONFIG = DatasetConfig(
|
|
63
59
|
name="no-sammendrag",
|
|
64
|
-
pretty_name="
|
|
65
|
-
"
|
|
66
|
-
huggingface_id="EuroEval/no-sammendrag-mini",
|
|
60
|
+
pretty_name="NoSammendrag",
|
|
61
|
+
source="EuroEval/no-sammendrag-mini",
|
|
67
62
|
task=SUMM,
|
|
68
|
-
languages=[
|
|
63
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
69
64
|
)
|
|
70
65
|
|
|
71
66
|
NRK_QUIZ_QA_CONFIG = DatasetConfig(
|
|
72
67
|
name="nrk-quiz-qa",
|
|
73
|
-
pretty_name="
|
|
74
|
-
|
|
68
|
+
pretty_name="NRK Quiz QA",
|
|
69
|
+
source="EuroEval/nrk-quiz-qa-mini",
|
|
75
70
|
task=KNOW,
|
|
76
|
-
languages=[
|
|
71
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
77
72
|
)
|
|
78
73
|
|
|
79
74
|
IDIOMS_NO_CONFIG = DatasetConfig(
|
|
80
75
|
name="idioms-no",
|
|
81
|
-
pretty_name="
|
|
82
|
-
|
|
76
|
+
pretty_name="Idioms-no",
|
|
77
|
+
source="EuroEval/idioms-no",
|
|
83
78
|
task=KNOW,
|
|
84
|
-
languages=[
|
|
79
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
85
80
|
)
|
|
86
81
|
|
|
87
82
|
NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
|
|
88
83
|
name="nor-common-sense-qa",
|
|
89
|
-
pretty_name="
|
|
90
|
-
"
|
|
91
|
-
huggingface_id="EuroEval/nor-common-sense-qa",
|
|
84
|
+
pretty_name="NorCommonSenseQA",
|
|
85
|
+
source="EuroEval/nor-common-sense-qa",
|
|
92
86
|
task=COMMON_SENSE,
|
|
93
|
-
languages=[
|
|
87
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
94
88
|
_labels=["a", "b", "c", "d", "e"],
|
|
95
89
|
)
|
|
96
90
|
|
|
91
|
+
VALEU_NO_CONFIG = DatasetConfig(
|
|
92
|
+
name="valeu-no",
|
|
93
|
+
pretty_name="VaLEU-no",
|
|
94
|
+
source="EuroEval/european-values-no",
|
|
95
|
+
task=EUROPEAN_VALUES,
|
|
96
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
97
|
+
splits=["test"],
|
|
98
|
+
bootstrap_samples=False,
|
|
99
|
+
_instruction_prompt="{text}",
|
|
100
|
+
)
|
|
101
|
+
|
|
97
102
|
|
|
98
103
|
### Unofficial datasets ###
|
|
99
104
|
|
|
100
105
|
NO_COLA_CONFIG = DatasetConfig(
|
|
101
106
|
name="no-cola",
|
|
102
|
-
pretty_name="
|
|
103
|
-
"
|
|
104
|
-
huggingface_id="EuroEval/no-cola-mini",
|
|
107
|
+
pretty_name="NoCoLA",
|
|
108
|
+
source="EuroEval/no-cola-mini",
|
|
105
109
|
task=LA,
|
|
106
|
-
languages=[
|
|
110
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
|
|
107
111
|
unofficial=True,
|
|
108
112
|
)
|
|
109
113
|
|
|
110
114
|
NORGLM_MULTI_QA = DatasetConfig(
|
|
111
115
|
name="norglm-multi-qa",
|
|
112
|
-
pretty_name="
|
|
113
|
-
"
|
|
114
|
-
huggingface_id="EuroEval/norglm-multi-qa",
|
|
116
|
+
pretty_name="NorGLM-Multi-QA",
|
|
117
|
+
source="EuroEval/norglm-multi-qa",
|
|
115
118
|
task=RC,
|
|
116
|
-
languages=[
|
|
119
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
117
120
|
unofficial=True,
|
|
118
121
|
)
|
|
119
122
|
|
|
120
123
|
NORGLM_MULTI_SUM = DatasetConfig(
|
|
121
124
|
name="norglm-multi-sum",
|
|
122
|
-
pretty_name="
|
|
123
|
-
"
|
|
124
|
-
huggingface_id="EuroEval/norglm-multi-sum",
|
|
125
|
+
pretty_name="NorGLM-Multi-Sum",
|
|
126
|
+
source="EuroEval/norglm-multi-sum",
|
|
125
127
|
task=SUMM,
|
|
126
|
-
languages=[
|
|
128
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
127
129
|
unofficial=True,
|
|
128
130
|
)
|
|
129
131
|
|
|
130
132
|
SCHIBSTED_NO_CONFIG = DatasetConfig(
|
|
131
133
|
name="schibsted-no",
|
|
132
|
-
pretty_name="
|
|
133
|
-
|
|
134
|
+
pretty_name="Schibsted-no",
|
|
135
|
+
source="EuroEval/schibsted-article-summaries-no",
|
|
134
136
|
task=SUMM,
|
|
135
|
-
languages=[
|
|
137
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
136
138
|
unofficial=True,
|
|
137
139
|
)
|
|
138
140
|
|
|
139
141
|
PERSONAL_SUM_CONFIG = DatasetConfig(
|
|
140
142
|
name="personal-sum",
|
|
141
|
-
pretty_name="
|
|
142
|
-
|
|
143
|
+
pretty_name="Personal Sum",
|
|
144
|
+
source="EuroEval/personal-sum",
|
|
143
145
|
task=SUMM,
|
|
144
|
-
languages=[
|
|
146
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
145
147
|
unofficial=True,
|
|
146
148
|
)
|
|
147
149
|
|
|
148
150
|
MMLU_NO_CONFIG = DatasetConfig(
|
|
149
151
|
name="mmlu-no",
|
|
150
|
-
pretty_name="
|
|
151
|
-
"
|
|
152
|
-
huggingface_id="EuroEval/mmlu-no-mini",
|
|
152
|
+
pretty_name="MMLU-no",
|
|
153
|
+
source="EuroEval/mmlu-no-mini",
|
|
153
154
|
task=KNOW,
|
|
154
|
-
languages=[
|
|
155
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
155
156
|
unofficial=True,
|
|
156
157
|
)
|
|
157
158
|
|
|
158
159
|
ARC_NO_CONFIG = DatasetConfig(
|
|
159
160
|
name="arc-no",
|
|
160
|
-
pretty_name="
|
|
161
|
-
"
|
|
162
|
-
huggingface_id="EuroEval/arc-no-mini",
|
|
161
|
+
pretty_name="ARC-no",
|
|
162
|
+
source="EuroEval/arc-no-mini",
|
|
163
163
|
task=KNOW,
|
|
164
|
-
languages=[
|
|
164
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
165
165
|
unofficial=True,
|
|
166
166
|
)
|
|
167
167
|
|
|
168
168
|
HELLASWAG_NO_CONFIG = DatasetConfig(
|
|
169
169
|
name="hellaswag-no",
|
|
170
|
-
pretty_name="
|
|
171
|
-
"
|
|
172
|
-
huggingface_id="EuroEval/hellaswag-no-mini",
|
|
170
|
+
pretty_name="HellaSwag-no",
|
|
171
|
+
source="EuroEval/hellaswag-no-mini",
|
|
173
172
|
task=COMMON_SENSE,
|
|
174
|
-
languages=[
|
|
173
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
175
174
|
unofficial=True,
|
|
176
175
|
)
|
|
177
176
|
|
|
178
177
|
BELEBELE_NO_CONFIG = DatasetConfig(
|
|
179
178
|
name="belebele-no",
|
|
180
|
-
pretty_name="
|
|
181
|
-
"
|
|
182
|
-
huggingface_id="EuroEval/belebele-no-mini",
|
|
179
|
+
pretty_name="Belebele-no",
|
|
180
|
+
source="EuroEval/belebele-no-mini",
|
|
183
181
|
task=MCRC,
|
|
184
|
-
languages=[
|
|
182
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
183
|
+
unofficial=True,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
MULTI_WIKI_QA_NB_CONFIG = DatasetConfig(
|
|
187
|
+
name="multi-wiki-qa-nb",
|
|
188
|
+
pretty_name="MultiWikiQA-nb",
|
|
189
|
+
source="EuroEval/multi-wiki-qa-no-mini",
|
|
190
|
+
task=RC,
|
|
191
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN],
|
|
192
|
+
unofficial=True,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
|
|
196
|
+
name="multi-wiki-qa-nn",
|
|
197
|
+
pretty_name="MultiWikiQA-nn",
|
|
198
|
+
source="EuroEval/multi-wiki-qa-nn-mini",
|
|
199
|
+
task=RC,
|
|
200
|
+
languages=[NORWEGIAN_NYNORSK],
|
|
201
|
+
unofficial=True,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
WINOGRANDE_NO_CONFIG = DatasetConfig(
|
|
205
|
+
name="winogrande-no",
|
|
206
|
+
pretty_name="Winogrande-no",
|
|
207
|
+
source="EuroEval/winogrande-no",
|
|
208
|
+
task=COMMON_SENSE,
|
|
209
|
+
languages=[NORWEGIAN_BOKMÅL, NORWEGIAN_NYNORSK, NORWEGIAN],
|
|
210
|
+
_labels=["a", "b"],
|
|
185
211
|
unofficial=True,
|
|
186
212
|
)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""All Polish dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import POLISH
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
POLEMO2_CONFIG = DatasetConfig(
|
|
10
|
+
name="polemo2",
|
|
11
|
+
pretty_name="Polemo2",
|
|
12
|
+
source="EuroEval/polemo2-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[POLISH],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_PL_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-pl",
|
|
19
|
+
pretty_name="ScaLA-pl",
|
|
20
|
+
source="EuroEval/scala-pl",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[POLISH],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
KPWR_NER_CONFIG = DatasetConfig(
|
|
26
|
+
name="kpwr-ner",
|
|
27
|
+
pretty_name="KPWr-NER",
|
|
28
|
+
source="EuroEval/kpwr-ner",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[POLISH],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
POQUAD_CONFIG = DatasetConfig(
|
|
34
|
+
name="poquad",
|
|
35
|
+
pretty_name="PoQuAD",
|
|
36
|
+
source="EuroEval/poquad-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[POLISH],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
PSC_CONFIG = DatasetConfig(
|
|
42
|
+
name="psc",
|
|
43
|
+
pretty_name="PSC",
|
|
44
|
+
source="EuroEval/psc-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[POLISH],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
LLMZSZL_CONFIG = DatasetConfig(
|
|
50
|
+
name="llmzszl",
|
|
51
|
+
pretty_name="LLMzSzŁ",
|
|
52
|
+
source="EuroEval/llmzszl-mini",
|
|
53
|
+
task=KNOW,
|
|
54
|
+
languages=[POLISH],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
WINOGRANDE_PL_CONFIG = DatasetConfig(
|
|
58
|
+
name="winogrande-pl",
|
|
59
|
+
pretty_name="Winogrande-pl",
|
|
60
|
+
source="EuroEval/winogrande-pl",
|
|
61
|
+
task=COMMON_SENSE,
|
|
62
|
+
languages=[POLISH],
|
|
63
|
+
_labels=["a", "b"],
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
VALEU_PL_CONFIG = DatasetConfig(
|
|
67
|
+
name="valeu-pl",
|
|
68
|
+
pretty_name="VaLEU-pl",
|
|
69
|
+
source="EuroEval/european-values-pl",
|
|
70
|
+
task=EUROPEAN_VALUES,
|
|
71
|
+
languages=[POLISH],
|
|
72
|
+
splits=["test"],
|
|
73
|
+
bootstrap_samples=False,
|
|
74
|
+
_instruction_prompt="{text}",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
### Unofficial datasets ###
|
|
79
|
+
|
|
80
|
+
MULTI_WIKI_QA_PL_CONFIG = DatasetConfig(
|
|
81
|
+
name="multi-wiki-qa-pl",
|
|
82
|
+
pretty_name="MultiWikiQA-pl",
|
|
83
|
+
source="EuroEval/multi-wiki-qa-pl-mini",
|
|
84
|
+
task=RC,
|
|
85
|
+
languages=[POLISH],
|
|
86
|
+
unofficial=True,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
GOLDENSWAG_PL_CONFIG = DatasetConfig(
|
|
90
|
+
name="goldenswag-pl",
|
|
91
|
+
pretty_name="GoldenSwag-pl",
|
|
92
|
+
source="EuroEval/goldenswag-pl-mini",
|
|
93
|
+
task=COMMON_SENSE,
|
|
94
|
+
languages=[POLISH],
|
|
95
|
+
unofficial=True,
|
|
96
|
+
)
|
|
@@ -1,64 +1,77 @@
|
|
|
1
1
|
"""All Portuguese dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, SENT, SUMM
|
|
4
|
+
from ..languages import EUROPEAN_PORTUGUESE, PORTUGUESE
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
SST2_PT_CONFIG = DatasetConfig(
|
|
10
10
|
name="sst2-pt",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/sst2-pt-mini",
|
|
11
|
+
pretty_name="SST2-pt",
|
|
12
|
+
source="EuroEval/sst2-pt-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
16
15
|
_labels=["positive", "negative"],
|
|
17
16
|
)
|
|
18
17
|
|
|
19
|
-
|
|
20
|
-
MMLU_PT_CONFIG = DatasetConfig(
|
|
21
|
-
name="mmlu-pt",
|
|
22
|
-
pretty_name="the truncated version of the Portuguese knowledge dataset MMLU-pt, "
|
|
23
|
-
"translated from the English MMLU dataset",
|
|
24
|
-
huggingface_id="EuroEval/mmlu-pt-mini",
|
|
25
|
-
task=KNOW,
|
|
26
|
-
languages=[PT],
|
|
27
|
-
)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
GOLDENSWAG_PT_CONFIG = DatasetConfig(
|
|
31
|
-
name="goldenswag-pt",
|
|
32
|
-
pretty_name="the truncated version of the Portuguese common-sense reasoning "
|
|
33
|
-
"dataset GoldenSwag-pt, translated from the English GoldenSwag dataset",
|
|
34
|
-
huggingface_id="EuroEval/goldenswag-pt-mini",
|
|
35
|
-
task=COMMON_SENSE,
|
|
36
|
-
languages=[PT],
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
|
|
40
18
|
SCALA_PT = DatasetConfig(
|
|
41
19
|
name="scala-pt",
|
|
42
|
-
pretty_name="
|
|
43
|
-
|
|
20
|
+
pretty_name="ScaLA-pt",
|
|
21
|
+
source="EuroEval/scala-pt",
|
|
44
22
|
task=LA,
|
|
45
|
-
languages=[
|
|
23
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
46
24
|
)
|
|
47
25
|
|
|
48
26
|
HAREM_CONFIG = DatasetConfig(
|
|
49
27
|
name="harem",
|
|
50
|
-
pretty_name="
|
|
51
|
-
|
|
28
|
+
pretty_name="HAREM",
|
|
29
|
+
source="EuroEval/harem",
|
|
52
30
|
task=NER,
|
|
53
|
-
languages=[
|
|
31
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
MULTI_WIKI_QA_PT_CONFIG = DatasetConfig(
|
|
35
|
+
name="multi-wiki-qa-pt",
|
|
36
|
+
pretty_name="MultiWikiQA-pt",
|
|
37
|
+
source="EuroEval/multi-wiki-qa-pt-pt-mini",
|
|
38
|
+
task=RC,
|
|
39
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
54
40
|
)
|
|
55
41
|
|
|
56
42
|
PUBLICO_CONFIG = DatasetConfig(
|
|
57
43
|
name="publico",
|
|
58
|
-
pretty_name="
|
|
59
|
-
|
|
44
|
+
pretty_name="Publico",
|
|
45
|
+
source="EuroEval/publico-mini",
|
|
60
46
|
task=SUMM,
|
|
61
|
-
languages=[
|
|
47
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
MMLU_PT_CONFIG = DatasetConfig(
|
|
51
|
+
name="mmlu-pt",
|
|
52
|
+
pretty_name="MMLU-pt",
|
|
53
|
+
source="EuroEval/mmlu-pt-mini",
|
|
54
|
+
task=KNOW,
|
|
55
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
GOLDENSWAG_PT_CONFIG = DatasetConfig(
|
|
59
|
+
name="goldenswag-pt",
|
|
60
|
+
pretty_name="GoldenSwag-pt",
|
|
61
|
+
source="EuroEval/goldenswag-pt-mini",
|
|
62
|
+
task=COMMON_SENSE,
|
|
63
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
VALEU_PT_CONFIG = DatasetConfig(
|
|
67
|
+
name="valeu-pt",
|
|
68
|
+
pretty_name="VaLEU-pt",
|
|
69
|
+
source="EuroEval/european-values-pt",
|
|
70
|
+
task=EUROPEAN_VALUES,
|
|
71
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
72
|
+
splits=["test"],
|
|
73
|
+
bootstrap_samples=False,
|
|
74
|
+
_instruction_prompt="{text}",
|
|
62
75
|
)
|
|
63
76
|
|
|
64
77
|
|
|
@@ -66,9 +79,19 @@ PUBLICO_CONFIG = DatasetConfig(
|
|
|
66
79
|
|
|
67
80
|
BOOLQ_PT_CONFIG = DatasetConfig(
|
|
68
81
|
name="boolq-pt",
|
|
69
|
-
pretty_name="
|
|
70
|
-
"
|
|
71
|
-
huggingface_id="EuroEval/boolq-pt",
|
|
82
|
+
pretty_name="BoolQ-pt",
|
|
83
|
+
source="EuroEval/boolq-pt",
|
|
72
84
|
task=MCRC,
|
|
73
|
-
languages=[
|
|
85
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
86
|
+
unofficial=True,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
WINOGRANDE_PT_CONFIG = DatasetConfig(
|
|
90
|
+
name="winogrande-pt",
|
|
91
|
+
pretty_name="Winogrande-pt",
|
|
92
|
+
source="EuroEval/winogrande-pt",
|
|
93
|
+
task=COMMON_SENSE,
|
|
94
|
+
languages=[PORTUGUESE, EUROPEAN_PORTUGUESE],
|
|
95
|
+
_labels=["a", "b"],
|
|
96
|
+
unofficial=True,
|
|
74
97
|
)
|