EuroEval 15.5.0__py3-none-any.whl → 15.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/base.py +3 -2
- euroeval/benchmark_modules/fresh.py +8 -6
- euroeval/benchmark_modules/hf.py +33 -31
- euroeval/benchmark_modules/litellm.py +120 -56
- euroeval/benchmark_modules/vllm.py +41 -26
- euroeval/benchmarker.py +23 -21
- euroeval/callbacks.py +2 -2
- euroeval/constants.py +1 -1
- euroeval/data_models.py +261 -42
- euroeval/dataset_configs/__init__.py +61 -0
- euroeval/dataset_configs/danish.py +120 -0
- euroeval/dataset_configs/dutch.py +123 -0
- euroeval/dataset_configs/english.py +88 -0
- euroeval/dataset_configs/faroese.py +54 -0
- euroeval/dataset_configs/french.py +83 -0
- euroeval/dataset_configs/german.py +91 -0
- euroeval/dataset_configs/icelandic.py +148 -0
- euroeval/dataset_configs/italian.py +81 -0
- euroeval/dataset_configs/norwegian.py +178 -0
- euroeval/dataset_configs/spanish.py +78 -0
- euroeval/dataset_configs/swedish.py +100 -0
- euroeval/exceptions.py +10 -10
- euroeval/finetuning.py +6 -10
- euroeval/generation.py +1 -0
- euroeval/human_evaluation.py +2 -2
- euroeval/languages.py +20 -13
- euroeval/model_cache.py +1 -1
- euroeval/model_loading.py +1 -12
- euroeval/prompt_templates/__init__.py +8 -0
- euroeval/prompt_templates/linguistic_acceptability.py +112 -0
- euroeval/prompt_templates/multiple_choice.py +97 -0
- euroeval/prompt_templates/named_entity_recognition.py +257 -0
- euroeval/prompt_templates/reading_comprehension.py +118 -0
- euroeval/prompt_templates/sentiment_classification.py +137 -0
- euroeval/prompt_templates/summarization.py +97 -0
- euroeval/speed_benchmark.py +1 -1
- euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
- euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
- euroeval/{task_utils → task_group_utils}/sequence_classification.py +1 -1
- euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
- euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
- euroeval/tasks.py +54 -0
- euroeval/tokenization_utils.py +343 -0
- euroeval/types.py +3 -1
- euroeval/utils.py +2 -347
- {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/METADATA +31 -9
- euroeval-15.6.1.dist-info/RECORD +59 -0
- euroeval/dataset_configs.py +0 -2408
- euroeval-15.5.0.dist-info/RECORD +0 -40
- /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
- {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/WHEEL +0 -0
- {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/entry_points.txt +0 -0
- {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""All Norwegian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import NB, NN, NO
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
NOREC_CONFIG = DatasetConfig(
|
|
10
|
+
name="norec",
|
|
11
|
+
pretty_name="the truncated version of the Norwegian sentiment classification "
|
|
12
|
+
"dataset NoReC",
|
|
13
|
+
huggingface_id="EuroEval/norec-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[NB, NN, NO],
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
SCALA_NB_CONFIG = DatasetConfig(
|
|
19
|
+
name="scala-nb",
|
|
20
|
+
pretty_name="the Bokmål part of the linguistic acceptability dataset ScaLA",
|
|
21
|
+
huggingface_id="EuroEval/scala-nb",
|
|
22
|
+
task=LA,
|
|
23
|
+
languages=[NB, NO],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
SCALA_NN_CONFIG = DatasetConfig(
|
|
27
|
+
name="scala-nn",
|
|
28
|
+
pretty_name="the Nynorsk part of the linguistic acceptability dataset ScaLA",
|
|
29
|
+
huggingface_id="EuroEval/scala-nn",
|
|
30
|
+
task=LA,
|
|
31
|
+
languages=[NN],
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
NORNE_NB_CONFIG = DatasetConfig(
|
|
35
|
+
name="norne-nb",
|
|
36
|
+
pretty_name="the truncated version of the Bokmål part of the Norwegian named "
|
|
37
|
+
"entity recognition dataset NorNE",
|
|
38
|
+
huggingface_id="EuroEval/norne-nb-mini",
|
|
39
|
+
task=NER,
|
|
40
|
+
languages=[NB, NO],
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
NORNE_NN_CONFIG = DatasetConfig(
|
|
44
|
+
name="norne-nn",
|
|
45
|
+
pretty_name="the truncated version of the Nynorsk part of the Norwegian named "
|
|
46
|
+
"entity recognition dataset NorNE",
|
|
47
|
+
huggingface_id="EuroEval/norne-nn-mini",
|
|
48
|
+
task=NER,
|
|
49
|
+
languages=[NN],
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
NORQUAD_CONFIG = DatasetConfig(
|
|
53
|
+
name="norquad",
|
|
54
|
+
pretty_name="the truncated version of the Norwegian question answering "
|
|
55
|
+
"dataset NorQuAD",
|
|
56
|
+
huggingface_id="EuroEval/norquad-mini",
|
|
57
|
+
task=RC,
|
|
58
|
+
languages=[NB, NN, NO],
|
|
59
|
+
_num_few_shot_examples=2,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
NO_SAMMENDRAG_CONFIG = DatasetConfig(
|
|
63
|
+
name="no-sammendrag",
|
|
64
|
+
pretty_name="the truncated version of the Norwegian summarisation dataset "
|
|
65
|
+
"Norske Sammendrag",
|
|
66
|
+
huggingface_id="EuroEval/no-sammendrag-mini",
|
|
67
|
+
task=SUMM,
|
|
68
|
+
languages=[NB, NN, NO],
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
NRK_QUIZ_QA_CONFIG = DatasetConfig(
|
|
72
|
+
name="nrk-quiz-qa",
|
|
73
|
+
pretty_name="the truncated version of the Norwegian knowledge dataset NRK Quiz QA",
|
|
74
|
+
huggingface_id="EuroEval/nrk-quiz-qa-mini",
|
|
75
|
+
task=KNOW,
|
|
76
|
+
languages=[NB, NN, NO],
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
NOR_COMMON_SENSE_QA_CONFIG = DatasetConfig(
|
|
80
|
+
name="nor-common-sense-qa",
|
|
81
|
+
pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
|
|
82
|
+
"NorCommonSenseQA",
|
|
83
|
+
huggingface_id="EuroEval/nor-common-sense-qa",
|
|
84
|
+
task=COMMON_SENSE,
|
|
85
|
+
languages=[NB, NN, NO],
|
|
86
|
+
_labels=["a", "b", "c", "d", "e"],
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
### Unofficial datasets ###
|
|
91
|
+
|
|
92
|
+
NO_COLA_CONFIG = DatasetConfig(
|
|
93
|
+
name="no-cola",
|
|
94
|
+
pretty_name="the truncated version of the Norwegian linguistic acceptability "
|
|
95
|
+
"dataset NoCoLA",
|
|
96
|
+
huggingface_id="EuroEval/no-cola-mini",
|
|
97
|
+
task=LA,
|
|
98
|
+
languages=[NB, NO],
|
|
99
|
+
unofficial=True,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
NORGLM_MULTI_QA = DatasetConfig(
|
|
103
|
+
name="norglm-multi-qa",
|
|
104
|
+
pretty_name="the question answering part of the Norwegian NorGLM multi-task human "
|
|
105
|
+
"annotated dataset NO-Multi-QA-Sum",
|
|
106
|
+
huggingface_id="EuroEval/norglm-multi-qa",
|
|
107
|
+
task=RC,
|
|
108
|
+
languages=[NB, NN, NO],
|
|
109
|
+
unofficial=True,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
NORGLM_MULTI_SUM = DatasetConfig(
|
|
113
|
+
name="norglm-multi-sum",
|
|
114
|
+
pretty_name="the summarisation part of the Norwegian NorGLM multi-task human "
|
|
115
|
+
"annotated dataset NO-Multi-QA-Sum",
|
|
116
|
+
huggingface_id="EuroEval/norglm-multi-sum",
|
|
117
|
+
task=SUMM,
|
|
118
|
+
languages=[NB, NN, NO],
|
|
119
|
+
unofficial=True,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
SCHIBSTED_NO_CONFIG = DatasetConfig(
|
|
123
|
+
name="schibsted-no",
|
|
124
|
+
pretty_name="the Norwegian summarisation dataset Schibsted-no",
|
|
125
|
+
huggingface_id="EuroEval/schibsted-article-summaries-no",
|
|
126
|
+
task=SUMM,
|
|
127
|
+
languages=[NB, NN, NO],
|
|
128
|
+
unofficial=True,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
PERSONAL_SUM_CONFIG = DatasetConfig(
|
|
132
|
+
name="personal-sum",
|
|
133
|
+
pretty_name="the Norwegian summarisation dataset personal-sum",
|
|
134
|
+
huggingface_id="EuroEval/personal-sum",
|
|
135
|
+
task=SUMM,
|
|
136
|
+
languages=[NB, NN, NO],
|
|
137
|
+
unofficial=True,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
MMLU_NO_CONFIG = DatasetConfig(
|
|
141
|
+
name="mmlu-no",
|
|
142
|
+
pretty_name="the truncated version of the Norwegian knowledge dataset MMLU-no, "
|
|
143
|
+
"translated from the English MMLU dataset",
|
|
144
|
+
huggingface_id="EuroEval/mmlu-no-mini",
|
|
145
|
+
task=KNOW,
|
|
146
|
+
languages=[NB, NN, NO],
|
|
147
|
+
unofficial=True,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
ARC_NO_CONFIG = DatasetConfig(
|
|
151
|
+
name="arc-no",
|
|
152
|
+
pretty_name="the truncated version of the Norwegian knowledge dataset ARC-no, "
|
|
153
|
+
"translated from the English ARC dataset",
|
|
154
|
+
huggingface_id="EuroEval/arc-no-mini",
|
|
155
|
+
task=KNOW,
|
|
156
|
+
languages=[NB, NN, NO],
|
|
157
|
+
unofficial=True,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
HELLASWAG_NO_CONFIG = DatasetConfig(
|
|
161
|
+
name="hellaswag-no",
|
|
162
|
+
pretty_name="the truncated version of the Norwegian common-sense reasoning dataset "
|
|
163
|
+
"HellaSwag-no, translated from the English HellaSwag dataset",
|
|
164
|
+
huggingface_id="EuroEval/hellaswag-no-mini",
|
|
165
|
+
task=COMMON_SENSE,
|
|
166
|
+
languages=[NB, NN, NO],
|
|
167
|
+
unofficial=True,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
BELEBELE_NO_CONFIG = DatasetConfig(
|
|
171
|
+
name="belebele-no",
|
|
172
|
+
pretty_name="the Norwegian multiple choice reading comprehension dataset "
|
|
173
|
+
"BeleBele-no, translated from the English BeleBele dataset",
|
|
174
|
+
huggingface_id="EuroEval/belebele-no-mini",
|
|
175
|
+
task=MCRC,
|
|
176
|
+
languages=[NB, NN, NO],
|
|
177
|
+
unofficial=True,
|
|
178
|
+
)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""All Spanish dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import ES
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
|
|
10
|
+
name="sentiment-headlines-es",
|
|
11
|
+
pretty_name="the truncated version of the Spanish sentiment headlines dataset",
|
|
12
|
+
huggingface_id="EuroEval/sentiment-headlines-es",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[ES],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_ES_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-es",
|
|
19
|
+
pretty_name="the Spanish part of the linguistic acceptability dataset ScaLA",
|
|
20
|
+
huggingface_id="EuroEval/scala-es",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[ES],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
CONLL_ES_CONFIG = DatasetConfig(
|
|
26
|
+
name="conll-es",
|
|
27
|
+
pretty_name="the Spanish part of the truncated version of the named entity "
|
|
28
|
+
"recognition dataset CoNLL 2002",
|
|
29
|
+
huggingface_id="EuroEval/conll-es-mini",
|
|
30
|
+
task=NER,
|
|
31
|
+
languages=[ES],
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
MLQA_ES_CONFIG = DatasetConfig(
|
|
35
|
+
name="mlqa-es",
|
|
36
|
+
pretty_name="the Spanish version of the MLQA reading comprehension dataset",
|
|
37
|
+
huggingface_id="EuroEval/mlqa-es",
|
|
38
|
+
task=RC,
|
|
39
|
+
languages=[ES],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
MLSUM_ES_CONFIG = DatasetConfig(
|
|
43
|
+
name="mlsum-es",
|
|
44
|
+
pretty_name="the truncated version of the Spanish summarisation dataset MLSum-es",
|
|
45
|
+
huggingface_id="EuroEval/mlsum-es-mini",
|
|
46
|
+
task=SUMM,
|
|
47
|
+
languages=[ES],
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
MMLU_ES_CONFIG = DatasetConfig(
|
|
51
|
+
name="mmlu-es",
|
|
52
|
+
pretty_name="the truncated version of the Spanish knowledge dataset MMLU-es, "
|
|
53
|
+
"translated from the English MMLU dataset",
|
|
54
|
+
huggingface_id="EuroEval/mmlu-es-mini",
|
|
55
|
+
task=KNOW,
|
|
56
|
+
languages=[ES],
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
HELLASWAG_ES_CONFIG = DatasetConfig(
|
|
60
|
+
name="hellaswag-es",
|
|
61
|
+
pretty_name="the truncated version of the Spanish common-sense reasoning dataset "
|
|
62
|
+
"HellaSwag-es, translated from the English HellaSwag dataset",
|
|
63
|
+
huggingface_id="EuroEval/hellaswag-es-mini",
|
|
64
|
+
task=COMMON_SENSE,
|
|
65
|
+
languages=[ES],
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
### Unofficial datasets ###
|
|
70
|
+
|
|
71
|
+
XQUAD_ES_CONFIG = DatasetConfig(
|
|
72
|
+
name="xquad-es",
|
|
73
|
+
pretty_name="the Spanish version of the XQuAD reading comprehension dataset",
|
|
74
|
+
huggingface_id="EuroEval/xquad-es",
|
|
75
|
+
task=RC,
|
|
76
|
+
languages=[ES],
|
|
77
|
+
unofficial=True,
|
|
78
|
+
)
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""All Swedish dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import SV
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
SWEREC_CONFIG = DatasetConfig(
|
|
10
|
+
name="swerec",
|
|
11
|
+
pretty_name="the truncated version of the Swedish sentiment classification "
|
|
12
|
+
"dataset SweReC",
|
|
13
|
+
huggingface_id="EuroEval/swerec-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[SV],
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
SCALA_SV_CONFIG = DatasetConfig(
|
|
19
|
+
name="scala-sv",
|
|
20
|
+
pretty_name="The Swedish part of the linguistic acceptability dataset ScaLA",
|
|
21
|
+
huggingface_id="EuroEval/scala-sv",
|
|
22
|
+
task=LA,
|
|
23
|
+
languages=[SV],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
SUC3_CONFIG = DatasetConfig(
|
|
27
|
+
name="suc3",
|
|
28
|
+
pretty_name="the truncated version of the Swedish named entity recognition "
|
|
29
|
+
"dataset SUC 3.0",
|
|
30
|
+
huggingface_id="EuroEval/suc3-mini",
|
|
31
|
+
task=NER,
|
|
32
|
+
languages=[SV],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
SCANDIQA_SV_CONFIG = DatasetConfig(
|
|
36
|
+
name="scandiqa-sv",
|
|
37
|
+
pretty_name="the Swedish part of the truncated version of the question answering "
|
|
38
|
+
"dataset ScandiQA",
|
|
39
|
+
huggingface_id="EuroEval/scandiqa-sv-mini",
|
|
40
|
+
task=RC,
|
|
41
|
+
languages=[SV],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
SWEDN_CONFIG = DatasetConfig(
|
|
45
|
+
name="swedn",
|
|
46
|
+
pretty_name="the truncated version of the Swedish summarisation dataset SweDN",
|
|
47
|
+
huggingface_id="EuroEval/swedn-mini",
|
|
48
|
+
task=SUMM,
|
|
49
|
+
languages=[SV],
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
MMLU_SV_CONFIG = DatasetConfig(
|
|
53
|
+
name="mmlu-sv",
|
|
54
|
+
pretty_name="the truncated version of the Swedish knowledge dataset MMLU-sv, "
|
|
55
|
+
"translated from the English MMLU dataset",
|
|
56
|
+
huggingface_id="EuroEval/mmlu-sv-mini",
|
|
57
|
+
task=KNOW,
|
|
58
|
+
languages=[SV],
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
HELLASWAG_SV_CONFIG = DatasetConfig(
|
|
62
|
+
name="hellaswag-sv",
|
|
63
|
+
pretty_name="the truncated version of the Swedish common-sense reasoning dataset "
|
|
64
|
+
"HellaSwag-sv, translated from the English HellaSwag dataset",
|
|
65
|
+
huggingface_id="EuroEval/hellaswag-sv-mini",
|
|
66
|
+
task=COMMON_SENSE,
|
|
67
|
+
languages=[SV],
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
### Unofficial datasets ###
|
|
72
|
+
|
|
73
|
+
SCHIBSTED_SV_CONFIG = DatasetConfig(
|
|
74
|
+
name="schibsted-sv",
|
|
75
|
+
pretty_name="the Swedish summarisation dataset Schibsted-sv",
|
|
76
|
+
huggingface_id="EuroEval/schibsted-article-summaries-sv",
|
|
77
|
+
task=SUMM,
|
|
78
|
+
languages=[SV],
|
|
79
|
+
unofficial=True,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
ARC_SV_CONFIG = DatasetConfig(
|
|
83
|
+
name="arc-sv",
|
|
84
|
+
pretty_name="the truncated version of the Swedish knowledge dataset ARC-sv, "
|
|
85
|
+
"translated from the English ARC dataset",
|
|
86
|
+
huggingface_id="EuroEval/arc-sv-mini",
|
|
87
|
+
task=KNOW,
|
|
88
|
+
languages=[SV],
|
|
89
|
+
unofficial=True,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
BELEBELE_SV_CONFIG = DatasetConfig(
|
|
93
|
+
name="belebele-sv",
|
|
94
|
+
pretty_name="the Swedish multiple choice reading comprehension dataset "
|
|
95
|
+
"BeleBele-sv, translated from the English BeleBele dataset",
|
|
96
|
+
huggingface_id="EuroEval/belebele-sv-mini",
|
|
97
|
+
task=MCRC,
|
|
98
|
+
languages=[SV],
|
|
99
|
+
unofficial=True,
|
|
100
|
+
)
|
euroeval/exceptions.py
CHANGED
|
@@ -7,7 +7,7 @@ class InvalidBenchmark(Exception):
|
|
|
7
7
|
def __init__(
|
|
8
8
|
self, message: str = "This model cannot be benchmarked on the given dataset."
|
|
9
9
|
) -> None:
|
|
10
|
-
"""
|
|
10
|
+
"""Initialise the exception.
|
|
11
11
|
|
|
12
12
|
Args:
|
|
13
13
|
message:
|
|
@@ -23,7 +23,7 @@ class InvalidModel(Exception):
|
|
|
23
23
|
def __init__(
|
|
24
24
|
self, message: str = "The model cannot be benchmarked on any datasets."
|
|
25
25
|
) -> None:
|
|
26
|
-
"""
|
|
26
|
+
"""Initialise the exception.
|
|
27
27
|
|
|
28
28
|
Args:
|
|
29
29
|
message:
|
|
@@ -39,7 +39,7 @@ class HuggingFaceHubDown(Exception):
|
|
|
39
39
|
def __init__(
|
|
40
40
|
self, message: str = "The Hugging Face Hub is currently down."
|
|
41
41
|
) -> None:
|
|
42
|
-
"""
|
|
42
|
+
"""Initialise the exception.
|
|
43
43
|
|
|
44
44
|
Args:
|
|
45
45
|
message:
|
|
@@ -55,7 +55,7 @@ class NoInternetConnection(Exception):
|
|
|
55
55
|
def __init__(
|
|
56
56
|
self, message: str = "There is currently no internet connection."
|
|
57
57
|
) -> None:
|
|
58
|
-
"""
|
|
58
|
+
"""Initialise the exception.
|
|
59
59
|
|
|
60
60
|
Args:
|
|
61
61
|
message:
|
|
@@ -71,7 +71,7 @@ class NaNValueInModelOutput(Exception):
|
|
|
71
71
|
def __init__(
|
|
72
72
|
self, message: str = "There is a NaN value in the model output."
|
|
73
73
|
) -> None:
|
|
74
|
-
"""
|
|
74
|
+
"""Initialise the exception.
|
|
75
75
|
|
|
76
76
|
Args:
|
|
77
77
|
message:
|
|
@@ -93,7 +93,7 @@ class FlashAttentionNotInstalled(Exception):
|
|
|
93
93
|
"pip install flash-attn --no-build-isolation`."
|
|
94
94
|
),
|
|
95
95
|
) -> None:
|
|
96
|
-
"""
|
|
96
|
+
"""Initialise the exception.
|
|
97
97
|
|
|
98
98
|
Args:
|
|
99
99
|
message:
|
|
@@ -107,7 +107,7 @@ class NeedsExtraInstalled(InvalidModel):
|
|
|
107
107
|
"""The evaluation requires extra to be installed."""
|
|
108
108
|
|
|
109
109
|
def __init__(self, extra: str) -> None:
|
|
110
|
-
"""
|
|
110
|
+
"""Initialise the exception.
|
|
111
111
|
|
|
112
112
|
Args:
|
|
113
113
|
extra:
|
|
@@ -126,7 +126,7 @@ class NeedsManualDependency(InvalidModel):
|
|
|
126
126
|
"""The evaluation requires a dependency to be manually installed."""
|
|
127
127
|
|
|
128
128
|
def __init__(self, package: str) -> None:
|
|
129
|
-
"""
|
|
129
|
+
"""Initialise the exception.
|
|
130
130
|
|
|
131
131
|
Args:
|
|
132
132
|
package:
|
|
@@ -146,7 +146,7 @@ class NeedsAdditionalArgument(InvalidModel):
|
|
|
146
146
|
def __init__(
|
|
147
147
|
self, cli_argument: str, script_argument: str, run_with_cli: bool
|
|
148
148
|
) -> None:
|
|
149
|
-
"""
|
|
149
|
+
"""Initialise the exception.
|
|
150
150
|
|
|
151
151
|
Args:
|
|
152
152
|
cli_argument:
|
|
@@ -177,7 +177,7 @@ class NeedsEnvironmentVariable(InvalidModel):
|
|
|
177
177
|
"""The evaluation requires an environment variable to be set."""
|
|
178
178
|
|
|
179
179
|
def __init__(self, env_var: str) -> None:
|
|
180
|
-
"""
|
|
180
|
+
"""Initialise the exception.
|
|
181
181
|
|
|
182
182
|
Args:
|
|
183
183
|
env_var:
|
euroeval/finetuning.py
CHANGED
|
@@ -7,14 +7,13 @@ import typing as t
|
|
|
7
7
|
import torch
|
|
8
8
|
from datasets import DatasetDict
|
|
9
9
|
from tqdm.auto import tqdm
|
|
10
|
-
from transformers import (
|
|
10
|
+
from transformers.trainer_callback import (
|
|
11
11
|
EarlyStoppingCallback,
|
|
12
|
-
IntervalStrategy,
|
|
13
12
|
PrinterCallback,
|
|
14
13
|
ProgressCallback,
|
|
15
|
-
TrainingArguments,
|
|
16
14
|
)
|
|
17
|
-
from transformers.
|
|
15
|
+
from transformers.trainer_utils import IntervalStrategy
|
|
16
|
+
from transformers.training_args import OptimizerNames, TrainingArguments
|
|
18
17
|
|
|
19
18
|
from .benchmark_modules import BenchmarkModule
|
|
20
19
|
from .callbacks import NeverLeaveProgressCallback
|
|
@@ -67,9 +66,6 @@ def finetune(
|
|
|
67
66
|
else:
|
|
68
67
|
dtype = DataType.FP32
|
|
69
68
|
|
|
70
|
-
# TEMP
|
|
71
|
-
dtype = DataType.FP32
|
|
72
|
-
|
|
73
69
|
bs: int = benchmark_config.batch_size
|
|
74
70
|
scores: list[dict[str, float]] = list()
|
|
75
71
|
for idx in tqdm(
|
|
@@ -212,7 +208,7 @@ def finetune_single_iteration(
|
|
|
212
208
|
|
|
213
209
|
if not benchmark_config.verbose:
|
|
214
210
|
|
|
215
|
-
def no_logging(logs: dict[str, float]) -> None:
|
|
211
|
+
def no_logging(logs: dict[str, float], start_time: float | None = None) -> None:
|
|
216
212
|
return
|
|
217
213
|
|
|
218
214
|
trainer.log = no_logging
|
|
@@ -292,7 +288,7 @@ def get_training_args(
|
|
|
292
288
|
|
|
293
289
|
training_args = TrainingArguments(
|
|
294
290
|
output_dir=model_config.model_cache_dir,
|
|
295
|
-
|
|
291
|
+
eval_strategy=IntervalStrategy.STEPS,
|
|
296
292
|
logging_strategy=logging_strategy,
|
|
297
293
|
save_strategy=IntervalStrategy.STEPS,
|
|
298
294
|
eval_steps=30,
|
|
@@ -304,11 +300,11 @@ def get_training_args(
|
|
|
304
300
|
save_total_limit=1,
|
|
305
301
|
per_device_train_batch_size=batch_size,
|
|
306
302
|
per_device_eval_batch_size=batch_size,
|
|
303
|
+
optim=OptimizerNames.ADAMW_TORCH,
|
|
307
304
|
learning_rate=2e-5,
|
|
308
305
|
warmup_ratio=0.01,
|
|
309
306
|
gradient_accumulation_steps=32 // batch_size,
|
|
310
307
|
load_best_model_at_end=True,
|
|
311
|
-
optim=OptimizerNames.ADAMW_TORCH,
|
|
312
308
|
seed=4242 + iteration_idx,
|
|
313
309
|
fp16=dtype == DataType.FP16,
|
|
314
310
|
bf16=dtype == DataType.BF16,
|
euroeval/generation.py
CHANGED
|
@@ -133,6 +133,7 @@ def generate_single_iteration(
|
|
|
133
133
|
all_preds: list[str] = list()
|
|
134
134
|
|
|
135
135
|
if len(non_cached_dataset) > 0:
|
|
136
|
+
itr: t.Iterable
|
|
136
137
|
match model.batching_preference:
|
|
137
138
|
case BatchingPreference.SINGLE_SAMPLE:
|
|
138
139
|
itr = tqdm(iterable=non_cached_dataset, leave=False)
|
euroeval/human_evaluation.py
CHANGED
|
@@ -17,7 +17,7 @@ from .dataset_configs import SPEED_CONFIG, get_all_dataset_configs
|
|
|
17
17
|
from .enums import GenerativeType, TaskGroup
|
|
18
18
|
from .exceptions import NeedsExtraInstalled
|
|
19
19
|
from .scores import aggregate_scores
|
|
20
|
-
from .
|
|
20
|
+
from .task_group_utils import (
|
|
21
21
|
question_answering,
|
|
22
22
|
sequence_classification,
|
|
23
23
|
text_to_text,
|
|
@@ -44,7 +44,7 @@ class HumanEvaluator:
|
|
|
44
44
|
description: str,
|
|
45
45
|
dummy_model_id: str = "mistralai/Mistral-7B-v0.1",
|
|
46
46
|
) -> None:
|
|
47
|
-
"""
|
|
47
|
+
"""Initialise the HumanEvaluator.
|
|
48
48
|
|
|
49
49
|
Args:
|
|
50
50
|
annotator_id:
|
euroeval/languages.py
CHANGED
|
@@ -17,6 +17,26 @@ def get_all_languages() -> dict[str, Language]:
|
|
|
17
17
|
return {cfg.code: cfg for cfg in globals().values() if isinstance(cfg, Language)}
|
|
18
18
|
|
|
19
19
|
|
|
20
|
+
### Currently Supported Lanuages ###
|
|
21
|
+
DA = Language(code="da", name="Danish", _and_separator="og", _or_separator="eller")
|
|
22
|
+
NL = Language(code="nl", name="Dutch", _and_separator="en", _or_separator="of")
|
|
23
|
+
EN = Language(code="en", name="English", _and_separator="and", _or_separator="or")
|
|
24
|
+
FO = Language(code="fo", name="Faroese", _and_separator="og", _or_separator="ella")
|
|
25
|
+
FR = Language(code="fr", name="French", _and_separator="et", _or_separator="ou")
|
|
26
|
+
DE = Language(code="de", name="German", _and_separator="und", _or_separator="oder")
|
|
27
|
+
IS = Language(code="is", name="Icelandic", _and_separator="og", _or_separator="eða")
|
|
28
|
+
IT = Language(code="it", name="Italian", _and_separator="e", _or_separator="o")
|
|
29
|
+
NO = Language(code="no", name="Norwegian", _and_separator="og", _or_separator="eller")
|
|
30
|
+
NB = Language(
|
|
31
|
+
code="nb", name="Norwegian Bokmål", _and_separator="og", _or_separator="eller"
|
|
32
|
+
)
|
|
33
|
+
NN = Language(
|
|
34
|
+
code="nn", name="Norwegian Nynorsk", _and_separator="og", _or_separator="eller"
|
|
35
|
+
)
|
|
36
|
+
ES = Language(code="es", name="Spanish", _and_separator="y", _or_separator="o")
|
|
37
|
+
SV = Language(code="sv", name="Swedish", _and_separator="och", _or_separator="eller")
|
|
38
|
+
|
|
39
|
+
|
|
20
40
|
AB = Language(code="ab", name="Abkhazian")
|
|
21
41
|
AA = Language(code="aa", name="Afar")
|
|
22
42
|
AF = Language(code="af", name="Afrikaans")
|
|
@@ -52,25 +72,19 @@ CO = Language(code="co", name="Corsican")
|
|
|
52
72
|
CR = Language(code="cr", name="Cree")
|
|
53
73
|
HR = Language(code="hr", name="Croatian")
|
|
54
74
|
CS = Language(code="cs", name="Czech")
|
|
55
|
-
DA = Language(code="da", name="Danish")
|
|
56
75
|
DV = Language(code="dv", name="Divehi")
|
|
57
|
-
NL = Language(code="nl", name="Dutch")
|
|
58
76
|
DZ = Language(code="dz", name="Dzongkha")
|
|
59
|
-
EN = Language(code="en", name="English")
|
|
60
77
|
EO = Language(code="eo", name="Esperanto")
|
|
61
78
|
ET = Language(code="et", name="Estonian")
|
|
62
79
|
EE = Language(code="ee", name="Ewe")
|
|
63
|
-
FO = Language(code="fo", name="Faroese")
|
|
64
80
|
FJ = Language(code="fj", name="Fijian")
|
|
65
81
|
FI = Language(code="fi", name="Finnish")
|
|
66
|
-
FR = Language(code="fr", name="French")
|
|
67
82
|
FY = Language(code="fy", name="Western Frisian")
|
|
68
83
|
FF = Language(code="ff", name="Fulah")
|
|
69
84
|
GD = Language(code="gd", name="Gaelic")
|
|
70
85
|
GL = Language(code="gl", name="Galician")
|
|
71
86
|
LG = Language(code="lg", name="Ganda")
|
|
72
87
|
KA = Language(code="ka", name="Georgian")
|
|
73
|
-
DE = Language(code="de", name="German")
|
|
74
88
|
EL = Language(code="el", name="Greek")
|
|
75
89
|
KL = Language(code="kl", name="Greenlandic")
|
|
76
90
|
GN = Language(code="gn", name="Guarani")
|
|
@@ -82,7 +96,6 @@ HZ = Language(code="hz", name="Herero")
|
|
|
82
96
|
HI = Language(code="hi", name="Hindi")
|
|
83
97
|
HO = Language(code="ho", name="Hiri Motu")
|
|
84
98
|
HU = Language(code="hu", name="Hungarian")
|
|
85
|
-
IS = Language(code="is", name="Icelandic")
|
|
86
99
|
IO = Language(code="io", name="Ido")
|
|
87
100
|
IG = Language(code="ig", name="Igbo")
|
|
88
101
|
ID = Language(code="id", name="Indonesian")
|
|
@@ -91,7 +104,6 @@ IE = Language(code="ie", name="Interlingue")
|
|
|
91
104
|
IU = Language(code="iu", name="Inuktitut")
|
|
92
105
|
IK = Language(code="ik", name="Inupiaq")
|
|
93
106
|
GA = Language(code="ga", name="Irish")
|
|
94
|
-
IT = Language(code="it", name="Italian")
|
|
95
107
|
JA = Language(code="ja", name="Japanese")
|
|
96
108
|
KN = Language(code="kn", name="Kannada")
|
|
97
109
|
KR = Language(code="kr", name="Kanuri")
|
|
@@ -130,9 +142,6 @@ ND = Language(code="nd", name="Northern Ndebele")
|
|
|
130
142
|
NR = Language(code="nr", name="South Ndebele")
|
|
131
143
|
NG = Language(code="ng", name="Ndonga")
|
|
132
144
|
NE = Language(code="ne", name="Nepali")
|
|
133
|
-
NO = Language(code="no", name="Norwegian")
|
|
134
|
-
NB = Language(code="nb", name="Norwegian Bokmål")
|
|
135
|
-
NN = Language(code="nn", name="Norwegian Nynorsk")
|
|
136
145
|
II = Language(code="ii", name="Sichuan Yi")
|
|
137
146
|
OC = Language(code="oc", name="Occitan")
|
|
138
147
|
OJ = Language(code="oj", name="Ojibwa")
|
|
@@ -163,11 +172,9 @@ SK = Language(code="sk", name="Slovak")
|
|
|
163
172
|
SL = Language(code="sl", name="Slovenian")
|
|
164
173
|
SO = Language(code="so", name="Somali")
|
|
165
174
|
ST = Language(code="st", name="Sotho")
|
|
166
|
-
ES = Language(code="es", name="Spanish")
|
|
167
175
|
SU = Language(code="su", name="Sundanese")
|
|
168
176
|
SW = Language(code="sw", name="Swahili")
|
|
169
177
|
SS = Language(code="ss", name="Swati")
|
|
170
|
-
SV = Language(code="sv", name="Swedish")
|
|
171
178
|
TL = Language(code="tl", name="Tagalog")
|
|
172
179
|
TY = Language(code="ty", name="Tahitian")
|
|
173
180
|
TG = Language(code="tg", name="Tajik")
|
euroeval/model_cache.py
CHANGED
euroeval/model_loading.py
CHANGED
|
@@ -8,9 +8,8 @@ from .benchmark_modules import (
|
|
|
8
8
|
LiteLLMModel,
|
|
9
9
|
VLLMModel,
|
|
10
10
|
)
|
|
11
|
-
from .constants import GENERATIVE_DATASET_TASK_GROUPS
|
|
12
11
|
from .enums import InferenceBackend, ModelType
|
|
13
|
-
from .exceptions import
|
|
12
|
+
from .exceptions import InvalidModel
|
|
14
13
|
|
|
15
14
|
if t.TYPE_CHECKING:
|
|
16
15
|
from .benchmark_modules import BenchmarkModule
|
|
@@ -59,16 +58,6 @@ def load_model(
|
|
|
59
58
|
f"inference backend {model_config.inference_backend!r}."
|
|
60
59
|
)
|
|
61
60
|
|
|
62
|
-
# Refuse to benchmark non-generative models on generative tasks
|
|
63
|
-
if (
|
|
64
|
-
dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
|
|
65
|
-
and not model_config.model_type == ModelType.GENERATIVE
|
|
66
|
-
):
|
|
67
|
-
raise InvalidBenchmark(
|
|
68
|
-
f"Cannot benchmark non-generative model {model_config.model_id!r} on "
|
|
69
|
-
f"generative task {dataset_config.task.name!r}."
|
|
70
|
-
)
|
|
71
|
-
|
|
72
61
|
model = model_class(
|
|
73
62
|
model_config=model_config,
|
|
74
63
|
dataset_config=dataset_config,
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""The different prompt templates used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from .linguistic_acceptability import LA_TEMPLATES
|
|
4
|
+
from .multiple_choice import MULTIPLE_CHOICE_TEMPLATES
|
|
5
|
+
from .named_entity_recognition import NER_TEMPLATES
|
|
6
|
+
from .reading_comprehension import RC_TEMPLATES
|
|
7
|
+
from .sentiment_classification import SENT_TEMPLATES
|
|
8
|
+
from .summarization import SUMM_TEMPLATES
|