EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +32 -14
- euroeval/benchmark_config_factory.py +92 -180
- euroeval/benchmark_modules/base.py +49 -39
- euroeval/benchmark_modules/fresh.py +35 -21
- euroeval/benchmark_modules/hf.py +280 -244
- euroeval/benchmark_modules/litellm.py +752 -312
- euroeval/benchmark_modules/vllm.py +570 -268
- euroeval/benchmarker.py +651 -528
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +49 -38
- euroeval/constants.py +44 -25
- euroeval/data_loading.py +111 -55
- euroeval/data_models.py +490 -323
- euroeval/dataset_configs/__init__.py +26 -4
- euroeval/dataset_configs/bosnian.py +39 -0
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/croatian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +78 -50
- euroeval/dataset_configs/dutch.py +74 -44
- euroeval/dataset_configs/english.py +71 -36
- euroeval/dataset_configs/estonian.py +111 -0
- euroeval/dataset_configs/faroese.py +25 -18
- euroeval/dataset_configs/finnish.py +63 -26
- euroeval/dataset_configs/french.py +65 -32
- euroeval/dataset_configs/german.py +77 -36
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +68 -57
- euroeval/dataset_configs/italian.py +68 -36
- euroeval/dataset_configs/latvian.py +87 -0
- euroeval/dataset_configs/lithuanian.py +64 -0
- euroeval/dataset_configs/norwegian.py +98 -72
- euroeval/dataset_configs/polish.py +96 -0
- euroeval/dataset_configs/portuguese.py +63 -40
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/slovene.py +56 -0
- euroeval/dataset_configs/spanish.py +68 -34
- euroeval/dataset_configs/swedish.py +82 -41
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/enums.py +12 -6
- euroeval/exceptions.py +21 -1
- euroeval/finetuning.py +34 -26
- euroeval/generation.py +76 -41
- euroeval/generation_utils.py +169 -34
- euroeval/languages.py +1020 -188
- euroeval/logging_utils.py +268 -0
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +85 -0
- euroeval/metrics/huggingface.py +216 -0
- euroeval/metrics/llm_as_a_judge.py +260 -0
- euroeval/metrics/pipeline.py +289 -0
- euroeval/metrics/speed.py +48 -0
- euroeval/model_cache.py +40 -21
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +157 -22
- euroeval/prompt_templates/multiple_choice.py +159 -17
- euroeval/prompt_templates/named_entity_recognition.py +318 -21
- euroeval/prompt_templates/reading_comprehension.py +207 -16
- euroeval/prompt_templates/sentiment_classification.py +205 -22
- euroeval/prompt_templates/summarization.py +122 -22
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +20 -9
- euroeval/speed_benchmark.py +11 -12
- euroeval/task_group_utils/multiple_choice_classification.py +21 -12
- euroeval/task_group_utils/question_answering.py +101 -73
- euroeval/task_group_utils/sequence_classification.py +144 -61
- euroeval/task_group_utils/text_to_text.py +33 -12
- euroeval/task_group_utils/token_classification.py +86 -89
- euroeval/tasks.py +75 -16
- euroeval/tokenisation_utils.py +603 -0
- euroeval/types.py +17 -11
- euroeval/utils.py +332 -137
- euroeval-16.7.1.dist-info/METADATA +623 -0
- euroeval-16.7.1.dist-info/RECORD +84 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -737
- euroeval/metrics.py +0 -452
- euroeval/tokenization_utils.py +0 -498
- euroeval-15.12.0.dist-info/METADATA +0 -285
- euroeval-15.12.0.dist-info/RECORD +0 -63
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""All Serbian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import SERBIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
MMS_SR_CONFIG = DatasetConfig(
|
|
10
|
+
name="mms-sr",
|
|
11
|
+
pretty_name="MMS-sr",
|
|
12
|
+
source="EuroEval/mms-sr-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[SERBIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_SR_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-sr",
|
|
19
|
+
pretty_name="ScaLA-sr",
|
|
20
|
+
source="EuroEval/scala-sr",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[SERBIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
UNER_SR_CONFIG = DatasetConfig(
|
|
26
|
+
name="uner-sr",
|
|
27
|
+
pretty_name="UNER-sr",
|
|
28
|
+
source="EuroEval/uner-sr-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[SERBIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_SR_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-sr",
|
|
35
|
+
pretty_name="MultiWikiQA-sr",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sr-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[SERBIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
LR_SUM_SR_CONFIG = DatasetConfig(
|
|
42
|
+
name="lr-sum-sr",
|
|
43
|
+
pretty_name="LRSum-sr",
|
|
44
|
+
source="EuroEval/lr-sum-sr-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[SERBIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
MMLU_SR_CONFIG = DatasetConfig(
|
|
50
|
+
name="mmlu-sr",
|
|
51
|
+
pretty_name="MMLU-sr",
|
|
52
|
+
source="EuroEval/mmlu-sr-mini",
|
|
53
|
+
task=KNOW,
|
|
54
|
+
languages=[SERBIAN],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
WINOGRANDE_SR_CONFIG = DatasetConfig(
|
|
58
|
+
name="winogrande-sr",
|
|
59
|
+
pretty_name="Winogrande-sr",
|
|
60
|
+
source="EuroEval/winogrande-sr",
|
|
61
|
+
task=COMMON_SENSE,
|
|
62
|
+
languages=[SERBIAN],
|
|
63
|
+
_labels=["a", "b"],
|
|
64
|
+
)
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""All Slovak dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import SLOVAK
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
CSFD_SENTIMENT_SK_CONFIG = DatasetConfig(
|
|
10
|
+
name="csfd-sentiment-sk",
|
|
11
|
+
pretty_name="CSFD Sentiment SK",
|
|
12
|
+
source="EuroEval/csfd-sentiment-sk-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[SLOVAK],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_SK_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-sk",
|
|
19
|
+
pretty_name="ScaLA-sk",
|
|
20
|
+
source="EuroEval/scala-sk",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[SLOVAK],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
UNER_SK_CONFIG = DatasetConfig(
|
|
26
|
+
name="uner-sk",
|
|
27
|
+
pretty_name="UNER-sk",
|
|
28
|
+
source="EuroEval/uner-sk-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[SLOVAK],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_SK_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-sk",
|
|
35
|
+
pretty_name="MultiWikiQA-sk",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sk-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[SLOVAK],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
MMLU_SK_CONFIG = DatasetConfig(
|
|
42
|
+
name="mmlu-sk",
|
|
43
|
+
pretty_name="MMLU-sk",
|
|
44
|
+
source="EuroEval/mmlu-sk-mini",
|
|
45
|
+
task=KNOW,
|
|
46
|
+
languages=[SLOVAK],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
WINOGRANDE_SK_CONFIG = DatasetConfig(
|
|
50
|
+
name="winogrande-sk",
|
|
51
|
+
pretty_name="Winogrande-sk",
|
|
52
|
+
source="EuroEval/winogrande-sk",
|
|
53
|
+
task=COMMON_SENSE,
|
|
54
|
+
languages=[SLOVAK],
|
|
55
|
+
)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""All Slovene dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import SLOVENE
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
SENTINEWS_CONFIG = DatasetConfig(
|
|
10
|
+
name="sentinews",
|
|
11
|
+
pretty_name="Sentinews-sl",
|
|
12
|
+
source="EuroEval/sentinews-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[SLOVENE],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_SL_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-sl",
|
|
19
|
+
pretty_name="ScaLA-sl",
|
|
20
|
+
source="EuroEval/scala-sl",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[SLOVENE],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
SSJ500K_NER_CONFIG = DatasetConfig(
|
|
26
|
+
name="ssj500k-ner",
|
|
27
|
+
pretty_name="ssj500k-NER",
|
|
28
|
+
source="EuroEval/ssj500k-ner-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[SLOVENE],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_SL_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-sl",
|
|
35
|
+
pretty_name="MultiWikiQA-sl",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sl-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[SLOVENE],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
MMLU_SL_CONFIG = DatasetConfig(
|
|
42
|
+
name="mmlu-sl",
|
|
43
|
+
pretty_name="MMLU-sl",
|
|
44
|
+
source="EuroEval/mmlu-sl-mini",
|
|
45
|
+
task=KNOW,
|
|
46
|
+
languages=[SLOVENE],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
WINOGRANDE_SL_CONFIG = DatasetConfig(
|
|
50
|
+
name="winogrande-sl",
|
|
51
|
+
pretty_name="Winogrande-sl",
|
|
52
|
+
source="EuroEval/winogrande-sl",
|
|
53
|
+
task=COMMON_SENSE,
|
|
54
|
+
languages=[SLOVENE],
|
|
55
|
+
_labels=["a", "b"],
|
|
56
|
+
)
|
|
@@ -1,69 +1,76 @@
|
|
|
1
1
|
"""All Spanish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
4
|
+
from ..languages import SPANISH
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
|
|
10
10
|
name="sentiment-headlines-es",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/sentiment-headlines-es",
|
|
11
|
+
pretty_name="Sentiment Headlines ES",
|
|
12
|
+
source="EuroEval/sentiment-headlines-es",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[SPANISH],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_ES_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-es",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-es",
|
|
20
|
+
source="EuroEval/scala-es",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[SPANISH],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
CONLL_ES_CONFIG = DatasetConfig(
|
|
27
26
|
name="conll-es",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/conll-es-mini",
|
|
27
|
+
pretty_name="CoNLL-es",
|
|
28
|
+
source="EuroEval/conll-es-mini",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[SPANISH],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
33
|
MLQA_ES_CONFIG = DatasetConfig(
|
|
36
34
|
name="mlqa-es",
|
|
37
|
-
pretty_name="
|
|
38
|
-
|
|
35
|
+
pretty_name="MLQA-es",
|
|
36
|
+
source="EuroEval/mlqa-es",
|
|
39
37
|
task=RC,
|
|
40
|
-
languages=[
|
|
38
|
+
languages=[SPANISH],
|
|
41
39
|
)
|
|
42
40
|
|
|
43
41
|
MLSUM_ES_CONFIG = DatasetConfig(
|
|
44
42
|
name="mlsum-es",
|
|
45
|
-
pretty_name="
|
|
46
|
-
|
|
43
|
+
pretty_name="MLSUM-es",
|
|
44
|
+
source="EuroEval/mlsum-es-mini",
|
|
47
45
|
task=SUMM,
|
|
48
|
-
languages=[
|
|
46
|
+
languages=[SPANISH],
|
|
49
47
|
)
|
|
50
48
|
|
|
51
49
|
MMLU_ES_CONFIG = DatasetConfig(
|
|
52
50
|
name="mmlu-es",
|
|
53
|
-
pretty_name="
|
|
54
|
-
"
|
|
55
|
-
huggingface_id="EuroEval/mmlu-es-mini",
|
|
51
|
+
pretty_name="MMLU-es",
|
|
52
|
+
source="EuroEval/mmlu-es-mini",
|
|
56
53
|
task=KNOW,
|
|
57
|
-
languages=[
|
|
54
|
+
languages=[SPANISH],
|
|
58
55
|
)
|
|
59
56
|
|
|
60
57
|
HELLASWAG_ES_CONFIG = DatasetConfig(
|
|
61
58
|
name="hellaswag-es",
|
|
62
|
-
pretty_name="
|
|
63
|
-
"
|
|
64
|
-
huggingface_id="EuroEval/hellaswag-es-mini",
|
|
59
|
+
pretty_name="HellaSwag-es",
|
|
60
|
+
source="EuroEval/hellaswag-es-mini",
|
|
65
61
|
task=COMMON_SENSE,
|
|
66
|
-
languages=[
|
|
62
|
+
languages=[SPANISH],
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
VALEU_ES_CONFIG = DatasetConfig(
|
|
66
|
+
name="valeu-es",
|
|
67
|
+
pretty_name="VaLEU-es",
|
|
68
|
+
source="EuroEval/european-values-es",
|
|
69
|
+
task=EUROPEAN_VALUES,
|
|
70
|
+
languages=[SPANISH],
|
|
71
|
+
splits=["test"],
|
|
72
|
+
bootstrap_samples=False,
|
|
73
|
+
_instruction_prompt="{text}",
|
|
67
74
|
)
|
|
68
75
|
|
|
69
76
|
|
|
@@ -71,19 +78,46 @@ HELLASWAG_ES_CONFIG = DatasetConfig(
|
|
|
71
78
|
|
|
72
79
|
XQUAD_ES_CONFIG = DatasetConfig(
|
|
73
80
|
name="xquad-es",
|
|
74
|
-
pretty_name="
|
|
75
|
-
|
|
81
|
+
pretty_name="XQuAD-es",
|
|
82
|
+
source="EuroEval/xquad-es",
|
|
76
83
|
task=RC,
|
|
77
|
-
languages=[
|
|
84
|
+
languages=[SPANISH],
|
|
78
85
|
unofficial=True,
|
|
79
86
|
)
|
|
80
87
|
|
|
81
88
|
BELEBELE_ES_CONFIG = DatasetConfig(
|
|
82
89
|
name="belebele-es",
|
|
83
|
-
pretty_name="
|
|
84
|
-
"
|
|
85
|
-
huggingface_id="EuroEval/belebele-es-mini",
|
|
90
|
+
pretty_name="Belebele-es",
|
|
91
|
+
source="EuroEval/belebele-es-mini",
|
|
86
92
|
task=MCRC,
|
|
87
|
-
languages=[
|
|
93
|
+
languages=[SPANISH],
|
|
94
|
+
unofficial=True,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
MULTI_WIKI_QA_ES_CONFIG = DatasetConfig(
|
|
98
|
+
name="multi-wiki-qa-es",
|
|
99
|
+
pretty_name="MultiWikiQA-es",
|
|
100
|
+
source="EuroEval/multi-wiki-qa-es-mini",
|
|
101
|
+
task=RC,
|
|
102
|
+
languages=[SPANISH],
|
|
103
|
+
unofficial=True,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
GOLDENSWAG_ES_CONFIG = DatasetConfig(
|
|
107
|
+
name="goldenswag-es",
|
|
108
|
+
pretty_name="GoldenSwag-es",
|
|
109
|
+
source="EuroEval/goldenswag-es-mini",
|
|
110
|
+
task=COMMON_SENSE,
|
|
111
|
+
languages=[SPANISH],
|
|
112
|
+
unofficial=True,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
WINOGRANDE_ES_CONFIG = DatasetConfig(
|
|
116
|
+
name="winogrande-es",
|
|
117
|
+
pretty_name="Winogrande-es",
|
|
118
|
+
source="EuroEval/winogrande-es",
|
|
119
|
+
task=COMMON_SENSE,
|
|
120
|
+
languages=[SPANISH],
|
|
121
|
+
_labels=["a", "b"],
|
|
88
122
|
unofficial=True,
|
|
89
123
|
)
|
|
@@ -1,70 +1,76 @@
|
|
|
1
1
|
"""All Swedish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
4
|
+
from ..languages import SWEDISH
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
SWEREC_CONFIG = DatasetConfig(
|
|
10
10
|
name="swerec",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/swerec-mini",
|
|
11
|
+
pretty_name="SweReC",
|
|
12
|
+
source="EuroEval/swerec-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[SWEDISH],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_SV_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-sv",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-sv",
|
|
20
|
+
source="EuroEval/scala-sv",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[SWEDISH],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
SUC3_CONFIG = DatasetConfig(
|
|
27
26
|
name="suc3",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/suc3-mini",
|
|
27
|
+
pretty_name="SUC3",
|
|
28
|
+
source="EuroEval/suc3-mini",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[SWEDISH],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
|
-
|
|
36
|
-
name="
|
|
37
|
-
pretty_name="
|
|
38
|
-
"
|
|
39
|
-
huggingface_id="EuroEval/scandiqa-sv-mini",
|
|
33
|
+
MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-sv",
|
|
35
|
+
pretty_name="MultiWikiQA-sv",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-sv-mini",
|
|
40
37
|
task=RC,
|
|
41
|
-
languages=[
|
|
38
|
+
languages=[SWEDISH],
|
|
42
39
|
)
|
|
43
40
|
|
|
44
41
|
SWEDN_CONFIG = DatasetConfig(
|
|
45
42
|
name="swedn",
|
|
46
|
-
pretty_name="
|
|
47
|
-
|
|
43
|
+
pretty_name="SweDN",
|
|
44
|
+
source="EuroEval/swedn-mini",
|
|
48
45
|
task=SUMM,
|
|
49
|
-
languages=[
|
|
46
|
+
languages=[SWEDISH],
|
|
50
47
|
)
|
|
51
48
|
|
|
52
49
|
MMLU_SV_CONFIG = DatasetConfig(
|
|
53
50
|
name="mmlu-sv",
|
|
54
|
-
pretty_name="
|
|
55
|
-
"
|
|
56
|
-
huggingface_id="EuroEval/mmlu-sv-mini",
|
|
51
|
+
pretty_name="MMLU-sv",
|
|
52
|
+
source="EuroEval/mmlu-sv-mini",
|
|
57
53
|
task=KNOW,
|
|
58
|
-
languages=[
|
|
54
|
+
languages=[SWEDISH],
|
|
59
55
|
)
|
|
60
56
|
|
|
61
57
|
HELLASWAG_SV_CONFIG = DatasetConfig(
|
|
62
58
|
name="hellaswag-sv",
|
|
63
|
-
pretty_name="
|
|
64
|
-
"
|
|
65
|
-
huggingface_id="EuroEval/hellaswag-sv-mini",
|
|
59
|
+
pretty_name="HellaSwag-sv",
|
|
60
|
+
source="EuroEval/hellaswag-sv-mini",
|
|
66
61
|
task=COMMON_SENSE,
|
|
67
|
-
languages=[
|
|
62
|
+
languages=[SWEDISH],
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
VALEU_SV_CONFIG = DatasetConfig(
|
|
66
|
+
name="valeu-sv",
|
|
67
|
+
pretty_name="VaLEU-sv",
|
|
68
|
+
source="EuroEval/european-values-sv",
|
|
69
|
+
task=EUROPEAN_VALUES,
|
|
70
|
+
languages=[SWEDISH],
|
|
71
|
+
splits=["test"],
|
|
72
|
+
bootstrap_samples=False,
|
|
73
|
+
_instruction_prompt="{text}",
|
|
68
74
|
)
|
|
69
75
|
|
|
70
76
|
|
|
@@ -72,29 +78,64 @@ HELLASWAG_SV_CONFIG = DatasetConfig(
|
|
|
72
78
|
|
|
73
79
|
SCHIBSTED_SV_CONFIG = DatasetConfig(
|
|
74
80
|
name="schibsted-sv",
|
|
75
|
-
pretty_name="
|
|
76
|
-
|
|
81
|
+
pretty_name="Schibsted-sv",
|
|
82
|
+
source="EuroEval/schibsted-article-summaries-sv",
|
|
77
83
|
task=SUMM,
|
|
78
|
-
languages=[
|
|
84
|
+
languages=[SWEDISH],
|
|
79
85
|
unofficial=True,
|
|
80
86
|
)
|
|
81
87
|
|
|
82
88
|
ARC_SV_CONFIG = DatasetConfig(
|
|
83
89
|
name="arc-sv",
|
|
84
|
-
pretty_name="
|
|
85
|
-
"
|
|
86
|
-
huggingface_id="EuroEval/arc-sv-mini",
|
|
90
|
+
pretty_name="ARC-sv",
|
|
91
|
+
source="EuroEval/arc-sv-mini",
|
|
87
92
|
task=KNOW,
|
|
88
|
-
languages=[
|
|
93
|
+
languages=[SWEDISH],
|
|
89
94
|
unofficial=True,
|
|
90
95
|
)
|
|
91
96
|
|
|
92
97
|
BELEBELE_SV_CONFIG = DatasetConfig(
|
|
93
98
|
name="belebele-sv",
|
|
94
|
-
pretty_name="
|
|
95
|
-
"
|
|
96
|
-
huggingface_id="EuroEval/belebele-sv-mini",
|
|
99
|
+
pretty_name="Belebele-sv",
|
|
100
|
+
source="EuroEval/belebele-sv-mini",
|
|
97
101
|
task=MCRC,
|
|
98
|
-
languages=[
|
|
102
|
+
languages=[SWEDISH],
|
|
103
|
+
unofficial=True,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
SCANDIQA_SV_CONFIG = DatasetConfig(
|
|
107
|
+
name="scandiqa-sv",
|
|
108
|
+
pretty_name="ScandiQA-sv",
|
|
109
|
+
source="EuroEval/scandiqa-sv-mini",
|
|
110
|
+
task=RC,
|
|
111
|
+
languages=[SWEDISH],
|
|
112
|
+
unofficial=True,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
GOLDENSWAG_SV_CONFIG = DatasetConfig(
|
|
116
|
+
name="goldenswag-sv",
|
|
117
|
+
pretty_name="GoldenSwag-sv",
|
|
118
|
+
source="EuroEval/goldenswag-sv-mini",
|
|
119
|
+
task=COMMON_SENSE,
|
|
120
|
+
languages=[SWEDISH],
|
|
121
|
+
unofficial=True,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
WINOGRANDE_SV_CONFIG = DatasetConfig(
|
|
125
|
+
name="winogrande-sv",
|
|
126
|
+
pretty_name="Winogrande-sv",
|
|
127
|
+
source="EuroEval/winogrande-sv",
|
|
128
|
+
task=COMMON_SENSE,
|
|
129
|
+
languages=[SWEDISH],
|
|
130
|
+
_labels=["a", "b"],
|
|
131
|
+
unofficial=True,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
SKOLPROV_CONFIG = DatasetConfig(
|
|
135
|
+
name="skolprov",
|
|
136
|
+
pretty_name="Skolprov",
|
|
137
|
+
source="EuroEval/skolprov",
|
|
138
|
+
task=KNOW,
|
|
139
|
+
languages=[SWEDISH],
|
|
99
140
|
unofficial=True,
|
|
100
141
|
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""All Ukrainian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import UKRAINIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
CROSS_DOMAIN_UK_REVIEWS_CONFIG = DatasetConfig(
|
|
10
|
+
name="cross-domain-uk-reviews",
|
|
11
|
+
pretty_name="Cross Domain Ukrainian Reviews",
|
|
12
|
+
source="EuroEval/cross-domain-uk-reviews-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[UKRAINIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_UK_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-uk",
|
|
19
|
+
pretty_name="ScaLA-uk",
|
|
20
|
+
source="EuroEval/scala-uk",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[UKRAINIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
NER_UK_CONFIG = DatasetConfig(
|
|
26
|
+
name="ner-uk",
|
|
27
|
+
pretty_name="NER-uk",
|
|
28
|
+
source="EuroEval/ner-uk-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[UKRAINIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_UK_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-uk",
|
|
35
|
+
pretty_name="MultiWikiQA-uk",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-uk-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[UKRAINIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
LR_SUM_UK_CONFIG = DatasetConfig(
|
|
42
|
+
name="lr-sum-uk",
|
|
43
|
+
pretty_name="LRSum-uk",
|
|
44
|
+
source="EuroEval/lr-sum-uk-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[UKRAINIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
GLOBAL_MMLU_UK_CONFIG = DatasetConfig(
|
|
50
|
+
name="global-mmlu-uk",
|
|
51
|
+
pretty_name="GlobalMMLU-uk",
|
|
52
|
+
source="EuroEval/global-mmlu-uk-mini",
|
|
53
|
+
task=KNOW,
|
|
54
|
+
languages=[UKRAINIAN],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
WINOGRANDE_UK_CONFIG = DatasetConfig(
|
|
58
|
+
name="winogrande-uk",
|
|
59
|
+
pretty_name="Winogrande-uk",
|
|
60
|
+
source="EuroEval/winogrande-uk",
|
|
61
|
+
task=COMMON_SENSE,
|
|
62
|
+
languages=[UKRAINIAN],
|
|
63
|
+
_labels=["a", "b"],
|
|
64
|
+
)
|
euroeval/enums.py
CHANGED
|
@@ -12,6 +12,14 @@ class AutoStrEnum(str, Enum):
|
|
|
12
12
|
) -> str:
|
|
13
13
|
return name.lower()
|
|
14
14
|
|
|
15
|
+
def __str__(self) -> str:
|
|
16
|
+
"""Return the value in upper case for better readability."""
|
|
17
|
+
return self.value.upper()
|
|
18
|
+
|
|
19
|
+
def __repr__(self) -> str:
|
|
20
|
+
"""Return the value in upper case for better readability."""
|
|
21
|
+
return self.value.upper()
|
|
22
|
+
|
|
15
23
|
|
|
16
24
|
class Device(AutoStrEnum):
|
|
17
25
|
"""The compute device to use for the evaluation.
|
|
@@ -40,14 +48,11 @@ class InferenceBackend(AutoStrEnum):
|
|
|
40
48
|
VLLM library.
|
|
41
49
|
LITELLM:
|
|
42
50
|
LiteLLM library.
|
|
43
|
-
NONE:
|
|
44
|
-
No inference backend used (e.g., for human evaluation).
|
|
45
51
|
"""
|
|
46
52
|
|
|
47
53
|
TRANSFORMERS = auto()
|
|
48
54
|
VLLM = auto()
|
|
49
55
|
LITELLM = auto()
|
|
50
|
-
NONE = auto()
|
|
51
56
|
|
|
52
57
|
|
|
53
58
|
class ModelType(AutoStrEnum):
|
|
@@ -58,13 +63,14 @@ class ModelType(AutoStrEnum):
|
|
|
58
63
|
An encoder (i.e., BERT-style) model.
|
|
59
64
|
GENERATIVE:
|
|
60
65
|
A generative model. Can be either decoder or encoder-decoder (aka seq2seq).
|
|
61
|
-
HUMAN:
|
|
62
|
-
Human evaluator.
|
|
63
66
|
"""
|
|
64
67
|
|
|
65
68
|
ENCODER = auto()
|
|
66
69
|
GENERATIVE = auto()
|
|
67
|
-
|
|
70
|
+
|
|
71
|
+
def __repr__(self) -> str:
|
|
72
|
+
"""Return the value in upper case for better readability."""
|
|
73
|
+
return self.value.upper()
|
|
68
74
|
|
|
69
75
|
|
|
70
76
|
class GenerativeType(AutoStrEnum):
|
euroeval/exceptions.py
CHANGED
|
@@ -118,6 +118,26 @@ class NeedsManualDependency(InvalidModel):
|
|
|
118
118
|
super().__init__(self.message)
|
|
119
119
|
|
|
120
120
|
|
|
121
|
+
class NeedsSystemDependency(InvalidModel):
|
|
122
|
+
"""The evaluation requires a system-level dependency to be installed."""
|
|
123
|
+
|
|
124
|
+
def __init__(self, dependency: str, instructions: str) -> None:
|
|
125
|
+
"""Initialise the exception.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
dependency:
|
|
129
|
+
The system dependency that needs to be installed.
|
|
130
|
+
instructions:
|
|
131
|
+
Instructions on how to install the dependency.
|
|
132
|
+
"""
|
|
133
|
+
self.dependency = dependency
|
|
134
|
+
self.message = (
|
|
135
|
+
f"The model you are trying to load requires `{dependency}` to be "
|
|
136
|
+
f"installed. {instructions}"
|
|
137
|
+
)
|
|
138
|
+
super().__init__(self.message)
|
|
139
|
+
|
|
140
|
+
|
|
121
141
|
class NeedsAdditionalArgument(InvalidModel):
|
|
122
142
|
"""The evaluation requires additional arguments to the `euroeval` command."""
|
|
123
143
|
|
|
@@ -145,7 +165,7 @@ class NeedsAdditionalArgument(InvalidModel):
|
|
|
145
165
|
else:
|
|
146
166
|
self.message = (
|
|
147
167
|
f"The model you are trying to load requires the `{script_argument}` "
|
|
148
|
-
"argument
|
|
168
|
+
"argument to be passed to the `Benchmarker` class. Please pass the "
|
|
149
169
|
"argument and try again."
|
|
150
170
|
)
|
|
151
171
|
super().__init__(self.message)
|