EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +32 -14
- euroeval/benchmark_config_factory.py +92 -180
- euroeval/benchmark_modules/base.py +49 -39
- euroeval/benchmark_modules/fresh.py +35 -21
- euroeval/benchmark_modules/hf.py +280 -244
- euroeval/benchmark_modules/litellm.py +752 -312
- euroeval/benchmark_modules/vllm.py +570 -268
- euroeval/benchmarker.py +651 -528
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +49 -38
- euroeval/constants.py +44 -25
- euroeval/data_loading.py +111 -55
- euroeval/data_models.py +490 -323
- euroeval/dataset_configs/__init__.py +26 -4
- euroeval/dataset_configs/bosnian.py +39 -0
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/croatian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +78 -50
- euroeval/dataset_configs/dutch.py +74 -44
- euroeval/dataset_configs/english.py +71 -36
- euroeval/dataset_configs/estonian.py +111 -0
- euroeval/dataset_configs/faroese.py +25 -18
- euroeval/dataset_configs/finnish.py +63 -26
- euroeval/dataset_configs/french.py +65 -32
- euroeval/dataset_configs/german.py +77 -36
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +68 -57
- euroeval/dataset_configs/italian.py +68 -36
- euroeval/dataset_configs/latvian.py +87 -0
- euroeval/dataset_configs/lithuanian.py +64 -0
- euroeval/dataset_configs/norwegian.py +98 -72
- euroeval/dataset_configs/polish.py +96 -0
- euroeval/dataset_configs/portuguese.py +63 -40
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/slovene.py +56 -0
- euroeval/dataset_configs/spanish.py +68 -34
- euroeval/dataset_configs/swedish.py +82 -41
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/enums.py +12 -6
- euroeval/exceptions.py +21 -1
- euroeval/finetuning.py +34 -26
- euroeval/generation.py +76 -41
- euroeval/generation_utils.py +169 -34
- euroeval/languages.py +1020 -188
- euroeval/logging_utils.py +268 -0
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +85 -0
- euroeval/metrics/huggingface.py +216 -0
- euroeval/metrics/llm_as_a_judge.py +260 -0
- euroeval/metrics/pipeline.py +289 -0
- euroeval/metrics/speed.py +48 -0
- euroeval/model_cache.py +40 -21
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +157 -22
- euroeval/prompt_templates/multiple_choice.py +159 -17
- euroeval/prompt_templates/named_entity_recognition.py +318 -21
- euroeval/prompt_templates/reading_comprehension.py +207 -16
- euroeval/prompt_templates/sentiment_classification.py +205 -22
- euroeval/prompt_templates/summarization.py +122 -22
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +20 -9
- euroeval/speed_benchmark.py +11 -12
- euroeval/task_group_utils/multiple_choice_classification.py +21 -12
- euroeval/task_group_utils/question_answering.py +101 -73
- euroeval/task_group_utils/sequence_classification.py +144 -61
- euroeval/task_group_utils/text_to_text.py +33 -12
- euroeval/task_group_utils/token_classification.py +86 -89
- euroeval/tasks.py +75 -16
- euroeval/tokenisation_utils.py +603 -0
- euroeval/types.py +17 -11
- euroeval/utils.py +332 -137
- euroeval-16.7.1.dist-info/METADATA +623 -0
- euroeval-16.7.1.dist-info/RECORD +84 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -737
- euroeval/metrics.py +0 -452
- euroeval/tokenization_utils.py +0 -498
- euroeval-15.12.0.dist-info/METADATA +0 -285
- euroeval-15.12.0.dist-info/RECORD +0 -63
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,19 +3,33 @@
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import get_all_languages
|
|
5
5
|
from ..tasks import SPEED
|
|
6
|
+
from ..utils import load_custom_datasets_module
|
|
7
|
+
from .bosnian import * # noqa: F403
|
|
8
|
+
from .bulgarian import * # noqa: F403
|
|
9
|
+
from .croatian import * # noqa: F403
|
|
10
|
+
from .czech import * # noqa: F403
|
|
6
11
|
from .danish import * # noqa: F403
|
|
7
12
|
from .dutch import * # noqa: F403
|
|
8
13
|
from .english import * # noqa: F403
|
|
14
|
+
from .estonian import * # noqa: F403
|
|
9
15
|
from .faroese import * # noqa: F403
|
|
10
16
|
from .finnish import * # noqa: F403
|
|
11
17
|
from .french import * # noqa: F403
|
|
12
18
|
from .german import * # noqa: F403
|
|
19
|
+
from .greek import * # noqa: F403
|
|
13
20
|
from .icelandic import * # noqa: F403
|
|
14
21
|
from .italian import * # noqa: F403
|
|
22
|
+
from .latvian import * # noqa: F403
|
|
23
|
+
from .lithuanian import * # noqa: F403
|
|
15
24
|
from .norwegian import * # noqa: F403
|
|
25
|
+
from .polish import * # noqa: F403
|
|
16
26
|
from .portuguese import * # noqa: F403
|
|
27
|
+
from .serbian import * # noqa: F403
|
|
28
|
+
from .slovak import * # noqa: F403
|
|
29
|
+
from .slovene import * # noqa: F403
|
|
17
30
|
from .spanish import * # noqa: F403
|
|
18
31
|
from .swedish import * # noqa: F403
|
|
32
|
+
from .ukrainian import * # noqa: F403
|
|
19
33
|
|
|
20
34
|
|
|
21
35
|
def get_all_dataset_configs() -> dict[str, DatasetConfig]:
|
|
@@ -24,14 +38,21 @@ def get_all_dataset_configs() -> dict[str, DatasetConfig]:
|
|
|
24
38
|
Returns:
|
|
25
39
|
A mapping between names of datasets and their configurations.
|
|
26
40
|
"""
|
|
41
|
+
globals_dict = globals()
|
|
42
|
+
module = load_custom_datasets_module()
|
|
43
|
+
if module is not None:
|
|
44
|
+
globals_dict |= vars(module)
|
|
27
45
|
dataset_configs = [
|
|
28
|
-
cfg
|
|
46
|
+
cfg
|
|
47
|
+
for cfg in globals_dict.values()
|
|
48
|
+
if isinstance(cfg, DatasetConfig) and cfg.task != SPEED
|
|
29
49
|
]
|
|
30
50
|
assert len(dataset_configs) == len({cfg.name for cfg in dataset_configs}), (
|
|
31
51
|
"There are duplicate dataset configurations. Please ensure that each dataset "
|
|
32
52
|
"has a unique name."
|
|
33
53
|
)
|
|
34
|
-
|
|
54
|
+
mapping = {cfg.name: cfg for cfg in dataset_configs}
|
|
55
|
+
return mapping
|
|
35
56
|
|
|
36
57
|
|
|
37
58
|
def get_dataset_config(dataset_name: str) -> DatasetConfig:
|
|
@@ -56,8 +77,9 @@ def get_dataset_config(dataset_name: str) -> DatasetConfig:
|
|
|
56
77
|
|
|
57
78
|
SPEED_CONFIG = DatasetConfig(
|
|
58
79
|
name="speed",
|
|
59
|
-
pretty_name="
|
|
60
|
-
|
|
80
|
+
pretty_name="",
|
|
81
|
+
source="",
|
|
61
82
|
task=SPEED,
|
|
62
83
|
languages=list(get_all_languages().values()),
|
|
84
|
+
_logging_string="the speed estimation benchmark",
|
|
63
85
|
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""All Bosnian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import BOSNIAN
|
|
5
|
+
from ..tasks import NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
MMS_BS_CONFIG = DatasetConfig(
|
|
10
|
+
name="mms-bs",
|
|
11
|
+
pretty_name="MMS-bs",
|
|
12
|
+
source="EuroEval/mms-bs-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[BOSNIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
WIKIANN_BS_CONFIG = DatasetConfig(
|
|
18
|
+
name="wikiann-bs",
|
|
19
|
+
pretty_name="WikiANN-bs",
|
|
20
|
+
source="EuroEval/wikiann-bs-mini",
|
|
21
|
+
task=NER,
|
|
22
|
+
languages=[BOSNIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
MULTI_WIKI_QA_BS_CONFIG = DatasetConfig(
|
|
26
|
+
name="multi-wiki-qa-bs",
|
|
27
|
+
pretty_name="MultiWikiQA-bs",
|
|
28
|
+
source="EuroEval/multi-wiki-qa-bs-mini",
|
|
29
|
+
task=RC,
|
|
30
|
+
languages=[BOSNIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
LR_SUM_BS_CONFIG = DatasetConfig(
|
|
34
|
+
name="lr-sum-bs",
|
|
35
|
+
pretty_name="LRSum-bs",
|
|
36
|
+
source="EuroEval/lr-sum-bs-mini",
|
|
37
|
+
task=SUMM,
|
|
38
|
+
languages=[BOSNIAN],
|
|
39
|
+
)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""All Bulgarian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import BULGARIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
CINEXIO_CONFIG = DatasetConfig(
|
|
10
|
+
name="cinexio",
|
|
11
|
+
pretty_name="Cinexio",
|
|
12
|
+
source="EuroEval/cinexio-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[BULGARIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_BG_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-bg",
|
|
19
|
+
pretty_name="ScaLA-bg",
|
|
20
|
+
source="EuroEval/scala-bg",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[BULGARIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
BG_NER_BSNLP_CONFIG = DatasetConfig(
|
|
26
|
+
name="bg-ner-bsnlp",
|
|
27
|
+
pretty_name="BG-NER-BSNLp",
|
|
28
|
+
source="EuroEval/bg-ner-bsnlp-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[BULGARIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_BG_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-bg",
|
|
35
|
+
pretty_name="MultiWikiQA-bg",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-bg-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[BULGARIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
EXAMS_BG_CONFIG = DatasetConfig(
|
|
42
|
+
name="exams-bg",
|
|
43
|
+
pretty_name="Exams-bg",
|
|
44
|
+
source="EuroEval/exams-bg-mini",
|
|
45
|
+
task=KNOW,
|
|
46
|
+
languages=[BULGARIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
WINOGRANDE_BG_CONFIG = DatasetConfig(
|
|
50
|
+
name="winogrande-bg",
|
|
51
|
+
pretty_name="Winogrande-bg",
|
|
52
|
+
source="EuroEval/winogrande-bg",
|
|
53
|
+
task=COMMON_SENSE,
|
|
54
|
+
languages=[BULGARIAN],
|
|
55
|
+
_labels=["a", "b"],
|
|
56
|
+
)
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""All Croatian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import CROATIAN
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
MMS_HR_CONFIG = DatasetConfig(
|
|
10
|
+
name="mms-hr",
|
|
11
|
+
pretty_name="MMS-hr",
|
|
12
|
+
source="EuroEval/mms-hr-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[CROATIAN],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
SCALA_HR_CONFIG = DatasetConfig(
|
|
18
|
+
name="scala-hr",
|
|
19
|
+
pretty_name="ScaLA-hr",
|
|
20
|
+
source="EuroEval/scala-hr",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[CROATIAN],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
WIKIANN_HR_CONFIG = DatasetConfig(
|
|
26
|
+
name="wikiann-hr",
|
|
27
|
+
pretty_name="WikiANN-hr",
|
|
28
|
+
source="EuroEval/wikiann-hr-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[CROATIAN],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
MULTI_WIKI_QA_HR_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-hr",
|
|
35
|
+
pretty_name="MultiWikiQA-hr",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-hr-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[CROATIAN],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
MMLU_HR_CONFIG = DatasetConfig(
|
|
42
|
+
name="mmlu-hr",
|
|
43
|
+
pretty_name="MMLU-hr",
|
|
44
|
+
source="EuroEval/mmlu-hr-mini",
|
|
45
|
+
task=KNOW,
|
|
46
|
+
languages=[CROATIAN],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
WINOGRANDE_HR_CONFIG = DatasetConfig(
|
|
50
|
+
name="winogrande-hr",
|
|
51
|
+
pretty_name="Winogrande-hr",
|
|
52
|
+
source="EuroEval/winogrande-hr",
|
|
53
|
+
task=COMMON_SENSE,
|
|
54
|
+
languages=[CROATIAN],
|
|
55
|
+
_labels=["a", "b"],
|
|
56
|
+
)
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""All Czech dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import CZECH
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
CSFD_SENTIMENT_CONFIG = DatasetConfig(
|
|
10
|
+
name="csfd-sentiment",
|
|
11
|
+
pretty_name="CSFD Sentiment",
|
|
12
|
+
source="EuroEval/csfd-sentiment-mini",
|
|
13
|
+
task=SENT,
|
|
14
|
+
languages=[CZECH],
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
CS_GEC_CONFIG = DatasetConfig(
|
|
18
|
+
name="cs-gec",
|
|
19
|
+
pretty_name="CS-GEC",
|
|
20
|
+
source="EuroEval/cs-gec-mini",
|
|
21
|
+
task=LA,
|
|
22
|
+
languages=[CZECH],
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
PONER_CONFIG = DatasetConfig(
|
|
26
|
+
name="poner",
|
|
27
|
+
pretty_name="PoNER",
|
|
28
|
+
source="EuroEval/poner-mini",
|
|
29
|
+
task=NER,
|
|
30
|
+
languages=[CZECH],
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
SQAD_CONFIG = DatasetConfig(
|
|
34
|
+
name="sqad",
|
|
35
|
+
pretty_name="SQAD",
|
|
36
|
+
source="EuroEval/sqad-mini",
|
|
37
|
+
task=RC,
|
|
38
|
+
languages=[CZECH],
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
CZECH_NEWS_CONFIG = DatasetConfig(
|
|
42
|
+
name="czech-news",
|
|
43
|
+
pretty_name="Czech News",
|
|
44
|
+
source="EuroEval/czech-news-mini",
|
|
45
|
+
task=SUMM,
|
|
46
|
+
languages=[CZECH],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
UMIMETO_QA_CONFIG = DatasetConfig(
|
|
50
|
+
name="umimeto-qa",
|
|
51
|
+
pretty_name="Umimeto QA",
|
|
52
|
+
source="EuroEval/umimeto-qa",
|
|
53
|
+
task=KNOW,
|
|
54
|
+
languages=[CZECH],
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
HELLASWAG_CS_CONFIG = DatasetConfig(
|
|
58
|
+
name="hellaswag-cs",
|
|
59
|
+
pretty_name="HellaSwag-cs",
|
|
60
|
+
source="EuroEval/hellaswag-cs-mini",
|
|
61
|
+
task=COMMON_SENSE,
|
|
62
|
+
languages=[CZECH],
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
### Unofficial datasets ###
|
|
67
|
+
|
|
68
|
+
SCALA_CS_CONFIG = DatasetConfig(
|
|
69
|
+
name="scala-cs",
|
|
70
|
+
pretty_name="ScaLA-cs",
|
|
71
|
+
source="EuroEval/scala-cs",
|
|
72
|
+
task=LA,
|
|
73
|
+
languages=[CZECH],
|
|
74
|
+
unofficial=True,
|
|
75
|
+
)
|
|
@@ -1,79 +1,83 @@
|
|
|
1
1
|
"""All Danish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
4
|
+
from ..languages import DANISH
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
ANGRY_TWEETS_CONFIG = DatasetConfig(
|
|
10
10
|
name="angry-tweets",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/angry-tweets-mini",
|
|
11
|
+
pretty_name="AngryTweets",
|
|
12
|
+
source="EuroEval/angry-tweets-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[DANISH],
|
|
16
15
|
)
|
|
17
16
|
|
|
18
17
|
SCALA_DA_CONFIG = DatasetConfig(
|
|
19
18
|
name="scala-da",
|
|
20
|
-
pretty_name="
|
|
21
|
-
|
|
19
|
+
pretty_name="ScaLA-da",
|
|
20
|
+
source="EuroEval/scala-da",
|
|
22
21
|
task=LA,
|
|
23
|
-
languages=[
|
|
22
|
+
languages=[DANISH],
|
|
24
23
|
)
|
|
25
24
|
|
|
26
25
|
DANSK_CONFIG = DatasetConfig(
|
|
27
26
|
name="dansk",
|
|
28
|
-
pretty_name="
|
|
29
|
-
"
|
|
30
|
-
huggingface_id="EuroEval/dansk-mini",
|
|
27
|
+
pretty_name="DANSK",
|
|
28
|
+
source="EuroEval/dansk-mini",
|
|
31
29
|
task=NER,
|
|
32
|
-
languages=[
|
|
30
|
+
languages=[DANISH],
|
|
33
31
|
)
|
|
34
32
|
|
|
35
|
-
|
|
36
|
-
name="
|
|
37
|
-
pretty_name="
|
|
38
|
-
"
|
|
39
|
-
huggingface_id="EuroEval/scandiqa-da-mini",
|
|
33
|
+
MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
|
|
34
|
+
name="multi-wiki-qa-da",
|
|
35
|
+
pretty_name="MultiWikiQA-da",
|
|
36
|
+
source="EuroEval/multi-wiki-qa-da-mini",
|
|
40
37
|
task=RC,
|
|
41
|
-
languages=[
|
|
38
|
+
languages=[DANISH],
|
|
42
39
|
)
|
|
43
40
|
|
|
44
41
|
NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
|
|
45
42
|
name="nordjylland-news",
|
|
46
|
-
pretty_name="
|
|
47
|
-
"
|
|
48
|
-
huggingface_id="EuroEval/nordjylland-news-mini",
|
|
43
|
+
pretty_name="Nordjylland News",
|
|
44
|
+
source="EuroEval/nordjylland-news-mini",
|
|
49
45
|
task=SUMM,
|
|
50
|
-
languages=[
|
|
46
|
+
languages=[DANISH],
|
|
51
47
|
)
|
|
52
48
|
|
|
53
49
|
DANSKE_TALEMAADER_CONFIG = DatasetConfig(
|
|
54
50
|
name="danske-talemaader",
|
|
55
|
-
pretty_name="
|
|
56
|
-
"
|
|
57
|
-
huggingface_id="EuroEval/danske-talemaader",
|
|
51
|
+
pretty_name="Danske Talemåder",
|
|
52
|
+
source="EuroEval/danske-talemaader",
|
|
58
53
|
task=KNOW,
|
|
59
|
-
languages=[
|
|
54
|
+
languages=[DANISH],
|
|
60
55
|
)
|
|
61
56
|
|
|
62
57
|
DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
|
|
63
58
|
name="danish-citizen-tests",
|
|
64
|
-
pretty_name="
|
|
65
|
-
|
|
59
|
+
pretty_name="Danish Citizen Tests",
|
|
60
|
+
source="EuroEval/danish-citizen-tests-updated",
|
|
66
61
|
task=KNOW,
|
|
67
|
-
languages=[
|
|
62
|
+
languages=[DANISH],
|
|
68
63
|
)
|
|
69
64
|
|
|
70
65
|
HELLASWAG_DA_CONFIG = DatasetConfig(
|
|
71
66
|
name="hellaswag-da",
|
|
72
|
-
pretty_name="
|
|
73
|
-
"
|
|
74
|
-
huggingface_id="EuroEval/hellaswag-da-mini",
|
|
67
|
+
pretty_name="HellaSwag-da",
|
|
68
|
+
source="EuroEval/hellaswag-da-mini",
|
|
75
69
|
task=COMMON_SENSE,
|
|
76
|
-
languages=[
|
|
70
|
+
languages=[DANISH],
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
VALEU_DA_CONFIG = DatasetConfig(
|
|
74
|
+
name="valeu-da",
|
|
75
|
+
pretty_name="ValEU-da",
|
|
76
|
+
source="EuroEval/european-values-da",
|
|
77
|
+
task=EUROPEAN_VALUES,
|
|
78
|
+
languages=[DANISH],
|
|
79
|
+
splits=["test"],
|
|
80
|
+
bootstrap_samples=False,
|
|
77
81
|
)
|
|
78
82
|
|
|
79
83
|
|
|
@@ -81,40 +85,64 @@ HELLASWAG_DA_CONFIG = DatasetConfig(
|
|
|
81
85
|
|
|
82
86
|
DANE_CONFIG = DatasetConfig(
|
|
83
87
|
name="dane",
|
|
84
|
-
pretty_name="
|
|
85
|
-
"
|
|
86
|
-
huggingface_id="EuroEval/dane-mini",
|
|
88
|
+
pretty_name="DaNE",
|
|
89
|
+
source="EuroEval/dane-mini",
|
|
87
90
|
task=NER,
|
|
88
|
-
languages=[
|
|
91
|
+
languages=[DANISH],
|
|
89
92
|
unofficial=True,
|
|
90
93
|
)
|
|
91
94
|
|
|
92
95
|
MMLU_DA_CONFIG = DatasetConfig(
|
|
93
96
|
name="mmlu-da",
|
|
94
|
-
pretty_name="
|
|
95
|
-
"
|
|
96
|
-
huggingface_id="EuroEval/mmlu-da-mini",
|
|
97
|
+
pretty_name="MMLU-da",
|
|
98
|
+
source="EuroEval/mmlu-da-mini",
|
|
97
99
|
task=KNOW,
|
|
98
|
-
languages=[
|
|
100
|
+
languages=[DANISH],
|
|
99
101
|
unofficial=True,
|
|
100
102
|
)
|
|
101
103
|
|
|
102
104
|
ARC_DA_CONFIG = DatasetConfig(
|
|
103
105
|
name="arc-da",
|
|
104
|
-
pretty_name="
|
|
105
|
-
"
|
|
106
|
-
huggingface_id="EuroEval/arc-da-mini",
|
|
106
|
+
pretty_name="ARC-da",
|
|
107
|
+
source="EuroEval/arc-da-mini",
|
|
107
108
|
task=KNOW,
|
|
108
|
-
languages=[
|
|
109
|
+
languages=[DANISH],
|
|
109
110
|
unofficial=True,
|
|
110
111
|
)
|
|
111
112
|
|
|
112
113
|
BELEBELE_DA_CONFIG = DatasetConfig(
|
|
113
114
|
name="belebele-da",
|
|
114
|
-
pretty_name="
|
|
115
|
-
"
|
|
116
|
-
huggingface_id="EuroEval/belebele-da-mini",
|
|
115
|
+
pretty_name="Belebele-da",
|
|
116
|
+
source="EuroEval/belebele-da-mini",
|
|
117
117
|
task=MCRC,
|
|
118
|
-
languages=[
|
|
118
|
+
languages=[DANISH],
|
|
119
|
+
unofficial=True,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
SCANDIQA_DA_CONFIG = DatasetConfig(
|
|
123
|
+
name="scandiqa-da",
|
|
124
|
+
pretty_name="ScandiQA-da",
|
|
125
|
+
source="EuroEval/scandiqa-da-mini",
|
|
126
|
+
task=RC,
|
|
127
|
+
languages=[DANISH],
|
|
128
|
+
unofficial=True,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
GOLDENSWAG_DA_CONFIG = DatasetConfig(
|
|
132
|
+
name="goldenswag-da",
|
|
133
|
+
pretty_name="GoldenSwag-da",
|
|
134
|
+
source="EuroEval/goldenswag-da-mini",
|
|
135
|
+
task=COMMON_SENSE,
|
|
136
|
+
languages=[DANISH],
|
|
137
|
+
unofficial=True,
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
WINOGRANDE_DA_CONFIG = DatasetConfig(
|
|
141
|
+
name="winogrande-da",
|
|
142
|
+
pretty_name="Winogrande-da",
|
|
143
|
+
source="EuroEval/winogrande-da",
|
|
144
|
+
task=COMMON_SENSE,
|
|
145
|
+
languages=[DANISH],
|
|
146
|
+
_labels=["a", "b"],
|
|
119
147
|
unofficial=True,
|
|
120
148
|
)
|
|
@@ -1,72 +1,77 @@
|
|
|
1
1
|
"""All Dutch dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..languages import
|
|
5
|
-
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
4
|
+
from ..languages import DUTCH
|
|
5
|
+
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
9
|
DBRD_CONFIG = DatasetConfig(
|
|
10
10
|
name="dbrd",
|
|
11
|
-
pretty_name="
|
|
12
|
-
"
|
|
13
|
-
huggingface_id="EuroEval/dbrd-mini",
|
|
11
|
+
pretty_name="DBRD",
|
|
12
|
+
source="EuroEval/dbrd-mini",
|
|
14
13
|
task=SENT,
|
|
15
|
-
languages=[
|
|
14
|
+
languages=[DUTCH],
|
|
16
15
|
_labels=["negative", "positive"],
|
|
17
16
|
)
|
|
18
17
|
|
|
19
18
|
SCALA_NL_CONFIG = DatasetConfig(
|
|
20
19
|
name="scala-nl",
|
|
21
|
-
pretty_name="
|
|
22
|
-
|
|
20
|
+
pretty_name="ScaLA-nl",
|
|
21
|
+
source="EuroEval/scala-nl",
|
|
23
22
|
task=LA,
|
|
24
|
-
languages=[
|
|
23
|
+
languages=[DUTCH],
|
|
25
24
|
)
|
|
26
25
|
|
|
27
26
|
CONLL_NL_CONFIG = DatasetConfig(
|
|
28
27
|
name="conll-nl",
|
|
29
|
-
pretty_name="
|
|
30
|
-
"
|
|
31
|
-
huggingface_id="EuroEval/conll-nl-mini",
|
|
28
|
+
pretty_name="CoNLL-nl",
|
|
29
|
+
source="EuroEval/conll-nl-mini",
|
|
32
30
|
task=NER,
|
|
33
|
-
languages=[
|
|
31
|
+
languages=[DUTCH],
|
|
34
32
|
)
|
|
35
33
|
|
|
36
34
|
SQUAD_NL_CONFIG = DatasetConfig(
|
|
37
35
|
name="squad-nl",
|
|
38
|
-
pretty_name="
|
|
39
|
-
"
|
|
40
|
-
huggingface_id="EuroEval/squad-nl-v2-mini",
|
|
36
|
+
pretty_name="SQuAD-nl",
|
|
37
|
+
source="EuroEval/squad-nl-v2-mini",
|
|
41
38
|
task=RC,
|
|
42
|
-
languages=[
|
|
39
|
+
languages=[DUTCH],
|
|
43
40
|
)
|
|
44
41
|
|
|
45
42
|
WIKI_LINGUA_NL_CONFIG = DatasetConfig(
|
|
46
43
|
name="wiki-lingua-nl",
|
|
47
|
-
pretty_name="
|
|
48
|
-
"
|
|
49
|
-
huggingface_id="EuroEval/wiki-lingua-nl-mini",
|
|
44
|
+
pretty_name="WikiLingua-nl",
|
|
45
|
+
source="EuroEval/wiki-lingua-nl-mini",
|
|
50
46
|
task=SUMM,
|
|
51
|
-
languages=[
|
|
47
|
+
languages=[DUTCH],
|
|
52
48
|
)
|
|
53
49
|
|
|
54
50
|
MMLU_NL_CONFIG = DatasetConfig(
|
|
55
51
|
name="mmlu-nl",
|
|
56
|
-
pretty_name="
|
|
57
|
-
"
|
|
58
|
-
huggingface_id="EuroEval/mmlu-nl-mini",
|
|
52
|
+
pretty_name="MMLU-nl",
|
|
53
|
+
source="EuroEval/mmlu-nl-mini",
|
|
59
54
|
task=KNOW,
|
|
60
|
-
languages=[
|
|
55
|
+
languages=[DUTCH],
|
|
61
56
|
)
|
|
62
57
|
|
|
63
58
|
HELLASWAG_NL_CONFIG = DatasetConfig(
|
|
64
59
|
name="hellaswag-nl",
|
|
65
|
-
pretty_name="
|
|
66
|
-
"
|
|
67
|
-
huggingface_id="EuroEval/hellaswag-nl-mini",
|
|
60
|
+
pretty_name="HellaSwag-nl",
|
|
61
|
+
source="EuroEval/hellaswag-nl-mini",
|
|
68
62
|
task=COMMON_SENSE,
|
|
69
|
-
languages=[
|
|
63
|
+
languages=[DUTCH],
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
VALEU_NL_CONFIG = DatasetConfig(
|
|
67
|
+
name="valeu-nl",
|
|
68
|
+
pretty_name="VaLEU-nl",
|
|
69
|
+
source="EuroEval/european-values-nl",
|
|
70
|
+
task=EUROPEAN_VALUES,
|
|
71
|
+
languages=[DUTCH],
|
|
72
|
+
splits=["test"],
|
|
73
|
+
bootstrap_samples=False,
|
|
74
|
+
_instruction_prompt="{text}",
|
|
70
75
|
)
|
|
71
76
|
|
|
72
77
|
|
|
@@ -74,39 +79,64 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
|
|
|
74
79
|
|
|
75
80
|
DUTCH_COLA_CONFIG = DatasetConfig(
|
|
76
81
|
name="dutch-cola",
|
|
77
|
-
pretty_name="
|
|
78
|
-
"
|
|
79
|
-
huggingface_id="EuroEval/dutch-cola",
|
|
82
|
+
pretty_name="Dutch CoLA",
|
|
83
|
+
source="EuroEval/dutch-cola",
|
|
80
84
|
task=LA,
|
|
81
|
-
languages=[
|
|
85
|
+
languages=[DUTCH],
|
|
82
86
|
unofficial=True,
|
|
83
87
|
)
|
|
84
88
|
|
|
85
89
|
DUTCH_COLA_FULL_CONFIG = DatasetConfig(
|
|
86
90
|
name="dutch-cola-full",
|
|
87
|
-
pretty_name="
|
|
88
|
-
|
|
91
|
+
pretty_name="Dutch CoLA Full",
|
|
92
|
+
source="EuroEval/dutch-cola-full",
|
|
89
93
|
task=LA,
|
|
90
|
-
languages=[
|
|
94
|
+
languages=[DUTCH],
|
|
91
95
|
unofficial=True,
|
|
92
96
|
)
|
|
93
97
|
|
|
94
98
|
ARC_NL_CONFIG = DatasetConfig(
|
|
95
99
|
name="arc-nl",
|
|
96
|
-
pretty_name="
|
|
97
|
-
"
|
|
98
|
-
huggingface_id="EuroEval/arc-nl-mini",
|
|
100
|
+
pretty_name="ARC-nl",
|
|
101
|
+
source="EuroEval/arc-nl-mini",
|
|
99
102
|
task=KNOW,
|
|
100
|
-
languages=[
|
|
103
|
+
languages=[DUTCH],
|
|
101
104
|
unofficial=True,
|
|
102
105
|
)
|
|
103
106
|
|
|
104
107
|
BELEBELE_NL_CONFIG = DatasetConfig(
|
|
105
108
|
name="belebele-nl",
|
|
106
|
-
pretty_name="
|
|
107
|
-
"
|
|
108
|
-
huggingface_id="EuroEval/belebele-nl-mini",
|
|
109
|
+
pretty_name="Belebele-nl",
|
|
110
|
+
source="EuroEval/belebele-nl-mini",
|
|
109
111
|
task=MCRC,
|
|
110
|
-
languages=[
|
|
112
|
+
languages=[DUTCH],
|
|
113
|
+
unofficial=True,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
MULTI_WIKI_QA_NL_CONFIG = DatasetConfig(
|
|
117
|
+
name="multi-wiki-qa-nl",
|
|
118
|
+
pretty_name="MultiWikiQA-nl",
|
|
119
|
+
source="EuroEval/multi-wiki-qa-nl-mini",
|
|
120
|
+
task=RC,
|
|
121
|
+
languages=[DUTCH],
|
|
122
|
+
unofficial=True,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
GOLDENSWAG_NL_CONFIG = DatasetConfig(
|
|
126
|
+
name="goldenswag-nl",
|
|
127
|
+
pretty_name="GoldenSwag-nl",
|
|
128
|
+
source="EuroEval/goldenswag-nl-mini",
|
|
129
|
+
task=COMMON_SENSE,
|
|
130
|
+
languages=[DUTCH],
|
|
131
|
+
unofficial=True,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
WINOGRANDE_NL_CONFIG = DatasetConfig(
|
|
135
|
+
name="winogrande-nl",
|
|
136
|
+
pretty_name="Winogrande-nl",
|
|
137
|
+
source="EuroEval/winogrande-nl",
|
|
138
|
+
task=COMMON_SENSE,
|
|
139
|
+
languages=[DUTCH],
|
|
140
|
+
_labels=["a", "b"],
|
|
111
141
|
unofficial=True,
|
|
112
142
|
)
|