EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +7 -4
- euroeval/benchmark_config_factory.py +0 -4
- euroeval/benchmark_modules/base.py +3 -16
- euroeval/benchmark_modules/fresh.py +5 -2
- euroeval/benchmark_modules/hf.py +107 -66
- euroeval/benchmark_modules/litellm.py +103 -55
- euroeval/benchmark_modules/vllm.py +155 -82
- euroeval/benchmarker.py +184 -129
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +1 -1
- euroeval/constants.py +9 -0
- euroeval/data_loading.py +14 -11
- euroeval/data_models.py +12 -4
- euroeval/dataset_configs/__init__.py +3 -0
- euroeval/dataset_configs/czech.py +79 -0
- euroeval/dataset_configs/danish.py +10 -13
- euroeval/dataset_configs/dutch.py +0 -3
- euroeval/dataset_configs/english.py +0 -3
- euroeval/dataset_configs/estonian.py +11 -1
- euroeval/dataset_configs/finnish.py +0 -3
- euroeval/dataset_configs/french.py +0 -3
- euroeval/dataset_configs/german.py +0 -3
- euroeval/dataset_configs/italian.py +0 -3
- euroeval/dataset_configs/latvian.py +2 -4
- euroeval/dataset_configs/lithuanian.py +68 -0
- euroeval/dataset_configs/norwegian.py +0 -3
- euroeval/dataset_configs/polish.py +0 -3
- euroeval/dataset_configs/portuguese.py +0 -3
- euroeval/dataset_configs/slovak.py +60 -0
- euroeval/dataset_configs/spanish.py +0 -3
- euroeval/dataset_configs/swedish.py +10 -15
- euroeval/finetuning.py +21 -15
- euroeval/generation.py +10 -10
- euroeval/generation_utils.py +2 -3
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +10 -6
- euroeval/metrics/llm_as_a_judge.py +5 -3
- euroeval/metrics/pipeline.py +22 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +11 -14
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/linguistic_acceptability.py +30 -3
- euroeval/prompt_templates/multiple_choice.py +34 -1
- euroeval/prompt_templates/named_entity_recognition.py +71 -11
- euroeval/prompt_templates/reading_comprehension.py +41 -3
- euroeval/prompt_templates/sentiment_classification.py +34 -1
- euroeval/prompt_templates/summarization.py +26 -6
- euroeval/scores.py +7 -7
- euroeval/speed_benchmark.py +3 -5
- euroeval/task_group_utils/multiple_choice_classification.py +0 -3
- euroeval/task_group_utils/question_answering.py +0 -3
- euroeval/task_group_utils/sequence_classification.py +43 -31
- euroeval/task_group_utils/text_to_text.py +17 -8
- euroeval/task_group_utils/token_classification.py +10 -9
- euroeval/tokenisation_utils.py +22 -20
- euroeval/utils.py +30 -147
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
- euroeval-16.4.0.dist-info/RECORD +75 -0
- euroeval-16.2.2.dist-info/RECORD +0 -70
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
euroeval/constants.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
|
1
1
|
"""Constants used throughout the project."""
|
|
2
2
|
|
|
3
|
+
from typing import TypeVar
|
|
4
|
+
|
|
3
5
|
from .enums import TaskGroup
|
|
4
6
|
|
|
7
|
+
# Type variable used for generic typing
|
|
8
|
+
T = TypeVar("T", bound=object)
|
|
9
|
+
|
|
10
|
+
|
|
5
11
|
# This is used as input to generative models; it cannot be a special token
|
|
6
12
|
DUMMY_FILL_VALUE = 100
|
|
7
13
|
|
|
@@ -50,9 +56,11 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
|
|
|
50
56
|
# Hugging Face Hub tags used to classify models as merge models
|
|
51
57
|
MERGE_TAGS = ["merge", "mergekit"]
|
|
52
58
|
|
|
59
|
+
|
|
53
60
|
# The minimum required CUDA compute capability for using bfloat16 in vLLM
|
|
54
61
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
|
|
55
62
|
|
|
63
|
+
|
|
56
64
|
# Used to detect whether a model is a reasoning model
|
|
57
65
|
REASONING_TOKENS = [
|
|
58
66
|
("<think>", "</think>"),
|
|
@@ -60,6 +68,7 @@ REASONING_TOKENS = [
|
|
|
60
68
|
("<reasoning>", "</reasoning>"),
|
|
61
69
|
]
|
|
62
70
|
|
|
71
|
+
|
|
63
72
|
# These tokens are sometimes used by models to indicate the end of a generated
|
|
64
73
|
# response, but they do not use them as a proper EOS token, so we have to deal with them
|
|
65
74
|
# manually. We only use them as stop tokens if they actually appear in the model's
|
euroeval/data_loading.py
CHANGED
|
@@ -12,6 +12,7 @@ from huggingface_hub.errors import HfHubHTTPError
|
|
|
12
12
|
from numpy.random import Generator
|
|
13
13
|
|
|
14
14
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
15
|
+
from .logging_utils import log, no_terminal_output
|
|
15
16
|
from .tasks import EUROPEAN_VALUES
|
|
16
17
|
from .utils import unscramble
|
|
17
18
|
|
|
@@ -20,8 +21,6 @@ if t.TYPE_CHECKING:
|
|
|
20
21
|
|
|
21
22
|
from .data_models import BenchmarkConfig, DatasetConfig
|
|
22
23
|
|
|
23
|
-
logger = logging.getLogger("euroeval")
|
|
24
|
-
|
|
25
24
|
|
|
26
25
|
def load_data(
|
|
27
26
|
rng: Generator, dataset_config: "DatasetConfig", benchmark_config: "BenchmarkConfig"
|
|
@@ -106,11 +105,12 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
106
105
|
num_attempts = 5
|
|
107
106
|
for _ in range(num_attempts):
|
|
108
107
|
try:
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
108
|
+
with no_terminal_output():
|
|
109
|
+
dataset = load_dataset(
|
|
110
|
+
path=dataset_config.huggingface_id,
|
|
111
|
+
cache_dir=cache_dir,
|
|
112
|
+
token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_"),
|
|
113
|
+
)
|
|
114
114
|
break
|
|
115
115
|
except (
|
|
116
116
|
FileNotFoundError,
|
|
@@ -118,9 +118,11 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
118
118
|
DatasetsError,
|
|
119
119
|
requests.ConnectionError,
|
|
120
120
|
requests.ReadTimeout,
|
|
121
|
-
):
|
|
122
|
-
|
|
123
|
-
f"Failed to load dataset {dataset_config.huggingface_id!r}
|
|
121
|
+
) as e:
|
|
122
|
+
log(
|
|
123
|
+
f"Failed to load dataset {dataset_config.huggingface_id!r}, due to "
|
|
124
|
+
f"the following error: {e}. Retrying...",
|
|
125
|
+
level=logging.DEBUG,
|
|
124
126
|
)
|
|
125
127
|
time.sleep(1)
|
|
126
128
|
continue
|
|
@@ -129,7 +131,8 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
129
131
|
else:
|
|
130
132
|
raise InvalidBenchmark(
|
|
131
133
|
f"Failed to load dataset {dataset_config.huggingface_id!r} after "
|
|
132
|
-
f"{num_attempts} attempts."
|
|
134
|
+
f"{num_attempts} attempts. Run with verbose mode to see the individual "
|
|
135
|
+
"errors."
|
|
133
136
|
)
|
|
134
137
|
assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
|
|
135
138
|
missing_keys = [key for key in dataset_config.splits if key not in dataset]
|
euroeval/data_models.py
CHANGED
|
@@ -558,14 +558,14 @@ class DatasetConfig:
|
|
|
558
558
|
)
|
|
559
559
|
|
|
560
560
|
@property
|
|
561
|
-
def id2label(self) ->
|
|
561
|
+
def id2label(self) -> "HashableDict":
|
|
562
562
|
"""The mapping from ID to label."""
|
|
563
|
-
return {idx: label for idx, label in enumerate(self.labels)}
|
|
563
|
+
return HashableDict({idx: label for idx, label in enumerate(self.labels)})
|
|
564
564
|
|
|
565
565
|
@property
|
|
566
|
-
def label2id(self) ->
|
|
566
|
+
def label2id(self) -> "HashableDict":
|
|
567
567
|
"""The mapping from label to ID."""
|
|
568
|
-
return {label: i for i, label in enumerate(self.labels)}
|
|
568
|
+
return HashableDict({label: i for i, label in enumerate(self.labels)})
|
|
569
569
|
|
|
570
570
|
@property
|
|
571
571
|
def num_labels(self) -> int:
|
|
@@ -783,3 +783,11 @@ class ModelIdComponents:
|
|
|
783
783
|
model_id: str
|
|
784
784
|
revision: str
|
|
785
785
|
param: str | None
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
class HashableDict(dict):
|
|
789
|
+
"""A hashable dictionary."""
|
|
790
|
+
|
|
791
|
+
def __hash__(self) -> int: # type: ignore[override]
|
|
792
|
+
"""Return the hash of the dictionary."""
|
|
793
|
+
return hash(frozenset(self.items()))
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import get_all_languages
|
|
5
5
|
from ..tasks import SPEED
|
|
6
|
+
from .czech import * # noqa: F403
|
|
6
7
|
from .danish import * # noqa: F403
|
|
7
8
|
from .dutch import * # noqa: F403
|
|
8
9
|
from .english import * # noqa: F403
|
|
@@ -14,9 +15,11 @@ from .german import * # noqa: F403
|
|
|
14
15
|
from .icelandic import * # noqa: F403
|
|
15
16
|
from .italian import * # noqa: F403
|
|
16
17
|
from .latvian import * # noqa: F403
|
|
18
|
+
from .lithuanian import * # noqa: F403
|
|
17
19
|
from .norwegian import * # noqa: F403
|
|
18
20
|
from .polish import * # noqa: F403
|
|
19
21
|
from .portuguese import * # noqa: F403
|
|
22
|
+
from .slovak import * # noqa: F403
|
|
20
23
|
from .spanish import * # noqa: F403
|
|
21
24
|
from .swedish import * # noqa: F403
|
|
22
25
|
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""All Czech dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import CS
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
CSFD_SENTIMENT_CONFIG = DatasetConfig(
|
|
10
|
+
name="csfd-sentiment",
|
|
11
|
+
pretty_name="the truncated version of the Czech sentiment classification dataset "
|
|
12
|
+
"CSFD Sentiment",
|
|
13
|
+
huggingface_id="EuroEval/csfd-sentiment-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[CS],
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
CS_GEC_CONFIG = DatasetConfig(
|
|
19
|
+
name="cs-gec",
|
|
20
|
+
pretty_name="the truncated version of the Czech linguistic acceptability dataset "
|
|
21
|
+
"CS-GEC",
|
|
22
|
+
huggingface_id="EuroEval/cs-gec-mini",
|
|
23
|
+
task=LA,
|
|
24
|
+
languages=[CS],
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
PONER_CONFIG = DatasetConfig(
|
|
28
|
+
name="poner",
|
|
29
|
+
pretty_name="the truncated version of the Czech named entity recognition dataset "
|
|
30
|
+
"PONER",
|
|
31
|
+
huggingface_id="EuroEval/poner-mini",
|
|
32
|
+
task=NER,
|
|
33
|
+
languages=[CS],
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
SQAD_CONFIG = DatasetConfig(
|
|
37
|
+
name="sqad",
|
|
38
|
+
pretty_name="the truncated version of the Czech reading comprehension dataset SQAD",
|
|
39
|
+
huggingface_id="EuroEval/sqad-mini",
|
|
40
|
+
task=RC,
|
|
41
|
+
languages=[CS],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
CZECH_NEWS_CONFIG = DatasetConfig(
|
|
45
|
+
name="czech-news",
|
|
46
|
+
pretty_name="the truncated version of the Czech summarisation dataset",
|
|
47
|
+
huggingface_id="EuroEval/czech-news-mini",
|
|
48
|
+
task=SUMM,
|
|
49
|
+
languages=[CS],
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
UMIMETO_QA_CONFIG = DatasetConfig(
|
|
53
|
+
name="umimeto-qa",
|
|
54
|
+
pretty_name="the Czech knowledge dataset UmimetoQA",
|
|
55
|
+
huggingface_id="EuroEval/umimeto-qa",
|
|
56
|
+
task=KNOW,
|
|
57
|
+
languages=[CS],
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
HELLASWAG_CS_CONFIG = DatasetConfig(
|
|
61
|
+
name="hellaswag-cs",
|
|
62
|
+
pretty_name="the truncated version of the Czech common-sense reasoning dataset "
|
|
63
|
+
"HellaSwag-cs, translated from the English HellaSwag dataset",
|
|
64
|
+
huggingface_id="EuroEval/hellaswag-cs-mini",
|
|
65
|
+
task=COMMON_SENSE,
|
|
66
|
+
languages=[CS],
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
### Unofficial datasets ###
|
|
71
|
+
|
|
72
|
+
SCALA_CS_CONFIG = DatasetConfig(
|
|
73
|
+
name="scala-cs",
|
|
74
|
+
pretty_name="the Czech part of the linguistic acceptability dataset ScaLA",
|
|
75
|
+
huggingface_id="EuroEval/scala-cs",
|
|
76
|
+
task=LA,
|
|
77
|
+
languages=[CS],
|
|
78
|
+
unofficial=True,
|
|
79
|
+
)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Danish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import DA
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -33,11 +32,11 @@ DANSK_CONFIG = DatasetConfig(
|
|
|
33
32
|
languages=[DA],
|
|
34
33
|
)
|
|
35
34
|
|
|
36
|
-
|
|
37
|
-
name="
|
|
38
|
-
pretty_name="the
|
|
39
|
-
"dataset
|
|
40
|
-
huggingface_id="EuroEval/
|
|
35
|
+
MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
|
|
36
|
+
name="multi-wiki-qa-da",
|
|
37
|
+
pretty_name="the truncated version of the Danish part of the reading "
|
|
38
|
+
"comprehension dataset MultiWikiQA",
|
|
39
|
+
huggingface_id="EuroEval/multi-wiki-qa-da-mini",
|
|
41
40
|
task=RC,
|
|
42
41
|
languages=[DA],
|
|
43
42
|
)
|
|
@@ -130,11 +129,11 @@ BELEBELE_DA_CONFIG = DatasetConfig(
|
|
|
130
129
|
unofficial=True,
|
|
131
130
|
)
|
|
132
131
|
|
|
133
|
-
|
|
134
|
-
name="
|
|
135
|
-
pretty_name="the
|
|
136
|
-
"
|
|
137
|
-
huggingface_id="EuroEval/
|
|
132
|
+
SCANDIQA_DA_CONFIG = DatasetConfig(
|
|
133
|
+
name="scandiqa-da",
|
|
134
|
+
pretty_name="the Danish part of the truncated version of the question answering "
|
|
135
|
+
"dataset ScandiQA",
|
|
136
|
+
huggingface_id="EuroEval/scandiqa-da-mini",
|
|
138
137
|
task=RC,
|
|
139
138
|
languages=[DA],
|
|
140
139
|
unofficial=True,
|
|
@@ -157,9 +156,7 @@ WINOGRANDE_DA_CONFIG = DatasetConfig(
|
|
|
157
156
|
huggingface_id="EuroEval/winogrande-da",
|
|
158
157
|
task=COMMON_SENSE,
|
|
159
158
|
languages=[DA],
|
|
160
|
-
splits=["train", "test"],
|
|
161
159
|
_labels=["a", "b"],
|
|
162
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
163
160
|
unofficial=True,
|
|
164
161
|
)
|
|
165
162
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Dutch dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import NL
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -150,9 +149,7 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
|
|
|
150
149
|
huggingface_id="EuroEval/winogrande-nl",
|
|
151
150
|
task=COMMON_SENSE,
|
|
152
151
|
languages=[NL],
|
|
153
|
-
splits=["train", "test"],
|
|
154
152
|
_labels=["a", "b"],
|
|
155
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
156
153
|
unofficial=True,
|
|
157
154
|
)
|
|
158
155
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All English dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import EN
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -133,9 +132,7 @@ WINOGRANDE_CONFIG = DatasetConfig(
|
|
|
133
132
|
huggingface_id="EuroEval/winogrande-en",
|
|
134
133
|
task=COMMON_SENSE,
|
|
135
134
|
languages=[EN],
|
|
136
|
-
splits=["train", "test"],
|
|
137
135
|
_labels=["a", "b"],
|
|
138
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
139
136
|
unofficial=True,
|
|
140
137
|
)
|
|
141
138
|
|
|
@@ -94,10 +94,20 @@ SCALA_ET_CONFIG = DatasetConfig(
|
|
|
94
94
|
|
|
95
95
|
EXAM_ET_CONFIG = DatasetConfig(
|
|
96
96
|
name="exam-et",
|
|
97
|
-
pretty_name="the Estonian knowledge
|
|
97
|
+
pretty_name="the Estonian knowledge dataset Exam-et",
|
|
98
98
|
huggingface_id="EuroEval/exam-et",
|
|
99
99
|
task=KNOW,
|
|
100
100
|
languages=[ET],
|
|
101
101
|
_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
|
|
102
102
|
unofficial=True,
|
|
103
103
|
)
|
|
104
|
+
|
|
105
|
+
MMLU_ET_CONFIG = DatasetConfig(
|
|
106
|
+
name="mmlu-et",
|
|
107
|
+
pretty_name="the truncated version of the Estonian knowledge dataset MMLU-et, "
|
|
108
|
+
"translated from the English MMLU dataset",
|
|
109
|
+
huggingface_id="EuroEval/mmlu-et-mini",
|
|
110
|
+
task=KNOW,
|
|
111
|
+
languages=[ET],
|
|
112
|
+
unofficial=True,
|
|
113
|
+
)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Finnish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import FI
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -109,9 +108,7 @@ WINOGRANDE_FI_CONFIG = DatasetConfig(
|
|
|
109
108
|
huggingface_id="EuroEval/winogrande-fi",
|
|
110
109
|
task=COMMON_SENSE,
|
|
111
110
|
languages=[FI],
|
|
112
|
-
splits=["train", "test"],
|
|
113
111
|
_labels=["a", "b"],
|
|
114
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
115
112
|
unofficial=True,
|
|
116
113
|
)
|
|
117
114
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All French dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import FR
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -121,9 +120,7 @@ WINOGRANDE_FR_CONFIG = DatasetConfig(
|
|
|
121
120
|
huggingface_id="EuroEval/winogrande-fr",
|
|
122
121
|
task=COMMON_SENSE,
|
|
123
122
|
languages=[FR],
|
|
124
|
-
splits=["train", "test"],
|
|
125
123
|
_labels=["a", "b"],
|
|
126
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
127
124
|
unofficial=True,
|
|
128
125
|
)
|
|
129
126
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All German dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import DE
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -138,9 +137,7 @@ WINOGRANDE_DE_CONFIG = DatasetConfig(
|
|
|
138
137
|
huggingface_id="EuroEval/winogrande-de",
|
|
139
138
|
task=COMMON_SENSE,
|
|
140
139
|
languages=[DE],
|
|
141
|
-
splits=["train", "test"],
|
|
142
140
|
_labels=["a", "b"],
|
|
143
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
144
141
|
unofficial=True,
|
|
145
142
|
)
|
|
146
143
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Italian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import IT
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -129,9 +128,7 @@ WINOGRANDE_IT_CONFIG = DatasetConfig(
|
|
|
129
128
|
huggingface_id="EuroEval/winogrande-it",
|
|
130
129
|
task=COMMON_SENSE,
|
|
131
130
|
languages=[IT],
|
|
132
|
-
splits=["train", "test"],
|
|
133
131
|
_labels=["a", "b"],
|
|
134
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
135
132
|
unofficial=True,
|
|
136
133
|
)
|
|
137
134
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Latvian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import LV
|
|
6
5
|
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -25,7 +24,8 @@ SCALA_LV_CONFIG = DatasetConfig(
|
|
|
25
24
|
|
|
26
25
|
FULLSTACK_NER_LV_CONFIG = DatasetConfig(
|
|
27
26
|
name="fullstack-ner-lv",
|
|
28
|
-
pretty_name="the truncated version of the
|
|
27
|
+
pretty_name="the truncated version of the Latvian named entity recognition "
|
|
28
|
+
"dataset FullStack-NER-lv",
|
|
29
29
|
huggingface_id="EuroEval/fullstack-ner-lv-mini",
|
|
30
30
|
task=NER,
|
|
31
31
|
languages=[LV],
|
|
@@ -88,8 +88,6 @@ WINOGRANDE_LV_CONFIG = DatasetConfig(
|
|
|
88
88
|
huggingface_id="EuroEval/winogrande-lv",
|
|
89
89
|
task=COMMON_SENSE,
|
|
90
90
|
languages=[LV],
|
|
91
|
-
splits=["train", "test"],
|
|
92
91
|
_labels=["a", "b"],
|
|
93
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
94
92
|
unofficial=True,
|
|
95
93
|
)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""All Lithuanian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import LT
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
LITHUANIAN_EMOTIONS_CONFIG = DatasetConfig(
|
|
10
|
+
name="lithuanian-emotions",
|
|
11
|
+
pretty_name="the truncated version of the Lithuanian sentiment "
|
|
12
|
+
"classification dataset Lithuanian Emotions",
|
|
13
|
+
huggingface_id="EuroEval/lithuanian-emotions-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[LT],
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
SCALA_LT_CONFIG = DatasetConfig(
|
|
19
|
+
name="scala-lt",
|
|
20
|
+
pretty_name="the Lithuanian part of the linguistic acceptability dataset ScaLA",
|
|
21
|
+
huggingface_id="EuroEval/scala-lt",
|
|
22
|
+
task=LA,
|
|
23
|
+
languages=[LT],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
WIKIANN_LT_CONFIG = DatasetConfig(
|
|
27
|
+
name="wikiann-lt",
|
|
28
|
+
pretty_name="the truncated version of the Lithuanian part of the named entity "
|
|
29
|
+
"recognition dataset WikiANN",
|
|
30
|
+
huggingface_id="EuroEval/wikiann-lt-mini",
|
|
31
|
+
task=NER,
|
|
32
|
+
languages=[LT],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
MULTI_WIKI_QA_LT_CONFIG = DatasetConfig(
|
|
36
|
+
name="multi-wiki-qa-lt",
|
|
37
|
+
pretty_name="the truncated version of the Lithuanian part of the reading "
|
|
38
|
+
"comprehension dataset MultiWikiQA",
|
|
39
|
+
huggingface_id="EuroEval/multi-wiki-qa-lt-mini",
|
|
40
|
+
task=RC,
|
|
41
|
+
languages=[LT],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
LRYTAS_CONFIG = DatasetConfig(
|
|
45
|
+
name="lrytas",
|
|
46
|
+
pretty_name="the truncated version of the Lithuanian summarisation dataset Lrytas",
|
|
47
|
+
huggingface_id="EuroEval/lrytas-mini",
|
|
48
|
+
task=SUMM,
|
|
49
|
+
languages=[LT],
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
LT_HISTORY_CONFIG = DatasetConfig(
|
|
53
|
+
name="lt-history",
|
|
54
|
+
pretty_name="the Lithuanian knowledge dataset LT-History",
|
|
55
|
+
huggingface_id="EuroEval/lt-history",
|
|
56
|
+
task=KNOW,
|
|
57
|
+
languages=[LT],
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
WINOGRANDE_LT_CONFIG = DatasetConfig(
|
|
61
|
+
name="winogrande-lt",
|
|
62
|
+
pretty_name="the Lithuanian common-sense reasoning dataset Winogrande-lt, "
|
|
63
|
+
"translated from the English Winogrande dataset",
|
|
64
|
+
huggingface_id="EuroEval/winogrande-lt",
|
|
65
|
+
task=COMMON_SENSE,
|
|
66
|
+
languages=[LT],
|
|
67
|
+
_labels=["a", "b"],
|
|
68
|
+
)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Norwegian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import NB, NN, NO
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -224,9 +223,7 @@ WINOGRANDE_NO_CONFIG = DatasetConfig(
|
|
|
224
223
|
huggingface_id="EuroEval/winogrande-no",
|
|
225
224
|
task=COMMON_SENSE,
|
|
226
225
|
languages=[NB, NN, NO],
|
|
227
|
-
splits=["train", "test"],
|
|
228
226
|
_labels=["a", "b"],
|
|
229
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
230
227
|
unofficial=True,
|
|
231
228
|
)
|
|
232
229
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Polish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import PL
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -62,9 +61,7 @@ WINOGRANDE_PL_CONFIG = DatasetConfig(
|
|
|
62
61
|
huggingface_id="EuroEval/winogrande-pl",
|
|
63
62
|
task=COMMON_SENSE,
|
|
64
63
|
languages=[PL],
|
|
65
|
-
splits=["train", "test"],
|
|
66
64
|
_labels=["a", "b"],
|
|
67
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
68
65
|
)
|
|
69
66
|
|
|
70
67
|
EUROPEAN_VALUES_PL_CONFIG = DatasetConfig(
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Portuguese dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import PT
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -99,9 +98,7 @@ WINOGRANDE_PT_CONFIG = DatasetConfig(
|
|
|
99
98
|
huggingface_id="EuroEval/winogrande-pt",
|
|
100
99
|
task=COMMON_SENSE,
|
|
101
100
|
languages=[PT],
|
|
102
|
-
splits=["train", "test"],
|
|
103
101
|
_labels=["a", "b"],
|
|
104
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
105
102
|
unofficial=True,
|
|
106
103
|
)
|
|
107
104
|
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""All Slovak dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import SK
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
CSFD_SENTIMENT_SK_CONFIG = DatasetConfig(
|
|
10
|
+
name="csfd-sentiment-sk",
|
|
11
|
+
pretty_name="the truncated version of the Slovak sentiment classification dataset "
|
|
12
|
+
"CSFD-sentiment-sk",
|
|
13
|
+
huggingface_id="EuroEval/csfd-sentiment-sk-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[SK],
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
SCALA_SK_CONFIG = DatasetConfig(
|
|
19
|
+
name="scala-sk",
|
|
20
|
+
pretty_name="the Slovak part of the linguistic acceptability dataset ScaLA",
|
|
21
|
+
huggingface_id="EuroEval/scala-sk",
|
|
22
|
+
task=LA,
|
|
23
|
+
languages=[SK],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
UNER_SK_CONFIG = DatasetConfig(
|
|
27
|
+
name="uner-sk",
|
|
28
|
+
pretty_name="the truncated version of the Slovak named entity recognition dataset "
|
|
29
|
+
"UNER-sk",
|
|
30
|
+
huggingface_id="EuroEval/uner-sk-mini",
|
|
31
|
+
task=NER,
|
|
32
|
+
languages=[SK],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
MULTI_WIKI_QA_SK_CONFIG = DatasetConfig(
|
|
36
|
+
name="multi-wiki-qa-sk",
|
|
37
|
+
pretty_name="the truncated version of the Slovak part of the reading comprehension "
|
|
38
|
+
"dataset MultiWikiQA",
|
|
39
|
+
huggingface_id="EuroEval/multi-wiki-qa-sk-mini",
|
|
40
|
+
task=RC,
|
|
41
|
+
languages=[SK],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
MMLU_SK_CONFIG = DatasetConfig(
|
|
45
|
+
name="mmlu-sk",
|
|
46
|
+
pretty_name="the truncated version of the Slovak knowledge dataset MMLU-sk, "
|
|
47
|
+
"translated from the English MMLU dataset",
|
|
48
|
+
huggingface_id="EuroEval/mmlu-sk-mini",
|
|
49
|
+
task=KNOW,
|
|
50
|
+
languages=[SK],
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
WINOGRANDE_SK_CONFIG = DatasetConfig(
|
|
54
|
+
name="winogrande-sk",
|
|
55
|
+
pretty_name="the Slovak common-sense reasoning dataset Winogrande-sk, translated "
|
|
56
|
+
"from the English Winogrande dataset",
|
|
57
|
+
huggingface_id="EuroEval/winogrande-sk",
|
|
58
|
+
task=COMMON_SENSE,
|
|
59
|
+
languages=[SK],
|
|
60
|
+
)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Spanish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import ES
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -127,9 +126,7 @@ WINOGRANDE_ES_CONFIG = DatasetConfig(
|
|
|
127
126
|
huggingface_id="EuroEval/winogrande-es",
|
|
128
127
|
task=COMMON_SENSE,
|
|
129
128
|
languages=[ES],
|
|
130
|
-
splits=["train", "test"],
|
|
131
129
|
_labels=["a", "b"],
|
|
132
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
133
130
|
unofficial=True,
|
|
134
131
|
)
|
|
135
132
|
|