EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (65) hide show
  1. euroeval/__init__.py +7 -4
  2. euroeval/benchmark_config_factory.py +0 -4
  3. euroeval/benchmark_modules/base.py +3 -16
  4. euroeval/benchmark_modules/fresh.py +5 -2
  5. euroeval/benchmark_modules/hf.py +107 -66
  6. euroeval/benchmark_modules/litellm.py +103 -55
  7. euroeval/benchmark_modules/vllm.py +155 -82
  8. euroeval/benchmarker.py +184 -129
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +1 -1
  12. euroeval/constants.py +9 -0
  13. euroeval/data_loading.py +14 -11
  14. euroeval/data_models.py +12 -4
  15. euroeval/dataset_configs/__init__.py +3 -0
  16. euroeval/dataset_configs/czech.py +79 -0
  17. euroeval/dataset_configs/danish.py +10 -13
  18. euroeval/dataset_configs/dutch.py +0 -3
  19. euroeval/dataset_configs/english.py +0 -3
  20. euroeval/dataset_configs/estonian.py +11 -1
  21. euroeval/dataset_configs/finnish.py +0 -3
  22. euroeval/dataset_configs/french.py +0 -3
  23. euroeval/dataset_configs/german.py +0 -3
  24. euroeval/dataset_configs/italian.py +0 -3
  25. euroeval/dataset_configs/latvian.py +2 -4
  26. euroeval/dataset_configs/lithuanian.py +68 -0
  27. euroeval/dataset_configs/norwegian.py +0 -3
  28. euroeval/dataset_configs/polish.py +0 -3
  29. euroeval/dataset_configs/portuguese.py +0 -3
  30. euroeval/dataset_configs/slovak.py +60 -0
  31. euroeval/dataset_configs/spanish.py +0 -3
  32. euroeval/dataset_configs/swedish.py +10 -15
  33. euroeval/finetuning.py +21 -15
  34. euroeval/generation.py +10 -10
  35. euroeval/generation_utils.py +2 -3
  36. euroeval/logging_utils.py +250 -0
  37. euroeval/metrics/base.py +0 -3
  38. euroeval/metrics/huggingface.py +10 -6
  39. euroeval/metrics/llm_as_a_judge.py +5 -3
  40. euroeval/metrics/pipeline.py +22 -9
  41. euroeval/metrics/speed.py +0 -3
  42. euroeval/model_cache.py +11 -14
  43. euroeval/model_config.py +4 -5
  44. euroeval/model_loading.py +3 -0
  45. euroeval/prompt_templates/linguistic_acceptability.py +30 -3
  46. euroeval/prompt_templates/multiple_choice.py +34 -1
  47. euroeval/prompt_templates/named_entity_recognition.py +71 -11
  48. euroeval/prompt_templates/reading_comprehension.py +41 -3
  49. euroeval/prompt_templates/sentiment_classification.py +34 -1
  50. euroeval/prompt_templates/summarization.py +26 -6
  51. euroeval/scores.py +7 -7
  52. euroeval/speed_benchmark.py +3 -5
  53. euroeval/task_group_utils/multiple_choice_classification.py +0 -3
  54. euroeval/task_group_utils/question_answering.py +0 -3
  55. euroeval/task_group_utils/sequence_classification.py +43 -31
  56. euroeval/task_group_utils/text_to_text.py +17 -8
  57. euroeval/task_group_utils/token_classification.py +10 -9
  58. euroeval/tokenisation_utils.py +22 -20
  59. euroeval/utils.py +30 -147
  60. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
  61. euroeval-16.4.0.dist-info/RECORD +75 -0
  62. euroeval-16.2.2.dist-info/RECORD +0 -70
  63. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
  64. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
  65. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
euroeval/constants.py CHANGED
@@ -1,7 +1,13 @@
1
1
  """Constants used throughout the project."""
2
2
 
3
+ from typing import TypeVar
4
+
3
5
  from .enums import TaskGroup
4
6
 
7
+ # Type variable used for generic typing
8
+ T = TypeVar("T", bound=object)
9
+
10
+
5
11
  # This is used as input to generative models; it cannot be a special token
6
12
  DUMMY_FILL_VALUE = 100
7
13
 
@@ -50,9 +56,11 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
50
56
  # Hugging Face Hub tags used to classify models as merge models
51
57
  MERGE_TAGS = ["merge", "mergekit"]
52
58
 
59
+
53
60
  # The minimum required CUDA compute capability for using bfloat16 in vLLM
54
61
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
55
62
 
63
+
56
64
  # Used to detect whether a model is a reasoning model
57
65
  REASONING_TOKENS = [
58
66
  ("<think>", "</think>"),
@@ -60,6 +68,7 @@ REASONING_TOKENS = [
60
68
  ("<reasoning>", "</reasoning>"),
61
69
  ]
62
70
 
71
+
63
72
  # These tokens are sometimes used by models to indicate the end of a generated
64
73
  # response, but they do not use them as a proper EOS token, so we have to deal with them
65
74
  # manually. We only use them as stop tokens if they actually appear in the model's
euroeval/data_loading.py CHANGED
@@ -12,6 +12,7 @@ from huggingface_hub.errors import HfHubHTTPError
12
12
  from numpy.random import Generator
13
13
 
14
14
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark
15
+ from .logging_utils import log, no_terminal_output
15
16
  from .tasks import EUROPEAN_VALUES
16
17
  from .utils import unscramble
17
18
 
@@ -20,8 +21,6 @@ if t.TYPE_CHECKING:
20
21
 
21
22
  from .data_models import BenchmarkConfig, DatasetConfig
22
23
 
23
- logger = logging.getLogger("euroeval")
24
-
25
24
 
26
25
  def load_data(
27
26
  rng: Generator, dataset_config: "DatasetConfig", benchmark_config: "BenchmarkConfig"
@@ -106,11 +105,12 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
106
105
  num_attempts = 5
107
106
  for _ in range(num_attempts):
108
107
  try:
109
- dataset = load_dataset(
110
- path=dataset_config.huggingface_id,
111
- cache_dir=cache_dir,
112
- token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
113
- )
108
+ with no_terminal_output():
109
+ dataset = load_dataset(
110
+ path=dataset_config.huggingface_id,
111
+ cache_dir=cache_dir,
112
+ token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_"),
113
+ )
114
114
  break
115
115
  except (
116
116
  FileNotFoundError,
@@ -118,9 +118,11 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
118
118
  DatasetsError,
119
119
  requests.ConnectionError,
120
120
  requests.ReadTimeout,
121
- ):
122
- logger.debug(
123
- f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
121
+ ) as e:
122
+ log(
123
+ f"Failed to load dataset {dataset_config.huggingface_id!r}, due to "
124
+ f"the following error: {e}. Retrying...",
125
+ level=logging.DEBUG,
124
126
  )
125
127
  time.sleep(1)
126
128
  continue
@@ -129,7 +131,8 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
129
131
  else:
130
132
  raise InvalidBenchmark(
131
133
  f"Failed to load dataset {dataset_config.huggingface_id!r} after "
132
- f"{num_attempts} attempts."
134
+ f"{num_attempts} attempts. Run with verbose mode to see the individual "
135
+ "errors."
133
136
  )
134
137
  assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
135
138
  missing_keys = [key for key in dataset_config.splits if key not in dataset]
euroeval/data_models.py CHANGED
@@ -558,14 +558,14 @@ class DatasetConfig:
558
558
  )
559
559
 
560
560
  @property
561
- def id2label(self) -> dict[int, str]:
561
+ def id2label(self) -> "HashableDict":
562
562
  """The mapping from ID to label."""
563
- return {idx: label for idx, label in enumerate(self.labels)}
563
+ return HashableDict({idx: label for idx, label in enumerate(self.labels)})
564
564
 
565
565
  @property
566
- def label2id(self) -> dict[str, int]:
566
+ def label2id(self) -> "HashableDict":
567
567
  """The mapping from label to ID."""
568
- return {label: i for i, label in enumerate(self.labels)}
568
+ return HashableDict({label: i for i, label in enumerate(self.labels)})
569
569
 
570
570
  @property
571
571
  def num_labels(self) -> int:
@@ -783,3 +783,11 @@ class ModelIdComponents:
783
783
  model_id: str
784
784
  revision: str
785
785
  param: str | None
786
+
787
+
788
+ class HashableDict(dict):
789
+ """A hashable dictionary."""
790
+
791
+ def __hash__(self) -> int: # type: ignore[override]
792
+ """Return the hash of the dictionary."""
793
+ return hash(frozenset(self.items()))
@@ -3,6 +3,7 @@
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import get_all_languages
5
5
  from ..tasks import SPEED
6
+ from .czech import * # noqa: F403
6
7
  from .danish import * # noqa: F403
7
8
  from .dutch import * # noqa: F403
8
9
  from .english import * # noqa: F403
@@ -14,9 +15,11 @@ from .german import * # noqa: F403
14
15
  from .icelandic import * # noqa: F403
15
16
  from .italian import * # noqa: F403
16
17
  from .latvian import * # noqa: F403
18
+ from .lithuanian import * # noqa: F403
17
19
  from .norwegian import * # noqa: F403
18
20
  from .polish import * # noqa: F403
19
21
  from .portuguese import * # noqa: F403
22
+ from .slovak import * # noqa: F403
20
23
  from .spanish import * # noqa: F403
21
24
  from .swedish import * # noqa: F403
22
25
 
@@ -0,0 +1,79 @@
1
+ """All Czech dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import CS
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ CSFD_SENTIMENT_CONFIG = DatasetConfig(
10
+ name="csfd-sentiment",
11
+ pretty_name="the truncated version of the Czech sentiment classification dataset "
12
+ "CSFD Sentiment",
13
+ huggingface_id="EuroEval/csfd-sentiment-mini",
14
+ task=SENT,
15
+ languages=[CS],
16
+ )
17
+
18
+ CS_GEC_CONFIG = DatasetConfig(
19
+ name="cs-gec",
20
+ pretty_name="the truncated version of the Czech linguistic acceptability dataset "
21
+ "CS-GEC",
22
+ huggingface_id="EuroEval/cs-gec-mini",
23
+ task=LA,
24
+ languages=[CS],
25
+ )
26
+
27
+ PONER_CONFIG = DatasetConfig(
28
+ name="poner",
29
+ pretty_name="the truncated version of the Czech named entity recognition dataset "
30
+ "PONER",
31
+ huggingface_id="EuroEval/poner-mini",
32
+ task=NER,
33
+ languages=[CS],
34
+ )
35
+
36
+ SQAD_CONFIG = DatasetConfig(
37
+ name="sqad",
38
+ pretty_name="the truncated version of the Czech reading comprehension dataset SQAD",
39
+ huggingface_id="EuroEval/sqad-mini",
40
+ task=RC,
41
+ languages=[CS],
42
+ )
43
+
44
+ CZECH_NEWS_CONFIG = DatasetConfig(
45
+ name="czech-news",
46
+ pretty_name="the truncated version of the Czech summarisation dataset",
47
+ huggingface_id="EuroEval/czech-news-mini",
48
+ task=SUMM,
49
+ languages=[CS],
50
+ )
51
+
52
+ UMIMETO_QA_CONFIG = DatasetConfig(
53
+ name="umimeto-qa",
54
+ pretty_name="the Czech knowledge dataset UmimetoQA",
55
+ huggingface_id="EuroEval/umimeto-qa",
56
+ task=KNOW,
57
+ languages=[CS],
58
+ )
59
+
60
+ HELLASWAG_CS_CONFIG = DatasetConfig(
61
+ name="hellaswag-cs",
62
+ pretty_name="the truncated version of the Czech common-sense reasoning dataset "
63
+ "HellaSwag-cs, translated from the English HellaSwag dataset",
64
+ huggingface_id="EuroEval/hellaswag-cs-mini",
65
+ task=COMMON_SENSE,
66
+ languages=[CS],
67
+ )
68
+
69
+
70
+ ### Unofficial datasets ###
71
+
72
+ SCALA_CS_CONFIG = DatasetConfig(
73
+ name="scala-cs",
74
+ pretty_name="the Czech part of the linguistic acceptability dataset ScaLA",
75
+ huggingface_id="EuroEval/scala-cs",
76
+ task=LA,
77
+ languages=[CS],
78
+ unofficial=True,
79
+ )
@@ -1,7 +1,6 @@
1
1
  """All Danish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import DA
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -33,11 +32,11 @@ DANSK_CONFIG = DatasetConfig(
33
32
  languages=[DA],
34
33
  )
35
34
 
36
- SCANDIQA_DA_CONFIG = DatasetConfig(
37
- name="scandiqa-da",
38
- pretty_name="the Danish part of the truncated version of the question answering "
39
- "dataset ScandiQA",
40
- huggingface_id="EuroEval/scandiqa-da-mini",
35
+ MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
36
+ name="multi-wiki-qa-da",
37
+ pretty_name="the truncated version of the Danish part of the reading "
38
+ "comprehension dataset MultiWikiQA",
39
+ huggingface_id="EuroEval/multi-wiki-qa-da-mini",
41
40
  task=RC,
42
41
  languages=[DA],
43
42
  )
@@ -130,11 +129,11 @@ BELEBELE_DA_CONFIG = DatasetConfig(
130
129
  unofficial=True,
131
130
  )
132
131
 
133
- MULTI_WIKI_QA_DA_CONFIG = DatasetConfig(
134
- name="multi-wiki-qa-da",
135
- pretty_name="the truncated version of the Danish part of the reading "
136
- "comprehension dataset MultiWikiQA",
137
- huggingface_id="EuroEval/multi-wiki-qa-da-mini",
132
+ SCANDIQA_DA_CONFIG = DatasetConfig(
133
+ name="scandiqa-da",
134
+ pretty_name="the Danish part of the truncated version of the question answering "
135
+ "dataset ScandiQA",
136
+ huggingface_id="EuroEval/scandiqa-da-mini",
138
137
  task=RC,
139
138
  languages=[DA],
140
139
  unofficial=True,
@@ -157,9 +156,7 @@ WINOGRANDE_DA_CONFIG = DatasetConfig(
157
156
  huggingface_id="EuroEval/winogrande-da",
158
157
  task=COMMON_SENSE,
159
158
  languages=[DA],
160
- splits=["train", "test"],
161
159
  _labels=["a", "b"],
162
- _allowed_model_types=[ModelType.GENERATIVE],
163
160
  unofficial=True,
164
161
  )
165
162
 
@@ -1,7 +1,6 @@
1
1
  """All Dutch dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import NL
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -150,9 +149,7 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
150
149
  huggingface_id="EuroEval/winogrande-nl",
151
150
  task=COMMON_SENSE,
152
151
  languages=[NL],
153
- splits=["train", "test"],
154
152
  _labels=["a", "b"],
155
- _allowed_model_types=[ModelType.GENERATIVE],
156
153
  unofficial=True,
157
154
  )
158
155
 
@@ -1,7 +1,6 @@
1
1
  """All English dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import EN
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -133,9 +132,7 @@ WINOGRANDE_CONFIG = DatasetConfig(
133
132
  huggingface_id="EuroEval/winogrande-en",
134
133
  task=COMMON_SENSE,
135
134
  languages=[EN],
136
- splits=["train", "test"],
137
135
  _labels=["a", "b"],
138
- _allowed_model_types=[ModelType.GENERATIVE],
139
136
  unofficial=True,
140
137
  )
141
138
 
@@ -94,10 +94,20 @@ SCALA_ET_CONFIG = DatasetConfig(
94
94
 
95
95
  EXAM_ET_CONFIG = DatasetConfig(
96
96
  name="exam-et",
97
- pretty_name="the Estonian knowledge assessment dataset Exam-et",
97
+ pretty_name="the Estonian knowledge dataset Exam-et",
98
98
  huggingface_id="EuroEval/exam-et",
99
99
  task=KNOW,
100
100
  languages=[ET],
101
101
  _labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
102
102
  unofficial=True,
103
103
  )
104
+
105
+ MMLU_ET_CONFIG = DatasetConfig(
106
+ name="mmlu-et",
107
+ pretty_name="the truncated version of the Estonian knowledge dataset MMLU-et, "
108
+ "translated from the English MMLU dataset",
109
+ huggingface_id="EuroEval/mmlu-et-mini",
110
+ task=KNOW,
111
+ languages=[ET],
112
+ unofficial=True,
113
+ )
@@ -1,7 +1,6 @@
1
1
  """All Finnish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import FI
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -109,9 +108,7 @@ WINOGRANDE_FI_CONFIG = DatasetConfig(
109
108
  huggingface_id="EuroEval/winogrande-fi",
110
109
  task=COMMON_SENSE,
111
110
  languages=[FI],
112
- splits=["train", "test"],
113
111
  _labels=["a", "b"],
114
- _allowed_model_types=[ModelType.GENERATIVE],
115
112
  unofficial=True,
116
113
  )
117
114
 
@@ -1,7 +1,6 @@
1
1
  """All French dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import FR
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -121,9 +120,7 @@ WINOGRANDE_FR_CONFIG = DatasetConfig(
121
120
  huggingface_id="EuroEval/winogrande-fr",
122
121
  task=COMMON_SENSE,
123
122
  languages=[FR],
124
- splits=["train", "test"],
125
123
  _labels=["a", "b"],
126
- _allowed_model_types=[ModelType.GENERATIVE],
127
124
  unofficial=True,
128
125
  )
129
126
 
@@ -1,7 +1,6 @@
1
1
  """All German dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import DE
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -138,9 +137,7 @@ WINOGRANDE_DE_CONFIG = DatasetConfig(
138
137
  huggingface_id="EuroEval/winogrande-de",
139
138
  task=COMMON_SENSE,
140
139
  languages=[DE],
141
- splits=["train", "test"],
142
140
  _labels=["a", "b"],
143
- _allowed_model_types=[ModelType.GENERATIVE],
144
141
  unofficial=True,
145
142
  )
146
143
 
@@ -1,7 +1,6 @@
1
1
  """All Italian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import IT
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -129,9 +128,7 @@ WINOGRANDE_IT_CONFIG = DatasetConfig(
129
128
  huggingface_id="EuroEval/winogrande-it",
130
129
  task=COMMON_SENSE,
131
130
  languages=[IT],
132
- splits=["train", "test"],
133
131
  _labels=["a", "b"],
134
- _allowed_model_types=[ModelType.GENERATIVE],
135
132
  unofficial=True,
136
133
  )
137
134
 
@@ -1,7 +1,6 @@
1
1
  """All Latvian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import LV
6
5
  from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
7
6
 
@@ -25,7 +24,8 @@ SCALA_LV_CONFIG = DatasetConfig(
25
24
 
26
25
  FULLSTACK_NER_LV_CONFIG = DatasetConfig(
27
26
  name="fullstack-ner-lv",
28
- pretty_name="the truncated version of the FullStack NER dataset",
27
+ pretty_name="the truncated version of the Latvian named entity recognition "
28
+ "dataset FullStack-NER-lv",
29
29
  huggingface_id="EuroEval/fullstack-ner-lv-mini",
30
30
  task=NER,
31
31
  languages=[LV],
@@ -88,8 +88,6 @@ WINOGRANDE_LV_CONFIG = DatasetConfig(
88
88
  huggingface_id="EuroEval/winogrande-lv",
89
89
  task=COMMON_SENSE,
90
90
  languages=[LV],
91
- splits=["train", "test"],
92
91
  _labels=["a", "b"],
93
- _allowed_model_types=[ModelType.GENERATIVE],
94
92
  unofficial=True,
95
93
  )
@@ -0,0 +1,68 @@
1
+ """All Lithuanian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import LT
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ LITHUANIAN_EMOTIONS_CONFIG = DatasetConfig(
10
+ name="lithuanian-emotions",
11
+ pretty_name="the truncated version of the Lithuanian sentiment "
12
+ "classification dataset Lithuanian Emotions",
13
+ huggingface_id="EuroEval/lithuanian-emotions-mini",
14
+ task=SENT,
15
+ languages=[LT],
16
+ )
17
+
18
+ SCALA_LT_CONFIG = DatasetConfig(
19
+ name="scala-lt",
20
+ pretty_name="the Lithuanian part of the linguistic acceptability dataset ScaLA",
21
+ huggingface_id="EuroEval/scala-lt",
22
+ task=LA,
23
+ languages=[LT],
24
+ )
25
+
26
+ WIKIANN_LT_CONFIG = DatasetConfig(
27
+ name="wikiann-lt",
28
+ pretty_name="the truncated version of the Lithuanian part of the named entity "
29
+ "recognition dataset WikiANN",
30
+ huggingface_id="EuroEval/wikiann-lt-mini",
31
+ task=NER,
32
+ languages=[LT],
33
+ )
34
+
35
+ MULTI_WIKI_QA_LT_CONFIG = DatasetConfig(
36
+ name="multi-wiki-qa-lt",
37
+ pretty_name="the truncated version of the Lithuanian part of the reading "
38
+ "comprehension dataset MultiWikiQA",
39
+ huggingface_id="EuroEval/multi-wiki-qa-lt-mini",
40
+ task=RC,
41
+ languages=[LT],
42
+ )
43
+
44
+ LRYTAS_CONFIG = DatasetConfig(
45
+ name="lrytas",
46
+ pretty_name="the truncated version of the Lithuanian summarisation dataset Lrytas",
47
+ huggingface_id="EuroEval/lrytas-mini",
48
+ task=SUMM,
49
+ languages=[LT],
50
+ )
51
+
52
+ LT_HISTORY_CONFIG = DatasetConfig(
53
+ name="lt-history",
54
+ pretty_name="the Lithuanian knowledge dataset LT-History",
55
+ huggingface_id="EuroEval/lt-history",
56
+ task=KNOW,
57
+ languages=[LT],
58
+ )
59
+
60
+ WINOGRANDE_LT_CONFIG = DatasetConfig(
61
+ name="winogrande-lt",
62
+ pretty_name="the Lithuanian common-sense reasoning dataset Winogrande-lt, "
63
+ "translated from the English Winogrande dataset",
64
+ huggingface_id="EuroEval/winogrande-lt",
65
+ task=COMMON_SENSE,
66
+ languages=[LT],
67
+ _labels=["a", "b"],
68
+ )
@@ -1,7 +1,6 @@
1
1
  """All Norwegian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import NB, NN, NO
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -224,9 +223,7 @@ WINOGRANDE_NO_CONFIG = DatasetConfig(
224
223
  huggingface_id="EuroEval/winogrande-no",
225
224
  task=COMMON_SENSE,
226
225
  languages=[NB, NN, NO],
227
- splits=["train", "test"],
228
226
  _labels=["a", "b"],
229
- _allowed_model_types=[ModelType.GENERATIVE],
230
227
  unofficial=True,
231
228
  )
232
229
 
@@ -1,7 +1,6 @@
1
1
  """All Polish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import PL
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
7
6
 
@@ -62,9 +61,7 @@ WINOGRANDE_PL_CONFIG = DatasetConfig(
62
61
  huggingface_id="EuroEval/winogrande-pl",
63
62
  task=COMMON_SENSE,
64
63
  languages=[PL],
65
- splits=["train", "test"],
66
64
  _labels=["a", "b"],
67
- _allowed_model_types=[ModelType.GENERATIVE],
68
65
  )
69
66
 
70
67
  EUROPEAN_VALUES_PL_CONFIG = DatasetConfig(
@@ -1,7 +1,6 @@
1
1
  """All Portuguese dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import PT
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -99,9 +98,7 @@ WINOGRANDE_PT_CONFIG = DatasetConfig(
99
98
  huggingface_id="EuroEval/winogrande-pt",
100
99
  task=COMMON_SENSE,
101
100
  languages=[PT],
102
- splits=["train", "test"],
103
101
  _labels=["a", "b"],
104
- _allowed_model_types=[ModelType.GENERATIVE],
105
102
  unofficial=True,
106
103
  )
107
104
 
@@ -0,0 +1,60 @@
1
+ """All Slovak dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import SK
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
6
+
7
+ ### Official datasets ###
8
+
9
+ CSFD_SENTIMENT_SK_CONFIG = DatasetConfig(
10
+ name="csfd-sentiment-sk",
11
+ pretty_name="the truncated version of the Slovak sentiment classification dataset "
12
+ "CSFD-sentiment-sk",
13
+ huggingface_id="EuroEval/csfd-sentiment-sk-mini",
14
+ task=SENT,
15
+ languages=[SK],
16
+ )
17
+
18
+ SCALA_SK_CONFIG = DatasetConfig(
19
+ name="scala-sk",
20
+ pretty_name="the Slovak part of the linguistic acceptability dataset ScaLA",
21
+ huggingface_id="EuroEval/scala-sk",
22
+ task=LA,
23
+ languages=[SK],
24
+ )
25
+
26
+ UNER_SK_CONFIG = DatasetConfig(
27
+ name="uner-sk",
28
+ pretty_name="the truncated version of the Slovak named entity recognition dataset "
29
+ "UNER-sk",
30
+ huggingface_id="EuroEval/uner-sk-mini",
31
+ task=NER,
32
+ languages=[SK],
33
+ )
34
+
35
+ MULTI_WIKI_QA_SK_CONFIG = DatasetConfig(
36
+ name="multi-wiki-qa-sk",
37
+ pretty_name="the truncated version of the Slovak part of the reading comprehension "
38
+ "dataset MultiWikiQA",
39
+ huggingface_id="EuroEval/multi-wiki-qa-sk-mini",
40
+ task=RC,
41
+ languages=[SK],
42
+ )
43
+
44
+ MMLU_SK_CONFIG = DatasetConfig(
45
+ name="mmlu-sk",
46
+ pretty_name="the truncated version of the Slovak knowledge dataset MMLU-sk, "
47
+ "translated from the English MMLU dataset",
48
+ huggingface_id="EuroEval/mmlu-sk-mini",
49
+ task=KNOW,
50
+ languages=[SK],
51
+ )
52
+
53
+ WINOGRANDE_SK_CONFIG = DatasetConfig(
54
+ name="winogrande-sk",
55
+ pretty_name="the Slovak common-sense reasoning dataset Winogrande-sk, translated "
56
+ "from the English Winogrande dataset",
57
+ huggingface_id="EuroEval/winogrande-sk",
58
+ task=COMMON_SENSE,
59
+ languages=[SK],
60
+ )
@@ -1,7 +1,6 @@
1
1
  """All Spanish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import ES
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -127,9 +126,7 @@ WINOGRANDE_ES_CONFIG = DatasetConfig(
127
126
  huggingface_id="EuroEval/winogrande-es",
128
127
  task=COMMON_SENSE,
129
128
  languages=[ES],
130
- splits=["train", "test"],
131
129
  _labels=["a", "b"],
132
- _allowed_model_types=[ModelType.GENERATIVE],
133
130
  unofficial=True,
134
131
  )
135
132