EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (71) hide show
  1. euroeval/__init__.py +6 -0
  2. euroeval/benchmark_config_factory.py +51 -46
  3. euroeval/benchmark_modules/base.py +6 -5
  4. euroeval/benchmark_modules/hf.py +2 -9
  5. euroeval/benchmark_modules/litellm.py +14 -12
  6. euroeval/benchmark_modules/vllm.py +17 -10
  7. euroeval/benchmarker.py +61 -44
  8. euroeval/caching_utils.py +1 -1
  9. euroeval/cli.py +86 -8
  10. euroeval/constants.py +3 -0
  11. euroeval/data_loading.py +78 -30
  12. euroeval/data_models.py +326 -326
  13. euroeval/dataset_configs/__init__.py +10 -3
  14. euroeval/dataset_configs/bulgarian.py +56 -0
  15. euroeval/dataset_configs/czech.py +25 -29
  16. euroeval/dataset_configs/danish.py +51 -88
  17. euroeval/dataset_configs/dutch.py +48 -86
  18. euroeval/dataset_configs/english.py +45 -76
  19. euroeval/dataset_configs/estonian.py +36 -38
  20. euroeval/dataset_configs/faroese.py +19 -60
  21. euroeval/dataset_configs/finnish.py +36 -68
  22. euroeval/dataset_configs/french.py +39 -74
  23. euroeval/dataset_configs/german.py +45 -81
  24. euroeval/dataset_configs/greek.py +64 -0
  25. euroeval/dataset_configs/icelandic.py +54 -91
  26. euroeval/dataset_configs/italian.py +42 -78
  27. euroeval/dataset_configs/latvian.py +28 -34
  28. euroeval/dataset_configs/lithuanian.py +22 -26
  29. euroeval/dataset_configs/norwegian.py +72 -114
  30. euroeval/dataset_configs/polish.py +33 -60
  31. euroeval/dataset_configs/portuguese.py +33 -65
  32. euroeval/dataset_configs/serbian.py +64 -0
  33. euroeval/dataset_configs/slovak.py +19 -24
  34. euroeval/dataset_configs/spanish.py +42 -76
  35. euroeval/dataset_configs/swedish.py +48 -84
  36. euroeval/dataset_configs/ukrainian.py +64 -0
  37. euroeval/exceptions.py +1 -1
  38. euroeval/finetuning.py +3 -2
  39. euroeval/generation.py +5 -4
  40. euroeval/generation_utils.py +6 -5
  41. euroeval/languages.py +395 -323
  42. euroeval/metrics/huggingface.py +14 -3
  43. euroeval/metrics/llm_as_a_judge.py +1 -1
  44. euroeval/model_cache.py +6 -5
  45. euroeval/model_loading.py +1 -1
  46. euroeval/prompt_templates/__init__.py +2 -0
  47. euroeval/prompt_templates/classification.py +206 -0
  48. euroeval/prompt_templates/linguistic_acceptability.py +82 -43
  49. euroeval/prompt_templates/multiple_choice.py +81 -41
  50. euroeval/prompt_templates/named_entity_recognition.py +125 -44
  51. euroeval/prompt_templates/reading_comprehension.py +92 -43
  52. euroeval/prompt_templates/sentiment_classification.py +91 -43
  53. euroeval/prompt_templates/summarization.py +64 -39
  54. euroeval/prompt_templates/token_classification.py +279 -0
  55. euroeval/scores.py +4 -3
  56. euroeval/speed_benchmark.py +2 -1
  57. euroeval/task_group_utils/multiple_choice_classification.py +2 -1
  58. euroeval/task_group_utils/question_answering.py +24 -13
  59. euroeval/task_group_utils/sequence_classification.py +5 -4
  60. euroeval/task_group_utils/text_to_text.py +2 -1
  61. euroeval/task_group_utils/token_classification.py +11 -8
  62. euroeval/tasks.py +44 -1
  63. euroeval/tokenisation_utils.py +19 -10
  64. euroeval/types.py +10 -9
  65. euroeval/utils.py +6 -3
  66. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
  67. euroeval-16.5.0.dist-info/RECORD +81 -0
  68. euroeval-16.4.0.dist-info/RECORD +0 -75
  69. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  70. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  71. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,64 @@
1
+ """All Serbian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import SERBIAN
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ MMS_SR_CONFIG = DatasetConfig(
10
+ name="mms-sr",
11
+ pretty_name="MMS-sr",
12
+ source="EuroEval/mms-sr-mini",
13
+ task=SENT,
14
+ languages=[SERBIAN],
15
+ )
16
+
17
+ SCALA_SR_CONFIG = DatasetConfig(
18
+ name="scala-sr",
19
+ pretty_name="ScaLA-sr",
20
+ source="EuroEval/scala-sr",
21
+ task=LA,
22
+ languages=[SERBIAN],
23
+ )
24
+
25
+ UNER_SR_CONFIG = DatasetConfig(
26
+ name="uner-sr",
27
+ pretty_name="UNER-sr",
28
+ source="EuroEval/uner-sr-mini",
29
+ task=NER,
30
+ languages=[SERBIAN],
31
+ )
32
+
33
+ MULTI_WIKI_QA_SR_CONFIG = DatasetConfig(
34
+ name="multi-wiki-qa-sr",
35
+ pretty_name="MultiWikiQA-sr",
36
+ source="EuroEval/multi-wiki-qa-sr-mini",
37
+ task=RC,
38
+ languages=[SERBIAN],
39
+ )
40
+
41
+ LR_SUM_SR_CONFIG = DatasetConfig(
42
+ name="lr-sum-sr",
43
+ pretty_name="LRSum-sr",
44
+ source="EuroEval/lr-sum-sr-mini",
45
+ task=SUMM,
46
+ languages=[SERBIAN],
47
+ )
48
+
49
+ MMLU_SR_CONFIG = DatasetConfig(
50
+ name="mmlu-sr",
51
+ pretty_name="MMLU-sr",
52
+ source="EuroEval/mmlu-sr-mini",
53
+ task=KNOW,
54
+ languages=[SERBIAN],
55
+ )
56
+
57
+ WINOGRANDE_SR_CONFIG = DatasetConfig(
58
+ name="winogrande-sr",
59
+ pretty_name="Winogrande-sr",
60
+ source="EuroEval/winogrande-sr",
61
+ task=COMMON_SENSE,
62
+ languages=[SERBIAN],
63
+ _labels=["a", "b"],
64
+ )
@@ -1,60 +1,55 @@
1
1
  """All Slovak dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import SK
4
+ from ..languages import SLOVAK
5
5
  from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  CSFD_SENTIMENT_SK_CONFIG = DatasetConfig(
10
10
  name="csfd-sentiment-sk",
11
- pretty_name="the truncated version of the Slovak sentiment classification dataset "
12
- "CSFD-sentiment-sk",
13
- huggingface_id="EuroEval/csfd-sentiment-sk-mini",
11
+ pretty_name="CSFD Sentiment SK",
12
+ source="EuroEval/csfd-sentiment-sk-mini",
14
13
  task=SENT,
15
- languages=[SK],
14
+ languages=[SLOVAK],
16
15
  )
17
16
 
18
17
  SCALA_SK_CONFIG = DatasetConfig(
19
18
  name="scala-sk",
20
- pretty_name="the Slovak part of the linguistic acceptability dataset ScaLA",
21
- huggingface_id="EuroEval/scala-sk",
19
+ pretty_name="ScaLA-sk",
20
+ source="EuroEval/scala-sk",
22
21
  task=LA,
23
- languages=[SK],
22
+ languages=[SLOVAK],
24
23
  )
25
24
 
26
25
  UNER_SK_CONFIG = DatasetConfig(
27
26
  name="uner-sk",
28
- pretty_name="the truncated version of the Slovak named entity recognition dataset "
29
- "UNER-sk",
30
- huggingface_id="EuroEval/uner-sk-mini",
27
+ pretty_name="UNER-sk",
28
+ source="EuroEval/uner-sk-mini",
31
29
  task=NER,
32
- languages=[SK],
30
+ languages=[SLOVAK],
33
31
  )
34
32
 
35
33
  MULTI_WIKI_QA_SK_CONFIG = DatasetConfig(
36
34
  name="multi-wiki-qa-sk",
37
- pretty_name="the truncated version of the Slovak part of the reading comprehension "
38
- "dataset MultiWikiQA",
39
- huggingface_id="EuroEval/multi-wiki-qa-sk-mini",
35
+ pretty_name="MultiWikiQA-sk",
36
+ source="EuroEval/multi-wiki-qa-sk-mini",
40
37
  task=RC,
41
- languages=[SK],
38
+ languages=[SLOVAK],
42
39
  )
43
40
 
44
41
  MMLU_SK_CONFIG = DatasetConfig(
45
42
  name="mmlu-sk",
46
- pretty_name="the truncated version of the Slovak knowledge dataset MMLU-sk, "
47
- "translated from the English MMLU dataset",
48
- huggingface_id="EuroEval/mmlu-sk-mini",
43
+ pretty_name="MMLU-sk",
44
+ source="EuroEval/mmlu-sk-mini",
49
45
  task=KNOW,
50
- languages=[SK],
46
+ languages=[SLOVAK],
51
47
  )
52
48
 
53
49
  WINOGRANDE_SK_CONFIG = DatasetConfig(
54
50
  name="winogrande-sk",
55
- pretty_name="the Slovak common-sense reasoning dataset Winogrande-sk, translated "
56
- "from the English Winogrande dataset",
57
- huggingface_id="EuroEval/winogrande-sk",
51
+ pretty_name="Winogrande-sk",
52
+ source="EuroEval/winogrande-sk",
58
53
  task=COMMON_SENSE,
59
- languages=[SK],
54
+ languages=[SLOVAK],
60
55
  )
@@ -1,77 +1,73 @@
1
1
  """All Spanish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import ES
4
+ from ..languages import SPANISH
5
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
10
10
  name="sentiment-headlines-es",
11
- pretty_name="the truncated version of the Spanish sentiment classification dataset "
12
- "SentimentHeadlines",
13
- huggingface_id="EuroEval/sentiment-headlines-es",
11
+ pretty_name="Sentiment Headlines ES",
12
+ source="EuroEval/sentiment-headlines-es",
14
13
  task=SENT,
15
- languages=[ES],
14
+ languages=[SPANISH],
16
15
  )
17
16
 
18
17
  SCALA_ES_CONFIG = DatasetConfig(
19
18
  name="scala-es",
20
- pretty_name="the Spanish part of the linguistic acceptability dataset ScaLA",
21
- huggingface_id="EuroEval/scala-es",
19
+ pretty_name="ScaLA-es",
20
+ source="EuroEval/scala-es",
22
21
  task=LA,
23
- languages=[ES],
22
+ languages=[SPANISH],
24
23
  )
25
24
 
26
25
  CONLL_ES_CONFIG = DatasetConfig(
27
26
  name="conll-es",
28
- pretty_name="the Spanish part of the truncated version of the named entity "
29
- "recognition dataset CoNLL 2002",
30
- huggingface_id="EuroEval/conll-es-mini",
27
+ pretty_name="CoNLL-es",
28
+ source="EuroEval/conll-es-mini",
31
29
  task=NER,
32
- languages=[ES],
30
+ languages=[SPANISH],
33
31
  )
34
32
 
35
33
  MLQA_ES_CONFIG = DatasetConfig(
36
34
  name="mlqa-es",
37
- pretty_name="the Spanish version of the reading comprehension dataset MLQA",
38
- huggingface_id="EuroEval/mlqa-es",
35
+ pretty_name="MLQA-es",
36
+ source="EuroEval/mlqa-es",
39
37
  task=RC,
40
- languages=[ES],
38
+ languages=[SPANISH],
41
39
  )
42
40
 
43
41
  MLSUM_ES_CONFIG = DatasetConfig(
44
42
  name="mlsum-es",
45
- pretty_name="the truncated version of the Spanish summarisation dataset MLSum-es",
46
- huggingface_id="EuroEval/mlsum-es-mini",
43
+ pretty_name="MLSUM-es",
44
+ source="EuroEval/mlsum-es-mini",
47
45
  task=SUMM,
48
- languages=[ES],
46
+ languages=[SPANISH],
49
47
  )
50
48
 
51
49
  MMLU_ES_CONFIG = DatasetConfig(
52
50
  name="mmlu-es",
53
- pretty_name="the truncated version of the Spanish knowledge dataset MMLU-es, "
54
- "translated from the English MMLU dataset",
55
- huggingface_id="EuroEval/mmlu-es-mini",
51
+ pretty_name="MMLU-es",
52
+ source="EuroEval/mmlu-es-mini",
56
53
  task=KNOW,
57
- languages=[ES],
54
+ languages=[SPANISH],
58
55
  )
59
56
 
60
57
  HELLASWAG_ES_CONFIG = DatasetConfig(
61
58
  name="hellaswag-es",
62
- pretty_name="the truncated version of the Spanish common-sense reasoning dataset "
63
- "HellaSwag-es, translated from the English HellaSwag dataset",
64
- huggingface_id="EuroEval/hellaswag-es-mini",
59
+ pretty_name="HellaSwag-es",
60
+ source="EuroEval/hellaswag-es-mini",
65
61
  task=COMMON_SENSE,
66
- languages=[ES],
62
+ languages=[SPANISH],
67
63
  )
68
64
 
69
- EUROPEAN_VALUES_ES_CONFIG = DatasetConfig(
70
- name="european-values-es",
71
- pretty_name="the Spanish version of the European values evaluation dataset",
72
- huggingface_id="EuroEval/european-values-es",
65
+ VALEU_ES_CONFIG = DatasetConfig(
66
+ name="valeu-es",
67
+ pretty_name="VaLEU-es",
68
+ source="EuroEval/european-values-es",
73
69
  task=EUROPEAN_VALUES,
74
- languages=[ES],
70
+ languages=[SPANISH],
75
71
  splits=["test"],
76
72
  bootstrap_samples=False,
77
73
  _instruction_prompt="{text}",
@@ -82,76 +78,46 @@ EUROPEAN_VALUES_ES_CONFIG = DatasetConfig(
82
78
 
83
79
  XQUAD_ES_CONFIG = DatasetConfig(
84
80
  name="xquad-es",
85
- pretty_name="the Spanish version of the reading comprehension dataset XQuAD",
86
- huggingface_id="EuroEval/xquad-es",
81
+ pretty_name="XQuAD-es",
82
+ source="EuroEval/xquad-es",
87
83
  task=RC,
88
- languages=[ES],
84
+ languages=[SPANISH],
89
85
  unofficial=True,
90
86
  )
91
87
 
92
88
  BELEBELE_ES_CONFIG = DatasetConfig(
93
89
  name="belebele-es",
94
- pretty_name="the Spanish multiple choice reading comprehension dataset "
95
- "BeleBele-es, translated from the English BeleBele dataset",
96
- huggingface_id="EuroEval/belebele-es-mini",
90
+ pretty_name="Belebele-es",
91
+ source="EuroEval/belebele-es-mini",
97
92
  task=MCRC,
98
- languages=[ES],
93
+ languages=[SPANISH],
99
94
  unofficial=True,
100
95
  )
101
96
 
102
97
  MULTI_WIKI_QA_ES_CONFIG = DatasetConfig(
103
98
  name="multi-wiki-qa-es",
104
- pretty_name="the truncated version of the Spanish part of the reading "
105
- "comprehension dataset MultiWikiQA",
106
- huggingface_id="EuroEval/multi-wiki-qa-es-mini",
99
+ pretty_name="MultiWikiQA-es",
100
+ source="EuroEval/multi-wiki-qa-es-mini",
107
101
  task=RC,
108
- languages=[ES],
102
+ languages=[SPANISH],
109
103
  unofficial=True,
110
104
  )
111
105
 
112
106
  GOLDENSWAG_ES_CONFIG = DatasetConfig(
113
107
  name="goldenswag-es",
114
- pretty_name="the truncated version of the Spanish common-sense reasoning "
115
- "dataset GoldenSwag-es, translated from the English GoldenSwag dataset",
116
- huggingface_id="EuroEval/goldenswag-es-mini",
108
+ pretty_name="GoldenSwag-es",
109
+ source="EuroEval/goldenswag-es-mini",
117
110
  task=COMMON_SENSE,
118
- languages=[ES],
111
+ languages=[SPANISH],
119
112
  unofficial=True,
120
113
  )
121
114
 
122
115
  WINOGRANDE_ES_CONFIG = DatasetConfig(
123
116
  name="winogrande-es",
124
- pretty_name="the Spanish common-sense reasoning dataset Winogrande-es, translated "
125
- "from the English Winogrande dataset",
126
- huggingface_id="EuroEval/winogrande-es",
117
+ pretty_name="Winogrande-es",
118
+ source="EuroEval/winogrande-es",
127
119
  task=COMMON_SENSE,
128
- languages=[ES],
120
+ languages=[SPANISH],
129
121
  _labels=["a", "b"],
130
122
  unofficial=True,
131
123
  )
132
-
133
- EUROPEAN_VALUES_SITUATIONAL_ES_CONFIG = DatasetConfig(
134
- name="european-values-situational-es",
135
- pretty_name="the Spanish version of the European values evaluation dataset, where "
136
- "the questions are phrased in a situational way",
137
- huggingface_id="EuroEval/european-values-situational-es",
138
- task=EUROPEAN_VALUES,
139
- languages=[ES],
140
- splits=["test"],
141
- bootstrap_samples=False,
142
- _instruction_prompt="{text}",
143
- unofficial=True,
144
- )
145
-
146
- EUROPEAN_VALUES_COMPLETIONS_ES_CONFIG = DatasetConfig(
147
- name="european-values-completions-es",
148
- pretty_name="the Spanish version of the European values evaluation dataset, where "
149
- "the questions are phrased as sentence completions",
150
- huggingface_id="EuroEval/european-values-completions-es",
151
- task=EUROPEAN_VALUES,
152
- languages=[ES],
153
- splits=["test"],
154
- bootstrap_samples=False,
155
- _instruction_prompt="{text}",
156
- unofficial=True,
157
- )
@@ -1,78 +1,73 @@
1
1
  """All Swedish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import SV
4
+ from ..languages import SWEDISH
5
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  SWEREC_CONFIG = DatasetConfig(
10
10
  name="swerec",
11
- pretty_name="the truncated version of the Swedish sentiment classification "
12
- "dataset SweReC",
13
- huggingface_id="EuroEval/swerec-mini",
11
+ pretty_name="SweReC",
12
+ source="EuroEval/swerec-mini",
14
13
  task=SENT,
15
- languages=[SV],
14
+ languages=[SWEDISH],
16
15
  )
17
16
 
18
17
  SCALA_SV_CONFIG = DatasetConfig(
19
18
  name="scala-sv",
20
- pretty_name="The Swedish part of the linguistic acceptability dataset ScaLA",
21
- huggingface_id="EuroEval/scala-sv",
19
+ pretty_name="ScaLA-sv",
20
+ source="EuroEval/scala-sv",
22
21
  task=LA,
23
- languages=[SV],
22
+ languages=[SWEDISH],
24
23
  )
25
24
 
26
25
  SUC3_CONFIG = DatasetConfig(
27
26
  name="suc3",
28
- pretty_name="the truncated version of the Swedish named entity recognition "
29
- "dataset SUC 3.0",
30
- huggingface_id="EuroEval/suc3-mini",
27
+ pretty_name="SUC3",
28
+ source="EuroEval/suc3-mini",
31
29
  task=NER,
32
- languages=[SV],
30
+ languages=[SWEDISH],
33
31
  )
34
32
 
35
33
  MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
36
34
  name="multi-wiki-qa-sv",
37
- pretty_name="the truncated version of the Swedish part of the reading "
38
- "comprehension dataset MultiWikiQA",
39
- huggingface_id="EuroEval/multi-wiki-qa-sv-mini",
35
+ pretty_name="MultiWikiQA-sv",
36
+ source="EuroEval/multi-wiki-qa-sv-mini",
40
37
  task=RC,
41
- languages=[SV],
38
+ languages=[SWEDISH],
42
39
  )
43
40
 
44
41
  SWEDN_CONFIG = DatasetConfig(
45
42
  name="swedn",
46
- pretty_name="the truncated version of the Swedish summarisation dataset SweDN",
47
- huggingface_id="EuroEval/swedn-mini",
43
+ pretty_name="SweDN",
44
+ source="EuroEval/swedn-mini",
48
45
  task=SUMM,
49
- languages=[SV],
46
+ languages=[SWEDISH],
50
47
  )
51
48
 
52
49
  MMLU_SV_CONFIG = DatasetConfig(
53
50
  name="mmlu-sv",
54
- pretty_name="the truncated version of the Swedish knowledge dataset MMLU-sv, "
55
- "translated from the English MMLU dataset",
56
- huggingface_id="EuroEval/mmlu-sv-mini",
51
+ pretty_name="MMLU-sv",
52
+ source="EuroEval/mmlu-sv-mini",
57
53
  task=KNOW,
58
- languages=[SV],
54
+ languages=[SWEDISH],
59
55
  )
60
56
 
61
57
  HELLASWAG_SV_CONFIG = DatasetConfig(
62
58
  name="hellaswag-sv",
63
- pretty_name="the truncated version of the Swedish common-sense reasoning dataset "
64
- "HellaSwag-sv, translated from the English HellaSwag dataset",
65
- huggingface_id="EuroEval/hellaswag-sv-mini",
59
+ pretty_name="HellaSwag-sv",
60
+ source="EuroEval/hellaswag-sv-mini",
66
61
  task=COMMON_SENSE,
67
- languages=[SV],
62
+ languages=[SWEDISH],
68
63
  )
69
64
 
70
- EUROPEAN_VALUES_SV_CONFIG = DatasetConfig(
71
- name="european-values-sv",
72
- pretty_name="the Swedish version of the European values evaluation dataset",
73
- huggingface_id="EuroEval/european-values-sv",
65
+ VALEU_SV_CONFIG = DatasetConfig(
66
+ name="valeu-sv",
67
+ pretty_name="VaLEU-sv",
68
+ source="EuroEval/european-values-sv",
74
69
  task=EUROPEAN_VALUES,
75
- languages=[SV],
70
+ languages=[SWEDISH],
76
71
  splits=["test"],
77
72
  bootstrap_samples=False,
78
73
  _instruction_prompt="{text}",
@@ -83,95 +78,64 @@ EUROPEAN_VALUES_SV_CONFIG = DatasetConfig(
83
78
 
84
79
  SCHIBSTED_SV_CONFIG = DatasetConfig(
85
80
  name="schibsted-sv",
86
- pretty_name="the Swedish summarisation dataset Schibsted-sv",
87
- huggingface_id="EuroEval/schibsted-article-summaries-sv",
81
+ pretty_name="Schibsted-sv",
82
+ source="EuroEval/schibsted-article-summaries-sv",
88
83
  task=SUMM,
89
- languages=[SV],
84
+ languages=[SWEDISH],
90
85
  unofficial=True,
91
86
  )
92
87
 
93
88
  ARC_SV_CONFIG = DatasetConfig(
94
89
  name="arc-sv",
95
- pretty_name="the truncated version of the Swedish knowledge dataset ARC-sv, "
96
- "translated from the English ARC dataset",
97
- huggingface_id="EuroEval/arc-sv-mini",
90
+ pretty_name="ARC-sv",
91
+ source="EuroEval/arc-sv-mini",
98
92
  task=KNOW,
99
- languages=[SV],
93
+ languages=[SWEDISH],
100
94
  unofficial=True,
101
95
  )
102
96
 
103
97
  BELEBELE_SV_CONFIG = DatasetConfig(
104
98
  name="belebele-sv",
105
- pretty_name="the Swedish multiple choice reading comprehension dataset "
106
- "BeleBele-sv, translated from the English BeleBele dataset",
107
- huggingface_id="EuroEval/belebele-sv-mini",
99
+ pretty_name="Belebele-sv",
100
+ source="EuroEval/belebele-sv-mini",
108
101
  task=MCRC,
109
- languages=[SV],
102
+ languages=[SWEDISH],
110
103
  unofficial=True,
111
104
  )
112
105
 
113
106
  SCANDIQA_SV_CONFIG = DatasetConfig(
114
107
  name="scandiqa-sv",
115
- pretty_name="the Swedish part of the truncated version of the question answering "
116
- "dataset ScandiQA",
117
- huggingface_id="EuroEval/scandiqa-sv-mini",
108
+ pretty_name="ScandiQA-sv",
109
+ source="EuroEval/scandiqa-sv-mini",
118
110
  task=RC,
119
- languages=[SV],
111
+ languages=[SWEDISH],
120
112
  unofficial=True,
121
113
  )
122
114
 
123
115
  GOLDENSWAG_SV_CONFIG = DatasetConfig(
124
116
  name="goldenswag-sv",
125
- pretty_name="the truncated version of the Swedish common-sense reasoning "
126
- "dataset GoldenSwag-sv, translated from the English GoldenSwag dataset",
127
- huggingface_id="EuroEval/goldenswag-sv-mini",
117
+ pretty_name="GoldenSwag-sv",
118
+ source="EuroEval/goldenswag-sv-mini",
128
119
  task=COMMON_SENSE,
129
- languages=[SV],
120
+ languages=[SWEDISH],
130
121
  unofficial=True,
131
122
  )
132
123
 
133
124
  WINOGRANDE_SV_CONFIG = DatasetConfig(
134
125
  name="winogrande-sv",
135
- pretty_name="the Swedish common-sense reasoning dataset Winogrande-sv, translated "
136
- "from the English Winogrande dataset",
137
- huggingface_id="EuroEval/winogrande-sv",
126
+ pretty_name="Winogrande-sv",
127
+ source="EuroEval/winogrande-sv",
138
128
  task=COMMON_SENSE,
139
- languages=[SV],
129
+ languages=[SWEDISH],
140
130
  _labels=["a", "b"],
141
131
  unofficial=True,
142
132
  )
143
133
 
144
- EUROPEAN_VALUES_SITUATIONAL_SV_CONFIG = DatasetConfig(
145
- name="european-values-situational-sv",
146
- pretty_name="the Swedish version of the European values evaluation dataset, where "
147
- "the questions are phrased in a situational way",
148
- huggingface_id="EuroEval/european-values-situational-sv",
149
- task=EUROPEAN_VALUES,
150
- languages=[SV],
151
- splits=["test"],
152
- bootstrap_samples=False,
153
- _instruction_prompt="{text}",
154
- unofficial=True,
155
- )
156
-
157
- EUROPEAN_VALUES_COMPLETIONS_SV_CONFIG = DatasetConfig(
158
- name="european-values-completions-sv",
159
- pretty_name="the Swedish version of the European values evaluation dataset, where "
160
- "the questions are phrased as sentence completions",
161
- huggingface_id="EuroEval/european-values-completions-sv",
162
- task=EUROPEAN_VALUES,
163
- languages=[SV],
164
- splits=["test"],
165
- bootstrap_samples=False,
166
- _instruction_prompt="{text}",
167
- unofficial=True,
168
- )
169
-
170
134
  SKOLPROV_CONFIG = DatasetConfig(
171
135
  name="skolprov",
172
- pretty_name="the Swedish knowledge dataset Skolprov",
173
- huggingface_id="EuroEval/skolprov",
136
+ pretty_name="Skolprov",
137
+ source="EuroEval/skolprov",
174
138
  task=KNOW,
175
- languages=[SV],
139
+ languages=[SWEDISH],
176
140
  unofficial=True,
177
141
  )
@@ -0,0 +1,64 @@
1
+ """All Ukrainian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import UKRAINIAN
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ CROSS_DOMAIN_UK_REVIEWS_CONFIG = DatasetConfig(
10
+ name="cross-domain-uk-reviews",
11
+ pretty_name="Cross Domain Ukrainian Reviews",
12
+ source="EuroEval/cross-domain-uk-reviews-mini",
13
+ task=SENT,
14
+ languages=[UKRAINIAN],
15
+ )
16
+
17
+ SCALA_UK_CONFIG = DatasetConfig(
18
+ name="scala-uk",
19
+ pretty_name="ScaLA-uk",
20
+ source="EuroEval/scala-uk",
21
+ task=LA,
22
+ languages=[UKRAINIAN],
23
+ )
24
+
25
+ NER_UK_CONFIG = DatasetConfig(
26
+ name="ner-uk",
27
+ pretty_name="NER-uk",
28
+ source="EuroEval/ner-uk-mini",
29
+ task=NER,
30
+ languages=[UKRAINIAN],
31
+ )
32
+
33
+ MULTI_WIKI_QA_UK_CONFIG = DatasetConfig(
34
+ name="multi-wiki-qa-uk",
35
+ pretty_name="MultiWikiQA-uk",
36
+ source="EuroEval/multi-wiki-qa-uk-mini",
37
+ task=RC,
38
+ languages=[UKRAINIAN],
39
+ )
40
+
41
+ LR_SUM_UK_CONFIG = DatasetConfig(
42
+ name="lr-sum-uk",
43
+ pretty_name="LRSum-uk",
44
+ source="EuroEval/lr-sum-uk-mini",
45
+ task=SUMM,
46
+ languages=[UKRAINIAN],
47
+ )
48
+
49
+ GLOBAL_MMLU_UK_CONFIG = DatasetConfig(
50
+ name="global-mmlu-uk",
51
+ pretty_name="GlobalMMLU-uk",
52
+ source="EuroEval/global-mmlu-uk-mini",
53
+ task=KNOW,
54
+ languages=[UKRAINIAN],
55
+ )
56
+
57
+ WINOGRANDE_UK_CONFIG = DatasetConfig(
58
+ name="winogrande-uk",
59
+ pretty_name="Winogrande-uk",
60
+ source="EuroEval/winogrande-uk",
61
+ task=COMMON_SENSE,
62
+ languages=[UKRAINIAN],
63
+ _labels=["a", "b"],
64
+ )
euroeval/exceptions.py CHANGED
@@ -145,7 +145,7 @@ class NeedsAdditionalArgument(InvalidModel):
145
145
  else:
146
146
  self.message = (
147
147
  f"The model you are trying to load requires the `{script_argument}` "
148
- "argument to be passed to the `Benchmarker` class. Please pass the "
148
+ "argument to be passed to the `Benchmarker` class. Please pass the "
149
149
  "argument and try again."
150
150
  )
151
151
  super().__init__(self.message)
euroeval/finetuning.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Functions related to the finetuning of models."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import sys
5
6
  import typing as t
@@ -30,11 +31,11 @@ if t.TYPE_CHECKING:
30
31
 
31
32
  def finetune(
32
33
  model: "BenchmarkModule",
33
- datasets: list["DatasetDict"],
34
+ datasets: c.Sequence["DatasetDict"],
34
35
  model_config: "ModelConfig",
35
36
  dataset_config: "DatasetConfig",
36
37
  benchmark_config: "BenchmarkConfig",
37
- ) -> list[dict[str, float]]:
38
+ ) -> c.Sequence[dict[str, float]]:
38
39
  """Evaluate a model on a dataset through finetuning.
39
40
 
40
41
  Args: