EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (71) hide show
  1. euroeval/__init__.py +6 -0
  2. euroeval/benchmark_config_factory.py +51 -46
  3. euroeval/benchmark_modules/base.py +6 -5
  4. euroeval/benchmark_modules/hf.py +2 -9
  5. euroeval/benchmark_modules/litellm.py +14 -12
  6. euroeval/benchmark_modules/vllm.py +17 -10
  7. euroeval/benchmarker.py +61 -44
  8. euroeval/caching_utils.py +1 -1
  9. euroeval/cli.py +86 -8
  10. euroeval/constants.py +3 -0
  11. euroeval/data_loading.py +78 -30
  12. euroeval/data_models.py +326 -326
  13. euroeval/dataset_configs/__init__.py +10 -3
  14. euroeval/dataset_configs/bulgarian.py +56 -0
  15. euroeval/dataset_configs/czech.py +25 -29
  16. euroeval/dataset_configs/danish.py +51 -88
  17. euroeval/dataset_configs/dutch.py +48 -86
  18. euroeval/dataset_configs/english.py +45 -76
  19. euroeval/dataset_configs/estonian.py +36 -38
  20. euroeval/dataset_configs/faroese.py +19 -60
  21. euroeval/dataset_configs/finnish.py +36 -68
  22. euroeval/dataset_configs/french.py +39 -74
  23. euroeval/dataset_configs/german.py +45 -81
  24. euroeval/dataset_configs/greek.py +64 -0
  25. euroeval/dataset_configs/icelandic.py +54 -91
  26. euroeval/dataset_configs/italian.py +42 -78
  27. euroeval/dataset_configs/latvian.py +28 -34
  28. euroeval/dataset_configs/lithuanian.py +22 -26
  29. euroeval/dataset_configs/norwegian.py +72 -114
  30. euroeval/dataset_configs/polish.py +33 -60
  31. euroeval/dataset_configs/portuguese.py +33 -65
  32. euroeval/dataset_configs/serbian.py +64 -0
  33. euroeval/dataset_configs/slovak.py +19 -24
  34. euroeval/dataset_configs/spanish.py +42 -76
  35. euroeval/dataset_configs/swedish.py +48 -84
  36. euroeval/dataset_configs/ukrainian.py +64 -0
  37. euroeval/exceptions.py +1 -1
  38. euroeval/finetuning.py +3 -2
  39. euroeval/generation.py +5 -4
  40. euroeval/generation_utils.py +6 -5
  41. euroeval/languages.py +395 -323
  42. euroeval/metrics/huggingface.py +14 -3
  43. euroeval/metrics/llm_as_a_judge.py +1 -1
  44. euroeval/model_cache.py +6 -5
  45. euroeval/model_loading.py +1 -1
  46. euroeval/prompt_templates/__init__.py +2 -0
  47. euroeval/prompt_templates/classification.py +206 -0
  48. euroeval/prompt_templates/linguistic_acceptability.py +82 -43
  49. euroeval/prompt_templates/multiple_choice.py +81 -41
  50. euroeval/prompt_templates/named_entity_recognition.py +125 -44
  51. euroeval/prompt_templates/reading_comprehension.py +92 -43
  52. euroeval/prompt_templates/sentiment_classification.py +91 -43
  53. euroeval/prompt_templates/summarization.py +64 -39
  54. euroeval/prompt_templates/token_classification.py +279 -0
  55. euroeval/scores.py +4 -3
  56. euroeval/speed_benchmark.py +2 -1
  57. euroeval/task_group_utils/multiple_choice_classification.py +2 -1
  58. euroeval/task_group_utils/question_answering.py +24 -13
  59. euroeval/task_group_utils/sequence_classification.py +5 -4
  60. euroeval/task_group_utils/text_to_text.py +2 -1
  61. euroeval/task_group_utils/token_classification.py +11 -8
  62. euroeval/tasks.py +44 -1
  63. euroeval/tokenisation_utils.py +19 -10
  64. euroeval/types.py +10 -9
  65. euroeval/utils.py +6 -3
  66. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
  67. euroeval-16.5.0.dist-info/RECORD +81 -0
  68. euroeval-16.4.0.dist-info/RECORD +0 -75
  69. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  70. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  71. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,80 +1,75 @@
1
1
  """All French dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import FR
4
+ from ..languages import FRENCH
5
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  ALLOCINE_CONFIG = DatasetConfig(
10
10
  name="allocine",
11
- pretty_name="the truncated version of the French sentiment classification "
12
- "dataset AlloCiné",
13
- huggingface_id="EuroEval/allocine-mini",
11
+ pretty_name="AlloCiné",
12
+ source="EuroEval/allocine-mini",
14
13
  task=SENT,
15
- languages=[FR],
14
+ languages=[FRENCH],
16
15
  _labels=["negative", "positive"],
17
16
  _prompt_label_mapping=dict(positive="positif", negative="négatif"),
18
17
  )
19
18
 
20
19
  SCALA_FR_CONFIG = DatasetConfig(
21
20
  name="scala-fr",
22
- pretty_name="the French part of the linguistic acceptability dataset ScaLA",
23
- huggingface_id="EuroEval/scala-fr",
21
+ pretty_name="ScaLA-fr",
22
+ source="EuroEval/scala-fr",
24
23
  task=LA,
25
- languages=[FR],
24
+ languages=[FRENCH],
26
25
  )
27
26
 
28
27
  ELTEC_CONFIG = DatasetConfig(
29
28
  name="eltec",
30
- pretty_name="the truncated version of the French named entity recognition "
31
- "dataset ELTeC",
32
- huggingface_id="EuroEval/eltec-mini",
29
+ pretty_name="ELTeC",
30
+ source="EuroEval/eltec-mini",
33
31
  task=NER,
34
- languages=[FR],
32
+ languages=[FRENCH],
35
33
  )
36
34
 
37
35
  FQUAD_CONFIG = DatasetConfig(
38
36
  name="fquad",
39
- pretty_name="the truncated version of the French reading comprehension dataset "
40
- "FQuAD",
41
- huggingface_id="EuroEval/fquad-mini",
37
+ pretty_name="FQuAD",
38
+ source="EuroEval/fquad-mini",
42
39
  task=RC,
43
- languages=[FR],
40
+ languages=[FRENCH],
44
41
  )
45
42
 
46
43
  ORANGE_SUM_CONFIG = DatasetConfig(
47
44
  name="orange-sum",
48
- pretty_name="the truncated version of the French summarisation dataset OrangeSum",
49
- huggingface_id="EuroEval/orange-sum-mini",
45
+ pretty_name="OrangeSum",
46
+ source="EuroEval/orange-sum-mini",
50
47
  task=SUMM,
51
- languages=[FR],
48
+ languages=[FRENCH],
52
49
  )
53
50
 
54
51
  MMLU_FR_CONFIG = DatasetConfig(
55
52
  name="mmlu-fr",
56
- pretty_name="the truncated version of the French knowledge dataset MMLU-fr, "
57
- "translated from the English MMLU dataset",
58
- huggingface_id="EuroEval/mmlu-fr-mini",
53
+ pretty_name="MMLU-fr",
54
+ source="EuroEval/mmlu-fr-mini",
59
55
  task=KNOW,
60
- languages=[FR],
56
+ languages=[FRENCH],
61
57
  )
62
58
 
63
59
  HELLASWAG_FR_CONFIG = DatasetConfig(
64
60
  name="hellaswag-fr",
65
- pretty_name="the truncated version of the French common-sense reasoning dataset "
66
- "HellaSwag-fr, translated from the English HellaSwag dataset",
67
- huggingface_id="EuroEval/hellaswag-fr-mini",
61
+ pretty_name="HellaSwag-fr",
62
+ source="EuroEval/hellaswag-fr-mini",
68
63
  task=COMMON_SENSE,
69
- languages=[FR],
64
+ languages=[FRENCH],
70
65
  )
71
66
 
72
- EUROPEAN_VALUES_FR_CONFIG = DatasetConfig(
73
- name="european-values-fr",
74
- pretty_name="the French version of the European values evaluation dataset",
75
- huggingface_id="EuroEval/european-values-fr",
67
+ VALEU_FR_CONFIG = DatasetConfig(
68
+ name="valeu-fr",
69
+ pretty_name="VaLEU-fr",
70
+ source="EuroEval/european-values-fr",
76
71
  task=EUROPEAN_VALUES,
77
- languages=[FR],
72
+ languages=[FRENCH],
78
73
  splits=["test"],
79
74
  bootstrap_samples=False,
80
75
  _instruction_prompt="{text}",
@@ -85,67 +80,37 @@ EUROPEAN_VALUES_FR_CONFIG = DatasetConfig(
85
80
 
86
81
  BELEBELE_FR_CONFIG = DatasetConfig(
87
82
  name="belebele-fr",
88
- pretty_name="the French multiple choice reading comprehension dataset BeleBele-fr, "
89
- "translated from the English BeleBele dataset",
90
- huggingface_id="EuroEval/belebele-fr-mini",
83
+ pretty_name="Belebele-fr",
84
+ source="EuroEval/belebele-fr-mini",
91
85
  task=MCRC,
92
- languages=[FR],
86
+ languages=[FRENCH],
93
87
  unofficial=True,
94
88
  )
95
89
 
96
90
  MULTI_WIKI_QA_FR_CONFIG = DatasetConfig(
97
91
  name="multi-wiki-qa-fr",
98
- pretty_name="the truncated version of the French part of the reading "
99
- "comprehension dataset MultiWikiQA",
100
- huggingface_id="EuroEval/multi-wiki-qa-fr-mini",
92
+ pretty_name="MultiWikiQA-fr",
93
+ source="EuroEval/multi-wiki-qa-fr-mini",
101
94
  task=RC,
102
- languages=[FR],
95
+ languages=[FRENCH],
103
96
  unofficial=True,
104
97
  )
105
98
 
106
99
  GOLDENSWAG_FR_CONFIG = DatasetConfig(
107
100
  name="goldenswag-fr",
108
- pretty_name="the truncated version of the French common-sense reasoning "
109
- "dataset GoldenSwag-fr, translated from the English GoldenSwag dataset",
110
- huggingface_id="EuroEval/goldenswag-fr-mini",
101
+ pretty_name="GoldenSwag-fr",
102
+ source="EuroEval/goldenswag-fr-mini",
111
103
  task=COMMON_SENSE,
112
- languages=[FR],
104
+ languages=[FRENCH],
113
105
  unofficial=True,
114
106
  )
115
107
 
116
108
  WINOGRANDE_FR_CONFIG = DatasetConfig(
117
109
  name="winogrande-fr",
118
- pretty_name="the French common-sense reasoning dataset Winogrande-fr, translated "
119
- "from the English Winogrande dataset",
120
- huggingface_id="EuroEval/winogrande-fr",
110
+ pretty_name="Winogrande-fr",
111
+ source="EuroEval/winogrande-fr",
121
112
  task=COMMON_SENSE,
122
- languages=[FR],
113
+ languages=[FRENCH],
123
114
  _labels=["a", "b"],
124
115
  unofficial=True,
125
116
  )
126
-
127
- EUROPEAN_VALUES_SITUATIONAL_FR_CONFIG = DatasetConfig(
128
- name="european-values-situational-fr",
129
- pretty_name="the French version of the European values evaluation dataset, where "
130
- "the questions are phrased in a situational way",
131
- huggingface_id="EuroEval/european-values-situational-fr",
132
- task=EUROPEAN_VALUES,
133
- languages=[FR],
134
- splits=["test"],
135
- bootstrap_samples=False,
136
- _instruction_prompt="{text}",
137
- unofficial=True,
138
- )
139
-
140
- EUROPEAN_VALUES_COMPLETIONS_FR_CONFIG = DatasetConfig(
141
- name="european-values-completions-fr",
142
- pretty_name="the French version of the European values evaluation dataset, where "
143
- "the questions are phrased as sentence completions",
144
- huggingface_id="EuroEval/european-values-completions-fr",
145
- task=EUROPEAN_VALUES,
146
- languages=[FR],
147
- splits=["test"],
148
- bootstrap_samples=False,
149
- _instruction_prompt="{text}",
150
- unofficial=True,
151
- )
@@ -1,78 +1,73 @@
1
1
  """All German dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..languages import DE
4
+ from ..languages import GERMAN
5
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
9
  SB10K_CONFIG = DatasetConfig(
10
10
  name="sb10k",
11
- pretty_name="the truncated version of the German sentiment classification "
12
- "dataset SB10k",
13
- huggingface_id="EuroEval/sb10k-mini",
11
+ pretty_name="SB10K",
12
+ source="EuroEval/sb10k-mini",
14
13
  task=SENT,
15
- languages=[DE],
14
+ languages=[GERMAN],
16
15
  )
17
16
 
18
17
  SCALA_DE_CONFIG = DatasetConfig(
19
18
  name="scala-de",
20
- pretty_name="the German part of the linguistic acceptability dataset ScaLA",
21
- huggingface_id="EuroEval/scala-de",
19
+ pretty_name="ScaLA-de",
20
+ source="EuroEval/scala-de",
22
21
  task=LA,
23
- languages=[DE],
22
+ languages=[GERMAN],
24
23
  )
25
24
 
26
25
  GERMEVAL_CONFIG = DatasetConfig(
27
26
  name="germeval",
28
- pretty_name="the truncated version of the German named entity recognition "
29
- "dataset GermEval",
30
- huggingface_id="EuroEval/germeval-mini",
27
+ pretty_name="GermEval",
28
+ source="EuroEval/germeval-mini",
31
29
  task=NER,
32
- languages=[DE],
30
+ languages=[GERMAN],
33
31
  )
34
32
 
35
33
  GERMANQUAD_CONFIG = DatasetConfig(
36
34
  name="germanquad",
37
- pretty_name="the truncated version of the German reading comprehension dataset "
38
- "GermanQuAD",
39
- huggingface_id="EuroEval/germanquad-mini",
35
+ pretty_name="GermanQuAD",
36
+ source="EuroEval/germanquad-mini",
40
37
  task=RC,
41
- languages=[DE],
38
+ languages=[GERMAN],
42
39
  )
43
40
 
44
41
  MLSUM_DE_CONFIG = DatasetConfig(
45
42
  name="mlsum-de",
46
- pretty_name="the truncated version of the German summarisation dataset MLSum-de",
47
- huggingface_id="EuroEval/mlsum-mini",
43
+ pretty_name="MLSUM-de",
44
+ source="EuroEval/mlsum-mini",
48
45
  task=SUMM,
49
- languages=[DE],
46
+ languages=[GERMAN],
50
47
  )
51
48
 
52
49
  MMLU_DE_CONFIG = DatasetConfig(
53
50
  name="mmlu-de",
54
- pretty_name="the truncated version of the German knowledge dataset MMLU-de, "
55
- "translated from the English MMLU dataset",
56
- huggingface_id="EuroEval/mmlu-de-mini",
51
+ pretty_name="MMLU-de",
52
+ source="EuroEval/mmlu-de-mini",
57
53
  task=KNOW,
58
- languages=[DE],
54
+ languages=[GERMAN],
59
55
  )
60
56
 
61
57
  HELLASWAG_DE_CONFIG = DatasetConfig(
62
58
  name="hellaswag-de",
63
- pretty_name="the truncated version of the German common-sense reasoning dataset "
64
- "HellaSwag-de, translated from the English HellaSwag dataset",
65
- huggingface_id="EuroEval/hellaswag-de-mini",
59
+ pretty_name="HellaSwag-de",
60
+ source="EuroEval/hellaswag-de-mini",
66
61
  task=COMMON_SENSE,
67
- languages=[DE],
62
+ languages=[GERMAN],
68
63
  )
69
64
 
70
- EUROPEAN_VALUES_DE_CONFIG = DatasetConfig(
71
- name="european-values-de",
72
- pretty_name="the German version of the European values evaluation dataset",
73
- huggingface_id="EuroEval/european-values-de",
65
+ VALEU_DE_CONFIG = DatasetConfig(
66
+ name="valeu-de",
67
+ pretty_name="VaLEU-de",
68
+ source="EuroEval/european-values-de",
74
69
  task=EUROPEAN_VALUES,
75
- languages=[DE],
70
+ languages=[GERMAN],
76
71
  splits=["test"],
77
72
  bootstrap_samples=False,
78
73
  _instruction_prompt="{text}",
@@ -83,86 +78,55 @@ EUROPEAN_VALUES_DE_CONFIG = DatasetConfig(
83
78
 
84
79
  XQUAD_DE_CONFIG = DatasetConfig(
85
80
  name="xquad-de",
86
- pretty_name="the German version of the reading comprehension dataset XQuAD",
87
- huggingface_id="EuroEval/xquad-de",
81
+ pretty_name="XQuAD-de",
82
+ source="EuroEval/xquad-de",
88
83
  task=RC,
89
- languages=[DE],
84
+ languages=[GERMAN],
90
85
  unofficial=True,
91
86
  )
92
87
 
93
88
  ARC_DE_CONFIG = DatasetConfig(
94
89
  name="arc-de",
95
- pretty_name="the truncated version of the German knowledge dataset ARC-de, "
96
- "translated from the English ARC dataset",
97
- huggingface_id="EuroEval/arc-de-mini",
90
+ pretty_name="ARC-de",
91
+ source="EuroEval/arc-de-mini",
98
92
  task=KNOW,
99
- languages=[DE],
93
+ languages=[GERMAN],
100
94
  unofficial=True,
101
95
  )
102
96
 
103
97
  BELEBELE_DE_CONFIG = DatasetConfig(
104
98
  name="belebele-de",
105
- pretty_name="the German multiple choice reading comprehension dataset BeleBele-de, "
106
- "translated from the English BeleBele dataset",
107
- huggingface_id="EuroEval/belebele-de-mini",
99
+ pretty_name="Belebele-de",
100
+ source="EuroEval/belebele-de-mini",
108
101
  task=MCRC,
109
- languages=[DE],
102
+ languages=[GERMAN],
110
103
  unofficial=True,
111
104
  )
112
105
 
113
106
  MULTI_WIKI_QA_DE_CONFIG = DatasetConfig(
114
107
  name="multi-wiki-qa-de",
115
- pretty_name="the truncated version of the German part of the reading "
116
- "comprehension dataset MultiWikiQA",
117
- huggingface_id="EuroEval/multi-wiki-qa-de-mini",
108
+ pretty_name="MultiWikiQA-de",
109
+ source="EuroEval/multi-wiki-qa-de-mini",
118
110
  task=RC,
119
- languages=[DE],
111
+ languages=[GERMAN],
120
112
  unofficial=True,
121
113
  )
122
114
 
123
115
  GOLDENSWAG_DE_CONFIG = DatasetConfig(
124
116
  name="goldenswag-de",
125
- pretty_name="the truncated version of the German common-sense reasoning "
126
- "dataset GoldenSwag-de, translated from the English GoldenSwag dataset",
127
- huggingface_id="EuroEval/goldenswag-de-mini",
117
+ pretty_name="GoldenSwag-de",
118
+ source="EuroEval/goldenswag-de-mini",
128
119
  task=COMMON_SENSE,
129
- languages=[DE],
120
+ languages=[GERMAN],
130
121
  unofficial=True,
131
122
  )
132
123
 
133
124
  WINOGRANDE_DE_CONFIG = DatasetConfig(
134
125
  name="winogrande-de",
135
- pretty_name="the German common-sense reasoning dataset Winogrande-de, translated "
136
- "from the English Winogrande dataset",
137
- huggingface_id="EuroEval/winogrande-de",
126
+ pretty_name="Winogrande-de",
127
+ source="EuroEval/winogrande-de",
138
128
  task=COMMON_SENSE,
139
- languages=[DE],
129
+ languages=[GERMAN],
140
130
  _labels=["a", "b"],
141
131
  unofficial=True,
142
132
  )
143
-
144
- EUROPEAN_VALUES_SITUATIONAL_DE_CONFIG = DatasetConfig(
145
- name="european-values-situational-de",
146
- pretty_name="the German version of the European values evaluation dataset, where "
147
- "the questions are phrased in a situational way",
148
- huggingface_id="EuroEval/european-values-situational-de",
149
- task=EUROPEAN_VALUES,
150
- languages=[DE],
151
- splits=["test"],
152
- bootstrap_samples=False,
153
- _instruction_prompt="{text}",
154
- unofficial=True,
155
- )
156
-
157
- EUROPEAN_VALUES_COMPLETIONS_DE_CONFIG = DatasetConfig(
158
- name="european-values-completions-de",
159
- pretty_name="the German version of the European values evaluation dataset, where "
160
- "the questions are phrased as sentence completions",
161
- huggingface_id="EuroEval/european-values-completions-de",
162
- task=EUROPEAN_VALUES,
163
- languages=[DE],
164
- splits=["test"],
165
- bootstrap_samples=False,
166
- _instruction_prompt="{text}",
167
- unofficial=True,
168
- )
@@ -0,0 +1,64 @@
1
+ """All Greek dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import GREEK
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ GREEK_SA_CONFIG = DatasetConfig(
10
+ name="greek-sa",
11
+ pretty_name="Greek Sentiment Analysis",
12
+ source="EuroEval/greek-sa-mini",
13
+ task=SENT,
14
+ languages=[GREEK],
15
+ _labels=["negative", "positive"],
16
+ )
17
+
18
+ SCALA_EL_CONFIG = DatasetConfig(
19
+ name="scala-el",
20
+ pretty_name="ScaLA-el",
21
+ source="EuroEval/scala-el",
22
+ task=LA,
23
+ languages=[GREEK],
24
+ )
25
+
26
+ ELNER_CONFIG = DatasetConfig(
27
+ name="elner",
28
+ pretty_name="ElNER",
29
+ source="EuroEval/elner-mini",
30
+ task=NER,
31
+ languages=[GREEK],
32
+ )
33
+
34
+ MULTI_WIKI_QA_EL_CONFIG = DatasetConfig(
35
+ name="multi-wiki-qa-el",
36
+ pretty_name="MultiWikiQA-el",
37
+ source="EuroEval/multi-wiki-qa-el-mini",
38
+ task=RC,
39
+ languages=[GREEK],
40
+ )
41
+
42
+ GREEK_WIKIPEDIA_CONFIG = DatasetConfig(
43
+ name="greek-wikipedia",
44
+ pretty_name="Greek Wikipedia",
45
+ source="EuroEval/greek-wikipedia-mini",
46
+ task=SUMM,
47
+ languages=[GREEK],
48
+ )
49
+
50
+ GLOBAL_MMLU_EL_CONFIG = DatasetConfig(
51
+ name="global-mmlu-el",
52
+ pretty_name="GlobalMMLU-el",
53
+ source="EuroEval/global-mmlu-el-mini",
54
+ task=KNOW,
55
+ languages=[GREEK],
56
+ )
57
+
58
+ WINOGRANDE_EL_CONFIG = DatasetConfig(
59
+ name="winogrande-el",
60
+ pretty_name="Winogrande-el",
61
+ source="EuroEval/winogrande-el",
62
+ task=COMMON_SENSE,
63
+ languages=[GREEK],
64
+ )