EuroEval 16.2.2__py3-none-any.whl → 16.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (38) hide show
  1. euroeval/__init__.py +4 -2
  2. euroeval/benchmark_modules/fresh.py +3 -1
  3. euroeval/benchmark_modules/hf.py +8 -4
  4. euroeval/benchmark_modules/litellm.py +5 -17
  5. euroeval/benchmark_modules/vllm.py +88 -23
  6. euroeval/benchmarker.py +110 -61
  7. euroeval/cli.py +1 -1
  8. euroeval/constants.py +3 -0
  9. euroeval/dataset_configs/__init__.py +1 -0
  10. euroeval/dataset_configs/danish.py +0 -2
  11. euroeval/dataset_configs/dutch.py +0 -2
  12. euroeval/dataset_configs/english.py +0 -2
  13. euroeval/dataset_configs/finnish.py +0 -2
  14. euroeval/dataset_configs/french.py +0 -2
  15. euroeval/dataset_configs/german.py +0 -2
  16. euroeval/dataset_configs/italian.py +0 -2
  17. euroeval/dataset_configs/latvian.py +2 -3
  18. euroeval/dataset_configs/lithuanian.py +62 -0
  19. euroeval/dataset_configs/norwegian.py +0 -2
  20. euroeval/dataset_configs/polish.py +0 -2
  21. euroeval/dataset_configs/portuguese.py +0 -2
  22. euroeval/dataset_configs/spanish.py +0 -2
  23. euroeval/dataset_configs/swedish.py +0 -3
  24. euroeval/metrics/huggingface.py +1 -1
  25. euroeval/metrics/pipeline.py +5 -0
  26. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  27. euroeval/prompt_templates/multiple_choice.py +9 -0
  28. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  29. euroeval/prompt_templates/reading_comprehension.py +10 -0
  30. euroeval/prompt_templates/sentiment_classification.py +11 -0
  31. euroeval/tokenisation_utils.py +8 -8
  32. euroeval/utils.py +1 -1
  33. {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
  34. euroeval-16.3.0.dist-info/RECORD +71 -0
  35. euroeval-16.2.2.dist-info/RECORD +0 -70
  36. {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
  37. {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
  38. {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,6 @@
1
1
  """All Finnish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import FI
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -111,7 +110,6 @@ WINOGRANDE_FI_CONFIG = DatasetConfig(
111
110
  languages=[FI],
112
111
  splits=["train", "test"],
113
112
  _labels=["a", "b"],
114
- _allowed_model_types=[ModelType.GENERATIVE],
115
113
  unofficial=True,
116
114
  )
117
115
 
@@ -1,7 +1,6 @@
1
1
  """All French dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import FR
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -123,7 +122,6 @@ WINOGRANDE_FR_CONFIG = DatasetConfig(
123
122
  languages=[FR],
124
123
  splits=["train", "test"],
125
124
  _labels=["a", "b"],
126
- _allowed_model_types=[ModelType.GENERATIVE],
127
125
  unofficial=True,
128
126
  )
129
127
 
@@ -1,7 +1,6 @@
1
1
  """All German dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import DE
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -140,7 +139,6 @@ WINOGRANDE_DE_CONFIG = DatasetConfig(
140
139
  languages=[DE],
141
140
  splits=["train", "test"],
142
141
  _labels=["a", "b"],
143
- _allowed_model_types=[ModelType.GENERATIVE],
144
142
  unofficial=True,
145
143
  )
146
144
 
@@ -1,7 +1,6 @@
1
1
  """All Italian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import IT
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -131,7 +130,6 @@ WINOGRANDE_IT_CONFIG = DatasetConfig(
131
130
  languages=[IT],
132
131
  splits=["train", "test"],
133
132
  _labels=["a", "b"],
134
- _allowed_model_types=[ModelType.GENERATIVE],
135
133
  unofficial=True,
136
134
  )
137
135
 
@@ -1,7 +1,6 @@
1
1
  """All Latvian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import LV
6
5
  from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
7
6
 
@@ -25,7 +24,8 @@ SCALA_LV_CONFIG = DatasetConfig(
25
24
 
26
25
  FULLSTACK_NER_LV_CONFIG = DatasetConfig(
27
26
  name="fullstack-ner-lv",
28
- pretty_name="the truncated version of the FullStack NER dataset",
27
+ pretty_name="the truncated version of the Latvian named entity recognition "
28
+ "dataset FullStack-NER-lv",
29
29
  huggingface_id="EuroEval/fullstack-ner-lv-mini",
30
30
  task=NER,
31
31
  languages=[LV],
@@ -90,6 +90,5 @@ WINOGRANDE_LV_CONFIG = DatasetConfig(
90
90
  languages=[LV],
91
91
  splits=["train", "test"],
92
92
  _labels=["a", "b"],
93
- _allowed_model_types=[ModelType.GENERATIVE],
94
93
  unofficial=True,
95
94
  )
@@ -0,0 +1,62 @@
1
+ """All Lithuanian dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import LT
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
6
+
7
+ ### Official datasets ###
8
+
9
+ LITHUANIAN_EMOTIONS_CONFIG = DatasetConfig(
10
+ name="lithuanian-emotions",
11
+ pretty_name="the truncated version of the Lithuanian sentiment "
12
+ "classification dataset Lithuanian Emotions",
13
+ huggingface_id="EuroEval/lithuanian-emotions-mini",
14
+ task=SENT,
15
+ languages=[LT],
16
+ )
17
+
18
+ SCALA_LT_CONFIG = DatasetConfig(
19
+ name="scala-lt",
20
+ pretty_name="the Lithuanian part of the linguistic acceptability dataset ScaLA",
21
+ huggingface_id="EuroEval/scala-lt",
22
+ task=LA,
23
+ languages=[LT],
24
+ )
25
+
26
+ WIKIANN_LT_CONFIG = DatasetConfig(
27
+ name="wikiann-lt",
28
+ pretty_name="the truncated version of the Lithuanian part of the named entity "
29
+ "recognition dataset WikiANN",
30
+ huggingface_id="EuroEval/wikiann-lt-mini",
31
+ task=NER,
32
+ languages=[LT],
33
+ )
34
+
35
+ MULTI_WIKI_QA_LT_CONFIG = DatasetConfig(
36
+ name="multi-wiki-qa-lt",
37
+ pretty_name="the truncated version of the Lithuanian part of the reading "
38
+ "comprehension dataset MultiWikiQA",
39
+ huggingface_id="EuroEval/multi-wiki-qa-lt-mini",
40
+ task=RC,
41
+ languages=[LT],
42
+ )
43
+
44
+ LT_HISTORY_CONFIG = DatasetConfig(
45
+ name="lt-history",
46
+ pretty_name="the Lithuanian knowledge dataset LT-History",
47
+ huggingface_id="EuroEval/lt-history",
48
+ task=KNOW,
49
+ languages=[LT],
50
+ splits=["train", "test"],
51
+ )
52
+
53
+ WINOGRANDE_LT_CONFIG = DatasetConfig(
54
+ name="winogrande-lt",
55
+ pretty_name="the Lithuanian common-sense reasoning dataset Winogrande-lt, "
56
+ "translated from the English Winogrande dataset",
57
+ huggingface_id="EuroEval/winogrande-lt",
58
+ task=COMMON_SENSE,
59
+ languages=[LT],
60
+ splits=["train", "test"],
61
+ _labels=["a", "b"],
62
+ )
@@ -1,7 +1,6 @@
1
1
  """All Norwegian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import NB, NN, NO
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -226,7 +225,6 @@ WINOGRANDE_NO_CONFIG = DatasetConfig(
226
225
  languages=[NB, NN, NO],
227
226
  splits=["train", "test"],
228
227
  _labels=["a", "b"],
229
- _allowed_model_types=[ModelType.GENERATIVE],
230
228
  unofficial=True,
231
229
  )
232
230
 
@@ -1,7 +1,6 @@
1
1
  """All Polish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import PL
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
7
6
 
@@ -64,7 +63,6 @@ WINOGRANDE_PL_CONFIG = DatasetConfig(
64
63
  languages=[PL],
65
64
  splits=["train", "test"],
66
65
  _labels=["a", "b"],
67
- _allowed_model_types=[ModelType.GENERATIVE],
68
66
  )
69
67
 
70
68
  EUROPEAN_VALUES_PL_CONFIG = DatasetConfig(
@@ -1,7 +1,6 @@
1
1
  """All Portuguese dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import PT
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -101,7 +100,6 @@ WINOGRANDE_PT_CONFIG = DatasetConfig(
101
100
  languages=[PT],
102
101
  splits=["train", "test"],
103
102
  _labels=["a", "b"],
104
- _allowed_model_types=[ModelType.GENERATIVE],
105
103
  unofficial=True,
106
104
  )
107
105
 
@@ -1,7 +1,6 @@
1
1
  """All Spanish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import ES
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -129,7 +128,6 @@ WINOGRANDE_ES_CONFIG = DatasetConfig(
129
128
  languages=[ES],
130
129
  splits=["train", "test"],
131
130
  _labels=["a", "b"],
132
- _allowed_model_types=[ModelType.GENERATIVE],
133
131
  unofficial=True,
134
132
  )
135
133
 
@@ -1,7 +1,6 @@
1
1
  """All Swedish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import SV
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -140,7 +139,6 @@ WINOGRANDE_SV_CONFIG = DatasetConfig(
140
139
  languages=[SV],
141
140
  splits=["train", "test"],
142
141
  _labels=["a", "b"],
143
- _allowed_model_types=[ModelType.GENERATIVE],
144
142
  unofficial=True,
145
143
  )
146
144
 
@@ -177,6 +175,5 @@ SKOLPROV_CONFIG = DatasetConfig(
177
175
  task=KNOW,
178
176
  languages=[SV],
179
177
  splits=["train", "test"],
180
- _allowed_model_types=[ModelType.GENERATIVE],
181
178
  unofficial=True,
182
179
  )
@@ -197,7 +197,7 @@ bert_score_metric = HuggingFaceMetric(
197
197
  huggingface_id="bertscore",
198
198
  results_key="f1",
199
199
  compute_kwargs=dict(
200
- model_type="microsoft/mdeberta-v3-base", device="cpu", batch_size=16
200
+ model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
201
201
  ),
202
202
  )
203
203
 
@@ -191,6 +191,11 @@ def european_values_preprocessing_fn(
191
191
  for idx, choice in idx_to_choice.items()
192
192
  if choice is not None
193
193
  }
194
+ if prediction not in idx_to_choice:
195
+ raise InvalidBenchmark(
196
+ f"The prediction {prediction} is not a valid index for the "
197
+ f"question with choices {idx_to_choice}."
198
+ )
194
199
  integer_prediction = idx_to_choice[prediction]
195
200
  integer_predictions.append(integer_prediction)
196
201
 
@@ -14,6 +14,7 @@ from ..languages import (
14
14
  FR,
15
15
  IS,
16
16
  IT,
17
+ LT,
17
18
  LV,
18
19
  NB,
19
20
  NL,
@@ -126,6 +127,14 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
126
127
  default_instruction_prompt="Frase: {text}\n\nStabilite se la frase è "
127
128
  "grammaticalmente corretta o meno. Rispondere con {labels_str}, e nient'altro.",
128
129
  ),
130
+ LT: PromptConfig(
131
+ default_prompt_label_mapping=dict(correct="taip", incorrect="ne"),
132
+ default_prompt_prefix="Toliau pateikti sakiniai ir ar jie yra gramatiškai "
133
+ "teisingi.",
134
+ default_prompt_template="Sakinys: {text}\nGramatiškai teisingas: {label}",
135
+ default_instruction_prompt="Sakinys: {text}\n\nNustatykite, ar sakinys yra "
136
+ "gramatiškai teisingas, ar ne. Atsakykite su {labels_str}, ir nieko kito.",
137
+ ),
129
138
  LV: PromptConfig(
130
139
  default_prompt_label_mapping=dict(correct="jā", incorrect="nē"),
131
140
  default_prompt_prefix="Šie ir teikumi un to gramatiskie pareizumi.",
@@ -13,6 +13,7 @@ from ..languages import (
13
13
  FR,
14
14
  IS,
15
15
  IT,
16
+ LT,
16
17
  LV,
17
18
  NB,
18
19
  NL,
@@ -105,6 +106,14 @@ MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
105
106
  "precedente con {labels_str}, e nient'altro.",
106
107
  default_prompt_label_mapping="auto",
107
108
  ),
109
+ LT: PromptConfig(
110
+ default_prompt_prefix="Toliau pateikti daugiavariančiai klausimai "
111
+ "(su atsakymais).",
112
+ default_prompt_template="Klausimas: {text}\nAtsakymas: {label}",
113
+ default_instruction_prompt="Klausimas: {text}\n\nAtsakykite į aukščiau "
114
+ "pateiktą klausimą atsakydami {labels_str}, ir nieko daugiau.",
115
+ default_prompt_label_mapping="auto",
116
+ ),
108
117
  LV: PromptConfig(
109
118
  default_prompt_prefix="Tālāk seko jautājumi ar vairākām atbilžu izvēlēm "
110
119
  "(ar atbildēm).",
@@ -14,6 +14,7 @@ from ..languages import (
14
14
  FR,
15
15
  IS,
16
16
  IT,
17
+ LT,
17
18
  LV,
18
19
  NB,
19
20
  NL,
@@ -241,6 +242,25 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
241
242
  "{labels_str}. I valori devono essere elenchi di entità "
242
243
  "nominate di quel tipo, esattamente come appaiono nella frase.",
243
244
  ),
245
+ LT: PromptConfig(
246
+ default_prompt_label_mapping={
247
+ "b-per": "asmuo",
248
+ "i-per": "asmuo",
249
+ "b-loc": "vieta",
250
+ "i-loc": "vieta",
251
+ "b-org": "organizacija",
252
+ "i-org": "organizacija",
253
+ "b-misc": "kita",
254
+ "i-misc": "kita",
255
+ },
256
+ default_prompt_prefix="Toliau pateikti sakiniai ir JSON žodynai su vardiniais "
257
+ "vienetais, kurie pateikiame sakinyje.",
258
+ default_prompt_template="Sakinys: {text}\nVardiniai vienetai: {label}",
259
+ default_instruction_prompt="Sakinys: {text}\n\nIdentifikuokite vardinius "
260
+ "vienetus sakinyje. Turėtumėte pateikti tai kaip JSON žodyną su raktais "
261
+ "{labels_str}. Reikšmės turi būti to tipo vardinių vienetų sąrašai, "
262
+ "tiksliai taip, kaip jie rodomi sakinyje.",
263
+ ),
244
264
  LV: PromptConfig(
245
265
  default_prompt_label_mapping={
246
266
  "b-per": "persona",
@@ -14,6 +14,7 @@ from ..languages import (
14
14
  FR,
15
15
  IS,
16
16
  IT,
17
+ LT,
17
18
  LV,
18
19
  NB,
19
20
  NL,
@@ -116,6 +117,15 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
116
117
  "sul in un massimo di 3 parole.\n\nDomanda: {question}",
117
118
  default_prompt_label_mapping=dict(),
118
119
  ),
120
+ LT: PromptConfig(
121
+ default_prompt_prefix="Toliau pateikti tekstai su atitinkamais klausimais ir "
122
+ "atsakymais.",
123
+ default_prompt_template="Tekstas: {text}\nKlausimas: {question}\nAtsakykite ne "
124
+ "daugiau kaip 3 žodžiais: {label}",
125
+ default_instruction_prompt="Tekstas: {text}\n\nAtsakykite į šį klausimą apie "
126
+ "aukščiau pateiktą tekstą ne daugiau kaip 3 žodžiais.\n\nKlausimas: {question}",
127
+ default_prompt_label_mapping=dict(),
128
+ ),
119
129
  LV: PromptConfig(
120
130
  default_prompt_prefix="Turpmāk seko teksti ar atbilstošiem jautājumiem un "
121
131
  "atbildēm.",
@@ -14,6 +14,7 @@ from ..languages import (
14
14
  FR,
15
15
  IS,
16
16
  IT,
17
+ LT,
17
18
  LV,
18
19
  NB,
19
20
  NL,
@@ -153,6 +154,16 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
153
154
  default_instruction_prompt="Documento: {text}\n\nClassificare il sentiment del "
154
155
  "documento. Rispondere con {labels_str}, e nient'altro.",
155
156
  ),
157
+ LT: PromptConfig(
158
+ default_prompt_label_mapping=dict(
159
+ positive="teigiamas", neutral="neutralus", negative="neigiamas"
160
+ ),
161
+ default_prompt_prefix="Toliau pateikti dokumentai ir jų nuotaika, kuri "
162
+ "gali būti {labels_str}.",
163
+ default_prompt_template="Dokumentas: {text}\nNuotaika: {label}",
164
+ default_instruction_prompt="Dokumentas: {text}\n\nKlasifikuokite nuotaiką "
165
+ "dokumente. Atsakykite su {labels_str}, ir nieko kito.",
166
+ ),
156
167
  LV: PromptConfig(
157
168
  default_prompt_label_mapping=dict(
158
169
  positive="pozitīvs", neutral="neitrāls", negative="negatīvs"
@@ -521,7 +521,14 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
521
521
  Returns:
522
522
  Whether the tokeniser has a chat template.
523
523
  """
524
- if hasattr(tokeniser, "chat_template"):
524
+ if isinstance(tokeniser, MistralCommonTokenizer):
525
+ log_once(
526
+ "The tokeniser is a Mistral tokeniser, so assuming that the model is "
527
+ "instruction tuned.",
528
+ level=logging.DEBUG,
529
+ )
530
+ return True
531
+ elif hasattr(tokeniser, "chat_template"):
525
532
  has_template = tokeniser.chat_template is not None
526
533
  if has_template:
527
534
  log_once(
@@ -530,13 +537,6 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
530
537
  level=logging.DEBUG,
531
538
  )
532
539
  return has_template
533
- elif isinstance(tokeniser, MistralCommonTokenizer):
534
- log_once(
535
- "The tokeniser is a Mistral tokeniser, so assuming that the model is "
536
- "instruction tuned.",
537
- level=logging.DEBUG,
538
- )
539
- return True
540
540
  else:
541
541
  log_once(
542
542
  "We cannot find a chat template for the tokeniser, so assuming that the "
euroeval/utils.py CHANGED
@@ -462,7 +462,7 @@ def extract_json_dict_from_string(s: str) -> dict | None:
462
462
  Returns:
463
463
  The extracted JSON dictionary, or None if no JSON dictionary could be found.
464
464
  """
465
- json_regex = r"\{[^{}]+?\}"
465
+ json_regex = r"\{[^{}]*?\}"
466
466
  if (json_match := re.search(pattern=json_regex, string=s, flags=re.DOTALL)) is None:
467
467
  logger.debug(
468
468
  "The model output does not contain any JSON dictionary, so cannot parse "