EuroEval 16.0.1__py3-none-any.whl → 16.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (48) hide show
  1. euroeval/benchmark_config_factory.py +6 -1
  2. euroeval/benchmark_modules/base.py +2 -0
  3. euroeval/benchmark_modules/fresh.py +7 -1
  4. euroeval/benchmark_modules/hf.py +26 -21
  5. euroeval/benchmark_modules/litellm.py +258 -131
  6. euroeval/benchmark_modules/vllm.py +79 -40
  7. euroeval/benchmarker.py +11 -2
  8. euroeval/cli.py +14 -1
  9. euroeval/constants.py +1 -1
  10. euroeval/data_models.py +77 -6
  11. euroeval/dataset_configs/__init__.py +1 -0
  12. euroeval/dataset_configs/danish.py +14 -0
  13. euroeval/dataset_configs/dutch.py +14 -0
  14. euroeval/dataset_configs/english.py +22 -0
  15. euroeval/dataset_configs/estonian.py +15 -7
  16. euroeval/dataset_configs/finnish.py +14 -0
  17. euroeval/dataset_configs/french.py +14 -0
  18. euroeval/dataset_configs/german.py +23 -0
  19. euroeval/dataset_configs/italian.py +14 -0
  20. euroeval/dataset_configs/latvian.py +14 -0
  21. euroeval/dataset_configs/norwegian.py +14 -0
  22. euroeval/dataset_configs/polish.py +126 -0
  23. euroeval/dataset_configs/portuguese.py +14 -0
  24. euroeval/dataset_configs/spanish.py +14 -0
  25. euroeval/dataset_configs/swedish.py +25 -0
  26. euroeval/enums.py +12 -0
  27. euroeval/generation.py +17 -8
  28. euroeval/generation_utils.py +58 -10
  29. euroeval/metrics/pipeline.py +1 -1
  30. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  31. euroeval/prompt_templates/multiple_choice.py +27 -1
  32. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  33. euroeval/prompt_templates/reading_comprehension.py +11 -0
  34. euroeval/prompt_templates/sentiment_classification.py +15 -0
  35. euroeval/prompt_templates/summarization.py +27 -1
  36. euroeval/scores.py +5 -0
  37. euroeval/task_group_utils/question_answering.py +29 -29
  38. euroeval/task_group_utils/sequence_classification.py +10 -33
  39. euroeval/task_group_utils/token_classification.py +3 -3
  40. euroeval/tasks.py +4 -4
  41. euroeval/{tokenization_utils.py → tokenisation_utils.py} +40 -23
  42. euroeval/utils.py +36 -3
  43. {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/METADATA +1 -1
  44. euroeval-16.1.0.dist-info/RECORD +70 -0
  45. euroeval-16.0.1.dist-info/RECORD +0 -69
  46. {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
  47. {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
  48. {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -47,13 +47,12 @@ ERR_NEWS_CONFIG = DatasetConfig(
47
47
  languages=[ET],
48
48
  )
49
49
 
50
- EXAM_ET_CONFIG = DatasetConfig(
51
- name="exam-et",
52
- pretty_name="the Estonian knowledge assessment dataset Exam-et",
53
- huggingface_id="EuroEval/exam-et",
50
+ TRIVIA_ET_CONFIG = DatasetConfig(
51
+ name="trivia-et",
52
+ pretty_name="the Estonian knowledge dataset Trivia-et",
53
+ huggingface_id="EuroEval/trivia-et",
54
54
  task=KNOW,
55
55
  languages=[ET],
56
- _labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
57
56
  )
58
57
 
59
58
  WINOGRANDE_ET_CONFIG = DatasetConfig(
@@ -82,8 +81,7 @@ EUROPEAN_VALUES_ET_CONFIG = DatasetConfig(
82
81
  _instruction_prompt="{text}",
83
82
  )
84
83
 
85
-
86
- ### Unofficial datasets ###
84
+ ### Unofficial datasets ###
87
85
 
88
86
  SCALA_ET_CONFIG = DatasetConfig(
89
87
  name="scala-et",
@@ -93,3 +91,13 @@ SCALA_ET_CONFIG = DatasetConfig(
93
91
  languages=[ET],
94
92
  unofficial=True,
95
93
  )
94
+
95
+ EXAM_ET_CONFIG = DatasetConfig(
96
+ name="exam-et",
97
+ pretty_name="the Estonian knowledge assessment dataset Exam-et",
98
+ huggingface_id="EuroEval/exam-et",
99
+ task=KNOW,
100
+ languages=[ET],
101
+ _labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
102
+ unofficial=True,
103
+ )
@@ -1,6 +1,7 @@
1
1
  """All Finnish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import FI
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -101,6 +102,19 @@ GOLDENSWAG_FI_CONFIG = DatasetConfig(
101
102
  unofficial=True,
102
103
  )
103
104
 
105
+ WINOGRANDE_FI_CONFIG = DatasetConfig(
106
+ name="winogrande-fi",
107
+ pretty_name="the Finnish common-sense reasoning dataset Winogrande-fi, translated "
108
+ "from the English Winogrande dataset",
109
+ huggingface_id="EuroEval/winogrande-fi",
110
+ task=COMMON_SENSE,
111
+ languages=[FI],
112
+ splits=["train", "test"],
113
+ _labels=["a", "b"],
114
+ _allowed_model_types=[ModelType.GENERATIVE],
115
+ unofficial=True,
116
+ )
117
+
104
118
  EUROPEAN_VALUES_SITUATIONAL_FI_CONFIG = DatasetConfig(
105
119
  name="european-values-situational-fi",
106
120
  pretty_name="the Finnish version of the European values evaluation dataset, where "
@@ -1,6 +1,7 @@
1
1
  """All French dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import FR
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -113,6 +114,19 @@ GOLDENSWAG_FR_CONFIG = DatasetConfig(
113
114
  unofficial=True,
114
115
  )
115
116
 
117
+ WINOGRANDE_FR_CONFIG = DatasetConfig(
118
+ name="winogrande-fr",
119
+ pretty_name="the French common-sense reasoning dataset Winogrande-fr, translated "
120
+ "from the English Winogrande dataset",
121
+ huggingface_id="EuroEval/winogrande-fr",
122
+ task=COMMON_SENSE,
123
+ languages=[FR],
124
+ splits=["train", "test"],
125
+ _labels=["a", "b"],
126
+ _allowed_model_types=[ModelType.GENERATIVE],
127
+ unofficial=True,
128
+ )
129
+
116
130
  EUROPEAN_VALUES_SITUATIONAL_FR_CONFIG = DatasetConfig(
117
131
  name="european-values-situational-fr",
118
132
  pretty_name="the French version of the European values evaluation dataset, where "
@@ -1,6 +1,7 @@
1
1
  """All German dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import DE
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -81,6 +82,15 @@ EUROPEAN_VALUES_DE_CONFIG = DatasetConfig(
81
82
 
82
83
  ### Unofficial datasets ###
83
84
 
85
+ XQUAD_DE_CONFIG = DatasetConfig(
86
+ name="xquad-de",
87
+ pretty_name="the German version of the reading comprehension dataset XQuAD",
88
+ huggingface_id="EuroEval/xquad-de",
89
+ task=RC,
90
+ languages=[DE],
91
+ unofficial=True,
92
+ )
93
+
84
94
  ARC_DE_CONFIG = DatasetConfig(
85
95
  name="arc-de",
86
96
  pretty_name="the truncated version of the German knowledge dataset ARC-de, "
@@ -121,6 +131,19 @@ GOLDENSWAG_DE_CONFIG = DatasetConfig(
121
131
  unofficial=True,
122
132
  )
123
133
 
134
+ WINOGRANDE_DE_CONFIG = DatasetConfig(
135
+ name="winogrande-de",
136
+ pretty_name="the German common-sense reasoning dataset Winogrande-de, translated "
137
+ "from the English Winogrande dataset",
138
+ huggingface_id="EuroEval/winogrande-de",
139
+ task=COMMON_SENSE,
140
+ languages=[DE],
141
+ splits=["train", "test"],
142
+ _labels=["a", "b"],
143
+ _allowed_model_types=[ModelType.GENERATIVE],
144
+ unofficial=True,
145
+ )
146
+
124
147
  EUROPEAN_VALUES_SITUATIONAL_DE_CONFIG = DatasetConfig(
125
148
  name="european-values-situational-de",
126
149
  pretty_name="the German version of the European values evaluation dataset, where "
@@ -1,6 +1,7 @@
1
1
  """All Italian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import IT
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -121,6 +122,19 @@ GOLDENSWAG_IT_CONFIG = DatasetConfig(
121
122
  unofficial=True,
122
123
  )
123
124
 
125
+ WINOGRANDE_IT_CONFIG = DatasetConfig(
126
+ name="winogrande-it",
127
+ pretty_name="the Italian common-sense reasoning dataset Winogrande-it, translated "
128
+ "from the English Winogrande dataset",
129
+ huggingface_id="EuroEval/winogrande-it",
130
+ task=COMMON_SENSE,
131
+ languages=[IT],
132
+ splits=["train", "test"],
133
+ _labels=["a", "b"],
134
+ _allowed_model_types=[ModelType.GENERATIVE],
135
+ unofficial=True,
136
+ )
137
+
124
138
  EUROPEAN_VALUES_SITUATIONAL_IT_CONFIG = DatasetConfig(
125
139
  name="european-values-situational-it",
126
140
  pretty_name="the Italian version of the European values evaluation dataset, "
@@ -1,6 +1,7 @@
1
1
  """All Latvian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import LV
5
6
  from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
7
 
@@ -79,3 +80,16 @@ WIKIANN_LV_CONFIG = DatasetConfig(
79
80
  languages=[LV],
80
81
  unofficial=True,
81
82
  )
83
+
84
+ WINOGRANDE_LV_CONFIG = DatasetConfig(
85
+ name="winogrande-lv",
86
+ pretty_name="the Latvian common-sense reasoning dataset Winogrande-lv, translated "
87
+ "from the English Winogrande dataset",
88
+ huggingface_id="EuroEval/winogrande-lv",
89
+ task=COMMON_SENSE,
90
+ languages=[LV],
91
+ splits=["train", "test"],
92
+ _labels=["a", "b"],
93
+ _allowed_model_types=[ModelType.GENERATIVE],
94
+ unofficial=True,
95
+ )
@@ -1,6 +1,7 @@
1
1
  """All Norwegian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import NB, NN, NO
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -216,6 +217,19 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
216
217
  unofficial=True,
217
218
  )
218
219
 
220
+ WINOGRANDE_NO_CONFIG = DatasetConfig(
221
+ name="winogrande-no",
222
+ pretty_name="the Norwegian common-sense reasoning dataset Winogrande-no, "
223
+ "translated from the English Winogrande dataset",
224
+ huggingface_id="EuroEval/winogrande-no",
225
+ task=COMMON_SENSE,
226
+ languages=[NB, NN, NO],
227
+ splits=["train", "test"],
228
+ _labels=["a", "b"],
229
+ _allowed_model_types=[ModelType.GENERATIVE],
230
+ unofficial=True,
231
+ )
232
+
219
233
  EUROPEAN_VALUES_SITUATIONAL_NO_CONFIG = DatasetConfig(
220
234
  name="european-values-situational-no",
221
235
  pretty_name="the Norwegian version of the European values evaluation dataset, "
@@ -0,0 +1,126 @@
1
+ """All Polish dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
5
+ from ..languages import PL
6
+ from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
7
+
8
+ ### Official datasets ###
9
+
10
+ POLEMO2_CONFIG = DatasetConfig(
11
+ name="polemo2",
12
+ pretty_name="the Polish sentiment classification dataset PolEmo2",
13
+ huggingface_id="EuroEval/polemo2-mini",
14
+ task=SENT,
15
+ languages=[PL],
16
+ )
17
+
18
+ SCALA_PL_CONFIG = DatasetConfig(
19
+ name="scala-pl",
20
+ pretty_name="the Polish part of the linguistic acceptability dataset ScaLA",
21
+ huggingface_id="EuroEval/scala-pl",
22
+ task=LA,
23
+ languages=[PL],
24
+ )
25
+
26
+ KPWR_NER_CONFIG = DatasetConfig(
27
+ name="kpwr-ner",
28
+ pretty_name="the Polish entity recognition dataset KPWr-NER",
29
+ huggingface_id="EuroEval/kpwr-ner",
30
+ task=NER,
31
+ languages=[PL],
32
+ )
33
+
34
+ POQUAD_CONFIG = DatasetConfig(
35
+ name="poquad",
36
+ pretty_name="the Polish question answering dataset PoQuAD",
37
+ huggingface_id="EuroEval/poquad-mini",
38
+ task=RC,
39
+ languages=[PL],
40
+ )
41
+
42
+ PSC_CONFIG = DatasetConfig(
43
+ name="psc",
44
+ pretty_name="the Polish summarisation dataset PSC",
45
+ huggingface_id="EuroEval/psc-mini",
46
+ task=SUMM,
47
+ languages=[PL],
48
+ )
49
+
50
+ LLMZSZL_CONFIG = DatasetConfig(
51
+ name="llmzszl",
52
+ pretty_name="the Polish knowledge dataset LLMzSzŁ",
53
+ huggingface_id="EuroEval/llmzszl-mini",
54
+ task=KNOW,
55
+ languages=[PL],
56
+ )
57
+
58
+ WINOGRANDE_PL_CONFIG = DatasetConfig(
59
+ name="winogrande-pl",
60
+ pretty_name="the Polish common-sense reasoning dataset Winogrande-pl, translated "
61
+ "from the English Winogrande dataset",
62
+ huggingface_id="EuroEval/winogrande-pl",
63
+ task=COMMON_SENSE,
64
+ languages=[PL],
65
+ splits=["train", "test"],
66
+ _labels=["a", "b"],
67
+ _allowed_model_types=[ModelType.GENERATIVE],
68
+ )
69
+
70
+ EUROPEAN_VALUES_PL_CONFIG = DatasetConfig(
71
+ name="european-values-pl",
72
+ pretty_name="the Polish version of the European values evaluation dataset",
73
+ huggingface_id="EuroEval/european-values-pl",
74
+ task=EUROPEAN_VALUES,
75
+ languages=[PL],
76
+ splits=["test"],
77
+ bootstrap_samples=False,
78
+ _instruction_prompt="{text}",
79
+ )
80
+
81
+
82
+ ### Unofficial datasets ###
83
+
84
+ MULTI_WIKI_QA_PL_CONFIG = DatasetConfig(
85
+ name="multi-wiki-qa-pl",
86
+ pretty_name="the truncated version of the Polish part of the reading "
87
+ "comprehension dataset MultiWikiQA",
88
+ huggingface_id="EuroEval/multi-wiki-qa-pl-mini",
89
+ task=RC,
90
+ languages=[PL],
91
+ unofficial=True,
92
+ )
93
+
94
+ GOLDENSWAG_PL_CONFIG = DatasetConfig(
95
+ name="goldenswag-pl",
96
+ pretty_name="the truncated version of the Polish common-sense reasoning "
97
+ "dataset GoldenSwag-pl, translated from the English GoldenSwag dataset",
98
+ huggingface_id="EuroEval/goldenswag-pl-mini",
99
+ task=COMMON_SENSE,
100
+ languages=[PL],
101
+ unofficial=True,
102
+ )
103
+
104
+ EUROPEAN_VALUES_SITUATIONAL_PL_CONFIG = DatasetConfig(
105
+ name="european-values-situational-pl",
106
+ pretty_name="the Polish version of the European values evaluation dataset, where "
107
+ "the questions are phrased in a situational way",
108
+ huggingface_id="EuroEval/european-values-situational-pl",
109
+ task=EUROPEAN_VALUES,
110
+ languages=[PL],
111
+ splits=["test"],
112
+ bootstrap_samples=False,
113
+ unofficial=True,
114
+ )
115
+
116
+ EUROPEAN_VALUES_COMPLETIONS_PL_CONFIG = DatasetConfig(
117
+ name="european-values-completions-pl",
118
+ pretty_name="the Polish version of the European values evaluation dataset, where "
119
+ "the questions are phrased as sentence completions",
120
+ huggingface_id="EuroEval/european-values-completions-pl",
121
+ task=EUROPEAN_VALUES,
122
+ languages=[PL],
123
+ splits=["test"],
124
+ bootstrap_samples=False,
125
+ unofficial=True,
126
+ )
@@ -1,6 +1,7 @@
1
1
  """All Portuguese dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import PT
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -91,6 +92,19 @@ BOOLQ_PT_CONFIG = DatasetConfig(
91
92
  unofficial=True,
92
93
  )
93
94
 
95
+ WINOGRANDE_PT_CONFIG = DatasetConfig(
96
+ name="winogrande-pt",
97
+ pretty_name="the Portuguese common-sense reasoning dataset Winogrande-pt, "
98
+ "translated from the English Winogrande dataset",
99
+ huggingface_id="EuroEval/winogrande-pt",
100
+ task=COMMON_SENSE,
101
+ languages=[PT],
102
+ splits=["train", "test"],
103
+ _labels=["a", "b"],
104
+ _allowed_model_types=[ModelType.GENERATIVE],
105
+ unofficial=True,
106
+ )
107
+
94
108
  EUROPEAN_VALUES_SITUATIONAL_PT_CONFIG = DatasetConfig(
95
109
  name="european-values-situational-pt",
96
110
  pretty_name="the Portuguese version of the European values evaluation dataset, "
@@ -1,6 +1,7 @@
1
1
  """All Spanish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import ES
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -119,6 +120,19 @@ GOLDENSWAG_ES_CONFIG = DatasetConfig(
119
120
  unofficial=True,
120
121
  )
121
122
 
123
+ WINOGRANDE_ES_CONFIG = DatasetConfig(
124
+ name="winogrande-es",
125
+ pretty_name="the Spanish common-sense reasoning dataset Winogrande-es, translated "
126
+ "from the English Winogrande dataset",
127
+ huggingface_id="EuroEval/winogrande-es",
128
+ task=COMMON_SENSE,
129
+ languages=[ES],
130
+ splits=["train", "test"],
131
+ _labels=["a", "b"],
132
+ _allowed_model_types=[ModelType.GENERATIVE],
133
+ unofficial=True,
134
+ )
135
+
122
136
  EUROPEAN_VALUES_SITUATIONAL_ES_CONFIG = DatasetConfig(
123
137
  name="european-values-situational-es",
124
138
  pretty_name="the Spanish version of the European values evaluation dataset, where "
@@ -1,6 +1,7 @@
1
1
  """All Swedish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import SV
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -130,6 +131,19 @@ GOLDENSWAG_SV_CONFIG = DatasetConfig(
130
131
  unofficial=True,
131
132
  )
132
133
 
134
+ WINOGRANDE_SV_CONFIG = DatasetConfig(
135
+ name="winogrande-sv",
136
+ pretty_name="the Swedish common-sense reasoning dataset Winogrande-sv, translated "
137
+ "from the English Winogrande dataset",
138
+ huggingface_id="EuroEval/winogrande-sv",
139
+ task=COMMON_SENSE,
140
+ languages=[SV],
141
+ splits=["train", "test"],
142
+ _labels=["a", "b"],
143
+ _allowed_model_types=[ModelType.GENERATIVE],
144
+ unofficial=True,
145
+ )
146
+
133
147
  EUROPEAN_VALUES_SITUATIONAL_SV_CONFIG = DatasetConfig(
134
148
  name="european-values-situational-sv",
135
149
  pretty_name="the Swedish version of the European values evaluation dataset, where "
@@ -155,3 +169,14 @@ EUROPEAN_VALUES_COMPLETIONS_SV_CONFIG = DatasetConfig(
155
169
  _instruction_prompt="{text}",
156
170
  unofficial=True,
157
171
  )
172
+
173
+ SKOLPROV_CONFIG = DatasetConfig(
174
+ name="skolprov",
175
+ pretty_name="the Swedish knowledge dataset Skolprov",
176
+ huggingface_id="EuroEval/skolprov",
177
+ task=KNOW,
178
+ languages=[SV],
179
+ splits=["train", "test"],
180
+ _allowed_model_types=[ModelType.GENERATIVE],
181
+ unofficial=True,
182
+ )
euroeval/enums.py CHANGED
@@ -12,6 +12,14 @@ class AutoStrEnum(str, Enum):
12
12
  ) -> str:
13
13
  return name.lower()
14
14
 
15
+ def __str__(self) -> str:
16
+ """Return the value in upper case for better readability."""
17
+ return self.value.upper()
18
+
19
+ def __repr__(self) -> str:
20
+ """Return the value in upper case for better readability."""
21
+ return self.value.upper()
22
+
15
23
 
16
24
  class Device(AutoStrEnum):
17
25
  """The compute device to use for the evaluation.
@@ -60,6 +68,10 @@ class ModelType(AutoStrEnum):
60
68
  ENCODER = auto()
61
69
  GENERATIVE = auto()
62
70
 
71
+ def __repr__(self) -> str:
72
+ """Return the value in upper case for better readability."""
73
+ return self.value.upper()
74
+
63
75
 
64
76
  class GenerativeType(AutoStrEnum):
65
77
  """The type of a generative model.
euroeval/generation.py CHANGED
@@ -307,7 +307,7 @@ def debug_log(
307
307
  for label in batch["label"]
308
308
  ]
309
309
  else:
310
- labels = ["N/A"] * len(extracted_labels)
310
+ labels = [None] * len(extracted_labels)
311
311
 
312
312
  case TaskGroup.QUESTION_ANSWERING:
313
313
  extracted_labels = [
@@ -330,12 +330,21 @@ def debug_log(
330
330
  else:
331
331
  input_texts = batch["text"]
332
332
 
333
- for input_text, raw_output, prediction, label in zip(
334
- input_texts, model_output.sequences, extracted_labels, labels
335
- ):
333
+ metadata_keys: list[str] = [
334
+ key
335
+ for key in batch.keys()
336
+ if key not in ["text", "messages", "label", "labels", "target_text"]
337
+ ]
338
+
339
+ for idx in range(len(input_texts)):
340
+ data_to_log: dict[str, t.Any] = {
341
+ "Input": input_texts[idx],
342
+ "Raw output": model_output.sequences[idx],
343
+ "Prediction": extracted_labels[idx],
344
+ }
345
+ if labels[idx]:
346
+ data_to_log["Label"] = labels[idx]
347
+ data_to_log |= {key.capitalize(): batch[key][idx] for key in metadata_keys}
336
348
  logger.info(
337
- f"Input: '{input_text}'\n"
338
- f"Raw output: '{raw_output}'\n"
339
- f"Prediction: '{prediction}'\n"
340
- f"Label: '{label}'"
349
+ "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items())
341
350
  )
@@ -4,11 +4,12 @@ import itertools as it
4
4
  import json
5
5
  import logging
6
6
  import random
7
+ import re
7
8
  import typing as t
8
9
 
9
- from .enums import TaskGroup
10
- from .exceptions import InvalidBenchmark
11
- from .tokenization_utils import apply_chat_template
10
+ from .enums import GenerativeType, TaskGroup
11
+ from .exceptions import InvalidBenchmark, InvalidModel
12
+ from .tokenisation_utils import apply_chat_template
12
13
  from .utils import extract_multiple_choice_labels, log_once
13
14
 
14
15
  if t.TYPE_CHECKING:
@@ -173,7 +174,7 @@ def apply_prompt(
173
174
  few_shot_examples: list[dict[str, t.Any]],
174
175
  model_config: "ModelConfig",
175
176
  dataset_config: "DatasetConfig",
176
- instruction_model: bool,
177
+ generative_type: GenerativeType | None,
177
178
  always_populate_text_field: bool,
178
179
  tokeniser: "PreTrainedTokenizer | None",
179
180
  ) -> dict[str, t.Any]:
@@ -184,10 +185,12 @@ def apply_prompt(
184
185
  The examples to apply the few-shot examples to.
185
186
  few_shot_examples:
186
187
  The few-shot examples to apply.
188
+ model_config:
189
+ The model configuration.
187
190
  dataset_config:
188
191
  The dataset configuration.
189
- instruction_model:
190
- Whether the model is instruction-tuned.
192
+ generative_type:
193
+ The generative type of the model.
191
194
  always_populate_text_field:
192
195
  Whether to always populate the 'text' field in the examples, as opposed to
193
196
  the 'messages' field.
@@ -198,7 +201,11 @@ def apply_prompt(
198
201
  The example with the few-shot examples applied.
199
202
  """
200
203
  # Sanity check
201
- if instruction_model and always_populate_text_field and tokeniser is None:
204
+ if (
205
+ generative_type == GenerativeType.INSTRUCTION_TUNED
206
+ and always_populate_text_field
207
+ and tokeniser is None
208
+ ):
202
209
  raise ValueError(
203
210
  "The `tokeniser` argument must be provided when the model is instruction "
204
211
  "tuned and when we are not just returning the raw messages."
@@ -222,7 +229,7 @@ def apply_prompt(
222
229
  )
223
230
  label_mapping = dataset_config.prompt_label_mapping
224
231
  label = label_mapping.get(label, label)
225
- if instruction_model:
232
+ if generative_type == GenerativeType.INSTRUCTION_TUNED:
226
233
  prompt = dataset_config.instruction_prompt.format(**kwargs)
227
234
  return prompt, label
228
235
  else:
@@ -348,7 +355,7 @@ def apply_prompt(
348
355
  f"Unsupported task group: {dataset_config.task.task_group}."
349
356
  )
350
357
 
351
- if instruction_model:
358
+ if generative_type == GenerativeType.INSTRUCTION_TUNED:
352
359
  few_shot_messages = [
353
360
  dict(role=role, content=content)
354
361
  for prompt, label in few_shot_sections
@@ -362,7 +369,6 @@ def apply_prompt(
362
369
 
363
370
  if not always_populate_text_field:
364
371
  examples["messages"] = messages_list
365
-
366
372
  else:
367
373
  assert tokeniser is not None
368
374
 
@@ -389,6 +395,9 @@ def apply_prompt(
389
395
  apply_chat_template(
390
396
  conversation=messages,
391
397
  tokeniser=tokeniser,
398
+ tokenise=False,
399
+ add_generation_prompt=True,
400
+ enable_thinking=(generative_type == GenerativeType.REASONING),
392
401
  chat_template=chat_template,
393
402
  )
394
403
  for messages in messages_list
@@ -414,3 +423,42 @@ def apply_prompt(
414
423
  examples["prompt"] = [new_prompt for new_prompt, _ in new_sections]
415
424
 
416
425
  return examples
426
+
427
+
428
+ def raise_if_wrong_params(
429
+ model_config: "ModelConfig", allowed_params: dict[re.Pattern, list[str]]
430
+ ) -> None:
431
+ """Raise an error if the model configuration has invalid parameters.
432
+
433
+ Args:
434
+ model_config:
435
+ The model configuration.
436
+ allowed_params:
437
+ The allowed parameters for the model, being a dictionary mapping a regex
438
+ pattern matching the model ID to a list of allowed parameters for those
439
+ models.
440
+
441
+ Raises:
442
+ InvalidModel:
443
+ If the model configuration has invalid parameters.
444
+ """
445
+ if model_config.param is None:
446
+ return
447
+ for model_regex, allowed_params_list in allowed_params.items():
448
+ if re.fullmatch(pattern=model_regex, string=model_config.model_id):
449
+ if model_config.param not in allowed_params_list:
450
+ msg = (
451
+ f"Invalid parameter {model_config.param!r} for model "
452
+ f"{model_config.model_id!r}."
453
+ )
454
+ if allowed_params_list:
455
+ msg += f" Allowed parameters are: {', '.join(allowed_params_list)}."
456
+ else:
457
+ msg += " No parameters are allowed."
458
+ raise InvalidModel(msg)
459
+ return
460
+ else:
461
+ raise InvalidModel(
462
+ f"The parameter {model_config.param!r} is not supported for the model "
463
+ f"{model_config.model_id!r}."
464
+ )
@@ -217,7 +217,7 @@ def european_values_preprocessing_fn(
217
217
  )
218
218
 
219
219
  # Double check that we reshaped the predictions correctly
220
- for idx, pred in enumerate(predictions):
220
+ for idx, pred in enumerate(integer_predictions):
221
221
  assert arr[idx // 5, idx % 5] == pred, (
222
222
  f"Reshaped predictions do not match the original predictions at index "
223
223
  f"{idx}: {arr[idx // 5, idx % 5]} != {pred}."
@@ -19,6 +19,7 @@ from ..languages import (
19
19
  NL,
20
20
  NN,
21
21
  NO,
22
+ PL,
22
23
  PT,
23
24
  SV,
24
25
  )
@@ -67,6 +68,14 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
67
68
  default_instruction_prompt="Lause: {text}\n\nOtsusta, kas lause on "
68
69
  "grammatiliselt õige või mitte. Vasta {labels_str}, ja mitte midagi muud.",
69
70
  ),
71
+ PL: PromptConfig(
72
+ default_prompt_label_mapping=dict(correct="tak", incorrect="nie"),
73
+ default_prompt_prefix="Poniżej znajdują się teksty i czy są "
74
+ "gramatycznie poprawne.",
75
+ default_prompt_template="Tekst: {text}\nGramatycznie poprawny: {label}",
76
+ default_instruction_prompt="Tekst: {text}\n\nOkreśl czy tekst jest "
77
+ "gramatycznie poprawny czy nie. Odpowiedz {labels_str}, i nic więcej.",
78
+ ),
70
79
  PT: PromptConfig(
71
80
  default_prompt_label_mapping=dict(correct="sim", incorrect="não"),
72
81
  default_prompt_prefix="Seguem-se abaixo textos e se são "