EuroEval 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (48) hide show
  1. euroeval/benchmark_config_factory.py +6 -1
  2. euroeval/benchmark_modules/base.py +2 -0
  3. euroeval/benchmark_modules/fresh.py +7 -1
  4. euroeval/benchmark_modules/hf.py +26 -21
  5. euroeval/benchmark_modules/litellm.py +258 -131
  6. euroeval/benchmark_modules/vllm.py +79 -40
  7. euroeval/benchmarker.py +11 -2
  8. euroeval/cli.py +14 -1
  9. euroeval/constants.py +1 -1
  10. euroeval/data_models.py +77 -6
  11. euroeval/dataset_configs/__init__.py +1 -0
  12. euroeval/dataset_configs/danish.py +14 -0
  13. euroeval/dataset_configs/dutch.py +14 -0
  14. euroeval/dataset_configs/english.py +22 -0
  15. euroeval/dataset_configs/estonian.py +15 -7
  16. euroeval/dataset_configs/finnish.py +14 -0
  17. euroeval/dataset_configs/french.py +14 -0
  18. euroeval/dataset_configs/german.py +23 -0
  19. euroeval/dataset_configs/italian.py +14 -0
  20. euroeval/dataset_configs/latvian.py +14 -0
  21. euroeval/dataset_configs/norwegian.py +14 -0
  22. euroeval/dataset_configs/polish.py +126 -0
  23. euroeval/dataset_configs/portuguese.py +14 -0
  24. euroeval/dataset_configs/spanish.py +14 -0
  25. euroeval/dataset_configs/swedish.py +25 -0
  26. euroeval/enums.py +12 -0
  27. euroeval/generation.py +17 -8
  28. euroeval/generation_utils.py +65 -11
  29. euroeval/metrics/pipeline.py +1 -1
  30. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  31. euroeval/prompt_templates/multiple_choice.py +27 -1
  32. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  33. euroeval/prompt_templates/reading_comprehension.py +11 -0
  34. euroeval/prompt_templates/sentiment_classification.py +15 -0
  35. euroeval/prompt_templates/summarization.py +27 -1
  36. euroeval/scores.py +5 -0
  37. euroeval/task_group_utils/question_answering.py +29 -29
  38. euroeval/task_group_utils/sequence_classification.py +11 -34
  39. euroeval/task_group_utils/token_classification.py +3 -3
  40. euroeval/tasks.py +4 -4
  41. euroeval/{tokenization_utils.py → tokenisation_utils.py} +50 -28
  42. euroeval/utils.py +36 -3
  43. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/METADATA +1 -1
  44. euroeval-16.1.1.dist-info/RECORD +70 -0
  45. euroeval-16.0.1.dist-info/RECORD +0 -69
  46. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/WHEEL +0 -0
  47. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/entry_points.txt +0 -0
  48. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -44,7 +44,11 @@ from ..exceptions import (
44
44
  NeedsEnvironmentVariable,
45
45
  NeedsExtraInstalled,
46
46
  )
47
- from ..generation_utils import apply_prompt, extract_few_shot_examples
47
+ from ..generation_utils import (
48
+ apply_prompt,
49
+ extract_few_shot_examples,
50
+ raise_if_wrong_params,
51
+ )
48
52
  from ..languages import get_all_languages
49
53
  from ..task_group_utils import (
50
54
  question_answering,
@@ -52,7 +56,7 @@ from ..task_group_utils import (
52
56
  text_to_text,
53
57
  token_classification,
54
58
  )
55
- from ..tokenization_utils import (
59
+ from ..tokenisation_utils import (
56
60
  apply_chat_template,
57
61
  get_bos_token,
58
62
  get_end_of_chat_token_ids,
@@ -69,6 +73,7 @@ from ..utils import (
69
73
  get_hf_token,
70
74
  get_min_cuda_compute_capability,
71
75
  log_once,
76
+ split_model_id,
72
77
  )
73
78
  from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
74
79
 
@@ -97,6 +102,7 @@ class VLLMModel(HuggingFaceEncoderModel):
97
102
  fresh_model = False
98
103
  batching_preference = BatchingPreference.ALL_AT_ONCE
99
104
  high_priority = True
105
+ allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
100
106
 
101
107
  def __init__(
102
108
  self,
@@ -120,42 +126,46 @@ class VLLMModel(HuggingFaceEncoderModel):
120
126
  if importlib.util.find_spec("vllm") is None:
121
127
  raise NeedsExtraInstalled(extra="generative")
122
128
 
129
+ raise_if_wrong_params(
130
+ model_config=model_config, allowed_params=self.allowed_params
131
+ )
132
+
123
133
  model, tokeniser = load_model_and_tokeniser(
124
134
  model_config=model_config, benchmark_config=benchmark_config
125
135
  )
126
136
  self._model: "LLM" = model
127
137
  self._tokeniser: "PreTrainedTokenizer" = tokeniser
138
+
139
+ # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
140
+ # to call the `__init__` method of the `BenchmarkModule` class.
141
+ super(HuggingFaceEncoderModel, self).__init__(
142
+ model_config=model_config,
143
+ dataset_config=dataset_config,
144
+ benchmark_config=benchmark_config,
145
+ log_metadata=log_metadata,
146
+ )
147
+
128
148
  self.end_of_reasoning_token = get_end_of_reasoning_token(
129
149
  model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
130
150
  )
131
151
  self.end_of_chat_token_ids = get_end_of_chat_token_ids(
132
- tokeniser=self._tokeniser
152
+ tokeniser=self._tokeniser, generative_type=self.generative_type
133
153
  )
134
154
  self.custom_stop_tokens = get_custom_stop_tokens(
135
155
  model=self._model,
136
156
  tokeniser=self._tokeniser,
137
157
  model_id=model_config.model_id,
138
- is_reasoning_model=self.end_of_reasoning_token is not None,
139
- )
140
-
141
- # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
142
- # to call the `__init__` method of the `BenchmarkModule` class.
143
- super(HuggingFaceEncoderModel, self).__init__(
144
- model_config=model_config,
145
- dataset_config=dataset_config,
146
- benchmark_config=benchmark_config,
147
- log_metadata=log_metadata,
158
+ generative_type=self.generative_type,
148
159
  )
149
160
 
150
161
  self.buffer |= dict(
151
- instruction_model=has_chat_template(tokeniser=self._tokeniser),
152
162
  first_label_token_mapping=get_first_label_token_mapping(
153
163
  dataset_config=self.dataset_config,
154
164
  model_config=self.model_config,
155
165
  tokeniser=self._tokeniser,
156
166
  generative_type=self.generative_type,
157
167
  log_metadata=self.log_metadata,
158
- ),
168
+ )
159
169
  )
160
170
  if self.model_config.adapter_base_model_id is not None:
161
171
  adapter_path = snapshot_download(
@@ -187,16 +197,36 @@ class VLLMModel(HuggingFaceEncoderModel):
187
197
  The generative type of the model, or None if it has not been set yet.
188
198
  """
189
199
  if not hasattr(self, "_tokeniser"):
200
+ log_once(
201
+ "The generative type of the model has not been set yet as the "
202
+ "tokeniser has not been loaded.",
203
+ level=logging.DEBUG,
204
+ )
190
205
  return None
191
- elif self.end_of_reasoning_token is not None:
192
- return GenerativeType.REASONING
206
+ elif self.benchmark_config.generative_type is not None:
207
+ type_ = self.benchmark_config.generative_type
208
+ elif self.model_config.param in {"thinking"}:
209
+ type_ = GenerativeType.REASONING
210
+ elif self.model_config.param in {"no-thinking"}:
211
+ type_ = GenerativeType.INSTRUCTION_TUNED
212
+ elif (
213
+ hasattr(self, "end_of_reasoning_token")
214
+ and self.end_of_reasoning_token is not None
215
+ ):
216
+ type_ = GenerativeType.REASONING
193
217
  elif (
194
218
  has_chat_template(tokeniser=self._tokeniser)
195
219
  or "instruct" in self.model_config.model_id.lower()
196
220
  ):
197
- return GenerativeType.INSTRUCTION_TUNED
221
+ type_ = GenerativeType.INSTRUCTION_TUNED
198
222
  else:
199
- return GenerativeType.BASE
223
+ type_ = GenerativeType.BASE
224
+ log_once(
225
+ f"Detected generative type {type_.name!r} for model "
226
+ f"{self.model_config.model_id!r}",
227
+ level=logging.DEBUG,
228
+ )
229
+ return type_
200
230
 
201
231
  @property
202
232
  def extract_labels_from_generation(self) -> ExtractLabelsFunction:
@@ -285,7 +315,7 @@ class VLLMModel(HuggingFaceEncoderModel):
285
315
  few_shot_examples=few_shot_examples,
286
316
  model_config=self.model_config,
287
317
  dataset_config=self.dataset_config,
288
- instruction_model=self.buffer["instruction_model"],
318
+ generative_type=self.generative_type,
289
319
  always_populate_text_field=True,
290
320
  tokeniser=self._tokeniser,
291
321
  ),
@@ -313,7 +343,7 @@ class VLLMModel(HuggingFaceEncoderModel):
313
343
  """
314
344
  # Get stopping tokens
315
345
  stop_tokens: list[str] = self.custom_stop_tokens.copy()
316
- if self.buffer["instruction_model"] is False:
346
+ if self.generative_type == GenerativeType.BASE:
317
347
  stop_tokens.append("\n\n")
318
348
  if self._tokeniser.pad_token_id is not None:
319
349
  assert isinstance(self._tokeniser.pad_token, str), (
@@ -430,9 +460,7 @@ class VLLMModel(HuggingFaceEncoderModel):
430
460
  labels_to_be_generated = list(self.dataset_config.prompt_label_mapping.values())
431
461
  if len(labels_to_be_generated) == 0:
432
462
  labels_to_be_generated = ["negative", "positive"]
433
- if not self.buffer.get(
434
- "instruction_model", False
435
- ) and should_prompts_be_stripped(
463
+ if self.generative_type == GenerativeType.BASE and should_prompts_be_stripped(
436
464
  labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
437
465
  ):
438
466
  log_once(
@@ -590,9 +618,10 @@ class VLLMModel(HuggingFaceEncoderModel):
590
618
  if using_api:
591
619
  return False
592
620
 
593
- model_id, revision = (
594
- model_id.split("@") if "@" in model_id else (model_id, "main")
595
- )
621
+ model_id_components = split_model_id(model_id=model_id)
622
+ model_id = model_id_components.model_id
623
+ revision = model_id_components.revision
624
+
596
625
  model_info = get_model_repo_info(
597
626
  model_id=model_id, revision=revision, benchmark_config=benchmark_config
598
627
  )
@@ -616,11 +645,11 @@ class VLLMModel(HuggingFaceEncoderModel):
616
645
  Returns:
617
646
  The model configuration.
618
647
  """
619
- model_id, revision = (
620
- model_id.split("@") if "@" in model_id else (model_id, "main")
621
- )
648
+ model_id_components = split_model_id(model_id=model_id)
622
649
  model_info = get_model_repo_info(
623
- model_id=model_id, revision=revision, benchmark_config=benchmark_config
650
+ model_id=model_id_components.model_id,
651
+ revision=model_id_components.revision,
652
+ benchmark_config=benchmark_config,
624
653
  )
625
654
  if model_info is None:
626
655
  raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -629,8 +658,9 @@ class VLLMModel(HuggingFaceEncoderModel):
629
658
  language_codes = list(language_mapping.keys())
630
659
 
631
660
  model_config = ModelConfig(
632
- model_id=model_id,
633
- revision=revision,
661
+ model_id=model_id_components.model_id,
662
+ revision=model_id_components.revision,
663
+ param=model_id_components.param,
634
664
  task=model_info.pipeline_tag,
635
665
  languages=[
636
666
  language_mapping[tag]
@@ -985,7 +1015,11 @@ def get_end_of_reasoning_token(
985
1015
  prompt = "What is your name?"
986
1016
  if has_chat_template(tokeniser=tokeniser):
987
1017
  templated_prompt = apply_chat_template(
988
- conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
1018
+ conversation=[dict(role="user", content=prompt)],
1019
+ tokeniser=tokeniser,
1020
+ tokenise=False,
1021
+ add_generation_prompt=True,
1022
+ enable_thinking=True,
989
1023
  )
990
1024
  assert isinstance(templated_prompt, str)
991
1025
  prompt = templated_prompt
@@ -1063,7 +1097,7 @@ def get_custom_stop_tokens(
1063
1097
  model: "LLM",
1064
1098
  tokeniser: "PreTrainedTokenizer",
1065
1099
  model_id: str,
1066
- is_reasoning_model: bool,
1100
+ generative_type: GenerativeType | None,
1067
1101
  ) -> list[str]:
1068
1102
  """Get the stop tokens for a generative model.
1069
1103
 
@@ -1074,9 +1108,8 @@ def get_custom_stop_tokens(
1074
1108
  The tokeniser.
1075
1109
  model_id:
1076
1110
  The model ID.
1077
- is_reasoning_model:
1078
- Whether the model is a reasoning model. This is used to determine the number
1079
- of generated tokens to allow before stopping the generation.
1111
+ generative_type:
1112
+ The generative type of the model.
1080
1113
 
1081
1114
  Returns:
1082
1115
  A list of stop tokens.
@@ -1086,12 +1119,18 @@ def get_custom_stop_tokens(
1086
1119
  prompt = "Hello"
1087
1120
  if has_chat_template(tokeniser=tokeniser):
1088
1121
  templated_prompt = apply_chat_template(
1089
- conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
1122
+ conversation=[dict(role="user", content=prompt)],
1123
+ tokeniser=tokeniser,
1124
+ tokenise=False,
1125
+ add_generation_prompt=True,
1126
+ enable_thinking=generative_type == GenerativeType.REASONING,
1090
1127
  )
1091
1128
  assert isinstance(templated_prompt, str)
1092
1129
  prompt = templated_prompt
1093
1130
 
1094
- max_tokens = REASONING_MAX_TOKENS if is_reasoning_model else 10
1131
+ max_tokens = (
1132
+ REASONING_MAX_TOKENS if generative_type == GenerativeType.REASONING else 10
1133
+ )
1095
1134
  completion = (
1096
1135
  model.generate(
1097
1136
  prompts=[prompt],
euroeval/benchmarker.py CHANGED
@@ -19,7 +19,7 @@ from .constants import GENERATIVE_PIPELINE_TAGS
19
19
  from .data_loading import load_data
20
20
  from .data_models import BenchmarkConfigParams, BenchmarkResult
21
21
  from .dataset_configs import get_all_dataset_configs
22
- from .enums import Device, ModelType
22
+ from .enums import Device, GenerativeType, ModelType
23
23
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
24
24
  from .finetuning import finetune
25
25
  from .generation import generate
@@ -79,6 +79,7 @@ class Benchmarker:
79
79
  api_base: str | None = None,
80
80
  api_version: str | None = None,
81
81
  gpu_memory_utilization: float = 0.9,
82
+ generative_type: GenerativeType | None = None,
82
83
  debug: bool = False,
83
84
  run_with_cli: bool = False,
84
85
  requires_safetensors: bool = False,
@@ -151,6 +152,10 @@ class Benchmarker:
151
152
  is generative. A larger value will result in faster evaluation, but at
152
153
  the risk of running out of GPU memory. Only reduce this if you are
153
154
  running out of GPU memory. Defaults to 0.9.
155
+ generative_type:
156
+ The type of generative model to benchmark. Only relevant if the model is
157
+ generative. If not specified, then the type will be inferred based on
158
+ the tags of the model. Defaults to None.
154
159
  debug:
155
160
  Whether to output debug information. Defaults to False.
156
161
  run_with_cli:
@@ -199,6 +204,7 @@ class Benchmarker:
199
204
  api_base=api_base,
200
205
  api_version=api_version,
201
206
  gpu_memory_utilization=gpu_memory_utilization,
207
+ generative_type=generative_type,
202
208
  debug=debug,
203
209
  run_with_cli=run_with_cli,
204
210
  requires_safetensors=requires_safetensors,
@@ -438,7 +444,7 @@ class Benchmarker:
438
444
 
439
445
  # Skip if the model type should not be benchmarked on this dataset
440
446
  model_type = model_config.model_type
441
- allowed_model_types = dataset_config.task.allowed_model_types
447
+ allowed_model_types = dataset_config.allowed_model_types
442
448
  if model_type not in allowed_model_types:
443
449
  logger.debug(
444
450
  f"Skipping benchmarking {model_id} on "
@@ -804,6 +810,7 @@ class Benchmarker:
804
810
  scores=scores,
805
811
  model_id=model_config.model_id,
806
812
  model_revision=model_config.revision,
813
+ model_param=model_config.param,
807
814
  )
808
815
 
809
816
  record = BenchmarkResult(
@@ -1108,6 +1115,8 @@ def initial_logging(
1108
1115
  model_id = model_config.model_id
1109
1116
  if model_config.revision and model_config.revision != "main":
1110
1117
  model_id += f"@{model_config.revision}"
1118
+ if model_config.param is not None:
1119
+ model_id += f"#{model_config.param}"
1111
1120
 
1112
1121
  split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
1113
1122
  if model_config.task in GENERATIVE_PIPELINE_TAGS:
euroeval/cli.py CHANGED
@@ -4,7 +4,7 @@ import click
4
4
 
5
5
  from .benchmarker import Benchmarker
6
6
  from .dataset_configs import get_all_dataset_configs
7
- from .enums import Device
7
+ from .enums import Device, GenerativeType
8
8
  from .languages import get_all_languages
9
9
  from .tasks import get_all_tasks
10
10
 
@@ -208,6 +208,14 @@ from .tasks import get_all_tasks
208
208
  help="Only allow loading models that have safetensors weights available",
209
209
  default=False,
210
210
  )
211
+ @click.option(
212
+ "--generative-type",
213
+ type=click.Choice(["base", "instruction_tuned", "reasoning"]),
214
+ default=None,
215
+ show_default=True,
216
+ help="The type of generative model. Only relevant if the model is generative. If "
217
+ "not specified, the type will be inferred automatically.",
218
+ )
211
219
  def benchmark(
212
220
  model: tuple[str],
213
221
  dataset: tuple[str],
@@ -234,6 +242,7 @@ def benchmark(
234
242
  gpu_memory_utilization: float,
235
243
  debug: bool,
236
244
  requires_safetensors: bool,
245
+ generative_type: str | None,
237
246
  ) -> None:
238
247
  """Benchmark pretrained language models on language tasks."""
239
248
  models = list(model)
@@ -244,6 +253,9 @@ def benchmark(
244
253
  tasks = None if len(task) == 0 else list(task)
245
254
  batch_size_int = int(batch_size)
246
255
  device = Device[device.upper()] if device is not None else None
256
+ generative_type_obj = (
257
+ GenerativeType[generative_type.upper()] if generative_type else None
258
+ )
247
259
 
248
260
  benchmarker = Benchmarker(
249
261
  language=languages,
@@ -268,6 +280,7 @@ def benchmark(
268
280
  api_base=api_base,
269
281
  api_version=api_version,
270
282
  gpu_memory_utilization=gpu_memory_utilization,
283
+ generative_type=generative_type_obj,
271
284
  debug=debug,
272
285
  run_with_cli=True,
273
286
  requires_safetensors=requires_safetensors,
euroeval/constants.py CHANGED
@@ -15,7 +15,7 @@ MAX_CONTEXT_LENGTH = 8_192
15
15
 
16
16
  # We need to raise the amount of tokens generated for reasoning models, to give them
17
17
  # time to think
18
- REASONING_MAX_TOKENS = 32_768
18
+ REASONING_MAX_TOKENS = 8_192
19
19
 
20
20
 
21
21
  # The Hugging Face Hub pipeline tags used to classify models as generative
euroeval/data_models.py CHANGED
@@ -118,14 +118,14 @@ class Task:
118
118
  log probabilities for the generated tokens. Defaults to False.
119
119
  requires_logprobs (optional):
120
120
  Whether the task requires log probabilities. Implies `uses_logprobs`.
121
- allowed_model_types (optional):
121
+ default_allowed_model_types (optional):
122
122
  A list of model types that are allowed to be evaluated on this task.
123
123
  Defaults to all model types being allowed.
124
- allowed_generative_types (optional):
124
+ default_allowed_generative_types (optional):
125
125
  A list of generative model types that are allowed to be evaluated on this
126
126
  task. If None, all generative model types are allowed. Only relevant if
127
127
  `allowed_model_types` includes generative models.
128
- allow_invalid_model_outputs (optional):
128
+ default_allow_invalid_model_outputs (optional):
129
129
  Whether to allow invalid model outputs. This is only relevant for generative
130
130
  models on classification tasks, where the model may generate an output
131
131
  which is not one of the allowed labels. If True, the model output will be
@@ -144,17 +144,17 @@ class Task:
144
144
  uses_structured_output: bool = False
145
145
  uses_logprobs: bool = False
146
146
  requires_logprobs: bool = False
147
- allowed_model_types: list[ModelType] = field(
147
+ default_allowed_model_types: list[ModelType] = field(
148
148
  default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
149
149
  )
150
- allowed_generative_types: list[GenerativeType] = field(
150
+ default_allowed_generative_types: list[GenerativeType] = field(
151
151
  default_factory=lambda: [
152
152
  GenerativeType.BASE,
153
153
  GenerativeType.INSTRUCTION_TUNED,
154
154
  GenerativeType.REASONING,
155
155
  ]
156
156
  )
157
- allow_invalid_model_outputs: bool = True
157
+ default_allow_invalid_model_outputs: bool = True
158
158
 
159
159
  def __post_init__(self) -> None:
160
160
  """Post-initialisation checks."""
@@ -225,6 +225,9 @@ class BenchmarkConfig:
225
225
  Whether the benchmark is being run with the CLI.
226
226
  requires_safetensors:
227
227
  Whether to only allow models that use the safetensors format.
228
+ generative_type:
229
+ The type of generative model to benchmark. Only relevant if the model is
230
+ generative.
228
231
  """
229
232
 
230
233
  model_languages: list[Language]
@@ -251,6 +254,7 @@ class BenchmarkConfig:
251
254
  debug: bool
252
255
  run_with_cli: bool
253
256
  requires_safetensors: bool
257
+ generative_type: GenerativeType | None
254
258
 
255
259
 
256
260
  class BenchmarkConfigParams(pydantic.BaseModel):
@@ -280,6 +284,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
280
284
  api_base: str | None
281
285
  api_version: str | None
282
286
  gpu_memory_utilization: float
287
+ generative_type: GenerativeType | None
283
288
  debug: bool
284
289
  run_with_cli: bool
285
290
  requires_safetensors: bool
@@ -402,6 +407,21 @@ class DatasetConfig:
402
407
  to a 1:1 mapping between the labels and themselves. If None then the mapping
403
408
  will be set to the default mapping for the task and language. Defaults to
404
409
  None.
410
+ _allowed_model_types (optional):
411
+ A list of model types that are allowed to be evaluated on this dataset.
412
+ Defaults to the one for the task.
413
+ _allowed_generative_types (optional):
414
+ A list of generative model types that are allowed to be evaluated on this
415
+ dataset. If None, all generative model types are allowed. Only relevant if
416
+ `allowed_model_types` includes generative models. Defaults to the one for
417
+ the task.
418
+ _allow_invalid_model_outputs (optional):
419
+ Whether to allow invalid model outputs. This is only relevant for
420
+ generative models on classification tasks, where the model may generate an
421
+ output which is not one of the allowed labels. If True, the model output
422
+ will be mapped to the closest valid label. If False, the model output will
423
+ be considered incorrect and the evaluation will be aborted. Defaults to
424
+ the one for the task.
405
425
  splits (optional):
406
426
  The names of the splits in the dataset. If not provided, defaults to
407
427
  ["train", "val", "test"].
@@ -423,6 +443,9 @@ class DatasetConfig:
423
443
  _max_generated_tokens: int | None = None
424
444
  _labels: list[str] | None = None
425
445
  _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
446
+ _allowed_model_types: list[ModelType] | None = None
447
+ _allowed_generative_types: list[GenerativeType] | None = None
448
+ _allow_invalid_model_outputs: bool | None = None
426
449
  splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
427
450
  bootstrap_samples: bool = True
428
451
  unofficial: bool = False
@@ -502,6 +525,33 @@ class DatasetConfig:
502
525
  else:
503
526
  return prompt_config.default_prompt_label_mapping
504
527
 
528
+ @property
529
+ def allowed_model_types(self) -> list[ModelType]:
530
+ """A list of model types that are allowed to be evaluated on this dataset."""
531
+ return (
532
+ self._allowed_model_types
533
+ if self._allowed_model_types is not None
534
+ else self.task.default_allowed_model_types
535
+ )
536
+
537
+ @property
538
+ def allowed_generative_types(self) -> list[GenerativeType]:
539
+ """A list of generative model types that are allowed on this dataset."""
540
+ return (
541
+ self._allowed_generative_types
542
+ if self._allowed_generative_types is not None
543
+ else self.task.default_allowed_generative_types
544
+ )
545
+
546
+ @property
547
+ def allow_invalid_model_outputs(self) -> bool:
548
+ """Whether to allow invalid model outputs."""
549
+ return (
550
+ self._allow_invalid_model_outputs
551
+ if self._allow_invalid_model_outputs is not None
552
+ else self.task.default_allow_invalid_model_outputs
553
+ )
554
+
505
555
  @property
506
556
  def id2label(self) -> dict[int, str]:
507
557
  """The mapping from ID to label."""
@@ -573,6 +623,8 @@ class ModelConfig:
573
623
  The ID of the model.
574
624
  revision:
575
625
  The revision of the model.
626
+ param:
627
+ The parameter of the model, or None if the model has no parameters.
576
628
  task:
577
629
  The task that the model was trained on.
578
630
  languages:
@@ -594,6 +646,7 @@ class ModelConfig:
594
646
 
595
647
  model_id: str
596
648
  revision: str
649
+ param: str | None
597
650
  task: str
598
651
  languages: list[Language]
599
652
  inference_backend: "InferenceBackend"
@@ -707,3 +760,21 @@ class PromptConfig:
707
760
  default_prompt_template: str
708
761
  default_instruction_prompt: str
709
762
  default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
763
+
764
+
765
+ @dataclass
766
+ class ModelIdComponents:
767
+ """A model ID split into its components.
768
+
769
+ Attributes:
770
+ model_id:
771
+ The main model ID without revision or parameters.
772
+ revision:
773
+ The revision of the model, if any.
774
+ param:
775
+ The parameter of the model, if any.
776
+ """
777
+
778
+ model_id: str
779
+ revision: str
780
+ param: str | None
@@ -15,6 +15,7 @@ from .icelandic import * # noqa: F403
15
15
  from .italian import * # noqa: F403
16
16
  from .latvian import * # noqa: F403
17
17
  from .norwegian import * # noqa: F403
18
+ from .polish import * # noqa: F403
18
19
  from .portuguese import * # noqa: F403
19
20
  from .spanish import * # noqa: F403
20
21
  from .swedish import * # noqa: F403
@@ -1,6 +1,7 @@
1
1
  """All Danish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import DA
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -149,6 +150,19 @@ GOLDENSWAG_DA_CONFIG = DatasetConfig(
149
150
  unofficial=True,
150
151
  )
151
152
 
153
+ WINOGRANDE_DA_CONFIG = DatasetConfig(
154
+ name="winogrande-da",
155
+ pretty_name="the Danish common-sense reasoning dataset Winogrande-da, translated "
156
+ "from the English Winogrande dataset",
157
+ huggingface_id="EuroEval/winogrande-da",
158
+ task=COMMON_SENSE,
159
+ languages=[DA],
160
+ splits=["train", "test"],
161
+ _labels=["a", "b"],
162
+ _allowed_model_types=[ModelType.GENERATIVE],
163
+ unofficial=True,
164
+ )
165
+
152
166
  EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
153
167
  name="european-values-situational-da",
154
168
  pretty_name="the Danish version of the European values evaluation dataset, where "
@@ -1,6 +1,7 @@
1
1
  """All Dutch dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import NL
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -142,6 +143,19 @@ GOLDENSWAG_NL_CONFIG = DatasetConfig(
142
143
  unofficial=True,
143
144
  )
144
145
 
146
+ WINOGRANDE_NL_CONFIG = DatasetConfig(
147
+ name="winogrande-nl",
148
+ pretty_name="the Dutch common-sense reasoning dataset Winogrande-nl, translated "
149
+ "from the English Winogrande dataset",
150
+ huggingface_id="EuroEval/winogrande-nl",
151
+ task=COMMON_SENSE,
152
+ languages=[NL],
153
+ splits=["train", "test"],
154
+ _labels=["a", "b"],
155
+ _allowed_model_types=[ModelType.GENERATIVE],
156
+ unofficial=True,
157
+ )
158
+
145
159
  EUROPEAN_VALUES_SITUATIONAL_NL_CONFIG = DatasetConfig(
146
160
  name="european-values-situational-nl",
147
161
  pretty_name="the Dutch version of the European values evaluation dataset, where "
@@ -1,6 +1,7 @@
1
1
  """All English dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import EN
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -80,6 +81,15 @@ EUROPEAN_VALUES_EN_CONFIG = DatasetConfig(
80
81
 
81
82
  ### Unofficial datasets ###
82
83
 
84
+ XQUAD_EN_CONFIG = DatasetConfig(
85
+ name="xquad-en",
86
+ pretty_name="the English version of the reading comprehension dataset XQuAD",
87
+ huggingface_id="EuroEval/xquad-en",
88
+ task=RC,
89
+ languages=[EN],
90
+ unofficial=True,
91
+ )
92
+
83
93
  ARC_CONFIG = DatasetConfig(
84
94
  name="arc",
85
95
  pretty_name="the truncated version of the English knowledge dataset ARC",
@@ -117,6 +127,18 @@ MULTI_WIKI_QA_EN_CONFIG = DatasetConfig(
117
127
  unofficial=True,
118
128
  )
119
129
 
130
+ WINOGRANDE_CONFIG = DatasetConfig(
131
+ name="winogrande",
132
+ pretty_name="the English common-sense reasoning dataset Winogrande",
133
+ huggingface_id="EuroEval/winogrande-en",
134
+ task=COMMON_SENSE,
135
+ languages=[EN],
136
+ splits=["train", "test"],
137
+ _labels=["a", "b"],
138
+ _allowed_model_types=[ModelType.GENERATIVE],
139
+ unofficial=True,
140
+ )
141
+
120
142
  EUROPEAN_VALUES_SITUATIONAL_EN_CONFIG = DatasetConfig(
121
143
  name="european-values-situational-en",
122
144
  pretty_name="the English version of the European values evaluation dataset, where "