ScandEval 16.12.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. scandeval/async_utils.py +46 -0
  2. scandeval/benchmark_config_factory.py +26 -2
  3. scandeval/benchmark_modules/fresh.py +2 -1
  4. scandeval/benchmark_modules/hf.py +50 -12
  5. scandeval/benchmark_modules/litellm.py +25 -15
  6. scandeval/benchmark_modules/vllm.py +3 -3
  7. scandeval/benchmarker.py +15 -33
  8. scandeval/cli.py +2 -4
  9. scandeval/constants.py +5 -0
  10. scandeval/custom_dataset_configs.py +152 -0
  11. scandeval/data_loading.py +87 -31
  12. scandeval/data_models.py +396 -225
  13. scandeval/dataset_configs/__init__.py +51 -25
  14. scandeval/dataset_configs/albanian.py +1 -1
  15. scandeval/dataset_configs/belarusian.py +47 -0
  16. scandeval/dataset_configs/bulgarian.py +1 -1
  17. scandeval/dataset_configs/catalan.py +1 -1
  18. scandeval/dataset_configs/croatian.py +1 -1
  19. scandeval/dataset_configs/danish.py +3 -2
  20. scandeval/dataset_configs/dutch.py +7 -6
  21. scandeval/dataset_configs/english.py +4 -3
  22. scandeval/dataset_configs/estonian.py +8 -7
  23. scandeval/dataset_configs/faroese.py +1 -1
  24. scandeval/dataset_configs/finnish.py +5 -4
  25. scandeval/dataset_configs/french.py +6 -5
  26. scandeval/dataset_configs/german.py +4 -3
  27. scandeval/dataset_configs/greek.py +1 -1
  28. scandeval/dataset_configs/hungarian.py +1 -1
  29. scandeval/dataset_configs/icelandic.py +4 -3
  30. scandeval/dataset_configs/italian.py +4 -3
  31. scandeval/dataset_configs/latvian.py +2 -2
  32. scandeval/dataset_configs/lithuanian.py +1 -1
  33. scandeval/dataset_configs/norwegian.py +6 -5
  34. scandeval/dataset_configs/polish.py +4 -3
  35. scandeval/dataset_configs/portuguese.py +5 -4
  36. scandeval/dataset_configs/romanian.py +2 -2
  37. scandeval/dataset_configs/serbian.py +1 -1
  38. scandeval/dataset_configs/slovene.py +1 -1
  39. scandeval/dataset_configs/spanish.py +4 -3
  40. scandeval/dataset_configs/swedish.py +4 -3
  41. scandeval/dataset_configs/ukrainian.py +1 -1
  42. scandeval/generation_utils.py +6 -6
  43. scandeval/metrics/llm_as_a_judge.py +1 -1
  44. scandeval/metrics/pipeline.py +1 -1
  45. scandeval/model_cache.py +34 -4
  46. scandeval/prompt_templates/linguistic_acceptability.py +9 -0
  47. scandeval/prompt_templates/multiple_choice.py +9 -0
  48. scandeval/prompt_templates/named_entity_recognition.py +21 -0
  49. scandeval/prompt_templates/reading_comprehension.py +10 -0
  50. scandeval/prompt_templates/sentiment_classification.py +11 -0
  51. scandeval/string_utils.py +157 -0
  52. scandeval/task_group_utils/sequence_classification.py +2 -5
  53. scandeval/task_group_utils/token_classification.py +2 -4
  54. scandeval/utils.py +6 -323
  55. scandeval-16.13.0.dist-info/METADATA +334 -0
  56. scandeval-16.13.0.dist-info/RECORD +94 -0
  57. scandeval-16.12.0.dist-info/METADATA +0 -667
  58. scandeval-16.12.0.dist-info/RECORD +0 -90
  59. {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
  60. {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
  61. {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0
scandeval/data_models.py CHANGED
@@ -1,7 +1,10 @@
1
1
  """Data models used in EuroEval."""
2
2
 
3
3
  import collections.abc as c
4
+ import importlib.metadata
5
+ import importlib.util
4
6
  import json
7
+ import logging
5
8
  import re
6
9
  import typing as t
7
10
  from copy import deepcopy
@@ -12,7 +15,7 @@ import pydantic
12
15
  import torch
13
16
  from transformers.generation.configuration_utils import GenerationConfig
14
17
 
15
- from .constants import ATTENTION_BACKENDS
18
+ from .constants import ATTENTION_BACKENDS, MAX_NUMBER_OF_LOGGING_LANGUAGES
16
19
  from .enums import Device, GenerativeType, ModelType, TaskGroup
17
20
  from .exceptions import InvalidBenchmark
18
21
  from .languages import (
@@ -24,14 +27,30 @@ from .languages import (
24
27
  PORTUGUESE,
25
28
  Language,
26
29
  )
30
+ from .logging_utils import log_once
27
31
  from .metrics.base import Metric
28
32
  from .types import ScoreDict
29
- from .utils import get_package_version
30
33
 
31
34
  if t.TYPE_CHECKING:
32
35
  from .enums import InferenceBackend
33
36
 
34
37
 
38
+ def get_package_version(package_name: str) -> str | None:
39
+ """Get the version of a package.
40
+
41
+ Args:
42
+ package_name:
43
+ The name of the package.
44
+
45
+ Returns:
46
+ The version of the package, or None if the package is not installed.
47
+ """
48
+ try:
49
+ return importlib.metadata.version(package_name)
50
+ except importlib.metadata.PackageNotFoundError:
51
+ return None
52
+
53
+
35
54
  @dataclass
36
55
  class PromptConfig:
37
56
  """Configuration for task-specific prompting across languages.
@@ -80,8 +99,9 @@ class Task:
80
99
  default_max_generated_tokens:
81
100
  The default maximum number of tokens to generate when benchmarking the task
82
101
  using few-shot evaluation.
83
- default_labels:
84
- The default labels for datasets using this task.
102
+ default_labels (optional):
103
+ The default labels for datasets using this task. Can be None if the labels
104
+ should be set manually in the dataset configs. Defaults to an empty tuple.
85
105
  requires_zero_shot (optional):
86
106
  Whether to only allow zero-shot evaluation for this task. If True, the
87
107
  task will not be evaluated using few-shot examples.
@@ -118,7 +138,7 @@ class Task:
118
138
  metrics: c.Sequence[Metric]
119
139
  default_num_few_shot_examples: int
120
140
  default_max_generated_tokens: int
121
- default_labels: c.Sequence[str] | None
141
+ default_labels: c.Sequence[str] | None = tuple()
122
142
  requires_zero_shot: bool = False
123
143
  uses_structured_output: bool = False
124
144
  uses_logprobs: bool = False
@@ -144,133 +164,362 @@ class Task:
144
164
  return hash(self.name)
145
165
 
146
166
 
147
- @dataclass
148
167
  class DatasetConfig:
149
- """Configuration for a dataset.
168
+ """Configuration for a dataset."""
169
+
170
+ def __init__(
171
+ self,
172
+ task: Task,
173
+ languages: c.Sequence[Language],
174
+ name: str | None = None,
175
+ pretty_name: str | None = None,
176
+ source: str | dict[str, str] | None = None,
177
+ prompt_prefix: str | None = None,
178
+ prompt_template: str | None = None,
179
+ instruction_prompt: str | None = None,
180
+ num_few_shot_examples: int | None = None,
181
+ max_generated_tokens: int | None = None,
182
+ labels: c.Sequence[str] | None = None,
183
+ prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None,
184
+ allowed_model_types: c.Sequence[ModelType] | None = None,
185
+ allowed_generative_types: c.Sequence[GenerativeType] | None = None,
186
+ allow_invalid_model_outputs: bool | None = None,
187
+ train_split: str | None = "train",
188
+ val_split: str | None = "val",
189
+ test_split: str = "test",
190
+ bootstrap_samples: bool = True,
191
+ unofficial: bool = False,
192
+ _prompt_prefix: str | None = None,
193
+ _prompt_template: str | None = None,
194
+ _instruction_prompt: str | None = None,
195
+ _num_few_shot_examples: int | None = None,
196
+ _max_generated_tokens: int | None = None,
197
+ _labels: c.Sequence[str] | None = None,
198
+ _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None,
199
+ _allowed_model_types: c.Sequence[ModelType] | None = None,
200
+ _allowed_generative_types: c.Sequence[GenerativeType] | None = None,
201
+ _allow_invalid_model_outputs: bool | None = None,
202
+ _logging_string: str | None = None,
203
+ ) -> None:
204
+ """Initialise a DatasetConfig object.
150
205
 
151
- Attributes:
152
- name:
153
- The name of the dataset. Must be lower case with no spaces.
154
- pretty_name:
155
- A longer prettier name for the dataset, which allows cases and spaces. Used
156
- for logging.
157
- source:
158
- The source of the dataset, which can be a Hugging Face ID or a dictionary
159
- with keys "train", "val" and "test" mapping to local CSV file paths.
160
- task:
161
- The task of the dataset.
162
- languages:
163
- The ISO 639-1 language codes of the entries in the dataset.
164
- id2label:
165
- The mapping from ID to label.
166
- label2id:
167
- The mapping from label to ID.
168
- num_labels:
169
- The number of labels in the dataset.
170
- _prompt_prefix (optional):
171
- The prefix to use in the few-shot prompt. Defaults to the template for the
172
- task and language.
173
- _prompt_template (optional):
174
- The template for the prompt to use when benchmarking the dataset using
175
- few-shot evaluation. Defaults to the template for the task and language.
176
- _instruction_prompt (optional):
177
- The prompt to use when benchmarking the dataset using instruction-based
178
- evaluation. Defaults to the template for the task and language.
179
- _num_few_shot_examples (optional):
180
- The number of examples to use when benchmarking the dataset using few-shot
181
- evaluation. For a classification task, these will be drawn evenly from
182
- each label. Defaults to the template for the task and language.
183
- _max_generated_tokens (optional):
184
- The maximum number of tokens to generate when benchmarking the dataset
185
- using few-shot evaluation. Defaults to the template for the task and
186
- language.
187
- _labels (optional):
188
- The labels in the dataset. Defaults to the template for the task and
189
- language.
190
- _prompt_label_mapping (optional):
191
- A mapping from the labels to another phrase which is used as a substitute
192
- for the label in few-shot evaluation. If "auto" then the mapping will be set
193
- to a 1:1 mapping between the labels and themselves. If None then the mapping
194
- will be set to the default mapping for the task and language. Defaults to
195
- None.
196
- _allowed_model_types (optional):
197
- A list of model types that are allowed to be evaluated on this dataset.
198
- Defaults to the one for the task.
199
- _allowed_generative_types (optional):
200
- A list of generative model types that are allowed to be evaluated on this
201
- dataset. If None, all generative model types are allowed. Only relevant if
202
- `allowed_model_types` includes generative models. Defaults to the one for
203
- the task.
204
- _allow_invalid_model_outputs (optional):
205
- Whether to allow invalid model outputs. This is only relevant for
206
- generative models on classification tasks, where the model may generate an
207
- output which is not one of the allowed labels. If True, the model output
208
- will be mapped to the closest valid label. If False, the model output will
209
- be considered incorrect and the evaluation will be aborted. Defaults to
210
- the one for the task.
211
- _logging_string (optional):
212
- The string used to describe evaluation on the dataset in logging. If not
213
- provided, a default string will be generated, based on the pretty name. Only
214
- use this if the default string is not suitable.
215
- splits (optional):
216
- The names of the splits in the dataset. If not provided, defaults to
217
- ["train", "val", "test"].
218
- bootstrap_samples (optional):
219
- Whether to bootstrap the dataset samples. Defaults to True.
220
- unofficial (optional):
221
- Whether the dataset is unofficial. Defaults to False.
222
- """
206
+ Args:
207
+ task:
208
+ The task of the dataset.
209
+ languages:
210
+ The ISO 639-1 language codes of the entries in the dataset.
211
+ name (optional):
212
+ The name of the dataset. Must be lower case with no spaces. Can be None
213
+ if and only if the dataset config resides directly in the Hugging Face
214
+ dataset repo. Defaults to None.
215
+ pretty_name (optional):
216
+ A longer prettier name for the dataset, which allows cases and spaces.
217
+ Used for logging. Can be None if and only if the dataset config resides
218
+ directly in the Hugging Face dataset repo. Defaults to None.
219
+ source (optional):
220
+ The source of the dataset, which can be a Hugging Face ID or a
221
+ dictionary with keys "train", "val" and "test" mapping to local CSV file
222
+ paths. Can be None if and only if the dataset config resides directly in
223
+ the Hugging Face dataset repo. Defaults to None.
224
+ prompt_prefix (optional):
225
+ The prefix to use in the few-shot prompt. Defaults to the template for
226
+ the task and language.
227
+ prompt_template (optional):
228
+ The template for the prompt to use when benchmarking the dataset using
229
+ few-shot evaluation. Defaults to the template for the task and language.
230
+ instruction_prompt (optional):
231
+ The prompt to use when benchmarking the dataset using instruction-based
232
+ evaluation. Defaults to the template for the task and language.
233
+ num_few_shot_examples (optional):
234
+ The number of examples to use when benchmarking the dataset using
235
+ few-shot evaluation. For a classification task, these will be drawn
236
+ evenly from each label. Defaults to the template for the task and
237
+ language.
238
+ max_generated_tokens (optional):
239
+ The maximum number of tokens to generate when benchmarking the dataset
240
+ using few-shot evaluation. Defaults to the template for the task and
241
+ language.
242
+ labels (optional):
243
+ The labels in the dataset. Defaults to the template for the task and
244
+ language.
245
+ prompt_label_mapping (optional):
246
+ A mapping from the labels to another phrase which is used as a
247
+ substitute for the label in few-shot evaluation. If "auto" then the
248
+ mapping will be set to a 1:1 mapping between the labels and themselves.
249
+ If None then the mapping will be set to the default mapping for the task
250
+ and language. Defaults to None.
251
+ allowed_model_types (optional):
252
+ A list of model types that are allowed to be evaluated on this dataset.
253
+ Defaults to the one for the task.
254
+ allowed_generative_types (optional):
255
+ A list of generative model types that are allowed to be evaluated on
256
+ this dataset. If None, all generative model types are allowed. Only
257
+ relevant if `allowed_model_types` includes generative models. Defaults
258
+ to the one for the task.
259
+ allow_invalid_model_outputs (optional):
260
+ Whether to allow invalid model outputs. This is only relevant for
261
+ generative models on classification tasks, where the model may generate
262
+ an output which is not one of the allowed labels. If True, the model
263
+ output will be mapped to the closest valid label. If False, the model
264
+ output will be considered incorrect and the evaluation will be aborted.
265
+ Defaults to the one for the task.
266
+ train_split (optional):
267
+ The name of the split to use as the training set. Can be None if there
268
+ is no training split in the dataset. Defaults to "train".
269
+ val_split (optional):
270
+ The name of the split to use as the validation set. Can be None if there
271
+ is no validation split in the dataset. Defaults to "val".
272
+ test_split (optional):
273
+ The name of the split to use as the test set. Defaults to "test".
274
+ bootstrap_samples (optional):
275
+ Whether to bootstrap the dataset samples. Defaults to True.
276
+ unofficial (optional):
277
+ Whether the dataset is unofficial. Defaults to False.
278
+ _prompt_prefix (optional):
279
+ This argument is deprecated. Please use `prompt_prefix` instead.
280
+ _prompt_template (optional):
281
+ This argument is deprecated. Please use `prompt_template` instead.
282
+ _instruction_prompt (optional):
283
+ This argument is deprecated. Please use `instruction_prompt` instead.
284
+ _num_few_shot_examples (optional):
285
+ This argument is deprecated. Please use `num_few_shot_examples` instead.
286
+ _max_generated_tokens (optional):
287
+ This argument is deprecated. Please use `max_generated_tokens` instead.
288
+ _labels (optional):
289
+ This argument is deprecated. Please use `labels` instead.
290
+ _prompt_label_mapping (optional):
291
+ This argument is deprecated. Please use `prompt_label_mapping` instead.
292
+ _allowed_model_types (optional):
293
+ This argument is deprecated. Please use `allowed_model_types` instead.
294
+ _allowed_generative_types (optional):
295
+ This argument is deprecated. Please use `allowed_generative_types`
296
+ instead.
297
+ _allow_invalid_model_outputs (optional):
298
+ This argument is deprecated. Please use `allow_invalid_model_outputs`
299
+ instead.
300
+ _logging_string (optional):
301
+ This argument is deprecated. Please use `logging_string` instead.
302
+ """
303
+ # Deprecation warnings
304
+ if _prompt_prefix is not None:
305
+ log_once(
306
+ "The `_prompt_prefix` argument is deprecated. Please use "
307
+ "`prompt_prefix` instead.",
308
+ level=logging.WARNING,
309
+ )
310
+ prompt_prefix = _prompt_prefix
311
+ if _prompt_template is not None:
312
+ log_once(
313
+ "The `_prompt_template` argument is deprecated. Please use "
314
+ "`prompt_template` instead.",
315
+ level=logging.WARNING,
316
+ )
317
+ prompt_template = _prompt_template
318
+ if _instruction_prompt is not None:
319
+ log_once(
320
+ "The `_instruction_prompt` argument is deprecated. Please use "
321
+ "`instruction_prompt` instead.",
322
+ level=logging.WARNING,
323
+ )
324
+ instruction_prompt = _instruction_prompt
325
+ if _num_few_shot_examples is not None:
326
+ log_once(
327
+ "The `_num_few_shot_examples` argument is deprecated. Please use "
328
+ "`num_few_shot_examples` instead.",
329
+ level=logging.WARNING,
330
+ )
331
+ num_few_shot_examples = _num_few_shot_examples
332
+ if _max_generated_tokens is not None:
333
+ log_once(
334
+ "The `_max_generated_tokens` argument is deprecated. Please use "
335
+ "`max_generated_tokens` instead.",
336
+ level=logging.WARNING,
337
+ )
338
+ max_generated_tokens = _max_generated_tokens
339
+ if _labels is not None:
340
+ log_once(
341
+ "The `_labels` argument is deprecated. Please use `labels` instead.",
342
+ level=logging.WARNING,
343
+ )
344
+ labels = _labels
345
+ if _prompt_label_mapping is not None:
346
+ log_once(
347
+ "The `_prompt_label_mapping` argument is deprecated. Please use "
348
+ "`prompt_label_mapping` instead.",
349
+ level=logging.WARNING,
350
+ )
351
+ prompt_label_mapping = _prompt_label_mapping
352
+ if _allowed_model_types is not None:
353
+ log_once(
354
+ "The `_allowed_model_types` argument is deprecated. Please use "
355
+ "`allowed_model_types` instead.",
356
+ level=logging.WARNING,
357
+ )
358
+ allowed_model_types = _allowed_model_types
359
+ if _allowed_generative_types is not None:
360
+ log_once(
361
+ "The `_allowed_generative_types` argument is deprecated. Please use "
362
+ "`allowed_generative_types` instead.",
363
+ level=logging.WARNING,
364
+ )
365
+ allowed_generative_types = _allowed_generative_types
366
+ if _allow_invalid_model_outputs is not None:
367
+ log_once(
368
+ "The `_allow_invalid_model_outputs` argument is deprecated. Please use "
369
+ "`allow_invalid_model_outputs` instead.",
370
+ level=logging.WARNING,
371
+ )
372
+ allow_invalid_model_outputs = _allow_invalid_model_outputs
373
+ if _logging_string is not None:
374
+ log_once(
375
+ "The `_logging_string` argument is deprecated and is not used anymore. "
376
+ "Using it will have no effect.",
377
+ level=logging.WARNING,
378
+ )
223
379
 
224
- name: str
225
- pretty_name: str
226
- source: str | dict[str, str]
227
- task: Task
228
- languages: c.Sequence[Language]
229
- _prompt_prefix: str | None = None
230
- _prompt_template: str | None = None
231
- _instruction_prompt: str | None = None
232
- _num_few_shot_examples: int | None = None
233
- _max_generated_tokens: int | None = None
234
- _labels: c.Sequence[str] | None = None
235
- _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
236
- _allowed_model_types: c.Sequence[ModelType] | None = None
237
- _allowed_generative_types: c.Sequence[GenerativeType] | None = None
238
- _allow_invalid_model_outputs: bool | None = None
239
- _logging_string: str | None = None
240
- splits: c.Sequence[str] = field(default_factory=lambda: ["train", "val", "test"])
241
- bootstrap_samples: bool = True
242
- unofficial: bool = False
380
+ self._name = name
381
+ self._pretty_name = pretty_name
382
+ self._source = source
383
+ self.task = task
384
+ self.languages = languages
385
+
386
+ template = self.task.template_dict.get(self.main_language)
387
+ self.prompt_prefix = (
388
+ prompt_prefix
389
+ if prompt_prefix is not None
390
+ else template.default_prompt_prefix
391
+ if template is not None
392
+ else ""
393
+ )
394
+ self.prompt_template = (
395
+ prompt_template
396
+ if prompt_template is not None
397
+ else template.default_prompt_template
398
+ if template is not None
399
+ else ""
400
+ )
401
+ self.instruction_prompt = (
402
+ instruction_prompt
403
+ if instruction_prompt is not None
404
+ else template.default_instruction_prompt
405
+ if template is not None
406
+ else ""
407
+ )
408
+ self.num_few_shot_examples = (
409
+ num_few_shot_examples
410
+ if num_few_shot_examples is not None
411
+ else self.task.default_num_few_shot_examples
412
+ )
413
+ self.max_generated_tokens = (
414
+ max_generated_tokens
415
+ if max_generated_tokens is not None
416
+ else self.task.default_max_generated_tokens
417
+ )
418
+ self.labels = (
419
+ labels if labels is not None else self.task.default_labels or list()
420
+ )
421
+ if prompt_label_mapping is None:
422
+ prompt_label_mapping = (
423
+ template.default_prompt_label_mapping
424
+ if template is not None
425
+ else dict()
426
+ )
427
+ self.prompt_label_mapping = (
428
+ {label: label for label in self.labels}
429
+ if prompt_label_mapping == "auto"
430
+ else prompt_label_mapping
431
+ )
432
+ self.allowed_model_types = (
433
+ allowed_model_types
434
+ if allowed_model_types is not None
435
+ else self.task.default_allowed_model_types
436
+ )
437
+ self.allowed_generative_types = (
438
+ allowed_generative_types
439
+ if allowed_generative_types is not None
440
+ else self.task.default_allowed_generative_types
441
+ )
442
+ self.allow_invalid_model_outputs = (
443
+ allow_invalid_model_outputs
444
+ if allow_invalid_model_outputs is not None
445
+ else self.task.default_allow_invalid_model_outputs
446
+ )
447
+ self.train_split = train_split
448
+ self.val_split = val_split
449
+ self.test_split = test_split
450
+ self.bootstrap_samples = bootstrap_samples
451
+ self.unofficial = unofficial
243
452
 
244
453
  @property
245
- def main_language(self) -> Language:
246
- """Get the main language of the dataset.
454
+ def name(self) -> str:
455
+ """The name of the dataset.
247
456
 
248
457
  Returns:
249
- The main language.
458
+ The name of the dataset.
250
459
  """
251
- match len(self.languages):
252
- case 0:
253
- raise InvalidBenchmark(
254
- f"Dataset {self.name!r} must have at least one language."
255
- )
256
- case 1:
257
- return self.languages[0]
258
- case _:
259
- if ENGLISH in self.languages:
260
- return ENGLISH
261
- elif NORWEGIAN in self.languages:
262
- return NORWEGIAN
263
- elif PORTUGUESE in self.languages:
264
- return PORTUGUESE
265
- else:
266
- return self.languages[0]
460
+ if self._name is None:
461
+ raise ValueError("The name of the dataset is not set!")
462
+ return self._name
463
+
464
+ @name.setter
465
+ def name(self, value: str) -> None:
466
+ """Set the name of the dataset.
467
+
468
+ Args:
469
+ value:
470
+ The new name of the dataset.
471
+ """
472
+ self._name = value
473
+
474
+ @property
475
+ def pretty_name(self) -> str:
476
+ """The pretty name of the dataset.
477
+
478
+ Returns:
479
+ The pretty name of the dataset.
480
+ """
481
+ if self._pretty_name is None:
482
+ raise ValueError("The pretty name of the dataset is not set!")
483
+ return self._pretty_name
484
+
485
+ @pretty_name.setter
486
+ def pretty_name(self, value: str) -> None:
487
+ """Set the pretty name of the dataset.
488
+
489
+ Args:
490
+ value:
491
+ The new pretty name of the dataset.
492
+ """
493
+ self._pretty_name = value
494
+
495
+ @property
496
+ def source(self) -> str | dict[str, str]:
497
+ """The source of the dataset.
498
+
499
+ Returns:
500
+ The source of the dataset.
501
+ """
502
+ if self._source is None:
503
+ raise ValueError("The source of the dataset is not set!")
504
+ return self._source
505
+
506
+ @source.setter
507
+ def source(self, value: str | dict[str, str]) -> None:
508
+ """Set the source of the dataset.
509
+
510
+ Args:
511
+ value:
512
+ The new source of the dataset.
513
+ """
514
+ self._source = value
267
515
 
268
516
  @property
269
517
  def logging_string(self) -> str:
270
- """The string used to describe evaluation on the dataset in logging."""
271
- if self._logging_string is not None:
272
- return self._logging_string
518
+ """The string used to describe evaluation on the dataset in logging.
273
519
 
520
+ Returns:
521
+ The logging string.
522
+ """
274
523
  truncated_str = (
275
524
  "truncated version of the "
276
525
  if isinstance(self.source, str) and self.source.endswith("-mini")
@@ -294,126 +543,48 @@ class DatasetConfig:
294
543
  if PORTUGUESE in self.languages and EUROPEAN_PORTUGUESE in self.languages:
295
544
  logging_languages.remove(EUROPEAN_PORTUGUESE)
296
545
 
297
- if len(logging_languages) > 1:
546
+ if len(logging_languages) > MAX_NUMBER_OF_LOGGING_LANGUAGES:
547
+ languages_str = ""
548
+ elif len(logging_languages) > 1:
298
549
  languages_str = (
299
550
  ", ".join([lang.name for lang in logging_languages[:-1]])
300
551
  + f" and {logging_languages[-1].name}"
552
+ + " "
301
553
  )
302
554
  else:
303
- languages_str = logging_languages[0].name
555
+ languages_str = logging_languages[0].name + " "
304
556
 
305
557
  task_str = self.task.name.replace("-", " ")
306
558
  dataset_name_str = (
307
559
  self.pretty_name or self.name.replace("-", " ").replace("_", " ").title()
308
560
  )
309
561
  return (
310
- f"the {truncated_str}{languages_str} {task_str} dataset {dataset_name_str}"
311
- )
312
-
313
- @property
314
- def prompt_prefix(self) -> str:
315
- """The prefix to use in the few-shot prompt."""
316
- prompt_config = self.task.template_dict[self.main_language]
317
- prompt_prefix = (
318
- prompt_config.default_prompt_prefix
319
- if self._prompt_prefix is None
320
- else self._prompt_prefix
321
- )
322
- return prompt_prefix
323
-
324
- @property
325
- def prompt_template(self) -> str:
326
- """The template used during few-shot evaluation."""
327
- prompt_config = self.task.template_dict[self.main_language]
328
- prompt_template = (
329
- prompt_config.default_prompt_template
330
- if self._prompt_template is None
331
- else self._prompt_template
332
- )
333
- return prompt_template
334
-
335
- @property
336
- def instruction_prompt(self) -> str:
337
- """The prompt to use when evaluating instruction-tuned models."""
338
- prompt_config = self.task.template_dict[self.main_language]
339
- instruction_prompt = (
340
- prompt_config.default_instruction_prompt
341
- if self._instruction_prompt is None
342
- else self._instruction_prompt
343
- )
344
- return instruction_prompt
345
-
346
- @property
347
- def num_few_shot_examples(self) -> int:
348
- """The number of few-shot examples to use."""
349
- return (
350
- self._num_few_shot_examples
351
- if self._num_few_shot_examples is not None
352
- else self.task.default_num_few_shot_examples
353
- )
354
-
355
- @property
356
- def max_generated_tokens(self) -> int:
357
- """The maximum number of tokens to generate when evaluating a model."""
358
- return (
359
- self._max_generated_tokens
360
- if self._max_generated_tokens is not None
361
- else self.task.default_max_generated_tokens
362
- )
363
-
364
- @property
365
- def labels(self) -> c.Sequence[str]:
366
- """The labels in the dataset."""
367
- if self._labels is not None:
368
- return self._labels
369
- elif self.task.default_labels is not None:
370
- return self.task.default_labels
371
- else:
372
- raise ValueError(
373
- f"Labels must be specified for dataset {self.name!r} with the "
374
- f"attribute `_labels`, as the task {self.task.name!r} does not have "
375
- "default labels."
376
- )
377
-
378
- @property
379
- def prompt_label_mapping(self) -> dict[str, str]:
380
- """Mapping from English labels to localised labels."""
381
- if self._prompt_label_mapping == "auto":
382
- return {label: label for label in self.labels}
383
- elif self._prompt_label_mapping is not None:
384
- return self._prompt_label_mapping
385
- prompt_config = self.task.template_dict[self.main_language]
386
- if prompt_config.default_prompt_label_mapping == "auto":
387
- return {label: label for label in self.labels}
388
- else:
389
- return prompt_config.default_prompt_label_mapping
390
-
391
- @property
392
- def allowed_model_types(self) -> c.Sequence[ModelType]:
393
- """A list of model types that are allowed to be evaluated on this dataset."""
394
- return (
395
- self._allowed_model_types
396
- if self._allowed_model_types is not None
397
- else self.task.default_allowed_model_types
562
+ f"the {truncated_str}{languages_str}{task_str} dataset {dataset_name_str}"
398
563
  )
399
564
 
400
565
  @property
401
- def allowed_generative_types(self) -> c.Sequence[GenerativeType]:
402
- """A list of generative model types that are allowed on this dataset."""
403
- return (
404
- self._allowed_generative_types
405
- if self._allowed_generative_types is not None
406
- else self.task.default_allowed_generative_types
407
- )
566
+ def main_language(self) -> Language:
567
+ """Get the main language of the dataset.
408
568
 
409
- @property
410
- def allow_invalid_model_outputs(self) -> bool:
411
- """Whether to allow invalid model outputs."""
412
- return (
413
- self._allow_invalid_model_outputs
414
- if self._allow_invalid_model_outputs is not None
415
- else self.task.default_allow_invalid_model_outputs
416
- )
569
+ Returns:
570
+ The main language.
571
+ """
572
+ match len(self.languages):
573
+ case 0:
574
+ raise InvalidBenchmark(
575
+ f"Dataset {self.name!r} must have at least one language."
576
+ )
577
+ case 1:
578
+ return self.languages[0]
579
+ case _:
580
+ if ENGLISH in self.languages:
581
+ return ENGLISH
582
+ elif NORWEGIAN in self.languages:
583
+ return NORWEGIAN
584
+ elif PORTUGUESE in self.languages:
585
+ return PORTUGUESE
586
+ else:
587
+ return self.languages[0]
417
588
 
418
589
  @property
419
590
  def id2label(self) -> "HashableDict":