ScandEval 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. scandeval/__init__.py +0 -9
  2. scandeval/async_utils.py +46 -0
  3. scandeval/benchmark_config_factory.py +31 -2
  4. scandeval/benchmark_modules/fresh.py +2 -1
  5. scandeval/benchmark_modules/hf.py +76 -23
  6. scandeval/benchmark_modules/litellm.py +33 -15
  7. scandeval/benchmark_modules/vllm.py +97 -44
  8. scandeval/benchmarker.py +29 -33
  9. scandeval/cli.py +11 -0
  10. scandeval/constants.py +36 -2
  11. scandeval/custom_dataset_configs.py +152 -0
  12. scandeval/data_loading.py +87 -31
  13. scandeval/data_models.py +405 -224
  14. scandeval/dataset_configs/__init__.py +51 -25
  15. scandeval/dataset_configs/albanian.py +1 -1
  16. scandeval/dataset_configs/belarusian.py +47 -0
  17. scandeval/dataset_configs/bulgarian.py +1 -1
  18. scandeval/dataset_configs/catalan.py +1 -1
  19. scandeval/dataset_configs/croatian.py +1 -1
  20. scandeval/dataset_configs/danish.py +3 -2
  21. scandeval/dataset_configs/dutch.py +16 -5
  22. scandeval/dataset_configs/english.py +4 -3
  23. scandeval/dataset_configs/estonian.py +8 -7
  24. scandeval/dataset_configs/faroese.py +1 -1
  25. scandeval/dataset_configs/finnish.py +5 -4
  26. scandeval/dataset_configs/french.py +6 -5
  27. scandeval/dataset_configs/german.py +4 -3
  28. scandeval/dataset_configs/greek.py +1 -1
  29. scandeval/dataset_configs/hungarian.py +1 -1
  30. scandeval/dataset_configs/icelandic.py +4 -3
  31. scandeval/dataset_configs/italian.py +4 -3
  32. scandeval/dataset_configs/latvian.py +2 -2
  33. scandeval/dataset_configs/lithuanian.py +1 -1
  34. scandeval/dataset_configs/norwegian.py +6 -5
  35. scandeval/dataset_configs/polish.py +4 -3
  36. scandeval/dataset_configs/portuguese.py +5 -4
  37. scandeval/dataset_configs/romanian.py +2 -2
  38. scandeval/dataset_configs/serbian.py +1 -1
  39. scandeval/dataset_configs/slovene.py +1 -1
  40. scandeval/dataset_configs/spanish.py +4 -3
  41. scandeval/dataset_configs/swedish.py +4 -3
  42. scandeval/dataset_configs/ukrainian.py +1 -1
  43. scandeval/generation_utils.py +6 -6
  44. scandeval/metrics/__init__.py +1 -0
  45. scandeval/metrics/bias.py +237 -0
  46. scandeval/metrics/huggingface.py +2 -1
  47. scandeval/metrics/llm_as_a_judge.py +1 -1
  48. scandeval/metrics/pipeline.py +1 -1
  49. scandeval/model_cache.py +34 -4
  50. scandeval/prompt_templates/linguistic_acceptability.py +9 -0
  51. scandeval/prompt_templates/multiple_choice.py +9 -0
  52. scandeval/prompt_templates/named_entity_recognition.py +21 -0
  53. scandeval/prompt_templates/reading_comprehension.py +10 -0
  54. scandeval/prompt_templates/sentiment_classification.py +11 -0
  55. scandeval/string_utils.py +157 -0
  56. scandeval/task_group_utils/sequence_classification.py +2 -5
  57. scandeval/task_group_utils/token_classification.py +2 -4
  58. scandeval/tasks.py +22 -0
  59. scandeval/tokenisation_utils.py +12 -1
  60. scandeval/utils.py +13 -383
  61. scandeval-16.13.0.dist-info/METADATA +334 -0
  62. scandeval-16.13.0.dist-info/RECORD +94 -0
  63. scandeval-16.11.0.dist-info/METADATA +0 -649
  64. scandeval-16.11.0.dist-info/RECORD +0 -89
  65. {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
  66. {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
  67. {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0
scandeval/data_models.py CHANGED
@@ -1,7 +1,10 @@
1
1
  """Data models used in EuroEval."""
2
2
 
3
3
  import collections.abc as c
4
+ import importlib.metadata
5
+ import importlib.util
4
6
  import json
7
+ import logging
5
8
  import re
6
9
  import typing as t
7
10
  from copy import deepcopy
@@ -12,6 +15,7 @@ import pydantic
12
15
  import torch
13
16
  from transformers.generation.configuration_utils import GenerationConfig
14
17
 
18
+ from .constants import ATTENTION_BACKENDS, MAX_NUMBER_OF_LOGGING_LANGUAGES
15
19
  from .enums import Device, GenerativeType, ModelType, TaskGroup
16
20
  from .exceptions import InvalidBenchmark
17
21
  from .languages import (
@@ -23,14 +27,30 @@ from .languages import (
23
27
  PORTUGUESE,
24
28
  Language,
25
29
  )
30
+ from .logging_utils import log_once
26
31
  from .metrics.base import Metric
27
32
  from .types import ScoreDict
28
- from .utils import get_package_version
29
33
 
30
34
  if t.TYPE_CHECKING:
31
35
  from .enums import InferenceBackend
32
36
 
33
37
 
38
+ def get_package_version(package_name: str) -> str | None:
39
+ """Get the version of a package.
40
+
41
+ Args:
42
+ package_name:
43
+ The name of the package.
44
+
45
+ Returns:
46
+ The version of the package, or None if the package is not installed.
47
+ """
48
+ try:
49
+ return importlib.metadata.version(package_name)
50
+ except importlib.metadata.PackageNotFoundError:
51
+ return None
52
+
53
+
34
54
  @dataclass
35
55
  class PromptConfig:
36
56
  """Configuration for task-specific prompting across languages.
@@ -79,8 +99,9 @@ class Task:
79
99
  default_max_generated_tokens:
80
100
  The default maximum number of tokens to generate when benchmarking the task
81
101
  using few-shot evaluation.
82
- default_labels:
83
- The default labels for datasets using this task.
102
+ default_labels (optional):
103
+ The default labels for datasets using this task. Can be None if the labels
104
+ should be set manually in the dataset configs. Defaults to an empty tuple.
84
105
  requires_zero_shot (optional):
85
106
  Whether to only allow zero-shot evaluation for this task. If True, the
86
107
  task will not be evaluated using few-shot examples.
@@ -117,7 +138,7 @@ class Task:
117
138
  metrics: c.Sequence[Metric]
118
139
  default_num_few_shot_examples: int
119
140
  default_max_generated_tokens: int
120
- default_labels: c.Sequence[str] | None
141
+ default_labels: c.Sequence[str] | None = tuple()
121
142
  requires_zero_shot: bool = False
122
143
  uses_structured_output: bool = False
123
144
  uses_logprobs: bool = False
@@ -143,133 +164,362 @@ class Task:
143
164
  return hash(self.name)
144
165
 
145
166
 
146
- @dataclass
147
167
  class DatasetConfig:
148
- """Configuration for a dataset.
168
+ """Configuration for a dataset."""
169
+
170
+ def __init__(
171
+ self,
172
+ task: Task,
173
+ languages: c.Sequence[Language],
174
+ name: str | None = None,
175
+ pretty_name: str | None = None,
176
+ source: str | dict[str, str] | None = None,
177
+ prompt_prefix: str | None = None,
178
+ prompt_template: str | None = None,
179
+ instruction_prompt: str | None = None,
180
+ num_few_shot_examples: int | None = None,
181
+ max_generated_tokens: int | None = None,
182
+ labels: c.Sequence[str] | None = None,
183
+ prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None,
184
+ allowed_model_types: c.Sequence[ModelType] | None = None,
185
+ allowed_generative_types: c.Sequence[GenerativeType] | None = None,
186
+ allow_invalid_model_outputs: bool | None = None,
187
+ train_split: str | None = "train",
188
+ val_split: str | None = "val",
189
+ test_split: str = "test",
190
+ bootstrap_samples: bool = True,
191
+ unofficial: bool = False,
192
+ _prompt_prefix: str | None = None,
193
+ _prompt_template: str | None = None,
194
+ _instruction_prompt: str | None = None,
195
+ _num_few_shot_examples: int | None = None,
196
+ _max_generated_tokens: int | None = None,
197
+ _labels: c.Sequence[str] | None = None,
198
+ _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None,
199
+ _allowed_model_types: c.Sequence[ModelType] | None = None,
200
+ _allowed_generative_types: c.Sequence[GenerativeType] | None = None,
201
+ _allow_invalid_model_outputs: bool | None = None,
202
+ _logging_string: str | None = None,
203
+ ) -> None:
204
+ """Initialise a DatasetConfig object.
149
205
 
150
- Attributes:
151
- name:
152
- The name of the dataset. Must be lower case with no spaces.
153
- pretty_name:
154
- A longer prettier name for the dataset, which allows cases and spaces. Used
155
- for logging.
156
- source:
157
- The source of the dataset, which can be a Hugging Face ID or a dictionary
158
- with keys "train", "val" and "test" mapping to local CSV file paths.
159
- task:
160
- The task of the dataset.
161
- languages:
162
- The ISO 639-1 language codes of the entries in the dataset.
163
- id2label:
164
- The mapping from ID to label.
165
- label2id:
166
- The mapping from label to ID.
167
- num_labels:
168
- The number of labels in the dataset.
169
- _prompt_prefix (optional):
170
- The prefix to use in the few-shot prompt. Defaults to the template for the
171
- task and language.
172
- _prompt_template (optional):
173
- The template for the prompt to use when benchmarking the dataset using
174
- few-shot evaluation. Defaults to the template for the task and language.
175
- _instruction_prompt (optional):
176
- The prompt to use when benchmarking the dataset using instruction-based
177
- evaluation. Defaults to the template for the task and language.
178
- _num_few_shot_examples (optional):
179
- The number of examples to use when benchmarking the dataset using few-shot
180
- evaluation. For a classification task, these will be drawn evenly from
181
- each label. Defaults to the template for the task and language.
182
- _max_generated_tokens (optional):
183
- The maximum number of tokens to generate when benchmarking the dataset
184
- using few-shot evaluation. Defaults to the template for the task and
185
- language.
186
- _labels (optional):
187
- The labels in the dataset. Defaults to the template for the task and
188
- language.
189
- _prompt_label_mapping (optional):
190
- A mapping from the labels to another phrase which is used as a substitute
191
- for the label in few-shot evaluation. If "auto" then the mapping will be set
192
- to a 1:1 mapping between the labels and themselves. If None then the mapping
193
- will be set to the default mapping for the task and language. Defaults to
194
- None.
195
- _allowed_model_types (optional):
196
- A list of model types that are allowed to be evaluated on this dataset.
197
- Defaults to the one for the task.
198
- _allowed_generative_types (optional):
199
- A list of generative model types that are allowed to be evaluated on this
200
- dataset. If None, all generative model types are allowed. Only relevant if
201
- `allowed_model_types` includes generative models. Defaults to the one for
202
- the task.
203
- _allow_invalid_model_outputs (optional):
204
- Whether to allow invalid model outputs. This is only relevant for
205
- generative models on classification tasks, where the model may generate an
206
- output which is not one of the allowed labels. If True, the model output
207
- will be mapped to the closest valid label. If False, the model output will
208
- be considered incorrect and the evaluation will be aborted. Defaults to
209
- the one for the task.
210
- _logging_string (optional):
211
- The string used to describe evaluation on the dataset in logging. If not
212
- provided, a default string will be generated, based on the pretty name. Only
213
- use this if the default string is not suitable.
214
- splits (optional):
215
- The names of the splits in the dataset. If not provided, defaults to
216
- ["train", "val", "test"].
217
- bootstrap_samples (optional):
218
- Whether to bootstrap the dataset samples. Defaults to True.
219
- unofficial (optional):
220
- Whether the dataset is unofficial. Defaults to False.
221
- """
206
+ Args:
207
+ task:
208
+ The task of the dataset.
209
+ languages:
210
+ The ISO 639-1 language codes of the entries in the dataset.
211
+ name (optional):
212
+ The name of the dataset. Must be lower case with no spaces. Can be None
213
+ if and only if the dataset config resides directly in the Hugging Face
214
+ dataset repo. Defaults to None.
215
+ pretty_name (optional):
216
+ A longer prettier name for the dataset, which allows cases and spaces.
217
+ Used for logging. Can be None if and only if the dataset config resides
218
+ directly in the Hugging Face dataset repo. Defaults to None.
219
+ source (optional):
220
+ The source of the dataset, which can be a Hugging Face ID or a
221
+ dictionary with keys "train", "val" and "test" mapping to local CSV file
222
+ paths. Can be None if and only if the dataset config resides directly in
223
+ the Hugging Face dataset repo. Defaults to None.
224
+ prompt_prefix (optional):
225
+ The prefix to use in the few-shot prompt. Defaults to the template for
226
+ the task and language.
227
+ prompt_template (optional):
228
+ The template for the prompt to use when benchmarking the dataset using
229
+ few-shot evaluation. Defaults to the template for the task and language.
230
+ instruction_prompt (optional):
231
+ The prompt to use when benchmarking the dataset using instruction-based
232
+ evaluation. Defaults to the template for the task and language.
233
+ num_few_shot_examples (optional):
234
+ The number of examples to use when benchmarking the dataset using
235
+ few-shot evaluation. For a classification task, these will be drawn
236
+ evenly from each label. Defaults to the template for the task and
237
+ language.
238
+ max_generated_tokens (optional):
239
+ The maximum number of tokens to generate when benchmarking the dataset
240
+ using few-shot evaluation. Defaults to the template for the task and
241
+ language.
242
+ labels (optional):
243
+ The labels in the dataset. Defaults to the template for the task and
244
+ language.
245
+ prompt_label_mapping (optional):
246
+ A mapping from the labels to another phrase which is used as a
247
+ substitute for the label in few-shot evaluation. If "auto" then the
248
+ mapping will be set to a 1:1 mapping between the labels and themselves.
249
+ If None then the mapping will be set to the default mapping for the task
250
+ and language. Defaults to None.
251
+ allowed_model_types (optional):
252
+ A list of model types that are allowed to be evaluated on this dataset.
253
+ Defaults to the one for the task.
254
+ allowed_generative_types (optional):
255
+ A list of generative model types that are allowed to be evaluated on
256
+ this dataset. If None, all generative model types are allowed. Only
257
+ relevant if `allowed_model_types` includes generative models. Defaults
258
+ to the one for the task.
259
+ allow_invalid_model_outputs (optional):
260
+ Whether to allow invalid model outputs. This is only relevant for
261
+ generative models on classification tasks, where the model may generate
262
+ an output which is not one of the allowed labels. If True, the model
263
+ output will be mapped to the closest valid label. If False, the model
264
+ output will be considered incorrect and the evaluation will be aborted.
265
+ Defaults to the one for the task.
266
+ train_split (optional):
267
+ The name of the split to use as the training set. Can be None if there
268
+ is no training split in the dataset. Defaults to "train".
269
+ val_split (optional):
270
+ The name of the split to use as the validation set. Can be None if there
271
+ is no validation split in the dataset. Defaults to "val".
272
+ test_split (optional):
273
+ The name of the split to use as the test set. Defaults to "test".
274
+ bootstrap_samples (optional):
275
+ Whether to bootstrap the dataset samples. Defaults to True.
276
+ unofficial (optional):
277
+ Whether the dataset is unofficial. Defaults to False.
278
+ _prompt_prefix (optional):
279
+ This argument is deprecated. Please use `prompt_prefix` instead.
280
+ _prompt_template (optional):
281
+ This argument is deprecated. Please use `prompt_template` instead.
282
+ _instruction_prompt (optional):
283
+ This argument is deprecated. Please use `instruction_prompt` instead.
284
+ _num_few_shot_examples (optional):
285
+ This argument is deprecated. Please use `num_few_shot_examples` instead.
286
+ _max_generated_tokens (optional):
287
+ This argument is deprecated. Please use `max_generated_tokens` instead.
288
+ _labels (optional):
289
+ This argument is deprecated. Please use `labels` instead.
290
+ _prompt_label_mapping (optional):
291
+ This argument is deprecated. Please use `prompt_label_mapping` instead.
292
+ _allowed_model_types (optional):
293
+ This argument is deprecated. Please use `allowed_model_types` instead.
294
+ _allowed_generative_types (optional):
295
+ This argument is deprecated. Please use `allowed_generative_types`
296
+ instead.
297
+ _allow_invalid_model_outputs (optional):
298
+ This argument is deprecated. Please use `allow_invalid_model_outputs`
299
+ instead.
300
+ _logging_string (optional):
301
+ This argument is deprecated. Please use `logging_string` instead.
302
+ """
303
+ # Deprecation warnings
304
+ if _prompt_prefix is not None:
305
+ log_once(
306
+ "The `_prompt_prefix` argument is deprecated. Please use "
307
+ "`prompt_prefix` instead.",
308
+ level=logging.WARNING,
309
+ )
310
+ prompt_prefix = _prompt_prefix
311
+ if _prompt_template is not None:
312
+ log_once(
313
+ "The `_prompt_template` argument is deprecated. Please use "
314
+ "`prompt_template` instead.",
315
+ level=logging.WARNING,
316
+ )
317
+ prompt_template = _prompt_template
318
+ if _instruction_prompt is not None:
319
+ log_once(
320
+ "The `_instruction_prompt` argument is deprecated. Please use "
321
+ "`instruction_prompt` instead.",
322
+ level=logging.WARNING,
323
+ )
324
+ instruction_prompt = _instruction_prompt
325
+ if _num_few_shot_examples is not None:
326
+ log_once(
327
+ "The `_num_few_shot_examples` argument is deprecated. Please use "
328
+ "`num_few_shot_examples` instead.",
329
+ level=logging.WARNING,
330
+ )
331
+ num_few_shot_examples = _num_few_shot_examples
332
+ if _max_generated_tokens is not None:
333
+ log_once(
334
+ "The `_max_generated_tokens` argument is deprecated. Please use "
335
+ "`max_generated_tokens` instead.",
336
+ level=logging.WARNING,
337
+ )
338
+ max_generated_tokens = _max_generated_tokens
339
+ if _labels is not None:
340
+ log_once(
341
+ "The `_labels` argument is deprecated. Please use `labels` instead.",
342
+ level=logging.WARNING,
343
+ )
344
+ labels = _labels
345
+ if _prompt_label_mapping is not None:
346
+ log_once(
347
+ "The `_prompt_label_mapping` argument is deprecated. Please use "
348
+ "`prompt_label_mapping` instead.",
349
+ level=logging.WARNING,
350
+ )
351
+ prompt_label_mapping = _prompt_label_mapping
352
+ if _allowed_model_types is not None:
353
+ log_once(
354
+ "The `_allowed_model_types` argument is deprecated. Please use "
355
+ "`allowed_model_types` instead.",
356
+ level=logging.WARNING,
357
+ )
358
+ allowed_model_types = _allowed_model_types
359
+ if _allowed_generative_types is not None:
360
+ log_once(
361
+ "The `_allowed_generative_types` argument is deprecated. Please use "
362
+ "`allowed_generative_types` instead.",
363
+ level=logging.WARNING,
364
+ )
365
+ allowed_generative_types = _allowed_generative_types
366
+ if _allow_invalid_model_outputs is not None:
367
+ log_once(
368
+ "The `_allow_invalid_model_outputs` argument is deprecated. Please use "
369
+ "`allow_invalid_model_outputs` instead.",
370
+ level=logging.WARNING,
371
+ )
372
+ allow_invalid_model_outputs = _allow_invalid_model_outputs
373
+ if _logging_string is not None:
374
+ log_once(
375
+ "The `_logging_string` argument is deprecated and is not used anymore. "
376
+ "Using it will have no effect.",
377
+ level=logging.WARNING,
378
+ )
222
379
 
223
- name: str
224
- pretty_name: str
225
- source: str | dict[str, str]
226
- task: Task
227
- languages: c.Sequence[Language]
228
- _prompt_prefix: str | None = None
229
- _prompt_template: str | None = None
230
- _instruction_prompt: str | None = None
231
- _num_few_shot_examples: int | None = None
232
- _max_generated_tokens: int | None = None
233
- _labels: c.Sequence[str] | None = None
234
- _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
235
- _allowed_model_types: c.Sequence[ModelType] | None = None
236
- _allowed_generative_types: c.Sequence[GenerativeType] | None = None
237
- _allow_invalid_model_outputs: bool | None = None
238
- _logging_string: str | None = None
239
- splits: c.Sequence[str] = field(default_factory=lambda: ["train", "val", "test"])
240
- bootstrap_samples: bool = True
241
- unofficial: bool = False
380
+ self._name = name
381
+ self._pretty_name = pretty_name
382
+ self._source = source
383
+ self.task = task
384
+ self.languages = languages
385
+
386
+ template = self.task.template_dict.get(self.main_language)
387
+ self.prompt_prefix = (
388
+ prompt_prefix
389
+ if prompt_prefix is not None
390
+ else template.default_prompt_prefix
391
+ if template is not None
392
+ else ""
393
+ )
394
+ self.prompt_template = (
395
+ prompt_template
396
+ if prompt_template is not None
397
+ else template.default_prompt_template
398
+ if template is not None
399
+ else ""
400
+ )
401
+ self.instruction_prompt = (
402
+ instruction_prompt
403
+ if instruction_prompt is not None
404
+ else template.default_instruction_prompt
405
+ if template is not None
406
+ else ""
407
+ )
408
+ self.num_few_shot_examples = (
409
+ num_few_shot_examples
410
+ if num_few_shot_examples is not None
411
+ else self.task.default_num_few_shot_examples
412
+ )
413
+ self.max_generated_tokens = (
414
+ max_generated_tokens
415
+ if max_generated_tokens is not None
416
+ else self.task.default_max_generated_tokens
417
+ )
418
+ self.labels = (
419
+ labels if labels is not None else self.task.default_labels or list()
420
+ )
421
+ if prompt_label_mapping is None:
422
+ prompt_label_mapping = (
423
+ template.default_prompt_label_mapping
424
+ if template is not None
425
+ else dict()
426
+ )
427
+ self.prompt_label_mapping = (
428
+ {label: label for label in self.labels}
429
+ if prompt_label_mapping == "auto"
430
+ else prompt_label_mapping
431
+ )
432
+ self.allowed_model_types = (
433
+ allowed_model_types
434
+ if allowed_model_types is not None
435
+ else self.task.default_allowed_model_types
436
+ )
437
+ self.allowed_generative_types = (
438
+ allowed_generative_types
439
+ if allowed_generative_types is not None
440
+ else self.task.default_allowed_generative_types
441
+ )
442
+ self.allow_invalid_model_outputs = (
443
+ allow_invalid_model_outputs
444
+ if allow_invalid_model_outputs is not None
445
+ else self.task.default_allow_invalid_model_outputs
446
+ )
447
+ self.train_split = train_split
448
+ self.val_split = val_split
449
+ self.test_split = test_split
450
+ self.bootstrap_samples = bootstrap_samples
451
+ self.unofficial = unofficial
242
452
 
243
453
  @property
244
- def main_language(self) -> Language:
245
- """Get the main language of the dataset.
454
+ def name(self) -> str:
455
+ """The name of the dataset.
246
456
 
247
457
  Returns:
248
- The main language.
458
+ The name of the dataset.
249
459
  """
250
- match len(self.languages):
251
- case 0:
252
- raise InvalidBenchmark(
253
- f"Dataset {self.name!r} must have at least one language."
254
- )
255
- case 1:
256
- return self.languages[0]
257
- case _:
258
- if ENGLISH in self.languages:
259
- return ENGLISH
260
- elif NORWEGIAN in self.languages:
261
- return NORWEGIAN
262
- elif PORTUGUESE in self.languages:
263
- return PORTUGUESE
264
- else:
265
- return self.languages[0]
460
+ if self._name is None:
461
+ raise ValueError("The name of the dataset is not set!")
462
+ return self._name
463
+
464
+ @name.setter
465
+ def name(self, value: str) -> None:
466
+ """Set the name of the dataset.
467
+
468
+ Args:
469
+ value:
470
+ The new name of the dataset.
471
+ """
472
+ self._name = value
473
+
474
+ @property
475
+ def pretty_name(self) -> str:
476
+ """The pretty name of the dataset.
477
+
478
+ Returns:
479
+ The pretty name of the dataset.
480
+ """
481
+ if self._pretty_name is None:
482
+ raise ValueError("The pretty name of the dataset is not set!")
483
+ return self._pretty_name
484
+
485
+ @pretty_name.setter
486
+ def pretty_name(self, value: str) -> None:
487
+ """Set the pretty name of the dataset.
488
+
489
+ Args:
490
+ value:
491
+ The new pretty name of the dataset.
492
+ """
493
+ self._pretty_name = value
494
+
495
+ @property
496
+ def source(self) -> str | dict[str, str]:
497
+ """The source of the dataset.
498
+
499
+ Returns:
500
+ The source of the dataset.
501
+ """
502
+ if self._source is None:
503
+ raise ValueError("The source of the dataset is not set!")
504
+ return self._source
505
+
506
+ @source.setter
507
+ def source(self, value: str | dict[str, str]) -> None:
508
+ """Set the source of the dataset.
509
+
510
+ Args:
511
+ value:
512
+ The new source of the dataset.
513
+ """
514
+ self._source = value
266
515
 
267
516
  @property
268
517
  def logging_string(self) -> str:
269
- """The string used to describe evaluation on the dataset in logging."""
270
- if self._logging_string is not None:
271
- return self._logging_string
518
+ """The string used to describe evaluation on the dataset in logging.
272
519
 
520
+ Returns:
521
+ The logging string.
522
+ """
273
523
  truncated_str = (
274
524
  "truncated version of the "
275
525
  if isinstance(self.source, str) and self.source.endswith("-mini")
@@ -293,126 +543,48 @@ class DatasetConfig:
293
543
  if PORTUGUESE in self.languages and EUROPEAN_PORTUGUESE in self.languages:
294
544
  logging_languages.remove(EUROPEAN_PORTUGUESE)
295
545
 
296
- if len(logging_languages) > 1:
546
+ if len(logging_languages) > MAX_NUMBER_OF_LOGGING_LANGUAGES:
547
+ languages_str = ""
548
+ elif len(logging_languages) > 1:
297
549
  languages_str = (
298
550
  ", ".join([lang.name for lang in logging_languages[:-1]])
299
551
  + f" and {logging_languages[-1].name}"
552
+ + " "
300
553
  )
301
554
  else:
302
- languages_str = logging_languages[0].name
555
+ languages_str = logging_languages[0].name + " "
303
556
 
304
557
  task_str = self.task.name.replace("-", " ")
305
558
  dataset_name_str = (
306
559
  self.pretty_name or self.name.replace("-", " ").replace("_", " ").title()
307
560
  )
308
561
  return (
309
- f"the {truncated_str}{languages_str} {task_str} dataset {dataset_name_str}"
310
- )
311
-
312
- @property
313
- def prompt_prefix(self) -> str:
314
- """The prefix to use in the few-shot prompt."""
315
- prompt_config = self.task.template_dict[self.main_language]
316
- prompt_prefix = (
317
- prompt_config.default_prompt_prefix
318
- if self._prompt_prefix is None
319
- else self._prompt_prefix
320
- )
321
- return prompt_prefix
322
-
323
- @property
324
- def prompt_template(self) -> str:
325
- """The template used during few-shot evaluation."""
326
- prompt_config = self.task.template_dict[self.main_language]
327
- prompt_template = (
328
- prompt_config.default_prompt_template
329
- if self._prompt_template is None
330
- else self._prompt_template
331
- )
332
- return prompt_template
333
-
334
- @property
335
- def instruction_prompt(self) -> str:
336
- """The prompt to use when evaluating instruction-tuned models."""
337
- prompt_config = self.task.template_dict[self.main_language]
338
- instruction_prompt = (
339
- prompt_config.default_instruction_prompt
340
- if self._instruction_prompt is None
341
- else self._instruction_prompt
342
- )
343
- return instruction_prompt
344
-
345
- @property
346
- def num_few_shot_examples(self) -> int:
347
- """The number of few-shot examples to use."""
348
- return (
349
- self._num_few_shot_examples
350
- if self._num_few_shot_examples is not None
351
- else self.task.default_num_few_shot_examples
352
- )
353
-
354
- @property
355
- def max_generated_tokens(self) -> int:
356
- """The maximum number of tokens to generate when evaluating a model."""
357
- return (
358
- self._max_generated_tokens
359
- if self._max_generated_tokens is not None
360
- else self.task.default_max_generated_tokens
361
- )
362
-
363
- @property
364
- def labels(self) -> c.Sequence[str]:
365
- """The labels in the dataset."""
366
- if self._labels is not None:
367
- return self._labels
368
- elif self.task.default_labels is not None:
369
- return self.task.default_labels
370
- else:
371
- raise ValueError(
372
- f"Labels must be specified for dataset {self.name!r} with the "
373
- f"attribute `_labels`, as the task {self.task.name!r} does not have "
374
- "default labels."
375
- )
376
-
377
- @property
378
- def prompt_label_mapping(self) -> dict[str, str]:
379
- """Mapping from English labels to localised labels."""
380
- if self._prompt_label_mapping == "auto":
381
- return {label: label for label in self.labels}
382
- elif self._prompt_label_mapping is not None:
383
- return self._prompt_label_mapping
384
- prompt_config = self.task.template_dict[self.main_language]
385
- if prompt_config.default_prompt_label_mapping == "auto":
386
- return {label: label for label in self.labels}
387
- else:
388
- return prompt_config.default_prompt_label_mapping
389
-
390
- @property
391
- def allowed_model_types(self) -> c.Sequence[ModelType]:
392
- """A list of model types that are allowed to be evaluated on this dataset."""
393
- return (
394
- self._allowed_model_types
395
- if self._allowed_model_types is not None
396
- else self.task.default_allowed_model_types
562
+ f"the {truncated_str}{languages_str}{task_str} dataset {dataset_name_str}"
397
563
  )
398
564
 
399
565
  @property
400
- def allowed_generative_types(self) -> c.Sequence[GenerativeType]:
401
- """A list of generative model types that are allowed on this dataset."""
402
- return (
403
- self._allowed_generative_types
404
- if self._allowed_generative_types is not None
405
- else self.task.default_allowed_generative_types
406
- )
566
+ def main_language(self) -> Language:
567
+ """Get the main language of the dataset.
407
568
 
408
- @property
409
- def allow_invalid_model_outputs(self) -> bool:
410
- """Whether to allow invalid model outputs."""
411
- return (
412
- self._allow_invalid_model_outputs
413
- if self._allow_invalid_model_outputs is not None
414
- else self.task.default_allow_invalid_model_outputs
415
- )
569
+ Returns:
570
+ The main language.
571
+ """
572
+ match len(self.languages):
573
+ case 0:
574
+ raise InvalidBenchmark(
575
+ f"Dataset {self.name!r} must have at least one language."
576
+ )
577
+ case 1:
578
+ return self.languages[0]
579
+ case _:
580
+ if ENGLISH in self.languages:
581
+ return ENGLISH
582
+ elif NORWEGIAN in self.languages:
583
+ return NORWEGIAN
584
+ elif PORTUGUESE in self.languages:
585
+ return PORTUGUESE
586
+ else:
587
+ return self.languages[0]
416
588
 
417
589
  @property
418
590
  def id2label(self) -> "HashableDict":
@@ -517,6 +689,9 @@ class BenchmarkConfig:
517
689
  faster evaluation, but at the risk of running out of GPU memory. Only reduce
518
690
  this if you are running out of GPU memory. Only relevant if the model is
519
691
  generative.
692
+ attention_backend:
693
+ The attention backend to use for vLLM. Defaults to FLASHINFER. Only
694
+ relevant if the model is generative.
520
695
  requires_safetensors:
521
696
  Whether to only allow models that use the safetensors format.
522
697
  generative_type:
@@ -553,6 +728,9 @@ class BenchmarkConfig:
553
728
  few_shot: bool
554
729
  num_iterations: int
555
730
  gpu_memory_utilization: float
731
+ attention_backend: t.Literal[
732
+ *ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
733
+ ]
556
734
  requires_safetensors: bool
557
735
  generative_type: GenerativeType | None
558
736
  download_only: bool
@@ -601,6 +779,9 @@ class BenchmarkConfigParams(pydantic.BaseModel):
601
779
  requires_safetensors: bool
602
780
  download_only: bool
603
781
  gpu_memory_utilization: float
782
+ attention_backend: t.Literal[
783
+ *ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
784
+ ]
604
785
  generative_type: GenerativeType | None
605
786
  custom_datasets_file: Path
606
787
  force: bool