EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (54) hide show
  1. euroeval/__init__.py +2 -2
  2. euroeval/benchmark_modules/base.py +3 -2
  3. euroeval/benchmark_modules/fresh.py +8 -6
  4. euroeval/benchmark_modules/hf.py +44 -33
  5. euroeval/benchmark_modules/litellm.py +314 -120
  6. euroeval/benchmark_modules/vllm.py +99 -59
  7. euroeval/benchmarker.py +52 -21
  8. euroeval/callbacks.py +2 -2
  9. euroeval/constants.py +9 -2
  10. euroeval/data_models.py +258 -44
  11. euroeval/dataset_configs/__init__.py +61 -0
  12. euroeval/dataset_configs/danish.py +120 -0
  13. euroeval/dataset_configs/dutch.py +123 -0
  14. euroeval/dataset_configs/english.py +88 -0
  15. euroeval/dataset_configs/faroese.py +53 -0
  16. euroeval/dataset_configs/french.py +83 -0
  17. euroeval/dataset_configs/german.py +91 -0
  18. euroeval/dataset_configs/icelandic.py +148 -0
  19. euroeval/dataset_configs/italian.py +81 -0
  20. euroeval/dataset_configs/norwegian.py +178 -0
  21. euroeval/dataset_configs/spanish.py +78 -0
  22. euroeval/dataset_configs/swedish.py +100 -0
  23. euroeval/exceptions.py +10 -10
  24. euroeval/finetuning.py +6 -10
  25. euroeval/generation.py +1 -0
  26. euroeval/human_evaluation.py +2 -2
  27. euroeval/languages.py +20 -13
  28. euroeval/model_cache.py +1 -1
  29. euroeval/model_loading.py +1 -12
  30. euroeval/prompt_templates/__init__.py +8 -0
  31. euroeval/prompt_templates/linguistic_acceptability.py +112 -0
  32. euroeval/prompt_templates/multiple_choice.py +97 -0
  33. euroeval/prompt_templates/named_entity_recognition.py +257 -0
  34. euroeval/prompt_templates/reading_comprehension.py +118 -0
  35. euroeval/prompt_templates/sentiment_classification.py +137 -0
  36. euroeval/prompt_templates/summarization.py +97 -0
  37. euroeval/speed_benchmark.py +1 -1
  38. euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
  39. euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
  40. euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
  41. euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
  42. euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
  43. euroeval/tasks.py +54 -0
  44. euroeval/tokenization_utils.py +343 -0
  45. euroeval/types.py +3 -1
  46. euroeval/utils.py +5 -254
  47. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
  48. euroeval-15.6.0.dist-info/RECORD +59 -0
  49. euroeval/dataset_configs.py +0 -2408
  50. euroeval-15.4.2.dist-info/RECORD +0 -40
  51. /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
  52. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
  53. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
  54. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
euroeval/data_models.py CHANGED
@@ -10,10 +10,9 @@ from dataclasses import dataclass, field
10
10
  import pydantic
11
11
  import torch
12
12
 
13
- from euroeval.utils import get_package_version
14
-
15
13
  from .enums import Device, InferenceBackend, ModelType, TaskGroup
16
14
  from .types import ScoreDict
15
+ from .utils import get_package_version
17
16
 
18
17
 
19
18
  @dataclass
@@ -55,44 +54,107 @@ class MetricConfig:
55
54
 
56
55
 
57
56
  @dataclass
58
- class Task:
59
- """A dataset task.
57
+ class Language:
58
+ """A benchmarkable language.
60
59
 
61
60
  Attributes:
61
+ code:
62
+ The ISO 639-1 language code of the language.
62
63
  name:
63
- The name of the task.
64
- task_group:
65
- The task group of the task.
66
- metrics:
67
- The metrics used to evaluate the task.
64
+ The name of the language.
65
+ and_separator (optional):
66
+ The word 'and' in the language.
67
+ or_separator (optional):
68
+ The word 'or' in the language.
68
69
  """
69
70
 
71
+ code: str
70
72
  name: str
71
- task_group: TaskGroup
72
- metrics: list[MetricConfig]
73
+ _and_separator: str | None = field(repr=False, default=None)
74
+ _or_separator: str | None = field(repr=False, default=None)
73
75
 
74
76
  def __hash__(self) -> int:
75
- """Return a hash of the task."""
76
- return hash(self.name)
77
+ """Return a hash of the language."""
78
+ return hash(self.code)
79
+
80
+ @property
81
+ def and_separator(self) -> str:
82
+ """Get the word 'and' in the language.
83
+
84
+ Returns:
85
+ The word 'and' in the language.
86
+
87
+ Raises:
88
+ NotImplementedError:
89
+ If `and_separator` is `None`.
90
+ """
91
+ if not self._and_separator:
92
+ raise NotImplementedError(
93
+ f"Separator for the word 'and' has not been defined for {self.name}."
94
+ )
95
+ return self._and_separator
96
+
97
+ @and_separator.setter
98
+ def and_separator(self, value: str | None) -> None:
99
+ self._and_separator = value
100
+
101
+ @property
102
+ def or_separator(self) -> str:
103
+ """Get the word 'or' in the language.
104
+
105
+ Returns:
106
+ The word 'or' in the language.
107
+
108
+ Raises:
109
+ NotImplementedError:
110
+ If `or_separator` is `None`.
111
+ """
112
+ if not self._or_separator:
113
+ raise NotImplementedError(
114
+ f"Separator for the word 'or' has not been defined for {self.name}."
115
+ )
116
+ return self._or_separator
117
+
118
+ @or_separator.setter
119
+ def or_separator(self, value: str | None) -> None:
120
+ self._or_separator = value
77
121
 
78
122
 
79
123
  @dataclass
80
- class Language:
81
- """A benchmarkable language.
124
+ class Task:
125
+ """A dataset task.
82
126
 
83
127
  Attributes:
84
- code:
85
- The ISO 639-1 language code of the language.
86
128
  name:
87
- The name of the language.
129
+ The name of the task.
130
+ task_group:
131
+ The task group of the task.
132
+ template_dict:
133
+ The template dictionary for the task, from language to prompt template.
134
+ metrics:
135
+ The metrics used to evaluate the task.
136
+ default_num_few_shot_examples:
137
+ The default number of examples to use when benchmarking the task using
138
+ few-shot evaluation. For a classification task, these will be drawn evenly
139
+ from each label.
140
+ default_max_generated_tokens:
141
+ The default maximum number of tokens to generate when benchmarking the task
142
+ using few-shot evaluation.
143
+ default_labels:
144
+ The default labels for datasets using this task.
88
145
  """
89
146
 
90
- code: str
91
147
  name: str
148
+ task_group: TaskGroup
149
+ template_dict: dict["Language", "PromptConfig"]
150
+ metrics: list[MetricConfig]
151
+ default_num_few_shot_examples: int
152
+ default_max_generated_tokens: int
153
+ default_labels: list[str]
92
154
 
93
155
  def __hash__(self) -> int:
94
- """Return a hash of the language."""
95
- return hash(self.code)
156
+ """Return a hash of the task."""
157
+ return hash(self.name)
96
158
 
97
159
 
98
160
  @dataclass
@@ -304,26 +366,30 @@ class DatasetConfig:
304
366
  The mapping from label to ID.
305
367
  num_labels:
306
368
  The number of labels in the dataset.
307
- prompt_template:
369
+ _prompt_prefix (optional):
370
+ The prefix to use in the few-shot prompt. Defaults to the template for the
371
+ task and language.
372
+ _prompt_template (optional):
308
373
  The template for the prompt to use when benchmarking the dataset using
309
- few-shot evaluation.
310
- max_generated_tokens:
311
- The maximum number of tokens to generate when benchmarking the dataset
312
- using few-shot evaluation.
313
- prompt_prefix:
314
- The prefix to use in the few-shot prompt.
315
- num_few_shot_examples:
374
+ few-shot evaluation. Defaults to the template for the task and language.
375
+ _instruction_prompt (optional):
376
+ The prompt to use when benchmarking the dataset using instruction-based
377
+ evaluation. Defaults to the template for the task and language.
378
+ _num_few_shot_examples (optional):
316
379
  The number of examples to use when benchmarking the dataset using few-shot
317
380
  evaluation. For a classification task, these will be drawn evenly from
318
- each label.
319
- instruction_prompt:
320
- The prompt to use when benchmarking the dataset using instruction-based
321
- evaluation.
322
- labels (optional):
323
- The labels in the dataset. Defaults to an empty list.
324
- prompt_label_mapping (optional):
381
+ each label. Defaults to the template for the task and language.
382
+ _max_generated_tokens (optional):
383
+ The maximum number of tokens to generate when benchmarking the dataset
384
+ using few-shot evaluation. Defaults to the template for the task and
385
+ language.
386
+ _labels (optional):
387
+ The labels in the dataset. Defaults to the template for the task and
388
+ language.
389
+ _prompt_label_mapping (optional):
325
390
  A mapping from the labels to another phrase which is used as a substitute
326
- for the label in few-shot evaluation. Defaults to an empty dictionary.
391
+ for the label in few-shot evaluation. Defaults to the template for the task
392
+ and language.
327
393
  unofficial (optional):
328
394
  Whether the dataset is unofficial. Defaults to False.
329
395
  """
@@ -333,15 +399,93 @@ class DatasetConfig:
333
399
  huggingface_id: str
334
400
  task: Task
335
401
  languages: list[Language]
336
- prompt_template: str
337
- max_generated_tokens: int
338
- prompt_prefix: str
339
- num_few_shot_examples: int
340
- instruction_prompt: str
341
- labels: list[str] = field(default_factory=list)
342
- prompt_label_mapping: dict[str, str] = field(default_factory=dict)
402
+ _prompt_prefix: str | None = None
403
+ _prompt_template: str | None = None
404
+ _instruction_prompt: str | None = None
405
+ _num_few_shot_examples: int | None = None
406
+ _max_generated_tokens: int | None = None
407
+ _labels: list[str] | None = None
408
+ _prompt_label_mapping: dict[str, str] | None = None
343
409
  unofficial: bool = False
344
410
 
411
+ @property
412
+ def prompt_prefix(self) -> str:
413
+ """The prefix to use in the few-shot prompt."""
414
+ main_language = self.languages[0]
415
+ prompt_config = self.task.template_dict[main_language]
416
+ prompt_prefix = (
417
+ prompt_config.default_prompt_prefix
418
+ if self._prompt_prefix is None
419
+ else self._prompt_prefix
420
+ )
421
+ prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
422
+ return prompt_prefix
423
+
424
+ @property
425
+ def prompt_template(self) -> str:
426
+ """The template used during few-shot evaluation."""
427
+ main_language = self.languages[0]
428
+ prompt_config = self.task.template_dict[main_language]
429
+ prompt_template = (
430
+ prompt_config.default_prompt_template
431
+ if self._prompt_template is None
432
+ else self._prompt_template
433
+ )
434
+ prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
435
+ return prompt_template
436
+
437
+ @property
438
+ def instruction_prompt(self) -> str:
439
+ """The prompt to use when evaluating instruction-tuned models."""
440
+ main_language = self.languages[0]
441
+ prompt_config = self.task.template_dict[main_language]
442
+ instruction_prompt = (
443
+ prompt_config.default_instruction_prompt
444
+ if self._instruction_prompt is None
445
+ else self._instruction_prompt
446
+ )
447
+ instruction_prompt = instruction_prompt.replace(
448
+ "{labels_str}", self._labels_str
449
+ )
450
+ return instruction_prompt
451
+
452
+ @property
453
+ def num_few_shot_examples(self) -> int:
454
+ """The number of few-shot examples to use."""
455
+ return (
456
+ self._num_few_shot_examples
457
+ if self._num_few_shot_examples is not None
458
+ else self.task.default_num_few_shot_examples
459
+ )
460
+
461
+ @property
462
+ def max_generated_tokens(self) -> int:
463
+ """The maximum number of tokens to generate when evaluating a model."""
464
+ return (
465
+ self._max_generated_tokens
466
+ if self._max_generated_tokens is not None
467
+ else self.task.default_max_generated_tokens
468
+ )
469
+
470
+ @property
471
+ def labels(self) -> list[str]:
472
+ """The labels in the dataset."""
473
+ return self._labels if self._labels is not None else self.task.default_labels
474
+
475
+ @property
476
+ def prompt_label_mapping(self) -> dict[str, str]:
477
+ """Mapping from English labels to localised labels."""
478
+ if self._prompt_label_mapping is not None:
479
+ return self._prompt_label_mapping
480
+
481
+ main_language = self.languages[0]
482
+ prompt_config = self.task.template_dict[main_language]
483
+
484
+ if prompt_config.default_prompt_label_mapping == "auto":
485
+ return {label: label for label in self.labels}
486
+ else:
487
+ return prompt_config.default_prompt_label_mapping
488
+
345
489
  @property
346
490
  def id2label(self) -> dict[int, str]:
347
491
  """The mapping from ID to label."""
@@ -361,6 +505,48 @@ class DatasetConfig:
361
505
  """Return a hash of the dataset configuration."""
362
506
  return hash(self.name)
363
507
 
508
+ @property
509
+ def _labels_str(self) -> str:
510
+ """Converts a set of labels to a natural string, in the specified language.
511
+
512
+ If the task is NER, we separate using 'and' and use the mapped labels instead of
513
+ the BIO NER labels.
514
+
515
+ Args:
516
+ language: The language to be used when converting the labels.
517
+
518
+ Returns:
519
+ The natural string representation of the labels in specified language.
520
+
521
+ Raises:
522
+ NotImplementedError:
523
+ If `and_separator` or `or_separator` are `None`, see `Language`.
524
+
525
+ Example:
526
+ >>> get_labels_str(language=DA)
527
+ "'a', 'b', 'c' eller 'd'"
528
+ """
529
+ main_language = self.languages[0]
530
+
531
+ if self.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
532
+ sep_word = main_language.and_separator
533
+ else:
534
+ sep_word = main_language.or_separator
535
+
536
+ # Convert labels to single-quoted labels - and remove duplicates
537
+ quoted_labels = [
538
+ f"'{label}'" for label in set(self.prompt_label_mapping.values())
539
+ ]
540
+
541
+ if not quoted_labels:
542
+ return ""
543
+ elif len(quoted_labels) == 1:
544
+ return quoted_labels[0]
545
+ elif len(quoted_labels) == 2:
546
+ return f"{quoted_labels[0]} {sep_word} {quoted_labels[1]}"
547
+ else:
548
+ return f"{', '.join(quoted_labels[:-1])} {sep_word} {quoted_labels[-1]}"
549
+
364
550
 
365
551
  @dataclass
366
552
  class ModelConfig:
@@ -477,3 +663,31 @@ class HFModelInfo:
477
663
  pipeline_tag: str
478
664
  tags: list[str]
479
665
  adapter_base_model_id: str | None
666
+
667
+
668
+ @dataclass
669
+ class PromptConfig:
670
+ """Configuration for task-specific prompting across languages.
671
+
672
+ Defines the prompt templates needed for evaluating a specific task in a given
673
+ language.
674
+
675
+ Attributes:
676
+ default_prompt_prefix:
677
+ The default prefix to use in the few-shot prompt.
678
+ default_prompt_template:
679
+ The default template for the prompt to use when benchmarking the dataset
680
+ using few-shot evaluation.
681
+ default_instruction_prompt:
682
+ The default prompt to use when benchmarking the dataset using
683
+ instruction-based evaluation.
684
+ default_prompt_label_mapping:
685
+ The default mapping from the labels to another phrase which is used as a
686
+ substitute for the label in few-shot evaluation. If set to "auto", the
687
+ mapping will be set to a 1:1 mapping between the labels and themselves.
688
+ """
689
+
690
+ default_prompt_prefix: str
691
+ default_prompt_template: str
692
+ default_instruction_prompt: str
693
+ default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
@@ -0,0 +1,61 @@
1
+ """All dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import get_all_languages
5
+ from ..tasks import SPEED
6
+ from .danish import * # noqa: F403
7
+ from .dutch import * # noqa: F403
8
+ from .english import * # noqa: F403
9
+ from .faroese import * # noqa: F403
10
+ from .french import * # noqa: F403
11
+ from .german import * # noqa: F403
12
+ from .icelandic import * # noqa: F403
13
+ from .italian import * # noqa: F403
14
+ from .norwegian import * # noqa: F403
15
+ from .spanish import * # noqa: F403
16
+ from .swedish import * # noqa: F403
17
+
18
+
19
+ def get_all_dataset_configs() -> dict[str, DatasetConfig]:
20
+ """Get a mapping of all the dataset configurations.
21
+
22
+ Returns:
23
+ A mapping between names of datasets and their configurations.
24
+ """
25
+ dataset_configs = [
26
+ cfg for cfg in globals().values() if isinstance(cfg, DatasetConfig)
27
+ ]
28
+ assert len(dataset_configs) == len({cfg.name for cfg in dataset_configs}), (
29
+ "There are duplicate dataset configurations. Please ensure that each dataset "
30
+ "has a unique name."
31
+ )
32
+ return {cfg.name: cfg for cfg in dataset_configs}
33
+
34
+
35
+ def get_dataset_config(dataset_name: str) -> DatasetConfig:
36
+ """Get the dataset configuration for a dataset.
37
+
38
+ Args:
39
+ dataset_name:
40
+ The name of the dataset.
41
+
42
+ Returns:
43
+ The dataset configuration.
44
+
45
+ Raises:
46
+ ValueError:
47
+ If the dataset is not found.
48
+ """
49
+ dataset_configs = get_all_dataset_configs()
50
+ if dataset_name not in dataset_configs:
51
+ raise ValueError(f"No dataset config found for dataset {dataset_name}.")
52
+ return dataset_configs[dataset_name]
53
+
54
+
55
+ SPEED_CONFIG = DatasetConfig(
56
+ name="speed",
57
+ pretty_name="the speed estimation benchmark",
58
+ huggingface_id="",
59
+ task=SPEED,
60
+ languages=list(get_all_languages().values()),
61
+ )
@@ -0,0 +1,120 @@
1
+ """All Danish dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import DA
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ ANGRY_TWEETS_CONFIG = DatasetConfig(
10
+ name="angry-tweets",
11
+ pretty_name="the truncated version of the Danish sentiment classification "
12
+ "dataset AngryTweets",
13
+ huggingface_id="EuroEval/angry-tweets-mini",
14
+ task=SENT,
15
+ languages=[DA],
16
+ )
17
+
18
+ SCALA_DA_CONFIG = DatasetConfig(
19
+ name="scala-da",
20
+ pretty_name="the Danish part of the linguistic acceptability dataset ScaLA",
21
+ huggingface_id="EuroEval/scala-da",
22
+ task=LA,
23
+ languages=[DA],
24
+ )
25
+
26
+ DANSK_CONFIG = DatasetConfig(
27
+ name="dansk",
28
+ pretty_name="the truncated version of the Danish named entity recognition "
29
+ "dataset DANSK",
30
+ huggingface_id="EuroEval/dansk-mini",
31
+ task=NER,
32
+ languages=[DA],
33
+ )
34
+
35
+ SCANDIQA_DA_CONFIG = DatasetConfig(
36
+ name="scandiqa-da",
37
+ pretty_name="the Danish part of the truncated version of the question answering "
38
+ "dataset ScandiQA",
39
+ huggingface_id="EuroEval/scandiqa-da-mini",
40
+ task=RC,
41
+ languages=[DA],
42
+ )
43
+
44
+ NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
45
+ name="nordjylland-news",
46
+ pretty_name="the truncated version of the Danish summarisation dataset "
47
+ "Nordjylland News",
48
+ huggingface_id="EuroEval/nordjylland-news-mini",
49
+ task=SUMM,
50
+ languages=[DA],
51
+ )
52
+
53
+ DANSKE_TALEMAADER_CONFIG = DatasetConfig(
54
+ name="danske-talemaader",
55
+ pretty_name="the truncated version of the Danish knowledge dataset Danske "
56
+ "Talemåder",
57
+ huggingface_id="EuroEval/danske-talemaader",
58
+ task=KNOW,
59
+ languages=[DA],
60
+ )
61
+
62
+ DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
63
+ name="danish-citizen-tests",
64
+ pretty_name="the Danish knowledge dataset Danish Citizen Tests",
65
+ huggingface_id="EuroEval/danish-citizen-tests-updated",
66
+ task=KNOW,
67
+ languages=[DA],
68
+ )
69
+
70
+ HELLASWAG_DA_CONFIG = DatasetConfig(
71
+ name="hellaswag-da",
72
+ pretty_name="the truncated version of the Danish common-sense reasoning dataset "
73
+ "HellaSwag-da, translated from the English HellaSwag dataset",
74
+ huggingface_id="EuroEval/hellaswag-da-mini",
75
+ task=COMMON_SENSE,
76
+ languages=[DA],
77
+ )
78
+
79
+
80
+ ### Unofficial datasets ###
81
+
82
+ DANE_CONFIG = DatasetConfig(
83
+ name="dane",
84
+ pretty_name="the truncated version of the Danish named entity recognition "
85
+ "dataset DaNE",
86
+ huggingface_id="EuroEval/dane-mini",
87
+ task=NER,
88
+ languages=[DA],
89
+ unofficial=True,
90
+ )
91
+
92
+ MMLU_DA_CONFIG = DatasetConfig(
93
+ name="mmlu-da",
94
+ pretty_name="the truncated version of the Danish knowledge dataset MMLU-da, "
95
+ "translated from the English MMLU dataset",
96
+ huggingface_id="EuroEval/mmlu-da-mini",
97
+ task=KNOW,
98
+ languages=[DA],
99
+ unofficial=True,
100
+ )
101
+
102
+ ARC_DA_CONFIG = DatasetConfig(
103
+ name="arc-da",
104
+ pretty_name="the truncated version of the Danish knowledge dataset ARC-da, "
105
+ "translated from the English ARC dataset",
106
+ huggingface_id="EuroEval/arc-da-mini",
107
+ task=KNOW,
108
+ languages=[DA],
109
+ unofficial=True,
110
+ )
111
+
112
+ BELEBELE_DA_CONFIG = DatasetConfig(
113
+ name="belebele-da",
114
+ pretty_name="the Danish multiple choice reading comprehension dataset BeleBele-da, "
115
+ "translated from the English BeleBele dataset",
116
+ huggingface_id="EuroEval/belebele-da-mini",
117
+ task=MCRC,
118
+ languages=[DA],
119
+ unofficial=True,
120
+ )
@@ -0,0 +1,123 @@
1
+ """All Dutch dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import NL
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ DUTCH_SOCIAL_CONFIG = DatasetConfig(
10
+ name="dutch-social",
11
+ pretty_name="the truncated version of the Dutch sentiment classification "
12
+ "dataset Dutch Social",
13
+ huggingface_id="EuroEval/dutch-social-mini",
14
+ task=SENT,
15
+ languages=[NL],
16
+ )
17
+
18
+ SCALA_NL_CONFIG = DatasetConfig(
19
+ name="scala-nl",
20
+ pretty_name="the Dutch part of the linguistic acceptability dataset ScaLA",
21
+ huggingface_id="EuroEval/scala-nl",
22
+ task=LA,
23
+ languages=[NL],
24
+ )
25
+
26
+ CONLL_NL_CONFIG = DatasetConfig(
27
+ name="conll-nl",
28
+ pretty_name="the Dutch part of the truncated version of the named entity "
29
+ "recognition dataset CoNLL 2002",
30
+ huggingface_id="EuroEval/conll-nl-mini",
31
+ task=NER,
32
+ languages=[NL],
33
+ )
34
+
35
+ SQUAD_NL_CONFIG = DatasetConfig(
36
+ name="squad-nl",
37
+ pretty_name="the truncated version of the Dutch reading comprehension dataset "
38
+ "SQuAD-nl, translated from the English SQuAD dataset",
39
+ huggingface_id="EuroEval/squad-nl-v2-mini",
40
+ task=RC,
41
+ languages=[NL],
42
+ )
43
+
44
+ WIKI_LINGUA_NL_CONFIG = DatasetConfig(
45
+ name="wiki-lingua-nl",
46
+ pretty_name="the Dutch part of the truncated version of the summarisation dataset "
47
+ "WikiLingua",
48
+ huggingface_id="EuroEval/wiki-lingua-nl-mini",
49
+ task=SUMM,
50
+ languages=[NL],
51
+ )
52
+
53
+ MMLU_NL_CONFIG = DatasetConfig(
54
+ name="mmlu-nl",
55
+ pretty_name="the truncated version of the Dutch knowledge dataset MMLU-nl, "
56
+ "translated from the English MMLU dataset",
57
+ huggingface_id="EuroEval/mmlu-nl-mini",
58
+ task=KNOW,
59
+ languages=[NL],
60
+ )
61
+
62
+ HELLASWAG_NL_CONFIG = DatasetConfig(
63
+ name="hellaswag-nl",
64
+ pretty_name="the truncated version of the Dutch common-sense reasoning dataset "
65
+ "HellaSwag-nl, translated from the English HellaSwag dataset",
66
+ huggingface_id="EuroEval/hellaswag-nl-mini",
67
+ task=COMMON_SENSE,
68
+ languages=[NL],
69
+ )
70
+
71
+
72
+ ### Unofficial datasets ###
73
+
74
+ DBRD_CONFIG = DatasetConfig(
75
+ name="dbrd",
76
+ pretty_name="the truncated version of the Dutch sentiment classification "
77
+ "dataset DBRD",
78
+ huggingface_id="EuroEval/dbrd-mini",
79
+ task=SENT,
80
+ languages=[NL],
81
+ _labels=["negative", "positive"],
82
+ _prompt_label_mapping=dict(positive="positief", negative="negatief"),
83
+ unofficial=True,
84
+ )
85
+
86
+ DUTCH_COLA_CONFIG = DatasetConfig(
87
+ name="dutch-cola",
88
+ pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
89
+ "Dutch CoLA",
90
+ huggingface_id="EuroEval/dutch-cola",
91
+ task=LA,
92
+ languages=[NL],
93
+ unofficial=True,
94
+ )
95
+
96
+ DUTCH_COLA_FULL_CONFIG = DatasetConfig(
97
+ name="dutch-cola-full",
98
+ pretty_name="the Dutch linguistic acceptability dataset Dutch CoLA",
99
+ huggingface_id="EuroEval/dutch-cola-full",
100
+ task=LA,
101
+ languages=[NL],
102
+ unofficial=True,
103
+ )
104
+
105
+ ARC_NL_CONFIG = DatasetConfig(
106
+ name="arc-nl",
107
+ pretty_name="the truncated version of the Dutch knowledge dataset ARC-nl, "
108
+ "translated from the English ARC dataset",
109
+ huggingface_id="EuroEval/arc-nl-mini",
110
+ task=KNOW,
111
+ languages=[NL],
112
+ unofficial=True,
113
+ )
114
+
115
+ BELEBELE_NL_CONFIG = DatasetConfig(
116
+ name="belebele-nl",
117
+ pretty_name="the Dutch multiple choice reading comprehension dataset BeleBele-nl, "
118
+ "translated from the English BeleBele dataset",
119
+ huggingface_id="EuroEval/belebele-nl-mini",
120
+ task=MCRC,
121
+ languages=[NL],
122
+ unofficial=True,
123
+ )