EuroEval 15.5.0__py3-none-any.whl → 15.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (53) hide show
  1. euroeval/benchmark_modules/base.py +3 -2
  2. euroeval/benchmark_modules/fresh.py +8 -6
  3. euroeval/benchmark_modules/hf.py +33 -31
  4. euroeval/benchmark_modules/litellm.py +120 -56
  5. euroeval/benchmark_modules/vllm.py +41 -26
  6. euroeval/benchmarker.py +23 -21
  7. euroeval/callbacks.py +2 -2
  8. euroeval/constants.py +1 -1
  9. euroeval/data_models.py +261 -42
  10. euroeval/dataset_configs/__init__.py +61 -0
  11. euroeval/dataset_configs/danish.py +120 -0
  12. euroeval/dataset_configs/dutch.py +123 -0
  13. euroeval/dataset_configs/english.py +88 -0
  14. euroeval/dataset_configs/faroese.py +54 -0
  15. euroeval/dataset_configs/french.py +83 -0
  16. euroeval/dataset_configs/german.py +91 -0
  17. euroeval/dataset_configs/icelandic.py +148 -0
  18. euroeval/dataset_configs/italian.py +81 -0
  19. euroeval/dataset_configs/norwegian.py +178 -0
  20. euroeval/dataset_configs/spanish.py +78 -0
  21. euroeval/dataset_configs/swedish.py +100 -0
  22. euroeval/exceptions.py +10 -10
  23. euroeval/finetuning.py +6 -10
  24. euroeval/generation.py +1 -0
  25. euroeval/human_evaluation.py +2 -2
  26. euroeval/languages.py +20 -13
  27. euroeval/model_cache.py +1 -1
  28. euroeval/model_loading.py +1 -12
  29. euroeval/prompt_templates/__init__.py +8 -0
  30. euroeval/prompt_templates/linguistic_acceptability.py +112 -0
  31. euroeval/prompt_templates/multiple_choice.py +97 -0
  32. euroeval/prompt_templates/named_entity_recognition.py +257 -0
  33. euroeval/prompt_templates/reading_comprehension.py +118 -0
  34. euroeval/prompt_templates/sentiment_classification.py +137 -0
  35. euroeval/prompt_templates/summarization.py +97 -0
  36. euroeval/speed_benchmark.py +1 -1
  37. euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
  38. euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
  39. euroeval/{task_utils → task_group_utils}/sequence_classification.py +1 -1
  40. euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
  41. euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
  42. euroeval/tasks.py +54 -0
  43. euroeval/tokenization_utils.py +343 -0
  44. euroeval/types.py +3 -1
  45. euroeval/utils.py +2 -347
  46. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/METADATA +31 -9
  47. euroeval-15.6.1.dist-info/RECORD +59 -0
  48. euroeval/dataset_configs.py +0 -2408
  49. euroeval-15.5.0.dist-info/RECORD +0 -40
  50. /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
  51. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/WHEEL +0 -0
  52. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/entry_points.txt +0 -0
  53. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Class that benchmarks language models."""
2
2
 
3
+ import contextlib
3
4
  import json
4
5
  import logging
5
6
  import re
@@ -13,7 +14,7 @@ from time import sleep
13
14
  from torch.distributed import destroy_process_group
14
15
 
15
16
  from .benchmark_config_factory import build_benchmark_config
16
- from .constants import GENERATIVE_PIPELINE_TAGS
17
+ from .constants import GENERATIVE_DATASET_TASK_GROUPS, GENERATIVE_PIPELINE_TAGS
17
18
  from .data_loading import load_data
18
19
  from .data_models import BenchmarkConfigParams, BenchmarkResult
19
20
  from .dataset_configs import get_all_dataset_configs
@@ -399,9 +400,21 @@ class Benchmarker:
399
400
  num_finished_benchmarks += 1
400
401
  continue
401
402
 
403
+ # Skip if the model is an encoder model and the task is generative
404
+ task_is_generative = (
405
+ dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
406
+ )
407
+ if model_config.model_type == ModelType.ENCODER and task_is_generative:
408
+ logger.debug(
409
+ f"Skipping benchmarking {model_id} on "
410
+ f"{dataset_config.pretty_name}, as it is an encoder model and "
411
+ "the task is generative."
412
+ )
413
+ continue
414
+
402
415
  # We do not re-initialise generative models as their architecture is not
403
416
  # customised to specific datasets
404
- if model_config.task in GENERATIVE_PIPELINE_TAGS:
417
+ if model_config.model_type == ModelType.GENERATIVE:
405
418
  initial_logging(
406
419
  model_config=model_config,
407
420
  dataset_config=dataset_config,
@@ -421,8 +434,8 @@ class Benchmarker:
421
434
  logger.info(e.message)
422
435
 
423
436
  # Add the remaining number of benchmarks for the model to
424
- # our benchmark counter, since we're skipping the
425
- # rest of them
437
+ # our benchmark counter, since we're skipping the rest of
438
+ # them
426
439
  num_finished_benchmarks += (
427
440
  len(dataset_configs)
428
441
  - dataset_configs.index(dataset_config)
@@ -447,31 +460,22 @@ class Benchmarker:
447
460
  raise benchmark_output_or_err
448
461
 
449
462
  elif isinstance(benchmark_output_or_err, InvalidBenchmark):
450
- if benchmark_config.raise_errors:
451
- raise benchmark_output_or_err
452
- logger.info(
453
- f"{model_id} could not be benchmarked on "
454
- f"{dataset_config.pretty_name}. Skipping. The error message "
455
- f"raised was {benchmark_output_or_err.message!r}."
456
- )
463
+ logger.info(benchmark_output_or_err.message)
457
464
  num_finished_benchmarks += 1
458
465
  continue
459
466
 
460
467
  elif isinstance(benchmark_output_or_err, InvalidModel):
461
- if benchmark_config.raise_errors:
462
- raise benchmark_output_or_err
463
468
  logger.info(benchmark_output_or_err.message)
464
469
 
465
- # Add the remaining number of benchmarks for the model to
466
- # our benchmark counter, since we're skipping the
467
- # rest of them
470
+ # Add the remaining number of benchmarks for the model to our
471
+ # benchmark counter, since we're skipping the rest of them
468
472
  num_finished_benchmarks += (
469
473
  len(dataset_configs) - dataset_configs.index(dataset_config) - 1
470
474
  )
471
475
  break
472
476
 
473
477
  else:
474
- record = benchmark_output_or_err
478
+ record: BenchmarkResult = benchmark_output_or_err
475
479
  current_benchmark_results.append(record)
476
480
  if benchmark_config.save_results:
477
481
  record.append_to_results(results_path=self.results_path)
@@ -482,6 +486,7 @@ class Benchmarker:
482
486
  f"{total_benchmarks} benchmarks."
483
487
  )
484
488
 
489
+ del loaded_model
485
490
  if benchmark_config.clear_model_cache:
486
491
  clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
487
492
 
@@ -493,11 +498,8 @@ class Benchmarker:
493
498
  # point and block the progress of another member of the process group. This
494
499
  # constraint has always been present, but this warning has only been added
495
500
  # since PyTorch 2.4 (function operator())
496
- try:
501
+ with contextlib.suppress(AssertionError):
497
502
  destroy_process_group()
498
- except AssertionError:
499
- pass
500
-
501
503
  return current_benchmark_results
502
504
 
503
505
  def _get_updated_benchmark_config(
euroeval/callbacks.py CHANGED
@@ -5,8 +5,8 @@ from collections.abc import Sized
5
5
 
6
6
  from torch.utils.data import DataLoader
7
7
  from tqdm.auto import tqdm
8
- from transformers import TrainerControl, TrainerState, TrainingArguments
9
- from transformers.trainer_callback import ProgressCallback
8
+ from transformers.trainer_callback import ProgressCallback, TrainerControl, TrainerState
9
+ from transformers.training_args import TrainingArguments
10
10
 
11
11
 
12
12
  class NeverLeaveProgressCallback(ProgressCallback):
euroeval/constants.py CHANGED
@@ -51,7 +51,7 @@ TASK_GROUPS_USING_LOGPROBS = [
51
51
 
52
52
  # The number of top log probabilities to return for generative models. For several APIs
53
53
  # this is the maximum number of log probabilities that can be returned
54
- MAX_LOGPROBS = 10
54
+ MAX_LOGPROBS = 8
55
55
 
56
56
 
57
57
  # We make sure to remove these metric attributes after each iteration, to avoid memory
euroeval/data_models.py CHANGED
@@ -54,44 +54,107 @@ class MetricConfig:
54
54
 
55
55
 
56
56
  @dataclass
57
- class Task:
58
- """A dataset task.
57
+ class Language:
58
+ """A benchmarkable language.
59
59
 
60
60
  Attributes:
61
+ code:
62
+ The ISO 639-1 language code of the language.
61
63
  name:
62
- The name of the task.
63
- task_group:
64
- The task group of the task.
65
- metrics:
66
- The metrics used to evaluate the task.
64
+ The name of the language.
65
+ and_separator (optional):
66
+ The word 'and' in the language.
67
+ or_separator (optional):
68
+ The word 'or' in the language.
67
69
  """
68
70
 
71
+ code: str
69
72
  name: str
70
- task_group: TaskGroup
71
- metrics: list[MetricConfig]
73
+ _and_separator: str | None = field(repr=False, default=None)
74
+ _or_separator: str | None = field(repr=False, default=None)
72
75
 
73
76
  def __hash__(self) -> int:
74
- """Return a hash of the task."""
75
- return hash(self.name)
77
+ """Return a hash of the language."""
78
+ return hash(self.code)
79
+
80
+ @property
81
+ def and_separator(self) -> str:
82
+ """Get the word 'and' in the language.
83
+
84
+ Returns:
85
+ The word 'and' in the language.
86
+
87
+ Raises:
88
+ NotImplementedError:
89
+ If `and_separator` is `None`.
90
+ """
91
+ if not self._and_separator:
92
+ raise NotImplementedError(
93
+ f"Separator for the word 'and' has not been defined for {self.name}."
94
+ )
95
+ return self._and_separator
96
+
97
+ @and_separator.setter
98
+ def and_separator(self, value: str | None) -> None:
99
+ self._and_separator = value
100
+
101
+ @property
102
+ def or_separator(self) -> str:
103
+ """Get the word 'or' in the language.
104
+
105
+ Returns:
106
+ The word 'or' in the language.
107
+
108
+ Raises:
109
+ NotImplementedError:
110
+ If `or_separator` is `None`.
111
+ """
112
+ if not self._or_separator:
113
+ raise NotImplementedError(
114
+ f"Separator for the word 'or' has not been defined for {self.name}."
115
+ )
116
+ return self._or_separator
117
+
118
+ @or_separator.setter
119
+ def or_separator(self, value: str | None) -> None:
120
+ self._or_separator = value
76
121
 
77
122
 
78
123
  @dataclass
79
- class Language:
80
- """A benchmarkable language.
124
+ class Task:
125
+ """A dataset task.
81
126
 
82
127
  Attributes:
83
- code:
84
- The ISO 639-1 language code of the language.
85
128
  name:
86
- The name of the language.
129
+ The name of the task.
130
+ task_group:
131
+ The task group of the task.
132
+ template_dict:
133
+ The template dictionary for the task, from language to prompt template.
134
+ metrics:
135
+ The metrics used to evaluate the task.
136
+ default_num_few_shot_examples:
137
+ The default number of examples to use when benchmarking the task using
138
+ few-shot evaluation. For a classification task, these will be drawn evenly
139
+ from each label.
140
+ default_max_generated_tokens:
141
+ The default maximum number of tokens to generate when benchmarking the task
142
+ using few-shot evaluation.
143
+ default_labels:
144
+ The default labels for datasets using this task.
87
145
  """
88
146
 
89
- code: str
90
147
  name: str
148
+ task_group: TaskGroup
149
+ template_dict: dict["Language", "PromptConfig"]
150
+ metrics: list[MetricConfig]
151
+ default_num_few_shot_examples: int
152
+ default_max_generated_tokens: int
153
+ default_labels: list[str]
91
154
 
92
155
  def __hash__(self) -> int:
93
- """Return a hash of the language."""
94
- return hash(self.code)
156
+ """Return a hash of the task."""
157
+ return hash(self.name)
95
158
 
96
159
 
97
160
  @dataclass
@@ -303,26 +366,32 @@ class DatasetConfig:
303
366
  The mapping from label to ID.
304
367
  num_labels:
305
368
  The number of labels in the dataset.
306
- prompt_template:
369
+ _prompt_prefix (optional):
370
+ The prefix to use in the few-shot prompt. Defaults to the template for the
371
+ task and language.
372
+ _prompt_template (optional):
307
373
  The template for the prompt to use when benchmarking the dataset using
308
- few-shot evaluation.
309
- max_generated_tokens:
310
- The maximum number of tokens to generate when benchmarking the dataset
311
- using few-shot evaluation.
312
- prompt_prefix:
313
- The prefix to use in the few-shot prompt.
314
- num_few_shot_examples:
374
+ few-shot evaluation. Defaults to the template for the task and language.
375
+ _instruction_prompt (optional):
376
+ The prompt to use when benchmarking the dataset using instruction-based
377
+ evaluation. Defaults to the template for the task and language.
378
+ _num_few_shot_examples (optional):
315
379
  The number of examples to use when benchmarking the dataset using few-shot
316
380
  evaluation. For a classification task, these will be drawn evenly from
317
- each label.
318
- instruction_prompt:
319
- The prompt to use when benchmarking the dataset using instruction-based
320
- evaluation.
321
- labels (optional):
322
- The labels in the dataset. Defaults to an empty list.
323
- prompt_label_mapping (optional):
381
+ each label. Defaults to the template for the task and language.
382
+ _max_generated_tokens (optional):
383
+ The maximum number of tokens to generate when benchmarking the dataset
384
+ using few-shot evaluation. Defaults to the template for the task and
385
+ language.
386
+ _labels (optional):
387
+ The labels in the dataset. Defaults to the template for the task and
388
+ language.
389
+ _prompt_label_mapping (optional):
324
390
  A mapping from the labels to another phrase which is used as a substitute
325
- for the label in few-shot evaluation. Defaults to an empty dictionary.
391
+ for the label in few-shot evaluation. If "auto" then the mapping will be set
392
+ to a 1:1 mapping between the labels and themselves. If None then the mapping
393
+ will be set to the default mapping for the task and language. Defaults to
394
+ None.
326
395
  unofficial (optional):
327
396
  Whether the dataset is unofficial. Defaults to False.
328
397
  """
@@ -332,15 +401,95 @@ class DatasetConfig:
332
401
  huggingface_id: str
333
402
  task: Task
334
403
  languages: list[Language]
335
- prompt_template: str
336
- max_generated_tokens: int
337
- prompt_prefix: str
338
- num_few_shot_examples: int
339
- instruction_prompt: str
340
- labels: list[str] = field(default_factory=list)
341
- prompt_label_mapping: dict[str, str] = field(default_factory=dict)
404
+ _prompt_prefix: str | None = None
405
+ _prompt_template: str | None = None
406
+ _instruction_prompt: str | None = None
407
+ _num_few_shot_examples: int | None = None
408
+ _max_generated_tokens: int | None = None
409
+ _labels: list[str] | None = None
410
+ _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
342
411
  unofficial: bool = False
343
412
 
413
+ @property
414
+ def prompt_prefix(self) -> str:
415
+ """The prefix to use in the few-shot prompt."""
416
+ main_language = self.languages[0]
417
+ prompt_config = self.task.template_dict[main_language]
418
+ prompt_prefix = (
419
+ prompt_config.default_prompt_prefix
420
+ if self._prompt_prefix is None
421
+ else self._prompt_prefix
422
+ )
423
+ prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
424
+ return prompt_prefix
425
+
426
+ @property
427
+ def prompt_template(self) -> str:
428
+ """The template used during few-shot evaluation."""
429
+ main_language = self.languages[0]
430
+ prompt_config = self.task.template_dict[main_language]
431
+ prompt_template = (
432
+ prompt_config.default_prompt_template
433
+ if self._prompt_template is None
434
+ else self._prompt_template
435
+ )
436
+ prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
437
+ return prompt_template
438
+
439
+ @property
440
+ def instruction_prompt(self) -> str:
441
+ """The prompt to use when evaluating instruction-tuned models."""
442
+ main_language = self.languages[0]
443
+ prompt_config = self.task.template_dict[main_language]
444
+ instruction_prompt = (
445
+ prompt_config.default_instruction_prompt
446
+ if self._instruction_prompt is None
447
+ else self._instruction_prompt
448
+ )
449
+ instruction_prompt = instruction_prompt.replace(
450
+ "{labels_str}", self._labels_str
451
+ )
452
+ return instruction_prompt
453
+
454
+ @property
455
+ def num_few_shot_examples(self) -> int:
456
+ """The number of few-shot examples to use."""
457
+ return (
458
+ self._num_few_shot_examples
459
+ if self._num_few_shot_examples is not None
460
+ else self.task.default_num_few_shot_examples
461
+ )
462
+
463
+ @property
464
+ def max_generated_tokens(self) -> int:
465
+ """The maximum number of tokens to generate when evaluating a model."""
466
+ return (
467
+ self._max_generated_tokens
468
+ if self._max_generated_tokens is not None
469
+ else self.task.default_max_generated_tokens
470
+ )
471
+
472
+ @property
473
+ def labels(self) -> list[str]:
474
+ """The labels in the dataset."""
475
+ return self._labels if self._labels is not None else self.task.default_labels
476
+
477
+ @property
478
+ def prompt_label_mapping(self) -> dict[str, str]:
479
+ """Mapping from English labels to localised labels."""
480
+ if self._prompt_label_mapping == "auto":
481
+ return {label: label for label in self.labels}
482
+ elif self._prompt_label_mapping is not None:
483
+ return self._prompt_label_mapping
484
+
485
+ main_language = self.languages[0]
486
+ prompt_config = self.task.template_dict[main_language]
487
+
488
+ if prompt_config.default_prompt_label_mapping == "auto":
489
+ return {label: label for label in self.labels}
490
+ else:
491
+ return prompt_config.default_prompt_label_mapping
492
+
344
493
  @property
345
494
  def id2label(self) -> dict[int, str]:
346
495
  """The mapping from ID to label."""
@@ -360,6 +509,48 @@ class DatasetConfig:
360
509
  """Return a hash of the dataset configuration."""
361
510
  return hash(self.name)
362
511
 
512
+ @property
513
+ def _labels_str(self) -> str:
514
+ """Converts a set of labels to a natural string, in the specified language.
515
+
516
+ If the task is NER, we separate using 'and' and use the mapped labels instead of
517
+ the BIO NER labels.
518
+
519
+ Args:
520
+ language: The language to be used when converting the labels.
521
+
522
+ Returns:
523
+ The natural string representation of the labels in specified language.
524
+
525
+ Raises:
526
+ NotImplementedError:
527
+ If `and_separator` or `or_separator` are `None`, see `Language`.
528
+
529
+ Example:
530
+ >>> get_labels_str(language=DA)
531
+ "'a', 'b', 'c' eller 'd'"
532
+ """
533
+ main_language = self.languages[0]
534
+
535
+ if self.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
536
+ sep_word = main_language.and_separator
537
+ else:
538
+ sep_word = main_language.or_separator
539
+
540
+ # Convert labels to single-quoted labels - and remove duplicates
541
+ quoted_labels = [
542
+ f"'{label}'" for label in set(self.prompt_label_mapping.values())
543
+ ]
544
+
545
+ if not quoted_labels:
546
+ return ""
547
+ elif len(quoted_labels) == 1:
548
+ return quoted_labels[0]
549
+ elif len(quoted_labels) == 2:
550
+ return f"{quoted_labels[0]} {sep_word} {quoted_labels[1]}"
551
+ else:
552
+ return f"{', '.join(quoted_labels[:-1])} {sep_word} {quoted_labels[-1]}"
553
+
363
554
 
364
555
  @dataclass
365
556
  class ModelConfig:
@@ -476,3 +667,31 @@ class HFModelInfo:
476
667
  pipeline_tag: str
477
668
  tags: list[str]
478
669
  adapter_base_model_id: str | None
670
+
671
+
672
+ @dataclass
673
+ class PromptConfig:
674
+ """Configuration for task-specific prompting across languages.
675
+
676
+ Defines the prompt templates needed for evaluating a specific task in a given
677
+ language.
678
+
679
+ Attributes:
680
+ default_prompt_prefix:
681
+ The default prefix to use in the few-shot prompt.
682
+ default_prompt_template:
683
+ The default template for the prompt to use when benchmarking the dataset
684
+ using few-shot evaluation.
685
+ default_instruction_prompt:
686
+ The default prompt to use when benchmarking the dataset using
687
+ instruction-based evaluation.
688
+ default_prompt_label_mapping:
689
+ The default mapping from the labels to another phrase which is used as a
690
+ substitute for the label in few-shot evaluation. If set to "auto", the
691
+ mapping will be set to a 1:1 mapping between the labels and themselves.
692
+ """
693
+
694
+ default_prompt_prefix: str
695
+ default_prompt_template: str
696
+ default_instruction_prompt: str
697
+ default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
@@ -0,0 +1,61 @@
1
+ """All dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import get_all_languages
5
+ from ..tasks import SPEED
6
+ from .danish import * # noqa: F403
7
+ from .dutch import * # noqa: F403
8
+ from .english import * # noqa: F403
9
+ from .faroese import * # noqa: F403
10
+ from .french import * # noqa: F403
11
+ from .german import * # noqa: F403
12
+ from .icelandic import * # noqa: F403
13
+ from .italian import * # noqa: F403
14
+ from .norwegian import * # noqa: F403
15
+ from .spanish import * # noqa: F403
16
+ from .swedish import * # noqa: F403
17
+
18
+
19
+ def get_all_dataset_configs() -> dict[str, DatasetConfig]:
20
+ """Get a mapping of all the dataset configurations.
21
+
22
+ Returns:
23
+ A mapping between names of datasets and their configurations.
24
+ """
25
+ dataset_configs = [
26
+ cfg for cfg in globals().values() if isinstance(cfg, DatasetConfig)
27
+ ]
28
+ assert len(dataset_configs) == len({cfg.name for cfg in dataset_configs}), (
29
+ "There are duplicate dataset configurations. Please ensure that each dataset "
30
+ "has a unique name."
31
+ )
32
+ return {cfg.name: cfg for cfg in dataset_configs}
33
+
34
+
35
+ def get_dataset_config(dataset_name: str) -> DatasetConfig:
36
+ """Get the dataset configuration for a dataset.
37
+
38
+ Args:
39
+ dataset_name:
40
+ The name of the dataset.
41
+
42
+ Returns:
43
+ The dataset configuration.
44
+
45
+ Raises:
46
+ ValueError:
47
+ If the dataset is not found.
48
+ """
49
+ dataset_configs = get_all_dataset_configs()
50
+ if dataset_name not in dataset_configs:
51
+ raise ValueError(f"No dataset config found for dataset {dataset_name}.")
52
+ return dataset_configs[dataset_name]
53
+
54
+
55
+ SPEED_CONFIG = DatasetConfig(
56
+ name="speed",
57
+ pretty_name="the speed estimation benchmark",
58
+ huggingface_id="",
59
+ task=SPEED,
60
+ languages=list(get_all_languages().values()),
61
+ )
@@ -0,0 +1,120 @@
1
+ """All Danish dataset configurations used in EuroEval."""
2
+
3
+ from ..data_models import DatasetConfig
4
+ from ..languages import DA
5
+ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
+
7
+ ### Official datasets ###
8
+
9
+ ANGRY_TWEETS_CONFIG = DatasetConfig(
10
+ name="angry-tweets",
11
+ pretty_name="the truncated version of the Danish sentiment classification "
12
+ "dataset AngryTweets",
13
+ huggingface_id="EuroEval/angry-tweets-mini",
14
+ task=SENT,
15
+ languages=[DA],
16
+ )
17
+
18
+ SCALA_DA_CONFIG = DatasetConfig(
19
+ name="scala-da",
20
+ pretty_name="the Danish part of the linguistic acceptability dataset ScaLA",
21
+ huggingface_id="EuroEval/scala-da",
22
+ task=LA,
23
+ languages=[DA],
24
+ )
25
+
26
+ DANSK_CONFIG = DatasetConfig(
27
+ name="dansk",
28
+ pretty_name="the truncated version of the Danish named entity recognition "
29
+ "dataset DANSK",
30
+ huggingface_id="EuroEval/dansk-mini",
31
+ task=NER,
32
+ languages=[DA],
33
+ )
34
+
35
+ SCANDIQA_DA_CONFIG = DatasetConfig(
36
+ name="scandiqa-da",
37
+ pretty_name="the Danish part of the truncated version of the question answering "
38
+ "dataset ScandiQA",
39
+ huggingface_id="EuroEval/scandiqa-da-mini",
40
+ task=RC,
41
+ languages=[DA],
42
+ )
43
+
44
+ NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
45
+ name="nordjylland-news",
46
+ pretty_name="the truncated version of the Danish summarisation dataset "
47
+ "Nordjylland News",
48
+ huggingface_id="EuroEval/nordjylland-news-mini",
49
+ task=SUMM,
50
+ languages=[DA],
51
+ )
52
+
53
+ DANSKE_TALEMAADER_CONFIG = DatasetConfig(
54
+ name="danske-talemaader",
55
+ pretty_name="the truncated version of the Danish knowledge dataset Danske "
56
+ "Talemåder",
57
+ huggingface_id="EuroEval/danske-talemaader",
58
+ task=KNOW,
59
+ languages=[DA],
60
+ )
61
+
62
+ DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
63
+ name="danish-citizen-tests",
64
+ pretty_name="the Danish knowledge dataset Danish Citizen Tests",
65
+ huggingface_id="EuroEval/danish-citizen-tests-updated",
66
+ task=KNOW,
67
+ languages=[DA],
68
+ )
69
+
70
+ HELLASWAG_DA_CONFIG = DatasetConfig(
71
+ name="hellaswag-da",
72
+ pretty_name="the truncated version of the Danish common-sense reasoning dataset "
73
+ "HellaSwag-da, translated from the English HellaSwag dataset",
74
+ huggingface_id="EuroEval/hellaswag-da-mini",
75
+ task=COMMON_SENSE,
76
+ languages=[DA],
77
+ )
78
+
79
+
80
+ ### Unofficial datasets ###
81
+
82
+ DANE_CONFIG = DatasetConfig(
83
+ name="dane",
84
+ pretty_name="the truncated version of the Danish named entity recognition "
85
+ "dataset DaNE",
86
+ huggingface_id="EuroEval/dane-mini",
87
+ task=NER,
88
+ languages=[DA],
89
+ unofficial=True,
90
+ )
91
+
92
+ MMLU_DA_CONFIG = DatasetConfig(
93
+ name="mmlu-da",
94
+ pretty_name="the truncated version of the Danish knowledge dataset MMLU-da, "
95
+ "translated from the English MMLU dataset",
96
+ huggingface_id="EuroEval/mmlu-da-mini",
97
+ task=KNOW,
98
+ languages=[DA],
99
+ unofficial=True,
100
+ )
101
+
102
+ ARC_DA_CONFIG = DatasetConfig(
103
+ name="arc-da",
104
+ pretty_name="the truncated version of the Danish knowledge dataset ARC-da, "
105
+ "translated from the English ARC dataset",
106
+ huggingface_id="EuroEval/arc-da-mini",
107
+ task=KNOW,
108
+ languages=[DA],
109
+ unofficial=True,
110
+ )
111
+
112
+ BELEBELE_DA_CONFIG = DatasetConfig(
113
+ name="belebele-da",
114
+ pretty_name="the Danish multiple choice reading comprehension dataset BeleBele-da, "
115
+ "translated from the English BeleBele dataset",
116
+ huggingface_id="EuroEval/belebele-da-mini",
117
+ task=MCRC,
118
+ languages=[DA],
119
+ unofficial=True,
120
+ )