EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +2 -2
- euroeval/benchmark_modules/base.py +3 -2
- euroeval/benchmark_modules/fresh.py +8 -6
- euroeval/benchmark_modules/hf.py +44 -33
- euroeval/benchmark_modules/litellm.py +314 -120
- euroeval/benchmark_modules/vllm.py +99 -59
- euroeval/benchmarker.py +52 -21
- euroeval/callbacks.py +2 -2
- euroeval/constants.py +9 -2
- euroeval/data_models.py +258 -44
- euroeval/dataset_configs/__init__.py +61 -0
- euroeval/dataset_configs/danish.py +120 -0
- euroeval/dataset_configs/dutch.py +123 -0
- euroeval/dataset_configs/english.py +88 -0
- euroeval/dataset_configs/faroese.py +53 -0
- euroeval/dataset_configs/french.py +83 -0
- euroeval/dataset_configs/german.py +91 -0
- euroeval/dataset_configs/icelandic.py +148 -0
- euroeval/dataset_configs/italian.py +81 -0
- euroeval/dataset_configs/norwegian.py +178 -0
- euroeval/dataset_configs/spanish.py +78 -0
- euroeval/dataset_configs/swedish.py +100 -0
- euroeval/exceptions.py +10 -10
- euroeval/finetuning.py +6 -10
- euroeval/generation.py +1 -0
- euroeval/human_evaluation.py +2 -2
- euroeval/languages.py +20 -13
- euroeval/model_cache.py +1 -1
- euroeval/model_loading.py +1 -12
- euroeval/prompt_templates/__init__.py +8 -0
- euroeval/prompt_templates/linguistic_acceptability.py +112 -0
- euroeval/prompt_templates/multiple_choice.py +97 -0
- euroeval/prompt_templates/named_entity_recognition.py +257 -0
- euroeval/prompt_templates/reading_comprehension.py +118 -0
- euroeval/prompt_templates/sentiment_classification.py +137 -0
- euroeval/prompt_templates/summarization.py +97 -0
- euroeval/speed_benchmark.py +1 -1
- euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
- euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
- euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
- euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
- euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
- euroeval/tasks.py +54 -0
- euroeval/tokenization_utils.py +343 -0
- euroeval/types.py +3 -1
- euroeval/utils.py +5 -254
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
- euroeval-15.6.0.dist-info/RECORD +59 -0
- euroeval/dataset_configs.py +0 -2408
- euroeval-15.4.2.dist-info/RECORD +0 -40
- /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
euroeval/data_models.py
CHANGED
|
@@ -10,10 +10,9 @@ from dataclasses import dataclass, field
|
|
|
10
10
|
import pydantic
|
|
11
11
|
import torch
|
|
12
12
|
|
|
13
|
-
from euroeval.utils import get_package_version
|
|
14
|
-
|
|
15
13
|
from .enums import Device, InferenceBackend, ModelType, TaskGroup
|
|
16
14
|
from .types import ScoreDict
|
|
15
|
+
from .utils import get_package_version
|
|
17
16
|
|
|
18
17
|
|
|
19
18
|
@dataclass
|
|
@@ -55,44 +54,107 @@ class MetricConfig:
|
|
|
55
54
|
|
|
56
55
|
|
|
57
56
|
@dataclass
|
|
58
|
-
class
|
|
59
|
-
"""A
|
|
57
|
+
class Language:
|
|
58
|
+
"""A benchmarkable language.
|
|
60
59
|
|
|
61
60
|
Attributes:
|
|
61
|
+
code:
|
|
62
|
+
The ISO 639-1 language code of the language.
|
|
62
63
|
name:
|
|
63
|
-
The name of the
|
|
64
|
-
|
|
65
|
-
The
|
|
66
|
-
|
|
67
|
-
The
|
|
64
|
+
The name of the language.
|
|
65
|
+
and_separator (optional):
|
|
66
|
+
The word 'and' in the language.
|
|
67
|
+
or_separator (optional):
|
|
68
|
+
The word 'or' in the language.
|
|
68
69
|
"""
|
|
69
70
|
|
|
71
|
+
code: str
|
|
70
72
|
name: str
|
|
71
|
-
|
|
72
|
-
|
|
73
|
+
_and_separator: str | None = field(repr=False, default=None)
|
|
74
|
+
_or_separator: str | None = field(repr=False, default=None)
|
|
73
75
|
|
|
74
76
|
def __hash__(self) -> int:
|
|
75
|
-
"""Return a hash of the
|
|
76
|
-
return hash(self.
|
|
77
|
+
"""Return a hash of the language."""
|
|
78
|
+
return hash(self.code)
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def and_separator(self) -> str:
|
|
82
|
+
"""Get the word 'and' in the language.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
The word 'and' in the language.
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
NotImplementedError:
|
|
89
|
+
If `and_separator` is `None`.
|
|
90
|
+
"""
|
|
91
|
+
if not self._and_separator:
|
|
92
|
+
raise NotImplementedError(
|
|
93
|
+
f"Separator for the word 'and' has not been defined for {self.name}."
|
|
94
|
+
)
|
|
95
|
+
return self._and_separator
|
|
96
|
+
|
|
97
|
+
@and_separator.setter
|
|
98
|
+
def and_separator(self, value: str | None) -> None:
|
|
99
|
+
self._and_separator = value
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def or_separator(self) -> str:
|
|
103
|
+
"""Get the word 'or' in the language.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
The word 'or' in the language.
|
|
107
|
+
|
|
108
|
+
Raises:
|
|
109
|
+
NotImplementedError:
|
|
110
|
+
If `or_separator` is `None`.
|
|
111
|
+
"""
|
|
112
|
+
if not self._or_separator:
|
|
113
|
+
raise NotImplementedError(
|
|
114
|
+
f"Separator for the word 'or' has not been defined for {self.name}."
|
|
115
|
+
)
|
|
116
|
+
return self._or_separator
|
|
117
|
+
|
|
118
|
+
@or_separator.setter
|
|
119
|
+
def or_separator(self, value: str | None) -> None:
|
|
120
|
+
self._or_separator = value
|
|
77
121
|
|
|
78
122
|
|
|
79
123
|
@dataclass
|
|
80
|
-
class
|
|
81
|
-
"""A
|
|
124
|
+
class Task:
|
|
125
|
+
"""A dataset task.
|
|
82
126
|
|
|
83
127
|
Attributes:
|
|
84
|
-
code:
|
|
85
|
-
The ISO 639-1 language code of the language.
|
|
86
128
|
name:
|
|
87
|
-
The name of the
|
|
129
|
+
The name of the task.
|
|
130
|
+
task_group:
|
|
131
|
+
The task group of the task.
|
|
132
|
+
template_dict:
|
|
133
|
+
The template dictionary for the task, from language to prompt template.
|
|
134
|
+
metrics:
|
|
135
|
+
The metrics used to evaluate the task.
|
|
136
|
+
default_num_few_shot_examples:
|
|
137
|
+
The default number of examples to use when benchmarking the task using
|
|
138
|
+
few-shot evaluation. For a classification task, these will be drawn evenly
|
|
139
|
+
from each label.
|
|
140
|
+
default_max_generated_tokens:
|
|
141
|
+
The default maximum number of tokens to generate when benchmarking the task
|
|
142
|
+
using few-shot evaluation.
|
|
143
|
+
default_labels:
|
|
144
|
+
The default labels for datasets using this task.
|
|
88
145
|
"""
|
|
89
146
|
|
|
90
|
-
code: str
|
|
91
147
|
name: str
|
|
148
|
+
task_group: TaskGroup
|
|
149
|
+
template_dict: dict["Language", "PromptConfig"]
|
|
150
|
+
metrics: list[MetricConfig]
|
|
151
|
+
default_num_few_shot_examples: int
|
|
152
|
+
default_max_generated_tokens: int
|
|
153
|
+
default_labels: list[str]
|
|
92
154
|
|
|
93
155
|
def __hash__(self) -> int:
|
|
94
|
-
"""Return a hash of the
|
|
95
|
-
return hash(self.
|
|
156
|
+
"""Return a hash of the task."""
|
|
157
|
+
return hash(self.name)
|
|
96
158
|
|
|
97
159
|
|
|
98
160
|
@dataclass
|
|
@@ -304,26 +366,30 @@ class DatasetConfig:
|
|
|
304
366
|
The mapping from label to ID.
|
|
305
367
|
num_labels:
|
|
306
368
|
The number of labels in the dataset.
|
|
307
|
-
|
|
369
|
+
_prompt_prefix (optional):
|
|
370
|
+
The prefix to use in the few-shot prompt. Defaults to the template for the
|
|
371
|
+
task and language.
|
|
372
|
+
_prompt_template (optional):
|
|
308
373
|
The template for the prompt to use when benchmarking the dataset using
|
|
309
|
-
few-shot evaluation.
|
|
310
|
-
|
|
311
|
-
The
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
The prefix to use in the few-shot prompt.
|
|
315
|
-
num_few_shot_examples:
|
|
374
|
+
few-shot evaluation. Defaults to the template for the task and language.
|
|
375
|
+
_instruction_prompt (optional):
|
|
376
|
+
The prompt to use when benchmarking the dataset using instruction-based
|
|
377
|
+
evaluation. Defaults to the template for the task and language.
|
|
378
|
+
_num_few_shot_examples (optional):
|
|
316
379
|
The number of examples to use when benchmarking the dataset using few-shot
|
|
317
380
|
evaluation. For a classification task, these will be drawn evenly from
|
|
318
|
-
each label.
|
|
319
|
-
|
|
320
|
-
The
|
|
321
|
-
evaluation.
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
381
|
+
each label. Defaults to the template for the task and language.
|
|
382
|
+
_max_generated_tokens (optional):
|
|
383
|
+
The maximum number of tokens to generate when benchmarking the dataset
|
|
384
|
+
using few-shot evaluation. Defaults to the template for the task and
|
|
385
|
+
language.
|
|
386
|
+
_labels (optional):
|
|
387
|
+
The labels in the dataset. Defaults to the template for the task and
|
|
388
|
+
language.
|
|
389
|
+
_prompt_label_mapping (optional):
|
|
325
390
|
A mapping from the labels to another phrase which is used as a substitute
|
|
326
|
-
for the label in few-shot evaluation. Defaults to
|
|
391
|
+
for the label in few-shot evaluation. Defaults to the template for the task
|
|
392
|
+
and language.
|
|
327
393
|
unofficial (optional):
|
|
328
394
|
Whether the dataset is unofficial. Defaults to False.
|
|
329
395
|
"""
|
|
@@ -333,15 +399,93 @@ class DatasetConfig:
|
|
|
333
399
|
huggingface_id: str
|
|
334
400
|
task: Task
|
|
335
401
|
languages: list[Language]
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
402
|
+
_prompt_prefix: str | None = None
|
|
403
|
+
_prompt_template: str | None = None
|
|
404
|
+
_instruction_prompt: str | None = None
|
|
405
|
+
_num_few_shot_examples: int | None = None
|
|
406
|
+
_max_generated_tokens: int | None = None
|
|
407
|
+
_labels: list[str] | None = None
|
|
408
|
+
_prompt_label_mapping: dict[str, str] | None = None
|
|
343
409
|
unofficial: bool = False
|
|
344
410
|
|
|
411
|
+
@property
|
|
412
|
+
def prompt_prefix(self) -> str:
|
|
413
|
+
"""The prefix to use in the few-shot prompt."""
|
|
414
|
+
main_language = self.languages[0]
|
|
415
|
+
prompt_config = self.task.template_dict[main_language]
|
|
416
|
+
prompt_prefix = (
|
|
417
|
+
prompt_config.default_prompt_prefix
|
|
418
|
+
if self._prompt_prefix is None
|
|
419
|
+
else self._prompt_prefix
|
|
420
|
+
)
|
|
421
|
+
prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
|
|
422
|
+
return prompt_prefix
|
|
423
|
+
|
|
424
|
+
@property
|
|
425
|
+
def prompt_template(self) -> str:
|
|
426
|
+
"""The template used during few-shot evaluation."""
|
|
427
|
+
main_language = self.languages[0]
|
|
428
|
+
prompt_config = self.task.template_dict[main_language]
|
|
429
|
+
prompt_template = (
|
|
430
|
+
prompt_config.default_prompt_template
|
|
431
|
+
if self._prompt_template is None
|
|
432
|
+
else self._prompt_template
|
|
433
|
+
)
|
|
434
|
+
prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
|
|
435
|
+
return prompt_template
|
|
436
|
+
|
|
437
|
+
@property
|
|
438
|
+
def instruction_prompt(self) -> str:
|
|
439
|
+
"""The prompt to use when evaluating instruction-tuned models."""
|
|
440
|
+
main_language = self.languages[0]
|
|
441
|
+
prompt_config = self.task.template_dict[main_language]
|
|
442
|
+
instruction_prompt = (
|
|
443
|
+
prompt_config.default_instruction_prompt
|
|
444
|
+
if self._instruction_prompt is None
|
|
445
|
+
else self._instruction_prompt
|
|
446
|
+
)
|
|
447
|
+
instruction_prompt = instruction_prompt.replace(
|
|
448
|
+
"{labels_str}", self._labels_str
|
|
449
|
+
)
|
|
450
|
+
return instruction_prompt
|
|
451
|
+
|
|
452
|
+
@property
|
|
453
|
+
def num_few_shot_examples(self) -> int:
|
|
454
|
+
"""The number of few-shot examples to use."""
|
|
455
|
+
return (
|
|
456
|
+
self._num_few_shot_examples
|
|
457
|
+
if self._num_few_shot_examples is not None
|
|
458
|
+
else self.task.default_num_few_shot_examples
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
@property
|
|
462
|
+
def max_generated_tokens(self) -> int:
|
|
463
|
+
"""The maximum number of tokens to generate when evaluating a model."""
|
|
464
|
+
return (
|
|
465
|
+
self._max_generated_tokens
|
|
466
|
+
if self._max_generated_tokens is not None
|
|
467
|
+
else self.task.default_max_generated_tokens
|
|
468
|
+
)
|
|
469
|
+
|
|
470
|
+
@property
|
|
471
|
+
def labels(self) -> list[str]:
|
|
472
|
+
"""The labels in the dataset."""
|
|
473
|
+
return self._labels if self._labels is not None else self.task.default_labels
|
|
474
|
+
|
|
475
|
+
@property
|
|
476
|
+
def prompt_label_mapping(self) -> dict[str, str]:
|
|
477
|
+
"""Mapping from English labels to localised labels."""
|
|
478
|
+
if self._prompt_label_mapping is not None:
|
|
479
|
+
return self._prompt_label_mapping
|
|
480
|
+
|
|
481
|
+
main_language = self.languages[0]
|
|
482
|
+
prompt_config = self.task.template_dict[main_language]
|
|
483
|
+
|
|
484
|
+
if prompt_config.default_prompt_label_mapping == "auto":
|
|
485
|
+
return {label: label for label in self.labels}
|
|
486
|
+
else:
|
|
487
|
+
return prompt_config.default_prompt_label_mapping
|
|
488
|
+
|
|
345
489
|
@property
|
|
346
490
|
def id2label(self) -> dict[int, str]:
|
|
347
491
|
"""The mapping from ID to label."""
|
|
@@ -361,6 +505,48 @@ class DatasetConfig:
|
|
|
361
505
|
"""Return a hash of the dataset configuration."""
|
|
362
506
|
return hash(self.name)
|
|
363
507
|
|
|
508
|
+
@property
|
|
509
|
+
def _labels_str(self) -> str:
|
|
510
|
+
"""Converts a set of labels to a natural string, in the specified language.
|
|
511
|
+
|
|
512
|
+
If the task is NER, we separate using 'and' and use the mapped labels instead of
|
|
513
|
+
the BIO NER labels.
|
|
514
|
+
|
|
515
|
+
Args:
|
|
516
|
+
language: The language to be used when converting the labels.
|
|
517
|
+
|
|
518
|
+
Returns:
|
|
519
|
+
The natural string representation of the labels in specified language.
|
|
520
|
+
|
|
521
|
+
Raises:
|
|
522
|
+
NotImplementedError:
|
|
523
|
+
If `and_separator` or `or_separator` are `None`, see `Language`.
|
|
524
|
+
|
|
525
|
+
Example:
|
|
526
|
+
>>> get_labels_str(language=DA)
|
|
527
|
+
"'a', 'b', 'c' eller 'd'"
|
|
528
|
+
"""
|
|
529
|
+
main_language = self.languages[0]
|
|
530
|
+
|
|
531
|
+
if self.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
|
|
532
|
+
sep_word = main_language.and_separator
|
|
533
|
+
else:
|
|
534
|
+
sep_word = main_language.or_separator
|
|
535
|
+
|
|
536
|
+
# Convert labels to single-quoted labels - and remove duplicates
|
|
537
|
+
quoted_labels = [
|
|
538
|
+
f"'{label}'" for label in set(self.prompt_label_mapping.values())
|
|
539
|
+
]
|
|
540
|
+
|
|
541
|
+
if not quoted_labels:
|
|
542
|
+
return ""
|
|
543
|
+
elif len(quoted_labels) == 1:
|
|
544
|
+
return quoted_labels[0]
|
|
545
|
+
elif len(quoted_labels) == 2:
|
|
546
|
+
return f"{quoted_labels[0]} {sep_word} {quoted_labels[1]}"
|
|
547
|
+
else:
|
|
548
|
+
return f"{', '.join(quoted_labels[:-1])} {sep_word} {quoted_labels[-1]}"
|
|
549
|
+
|
|
364
550
|
|
|
365
551
|
@dataclass
|
|
366
552
|
class ModelConfig:
|
|
@@ -477,3 +663,31 @@ class HFModelInfo:
|
|
|
477
663
|
pipeline_tag: str
|
|
478
664
|
tags: list[str]
|
|
479
665
|
adapter_base_model_id: str | None
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
@dataclass
|
|
669
|
+
class PromptConfig:
|
|
670
|
+
"""Configuration for task-specific prompting across languages.
|
|
671
|
+
|
|
672
|
+
Defines the prompt templates needed for evaluating a specific task in a given
|
|
673
|
+
language.
|
|
674
|
+
|
|
675
|
+
Attributes:
|
|
676
|
+
default_prompt_prefix:
|
|
677
|
+
The default prefix to use in the few-shot prompt.
|
|
678
|
+
default_prompt_template:
|
|
679
|
+
The default template for the prompt to use when benchmarking the dataset
|
|
680
|
+
using few-shot evaluation.
|
|
681
|
+
default_instruction_prompt:
|
|
682
|
+
The default prompt to use when benchmarking the dataset using
|
|
683
|
+
instruction-based evaluation.
|
|
684
|
+
default_prompt_label_mapping:
|
|
685
|
+
The default mapping from the labels to another phrase which is used as a
|
|
686
|
+
substitute for the label in few-shot evaluation. If set to "auto", the
|
|
687
|
+
mapping will be set to a 1:1 mapping between the labels and themselves.
|
|
688
|
+
"""
|
|
689
|
+
|
|
690
|
+
default_prompt_prefix: str
|
|
691
|
+
default_prompt_template: str
|
|
692
|
+
default_instruction_prompt: str
|
|
693
|
+
default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""All dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import get_all_languages
|
|
5
|
+
from ..tasks import SPEED
|
|
6
|
+
from .danish import * # noqa: F403
|
|
7
|
+
from .dutch import * # noqa: F403
|
|
8
|
+
from .english import * # noqa: F403
|
|
9
|
+
from .faroese import * # noqa: F403
|
|
10
|
+
from .french import * # noqa: F403
|
|
11
|
+
from .german import * # noqa: F403
|
|
12
|
+
from .icelandic import * # noqa: F403
|
|
13
|
+
from .italian import * # noqa: F403
|
|
14
|
+
from .norwegian import * # noqa: F403
|
|
15
|
+
from .spanish import * # noqa: F403
|
|
16
|
+
from .swedish import * # noqa: F403
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_all_dataset_configs() -> dict[str, DatasetConfig]:
|
|
20
|
+
"""Get a mapping of all the dataset configurations.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
A mapping between names of datasets and their configurations.
|
|
24
|
+
"""
|
|
25
|
+
dataset_configs = [
|
|
26
|
+
cfg for cfg in globals().values() if isinstance(cfg, DatasetConfig)
|
|
27
|
+
]
|
|
28
|
+
assert len(dataset_configs) == len({cfg.name for cfg in dataset_configs}), (
|
|
29
|
+
"There are duplicate dataset configurations. Please ensure that each dataset "
|
|
30
|
+
"has a unique name."
|
|
31
|
+
)
|
|
32
|
+
return {cfg.name: cfg for cfg in dataset_configs}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_dataset_config(dataset_name: str) -> DatasetConfig:
|
|
36
|
+
"""Get the dataset configuration for a dataset.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
dataset_name:
|
|
40
|
+
The name of the dataset.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
The dataset configuration.
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError:
|
|
47
|
+
If the dataset is not found.
|
|
48
|
+
"""
|
|
49
|
+
dataset_configs = get_all_dataset_configs()
|
|
50
|
+
if dataset_name not in dataset_configs:
|
|
51
|
+
raise ValueError(f"No dataset config found for dataset {dataset_name}.")
|
|
52
|
+
return dataset_configs[dataset_name]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
SPEED_CONFIG = DatasetConfig(
|
|
56
|
+
name="speed",
|
|
57
|
+
pretty_name="the speed estimation benchmark",
|
|
58
|
+
huggingface_id="",
|
|
59
|
+
task=SPEED,
|
|
60
|
+
languages=list(get_all_languages().values()),
|
|
61
|
+
)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""All Danish dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import DA
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
ANGRY_TWEETS_CONFIG = DatasetConfig(
|
|
10
|
+
name="angry-tweets",
|
|
11
|
+
pretty_name="the truncated version of the Danish sentiment classification "
|
|
12
|
+
"dataset AngryTweets",
|
|
13
|
+
huggingface_id="EuroEval/angry-tweets-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[DA],
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
SCALA_DA_CONFIG = DatasetConfig(
|
|
19
|
+
name="scala-da",
|
|
20
|
+
pretty_name="the Danish part of the linguistic acceptability dataset ScaLA",
|
|
21
|
+
huggingface_id="EuroEval/scala-da",
|
|
22
|
+
task=LA,
|
|
23
|
+
languages=[DA],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
DANSK_CONFIG = DatasetConfig(
|
|
27
|
+
name="dansk",
|
|
28
|
+
pretty_name="the truncated version of the Danish named entity recognition "
|
|
29
|
+
"dataset DANSK",
|
|
30
|
+
huggingface_id="EuroEval/dansk-mini",
|
|
31
|
+
task=NER,
|
|
32
|
+
languages=[DA],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
SCANDIQA_DA_CONFIG = DatasetConfig(
|
|
36
|
+
name="scandiqa-da",
|
|
37
|
+
pretty_name="the Danish part of the truncated version of the question answering "
|
|
38
|
+
"dataset ScandiQA",
|
|
39
|
+
huggingface_id="EuroEval/scandiqa-da-mini",
|
|
40
|
+
task=RC,
|
|
41
|
+
languages=[DA],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
|
|
45
|
+
name="nordjylland-news",
|
|
46
|
+
pretty_name="the truncated version of the Danish summarisation dataset "
|
|
47
|
+
"Nordjylland News",
|
|
48
|
+
huggingface_id="EuroEval/nordjylland-news-mini",
|
|
49
|
+
task=SUMM,
|
|
50
|
+
languages=[DA],
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
DANSKE_TALEMAADER_CONFIG = DatasetConfig(
|
|
54
|
+
name="danske-talemaader",
|
|
55
|
+
pretty_name="the truncated version of the Danish knowledge dataset Danske "
|
|
56
|
+
"Talemåder",
|
|
57
|
+
huggingface_id="EuroEval/danske-talemaader",
|
|
58
|
+
task=KNOW,
|
|
59
|
+
languages=[DA],
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
DANISH_CITIZEN_TESTS_CONFIG = DatasetConfig(
|
|
63
|
+
name="danish-citizen-tests",
|
|
64
|
+
pretty_name="the Danish knowledge dataset Danish Citizen Tests",
|
|
65
|
+
huggingface_id="EuroEval/danish-citizen-tests-updated",
|
|
66
|
+
task=KNOW,
|
|
67
|
+
languages=[DA],
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
HELLASWAG_DA_CONFIG = DatasetConfig(
|
|
71
|
+
name="hellaswag-da",
|
|
72
|
+
pretty_name="the truncated version of the Danish common-sense reasoning dataset "
|
|
73
|
+
"HellaSwag-da, translated from the English HellaSwag dataset",
|
|
74
|
+
huggingface_id="EuroEval/hellaswag-da-mini",
|
|
75
|
+
task=COMMON_SENSE,
|
|
76
|
+
languages=[DA],
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
### Unofficial datasets ###
|
|
81
|
+
|
|
82
|
+
DANE_CONFIG = DatasetConfig(
|
|
83
|
+
name="dane",
|
|
84
|
+
pretty_name="the truncated version of the Danish named entity recognition "
|
|
85
|
+
"dataset DaNE",
|
|
86
|
+
huggingface_id="EuroEval/dane-mini",
|
|
87
|
+
task=NER,
|
|
88
|
+
languages=[DA],
|
|
89
|
+
unofficial=True,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
MMLU_DA_CONFIG = DatasetConfig(
|
|
93
|
+
name="mmlu-da",
|
|
94
|
+
pretty_name="the truncated version of the Danish knowledge dataset MMLU-da, "
|
|
95
|
+
"translated from the English MMLU dataset",
|
|
96
|
+
huggingface_id="EuroEval/mmlu-da-mini",
|
|
97
|
+
task=KNOW,
|
|
98
|
+
languages=[DA],
|
|
99
|
+
unofficial=True,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
ARC_DA_CONFIG = DatasetConfig(
|
|
103
|
+
name="arc-da",
|
|
104
|
+
pretty_name="the truncated version of the Danish knowledge dataset ARC-da, "
|
|
105
|
+
"translated from the English ARC dataset",
|
|
106
|
+
huggingface_id="EuroEval/arc-da-mini",
|
|
107
|
+
task=KNOW,
|
|
108
|
+
languages=[DA],
|
|
109
|
+
unofficial=True,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
BELEBELE_DA_CONFIG = DatasetConfig(
|
|
113
|
+
name="belebele-da",
|
|
114
|
+
pretty_name="the Danish multiple choice reading comprehension dataset BeleBele-da, "
|
|
115
|
+
"translated from the English BeleBele dataset",
|
|
116
|
+
huggingface_id="EuroEval/belebele-da-mini",
|
|
117
|
+
task=MCRC,
|
|
118
|
+
languages=[DA],
|
|
119
|
+
unofficial=True,
|
|
120
|
+
)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""All Dutch dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import NL
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
DUTCH_SOCIAL_CONFIG = DatasetConfig(
|
|
10
|
+
name="dutch-social",
|
|
11
|
+
pretty_name="the truncated version of the Dutch sentiment classification "
|
|
12
|
+
"dataset Dutch Social",
|
|
13
|
+
huggingface_id="EuroEval/dutch-social-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[NL],
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
SCALA_NL_CONFIG = DatasetConfig(
|
|
19
|
+
name="scala-nl",
|
|
20
|
+
pretty_name="the Dutch part of the linguistic acceptability dataset ScaLA",
|
|
21
|
+
huggingface_id="EuroEval/scala-nl",
|
|
22
|
+
task=LA,
|
|
23
|
+
languages=[NL],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
CONLL_NL_CONFIG = DatasetConfig(
|
|
27
|
+
name="conll-nl",
|
|
28
|
+
pretty_name="the Dutch part of the truncated version of the named entity "
|
|
29
|
+
"recognition dataset CoNLL 2002",
|
|
30
|
+
huggingface_id="EuroEval/conll-nl-mini",
|
|
31
|
+
task=NER,
|
|
32
|
+
languages=[NL],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
SQUAD_NL_CONFIG = DatasetConfig(
|
|
36
|
+
name="squad-nl",
|
|
37
|
+
pretty_name="the truncated version of the Dutch reading comprehension dataset "
|
|
38
|
+
"SQuAD-nl, translated from the English SQuAD dataset",
|
|
39
|
+
huggingface_id="EuroEval/squad-nl-v2-mini",
|
|
40
|
+
task=RC,
|
|
41
|
+
languages=[NL],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
WIKI_LINGUA_NL_CONFIG = DatasetConfig(
|
|
45
|
+
name="wiki-lingua-nl",
|
|
46
|
+
pretty_name="the Dutch part of the truncated version of the summarisation dataset "
|
|
47
|
+
"WikiLingua",
|
|
48
|
+
huggingface_id="EuroEval/wiki-lingua-nl-mini",
|
|
49
|
+
task=SUMM,
|
|
50
|
+
languages=[NL],
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
MMLU_NL_CONFIG = DatasetConfig(
|
|
54
|
+
name="mmlu-nl",
|
|
55
|
+
pretty_name="the truncated version of the Dutch knowledge dataset MMLU-nl, "
|
|
56
|
+
"translated from the English MMLU dataset",
|
|
57
|
+
huggingface_id="EuroEval/mmlu-nl-mini",
|
|
58
|
+
task=KNOW,
|
|
59
|
+
languages=[NL],
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
HELLASWAG_NL_CONFIG = DatasetConfig(
|
|
63
|
+
name="hellaswag-nl",
|
|
64
|
+
pretty_name="the truncated version of the Dutch common-sense reasoning dataset "
|
|
65
|
+
"HellaSwag-nl, translated from the English HellaSwag dataset",
|
|
66
|
+
huggingface_id="EuroEval/hellaswag-nl-mini",
|
|
67
|
+
task=COMMON_SENSE,
|
|
68
|
+
languages=[NL],
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
### Unofficial datasets ###
|
|
73
|
+
|
|
74
|
+
DBRD_CONFIG = DatasetConfig(
|
|
75
|
+
name="dbrd",
|
|
76
|
+
pretty_name="the truncated version of the Dutch sentiment classification "
|
|
77
|
+
"dataset DBRD",
|
|
78
|
+
huggingface_id="EuroEval/dbrd-mini",
|
|
79
|
+
task=SENT,
|
|
80
|
+
languages=[NL],
|
|
81
|
+
_labels=["negative", "positive"],
|
|
82
|
+
_prompt_label_mapping=dict(positive="positief", negative="negatief"),
|
|
83
|
+
unofficial=True,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
DUTCH_COLA_CONFIG = DatasetConfig(
|
|
87
|
+
name="dutch-cola",
|
|
88
|
+
pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
|
|
89
|
+
"Dutch CoLA",
|
|
90
|
+
huggingface_id="EuroEval/dutch-cola",
|
|
91
|
+
task=LA,
|
|
92
|
+
languages=[NL],
|
|
93
|
+
unofficial=True,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
DUTCH_COLA_FULL_CONFIG = DatasetConfig(
|
|
97
|
+
name="dutch-cola-full",
|
|
98
|
+
pretty_name="the Dutch linguistic acceptability dataset Dutch CoLA",
|
|
99
|
+
huggingface_id="EuroEval/dutch-cola-full",
|
|
100
|
+
task=LA,
|
|
101
|
+
languages=[NL],
|
|
102
|
+
unofficial=True,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
ARC_NL_CONFIG = DatasetConfig(
|
|
106
|
+
name="arc-nl",
|
|
107
|
+
pretty_name="the truncated version of the Dutch knowledge dataset ARC-nl, "
|
|
108
|
+
"translated from the English ARC dataset",
|
|
109
|
+
huggingface_id="EuroEval/arc-nl-mini",
|
|
110
|
+
task=KNOW,
|
|
111
|
+
languages=[NL],
|
|
112
|
+
unofficial=True,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
BELEBELE_NL_CONFIG = DatasetConfig(
|
|
116
|
+
name="belebele-nl",
|
|
117
|
+
pretty_name="the Dutch multiple choice reading comprehension dataset BeleBele-nl, "
|
|
118
|
+
"translated from the English BeleBele dataset",
|
|
119
|
+
huggingface_id="EuroEval/belebele-nl-mini",
|
|
120
|
+
task=MCRC,
|
|
121
|
+
languages=[NL],
|
|
122
|
+
unofficial=True,
|
|
123
|
+
)
|