EuroEval 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +5 -0
- euroeval/benchmark_config_factory.py +6 -1
- euroeval/benchmark_modules/base.py +2 -0
- euroeval/benchmark_modules/fresh.py +7 -1
- euroeval/benchmark_modules/hf.py +26 -21
- euroeval/benchmark_modules/litellm.py +258 -131
- euroeval/benchmark_modules/vllm.py +120 -68
- euroeval/benchmarker.py +11 -2
- euroeval/cli.py +14 -1
- euroeval/constants.py +7 -1
- euroeval/data_models.py +95 -20
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/danish.py +14 -3
- euroeval/dataset_configs/dutch.py +14 -0
- euroeval/dataset_configs/english.py +22 -0
- euroeval/dataset_configs/estonian.py +15 -7
- euroeval/dataset_configs/finnish.py +14 -0
- euroeval/dataset_configs/french.py +14 -0
- euroeval/dataset_configs/german.py +23 -0
- euroeval/dataset_configs/italian.py +14 -0
- euroeval/dataset_configs/latvian.py +14 -0
- euroeval/dataset_configs/norwegian.py +14 -0
- euroeval/dataset_configs/polish.py +126 -0
- euroeval/dataset_configs/portuguese.py +14 -0
- euroeval/dataset_configs/spanish.py +14 -0
- euroeval/dataset_configs/swedish.py +25 -0
- euroeval/enums.py +12 -0
- euroeval/generation.py +17 -8
- euroeval/generation_utils.py +102 -16
- euroeval/metrics/pipeline.py +51 -9
- euroeval/model_cache.py +13 -1
- euroeval/prompt_templates/linguistic_acceptability.py +9 -0
- euroeval/prompt_templates/multiple_choice.py +27 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -0
- euroeval/prompt_templates/reading_comprehension.py +11 -0
- euroeval/prompt_templates/sentiment_classification.py +15 -0
- euroeval/prompt_templates/summarization.py +27 -1
- euroeval/scores.py +5 -0
- euroeval/task_group_utils/multiple_choice_classification.py +2 -2
- euroeval/task_group_utils/question_answering.py +29 -29
- euroeval/task_group_utils/sequence_classification.py +71 -81
- euroeval/task_group_utils/token_classification.py +17 -3
- euroeval/tasks.py +12 -10
- euroeval/{tokenization_utils.py → tokenisation_utils.py} +41 -25
- euroeval/utils.py +67 -3
- {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/METADATA +3 -1
- euroeval-16.1.0.dist-info/RECORD +70 -0
- euroeval-16.0.0.dist-info/RECORD +0 -69
- {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
- {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0
euroeval/data_models.py
CHANGED
|
@@ -118,13 +118,19 @@ class Task:
|
|
|
118
118
|
log probabilities for the generated tokens. Defaults to False.
|
|
119
119
|
requires_logprobs (optional):
|
|
120
120
|
Whether the task requires log probabilities. Implies `uses_logprobs`.
|
|
121
|
-
|
|
121
|
+
default_allowed_model_types (optional):
|
|
122
122
|
A list of model types that are allowed to be evaluated on this task.
|
|
123
123
|
Defaults to all model types being allowed.
|
|
124
|
-
|
|
124
|
+
default_allowed_generative_types (optional):
|
|
125
125
|
A list of generative model types that are allowed to be evaluated on this
|
|
126
126
|
task. If None, all generative model types are allowed. Only relevant if
|
|
127
127
|
`allowed_model_types` includes generative models.
|
|
128
|
+
default_allow_invalid_model_outputs (optional):
|
|
129
|
+
Whether to allow invalid model outputs. This is only relevant for generative
|
|
130
|
+
models on classification tasks, where the model may generate an output
|
|
131
|
+
which is not one of the allowed labels. If True, the model output will be
|
|
132
|
+
mapped to the closest valid label. If False, the model output will be
|
|
133
|
+
considered incorrect and the evaluation will be aborted. Defaults to True.
|
|
128
134
|
"""
|
|
129
135
|
|
|
130
136
|
name: str
|
|
@@ -138,16 +144,17 @@ class Task:
|
|
|
138
144
|
uses_structured_output: bool = False
|
|
139
145
|
uses_logprobs: bool = False
|
|
140
146
|
requires_logprobs: bool = False
|
|
141
|
-
|
|
147
|
+
default_allowed_model_types: list[ModelType] = field(
|
|
142
148
|
default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
|
|
143
149
|
)
|
|
144
|
-
|
|
150
|
+
default_allowed_generative_types: list[GenerativeType] = field(
|
|
145
151
|
default_factory=lambda: [
|
|
146
152
|
GenerativeType.BASE,
|
|
147
153
|
GenerativeType.INSTRUCTION_TUNED,
|
|
148
154
|
GenerativeType.REASONING,
|
|
149
155
|
]
|
|
150
156
|
)
|
|
157
|
+
default_allow_invalid_model_outputs: bool = True
|
|
151
158
|
|
|
152
159
|
def __post_init__(self) -> None:
|
|
153
160
|
"""Post-initialisation checks."""
|
|
@@ -218,6 +225,9 @@ class BenchmarkConfig:
|
|
|
218
225
|
Whether the benchmark is being run with the CLI.
|
|
219
226
|
requires_safetensors:
|
|
220
227
|
Whether to only allow models that use the safetensors format.
|
|
228
|
+
generative_type:
|
|
229
|
+
The type of generative model to benchmark. Only relevant if the model is
|
|
230
|
+
generative.
|
|
221
231
|
"""
|
|
222
232
|
|
|
223
233
|
model_languages: list[Language]
|
|
@@ -244,6 +254,7 @@ class BenchmarkConfig:
|
|
|
244
254
|
debug: bool
|
|
245
255
|
run_with_cli: bool
|
|
246
256
|
requires_safetensors: bool
|
|
257
|
+
generative_type: GenerativeType | None
|
|
247
258
|
|
|
248
259
|
|
|
249
260
|
class BenchmarkConfigParams(pydantic.BaseModel):
|
|
@@ -273,6 +284,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
273
284
|
api_base: str | None
|
|
274
285
|
api_version: str | None
|
|
275
286
|
gpu_memory_utilization: float
|
|
287
|
+
generative_type: GenerativeType | None
|
|
276
288
|
debug: bool
|
|
277
289
|
run_with_cli: bool
|
|
278
290
|
requires_safetensors: bool
|
|
@@ -395,6 +407,21 @@ class DatasetConfig:
|
|
|
395
407
|
to a 1:1 mapping between the labels and themselves. If None then the mapping
|
|
396
408
|
will be set to the default mapping for the task and language. Defaults to
|
|
397
409
|
None.
|
|
410
|
+
_allowed_model_types (optional):
|
|
411
|
+
A list of model types that are allowed to be evaluated on this dataset.
|
|
412
|
+
Defaults to the one for the task.
|
|
413
|
+
_allowed_generative_types (optional):
|
|
414
|
+
A list of generative model types that are allowed to be evaluated on this
|
|
415
|
+
dataset. If None, all generative model types are allowed. Only relevant if
|
|
416
|
+
`allowed_model_types` includes generative models. Defaults to the one for
|
|
417
|
+
the task.
|
|
418
|
+
_allow_invalid_model_outputs (optional):
|
|
419
|
+
Whether to allow invalid model outputs. This is only relevant for
|
|
420
|
+
generative models on classification tasks, where the model may generate an
|
|
421
|
+
output which is not one of the allowed labels. If True, the model output
|
|
422
|
+
will be mapped to the closest valid label. If False, the model output will
|
|
423
|
+
be considered incorrect and the evaluation will be aborted. Defaults to
|
|
424
|
+
the one for the task.
|
|
398
425
|
splits (optional):
|
|
399
426
|
The names of the splits in the dataset. If not provided, defaults to
|
|
400
427
|
["train", "val", "test"].
|
|
@@ -416,6 +443,9 @@ class DatasetConfig:
|
|
|
416
443
|
_max_generated_tokens: int | None = None
|
|
417
444
|
_labels: list[str] | None = None
|
|
418
445
|
_prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
|
|
446
|
+
_allowed_model_types: list[ModelType] | None = None
|
|
447
|
+
_allowed_generative_types: list[GenerativeType] | None = None
|
|
448
|
+
_allow_invalid_model_outputs: bool | None = None
|
|
419
449
|
splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
|
|
420
450
|
bootstrap_samples: bool = True
|
|
421
451
|
unofficial: bool = False
|
|
@@ -430,7 +460,6 @@ class DatasetConfig:
|
|
|
430
460
|
if self._prompt_prefix is None
|
|
431
461
|
else self._prompt_prefix
|
|
432
462
|
)
|
|
433
|
-
prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
|
|
434
463
|
return prompt_prefix
|
|
435
464
|
|
|
436
465
|
@property
|
|
@@ -443,7 +472,6 @@ class DatasetConfig:
|
|
|
443
472
|
if self._prompt_template is None
|
|
444
473
|
else self._prompt_template
|
|
445
474
|
)
|
|
446
|
-
prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
|
|
447
475
|
return prompt_template
|
|
448
476
|
|
|
449
477
|
@property
|
|
@@ -456,9 +484,6 @@ class DatasetConfig:
|
|
|
456
484
|
if self._instruction_prompt is None
|
|
457
485
|
else self._instruction_prompt
|
|
458
486
|
)
|
|
459
|
-
instruction_prompt = instruction_prompt.replace(
|
|
460
|
-
"{labels_str}", self._labels_str
|
|
461
|
-
)
|
|
462
487
|
return instruction_prompt
|
|
463
488
|
|
|
464
489
|
@property
|
|
@@ -500,6 +525,33 @@ class DatasetConfig:
|
|
|
500
525
|
else:
|
|
501
526
|
return prompt_config.default_prompt_label_mapping
|
|
502
527
|
|
|
528
|
+
@property
|
|
529
|
+
def allowed_model_types(self) -> list[ModelType]:
|
|
530
|
+
"""A list of model types that are allowed to be evaluated on this dataset."""
|
|
531
|
+
return (
|
|
532
|
+
self._allowed_model_types
|
|
533
|
+
if self._allowed_model_types is not None
|
|
534
|
+
else self.task.default_allowed_model_types
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
@property
|
|
538
|
+
def allowed_generative_types(self) -> list[GenerativeType]:
|
|
539
|
+
"""A list of generative model types that are allowed on this dataset."""
|
|
540
|
+
return (
|
|
541
|
+
self._allowed_generative_types
|
|
542
|
+
if self._allowed_generative_types is not None
|
|
543
|
+
else self.task.default_allowed_generative_types
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
@property
|
|
547
|
+
def allow_invalid_model_outputs(self) -> bool:
|
|
548
|
+
"""Whether to allow invalid model outputs."""
|
|
549
|
+
return (
|
|
550
|
+
self._allow_invalid_model_outputs
|
|
551
|
+
if self._allow_invalid_model_outputs is not None
|
|
552
|
+
else self.task.default_allow_invalid_model_outputs
|
|
553
|
+
)
|
|
554
|
+
|
|
503
555
|
@property
|
|
504
556
|
def id2label(self) -> dict[int, str]:
|
|
505
557
|
"""The mapping from ID to label."""
|
|
@@ -519,15 +571,16 @@ class DatasetConfig:
|
|
|
519
571
|
"""Return a hash of the dataset configuration."""
|
|
520
572
|
return hash(self.name)
|
|
521
573
|
|
|
522
|
-
|
|
523
|
-
def _labels_str(self) -> str:
|
|
574
|
+
def get_labels_str(self, labels: list[str] | None = None) -> str:
|
|
524
575
|
"""Converts a set of labels to a natural string, in the specified language.
|
|
525
576
|
|
|
526
577
|
If the task is NER, we separate using 'and' and use the mapped labels instead of
|
|
527
578
|
the BIO NER labels.
|
|
528
579
|
|
|
529
580
|
Args:
|
|
530
|
-
|
|
581
|
+
labels (optional):
|
|
582
|
+
The labels to convert to a natural string. If None, uses all the labels
|
|
583
|
+
in the dataset. Defaults to None.
|
|
531
584
|
|
|
532
585
|
Returns:
|
|
533
586
|
The natural string representation of the labels in specified language.
|
|
@@ -539,16 +592,17 @@ class DatasetConfig:
|
|
|
539
592
|
else:
|
|
540
593
|
sep_word = main_language.or_separator
|
|
541
594
|
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
595
|
+
if labels is None:
|
|
596
|
+
labels = list()
|
|
597
|
+
for english_label in self.labels:
|
|
598
|
+
if english_label not in self.prompt_label_mapping:
|
|
599
|
+
continue
|
|
600
|
+
label = self.prompt_label_mapping[english_label]
|
|
601
|
+
if label not in labels:
|
|
602
|
+
labels.append(label)
|
|
549
603
|
|
|
550
604
|
# Convert labels to single-quoted labels - and remove duplicates
|
|
551
|
-
quoted_labels = [f"'{label}'" for label in
|
|
605
|
+
quoted_labels = [f"'{label}'" for label in labels]
|
|
552
606
|
|
|
553
607
|
if not quoted_labels:
|
|
554
608
|
return ""
|
|
@@ -569,6 +623,8 @@ class ModelConfig:
|
|
|
569
623
|
The ID of the model.
|
|
570
624
|
revision:
|
|
571
625
|
The revision of the model.
|
|
626
|
+
param:
|
|
627
|
+
The parameter of the model, or None if the model has no parameters.
|
|
572
628
|
task:
|
|
573
629
|
The task that the model was trained on.
|
|
574
630
|
languages:
|
|
@@ -590,6 +646,7 @@ class ModelConfig:
|
|
|
590
646
|
|
|
591
647
|
model_id: str
|
|
592
648
|
revision: str
|
|
649
|
+
param: str | None
|
|
593
650
|
task: str
|
|
594
651
|
languages: list[Language]
|
|
595
652
|
inference_backend: "InferenceBackend"
|
|
@@ -703,3 +760,21 @@ class PromptConfig:
|
|
|
703
760
|
default_prompt_template: str
|
|
704
761
|
default_instruction_prompt: str
|
|
705
762
|
default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
@dataclass
|
|
766
|
+
class ModelIdComponents:
|
|
767
|
+
"""A model ID split into its components.
|
|
768
|
+
|
|
769
|
+
Attributes:
|
|
770
|
+
model_id:
|
|
771
|
+
The main model ID without revision or parameters.
|
|
772
|
+
revision:
|
|
773
|
+
The revision of the model, if any.
|
|
774
|
+
param:
|
|
775
|
+
The parameter of the model, if any.
|
|
776
|
+
"""
|
|
777
|
+
|
|
778
|
+
model_id: str
|
|
779
|
+
revision: str
|
|
780
|
+
param: str | None
|
|
@@ -15,6 +15,7 @@ from .icelandic import * # noqa: F403
|
|
|
15
15
|
from .italian import * # noqa: F403
|
|
16
16
|
from .latvian import * # noqa: F403
|
|
17
17
|
from .norwegian import * # noqa: F403
|
|
18
|
+
from .polish import * # noqa: F403
|
|
18
19
|
from .portuguese import * # noqa: F403
|
|
19
20
|
from .spanish import * # noqa: F403
|
|
20
21
|
from .swedish import * # noqa: F403
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Danish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import DA
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -84,7 +85,6 @@ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
|
|
|
84
85
|
languages=[DA],
|
|
85
86
|
splits=["test"],
|
|
86
87
|
bootstrap_samples=False,
|
|
87
|
-
_instruction_prompt="{text}",
|
|
88
88
|
)
|
|
89
89
|
|
|
90
90
|
|
|
@@ -150,6 +150,19 @@ GOLDENSWAG_DA_CONFIG = DatasetConfig(
|
|
|
150
150
|
unofficial=True,
|
|
151
151
|
)
|
|
152
152
|
|
|
153
|
+
WINOGRANDE_DA_CONFIG = DatasetConfig(
|
|
154
|
+
name="winogrande-da",
|
|
155
|
+
pretty_name="the Danish common-sense reasoning dataset Winogrande-da, translated "
|
|
156
|
+
"from the English Winogrande dataset",
|
|
157
|
+
huggingface_id="EuroEval/winogrande-da",
|
|
158
|
+
task=COMMON_SENSE,
|
|
159
|
+
languages=[DA],
|
|
160
|
+
splits=["train", "test"],
|
|
161
|
+
_labels=["a", "b"],
|
|
162
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
163
|
+
unofficial=True,
|
|
164
|
+
)
|
|
165
|
+
|
|
153
166
|
EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
|
|
154
167
|
name="european-values-situational-da",
|
|
155
168
|
pretty_name="the Danish version of the European values evaluation dataset, where "
|
|
@@ -159,7 +172,6 @@ EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
|
|
|
159
172
|
languages=[DA],
|
|
160
173
|
splits=["test"],
|
|
161
174
|
bootstrap_samples=False,
|
|
162
|
-
_instruction_prompt="{text}",
|
|
163
175
|
unofficial=True,
|
|
164
176
|
)
|
|
165
177
|
|
|
@@ -172,6 +184,5 @@ EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
|
|
|
172
184
|
languages=[DA],
|
|
173
185
|
splits=["test"],
|
|
174
186
|
bootstrap_samples=False,
|
|
175
|
-
_instruction_prompt="{text}",
|
|
176
187
|
unofficial=True,
|
|
177
188
|
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Dutch dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import NL
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -142,6 +143,19 @@ GOLDENSWAG_NL_CONFIG = DatasetConfig(
|
|
|
142
143
|
unofficial=True,
|
|
143
144
|
)
|
|
144
145
|
|
|
146
|
+
WINOGRANDE_NL_CONFIG = DatasetConfig(
|
|
147
|
+
name="winogrande-nl",
|
|
148
|
+
pretty_name="the Dutch common-sense reasoning dataset Winogrande-nl, translated "
|
|
149
|
+
"from the English Winogrande dataset",
|
|
150
|
+
huggingface_id="EuroEval/winogrande-nl",
|
|
151
|
+
task=COMMON_SENSE,
|
|
152
|
+
languages=[NL],
|
|
153
|
+
splits=["train", "test"],
|
|
154
|
+
_labels=["a", "b"],
|
|
155
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
156
|
+
unofficial=True,
|
|
157
|
+
)
|
|
158
|
+
|
|
145
159
|
EUROPEAN_VALUES_SITUATIONAL_NL_CONFIG = DatasetConfig(
|
|
146
160
|
name="european-values-situational-nl",
|
|
147
161
|
pretty_name="the Dutch version of the European values evaluation dataset, where "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All English dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import EN
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -80,6 +81,15 @@ EUROPEAN_VALUES_EN_CONFIG = DatasetConfig(
|
|
|
80
81
|
|
|
81
82
|
### Unofficial datasets ###
|
|
82
83
|
|
|
84
|
+
XQUAD_EN_CONFIG = DatasetConfig(
|
|
85
|
+
name="xquad-en",
|
|
86
|
+
pretty_name="the English version of the reading comprehension dataset XQuAD",
|
|
87
|
+
huggingface_id="EuroEval/xquad-en",
|
|
88
|
+
task=RC,
|
|
89
|
+
languages=[EN],
|
|
90
|
+
unofficial=True,
|
|
91
|
+
)
|
|
92
|
+
|
|
83
93
|
ARC_CONFIG = DatasetConfig(
|
|
84
94
|
name="arc",
|
|
85
95
|
pretty_name="the truncated version of the English knowledge dataset ARC",
|
|
@@ -117,6 +127,18 @@ MULTI_WIKI_QA_EN_CONFIG = DatasetConfig(
|
|
|
117
127
|
unofficial=True,
|
|
118
128
|
)
|
|
119
129
|
|
|
130
|
+
WINOGRANDE_CONFIG = DatasetConfig(
|
|
131
|
+
name="winogrande",
|
|
132
|
+
pretty_name="the English common-sense reasoning dataset Winogrande",
|
|
133
|
+
huggingface_id="EuroEval/winogrande-en",
|
|
134
|
+
task=COMMON_SENSE,
|
|
135
|
+
languages=[EN],
|
|
136
|
+
splits=["train", "test"],
|
|
137
|
+
_labels=["a", "b"],
|
|
138
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
139
|
+
unofficial=True,
|
|
140
|
+
)
|
|
141
|
+
|
|
120
142
|
EUROPEAN_VALUES_SITUATIONAL_EN_CONFIG = DatasetConfig(
|
|
121
143
|
name="european-values-situational-en",
|
|
122
144
|
pretty_name="the English version of the European values evaluation dataset, where "
|
|
@@ -47,13 +47,12 @@ ERR_NEWS_CONFIG = DatasetConfig(
|
|
|
47
47
|
languages=[ET],
|
|
48
48
|
)
|
|
49
49
|
|
|
50
|
-
|
|
51
|
-
name="
|
|
52
|
-
pretty_name="the Estonian knowledge
|
|
53
|
-
huggingface_id="EuroEval/
|
|
50
|
+
TRIVIA_ET_CONFIG = DatasetConfig(
|
|
51
|
+
name="trivia-et",
|
|
52
|
+
pretty_name="the Estonian knowledge dataset Trivia-et",
|
|
53
|
+
huggingface_id="EuroEval/trivia-et",
|
|
54
54
|
task=KNOW,
|
|
55
55
|
languages=[ET],
|
|
56
|
-
_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
|
|
57
56
|
)
|
|
58
57
|
|
|
59
58
|
WINOGRANDE_ET_CONFIG = DatasetConfig(
|
|
@@ -82,8 +81,7 @@ EUROPEAN_VALUES_ET_CONFIG = DatasetConfig(
|
|
|
82
81
|
_instruction_prompt="{text}",
|
|
83
82
|
)
|
|
84
83
|
|
|
85
|
-
|
|
86
|
-
### Unofficial datasets ###
|
|
84
|
+
### Unofficial datasets ###
|
|
87
85
|
|
|
88
86
|
SCALA_ET_CONFIG = DatasetConfig(
|
|
89
87
|
name="scala-et",
|
|
@@ -93,3 +91,13 @@ SCALA_ET_CONFIG = DatasetConfig(
|
|
|
93
91
|
languages=[ET],
|
|
94
92
|
unofficial=True,
|
|
95
93
|
)
|
|
94
|
+
|
|
95
|
+
EXAM_ET_CONFIG = DatasetConfig(
|
|
96
|
+
name="exam-et",
|
|
97
|
+
pretty_name="the Estonian knowledge assessment dataset Exam-et",
|
|
98
|
+
huggingface_id="EuroEval/exam-et",
|
|
99
|
+
task=KNOW,
|
|
100
|
+
languages=[ET],
|
|
101
|
+
_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
|
|
102
|
+
unofficial=True,
|
|
103
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Finnish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import FI
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -101,6 +102,19 @@ GOLDENSWAG_FI_CONFIG = DatasetConfig(
|
|
|
101
102
|
unofficial=True,
|
|
102
103
|
)
|
|
103
104
|
|
|
105
|
+
WINOGRANDE_FI_CONFIG = DatasetConfig(
|
|
106
|
+
name="winogrande-fi",
|
|
107
|
+
pretty_name="the Finnish common-sense reasoning dataset Winogrande-fi, translated "
|
|
108
|
+
"from the English Winogrande dataset",
|
|
109
|
+
huggingface_id="EuroEval/winogrande-fi",
|
|
110
|
+
task=COMMON_SENSE,
|
|
111
|
+
languages=[FI],
|
|
112
|
+
splits=["train", "test"],
|
|
113
|
+
_labels=["a", "b"],
|
|
114
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
115
|
+
unofficial=True,
|
|
116
|
+
)
|
|
117
|
+
|
|
104
118
|
EUROPEAN_VALUES_SITUATIONAL_FI_CONFIG = DatasetConfig(
|
|
105
119
|
name="european-values-situational-fi",
|
|
106
120
|
pretty_name="the Finnish version of the European values evaluation dataset, where "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All French dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import FR
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -113,6 +114,19 @@ GOLDENSWAG_FR_CONFIG = DatasetConfig(
|
|
|
113
114
|
unofficial=True,
|
|
114
115
|
)
|
|
115
116
|
|
|
117
|
+
WINOGRANDE_FR_CONFIG = DatasetConfig(
|
|
118
|
+
name="winogrande-fr",
|
|
119
|
+
pretty_name="the French common-sense reasoning dataset Winogrande-fr, translated "
|
|
120
|
+
"from the English Winogrande dataset",
|
|
121
|
+
huggingface_id="EuroEval/winogrande-fr",
|
|
122
|
+
task=COMMON_SENSE,
|
|
123
|
+
languages=[FR],
|
|
124
|
+
splits=["train", "test"],
|
|
125
|
+
_labels=["a", "b"],
|
|
126
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
127
|
+
unofficial=True,
|
|
128
|
+
)
|
|
129
|
+
|
|
116
130
|
EUROPEAN_VALUES_SITUATIONAL_FR_CONFIG = DatasetConfig(
|
|
117
131
|
name="european-values-situational-fr",
|
|
118
132
|
pretty_name="the French version of the European values evaluation dataset, where "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All German dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import DE
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -81,6 +82,15 @@ EUROPEAN_VALUES_DE_CONFIG = DatasetConfig(
|
|
|
81
82
|
|
|
82
83
|
### Unofficial datasets ###
|
|
83
84
|
|
|
85
|
+
XQUAD_DE_CONFIG = DatasetConfig(
|
|
86
|
+
name="xquad-de",
|
|
87
|
+
pretty_name="the German version of the reading comprehension dataset XQuAD",
|
|
88
|
+
huggingface_id="EuroEval/xquad-de",
|
|
89
|
+
task=RC,
|
|
90
|
+
languages=[DE],
|
|
91
|
+
unofficial=True,
|
|
92
|
+
)
|
|
93
|
+
|
|
84
94
|
ARC_DE_CONFIG = DatasetConfig(
|
|
85
95
|
name="arc-de",
|
|
86
96
|
pretty_name="the truncated version of the German knowledge dataset ARC-de, "
|
|
@@ -121,6 +131,19 @@ GOLDENSWAG_DE_CONFIG = DatasetConfig(
|
|
|
121
131
|
unofficial=True,
|
|
122
132
|
)
|
|
123
133
|
|
|
134
|
+
WINOGRANDE_DE_CONFIG = DatasetConfig(
|
|
135
|
+
name="winogrande-de",
|
|
136
|
+
pretty_name="the German common-sense reasoning dataset Winogrande-de, translated "
|
|
137
|
+
"from the English Winogrande dataset",
|
|
138
|
+
huggingface_id="EuroEval/winogrande-de",
|
|
139
|
+
task=COMMON_SENSE,
|
|
140
|
+
languages=[DE],
|
|
141
|
+
splits=["train", "test"],
|
|
142
|
+
_labels=["a", "b"],
|
|
143
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
144
|
+
unofficial=True,
|
|
145
|
+
)
|
|
146
|
+
|
|
124
147
|
EUROPEAN_VALUES_SITUATIONAL_DE_CONFIG = DatasetConfig(
|
|
125
148
|
name="european-values-situational-de",
|
|
126
149
|
pretty_name="the German version of the European values evaluation dataset, where "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Italian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import IT
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -121,6 +122,19 @@ GOLDENSWAG_IT_CONFIG = DatasetConfig(
|
|
|
121
122
|
unofficial=True,
|
|
122
123
|
)
|
|
123
124
|
|
|
125
|
+
WINOGRANDE_IT_CONFIG = DatasetConfig(
|
|
126
|
+
name="winogrande-it",
|
|
127
|
+
pretty_name="the Italian common-sense reasoning dataset Winogrande-it, translated "
|
|
128
|
+
"from the English Winogrande dataset",
|
|
129
|
+
huggingface_id="EuroEval/winogrande-it",
|
|
130
|
+
task=COMMON_SENSE,
|
|
131
|
+
languages=[IT],
|
|
132
|
+
splits=["train", "test"],
|
|
133
|
+
_labels=["a", "b"],
|
|
134
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
135
|
+
unofficial=True,
|
|
136
|
+
)
|
|
137
|
+
|
|
124
138
|
EUROPEAN_VALUES_SITUATIONAL_IT_CONFIG = DatasetConfig(
|
|
125
139
|
name="european-values-situational-it",
|
|
126
140
|
pretty_name="the Italian version of the European values evaluation dataset, "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Latvian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import LV
|
|
5
6
|
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -79,3 +80,16 @@ WIKIANN_LV_CONFIG = DatasetConfig(
|
|
|
79
80
|
languages=[LV],
|
|
80
81
|
unofficial=True,
|
|
81
82
|
)
|
|
83
|
+
|
|
84
|
+
WINOGRANDE_LV_CONFIG = DatasetConfig(
|
|
85
|
+
name="winogrande-lv",
|
|
86
|
+
pretty_name="the Latvian common-sense reasoning dataset Winogrande-lv, translated "
|
|
87
|
+
"from the English Winogrande dataset",
|
|
88
|
+
huggingface_id="EuroEval/winogrande-lv",
|
|
89
|
+
task=COMMON_SENSE,
|
|
90
|
+
languages=[LV],
|
|
91
|
+
splits=["train", "test"],
|
|
92
|
+
_labels=["a", "b"],
|
|
93
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
94
|
+
unofficial=True,
|
|
95
|
+
)
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Norwegian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import NB, NN, NO
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -216,6 +217,19 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
|
|
|
216
217
|
unofficial=True,
|
|
217
218
|
)
|
|
218
219
|
|
|
220
|
+
WINOGRANDE_NO_CONFIG = DatasetConfig(
|
|
221
|
+
name="winogrande-no",
|
|
222
|
+
pretty_name="the Norwegian common-sense reasoning dataset Winogrande-no, "
|
|
223
|
+
"translated from the English Winogrande dataset",
|
|
224
|
+
huggingface_id="EuroEval/winogrande-no",
|
|
225
|
+
task=COMMON_SENSE,
|
|
226
|
+
languages=[NB, NN, NO],
|
|
227
|
+
splits=["train", "test"],
|
|
228
|
+
_labels=["a", "b"],
|
|
229
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
230
|
+
unofficial=True,
|
|
231
|
+
)
|
|
232
|
+
|
|
219
233
|
EUROPEAN_VALUES_SITUATIONAL_NO_CONFIG = DatasetConfig(
|
|
220
234
|
name="european-values-situational-no",
|
|
221
235
|
pretty_name="the Norwegian version of the European values evaluation dataset, "
|