EuroEval 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (51) hide show
  1. euroeval/__init__.py +5 -0
  2. euroeval/benchmark_config_factory.py +6 -1
  3. euroeval/benchmark_modules/base.py +2 -0
  4. euroeval/benchmark_modules/fresh.py +7 -1
  5. euroeval/benchmark_modules/hf.py +26 -21
  6. euroeval/benchmark_modules/litellm.py +258 -131
  7. euroeval/benchmark_modules/vllm.py +120 -68
  8. euroeval/benchmarker.py +11 -2
  9. euroeval/cli.py +14 -1
  10. euroeval/constants.py +7 -1
  11. euroeval/data_models.py +95 -20
  12. euroeval/dataset_configs/__init__.py +1 -0
  13. euroeval/dataset_configs/danish.py +14 -3
  14. euroeval/dataset_configs/dutch.py +14 -0
  15. euroeval/dataset_configs/english.py +22 -0
  16. euroeval/dataset_configs/estonian.py +15 -7
  17. euroeval/dataset_configs/finnish.py +14 -0
  18. euroeval/dataset_configs/french.py +14 -0
  19. euroeval/dataset_configs/german.py +23 -0
  20. euroeval/dataset_configs/italian.py +14 -0
  21. euroeval/dataset_configs/latvian.py +14 -0
  22. euroeval/dataset_configs/norwegian.py +14 -0
  23. euroeval/dataset_configs/polish.py +126 -0
  24. euroeval/dataset_configs/portuguese.py +14 -0
  25. euroeval/dataset_configs/spanish.py +14 -0
  26. euroeval/dataset_configs/swedish.py +25 -0
  27. euroeval/enums.py +12 -0
  28. euroeval/generation.py +17 -8
  29. euroeval/generation_utils.py +102 -16
  30. euroeval/metrics/pipeline.py +51 -9
  31. euroeval/model_cache.py +13 -1
  32. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  33. euroeval/prompt_templates/multiple_choice.py +27 -1
  34. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  35. euroeval/prompt_templates/reading_comprehension.py +11 -0
  36. euroeval/prompt_templates/sentiment_classification.py +15 -0
  37. euroeval/prompt_templates/summarization.py +27 -1
  38. euroeval/scores.py +5 -0
  39. euroeval/task_group_utils/multiple_choice_classification.py +2 -2
  40. euroeval/task_group_utils/question_answering.py +29 -29
  41. euroeval/task_group_utils/sequence_classification.py +71 -81
  42. euroeval/task_group_utils/token_classification.py +17 -3
  43. euroeval/tasks.py +12 -10
  44. euroeval/{tokenization_utils.py → tokenisation_utils.py} +41 -25
  45. euroeval/utils.py +67 -3
  46. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/METADATA +3 -1
  47. euroeval-16.1.0.dist-info/RECORD +70 -0
  48. euroeval-16.0.0.dist-info/RECORD +0 -69
  49. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
  50. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
  51. {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0
euroeval/data_models.py CHANGED
@@ -118,13 +118,19 @@ class Task:
118
118
  log probabilities for the generated tokens. Defaults to False.
119
119
  requires_logprobs (optional):
120
120
  Whether the task requires log probabilities. Implies `uses_logprobs`.
121
- allowed_model_types (optional):
121
+ default_allowed_model_types (optional):
122
122
  A list of model types that are allowed to be evaluated on this task.
123
123
  Defaults to all model types being allowed.
124
- allowed_generative_types (optional):
124
+ default_allowed_generative_types (optional):
125
125
  A list of generative model types that are allowed to be evaluated on this
126
126
  task. If None, all generative model types are allowed. Only relevant if
127
127
  `allowed_model_types` includes generative models.
128
+ default_allow_invalid_model_outputs (optional):
129
+ Whether to allow invalid model outputs. This is only relevant for generative
130
+ models on classification tasks, where the model may generate an output
131
+ which is not one of the allowed labels. If True, the model output will be
132
+ mapped to the closest valid label. If False, the model output will be
133
+ considered incorrect and the evaluation will be aborted. Defaults to True.
128
134
  """
129
135
 
130
136
  name: str
@@ -138,16 +144,17 @@ class Task:
138
144
  uses_structured_output: bool = False
139
145
  uses_logprobs: bool = False
140
146
  requires_logprobs: bool = False
141
- allowed_model_types: list[ModelType] = field(
147
+ default_allowed_model_types: list[ModelType] = field(
142
148
  default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
143
149
  )
144
- allowed_generative_types: list[GenerativeType] = field(
150
+ default_allowed_generative_types: list[GenerativeType] = field(
145
151
  default_factory=lambda: [
146
152
  GenerativeType.BASE,
147
153
  GenerativeType.INSTRUCTION_TUNED,
148
154
  GenerativeType.REASONING,
149
155
  ]
150
156
  )
157
+ default_allow_invalid_model_outputs: bool = True
151
158
 
152
159
  def __post_init__(self) -> None:
153
160
  """Post-initialisation checks."""
@@ -218,6 +225,9 @@ class BenchmarkConfig:
218
225
  Whether the benchmark is being run with the CLI.
219
226
  requires_safetensors:
220
227
  Whether to only allow models that use the safetensors format.
228
+ generative_type:
229
+ The type of generative model to benchmark. Only relevant if the model is
230
+ generative.
221
231
  """
222
232
 
223
233
  model_languages: list[Language]
@@ -244,6 +254,7 @@ class BenchmarkConfig:
244
254
  debug: bool
245
255
  run_with_cli: bool
246
256
  requires_safetensors: bool
257
+ generative_type: GenerativeType | None
247
258
 
248
259
 
249
260
  class BenchmarkConfigParams(pydantic.BaseModel):
@@ -273,6 +284,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
273
284
  api_base: str | None
274
285
  api_version: str | None
275
286
  gpu_memory_utilization: float
287
+ generative_type: GenerativeType | None
276
288
  debug: bool
277
289
  run_with_cli: bool
278
290
  requires_safetensors: bool
@@ -395,6 +407,21 @@ class DatasetConfig:
395
407
  to a 1:1 mapping between the labels and themselves. If None then the mapping
396
408
  will be set to the default mapping for the task and language. Defaults to
397
409
  None.
410
+ _allowed_model_types (optional):
411
+ A list of model types that are allowed to be evaluated on this dataset.
412
+ Defaults to the one for the task.
413
+ _allowed_generative_types (optional):
414
+ A list of generative model types that are allowed to be evaluated on this
415
+ dataset. If None, all generative model types are allowed. Only relevant if
416
+ `allowed_model_types` includes generative models. Defaults to the one for
417
+ the task.
418
+ _allow_invalid_model_outputs (optional):
419
+ Whether to allow invalid model outputs. This is only relevant for
420
+ generative models on classification tasks, where the model may generate an
421
+ output which is not one of the allowed labels. If True, the model output
422
+ will be mapped to the closest valid label. If False, the model output will
423
+ be considered incorrect and the evaluation will be aborted. Defaults to
424
+ the one for the task.
398
425
  splits (optional):
399
426
  The names of the splits in the dataset. If not provided, defaults to
400
427
  ["train", "val", "test"].
@@ -416,6 +443,9 @@ class DatasetConfig:
416
443
  _max_generated_tokens: int | None = None
417
444
  _labels: list[str] | None = None
418
445
  _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
446
+ _allowed_model_types: list[ModelType] | None = None
447
+ _allowed_generative_types: list[GenerativeType] | None = None
448
+ _allow_invalid_model_outputs: bool | None = None
419
449
  splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
420
450
  bootstrap_samples: bool = True
421
451
  unofficial: bool = False
@@ -430,7 +460,6 @@ class DatasetConfig:
430
460
  if self._prompt_prefix is None
431
461
  else self._prompt_prefix
432
462
  )
433
- prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
434
463
  return prompt_prefix
435
464
 
436
465
  @property
@@ -443,7 +472,6 @@ class DatasetConfig:
443
472
  if self._prompt_template is None
444
473
  else self._prompt_template
445
474
  )
446
- prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
447
475
  return prompt_template
448
476
 
449
477
  @property
@@ -456,9 +484,6 @@ class DatasetConfig:
456
484
  if self._instruction_prompt is None
457
485
  else self._instruction_prompt
458
486
  )
459
- instruction_prompt = instruction_prompt.replace(
460
- "{labels_str}", self._labels_str
461
- )
462
487
  return instruction_prompt
463
488
 
464
489
  @property
@@ -500,6 +525,33 @@ class DatasetConfig:
500
525
  else:
501
526
  return prompt_config.default_prompt_label_mapping
502
527
 
528
+ @property
529
+ def allowed_model_types(self) -> list[ModelType]:
530
+ """A list of model types that are allowed to be evaluated on this dataset."""
531
+ return (
532
+ self._allowed_model_types
533
+ if self._allowed_model_types is not None
534
+ else self.task.default_allowed_model_types
535
+ )
536
+
537
+ @property
538
+ def allowed_generative_types(self) -> list[GenerativeType]:
539
+ """A list of generative model types that are allowed on this dataset."""
540
+ return (
541
+ self._allowed_generative_types
542
+ if self._allowed_generative_types is not None
543
+ else self.task.default_allowed_generative_types
544
+ )
545
+
546
+ @property
547
+ def allow_invalid_model_outputs(self) -> bool:
548
+ """Whether to allow invalid model outputs."""
549
+ return (
550
+ self._allow_invalid_model_outputs
551
+ if self._allow_invalid_model_outputs is not None
552
+ else self.task.default_allow_invalid_model_outputs
553
+ )
554
+
503
555
  @property
504
556
  def id2label(self) -> dict[int, str]:
505
557
  """The mapping from ID to label."""
@@ -519,15 +571,16 @@ class DatasetConfig:
519
571
  """Return a hash of the dataset configuration."""
520
572
  return hash(self.name)
521
573
 
522
- @property
523
- def _labels_str(self) -> str:
574
+ def get_labels_str(self, labels: list[str] | None = None) -> str:
524
575
  """Converts a set of labels to a natural string, in the specified language.
525
576
 
526
577
  If the task is NER, we separate using 'and' and use the mapped labels instead of
527
578
  the BIO NER labels.
528
579
 
529
580
  Args:
530
- language: The language to be used when converting the labels.
581
+ labels (optional):
582
+ The labels to convert to a natural string. If None, uses all the labels
583
+ in the dataset. Defaults to None.
531
584
 
532
585
  Returns:
533
586
  The natural string representation of the labels in specified language.
@@ -539,16 +592,17 @@ class DatasetConfig:
539
592
  else:
540
593
  sep_word = main_language.or_separator
541
594
 
542
- local_labels: list[str] = []
543
- for label in self.labels:
544
- if label not in self.prompt_label_mapping:
545
- continue
546
- local_label = self.prompt_label_mapping[label]
547
- if local_label not in local_labels:
548
- local_labels.append(local_label)
595
+ if labels is None:
596
+ labels = list()
597
+ for english_label in self.labels:
598
+ if english_label not in self.prompt_label_mapping:
599
+ continue
600
+ label = self.prompt_label_mapping[english_label]
601
+ if label not in labels:
602
+ labels.append(label)
549
603
 
550
604
  # Convert labels to single-quoted labels - and remove duplicates
551
- quoted_labels = [f"'{label}'" for label in local_labels]
605
+ quoted_labels = [f"'{label}'" for label in labels]
552
606
 
553
607
  if not quoted_labels:
554
608
  return ""
@@ -569,6 +623,8 @@ class ModelConfig:
569
623
  The ID of the model.
570
624
  revision:
571
625
  The revision of the model.
626
+ param:
627
+ The parameter of the model, or None if the model has no parameters.
572
628
  task:
573
629
  The task that the model was trained on.
574
630
  languages:
@@ -590,6 +646,7 @@ class ModelConfig:
590
646
 
591
647
  model_id: str
592
648
  revision: str
649
+ param: str | None
593
650
  task: str
594
651
  languages: list[Language]
595
652
  inference_backend: "InferenceBackend"
@@ -703,3 +760,21 @@ class PromptConfig:
703
760
  default_prompt_template: str
704
761
  default_instruction_prompt: str
705
762
  default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
763
+
764
+
765
+ @dataclass
766
+ class ModelIdComponents:
767
+ """A model ID split into its components.
768
+
769
+ Attributes:
770
+ model_id:
771
+ The main model ID without revision or parameters.
772
+ revision:
773
+ The revision of the model, if any.
774
+ param:
775
+ The parameter of the model, if any.
776
+ """
777
+
778
+ model_id: str
779
+ revision: str
780
+ param: str | None
@@ -15,6 +15,7 @@ from .icelandic import * # noqa: F403
15
15
  from .italian import * # noqa: F403
16
16
  from .latvian import * # noqa: F403
17
17
  from .norwegian import * # noqa: F403
18
+ from .polish import * # noqa: F403
18
19
  from .portuguese import * # noqa: F403
19
20
  from .spanish import * # noqa: F403
20
21
  from .swedish import * # noqa: F403
@@ -1,6 +1,7 @@
1
1
  """All Danish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import DA
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -84,7 +85,6 @@ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
84
85
  languages=[DA],
85
86
  splits=["test"],
86
87
  bootstrap_samples=False,
87
- _instruction_prompt="{text}",
88
88
  )
89
89
 
90
90
 
@@ -150,6 +150,19 @@ GOLDENSWAG_DA_CONFIG = DatasetConfig(
150
150
  unofficial=True,
151
151
  )
152
152
 
153
+ WINOGRANDE_DA_CONFIG = DatasetConfig(
154
+ name="winogrande-da",
155
+ pretty_name="the Danish common-sense reasoning dataset Winogrande-da, translated "
156
+ "from the English Winogrande dataset",
157
+ huggingface_id="EuroEval/winogrande-da",
158
+ task=COMMON_SENSE,
159
+ languages=[DA],
160
+ splits=["train", "test"],
161
+ _labels=["a", "b"],
162
+ _allowed_model_types=[ModelType.GENERATIVE],
163
+ unofficial=True,
164
+ )
165
+
153
166
  EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
154
167
  name="european-values-situational-da",
155
168
  pretty_name="the Danish version of the European values evaluation dataset, where "
@@ -159,7 +172,6 @@ EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
159
172
  languages=[DA],
160
173
  splits=["test"],
161
174
  bootstrap_samples=False,
162
- _instruction_prompt="{text}",
163
175
  unofficial=True,
164
176
  )
165
177
 
@@ -172,6 +184,5 @@ EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
172
184
  languages=[DA],
173
185
  splits=["test"],
174
186
  bootstrap_samples=False,
175
- _instruction_prompt="{text}",
176
187
  unofficial=True,
177
188
  )
@@ -1,6 +1,7 @@
1
1
  """All Dutch dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import NL
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -142,6 +143,19 @@ GOLDENSWAG_NL_CONFIG = DatasetConfig(
142
143
  unofficial=True,
143
144
  )
144
145
 
146
+ WINOGRANDE_NL_CONFIG = DatasetConfig(
147
+ name="winogrande-nl",
148
+ pretty_name="the Dutch common-sense reasoning dataset Winogrande-nl, translated "
149
+ "from the English Winogrande dataset",
150
+ huggingface_id="EuroEval/winogrande-nl",
151
+ task=COMMON_SENSE,
152
+ languages=[NL],
153
+ splits=["train", "test"],
154
+ _labels=["a", "b"],
155
+ _allowed_model_types=[ModelType.GENERATIVE],
156
+ unofficial=True,
157
+ )
158
+
145
159
  EUROPEAN_VALUES_SITUATIONAL_NL_CONFIG = DatasetConfig(
146
160
  name="european-values-situational-nl",
147
161
  pretty_name="the Dutch version of the European values evaluation dataset, where "
@@ -1,6 +1,7 @@
1
1
  """All English dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import EN
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -80,6 +81,15 @@ EUROPEAN_VALUES_EN_CONFIG = DatasetConfig(
80
81
 
81
82
  ### Unofficial datasets ###
82
83
 
84
+ XQUAD_EN_CONFIG = DatasetConfig(
85
+ name="xquad-en",
86
+ pretty_name="the English version of the reading comprehension dataset XQuAD",
87
+ huggingface_id="EuroEval/xquad-en",
88
+ task=RC,
89
+ languages=[EN],
90
+ unofficial=True,
91
+ )
92
+
83
93
  ARC_CONFIG = DatasetConfig(
84
94
  name="arc",
85
95
  pretty_name="the truncated version of the English knowledge dataset ARC",
@@ -117,6 +127,18 @@ MULTI_WIKI_QA_EN_CONFIG = DatasetConfig(
117
127
  unofficial=True,
118
128
  )
119
129
 
130
+ WINOGRANDE_CONFIG = DatasetConfig(
131
+ name="winogrande",
132
+ pretty_name="the English common-sense reasoning dataset Winogrande",
133
+ huggingface_id="EuroEval/winogrande-en",
134
+ task=COMMON_SENSE,
135
+ languages=[EN],
136
+ splits=["train", "test"],
137
+ _labels=["a", "b"],
138
+ _allowed_model_types=[ModelType.GENERATIVE],
139
+ unofficial=True,
140
+ )
141
+
120
142
  EUROPEAN_VALUES_SITUATIONAL_EN_CONFIG = DatasetConfig(
121
143
  name="european-values-situational-en",
122
144
  pretty_name="the English version of the European values evaluation dataset, where "
@@ -47,13 +47,12 @@ ERR_NEWS_CONFIG = DatasetConfig(
47
47
  languages=[ET],
48
48
  )
49
49
 
50
- EXAM_ET_CONFIG = DatasetConfig(
51
- name="exam-et",
52
- pretty_name="the Estonian knowledge assessment dataset Exam-et",
53
- huggingface_id="EuroEval/exam-et",
50
+ TRIVIA_ET_CONFIG = DatasetConfig(
51
+ name="trivia-et",
52
+ pretty_name="the Estonian knowledge dataset Trivia-et",
53
+ huggingface_id="EuroEval/trivia-et",
54
54
  task=KNOW,
55
55
  languages=[ET],
56
- _labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
57
56
  )
58
57
 
59
58
  WINOGRANDE_ET_CONFIG = DatasetConfig(
@@ -82,8 +81,7 @@ EUROPEAN_VALUES_ET_CONFIG = DatasetConfig(
82
81
  _instruction_prompt="{text}",
83
82
  )
84
83
 
85
-
86
- ### Unofficial datasets ###
84
+ ### Unofficial datasets ###
87
85
 
88
86
  SCALA_ET_CONFIG = DatasetConfig(
89
87
  name="scala-et",
@@ -93,3 +91,13 @@ SCALA_ET_CONFIG = DatasetConfig(
93
91
  languages=[ET],
94
92
  unofficial=True,
95
93
  )
94
+
95
+ EXAM_ET_CONFIG = DatasetConfig(
96
+ name="exam-et",
97
+ pretty_name="the Estonian knowledge assessment dataset Exam-et",
98
+ huggingface_id="EuroEval/exam-et",
99
+ task=KNOW,
100
+ languages=[ET],
101
+ _labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"],
102
+ unofficial=True,
103
+ )
@@ -1,6 +1,7 @@
1
1
  """All Finnish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import FI
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -101,6 +102,19 @@ GOLDENSWAG_FI_CONFIG = DatasetConfig(
101
102
  unofficial=True,
102
103
  )
103
104
 
105
+ WINOGRANDE_FI_CONFIG = DatasetConfig(
106
+ name="winogrande-fi",
107
+ pretty_name="the Finnish common-sense reasoning dataset Winogrande-fi, translated "
108
+ "from the English Winogrande dataset",
109
+ huggingface_id="EuroEval/winogrande-fi",
110
+ task=COMMON_SENSE,
111
+ languages=[FI],
112
+ splits=["train", "test"],
113
+ _labels=["a", "b"],
114
+ _allowed_model_types=[ModelType.GENERATIVE],
115
+ unofficial=True,
116
+ )
117
+
104
118
  EUROPEAN_VALUES_SITUATIONAL_FI_CONFIG = DatasetConfig(
105
119
  name="european-values-situational-fi",
106
120
  pretty_name="the Finnish version of the European values evaluation dataset, where "
@@ -1,6 +1,7 @@
1
1
  """All French dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import FR
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -113,6 +114,19 @@ GOLDENSWAG_FR_CONFIG = DatasetConfig(
113
114
  unofficial=True,
114
115
  )
115
116
 
117
+ WINOGRANDE_FR_CONFIG = DatasetConfig(
118
+ name="winogrande-fr",
119
+ pretty_name="the French common-sense reasoning dataset Winogrande-fr, translated "
120
+ "from the English Winogrande dataset",
121
+ huggingface_id="EuroEval/winogrande-fr",
122
+ task=COMMON_SENSE,
123
+ languages=[FR],
124
+ splits=["train", "test"],
125
+ _labels=["a", "b"],
126
+ _allowed_model_types=[ModelType.GENERATIVE],
127
+ unofficial=True,
128
+ )
129
+
116
130
  EUROPEAN_VALUES_SITUATIONAL_FR_CONFIG = DatasetConfig(
117
131
  name="european-values-situational-fr",
118
132
  pretty_name="the French version of the European values evaluation dataset, where "
@@ -1,6 +1,7 @@
1
1
  """All German dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import DE
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -81,6 +82,15 @@ EUROPEAN_VALUES_DE_CONFIG = DatasetConfig(
81
82
 
82
83
  ### Unofficial datasets ###
83
84
 
85
+ XQUAD_DE_CONFIG = DatasetConfig(
86
+ name="xquad-de",
87
+ pretty_name="the German version of the reading comprehension dataset XQuAD",
88
+ huggingface_id="EuroEval/xquad-de",
89
+ task=RC,
90
+ languages=[DE],
91
+ unofficial=True,
92
+ )
93
+
84
94
  ARC_DE_CONFIG = DatasetConfig(
85
95
  name="arc-de",
86
96
  pretty_name="the truncated version of the German knowledge dataset ARC-de, "
@@ -121,6 +131,19 @@ GOLDENSWAG_DE_CONFIG = DatasetConfig(
121
131
  unofficial=True,
122
132
  )
123
133
 
134
+ WINOGRANDE_DE_CONFIG = DatasetConfig(
135
+ name="winogrande-de",
136
+ pretty_name="the German common-sense reasoning dataset Winogrande-de, translated "
137
+ "from the English Winogrande dataset",
138
+ huggingface_id="EuroEval/winogrande-de",
139
+ task=COMMON_SENSE,
140
+ languages=[DE],
141
+ splits=["train", "test"],
142
+ _labels=["a", "b"],
143
+ _allowed_model_types=[ModelType.GENERATIVE],
144
+ unofficial=True,
145
+ )
146
+
124
147
  EUROPEAN_VALUES_SITUATIONAL_DE_CONFIG = DatasetConfig(
125
148
  name="european-values-situational-de",
126
149
  pretty_name="the German version of the European values evaluation dataset, where "
@@ -1,6 +1,7 @@
1
1
  """All Italian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import IT
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -121,6 +122,19 @@ GOLDENSWAG_IT_CONFIG = DatasetConfig(
121
122
  unofficial=True,
122
123
  )
123
124
 
125
+ WINOGRANDE_IT_CONFIG = DatasetConfig(
126
+ name="winogrande-it",
127
+ pretty_name="the Italian common-sense reasoning dataset Winogrande-it, translated "
128
+ "from the English Winogrande dataset",
129
+ huggingface_id="EuroEval/winogrande-it",
130
+ task=COMMON_SENSE,
131
+ languages=[IT],
132
+ splits=["train", "test"],
133
+ _labels=["a", "b"],
134
+ _allowed_model_types=[ModelType.GENERATIVE],
135
+ unofficial=True,
136
+ )
137
+
124
138
  EUROPEAN_VALUES_SITUATIONAL_IT_CONFIG = DatasetConfig(
125
139
  name="european-values-situational-it",
126
140
  pretty_name="the Italian version of the European values evaluation dataset, "
@@ -1,6 +1,7 @@
1
1
  """All Latvian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import LV
5
6
  from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
6
7
 
@@ -79,3 +80,16 @@ WIKIANN_LV_CONFIG = DatasetConfig(
79
80
  languages=[LV],
80
81
  unofficial=True,
81
82
  )
83
+
84
+ WINOGRANDE_LV_CONFIG = DatasetConfig(
85
+ name="winogrande-lv",
86
+ pretty_name="the Latvian common-sense reasoning dataset Winogrande-lv, translated "
87
+ "from the English Winogrande dataset",
88
+ huggingface_id="EuroEval/winogrande-lv",
89
+ task=COMMON_SENSE,
90
+ languages=[LV],
91
+ splits=["train", "test"],
92
+ _labels=["a", "b"],
93
+ _allowed_model_types=[ModelType.GENERATIVE],
94
+ unofficial=True,
95
+ )
@@ -1,6 +1,7 @@
1
1
  """All Norwegian dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
+ from ..enums import ModelType
4
5
  from ..languages import NB, NN, NO
5
6
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
7
 
@@ -216,6 +217,19 @@ MULTI_WIKI_QA_NN_CONFIG = DatasetConfig(
216
217
  unofficial=True,
217
218
  )
218
219
 
220
+ WINOGRANDE_NO_CONFIG = DatasetConfig(
221
+ name="winogrande-no",
222
+ pretty_name="the Norwegian common-sense reasoning dataset Winogrande-no, "
223
+ "translated from the English Winogrande dataset",
224
+ huggingface_id="EuroEval/winogrande-no",
225
+ task=COMMON_SENSE,
226
+ languages=[NB, NN, NO],
227
+ splits=["train", "test"],
228
+ _labels=["a", "b"],
229
+ _allowed_model_types=[ModelType.GENERATIVE],
230
+ unofficial=True,
231
+ )
232
+
219
233
  EUROPEAN_VALUES_SITUATIONAL_NO_CONFIG = DatasetConfig(
220
234
  name="european-values-situational-no",
221
235
  pretty_name="the Norwegian version of the European values evaluation dataset, "