EuroEval 15.7.0__py3-none-any.whl → 15.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_config_factory.py +1 -1
- euroeval/benchmark_modules/litellm.py +27 -258
- euroeval/benchmark_modules/vllm.py +14 -304
- euroeval/benchmarker.py +14 -11
- euroeval/data_models.py +3 -1
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/dutch.py +5 -16
- euroeval/dataset_configs/finnish.py +11 -9
- euroeval/generation_utils.py +346 -0
- euroeval/languages.py +1 -1
- euroeval/scores.py +7 -1
- euroeval/task_group_utils/sequence_classification.py +46 -11
- euroeval/tokenization_utils.py +50 -14
- {euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/METADATA +1 -1
- {euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/RECORD +18 -17
- {euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/WHEEL +0 -0
- {euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/entry_points.txt +0 -0
- {euroeval-15.7.0.dist-info → euroeval-15.7.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,11 +3,9 @@
|
|
|
3
3
|
import collections.abc as c
|
|
4
4
|
import contextlib
|
|
5
5
|
import importlib.util
|
|
6
|
-
import itertools as it
|
|
7
6
|
import json
|
|
8
7
|
import logging
|
|
9
8
|
import os
|
|
10
|
-
import random
|
|
11
9
|
import re
|
|
12
10
|
import sys
|
|
13
11
|
import typing as t
|
|
@@ -56,6 +54,7 @@ from ..exceptions import (
|
|
|
56
54
|
NeedsEnvironmentVariable,
|
|
57
55
|
NeedsExtraInstalled,
|
|
58
56
|
)
|
|
57
|
+
from ..generation_utils import apply_prompt, extract_few_shot_examples
|
|
59
58
|
from ..languages import get_all_languages
|
|
60
59
|
from ..task_group_utils import (
|
|
61
60
|
question_answering,
|
|
@@ -258,14 +257,22 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
258
257
|
)
|
|
259
258
|
|
|
260
259
|
if self.benchmark_config.few_shot:
|
|
261
|
-
few_shot_examples =
|
|
262
|
-
dataset=dataset,
|
|
260
|
+
few_shot_examples = extract_few_shot_examples(
|
|
261
|
+
dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
|
|
263
262
|
)
|
|
264
263
|
else:
|
|
265
264
|
few_shot_examples = list()
|
|
266
265
|
|
|
267
266
|
dataset["test"] = dataset["test"].map(
|
|
268
|
-
partial(
|
|
267
|
+
partial(
|
|
268
|
+
apply_prompt,
|
|
269
|
+
few_shot_examples=few_shot_examples,
|
|
270
|
+
model_config=self.model_config,
|
|
271
|
+
dataset_config=self.dataset_config,
|
|
272
|
+
instruction_model=self.buffer["instruction_model"],
|
|
273
|
+
always_populate_text_field=True,
|
|
274
|
+
tokenizer=self._tokenizer,
|
|
275
|
+
),
|
|
269
276
|
batched=True,
|
|
270
277
|
load_from_cache_file=False,
|
|
271
278
|
keep_in_memory=True,
|
|
@@ -439,6 +446,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
439
446
|
logger.info(
|
|
440
447
|
"Prompts are too long, so truncating them and trying again..."
|
|
441
448
|
)
|
|
449
|
+
logger.debug(f"The error message was: {str(e)}")
|
|
442
450
|
tokenized_prompts = self._tokenizer(
|
|
443
451
|
text=prompts,
|
|
444
452
|
truncation=True,
|
|
@@ -499,7 +507,6 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
499
507
|
|
|
500
508
|
# Sanity check
|
|
501
509
|
if len(completions) != len(prompts):
|
|
502
|
-
breakpoint()
|
|
503
510
|
raise InvalidBenchmark(
|
|
504
511
|
f"Expected {len(prompts):,} completions, but got {len(completions):,}."
|
|
505
512
|
)
|
|
@@ -615,303 +622,6 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
615
622
|
|
|
616
623
|
return model_config
|
|
617
624
|
|
|
618
|
-
def _extract_few_shot_examples(
|
|
619
|
-
self, dataset: DatasetDict, task: Task, itr_idx: int
|
|
620
|
-
) -> list[dict[str, t.Any]]:
|
|
621
|
-
"""Extract few-shot examples from a dataset.
|
|
622
|
-
|
|
623
|
-
This will always extract the examples from the training split.
|
|
624
|
-
|
|
625
|
-
We ensure that the few-shot examples are unique by picking them one at a time.
|
|
626
|
-
|
|
627
|
-
Args:
|
|
628
|
-
dataset:
|
|
629
|
-
The dataset to extract the few-shot examples from.
|
|
630
|
-
task:
|
|
631
|
-
The task that is being benchmarked.
|
|
632
|
-
itr_idx:
|
|
633
|
-
The index of the dataset in the iterator.
|
|
634
|
-
|
|
635
|
-
Returns:
|
|
636
|
-
The few-shot examples.
|
|
637
|
-
"""
|
|
638
|
-
random_seed = 4242 + itr_idx
|
|
639
|
-
num_few_shots = self.dataset_config.num_few_shot_examples
|
|
640
|
-
few_shot_examples: list[dict[str, t.Any]] = list()
|
|
641
|
-
shuffled_train = dataset["train"].shuffle(seed=random_seed)
|
|
642
|
-
|
|
643
|
-
match task.task_group:
|
|
644
|
-
case (
|
|
645
|
-
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
646
|
-
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
647
|
-
):
|
|
648
|
-
labels = it.cycle(self.dataset_config.labels)
|
|
649
|
-
while (
|
|
650
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
651
|
-
):
|
|
652
|
-
label = next(labels)
|
|
653
|
-
possible_examples = shuffled_train.filter(
|
|
654
|
-
lambda x: x["label"].lower() == label.lower()
|
|
655
|
-
)
|
|
656
|
-
if len(possible_examples) == 0:
|
|
657
|
-
continue
|
|
658
|
-
example = possible_examples.select(range(1))[0]
|
|
659
|
-
few_shot_examples.append(example)
|
|
660
|
-
shuffled_train = shuffled_train.filter(
|
|
661
|
-
lambda x: x["text"] != example["text"]
|
|
662
|
-
)
|
|
663
|
-
|
|
664
|
-
case TaskGroup.TEXT_TO_TEXT:
|
|
665
|
-
while (
|
|
666
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
667
|
-
):
|
|
668
|
-
example = shuffled_train.select(range(1))[0]
|
|
669
|
-
few_shot_examples.append(example)
|
|
670
|
-
shuffled_train = shuffled_train.filter(
|
|
671
|
-
lambda x: x["text"] != example["text"]
|
|
672
|
-
)
|
|
673
|
-
|
|
674
|
-
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
675
|
-
labels = it.cycle(
|
|
676
|
-
[
|
|
677
|
-
label.lower()
|
|
678
|
-
for label in self.dataset_config.labels
|
|
679
|
-
if label.lower().startswith("b-")
|
|
680
|
-
]
|
|
681
|
-
)
|
|
682
|
-
while (
|
|
683
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
684
|
-
):
|
|
685
|
-
label = next(labels)
|
|
686
|
-
possible_examples = shuffled_train.filter(
|
|
687
|
-
lambda x: label in [tag.lower() for tag in x["labels"]]
|
|
688
|
-
)
|
|
689
|
-
if len(possible_examples) == 0:
|
|
690
|
-
continue
|
|
691
|
-
example = possible_examples.select(range(1))[0]
|
|
692
|
-
few_shot_examples.append(example)
|
|
693
|
-
shuffled_train = shuffled_train.filter(
|
|
694
|
-
lambda x: x["tokens"] != example["tokens"]
|
|
695
|
-
)
|
|
696
|
-
|
|
697
|
-
case TaskGroup.QUESTION_ANSWERING:
|
|
698
|
-
# Locate the maximum number of tokens that constitutes a short example
|
|
699
|
-
for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
|
|
700
|
-
train_with_short_examples = dataset["train"].filter(
|
|
701
|
-
lambda example: len(example["context"]) < max_num_tokens
|
|
702
|
-
)
|
|
703
|
-
num_short_examples = len(train_with_short_examples)
|
|
704
|
-
if num_short_examples >= self.dataset_config.num_few_shot_examples:
|
|
705
|
-
break
|
|
706
|
-
else:
|
|
707
|
-
raise InvalidBenchmark(
|
|
708
|
-
"Could not find enough short examples for few-shot learning."
|
|
709
|
-
)
|
|
710
|
-
|
|
711
|
-
shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
|
|
712
|
-
while (
|
|
713
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
714
|
-
):
|
|
715
|
-
example = shuffled_train.select(range(1))[0]
|
|
716
|
-
few_shot_examples.append(example)
|
|
717
|
-
shuffled_train = shuffled_train.filter(
|
|
718
|
-
lambda x: x["context"] != example["context"]
|
|
719
|
-
)
|
|
720
|
-
|
|
721
|
-
case _:
|
|
722
|
-
raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
|
|
723
|
-
|
|
724
|
-
random.seed(random_seed)
|
|
725
|
-
random.shuffle(few_shot_examples)
|
|
726
|
-
return few_shot_examples
|
|
727
|
-
|
|
728
|
-
def _apply_prompt(
|
|
729
|
-
self,
|
|
730
|
-
examples: dict[str, t.Any],
|
|
731
|
-
few_shot_examples: list[dict[str, t.Any]],
|
|
732
|
-
task: Task,
|
|
733
|
-
) -> dict[str, t.Any]:
|
|
734
|
-
"""Apply prompt template to an example, potentially with few-shot examples.
|
|
735
|
-
|
|
736
|
-
Args:
|
|
737
|
-
examples:
|
|
738
|
-
The examples to apply the few-shot examples to.
|
|
739
|
-
few_shot_examples:
|
|
740
|
-
The few-shot examples to apply.
|
|
741
|
-
task:
|
|
742
|
-
The task that is being benchmarked.
|
|
743
|
-
|
|
744
|
-
Returns:
|
|
745
|
-
The example with the few-shot examples applied.
|
|
746
|
-
"""
|
|
747
|
-
|
|
748
|
-
def create_prompt(**kwargs: str) -> tuple[str, str]:
|
|
749
|
-
"""Create a prompt from the given keyword arguments.
|
|
750
|
-
|
|
751
|
-
Args:
|
|
752
|
-
kwargs:
|
|
753
|
-
The keyword arguments to use in the prompt.
|
|
754
|
-
|
|
755
|
-
Returns:
|
|
756
|
-
A pair (prompt, label), where "label" is an empty string if the model is
|
|
757
|
-
not instruction tuned (as in this case it is included in the prompt).
|
|
758
|
-
"""
|
|
759
|
-
label_key = "label" if "label" in kwargs else "target_text"
|
|
760
|
-
label = kwargs.pop(label_key)
|
|
761
|
-
assert label is not None, (
|
|
762
|
-
f"Found a None label for the prompt: {kwargs}. This should not happen."
|
|
763
|
-
)
|
|
764
|
-
label_mapping = self.dataset_config.prompt_label_mapping
|
|
765
|
-
label = label_mapping.get(label, label)
|
|
766
|
-
if self.buffer["instruction_model"]:
|
|
767
|
-
prompt = self.dataset_config.instruction_prompt.format(**kwargs)
|
|
768
|
-
return prompt, label
|
|
769
|
-
else:
|
|
770
|
-
kwargs[label_key] = label
|
|
771
|
-
return self.dataset_config.prompt_template.format(**kwargs), ""
|
|
772
|
-
|
|
773
|
-
match task.task_group:
|
|
774
|
-
case (
|
|
775
|
-
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
776
|
-
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
777
|
-
):
|
|
778
|
-
few_shot_sections = [
|
|
779
|
-
create_prompt(
|
|
780
|
-
text=example["text"].replace("\n", " ").strip(),
|
|
781
|
-
label=example["label"].replace("\n", " ").strip(),
|
|
782
|
-
)
|
|
783
|
-
for example in few_shot_examples
|
|
784
|
-
]
|
|
785
|
-
new_sections = [
|
|
786
|
-
create_prompt(text=text.replace("\n", " ").strip(), label="")
|
|
787
|
-
for text in examples["text"]
|
|
788
|
-
]
|
|
789
|
-
|
|
790
|
-
case TaskGroup.TEXT_TO_TEXT:
|
|
791
|
-
few_shot_sections = [
|
|
792
|
-
create_prompt(
|
|
793
|
-
text=example["text"].replace("\n", " ").strip(),
|
|
794
|
-
target_text=example["target_text"].replace("\n", " ").strip(),
|
|
795
|
-
)
|
|
796
|
-
for example in few_shot_examples
|
|
797
|
-
]
|
|
798
|
-
new_sections = [
|
|
799
|
-
create_prompt(text=text.replace("\n", " ").strip(), target_text="")
|
|
800
|
-
for text in examples["text"]
|
|
801
|
-
]
|
|
802
|
-
|
|
803
|
-
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
804
|
-
|
|
805
|
-
def create_label(example: dict) -> str:
|
|
806
|
-
prompt_labels = self.dataset_config.prompt_label_mapping.values()
|
|
807
|
-
labels: dict[str, list[str]] = {
|
|
808
|
-
prompt_label: list() for prompt_label in prompt_labels
|
|
809
|
-
}
|
|
810
|
-
for token, label in zip(example["tokens"], example["labels"]):
|
|
811
|
-
label = label.lower()
|
|
812
|
-
if label == "o":
|
|
813
|
-
continue
|
|
814
|
-
prompt_label = self.dataset_config.prompt_label_mapping[label]
|
|
815
|
-
if label.startswith("b-"):
|
|
816
|
-
labels[prompt_label].append(token)
|
|
817
|
-
elif label.startswith("i-"):
|
|
818
|
-
labels[prompt_label][-1] += " " + token
|
|
819
|
-
return json.dumps(labels, ensure_ascii=False)
|
|
820
|
-
|
|
821
|
-
few_shot_sections = [
|
|
822
|
-
create_prompt(
|
|
823
|
-
text=" ".join(example["tokens"]).replace("\n", " ").strip(),
|
|
824
|
-
label=create_label(example=example),
|
|
825
|
-
)
|
|
826
|
-
for example in few_shot_examples
|
|
827
|
-
]
|
|
828
|
-
new_sections = [
|
|
829
|
-
create_prompt(
|
|
830
|
-
text=" ".join(tokens).replace("\n", " ").strip(), label=""
|
|
831
|
-
)
|
|
832
|
-
for tokens in examples["tokens"]
|
|
833
|
-
]
|
|
834
|
-
|
|
835
|
-
case TaskGroup.QUESTION_ANSWERING:
|
|
836
|
-
few_shot_sections = [
|
|
837
|
-
create_prompt(
|
|
838
|
-
text=example["context"].replace("\n", " ").strip(),
|
|
839
|
-
question=example["question"].replace("\n", " ").strip(),
|
|
840
|
-
label=example["answers"]["text"][0].replace("\n", " "),
|
|
841
|
-
)
|
|
842
|
-
for example in few_shot_examples
|
|
843
|
-
]
|
|
844
|
-
new_sections = [
|
|
845
|
-
create_prompt(
|
|
846
|
-
text=context.replace("\n", " ").strip(),
|
|
847
|
-
question=question.replace("\n", " ").strip(),
|
|
848
|
-
label="",
|
|
849
|
-
)
|
|
850
|
-
for context, question in zip(
|
|
851
|
-
examples["context"], examples["question"]
|
|
852
|
-
)
|
|
853
|
-
]
|
|
854
|
-
|
|
855
|
-
case _:
|
|
856
|
-
raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
|
|
857
|
-
|
|
858
|
-
if self.buffer["instruction_model"]:
|
|
859
|
-
few_shot_messages = [
|
|
860
|
-
dict(role=role, content=content)
|
|
861
|
-
for prompt, label in few_shot_sections
|
|
862
|
-
for role, content in [("user", prompt), ("assistant", label)]
|
|
863
|
-
]
|
|
864
|
-
|
|
865
|
-
messages_list = [
|
|
866
|
-
few_shot_messages + [dict(role="user", content=prompt)]
|
|
867
|
-
for prompt, _ in new_sections
|
|
868
|
-
]
|
|
869
|
-
|
|
870
|
-
# Pick the chat template that matches the language of the dataset, if such a
|
|
871
|
-
# template exists
|
|
872
|
-
chat_template: str | None = None
|
|
873
|
-
if isinstance(self._tokenizer.chat_template, dict):
|
|
874
|
-
language_codes = [
|
|
875
|
-
language.code for language in self.dataset_config.languages
|
|
876
|
-
]
|
|
877
|
-
for name, candidate_template in self._tokenizer.chat_template.items():
|
|
878
|
-
if name.lower() in language_codes:
|
|
879
|
-
chat_template = candidate_template
|
|
880
|
-
log_once(
|
|
881
|
-
f"Using the {name!r} chat template for the tokenizer for "
|
|
882
|
-
f"model {self.model_config.model_id!r}.",
|
|
883
|
-
level=logging.DEBUG,
|
|
884
|
-
)
|
|
885
|
-
break
|
|
886
|
-
|
|
887
|
-
texts = [
|
|
888
|
-
self._tokenizer.apply_chat_template(
|
|
889
|
-
conversation=messages,
|
|
890
|
-
tokenize=False,
|
|
891
|
-
add_generation_prompt=True,
|
|
892
|
-
chat_template=chat_template,
|
|
893
|
-
)
|
|
894
|
-
for messages in messages_list
|
|
895
|
-
]
|
|
896
|
-
|
|
897
|
-
examples["text"] = texts
|
|
898
|
-
|
|
899
|
-
else:
|
|
900
|
-
prompt_prefix = ""
|
|
901
|
-
if self.dataset_config.prompt_prefix:
|
|
902
|
-
prompt_prefix = self.dataset_config.prompt_prefix + "\n\n"
|
|
903
|
-
|
|
904
|
-
few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
|
|
905
|
-
if few_shot_prompt:
|
|
906
|
-
few_shot_prompt += "\n\n"
|
|
907
|
-
|
|
908
|
-
examples["text"] = [
|
|
909
|
-
prompt_prefix + few_shot_prompt + new_prompt
|
|
910
|
-
for new_prompt, _ in new_sections
|
|
911
|
-
]
|
|
912
|
-
|
|
913
|
-
return examples
|
|
914
|
-
|
|
915
625
|
@property
|
|
916
626
|
def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
|
|
917
627
|
"""The data collator used to prepare samples during finetuning.
|
|
@@ -1087,7 +797,7 @@ def load_model_and_tokenizer(
|
|
|
1087
797
|
enable_lora=model_config.adapter_base_model_id is not None,
|
|
1088
798
|
max_lora_rank=256,
|
|
1089
799
|
)
|
|
1090
|
-
except (ValueError, OSError) as e:
|
|
800
|
+
except (RuntimeError, ValueError, OSError) as e:
|
|
1091
801
|
if "awaiting a review from the repo authors" in str(e):
|
|
1092
802
|
raise InvalidModel(
|
|
1093
803
|
f"The model {model_id!r} is awaiting a review from the repository "
|
euroeval/benchmarker.py
CHANGED
|
@@ -372,15 +372,7 @@ class Benchmarker:
|
|
|
372
372
|
|
|
373
373
|
current_benchmark_results: list[BenchmarkResult] = list()
|
|
374
374
|
for model_id in model_ids:
|
|
375
|
-
|
|
376
|
-
model_config = get_model_config(
|
|
377
|
-
model_id=model_id, benchmark_config=benchmark_config
|
|
378
|
-
)
|
|
379
|
-
except InvalidModel as e:
|
|
380
|
-
logger.info(e.message)
|
|
381
|
-
num_finished_benchmarks += len(dataset_configs)
|
|
382
|
-
continue
|
|
383
|
-
|
|
375
|
+
model_config: ModelConfig | None = None
|
|
384
376
|
loaded_model: BenchmarkModule | None = None
|
|
385
377
|
for dataset_config in dataset_configs:
|
|
386
378
|
# Skip if we have already benchmarked this model on this dataset and
|
|
@@ -394,12 +386,22 @@ class Benchmarker:
|
|
|
394
386
|
):
|
|
395
387
|
logger.debug(
|
|
396
388
|
f"Skipping benchmarking {model_id} on "
|
|
397
|
-
f"{dataset_config.pretty_name}, as it "
|
|
398
|
-
"
|
|
389
|
+
f"{dataset_config.pretty_name}, as it has already been "
|
|
390
|
+
"benchmarked."
|
|
399
391
|
)
|
|
400
392
|
num_finished_benchmarks += 1
|
|
401
393
|
continue
|
|
402
394
|
|
|
395
|
+
if model_config is None:
|
|
396
|
+
try:
|
|
397
|
+
model_config = get_model_config(
|
|
398
|
+
model_id=model_id, benchmark_config=benchmark_config
|
|
399
|
+
)
|
|
400
|
+
except InvalidModel as e:
|
|
401
|
+
logger.info(e.message)
|
|
402
|
+
num_finished_benchmarks += len(dataset_configs)
|
|
403
|
+
continue
|
|
404
|
+
|
|
403
405
|
# Skip if the model is an encoder model and the task is generative
|
|
404
406
|
task_is_generative = (
|
|
405
407
|
dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
|
|
@@ -774,6 +776,7 @@ class Benchmarker:
|
|
|
774
776
|
metric_configs=dataset_config.task.metrics,
|
|
775
777
|
scores=scores,
|
|
776
778
|
model_id=model_config.model_id,
|
|
779
|
+
model_revision=model_config.revision,
|
|
777
780
|
)
|
|
778
781
|
|
|
779
782
|
record = BenchmarkResult(
|
euroeval/data_models.py
CHANGED
|
@@ -531,7 +531,9 @@ class DatasetConfig:
|
|
|
531
531
|
|
|
532
532
|
# Convert labels to single-quoted labels - and remove duplicates
|
|
533
533
|
quoted_labels = [
|
|
534
|
-
f"'{label}'"
|
|
534
|
+
f"'{self.prompt_label_mapping[label]}'"
|
|
535
|
+
for label in set(self.labels)
|
|
536
|
+
if label in self.prompt_label_mapping
|
|
535
537
|
]
|
|
536
538
|
|
|
537
539
|
if not quoted_labels:
|
|
@@ -7,6 +7,7 @@ from .danish import * # noqa: F403
|
|
|
7
7
|
from .dutch import * # noqa: F403
|
|
8
8
|
from .english import * # noqa: F403
|
|
9
9
|
from .faroese import * # noqa: F403
|
|
10
|
+
from .finnish import * # noqa: F403
|
|
10
11
|
from .french import * # noqa: F403
|
|
11
12
|
from .german import * # noqa: F403
|
|
12
13
|
from .icelandic import * # noqa: F403
|
|
@@ -6,13 +6,14 @@ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
name="
|
|
9
|
+
DBRD_CONFIG = DatasetConfig(
|
|
10
|
+
name="dbrd",
|
|
11
11
|
pretty_name="the truncated version of the Dutch sentiment classification "
|
|
12
|
-
"dataset
|
|
13
|
-
huggingface_id="EuroEval/
|
|
12
|
+
"dataset DBRD",
|
|
13
|
+
huggingface_id="EuroEval/dbrd-mini",
|
|
14
14
|
task=SENT,
|
|
15
15
|
languages=[NL],
|
|
16
|
+
_labels=["negative", "positive"],
|
|
16
17
|
)
|
|
17
18
|
|
|
18
19
|
SCALA_NL_CONFIG = DatasetConfig(
|
|
@@ -71,18 +72,6 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
|
|
|
71
72
|
|
|
72
73
|
### Unofficial datasets ###
|
|
73
74
|
|
|
74
|
-
DBRD_CONFIG = DatasetConfig(
|
|
75
|
-
name="dbrd",
|
|
76
|
-
pretty_name="the truncated version of the Dutch sentiment classification "
|
|
77
|
-
"dataset DBRD",
|
|
78
|
-
huggingface_id="EuroEval/dbrd-mini",
|
|
79
|
-
task=SENT,
|
|
80
|
-
languages=[NL],
|
|
81
|
-
_labels=["negative", "positive"],
|
|
82
|
-
_prompt_label_mapping=dict(positive="positief", negative="negatief"),
|
|
83
|
-
unofficial=True,
|
|
84
|
-
)
|
|
85
|
-
|
|
86
75
|
DUTCH_COLA_CONFIG = DatasetConfig(
|
|
87
76
|
name="dutch-cola",
|
|
88
77
|
pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
4
|
from ..languages import FI
|
|
5
|
-
from ..tasks import
|
|
5
|
+
from ..tasks import LA, NER, RC, SENT, SUMM
|
|
6
6
|
|
|
7
7
|
### Official datasets ###
|
|
8
8
|
|
|
@@ -40,14 +40,16 @@ XLSUM_FI_CONFIG = DatasetConfig(
|
|
|
40
40
|
languages=[FI],
|
|
41
41
|
)
|
|
42
42
|
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
43
|
+
# TODO: Include when this issue has been resolved:
|
|
44
|
+
# https://github.com/EuroEval/EuroEval/issues/158#issuecomment-2846664885
|
|
45
|
+
# HELLASWAG_FI_CONFIG = DatasetConfig(
|
|
46
|
+
# name="hellaswag-fi",
|
|
47
|
+
# pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
|
|
48
|
+
# "HellaSwag-fi, translated from the English HellaSwag dataset",
|
|
49
|
+
# huggingface_id="EuroEval/hellaswag-fi-mini",
|
|
50
|
+
# task=COMMON_SENSE,
|
|
51
|
+
# languages=[FI],
|
|
52
|
+
# )
|
|
51
53
|
|
|
52
54
|
SCALA_FI_CONFIG = DatasetConfig(
|
|
53
55
|
name="scala-fi",
|