EuroEval 15.7.0__py3-none-any.whl → 15.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -3,11 +3,9 @@
3
3
  import collections.abc as c
4
4
  import contextlib
5
5
  import importlib.util
6
- import itertools as it
7
6
  import json
8
7
  import logging
9
8
  import os
10
- import random
11
9
  import re
12
10
  import sys
13
11
  import typing as t
@@ -56,6 +54,7 @@ from ..exceptions import (
56
54
  NeedsEnvironmentVariable,
57
55
  NeedsExtraInstalled,
58
56
  )
57
+ from ..generation_utils import apply_prompt, extract_few_shot_examples
59
58
  from ..languages import get_all_languages
60
59
  from ..task_group_utils import (
61
60
  question_answering,
@@ -258,14 +257,22 @@ class VLLMModel(HuggingFaceEncoderModel):
258
257
  )
259
258
 
260
259
  if self.benchmark_config.few_shot:
261
- few_shot_examples = self._extract_few_shot_examples(
262
- dataset=dataset, task=task, itr_idx=itr_idx
260
+ few_shot_examples = extract_few_shot_examples(
261
+ dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
263
262
  )
264
263
  else:
265
264
  few_shot_examples = list()
266
265
 
267
266
  dataset["test"] = dataset["test"].map(
268
- partial(self._apply_prompt, few_shot_examples=few_shot_examples, task=task),
267
+ partial(
268
+ apply_prompt,
269
+ few_shot_examples=few_shot_examples,
270
+ model_config=self.model_config,
271
+ dataset_config=self.dataset_config,
272
+ instruction_model=self.buffer["instruction_model"],
273
+ always_populate_text_field=True,
274
+ tokenizer=self._tokenizer,
275
+ ),
269
276
  batched=True,
270
277
  load_from_cache_file=False,
271
278
  keep_in_memory=True,
@@ -439,6 +446,7 @@ class VLLMModel(HuggingFaceEncoderModel):
439
446
  logger.info(
440
447
  "Prompts are too long, so truncating them and trying again..."
441
448
  )
449
+ logger.debug(f"The error message was: {str(e)}")
442
450
  tokenized_prompts = self._tokenizer(
443
451
  text=prompts,
444
452
  truncation=True,
@@ -499,7 +507,6 @@ class VLLMModel(HuggingFaceEncoderModel):
499
507
 
500
508
  # Sanity check
501
509
  if len(completions) != len(prompts):
502
- breakpoint()
503
510
  raise InvalidBenchmark(
504
511
  f"Expected {len(prompts):,} completions, but got {len(completions):,}."
505
512
  )
@@ -615,303 +622,6 @@ class VLLMModel(HuggingFaceEncoderModel):
615
622
 
616
623
  return model_config
617
624
 
618
- def _extract_few_shot_examples(
619
- self, dataset: DatasetDict, task: Task, itr_idx: int
620
- ) -> list[dict[str, t.Any]]:
621
- """Extract few-shot examples from a dataset.
622
-
623
- This will always extract the examples from the training split.
624
-
625
- We ensure that the few-shot examples are unique by picking them one at a time.
626
-
627
- Args:
628
- dataset:
629
- The dataset to extract the few-shot examples from.
630
- task:
631
- The task that is being benchmarked.
632
- itr_idx:
633
- The index of the dataset in the iterator.
634
-
635
- Returns:
636
- The few-shot examples.
637
- """
638
- random_seed = 4242 + itr_idx
639
- num_few_shots = self.dataset_config.num_few_shot_examples
640
- few_shot_examples: list[dict[str, t.Any]] = list()
641
- shuffled_train = dataset["train"].shuffle(seed=random_seed)
642
-
643
- match task.task_group:
644
- case (
645
- TaskGroup.SEQUENCE_CLASSIFICATION
646
- | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
647
- ):
648
- labels = it.cycle(self.dataset_config.labels)
649
- while (
650
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
651
- ):
652
- label = next(labels)
653
- possible_examples = shuffled_train.filter(
654
- lambda x: x["label"].lower() == label.lower()
655
- )
656
- if len(possible_examples) == 0:
657
- continue
658
- example = possible_examples.select(range(1))[0]
659
- few_shot_examples.append(example)
660
- shuffled_train = shuffled_train.filter(
661
- lambda x: x["text"] != example["text"]
662
- )
663
-
664
- case TaskGroup.TEXT_TO_TEXT:
665
- while (
666
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
667
- ):
668
- example = shuffled_train.select(range(1))[0]
669
- few_shot_examples.append(example)
670
- shuffled_train = shuffled_train.filter(
671
- lambda x: x["text"] != example["text"]
672
- )
673
-
674
- case TaskGroup.TOKEN_CLASSIFICATION:
675
- labels = it.cycle(
676
- [
677
- label.lower()
678
- for label in self.dataset_config.labels
679
- if label.lower().startswith("b-")
680
- ]
681
- )
682
- while (
683
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
684
- ):
685
- label = next(labels)
686
- possible_examples = shuffled_train.filter(
687
- lambda x: label in [tag.lower() for tag in x["labels"]]
688
- )
689
- if len(possible_examples) == 0:
690
- continue
691
- example = possible_examples.select(range(1))[0]
692
- few_shot_examples.append(example)
693
- shuffled_train = shuffled_train.filter(
694
- lambda x: x["tokens"] != example["tokens"]
695
- )
696
-
697
- case TaskGroup.QUESTION_ANSWERING:
698
- # Locate the maximum number of tokens that constitutes a short example
699
- for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
700
- train_with_short_examples = dataset["train"].filter(
701
- lambda example: len(example["context"]) < max_num_tokens
702
- )
703
- num_short_examples = len(train_with_short_examples)
704
- if num_short_examples >= self.dataset_config.num_few_shot_examples:
705
- break
706
- else:
707
- raise InvalidBenchmark(
708
- "Could not find enough short examples for few-shot learning."
709
- )
710
-
711
- shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
712
- while (
713
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
714
- ):
715
- example = shuffled_train.select(range(1))[0]
716
- few_shot_examples.append(example)
717
- shuffled_train = shuffled_train.filter(
718
- lambda x: x["context"] != example["context"]
719
- )
720
-
721
- case _:
722
- raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
723
-
724
- random.seed(random_seed)
725
- random.shuffle(few_shot_examples)
726
- return few_shot_examples
727
-
728
- def _apply_prompt(
729
- self,
730
- examples: dict[str, t.Any],
731
- few_shot_examples: list[dict[str, t.Any]],
732
- task: Task,
733
- ) -> dict[str, t.Any]:
734
- """Apply prompt template to an example, potentially with few-shot examples.
735
-
736
- Args:
737
- examples:
738
- The examples to apply the few-shot examples to.
739
- few_shot_examples:
740
- The few-shot examples to apply.
741
- task:
742
- The task that is being benchmarked.
743
-
744
- Returns:
745
- The example with the few-shot examples applied.
746
- """
747
-
748
- def create_prompt(**kwargs: str) -> tuple[str, str]:
749
- """Create a prompt from the given keyword arguments.
750
-
751
- Args:
752
- kwargs:
753
- The keyword arguments to use in the prompt.
754
-
755
- Returns:
756
- A pair (prompt, label), where "label" is an empty string if the model is
757
- not instruction tuned (as in this case it is included in the prompt).
758
- """
759
- label_key = "label" if "label" in kwargs else "target_text"
760
- label = kwargs.pop(label_key)
761
- assert label is not None, (
762
- f"Found a None label for the prompt: {kwargs}. This should not happen."
763
- )
764
- label_mapping = self.dataset_config.prompt_label_mapping
765
- label = label_mapping.get(label, label)
766
- if self.buffer["instruction_model"]:
767
- prompt = self.dataset_config.instruction_prompt.format(**kwargs)
768
- return prompt, label
769
- else:
770
- kwargs[label_key] = label
771
- return self.dataset_config.prompt_template.format(**kwargs), ""
772
-
773
- match task.task_group:
774
- case (
775
- TaskGroup.SEQUENCE_CLASSIFICATION
776
- | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
777
- ):
778
- few_shot_sections = [
779
- create_prompt(
780
- text=example["text"].replace("\n", " ").strip(),
781
- label=example["label"].replace("\n", " ").strip(),
782
- )
783
- for example in few_shot_examples
784
- ]
785
- new_sections = [
786
- create_prompt(text=text.replace("\n", " ").strip(), label="")
787
- for text in examples["text"]
788
- ]
789
-
790
- case TaskGroup.TEXT_TO_TEXT:
791
- few_shot_sections = [
792
- create_prompt(
793
- text=example["text"].replace("\n", " ").strip(),
794
- target_text=example["target_text"].replace("\n", " ").strip(),
795
- )
796
- for example in few_shot_examples
797
- ]
798
- new_sections = [
799
- create_prompt(text=text.replace("\n", " ").strip(), target_text="")
800
- for text in examples["text"]
801
- ]
802
-
803
- case TaskGroup.TOKEN_CLASSIFICATION:
804
-
805
- def create_label(example: dict) -> str:
806
- prompt_labels = self.dataset_config.prompt_label_mapping.values()
807
- labels: dict[str, list[str]] = {
808
- prompt_label: list() for prompt_label in prompt_labels
809
- }
810
- for token, label in zip(example["tokens"], example["labels"]):
811
- label = label.lower()
812
- if label == "o":
813
- continue
814
- prompt_label = self.dataset_config.prompt_label_mapping[label]
815
- if label.startswith("b-"):
816
- labels[prompt_label].append(token)
817
- elif label.startswith("i-"):
818
- labels[prompt_label][-1] += " " + token
819
- return json.dumps(labels, ensure_ascii=False)
820
-
821
- few_shot_sections = [
822
- create_prompt(
823
- text=" ".join(example["tokens"]).replace("\n", " ").strip(),
824
- label=create_label(example=example),
825
- )
826
- for example in few_shot_examples
827
- ]
828
- new_sections = [
829
- create_prompt(
830
- text=" ".join(tokens).replace("\n", " ").strip(), label=""
831
- )
832
- for tokens in examples["tokens"]
833
- ]
834
-
835
- case TaskGroup.QUESTION_ANSWERING:
836
- few_shot_sections = [
837
- create_prompt(
838
- text=example["context"].replace("\n", " ").strip(),
839
- question=example["question"].replace("\n", " ").strip(),
840
- label=example["answers"]["text"][0].replace("\n", " "),
841
- )
842
- for example in few_shot_examples
843
- ]
844
- new_sections = [
845
- create_prompt(
846
- text=context.replace("\n", " ").strip(),
847
- question=question.replace("\n", " ").strip(),
848
- label="",
849
- )
850
- for context, question in zip(
851
- examples["context"], examples["question"]
852
- )
853
- ]
854
-
855
- case _:
856
- raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
857
-
858
- if self.buffer["instruction_model"]:
859
- few_shot_messages = [
860
- dict(role=role, content=content)
861
- for prompt, label in few_shot_sections
862
- for role, content in [("user", prompt), ("assistant", label)]
863
- ]
864
-
865
- messages_list = [
866
- few_shot_messages + [dict(role="user", content=prompt)]
867
- for prompt, _ in new_sections
868
- ]
869
-
870
- # Pick the chat template that matches the language of the dataset, if such a
871
- # template exists
872
- chat_template: str | None = None
873
- if isinstance(self._tokenizer.chat_template, dict):
874
- language_codes = [
875
- language.code for language in self.dataset_config.languages
876
- ]
877
- for name, candidate_template in self._tokenizer.chat_template.items():
878
- if name.lower() in language_codes:
879
- chat_template = candidate_template
880
- log_once(
881
- f"Using the {name!r} chat template for the tokenizer for "
882
- f"model {self.model_config.model_id!r}.",
883
- level=logging.DEBUG,
884
- )
885
- break
886
-
887
- texts = [
888
- self._tokenizer.apply_chat_template(
889
- conversation=messages,
890
- tokenize=False,
891
- add_generation_prompt=True,
892
- chat_template=chat_template,
893
- )
894
- for messages in messages_list
895
- ]
896
-
897
- examples["text"] = texts
898
-
899
- else:
900
- prompt_prefix = ""
901
- if self.dataset_config.prompt_prefix:
902
- prompt_prefix = self.dataset_config.prompt_prefix + "\n\n"
903
-
904
- few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
905
- if few_shot_prompt:
906
- few_shot_prompt += "\n\n"
907
-
908
- examples["text"] = [
909
- prompt_prefix + few_shot_prompt + new_prompt
910
- for new_prompt, _ in new_sections
911
- ]
912
-
913
- return examples
914
-
915
625
  @property
916
626
  def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
917
627
  """The data collator used to prepare samples during finetuning.
@@ -1087,7 +797,7 @@ def load_model_and_tokenizer(
1087
797
  enable_lora=model_config.adapter_base_model_id is not None,
1088
798
  max_lora_rank=256,
1089
799
  )
1090
- except (ValueError, OSError) as e:
800
+ except (RuntimeError, ValueError, OSError) as e:
1091
801
  if "awaiting a review from the repo authors" in str(e):
1092
802
  raise InvalidModel(
1093
803
  f"The model {model_id!r} is awaiting a review from the repository "
euroeval/benchmarker.py CHANGED
@@ -372,15 +372,7 @@ class Benchmarker:
372
372
 
373
373
  current_benchmark_results: list[BenchmarkResult] = list()
374
374
  for model_id in model_ids:
375
- try:
376
- model_config = get_model_config(
377
- model_id=model_id, benchmark_config=benchmark_config
378
- )
379
- except InvalidModel as e:
380
- logger.info(e.message)
381
- num_finished_benchmarks += len(dataset_configs)
382
- continue
383
-
375
+ model_config: ModelConfig | None = None
384
376
  loaded_model: BenchmarkModule | None = None
385
377
  for dataset_config in dataset_configs:
386
378
  # Skip if we have already benchmarked this model on this dataset and
@@ -394,12 +386,22 @@ class Benchmarker:
394
386
  ):
395
387
  logger.debug(
396
388
  f"Skipping benchmarking {model_id} on "
397
- f"{dataset_config.pretty_name}, as it "
398
- "has already been benchmarked."
389
+ f"{dataset_config.pretty_name}, as it has already been "
390
+ "benchmarked."
399
391
  )
400
392
  num_finished_benchmarks += 1
401
393
  continue
402
394
 
395
+ if model_config is None:
396
+ try:
397
+ model_config = get_model_config(
398
+ model_id=model_id, benchmark_config=benchmark_config
399
+ )
400
+ except InvalidModel as e:
401
+ logger.info(e.message)
402
+ num_finished_benchmarks += len(dataset_configs)
403
+ continue
404
+
403
405
  # Skip if the model is an encoder model and the task is generative
404
406
  task_is_generative = (
405
407
  dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
@@ -774,6 +776,7 @@ class Benchmarker:
774
776
  metric_configs=dataset_config.task.metrics,
775
777
  scores=scores,
776
778
  model_id=model_config.model_id,
779
+ model_revision=model_config.revision,
777
780
  )
778
781
 
779
782
  record = BenchmarkResult(
euroeval/data_models.py CHANGED
@@ -531,7 +531,9 @@ class DatasetConfig:
531
531
 
532
532
  # Convert labels to single-quoted labels - and remove duplicates
533
533
  quoted_labels = [
534
- f"'{label}'" for label in set(self.prompt_label_mapping.values())
534
+ f"'{self.prompt_label_mapping[label]}'"
535
+ for label in set(self.labels)
536
+ if label in self.prompt_label_mapping
535
537
  ]
536
538
 
537
539
  if not quoted_labels:
@@ -7,6 +7,7 @@ from .danish import * # noqa: F403
7
7
  from .dutch import * # noqa: F403
8
8
  from .english import * # noqa: F403
9
9
  from .faroese import * # noqa: F403
10
+ from .finnish import * # noqa: F403
10
11
  from .french import * # noqa: F403
11
12
  from .german import * # noqa: F403
12
13
  from .icelandic import * # noqa: F403
@@ -6,13 +6,14 @@ from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
9
- DUTCH_SOCIAL_CONFIG = DatasetConfig(
10
- name="dutch-social",
9
+ DBRD_CONFIG = DatasetConfig(
10
+ name="dbrd",
11
11
  pretty_name="the truncated version of the Dutch sentiment classification "
12
- "dataset Dutch Social",
13
- huggingface_id="EuroEval/dutch-social-mini",
12
+ "dataset DBRD",
13
+ huggingface_id="EuroEval/dbrd-mini",
14
14
  task=SENT,
15
15
  languages=[NL],
16
+ _labels=["negative", "positive"],
16
17
  )
17
18
 
18
19
  SCALA_NL_CONFIG = DatasetConfig(
@@ -71,18 +72,6 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
71
72
 
72
73
  ### Unofficial datasets ###
73
74
 
74
- DBRD_CONFIG = DatasetConfig(
75
- name="dbrd",
76
- pretty_name="the truncated version of the Dutch sentiment classification "
77
- "dataset DBRD",
78
- huggingface_id="EuroEval/dbrd-mini",
79
- task=SENT,
80
- languages=[NL],
81
- _labels=["negative", "positive"],
82
- _prompt_label_mapping=dict(positive="positief", negative="negatief"),
83
- unofficial=True,
84
- )
85
-
86
75
  DUTCH_COLA_CONFIG = DatasetConfig(
87
76
  name="dutch-cola",
88
77
  pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
@@ -2,7 +2,7 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import FI
5
- from ..tasks import COMMON_SENSE, LA, NER, RC, SENT, SUMM
5
+ from ..tasks import LA, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
@@ -40,14 +40,16 @@ XLSUM_FI_CONFIG = DatasetConfig(
40
40
  languages=[FI],
41
41
  )
42
42
 
43
- HELLASWAG_FI_CONFIG = DatasetConfig(
44
- name="hellaswag-fi",
45
- pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
46
- "HellaSwag-fi, translated from the English HellaSwag dataset",
47
- huggingface_id="EuroEval/hellaswag-fi-mini",
48
- task=COMMON_SENSE,
49
- languages=[FI],
50
- )
43
+ # TODO: Include when this issue has been resolved:
44
+ # https://github.com/EuroEval/EuroEval/issues/158#issuecomment-2846664885
45
+ # HELLASWAG_FI_CONFIG = DatasetConfig(
46
+ # name="hellaswag-fi",
47
+ # pretty_name="the truncated version of the Finnish common-sense reasoning dataset "
48
+ # "HellaSwag-fi, translated from the English HellaSwag dataset",
49
+ # huggingface_id="EuroEval/hellaswag-fi-mini",
50
+ # task=COMMON_SENSE,
51
+ # languages=[FI],
52
+ # )
51
53
 
52
54
  SCALA_FI_CONFIG = DatasetConfig(
53
55
  name="scala-fi",