EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (54) hide show
  1. euroeval/__init__.py +2 -2
  2. euroeval/benchmark_modules/base.py +3 -2
  3. euroeval/benchmark_modules/fresh.py +8 -6
  4. euroeval/benchmark_modules/hf.py +44 -33
  5. euroeval/benchmark_modules/litellm.py +314 -120
  6. euroeval/benchmark_modules/vllm.py +99 -59
  7. euroeval/benchmarker.py +52 -21
  8. euroeval/callbacks.py +2 -2
  9. euroeval/constants.py +9 -2
  10. euroeval/data_models.py +258 -44
  11. euroeval/dataset_configs/__init__.py +61 -0
  12. euroeval/dataset_configs/danish.py +120 -0
  13. euroeval/dataset_configs/dutch.py +123 -0
  14. euroeval/dataset_configs/english.py +88 -0
  15. euroeval/dataset_configs/faroese.py +53 -0
  16. euroeval/dataset_configs/french.py +83 -0
  17. euroeval/dataset_configs/german.py +91 -0
  18. euroeval/dataset_configs/icelandic.py +148 -0
  19. euroeval/dataset_configs/italian.py +81 -0
  20. euroeval/dataset_configs/norwegian.py +178 -0
  21. euroeval/dataset_configs/spanish.py +78 -0
  22. euroeval/dataset_configs/swedish.py +100 -0
  23. euroeval/exceptions.py +10 -10
  24. euroeval/finetuning.py +6 -10
  25. euroeval/generation.py +1 -0
  26. euroeval/human_evaluation.py +2 -2
  27. euroeval/languages.py +20 -13
  28. euroeval/model_cache.py +1 -1
  29. euroeval/model_loading.py +1 -12
  30. euroeval/prompt_templates/__init__.py +8 -0
  31. euroeval/prompt_templates/linguistic_acceptability.py +112 -0
  32. euroeval/prompt_templates/multiple_choice.py +97 -0
  33. euroeval/prompt_templates/named_entity_recognition.py +257 -0
  34. euroeval/prompt_templates/reading_comprehension.py +118 -0
  35. euroeval/prompt_templates/sentiment_classification.py +137 -0
  36. euroeval/prompt_templates/summarization.py +97 -0
  37. euroeval/speed_benchmark.py +1 -1
  38. euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
  39. euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
  40. euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
  41. euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
  42. euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
  43. euroeval/tasks.py +54 -0
  44. euroeval/tokenization_utils.py +343 -0
  45. euroeval/types.py +3 -1
  46. euroeval/utils.py +5 -254
  47. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
  48. euroeval-15.6.0.dist-info/RECORD +59 -0
  49. euroeval/dataset_configs.py +0 -2408
  50. euroeval-15.4.2.dist-info/RECORD +0 -40
  51. /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
  52. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
  53. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
  54. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,137 @@
1
+ """Templates for the Sentiment Analysis task."""
2
+
3
+ from ..data_models import PromptConfig
4
+ from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
5
+
6
+ SENT_TEMPLATES = {
7
+ DA: PromptConfig(
8
+ default_prompt_label_mapping=dict(
9
+ positive="positiv", neutral="neutral", negative="negativ"
10
+ ),
11
+ default_prompt_prefix="Følgende er dokumenter og deres sentiment, som kan være "
12
+ "{labels_str}.",
13
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
14
+ default_instruction_prompt="Dokument: {text}\n\nKlassificer sentimentet i "
15
+ "dokumentet. Svar kun med {labels_str}, og intet andet.",
16
+ ),
17
+ DE: PromptConfig(
18
+ default_prompt_label_mapping=dict(
19
+ positive="positiv", neutral="neutral", negative="negativ"
20
+ ),
21
+ default_prompt_prefix="Nachfolgend finden Sie Dokumente und ihre Bewertung, "
22
+ "die {labels_str} sein kann.",
23
+ default_prompt_template="Dokument: {text}\nStimmung: {label}",
24
+ default_instruction_prompt="Dokument: {text}\n\nKlassifizieren Sie die "
25
+ "Stimmung im Dokument. Antworten Sie mit {labels_str}, und nichts anderes.",
26
+ ),
27
+ EN: PromptConfig(
28
+ default_prompt_label_mapping=dict(
29
+ positive="positive", neutral="neutral", negative="negative"
30
+ ),
31
+ default_prompt_prefix="The following are documents and their sentiment, which "
32
+ "can be {labels_str}.",
33
+ default_prompt_template="Document: {text}\nSentiment: {label}",
34
+ default_instruction_prompt="Document: {text}\n\nClassify the sentiment in the "
35
+ "document. Answer with {labels_str}, and nothing else.",
36
+ ),
37
+ ES: PromptConfig(
38
+ default_prompt_label_mapping=dict(
39
+ positive="positivo", neutral="neutral", negative="negativo"
40
+ ),
41
+ default_prompt_prefix="A continuación se muestran los documentos y su "
42
+ "sentimiento, que puede ser {labels_str}.",
43
+ default_prompt_template="Documento: {text}\nSentimiento: {label}",
44
+ default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
45
+ "documento. Responde con {labels_str}, y nada más.",
46
+ ),
47
+ FO: PromptConfig(
48
+ default_prompt_label_mapping=dict(
49
+ positive="positivt", neutral="neutralt", negative="negativt"
50
+ ),
51
+ default_prompt_prefix="Niðanfyri eru skjøl og teirra kenslur, sum kunnu vera "
52
+ "{labels_str}.",
53
+ default_prompt_template="Skjal: {text}\nKensla: {label}",
54
+ default_instruction_prompt="Skjal: {text}\n\nFlokka kensluna í skjalinum. "
55
+ "Svara við {labels_str}, og einki annað.",
56
+ ),
57
+ FR: PromptConfig(
58
+ default_prompt_label_mapping=dict(
59
+ positive="positif", neutral="neutre", negative="négatif"
60
+ ),
61
+ default_prompt_prefix="Les documents suivants sont accompagnés de leur "
62
+ "sentiment, qui peut être {labels_str}.",
63
+ default_prompt_template="Document: {text}\nSentiment: {label}",
64
+ default_instruction_prompt="Document: {text}\n\nClassez le sentiment dans le "
65
+ "document. Répondez par {labels_str}, et rien d'autre.",
66
+ ),
67
+ IS: PromptConfig(
68
+ default_prompt_label_mapping=dict(
69
+ positive="jákvætt", neutral="hlutlaust", negative="neikvætt"
70
+ ),
71
+ default_prompt_prefix="Eftirfarandi eru skjöl og viðhorf þeirra, sem geta "
72
+ "verið {labels_str}.",
73
+ default_prompt_template="Skjal: {text}\nViðhorf: {label}",
74
+ default_instruction_prompt="Skjal: {text}\n\nFlokkaðu viðhorfið í skjalinu. "
75
+ "Svaraðu með {labels_str}, og ekkert annað.",
76
+ ),
77
+ IT: PromptConfig(
78
+ default_prompt_label_mapping=dict(
79
+ positive="positivo", neutral="neutro", negative="negativo"
80
+ ),
81
+ default_prompt_prefix="Di seguito sono riportati i documenti e il loro "
82
+ "sentiment, che può essere {labels_str}.",
83
+ default_prompt_template="Documento: {text}\nSentimento: {label}",
84
+ default_instruction_prompt="Documento: {text}\n\nClassificare il sentiment del "
85
+ "documento. Rispondere con {labels_str}, e nient'altro.",
86
+ ),
87
+ NB: PromptConfig(
88
+ default_prompt_label_mapping=dict(
89
+ positive="positiv", neutral="nøytral", negative="negativ"
90
+ ),
91
+ default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
92
+ "{labels_str}",
93
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
94
+ default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
95
+ "teksten. Svar med {labels_str}, og ikke noe annet.",
96
+ ),
97
+ NL: PromptConfig(
98
+ default_prompt_label_mapping=dict(
99
+ positive="positief", neutral="neutraal", negative="negatief"
100
+ ),
101
+ default_prompt_prefix="Hieronder volgen documenten en hun sentiment, dat "
102
+ "{labels_str} kan zijn.",
103
+ default_prompt_template="Document: {text}\nSentiment: {label}",
104
+ default_instruction_prompt="Document: {text}\n\nClassificeer het sentiment in "
105
+ "het document. Antwoord met {labels_str}, en verder niets.",
106
+ ),
107
+ NN: PromptConfig(
108
+ default_prompt_label_mapping=dict(
109
+ positive="positiv", neutral="nøytral", negative="negativ"
110
+ ),
111
+ default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
112
+ "{labels_str}",
113
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
114
+ default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
115
+ "teksten. Svar med {labels_str}, og ikke noe annet.",
116
+ ),
117
+ NO: PromptConfig(
118
+ default_prompt_label_mapping=dict(
119
+ positive="positiv", neutral="nøytral", negative="negativ"
120
+ ),
121
+ default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
122
+ "{labels_str}",
123
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
124
+ default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
125
+ "teksten. Svar med {labels_str}, og ikke noe annet.",
126
+ ),
127
+ SV: PromptConfig(
128
+ default_prompt_label_mapping=dict(
129
+ positive="positiv", neutral="neutral", negative="negativ"
130
+ ),
131
+ default_prompt_prefix="Nedan följer dokument och deras sentiment, som kan vara "
132
+ "{labels_str}.",
133
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
134
+ default_instruction_prompt="Dokument: {text}\n\nKlassificera känslan i "
135
+ "dokumentet. Svara med {labels_str}, och inget annat.",
136
+ ),
137
+ }
@@ -0,0 +1,97 @@
1
+ """Templates for the Summarization task."""
2
+
3
+ from ..data_models import PromptConfig
4
+ from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
5
+
6
+ # TODO: Missing Faroese
7
+ SUMM_TEMPLATES = {
8
+ DA: PromptConfig(
9
+ default_prompt_prefix="Følgende er dokumenter med tilhørende resuméer.",
10
+ default_prompt_template="Dokument: {text}\nResumé: {target_text}",
11
+ default_instruction_prompt="Dokument: {text}\n\nSkriv et resumé af ovenstående "
12
+ "dokument.",
13
+ default_prompt_label_mapping=dict(),
14
+ ),
15
+ DE: PromptConfig(
16
+ default_prompt_prefix="Nachstehend finden Sie Dokumente mit zugehörigen "
17
+ "Zusammenfassungen.",
18
+ default_prompt_template="Dokument: {text}\nZusammenfassung: {target_text}",
19
+ default_instruction_prompt="Nachrichtenartikel: {text}\n\nSchreiben Sie eine "
20
+ "Zusammenfassung des oben genannten Dokuments.",
21
+ default_prompt_label_mapping=dict(),
22
+ ),
23
+ EN: PromptConfig(
24
+ default_prompt_prefix="The following are documents with accompanying "
25
+ "summaries.",
26
+ default_prompt_template="Document: {text}\nSummary: {target_text}",
27
+ default_instruction_prompt="Document: {text}\n\nWrite a summary of the above "
28
+ "document.",
29
+ default_prompt_label_mapping=dict(),
30
+ ),
31
+ ES: PromptConfig(
32
+ default_prompt_prefix="A continuación se presentan documentos con resúmenes "
33
+ "adjuntos.",
34
+ default_prompt_template="Documento: {text}\nResumen: {target_text}",
35
+ default_instruction_prompt="Documento: {text}\n\nEscriba un resumen del "
36
+ "documento anterior.",
37
+ default_prompt_label_mapping=dict(),
38
+ ),
39
+ FR: PromptConfig(
40
+ default_prompt_prefix="Les documents suivants sont accompagnés d'un résumé.",
41
+ default_prompt_template="Document: {text}\nRésumé: {target_text}",
42
+ default_instruction_prompt="Document: {text}\n\nRédigez un résumé du "
43
+ "document ci-dessus.",
44
+ default_prompt_label_mapping=dict(),
45
+ ),
46
+ IS: PromptConfig(
47
+ default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
48
+ default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
49
+ default_instruction_prompt="Skjal: {text}\n\nSkrifaðu samantekt á ofangreindu "
50
+ "skjali.",
51
+ default_prompt_label_mapping=dict(),
52
+ ),
53
+ IT: PromptConfig(
54
+ default_prompt_prefix="Di seguito sono riportati i documenti con le relative "
55
+ "sintesi.",
56
+ default_prompt_template="Documento: {text}\nSintesi: {target_text}",
57
+ default_instruction_prompt="Documento: {text}\n\nScrivete una sintesi del "
58
+ "documento di cui sopra.",
59
+ default_prompt_label_mapping=dict(),
60
+ ),
61
+ NB: PromptConfig(
62
+ default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
63
+ default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
64
+ default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
65
+ "dokumentet ovenfor.",
66
+ default_prompt_label_mapping=dict(),
67
+ ),
68
+ NL: PromptConfig(
69
+ default_prompt_prefix="Hieronder volgen documenten met bijbehorende "
70
+ "samenvattingen.",
71
+ default_prompt_template="Document: {text}\nSamenvatting: {target_text}",
72
+ default_instruction_prompt="Document: {text}\n\nSchrijf een samenvatting van "
73
+ "het bovenstaande document.",
74
+ default_prompt_label_mapping=dict(),
75
+ ),
76
+ NN: PromptConfig(
77
+ default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
78
+ default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
79
+ default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
80
+ "dokumentet ovenfor.",
81
+ default_prompt_label_mapping=dict(),
82
+ ),
83
+ NO: PromptConfig(
84
+ default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
85
+ default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
86
+ default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
87
+ "dokumentet ovenfor.",
88
+ default_prompt_label_mapping=dict(),
89
+ ),
90
+ SV: PromptConfig(
91
+ default_prompt_prefix="Nedan följer dokument med tillhörande sammanfattningar.",
92
+ default_prompt_template="Dokument: {text}\nSammanfattning: {target_text}",
93
+ default_instruction_prompt="Dokument: {text}\n\nSkriv en sammanfattning av "
94
+ "ovanstående dokument.",
95
+ default_prompt_label_mapping=dict(),
96
+ ),
97
+ }
@@ -4,7 +4,7 @@ import logging
4
4
 
5
5
  import pyinfer
6
6
  from tqdm.auto import tqdm
7
- from transformers import AutoTokenizer
7
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
8
8
 
9
9
  from .benchmark_modules import (
10
10
  BenchmarkModule,
@@ -8,7 +8,9 @@ from collections import defaultdict
8
8
 
9
9
  import numpy as np
10
10
  from datasets import Dataset
11
- from transformers import BatchEncoding, PreTrainedTokenizer, Trainer
11
+ from transformers.tokenization_utils import PreTrainedTokenizer
12
+ from transformers.tokenization_utils_base import BatchEncoding
13
+ from transformers.trainer import Trainer
12
14
 
13
15
  if t.TYPE_CHECKING:
14
16
  from ..types import Labels, Predictions
@@ -19,12 +21,12 @@ logger = logging.getLogger("euroeval")
19
21
  class MultipleChoiceClassificationTrainer(Trainer):
20
22
  """Trainer subclass for question answering tasks."""
21
23
 
22
- def evaluate(
24
+ def evaluate( # type: ignore[override]
23
25
  self,
24
26
  eval_dataset: "Dataset | None" = None,
25
27
  ignore_keys: list[str] | None = None,
26
28
  metric_key_prefix: str = "eval",
27
- ) -> dict[str, float] | None:
29
+ ) -> dict[str, float]:
28
30
  """Evaluate the model on the given dataset.
29
31
 
30
32
  Args:
@@ -54,22 +56,28 @@ class MultipleChoiceClassificationTrainer(Trainer):
54
56
  metric_key_prefix=metric_key_prefix,
55
57
  )
56
58
 
59
+ predictions = output.predictions
60
+ assert isinstance(predictions, np.ndarray)
61
+
62
+ metrics = output.metrics
63
+ assert metrics is not None
64
+
57
65
  if metric_key_prefix == "test":
58
66
  preds_and_labels = postprocess_predictions_and_labels(
59
- predictions=output.predictions, dataset=eval_dataset
67
+ predictions=predictions, dataset=eval_dataset
60
68
  )
61
- output.metrics.update(self.compute_metrics(preds_and_labels))
69
+ assert self.compute_metrics is not None
70
+ new_metrics = self.compute_metrics(preds_and_labels) # type: ignore[arg-type]
71
+ metrics.update(new_metrics)
62
72
 
63
73
  # Prefix all keys with metric_key_prefix + '_'
64
- for key in list(output.metrics.keys()):
74
+ for key in list(metrics.keys()):
65
75
  if not key.startswith(f"{metric_key_prefix}_"):
66
- output.metrics[f"{metric_key_prefix}_{key}"] = output.metrics.pop(
67
- key
68
- )
76
+ metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
69
77
 
70
78
  # Only the main node log the results by default
71
79
  if self.args.should_log:
72
- self.log(output.metrics)
80
+ self.log(metrics)
73
81
 
74
82
  self.control = self.callback_handler.on_evaluate(
75
83
  self.args,
@@ -77,7 +85,7 @@ class MultipleChoiceClassificationTrainer(Trainer):
77
85
  self.control, # type: ignore[has-type]
78
86
  output.metrics,
79
87
  )
80
- return output.metrics
88
+ return metrics
81
89
 
82
90
 
83
91
  def prepare_examples(
@@ -8,25 +8,22 @@ from collections import defaultdict
8
8
  import evaluate
9
9
  import numpy as np
10
10
  from evaluate import EvaluationModule
11
- from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
11
+ from transformers.tokenization_utils import PreTrainedTokenizer
12
+ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
12
13
  from transformers.trainer import Trainer
13
14
 
14
15
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
15
- from ..utils import (
16
- get_special_token_metadata,
17
- raise_if_model_output_contains_nan_values,
18
- )
16
+ from ..tokenization_utils import get_special_token_metadata
17
+ from ..utils import raise_if_model_output_contains_nan_values
19
18
 
20
19
  if t.TYPE_CHECKING:
21
20
  import torch.nn as nn
22
21
  from datasets.arrow_dataset import Dataset
23
- from transformers import (
24
- EvalPrediction,
25
- PreTrainedModel,
26
- TrainerCallback,
27
- TrainingArguments,
28
- )
22
+ from transformers.modeling_utils import PreTrainedModel
29
23
  from transformers.tokenization_utils_base import BatchEncoding
24
+ from transformers.trainer_callback import TrainerCallback
25
+ from transformers.trainer_utils import EvalPrediction
26
+ from transformers.training_args import TrainingArguments
30
27
 
31
28
  from ..types import Labels, Predictions
32
29
 
@@ -47,7 +44,7 @@ class QuestionAnsweringTrainer(Trainer):
47
44
  callbacks: "list[TrainerCallback]",
48
45
  data_collator: "c.Callable",
49
46
  ) -> None:
50
- """Initialize the trainer."""
47
+ """Initialise the trainer."""
51
48
  super().__init__(
52
49
  model=model,
53
50
  processing_class=processing_class,
@@ -68,13 +65,13 @@ class QuestionAnsweringTrainer(Trainer):
68
65
  # Set the label names
69
66
  self.label_names = ["start_positions", "end_positions"]
70
67
 
71
- def evaluate(
68
+ def evaluate( # type: ignore[override]
72
69
  self,
73
70
  eval_dataset: "Dataset | None" = None,
74
71
  orig_eval_dataset: "Dataset | None" = None,
75
72
  ignore_keys: list[str] | None = None,
76
73
  metric_key_prefix: str = "eval",
77
- ) -> dict[str, float] | None:
74
+ ) -> dict[str, float]:
78
75
  """Evaluate the model on the given dataset.
79
76
 
80
77
  Args:
@@ -113,33 +110,39 @@ class QuestionAnsweringTrainer(Trainer):
113
110
  finally:
114
111
  self.compute_metrics = compute_metrics
115
112
 
113
+ predictions = output.predictions
114
+ assert isinstance(predictions, tuple)
115
+
116
+ metrics = output.metrics
117
+ assert metrics is not None
118
+
116
119
  if orig_eval_dataset is not None:
117
120
  preds_and_labels = postprocess_predictions_and_labels(
118
- predictions=output.predictions,
121
+ predictions=predictions, # type: ignore[arg-type]
119
122
  dataset=orig_eval_dataset,
120
123
  prepared_dataset=eval_dataset,
121
124
  cls_token_index=self.cls_token_id,
122
125
  )
123
- output.metrics.update(self.compute_metrics(preds_and_labels))
126
+ assert self.compute_metrics is not None
127
+ new_metrics = self.compute_metrics(preds_and_labels) # type: ignore[arg-type]
128
+ metrics.update(new_metrics)
124
129
 
125
130
  # Prefix all keys with metric_key_prefix + '_'
126
- for key in list(output.metrics.keys()):
131
+ for key in list(metrics.keys()):
127
132
  if not key.startswith(f"{metric_key_prefix}_"):
128
- output.metrics[f"{metric_key_prefix}_{key}"] = output.metrics.pop(
129
- key
130
- )
133
+ metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
131
134
 
132
135
  # Only the main node log the results by default
133
136
  if self.args.should_log:
134
- self.log(output.metrics)
137
+ self.log(metrics)
135
138
 
136
139
  self.control = self.callback_handler.on_evaluate(
137
140
  self.args,
138
141
  self.state,
139
142
  self.control, # type: ignore[has-type]
140
- output.metrics,
143
+ metrics,
141
144
  )
142
- return output.metrics
145
+ return metrics
143
146
 
144
147
 
145
148
  def compute_metrics(
@@ -472,7 +475,7 @@ def prepare_test_examples(
472
475
 
473
476
 
474
477
  def postprocess_predictions_and_labels(
475
- predictions: list,
478
+ predictions: tuple[np.ndarray, np.ndarray],
476
479
  dataset: "Dataset",
477
480
  prepared_dataset: "Dataset",
478
481
  cls_token_index: int,
@@ -492,9 +495,7 @@ def postprocess_predictions_and_labels(
492
495
  Returns:
493
496
  The postprocessed predictions and labels.
494
497
  """
495
- # Extract the logits from the predictions
496
- all_start_logits = predictions[0]
497
- all_end_logits = predictions[1]
498
+ all_start_logits, all_end_logits = predictions
498
499
 
499
500
  # Build a map from an example to its corresponding features, being the blocks of
500
501
  # text from the context that we're feeding into the model. An example can have
@@ -507,7 +508,7 @@ def postprocess_predictions_and_labels(
507
508
  features_per_example[example_index].append(i)
508
509
 
509
510
  # Loop over all the examples
510
- predictions = list()
511
+ prediction_list: list[dict[str, t.Any]] = list()
511
512
  labels = list()
512
513
  for example_index, example in enumerate(dataset):
513
514
  # Extract the best valid answer associated with the current example
@@ -530,7 +531,7 @@ def postprocess_predictions_and_labels(
530
531
  )
531
532
 
532
533
  # Add the answer to the list of predictions
533
- predictions.append(prediction)
534
+ prediction_list.append(prediction)
534
535
 
535
536
  # Create the associated reference dictionary, to be added to the list of
536
537
  # references
@@ -545,7 +546,7 @@ def postprocess_predictions_and_labels(
545
546
  # Add the answer and label to the list of predictions and labels, respectively
546
547
  labels.append(label)
547
548
 
548
- return predictions, labels
549
+ return prediction_list, labels
549
550
 
550
551
 
551
552
  def find_best_answer(
@@ -10,10 +10,11 @@ import numpy as np
10
10
  from evaluate import EvaluationModule
11
11
 
12
12
  from ..data_models import BenchmarkConfig, GenerativeModelOutput
13
+ from ..exceptions import InvalidBenchmark
13
14
  from ..utils import log_once, raise_if_model_output_contains_nan_values
14
15
 
15
16
  if t.TYPE_CHECKING:
16
- from transformers import EvalPrediction
17
+ from transformers.trainer_utils import EvalPrediction
17
18
 
18
19
  from ..data_models import DatasetConfig
19
20
  from ..types import Labels, Predictions
@@ -110,6 +111,7 @@ def extract_labels_from_generation(
110
111
  input_batch: dict[str, list],
111
112
  model_output: GenerativeModelOutput,
112
113
  dataset_config: "DatasetConfig",
114
+ first_label_token_mapping: dict[str, str] | bool,
113
115
  ) -> list[str]:
114
116
  """Extract the predicted labels from the generated output.
115
117
 
@@ -121,13 +123,19 @@ def extract_labels_from_generation(
121
123
  The raw generated output of the model.
122
124
  dataset_config:
123
125
  The configuration of the dataset.
126
+ first_label_token_mapping:
127
+ A mapping from labels to the first token in each label, or alternatively a
128
+ Boolean value indicating whether the model should output scores (if the
129
+ mapping is outputted then the model will always output scores).
124
130
 
125
131
  Returns:
126
132
  The predicted labels.
127
133
  """
128
134
  if model_output.scores is not None:
129
135
  return get_closest_logprobs_labels(
130
- generation_logprobs=model_output.scores, dataset_config=dataset_config
136
+ generation_logprobs=model_output.scores,
137
+ dataset_config=dataset_config,
138
+ first_label_token_mapping=first_label_token_mapping,
131
139
  )
132
140
  else:
133
141
  return get_closest_word_edit_labels(
@@ -138,6 +146,7 @@ def extract_labels_from_generation(
138
146
  def get_closest_logprobs_labels(
139
147
  generation_logprobs: list[list[list[tuple[str, float]]]],
140
148
  dataset_config: "DatasetConfig",
149
+ first_label_token_mapping: dict[str, str] | bool,
141
150
  ) -> list[str]:
142
151
  """Get the labels with the highest predicted logprob value.
143
152
 
@@ -152,6 +161,10 @@ def get_closest_logprobs_labels(
152
161
  (batch_size, num_tokens, num_logprobs).
153
162
  dataset_config:
154
163
  The configuration of the dataset.
164
+ first_label_token_mapping:
165
+ A mapping from labels to the first token in each label, or alternatively a
166
+ Boolean value indicating whether the model should output scores (if the
167
+ mapping is outputted then the model will always output scores).
155
168
 
156
169
  Returns:
157
170
  The predicted labels.
@@ -185,11 +198,29 @@ def get_closest_logprobs_labels(
185
198
  generated_label = "".join(previously_generated_labels) + generated_label
186
199
 
187
200
  # Get the candidate labels that starts with the generated label
188
- candidate_output_labels = {
189
- candidate_label
190
- for candidate_label in candidate_labels
191
- if candidate_label.startswith(generated_label)
192
- }
201
+ if isinstance(first_label_token_mapping, dict):
202
+ if any(
203
+ candidate_label not in first_label_token_mapping
204
+ for candidate_label in candidate_labels
205
+ ):
206
+ raise InvalidBenchmark(
207
+ "There is a label not present in the first label token "
208
+ "mapping - this should never happen! Please report this "
209
+ "issue to the EuroEval team at "
210
+ "github.com/EuroEval/EuroEval/issues."
211
+ )
212
+
213
+ candidate_output_labels = {
214
+ candidate_label
215
+ for candidate_label in candidate_labels
216
+ if generated_label == first_label_token_mapping[candidate_label]
217
+ }
218
+ else:
219
+ candidate_output_labels = {
220
+ candidate_label
221
+ for candidate_label in candidate_labels
222
+ if candidate_label.startswith(generated_label)
223
+ }
193
224
 
194
225
  # If we can uniquely determine the output label, we break the loop. If
195
226
  # there are multiple possible labels then we store the current one, and
@@ -206,7 +237,7 @@ def get_closest_logprobs_labels(
206
237
  else:
207
238
  output_label = candidate_output_labels.pop()
208
239
  candidate_output_labels.add(output_label)
209
- log_once(
240
+ raise InvalidBenchmark(
210
241
  "Multiple candidate labels found for the generated label "
211
242
  f"{generated_label!r}: {candidate_output_labels}. Since "
212
243
  "this is not the first generated label, we cannot "
@@ -214,9 +245,13 @@ def get_closest_logprobs_labels(
214
245
  f"forced to use the arbitrary {output_label!r} as the "
215
246
  "output label, potentially resulting in worse performance. "
216
247
  "Please report this issue to the EuroEval team at "
217
- "github.com/EuroEval/EuroEval/issues.",
218
- level=logging.WARNING,
248
+ "github.com/EuroEval/EuroEval/issues."
219
249
  )
250
+ elif len(candidate_output_labels) == 0:
251
+ logger.debug(
252
+ f"No candidate label found for the generated label "
253
+ f"{generated_label!r}. The generated label is thus ignored."
254
+ )
220
255
 
221
256
  if output_label is not None:
222
257
  output_labels.append(output_label)
@@ -17,7 +17,7 @@ from ..utils import (
17
17
  )
18
18
 
19
19
  if t.TYPE_CHECKING:
20
- from transformers import EvalPrediction
20
+ from transformers.trainer_utils import EvalPrediction
21
21
 
22
22
  from ..types import Labels, Predictions
23
23
 
@@ -9,14 +9,15 @@ import demjson3
9
9
  import evaluate
10
10
  import numpy as np
11
11
  from evaluate import EvaluationModule
12
- from transformers import PreTrainedTokenizer
12
+ from transformers.tokenization_utils import PreTrainedTokenizer
13
13
 
14
14
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
15
15
  from ..exceptions import InvalidBenchmark
16
16
  from ..utils import raise_if_model_output_contains_nan_values
17
17
 
18
18
  if t.TYPE_CHECKING:
19
- from transformers import BatchEncoding, EvalPrediction
19
+ from transformers.tokenization_utils_base import BatchEncoding
20
+ from transformers.trainer_utils import EvalPrediction
20
21
 
21
22
  from ..types import Labels, Predictions
22
23