EuroEval 15.5.0__py3-none-any.whl → 15.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (53) hide show
  1. euroeval/benchmark_modules/base.py +3 -2
  2. euroeval/benchmark_modules/fresh.py +8 -6
  3. euroeval/benchmark_modules/hf.py +33 -31
  4. euroeval/benchmark_modules/litellm.py +120 -56
  5. euroeval/benchmark_modules/vllm.py +41 -26
  6. euroeval/benchmarker.py +23 -21
  7. euroeval/callbacks.py +2 -2
  8. euroeval/constants.py +1 -1
  9. euroeval/data_models.py +261 -42
  10. euroeval/dataset_configs/__init__.py +61 -0
  11. euroeval/dataset_configs/danish.py +120 -0
  12. euroeval/dataset_configs/dutch.py +123 -0
  13. euroeval/dataset_configs/english.py +88 -0
  14. euroeval/dataset_configs/faroese.py +54 -0
  15. euroeval/dataset_configs/french.py +83 -0
  16. euroeval/dataset_configs/german.py +91 -0
  17. euroeval/dataset_configs/icelandic.py +148 -0
  18. euroeval/dataset_configs/italian.py +81 -0
  19. euroeval/dataset_configs/norwegian.py +178 -0
  20. euroeval/dataset_configs/spanish.py +78 -0
  21. euroeval/dataset_configs/swedish.py +100 -0
  22. euroeval/exceptions.py +10 -10
  23. euroeval/finetuning.py +6 -10
  24. euroeval/generation.py +1 -0
  25. euroeval/human_evaluation.py +2 -2
  26. euroeval/languages.py +20 -13
  27. euroeval/model_cache.py +1 -1
  28. euroeval/model_loading.py +1 -12
  29. euroeval/prompt_templates/__init__.py +8 -0
  30. euroeval/prompt_templates/linguistic_acceptability.py +112 -0
  31. euroeval/prompt_templates/multiple_choice.py +97 -0
  32. euroeval/prompt_templates/named_entity_recognition.py +257 -0
  33. euroeval/prompt_templates/reading_comprehension.py +118 -0
  34. euroeval/prompt_templates/sentiment_classification.py +137 -0
  35. euroeval/prompt_templates/summarization.py +97 -0
  36. euroeval/speed_benchmark.py +1 -1
  37. euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
  38. euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
  39. euroeval/{task_utils → task_group_utils}/sequence_classification.py +1 -1
  40. euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
  41. euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
  42. euroeval/tasks.py +54 -0
  43. euroeval/tokenization_utils.py +343 -0
  44. euroeval/types.py +3 -1
  45. euroeval/utils.py +2 -347
  46. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/METADATA +31 -9
  47. euroeval-15.6.1.dist-info/RECORD +59 -0
  48. euroeval/dataset_configs.py +0 -2408
  49. euroeval-15.5.0.dist-info/RECORD +0 -40
  50. /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
  51. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/WHEEL +0 -0
  52. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/entry_points.txt +0 -0
  53. {euroeval-15.5.0.dist-info → euroeval-15.6.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,137 @@
1
+ """Templates for the Sentiment Analysis task."""
2
+
3
+ from ..data_models import PromptConfig
4
+ from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
5
+
6
+ SENT_TEMPLATES = {
7
+ DA: PromptConfig(
8
+ default_prompt_label_mapping=dict(
9
+ positive="positiv", neutral="neutral", negative="negativ"
10
+ ),
11
+ default_prompt_prefix="Følgende er dokumenter og deres sentiment, som kan være "
12
+ "{labels_str}.",
13
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
14
+ default_instruction_prompt="Dokument: {text}\n\nKlassificer sentimentet i "
15
+ "dokumentet. Svar kun med {labels_str}, og intet andet.",
16
+ ),
17
+ DE: PromptConfig(
18
+ default_prompt_label_mapping=dict(
19
+ positive="positiv", neutral="neutral", negative="negativ"
20
+ ),
21
+ default_prompt_prefix="Nachfolgend finden Sie Dokumente und ihre Bewertung, "
22
+ "die {labels_str} sein kann.",
23
+ default_prompt_template="Dokument: {text}\nStimmung: {label}",
24
+ default_instruction_prompt="Dokument: {text}\n\nKlassifizieren Sie die "
25
+ "Stimmung im Dokument. Antworten Sie mit {labels_str}, und nichts anderes.",
26
+ ),
27
+ EN: PromptConfig(
28
+ default_prompt_label_mapping=dict(
29
+ positive="positive", neutral="neutral", negative="negative"
30
+ ),
31
+ default_prompt_prefix="The following are documents and their sentiment, which "
32
+ "can be {labels_str}.",
33
+ default_prompt_template="Document: {text}\nSentiment: {label}",
34
+ default_instruction_prompt="Document: {text}\n\nClassify the sentiment in the "
35
+ "document. Answer with {labels_str}, and nothing else.",
36
+ ),
37
+ ES: PromptConfig(
38
+ default_prompt_label_mapping=dict(
39
+ positive="positivo", neutral="neutral", negative="negativo"
40
+ ),
41
+ default_prompt_prefix="A continuación se muestran los documentos y su "
42
+ "sentimiento, que puede ser {labels_str}.",
43
+ default_prompt_template="Documento: {text}\nSentimiento: {label}",
44
+ default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
45
+ "documento. Responde con {labels_str}, y nada más.",
46
+ ),
47
+ FO: PromptConfig(
48
+ default_prompt_label_mapping=dict(
49
+ positive="positivt", neutral="neutralt", negative="negativt"
50
+ ),
51
+ default_prompt_prefix="Niðanfyri eru skjøl og teirra kenslur, sum kunnu vera "
52
+ "{labels_str}.",
53
+ default_prompt_template="Skjal: {text}\nKensla: {label}",
54
+ default_instruction_prompt="Skjal: {text}\n\nFlokka kensluna í skjalinum. "
55
+ "Svara við {labels_str}, og einki annað.",
56
+ ),
57
+ FR: PromptConfig(
58
+ default_prompt_label_mapping=dict(
59
+ positive="positif", neutral="neutre", negative="négatif"
60
+ ),
61
+ default_prompt_prefix="Les documents suivants sont accompagnés de leur "
62
+ "sentiment, qui peut être {labels_str}.",
63
+ default_prompt_template="Document: {text}\nSentiment: {label}",
64
+ default_instruction_prompt="Document: {text}\n\nClassez le sentiment dans le "
65
+ "document. Répondez par {labels_str}, et rien d'autre.",
66
+ ),
67
+ IS: PromptConfig(
68
+ default_prompt_label_mapping=dict(
69
+ positive="jákvætt", neutral="hlutlaust", negative="neikvætt"
70
+ ),
71
+ default_prompt_prefix="Eftirfarandi eru skjöl og viðhorf þeirra, sem geta "
72
+ "verið {labels_str}.",
73
+ default_prompt_template="Skjal: {text}\nViðhorf: {label}",
74
+ default_instruction_prompt="Skjal: {text}\n\nFlokkaðu viðhorfið í skjalinu. "
75
+ "Svaraðu með {labels_str}, og ekkert annað.",
76
+ ),
77
+ IT: PromptConfig(
78
+ default_prompt_label_mapping=dict(
79
+ positive="positivo", neutral="neutro", negative="negativo"
80
+ ),
81
+ default_prompt_prefix="Di seguito sono riportati i documenti e il loro "
82
+ "sentiment, che può essere {labels_str}.",
83
+ default_prompt_template="Documento: {text}\nSentimento: {label}",
84
+ default_instruction_prompt="Documento: {text}\n\nClassificare il sentiment del "
85
+ "documento. Rispondere con {labels_str}, e nient'altro.",
86
+ ),
87
+ NB: PromptConfig(
88
+ default_prompt_label_mapping=dict(
89
+ positive="positiv", neutral="nøytral", negative="negativ"
90
+ ),
91
+ default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
92
+ "{labels_str}",
93
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
94
+ default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
95
+ "teksten. Svar med {labels_str}, og ikke noe annet.",
96
+ ),
97
+ NL: PromptConfig(
98
+ default_prompt_label_mapping=dict(
99
+ positive="positief", neutral="neutraal", negative="negatief"
100
+ ),
101
+ default_prompt_prefix="Hieronder volgen documenten en hun sentiment, dat "
102
+ "{labels_str} kan zijn.",
103
+ default_prompt_template="Document: {text}\nSentiment: {label}",
104
+ default_instruction_prompt="Document: {text}\n\nClassificeer het sentiment in "
105
+ "het document. Antwoord met {labels_str}, en verder niets.",
106
+ ),
107
+ NN: PromptConfig(
108
+ default_prompt_label_mapping=dict(
109
+ positive="positiv", neutral="nøytral", negative="negativ"
110
+ ),
111
+ default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
112
+ "{labels_str}",
113
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
114
+ default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
115
+ "teksten. Svar med {labels_str}, og ikke noe annet.",
116
+ ),
117
+ NO: PromptConfig(
118
+ default_prompt_label_mapping=dict(
119
+ positive="positiv", neutral="nøytral", negative="negativ"
120
+ ),
121
+ default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
122
+ "{labels_str}",
123
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
124
+ default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
125
+ "teksten. Svar med {labels_str}, og ikke noe annet.",
126
+ ),
127
+ SV: PromptConfig(
128
+ default_prompt_label_mapping=dict(
129
+ positive="positiv", neutral="neutral", negative="negativ"
130
+ ),
131
+ default_prompt_prefix="Nedan följer dokument och deras sentiment, som kan vara "
132
+ "{labels_str}.",
133
+ default_prompt_template="Dokument: {text}\nSentiment: {label}",
134
+ default_instruction_prompt="Dokument: {text}\n\nKlassificera känslan i "
135
+ "dokumentet. Svara med {labels_str}, och inget annat.",
136
+ ),
137
+ }
@@ -0,0 +1,97 @@
1
+ """Templates for the Summarization task."""
2
+
3
+ from ..data_models import PromptConfig
4
+ from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
5
+
6
+ # TODO: Missing Faroese
7
+ SUMM_TEMPLATES = {
8
+ DA: PromptConfig(
9
+ default_prompt_prefix="Følgende er dokumenter med tilhørende resuméer.",
10
+ default_prompt_template="Dokument: {text}\nResumé: {target_text}",
11
+ default_instruction_prompt="Dokument: {text}\n\nSkriv et resumé af ovenstående "
12
+ "dokument.",
13
+ default_prompt_label_mapping=dict(),
14
+ ),
15
+ DE: PromptConfig(
16
+ default_prompt_prefix="Nachstehend finden Sie Dokumente mit zugehörigen "
17
+ "Zusammenfassungen.",
18
+ default_prompt_template="Dokument: {text}\nZusammenfassung: {target_text}",
19
+ default_instruction_prompt="Nachrichtenartikel: {text}\n\nSchreiben Sie eine "
20
+ "Zusammenfassung des oben genannten Dokuments.",
21
+ default_prompt_label_mapping=dict(),
22
+ ),
23
+ EN: PromptConfig(
24
+ default_prompt_prefix="The following are documents with accompanying "
25
+ "summaries.",
26
+ default_prompt_template="Document: {text}\nSummary: {target_text}",
27
+ default_instruction_prompt="Document: {text}\n\nWrite a summary of the above "
28
+ "document.",
29
+ default_prompt_label_mapping=dict(),
30
+ ),
31
+ ES: PromptConfig(
32
+ default_prompt_prefix="A continuación se presentan documentos con resúmenes "
33
+ "adjuntos.",
34
+ default_prompt_template="Documento: {text}\nResumen: {target_text}",
35
+ default_instruction_prompt="Documento: {text}\n\nEscriba un resumen del "
36
+ "documento anterior.",
37
+ default_prompt_label_mapping=dict(),
38
+ ),
39
+ FR: PromptConfig(
40
+ default_prompt_prefix="Les documents suivants sont accompagnés d'un résumé.",
41
+ default_prompt_template="Document: {text}\nRésumé: {target_text}",
42
+ default_instruction_prompt="Document: {text}\n\nRédigez un résumé du "
43
+ "document ci-dessus.",
44
+ default_prompt_label_mapping=dict(),
45
+ ),
46
+ IS: PromptConfig(
47
+ default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
48
+ default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
49
+ default_instruction_prompt="Skjal: {text}\n\nSkrifaðu samantekt á ofangreindu "
50
+ "skjali.",
51
+ default_prompt_label_mapping=dict(),
52
+ ),
53
+ IT: PromptConfig(
54
+ default_prompt_prefix="Di seguito sono riportati i documenti con le relative "
55
+ "sintesi.",
56
+ default_prompt_template="Documento: {text}\nSintesi: {target_text}",
57
+ default_instruction_prompt="Documento: {text}\n\nScrivete una sintesi del "
58
+ "documento di cui sopra.",
59
+ default_prompt_label_mapping=dict(),
60
+ ),
61
+ NB: PromptConfig(
62
+ default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
63
+ default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
64
+ default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
65
+ "dokumentet ovenfor.",
66
+ default_prompt_label_mapping=dict(),
67
+ ),
68
+ NL: PromptConfig(
69
+ default_prompt_prefix="Hieronder volgen documenten met bijbehorende "
70
+ "samenvattingen.",
71
+ default_prompt_template="Document: {text}\nSamenvatting: {target_text}",
72
+ default_instruction_prompt="Document: {text}\n\nSchrijf een samenvatting van "
73
+ "het bovenstaande document.",
74
+ default_prompt_label_mapping=dict(),
75
+ ),
76
+ NN: PromptConfig(
77
+ default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
78
+ default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
79
+ default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
80
+ "dokumentet ovenfor.",
81
+ default_prompt_label_mapping=dict(),
82
+ ),
83
+ NO: PromptConfig(
84
+ default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
85
+ default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
86
+ default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
87
+ "dokumentet ovenfor.",
88
+ default_prompt_label_mapping=dict(),
89
+ ),
90
+ SV: PromptConfig(
91
+ default_prompt_prefix="Nedan följer dokument med tillhörande sammanfattningar.",
92
+ default_prompt_template="Dokument: {text}\nSammanfattning: {target_text}",
93
+ default_instruction_prompt="Dokument: {text}\n\nSkriv en sammanfattning av "
94
+ "ovanstående dokument.",
95
+ default_prompt_label_mapping=dict(),
96
+ ),
97
+ }
@@ -4,7 +4,7 @@ import logging
4
4
 
5
5
  import pyinfer
6
6
  from tqdm.auto import tqdm
7
- from transformers import AutoTokenizer
7
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
8
8
 
9
9
  from .benchmark_modules import (
10
10
  BenchmarkModule,
@@ -8,7 +8,9 @@ from collections import defaultdict
8
8
 
9
9
  import numpy as np
10
10
  from datasets import Dataset
11
- from transformers import BatchEncoding, PreTrainedTokenizer, Trainer
11
+ from transformers.tokenization_utils import PreTrainedTokenizer
12
+ from transformers.tokenization_utils_base import BatchEncoding
13
+ from transformers.trainer import Trainer
12
14
 
13
15
  if t.TYPE_CHECKING:
14
16
  from ..types import Labels, Predictions
@@ -19,12 +21,12 @@ logger = logging.getLogger("euroeval")
19
21
  class MultipleChoiceClassificationTrainer(Trainer):
20
22
  """Trainer subclass for question answering tasks."""
21
23
 
22
- def evaluate(
24
+ def evaluate( # type: ignore[override]
23
25
  self,
24
26
  eval_dataset: "Dataset | None" = None,
25
27
  ignore_keys: list[str] | None = None,
26
28
  metric_key_prefix: str = "eval",
27
- ) -> dict[str, float] | None:
29
+ ) -> dict[str, float]:
28
30
  """Evaluate the model on the given dataset.
29
31
 
30
32
  Args:
@@ -54,22 +56,28 @@ class MultipleChoiceClassificationTrainer(Trainer):
54
56
  metric_key_prefix=metric_key_prefix,
55
57
  )
56
58
 
59
+ predictions = output.predictions
60
+ assert isinstance(predictions, np.ndarray)
61
+
62
+ metrics = output.metrics
63
+ assert metrics is not None
64
+
57
65
  if metric_key_prefix == "test":
58
66
  preds_and_labels = postprocess_predictions_and_labels(
59
- predictions=output.predictions, dataset=eval_dataset
67
+ predictions=predictions, dataset=eval_dataset
60
68
  )
61
- output.metrics.update(self.compute_metrics(preds_and_labels))
69
+ assert self.compute_metrics is not None
70
+ new_metrics = self.compute_metrics(preds_and_labels) # type: ignore[arg-type]
71
+ metrics.update(new_metrics)
62
72
 
63
73
  # Prefix all keys with metric_key_prefix + '_'
64
- for key in list(output.metrics.keys()):
74
+ for key in list(metrics.keys()):
65
75
  if not key.startswith(f"{metric_key_prefix}_"):
66
- output.metrics[f"{metric_key_prefix}_{key}"] = output.metrics.pop(
67
- key
68
- )
76
+ metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
69
77
 
70
78
  # Only the main node log the results by default
71
79
  if self.args.should_log:
72
- self.log(output.metrics)
80
+ self.log(metrics)
73
81
 
74
82
  self.control = self.callback_handler.on_evaluate(
75
83
  self.args,
@@ -77,7 +85,7 @@ class MultipleChoiceClassificationTrainer(Trainer):
77
85
  self.control, # type: ignore[has-type]
78
86
  output.metrics,
79
87
  )
80
- return output.metrics
88
+ return metrics
81
89
 
82
90
 
83
91
  def prepare_examples(
@@ -8,25 +8,22 @@ from collections import defaultdict
8
8
  import evaluate
9
9
  import numpy as np
10
10
  from evaluate import EvaluationModule
11
- from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
11
+ from transformers.tokenization_utils import PreTrainedTokenizer
12
+ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
12
13
  from transformers.trainer import Trainer
13
14
 
14
15
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
15
- from ..utils import (
16
- get_special_token_metadata,
17
- raise_if_model_output_contains_nan_values,
18
- )
16
+ from ..tokenization_utils import get_special_token_metadata
17
+ from ..utils import raise_if_model_output_contains_nan_values
19
18
 
20
19
  if t.TYPE_CHECKING:
21
20
  import torch.nn as nn
22
21
  from datasets.arrow_dataset import Dataset
23
- from transformers import (
24
- EvalPrediction,
25
- PreTrainedModel,
26
- TrainerCallback,
27
- TrainingArguments,
28
- )
22
+ from transformers.modeling_utils import PreTrainedModel
29
23
  from transformers.tokenization_utils_base import BatchEncoding
24
+ from transformers.trainer_callback import TrainerCallback
25
+ from transformers.trainer_utils import EvalPrediction
26
+ from transformers.training_args import TrainingArguments
30
27
 
31
28
  from ..types import Labels, Predictions
32
29
 
@@ -47,7 +44,7 @@ class QuestionAnsweringTrainer(Trainer):
47
44
  callbacks: "list[TrainerCallback]",
48
45
  data_collator: "c.Callable",
49
46
  ) -> None:
50
- """Initialize the trainer."""
47
+ """Initialise the trainer."""
51
48
  super().__init__(
52
49
  model=model,
53
50
  processing_class=processing_class,
@@ -68,13 +65,13 @@ class QuestionAnsweringTrainer(Trainer):
68
65
  # Set the label names
69
66
  self.label_names = ["start_positions", "end_positions"]
70
67
 
71
- def evaluate(
68
+ def evaluate( # type: ignore[override]
72
69
  self,
73
70
  eval_dataset: "Dataset | None" = None,
74
71
  orig_eval_dataset: "Dataset | None" = None,
75
72
  ignore_keys: list[str] | None = None,
76
73
  metric_key_prefix: str = "eval",
77
- ) -> dict[str, float] | None:
74
+ ) -> dict[str, float]:
78
75
  """Evaluate the model on the given dataset.
79
76
 
80
77
  Args:
@@ -113,33 +110,39 @@ class QuestionAnsweringTrainer(Trainer):
113
110
  finally:
114
111
  self.compute_metrics = compute_metrics
115
112
 
113
+ predictions = output.predictions
114
+ assert isinstance(predictions, tuple)
115
+
116
+ metrics = output.metrics
117
+ assert metrics is not None
118
+
116
119
  if orig_eval_dataset is not None:
117
120
  preds_and_labels = postprocess_predictions_and_labels(
118
- predictions=output.predictions,
121
+ predictions=predictions, # type: ignore[arg-type]
119
122
  dataset=orig_eval_dataset,
120
123
  prepared_dataset=eval_dataset,
121
124
  cls_token_index=self.cls_token_id,
122
125
  )
123
- output.metrics.update(self.compute_metrics(preds_and_labels))
126
+ assert self.compute_metrics is not None
127
+ new_metrics = self.compute_metrics(preds_and_labels) # type: ignore[arg-type]
128
+ metrics.update(new_metrics)
124
129
 
125
130
  # Prefix all keys with metric_key_prefix + '_'
126
- for key in list(output.metrics.keys()):
131
+ for key in list(metrics.keys()):
127
132
  if not key.startswith(f"{metric_key_prefix}_"):
128
- output.metrics[f"{metric_key_prefix}_{key}"] = output.metrics.pop(
129
- key
130
- )
133
+ metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
131
134
 
132
135
  # Only the main node log the results by default
133
136
  if self.args.should_log:
134
- self.log(output.metrics)
137
+ self.log(metrics)
135
138
 
136
139
  self.control = self.callback_handler.on_evaluate(
137
140
  self.args,
138
141
  self.state,
139
142
  self.control, # type: ignore[has-type]
140
- output.metrics,
143
+ metrics,
141
144
  )
142
- return output.metrics
145
+ return metrics
143
146
 
144
147
 
145
148
  def compute_metrics(
@@ -472,7 +475,7 @@ def prepare_test_examples(
472
475
 
473
476
 
474
477
  def postprocess_predictions_and_labels(
475
- predictions: list,
478
+ predictions: tuple[np.ndarray, np.ndarray],
476
479
  dataset: "Dataset",
477
480
  prepared_dataset: "Dataset",
478
481
  cls_token_index: int,
@@ -492,9 +495,7 @@ def postprocess_predictions_and_labels(
492
495
  Returns:
493
496
  The postprocessed predictions and labels.
494
497
  """
495
- # Extract the logits from the predictions
496
- all_start_logits = predictions[0]
497
- all_end_logits = predictions[1]
498
+ all_start_logits, all_end_logits = predictions
498
499
 
499
500
  # Build a map from an example to its corresponding features, being the blocks of
500
501
  # text from the context that we're feeding into the model. An example can have
@@ -507,7 +508,7 @@ def postprocess_predictions_and_labels(
507
508
  features_per_example[example_index].append(i)
508
509
 
509
510
  # Loop over all the examples
510
- predictions = list()
511
+ prediction_list: list[dict[str, t.Any]] = list()
511
512
  labels = list()
512
513
  for example_index, example in enumerate(dataset):
513
514
  # Extract the best valid answer associated with the current example
@@ -530,7 +531,7 @@ def postprocess_predictions_and_labels(
530
531
  )
531
532
 
532
533
  # Add the answer to the list of predictions
533
- predictions.append(prediction)
534
+ prediction_list.append(prediction)
534
535
 
535
536
  # Create the associated reference dictionary, to be added to the list of
536
537
  # references
@@ -545,7 +546,7 @@ def postprocess_predictions_and_labels(
545
546
  # Add the answer and label to the list of predictions and labels, respectively
546
547
  labels.append(label)
547
548
 
548
- return predictions, labels
549
+ return prediction_list, labels
549
550
 
550
551
 
551
552
  def find_best_answer(
@@ -14,7 +14,7 @@ from ..exceptions import InvalidBenchmark
14
14
  from ..utils import log_once, raise_if_model_output_contains_nan_values
15
15
 
16
16
  if t.TYPE_CHECKING:
17
- from transformers import EvalPrediction
17
+ from transformers.trainer_utils import EvalPrediction
18
18
 
19
19
  from ..data_models import DatasetConfig
20
20
  from ..types import Labels, Predictions
@@ -17,7 +17,7 @@ from ..utils import (
17
17
  )
18
18
 
19
19
  if t.TYPE_CHECKING:
20
- from transformers import EvalPrediction
20
+ from transformers.trainer_utils import EvalPrediction
21
21
 
22
22
  from ..types import Labels, Predictions
23
23
 
@@ -9,14 +9,15 @@ import demjson3
9
9
  import evaluate
10
10
  import numpy as np
11
11
  from evaluate import EvaluationModule
12
- from transformers import PreTrainedTokenizer
12
+ from transformers.tokenization_utils import PreTrainedTokenizer
13
13
 
14
14
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
15
15
  from ..exceptions import InvalidBenchmark
16
16
  from ..utils import raise_if_model_output_contains_nan_values
17
17
 
18
18
  if t.TYPE_CHECKING:
19
- from transformers import BatchEncoding, EvalPrediction
19
+ from transformers.tokenization_utils_base import BatchEncoding
20
+ from transformers.trainer_utils import EvalPrediction
20
21
 
21
22
  from ..types import Labels, Predictions
22
23
 
euroeval/tasks.py CHANGED
@@ -2,6 +2,14 @@
2
2
 
3
3
  from .data_models import MetricConfig, Task
4
4
  from .enums import TaskGroup
5
+ from .prompt_templates import (
6
+ LA_TEMPLATES,
7
+ MULTIPLE_CHOICE_TEMPLATES,
8
+ NER_TEMPLATES,
9
+ RC_TEMPLATES,
10
+ SENT_TEMPLATES,
11
+ SUMM_TEMPLATES,
12
+ )
5
13
 
6
14
 
7
15
  def get_all_tasks() -> dict[str, Task]:
@@ -16,6 +24,7 @@ def get_all_tasks() -> dict[str, Task]:
16
24
  LA = Task(
17
25
  name="linguistic-acceptability",
18
26
  task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
27
+ template_dict=LA_TEMPLATES,
19
28
  metrics=[
20
29
  MetricConfig(
21
30
  name="mcc",
@@ -31,12 +40,16 @@ LA = Task(
31
40
  compute_kwargs=dict(average="macro"),
32
41
  ),
33
42
  ],
43
+ default_num_few_shot_examples=12,
44
+ default_max_generated_tokens=5,
45
+ default_labels=["correct", "incorrect"],
34
46
  )
35
47
 
36
48
 
37
49
  NER = Task(
38
50
  name="named-entity-recognition",
39
51
  task_group=TaskGroup.TOKEN_CLASSIFICATION,
52
+ template_dict=NER_TEMPLATES,
40
53
  metrics=[
41
54
  MetricConfig(
42
55
  name="micro_f1_no_misc",
@@ -51,12 +64,26 @@ NER = Task(
51
64
  results_key="overall_f1",
52
65
  ),
53
66
  ],
67
+ default_num_few_shot_examples=8,
68
+ default_max_generated_tokens=128,
69
+ default_labels=[
70
+ "o",
71
+ "b-loc",
72
+ "i-loc",
73
+ "b-org",
74
+ "i-org",
75
+ "b-per",
76
+ "i-per",
77
+ "b-misc",
78
+ "i-misc",
79
+ ],
54
80
  )
55
81
 
56
82
 
57
83
  RC = Task(
58
84
  name="reading-comprehension",
59
85
  task_group=TaskGroup.QUESTION_ANSWERING,
86
+ template_dict=RC_TEMPLATES,
60
87
  metrics=[
61
88
  MetricConfig(
62
89
  name="f1",
@@ -73,12 +100,16 @@ RC = Task(
73
100
  postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:.2f}%"),
74
101
  ),
75
102
  ],
103
+ default_num_few_shot_examples=4,
104
+ default_max_generated_tokens=32,
105
+ default_labels=["start_positions", "end_positions"],
76
106
  )
77
107
 
78
108
 
79
109
  SENT = Task(
80
110
  name="sentiment-classification",
81
111
  task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
112
+ template_dict=SENT_TEMPLATES,
82
113
  metrics=[
83
114
  MetricConfig(
84
115
  name="mcc",
@@ -94,12 +125,16 @@ SENT = Task(
94
125
  compute_kwargs=dict(average="macro"),
95
126
  ),
96
127
  ],
128
+ default_num_few_shot_examples=12,
129
+ default_max_generated_tokens=5,
130
+ default_labels=["positive", "neutral", "negative"],
97
131
  )
98
132
 
99
133
 
100
134
  SUMM = Task(
101
135
  name="summarization",
102
136
  task_group=TaskGroup.TEXT_TO_TEXT,
137
+ template_dict=SUMM_TEMPLATES,
103
138
  metrics=[
104
139
  MetricConfig(
105
140
  name="bertscore",
@@ -117,12 +152,16 @@ SUMM = Task(
117
152
  results_key="rougeL",
118
153
  ),
119
154
  ],
155
+ default_num_few_shot_examples=1,
156
+ default_max_generated_tokens=256,
157
+ default_labels=[],
120
158
  )
121
159
 
122
160
 
123
161
  KNOW = Task(
124
162
  name="knowledge",
125
163
  task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
164
+ template_dict=MULTIPLE_CHOICE_TEMPLATES,
126
165
  metrics=[
127
166
  MetricConfig(
128
167
  name="mcc",
@@ -137,12 +176,16 @@ KNOW = Task(
137
176
  results_key="accuracy",
138
177
  ),
139
178
  ],
179
+ default_num_few_shot_examples=5,
180
+ default_max_generated_tokens=5,
181
+ default_labels=["a", "b", "c", "d"],
140
182
  )
141
183
 
142
184
 
143
185
  MCRC = Task(
144
186
  name="multiple-choice-reading-comprehension",
145
187
  task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
188
+ template_dict=MULTIPLE_CHOICE_TEMPLATES,
146
189
  metrics=[
147
190
  MetricConfig(
148
191
  name="mcc",
@@ -157,12 +200,16 @@ MCRC = Task(
157
200
  results_key="accuracy",
158
201
  ),
159
202
  ],
203
+ default_num_few_shot_examples=5,
204
+ default_max_generated_tokens=5,
205
+ default_labels=["a", "b", "c", "d"],
160
206
  )
161
207
 
162
208
 
163
209
  COMMON_SENSE = Task(
164
210
  name="common-sense-reasoning",
165
211
  task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
212
+ template_dict=MULTIPLE_CHOICE_TEMPLATES,
166
213
  metrics=[
167
214
  MetricConfig(
168
215
  name="mcc",
@@ -177,12 +224,16 @@ COMMON_SENSE = Task(
177
224
  results_key="accuracy",
178
225
  ),
179
226
  ],
227
+ default_num_few_shot_examples=5,
228
+ default_max_generated_tokens=5,
229
+ default_labels=["a", "b", "c", "d"],
180
230
  )
181
231
 
182
232
 
183
233
  SPEED = Task(
184
234
  name="speed",
185
235
  task_group=TaskGroup.SPEED,
236
+ template_dict={},
186
237
  metrics=[
187
238
  MetricConfig(
188
239
  name="speed",
@@ -199,4 +250,7 @@ SPEED = Task(
199
250
  postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
200
251
  ),
201
252
  ],
253
+ default_num_few_shot_examples=0,
254
+ default_max_generated_tokens=5,
255
+ default_labels=[],
202
256
  )