EuroEval 15.5.0__py3-none-any.whl → 15.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/base.py +3 -2
- euroeval/benchmark_modules/fresh.py +8 -6
- euroeval/benchmark_modules/hf.py +33 -31
- euroeval/benchmark_modules/litellm.py +120 -56
- euroeval/benchmark_modules/vllm.py +41 -26
- euroeval/benchmarker.py +23 -21
- euroeval/callbacks.py +2 -2
- euroeval/constants.py +1 -1
- euroeval/data_models.py +257 -42
- euroeval/dataset_configs/__init__.py +61 -0
- euroeval/dataset_configs/danish.py +120 -0
- euroeval/dataset_configs/dutch.py +123 -0
- euroeval/dataset_configs/english.py +88 -0
- euroeval/dataset_configs/faroese.py +53 -0
- euroeval/dataset_configs/french.py +83 -0
- euroeval/dataset_configs/german.py +91 -0
- euroeval/dataset_configs/icelandic.py +148 -0
- euroeval/dataset_configs/italian.py +81 -0
- euroeval/dataset_configs/norwegian.py +178 -0
- euroeval/dataset_configs/spanish.py +78 -0
- euroeval/dataset_configs/swedish.py +100 -0
- euroeval/exceptions.py +10 -10
- euroeval/finetuning.py +6 -10
- euroeval/generation.py +1 -0
- euroeval/human_evaluation.py +2 -2
- euroeval/languages.py +20 -13
- euroeval/model_cache.py +1 -1
- euroeval/model_loading.py +1 -12
- euroeval/prompt_templates/__init__.py +8 -0
- euroeval/prompt_templates/linguistic_acceptability.py +112 -0
- euroeval/prompt_templates/multiple_choice.py +97 -0
- euroeval/prompt_templates/named_entity_recognition.py +257 -0
- euroeval/prompt_templates/reading_comprehension.py +118 -0
- euroeval/prompt_templates/sentiment_classification.py +137 -0
- euroeval/prompt_templates/summarization.py +97 -0
- euroeval/speed_benchmark.py +1 -1
- euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
- euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
- euroeval/{task_utils → task_group_utils}/sequence_classification.py +1 -1
- euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
- euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
- euroeval/tasks.py +54 -0
- euroeval/tokenization_utils.py +343 -0
- euroeval/types.py +3 -1
- euroeval/utils.py +2 -347
- {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/METADATA +30 -9
- euroeval-15.6.0.dist-info/RECORD +59 -0
- euroeval/dataset_configs.py +0 -2408
- euroeval-15.5.0.dist-info/RECORD +0 -40
- /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
- {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
- {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.5.0.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Templates for the Sentiment Analysis task."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import PromptConfig
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
|
+
|
|
6
|
+
SENT_TEMPLATES = {
|
|
7
|
+
DA: PromptConfig(
|
|
8
|
+
default_prompt_label_mapping=dict(
|
|
9
|
+
positive="positiv", neutral="neutral", negative="negativ"
|
|
10
|
+
),
|
|
11
|
+
default_prompt_prefix="Følgende er dokumenter og deres sentiment, som kan være "
|
|
12
|
+
"{labels_str}.",
|
|
13
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
14
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassificer sentimentet i "
|
|
15
|
+
"dokumentet. Svar kun med {labels_str}, og intet andet.",
|
|
16
|
+
),
|
|
17
|
+
DE: PromptConfig(
|
|
18
|
+
default_prompt_label_mapping=dict(
|
|
19
|
+
positive="positiv", neutral="neutral", negative="negativ"
|
|
20
|
+
),
|
|
21
|
+
default_prompt_prefix="Nachfolgend finden Sie Dokumente und ihre Bewertung, "
|
|
22
|
+
"die {labels_str} sein kann.",
|
|
23
|
+
default_prompt_template="Dokument: {text}\nStimmung: {label}",
|
|
24
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassifizieren Sie die "
|
|
25
|
+
"Stimmung im Dokument. Antworten Sie mit {labels_str}, und nichts anderes.",
|
|
26
|
+
),
|
|
27
|
+
EN: PromptConfig(
|
|
28
|
+
default_prompt_label_mapping=dict(
|
|
29
|
+
positive="positive", neutral="neutral", negative="negative"
|
|
30
|
+
),
|
|
31
|
+
default_prompt_prefix="The following are documents and their sentiment, which "
|
|
32
|
+
"can be {labels_str}.",
|
|
33
|
+
default_prompt_template="Document: {text}\nSentiment: {label}",
|
|
34
|
+
default_instruction_prompt="Document: {text}\n\nClassify the sentiment in the "
|
|
35
|
+
"document. Answer with {labels_str}, and nothing else.",
|
|
36
|
+
),
|
|
37
|
+
ES: PromptConfig(
|
|
38
|
+
default_prompt_label_mapping=dict(
|
|
39
|
+
positive="positivo", neutral="neutral", negative="negativo"
|
|
40
|
+
),
|
|
41
|
+
default_prompt_prefix="A continuación se muestran los documentos y su "
|
|
42
|
+
"sentimiento, que puede ser {labels_str}.",
|
|
43
|
+
default_prompt_template="Documento: {text}\nSentimiento: {label}",
|
|
44
|
+
default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
|
|
45
|
+
"documento. Responde con {labels_str}, y nada más.",
|
|
46
|
+
),
|
|
47
|
+
FO: PromptConfig(
|
|
48
|
+
default_prompt_label_mapping=dict(
|
|
49
|
+
positive="positivt", neutral="neutralt", negative="negativt"
|
|
50
|
+
),
|
|
51
|
+
default_prompt_prefix="Niðanfyri eru skjøl og teirra kenslur, sum kunnu vera "
|
|
52
|
+
"{labels_str}.",
|
|
53
|
+
default_prompt_template="Skjal: {text}\nKensla: {label}",
|
|
54
|
+
default_instruction_prompt="Skjal: {text}\n\nFlokka kensluna í skjalinum. "
|
|
55
|
+
"Svara við {labels_str}, og einki annað.",
|
|
56
|
+
),
|
|
57
|
+
FR: PromptConfig(
|
|
58
|
+
default_prompt_label_mapping=dict(
|
|
59
|
+
positive="positif", neutral="neutre", negative="négatif"
|
|
60
|
+
),
|
|
61
|
+
default_prompt_prefix="Les documents suivants sont accompagnés de leur "
|
|
62
|
+
"sentiment, qui peut être {labels_str}.",
|
|
63
|
+
default_prompt_template="Document: {text}\nSentiment: {label}",
|
|
64
|
+
default_instruction_prompt="Document: {text}\n\nClassez le sentiment dans le "
|
|
65
|
+
"document. Répondez par {labels_str}, et rien d'autre.",
|
|
66
|
+
),
|
|
67
|
+
IS: PromptConfig(
|
|
68
|
+
default_prompt_label_mapping=dict(
|
|
69
|
+
positive="jákvætt", neutral="hlutlaust", negative="neikvætt"
|
|
70
|
+
),
|
|
71
|
+
default_prompt_prefix="Eftirfarandi eru skjöl og viðhorf þeirra, sem geta "
|
|
72
|
+
"verið {labels_str}.",
|
|
73
|
+
default_prompt_template="Skjal: {text}\nViðhorf: {label}",
|
|
74
|
+
default_instruction_prompt="Skjal: {text}\n\nFlokkaðu viðhorfið í skjalinu. "
|
|
75
|
+
"Svaraðu með {labels_str}, og ekkert annað.",
|
|
76
|
+
),
|
|
77
|
+
IT: PromptConfig(
|
|
78
|
+
default_prompt_label_mapping=dict(
|
|
79
|
+
positive="positivo", neutral="neutro", negative="negativo"
|
|
80
|
+
),
|
|
81
|
+
default_prompt_prefix="Di seguito sono riportati i documenti e il loro "
|
|
82
|
+
"sentiment, che può essere {labels_str}.",
|
|
83
|
+
default_prompt_template="Documento: {text}\nSentimento: {label}",
|
|
84
|
+
default_instruction_prompt="Documento: {text}\n\nClassificare il sentiment del "
|
|
85
|
+
"documento. Rispondere con {labels_str}, e nient'altro.",
|
|
86
|
+
),
|
|
87
|
+
NB: PromptConfig(
|
|
88
|
+
default_prompt_label_mapping=dict(
|
|
89
|
+
positive="positiv", neutral="nøytral", negative="negativ"
|
|
90
|
+
),
|
|
91
|
+
default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
|
|
92
|
+
"{labels_str}",
|
|
93
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
94
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
|
|
95
|
+
"teksten. Svar med {labels_str}, og ikke noe annet.",
|
|
96
|
+
),
|
|
97
|
+
NL: PromptConfig(
|
|
98
|
+
default_prompt_label_mapping=dict(
|
|
99
|
+
positive="positief", neutral="neutraal", negative="negatief"
|
|
100
|
+
),
|
|
101
|
+
default_prompt_prefix="Hieronder volgen documenten en hun sentiment, dat "
|
|
102
|
+
"{labels_str} kan zijn.",
|
|
103
|
+
default_prompt_template="Document: {text}\nSentiment: {label}",
|
|
104
|
+
default_instruction_prompt="Document: {text}\n\nClassificeer het sentiment in "
|
|
105
|
+
"het document. Antwoord met {labels_str}, en verder niets.",
|
|
106
|
+
),
|
|
107
|
+
NN: PromptConfig(
|
|
108
|
+
default_prompt_label_mapping=dict(
|
|
109
|
+
positive="positiv", neutral="nøytral", negative="negativ"
|
|
110
|
+
),
|
|
111
|
+
default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
|
|
112
|
+
"{labels_str}",
|
|
113
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
114
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
|
|
115
|
+
"teksten. Svar med {labels_str}, og ikke noe annet.",
|
|
116
|
+
),
|
|
117
|
+
NO: PromptConfig(
|
|
118
|
+
default_prompt_label_mapping=dict(
|
|
119
|
+
positive="positiv", neutral="nøytral", negative="negativ"
|
|
120
|
+
),
|
|
121
|
+
default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
|
|
122
|
+
"{labels_str}",
|
|
123
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
124
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
|
|
125
|
+
"teksten. Svar med {labels_str}, og ikke noe annet.",
|
|
126
|
+
),
|
|
127
|
+
SV: PromptConfig(
|
|
128
|
+
default_prompt_label_mapping=dict(
|
|
129
|
+
positive="positiv", neutral="neutral", negative="negativ"
|
|
130
|
+
),
|
|
131
|
+
default_prompt_prefix="Nedan följer dokument och deras sentiment, som kan vara "
|
|
132
|
+
"{labels_str}.",
|
|
133
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
134
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassificera känslan i "
|
|
135
|
+
"dokumentet. Svara med {labels_str}, och inget annat.",
|
|
136
|
+
),
|
|
137
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Templates for the Summarization task."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import PromptConfig
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
|
+
|
|
6
|
+
# TODO: Missing Faroese
|
|
7
|
+
SUMM_TEMPLATES = {
|
|
8
|
+
DA: PromptConfig(
|
|
9
|
+
default_prompt_prefix="Følgende er dokumenter med tilhørende resuméer.",
|
|
10
|
+
default_prompt_template="Dokument: {text}\nResumé: {target_text}",
|
|
11
|
+
default_instruction_prompt="Dokument: {text}\n\nSkriv et resumé af ovenstående "
|
|
12
|
+
"dokument.",
|
|
13
|
+
default_prompt_label_mapping=dict(),
|
|
14
|
+
),
|
|
15
|
+
DE: PromptConfig(
|
|
16
|
+
default_prompt_prefix="Nachstehend finden Sie Dokumente mit zugehörigen "
|
|
17
|
+
"Zusammenfassungen.",
|
|
18
|
+
default_prompt_template="Dokument: {text}\nZusammenfassung: {target_text}",
|
|
19
|
+
default_instruction_prompt="Nachrichtenartikel: {text}\n\nSchreiben Sie eine "
|
|
20
|
+
"Zusammenfassung des oben genannten Dokuments.",
|
|
21
|
+
default_prompt_label_mapping=dict(),
|
|
22
|
+
),
|
|
23
|
+
EN: PromptConfig(
|
|
24
|
+
default_prompt_prefix="The following are documents with accompanying "
|
|
25
|
+
"summaries.",
|
|
26
|
+
default_prompt_template="Document: {text}\nSummary: {target_text}",
|
|
27
|
+
default_instruction_prompt="Document: {text}\n\nWrite a summary of the above "
|
|
28
|
+
"document.",
|
|
29
|
+
default_prompt_label_mapping=dict(),
|
|
30
|
+
),
|
|
31
|
+
ES: PromptConfig(
|
|
32
|
+
default_prompt_prefix="A continuación se presentan documentos con resúmenes "
|
|
33
|
+
"adjuntos.",
|
|
34
|
+
default_prompt_template="Documento: {text}\nResumen: {target_text}",
|
|
35
|
+
default_instruction_prompt="Documento: {text}\n\nEscriba un resumen del "
|
|
36
|
+
"documento anterior.",
|
|
37
|
+
default_prompt_label_mapping=dict(),
|
|
38
|
+
),
|
|
39
|
+
FR: PromptConfig(
|
|
40
|
+
default_prompt_prefix="Les documents suivants sont accompagnés d'un résumé.",
|
|
41
|
+
default_prompt_template="Document: {text}\nRésumé: {target_text}",
|
|
42
|
+
default_instruction_prompt="Document: {text}\n\nRédigez un résumé du "
|
|
43
|
+
"document ci-dessus.",
|
|
44
|
+
default_prompt_label_mapping=dict(),
|
|
45
|
+
),
|
|
46
|
+
IS: PromptConfig(
|
|
47
|
+
default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
|
|
48
|
+
default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
|
|
49
|
+
default_instruction_prompt="Skjal: {text}\n\nSkrifaðu samantekt á ofangreindu "
|
|
50
|
+
"skjali.",
|
|
51
|
+
default_prompt_label_mapping=dict(),
|
|
52
|
+
),
|
|
53
|
+
IT: PromptConfig(
|
|
54
|
+
default_prompt_prefix="Di seguito sono riportati i documenti con le relative "
|
|
55
|
+
"sintesi.",
|
|
56
|
+
default_prompt_template="Documento: {text}\nSintesi: {target_text}",
|
|
57
|
+
default_instruction_prompt="Documento: {text}\n\nScrivete una sintesi del "
|
|
58
|
+
"documento di cui sopra.",
|
|
59
|
+
default_prompt_label_mapping=dict(),
|
|
60
|
+
),
|
|
61
|
+
NB: PromptConfig(
|
|
62
|
+
default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
|
|
63
|
+
default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
|
|
64
|
+
default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
|
|
65
|
+
"dokumentet ovenfor.",
|
|
66
|
+
default_prompt_label_mapping=dict(),
|
|
67
|
+
),
|
|
68
|
+
NL: PromptConfig(
|
|
69
|
+
default_prompt_prefix="Hieronder volgen documenten met bijbehorende "
|
|
70
|
+
"samenvattingen.",
|
|
71
|
+
default_prompt_template="Document: {text}\nSamenvatting: {target_text}",
|
|
72
|
+
default_instruction_prompt="Document: {text}\n\nSchrijf een samenvatting van "
|
|
73
|
+
"het bovenstaande document.",
|
|
74
|
+
default_prompt_label_mapping=dict(),
|
|
75
|
+
),
|
|
76
|
+
NN: PromptConfig(
|
|
77
|
+
default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
|
|
78
|
+
default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
|
|
79
|
+
default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
|
|
80
|
+
"dokumentet ovenfor.",
|
|
81
|
+
default_prompt_label_mapping=dict(),
|
|
82
|
+
),
|
|
83
|
+
NO: PromptConfig(
|
|
84
|
+
default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
|
|
85
|
+
default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
|
|
86
|
+
default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
|
|
87
|
+
"dokumentet ovenfor.",
|
|
88
|
+
default_prompt_label_mapping=dict(),
|
|
89
|
+
),
|
|
90
|
+
SV: PromptConfig(
|
|
91
|
+
default_prompt_prefix="Nedan följer dokument med tillhörande sammanfattningar.",
|
|
92
|
+
default_prompt_template="Dokument: {text}\nSammanfattning: {target_text}",
|
|
93
|
+
default_instruction_prompt="Dokument: {text}\n\nSkriv en sammanfattning av "
|
|
94
|
+
"ovanstående dokument.",
|
|
95
|
+
default_prompt_label_mapping=dict(),
|
|
96
|
+
),
|
|
97
|
+
}
|
euroeval/speed_benchmark.py
CHANGED
|
@@ -8,7 +8,9 @@ from collections import defaultdict
|
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
from datasets import Dataset
|
|
11
|
-
from transformers import
|
|
11
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
12
|
+
from transformers.tokenization_utils_base import BatchEncoding
|
|
13
|
+
from transformers.trainer import Trainer
|
|
12
14
|
|
|
13
15
|
if t.TYPE_CHECKING:
|
|
14
16
|
from ..types import Labels, Predictions
|
|
@@ -19,12 +21,12 @@ logger = logging.getLogger("euroeval")
|
|
|
19
21
|
class MultipleChoiceClassificationTrainer(Trainer):
|
|
20
22
|
"""Trainer subclass for question answering tasks."""
|
|
21
23
|
|
|
22
|
-
def evaluate(
|
|
24
|
+
def evaluate( # type: ignore[override]
|
|
23
25
|
self,
|
|
24
26
|
eval_dataset: "Dataset | None" = None,
|
|
25
27
|
ignore_keys: list[str] | None = None,
|
|
26
28
|
metric_key_prefix: str = "eval",
|
|
27
|
-
) -> dict[str, float]
|
|
29
|
+
) -> dict[str, float]:
|
|
28
30
|
"""Evaluate the model on the given dataset.
|
|
29
31
|
|
|
30
32
|
Args:
|
|
@@ -54,22 +56,28 @@ class MultipleChoiceClassificationTrainer(Trainer):
|
|
|
54
56
|
metric_key_prefix=metric_key_prefix,
|
|
55
57
|
)
|
|
56
58
|
|
|
59
|
+
predictions = output.predictions
|
|
60
|
+
assert isinstance(predictions, np.ndarray)
|
|
61
|
+
|
|
62
|
+
metrics = output.metrics
|
|
63
|
+
assert metrics is not None
|
|
64
|
+
|
|
57
65
|
if metric_key_prefix == "test":
|
|
58
66
|
preds_and_labels = postprocess_predictions_and_labels(
|
|
59
|
-
predictions=
|
|
67
|
+
predictions=predictions, dataset=eval_dataset
|
|
60
68
|
)
|
|
61
|
-
|
|
69
|
+
assert self.compute_metrics is not None
|
|
70
|
+
new_metrics = self.compute_metrics(preds_and_labels) # type: ignore[arg-type]
|
|
71
|
+
metrics.update(new_metrics)
|
|
62
72
|
|
|
63
73
|
# Prefix all keys with metric_key_prefix + '_'
|
|
64
|
-
for key in list(
|
|
74
|
+
for key in list(metrics.keys()):
|
|
65
75
|
if not key.startswith(f"{metric_key_prefix}_"):
|
|
66
|
-
|
|
67
|
-
key
|
|
68
|
-
)
|
|
76
|
+
metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
|
|
69
77
|
|
|
70
78
|
# Only the main node log the results by default
|
|
71
79
|
if self.args.should_log:
|
|
72
|
-
self.log(
|
|
80
|
+
self.log(metrics)
|
|
73
81
|
|
|
74
82
|
self.control = self.callback_handler.on_evaluate(
|
|
75
83
|
self.args,
|
|
@@ -77,7 +85,7 @@ class MultipleChoiceClassificationTrainer(Trainer):
|
|
|
77
85
|
self.control, # type: ignore[has-type]
|
|
78
86
|
output.metrics,
|
|
79
87
|
)
|
|
80
|
-
return
|
|
88
|
+
return metrics
|
|
81
89
|
|
|
82
90
|
|
|
83
91
|
def prepare_examples(
|
|
@@ -8,25 +8,22 @@ from collections import defaultdict
|
|
|
8
8
|
import evaluate
|
|
9
9
|
import numpy as np
|
|
10
10
|
from evaluate import EvaluationModule
|
|
11
|
-
from transformers import PreTrainedTokenizer
|
|
11
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
12
|
+
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
|
12
13
|
from transformers.trainer import Trainer
|
|
13
14
|
|
|
14
15
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
15
|
-
from ..
|
|
16
|
-
|
|
17
|
-
raise_if_model_output_contains_nan_values,
|
|
18
|
-
)
|
|
16
|
+
from ..tokenization_utils import get_special_token_metadata
|
|
17
|
+
from ..utils import raise_if_model_output_contains_nan_values
|
|
19
18
|
|
|
20
19
|
if t.TYPE_CHECKING:
|
|
21
20
|
import torch.nn as nn
|
|
22
21
|
from datasets.arrow_dataset import Dataset
|
|
23
|
-
from transformers import
|
|
24
|
-
EvalPrediction,
|
|
25
|
-
PreTrainedModel,
|
|
26
|
-
TrainerCallback,
|
|
27
|
-
TrainingArguments,
|
|
28
|
-
)
|
|
22
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
29
23
|
from transformers.tokenization_utils_base import BatchEncoding
|
|
24
|
+
from transformers.trainer_callback import TrainerCallback
|
|
25
|
+
from transformers.trainer_utils import EvalPrediction
|
|
26
|
+
from transformers.training_args import TrainingArguments
|
|
30
27
|
|
|
31
28
|
from ..types import Labels, Predictions
|
|
32
29
|
|
|
@@ -47,7 +44,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
47
44
|
callbacks: "list[TrainerCallback]",
|
|
48
45
|
data_collator: "c.Callable",
|
|
49
46
|
) -> None:
|
|
50
|
-
"""
|
|
47
|
+
"""Initialise the trainer."""
|
|
51
48
|
super().__init__(
|
|
52
49
|
model=model,
|
|
53
50
|
processing_class=processing_class,
|
|
@@ -68,13 +65,13 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
68
65
|
# Set the label names
|
|
69
66
|
self.label_names = ["start_positions", "end_positions"]
|
|
70
67
|
|
|
71
|
-
def evaluate(
|
|
68
|
+
def evaluate( # type: ignore[override]
|
|
72
69
|
self,
|
|
73
70
|
eval_dataset: "Dataset | None" = None,
|
|
74
71
|
orig_eval_dataset: "Dataset | None" = None,
|
|
75
72
|
ignore_keys: list[str] | None = None,
|
|
76
73
|
metric_key_prefix: str = "eval",
|
|
77
|
-
) -> dict[str, float]
|
|
74
|
+
) -> dict[str, float]:
|
|
78
75
|
"""Evaluate the model on the given dataset.
|
|
79
76
|
|
|
80
77
|
Args:
|
|
@@ -113,33 +110,39 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
113
110
|
finally:
|
|
114
111
|
self.compute_metrics = compute_metrics
|
|
115
112
|
|
|
113
|
+
predictions = output.predictions
|
|
114
|
+
assert isinstance(predictions, tuple)
|
|
115
|
+
|
|
116
|
+
metrics = output.metrics
|
|
117
|
+
assert metrics is not None
|
|
118
|
+
|
|
116
119
|
if orig_eval_dataset is not None:
|
|
117
120
|
preds_and_labels = postprocess_predictions_and_labels(
|
|
118
|
-
predictions=
|
|
121
|
+
predictions=predictions, # type: ignore[arg-type]
|
|
119
122
|
dataset=orig_eval_dataset,
|
|
120
123
|
prepared_dataset=eval_dataset,
|
|
121
124
|
cls_token_index=self.cls_token_id,
|
|
122
125
|
)
|
|
123
|
-
|
|
126
|
+
assert self.compute_metrics is not None
|
|
127
|
+
new_metrics = self.compute_metrics(preds_and_labels) # type: ignore[arg-type]
|
|
128
|
+
metrics.update(new_metrics)
|
|
124
129
|
|
|
125
130
|
# Prefix all keys with metric_key_prefix + '_'
|
|
126
|
-
for key in list(
|
|
131
|
+
for key in list(metrics.keys()):
|
|
127
132
|
if not key.startswith(f"{metric_key_prefix}_"):
|
|
128
|
-
|
|
129
|
-
key
|
|
130
|
-
)
|
|
133
|
+
metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
|
|
131
134
|
|
|
132
135
|
# Only the main node log the results by default
|
|
133
136
|
if self.args.should_log:
|
|
134
|
-
self.log(
|
|
137
|
+
self.log(metrics)
|
|
135
138
|
|
|
136
139
|
self.control = self.callback_handler.on_evaluate(
|
|
137
140
|
self.args,
|
|
138
141
|
self.state,
|
|
139
142
|
self.control, # type: ignore[has-type]
|
|
140
|
-
|
|
143
|
+
metrics,
|
|
141
144
|
)
|
|
142
|
-
return
|
|
145
|
+
return metrics
|
|
143
146
|
|
|
144
147
|
|
|
145
148
|
def compute_metrics(
|
|
@@ -472,7 +475,7 @@ def prepare_test_examples(
|
|
|
472
475
|
|
|
473
476
|
|
|
474
477
|
def postprocess_predictions_and_labels(
|
|
475
|
-
predictions:
|
|
478
|
+
predictions: tuple[np.ndarray, np.ndarray],
|
|
476
479
|
dataset: "Dataset",
|
|
477
480
|
prepared_dataset: "Dataset",
|
|
478
481
|
cls_token_index: int,
|
|
@@ -492,9 +495,7 @@ def postprocess_predictions_and_labels(
|
|
|
492
495
|
Returns:
|
|
493
496
|
The postprocessed predictions and labels.
|
|
494
497
|
"""
|
|
495
|
-
|
|
496
|
-
all_start_logits = predictions[0]
|
|
497
|
-
all_end_logits = predictions[1]
|
|
498
|
+
all_start_logits, all_end_logits = predictions
|
|
498
499
|
|
|
499
500
|
# Build a map from an example to its corresponding features, being the blocks of
|
|
500
501
|
# text from the context that we're feeding into the model. An example can have
|
|
@@ -507,7 +508,7 @@ def postprocess_predictions_and_labels(
|
|
|
507
508
|
features_per_example[example_index].append(i)
|
|
508
509
|
|
|
509
510
|
# Loop over all the examples
|
|
510
|
-
|
|
511
|
+
prediction_list: list[dict[str, t.Any]] = list()
|
|
511
512
|
labels = list()
|
|
512
513
|
for example_index, example in enumerate(dataset):
|
|
513
514
|
# Extract the best valid answer associated with the current example
|
|
@@ -530,7 +531,7 @@ def postprocess_predictions_and_labels(
|
|
|
530
531
|
)
|
|
531
532
|
|
|
532
533
|
# Add the answer to the list of predictions
|
|
533
|
-
|
|
534
|
+
prediction_list.append(prediction)
|
|
534
535
|
|
|
535
536
|
# Create the associated reference dictionary, to be added to the list of
|
|
536
537
|
# references
|
|
@@ -545,7 +546,7 @@ def postprocess_predictions_and_labels(
|
|
|
545
546
|
# Add the answer and label to the list of predictions and labels, respectively
|
|
546
547
|
labels.append(label)
|
|
547
548
|
|
|
548
|
-
return
|
|
549
|
+
return prediction_list, labels
|
|
549
550
|
|
|
550
551
|
|
|
551
552
|
def find_best_answer(
|
|
@@ -14,7 +14,7 @@ from ..exceptions import InvalidBenchmark
|
|
|
14
14
|
from ..utils import log_once, raise_if_model_output_contains_nan_values
|
|
15
15
|
|
|
16
16
|
if t.TYPE_CHECKING:
|
|
17
|
-
from transformers import EvalPrediction
|
|
17
|
+
from transformers.trainer_utils import EvalPrediction
|
|
18
18
|
|
|
19
19
|
from ..data_models import DatasetConfig
|
|
20
20
|
from ..types import Labels, Predictions
|
|
@@ -9,14 +9,15 @@ import demjson3
|
|
|
9
9
|
import evaluate
|
|
10
10
|
import numpy as np
|
|
11
11
|
from evaluate import EvaluationModule
|
|
12
|
-
from transformers import PreTrainedTokenizer
|
|
12
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
13
13
|
|
|
14
14
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
15
15
|
from ..exceptions import InvalidBenchmark
|
|
16
16
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
17
17
|
|
|
18
18
|
if t.TYPE_CHECKING:
|
|
19
|
-
from transformers import BatchEncoding
|
|
19
|
+
from transformers.tokenization_utils_base import BatchEncoding
|
|
20
|
+
from transformers.trainer_utils import EvalPrediction
|
|
20
21
|
|
|
21
22
|
from ..types import Labels, Predictions
|
|
22
23
|
|
euroeval/tasks.py
CHANGED
|
@@ -2,6 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
from .data_models import MetricConfig, Task
|
|
4
4
|
from .enums import TaskGroup
|
|
5
|
+
from .prompt_templates import (
|
|
6
|
+
LA_TEMPLATES,
|
|
7
|
+
MULTIPLE_CHOICE_TEMPLATES,
|
|
8
|
+
NER_TEMPLATES,
|
|
9
|
+
RC_TEMPLATES,
|
|
10
|
+
SENT_TEMPLATES,
|
|
11
|
+
SUMM_TEMPLATES,
|
|
12
|
+
)
|
|
5
13
|
|
|
6
14
|
|
|
7
15
|
def get_all_tasks() -> dict[str, Task]:
|
|
@@ -16,6 +24,7 @@ def get_all_tasks() -> dict[str, Task]:
|
|
|
16
24
|
LA = Task(
|
|
17
25
|
name="linguistic-acceptability",
|
|
18
26
|
task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
|
|
27
|
+
template_dict=LA_TEMPLATES,
|
|
19
28
|
metrics=[
|
|
20
29
|
MetricConfig(
|
|
21
30
|
name="mcc",
|
|
@@ -31,12 +40,16 @@ LA = Task(
|
|
|
31
40
|
compute_kwargs=dict(average="macro"),
|
|
32
41
|
),
|
|
33
42
|
],
|
|
43
|
+
default_num_few_shot_examples=12,
|
|
44
|
+
default_max_generated_tokens=5,
|
|
45
|
+
default_labels=["correct", "incorrect"],
|
|
34
46
|
)
|
|
35
47
|
|
|
36
48
|
|
|
37
49
|
NER = Task(
|
|
38
50
|
name="named-entity-recognition",
|
|
39
51
|
task_group=TaskGroup.TOKEN_CLASSIFICATION,
|
|
52
|
+
template_dict=NER_TEMPLATES,
|
|
40
53
|
metrics=[
|
|
41
54
|
MetricConfig(
|
|
42
55
|
name="micro_f1_no_misc",
|
|
@@ -51,12 +64,26 @@ NER = Task(
|
|
|
51
64
|
results_key="overall_f1",
|
|
52
65
|
),
|
|
53
66
|
],
|
|
67
|
+
default_num_few_shot_examples=8,
|
|
68
|
+
default_max_generated_tokens=128,
|
|
69
|
+
default_labels=[
|
|
70
|
+
"o",
|
|
71
|
+
"b-loc",
|
|
72
|
+
"i-loc",
|
|
73
|
+
"b-org",
|
|
74
|
+
"i-org",
|
|
75
|
+
"b-per",
|
|
76
|
+
"i-per",
|
|
77
|
+
"b-misc",
|
|
78
|
+
"i-misc",
|
|
79
|
+
],
|
|
54
80
|
)
|
|
55
81
|
|
|
56
82
|
|
|
57
83
|
RC = Task(
|
|
58
84
|
name="reading-comprehension",
|
|
59
85
|
task_group=TaskGroup.QUESTION_ANSWERING,
|
|
86
|
+
template_dict=RC_TEMPLATES,
|
|
60
87
|
metrics=[
|
|
61
88
|
MetricConfig(
|
|
62
89
|
name="f1",
|
|
@@ -73,12 +100,16 @@ RC = Task(
|
|
|
73
100
|
postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:.2f}%"),
|
|
74
101
|
),
|
|
75
102
|
],
|
|
103
|
+
default_num_few_shot_examples=4,
|
|
104
|
+
default_max_generated_tokens=32,
|
|
105
|
+
default_labels=["start_positions", "end_positions"],
|
|
76
106
|
)
|
|
77
107
|
|
|
78
108
|
|
|
79
109
|
SENT = Task(
|
|
80
110
|
name="sentiment-classification",
|
|
81
111
|
task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
|
|
112
|
+
template_dict=SENT_TEMPLATES,
|
|
82
113
|
metrics=[
|
|
83
114
|
MetricConfig(
|
|
84
115
|
name="mcc",
|
|
@@ -94,12 +125,16 @@ SENT = Task(
|
|
|
94
125
|
compute_kwargs=dict(average="macro"),
|
|
95
126
|
),
|
|
96
127
|
],
|
|
128
|
+
default_num_few_shot_examples=12,
|
|
129
|
+
default_max_generated_tokens=5,
|
|
130
|
+
default_labels=["positive", "neutral", "negative"],
|
|
97
131
|
)
|
|
98
132
|
|
|
99
133
|
|
|
100
134
|
SUMM = Task(
|
|
101
135
|
name="summarization",
|
|
102
136
|
task_group=TaskGroup.TEXT_TO_TEXT,
|
|
137
|
+
template_dict=SUMM_TEMPLATES,
|
|
103
138
|
metrics=[
|
|
104
139
|
MetricConfig(
|
|
105
140
|
name="bertscore",
|
|
@@ -117,12 +152,16 @@ SUMM = Task(
|
|
|
117
152
|
results_key="rougeL",
|
|
118
153
|
),
|
|
119
154
|
],
|
|
155
|
+
default_num_few_shot_examples=1,
|
|
156
|
+
default_max_generated_tokens=256,
|
|
157
|
+
default_labels=[],
|
|
120
158
|
)
|
|
121
159
|
|
|
122
160
|
|
|
123
161
|
KNOW = Task(
|
|
124
162
|
name="knowledge",
|
|
125
163
|
task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
164
|
+
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
126
165
|
metrics=[
|
|
127
166
|
MetricConfig(
|
|
128
167
|
name="mcc",
|
|
@@ -137,12 +176,16 @@ KNOW = Task(
|
|
|
137
176
|
results_key="accuracy",
|
|
138
177
|
),
|
|
139
178
|
],
|
|
179
|
+
default_num_few_shot_examples=5,
|
|
180
|
+
default_max_generated_tokens=5,
|
|
181
|
+
default_labels=["a", "b", "c", "d"],
|
|
140
182
|
)
|
|
141
183
|
|
|
142
184
|
|
|
143
185
|
MCRC = Task(
|
|
144
186
|
name="multiple-choice-reading-comprehension",
|
|
145
187
|
task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
188
|
+
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
146
189
|
metrics=[
|
|
147
190
|
MetricConfig(
|
|
148
191
|
name="mcc",
|
|
@@ -157,12 +200,16 @@ MCRC = Task(
|
|
|
157
200
|
results_key="accuracy",
|
|
158
201
|
),
|
|
159
202
|
],
|
|
203
|
+
default_num_few_shot_examples=5,
|
|
204
|
+
default_max_generated_tokens=5,
|
|
205
|
+
default_labels=["a", "b", "c", "d"],
|
|
160
206
|
)
|
|
161
207
|
|
|
162
208
|
|
|
163
209
|
COMMON_SENSE = Task(
|
|
164
210
|
name="common-sense-reasoning",
|
|
165
211
|
task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
212
|
+
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
166
213
|
metrics=[
|
|
167
214
|
MetricConfig(
|
|
168
215
|
name="mcc",
|
|
@@ -177,12 +224,16 @@ COMMON_SENSE = Task(
|
|
|
177
224
|
results_key="accuracy",
|
|
178
225
|
),
|
|
179
226
|
],
|
|
227
|
+
default_num_few_shot_examples=5,
|
|
228
|
+
default_max_generated_tokens=5,
|
|
229
|
+
default_labels=["a", "b", "c", "d"],
|
|
180
230
|
)
|
|
181
231
|
|
|
182
232
|
|
|
183
233
|
SPEED = Task(
|
|
184
234
|
name="speed",
|
|
185
235
|
task_group=TaskGroup.SPEED,
|
|
236
|
+
template_dict={},
|
|
186
237
|
metrics=[
|
|
187
238
|
MetricConfig(
|
|
188
239
|
name="speed",
|
|
@@ -199,4 +250,7 @@ SPEED = Task(
|
|
|
199
250
|
postprocessing_fn=lambda raw_score: (raw_score, f"{raw_score:,.0f}"),
|
|
200
251
|
),
|
|
201
252
|
],
|
|
253
|
+
default_num_few_shot_examples=0,
|
|
254
|
+
default_max_generated_tokens=5,
|
|
255
|
+
default_labels=[],
|
|
202
256
|
)
|