EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +2 -2
- euroeval/benchmark_modules/base.py +3 -2
- euroeval/benchmark_modules/fresh.py +8 -6
- euroeval/benchmark_modules/hf.py +44 -33
- euroeval/benchmark_modules/litellm.py +314 -120
- euroeval/benchmark_modules/vllm.py +99 -59
- euroeval/benchmarker.py +52 -21
- euroeval/callbacks.py +2 -2
- euroeval/constants.py +9 -2
- euroeval/data_models.py +258 -44
- euroeval/dataset_configs/__init__.py +61 -0
- euroeval/dataset_configs/danish.py +120 -0
- euroeval/dataset_configs/dutch.py +123 -0
- euroeval/dataset_configs/english.py +88 -0
- euroeval/dataset_configs/faroese.py +53 -0
- euroeval/dataset_configs/french.py +83 -0
- euroeval/dataset_configs/german.py +91 -0
- euroeval/dataset_configs/icelandic.py +148 -0
- euroeval/dataset_configs/italian.py +81 -0
- euroeval/dataset_configs/norwegian.py +178 -0
- euroeval/dataset_configs/spanish.py +78 -0
- euroeval/dataset_configs/swedish.py +100 -0
- euroeval/exceptions.py +10 -10
- euroeval/finetuning.py +6 -10
- euroeval/generation.py +1 -0
- euroeval/human_evaluation.py +2 -2
- euroeval/languages.py +20 -13
- euroeval/model_cache.py +1 -1
- euroeval/model_loading.py +1 -12
- euroeval/prompt_templates/__init__.py +8 -0
- euroeval/prompt_templates/linguistic_acceptability.py +112 -0
- euroeval/prompt_templates/multiple_choice.py +97 -0
- euroeval/prompt_templates/named_entity_recognition.py +257 -0
- euroeval/prompt_templates/reading_comprehension.py +118 -0
- euroeval/prompt_templates/sentiment_classification.py +137 -0
- euroeval/prompt_templates/summarization.py +97 -0
- euroeval/speed_benchmark.py +1 -1
- euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
- euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
- euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
- euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
- euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
- euroeval/tasks.py +54 -0
- euroeval/tokenization_utils.py +343 -0
- euroeval/types.py +3 -1
- euroeval/utils.py +5 -254
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
- euroeval-15.6.0.dist-info/RECORD +59 -0
- euroeval/dataset_configs.py +0 -2408
- euroeval-15.4.2.dist-info/RECORD +0 -40
- /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Templates for the Sentiment Analysis task."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import PromptConfig
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
|
+
|
|
6
|
+
SENT_TEMPLATES = {
|
|
7
|
+
DA: PromptConfig(
|
|
8
|
+
default_prompt_label_mapping=dict(
|
|
9
|
+
positive="positiv", neutral="neutral", negative="negativ"
|
|
10
|
+
),
|
|
11
|
+
default_prompt_prefix="Følgende er dokumenter og deres sentiment, som kan være "
|
|
12
|
+
"{labels_str}.",
|
|
13
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
14
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassificer sentimentet i "
|
|
15
|
+
"dokumentet. Svar kun med {labels_str}, og intet andet.",
|
|
16
|
+
),
|
|
17
|
+
DE: PromptConfig(
|
|
18
|
+
default_prompt_label_mapping=dict(
|
|
19
|
+
positive="positiv", neutral="neutral", negative="negativ"
|
|
20
|
+
),
|
|
21
|
+
default_prompt_prefix="Nachfolgend finden Sie Dokumente und ihre Bewertung, "
|
|
22
|
+
"die {labels_str} sein kann.",
|
|
23
|
+
default_prompt_template="Dokument: {text}\nStimmung: {label}",
|
|
24
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassifizieren Sie die "
|
|
25
|
+
"Stimmung im Dokument. Antworten Sie mit {labels_str}, und nichts anderes.",
|
|
26
|
+
),
|
|
27
|
+
EN: PromptConfig(
|
|
28
|
+
default_prompt_label_mapping=dict(
|
|
29
|
+
positive="positive", neutral="neutral", negative="negative"
|
|
30
|
+
),
|
|
31
|
+
default_prompt_prefix="The following are documents and their sentiment, which "
|
|
32
|
+
"can be {labels_str}.",
|
|
33
|
+
default_prompt_template="Document: {text}\nSentiment: {label}",
|
|
34
|
+
default_instruction_prompt="Document: {text}\n\nClassify the sentiment in the "
|
|
35
|
+
"document. Answer with {labels_str}, and nothing else.",
|
|
36
|
+
),
|
|
37
|
+
ES: PromptConfig(
|
|
38
|
+
default_prompt_label_mapping=dict(
|
|
39
|
+
positive="positivo", neutral="neutral", negative="negativo"
|
|
40
|
+
),
|
|
41
|
+
default_prompt_prefix="A continuación se muestran los documentos y su "
|
|
42
|
+
"sentimiento, que puede ser {labels_str}.",
|
|
43
|
+
default_prompt_template="Documento: {text}\nSentimiento: {label}",
|
|
44
|
+
default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
|
|
45
|
+
"documento. Responde con {labels_str}, y nada más.",
|
|
46
|
+
),
|
|
47
|
+
FO: PromptConfig(
|
|
48
|
+
default_prompt_label_mapping=dict(
|
|
49
|
+
positive="positivt", neutral="neutralt", negative="negativt"
|
|
50
|
+
),
|
|
51
|
+
default_prompt_prefix="Niðanfyri eru skjøl og teirra kenslur, sum kunnu vera "
|
|
52
|
+
"{labels_str}.",
|
|
53
|
+
default_prompt_template="Skjal: {text}\nKensla: {label}",
|
|
54
|
+
default_instruction_prompt="Skjal: {text}\n\nFlokka kensluna í skjalinum. "
|
|
55
|
+
"Svara við {labels_str}, og einki annað.",
|
|
56
|
+
),
|
|
57
|
+
FR: PromptConfig(
|
|
58
|
+
default_prompt_label_mapping=dict(
|
|
59
|
+
positive="positif", neutral="neutre", negative="négatif"
|
|
60
|
+
),
|
|
61
|
+
default_prompt_prefix="Les documents suivants sont accompagnés de leur "
|
|
62
|
+
"sentiment, qui peut être {labels_str}.",
|
|
63
|
+
default_prompt_template="Document: {text}\nSentiment: {label}",
|
|
64
|
+
default_instruction_prompt="Document: {text}\n\nClassez le sentiment dans le "
|
|
65
|
+
"document. Répondez par {labels_str}, et rien d'autre.",
|
|
66
|
+
),
|
|
67
|
+
IS: PromptConfig(
|
|
68
|
+
default_prompt_label_mapping=dict(
|
|
69
|
+
positive="jákvætt", neutral="hlutlaust", negative="neikvætt"
|
|
70
|
+
),
|
|
71
|
+
default_prompt_prefix="Eftirfarandi eru skjöl og viðhorf þeirra, sem geta "
|
|
72
|
+
"verið {labels_str}.",
|
|
73
|
+
default_prompt_template="Skjal: {text}\nViðhorf: {label}",
|
|
74
|
+
default_instruction_prompt="Skjal: {text}\n\nFlokkaðu viðhorfið í skjalinu. "
|
|
75
|
+
"Svaraðu með {labels_str}, og ekkert annað.",
|
|
76
|
+
),
|
|
77
|
+
IT: PromptConfig(
|
|
78
|
+
default_prompt_label_mapping=dict(
|
|
79
|
+
positive="positivo", neutral="neutro", negative="negativo"
|
|
80
|
+
),
|
|
81
|
+
default_prompt_prefix="Di seguito sono riportati i documenti e il loro "
|
|
82
|
+
"sentiment, che può essere {labels_str}.",
|
|
83
|
+
default_prompt_template="Documento: {text}\nSentimento: {label}",
|
|
84
|
+
default_instruction_prompt="Documento: {text}\n\nClassificare il sentiment del "
|
|
85
|
+
"documento. Rispondere con {labels_str}, e nient'altro.",
|
|
86
|
+
),
|
|
87
|
+
NB: PromptConfig(
|
|
88
|
+
default_prompt_label_mapping=dict(
|
|
89
|
+
positive="positiv", neutral="nøytral", negative="negativ"
|
|
90
|
+
),
|
|
91
|
+
default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
|
|
92
|
+
"{labels_str}",
|
|
93
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
94
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
|
|
95
|
+
"teksten. Svar med {labels_str}, og ikke noe annet.",
|
|
96
|
+
),
|
|
97
|
+
NL: PromptConfig(
|
|
98
|
+
default_prompt_label_mapping=dict(
|
|
99
|
+
positive="positief", neutral="neutraal", negative="negatief"
|
|
100
|
+
),
|
|
101
|
+
default_prompt_prefix="Hieronder volgen documenten en hun sentiment, dat "
|
|
102
|
+
"{labels_str} kan zijn.",
|
|
103
|
+
default_prompt_template="Document: {text}\nSentiment: {label}",
|
|
104
|
+
default_instruction_prompt="Document: {text}\n\nClassificeer het sentiment in "
|
|
105
|
+
"het document. Antwoord met {labels_str}, en verder niets.",
|
|
106
|
+
),
|
|
107
|
+
NN: PromptConfig(
|
|
108
|
+
default_prompt_label_mapping=dict(
|
|
109
|
+
positive="positiv", neutral="nøytral", negative="negativ"
|
|
110
|
+
),
|
|
111
|
+
default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
|
|
112
|
+
"{labels_str}",
|
|
113
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
114
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
|
|
115
|
+
"teksten. Svar med {labels_str}, og ikke noe annet.",
|
|
116
|
+
),
|
|
117
|
+
NO: PromptConfig(
|
|
118
|
+
default_prompt_label_mapping=dict(
|
|
119
|
+
positive="positiv", neutral="nøytral", negative="negativ"
|
|
120
|
+
),
|
|
121
|
+
default_prompt_prefix="Her følger dokumenter og deres sentiment, som kan være "
|
|
122
|
+
"{labels_str}",
|
|
123
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
124
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassifiser følelsen i "
|
|
125
|
+
"teksten. Svar med {labels_str}, og ikke noe annet.",
|
|
126
|
+
),
|
|
127
|
+
SV: PromptConfig(
|
|
128
|
+
default_prompt_label_mapping=dict(
|
|
129
|
+
positive="positiv", neutral="neutral", negative="negativ"
|
|
130
|
+
),
|
|
131
|
+
default_prompt_prefix="Nedan följer dokument och deras sentiment, som kan vara "
|
|
132
|
+
"{labels_str}.",
|
|
133
|
+
default_prompt_template="Dokument: {text}\nSentiment: {label}",
|
|
134
|
+
default_instruction_prompt="Dokument: {text}\n\nKlassificera känslan i "
|
|
135
|
+
"dokumentet. Svara med {labels_str}, och inget annat.",
|
|
136
|
+
),
|
|
137
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Templates for the Summarization task."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import PromptConfig
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FR, IS, IT, NB, NL, NN, NO, SV
|
|
5
|
+
|
|
6
|
+
# TODO: Missing Faroese
|
|
7
|
+
SUMM_TEMPLATES = {
|
|
8
|
+
DA: PromptConfig(
|
|
9
|
+
default_prompt_prefix="Følgende er dokumenter med tilhørende resuméer.",
|
|
10
|
+
default_prompt_template="Dokument: {text}\nResumé: {target_text}",
|
|
11
|
+
default_instruction_prompt="Dokument: {text}\n\nSkriv et resumé af ovenstående "
|
|
12
|
+
"dokument.",
|
|
13
|
+
default_prompt_label_mapping=dict(),
|
|
14
|
+
),
|
|
15
|
+
DE: PromptConfig(
|
|
16
|
+
default_prompt_prefix="Nachstehend finden Sie Dokumente mit zugehörigen "
|
|
17
|
+
"Zusammenfassungen.",
|
|
18
|
+
default_prompt_template="Dokument: {text}\nZusammenfassung: {target_text}",
|
|
19
|
+
default_instruction_prompt="Nachrichtenartikel: {text}\n\nSchreiben Sie eine "
|
|
20
|
+
"Zusammenfassung des oben genannten Dokuments.",
|
|
21
|
+
default_prompt_label_mapping=dict(),
|
|
22
|
+
),
|
|
23
|
+
EN: PromptConfig(
|
|
24
|
+
default_prompt_prefix="The following are documents with accompanying "
|
|
25
|
+
"summaries.",
|
|
26
|
+
default_prompt_template="Document: {text}\nSummary: {target_text}",
|
|
27
|
+
default_instruction_prompt="Document: {text}\n\nWrite a summary of the above "
|
|
28
|
+
"document.",
|
|
29
|
+
default_prompt_label_mapping=dict(),
|
|
30
|
+
),
|
|
31
|
+
ES: PromptConfig(
|
|
32
|
+
default_prompt_prefix="A continuación se presentan documentos con resúmenes "
|
|
33
|
+
"adjuntos.",
|
|
34
|
+
default_prompt_template="Documento: {text}\nResumen: {target_text}",
|
|
35
|
+
default_instruction_prompt="Documento: {text}\n\nEscriba un resumen del "
|
|
36
|
+
"documento anterior.",
|
|
37
|
+
default_prompt_label_mapping=dict(),
|
|
38
|
+
),
|
|
39
|
+
FR: PromptConfig(
|
|
40
|
+
default_prompt_prefix="Les documents suivants sont accompagnés d'un résumé.",
|
|
41
|
+
default_prompt_template="Document: {text}\nRésumé: {target_text}",
|
|
42
|
+
default_instruction_prompt="Document: {text}\n\nRédigez un résumé du "
|
|
43
|
+
"document ci-dessus.",
|
|
44
|
+
default_prompt_label_mapping=dict(),
|
|
45
|
+
),
|
|
46
|
+
IS: PromptConfig(
|
|
47
|
+
default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
|
|
48
|
+
default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
|
|
49
|
+
default_instruction_prompt="Skjal: {text}\n\nSkrifaðu samantekt á ofangreindu "
|
|
50
|
+
"skjali.",
|
|
51
|
+
default_prompt_label_mapping=dict(),
|
|
52
|
+
),
|
|
53
|
+
IT: PromptConfig(
|
|
54
|
+
default_prompt_prefix="Di seguito sono riportati i documenti con le relative "
|
|
55
|
+
"sintesi.",
|
|
56
|
+
default_prompt_template="Documento: {text}\nSintesi: {target_text}",
|
|
57
|
+
default_instruction_prompt="Documento: {text}\n\nScrivete una sintesi del "
|
|
58
|
+
"documento di cui sopra.",
|
|
59
|
+
default_prompt_label_mapping=dict(),
|
|
60
|
+
),
|
|
61
|
+
NB: PromptConfig(
|
|
62
|
+
default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
|
|
63
|
+
default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
|
|
64
|
+
default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
|
|
65
|
+
"dokumentet ovenfor.",
|
|
66
|
+
default_prompt_label_mapping=dict(),
|
|
67
|
+
),
|
|
68
|
+
NL: PromptConfig(
|
|
69
|
+
default_prompt_prefix="Hieronder volgen documenten met bijbehorende "
|
|
70
|
+
"samenvattingen.",
|
|
71
|
+
default_prompt_template="Document: {text}\nSamenvatting: {target_text}",
|
|
72
|
+
default_instruction_prompt="Document: {text}\n\nSchrijf een samenvatting van "
|
|
73
|
+
"het bovenstaande document.",
|
|
74
|
+
default_prompt_label_mapping=dict(),
|
|
75
|
+
),
|
|
76
|
+
NN: PromptConfig(
|
|
77
|
+
default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
|
|
78
|
+
default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
|
|
79
|
+
default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
|
|
80
|
+
"dokumentet ovenfor.",
|
|
81
|
+
default_prompt_label_mapping=dict(),
|
|
82
|
+
),
|
|
83
|
+
NO: PromptConfig(
|
|
84
|
+
default_prompt_prefix="Nedenfor følger dokumenter med tilhørende sammendrag.",
|
|
85
|
+
default_prompt_template="Dokument: {text}\nSammendrag: {target_text}",
|
|
86
|
+
default_instruction_prompt="Dokument: {text}\n\nSkriv et sammendrag av "
|
|
87
|
+
"dokumentet ovenfor.",
|
|
88
|
+
default_prompt_label_mapping=dict(),
|
|
89
|
+
),
|
|
90
|
+
SV: PromptConfig(
|
|
91
|
+
default_prompt_prefix="Nedan följer dokument med tillhörande sammanfattningar.",
|
|
92
|
+
default_prompt_template="Dokument: {text}\nSammanfattning: {target_text}",
|
|
93
|
+
default_instruction_prompt="Dokument: {text}\n\nSkriv en sammanfattning av "
|
|
94
|
+
"ovanstående dokument.",
|
|
95
|
+
default_prompt_label_mapping=dict(),
|
|
96
|
+
),
|
|
97
|
+
}
|
euroeval/speed_benchmark.py
CHANGED
|
@@ -8,7 +8,9 @@ from collections import defaultdict
|
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
10
|
from datasets import Dataset
|
|
11
|
-
from transformers import
|
|
11
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
12
|
+
from transformers.tokenization_utils_base import BatchEncoding
|
|
13
|
+
from transformers.trainer import Trainer
|
|
12
14
|
|
|
13
15
|
if t.TYPE_CHECKING:
|
|
14
16
|
from ..types import Labels, Predictions
|
|
@@ -19,12 +21,12 @@ logger = logging.getLogger("euroeval")
|
|
|
19
21
|
class MultipleChoiceClassificationTrainer(Trainer):
|
|
20
22
|
"""Trainer subclass for question answering tasks."""
|
|
21
23
|
|
|
22
|
-
def evaluate(
|
|
24
|
+
def evaluate( # type: ignore[override]
|
|
23
25
|
self,
|
|
24
26
|
eval_dataset: "Dataset | None" = None,
|
|
25
27
|
ignore_keys: list[str] | None = None,
|
|
26
28
|
metric_key_prefix: str = "eval",
|
|
27
|
-
) -> dict[str, float]
|
|
29
|
+
) -> dict[str, float]:
|
|
28
30
|
"""Evaluate the model on the given dataset.
|
|
29
31
|
|
|
30
32
|
Args:
|
|
@@ -54,22 +56,28 @@ class MultipleChoiceClassificationTrainer(Trainer):
|
|
|
54
56
|
metric_key_prefix=metric_key_prefix,
|
|
55
57
|
)
|
|
56
58
|
|
|
59
|
+
predictions = output.predictions
|
|
60
|
+
assert isinstance(predictions, np.ndarray)
|
|
61
|
+
|
|
62
|
+
metrics = output.metrics
|
|
63
|
+
assert metrics is not None
|
|
64
|
+
|
|
57
65
|
if metric_key_prefix == "test":
|
|
58
66
|
preds_and_labels = postprocess_predictions_and_labels(
|
|
59
|
-
predictions=
|
|
67
|
+
predictions=predictions, dataset=eval_dataset
|
|
60
68
|
)
|
|
61
|
-
|
|
69
|
+
assert self.compute_metrics is not None
|
|
70
|
+
new_metrics = self.compute_metrics(preds_and_labels) # type: ignore[arg-type]
|
|
71
|
+
metrics.update(new_metrics)
|
|
62
72
|
|
|
63
73
|
# Prefix all keys with metric_key_prefix + '_'
|
|
64
|
-
for key in list(
|
|
74
|
+
for key in list(metrics.keys()):
|
|
65
75
|
if not key.startswith(f"{metric_key_prefix}_"):
|
|
66
|
-
|
|
67
|
-
key
|
|
68
|
-
)
|
|
76
|
+
metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
|
|
69
77
|
|
|
70
78
|
# Only the main node log the results by default
|
|
71
79
|
if self.args.should_log:
|
|
72
|
-
self.log(
|
|
80
|
+
self.log(metrics)
|
|
73
81
|
|
|
74
82
|
self.control = self.callback_handler.on_evaluate(
|
|
75
83
|
self.args,
|
|
@@ -77,7 +85,7 @@ class MultipleChoiceClassificationTrainer(Trainer):
|
|
|
77
85
|
self.control, # type: ignore[has-type]
|
|
78
86
|
output.metrics,
|
|
79
87
|
)
|
|
80
|
-
return
|
|
88
|
+
return metrics
|
|
81
89
|
|
|
82
90
|
|
|
83
91
|
def prepare_examples(
|
|
@@ -8,25 +8,22 @@ from collections import defaultdict
|
|
|
8
8
|
import evaluate
|
|
9
9
|
import numpy as np
|
|
10
10
|
from evaluate import EvaluationModule
|
|
11
|
-
from transformers import PreTrainedTokenizer
|
|
11
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
12
|
+
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
|
12
13
|
from transformers.trainer import Trainer
|
|
13
14
|
|
|
14
15
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
15
|
-
from ..
|
|
16
|
-
|
|
17
|
-
raise_if_model_output_contains_nan_values,
|
|
18
|
-
)
|
|
16
|
+
from ..tokenization_utils import get_special_token_metadata
|
|
17
|
+
from ..utils import raise_if_model_output_contains_nan_values
|
|
19
18
|
|
|
20
19
|
if t.TYPE_CHECKING:
|
|
21
20
|
import torch.nn as nn
|
|
22
21
|
from datasets.arrow_dataset import Dataset
|
|
23
|
-
from transformers import
|
|
24
|
-
EvalPrediction,
|
|
25
|
-
PreTrainedModel,
|
|
26
|
-
TrainerCallback,
|
|
27
|
-
TrainingArguments,
|
|
28
|
-
)
|
|
22
|
+
from transformers.modeling_utils import PreTrainedModel
|
|
29
23
|
from transformers.tokenization_utils_base import BatchEncoding
|
|
24
|
+
from transformers.trainer_callback import TrainerCallback
|
|
25
|
+
from transformers.trainer_utils import EvalPrediction
|
|
26
|
+
from transformers.training_args import TrainingArguments
|
|
30
27
|
|
|
31
28
|
from ..types import Labels, Predictions
|
|
32
29
|
|
|
@@ -47,7 +44,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
47
44
|
callbacks: "list[TrainerCallback]",
|
|
48
45
|
data_collator: "c.Callable",
|
|
49
46
|
) -> None:
|
|
50
|
-
"""
|
|
47
|
+
"""Initialise the trainer."""
|
|
51
48
|
super().__init__(
|
|
52
49
|
model=model,
|
|
53
50
|
processing_class=processing_class,
|
|
@@ -68,13 +65,13 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
68
65
|
# Set the label names
|
|
69
66
|
self.label_names = ["start_positions", "end_positions"]
|
|
70
67
|
|
|
71
|
-
def evaluate(
|
|
68
|
+
def evaluate( # type: ignore[override]
|
|
72
69
|
self,
|
|
73
70
|
eval_dataset: "Dataset | None" = None,
|
|
74
71
|
orig_eval_dataset: "Dataset | None" = None,
|
|
75
72
|
ignore_keys: list[str] | None = None,
|
|
76
73
|
metric_key_prefix: str = "eval",
|
|
77
|
-
) -> dict[str, float]
|
|
74
|
+
) -> dict[str, float]:
|
|
78
75
|
"""Evaluate the model on the given dataset.
|
|
79
76
|
|
|
80
77
|
Args:
|
|
@@ -113,33 +110,39 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
113
110
|
finally:
|
|
114
111
|
self.compute_metrics = compute_metrics
|
|
115
112
|
|
|
113
|
+
predictions = output.predictions
|
|
114
|
+
assert isinstance(predictions, tuple)
|
|
115
|
+
|
|
116
|
+
metrics = output.metrics
|
|
117
|
+
assert metrics is not None
|
|
118
|
+
|
|
116
119
|
if orig_eval_dataset is not None:
|
|
117
120
|
preds_and_labels = postprocess_predictions_and_labels(
|
|
118
|
-
predictions=
|
|
121
|
+
predictions=predictions, # type: ignore[arg-type]
|
|
119
122
|
dataset=orig_eval_dataset,
|
|
120
123
|
prepared_dataset=eval_dataset,
|
|
121
124
|
cls_token_index=self.cls_token_id,
|
|
122
125
|
)
|
|
123
|
-
|
|
126
|
+
assert self.compute_metrics is not None
|
|
127
|
+
new_metrics = self.compute_metrics(preds_and_labels) # type: ignore[arg-type]
|
|
128
|
+
metrics.update(new_metrics)
|
|
124
129
|
|
|
125
130
|
# Prefix all keys with metric_key_prefix + '_'
|
|
126
|
-
for key in list(
|
|
131
|
+
for key in list(metrics.keys()):
|
|
127
132
|
if not key.startswith(f"{metric_key_prefix}_"):
|
|
128
|
-
|
|
129
|
-
key
|
|
130
|
-
)
|
|
133
|
+
metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
|
|
131
134
|
|
|
132
135
|
# Only the main node log the results by default
|
|
133
136
|
if self.args.should_log:
|
|
134
|
-
self.log(
|
|
137
|
+
self.log(metrics)
|
|
135
138
|
|
|
136
139
|
self.control = self.callback_handler.on_evaluate(
|
|
137
140
|
self.args,
|
|
138
141
|
self.state,
|
|
139
142
|
self.control, # type: ignore[has-type]
|
|
140
|
-
|
|
143
|
+
metrics,
|
|
141
144
|
)
|
|
142
|
-
return
|
|
145
|
+
return metrics
|
|
143
146
|
|
|
144
147
|
|
|
145
148
|
def compute_metrics(
|
|
@@ -472,7 +475,7 @@ def prepare_test_examples(
|
|
|
472
475
|
|
|
473
476
|
|
|
474
477
|
def postprocess_predictions_and_labels(
|
|
475
|
-
predictions:
|
|
478
|
+
predictions: tuple[np.ndarray, np.ndarray],
|
|
476
479
|
dataset: "Dataset",
|
|
477
480
|
prepared_dataset: "Dataset",
|
|
478
481
|
cls_token_index: int,
|
|
@@ -492,9 +495,7 @@ def postprocess_predictions_and_labels(
|
|
|
492
495
|
Returns:
|
|
493
496
|
The postprocessed predictions and labels.
|
|
494
497
|
"""
|
|
495
|
-
|
|
496
|
-
all_start_logits = predictions[0]
|
|
497
|
-
all_end_logits = predictions[1]
|
|
498
|
+
all_start_logits, all_end_logits = predictions
|
|
498
499
|
|
|
499
500
|
# Build a map from an example to its corresponding features, being the blocks of
|
|
500
501
|
# text from the context that we're feeding into the model. An example can have
|
|
@@ -507,7 +508,7 @@ def postprocess_predictions_and_labels(
|
|
|
507
508
|
features_per_example[example_index].append(i)
|
|
508
509
|
|
|
509
510
|
# Loop over all the examples
|
|
510
|
-
|
|
511
|
+
prediction_list: list[dict[str, t.Any]] = list()
|
|
511
512
|
labels = list()
|
|
512
513
|
for example_index, example in enumerate(dataset):
|
|
513
514
|
# Extract the best valid answer associated with the current example
|
|
@@ -530,7 +531,7 @@ def postprocess_predictions_and_labels(
|
|
|
530
531
|
)
|
|
531
532
|
|
|
532
533
|
# Add the answer to the list of predictions
|
|
533
|
-
|
|
534
|
+
prediction_list.append(prediction)
|
|
534
535
|
|
|
535
536
|
# Create the associated reference dictionary, to be added to the list of
|
|
536
537
|
# references
|
|
@@ -545,7 +546,7 @@ def postprocess_predictions_and_labels(
|
|
|
545
546
|
# Add the answer and label to the list of predictions and labels, respectively
|
|
546
547
|
labels.append(label)
|
|
547
548
|
|
|
548
|
-
return
|
|
549
|
+
return prediction_list, labels
|
|
549
550
|
|
|
550
551
|
|
|
551
552
|
def find_best_answer(
|
|
@@ -10,10 +10,11 @@ import numpy as np
|
|
|
10
10
|
from evaluate import EvaluationModule
|
|
11
11
|
|
|
12
12
|
from ..data_models import BenchmarkConfig, GenerativeModelOutput
|
|
13
|
+
from ..exceptions import InvalidBenchmark
|
|
13
14
|
from ..utils import log_once, raise_if_model_output_contains_nan_values
|
|
14
15
|
|
|
15
16
|
if t.TYPE_CHECKING:
|
|
16
|
-
from transformers import EvalPrediction
|
|
17
|
+
from transformers.trainer_utils import EvalPrediction
|
|
17
18
|
|
|
18
19
|
from ..data_models import DatasetConfig
|
|
19
20
|
from ..types import Labels, Predictions
|
|
@@ -110,6 +111,7 @@ def extract_labels_from_generation(
|
|
|
110
111
|
input_batch: dict[str, list],
|
|
111
112
|
model_output: GenerativeModelOutput,
|
|
112
113
|
dataset_config: "DatasetConfig",
|
|
114
|
+
first_label_token_mapping: dict[str, str] | bool,
|
|
113
115
|
) -> list[str]:
|
|
114
116
|
"""Extract the predicted labels from the generated output.
|
|
115
117
|
|
|
@@ -121,13 +123,19 @@ def extract_labels_from_generation(
|
|
|
121
123
|
The raw generated output of the model.
|
|
122
124
|
dataset_config:
|
|
123
125
|
The configuration of the dataset.
|
|
126
|
+
first_label_token_mapping:
|
|
127
|
+
A mapping from labels to the first token in each label, or alternatively a
|
|
128
|
+
Boolean value indicating whether the model should output scores (if the
|
|
129
|
+
mapping is outputted then the model will always output scores).
|
|
124
130
|
|
|
125
131
|
Returns:
|
|
126
132
|
The predicted labels.
|
|
127
133
|
"""
|
|
128
134
|
if model_output.scores is not None:
|
|
129
135
|
return get_closest_logprobs_labels(
|
|
130
|
-
generation_logprobs=model_output.scores,
|
|
136
|
+
generation_logprobs=model_output.scores,
|
|
137
|
+
dataset_config=dataset_config,
|
|
138
|
+
first_label_token_mapping=first_label_token_mapping,
|
|
131
139
|
)
|
|
132
140
|
else:
|
|
133
141
|
return get_closest_word_edit_labels(
|
|
@@ -138,6 +146,7 @@ def extract_labels_from_generation(
|
|
|
138
146
|
def get_closest_logprobs_labels(
|
|
139
147
|
generation_logprobs: list[list[list[tuple[str, float]]]],
|
|
140
148
|
dataset_config: "DatasetConfig",
|
|
149
|
+
first_label_token_mapping: dict[str, str] | bool,
|
|
141
150
|
) -> list[str]:
|
|
142
151
|
"""Get the labels with the highest predicted logprob value.
|
|
143
152
|
|
|
@@ -152,6 +161,10 @@ def get_closest_logprobs_labels(
|
|
|
152
161
|
(batch_size, num_tokens, num_logprobs).
|
|
153
162
|
dataset_config:
|
|
154
163
|
The configuration of the dataset.
|
|
164
|
+
first_label_token_mapping:
|
|
165
|
+
A mapping from labels to the first token in each label, or alternatively a
|
|
166
|
+
Boolean value indicating whether the model should output scores (if the
|
|
167
|
+
mapping is outputted then the model will always output scores).
|
|
155
168
|
|
|
156
169
|
Returns:
|
|
157
170
|
The predicted labels.
|
|
@@ -185,11 +198,29 @@ def get_closest_logprobs_labels(
|
|
|
185
198
|
generated_label = "".join(previously_generated_labels) + generated_label
|
|
186
199
|
|
|
187
200
|
# Get the candidate labels that starts with the generated label
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
201
|
+
if isinstance(first_label_token_mapping, dict):
|
|
202
|
+
if any(
|
|
203
|
+
candidate_label not in first_label_token_mapping
|
|
204
|
+
for candidate_label in candidate_labels
|
|
205
|
+
):
|
|
206
|
+
raise InvalidBenchmark(
|
|
207
|
+
"There is a label not present in the first label token "
|
|
208
|
+
"mapping - this should never happen! Please report this "
|
|
209
|
+
"issue to the EuroEval team at "
|
|
210
|
+
"github.com/EuroEval/EuroEval/issues."
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
candidate_output_labels = {
|
|
214
|
+
candidate_label
|
|
215
|
+
for candidate_label in candidate_labels
|
|
216
|
+
if generated_label == first_label_token_mapping[candidate_label]
|
|
217
|
+
}
|
|
218
|
+
else:
|
|
219
|
+
candidate_output_labels = {
|
|
220
|
+
candidate_label
|
|
221
|
+
for candidate_label in candidate_labels
|
|
222
|
+
if candidate_label.startswith(generated_label)
|
|
223
|
+
}
|
|
193
224
|
|
|
194
225
|
# If we can uniquely determine the output label, we break the loop. If
|
|
195
226
|
# there are multiple possible labels then we store the current one, and
|
|
@@ -206,7 +237,7 @@ def get_closest_logprobs_labels(
|
|
|
206
237
|
else:
|
|
207
238
|
output_label = candidate_output_labels.pop()
|
|
208
239
|
candidate_output_labels.add(output_label)
|
|
209
|
-
|
|
240
|
+
raise InvalidBenchmark(
|
|
210
241
|
"Multiple candidate labels found for the generated label "
|
|
211
242
|
f"{generated_label!r}: {candidate_output_labels}. Since "
|
|
212
243
|
"this is not the first generated label, we cannot "
|
|
@@ -214,9 +245,13 @@ def get_closest_logprobs_labels(
|
|
|
214
245
|
f"forced to use the arbitrary {output_label!r} as the "
|
|
215
246
|
"output label, potentially resulting in worse performance. "
|
|
216
247
|
"Please report this issue to the EuroEval team at "
|
|
217
|
-
"github.com/EuroEval/EuroEval/issues."
|
|
218
|
-
level=logging.WARNING,
|
|
248
|
+
"github.com/EuroEval/EuroEval/issues."
|
|
219
249
|
)
|
|
250
|
+
elif len(candidate_output_labels) == 0:
|
|
251
|
+
logger.debug(
|
|
252
|
+
f"No candidate label found for the generated label "
|
|
253
|
+
f"{generated_label!r}. The generated label is thus ignored."
|
|
254
|
+
)
|
|
220
255
|
|
|
221
256
|
if output_label is not None:
|
|
222
257
|
output_labels.append(output_label)
|
|
@@ -9,14 +9,15 @@ import demjson3
|
|
|
9
9
|
import evaluate
|
|
10
10
|
import numpy as np
|
|
11
11
|
from evaluate import EvaluationModule
|
|
12
|
-
from transformers import PreTrainedTokenizer
|
|
12
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
13
13
|
|
|
14
14
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
15
15
|
from ..exceptions import InvalidBenchmark
|
|
16
16
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
17
17
|
|
|
18
18
|
if t.TYPE_CHECKING:
|
|
19
|
-
from transformers import BatchEncoding
|
|
19
|
+
from transformers.tokenization_utils_base import BatchEncoding
|
|
20
|
+
from transformers.trainer_utils import EvalPrediction
|
|
20
21
|
|
|
21
22
|
from ..types import Labels, Predictions
|
|
22
23
|
|