EuroEval 15.10.1__py3-none-any.whl → 15.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- euroeval/__init__.py +7 -0
- euroeval/benchmark_config_factory.py +7 -0
- euroeval/benchmark_modules/base.py +29 -29
- euroeval/benchmark_modules/fresh.py +31 -19
- euroeval/benchmark_modules/hf.py +27 -23
- euroeval/benchmark_modules/litellm.py +50 -30
- euroeval/benchmark_modules/vllm.py +22 -26
- euroeval/benchmarker.py +8 -1
- euroeval/callbacks.py +17 -13
- euroeval/cli.py +10 -0
- euroeval/data_loading.py +10 -5
- euroeval/data_models.py +9 -40
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/english.py +13 -4
- euroeval/dataset_configs/norwegian.py +8 -0
- euroeval/dataset_configs/portuguese.py +74 -0
- euroeval/dataset_configs/spanish.py +4 -3
- euroeval/finetuning.py +9 -8
- euroeval/generation.py +27 -8
- euroeval/human_evaluation.py +14 -13
- euroeval/languages.py +1 -2
- euroeval/metrics.py +452 -0
- euroeval/prompt_templates/linguistic_acceptability.py +9 -1
- euroeval/prompt_templates/multiple_choice.py +9 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -1
- euroeval/prompt_templates/sentiment_classification.py +11 -1
- euroeval/prompt_templates/summarization.py +8 -1
- euroeval/scores.py +14 -19
- euroeval/speed_benchmark.py +6 -7
- euroeval/task_group_utils/multiple_choice_classification.py +6 -4
- euroeval/task_group_utils/question_answering.py +5 -28
- euroeval/task_group_utils/sequence_classification.py +6 -30
- euroeval/task_group_utils/text_to_text.py +19 -34
- euroeval/task_group_utils/token_classification.py +18 -30
- euroeval/tasks.py +11 -136
- euroeval/types.py +6 -4
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/METADATA +10 -10
- euroeval-15.12.0.dist-info/RECORD +63 -0
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/licenses/LICENSE +1 -1
- euroeval-15.10.1.dist-info/RECORD +0 -61
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/WHEEL +0 -0
- {euroeval-15.10.1.dist-info → euroeval-15.12.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for all multiple choice tasks."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
|
|
5
5
|
|
|
6
6
|
# TODO: Missing Faroese
|
|
7
7
|
MULTIPLE_CHOICE_TEMPLATES = {
|
|
@@ -36,6 +36,14 @@ MULTIPLE_CHOICE_TEMPLATES = {
|
|
|
36
36
|
"usando solo {labels_str}, y nada más.",
|
|
37
37
|
default_prompt_label_mapping="auto",
|
|
38
38
|
),
|
|
39
|
+
PT: PromptConfig(
|
|
40
|
+
default_prompt_prefix="As seguintes são perguntas de escolha múltipla "
|
|
41
|
+
"(com respostas).",
|
|
42
|
+
default_prompt_template="Pergunta: {text}\nResposta: {label}",
|
|
43
|
+
default_instruction_prompt="Pergunta: {text}\n\nResponde à pergunta "
|
|
44
|
+
"acima usando só {labels_str}, e nada mais.",
|
|
45
|
+
default_prompt_label_mapping="auto",
|
|
46
|
+
),
|
|
39
47
|
FI: PromptConfig(
|
|
40
48
|
default_prompt_prefix="Seuraavat ovat monivalintakysymyksiä (vastauksineen).",
|
|
41
49
|
default_prompt_template="Kysymys: {text}\nVastaus: {label}",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Named Entity Recognition task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
|
|
5
5
|
|
|
6
6
|
NER_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -80,6 +80,25 @@ NER_TEMPLATES = {
|
|
|
80
80
|
"claves {labels_str}. Los valores deben ser listas de las "
|
|
81
81
|
"entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
|
|
82
82
|
),
|
|
83
|
+
PT: PromptConfig(
|
|
84
|
+
default_prompt_label_mapping={
|
|
85
|
+
"b-per": "pessoa",
|
|
86
|
+
"i-per": "pessoa",
|
|
87
|
+
"b-loc": "local",
|
|
88
|
+
"i-loc": "local",
|
|
89
|
+
"b-org": "organização",
|
|
90
|
+
"i-org": "organização",
|
|
91
|
+
"b-misc": "diverso",
|
|
92
|
+
"i-misc": "diverso",
|
|
93
|
+
},
|
|
94
|
+
default_prompt_prefix="Seguem-se frases e dicionários JSON com as entidades "
|
|
95
|
+
"mencionadas presentes na frase indicada.",
|
|
96
|
+
default_prompt_template="Frase: {text}\nEntidades mencionadas: {label}",
|
|
97
|
+
default_instruction_prompt="Frase: {text}\n\nIdentifica as entidades "
|
|
98
|
+
"mencionadas na frase. Deves devolver um dicionário JSON com as chaves "
|
|
99
|
+
"{labels_str}. Os valores devem ser listas contendo as entidades "
|
|
100
|
+
"mencionadas desse tipo, tal como ocorrem na frase.",
|
|
101
|
+
),
|
|
83
102
|
FI: PromptConfig(
|
|
84
103
|
default_prompt_label_mapping={
|
|
85
104
|
"b-per": "henkilö",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Sentiment Analysis task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FO, FR, IS, IT, NB, NL, NN, NO, PT, SV
|
|
5
5
|
|
|
6
6
|
SENT_TEMPLATES = {
|
|
7
7
|
DA: PromptConfig(
|
|
@@ -44,6 +44,16 @@ SENT_TEMPLATES = {
|
|
|
44
44
|
default_instruction_prompt="Documento: {text}\n\nClasifica el sentimiento del "
|
|
45
45
|
"documento. Responde con {labels_str}, y nada más.",
|
|
46
46
|
),
|
|
47
|
+
PT: PromptConfig(
|
|
48
|
+
default_prompt_label_mapping=dict(
|
|
49
|
+
positive="positivo", neutral="neutro", negative="negativo"
|
|
50
|
+
),
|
|
51
|
+
default_prompt_prefix="Abaixo encontras documentos e os seus "
|
|
52
|
+
"sentimentos correspondentes, que podem ser {labels_str}.",
|
|
53
|
+
default_prompt_template="Documento: {text}\nSentimento: {label}",
|
|
54
|
+
default_instruction_prompt="Documento: {text}\n\nClassifica o "
|
|
55
|
+
"sentimento do documento. Responde apenas com {labels_str}.",
|
|
56
|
+
),
|
|
47
57
|
FI: PromptConfig(
|
|
48
58
|
default_prompt_label_mapping=dict(
|
|
49
59
|
positive="positiivinen", neutral="neutrali", negative="negatiivinen"
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Templates for the Summarization task."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import PromptConfig
|
|
4
|
-
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, SV
|
|
4
|
+
from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
|
|
5
5
|
|
|
6
6
|
# TODO: Missing Faroese
|
|
7
7
|
SUMM_TEMPLATES = {
|
|
@@ -36,6 +36,13 @@ SUMM_TEMPLATES = {
|
|
|
36
36
|
"documento anterior.",
|
|
37
37
|
default_prompt_label_mapping=dict(),
|
|
38
38
|
),
|
|
39
|
+
PT: PromptConfig(
|
|
40
|
+
default_prompt_prefix="Abaixo encontras documentos com resumos associados.",
|
|
41
|
+
default_prompt_template="Documento: {text}\nResumo: {target_text}",
|
|
42
|
+
default_instruction_prompt="Documento: {text}\n\nEscreve um resumo do "
|
|
43
|
+
"documento anterior.",
|
|
44
|
+
default_prompt_label_mapping=dict(),
|
|
45
|
+
),
|
|
39
46
|
FI: PromptConfig(
|
|
40
47
|
default_prompt_prefix="Seuraavassa on artikkeleita ja niihin liittyviä "
|
|
41
48
|
"tiivistelmiä.",
|
euroeval/scores.py
CHANGED
|
@@ -7,7 +7,7 @@ import warnings
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
9
|
if t.TYPE_CHECKING:
|
|
10
|
-
from .
|
|
10
|
+
from .metrics import Metric
|
|
11
11
|
from .types import ScoreDict
|
|
12
12
|
|
|
13
13
|
logger = logging.getLogger("euroeval")
|
|
@@ -15,7 +15,7 @@ logger = logging.getLogger("euroeval")
|
|
|
15
15
|
|
|
16
16
|
def log_scores(
|
|
17
17
|
dataset_name: str,
|
|
18
|
-
|
|
18
|
+
metrics: list["Metric"],
|
|
19
19
|
scores: list[dict[str, float]],
|
|
20
20
|
model_id: str,
|
|
21
21
|
model_revision: str,
|
|
@@ -25,7 +25,7 @@ def log_scores(
|
|
|
25
25
|
Args:
|
|
26
26
|
dataset_name:
|
|
27
27
|
Name of the dataset.
|
|
28
|
-
|
|
28
|
+
metrics:
|
|
29
29
|
List of metrics to log.
|
|
30
30
|
scores:
|
|
31
31
|
The scores that are to be logged. This is a list of dictionaries full of
|
|
@@ -46,19 +46,19 @@ def log_scores(
|
|
|
46
46
|
logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
|
|
47
47
|
|
|
48
48
|
total_dict: dict[str, float] = dict()
|
|
49
|
-
for
|
|
50
|
-
test_score, test_se = aggregate_scores(scores=scores,
|
|
51
|
-
test_score, test_score_str =
|
|
52
|
-
test_se, test_se_str =
|
|
53
|
-
total_dict[f"test_{
|
|
54
|
-
total_dict[f"test_{
|
|
55
|
-
logger.info(f"{
|
|
49
|
+
for metric in metrics:
|
|
50
|
+
test_score, test_se = aggregate_scores(scores=scores, metric=metric)
|
|
51
|
+
test_score, test_score_str = metric.postprocessing_fn(test_score)
|
|
52
|
+
test_se, test_se_str = metric.postprocessing_fn(test_se)
|
|
53
|
+
total_dict[f"test_{metric.name}"] = test_score
|
|
54
|
+
total_dict[f"test_{metric.name}_se"] = test_se
|
|
55
|
+
logger.info(f"{metric.pretty_name}: {test_score_str} ± {test_se_str}")
|
|
56
56
|
|
|
57
57
|
return dict(raw=scores, total=total_dict)
|
|
58
58
|
|
|
59
59
|
|
|
60
60
|
def aggregate_scores(
|
|
61
|
-
scores: list[dict[str, float]],
|
|
61
|
+
scores: list[dict[str, float]], metric: "Metric"
|
|
62
62
|
) -> tuple[float, float]:
|
|
63
63
|
"""Helper function to compute the mean with confidence intervals.
|
|
64
64
|
|
|
@@ -66,9 +66,8 @@ def aggregate_scores(
|
|
|
66
66
|
scores:
|
|
67
67
|
Dictionary with the names of the metrics as keys, of the form
|
|
68
68
|
"<split>_<metric_name>", such as "val_f1", and values the metric values.
|
|
69
|
-
|
|
70
|
-
The
|
|
71
|
-
metric from `scores`.
|
|
69
|
+
metric:
|
|
70
|
+
The metric, which is used to collect the correct metric from `scores`.
|
|
72
71
|
|
|
73
72
|
Returns:
|
|
74
73
|
A pair of floats, containing the score and the radius of its 95% confidence
|
|
@@ -78,11 +77,7 @@ def aggregate_scores(
|
|
|
78
77
|
warnings.simplefilter("ignore")
|
|
79
78
|
|
|
80
79
|
test_scores = [
|
|
81
|
-
|
|
82
|
-
dct[metric_config.name]
|
|
83
|
-
if metric_config.name in dct
|
|
84
|
-
else dct[f"test_{metric_config.name}"]
|
|
85
|
-
)
|
|
80
|
+
dct[metric.name] if metric.name in dct else dct[f"test_{metric.name}"]
|
|
86
81
|
for dct in scores
|
|
87
82
|
]
|
|
88
83
|
test_score = np.mean(test_scores).item()
|
euroeval/speed_benchmark.py
CHANGED
|
@@ -1,21 +1,20 @@
|
|
|
1
1
|
"""Benchmarking model inference speed."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import typing as t
|
|
4
5
|
|
|
5
6
|
import pyinfer
|
|
6
7
|
from tqdm.auto import tqdm
|
|
7
8
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
8
9
|
|
|
9
|
-
from .benchmark_modules import
|
|
10
|
-
BenchmarkModule,
|
|
11
|
-
HuggingFaceEncoderModel,
|
|
12
|
-
LiteLLMModel,
|
|
13
|
-
VLLMModel,
|
|
14
|
-
)
|
|
15
|
-
from .data_models import BenchmarkConfig
|
|
10
|
+
from .benchmark_modules import HuggingFaceEncoderModel, LiteLLMModel, VLLMModel
|
|
16
11
|
from .exceptions import InvalidBenchmark
|
|
17
12
|
from .utils import clear_memory
|
|
18
13
|
|
|
14
|
+
if t.TYPE_CHECKING:
|
|
15
|
+
from .benchmark_modules import BenchmarkModule
|
|
16
|
+
from .data_models import BenchmarkConfig
|
|
17
|
+
|
|
19
18
|
logger = logging.getLogger("euroeval")
|
|
20
19
|
|
|
21
20
|
|
|
@@ -7,14 +7,15 @@ import typing as t
|
|
|
7
7
|
from collections import defaultdict
|
|
8
8
|
|
|
9
9
|
import numpy as np
|
|
10
|
-
from datasets import Dataset
|
|
11
|
-
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
12
|
-
from transformers.tokenization_utils_base import BatchEncoding
|
|
13
10
|
from transformers.trainer import Trainer
|
|
14
11
|
|
|
15
12
|
from ..exceptions import InvalidBenchmark
|
|
16
13
|
|
|
17
14
|
if t.TYPE_CHECKING:
|
|
15
|
+
from datasets import Dataset
|
|
16
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
17
|
+
from transformers.tokenization_utils_base import BatchEncoding
|
|
18
|
+
|
|
18
19
|
from ..types import Labels, Predictions
|
|
19
20
|
|
|
20
21
|
logger = logging.getLogger("euroeval")
|
|
@@ -147,7 +148,8 @@ def postprocess_predictions_and_labels(
|
|
|
147
148
|
|
|
148
149
|
Args:
|
|
149
150
|
predictions:
|
|
150
|
-
The model predictions, of shape (num_examples, 2)
|
|
151
|
+
The model predictions, of shape (num_examples, 2), corresponding to the
|
|
152
|
+
False/True probabilities for each example.
|
|
151
153
|
dataset:
|
|
152
154
|
The dataset containing the examples.
|
|
153
155
|
|
|
@@ -5,13 +5,10 @@ import logging
|
|
|
5
5
|
import typing as t
|
|
6
6
|
from collections import defaultdict
|
|
7
7
|
|
|
8
|
-
import evaluate
|
|
9
8
|
import numpy as np
|
|
10
|
-
from evaluate import EvaluationModule
|
|
11
9
|
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
|
|
12
10
|
from transformers.trainer import Trainer
|
|
13
11
|
|
|
14
|
-
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
15
12
|
from ..exceptions import InvalidBenchmark
|
|
16
13
|
from ..tokenization_utils import get_special_token_metadata
|
|
17
14
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
@@ -26,6 +23,7 @@ if t.TYPE_CHECKING:
|
|
|
26
23
|
from transformers.trainer_utils import EvalPrediction
|
|
27
24
|
from transformers.training_args import TrainingArguments
|
|
28
25
|
|
|
26
|
+
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
29
27
|
from ..types import Labels, Predictions
|
|
30
28
|
|
|
31
29
|
logger = logging.getLogger("euroeval")
|
|
@@ -151,7 +149,6 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
151
149
|
def compute_metrics(
|
|
152
150
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
153
151
|
dataset_config: "DatasetConfig",
|
|
154
|
-
benchmark_config: "BenchmarkConfig",
|
|
155
152
|
) -> dict[str, float]:
|
|
156
153
|
"""Compute the metrics needed for evaluation.
|
|
157
154
|
|
|
@@ -161,8 +158,6 @@ def compute_metrics(
|
|
|
161
158
|
contains the true labels.
|
|
162
159
|
dataset_config:
|
|
163
160
|
The configuration of the dataset.
|
|
164
|
-
benchmark_config:
|
|
165
|
-
The configuration of the benchmark.
|
|
166
161
|
|
|
167
162
|
Returns:
|
|
168
163
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
@@ -178,17 +173,6 @@ def compute_metrics(
|
|
|
178
173
|
assert not isinstance(model_outputs, tuple)
|
|
179
174
|
raise_if_model_output_contains_nan_values(model_output=model_outputs)
|
|
180
175
|
|
|
181
|
-
metrics = {
|
|
182
|
-
metric_cfg.name: (
|
|
183
|
-
evaluate.load(
|
|
184
|
-
path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
|
|
185
|
-
)
|
|
186
|
-
if metric_cfg.huggingface_id != ""
|
|
187
|
-
else None
|
|
188
|
-
)
|
|
189
|
-
for metric_cfg in dataset_config.task.metrics
|
|
190
|
-
}
|
|
191
|
-
|
|
192
176
|
model_output_dtype = np.asarray(model_outputs).dtype
|
|
193
177
|
if model_output_dtype in [np.float16, np.float32, np.float64]:
|
|
194
178
|
predictions = np.asarray(model_outputs).argmax(axis=-1)
|
|
@@ -196,20 +180,13 @@ def compute_metrics(
|
|
|
196
180
|
predictions = model_outputs
|
|
197
181
|
|
|
198
182
|
results: dict[str, float] = dict()
|
|
199
|
-
for
|
|
200
|
-
metric =
|
|
201
|
-
assert isinstance(metric, EvaluationModule)
|
|
202
|
-
score_dict: dict[str, float] | None = metric.compute(
|
|
203
|
-
predictions=predictions, references=labels, **cfg.compute_kwargs
|
|
204
|
-
)
|
|
183
|
+
for metric in dataset_config.task.metrics:
|
|
184
|
+
score: float | None = metric(predictions=predictions, references=labels)
|
|
205
185
|
|
|
206
186
|
# The metric returns None if we are running on multi-GPU and the current
|
|
207
187
|
# process is not the main process
|
|
208
|
-
if
|
|
209
|
-
|
|
210
|
-
if isinstance(scores, list):
|
|
211
|
-
scores = sum(scores) / len(scores)
|
|
212
|
-
results[cfg.name] = scores
|
|
188
|
+
if score is not None:
|
|
189
|
+
results[metric.name] = score
|
|
213
190
|
|
|
214
191
|
return results
|
|
215
192
|
|
|
@@ -4,19 +4,16 @@ import logging
|
|
|
4
4
|
import re
|
|
5
5
|
import typing as t
|
|
6
6
|
|
|
7
|
-
import evaluate
|
|
8
7
|
import Levenshtein
|
|
9
8
|
import numpy as np
|
|
10
|
-
from evaluate import EvaluationModule
|
|
11
9
|
|
|
12
|
-
from ..data_models import BenchmarkConfig, GenerativeModelOutput
|
|
13
10
|
from ..exceptions import InvalidBenchmark
|
|
14
11
|
from ..utils import log_once, raise_if_model_output_contains_nan_values
|
|
15
12
|
|
|
16
13
|
if t.TYPE_CHECKING:
|
|
17
14
|
from transformers.trainer_utils import EvalPrediction
|
|
18
15
|
|
|
19
|
-
from ..data_models import DatasetConfig
|
|
16
|
+
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
20
17
|
from ..types import Labels, Predictions
|
|
21
18
|
|
|
22
19
|
|
|
@@ -26,7 +23,6 @@ logger = logging.getLogger("euroeval")
|
|
|
26
23
|
def compute_metrics(
|
|
27
24
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
28
25
|
dataset_config: "DatasetConfig",
|
|
29
|
-
benchmark_config: "BenchmarkConfig",
|
|
30
26
|
) -> dict[str, float]:
|
|
31
27
|
"""Compute the metrics needed for evaluation.
|
|
32
28
|
|
|
@@ -36,8 +32,6 @@ def compute_metrics(
|
|
|
36
32
|
contains the true labels.
|
|
37
33
|
dataset_config:
|
|
38
34
|
The configuration of the dataset.
|
|
39
|
-
benchmark_config:
|
|
40
|
-
The configuration of the benchmark.
|
|
41
35
|
|
|
42
36
|
Returns:
|
|
43
37
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
@@ -51,17 +45,6 @@ def compute_metrics(
|
|
|
51
45
|
if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
|
|
52
46
|
model_outputs = model_outputs[0]
|
|
53
47
|
|
|
54
|
-
metrics = {
|
|
55
|
-
metric_cfg.name: (
|
|
56
|
-
evaluate.load(
|
|
57
|
-
path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
|
|
58
|
-
)
|
|
59
|
-
if metric_cfg.huggingface_id != ""
|
|
60
|
-
else None
|
|
61
|
-
)
|
|
62
|
-
for metric_cfg in dataset_config.task.metrics
|
|
63
|
-
}
|
|
64
|
-
|
|
65
48
|
model_output_dtype = np.asarray(model_outputs).dtype
|
|
66
49
|
if model_output_dtype in [np.float16, np.float32, np.float64]:
|
|
67
50
|
predictions = np.asarray(model_outputs).argmax(axis=-1)
|
|
@@ -89,27 +72,20 @@ def compute_metrics(
|
|
|
89
72
|
]
|
|
90
73
|
|
|
91
74
|
results: dict[str, float] = dict()
|
|
92
|
-
for
|
|
93
|
-
metric =
|
|
94
|
-
assert isinstance(metric, EvaluationModule)
|
|
95
|
-
score_dict: dict[str, float] | None = metric.compute(
|
|
96
|
-
predictions=predictions, references=label_ids, **cfg.compute_kwargs
|
|
97
|
-
)
|
|
75
|
+
for metric in dataset_config.task.metrics:
|
|
76
|
+
score: float | None = metric(predictions=predictions, references=label_ids)
|
|
98
77
|
|
|
99
78
|
# The metric returns None if we are running on multi-GPU and the current
|
|
100
79
|
# process is not the main process
|
|
101
|
-
if
|
|
102
|
-
|
|
103
|
-
if isinstance(scores, list):
|
|
104
|
-
scores = sum(scores) / len(scores)
|
|
105
|
-
results[cfg.name] = scores
|
|
80
|
+
if score is not None:
|
|
81
|
+
results[metric.name] = score
|
|
106
82
|
|
|
107
83
|
return results
|
|
108
84
|
|
|
109
85
|
|
|
110
86
|
def extract_labels_from_generation(
|
|
111
87
|
input_batch: dict[str, list],
|
|
112
|
-
model_output: GenerativeModelOutput,
|
|
88
|
+
model_output: "GenerativeModelOutput",
|
|
113
89
|
dataset_config: "DatasetConfig",
|
|
114
90
|
first_label_token_mapping: dict[str, str] | bool,
|
|
115
91
|
) -> list[str]:
|
|
@@ -3,18 +3,17 @@
|
|
|
3
3
|
import logging
|
|
4
4
|
import typing as t
|
|
5
5
|
|
|
6
|
-
import evaluate
|
|
7
6
|
import numpy as np
|
|
8
|
-
from evaluate import EvaluationModule
|
|
9
7
|
|
|
10
8
|
from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
|
|
11
|
-
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
12
9
|
from ..exceptions import InvalidBenchmark
|
|
13
|
-
from ..
|
|
10
|
+
from ..metrics import HuggingFaceMetric
|
|
11
|
+
from ..utils import raise_if_model_output_contains_nan_values
|
|
14
12
|
|
|
15
13
|
if t.TYPE_CHECKING:
|
|
16
14
|
from transformers.trainer_utils import EvalPrediction
|
|
17
15
|
|
|
16
|
+
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
18
17
|
from ..types import Labels, Predictions
|
|
19
18
|
|
|
20
19
|
|
|
@@ -51,17 +50,6 @@ def compute_metrics(
|
|
|
51
50
|
assert not isinstance(model_outputs, tuple)
|
|
52
51
|
raise_if_model_output_contains_nan_values(model_output=model_outputs)
|
|
53
52
|
|
|
54
|
-
metrics = {
|
|
55
|
-
metric_cfg.name: (
|
|
56
|
-
evaluate.load(
|
|
57
|
-
path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
|
|
58
|
-
)
|
|
59
|
-
if metric_cfg.huggingface_id != ""
|
|
60
|
-
else None
|
|
61
|
-
)
|
|
62
|
-
for metric_cfg in dataset_config.task.metrics
|
|
63
|
-
}
|
|
64
|
-
|
|
65
53
|
model_output_dtype = np.asarray(model_outputs).dtype
|
|
66
54
|
output_is_prob = model_output_dtype in [np.float16, np.float32, np.float64]
|
|
67
55
|
if output_is_prob:
|
|
@@ -70,21 +58,18 @@ def compute_metrics(
|
|
|
70
58
|
predictions = model_outputs
|
|
71
59
|
|
|
72
60
|
results: dict[str, float] = dict()
|
|
73
|
-
for
|
|
74
|
-
metric = metrics[cfg.name]
|
|
75
|
-
assert isinstance(metric, EvaluationModule)
|
|
76
|
-
|
|
61
|
+
for metric in dataset_config.task.metrics:
|
|
77
62
|
# Some metrics can be computed on hardware accelerators. In this case we
|
|
78
63
|
# start by setting the device to the same device as the model
|
|
79
|
-
if
|
|
80
|
-
|
|
64
|
+
if (
|
|
65
|
+
isinstance(metric, HuggingFaceMetric)
|
|
66
|
+
and metric.compute_kwargs.get("device", None) == "auto"
|
|
67
|
+
):
|
|
68
|
+
metric.compute_kwargs["device"] = benchmark_config.device.type
|
|
81
69
|
|
|
82
70
|
while True:
|
|
83
71
|
try:
|
|
84
|
-
|
|
85
|
-
score_dict: dict[str, float] | None = metric.compute(
|
|
86
|
-
predictions=predictions, references=labels, **cfg.compute_kwargs
|
|
87
|
-
)
|
|
72
|
+
score: float | None = metric(predictions=predictions, references=labels)
|
|
88
73
|
break
|
|
89
74
|
except Exception as e:
|
|
90
75
|
oom_error = [
|
|
@@ -95,11 +80,14 @@ def compute_metrics(
|
|
|
95
80
|
if not any(error in str(e) for error in oom_error):
|
|
96
81
|
raise InvalidBenchmark(str(e))
|
|
97
82
|
|
|
98
|
-
if
|
|
99
|
-
|
|
83
|
+
if (
|
|
84
|
+
isinstance(metric, HuggingFaceMetric)
|
|
85
|
+
and metric.compute_kwargs.get("device", "cpu") != "cpu"
|
|
86
|
+
):
|
|
87
|
+
metric.compute_kwargs["device"] = "cpu"
|
|
100
88
|
logger.debug(
|
|
101
89
|
"Out of memory error occurred during the computation of "
|
|
102
|
-
f"the metric {
|
|
90
|
+
f"the metric {metric.pretty_name}. Moving the computation to "
|
|
103
91
|
"the CPU."
|
|
104
92
|
)
|
|
105
93
|
else:
|
|
@@ -109,17 +97,14 @@ def compute_metrics(
|
|
|
109
97
|
if hasattr(metric, attribute):
|
|
110
98
|
logger.debug(
|
|
111
99
|
f"Deleting the {attribute!r} attribute of the metric "
|
|
112
|
-
f"{
|
|
100
|
+
f"{metric.pretty_name} to free up memory."
|
|
113
101
|
)
|
|
114
102
|
delattr(metric, attribute)
|
|
115
103
|
|
|
116
104
|
# The metric returns None if we are running on multi-GPU and the current
|
|
117
105
|
# process is not the main process
|
|
118
|
-
if
|
|
119
|
-
|
|
120
|
-
if isinstance(scores, list):
|
|
121
|
-
scores = sum(scores) / len(scores)
|
|
122
|
-
results[cfg.name] = scores
|
|
106
|
+
if score is not None:
|
|
107
|
+
results[metric.name] = score
|
|
123
108
|
|
|
124
109
|
return results
|
|
125
110
|
|
|
@@ -6,19 +6,17 @@ import typing as t
|
|
|
6
6
|
from copy import deepcopy
|
|
7
7
|
|
|
8
8
|
import demjson3
|
|
9
|
-
import evaluate
|
|
10
9
|
import numpy as np
|
|
11
|
-
from evaluate import EvaluationModule
|
|
12
|
-
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
13
10
|
|
|
14
|
-
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
15
11
|
from ..exceptions import InvalidBenchmark
|
|
16
12
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
17
13
|
|
|
18
14
|
if t.TYPE_CHECKING:
|
|
15
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
19
16
|
from transformers.tokenization_utils_base import BatchEncoding
|
|
20
17
|
from transformers.trainer_utils import EvalPrediction
|
|
21
18
|
|
|
19
|
+
from ..data_models import DatasetConfig, GenerativeModelOutput
|
|
22
20
|
from ..types import Labels, Predictions
|
|
23
21
|
|
|
24
22
|
|
|
@@ -29,7 +27,6 @@ def compute_metrics(
|
|
|
29
27
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
30
28
|
has_misc_tags: bool,
|
|
31
29
|
dataset_config: "DatasetConfig",
|
|
32
|
-
benchmark_config: "BenchmarkConfig",
|
|
33
30
|
) -> dict[str, float]:
|
|
34
31
|
"""Compute the metrics needed for evaluation.
|
|
35
32
|
|
|
@@ -41,8 +38,6 @@ def compute_metrics(
|
|
|
41
38
|
Whether the dataset has MISC tags.
|
|
42
39
|
dataset_config:
|
|
43
40
|
The configuration of the dataset.
|
|
44
|
-
benchmark_config:
|
|
45
|
-
The configuration of the benchmark.
|
|
46
41
|
|
|
47
42
|
Returns:
|
|
48
43
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
@@ -55,17 +50,6 @@ def compute_metrics(
|
|
|
55
50
|
if isinstance(model_outputs, tuple) and len(model_outputs) == 2:
|
|
56
51
|
model_outputs = model_outputs[0]
|
|
57
52
|
|
|
58
|
-
metrics = {
|
|
59
|
-
metric_cfg.name: (
|
|
60
|
-
evaluate.load(
|
|
61
|
-
path=metric_cfg.huggingface_id, cache_dir=benchmark_config.cache_dir
|
|
62
|
-
)
|
|
63
|
-
if metric_cfg.huggingface_id != ""
|
|
64
|
-
else None
|
|
65
|
-
)
|
|
66
|
-
for metric_cfg in dataset_config.task.metrics
|
|
67
|
-
}
|
|
68
|
-
|
|
69
53
|
predictions: list[list[str]]
|
|
70
54
|
if not isinstance(model_outputs[0][0], str):
|
|
71
55
|
raw_predictions: list[list[int]] = np.argmax(model_outputs, axis=-1).tolist()
|
|
@@ -145,11 +129,14 @@ def compute_metrics(
|
|
|
145
129
|
all(ner_tag == "o" for ner_tag in label_list) for label_list in labels
|
|
146
130
|
)
|
|
147
131
|
if predictions_all_zero and labels_all_zero:
|
|
148
|
-
|
|
132
|
+
micro_f1_score: float | None = 1.0
|
|
149
133
|
else:
|
|
150
|
-
metric =
|
|
151
|
-
|
|
152
|
-
|
|
134
|
+
metric = next(
|
|
135
|
+
metric
|
|
136
|
+
for metric in dataset_config.task.metrics
|
|
137
|
+
if metric.name == "micro_f1"
|
|
138
|
+
)
|
|
139
|
+
micro_f1_score = metric(predictions=predictions, references=list(labels))
|
|
153
140
|
|
|
154
141
|
# Compute the metrics without MISC tags
|
|
155
142
|
# We manually set the F1 metric to be 100% if both the labels and the models
|
|
@@ -163,21 +150,22 @@ def compute_metrics(
|
|
|
163
150
|
all(ner_tag == "o" for ner_tag in label_list) for label_list in labels_no_misc
|
|
164
151
|
)
|
|
165
152
|
if predictions_no_misc_all_zero and labels_no_misc_all_zero:
|
|
166
|
-
|
|
153
|
+
micro_f1_no_misc_score: float | None = 1.0
|
|
167
154
|
else:
|
|
168
|
-
metric =
|
|
169
|
-
|
|
170
|
-
|
|
155
|
+
metric = next(
|
|
156
|
+
metric
|
|
157
|
+
for metric in dataset_config.task.metrics
|
|
158
|
+
if metric.name == "micro_f1_no_misc"
|
|
159
|
+
)
|
|
160
|
+
micro_f1_no_misc_score = metric(
|
|
171
161
|
predictions=predictions_no_misc, references=labels_no_misc
|
|
172
162
|
)
|
|
173
163
|
|
|
174
164
|
# Raise error if the metrics are invalid
|
|
175
|
-
if
|
|
165
|
+
if micro_f1_score is None or micro_f1_no_misc_score is None:
|
|
176
166
|
raise InvalidBenchmark("The predictions and labels are not of the same length.")
|
|
177
167
|
|
|
178
|
-
return dict(
|
|
179
|
-
micro_f1_no_misc=results_no_misc["overall_f1"], micro_f1=results["overall_f1"]
|
|
180
|
-
)
|
|
168
|
+
return dict(micro_f1_no_misc=micro_f1_no_misc_score, micro_f1=micro_f1_score)
|
|
181
169
|
|
|
182
170
|
|
|
183
171
|
def extract_labels_from_generation(
|