EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +9 -2
- euroeval/benchmark_config_factory.py +51 -50
- euroeval/benchmark_modules/base.py +9 -21
- euroeval/benchmark_modules/fresh.py +2 -1
- euroeval/benchmark_modules/hf.py +101 -71
- euroeval/benchmark_modules/litellm.py +115 -53
- euroeval/benchmark_modules/vllm.py +107 -92
- euroeval/benchmarker.py +144 -121
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +86 -8
- euroeval/constants.py +9 -0
- euroeval/data_loading.py +80 -29
- euroeval/data_models.py +338 -330
- euroeval/dataset_configs/__init__.py +12 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +55 -93
- euroeval/dataset_configs/dutch.py +48 -87
- euroeval/dataset_configs/english.py +45 -77
- euroeval/dataset_configs/estonian.py +42 -34
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -69
- euroeval/dataset_configs/french.py +39 -75
- euroeval/dataset_configs/german.py +45 -82
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -79
- euroeval/dataset_configs/latvian.py +28 -35
- euroeval/dataset_configs/lithuanian.py +28 -26
- euroeval/dataset_configs/norwegian.py +72 -115
- euroeval/dataset_configs/polish.py +33 -61
- euroeval/dataset_configs/portuguese.py +33 -66
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/spanish.py +42 -77
- euroeval/dataset_configs/swedish.py +52 -90
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +24 -17
- euroeval/generation.py +15 -14
- euroeval/generation_utils.py +8 -8
- euroeval/languages.py +395 -323
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +21 -6
- euroeval/metrics/llm_as_a_judge.py +6 -4
- euroeval/metrics/pipeline.py +17 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +17 -19
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +99 -42
- euroeval/prompt_templates/multiple_choice.py +102 -38
- euroeval/prompt_templates/named_entity_recognition.py +172 -51
- euroeval/prompt_templates/reading_comprehension.py +119 -42
- euroeval/prompt_templates/sentiment_classification.py +110 -40
- euroeval/prompt_templates/summarization.py +85 -40
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +11 -10
- euroeval/speed_benchmark.py +5 -6
- euroeval/task_group_utils/multiple_choice_classification.py +2 -4
- euroeval/task_group_utils/question_answering.py +24 -16
- euroeval/task_group_utils/sequence_classification.py +48 -35
- euroeval/task_group_utils/text_to_text.py +19 -9
- euroeval/task_group_utils/token_classification.py +21 -17
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +33 -22
- euroeval/types.py +10 -9
- euroeval/utils.py +35 -149
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.3.0.dist-info/RECORD +0 -71
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to the question-answering task group."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
-
import logging
|
|
5
4
|
import typing as t
|
|
6
5
|
from collections import defaultdict
|
|
7
6
|
|
|
@@ -26,8 +25,6 @@ if t.TYPE_CHECKING:
|
|
|
26
25
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
27
26
|
from ..types import Labels, Predictions
|
|
28
27
|
|
|
29
|
-
logger = logging.getLogger("euroeval")
|
|
30
|
-
|
|
31
28
|
|
|
32
29
|
class QuestionAnsweringTrainer(Trainer):
|
|
33
30
|
"""Trainer subclass for question answering tasks."""
|
|
@@ -40,7 +37,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
40
37
|
train_dataset: "Dataset",
|
|
41
38
|
eval_dataset: "Dataset",
|
|
42
39
|
compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
|
|
43
|
-
callbacks: "
|
|
40
|
+
callbacks: "c.Sequence[TrainerCallback]",
|
|
44
41
|
data_collator: "c.Callable",
|
|
45
42
|
**kwargs,
|
|
46
43
|
) -> None:
|
|
@@ -70,7 +67,7 @@ class QuestionAnsweringTrainer(Trainer):
|
|
|
70
67
|
self,
|
|
71
68
|
eval_dataset: "Dataset | None" = None,
|
|
72
69
|
orig_eval_dataset: "Dataset | None" = None,
|
|
73
|
-
ignore_keys:
|
|
70
|
+
ignore_keys: c.Sequence[str] | None = None,
|
|
74
71
|
metric_key_prefix: str = "eval",
|
|
75
72
|
) -> dict[str, float]:
|
|
76
73
|
"""Evaluate the model on the given dataset.
|
|
@@ -206,7 +203,7 @@ def compute_metrics(
|
|
|
206
203
|
|
|
207
204
|
def extract_labels_from_generation(
|
|
208
205
|
input_batch: dict[str, list], model_output: "GenerativeModelOutput"
|
|
209
|
-
) ->
|
|
206
|
+
) -> c.Sequence[t.Any]:
|
|
210
207
|
"""Extract the predicted labels from the generated output.
|
|
211
208
|
|
|
212
209
|
Args:
|
|
@@ -268,8 +265,11 @@ def prepare_train_examples(
|
|
|
268
265
|
max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
|
|
269
266
|
num_special_tokens = int(has_cls_token) + int(has_sep_token)
|
|
270
267
|
stride = tokeniser.model_max_length // 4
|
|
271
|
-
|
|
272
|
-
|
|
268
|
+
stride = min(
|
|
269
|
+
stride,
|
|
270
|
+
tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
|
|
271
|
+
)
|
|
272
|
+
stride = max(stride, 0)
|
|
273
273
|
max_length = tokeniser.model_max_length - stride
|
|
274
274
|
|
|
275
275
|
# Tokenise our examples with truncation and padding, but keep the overflows using a
|
|
@@ -338,9 +338,17 @@ def prepare_train_examples(
|
|
|
338
338
|
end_char = start_char + len(answers["text"][0])
|
|
339
339
|
|
|
340
340
|
# Start token index of the current span in the text.
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
token_start_index
|
|
341
|
+
try:
|
|
342
|
+
token_start_index = 0
|
|
343
|
+
while sequence_ids[token_start_index] != 1:
|
|
344
|
+
token_start_index += 1
|
|
345
|
+
|
|
346
|
+
# If it turns out that we cannot find the context in the span, then we
|
|
347
|
+
# treat this as an impossible case
|
|
348
|
+
except IndexError:
|
|
349
|
+
tokenised_examples.start_positions.append(cls_index)
|
|
350
|
+
tokenised_examples.end_positions.append(cls_index)
|
|
351
|
+
continue
|
|
344
352
|
|
|
345
353
|
# End token index of the current span in the text.
|
|
346
354
|
token_end_index = len(input_ids) - 1
|
|
@@ -472,7 +480,7 @@ def postprocess_predictions_and_labels(
|
|
|
472
480
|
dataset: "Dataset",
|
|
473
481
|
prepared_dataset: "Dataset",
|
|
474
482
|
cls_token_index: int,
|
|
475
|
-
) -> tuple[
|
|
483
|
+
) -> tuple[c.Sequence[dict], c.Sequence[dict]]:
|
|
476
484
|
"""Postprocess the predictions and labels, to allow easier metric computation.
|
|
477
485
|
|
|
478
486
|
Args:
|
|
@@ -553,7 +561,7 @@ def find_best_answer(
|
|
|
553
561
|
all_start_logits: np.ndarray,
|
|
554
562
|
all_end_logits: np.ndarray,
|
|
555
563
|
prepared_dataset: "Dataset",
|
|
556
|
-
feature_indices:
|
|
564
|
+
feature_indices: c.Sequence[int],
|
|
557
565
|
context: str,
|
|
558
566
|
max_answer_length: int,
|
|
559
567
|
num_best_logits: int,
|
|
@@ -586,7 +594,7 @@ def find_best_answer(
|
|
|
586
594
|
The best answer for the example.
|
|
587
595
|
"""
|
|
588
596
|
# Loop through all the features associated to the current example
|
|
589
|
-
valid_answers = list()
|
|
597
|
+
valid_answers: list[dict] = list()
|
|
590
598
|
for feature_index in feature_indices:
|
|
591
599
|
# Get the features associated with the current example
|
|
592
600
|
features = prepared_dataset[feature_index]
|
|
@@ -627,12 +635,12 @@ def find_best_answer(
|
|
|
627
635
|
def find_valid_answers(
|
|
628
636
|
start_logits: np.ndarray,
|
|
629
637
|
end_logits: np.ndarray,
|
|
630
|
-
offset_mapping:
|
|
638
|
+
offset_mapping: c.Sequence[tuple[int, int]],
|
|
631
639
|
context: str,
|
|
632
640
|
max_answer_length: int,
|
|
633
641
|
num_best_logits: int,
|
|
634
642
|
min_null_score: float,
|
|
635
|
-
) ->
|
|
643
|
+
) -> c.Sequence[dict]:
|
|
636
644
|
"""Find the valid answers from the start and end indexes.
|
|
637
645
|
|
|
638
646
|
Args:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to the sequence-classification task group."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import re
|
|
5
6
|
import typing as t
|
|
@@ -19,13 +20,15 @@ if t.TYPE_CHECKING:
|
|
|
19
20
|
from datasets.arrow_dataset import Dataset
|
|
20
21
|
from transformers.trainer_utils import EvalPrediction
|
|
21
22
|
|
|
22
|
-
from ..data_models import
|
|
23
|
+
from ..data_models import (
|
|
24
|
+
BenchmarkConfig,
|
|
25
|
+
DatasetConfig,
|
|
26
|
+
GenerativeModelOutput,
|
|
27
|
+
ModelConfig,
|
|
28
|
+
)
|
|
23
29
|
from ..types import Labels, Predictions
|
|
24
30
|
|
|
25
31
|
|
|
26
|
-
logger = logging.getLogger("euroeval")
|
|
27
|
-
|
|
28
|
-
|
|
29
32
|
def compute_metrics(
|
|
30
33
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
31
34
|
dataset_config: "DatasetConfig",
|
|
@@ -106,8 +109,9 @@ def extract_labels_from_generation(
|
|
|
106
109
|
input_batch: dict[str, list],
|
|
107
110
|
model_output: "GenerativeModelOutput",
|
|
108
111
|
dataset_config: "DatasetConfig",
|
|
112
|
+
model_config: "ModelConfig",
|
|
109
113
|
first_label_token_mapping: dict[str, str] | bool,
|
|
110
|
-
) ->
|
|
114
|
+
) -> c.Sequence[str]:
|
|
111
115
|
"""Extract the predicted labels from the generated output.
|
|
112
116
|
|
|
113
117
|
Args:
|
|
@@ -118,6 +122,8 @@ def extract_labels_from_generation(
|
|
|
118
122
|
The raw generated output of the model.
|
|
119
123
|
dataset_config:
|
|
120
124
|
The configuration of the dataset.
|
|
125
|
+
model_config:
|
|
126
|
+
The configuration of the model.
|
|
121
127
|
first_label_token_mapping:
|
|
122
128
|
A mapping from labels to the first token in each label, or alternatively a
|
|
123
129
|
Boolean value indicating whether the model should output scores (if the
|
|
@@ -167,6 +173,7 @@ def extract_labels_from_generation(
|
|
|
167
173
|
)
|
|
168
174
|
|
|
169
175
|
new_predicted_labels: list[str] = list()
|
|
176
|
+
num_predictions_being_very_off = 0
|
|
170
177
|
for idx, predicted_label in enumerate(model_output.sequences):
|
|
171
178
|
# If the prediction includes a boxed answer, use that instead of the full
|
|
172
179
|
# generation
|
|
@@ -199,42 +206,48 @@ def extract_labels_from_generation(
|
|
|
199
206
|
# word edit distance to the predicted label (if invalid model outputs are
|
|
200
207
|
# allowed), or we raise an error
|
|
201
208
|
if min(edit_distances) >= 1000:
|
|
202
|
-
|
|
203
|
-
logger.warning(
|
|
204
|
-
"No candidate labels found for the predicted label "
|
|
205
|
-
f"{predicted_label!r}, out of the candidate labels "
|
|
206
|
-
f"{sample_candidate_labels[idx]}. This likely means that the model "
|
|
207
|
-
"output is completely off, but since invalid model outputs are "
|
|
208
|
-
"allowed for this task, we will use the closest candidate label "
|
|
209
|
-
f"({best_candidate_label})) as the output label. If you see this "
|
|
210
|
-
"warning very often, please report this issue to the EuroEval "
|
|
211
|
-
"team at github.com/EuroEval/EuroEval/issues."
|
|
212
|
-
)
|
|
213
|
-
logger.debug(
|
|
214
|
-
"The candidate labels were extracted from the prompt: "
|
|
215
|
-
f"{input_batch['text'][idx]!r}."
|
|
216
|
-
)
|
|
217
|
-
else:
|
|
218
|
-
raise InvalidBenchmark(
|
|
219
|
-
"No candidate labels found for the predicted label "
|
|
220
|
-
f"{predicted_label!r}, out of the candidate labels "
|
|
221
|
-
f"{sample_candidate_labels[idx]}. This likely means that the model "
|
|
222
|
-
"output is completely off, and we cannot extract any labels from "
|
|
223
|
-
"it. Please check the model output and the candidate labels. The "
|
|
224
|
-
"candidate labels were extracted from the prompt: "
|
|
225
|
-
f"{input_batch['text'][idx]!r}."
|
|
226
|
-
)
|
|
209
|
+
num_predictions_being_very_off += 1
|
|
227
210
|
|
|
228
211
|
new_predicted_labels.append(best_candidate_label)
|
|
229
212
|
|
|
213
|
+
if num_predictions_being_very_off > 0:
|
|
214
|
+
if dataset_config.allow_invalid_model_outputs:
|
|
215
|
+
log_msg = (
|
|
216
|
+
"No candidate labels found for the predicted label in "
|
|
217
|
+
f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
|
|
218
|
+
f"of the samples with the model {model_config.model_id!r}. This "
|
|
219
|
+
"likely means that the model were completely off in these cases, "
|
|
220
|
+
"but since invalid model outputs are allowed for this task, we used "
|
|
221
|
+
"the closest candidate labels as the output labels."
|
|
222
|
+
)
|
|
223
|
+
level = logging.DEBUG
|
|
224
|
+
if num_predictions_being_very_off / len(model_output.sequences) > 0.5:
|
|
225
|
+
log_msg += (
|
|
226
|
+
" Since this happened for most of the model's predictions, please "
|
|
227
|
+
"report this issue to the EuroEval team at "
|
|
228
|
+
"github.com/EuroEval/EuroEval/issues."
|
|
229
|
+
)
|
|
230
|
+
level = logging.WARNING
|
|
231
|
+
log_once(log_msg, level=level)
|
|
232
|
+
else:
|
|
233
|
+
raise InvalidBenchmark(
|
|
234
|
+
"No candidate labels found for the predicted label in "
|
|
235
|
+
f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
|
|
236
|
+
"of the samples. This likely means that the model were completely "
|
|
237
|
+
"off in these cases. Since this task does not allow invalid model "
|
|
238
|
+
"outputs, we have to abort the evaluation. Please re-run the "
|
|
239
|
+
"evaluation with the `--debug` flag (or `debug=True` if you're using "
|
|
240
|
+
"the `Benchmarker` API) to see the precise model outputs."
|
|
241
|
+
)
|
|
242
|
+
|
|
230
243
|
return new_predicted_labels
|
|
231
244
|
|
|
232
245
|
|
|
233
246
|
def get_closest_logprobs_labels(
|
|
234
|
-
generation_logprobs:
|
|
247
|
+
generation_logprobs: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]],
|
|
235
248
|
first_label_token_mapping: dict[str, str] | t.Literal[True],
|
|
236
|
-
candidate_labels:
|
|
237
|
-
) ->
|
|
249
|
+
candidate_labels: c.Sequence[c.Sequence[str]],
|
|
250
|
+
) -> c.Sequence[str] | None:
|
|
238
251
|
"""Get the labels with the highest predicted logprob value.
|
|
239
252
|
|
|
240
253
|
In case a candidate label is split into multiple tokens, we only use the first
|
|
@@ -355,7 +368,7 @@ def get_closest_logprobs_labels(
|
|
|
355
368
|
"be determined. This means that using logprobs to extract the "
|
|
356
369
|
"labels is not reliable, and we will instead fall back to "
|
|
357
370
|
"extracting the labels using word edit distance.",
|
|
358
|
-
level=logging.
|
|
371
|
+
level=logging.DEBUG,
|
|
359
372
|
)
|
|
360
373
|
else:
|
|
361
374
|
log_once(
|
|
@@ -363,7 +376,7 @@ def get_closest_logprobs_labels(
|
|
|
363
376
|
"means that using logprobs to extract the labels is not reliable, "
|
|
364
377
|
"and we will instead fall back to extracting the labels using "
|
|
365
378
|
"word edit distance.",
|
|
366
|
-
level=logging.
|
|
379
|
+
level=logging.DEBUG,
|
|
367
380
|
)
|
|
368
381
|
return None
|
|
369
382
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to the text-to-text task group."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import typing as t
|
|
5
6
|
|
|
@@ -7,6 +8,7 @@ import numpy as np
|
|
|
7
8
|
|
|
8
9
|
from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
|
|
9
10
|
from ..exceptions import InvalidBenchmark
|
|
11
|
+
from ..logging_utils import log
|
|
10
12
|
from ..metrics import HuggingFaceMetric
|
|
11
13
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
12
14
|
|
|
@@ -18,9 +20,6 @@ if t.TYPE_CHECKING:
|
|
|
18
20
|
from ..types import Labels, Predictions
|
|
19
21
|
|
|
20
22
|
|
|
21
|
-
logger = logging.getLogger("euroeval")
|
|
22
|
-
|
|
23
|
-
|
|
24
23
|
def compute_metrics(
|
|
25
24
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
26
25
|
dataset_config: "DatasetConfig",
|
|
@@ -44,6 +43,10 @@ def compute_metrics(
|
|
|
44
43
|
Returns:
|
|
45
44
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
46
45
|
values.
|
|
46
|
+
|
|
47
|
+
Raises:
|
|
48
|
+
InvalidBenchmark:
|
|
49
|
+
If the metric computation fails.
|
|
47
50
|
"""
|
|
48
51
|
model_outputs, labels = model_outputs_and_labels
|
|
49
52
|
|
|
@@ -72,7 +75,7 @@ def compute_metrics(
|
|
|
72
75
|
):
|
|
73
76
|
metric.compute_kwargs["device"] = benchmark_config.device.type
|
|
74
77
|
|
|
75
|
-
|
|
78
|
+
for _ in range(num_attempts := 5):
|
|
76
79
|
try:
|
|
77
80
|
score: float | None = metric(
|
|
78
81
|
predictions=predictions,
|
|
@@ -96,21 +99,28 @@ def compute_metrics(
|
|
|
96
99
|
and metric.compute_kwargs.get("device", "cpu") != "cpu"
|
|
97
100
|
):
|
|
98
101
|
metric.compute_kwargs["device"] = "cpu"
|
|
99
|
-
|
|
102
|
+
log(
|
|
100
103
|
"Out of memory error occurred during the computation of "
|
|
101
104
|
f"the metric {metric.pretty_name}. Moving the computation to "
|
|
102
|
-
"the CPU."
|
|
105
|
+
"the CPU.",
|
|
106
|
+
level=logging.DEBUG,
|
|
103
107
|
)
|
|
104
108
|
else:
|
|
105
109
|
raise InvalidBenchmark(str(e)) from e
|
|
106
110
|
finally:
|
|
107
111
|
for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
|
|
108
112
|
if hasattr(metric, attribute):
|
|
109
|
-
|
|
113
|
+
log(
|
|
110
114
|
f"Deleting the {attribute!r} attribute of the metric "
|
|
111
|
-
f"{metric.pretty_name} to free up memory."
|
|
115
|
+
f"{metric.pretty_name} to free up memory.",
|
|
116
|
+
level=logging.DEBUG,
|
|
112
117
|
)
|
|
113
118
|
delattr(metric, attribute)
|
|
119
|
+
else:
|
|
120
|
+
raise InvalidBenchmark(
|
|
121
|
+
f"Could not compute the metric {metric.pretty_name} after "
|
|
122
|
+
f"{num_attempts} attempts due to out of memory errors."
|
|
123
|
+
)
|
|
114
124
|
|
|
115
125
|
# The metric returns None if we are running on multi-GPU and the current
|
|
116
126
|
# process is not the main process
|
|
@@ -122,7 +132,7 @@ def compute_metrics(
|
|
|
122
132
|
|
|
123
133
|
def extract_labels_from_generation(
|
|
124
134
|
input_batch: dict[str, list], model_output: "GenerativeModelOutput"
|
|
125
|
-
) ->
|
|
135
|
+
) -> c.Sequence[t.Any]:
|
|
126
136
|
"""Extract the predicted labels from the generated output.
|
|
127
137
|
|
|
128
138
|
Args:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to the token-classification task group."""
|
|
2
2
|
|
|
3
|
+
import collections.abc as c
|
|
3
4
|
import logging
|
|
4
5
|
import typing as t
|
|
5
6
|
from copy import deepcopy
|
|
@@ -7,6 +8,7 @@ from copy import deepcopy
|
|
|
7
8
|
import numpy as np
|
|
8
9
|
|
|
9
10
|
from ..exceptions import InvalidBenchmark
|
|
11
|
+
from ..logging_utils import log
|
|
10
12
|
from ..utils import (
|
|
11
13
|
extract_json_dict_from_string,
|
|
12
14
|
raise_if_model_output_contains_nan_values,
|
|
@@ -22,9 +24,6 @@ if t.TYPE_CHECKING:
|
|
|
22
24
|
from ..types import Labels, Predictions
|
|
23
25
|
|
|
24
26
|
|
|
25
|
-
logger = logging.getLogger("euroeval")
|
|
26
|
-
|
|
27
|
-
|
|
28
27
|
def compute_metrics(
|
|
29
28
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
30
29
|
has_misc_tags: bool,
|
|
@@ -61,7 +60,9 @@ def compute_metrics(
|
|
|
61
60
|
|
|
62
61
|
predictions: list[list[str]]
|
|
63
62
|
if not isinstance(model_outputs[0][0], str):
|
|
64
|
-
raw_predictions:
|
|
63
|
+
raw_predictions: c.Sequence[c.Sequence[int]] = np.argmax(
|
|
64
|
+
model_outputs, axis=-1
|
|
65
|
+
).tolist()
|
|
65
66
|
|
|
66
67
|
# Remove ignored index (special tokens)
|
|
67
68
|
predictions = [
|
|
@@ -191,7 +192,7 @@ def extract_labels_from_generation(
|
|
|
191
192
|
input_batch: dict[str, list],
|
|
192
193
|
model_output: "GenerativeModelOutput",
|
|
193
194
|
dataset_config: "DatasetConfig",
|
|
194
|
-
) ->
|
|
195
|
+
) -> c.Sequence[t.Any]:
|
|
195
196
|
"""Extract the predicted labels from the generated output.
|
|
196
197
|
|
|
197
198
|
Args:
|
|
@@ -216,17 +217,19 @@ def extract_labels_from_generation(
|
|
|
216
217
|
prompt_label_mapping = dataset_config.prompt_label_mapping
|
|
217
218
|
for prompt_tag_name, named_entities in prediction_dict.items():
|
|
218
219
|
if not isinstance(named_entities, list):
|
|
219
|
-
|
|
220
|
+
log(
|
|
220
221
|
"The model produced an invalid format for the named entities. "
|
|
221
|
-
f"Expected a list but got {type(named_entities)}. Skipping."
|
|
222
|
+
f"Expected a list but got {type(named_entities)}. Skipping.",
|
|
223
|
+
level=logging.DEBUG,
|
|
222
224
|
)
|
|
223
225
|
continue
|
|
224
226
|
try:
|
|
225
227
|
named_entities = [str(ne) for ne in named_entities]
|
|
226
228
|
except Exception:
|
|
227
|
-
|
|
229
|
+
log(
|
|
228
230
|
"The model produced an invalid format for the named entities. "
|
|
229
|
-
f"Expected a list of strings but got {named_entities}. Skipping."
|
|
231
|
+
f"Expected a list of strings but got {named_entities}. Skipping.",
|
|
232
|
+
level=logging.DEBUG,
|
|
230
233
|
)
|
|
231
234
|
continue
|
|
232
235
|
try:
|
|
@@ -236,9 +239,10 @@ def extract_labels_from_generation(
|
|
|
236
239
|
if prompt_tag == prompt_tag_name
|
|
237
240
|
][0]
|
|
238
241
|
except IndexError:
|
|
239
|
-
|
|
242
|
+
log(
|
|
240
243
|
"The model produced an invalid prompt tag name, "
|
|
241
|
-
f"{prompt_tag_name}. Skipping."
|
|
244
|
+
f"{prompt_tag_name}. Skipping.",
|
|
245
|
+
level=logging.DEBUG,
|
|
242
246
|
)
|
|
243
247
|
continue
|
|
244
248
|
|
|
@@ -283,8 +287,8 @@ def tokenize_and_align_labels(
|
|
|
283
287
|
# tokeniser is of a "fast" variant then this can be accessed through the
|
|
284
288
|
# `word_ids` method. Otherwise, we have to extract it manually.
|
|
285
289
|
all_labels: list[list[int]] = list()
|
|
286
|
-
labels:
|
|
287
|
-
word_ids:
|
|
290
|
+
labels: c.Sequence[str]
|
|
291
|
+
word_ids: c.Sequence[int | None]
|
|
288
292
|
for i, labels in enumerate(examples["labels"]):
|
|
289
293
|
# Try to get the word IDs from the tokeniser
|
|
290
294
|
try:
|
|
@@ -294,10 +298,10 @@ def tokenize_and_align_labels(
|
|
|
294
298
|
# IDs manually
|
|
295
299
|
except ValueError:
|
|
296
300
|
# Get the list of words in the document
|
|
297
|
-
words:
|
|
301
|
+
words: c.Sequence[str] = examples["tokens"][i]
|
|
298
302
|
|
|
299
303
|
# Get the list of token IDs in the document
|
|
300
|
-
tok_ids:
|
|
304
|
+
tok_ids: c.Sequence[int] = tokenized_inputs.input_ids[i]
|
|
301
305
|
|
|
302
306
|
# Decode the token IDs
|
|
303
307
|
tokens = tokeniser.convert_ids_to_tokens(tok_ids)
|
|
@@ -390,8 +394,8 @@ def tokenize_and_align_labels(
|
|
|
390
394
|
|
|
391
395
|
|
|
392
396
|
def handle_unk_tokens(
|
|
393
|
-
tokeniser: "PreTrainedTokenizer", tokens: list[str], words:
|
|
394
|
-
) ->
|
|
397
|
+
tokeniser: "PreTrainedTokenizer", tokens: list[str], words: c.Sequence[str]
|
|
398
|
+
) -> c.Sequence[str]:
|
|
395
399
|
"""Replace unknown tokens in the tokens with the corresponding word.
|
|
396
400
|
|
|
397
401
|
Args:
|
euroeval/tasks.py
CHANGED
|
@@ -5,12 +5,14 @@ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
|
|
|
5
5
|
from .data_models import Task
|
|
6
6
|
from .enums import GenerativeType, ModelType, TaskGroup
|
|
7
7
|
from .prompt_templates import (
|
|
8
|
+
CLASSIFICATION_TEMPLATES,
|
|
8
9
|
LA_TEMPLATES,
|
|
9
10
|
MULTIPLE_CHOICE_TEMPLATES,
|
|
10
11
|
NER_TEMPLATES,
|
|
11
12
|
RC_TEMPLATES,
|
|
12
13
|
SENT_TEMPLATES,
|
|
13
14
|
SUMM_TEMPLATES,
|
|
15
|
+
TOKEN_CLASSIFICATION_TEMPLATES,
|
|
14
16
|
)
|
|
15
17
|
|
|
16
18
|
|
|
@@ -20,7 +22,11 @@ def get_all_tasks() -> dict[str, Task]:
|
|
|
20
22
|
Returns:
|
|
21
23
|
A mapping between names of dataset tasks and their configurations.
|
|
22
24
|
"""
|
|
23
|
-
return {
|
|
25
|
+
return {
|
|
26
|
+
cfg.name: cfg
|
|
27
|
+
for cfg in globals().values()
|
|
28
|
+
if isinstance(cfg, Task) and cfg != SPEED
|
|
29
|
+
}
|
|
24
30
|
|
|
25
31
|
|
|
26
32
|
LA = Task(
|
|
@@ -159,3 +165,40 @@ SPEED = Task(
|
|
|
159
165
|
default_max_generated_tokens=5,
|
|
160
166
|
default_labels=[],
|
|
161
167
|
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# Used for custom datasets
|
|
171
|
+
|
|
172
|
+
TEXT_CLASSIFICATION = Task(
|
|
173
|
+
name="classification",
|
|
174
|
+
task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
|
|
175
|
+
template_dict=CLASSIFICATION_TEMPLATES,
|
|
176
|
+
metrics=[m.mcc_metric, m.macro_f1_metric],
|
|
177
|
+
default_num_few_shot_examples=12,
|
|
178
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
179
|
+
default_labels=None,
|
|
180
|
+
uses_logprobs=True,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
TOKEN_CLASSIFICATION = Task(
|
|
184
|
+
name="token-classification",
|
|
185
|
+
task_group=TaskGroup.TOKEN_CLASSIFICATION,
|
|
186
|
+
template_dict=TOKEN_CLASSIFICATION_TEMPLATES,
|
|
187
|
+
metrics=[m.micro_f1_metric],
|
|
188
|
+
default_num_few_shot_examples=8,
|
|
189
|
+
default_max_generated_tokens=128,
|
|
190
|
+
default_labels=None,
|
|
191
|
+
uses_structured_output=True,
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
MULTIPLE_CHOICE = Task(
|
|
195
|
+
name="multiple-choice",
|
|
196
|
+
task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
|
|
197
|
+
template_dict=MULTIPLE_CHOICE_TEMPLATES,
|
|
198
|
+
metrics=[m.mcc_metric, m.accuracy_metric],
|
|
199
|
+
default_num_few_shot_examples=5,
|
|
200
|
+
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
201
|
+
default_labels=None,
|
|
202
|
+
default_allowed_model_types=[ModelType.GENERATIVE],
|
|
203
|
+
uses_logprobs=True,
|
|
204
|
+
)
|