EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +7 -4
- euroeval/benchmark_config_factory.py +0 -4
- euroeval/benchmark_modules/base.py +3 -16
- euroeval/benchmark_modules/fresh.py +5 -2
- euroeval/benchmark_modules/hf.py +107 -66
- euroeval/benchmark_modules/litellm.py +103 -55
- euroeval/benchmark_modules/vllm.py +155 -82
- euroeval/benchmarker.py +184 -129
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +1 -1
- euroeval/constants.py +9 -0
- euroeval/data_loading.py +14 -11
- euroeval/data_models.py +12 -4
- euroeval/dataset_configs/__init__.py +3 -0
- euroeval/dataset_configs/czech.py +79 -0
- euroeval/dataset_configs/danish.py +10 -13
- euroeval/dataset_configs/dutch.py +0 -3
- euroeval/dataset_configs/english.py +0 -3
- euroeval/dataset_configs/estonian.py +11 -1
- euroeval/dataset_configs/finnish.py +0 -3
- euroeval/dataset_configs/french.py +0 -3
- euroeval/dataset_configs/german.py +0 -3
- euroeval/dataset_configs/italian.py +0 -3
- euroeval/dataset_configs/latvian.py +2 -4
- euroeval/dataset_configs/lithuanian.py +68 -0
- euroeval/dataset_configs/norwegian.py +0 -3
- euroeval/dataset_configs/polish.py +0 -3
- euroeval/dataset_configs/portuguese.py +0 -3
- euroeval/dataset_configs/slovak.py +60 -0
- euroeval/dataset_configs/spanish.py +0 -3
- euroeval/dataset_configs/swedish.py +10 -15
- euroeval/finetuning.py +21 -15
- euroeval/generation.py +10 -10
- euroeval/generation_utils.py +2 -3
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +10 -6
- euroeval/metrics/llm_as_a_judge.py +5 -3
- euroeval/metrics/pipeline.py +22 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +11 -14
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/linguistic_acceptability.py +30 -3
- euroeval/prompt_templates/multiple_choice.py +34 -1
- euroeval/prompt_templates/named_entity_recognition.py +71 -11
- euroeval/prompt_templates/reading_comprehension.py +41 -3
- euroeval/prompt_templates/sentiment_classification.py +34 -1
- euroeval/prompt_templates/summarization.py +26 -6
- euroeval/scores.py +7 -7
- euroeval/speed_benchmark.py +3 -5
- euroeval/task_group_utils/multiple_choice_classification.py +0 -3
- euroeval/task_group_utils/question_answering.py +0 -3
- euroeval/task_group_utils/sequence_classification.py +43 -31
- euroeval/task_group_utils/text_to_text.py +17 -8
- euroeval/task_group_utils/token_classification.py +10 -9
- euroeval/tokenisation_utils.py +22 -20
- euroeval/utils.py +30 -147
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
- euroeval-16.4.0.dist-info/RECORD +75 -0
- euroeval-16.2.2.dist-info/RECORD +0 -70
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
euroeval/scores.py
CHANGED
|
@@ -6,12 +6,12 @@ import warnings
|
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
|
+
from .logging_utils import log
|
|
10
|
+
|
|
9
11
|
if t.TYPE_CHECKING:
|
|
10
12
|
from .metrics import Metric
|
|
11
13
|
from .types import ScoreDict
|
|
12
14
|
|
|
13
|
-
logger = logging.getLogger("euroeval")
|
|
14
|
-
|
|
15
15
|
|
|
16
16
|
def log_scores(
|
|
17
17
|
dataset_name: str,
|
|
@@ -48,9 +48,8 @@ def log_scores(
|
|
|
48
48
|
if model_param is not None:
|
|
49
49
|
model_id += f"#{model_param}"
|
|
50
50
|
|
|
51
|
-
logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
|
|
52
|
-
|
|
53
51
|
total_dict: dict[str, float] = dict()
|
|
52
|
+
all_log_strs: list[str] = [f"Finished benchmarking {model_id} on {dataset_name}."]
|
|
54
53
|
for metric in metrics:
|
|
55
54
|
test_score, test_se = aggregate_scores(scores=scores, metric=metric)
|
|
56
55
|
test_score, test_score_str = metric.postprocessing_fn(test_score)
|
|
@@ -58,11 +57,12 @@ def log_scores(
|
|
|
58
57
|
total_dict[f"test_{metric.name}"] = test_score
|
|
59
58
|
total_dict[f"test_{metric.name}_se"] = test_se
|
|
60
59
|
log_str = (
|
|
61
|
-
f"{metric.pretty_name}: {test_score_str} ± {test_se_str}"
|
|
60
|
+
f"- {metric.pretty_name}: {test_score_str} ± {test_se_str}"
|
|
62
61
|
if not np.isnan(test_se)
|
|
63
|
-
else f"{metric.pretty_name}: {test_score_str}"
|
|
62
|
+
else f"- {metric.pretty_name}: {test_score_str}"
|
|
64
63
|
)
|
|
65
|
-
|
|
64
|
+
all_log_strs.append(log_str)
|
|
65
|
+
log("\n".join(all_log_strs), level=logging.INFO)
|
|
66
66
|
|
|
67
67
|
return dict(raw=scores, total=total_dict)
|
|
68
68
|
|
euroeval/speed_benchmark.py
CHANGED
|
@@ -4,19 +4,17 @@ import logging
|
|
|
4
4
|
import typing as t
|
|
5
5
|
|
|
6
6
|
import pyinfer
|
|
7
|
-
from tqdm.auto import tqdm
|
|
8
7
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
9
8
|
|
|
10
9
|
from .benchmark_modules import HuggingFaceEncoderModel, LiteLLMModel, VLLMModel
|
|
11
10
|
from .exceptions import InvalidBenchmark
|
|
11
|
+
from .logging_utils import get_pbar, log
|
|
12
12
|
from .utils import clear_memory
|
|
13
13
|
|
|
14
14
|
if t.TYPE_CHECKING:
|
|
15
15
|
from .benchmark_modules import BenchmarkModule
|
|
16
16
|
from .data_models import BenchmarkConfig
|
|
17
17
|
|
|
18
|
-
logger = logging.getLogger("euroeval")
|
|
19
|
-
|
|
20
18
|
|
|
21
19
|
def benchmark_speed(
|
|
22
20
|
model: "BenchmarkModule", benchmark_config: "BenchmarkConfig"
|
|
@@ -33,7 +31,7 @@ def benchmark_speed(
|
|
|
33
31
|
Dictionary of scores.
|
|
34
32
|
"""
|
|
35
33
|
scores: list[dict[str, float]] = list()
|
|
36
|
-
for idx in
|
|
34
|
+
for idx in get_pbar(
|
|
37
35
|
iterable=range(benchmark_config.num_iterations),
|
|
38
36
|
desc="Benchmarking",
|
|
39
37
|
disable=not benchmark_config.progress_bar,
|
|
@@ -41,7 +39,7 @@ def benchmark_speed(
|
|
|
41
39
|
itr_scores = benchmark_speed_single_iteration(model=model, itr_idx=idx)
|
|
42
40
|
clear_memory()
|
|
43
41
|
scores.append(itr_scores)
|
|
44
|
-
|
|
42
|
+
log(f"Scores for iteration {idx}: {itr_scores}", level=logging.DEBUG)
|
|
45
43
|
return scores
|
|
46
44
|
|
|
47
45
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to the multiple-choice classification task group."""
|
|
2
2
|
|
|
3
3
|
import hashlib
|
|
4
|
-
import logging
|
|
5
4
|
import re
|
|
6
5
|
import typing as t
|
|
7
6
|
from collections import defaultdict
|
|
@@ -18,8 +17,6 @@ if t.TYPE_CHECKING:
|
|
|
18
17
|
|
|
19
18
|
from ..types import Labels, Predictions
|
|
20
19
|
|
|
21
|
-
logger = logging.getLogger("euroeval")
|
|
22
|
-
|
|
23
20
|
|
|
24
21
|
class MultipleChoiceClassificationTrainer(Trainer):
|
|
25
22
|
"""Trainer subclass for multiple-choice classification tasks."""
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Utility functions related to the question-answering task group."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
-
import logging
|
|
5
4
|
import typing as t
|
|
6
5
|
from collections import defaultdict
|
|
7
6
|
|
|
@@ -26,8 +25,6 @@ if t.TYPE_CHECKING:
|
|
|
26
25
|
from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
|
|
27
26
|
from ..types import Labels, Predictions
|
|
28
27
|
|
|
29
|
-
logger = logging.getLogger("euroeval")
|
|
30
|
-
|
|
31
28
|
|
|
32
29
|
class QuestionAnsweringTrainer(Trainer):
|
|
33
30
|
"""Trainer subclass for question answering tasks."""
|
|
@@ -19,13 +19,15 @@ if t.TYPE_CHECKING:
|
|
|
19
19
|
from datasets.arrow_dataset import Dataset
|
|
20
20
|
from transformers.trainer_utils import EvalPrediction
|
|
21
21
|
|
|
22
|
-
from ..data_models import
|
|
22
|
+
from ..data_models import (
|
|
23
|
+
BenchmarkConfig,
|
|
24
|
+
DatasetConfig,
|
|
25
|
+
GenerativeModelOutput,
|
|
26
|
+
ModelConfig,
|
|
27
|
+
)
|
|
23
28
|
from ..types import Labels, Predictions
|
|
24
29
|
|
|
25
30
|
|
|
26
|
-
logger = logging.getLogger("euroeval")
|
|
27
|
-
|
|
28
|
-
|
|
29
31
|
def compute_metrics(
|
|
30
32
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
31
33
|
dataset_config: "DatasetConfig",
|
|
@@ -106,6 +108,7 @@ def extract_labels_from_generation(
|
|
|
106
108
|
input_batch: dict[str, list],
|
|
107
109
|
model_output: "GenerativeModelOutput",
|
|
108
110
|
dataset_config: "DatasetConfig",
|
|
111
|
+
model_config: "ModelConfig",
|
|
109
112
|
first_label_token_mapping: dict[str, str] | bool,
|
|
110
113
|
) -> list[str]:
|
|
111
114
|
"""Extract the predicted labels from the generated output.
|
|
@@ -118,6 +121,8 @@ def extract_labels_from_generation(
|
|
|
118
121
|
The raw generated output of the model.
|
|
119
122
|
dataset_config:
|
|
120
123
|
The configuration of the dataset.
|
|
124
|
+
model_config:
|
|
125
|
+
The configuration of the model.
|
|
121
126
|
first_label_token_mapping:
|
|
122
127
|
A mapping from labels to the first token in each label, or alternatively a
|
|
123
128
|
Boolean value indicating whether the model should output scores (if the
|
|
@@ -167,6 +172,7 @@ def extract_labels_from_generation(
|
|
|
167
172
|
)
|
|
168
173
|
|
|
169
174
|
new_predicted_labels: list[str] = list()
|
|
175
|
+
num_predictions_being_very_off = 0
|
|
170
176
|
for idx, predicted_label in enumerate(model_output.sequences):
|
|
171
177
|
# If the prediction includes a boxed answer, use that instead of the full
|
|
172
178
|
# generation
|
|
@@ -199,34 +205,40 @@ def extract_labels_from_generation(
|
|
|
199
205
|
# word edit distance to the predicted label (if invalid model outputs are
|
|
200
206
|
# allowed), or we raise an error
|
|
201
207
|
if min(edit_distances) >= 1000:
|
|
202
|
-
|
|
203
|
-
logger.warning(
|
|
204
|
-
"No candidate labels found for the predicted label "
|
|
205
|
-
f"{predicted_label!r}, out of the candidate labels "
|
|
206
|
-
f"{sample_candidate_labels[idx]}. This likely means that the model "
|
|
207
|
-
"output is completely off, but since invalid model outputs are "
|
|
208
|
-
"allowed for this task, we will use the closest candidate label "
|
|
209
|
-
f"({best_candidate_label})) as the output label. If you see this "
|
|
210
|
-
"warning very often, please report this issue to the EuroEval "
|
|
211
|
-
"team at github.com/EuroEval/EuroEval/issues."
|
|
212
|
-
)
|
|
213
|
-
logger.debug(
|
|
214
|
-
"The candidate labels were extracted from the prompt: "
|
|
215
|
-
f"{input_batch['text'][idx]!r}."
|
|
216
|
-
)
|
|
217
|
-
else:
|
|
218
|
-
raise InvalidBenchmark(
|
|
219
|
-
"No candidate labels found for the predicted label "
|
|
220
|
-
f"{predicted_label!r}, out of the candidate labels "
|
|
221
|
-
f"{sample_candidate_labels[idx]}. This likely means that the model "
|
|
222
|
-
"output is completely off, and we cannot extract any labels from "
|
|
223
|
-
"it. Please check the model output and the candidate labels. The "
|
|
224
|
-
"candidate labels were extracted from the prompt: "
|
|
225
|
-
f"{input_batch['text'][idx]!r}."
|
|
226
|
-
)
|
|
208
|
+
num_predictions_being_very_off += 1
|
|
227
209
|
|
|
228
210
|
new_predicted_labels.append(best_candidate_label)
|
|
229
211
|
|
|
212
|
+
if num_predictions_being_very_off > 0:
|
|
213
|
+
if dataset_config.allow_invalid_model_outputs:
|
|
214
|
+
log_msg = (
|
|
215
|
+
"No candidate labels found for the predicted label in "
|
|
216
|
+
f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
|
|
217
|
+
f"of the samples with the model {model_config.model_id!r}. This "
|
|
218
|
+
"likely means that the model were completely off in these cases, "
|
|
219
|
+
"but since invalid model outputs are allowed for this task, we used "
|
|
220
|
+
"the closest candidate labels as the output labels."
|
|
221
|
+
)
|
|
222
|
+
level = logging.DEBUG
|
|
223
|
+
if num_predictions_being_very_off / len(model_output.sequences) > 0.5:
|
|
224
|
+
log_msg += (
|
|
225
|
+
" Since this happened for most of the model's predictions, please "
|
|
226
|
+
"report this issue to the EuroEval team at "
|
|
227
|
+
"github.com/EuroEval/EuroEval/issues."
|
|
228
|
+
)
|
|
229
|
+
level = logging.WARNING
|
|
230
|
+
log_once(log_msg, level=level)
|
|
231
|
+
else:
|
|
232
|
+
raise InvalidBenchmark(
|
|
233
|
+
"No candidate labels found for the predicted label in "
|
|
234
|
+
f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
|
|
235
|
+
"of the samples. This likely means that the model were completely "
|
|
236
|
+
"off in these cases. Since this task does not allow invalid model "
|
|
237
|
+
"outputs, we have to abort the evaluation. Please re-run the "
|
|
238
|
+
"evaluation with the `--debug` flag (or `debug=True` if you're using "
|
|
239
|
+
"the `Benchmarker` API) to see the precise model outputs."
|
|
240
|
+
)
|
|
241
|
+
|
|
230
242
|
return new_predicted_labels
|
|
231
243
|
|
|
232
244
|
|
|
@@ -355,7 +367,7 @@ def get_closest_logprobs_labels(
|
|
|
355
367
|
"be determined. This means that using logprobs to extract the "
|
|
356
368
|
"labels is not reliable, and we will instead fall back to "
|
|
357
369
|
"extracting the labels using word edit distance.",
|
|
358
|
-
level=logging.
|
|
370
|
+
level=logging.DEBUG,
|
|
359
371
|
)
|
|
360
372
|
else:
|
|
361
373
|
log_once(
|
|
@@ -363,7 +375,7 @@ def get_closest_logprobs_labels(
|
|
|
363
375
|
"means that using logprobs to extract the labels is not reliable, "
|
|
364
376
|
"and we will instead fall back to extracting the labels using "
|
|
365
377
|
"word edit distance.",
|
|
366
|
-
level=logging.
|
|
378
|
+
level=logging.DEBUG,
|
|
367
379
|
)
|
|
368
380
|
return None
|
|
369
381
|
|
|
@@ -7,6 +7,7 @@ import numpy as np
|
|
|
7
7
|
|
|
8
8
|
from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
|
|
9
9
|
from ..exceptions import InvalidBenchmark
|
|
10
|
+
from ..logging_utils import log
|
|
10
11
|
from ..metrics import HuggingFaceMetric
|
|
11
12
|
from ..utils import raise_if_model_output_contains_nan_values
|
|
12
13
|
|
|
@@ -18,9 +19,6 @@ if t.TYPE_CHECKING:
|
|
|
18
19
|
from ..types import Labels, Predictions
|
|
19
20
|
|
|
20
21
|
|
|
21
|
-
logger = logging.getLogger("euroeval")
|
|
22
|
-
|
|
23
|
-
|
|
24
22
|
def compute_metrics(
|
|
25
23
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
26
24
|
dataset_config: "DatasetConfig",
|
|
@@ -44,6 +42,10 @@ def compute_metrics(
|
|
|
44
42
|
Returns:
|
|
45
43
|
A dictionary with the names of the metrics as keys and the metric values as
|
|
46
44
|
values.
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
InvalidBenchmark:
|
|
48
|
+
If the metric computation fails.
|
|
47
49
|
"""
|
|
48
50
|
model_outputs, labels = model_outputs_and_labels
|
|
49
51
|
|
|
@@ -72,7 +74,7 @@ def compute_metrics(
|
|
|
72
74
|
):
|
|
73
75
|
metric.compute_kwargs["device"] = benchmark_config.device.type
|
|
74
76
|
|
|
75
|
-
|
|
77
|
+
for _ in range(num_attempts := 5):
|
|
76
78
|
try:
|
|
77
79
|
score: float | None = metric(
|
|
78
80
|
predictions=predictions,
|
|
@@ -96,21 +98,28 @@ def compute_metrics(
|
|
|
96
98
|
and metric.compute_kwargs.get("device", "cpu") != "cpu"
|
|
97
99
|
):
|
|
98
100
|
metric.compute_kwargs["device"] = "cpu"
|
|
99
|
-
|
|
101
|
+
log(
|
|
100
102
|
"Out of memory error occurred during the computation of "
|
|
101
103
|
f"the metric {metric.pretty_name}. Moving the computation to "
|
|
102
|
-
"the CPU."
|
|
104
|
+
"the CPU.",
|
|
105
|
+
level=logging.DEBUG,
|
|
103
106
|
)
|
|
104
107
|
else:
|
|
105
108
|
raise InvalidBenchmark(str(e)) from e
|
|
106
109
|
finally:
|
|
107
110
|
for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
|
|
108
111
|
if hasattr(metric, attribute):
|
|
109
|
-
|
|
112
|
+
log(
|
|
110
113
|
f"Deleting the {attribute!r} attribute of the metric "
|
|
111
|
-
f"{metric.pretty_name} to free up memory."
|
|
114
|
+
f"{metric.pretty_name} to free up memory.",
|
|
115
|
+
level=logging.DEBUG,
|
|
112
116
|
)
|
|
113
117
|
delattr(metric, attribute)
|
|
118
|
+
else:
|
|
119
|
+
raise InvalidBenchmark(
|
|
120
|
+
f"Could not compute the metric {metric.pretty_name} after "
|
|
121
|
+
f"{num_attempts} attempts due to out of memory errors."
|
|
122
|
+
)
|
|
114
123
|
|
|
115
124
|
# The metric returns None if we are running on multi-GPU and the current
|
|
116
125
|
# process is not the main process
|
|
@@ -7,6 +7,7 @@ from copy import deepcopy
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
9
|
from ..exceptions import InvalidBenchmark
|
|
10
|
+
from ..logging_utils import log
|
|
10
11
|
from ..utils import (
|
|
11
12
|
extract_json_dict_from_string,
|
|
12
13
|
raise_if_model_output_contains_nan_values,
|
|
@@ -22,9 +23,6 @@ if t.TYPE_CHECKING:
|
|
|
22
23
|
from ..types import Labels, Predictions
|
|
23
24
|
|
|
24
25
|
|
|
25
|
-
logger = logging.getLogger("euroeval")
|
|
26
|
-
|
|
27
|
-
|
|
28
26
|
def compute_metrics(
|
|
29
27
|
model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
|
|
30
28
|
has_misc_tags: bool,
|
|
@@ -216,17 +214,19 @@ def extract_labels_from_generation(
|
|
|
216
214
|
prompt_label_mapping = dataset_config.prompt_label_mapping
|
|
217
215
|
for prompt_tag_name, named_entities in prediction_dict.items():
|
|
218
216
|
if not isinstance(named_entities, list):
|
|
219
|
-
|
|
217
|
+
log(
|
|
220
218
|
"The model produced an invalid format for the named entities. "
|
|
221
|
-
f"Expected a list but got {type(named_entities)}. Skipping."
|
|
219
|
+
f"Expected a list but got {type(named_entities)}. Skipping.",
|
|
220
|
+
level=logging.DEBUG,
|
|
222
221
|
)
|
|
223
222
|
continue
|
|
224
223
|
try:
|
|
225
224
|
named_entities = [str(ne) for ne in named_entities]
|
|
226
225
|
except Exception:
|
|
227
|
-
|
|
226
|
+
log(
|
|
228
227
|
"The model produced an invalid format for the named entities. "
|
|
229
|
-
f"Expected a list of strings but got {named_entities}. Skipping."
|
|
228
|
+
f"Expected a list of strings but got {named_entities}. Skipping.",
|
|
229
|
+
level=logging.DEBUG,
|
|
230
230
|
)
|
|
231
231
|
continue
|
|
232
232
|
try:
|
|
@@ -236,9 +236,10 @@ def extract_labels_from_generation(
|
|
|
236
236
|
if prompt_tag == prompt_tag_name
|
|
237
237
|
][0]
|
|
238
238
|
except IndexError:
|
|
239
|
-
|
|
239
|
+
log(
|
|
240
240
|
"The model produced an invalid prompt tag name, "
|
|
241
|
-
f"{prompt_tag_name}. Skipping."
|
|
241
|
+
f"{prompt_tag_name}. Skipping.",
|
|
242
|
+
level=logging.DEBUG,
|
|
242
243
|
)
|
|
243
244
|
continue
|
|
244
245
|
|
euroeval/tokenisation_utils.py
CHANGED
|
@@ -5,11 +5,11 @@ import re
|
|
|
5
5
|
import typing as t
|
|
6
6
|
|
|
7
7
|
import torch
|
|
8
|
-
from transformers import MistralCommonTokenizer
|
|
8
|
+
from transformers.tokenization_mistral_common import MistralCommonTokenizer
|
|
9
9
|
|
|
10
10
|
from .enums import GenerativeType
|
|
11
11
|
from .exceptions import InvalidModel
|
|
12
|
-
from .
|
|
12
|
+
from .logging_utils import log, log_once
|
|
13
13
|
|
|
14
14
|
if t.TYPE_CHECKING:
|
|
15
15
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
@@ -18,9 +18,6 @@ if t.TYPE_CHECKING:
|
|
|
18
18
|
from .data_models import DatasetConfig, ModelConfig
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
logger = logging.getLogger("euroeval")
|
|
22
|
-
|
|
23
|
-
|
|
24
21
|
def get_special_token_metadata(tokeniser: "PreTrainedTokenizerBase") -> dict:
|
|
25
22
|
"""Get the special token metadata for a tokeniser.
|
|
26
23
|
|
|
@@ -182,7 +179,7 @@ def get_bos_token(
|
|
|
182
179
|
"The model does not have a beginning-of-sequence token. Please ensure that "
|
|
183
180
|
"this has been set in the tokeniser's configuration. Using no BOS token."
|
|
184
181
|
" This may lead to unexpected behavior in the model.",
|
|
185
|
-
level=logging.
|
|
182
|
+
level=logging.WARNING,
|
|
186
183
|
)
|
|
187
184
|
return None, None
|
|
188
185
|
|
|
@@ -223,14 +220,14 @@ def get_eos_token(
|
|
|
223
220
|
"The model does not have an end-of-sequence token. Please ensure that this "
|
|
224
221
|
"has been set in the tokeniser's configuration. Using no EOS token. This "
|
|
225
222
|
"may lead to unexpected behavior in the model.",
|
|
226
|
-
level=logging.
|
|
223
|
+
level=logging.WARNING,
|
|
227
224
|
)
|
|
228
225
|
return None, None
|
|
229
226
|
|
|
230
227
|
log_once(
|
|
231
228
|
f"End-of-sequence token was not set, but detected it as {eos_token!r} with "
|
|
232
229
|
f"ID {eos_token_id}.",
|
|
233
|
-
level=logging.
|
|
230
|
+
level=logging.WARNING,
|
|
234
231
|
)
|
|
235
232
|
return eos_token, eos_token_id
|
|
236
233
|
|
|
@@ -306,7 +303,7 @@ def get_pad_token(
|
|
|
306
303
|
"Could not identify a padding token for the model. Please ensure that "
|
|
307
304
|
"this has been set in the tokeniser's configuration. Using no padding "
|
|
308
305
|
"token. This may lead to unexpected behavior in the model.",
|
|
309
|
-
level=logging.
|
|
306
|
+
level=logging.WARNING,
|
|
310
307
|
)
|
|
311
308
|
return None, None
|
|
312
309
|
|
|
@@ -358,12 +355,16 @@ def get_end_of_chat_token_ids(
|
|
|
358
355
|
x_token_index = idx
|
|
359
356
|
break
|
|
360
357
|
else:
|
|
361
|
-
|
|
358
|
+
log(
|
|
359
|
+
"Could not locate the end-of-chat token for the model.", level=logging.DEBUG
|
|
360
|
+
)
|
|
362
361
|
return None
|
|
363
362
|
|
|
364
363
|
end_of_chat_tokens = token_ids[x_token_index + 1 :]
|
|
365
364
|
if len(end_of_chat_tokens) == 0:
|
|
366
|
-
|
|
365
|
+
log(
|
|
366
|
+
"Could not locate the end-of-chat token for the model.", level=logging.DEBUG
|
|
367
|
+
)
|
|
367
368
|
return None
|
|
368
369
|
|
|
369
370
|
log_once(
|
|
@@ -506,7 +507,8 @@ def get_first_label_token_mapping(
|
|
|
506
507
|
log_once(
|
|
507
508
|
"We will not use logprobs with the model since the first tokens of the "
|
|
508
509
|
"labels are not distinct. The first tokens for the labels "
|
|
509
|
-
f"{local_labels} are {first_tokens}"
|
|
510
|
+
f"{local_labels} are {first_tokens}",
|
|
511
|
+
level=logging.DEBUG,
|
|
510
512
|
)
|
|
511
513
|
return False
|
|
512
514
|
|
|
@@ -521,7 +523,14 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
|
|
|
521
523
|
Returns:
|
|
522
524
|
Whether the tokeniser has a chat template.
|
|
523
525
|
"""
|
|
524
|
-
if
|
|
526
|
+
if isinstance(tokeniser, MistralCommonTokenizer):
|
|
527
|
+
log_once(
|
|
528
|
+
"The tokeniser is a Mistral tokeniser, so assuming that the model is "
|
|
529
|
+
"instruction tuned.",
|
|
530
|
+
level=logging.DEBUG,
|
|
531
|
+
)
|
|
532
|
+
return True
|
|
533
|
+
elif hasattr(tokeniser, "chat_template"):
|
|
525
534
|
has_template = tokeniser.chat_template is not None
|
|
526
535
|
if has_template:
|
|
527
536
|
log_once(
|
|
@@ -530,13 +539,6 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
|
|
|
530
539
|
level=logging.DEBUG,
|
|
531
540
|
)
|
|
532
541
|
return has_template
|
|
533
|
-
elif isinstance(tokeniser, MistralCommonTokenizer):
|
|
534
|
-
log_once(
|
|
535
|
-
"The tokeniser is a Mistral tokeniser, so assuming that the model is "
|
|
536
|
-
"instruction tuned.",
|
|
537
|
-
level=logging.DEBUG,
|
|
538
|
-
)
|
|
539
|
-
return True
|
|
540
542
|
else:
|
|
541
543
|
log_once(
|
|
542
544
|
"We cannot find a chat template for the tokeniser, so assuming that the "
|