EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +7 -4
- euroeval/benchmark_config_factory.py +0 -4
- euroeval/benchmark_modules/base.py +3 -16
- euroeval/benchmark_modules/fresh.py +5 -2
- euroeval/benchmark_modules/hf.py +107 -66
- euroeval/benchmark_modules/litellm.py +103 -55
- euroeval/benchmark_modules/vllm.py +155 -82
- euroeval/benchmarker.py +184 -129
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +1 -1
- euroeval/constants.py +9 -0
- euroeval/data_loading.py +14 -11
- euroeval/data_models.py +12 -4
- euroeval/dataset_configs/__init__.py +3 -0
- euroeval/dataset_configs/czech.py +79 -0
- euroeval/dataset_configs/danish.py +10 -13
- euroeval/dataset_configs/dutch.py +0 -3
- euroeval/dataset_configs/english.py +0 -3
- euroeval/dataset_configs/estonian.py +11 -1
- euroeval/dataset_configs/finnish.py +0 -3
- euroeval/dataset_configs/french.py +0 -3
- euroeval/dataset_configs/german.py +0 -3
- euroeval/dataset_configs/italian.py +0 -3
- euroeval/dataset_configs/latvian.py +2 -4
- euroeval/dataset_configs/lithuanian.py +68 -0
- euroeval/dataset_configs/norwegian.py +0 -3
- euroeval/dataset_configs/polish.py +0 -3
- euroeval/dataset_configs/portuguese.py +0 -3
- euroeval/dataset_configs/slovak.py +60 -0
- euroeval/dataset_configs/spanish.py +0 -3
- euroeval/dataset_configs/swedish.py +10 -15
- euroeval/finetuning.py +21 -15
- euroeval/generation.py +10 -10
- euroeval/generation_utils.py +2 -3
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +10 -6
- euroeval/metrics/llm_as_a_judge.py +5 -3
- euroeval/metrics/pipeline.py +22 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +11 -14
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/linguistic_acceptability.py +30 -3
- euroeval/prompt_templates/multiple_choice.py +34 -1
- euroeval/prompt_templates/named_entity_recognition.py +71 -11
- euroeval/prompt_templates/reading_comprehension.py +41 -3
- euroeval/prompt_templates/sentiment_classification.py +34 -1
- euroeval/prompt_templates/summarization.py +26 -6
- euroeval/scores.py +7 -7
- euroeval/speed_benchmark.py +3 -5
- euroeval/task_group_utils/multiple_choice_classification.py +0 -3
- euroeval/task_group_utils/question_answering.py +0 -3
- euroeval/task_group_utils/sequence_classification.py +43 -31
- euroeval/task_group_utils/text_to_text.py +17 -8
- euroeval/task_group_utils/token_classification.py +10 -9
- euroeval/tokenisation_utils.py +22 -20
- euroeval/utils.py +30 -147
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
- euroeval-16.4.0.dist-info/RECORD +75 -0
- euroeval-16.2.2.dist-info/RECORD +0 -70
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Swedish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import SV
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -33,11 +32,11 @@ SUC3_CONFIG = DatasetConfig(
|
|
|
33
32
|
languages=[SV],
|
|
34
33
|
)
|
|
35
34
|
|
|
36
|
-
|
|
37
|
-
name="
|
|
38
|
-
pretty_name="the
|
|
39
|
-
"dataset
|
|
40
|
-
huggingface_id="EuroEval/
|
|
35
|
+
MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
|
|
36
|
+
name="multi-wiki-qa-sv",
|
|
37
|
+
pretty_name="the truncated version of the Swedish part of the reading "
|
|
38
|
+
"comprehension dataset MultiWikiQA",
|
|
39
|
+
huggingface_id="EuroEval/multi-wiki-qa-sv-mini",
|
|
41
40
|
task=RC,
|
|
42
41
|
languages=[SV],
|
|
43
42
|
)
|
|
@@ -111,11 +110,11 @@ BELEBELE_SV_CONFIG = DatasetConfig(
|
|
|
111
110
|
unofficial=True,
|
|
112
111
|
)
|
|
113
112
|
|
|
114
|
-
|
|
115
|
-
name="
|
|
116
|
-
pretty_name="the
|
|
117
|
-
"
|
|
118
|
-
huggingface_id="EuroEval/
|
|
113
|
+
SCANDIQA_SV_CONFIG = DatasetConfig(
|
|
114
|
+
name="scandiqa-sv",
|
|
115
|
+
pretty_name="the Swedish part of the truncated version of the question answering "
|
|
116
|
+
"dataset ScandiQA",
|
|
117
|
+
huggingface_id="EuroEval/scandiqa-sv-mini",
|
|
119
118
|
task=RC,
|
|
120
119
|
languages=[SV],
|
|
121
120
|
unofficial=True,
|
|
@@ -138,9 +137,7 @@ WINOGRANDE_SV_CONFIG = DatasetConfig(
|
|
|
138
137
|
huggingface_id="EuroEval/winogrande-sv",
|
|
139
138
|
task=COMMON_SENSE,
|
|
140
139
|
languages=[SV],
|
|
141
|
-
splits=["train", "test"],
|
|
142
140
|
_labels=["a", "b"],
|
|
143
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
144
141
|
unofficial=True,
|
|
145
142
|
)
|
|
146
143
|
|
|
@@ -176,7 +173,5 @@ SKOLPROV_CONFIG = DatasetConfig(
|
|
|
176
173
|
huggingface_id="EuroEval/skolprov",
|
|
177
174
|
task=KNOW,
|
|
178
175
|
languages=[SV],
|
|
179
|
-
splits=["train", "test"],
|
|
180
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
181
176
|
unofficial=True,
|
|
182
177
|
)
|
euroeval/finetuning.py
CHANGED
|
@@ -6,7 +6,6 @@ import typing as t
|
|
|
6
6
|
from functools import partial
|
|
7
7
|
|
|
8
8
|
import torch
|
|
9
|
-
from tqdm.auto import tqdm
|
|
10
9
|
from transformers.trainer_callback import (
|
|
11
10
|
EarlyStoppingCallback,
|
|
12
11
|
PrinterCallback,
|
|
@@ -18,13 +17,9 @@ from transformers.training_args import OptimizerNames, TrainingArguments
|
|
|
18
17
|
from .callbacks import NeverLeaveProgressCallback
|
|
19
18
|
from .enums import DataType
|
|
20
19
|
from .exceptions import InvalidBenchmark, NaNValueInModelOutput
|
|
20
|
+
from .logging_utils import block_terminal_output, get_pbar, log, log_once
|
|
21
21
|
from .model_loading import load_model
|
|
22
|
-
from .utils import
|
|
23
|
-
block_terminal_output,
|
|
24
|
-
clear_memory,
|
|
25
|
-
enforce_reproducibility,
|
|
26
|
-
log_once,
|
|
27
|
-
)
|
|
22
|
+
from .utils import clear_memory, enforce_reproducibility
|
|
28
23
|
|
|
29
24
|
if t.TYPE_CHECKING:
|
|
30
25
|
from datasets import DatasetDict
|
|
@@ -32,8 +27,6 @@ if t.TYPE_CHECKING:
|
|
|
32
27
|
from .benchmark_modules import BenchmarkModule
|
|
33
28
|
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
34
29
|
|
|
35
|
-
logger = logging.getLogger("euroeval")
|
|
36
|
-
|
|
37
30
|
|
|
38
31
|
def finetune(
|
|
39
32
|
model: "BenchmarkModule",
|
|
@@ -58,6 +51,10 @@ def finetune(
|
|
|
58
51
|
|
|
59
52
|
Returns:
|
|
60
53
|
A list of dicts containing the scores for each metric for each iteration.
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
InvalidBenchmark:
|
|
57
|
+
If the benchmark could not be completed.
|
|
61
58
|
"""
|
|
62
59
|
# Set the data type to use for the model weights
|
|
63
60
|
using_cuda = benchmark_config.device == torch.device("cuda")
|
|
@@ -70,7 +67,7 @@ def finetune(
|
|
|
70
67
|
|
|
71
68
|
bs: int = benchmark_config.batch_size
|
|
72
69
|
scores: list[dict[str, float]] = list()
|
|
73
|
-
for idx in
|
|
70
|
+
for idx in get_pbar(
|
|
74
71
|
iterable=range(benchmark_config.num_iterations),
|
|
75
72
|
desc="Benchmarking",
|
|
76
73
|
disable=not benchmark_config.progress_bar,
|
|
@@ -80,7 +77,7 @@ def finetune(
|
|
|
80
77
|
model_already_initialized = idx == 0
|
|
81
78
|
|
|
82
79
|
# Run a loop here to deal with automatic reduction of batch size
|
|
83
|
-
|
|
80
|
+
for _ in range(num_attempts := 10):
|
|
84
81
|
# Clear GPU memory
|
|
85
82
|
if not model_already_initialized:
|
|
86
83
|
try:
|
|
@@ -112,7 +109,10 @@ def finetune(
|
|
|
112
109
|
)
|
|
113
110
|
|
|
114
111
|
scores.append(itr_scores)
|
|
115
|
-
|
|
112
|
+
log(
|
|
113
|
+
f"Test scores for iteration {idx}: {itr_scores}",
|
|
114
|
+
level=logging.DEBUG,
|
|
115
|
+
)
|
|
116
116
|
|
|
117
117
|
break
|
|
118
118
|
|
|
@@ -123,9 +123,10 @@ def finetune(
|
|
|
123
123
|
if dtype != DataType.FP32:
|
|
124
124
|
dtype = DataType.FP32
|
|
125
125
|
model_already_initialized = False
|
|
126
|
-
|
|
126
|
+
log(
|
|
127
127
|
"NaN value detected in model outputs while using mixed "
|
|
128
|
-
"precision. Retrying with full fp32 precision."
|
|
128
|
+
"precision. Retrying with full fp32 precision.",
|
|
129
|
+
level=logging.DEBUG,
|
|
129
130
|
)
|
|
130
131
|
else:
|
|
131
132
|
raise InvalidBenchmark(
|
|
@@ -151,7 +152,12 @@ def finetune(
|
|
|
151
152
|
model_already_initialized = False
|
|
152
153
|
|
|
153
154
|
bs //= 2
|
|
154
|
-
|
|
155
|
+
log(f"Reduced batch size to {bs}", level=logging.DEBUG)
|
|
156
|
+
|
|
157
|
+
else:
|
|
158
|
+
raise InvalidBenchmark(
|
|
159
|
+
f"Could not benchmark the model after {num_attempts} attempts!"
|
|
160
|
+
)
|
|
155
161
|
|
|
156
162
|
return scores
|
|
157
163
|
|
euroeval/generation.py
CHANGED
|
@@ -11,12 +11,13 @@ from tqdm.auto import tqdm
|
|
|
11
11
|
|
|
12
12
|
from .enums import BatchingPreference, TaskGroup
|
|
13
13
|
from .exceptions import InvalidBenchmark
|
|
14
|
+
from .logging_utils import get_pbar, log, log_once
|
|
14
15
|
from .model_cache import (
|
|
15
16
|
ModelCache,
|
|
16
17
|
load_cached_model_outputs,
|
|
17
18
|
split_dataset_into_cached_and_non_cached,
|
|
18
19
|
)
|
|
19
|
-
from .utils import clear_memory
|
|
20
|
+
from .utils import clear_memory
|
|
20
21
|
|
|
21
22
|
if t.TYPE_CHECKING:
|
|
22
23
|
from datasets import DatasetDict
|
|
@@ -29,8 +30,6 @@ if t.TYPE_CHECKING:
|
|
|
29
30
|
ModelConfig,
|
|
30
31
|
)
|
|
31
32
|
|
|
32
|
-
logger = logging.getLogger("euroeval")
|
|
33
|
-
|
|
34
33
|
|
|
35
34
|
def generate(
|
|
36
35
|
model: "BenchmarkModule",
|
|
@@ -78,7 +77,7 @@ def generate(
|
|
|
78
77
|
)
|
|
79
78
|
|
|
80
79
|
scores: list[dict[str, float]] = list()
|
|
81
|
-
for idx in
|
|
80
|
+
for idx in get_pbar(
|
|
82
81
|
iterable=range(len(datasets)),
|
|
83
82
|
desc="Benchmarking",
|
|
84
83
|
disable=not benchmark_config.progress_bar,
|
|
@@ -90,7 +89,7 @@ def generate(
|
|
|
90
89
|
dataset_config=dataset_config,
|
|
91
90
|
benchmark_config=benchmark_config,
|
|
92
91
|
)
|
|
93
|
-
|
|
92
|
+
log(f"Test scores for iteration {idx}: {test_scores}", level=logging.DEBUG)
|
|
94
93
|
scores.append(test_scores)
|
|
95
94
|
clear_memory()
|
|
96
95
|
|
|
@@ -142,14 +141,14 @@ def generate_single_iteration(
|
|
|
142
141
|
itr: t.Iterable
|
|
143
142
|
match model.batching_preference:
|
|
144
143
|
case BatchingPreference.SINGLE_SAMPLE:
|
|
145
|
-
itr =
|
|
144
|
+
itr = get_pbar(iterable=non_cached_dataset)
|
|
146
145
|
case BatchingPreference.ALL_AT_ONCE:
|
|
147
146
|
itr = [non_cached_dataset[:]]
|
|
148
147
|
case _:
|
|
149
148
|
num_batches = len(non_cached_dataset) // benchmark_config.batch_size
|
|
150
149
|
if len(non_cached_dataset) % benchmark_config.batch_size != 0:
|
|
151
150
|
num_batches += 1
|
|
152
|
-
itr =
|
|
151
|
+
itr = get_pbar(
|
|
153
152
|
iterable=mit.batched(
|
|
154
153
|
iterable=non_cached_dataset, n=benchmark_config.batch_size
|
|
155
154
|
),
|
|
@@ -297,7 +296,7 @@ def debug_log(
|
|
|
297
296
|
+ "\n"
|
|
298
297
|
+ "\t".join(labels)
|
|
299
298
|
)
|
|
300
|
-
|
|
299
|
+
log("\n\n".join(log_msgs), level=logging.DEBUG)
|
|
301
300
|
return
|
|
302
301
|
|
|
303
302
|
case (
|
|
@@ -347,6 +346,7 @@ def debug_log(
|
|
|
347
346
|
if labels[idx]:
|
|
348
347
|
data_to_log["Label"] = labels[idx]
|
|
349
348
|
data_to_log |= {key.capitalize(): batch[key][idx] for key in metadata_keys}
|
|
350
|
-
|
|
351
|
-
"\n".join(f"{key}: {value!r}" for key, value in data_to_log.items())
|
|
349
|
+
log(
|
|
350
|
+
"\n".join(f"{key}: {value!r}" for key, value in data_to_log.items()),
|
|
351
|
+
level=logging.DEBUG,
|
|
352
352
|
)
|
euroeval/generation_utils.py
CHANGED
|
@@ -9,8 +9,9 @@ import typing as t
|
|
|
9
9
|
|
|
10
10
|
from .enums import GenerativeType, TaskGroup
|
|
11
11
|
from .exceptions import InvalidBenchmark, InvalidModel
|
|
12
|
+
from .logging_utils import log_once
|
|
12
13
|
from .tokenisation_utils import apply_chat_template
|
|
13
|
-
from .utils import extract_multiple_choice_labels
|
|
14
|
+
from .utils import extract_multiple_choice_labels
|
|
14
15
|
|
|
15
16
|
if t.TYPE_CHECKING:
|
|
16
17
|
from datasets import DatasetDict
|
|
@@ -18,8 +19,6 @@ if t.TYPE_CHECKING:
|
|
|
18
19
|
|
|
19
20
|
from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
|
|
20
21
|
|
|
21
|
-
logger = logging.getLogger("euroeval")
|
|
22
|
-
|
|
23
22
|
|
|
24
23
|
def extract_few_shot_examples(
|
|
25
24
|
dataset: "DatasetDict",
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""Utility functions related to logging."""
|
|
2
|
+
|
|
3
|
+
import datetime as dt
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
import warnings
|
|
8
|
+
from io import TextIOWrapper
|
|
9
|
+
|
|
10
|
+
import litellm
|
|
11
|
+
from datasets.utils import disable_progress_bars as disable_datasets_progress_bars
|
|
12
|
+
from evaluate import disable_progress_bar as disable_evaluate_progress_bar
|
|
13
|
+
from huggingface_hub.utils.tqdm import (
|
|
14
|
+
disable_progress_bars as disable_hf_hub_progress_bars,
|
|
15
|
+
)
|
|
16
|
+
from termcolor import colored
|
|
17
|
+
from tqdm.auto import tqdm
|
|
18
|
+
from transformers import logging as tf_logging
|
|
19
|
+
|
|
20
|
+
from .caching_utils import cache_arguments
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("euroeval")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_pbar(*tqdm_args, **tqdm_kwargs) -> tqdm:
|
|
26
|
+
"""Get a progress bar for vLLM with custom hard-coded arguments.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
*tqdm_args:
|
|
30
|
+
Positional arguments to pass to tqdm.
|
|
31
|
+
**tqdm_kwargs:
|
|
32
|
+
Additional keyword arguments to pass to tqdm.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
A tqdm progress bar.
|
|
36
|
+
"""
|
|
37
|
+
tqdm_kwargs = dict(colour="yellow", ascii="—▰", leave=False) | tqdm_kwargs
|
|
38
|
+
tqdm_kwargs["desc"] = colored(
|
|
39
|
+
text=tqdm_kwargs.get("desc", "Processing"), color="light_yellow"
|
|
40
|
+
)
|
|
41
|
+
return tqdm(*tqdm_args, **tqdm_kwargs)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def log(message: str, level: int, colour: str | None = None) -> None:
|
|
45
|
+
"""Log a message.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
message:
|
|
49
|
+
The message to log.
|
|
50
|
+
level:
|
|
51
|
+
The logging level. Defaults to logging.INFO.
|
|
52
|
+
colour:
|
|
53
|
+
The colour to use for the message. If None, a default colour will be used
|
|
54
|
+
based on the logging level.
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
ValueError:
|
|
58
|
+
If the logging level is invalid.
|
|
59
|
+
"""
|
|
60
|
+
match level:
|
|
61
|
+
case logging.DEBUG:
|
|
62
|
+
message = colored(
|
|
63
|
+
text=(
|
|
64
|
+
"[DEBUG] "
|
|
65
|
+
+ dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
66
|
+
+ f" · {message}"
|
|
67
|
+
),
|
|
68
|
+
color=colour or "light_blue",
|
|
69
|
+
)
|
|
70
|
+
logger.debug(message)
|
|
71
|
+
case logging.INFO:
|
|
72
|
+
if colour is not None:
|
|
73
|
+
message = colored(text=message, color=colour)
|
|
74
|
+
logger.info(message)
|
|
75
|
+
case logging.WARNING:
|
|
76
|
+
message = colored(text=message, color=colour or "light_red")
|
|
77
|
+
logger.warning(message)
|
|
78
|
+
case logging.ERROR:
|
|
79
|
+
message = colored(text=message, color=colour or "red")
|
|
80
|
+
logger.error(message)
|
|
81
|
+
case logging.CRITICAL:
|
|
82
|
+
message = colored(text=message, color=colour or "red")
|
|
83
|
+
logger.critical(message)
|
|
84
|
+
case _:
|
|
85
|
+
raise ValueError(f"Invalid logging level: {level}")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@cache_arguments("message")
|
|
89
|
+
def log_once(message: str, level: int = logging.INFO, prefix: str = "") -> None:
|
|
90
|
+
"""Log a message once.
|
|
91
|
+
|
|
92
|
+
This is ensured by caching the "message" argument and only logging it the first time
|
|
93
|
+
this function is called with that message.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
message:
|
|
97
|
+
The message to log.
|
|
98
|
+
level:
|
|
99
|
+
The logging level. Defaults to logging.INFO.
|
|
100
|
+
prefix:
|
|
101
|
+
A prefix to add to the message, which is not considered when determining if
|
|
102
|
+
the message has been logged before.
|
|
103
|
+
"""
|
|
104
|
+
log(message=prefix + message, level=level)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def block_terminal_output() -> None:
|
|
108
|
+
"""Blocks libraries from writing output to the terminal.
|
|
109
|
+
|
|
110
|
+
This filters warnings from some libraries, sets the logging level to ERROR for some
|
|
111
|
+
libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and
|
|
112
|
+
disables most of the logging from the `transformers` library.
|
|
113
|
+
"""
|
|
114
|
+
if os.getenv("FULL_LOG") == "1":
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
# Ignore miscellaneous warnings
|
|
118
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
119
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
120
|
+
logging.getLogger("absl").setLevel(logging.CRITICAL)
|
|
121
|
+
|
|
122
|
+
# Disable matplotlib logging
|
|
123
|
+
logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
|
|
124
|
+
|
|
125
|
+
# Disable PyTorch logging
|
|
126
|
+
logging.getLogger("torch.utils.cpp_extension").setLevel(logging.CRITICAL)
|
|
127
|
+
warnings.filterwarnings(action="ignore", module="torch*")
|
|
128
|
+
os.environ["TORCH_LOGS"] = "-all"
|
|
129
|
+
|
|
130
|
+
# Disable huggingface_hub logging
|
|
131
|
+
logging.getLogger("huggingface_hub").setLevel(logging.CRITICAL)
|
|
132
|
+
disable_hf_hub_progress_bars()
|
|
133
|
+
|
|
134
|
+
# Disable LiteLLM logging
|
|
135
|
+
logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
|
|
136
|
+
logging.getLogger("LiteLLM Router").setLevel(logging.CRITICAL)
|
|
137
|
+
logging.getLogger("LiteLLM Proxy").setLevel(logging.CRITICAL)
|
|
138
|
+
logging.getLogger("openai").setLevel(logging.CRITICAL)
|
|
139
|
+
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
140
|
+
litellm.suppress_debug_info = True
|
|
141
|
+
|
|
142
|
+
# Disable vLLM logging
|
|
143
|
+
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
144
|
+
logging.getLogger("vllm.engine.llm_engine").setLevel(logging.CRITICAL)
|
|
145
|
+
logging.getLogger("vllm.transformers_utils.tokenizer").setLevel(logging.CRITICAL)
|
|
146
|
+
logging.getLogger("vllm.core.scheduler").setLevel(logging.CRITICAL)
|
|
147
|
+
logging.getLogger("vllm.model_executor.weight_utils").setLevel(logging.CRITICAL)
|
|
148
|
+
logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
|
|
149
|
+
logging.getLogger("mistral_common.tokens.tokenizers.tekken").setLevel(
|
|
150
|
+
logging.CRITICAL
|
|
151
|
+
)
|
|
152
|
+
os.environ["LOG_LEVEL"] = "CRITICAL"
|
|
153
|
+
os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
|
|
154
|
+
|
|
155
|
+
# Disable flashinfer logging
|
|
156
|
+
os.environ["FLASHINFER_LOGGING_LEVEL"] = "CRITICAL"
|
|
157
|
+
|
|
158
|
+
# Disable datasets logging
|
|
159
|
+
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
160
|
+
logging.getLogger("filelock").setLevel(logging.CRITICAL)
|
|
161
|
+
disable_datasets_progress_bars()
|
|
162
|
+
|
|
163
|
+
# Disable evaluate logging
|
|
164
|
+
warnings.filterwarnings("ignore", module="seqeval*")
|
|
165
|
+
disable_evaluate_progress_bar()
|
|
166
|
+
|
|
167
|
+
# Disable most of the `transformers` logging
|
|
168
|
+
tf_logging._default_log_level = logging.CRITICAL
|
|
169
|
+
tf_logging.set_verbosity(logging.CRITICAL)
|
|
170
|
+
logging.getLogger("transformers.trainer").setLevel(logging.CRITICAL)
|
|
171
|
+
logging.getLogger("accelerate").setLevel(logging.CRITICAL)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class no_terminal_output:
|
|
175
|
+
"""Context manager that suppresses all terminal output."""
|
|
176
|
+
|
|
177
|
+
def __init__(self, disable: bool = False) -> None:
|
|
178
|
+
"""Initialise the context manager.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
disable:
|
|
182
|
+
If True, this context manager does nothing.
|
|
183
|
+
"""
|
|
184
|
+
self.disable = disable
|
|
185
|
+
self.nothing_file: TextIOWrapper | None = None
|
|
186
|
+
self._cpp_stdout_file: int | None = None
|
|
187
|
+
self._cpp_stderr_file: int | None = None
|
|
188
|
+
try:
|
|
189
|
+
self._cpp_stdout_file = os.dup(sys.stdout.fileno())
|
|
190
|
+
self._cpp_stderr_file = os.dup(sys.stderr.fileno())
|
|
191
|
+
except OSError:
|
|
192
|
+
self._log_windows_warning()
|
|
193
|
+
|
|
194
|
+
def _log_windows_warning(self) -> None:
|
|
195
|
+
"""Log a warning about Windows not supporting blocking terminal output."""
|
|
196
|
+
log_once(
|
|
197
|
+
"Your operating system (probably Windows) does not support blocking "
|
|
198
|
+
"terminal output, so expect more messy output - sorry!",
|
|
199
|
+
level=logging.WARNING,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
def __enter__(self) -> None:
|
|
203
|
+
"""Suppress all terminal output."""
|
|
204
|
+
if not self.disable:
|
|
205
|
+
self.nothing_file = open(os.devnull, "w")
|
|
206
|
+
try:
|
|
207
|
+
os.dup2(fd=self.nothing_file.fileno(), fd2=sys.stdout.fileno())
|
|
208
|
+
os.dup2(fd=self.nothing_file.fileno(), fd2=sys.stderr.fileno())
|
|
209
|
+
except OSError:
|
|
210
|
+
self._log_windows_warning()
|
|
211
|
+
|
|
212
|
+
def __exit__(
|
|
213
|
+
self,
|
|
214
|
+
exc_type: type[BaseException] | None,
|
|
215
|
+
exc_val: BaseException | None,
|
|
216
|
+
exc_tb: type[BaseException] | None,
|
|
217
|
+
) -> None:
|
|
218
|
+
"""Re-enable terminal output."""
|
|
219
|
+
if not self.disable:
|
|
220
|
+
if self.nothing_file is not None:
|
|
221
|
+
self.nothing_file.close()
|
|
222
|
+
try:
|
|
223
|
+
if self._cpp_stdout_file is not None:
|
|
224
|
+
os.dup2(fd=self._cpp_stdout_file, fd2=sys.stdout.fileno())
|
|
225
|
+
if self._cpp_stderr_file is not None:
|
|
226
|
+
os.dup2(fd=self._cpp_stderr_file, fd2=sys.stderr.fileno())
|
|
227
|
+
except OSError:
|
|
228
|
+
self._log_windows_warning()
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
|
|
232
|
+
"""Adjust the logging level based on verbosity.
|
|
233
|
+
|
|
234
|
+
Args:
|
|
235
|
+
verbose:
|
|
236
|
+
Whether to output additional output.
|
|
237
|
+
ignore_testing:
|
|
238
|
+
Whether to ignore the testing flag.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
The logging level that was set.
|
|
242
|
+
"""
|
|
243
|
+
if hasattr(sys, "_called_from_test") and not ignore_testing:
|
|
244
|
+
logging_level = logging.CRITICAL
|
|
245
|
+
elif verbose:
|
|
246
|
+
logging_level = logging.DEBUG
|
|
247
|
+
else:
|
|
248
|
+
logging_level = logging.INFO
|
|
249
|
+
logger.setLevel(logging_level)
|
|
250
|
+
return logging_level
|
euroeval/metrics/base.py
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
4
|
import collections.abc as c
|
|
5
|
-
import logging
|
|
6
5
|
import typing as t
|
|
7
6
|
|
|
8
7
|
if t.TYPE_CHECKING:
|
|
@@ -10,8 +9,6 @@ if t.TYPE_CHECKING:
|
|
|
10
9
|
|
|
11
10
|
from ..data_models import BenchmarkConfig, DatasetConfig
|
|
12
11
|
|
|
13
|
-
logger: logging.Logger = logging.getLogger("euroeval")
|
|
14
|
-
|
|
15
12
|
|
|
16
13
|
class Metric(abc.ABC):
|
|
17
14
|
"""Abstract base class for all metrics."""
|
euroeval/metrics/huggingface.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All the Hugging Face metrics used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
-
import logging
|
|
5
4
|
import typing as t
|
|
6
5
|
from pathlib import Path
|
|
7
6
|
|
|
@@ -9,7 +8,7 @@ import evaluate
|
|
|
9
8
|
import numpy as np
|
|
10
9
|
from datasets import DownloadConfig
|
|
11
10
|
|
|
12
|
-
from ..
|
|
11
|
+
from ..logging_utils import no_terminal_output
|
|
13
12
|
from .base import Metric
|
|
14
13
|
|
|
15
14
|
if t.TYPE_CHECKING:
|
|
@@ -18,8 +17,6 @@ if t.TYPE_CHECKING:
|
|
|
18
17
|
|
|
19
18
|
from ..data_models import BenchmarkConfig, DatasetConfig
|
|
20
19
|
|
|
21
|
-
logger: logging.Logger = logging.getLogger("euroeval")
|
|
22
|
-
|
|
23
20
|
|
|
24
21
|
class HuggingFaceMetric(Metric):
|
|
25
22
|
"""A metric which is implemented in the `evaluate` package.
|
|
@@ -126,7 +123,7 @@ class HuggingFaceMetric(Metric):
|
|
|
126
123
|
|
|
127
124
|
assert self.metric is not None
|
|
128
125
|
|
|
129
|
-
with
|
|
126
|
+
with no_terminal_output(disable=benchmark_config.verbose):
|
|
130
127
|
results = self.metric.compute(
|
|
131
128
|
predictions=predictions, references=references, **self.compute_kwargs
|
|
132
129
|
)
|
|
@@ -145,6 +142,13 @@ class HuggingFaceMetric(Metric):
|
|
|
145
142
|
|
|
146
143
|
return score
|
|
147
144
|
|
|
145
|
+
def __del__(self) -> None:
|
|
146
|
+
"""Clean up the metric from memory."""
|
|
147
|
+
if self.metric is not None:
|
|
148
|
+
if self.metric.writer is not None:
|
|
149
|
+
self.metric.writer.close()
|
|
150
|
+
del self.metric
|
|
151
|
+
|
|
148
152
|
|
|
149
153
|
mcc_metric = HuggingFaceMetric(
|
|
150
154
|
name="mcc",
|
|
@@ -197,7 +201,7 @@ bert_score_metric = HuggingFaceMetric(
|
|
|
197
201
|
huggingface_id="bertscore",
|
|
198
202
|
results_key="f1",
|
|
199
203
|
compute_kwargs=dict(
|
|
200
|
-
model_type="microsoft/mdeberta-v3-base", device="
|
|
204
|
+
model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
|
|
201
205
|
),
|
|
202
206
|
)
|
|
203
207
|
|
|
@@ -8,6 +8,7 @@ from pathlib import Path
|
|
|
8
8
|
from pydantic import BaseModel, Field
|
|
9
9
|
|
|
10
10
|
from ..exceptions import InvalidBenchmark
|
|
11
|
+
from ..logging_utils import log
|
|
11
12
|
from ..model_cache import ModelCache
|
|
12
13
|
from ..utils import extract_json_dict_from_string
|
|
13
14
|
from .base import Metric
|
|
@@ -17,8 +18,6 @@ if t.TYPE_CHECKING:
|
|
|
17
18
|
|
|
18
19
|
from ..data_models import BenchmarkConfig, DatasetConfig
|
|
19
20
|
|
|
20
|
-
logger: logging.Logger = logging.getLogger("euroeval")
|
|
21
|
-
|
|
22
21
|
|
|
23
22
|
class LLMAsAJudgeMetric(Metric):
|
|
24
23
|
"""Use an LLM to judge the quality of the predictions."""
|
|
@@ -190,7 +189,10 @@ class LLMAsAJudgeMetric(Metric):
|
|
|
190
189
|
# Calculate the scores using the scoring function
|
|
191
190
|
scores = [self.scoring_fn(output) for output in outputs]
|
|
192
191
|
if not scores:
|
|
193
|
-
|
|
192
|
+
log(
|
|
193
|
+
f"No scores were calculated for {self.pretty_name}.",
|
|
194
|
+
level=logging.WARNING,
|
|
195
|
+
)
|
|
194
196
|
return None
|
|
195
197
|
return sum(scores) / len(scores)
|
|
196
198
|
|
euroeval/metrics/pipeline.py
CHANGED
|
@@ -11,6 +11,7 @@ import numpy as np
|
|
|
11
11
|
from scipy.special import expit as sigmoid
|
|
12
12
|
|
|
13
13
|
from ..exceptions import InvalidBenchmark
|
|
14
|
+
from ..logging_utils import log, no_terminal_output
|
|
14
15
|
from ..utils import unscramble
|
|
15
16
|
from .base import Metric
|
|
16
17
|
|
|
@@ -20,8 +21,6 @@ if t.TYPE_CHECKING:
|
|
|
20
21
|
|
|
21
22
|
from ..data_models import BenchmarkConfig, DatasetConfig
|
|
22
23
|
|
|
23
|
-
logger: logging.Logger = logging.getLogger("euroeval")
|
|
24
|
-
|
|
25
24
|
|
|
26
25
|
T = t.TypeVar("T", bound=int | float | str | bool)
|
|
27
26
|
|
|
@@ -121,16 +120,22 @@ class PipelineMetric(Metric):
|
|
|
121
120
|
The calculated metric score, or None if the score should be ignored.
|
|
122
121
|
"""
|
|
123
122
|
if self.pipeline is None:
|
|
124
|
-
self.pipeline = self._download_pipeline(
|
|
123
|
+
self.pipeline = self._download_pipeline(
|
|
124
|
+
cache_dir=benchmark_config.cache_dir
|
|
125
|
+
)
|
|
125
126
|
if self.preprocessing_fn is not None:
|
|
126
127
|
predictions = self.preprocessing_fn(
|
|
127
128
|
predictions=predictions, dataset=dataset
|
|
128
129
|
)
|
|
129
130
|
return self.pipeline_scoring_function(self.pipeline, predictions)
|
|
130
131
|
|
|
131
|
-
def _download_pipeline(self) -> "Pipeline":
|
|
132
|
+
def _download_pipeline(self, cache_dir: str) -> "Pipeline":
|
|
132
133
|
"""Download the scikit-learn pipeline from the given URL.
|
|
133
134
|
|
|
135
|
+
Args:
|
|
136
|
+
cache_dir:
|
|
137
|
+
The directory to use for caching the downloaded pipeline.
|
|
138
|
+
|
|
134
139
|
Returns:
|
|
135
140
|
The downloaded scikit-learn pipeline.
|
|
136
141
|
|
|
@@ -138,10 +143,13 @@ class PipelineMetric(Metric):
|
|
|
138
143
|
InvalidBenchmark:
|
|
139
144
|
If the loading of the pipeline fails for any reason.
|
|
140
145
|
"""
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
146
|
+
log(f"Loading pipeline from {self.pipeline_repo}...", level=logging.DEBUG)
|
|
147
|
+
with no_terminal_output():
|
|
148
|
+
folder_path = hf_hub.HfApi(
|
|
149
|
+
token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_")
|
|
150
|
+
).snapshot_download(
|
|
151
|
+
repo_id=self.pipeline_repo, repo_type="model", cache_dir=cache_dir
|
|
152
|
+
)
|
|
145
153
|
model_path = Path(folder_path, self.pipeline_file_name)
|
|
146
154
|
try:
|
|
147
155
|
with model_path.open(mode="rb") as f:
|
|
@@ -150,7 +158,7 @@ class PipelineMetric(Metric):
|
|
|
150
158
|
raise InvalidBenchmark(
|
|
151
159
|
f"Failed to load pipeline from {self.pipeline_repo!r}: {e}"
|
|
152
160
|
) from e
|
|
153
|
-
|
|
161
|
+
log(f"Successfully loaded pipeline: {pipeline}", level=logging.DEBUG)
|
|
154
162
|
return pipeline
|
|
155
163
|
|
|
156
164
|
|
|
@@ -191,6 +199,11 @@ def european_values_preprocessing_fn(
|
|
|
191
199
|
for idx, choice in idx_to_choice.items()
|
|
192
200
|
if choice is not None
|
|
193
201
|
}
|
|
202
|
+
if prediction not in idx_to_choice:
|
|
203
|
+
raise InvalidBenchmark(
|
|
204
|
+
f"The prediction {prediction} is not a valid index for the "
|
|
205
|
+
f"question with choices {idx_to_choice}."
|
|
206
|
+
)
|
|
194
207
|
integer_prediction = idx_to_choice[prediction]
|
|
195
208
|
integer_predictions.append(integer_prediction)
|
|
196
209
|
|