EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (65) hide show
  1. euroeval/__init__.py +7 -4
  2. euroeval/benchmark_config_factory.py +0 -4
  3. euroeval/benchmark_modules/base.py +3 -16
  4. euroeval/benchmark_modules/fresh.py +5 -2
  5. euroeval/benchmark_modules/hf.py +107 -66
  6. euroeval/benchmark_modules/litellm.py +103 -55
  7. euroeval/benchmark_modules/vllm.py +155 -82
  8. euroeval/benchmarker.py +184 -129
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +1 -1
  12. euroeval/constants.py +9 -0
  13. euroeval/data_loading.py +14 -11
  14. euroeval/data_models.py +12 -4
  15. euroeval/dataset_configs/__init__.py +3 -0
  16. euroeval/dataset_configs/czech.py +79 -0
  17. euroeval/dataset_configs/danish.py +10 -13
  18. euroeval/dataset_configs/dutch.py +0 -3
  19. euroeval/dataset_configs/english.py +0 -3
  20. euroeval/dataset_configs/estonian.py +11 -1
  21. euroeval/dataset_configs/finnish.py +0 -3
  22. euroeval/dataset_configs/french.py +0 -3
  23. euroeval/dataset_configs/german.py +0 -3
  24. euroeval/dataset_configs/italian.py +0 -3
  25. euroeval/dataset_configs/latvian.py +2 -4
  26. euroeval/dataset_configs/lithuanian.py +68 -0
  27. euroeval/dataset_configs/norwegian.py +0 -3
  28. euroeval/dataset_configs/polish.py +0 -3
  29. euroeval/dataset_configs/portuguese.py +0 -3
  30. euroeval/dataset_configs/slovak.py +60 -0
  31. euroeval/dataset_configs/spanish.py +0 -3
  32. euroeval/dataset_configs/swedish.py +10 -15
  33. euroeval/finetuning.py +21 -15
  34. euroeval/generation.py +10 -10
  35. euroeval/generation_utils.py +2 -3
  36. euroeval/logging_utils.py +250 -0
  37. euroeval/metrics/base.py +0 -3
  38. euroeval/metrics/huggingface.py +10 -6
  39. euroeval/metrics/llm_as_a_judge.py +5 -3
  40. euroeval/metrics/pipeline.py +22 -9
  41. euroeval/metrics/speed.py +0 -3
  42. euroeval/model_cache.py +11 -14
  43. euroeval/model_config.py +4 -5
  44. euroeval/model_loading.py +3 -0
  45. euroeval/prompt_templates/linguistic_acceptability.py +30 -3
  46. euroeval/prompt_templates/multiple_choice.py +34 -1
  47. euroeval/prompt_templates/named_entity_recognition.py +71 -11
  48. euroeval/prompt_templates/reading_comprehension.py +41 -3
  49. euroeval/prompt_templates/sentiment_classification.py +34 -1
  50. euroeval/prompt_templates/summarization.py +26 -6
  51. euroeval/scores.py +7 -7
  52. euroeval/speed_benchmark.py +3 -5
  53. euroeval/task_group_utils/multiple_choice_classification.py +0 -3
  54. euroeval/task_group_utils/question_answering.py +0 -3
  55. euroeval/task_group_utils/sequence_classification.py +43 -31
  56. euroeval/task_group_utils/text_to_text.py +17 -8
  57. euroeval/task_group_utils/token_classification.py +10 -9
  58. euroeval/tokenisation_utils.py +22 -20
  59. euroeval/utils.py +30 -147
  60. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
  61. euroeval-16.4.0.dist-info/RECORD +75 -0
  62. euroeval-16.2.2.dist-info/RECORD +0 -70
  63. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
  64. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
  65. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,6 @@
1
1
  """All Swedish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import SV
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -33,11 +32,11 @@ SUC3_CONFIG = DatasetConfig(
33
32
  languages=[SV],
34
33
  )
35
34
 
36
- SCANDIQA_SV_CONFIG = DatasetConfig(
37
- name="scandiqa-sv",
38
- pretty_name="the Swedish part of the truncated version of the question answering "
39
- "dataset ScandiQA",
40
- huggingface_id="EuroEval/scandiqa-sv-mini",
35
+ MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
36
+ name="multi-wiki-qa-sv",
37
+ pretty_name="the truncated version of the Swedish part of the reading "
38
+ "comprehension dataset MultiWikiQA",
39
+ huggingface_id="EuroEval/multi-wiki-qa-sv-mini",
41
40
  task=RC,
42
41
  languages=[SV],
43
42
  )
@@ -111,11 +110,11 @@ BELEBELE_SV_CONFIG = DatasetConfig(
111
110
  unofficial=True,
112
111
  )
113
112
 
114
- MULTI_WIKI_QA_SV_CONFIG = DatasetConfig(
115
- name="multi-wiki-qa-sv",
116
- pretty_name="the truncated version of the Swedish part of the reading "
117
- "comprehension dataset MultiWikiQA",
118
- huggingface_id="EuroEval/multi-wiki-qa-sv-mini",
113
+ SCANDIQA_SV_CONFIG = DatasetConfig(
114
+ name="scandiqa-sv",
115
+ pretty_name="the Swedish part of the truncated version of the question answering "
116
+ "dataset ScandiQA",
117
+ huggingface_id="EuroEval/scandiqa-sv-mini",
119
118
  task=RC,
120
119
  languages=[SV],
121
120
  unofficial=True,
@@ -138,9 +137,7 @@ WINOGRANDE_SV_CONFIG = DatasetConfig(
138
137
  huggingface_id="EuroEval/winogrande-sv",
139
138
  task=COMMON_SENSE,
140
139
  languages=[SV],
141
- splits=["train", "test"],
142
140
  _labels=["a", "b"],
143
- _allowed_model_types=[ModelType.GENERATIVE],
144
141
  unofficial=True,
145
142
  )
146
143
 
@@ -176,7 +173,5 @@ SKOLPROV_CONFIG = DatasetConfig(
176
173
  huggingface_id="EuroEval/skolprov",
177
174
  task=KNOW,
178
175
  languages=[SV],
179
- splits=["train", "test"],
180
- _allowed_model_types=[ModelType.GENERATIVE],
181
176
  unofficial=True,
182
177
  )
euroeval/finetuning.py CHANGED
@@ -6,7 +6,6 @@ import typing as t
6
6
  from functools import partial
7
7
 
8
8
  import torch
9
- from tqdm.auto import tqdm
10
9
  from transformers.trainer_callback import (
11
10
  EarlyStoppingCallback,
12
11
  PrinterCallback,
@@ -18,13 +17,9 @@ from transformers.training_args import OptimizerNames, TrainingArguments
18
17
  from .callbacks import NeverLeaveProgressCallback
19
18
  from .enums import DataType
20
19
  from .exceptions import InvalidBenchmark, NaNValueInModelOutput
20
+ from .logging_utils import block_terminal_output, get_pbar, log, log_once
21
21
  from .model_loading import load_model
22
- from .utils import (
23
- block_terminal_output,
24
- clear_memory,
25
- enforce_reproducibility,
26
- log_once,
27
- )
22
+ from .utils import clear_memory, enforce_reproducibility
28
23
 
29
24
  if t.TYPE_CHECKING:
30
25
  from datasets import DatasetDict
@@ -32,8 +27,6 @@ if t.TYPE_CHECKING:
32
27
  from .benchmark_modules import BenchmarkModule
33
28
  from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
34
29
 
35
- logger = logging.getLogger("euroeval")
36
-
37
30
 
38
31
  def finetune(
39
32
  model: "BenchmarkModule",
@@ -58,6 +51,10 @@ def finetune(
58
51
 
59
52
  Returns:
60
53
  A list of dicts containing the scores for each metric for each iteration.
54
+
55
+ Raises:
56
+ InvalidBenchmark:
57
+ If the benchmark could not be completed.
61
58
  """
62
59
  # Set the data type to use for the model weights
63
60
  using_cuda = benchmark_config.device == torch.device("cuda")
@@ -70,7 +67,7 @@ def finetune(
70
67
 
71
68
  bs: int = benchmark_config.batch_size
72
69
  scores: list[dict[str, float]] = list()
73
- for idx in tqdm(
70
+ for idx in get_pbar(
74
71
  iterable=range(benchmark_config.num_iterations),
75
72
  desc="Benchmarking",
76
73
  disable=not benchmark_config.progress_bar,
@@ -80,7 +77,7 @@ def finetune(
80
77
  model_already_initialized = idx == 0
81
78
 
82
79
  # Run a loop here to deal with automatic reduction of batch size
83
- while True:
80
+ for _ in range(num_attempts := 10):
84
81
  # Clear GPU memory
85
82
  if not model_already_initialized:
86
83
  try:
@@ -112,7 +109,10 @@ def finetune(
112
109
  )
113
110
 
114
111
  scores.append(itr_scores)
115
- logger.debug(f"Test scores for iteration {idx}: {itr_scores}")
112
+ log(
113
+ f"Test scores for iteration {idx}: {itr_scores}",
114
+ level=logging.DEBUG,
115
+ )
116
116
 
117
117
  break
118
118
 
@@ -123,9 +123,10 @@ def finetune(
123
123
  if dtype != DataType.FP32:
124
124
  dtype = DataType.FP32
125
125
  model_already_initialized = False
126
- logger.debug(
126
+ log(
127
127
  "NaN value detected in model outputs while using mixed "
128
- "precision. Retrying with full fp32 precision."
128
+ "precision. Retrying with full fp32 precision.",
129
+ level=logging.DEBUG,
129
130
  )
130
131
  else:
131
132
  raise InvalidBenchmark(
@@ -151,7 +152,12 @@ def finetune(
151
152
  model_already_initialized = False
152
153
 
153
154
  bs //= 2
154
- logger.debug(f"Reduced batch size to {bs}")
155
+ log(f"Reduced batch size to {bs}", level=logging.DEBUG)
156
+
157
+ else:
158
+ raise InvalidBenchmark(
159
+ f"Could not benchmark the model after {num_attempts} attempts!"
160
+ )
155
161
 
156
162
  return scores
157
163
 
euroeval/generation.py CHANGED
@@ -11,12 +11,13 @@ from tqdm.auto import tqdm
11
11
 
12
12
  from .enums import BatchingPreference, TaskGroup
13
13
  from .exceptions import InvalidBenchmark
14
+ from .logging_utils import get_pbar, log, log_once
14
15
  from .model_cache import (
15
16
  ModelCache,
16
17
  load_cached_model_outputs,
17
18
  split_dataset_into_cached_and_non_cached,
18
19
  )
19
- from .utils import clear_memory, log_once
20
+ from .utils import clear_memory
20
21
 
21
22
  if t.TYPE_CHECKING:
22
23
  from datasets import DatasetDict
@@ -29,8 +30,6 @@ if t.TYPE_CHECKING:
29
30
  ModelConfig,
30
31
  )
31
32
 
32
- logger = logging.getLogger("euroeval")
33
-
34
33
 
35
34
  def generate(
36
35
  model: "BenchmarkModule",
@@ -78,7 +77,7 @@ def generate(
78
77
  )
79
78
 
80
79
  scores: list[dict[str, float]] = list()
81
- for idx in tqdm(
80
+ for idx in get_pbar(
82
81
  iterable=range(len(datasets)),
83
82
  desc="Benchmarking",
84
83
  disable=not benchmark_config.progress_bar,
@@ -90,7 +89,7 @@ def generate(
90
89
  dataset_config=dataset_config,
91
90
  benchmark_config=benchmark_config,
92
91
  )
93
- logger.debug(f"Test scores for iteration {idx}: {test_scores}")
92
+ log(f"Test scores for iteration {idx}: {test_scores}", level=logging.DEBUG)
94
93
  scores.append(test_scores)
95
94
  clear_memory()
96
95
 
@@ -142,14 +141,14 @@ def generate_single_iteration(
142
141
  itr: t.Iterable
143
142
  match model.batching_preference:
144
143
  case BatchingPreference.SINGLE_SAMPLE:
145
- itr = tqdm(iterable=non_cached_dataset, leave=False)
144
+ itr = get_pbar(iterable=non_cached_dataset)
146
145
  case BatchingPreference.ALL_AT_ONCE:
147
146
  itr = [non_cached_dataset[:]]
148
147
  case _:
149
148
  num_batches = len(non_cached_dataset) // benchmark_config.batch_size
150
149
  if len(non_cached_dataset) % benchmark_config.batch_size != 0:
151
150
  num_batches += 1
152
- itr = tqdm(
151
+ itr = get_pbar(
153
152
  iterable=mit.batched(
154
153
  iterable=non_cached_dataset, n=benchmark_config.batch_size
155
154
  ),
@@ -297,7 +296,7 @@ def debug_log(
297
296
  + "\n"
298
297
  + "\t".join(labels)
299
298
  )
300
- logger.info("\n\n".join(log_msgs))
299
+ log("\n\n".join(log_msgs), level=logging.DEBUG)
301
300
  return
302
301
 
303
302
  case (
@@ -347,6 +346,7 @@ def debug_log(
347
346
  if labels[idx]:
348
347
  data_to_log["Label"] = labels[idx]
349
348
  data_to_log |= {key.capitalize(): batch[key][idx] for key in metadata_keys}
350
- logger.info(
351
- "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items())
349
+ log(
350
+ "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items()),
351
+ level=logging.DEBUG,
352
352
  )
@@ -9,8 +9,9 @@ import typing as t
9
9
 
10
10
  from .enums import GenerativeType, TaskGroup
11
11
  from .exceptions import InvalidBenchmark, InvalidModel
12
+ from .logging_utils import log_once
12
13
  from .tokenisation_utils import apply_chat_template
13
- from .utils import extract_multiple_choice_labels, log_once
14
+ from .utils import extract_multiple_choice_labels
14
15
 
15
16
  if t.TYPE_CHECKING:
16
17
  from datasets import DatasetDict
@@ -18,8 +19,6 @@ if t.TYPE_CHECKING:
18
19
 
19
20
  from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
20
21
 
21
- logger = logging.getLogger("euroeval")
22
-
23
22
 
24
23
  def extract_few_shot_examples(
25
24
  dataset: "DatasetDict",
@@ -0,0 +1,250 @@
1
+ """Utility functions related to logging."""
2
+
3
+ import datetime as dt
4
+ import logging
5
+ import os
6
+ import sys
7
+ import warnings
8
+ from io import TextIOWrapper
9
+
10
+ import litellm
11
+ from datasets.utils import disable_progress_bars as disable_datasets_progress_bars
12
+ from evaluate import disable_progress_bar as disable_evaluate_progress_bar
13
+ from huggingface_hub.utils.tqdm import (
14
+ disable_progress_bars as disable_hf_hub_progress_bars,
15
+ )
16
+ from termcolor import colored
17
+ from tqdm.auto import tqdm
18
+ from transformers import logging as tf_logging
19
+
20
+ from .caching_utils import cache_arguments
21
+
22
+ logger = logging.getLogger("euroeval")
23
+
24
+
25
+ def get_pbar(*tqdm_args, **tqdm_kwargs) -> tqdm:
26
+ """Get a progress bar for vLLM with custom hard-coded arguments.
27
+
28
+ Args:
29
+ *tqdm_args:
30
+ Positional arguments to pass to tqdm.
31
+ **tqdm_kwargs:
32
+ Additional keyword arguments to pass to tqdm.
33
+
34
+ Returns:
35
+ A tqdm progress bar.
36
+ """
37
+ tqdm_kwargs = dict(colour="yellow", ascii="—▰", leave=False) | tqdm_kwargs
38
+ tqdm_kwargs["desc"] = colored(
39
+ text=tqdm_kwargs.get("desc", "Processing"), color="light_yellow"
40
+ )
41
+ return tqdm(*tqdm_args, **tqdm_kwargs)
42
+
43
+
44
+ def log(message: str, level: int, colour: str | None = None) -> None:
45
+ """Log a message.
46
+
47
+ Args:
48
+ message:
49
+ The message to log.
50
+ level:
51
+ The logging level. Defaults to logging.INFO.
52
+ colour:
53
+ The colour to use for the message. If None, a default colour will be used
54
+ based on the logging level.
55
+
56
+ Raises:
57
+ ValueError:
58
+ If the logging level is invalid.
59
+ """
60
+ match level:
61
+ case logging.DEBUG:
62
+ message = colored(
63
+ text=(
64
+ "[DEBUG] "
65
+ + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
66
+ + f" · {message}"
67
+ ),
68
+ color=colour or "light_blue",
69
+ )
70
+ logger.debug(message)
71
+ case logging.INFO:
72
+ if colour is not None:
73
+ message = colored(text=message, color=colour)
74
+ logger.info(message)
75
+ case logging.WARNING:
76
+ message = colored(text=message, color=colour or "light_red")
77
+ logger.warning(message)
78
+ case logging.ERROR:
79
+ message = colored(text=message, color=colour or "red")
80
+ logger.error(message)
81
+ case logging.CRITICAL:
82
+ message = colored(text=message, color=colour or "red")
83
+ logger.critical(message)
84
+ case _:
85
+ raise ValueError(f"Invalid logging level: {level}")
86
+
87
+
88
+ @cache_arguments("message")
89
+ def log_once(message: str, level: int = logging.INFO, prefix: str = "") -> None:
90
+ """Log a message once.
91
+
92
+ This is ensured by caching the "message" argument and only logging it the first time
93
+ this function is called with that message.
94
+
95
+ Args:
96
+ message:
97
+ The message to log.
98
+ level:
99
+ The logging level. Defaults to logging.INFO.
100
+ prefix:
101
+ A prefix to add to the message, which is not considered when determining if
102
+ the message has been logged before.
103
+ """
104
+ log(message=prefix + message, level=level)
105
+
106
+
107
+ def block_terminal_output() -> None:
108
+ """Blocks libraries from writing output to the terminal.
109
+
110
+ This filters warnings from some libraries, sets the logging level to ERROR for some
111
+ libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and
112
+ disables most of the logging from the `transformers` library.
113
+ """
114
+ if os.getenv("FULL_LOG") == "1":
115
+ return
116
+
117
+ # Ignore miscellaneous warnings
118
+ warnings.filterwarnings("ignore", category=UserWarning)
119
+ warnings.filterwarnings("ignore", category=FutureWarning)
120
+ logging.getLogger("absl").setLevel(logging.CRITICAL)
121
+
122
+ # Disable matplotlib logging
123
+ logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
124
+
125
+ # Disable PyTorch logging
126
+ logging.getLogger("torch.utils.cpp_extension").setLevel(logging.CRITICAL)
127
+ warnings.filterwarnings(action="ignore", module="torch*")
128
+ os.environ["TORCH_LOGS"] = "-all"
129
+
130
+ # Disable huggingface_hub logging
131
+ logging.getLogger("huggingface_hub").setLevel(logging.CRITICAL)
132
+ disable_hf_hub_progress_bars()
133
+
134
+ # Disable LiteLLM logging
135
+ logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
136
+ logging.getLogger("LiteLLM Router").setLevel(logging.CRITICAL)
137
+ logging.getLogger("LiteLLM Proxy").setLevel(logging.CRITICAL)
138
+ logging.getLogger("openai").setLevel(logging.CRITICAL)
139
+ logging.getLogger("httpx").setLevel(logging.CRITICAL)
140
+ litellm.suppress_debug_info = True
141
+
142
+ # Disable vLLM logging
143
+ logging.getLogger("vllm").setLevel(logging.CRITICAL)
144
+ logging.getLogger("vllm.engine.llm_engine").setLevel(logging.CRITICAL)
145
+ logging.getLogger("vllm.transformers_utils.tokenizer").setLevel(logging.CRITICAL)
146
+ logging.getLogger("vllm.core.scheduler").setLevel(logging.CRITICAL)
147
+ logging.getLogger("vllm.model_executor.weight_utils").setLevel(logging.CRITICAL)
148
+ logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
149
+ logging.getLogger("mistral_common.tokens.tokenizers.tekken").setLevel(
150
+ logging.CRITICAL
151
+ )
152
+ os.environ["LOG_LEVEL"] = "CRITICAL"
153
+ os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
154
+
155
+ # Disable flashinfer logging
156
+ os.environ["FLASHINFER_LOGGING_LEVEL"] = "CRITICAL"
157
+
158
+ # Disable datasets logging
159
+ logging.getLogger("datasets").setLevel(logging.CRITICAL)
160
+ logging.getLogger("filelock").setLevel(logging.CRITICAL)
161
+ disable_datasets_progress_bars()
162
+
163
+ # Disable evaluate logging
164
+ warnings.filterwarnings("ignore", module="seqeval*")
165
+ disable_evaluate_progress_bar()
166
+
167
+ # Disable most of the `transformers` logging
168
+ tf_logging._default_log_level = logging.CRITICAL
169
+ tf_logging.set_verbosity(logging.CRITICAL)
170
+ logging.getLogger("transformers.trainer").setLevel(logging.CRITICAL)
171
+ logging.getLogger("accelerate").setLevel(logging.CRITICAL)
172
+
173
+
174
+ class no_terminal_output:
175
+ """Context manager that suppresses all terminal output."""
176
+
177
+ def __init__(self, disable: bool = False) -> None:
178
+ """Initialise the context manager.
179
+
180
+ Args:
181
+ disable:
182
+ If True, this context manager does nothing.
183
+ """
184
+ self.disable = disable
185
+ self.nothing_file: TextIOWrapper | None = None
186
+ self._cpp_stdout_file: int | None = None
187
+ self._cpp_stderr_file: int | None = None
188
+ try:
189
+ self._cpp_stdout_file = os.dup(sys.stdout.fileno())
190
+ self._cpp_stderr_file = os.dup(sys.stderr.fileno())
191
+ except OSError:
192
+ self._log_windows_warning()
193
+
194
+ def _log_windows_warning(self) -> None:
195
+ """Log a warning about Windows not supporting blocking terminal output."""
196
+ log_once(
197
+ "Your operating system (probably Windows) does not support blocking "
198
+ "terminal output, so expect more messy output - sorry!",
199
+ level=logging.WARNING,
200
+ )
201
+
202
+ def __enter__(self) -> None:
203
+ """Suppress all terminal output."""
204
+ if not self.disable:
205
+ self.nothing_file = open(os.devnull, "w")
206
+ try:
207
+ os.dup2(fd=self.nothing_file.fileno(), fd2=sys.stdout.fileno())
208
+ os.dup2(fd=self.nothing_file.fileno(), fd2=sys.stderr.fileno())
209
+ except OSError:
210
+ self._log_windows_warning()
211
+
212
+ def __exit__(
213
+ self,
214
+ exc_type: type[BaseException] | None,
215
+ exc_val: BaseException | None,
216
+ exc_tb: type[BaseException] | None,
217
+ ) -> None:
218
+ """Re-enable terminal output."""
219
+ if not self.disable:
220
+ if self.nothing_file is not None:
221
+ self.nothing_file.close()
222
+ try:
223
+ if self._cpp_stdout_file is not None:
224
+ os.dup2(fd=self._cpp_stdout_file, fd2=sys.stdout.fileno())
225
+ if self._cpp_stderr_file is not None:
226
+ os.dup2(fd=self._cpp_stderr_file, fd2=sys.stderr.fileno())
227
+ except OSError:
228
+ self._log_windows_warning()
229
+
230
+
231
+ def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
232
+ """Adjust the logging level based on verbosity.
233
+
234
+ Args:
235
+ verbose:
236
+ Whether to output additional output.
237
+ ignore_testing:
238
+ Whether to ignore the testing flag.
239
+
240
+ Returns:
241
+ The logging level that was set.
242
+ """
243
+ if hasattr(sys, "_called_from_test") and not ignore_testing:
244
+ logging_level = logging.CRITICAL
245
+ elif verbose:
246
+ logging_level = logging.DEBUG
247
+ else:
248
+ logging_level = logging.INFO
249
+ logger.setLevel(logging_level)
250
+ return logging_level
euroeval/metrics/base.py CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  import abc
4
4
  import collections.abc as c
5
- import logging
6
5
  import typing as t
7
6
 
8
7
  if t.TYPE_CHECKING:
@@ -10,8 +9,6 @@ if t.TYPE_CHECKING:
10
9
 
11
10
  from ..data_models import BenchmarkConfig, DatasetConfig
12
11
 
13
- logger: logging.Logger = logging.getLogger("euroeval")
14
-
15
12
 
16
13
  class Metric(abc.ABC):
17
14
  """Abstract base class for all metrics."""
@@ -1,7 +1,6 @@
1
1
  """All the Hugging Face metrics used in EuroEval."""
2
2
 
3
3
  import collections.abc as c
4
- import logging
5
4
  import typing as t
6
5
  from pathlib import Path
7
6
 
@@ -9,7 +8,7 @@ import evaluate
9
8
  import numpy as np
10
9
  from datasets import DownloadConfig
11
10
 
12
- from ..utils import HiddenPrints
11
+ from ..logging_utils import no_terminal_output
13
12
  from .base import Metric
14
13
 
15
14
  if t.TYPE_CHECKING:
@@ -18,8 +17,6 @@ if t.TYPE_CHECKING:
18
17
 
19
18
  from ..data_models import BenchmarkConfig, DatasetConfig
20
19
 
21
- logger: logging.Logger = logging.getLogger("euroeval")
22
-
23
20
 
24
21
  class HuggingFaceMetric(Metric):
25
22
  """A metric which is implemented in the `evaluate` package.
@@ -126,7 +123,7 @@ class HuggingFaceMetric(Metric):
126
123
 
127
124
  assert self.metric is not None
128
125
 
129
- with HiddenPrints():
126
+ with no_terminal_output(disable=benchmark_config.verbose):
130
127
  results = self.metric.compute(
131
128
  predictions=predictions, references=references, **self.compute_kwargs
132
129
  )
@@ -145,6 +142,13 @@ class HuggingFaceMetric(Metric):
145
142
 
146
143
  return score
147
144
 
145
+ def __del__(self) -> None:
146
+ """Clean up the metric from memory."""
147
+ if self.metric is not None:
148
+ if self.metric.writer is not None:
149
+ self.metric.writer.close()
150
+ del self.metric
151
+
148
152
 
149
153
  mcc_metric = HuggingFaceMetric(
150
154
  name="mcc",
@@ -197,7 +201,7 @@ bert_score_metric = HuggingFaceMetric(
197
201
  huggingface_id="bertscore",
198
202
  results_key="f1",
199
203
  compute_kwargs=dict(
200
- model_type="microsoft/mdeberta-v3-base", device="cpu", batch_size=16
204
+ model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
201
205
  ),
202
206
  )
203
207
 
@@ -8,6 +8,7 @@ from pathlib import Path
8
8
  from pydantic import BaseModel, Field
9
9
 
10
10
  from ..exceptions import InvalidBenchmark
11
+ from ..logging_utils import log
11
12
  from ..model_cache import ModelCache
12
13
  from ..utils import extract_json_dict_from_string
13
14
  from .base import Metric
@@ -17,8 +18,6 @@ if t.TYPE_CHECKING:
17
18
 
18
19
  from ..data_models import BenchmarkConfig, DatasetConfig
19
20
 
20
- logger: logging.Logger = logging.getLogger("euroeval")
21
-
22
21
 
23
22
  class LLMAsAJudgeMetric(Metric):
24
23
  """Use an LLM to judge the quality of the predictions."""
@@ -190,7 +189,10 @@ class LLMAsAJudgeMetric(Metric):
190
189
  # Calculate the scores using the scoring function
191
190
  scores = [self.scoring_fn(output) for output in outputs]
192
191
  if not scores:
193
- logger.warning(f"No scores were calculated for {self.pretty_name}.")
192
+ log(
193
+ f"No scores were calculated for {self.pretty_name}.",
194
+ level=logging.WARNING,
195
+ )
194
196
  return None
195
197
  return sum(scores) / len(scores)
196
198
 
@@ -11,6 +11,7 @@ import numpy as np
11
11
  from scipy.special import expit as sigmoid
12
12
 
13
13
  from ..exceptions import InvalidBenchmark
14
+ from ..logging_utils import log, no_terminal_output
14
15
  from ..utils import unscramble
15
16
  from .base import Metric
16
17
 
@@ -20,8 +21,6 @@ if t.TYPE_CHECKING:
20
21
 
21
22
  from ..data_models import BenchmarkConfig, DatasetConfig
22
23
 
23
- logger: logging.Logger = logging.getLogger("euroeval")
24
-
25
24
 
26
25
  T = t.TypeVar("T", bound=int | float | str | bool)
27
26
 
@@ -121,16 +120,22 @@ class PipelineMetric(Metric):
121
120
  The calculated metric score, or None if the score should be ignored.
122
121
  """
123
122
  if self.pipeline is None:
124
- self.pipeline = self._download_pipeline()
123
+ self.pipeline = self._download_pipeline(
124
+ cache_dir=benchmark_config.cache_dir
125
+ )
125
126
  if self.preprocessing_fn is not None:
126
127
  predictions = self.preprocessing_fn(
127
128
  predictions=predictions, dataset=dataset
128
129
  )
129
130
  return self.pipeline_scoring_function(self.pipeline, predictions)
130
131
 
131
- def _download_pipeline(self) -> "Pipeline":
132
+ def _download_pipeline(self, cache_dir: str) -> "Pipeline":
132
133
  """Download the scikit-learn pipeline from the given URL.
133
134
 
135
+ Args:
136
+ cache_dir:
137
+ The directory to use for caching the downloaded pipeline.
138
+
134
139
  Returns:
135
140
  The downloaded scikit-learn pipeline.
136
141
 
@@ -138,10 +143,13 @@ class PipelineMetric(Metric):
138
143
  InvalidBenchmark:
139
144
  If the loading of the pipeline fails for any reason.
140
145
  """
141
- logger.debug(f"Loading pipeline from {self.pipeline_repo}...")
142
- folder_path = hf_hub.HfApi(
143
- token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_")
144
- ).snapshot_download(repo_id=self.pipeline_repo, repo_type="model")
146
+ log(f"Loading pipeline from {self.pipeline_repo}...", level=logging.DEBUG)
147
+ with no_terminal_output():
148
+ folder_path = hf_hub.HfApi(
149
+ token=unscramble("XbjeOLhwebEaSaDUMqqaPaPIhgOcyOfDpGnX_")
150
+ ).snapshot_download(
151
+ repo_id=self.pipeline_repo, repo_type="model", cache_dir=cache_dir
152
+ )
145
153
  model_path = Path(folder_path, self.pipeline_file_name)
146
154
  try:
147
155
  with model_path.open(mode="rb") as f:
@@ -150,7 +158,7 @@ class PipelineMetric(Metric):
150
158
  raise InvalidBenchmark(
151
159
  f"Failed to load pipeline from {self.pipeline_repo!r}: {e}"
152
160
  ) from e
153
- logger.debug(f"Successfully loaded pipeline: {pipeline}")
161
+ log(f"Successfully loaded pipeline: {pipeline}", level=logging.DEBUG)
154
162
  return pipeline
155
163
 
156
164
 
@@ -191,6 +199,11 @@ def european_values_preprocessing_fn(
191
199
  for idx, choice in idx_to_choice.items()
192
200
  if choice is not None
193
201
  }
202
+ if prediction not in idx_to_choice:
203
+ raise InvalidBenchmark(
204
+ f"The prediction {prediction} is not a valid index for the "
205
+ f"question with choices {idx_to_choice}."
206
+ )
194
207
  integer_prediction = idx_to_choice[prediction]
195
208
  integer_predictions.append(integer_prediction)
196
209