EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (78) hide show
  1. euroeval/__init__.py +9 -2
  2. euroeval/benchmark_config_factory.py +51 -50
  3. euroeval/benchmark_modules/base.py +9 -21
  4. euroeval/benchmark_modules/fresh.py +2 -1
  5. euroeval/benchmark_modules/hf.py +101 -71
  6. euroeval/benchmark_modules/litellm.py +115 -53
  7. euroeval/benchmark_modules/vllm.py +107 -92
  8. euroeval/benchmarker.py +144 -121
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +86 -8
  12. euroeval/constants.py +9 -0
  13. euroeval/data_loading.py +80 -29
  14. euroeval/data_models.py +338 -330
  15. euroeval/dataset_configs/__init__.py +12 -3
  16. euroeval/dataset_configs/bulgarian.py +56 -0
  17. euroeval/dataset_configs/czech.py +75 -0
  18. euroeval/dataset_configs/danish.py +55 -93
  19. euroeval/dataset_configs/dutch.py +48 -87
  20. euroeval/dataset_configs/english.py +45 -77
  21. euroeval/dataset_configs/estonian.py +42 -34
  22. euroeval/dataset_configs/faroese.py +19 -60
  23. euroeval/dataset_configs/finnish.py +36 -69
  24. euroeval/dataset_configs/french.py +39 -75
  25. euroeval/dataset_configs/german.py +45 -82
  26. euroeval/dataset_configs/greek.py +64 -0
  27. euroeval/dataset_configs/icelandic.py +54 -91
  28. euroeval/dataset_configs/italian.py +42 -79
  29. euroeval/dataset_configs/latvian.py +28 -35
  30. euroeval/dataset_configs/lithuanian.py +28 -26
  31. euroeval/dataset_configs/norwegian.py +72 -115
  32. euroeval/dataset_configs/polish.py +33 -61
  33. euroeval/dataset_configs/portuguese.py +33 -66
  34. euroeval/dataset_configs/serbian.py +64 -0
  35. euroeval/dataset_configs/slovak.py +55 -0
  36. euroeval/dataset_configs/spanish.py +42 -77
  37. euroeval/dataset_configs/swedish.py +52 -90
  38. euroeval/dataset_configs/ukrainian.py +64 -0
  39. euroeval/exceptions.py +1 -1
  40. euroeval/finetuning.py +24 -17
  41. euroeval/generation.py +15 -14
  42. euroeval/generation_utils.py +8 -8
  43. euroeval/languages.py +395 -323
  44. euroeval/logging_utils.py +250 -0
  45. euroeval/metrics/base.py +0 -3
  46. euroeval/metrics/huggingface.py +21 -6
  47. euroeval/metrics/llm_as_a_judge.py +6 -4
  48. euroeval/metrics/pipeline.py +17 -9
  49. euroeval/metrics/speed.py +0 -3
  50. euroeval/model_cache.py +17 -19
  51. euroeval/model_config.py +4 -5
  52. euroeval/model_loading.py +3 -0
  53. euroeval/prompt_templates/__init__.py +2 -0
  54. euroeval/prompt_templates/classification.py +206 -0
  55. euroeval/prompt_templates/linguistic_acceptability.py +99 -42
  56. euroeval/prompt_templates/multiple_choice.py +102 -38
  57. euroeval/prompt_templates/named_entity_recognition.py +172 -51
  58. euroeval/prompt_templates/reading_comprehension.py +119 -42
  59. euroeval/prompt_templates/sentiment_classification.py +110 -40
  60. euroeval/prompt_templates/summarization.py +85 -40
  61. euroeval/prompt_templates/token_classification.py +279 -0
  62. euroeval/scores.py +11 -10
  63. euroeval/speed_benchmark.py +5 -6
  64. euroeval/task_group_utils/multiple_choice_classification.py +2 -4
  65. euroeval/task_group_utils/question_answering.py +24 -16
  66. euroeval/task_group_utils/sequence_classification.py +48 -35
  67. euroeval/task_group_utils/text_to_text.py +19 -9
  68. euroeval/task_group_utils/token_classification.py +21 -17
  69. euroeval/tasks.py +44 -1
  70. euroeval/tokenisation_utils.py +33 -22
  71. euroeval/types.py +10 -9
  72. euroeval/utils.py +35 -149
  73. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
  74. euroeval-16.5.0.dist-info/RECORD +81 -0
  75. euroeval-16.3.0.dist-info/RECORD +0 -71
  76. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  77. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  78. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py CHANGED
@@ -21,7 +21,8 @@ if os.getenv("FULL_LOG") != "1":
21
21
  os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
22
22
 
23
23
  # Set up logging
24
- fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
24
+ # fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
25
+ fmt = colored("%(message)s", "light_yellow")
25
26
  logging.basicConfig(
26
27
  level=logging.CRITICAL if hasattr(sys, "_called_from_test") else logging.INFO,
27
28
  format=fmt,
@@ -50,7 +51,13 @@ import importlib.metadata # noqa: E402
50
51
  from dotenv import load_dotenv # noqa: E402
51
52
 
52
53
  from .benchmarker import Benchmarker # noqa: E402
53
- from .utils import block_terminal_output # noqa: E402
54
+ from .data_models import DatasetConfig # noqa: E402
55
+ from .logging_utils import block_terminal_output # noqa: E402
56
+ from .tasks import ( # noqa: E402
57
+ MULTIPLE_CHOICE,
58
+ TEXT_CLASSIFICATION,
59
+ TOKEN_CLASSIFICATION,
60
+ )
54
61
 
55
62
  # Block unwanted terminal outputs. This blocks way more than the above, but since it
56
63
  # relies on importing from the `utils` module, external modules are already imported
@@ -1,23 +1,20 @@
1
1
  """Factory class for creating dataset configurations."""
2
2
 
3
- import logging
3
+ import collections.abc as c
4
4
  import sys
5
5
  import typing as t
6
6
 
7
7
  import torch
8
8
 
9
- from .data_models import BenchmarkConfig, BenchmarkConfigParams
9
+ from .data_models import BenchmarkConfig, BenchmarkConfigParams, DatasetConfig, Task
10
10
  from .dataset_configs import get_all_dataset_configs
11
11
  from .enums import Device
12
12
  from .exceptions import InvalidBenchmark
13
13
  from .languages import get_all_languages
14
- from .tasks import SPEED, get_all_tasks
14
+ from .tasks import get_all_tasks
15
15
 
16
16
  if t.TYPE_CHECKING:
17
- from .data_models import Language, Task
18
-
19
-
20
- logger = logging.getLogger("euroeval")
17
+ from .data_models import Language
21
18
 
22
19
 
23
20
  def build_benchmark_config(
@@ -44,7 +41,7 @@ def build_benchmark_config(
44
41
  default_language_codes=language_codes,
45
42
  )
46
43
 
47
- tasks, datasets = prepare_tasks_and_datasets(
44
+ dataset_configs = prepare_dataset_configs(
48
45
  task=benchmark_config_params.task,
49
46
  dataset=benchmark_config_params.dataset,
50
47
  dataset_languages=dataset_languages,
@@ -53,8 +50,7 @@ def build_benchmark_config(
53
50
  return BenchmarkConfig(
54
51
  model_languages=model_languages,
55
52
  dataset_languages=dataset_languages,
56
- tasks=tasks,
57
- datasets=datasets,
53
+ datasets=dataset_configs,
58
54
  batch_size=benchmark_config_params.batch_size,
59
55
  raise_errors=benchmark_config_params.raise_errors,
60
56
  cache_dir=benchmark_config_params.cache_dir,
@@ -84,7 +80,9 @@ def build_benchmark_config(
84
80
  )
85
81
 
86
82
 
87
- def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
83
+ def get_correct_language_codes(
84
+ language_codes: str | c.Sequence[str],
85
+ ) -> c.Sequence[str]:
88
86
  """Get correct language code(s).
89
87
 
90
88
  Args:
@@ -105,7 +103,7 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
105
103
  elif isinstance(language_codes, str):
106
104
  languages = [language_codes]
107
105
  else:
108
- languages = language_codes
106
+ languages = list(language_codes)
109
107
 
110
108
  # If `languages` contains 'no' then also include 'nb' and 'nn'. Conversely, if
111
109
  # either 'nb' or 'nn' are specified then also include 'no'.
@@ -118,8 +116,9 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
118
116
 
119
117
 
120
118
  def prepare_languages(
121
- language_codes: str | list[str] | None, default_language_codes: list[str]
122
- ) -> list["Language"]:
119
+ language_codes: str | c.Sequence[str] | None,
120
+ default_language_codes: c.Sequence[str],
121
+ ) -> c.Sequence["Language"]:
123
122
  """Prepare language(s) for benchmarking.
124
123
 
125
124
  Args:
@@ -137,7 +136,7 @@ def prepare_languages(
137
136
  language_mapping = get_all_languages()
138
137
 
139
138
  # Create the list `languages_str` of language codes to use for models or datasets
140
- languages_str: list[str]
139
+ languages_str: c.Sequence[str]
141
140
  if language_codes is None:
142
141
  languages_str = default_language_codes
143
142
  elif isinstance(language_codes, str):
@@ -154,12 +153,12 @@ def prepare_languages(
154
153
  return prepared_languages
155
154
 
156
155
 
157
- def prepare_tasks_and_datasets(
158
- task: str | list[str] | None,
159
- dataset_languages: list["Language"],
160
- dataset: str | list[str] | None,
161
- ) -> tuple[list["Task"], list[str]]:
162
- """Prepare task(s) and dataset(s) for benchmarking.
156
+ def prepare_dataset_configs(
157
+ task: "str | Task | c.Sequence[str | Task] | None",
158
+ dataset_languages: c.Sequence["Language"],
159
+ dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None",
160
+ ) -> c.Sequence["DatasetConfig"]:
161
+ """Prepare dataset config(s) for benchmarking.
163
162
 
164
163
  Args:
165
164
  task:
@@ -172,56 +171,58 @@ def prepare_tasks_and_datasets(
172
171
  included, limited by the `task` and `dataset_languages` parameters.
173
172
 
174
173
  Returns:
175
- The prepared tasks and datasets.
174
+ The prepared dataset configs.
176
175
 
177
176
  Raises:
178
177
  InvalidBenchmark:
179
178
  If the task or dataset is not found in the benchmark tasks or datasets.
180
179
  """
181
- # Create a dictionary that maps benchmark tasks to their associated benchmark
182
- # task objects, and a dictionary that maps dataset names to their associated
183
- # dataset configuration objects
184
- task_mapping = get_all_tasks()
185
- all_dataset_configs = get_all_dataset_configs()
186
-
187
180
  # Create the list of dataset tasks
181
+ task_mapping = get_all_tasks()
188
182
  try:
189
183
  if task is None:
190
- tasks = [t for t in task_mapping.values() if t != SPEED]
184
+ tasks = None
191
185
  elif isinstance(task, str):
192
186
  tasks = [task_mapping[task]]
187
+ elif isinstance(task, Task):
188
+ tasks = [task]
193
189
  else:
194
- tasks = [task_mapping[t] for t in task]
190
+ tasks = [task_mapping[t] if isinstance(t, str) else t for t in task]
195
191
  except KeyError as e:
196
192
  raise InvalidBenchmark(f"Task {e} not found in the benchmark tasks.") from e
197
193
 
198
- all_official_datasets = [
199
- dataset_name
200
- for dataset_name, dataset_config in all_dataset_configs.items()
194
+ # Create the list of dataset configs
195
+ all_dataset_configs = get_all_dataset_configs()
196
+ all_official_dataset_configs: c.Sequence[DatasetConfig] = [
197
+ dataset_config
198
+ for dataset_config in all_dataset_configs.values()
201
199
  if not dataset_config.unofficial
202
200
  ]
203
- if dataset is None:
204
- dataset = all_official_datasets
205
- elif isinstance(dataset, str):
206
- dataset = [dataset]
207
-
208
- all_datasets = list(all_dataset_configs.keys())
209
- invalid_datasets = set(dataset) - set(all_datasets)
210
- if invalid_datasets:
201
+ try:
202
+ if dataset is None:
203
+ datasets = all_official_dataset_configs
204
+ elif isinstance(dataset, str):
205
+ datasets = [all_dataset_configs[dataset]]
206
+ elif isinstance(dataset, DatasetConfig):
207
+ datasets = [dataset]
208
+ else:
209
+ datasets = [
210
+ all_dataset_configs[d] if isinstance(d, str) else d for d in dataset
211
+ ]
212
+ except KeyError as e:
211
213
  raise InvalidBenchmark(
212
- f"Dataset(s) {', '.join(invalid_datasets)} not found in the benchmark "
213
- "datasets."
214
- )
214
+ f"Dataset {e} not found in the benchmark datasets."
215
+ ) from e
215
216
 
217
+ # Filter the dataset configs based on the specified tasks and languages
216
218
  datasets = [
217
- dataset_name
218
- for dataset_name, dataset_config in all_dataset_configs.items()
219
- if dataset_name in dataset
220
- and dataset_config.task in tasks
221
- and set(dataset_config.languages).intersection(dataset_languages)
219
+ ds
220
+ for ds in datasets
221
+ if (tasks is None or ds.task in tasks)
222
+ and any(lang in dataset_languages for lang in ds.languages)
222
223
  ]
223
224
 
224
- return tasks, datasets
225
+ return datasets
225
226
 
226
227
 
227
228
  def prepare_device(device: Device | None) -> torch.device:
@@ -3,24 +3,22 @@
3
3
  import collections.abc as c
4
4
  import logging
5
5
  import re
6
- import sys
7
6
  import typing as t
8
7
  from abc import ABC, abstractmethod
9
8
  from functools import cached_property, partial
10
9
 
11
10
  from datasets import Dataset, DatasetDict
12
11
  from torch import nn
13
- from tqdm.auto import tqdm
14
12
 
15
13
  from ..enums import TaskGroup
16
14
  from ..exceptions import InvalidBenchmark, NeedsEnvironmentVariable, NeedsExtraInstalled
15
+ from ..logging_utils import get_pbar, log_once
17
16
  from ..task_group_utils import (
18
17
  question_answering,
19
18
  sequence_classification,
20
19
  text_to_text,
21
20
  token_classification,
22
21
  )
23
- from ..utils import log_once
24
22
 
25
23
  if t.TYPE_CHECKING:
26
24
  from transformers.tokenization_utils import PreTrainedTokenizer
@@ -36,8 +34,6 @@ if t.TYPE_CHECKING:
36
34
  from ..enums import BatchingPreference, GenerativeType
37
35
  from ..types import ComputeMetricsFunction, ExtractLabelsFunction
38
36
 
39
- logger = logging.getLogger("euroeval")
40
-
41
37
 
42
38
  class BenchmarkModule(ABC):
43
39
  """Abstract class for a benchmark module.
@@ -56,7 +52,7 @@ class BenchmarkModule(ABC):
56
52
  fresh_model: bool
57
53
  batching_preference: "BatchingPreference"
58
54
  high_priority: bool
59
- allowed_params: dict[re.Pattern, list[str]] = {re.compile(r".*"): []}
55
+ allowed_params: dict[re.Pattern, c.Sequence[str]] = {re.compile(r".*"): []}
60
56
 
61
57
  def __init__(
62
58
  self,
@@ -87,20 +83,12 @@ class BenchmarkModule(ABC):
87
83
 
88
84
  def _log_metadata(self) -> None:
89
85
  """Log the metadata of the model."""
90
- # Set logging level based on verbosity
91
- if hasattr(sys, "_called_from_test"):
92
- logging_level = logging.CRITICAL
93
- elif self.benchmark_config.verbose:
94
- logging_level = logging.DEBUG
95
- else:
96
- logging_level = logging.INFO
97
- logger.setLevel(logging_level)
98
-
99
- logging_msg: str = ""
86
+ model_id = self.model_config.model_id
87
+ logging_msg: str = ""
100
88
  if self.num_params < 0:
101
- logging_msg += "The model has an unknown number of parameters, "
89
+ logging_msg += f"The model {model_id} has an unknown number of parameters, "
102
90
  else:
103
- logging_msg += f"The model has {self.num_params:,} parameters, "
91
+ logging_msg += f"The model {model_id} has {self.num_params:,} parameters, "
104
92
  if self.vocab_size < 0:
105
93
  logging_msg += "an unknown vocabulary size, "
106
94
  else:
@@ -179,7 +167,7 @@ class BenchmarkModule(ABC):
179
167
 
180
168
  @property
181
169
  @abstractmethod
182
- def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
170
+ def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
183
171
  """The data collator used to prepare samples during finetuning.
184
172
 
185
173
  Returns:
@@ -253,7 +241,7 @@ class BenchmarkModule(ABC):
253
241
 
254
242
  def prepare_datasets(
255
243
  self, datasets: list[DatasetDict], task: "Task"
256
- ) -> list[DatasetDict]:
244
+ ) -> c.Sequence[DatasetDict]:
257
245
  """Prepare the datasets for the model.
258
246
 
259
247
  This includes things like tokenisation.
@@ -273,7 +261,7 @@ class BenchmarkModule(ABC):
273
261
  tasks.
274
262
  """
275
263
  for idx, dataset in enumerate(
276
- tqdm(iterable=datasets, desc="Preparing datasets")
264
+ get_pbar(iterable=datasets, desc="Preparing datasets")
277
265
  ):
278
266
  prepared_dataset = self.prepare_dataset(
279
267
  dataset=dataset, task=task, itr_idx=idx
@@ -27,7 +27,8 @@ from ..exceptions import (
27
27
  NeedsExtraInstalled,
28
28
  )
29
29
  from ..generation_utils import raise_if_wrong_params
30
- from ..utils import block_terminal_output, create_model_cache_dir, get_hf_token
30
+ from ..logging_utils import block_terminal_output
31
+ from ..utils import create_model_cache_dir, get_hf_token
31
32
  from .hf import (
32
33
  HuggingFaceEncoderModel,
33
34
  align_model_and_tokeniser,