EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (71) hide show
  1. euroeval/__init__.py +6 -0
  2. euroeval/benchmark_config_factory.py +51 -46
  3. euroeval/benchmark_modules/base.py +6 -5
  4. euroeval/benchmark_modules/hf.py +2 -9
  5. euroeval/benchmark_modules/litellm.py +14 -12
  6. euroeval/benchmark_modules/vllm.py +17 -10
  7. euroeval/benchmarker.py +61 -44
  8. euroeval/caching_utils.py +1 -1
  9. euroeval/cli.py +86 -8
  10. euroeval/constants.py +3 -0
  11. euroeval/data_loading.py +78 -30
  12. euroeval/data_models.py +326 -326
  13. euroeval/dataset_configs/__init__.py +10 -3
  14. euroeval/dataset_configs/bulgarian.py +56 -0
  15. euroeval/dataset_configs/czech.py +25 -29
  16. euroeval/dataset_configs/danish.py +51 -88
  17. euroeval/dataset_configs/dutch.py +48 -86
  18. euroeval/dataset_configs/english.py +45 -76
  19. euroeval/dataset_configs/estonian.py +36 -38
  20. euroeval/dataset_configs/faroese.py +19 -60
  21. euroeval/dataset_configs/finnish.py +36 -68
  22. euroeval/dataset_configs/french.py +39 -74
  23. euroeval/dataset_configs/german.py +45 -81
  24. euroeval/dataset_configs/greek.py +64 -0
  25. euroeval/dataset_configs/icelandic.py +54 -91
  26. euroeval/dataset_configs/italian.py +42 -78
  27. euroeval/dataset_configs/latvian.py +28 -34
  28. euroeval/dataset_configs/lithuanian.py +22 -26
  29. euroeval/dataset_configs/norwegian.py +72 -114
  30. euroeval/dataset_configs/polish.py +33 -60
  31. euroeval/dataset_configs/portuguese.py +33 -65
  32. euroeval/dataset_configs/serbian.py +64 -0
  33. euroeval/dataset_configs/slovak.py +19 -24
  34. euroeval/dataset_configs/spanish.py +42 -76
  35. euroeval/dataset_configs/swedish.py +48 -84
  36. euroeval/dataset_configs/ukrainian.py +64 -0
  37. euroeval/exceptions.py +1 -1
  38. euroeval/finetuning.py +3 -2
  39. euroeval/generation.py +5 -4
  40. euroeval/generation_utils.py +6 -5
  41. euroeval/languages.py +395 -323
  42. euroeval/metrics/huggingface.py +14 -3
  43. euroeval/metrics/llm_as_a_judge.py +1 -1
  44. euroeval/model_cache.py +6 -5
  45. euroeval/model_loading.py +1 -1
  46. euroeval/prompt_templates/__init__.py +2 -0
  47. euroeval/prompt_templates/classification.py +206 -0
  48. euroeval/prompt_templates/linguistic_acceptability.py +82 -43
  49. euroeval/prompt_templates/multiple_choice.py +81 -41
  50. euroeval/prompt_templates/named_entity_recognition.py +125 -44
  51. euroeval/prompt_templates/reading_comprehension.py +92 -43
  52. euroeval/prompt_templates/sentiment_classification.py +91 -43
  53. euroeval/prompt_templates/summarization.py +64 -39
  54. euroeval/prompt_templates/token_classification.py +279 -0
  55. euroeval/scores.py +4 -3
  56. euroeval/speed_benchmark.py +2 -1
  57. euroeval/task_group_utils/multiple_choice_classification.py +2 -1
  58. euroeval/task_group_utils/question_answering.py +24 -13
  59. euroeval/task_group_utils/sequence_classification.py +5 -4
  60. euroeval/task_group_utils/text_to_text.py +2 -1
  61. euroeval/task_group_utils/token_classification.py +11 -8
  62. euroeval/tasks.py +44 -1
  63. euroeval/tokenisation_utils.py +19 -10
  64. euroeval/types.py +10 -9
  65. euroeval/utils.py +6 -3
  66. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
  67. euroeval-16.5.0.dist-info/RECORD +81 -0
  68. euroeval-16.4.0.dist-info/RECORD +0 -75
  69. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  70. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  71. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py CHANGED
@@ -51,7 +51,13 @@ import importlib.metadata # noqa: E402
51
51
  from dotenv import load_dotenv # noqa: E402
52
52
 
53
53
  from .benchmarker import Benchmarker # noqa: E402
54
+ from .data_models import DatasetConfig # noqa: E402
54
55
  from .logging_utils import block_terminal_output # noqa: E402
56
+ from .tasks import ( # noqa: E402
57
+ MULTIPLE_CHOICE,
58
+ TEXT_CLASSIFICATION,
59
+ TOKEN_CLASSIFICATION,
60
+ )
55
61
 
56
62
  # Block unwanted terminal outputs. This blocks way more than the above, but since it
57
63
  # relies on importing from the `utils` module, external modules are already imported
@@ -1,19 +1,20 @@
1
1
  """Factory class for creating dataset configurations."""
2
2
 
3
+ import collections.abc as c
3
4
  import sys
4
5
  import typing as t
5
6
 
6
7
  import torch
7
8
 
8
- from .data_models import BenchmarkConfig, BenchmarkConfigParams
9
+ from .data_models import BenchmarkConfig, BenchmarkConfigParams, DatasetConfig, Task
9
10
  from .dataset_configs import get_all_dataset_configs
10
11
  from .enums import Device
11
12
  from .exceptions import InvalidBenchmark
12
13
  from .languages import get_all_languages
13
- from .tasks import SPEED, get_all_tasks
14
+ from .tasks import get_all_tasks
14
15
 
15
16
  if t.TYPE_CHECKING:
16
- from .data_models import Language, Task
17
+ from .data_models import Language
17
18
 
18
19
 
19
20
  def build_benchmark_config(
@@ -40,7 +41,7 @@ def build_benchmark_config(
40
41
  default_language_codes=language_codes,
41
42
  )
42
43
 
43
- tasks, datasets = prepare_tasks_and_datasets(
44
+ dataset_configs = prepare_dataset_configs(
44
45
  task=benchmark_config_params.task,
45
46
  dataset=benchmark_config_params.dataset,
46
47
  dataset_languages=dataset_languages,
@@ -49,8 +50,7 @@ def build_benchmark_config(
49
50
  return BenchmarkConfig(
50
51
  model_languages=model_languages,
51
52
  dataset_languages=dataset_languages,
52
- tasks=tasks,
53
- datasets=datasets,
53
+ datasets=dataset_configs,
54
54
  batch_size=benchmark_config_params.batch_size,
55
55
  raise_errors=benchmark_config_params.raise_errors,
56
56
  cache_dir=benchmark_config_params.cache_dir,
@@ -80,7 +80,9 @@ def build_benchmark_config(
80
80
  )
81
81
 
82
82
 
83
- def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
83
+ def get_correct_language_codes(
84
+ language_codes: str | c.Sequence[str],
85
+ ) -> c.Sequence[str]:
84
86
  """Get correct language code(s).
85
87
 
86
88
  Args:
@@ -101,7 +103,7 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
101
103
  elif isinstance(language_codes, str):
102
104
  languages = [language_codes]
103
105
  else:
104
- languages = language_codes
106
+ languages = list(language_codes)
105
107
 
106
108
  # If `languages` contains 'no' then also include 'nb' and 'nn'. Conversely, if
107
109
  # either 'nb' or 'nn' are specified then also include 'no'.
@@ -114,8 +116,9 @@ def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
114
116
 
115
117
 
116
118
  def prepare_languages(
117
- language_codes: str | list[str] | None, default_language_codes: list[str]
118
- ) -> list["Language"]:
119
+ language_codes: str | c.Sequence[str] | None,
120
+ default_language_codes: c.Sequence[str],
121
+ ) -> c.Sequence["Language"]:
119
122
  """Prepare language(s) for benchmarking.
120
123
 
121
124
  Args:
@@ -133,7 +136,7 @@ def prepare_languages(
133
136
  language_mapping = get_all_languages()
134
137
 
135
138
  # Create the list `languages_str` of language codes to use for models or datasets
136
- languages_str: list[str]
139
+ languages_str: c.Sequence[str]
137
140
  if language_codes is None:
138
141
  languages_str = default_language_codes
139
142
  elif isinstance(language_codes, str):
@@ -150,12 +153,12 @@ def prepare_languages(
150
153
  return prepared_languages
151
154
 
152
155
 
153
- def prepare_tasks_and_datasets(
154
- task: str | list[str] | None,
155
- dataset_languages: list["Language"],
156
- dataset: str | list[str] | None,
157
- ) -> tuple[list["Task"], list[str]]:
158
- """Prepare task(s) and dataset(s) for benchmarking.
156
+ def prepare_dataset_configs(
157
+ task: "str | Task | c.Sequence[str | Task] | None",
158
+ dataset_languages: c.Sequence["Language"],
159
+ dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None",
160
+ ) -> c.Sequence["DatasetConfig"]:
161
+ """Prepare dataset config(s) for benchmarking.
159
162
 
160
163
  Args:
161
164
  task:
@@ -168,56 +171,58 @@ def prepare_tasks_and_datasets(
168
171
  included, limited by the `task` and `dataset_languages` parameters.
169
172
 
170
173
  Returns:
171
- The prepared tasks and datasets.
174
+ The prepared dataset configs.
172
175
 
173
176
  Raises:
174
177
  InvalidBenchmark:
175
178
  If the task or dataset is not found in the benchmark tasks or datasets.
176
179
  """
177
- # Create a dictionary that maps benchmark tasks to their associated benchmark
178
- # task objects, and a dictionary that maps dataset names to their associated
179
- # dataset configuration objects
180
- task_mapping = get_all_tasks()
181
- all_dataset_configs = get_all_dataset_configs()
182
-
183
180
  # Create the list of dataset tasks
181
+ task_mapping = get_all_tasks()
184
182
  try:
185
183
  if task is None:
186
- tasks = [t for t in task_mapping.values() if t != SPEED]
184
+ tasks = None
187
185
  elif isinstance(task, str):
188
186
  tasks = [task_mapping[task]]
187
+ elif isinstance(task, Task):
188
+ tasks = [task]
189
189
  else:
190
- tasks = [task_mapping[t] for t in task]
190
+ tasks = [task_mapping[t] if isinstance(t, str) else t for t in task]
191
191
  except KeyError as e:
192
192
  raise InvalidBenchmark(f"Task {e} not found in the benchmark tasks.") from e
193
193
 
194
- all_official_datasets = [
195
- dataset_name
196
- for dataset_name, dataset_config in all_dataset_configs.items()
194
+ # Create the list of dataset configs
195
+ all_dataset_configs = get_all_dataset_configs()
196
+ all_official_dataset_configs: c.Sequence[DatasetConfig] = [
197
+ dataset_config
198
+ for dataset_config in all_dataset_configs.values()
197
199
  if not dataset_config.unofficial
198
200
  ]
199
- if dataset is None:
200
- dataset = all_official_datasets
201
- elif isinstance(dataset, str):
202
- dataset = [dataset]
203
-
204
- all_datasets = list(all_dataset_configs.keys())
205
- invalid_datasets = set(dataset) - set(all_datasets)
206
- if invalid_datasets:
201
+ try:
202
+ if dataset is None:
203
+ datasets = all_official_dataset_configs
204
+ elif isinstance(dataset, str):
205
+ datasets = [all_dataset_configs[dataset]]
206
+ elif isinstance(dataset, DatasetConfig):
207
+ datasets = [dataset]
208
+ else:
209
+ datasets = [
210
+ all_dataset_configs[d] if isinstance(d, str) else d for d in dataset
211
+ ]
212
+ except KeyError as e:
207
213
  raise InvalidBenchmark(
208
- f"Dataset(s) {', '.join(invalid_datasets)} not found in the benchmark "
209
- "datasets."
210
- )
214
+ f"Dataset {e} not found in the benchmark datasets."
215
+ ) from e
211
216
 
217
+ # Filter the dataset configs based on the specified tasks and languages
212
218
  datasets = [
213
- dataset_name
214
- for dataset_name, dataset_config in all_dataset_configs.items()
215
- if dataset_name in dataset
216
- and dataset_config.task in tasks
217
- and set(dataset_config.languages).intersection(dataset_languages)
219
+ ds
220
+ for ds in datasets
221
+ if (tasks is None or ds.task in tasks)
222
+ and any(lang in dataset_languages for lang in ds.languages)
218
223
  ]
219
224
 
220
- return tasks, datasets
225
+ return datasets
221
226
 
222
227
 
223
228
  def prepare_device(device: Device | None) -> torch.device:
@@ -52,7 +52,7 @@ class BenchmarkModule(ABC):
52
52
  fresh_model: bool
53
53
  batching_preference: "BatchingPreference"
54
54
  high_priority: bool
55
- allowed_params: dict[re.Pattern, list[str]] = {re.compile(r".*"): []}
55
+ allowed_params: dict[re.Pattern, c.Sequence[str]] = {re.compile(r".*"): []}
56
56
 
57
57
  def __init__(
58
58
  self,
@@ -83,11 +83,12 @@ class BenchmarkModule(ABC):
83
83
 
84
84
  def _log_metadata(self) -> None:
85
85
  """Log the metadata of the model."""
86
+ model_id = self.model_config.model_id
86
87
  logging_msg: str = " ↳ "
87
88
  if self.num_params < 0:
88
- logging_msg += "The model has an unknown number of parameters, "
89
+ logging_msg += f"The model {model_id} has an unknown number of parameters, "
89
90
  else:
90
- logging_msg += f"The model has {self.num_params:,} parameters, "
91
+ logging_msg += f"The model {model_id} has {self.num_params:,} parameters, "
91
92
  if self.vocab_size < 0:
92
93
  logging_msg += "an unknown vocabulary size, "
93
94
  else:
@@ -166,7 +167,7 @@ class BenchmarkModule(ABC):
166
167
 
167
168
  @property
168
169
  @abstractmethod
169
- def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
170
+ def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
170
171
  """The data collator used to prepare samples during finetuning.
171
172
 
172
173
  Returns:
@@ -240,7 +241,7 @@ class BenchmarkModule(ABC):
240
241
 
241
242
  def prepare_datasets(
242
243
  self, datasets: list[DatasetDict], task: "Task"
243
- ) -> list[DatasetDict]:
244
+ ) -> c.Sequence[DatasetDict]:
244
245
  """Prepare the datasets for the model.
245
246
 
246
247
  This includes things like tokenisation.
@@ -267,7 +267,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
267
267
  return model_max_length
268
268
 
269
269
  @property
270
- def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
270
+ def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
271
271
  """The data collator used to prepare samples during finetuning.
272
272
 
273
273
  Returns:
@@ -775,15 +775,8 @@ def get_model_repo_info(
775
775
  level=logging.DEBUG,
776
776
  )
777
777
  return None
778
- except (RepositoryNotFoundError, HFValidationError):
778
+ except (RepositoryNotFoundError, HFValidationError, HfHubHTTPError):
779
779
  return None
780
- except HfHubHTTPError as e:
781
- if "unauthorized" in str(e).lower():
782
- raise InvalidModel(
783
- "It seems like your specified Hugging Face API key is invalid. "
784
- "Please double-check your API key."
785
- ) from e
786
- raise InvalidModel(str(e)) from e
787
780
  except (OSError, RequestException) as e:
788
781
  if internet_connection_available():
789
782
  errors.append(e)
@@ -310,7 +310,7 @@ class LiteLLMModel(BenchmarkModule):
310
310
  InvalidBenchmark:
311
311
  If the inputs do not contain either 'messages' or 'text' keys.
312
312
  """
313
- model_inputs: list[list[litellm.AllMessageValues] | str]
313
+ model_inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str]
314
314
  if "messages" in inputs:
315
315
  model_inputs = inputs["messages"]
316
316
  elif "text" in inputs:
@@ -331,9 +331,9 @@ class LiteLLMModel(BenchmarkModule):
331
331
  )
332
332
 
333
333
  all_responses: dict[int, "ModelResponse"] = {}
334
- inputs_to_run: list[tuple[int, list[litellm.AllMessageValues] | str]] = list(
335
- enumerate(model_inputs)
336
- )
334
+ inputs_to_run: c.Sequence[
335
+ tuple[int, c.Sequence[litellm.AllMessageValues] | str]
336
+ ] = list(enumerate(model_inputs))
337
337
  for attempt in range(num_attempts := 10):
338
338
  if not inputs_to_run:
339
339
  break
@@ -540,7 +540,7 @@ class LiteLLMModel(BenchmarkModule):
540
540
  )
541
541
  ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
542
542
  keys_and_their_types = {
543
- tag_name: (list[str], ...) for tag_name in ner_tag_names
543
+ tag_name: (c.Sequence[str], ...) for tag_name in ner_tag_names
544
544
  }
545
545
  pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
546
546
  generation_kwargs["response_format"] = pydantic_class
@@ -686,9 +686,11 @@ class LiteLLMModel(BenchmarkModule):
686
686
  async def _generate_async(
687
687
  self,
688
688
  model_id: str,
689
- inputs: list[list[litellm.AllMessageValues] | str],
689
+ inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str],
690
690
  **generation_kwargs,
691
- ) -> tuple[list[tuple[int, "ModelResponse"]], list[tuple[int, Exception]]]:
691
+ ) -> tuple[
692
+ c.Sequence[tuple[int, "ModelResponse"]], c.Sequence[tuple[int, Exception]]
693
+ ]:
692
694
  """Generate outputs from the model asynchronously.
693
695
 
694
696
  Args:
@@ -789,7 +791,7 @@ class LiteLLMModel(BenchmarkModule):
789
791
 
790
792
  @staticmethod
791
793
  def _create_model_output(
792
- model_responses: list["ModelResponse"], model_id: str
794
+ model_responses: c.Sequence["ModelResponse"], model_id: str
793
795
  ) -> GenerativeModelOutput:
794
796
  """Create a GenerativeModelOutput object from a list of ModelResponse objects.
795
797
 
@@ -863,7 +865,7 @@ class LiteLLMModel(BenchmarkModule):
863
865
  )
864
866
  continue
865
867
 
866
- logprobs_list: list[list[tuple[str, float]]]
868
+ logprobs_list: c.Sequence[c.Sequence[tuple[str, float]]]
867
869
  if isinstance(logprobs_obj, ChoiceLogprobs):
868
870
  logprobs_list = [
869
871
  [
@@ -1159,7 +1161,7 @@ class LiteLLMModel(BenchmarkModule):
1159
1161
  return -1
1160
1162
 
1161
1163
  @property
1162
- def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
1164
+ def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
1163
1165
  """The data collator used to prepare samples during finetuning.
1164
1166
 
1165
1167
  Returns:
@@ -1545,7 +1547,7 @@ class LiteLLMModel(BenchmarkModule):
1545
1547
  # First attempt is a test run with a single conversation to handle errors
1546
1548
  # quickly. We repeat this multiple times to deal with different types of
1547
1549
  # errors, and stop if we get a successful response.
1548
- test_input: list[litellm.AllMessageValues] | str
1550
+ test_input: c.Sequence[litellm.AllMessageValues] | str
1549
1551
  if self.generative_type == GenerativeType.BASE:
1550
1552
  test_input = "Test message"
1551
1553
  else:
@@ -1604,7 +1606,7 @@ def try_download_ollama_model(model_id: str) -> bool:
1604
1606
  )
1605
1607
 
1606
1608
  try:
1607
- downloaded_ollama_models: list[str] = [
1609
+ downloaded_ollama_models: c.Sequence[str] = [
1608
1610
  model_obj.model
1609
1611
  for model_obj in ollama.list().models
1610
1612
  if model_obj.model is not None
@@ -416,12 +416,18 @@ class VLLMModel(HuggingFaceEncoderModel):
416
416
  json=structured_generation_schema
417
417
  )
418
418
  elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
419
- structured_outputs = StructuredOutputsParams(
420
- choice=[
421
- self.dataset_config.prompt_label_mapping[label]
422
- for label in self.dataset_config.labels
419
+ choice_labels = [
420
+ self.dataset_config.prompt_label_mapping[label]
421
+ for label in self.dataset_config.labels
422
+ ]
423
+ if "first_label_token_mapping" in self.buffer and isinstance(
424
+ self.buffer["first_label_token_mapping"], dict
425
+ ):
426
+ choice_labels = [
427
+ self.buffer["first_label_token_mapping"][label]
428
+ for label in choice_labels
423
429
  ]
424
- )
430
+ structured_outputs = StructuredOutputsParams(choice=choice_labels)
425
431
  log_once(
426
432
  "Using structured generation with the choices: "
427
433
  f"{structured_outputs.choice!r}.",
@@ -452,7 +458,7 @@ class VLLMModel(HuggingFaceEncoderModel):
452
458
 
453
459
  # If any of the prompts are empty then we need to replace them with a BOS token
454
460
  # so that the vLLM model can generate from them
455
- prompts: list[str] = inputs["text"]
461
+ prompts: c.Sequence[str] = inputs["text"]
456
462
  if any(len(prompt) == 0 for prompt in prompts):
457
463
  log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
458
464
  prompts = [
@@ -556,13 +562,14 @@ class VLLMModel(HuggingFaceEncoderModel):
556
562
  )
557
563
 
558
564
  # Parse the raw model outputs
559
- completion_ids: list[list[int]] = [
565
+ completion_ids: c.Sequence[c.Sequence[int]] = [
560
566
  list(output.outputs[0].token_ids) for output in raw_outputs
561
567
  ]
562
568
  completions = self._tokeniser.batch_decode(
563
569
  sequences=[
564
570
  torch.LongTensor(completion_id) for completion_id in completion_ids
565
- ]
571
+ ],
572
+ skip_special_tokens=True,
566
573
  )
567
574
  if (
568
575
  self.end_of_reasoning_token is not None
@@ -608,7 +615,7 @@ class VLLMModel(HuggingFaceEncoderModel):
608
615
 
609
616
  # Add logprobs scores to the output
610
617
  if self.buffer["first_label_token_mapping"]:
611
- scores: list[list[list[tuple[str, float]]]] = [
618
+ scores: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]] = [
612
619
  [
613
620
  [
614
621
  (obj.decoded_token or "", obj.logprob)
@@ -719,7 +726,7 @@ class VLLMModel(HuggingFaceEncoderModel):
719
726
  return model_config
720
727
 
721
728
  @property
722
- def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
729
+ def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
723
730
  """The data collator used to prepare samples during finetuning.
724
731
 
725
732
  Returns:
euroeval/benchmarker.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Class that benchmarks language models."""
2
2
 
3
+ import collections.abc as c
3
4
  import contextlib
4
5
  import datetime as dt
5
6
  import json
@@ -38,7 +39,7 @@ from .utils import (
38
39
 
39
40
  if t.TYPE_CHECKING:
40
41
  from .benchmark_modules import BenchmarkModule
41
- from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
42
+ from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig, Task
42
43
 
43
44
 
44
45
  class Benchmarker:
@@ -62,11 +63,11 @@ class Benchmarker:
62
63
  self,
63
64
  progress_bar: bool = True,
64
65
  save_results: bool = True,
65
- task: str | list[str] | None = None,
66
- dataset: list[str] | str | None = None,
67
- language: str | list[str] = "all",
68
- model_language: str | list[str] | None = None,
69
- dataset_language: str | list[str] | None = None,
66
+ task: "str | Task | c.Sequence[str | Task] | None" = None,
67
+ dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
68
+ language: str | c.Sequence[str] = "all",
69
+ model_language: str | c.Sequence[str] | None = None,
70
+ dataset_language: str | c.Sequence[str] | None = None,
70
71
  device: Device | None = None,
71
72
  batch_size: int = 32,
72
73
  raise_errors: bool = False,
@@ -176,6 +177,8 @@ class Benchmarker:
176
177
  ValueError:
177
178
  If both `task` and `dataset` are specified, or if `download_only`
178
179
  is True and we have no internet connection.
180
+ ImportError:
181
+ If `hf_transfer` is enabled but not installed.
179
182
  """
180
183
  if task is not None and dataset is not None:
181
184
  raise ValueError("Only one of `task` and `dataset` can be specified.")
@@ -236,13 +239,13 @@ class Benchmarker:
236
239
  )
237
240
 
238
241
  # Initialise variable storing model lists, so we only have to fetch it once
239
- self._model_lists: dict[str, list[str]] | None = None
242
+ self._model_lists: dict[str, c.Sequence[str]] | None = None
240
243
 
241
244
  self.results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
242
245
  adjust_logging_level(verbose=self.benchmark_config.verbose)
243
246
 
244
247
  @property
245
- def benchmark_results(self) -> list[BenchmarkResult]:
248
+ def benchmark_results(self) -> c.Sequence[BenchmarkResult]:
246
249
  """The benchmark results.
247
250
 
248
251
  Returns:
@@ -320,14 +323,14 @@ class Benchmarker:
320
323
 
321
324
  def benchmark(
322
325
  self,
323
- model: list[str] | str,
324
- task: str | list[str] | None = None,
325
- dataset: list[str] | str | None = None,
326
+ model: c.Sequence[str] | str,
327
+ task: "str | Task | c.Sequence[str | Task] | None" = None,
328
+ dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
326
329
  progress_bar: bool | None = None,
327
330
  save_results: bool | None = None,
328
- language: str | list[str] | None = None,
329
- model_language: str | list[str] | None = None,
330
- dataset_language: str | list[str] | None = None,
331
+ language: str | c.Sequence[str] | None = None,
332
+ model_language: str | c.Sequence[str] | None = None,
333
+ dataset_language: str | c.Sequence[str] | None = None,
331
334
  device: Device | None = None,
332
335
  batch_size: int | None = None,
333
336
  raise_errors: bool | None = None,
@@ -347,7 +350,7 @@ class Benchmarker:
347
350
  force: bool | None = None,
348
351
  verbose: bool | None = None,
349
352
  debug: bool | None = None,
350
- ) -> list[BenchmarkResult]:
353
+ ) -> c.Sequence[BenchmarkResult]:
351
354
  """Benchmarks models on datasets.
352
355
 
353
356
  Args:
@@ -605,9 +608,7 @@ class Benchmarker:
605
608
  clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
606
609
 
607
610
  model_ids = self._prepare_model_ids(model_id=model)
608
- dataset_configs = prepare_dataset_configs(
609
- dataset_names=benchmark_config.datasets
610
- )
611
+ dataset_configs = benchmark_config.datasets
611
612
 
612
613
  # Get all the model configs
613
614
  model_configs: list[ModelConfig] = list()
@@ -625,27 +626,40 @@ class Benchmarker:
625
626
  log(e.message, level=logging.ERROR)
626
627
 
627
628
  # Create a dictionary that takes each model config to the dataset configs that
628
- # we need to benchmark the model on. Here we remove the datasets that the model
629
- # has already been benchmarked on, or datasets that the model cannot be
630
- # benchmarked on.
631
- model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
629
+ # we need to benchmark the model on. We initially include all the relevant
630
+ # datasets for each model.
631
+ model_config_to_dataset_configs: dict[
632
+ ModelConfig, c.Sequence[DatasetConfig]
633
+ ] = {
632
634
  model_config: [
633
635
  dataset_config
634
636
  for dataset_config in dataset_configs
635
- if (
636
- benchmark_config.force
637
- or not model_has_been_benchmarked(
638
- model_config=model_config,
639
- dataset_config=dataset_config,
640
- benchmark_config=benchmark_config,
641
- benchmark_results=self.benchmark_results,
642
- )
643
- )
644
- and model_config.model_type in dataset_config.allowed_model_types
637
+ if model_config.model_type in dataset_config.allowed_model_types
645
638
  ]
646
639
  for model_config in model_configs
647
640
  }
648
641
 
642
+ # Initialise the current benchmark results with all the ones that we have cached
643
+ # on disk already (can be none), and remove those datasets from the mapping
644
+ current_benchmark_results: list[BenchmarkResult] = list()
645
+ for (
646
+ model_config,
647
+ model_dataset_configs,
648
+ ) in model_config_to_dataset_configs.items():
649
+ new_model_dataset_configs: list[DatasetConfig] = list()
650
+ for dataset_config in model_dataset_configs:
651
+ benchmark_record = get_record(
652
+ model_config=model_config,
653
+ dataset_config=dataset_config,
654
+ benchmark_config=benchmark_config,
655
+ benchmark_results=self.benchmark_results,
656
+ )
657
+ if benchmark_record is not None and not benchmark_config.force:
658
+ current_benchmark_results.append(benchmark_record)
659
+ else:
660
+ new_model_dataset_configs.append(dataset_config)
661
+ model_config_to_dataset_configs[model_config] = new_model_dataset_configs
662
+
649
663
  total_benchmarks = sum(
650
664
  len(dataset_configs)
651
665
  for dataset_configs in model_config_to_dataset_configs.values()
@@ -656,10 +670,9 @@ class Benchmarker:
656
670
  "benchmarked on all the selected datasets.",
657
671
  level=logging.INFO,
658
672
  )
659
- return list()
673
+ return current_benchmark_results
660
674
 
661
675
  num_finished_benchmarks = 0
662
- current_benchmark_results: list[BenchmarkResult] = list()
663
676
  benchmark_params_to_revert: dict[str, t.Any] = dict()
664
677
  for model_config in model_configs:
665
678
  if not model_config_to_dataset_configs[model_config]:
@@ -809,7 +822,9 @@ class Benchmarker:
809
822
  if benchmark_config.clear_model_cache:
810
823
  clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
811
824
 
812
- log(f"Completed {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO)
825
+ log(
826
+ f"\nCompleted {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO
827
+ )
813
828
 
814
829
  # This avoids the following warning at the end of the benchmarking:
815
830
  # Warning: WARNING: process group has NOT been destroyed before we destruct
@@ -823,7 +838,7 @@ class Benchmarker:
823
838
  destroy_process_group()
824
839
  return current_benchmark_results
825
840
 
826
- def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
841
+ def _prepare_model_ids(self, model_id: c.Sequence[str] | str) -> c.Sequence[str]:
827
842
  """Prepare the model ID(s) to be benchmarked.
828
843
 
829
844
  Args:
@@ -1020,13 +1035,13 @@ class Benchmarker:
1020
1035
  return self.benchmark(*args, **kwds)
1021
1036
 
1022
1037
 
1023
- def model_has_been_benchmarked(
1038
+ def get_record(
1024
1039
  model_config: "ModelConfig",
1025
1040
  dataset_config: "DatasetConfig",
1026
1041
  benchmark_config: "BenchmarkConfig",
1027
- benchmark_results: list[BenchmarkResult],
1028
- ) -> bool:
1029
- """Checks whether a model has already been benchmarked on a dataset.
1042
+ benchmark_results: c.Sequence[BenchmarkResult],
1043
+ ) -> BenchmarkResult | None:
1044
+ """Get the benchmark record for a given model and dataset.
1030
1045
 
1031
1046
  Args:
1032
1047
  model_config:
@@ -1039,7 +1054,7 @@ def model_has_been_benchmarked(
1039
1054
  The benchmark results.
1040
1055
 
1041
1056
  Returns:
1042
- Whether the model has already been evaluated on the dataset.
1057
+ The benchmark record, or None if no such record exists.
1043
1058
  """
1044
1059
  for record in benchmark_results:
1045
1060
  model_id_components = split_model_id(model_id=record.model)
@@ -1064,8 +1079,8 @@ def model_has_been_benchmarked(
1064
1079
  and same_split
1065
1080
  and same_num_shots
1066
1081
  ):
1067
- return True
1068
- return False
1082
+ return record
1083
+ return None
1069
1084
 
1070
1085
 
1071
1086
  def clear_model_cache_fn(cache_dir: str) -> None:
@@ -1086,7 +1101,9 @@ def clear_model_cache_fn(cache_dir: str) -> None:
1086
1101
  rmtree(sub_model_dir)
1087
1102
 
1088
1103
 
1089
- def prepare_dataset_configs(dataset_names: list[str]) -> list["DatasetConfig"]:
1104
+ def prepare_dataset_configs(
1105
+ dataset_names: c.Sequence[str],
1106
+ ) -> c.Sequence["DatasetConfig"]:
1090
1107
  """Prepare the dataset configuration(s) to be benchmarked.
1091
1108
 
1092
1109
  Args:
euroeval/caching_utils.py CHANGED
@@ -54,7 +54,7 @@ def cache_arguments(
54
54
  key = args + tuple(kwargs[k] for k in sorted(kwargs.keys()))
55
55
  else:
56
56
  func_params = func.__code__.co_varnames
57
- key_items: list[t.Any] = []
57
+ key_items: list[t.Any] = list()
58
58
  for arg_name in arguments:
59
59
  if arg_name in kwargs:
60
60
  key_items.append(kwargs[arg_name])