EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (78) hide show
  1. euroeval/__init__.py +9 -2
  2. euroeval/benchmark_config_factory.py +51 -50
  3. euroeval/benchmark_modules/base.py +9 -21
  4. euroeval/benchmark_modules/fresh.py +2 -1
  5. euroeval/benchmark_modules/hf.py +101 -71
  6. euroeval/benchmark_modules/litellm.py +115 -53
  7. euroeval/benchmark_modules/vllm.py +107 -92
  8. euroeval/benchmarker.py +144 -121
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +86 -8
  12. euroeval/constants.py +9 -0
  13. euroeval/data_loading.py +80 -29
  14. euroeval/data_models.py +338 -330
  15. euroeval/dataset_configs/__init__.py +12 -3
  16. euroeval/dataset_configs/bulgarian.py +56 -0
  17. euroeval/dataset_configs/czech.py +75 -0
  18. euroeval/dataset_configs/danish.py +55 -93
  19. euroeval/dataset_configs/dutch.py +48 -87
  20. euroeval/dataset_configs/english.py +45 -77
  21. euroeval/dataset_configs/estonian.py +42 -34
  22. euroeval/dataset_configs/faroese.py +19 -60
  23. euroeval/dataset_configs/finnish.py +36 -69
  24. euroeval/dataset_configs/french.py +39 -75
  25. euroeval/dataset_configs/german.py +45 -82
  26. euroeval/dataset_configs/greek.py +64 -0
  27. euroeval/dataset_configs/icelandic.py +54 -91
  28. euroeval/dataset_configs/italian.py +42 -79
  29. euroeval/dataset_configs/latvian.py +28 -35
  30. euroeval/dataset_configs/lithuanian.py +28 -26
  31. euroeval/dataset_configs/norwegian.py +72 -115
  32. euroeval/dataset_configs/polish.py +33 -61
  33. euroeval/dataset_configs/portuguese.py +33 -66
  34. euroeval/dataset_configs/serbian.py +64 -0
  35. euroeval/dataset_configs/slovak.py +55 -0
  36. euroeval/dataset_configs/spanish.py +42 -77
  37. euroeval/dataset_configs/swedish.py +52 -90
  38. euroeval/dataset_configs/ukrainian.py +64 -0
  39. euroeval/exceptions.py +1 -1
  40. euroeval/finetuning.py +24 -17
  41. euroeval/generation.py +15 -14
  42. euroeval/generation_utils.py +8 -8
  43. euroeval/languages.py +395 -323
  44. euroeval/logging_utils.py +250 -0
  45. euroeval/metrics/base.py +0 -3
  46. euroeval/metrics/huggingface.py +21 -6
  47. euroeval/metrics/llm_as_a_judge.py +6 -4
  48. euroeval/metrics/pipeline.py +17 -9
  49. euroeval/metrics/speed.py +0 -3
  50. euroeval/model_cache.py +17 -19
  51. euroeval/model_config.py +4 -5
  52. euroeval/model_loading.py +3 -0
  53. euroeval/prompt_templates/__init__.py +2 -0
  54. euroeval/prompt_templates/classification.py +206 -0
  55. euroeval/prompt_templates/linguistic_acceptability.py +99 -42
  56. euroeval/prompt_templates/multiple_choice.py +102 -38
  57. euroeval/prompt_templates/named_entity_recognition.py +172 -51
  58. euroeval/prompt_templates/reading_comprehension.py +119 -42
  59. euroeval/prompt_templates/sentiment_classification.py +110 -40
  60. euroeval/prompt_templates/summarization.py +85 -40
  61. euroeval/prompt_templates/token_classification.py +279 -0
  62. euroeval/scores.py +11 -10
  63. euroeval/speed_benchmark.py +5 -6
  64. euroeval/task_group_utils/multiple_choice_classification.py +2 -4
  65. euroeval/task_group_utils/question_answering.py +24 -16
  66. euroeval/task_group_utils/sequence_classification.py +48 -35
  67. euroeval/task_group_utils/text_to_text.py +19 -9
  68. euroeval/task_group_utils/token_classification.py +21 -17
  69. euroeval/tasks.py +44 -1
  70. euroeval/tokenisation_utils.py +33 -22
  71. euroeval/types.py +10 -9
  72. euroeval/utils.py +35 -149
  73. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
  74. euroeval-16.5.0.dist-info/RECORD +81 -0
  75. euroeval-16.3.0.dist-info/RECORD +0 -71
  76. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  77. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  78. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/finetuning.py CHANGED
@@ -1,12 +1,12 @@
1
1
  """Functions related to the finetuning of models."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import sys
5
6
  import typing as t
6
7
  from functools import partial
7
8
 
8
9
  import torch
9
- from tqdm.auto import tqdm
10
10
  from transformers.trainer_callback import (
11
11
  EarlyStoppingCallback,
12
12
  PrinterCallback,
@@ -18,13 +18,9 @@ from transformers.training_args import OptimizerNames, TrainingArguments
18
18
  from .callbacks import NeverLeaveProgressCallback
19
19
  from .enums import DataType
20
20
  from .exceptions import InvalidBenchmark, NaNValueInModelOutput
21
+ from .logging_utils import block_terminal_output, get_pbar, log, log_once
21
22
  from .model_loading import load_model
22
- from .utils import (
23
- block_terminal_output,
24
- clear_memory,
25
- enforce_reproducibility,
26
- log_once,
27
- )
23
+ from .utils import clear_memory, enforce_reproducibility
28
24
 
29
25
  if t.TYPE_CHECKING:
30
26
  from datasets import DatasetDict
@@ -32,16 +28,14 @@ if t.TYPE_CHECKING:
32
28
  from .benchmark_modules import BenchmarkModule
33
29
  from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
34
30
 
35
- logger = logging.getLogger("euroeval")
36
-
37
31
 
38
32
  def finetune(
39
33
  model: "BenchmarkModule",
40
- datasets: list["DatasetDict"],
34
+ datasets: c.Sequence["DatasetDict"],
41
35
  model_config: "ModelConfig",
42
36
  dataset_config: "DatasetConfig",
43
37
  benchmark_config: "BenchmarkConfig",
44
- ) -> list[dict[str, float]]:
38
+ ) -> c.Sequence[dict[str, float]]:
45
39
  """Evaluate a model on a dataset through finetuning.
46
40
 
47
41
  Args:
@@ -58,6 +52,10 @@ def finetune(
58
52
 
59
53
  Returns:
60
54
  A list of dicts containing the scores for each metric for each iteration.
55
+
56
+ Raises:
57
+ InvalidBenchmark:
58
+ If the benchmark could not be completed.
61
59
  """
62
60
  # Set the data type to use for the model weights
63
61
  using_cuda = benchmark_config.device == torch.device("cuda")
@@ -70,7 +68,7 @@ def finetune(
70
68
 
71
69
  bs: int = benchmark_config.batch_size
72
70
  scores: list[dict[str, float]] = list()
73
- for idx in tqdm(
71
+ for idx in get_pbar(
74
72
  iterable=range(benchmark_config.num_iterations),
75
73
  desc="Benchmarking",
76
74
  disable=not benchmark_config.progress_bar,
@@ -80,7 +78,7 @@ def finetune(
80
78
  model_already_initialized = idx == 0
81
79
 
82
80
  # Run a loop here to deal with automatic reduction of batch size
83
- while True:
81
+ for _ in range(num_attempts := 10):
84
82
  # Clear GPU memory
85
83
  if not model_already_initialized:
86
84
  try:
@@ -112,7 +110,10 @@ def finetune(
112
110
  )
113
111
 
114
112
  scores.append(itr_scores)
115
- logger.debug(f"Test scores for iteration {idx}: {itr_scores}")
113
+ log(
114
+ f"Test scores for iteration {idx}: {itr_scores}",
115
+ level=logging.DEBUG,
116
+ )
116
117
 
117
118
  break
118
119
 
@@ -123,9 +124,10 @@ def finetune(
123
124
  if dtype != DataType.FP32:
124
125
  dtype = DataType.FP32
125
126
  model_already_initialized = False
126
- logger.debug(
127
+ log(
127
128
  "NaN value detected in model outputs while using mixed "
128
- "precision. Retrying with full fp32 precision."
129
+ "precision. Retrying with full fp32 precision.",
130
+ level=logging.DEBUG,
129
131
  )
130
132
  else:
131
133
  raise InvalidBenchmark(
@@ -151,7 +153,12 @@ def finetune(
151
153
  model_already_initialized = False
152
154
 
153
155
  bs //= 2
154
- logger.debug(f"Reduced batch size to {bs}")
156
+ log(f"Reduced batch size to {bs}", level=logging.DEBUG)
157
+
158
+ else:
159
+ raise InvalidBenchmark(
160
+ f"Could not benchmark the model after {num_attempts} attempts!"
161
+ )
155
162
 
156
163
  return scores
157
164
 
euroeval/generation.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Functions related to text generation of models."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import sys
5
6
  import typing as t
@@ -11,12 +12,13 @@ from tqdm.auto import tqdm
11
12
 
12
13
  from .enums import BatchingPreference, TaskGroup
13
14
  from .exceptions import InvalidBenchmark
15
+ from .logging_utils import get_pbar, log, log_once
14
16
  from .model_cache import (
15
17
  ModelCache,
16
18
  load_cached_model_outputs,
17
19
  split_dataset_into_cached_and_non_cached,
18
20
  )
19
- from .utils import clear_memory, log_once
21
+ from .utils import clear_memory
20
22
 
21
23
  if t.TYPE_CHECKING:
22
24
  from datasets import DatasetDict
@@ -29,16 +31,14 @@ if t.TYPE_CHECKING:
29
31
  ModelConfig,
30
32
  )
31
33
 
32
- logger = logging.getLogger("euroeval")
33
-
34
34
 
35
35
  def generate(
36
36
  model: "BenchmarkModule",
37
- datasets: list["DatasetDict"],
37
+ datasets: c.Sequence["DatasetDict"],
38
38
  model_config: "ModelConfig",
39
39
  dataset_config: "DatasetConfig",
40
40
  benchmark_config: "BenchmarkConfig",
41
- ) -> list[dict[str, float]]:
41
+ ) -> c.Sequence[dict[str, float]]:
42
42
  """Evaluate a model on a dataset through generation.
43
43
 
44
44
  Args:
@@ -78,7 +78,7 @@ def generate(
78
78
  )
79
79
 
80
80
  scores: list[dict[str, float]] = list()
81
- for idx in tqdm(
81
+ for idx in get_pbar(
82
82
  iterable=range(len(datasets)),
83
83
  desc="Benchmarking",
84
84
  disable=not benchmark_config.progress_bar,
@@ -90,7 +90,7 @@ def generate(
90
90
  dataset_config=dataset_config,
91
91
  benchmark_config=benchmark_config,
92
92
  )
93
- logger.debug(f"Test scores for iteration {idx}: {test_scores}")
93
+ log(f"Test scores for iteration {idx}: {test_scores}", level=logging.DEBUG)
94
94
  scores.append(test_scores)
95
95
  clear_memory()
96
96
 
@@ -142,14 +142,14 @@ def generate_single_iteration(
142
142
  itr: t.Iterable
143
143
  match model.batching_preference:
144
144
  case BatchingPreference.SINGLE_SAMPLE:
145
- itr = tqdm(iterable=non_cached_dataset, leave=False)
145
+ itr = get_pbar(iterable=non_cached_dataset)
146
146
  case BatchingPreference.ALL_AT_ONCE:
147
147
  itr = [non_cached_dataset[:]]
148
148
  case _:
149
149
  num_batches = len(non_cached_dataset) // benchmark_config.batch_size
150
150
  if len(non_cached_dataset) % benchmark_config.batch_size != 0:
151
151
  num_batches += 1
152
- itr = tqdm(
152
+ itr = get_pbar(
153
153
  iterable=mit.batched(
154
154
  iterable=non_cached_dataset, n=benchmark_config.batch_size
155
155
  ),
@@ -254,7 +254,7 @@ def generate_single_iteration(
254
254
  def debug_log(
255
255
  batch: dict[str, t.Any],
256
256
  model_output: "GenerativeModelOutput",
257
- extracted_labels: list[dict | str | list[str]],
257
+ extracted_labels: c.Sequence[dict | str | c.Sequence[str]],
258
258
  dataset_config: "DatasetConfig",
259
259
  ) -> None:
260
260
  """Log inputs and outputs for debugging purposes.
@@ -297,7 +297,7 @@ def debug_log(
297
297
  + "\n"
298
298
  + "\t".join(labels)
299
299
  )
300
- logger.info("\n\n".join(log_msgs))
300
+ log("\n\n".join(log_msgs), level=logging.DEBUG)
301
301
  return
302
302
 
303
303
  case (
@@ -332,7 +332,7 @@ def debug_log(
332
332
  else:
333
333
  input_texts = batch["text"]
334
334
 
335
- metadata_keys: list[str] = [
335
+ metadata_keys: c.Sequence[str] = [
336
336
  key
337
337
  for key in batch.keys()
338
338
  if key not in ["text", "messages", "label", "labels", "target_text"]
@@ -347,6 +347,7 @@ def debug_log(
347
347
  if labels[idx]:
348
348
  data_to_log["Label"] = labels[idx]
349
349
  data_to_log |= {key.capitalize(): batch[key][idx] for key in metadata_keys}
350
- logger.info(
351
- "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items())
350
+ log(
351
+ "\n".join(f"{key}: {value!r}" for key, value in data_to_log.items()),
352
+ level=logging.DEBUG,
352
353
  )
@@ -1,5 +1,6 @@
1
1
  """Utility functions related to generative models."""
2
2
 
3
+ import collections.abc as c
3
4
  import itertools as it
4
5
  import json
5
6
  import logging
@@ -9,8 +10,9 @@ import typing as t
9
10
 
10
11
  from .enums import GenerativeType, TaskGroup
11
12
  from .exceptions import InvalidBenchmark, InvalidModel
13
+ from .logging_utils import log_once
12
14
  from .tokenisation_utils import apply_chat_template
13
- from .utils import extract_multiple_choice_labels, log_once
15
+ from .utils import extract_multiple_choice_labels
14
16
 
15
17
  if t.TYPE_CHECKING:
16
18
  from datasets import DatasetDict
@@ -18,15 +20,13 @@ if t.TYPE_CHECKING:
18
20
 
19
21
  from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
20
22
 
21
- logger = logging.getLogger("euroeval")
22
-
23
23
 
24
24
  def extract_few_shot_examples(
25
25
  dataset: "DatasetDict",
26
26
  dataset_config: "DatasetConfig",
27
27
  benchmark_config: "BenchmarkConfig",
28
28
  itr_idx: int,
29
- ) -> list[dict[str, t.Any]]:
29
+ ) -> c.Sequence[dict[str, t.Any]]:
30
30
  """Extract few-shot examples from a dataset.
31
31
 
32
32
  This will always extract the examples from the training split.
@@ -79,7 +79,7 @@ def extract_few_shot_examples(
79
79
  lambda example: len(example["text"]) < max_num_tokens
80
80
  )
81
81
  num_short_examples = len(train_with_short_examples)
82
- if num_short_examples >= dataset_config.num_few_shot_examples:
82
+ if num_short_examples >= num_few_shots:
83
83
  break
84
84
  else:
85
85
  raise InvalidBenchmark(
@@ -144,7 +144,7 @@ def extract_few_shot_examples(
144
144
  lambda example: len(example["context"]) < max_num_tokens
145
145
  )
146
146
  num_short_examples = len(train_with_short_examples)
147
- if num_short_examples >= dataset_config.num_few_shot_examples:
147
+ if num_short_examples >= num_few_shots:
148
148
  break
149
149
  else:
150
150
  raise InvalidBenchmark(
@@ -171,7 +171,7 @@ def extract_few_shot_examples(
171
171
 
172
172
  def apply_prompt(
173
173
  examples: dict[str, t.Any],
174
- few_shot_examples: list[dict[str, t.Any]],
174
+ few_shot_examples: c.Sequence[dict[str, t.Any]],
175
175
  model_config: "ModelConfig",
176
176
  dataset_config: "DatasetConfig",
177
177
  generative_type: GenerativeType | None,
@@ -432,7 +432,7 @@ def apply_prompt(
432
432
 
433
433
 
434
434
  def raise_if_wrong_params(
435
- model_config: "ModelConfig", allowed_params: dict[re.Pattern, list[str]]
435
+ model_config: "ModelConfig", allowed_params: dict[re.Pattern, c.Sequence[str]]
436
436
  ) -> None:
437
437
  """Raise an error if the model configuration has invalid parameters.
438
438