EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show
  1. euroeval/__init__.py +8 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +199 -139
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +19 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +73 -23
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +35 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +90 -20
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +276 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/model_cache.py +13 -1
  41. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  42. euroeval/prompt_templates/multiple_choice.py +23 -2
  43. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  44. euroeval/prompt_templates/reading_comprehension.py +42 -2
  45. euroeval/prompt_templates/sentiment_classification.py +46 -2
  46. euroeval/prompt_templates/summarization.py +24 -4
  47. euroeval/scores.py +7 -2
  48. euroeval/speed_benchmark.py +6 -6
  49. euroeval/task_group_utils/multiple_choice_classification.py +19 -8
  50. euroeval/task_group_utils/question_answering.py +35 -28
  51. euroeval/task_group_utils/sequence_classification.py +128 -42
  52. euroeval/task_group_utils/text_to_text.py +7 -3
  53. euroeval/task_group_utils/token_classification.py +59 -73
  54. euroeval/tasks.py +33 -6
  55. euroeval/tokenization_utils.py +294 -207
  56. euroeval/utils.py +150 -35
  57. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
  58. euroeval-16.0.1.dist-info/RECORD +69 -0
  59. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
  60. euroeval/human_evaluation.py +0 -738
  61. euroeval/metrics.py +0 -470
  62. euroeval-15.16.0.dist-info/RECORD +0 -63
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
  64. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,10 +1,15 @@
1
1
  """Templates for the Summarization task."""
2
2
 
3
+ import typing as t
4
+
3
5
  from ..data_models import PromptConfig
4
- from ..languages import DA, DE, EN, ES, FI, FR, IS, IT, NB, NL, NN, NO, PT, SV
6
+ from ..languages import DA, DE, EN, ES, ET, FI, FR, IS, IT, LV, NB, NL, NN, NO, PT, SV
7
+
8
+ if t.TYPE_CHECKING:
9
+ from ..data_models import Language
5
10
 
6
11
  # TODO: Missing Faroese
7
- SUMM_TEMPLATES = {
12
+ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
8
13
  DA: PromptConfig(
9
14
  default_prompt_prefix="Følgende er dokumenter med tilhørende resuméer.",
10
15
  default_prompt_template="Dokument: {text}\nResumé: {target_text}",
@@ -32,8 +37,14 @@ SUMM_TEMPLATES = {
32
37
  default_prompt_prefix="A continuación se presentan documentos con resúmenes "
33
38
  "adjuntos.",
34
39
  default_prompt_template="Documento: {text}\nResumen: {target_text}",
35
- default_instruction_prompt="Documento: {text}\n\nEscriba un resumen del "
36
- "documento anterior.",
40
+ default_instruction_prompt="Documento: {text}\n\n",
41
+ default_prompt_label_mapping=dict(),
42
+ ),
43
+ ET: PromptConfig(
44
+ default_prompt_prefix="Allpool on dokumendid koos kokkuvõtetega.",
45
+ default_prompt_template="Dokument: {text}\nKokkuvõte: {target_text}",
46
+ default_instruction_prompt="Dokument: {text}\n\nKoosta ülaltoodud dokumendi "
47
+ "kokkuvõte.",
37
48
  default_prompt_label_mapping=dict(),
38
49
  ),
39
50
  PT: PromptConfig(
@@ -58,6 +69,15 @@ SUMM_TEMPLATES = {
58
69
  "document ci-dessus.",
59
70
  default_prompt_label_mapping=dict(),
60
71
  ),
72
+ LV: PromptConfig(
73
+ default_prompt_prefix="Tālāk ir dokumenti ar pievienotām kopsavilkumiem.",
74
+ default_prompt_template="Dokuments: {text}\nKopsavilkums: {target_text}",
75
+ default_instruction_prompt=(
76
+ "Dokuments: {text}\n\n"
77
+ "Uzrakstiet kopsavilkumu par iepriekš minēto dokumentu."
78
+ ),
79
+ default_prompt_label_mapping=dict(),
80
+ ),
61
81
  IS: PromptConfig(
62
82
  default_prompt_prefix="Eftirfarandi eru skjöl með meðfylgjandi samantektum.",
63
83
  default_prompt_template="Skjal: {text}\nSamantekt: {target_text}",
euroeval/scores.py CHANGED
@@ -52,7 +52,12 @@ def log_scores(
52
52
  test_se, test_se_str = metric.postprocessing_fn(test_se)
53
53
  total_dict[f"test_{metric.name}"] = test_score
54
54
  total_dict[f"test_{metric.name}_se"] = test_se
55
- logger.info(f"{metric.pretty_name}: {test_score_str} ± {test_se_str}")
55
+ log_str = (
56
+ f"{metric.pretty_name}: {test_score_str} ± {test_se_str}"
57
+ if not np.isnan(test_se)
58
+ else f"{metric.pretty_name}: {test_score_str}"
59
+ )
60
+ logger.info(log_str)
56
61
 
57
62
  return dict(raw=scores, total=total_dict)
58
63
 
@@ -84,7 +89,7 @@ def aggregate_scores(
84
89
 
85
90
  if len(test_scores) > 1:
86
91
  sample_std = np.std(test_scores, ddof=1)
87
- test_se = sample_std / np.sqrt(len(test_scores))
92
+ test_se = (sample_std / np.sqrt(len(test_scores))).item()
88
93
  else:
89
94
  test_se = np.nan
90
95
 
@@ -59,7 +59,7 @@ def benchmark_speed_single_iteration(
59
59
  Returns:
60
60
  A dictionary containing the scores for the current iteration.
61
61
  """
62
- gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
62
+ gpt2_tokeniser = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
63
63
 
64
64
  base_doc = "Document which contains roughly 10 tokens. "
65
65
  multiplier = 10 * (1 + itr_idx)
@@ -74,11 +74,11 @@ def benchmark_speed_single_iteration(
74
74
  model.generate(inputs=dict(text=[doc]))
75
75
 
76
76
  def encoder_predict(doc: str) -> None:
77
- tokenizer = model.get_tokenizer()
77
+ tokeniser = model.get_tokeniser()
78
78
  pytorch_model = model.get_pytorch_module()
79
79
  inputs = {
80
80
  key: tensor.to(pytorch_model.device)
81
- for key, tensor in tokenizer(
81
+ for key, tensor in tokeniser(
82
82
  text=[doc], truncation=True, return_tensors="pt"
83
83
  ).items()
84
84
  }
@@ -102,21 +102,21 @@ def benchmark_speed_single_iteration(
102
102
  speed_scores = pyinfer.InferenceReport(
103
103
  model=predict, inputs=doc, n_seconds=3
104
104
  ).run(print_report=False)
105
- num_gpt2_tokens = len(gpt2_tokenizer([doc], truncation=True)["input_ids"][0])
105
+ num_gpt2_tokens = len(gpt2_tokeniser([doc], truncation=True)["input_ids"][0])
106
106
  gpt2_tokens_per_second = speed_scores["Infer(p/sec)"] * num_gpt2_tokens
107
107
 
108
108
  speed_scores_short = pyinfer.InferenceReport(
109
109
  model=predict, inputs=short_doc, n_seconds=3
110
110
  ).run(print_report=False)
111
111
  num_gpt2_tokens_short = len(
112
- gpt2_tokenizer([short_doc], truncation=True)["input_ids"][0]
112
+ gpt2_tokeniser([short_doc], truncation=True)["input_ids"][0]
113
113
  )
114
114
  gpt2_tokens_per_second_short = (
115
115
  speed_scores_short["Infer(p/sec)"] * num_gpt2_tokens_short
116
116
  )
117
117
 
118
118
  except (RuntimeError, ValueError, IndexError) as e:
119
- raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}")
119
+ raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}") from e
120
120
 
121
121
  return dict(
122
122
  test_speed=gpt2_tokens_per_second, test_speed_short=gpt2_tokens_per_second_short
@@ -94,15 +94,15 @@ class MultipleChoiceClassificationTrainer(Trainer):
94
94
 
95
95
 
96
96
  def prepare_examples(
97
- examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
97
+ examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
98
98
  ) -> "BatchEncoding":
99
99
  """Prepare the features.
100
100
 
101
101
  Args:
102
102
  examples:
103
103
  The examples to prepare.
104
- tokenizer:
105
- The tokenizer to use to prepare the examples.
104
+ tokeniser:
105
+ The tokeniser to use to prepare the examples.
106
106
 
107
107
  Returns:
108
108
  The prepared examples.
@@ -110,12 +110,23 @@ def prepare_examples(
110
110
  doc: str = examples["text"][0]
111
111
  sections = doc.split("\n")
112
112
 
113
- choice_idxs = [
113
+ candidate_choice_idxs = [
114
114
  idx
115
115
  for idx, section in enumerate(sections)
116
- if re.match(pattern=r"^[a-e]\. ", string=section) is not None
116
+ if re.match(pattern=r"^[a-z0-9]+\. ", string=section) is not None
117
117
  ]
118
- choices = [sections[idx] for idx in choice_idxs]
118
+
119
+ # Sometimes the question itself starts with a letter or number followed by a dot, We
120
+ # want to ignore these cases, and focus on the final contingent block of at least
121
+ # two choices.
122
+ choice_idxs: list[int] = list()
123
+ for idx in reversed(candidate_choice_idxs):
124
+ if len(choice_idxs) < 2 or (
125
+ len(choice_idxs) >= 2 and idx == choice_idxs[-1] - 1
126
+ ):
127
+ choice_idxs.append(idx)
128
+
129
+ choices = [sections[idx] for idx in reversed(choice_idxs)]
119
130
 
120
131
  # Check that the choices are present, and that all of them are at the end
121
132
  assert len(choices) > 0, "No choices found in the document."
@@ -127,7 +138,7 @@ def prepare_examples(
127
138
  question_idx = min(choice_idxs) - 2 # -2 to remove the 'Choices:' line
128
139
  context_and_question = "\n".join(sections[: question_idx + 1]).strip()
129
140
 
130
- new_examples = tokenizer(
141
+ new_examples = tokeniser(
131
142
  text=[context_and_question] * len(choices),
132
143
  text_pair=[choice[3:] for choice in choices],
133
144
  padding=True,
@@ -135,7 +146,7 @@ def prepare_examples(
135
146
  )
136
147
  new_examples["label"] = [
137
148
  int(choice.startswith(f"{letter}. ") and letter == examples["label"][0])
138
- for letter, choice in zip("abcde", choices)
149
+ for letter, choice in zip("abcdefghijklmnopqrstuvwxyz", choices)
139
150
  ]
140
151
  new_examples["id"] = [hashlib.md5(string=doc.encode()).hexdigest()] * len(choices)
141
152
  return new_examples
@@ -23,7 +23,7 @@ if t.TYPE_CHECKING:
23
23
  from transformers.trainer_utils import EvalPrediction
24
24
  from transformers.training_args import TrainingArguments
25
25
 
26
- from ..data_models import DatasetConfig, GenerativeModelOutput
26
+ from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
27
27
  from ..types import Labels, Predictions
28
28
 
29
29
  logger = logging.getLogger("euroeval")
@@ -57,7 +57,7 @@ class QuestionAnsweringTrainer(Trainer):
57
57
  **kwargs,
58
58
  )
59
59
 
60
- # Get the CLS token id for the tokenizer
60
+ # Get the CLS token id for the tokeniser
61
61
  if self.tokenizer is not None:
62
62
  assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
63
63
  special_token_metadata = get_special_token_metadata(self.tokenizer)
@@ -149,6 +149,7 @@ class QuestionAnsweringTrainer(Trainer):
149
149
  def compute_metrics(
150
150
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
151
151
  dataset_config: "DatasetConfig",
152
+ benchmark_config: "BenchmarkConfig",
152
153
  dataset: "Dataset",
153
154
  ) -> dict[str, float]:
154
155
  """Compute the metrics needed for evaluation.
@@ -159,6 +160,8 @@ def compute_metrics(
159
160
  contains the true labels.
160
161
  dataset_config:
161
162
  The configuration of the dataset.
163
+ benchmark_config:
164
+ The configuration of the benchmark.
162
165
  dataset:
163
166
  The dataset used for evaluation. This is only used in case any additional
164
167
  metadata is used to compute the metrics.
@@ -186,7 +189,11 @@ def compute_metrics(
186
189
  results: dict[str, float] = dict()
187
190
  for metric in dataset_config.task.metrics:
188
191
  score: float | None = metric(
189
- predictions=predictions, references=labels, dataset=dataset
192
+ predictions=predictions,
193
+ references=labels,
194
+ dataset=dataset,
195
+ dataset_config=dataset_config,
196
+ benchmark_config=benchmark_config,
190
197
  )
191
198
 
192
199
  # The metric returns None if we are running on multi-GPU and the current
@@ -221,15 +228,15 @@ def extract_labels_from_generation(
221
228
 
222
229
 
223
230
  def prepare_train_examples(
224
- examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
231
+ examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
225
232
  ) -> "BatchEncoding":
226
233
  """Prepare the features for training.
227
234
 
228
235
  Args:
229
236
  examples:
230
237
  The examples to prepare.
231
- tokenizer:
232
- The tokenizer to use to prepare the examples.
238
+ tokeniser:
239
+ The tokeniser to use to prepare the examples.
233
240
 
234
241
  Returns:
235
242
  The prepared examples.
@@ -239,15 +246,15 @@ def prepare_train_examples(
239
246
  # take a lots of space). So we remove that left whitespace
240
247
  examples["question"] = [q.lstrip() for q in examples["question"]]
241
248
 
242
- # Extract special token metadata from the tokenizer
243
- special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
249
+ # Extract special token metadata from the tokeniser
250
+ special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
244
251
  has_cls_token = special_token_metadata["has_cls_token"]
245
252
  has_sep_token = special_token_metadata["has_sep_token"]
246
253
  cls_token_id = special_token_metadata["cls_token_id"]
247
254
  cls_token = special_token_metadata["cls_token"]
248
255
  sep_token = special_token_metadata["sep_token"]
249
256
 
250
- # If the tokenizer is not adding special tokens, then we add them manually
257
+ # If the tokeniser is not adding special tokens, then we add them manually
251
258
  if not has_cls_token and not has_sep_token:
252
259
  examples["question"] = [
253
260
  f"{cls_token}{q}{sep_token}" for q in examples["question"]
@@ -258,18 +265,18 @@ def prepare_train_examples(
258
265
  # split into several features. Since we are always keeping the question tokens, we
259
266
  # need to make sure that the stride does not exceed the resulting maximum context
260
267
  # length.
261
- max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
268
+ max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
262
269
  num_special_tokens = int(has_cls_token) + int(has_sep_token)
263
- stride = tokenizer.model_max_length // 4
264
- max_length = tokenizer.model_max_length - stride
270
+ stride = tokeniser.model_max_length // 4
271
+ max_length = tokeniser.model_max_length - stride
265
272
  stride = min(stride, max_length - max_question_tokens - num_special_tokens)
266
- max_length = tokenizer.model_max_length - stride
273
+ max_length = tokeniser.model_max_length - stride
267
274
 
268
275
  # Tokenize our examples with truncation and padding, but keep the overflows using a
269
276
  # stride. This results in one example possible giving several features when a
270
277
  # context is long, each of those features having a context that overlaps a bit the
271
278
  # context of the previous feature.
272
- tokenized_examples = tokenizer(
279
+ tokenized_examples = tokeniser(
273
280
  text=examples["question"],
274
281
  text_pair=examples["context"],
275
282
  truncation="only_second",
@@ -306,9 +313,9 @@ def prepare_train_examples(
306
313
  sequence_ids = tokenized_examples.sequence_ids(i)
307
314
 
308
315
  # Manually ensure that the special tokens are set to None in `sequence_ids`
309
- for special_token in tokenizer.special_tokens_map.keys():
310
- if hasattr(tokenizer, f"{special_token}_id"):
311
- special_token_id = getattr(tokenizer, f"{special_token}_id")
316
+ for special_token in tokeniser.special_tokens_map.keys():
317
+ if hasattr(tokeniser, f"{special_token}_id"):
318
+ special_token_id = getattr(tokeniser, f"{special_token}_id")
312
319
  if special_token_id is not None:
313
320
  sequence_ids = [
314
321
  None if token_id == special_token_id else seq_id
@@ -373,15 +380,15 @@ def prepare_train_examples(
373
380
 
374
381
 
375
382
  def prepare_test_examples(
376
- examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
383
+ examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
377
384
  ) -> "BatchEncoding":
378
385
  """Prepare test examples.
379
386
 
380
387
  Args:
381
388
  examples:
382
389
  Dictionary of test examples.
383
- tokenizer:
384
- The tokenizer used to preprocess the examples.
390
+ tokeniser:
391
+ The tokeniser used to preprocess the examples.
385
392
 
386
393
  Returns:
387
394
  The prepared test examples.
@@ -391,14 +398,14 @@ def prepare_test_examples(
391
398
  # take a lots of space). So we remove that left whitespace
392
399
  examples["question"] = [q.lstrip() for q in examples["question"]]
393
400
 
394
- # Extract special token metadata from the tokenizer
395
- special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
401
+ # Extract special token metadata from the tokeniser
402
+ special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
396
403
  has_cls_token = special_token_metadata["has_cls_token"]
397
404
  has_sep_token = special_token_metadata["has_sep_token"]
398
405
  cls_token = special_token_metadata["cls_token"]
399
406
  sep_token = special_token_metadata["sep_token"]
400
407
 
401
- # If the tokenizer is not adding special tokens, then we add them manually
408
+ # If the tokeniser is not adding special tokens, then we add them manually
402
409
  if not has_cls_token and not has_sep_token:
403
410
  examples["question"] = [
404
411
  f"{cls_token}{q}{sep_token}" for q in examples["question"]
@@ -409,18 +416,18 @@ def prepare_test_examples(
409
416
  # split into several features. Since we are always keeping the question tokens, we
410
417
  # need to make sure that the stride does not exceed the resulting maximum context
411
418
  # length.
412
- max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
419
+ max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
413
420
  num_special_tokens = int(has_cls_token) + int(has_sep_token)
414
- stride = tokenizer.model_max_length // 4
415
- max_length = tokenizer.model_max_length - stride
421
+ stride = tokeniser.model_max_length // 4
422
+ max_length = tokeniser.model_max_length - stride
416
423
  stride = min(stride, max_length - max_question_tokens - num_special_tokens)
417
- max_length = tokenizer.model_max_length - stride
424
+ max_length = tokeniser.model_max_length - stride
418
425
 
419
426
  # Tokenize our examples with truncation and maybe padding, but keep the overflows
420
427
  # using a stride. This results in one example possible giving several features when
421
428
  # a context is long, each of those features having a context that overlaps a bit
422
429
  # the context of the previous feature.
423
- tokenized_examples = tokenizer(
430
+ tokenized_examples = tokeniser(
424
431
  text=examples["question"],
425
432
  text_pair=examples["context"],
426
433
  truncation="only_second",
@@ -7,14 +7,19 @@ import typing as t
7
7
  import Levenshtein
8
8
  import numpy as np
9
9
 
10
+ from ..enums import TaskGroup
10
11
  from ..exceptions import InvalidBenchmark
11
- from ..utils import log_once, raise_if_model_output_contains_nan_values
12
+ from ..utils import (
13
+ extract_multiple_choice_labels,
14
+ log_once,
15
+ raise_if_model_output_contains_nan_values,
16
+ )
12
17
 
13
18
  if t.TYPE_CHECKING:
14
19
  from datasets.arrow_dataset import Dataset
15
20
  from transformers.trainer_utils import EvalPrediction
16
21
 
17
- from ..data_models import DatasetConfig, GenerativeModelOutput
22
+ from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
18
23
  from ..types import Labels, Predictions
19
24
 
20
25
 
@@ -24,6 +29,7 @@ logger = logging.getLogger("euroeval")
24
29
  def compute_metrics(
25
30
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
26
31
  dataset_config: "DatasetConfig",
32
+ benchmark_config: "BenchmarkConfig",
27
33
  dataset: "Dataset",
28
34
  ) -> dict[str, float]:
29
35
  """Compute the metrics needed for evaluation.
@@ -34,6 +40,8 @@ def compute_metrics(
34
40
  contains the true labels.
35
41
  dataset_config:
36
42
  The configuration of the dataset.
43
+ benchmark_config:
44
+ The configuration of the benchmark.
37
45
  dataset:
38
46
  The dataset used for evaluation. This is only used in case any additional
39
47
  metadata is used to compute the metrics.
@@ -79,7 +87,11 @@ def compute_metrics(
79
87
  results: dict[str, float] = dict()
80
88
  for metric in dataset_config.task.metrics:
81
89
  score: float | None = metric(
82
- predictions=predictions, references=label_ids, dataset=dataset
90
+ predictions=predictions,
91
+ references=label_ids,
92
+ dataset=dataset,
93
+ dataset_config=dataset_config,
94
+ benchmark_config=benchmark_config,
83
95
  )
84
96
 
85
97
  # The metric returns None if we are running on multi-GPU and the current
@@ -113,7 +125,28 @@ def extract_labels_from_generation(
113
125
 
114
126
  Returns:
115
127
  The predicted labels.
128
+
129
+ Raises:
130
+ InvalidBenchmark:
131
+ If the task requires log probabilities, but the model did not output them,
132
+ or if the model outputted log probabilities but the first label token
133
+ mapping is not provided.
116
134
  """
135
+ # Get the candidate labels, which are the labels that the model can predict
136
+ default_labels = [
137
+ dataset_config.prompt_label_mapping[lbl]
138
+ for lbl in dataset_config.id2label.values()
139
+ ]
140
+ if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
141
+ sample_candidate_labels = [
142
+ extract_multiple_choice_labels(
143
+ prompt=prompt, candidate_labels=default_labels
144
+ )
145
+ for prompt in input_batch["prompt"]
146
+ ]
147
+ else:
148
+ sample_candidate_labels = [default_labels] * len(input_batch["prompt"])
149
+
117
150
  if model_output.scores is not None:
118
151
  if first_label_token_mapping is False:
119
152
  raise InvalidBenchmark(
@@ -122,38 +155,85 @@ def extract_labels_from_generation(
122
155
  )
123
156
  labels = get_closest_logprobs_labels(
124
157
  generation_logprobs=model_output.scores,
125
- dataset_config=dataset_config,
126
158
  first_label_token_mapping=first_label_token_mapping,
159
+ candidate_labels=sample_candidate_labels,
127
160
  )
128
161
  if labels is not None:
129
162
  return labels
163
+ elif dataset_config.task.requires_logprobs:
164
+ raise InvalidBenchmark(
165
+ "This task requires the model to output logprobs, and this model "
166
+ "does not seem to be able to do that. Skipping the evaluation."
167
+ )
130
168
 
131
- candidate_labels = [
132
- dataset_config.prompt_label_mapping[lbl]
133
- for lbl in dataset_config.id2label.values()
134
- ]
135
169
  new_predicted_labels: list[str] = list()
136
- for predicted_label in model_output.sequences:
170
+ for idx, predicted_label in enumerate(model_output.sequences):
137
171
  # If the prediction includes a boxed answer, use that instead of the full
138
172
  # generation
139
173
  if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
140
174
  predicted_label = m.group(1)
141
175
 
142
- # Pick the label with the smallest word edit distance to the predicted label
176
+ # We set the word edit distance weights such that we heavily penalise insertions
177
+ # and substitutions, so that we don't just insert the correct label, but that we
178
+ # want the model to have included the correct label in its output.
179
+ insertion_weight = 1000
180
+ deletion_weight = 1
181
+ substitution_weight = 1000
182
+
183
+ # Compute the word edit distances between the predicted label and all candidate
184
+ # labels
143
185
  edit_distances = [
144
- Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
145
- for candidate_label in candidate_labels
186
+ Levenshtein.distance(
187
+ s1=predicted_label.lower(),
188
+ s2=candidate_label.lower(),
189
+ weights=(insertion_weight, deletion_weight, substitution_weight),
190
+ )
191
+ for candidate_label in sample_candidate_labels[idx]
146
192
  ]
147
- predicted_label = candidate_labels[np.argmin(edit_distances).item()]
148
- new_predicted_labels.append(predicted_label)
193
+
194
+ best_candidate_label = sample_candidate_labels[idx][
195
+ np.argmin(edit_distances).item()
196
+ ]
197
+
198
+ # If no candidate labels were found, we either pick the label with the smallest
199
+ # word edit distance to the predicted label (if invalid model outputs are
200
+ # allowed), or we raise an error
201
+ if min(edit_distances) > 100:
202
+ if dataset_config.task.allow_invalid_model_outputs:
203
+ logger.warning(
204
+ "No candidate labels found for the predicted label "
205
+ f"{predicted_label!r}, out of the candidate labels "
206
+ f"{sample_candidate_labels[idx]}. This likely means that the model "
207
+ "output is completely off, but since invalid model outputs are "
208
+ "allowed for this task, we will use the closest candidate label "
209
+ f"({best_candidate_label})) as the output label. If you see this "
210
+ "warning very often, please report this issue to the EuroEval "
211
+ "team at github.com/EuroEval/EuroEval/issues."
212
+ )
213
+ logger.debug(
214
+ "The candidate labels were extracted from the prompt: "
215
+ f"{input_batch['text'][idx]!r}."
216
+ )
217
+ else:
218
+ raise InvalidBenchmark(
219
+ "No candidate labels found for the predicted label "
220
+ f"{predicted_label!r}, out of the candidate labels "
221
+ f"{sample_candidate_labels[idx]}. This likely means that the model "
222
+ "output is completely off, and we cannot extract any labels from "
223
+ "it. Please check the model output and the candidate labels. The "
224
+ "candidate labels were extracted from the prompt: "
225
+ f"{input_batch['text'][idx]!r}."
226
+ )
227
+
228
+ new_predicted_labels.append(best_candidate_label)
149
229
 
150
230
  return new_predicted_labels
151
231
 
152
232
 
153
233
  def get_closest_logprobs_labels(
154
234
  generation_logprobs: list[list[list[tuple[str, float]]]],
155
- dataset_config: "DatasetConfig",
156
235
  first_label_token_mapping: dict[str, str] | t.Literal[True],
236
+ candidate_labels: list[list[str]],
157
237
  ) -> list[str] | None:
158
238
  """Get the labels with the highest predicted logprob value.
159
239
 
@@ -166,11 +246,11 @@ def get_closest_logprobs_labels(
166
246
  generation_logprobs:
167
247
  The logprobs of the generated tokens, for all samples in the batch. Of shape
168
248
  (batch_size, num_tokens, num_logprobs).
169
- dataset_config:
170
- The configuration of the dataset.
171
249
  first_label_token_mapping:
172
250
  A mapping from labels to the first token in each label, or alternatively a
173
251
  `True` value indicating that the model should output logprobs.
252
+ candidate_labels:
253
+ The candidate labels for each sample in the batch.
174
254
 
175
255
  Returns:
176
256
  The predicted labels, or None if labels could not be extracted.
@@ -179,19 +259,11 @@ def get_closest_logprobs_labels(
179
259
  InvalidBenchmark:
180
260
  If no candidate label can be found for any of the generated labels.
181
261
  """
182
- english_labels = list(dataset_config.id2label.values())
183
- english2local = dataset_config.prompt_label_mapping
184
- candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
185
-
186
262
  output_labels: list[str] = list()
187
- for sample in generation_logprobs:
263
+ for idx, sample in enumerate(generation_logprobs):
188
264
  for logprob_list in sample:
189
265
  generated_labels = [
190
- re.sub(
191
- pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
192
- repl="",
193
- string=label.lower(),
194
- )
266
+ re.sub(pattern=r"^[^a-zæøåüöä0-9]+$", repl="", string=label.lower())
195
267
  for label, _ in logprob_list
196
268
  ]
197
269
  generated_labels = [label for label in generated_labels if label != ""]
@@ -206,7 +278,7 @@ def get_closest_logprobs_labels(
206
278
  if isinstance(first_label_token_mapping, dict):
207
279
  if any(
208
280
  candidate_label not in first_label_token_mapping
209
- for candidate_label in candidate_labels
281
+ for candidate_label in candidate_labels[idx]
210
282
  ):
211
283
  raise InvalidBenchmark(
212
284
  "There is a label not present in the first label token "
@@ -217,16 +289,28 @@ def get_closest_logprobs_labels(
217
289
 
218
290
  candidate_output_labels = {
219
291
  candidate_label
220
- for candidate_label in candidate_labels
292
+ for candidate_label in candidate_labels[idx]
221
293
  if generated_label == first_label_token_mapping[candidate_label]
222
294
  }
223
295
  else:
224
296
  candidate_output_labels = {
225
297
  candidate_label
226
- for candidate_label in candidate_labels
298
+ for candidate_label in candidate_labels[idx]
227
299
  if candidate_label.startswith(generated_label)
228
300
  }
229
301
 
302
+ # If the generated label is a numeral (e.g., "1", "2", "3") and there is
303
+ # a matching candidate label, we only keep the full match
304
+ if re.match(r"^\d+$", generated_label) and any(
305
+ candidate_label == generated_label
306
+ for candidate_label in candidate_output_labels
307
+ ):
308
+ candidate_output_labels = {
309
+ candidate_label
310
+ for candidate_label in candidate_output_labels
311
+ if candidate_label == generated_label
312
+ }
313
+
230
314
  # If we can uniquely determine the output label, we break the loop.
231
315
  if len(candidate_output_labels) == 1:
232
316
  output_label = candidate_output_labels.pop()
@@ -257,16 +341,18 @@ def get_closest_logprobs_labels(
257
341
  elif len(candidate_output_labels) == 0:
258
342
  candidate_output_labels_starting_with_generated_label = [
259
343
  candidate_label
260
- for candidate_label in candidate_labels
344
+ for candidate_label in candidate_labels[idx]
261
345
  if candidate_label.startswith(generated_label)
262
346
  ]
263
347
  if candidate_output_labels_starting_with_generated_label:
264
348
  log_once(
265
349
  f"No candidate label found for the generated label "
266
- f"{generated_label!r}. This means that using logprobs to "
267
- "extract the labels is not reliable, and we will instead "
268
- "fall back to extracting the labels using word edit "
269
- "distance.",
350
+ f"{generated_label!r}, but there are candidate labels "
351
+ f"starting with it: "
352
+ f"{candidate_output_labels_starting_with_generated_label}. "
353
+ "This means that the first label token mapping is not "
354
+ "reliable, and we will instead fall back to extracting "
355
+ "the labels using word edit distance.",
270
356
  level=logging.DEBUG,
271
357
  )
272
358
  return None
@@ -291,18 +377,18 @@ def get_closest_logprobs_labels(
291
377
  if len(sample) == 0:
292
378
  log_once(
293
379
  "The model outputted an empty string, so no candidate labels could "
294
- f"be determined. Using {candidate_labels[0]!r} as the output "
295
- "label.",
296
- level=logging.DEBUG,
380
+ "be determined. Using the first label, "
381
+ f"{candidate_labels[idx][0]!r}, as the output label.",
382
+ level=logging.INFO,
297
383
  )
298
384
  else:
299
385
  log_once(
300
386
  "Could not find a candidate label for any of the generated "
301
- f"labels in the sample {sample}. Using {candidate_labels[0]!r} "
302
- "as the output label.",
303
- level=logging.DEBUG,
387
+ f"labels in the sample {sample}. Using the first label, "
388
+ f"{candidate_labels[idx][0]!r}, as the output label.",
389
+ level=logging.INFO,
304
390
  )
305
- output_labels.append(candidate_labels[0])
391
+ output_labels.append(candidate_labels[idx][0])
306
392
 
307
393
  assert len(output_labels) == len(generation_logprobs)
308
394
  return output_labels