EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show
  1. euroeval/__init__.py +3 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +161 -114
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +13 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +53 -7
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +38 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +46 -14
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +234 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  41. euroeval/prompt_templates/multiple_choice.py +23 -2
  42. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  43. euroeval/prompt_templates/reading_comprehension.py +42 -2
  44. euroeval/prompt_templates/sentiment_classification.py +46 -2
  45. euroeval/prompt_templates/summarization.py +24 -4
  46. euroeval/scores.py +7 -2
  47. euroeval/speed_benchmark.py +6 -6
  48. euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  49. euroeval/task_group_utils/question_answering.py +35 -28
  50. euroeval/task_group_utils/sequence_classification.py +96 -23
  51. euroeval/task_group_utils/text_to_text.py +7 -3
  52. euroeval/task_group_utils/token_classification.py +47 -75
  53. euroeval/tasks.py +31 -6
  54. euroeval/tokenization_utils.py +295 -207
  55. euroeval/utils.py +118 -34
  56. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
  57. euroeval-16.0.0.dist-info/RECORD +69 -0
  58. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
  59. euroeval/human_evaluation.py +0 -738
  60. euroeval/metrics.py +0 -470
  61. euroeval-15.16.0.dist-info/RECORD +0 -63
  62. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
euroeval/scores.py CHANGED
@@ -52,7 +52,12 @@ def log_scores(
52
52
  test_se, test_se_str = metric.postprocessing_fn(test_se)
53
53
  total_dict[f"test_{metric.name}"] = test_score
54
54
  total_dict[f"test_{metric.name}_se"] = test_se
55
- logger.info(f"{metric.pretty_name}: {test_score_str} ± {test_se_str}")
55
+ log_str = (
56
+ f"{metric.pretty_name}: {test_score_str} ± {test_se_str}"
57
+ if not np.isnan(test_se)
58
+ else f"{metric.pretty_name}: {test_score_str}"
59
+ )
60
+ logger.info(log_str)
56
61
 
57
62
  return dict(raw=scores, total=total_dict)
58
63
 
@@ -84,7 +89,7 @@ def aggregate_scores(
84
89
 
85
90
  if len(test_scores) > 1:
86
91
  sample_std = np.std(test_scores, ddof=1)
87
- test_se = sample_std / np.sqrt(len(test_scores))
92
+ test_se = (sample_std / np.sqrt(len(test_scores))).item()
88
93
  else:
89
94
  test_se = np.nan
90
95
 
@@ -59,7 +59,7 @@ def benchmark_speed_single_iteration(
59
59
  Returns:
60
60
  A dictionary containing the scores for the current iteration.
61
61
  """
62
- gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
62
+ gpt2_tokeniser = AutoTokenizer.from_pretrained("gpt2", trust_remote_code=True)
63
63
 
64
64
  base_doc = "Document which contains roughly 10 tokens. "
65
65
  multiplier = 10 * (1 + itr_idx)
@@ -74,11 +74,11 @@ def benchmark_speed_single_iteration(
74
74
  model.generate(inputs=dict(text=[doc]))
75
75
 
76
76
  def encoder_predict(doc: str) -> None:
77
- tokenizer = model.get_tokenizer()
77
+ tokeniser = model.get_tokeniser()
78
78
  pytorch_model = model.get_pytorch_module()
79
79
  inputs = {
80
80
  key: tensor.to(pytorch_model.device)
81
- for key, tensor in tokenizer(
81
+ for key, tensor in tokeniser(
82
82
  text=[doc], truncation=True, return_tensors="pt"
83
83
  ).items()
84
84
  }
@@ -102,21 +102,21 @@ def benchmark_speed_single_iteration(
102
102
  speed_scores = pyinfer.InferenceReport(
103
103
  model=predict, inputs=doc, n_seconds=3
104
104
  ).run(print_report=False)
105
- num_gpt2_tokens = len(gpt2_tokenizer([doc], truncation=True)["input_ids"][0])
105
+ num_gpt2_tokens = len(gpt2_tokeniser([doc], truncation=True)["input_ids"][0])
106
106
  gpt2_tokens_per_second = speed_scores["Infer(p/sec)"] * num_gpt2_tokens
107
107
 
108
108
  speed_scores_short = pyinfer.InferenceReport(
109
109
  model=predict, inputs=short_doc, n_seconds=3
110
110
  ).run(print_report=False)
111
111
  num_gpt2_tokens_short = len(
112
- gpt2_tokenizer([short_doc], truncation=True)["input_ids"][0]
112
+ gpt2_tokeniser([short_doc], truncation=True)["input_ids"][0]
113
113
  )
114
114
  gpt2_tokens_per_second_short = (
115
115
  speed_scores_short["Infer(p/sec)"] * num_gpt2_tokens_short
116
116
  )
117
117
 
118
118
  except (RuntimeError, ValueError, IndexError) as e:
119
- raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}")
119
+ raise InvalidBenchmark(f"Speed benchmark failed with error: {e!r}") from e
120
120
 
121
121
  return dict(
122
122
  test_speed=gpt2_tokens_per_second, test_speed_short=gpt2_tokens_per_second_short
@@ -94,15 +94,15 @@ class MultipleChoiceClassificationTrainer(Trainer):
94
94
 
95
95
 
96
96
  def prepare_examples(
97
- examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
97
+ examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
98
98
  ) -> "BatchEncoding":
99
99
  """Prepare the features.
100
100
 
101
101
  Args:
102
102
  examples:
103
103
  The examples to prepare.
104
- tokenizer:
105
- The tokenizer to use to prepare the examples.
104
+ tokeniser:
105
+ The tokeniser to use to prepare the examples.
106
106
 
107
107
  Returns:
108
108
  The prepared examples.
@@ -110,11 +110,22 @@ def prepare_examples(
110
110
  doc: str = examples["text"][0]
111
111
  sections = doc.split("\n")
112
112
 
113
- choice_idxs = [
113
+ candidate_choice_idxs = [
114
114
  idx
115
115
  for idx, section in enumerate(sections)
116
- if re.match(pattern=r"^[a-e]\. ", string=section) is not None
116
+ if re.match(pattern=r"^[a-z0-9]+\. ", string=section) is not None
117
117
  ]
118
+
119
+ # Sometimes the question itself starts with a letter or number followed by a dot, We
120
+ # want to ignore these cases, and focus on the final contingent block of at least
121
+ # two choices.
122
+ choice_idxs: list[int] = list()
123
+ for idx in reversed(candidate_choice_idxs):
124
+ if len(choice_idxs) < 2 or (
125
+ len(choice_idxs) >= 2 and idx == choice_idxs[-1] - 1
126
+ ):
127
+ choice_idxs.append(idx)
128
+
118
129
  choices = [sections[idx] for idx in choice_idxs]
119
130
 
120
131
  # Check that the choices are present, and that all of them are at the end
@@ -127,7 +138,7 @@ def prepare_examples(
127
138
  question_idx = min(choice_idxs) - 2 # -2 to remove the 'Choices:' line
128
139
  context_and_question = "\n".join(sections[: question_idx + 1]).strip()
129
140
 
130
- new_examples = tokenizer(
141
+ new_examples = tokeniser(
131
142
  text=[context_and_question] * len(choices),
132
143
  text_pair=[choice[3:] for choice in choices],
133
144
  padding=True,
@@ -23,7 +23,7 @@ if t.TYPE_CHECKING:
23
23
  from transformers.trainer_utils import EvalPrediction
24
24
  from transformers.training_args import TrainingArguments
25
25
 
26
- from ..data_models import DatasetConfig, GenerativeModelOutput
26
+ from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
27
27
  from ..types import Labels, Predictions
28
28
 
29
29
  logger = logging.getLogger("euroeval")
@@ -57,7 +57,7 @@ class QuestionAnsweringTrainer(Trainer):
57
57
  **kwargs,
58
58
  )
59
59
 
60
- # Get the CLS token id for the tokenizer
60
+ # Get the CLS token id for the tokeniser
61
61
  if self.tokenizer is not None:
62
62
  assert isinstance(self.tokenizer, PreTrainedTokenizerBase)
63
63
  special_token_metadata = get_special_token_metadata(self.tokenizer)
@@ -149,6 +149,7 @@ class QuestionAnsweringTrainer(Trainer):
149
149
  def compute_metrics(
150
150
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
151
151
  dataset_config: "DatasetConfig",
152
+ benchmark_config: "BenchmarkConfig",
152
153
  dataset: "Dataset",
153
154
  ) -> dict[str, float]:
154
155
  """Compute the metrics needed for evaluation.
@@ -159,6 +160,8 @@ def compute_metrics(
159
160
  contains the true labels.
160
161
  dataset_config:
161
162
  The configuration of the dataset.
163
+ benchmark_config:
164
+ The configuration of the benchmark.
162
165
  dataset:
163
166
  The dataset used for evaluation. This is only used in case any additional
164
167
  metadata is used to compute the metrics.
@@ -186,7 +189,11 @@ def compute_metrics(
186
189
  results: dict[str, float] = dict()
187
190
  for metric in dataset_config.task.metrics:
188
191
  score: float | None = metric(
189
- predictions=predictions, references=labels, dataset=dataset
192
+ predictions=predictions,
193
+ references=labels,
194
+ dataset=dataset,
195
+ dataset_config=dataset_config,
196
+ benchmark_config=benchmark_config,
190
197
  )
191
198
 
192
199
  # The metric returns None if we are running on multi-GPU and the current
@@ -221,15 +228,15 @@ def extract_labels_from_generation(
221
228
 
222
229
 
223
230
  def prepare_train_examples(
224
- examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
231
+ examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
225
232
  ) -> "BatchEncoding":
226
233
  """Prepare the features for training.
227
234
 
228
235
  Args:
229
236
  examples:
230
237
  The examples to prepare.
231
- tokenizer:
232
- The tokenizer to use to prepare the examples.
238
+ tokeniser:
239
+ The tokeniser to use to prepare the examples.
233
240
 
234
241
  Returns:
235
242
  The prepared examples.
@@ -239,15 +246,15 @@ def prepare_train_examples(
239
246
  # take a lots of space). So we remove that left whitespace
240
247
  examples["question"] = [q.lstrip() for q in examples["question"]]
241
248
 
242
- # Extract special token metadata from the tokenizer
243
- special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
249
+ # Extract special token metadata from the tokeniser
250
+ special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
244
251
  has_cls_token = special_token_metadata["has_cls_token"]
245
252
  has_sep_token = special_token_metadata["has_sep_token"]
246
253
  cls_token_id = special_token_metadata["cls_token_id"]
247
254
  cls_token = special_token_metadata["cls_token"]
248
255
  sep_token = special_token_metadata["sep_token"]
249
256
 
250
- # If the tokenizer is not adding special tokens, then we add them manually
257
+ # If the tokeniser is not adding special tokens, then we add them manually
251
258
  if not has_cls_token and not has_sep_token:
252
259
  examples["question"] = [
253
260
  f"{cls_token}{q}{sep_token}" for q in examples["question"]
@@ -258,18 +265,18 @@ def prepare_train_examples(
258
265
  # split into several features. Since we are always keeping the question tokens, we
259
266
  # need to make sure that the stride does not exceed the resulting maximum context
260
267
  # length.
261
- max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
268
+ max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
262
269
  num_special_tokens = int(has_cls_token) + int(has_sep_token)
263
- stride = tokenizer.model_max_length // 4
264
- max_length = tokenizer.model_max_length - stride
270
+ stride = tokeniser.model_max_length // 4
271
+ max_length = tokeniser.model_max_length - stride
265
272
  stride = min(stride, max_length - max_question_tokens - num_special_tokens)
266
- max_length = tokenizer.model_max_length - stride
273
+ max_length = tokeniser.model_max_length - stride
267
274
 
268
275
  # Tokenize our examples with truncation and padding, but keep the overflows using a
269
276
  # stride. This results in one example possible giving several features when a
270
277
  # context is long, each of those features having a context that overlaps a bit the
271
278
  # context of the previous feature.
272
- tokenized_examples = tokenizer(
279
+ tokenized_examples = tokeniser(
273
280
  text=examples["question"],
274
281
  text_pair=examples["context"],
275
282
  truncation="only_second",
@@ -306,9 +313,9 @@ def prepare_train_examples(
306
313
  sequence_ids = tokenized_examples.sequence_ids(i)
307
314
 
308
315
  # Manually ensure that the special tokens are set to None in `sequence_ids`
309
- for special_token in tokenizer.special_tokens_map.keys():
310
- if hasattr(tokenizer, f"{special_token}_id"):
311
- special_token_id = getattr(tokenizer, f"{special_token}_id")
316
+ for special_token in tokeniser.special_tokens_map.keys():
317
+ if hasattr(tokeniser, f"{special_token}_id"):
318
+ special_token_id = getattr(tokeniser, f"{special_token}_id")
312
319
  if special_token_id is not None:
313
320
  sequence_ids = [
314
321
  None if token_id == special_token_id else seq_id
@@ -373,15 +380,15 @@ def prepare_train_examples(
373
380
 
374
381
 
375
382
  def prepare_test_examples(
376
- examples: "BatchEncoding", tokenizer: "PreTrainedTokenizer"
383
+ examples: "BatchEncoding", tokeniser: "PreTrainedTokenizer"
377
384
  ) -> "BatchEncoding":
378
385
  """Prepare test examples.
379
386
 
380
387
  Args:
381
388
  examples:
382
389
  Dictionary of test examples.
383
- tokenizer:
384
- The tokenizer used to preprocess the examples.
390
+ tokeniser:
391
+ The tokeniser used to preprocess the examples.
385
392
 
386
393
  Returns:
387
394
  The prepared test examples.
@@ -391,14 +398,14 @@ def prepare_test_examples(
391
398
  # take a lots of space). So we remove that left whitespace
392
399
  examples["question"] = [q.lstrip() for q in examples["question"]]
393
400
 
394
- # Extract special token metadata from the tokenizer
395
- special_token_metadata = get_special_token_metadata(tokenizer=tokenizer)
401
+ # Extract special token metadata from the tokeniser
402
+ special_token_metadata = get_special_token_metadata(tokeniser=tokeniser)
396
403
  has_cls_token = special_token_metadata["has_cls_token"]
397
404
  has_sep_token = special_token_metadata["has_sep_token"]
398
405
  cls_token = special_token_metadata["cls_token"]
399
406
  sep_token = special_token_metadata["sep_token"]
400
407
 
401
- # If the tokenizer is not adding special tokens, then we add them manually
408
+ # If the tokeniser is not adding special tokens, then we add them manually
402
409
  if not has_cls_token and not has_sep_token:
403
410
  examples["question"] = [
404
411
  f"{cls_token}{q}{sep_token}" for q in examples["question"]
@@ -409,18 +416,18 @@ def prepare_test_examples(
409
416
  # split into several features. Since we are always keeping the question tokens, we
410
417
  # need to make sure that the stride does not exceed the resulting maximum context
411
418
  # length.
412
- max_question_tokens = max(len(tokenizer(q).input_ids) for q in examples["question"])
419
+ max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
413
420
  num_special_tokens = int(has_cls_token) + int(has_sep_token)
414
- stride = tokenizer.model_max_length // 4
415
- max_length = tokenizer.model_max_length - stride
421
+ stride = tokeniser.model_max_length // 4
422
+ max_length = tokeniser.model_max_length - stride
416
423
  stride = min(stride, max_length - max_question_tokens - num_special_tokens)
417
- max_length = tokenizer.model_max_length - stride
424
+ max_length = tokeniser.model_max_length - stride
418
425
 
419
426
  # Tokenize our examples with truncation and maybe padding, but keep the overflows
420
427
  # using a stride. This results in one example possible giving several features when
421
428
  # a context is long, each of those features having a context that overlaps a bit
422
429
  # the context of the previous feature.
423
- tokenized_examples = tokenizer(
430
+ tokenized_examples = tokeniser(
424
431
  text=examples["question"],
425
432
  text_pair=examples["context"],
426
433
  truncation="only_second",
@@ -7,6 +7,7 @@ import typing as t
7
7
  import Levenshtein
8
8
  import numpy as np
9
9
 
10
+ from ..enums import TaskGroup
10
11
  from ..exceptions import InvalidBenchmark
11
12
  from ..utils import log_once, raise_if_model_output_contains_nan_values
12
13
 
@@ -14,7 +15,7 @@ if t.TYPE_CHECKING:
14
15
  from datasets.arrow_dataset import Dataset
15
16
  from transformers.trainer_utils import EvalPrediction
16
17
 
17
- from ..data_models import DatasetConfig, GenerativeModelOutput
18
+ from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
18
19
  from ..types import Labels, Predictions
19
20
 
20
21
 
@@ -24,6 +25,7 @@ logger = logging.getLogger("euroeval")
24
25
  def compute_metrics(
25
26
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
26
27
  dataset_config: "DatasetConfig",
28
+ benchmark_config: "BenchmarkConfig",
27
29
  dataset: "Dataset",
28
30
  ) -> dict[str, float]:
29
31
  """Compute the metrics needed for evaluation.
@@ -34,6 +36,8 @@ def compute_metrics(
34
36
  contains the true labels.
35
37
  dataset_config:
36
38
  The configuration of the dataset.
39
+ benchmark_config:
40
+ The configuration of the benchmark.
37
41
  dataset:
38
42
  The dataset used for evaluation. This is only used in case any additional
39
43
  metadata is used to compute the metrics.
@@ -79,7 +83,11 @@ def compute_metrics(
79
83
  results: dict[str, float] = dict()
80
84
  for metric in dataset_config.task.metrics:
81
85
  score: float | None = metric(
82
- predictions=predictions, references=label_ids, dataset=dataset
86
+ predictions=predictions,
87
+ references=label_ids,
88
+ dataset=dataset,
89
+ dataset_config=dataset_config,
90
+ benchmark_config=benchmark_config,
83
91
  )
84
92
 
85
93
  # The metric returns None if we are running on multi-GPU and the current
@@ -113,6 +121,12 @@ def extract_labels_from_generation(
113
121
 
114
122
  Returns:
115
123
  The predicted labels.
124
+
125
+ Raises:
126
+ InvalidBenchmark:
127
+ If the task requires log probabilities, but the model did not output them,
128
+ or if the model outputted log probabilities but the first label token
129
+ mapping is not provided.
116
130
  """
117
131
  if model_output.scores is not None:
118
132
  if first_label_token_mapping is False:
@@ -127,25 +141,74 @@ def extract_labels_from_generation(
127
141
  )
128
142
  if labels is not None:
129
143
  return labels
144
+ elif dataset_config.task.requires_logprobs:
145
+ raise InvalidBenchmark(
146
+ "This task requires the model to output logprobs, and this model "
147
+ "does not seem to be able to do that. Skipping the evaluation."
148
+ )
130
149
 
150
+ # Get the candidate labels, which are the labels that the model can predict
131
151
  candidate_labels = [
132
152
  dataset_config.prompt_label_mapping[lbl]
133
153
  for lbl in dataset_config.id2label.values()
134
154
  ]
155
+
135
156
  new_predicted_labels: list[str] = list()
136
- for predicted_label in model_output.sequences:
157
+ for idx, predicted_label in enumerate(model_output.sequences):
158
+ # Special case if we are doing multiple choice classification: we in this case
159
+ # dynamically change the candidate labels to the labels mentioned in the prompt
160
+ if dataset_config.task.task_group == TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
161
+ prompt = input_batch["text"][idx]
162
+ sample_candidate_labels = [
163
+ candidate_label
164
+ for candidate_label in candidate_labels
165
+ if re.search(
166
+ pattern=rf"\b{candidate_label}. ",
167
+ string=prompt,
168
+ flags=re.IGNORECASE,
169
+ )
170
+ is not None
171
+ ]
172
+ else:
173
+ sample_candidate_labels = candidate_labels
174
+
137
175
  # If the prediction includes a boxed answer, use that instead of the full
138
176
  # generation
139
177
  if (m := re.search(r"boxed\{(.*?)\}", predicted_label)) is not None:
140
178
  predicted_label = m.group(1)
141
179
 
142
- # Pick the label with the smallest word edit distance to the predicted label
180
+ # We set the word edit distance weights such that we heavily penalise insertions
181
+ # and substitutions, so that we don't just insert the correct label, but that we
182
+ # want the model to have included the correct label in its output.
183
+ insertion_weight = 1000
184
+ deletion_weight = 1
185
+ substitution_weight = 1000
186
+
187
+ # Compute the word edit distances between the predicted label and all candidate
188
+ # labels
143
189
  edit_distances = [
144
- Levenshtein.distance(s1=predicted_label.lower(), s2=candidate_label.lower())
145
- for candidate_label in candidate_labels
190
+ Levenshtein.distance(
191
+ s1=predicted_label.lower(),
192
+ s2=candidate_label.lower(),
193
+ weights=(insertion_weight, deletion_weight, substitution_weight),
194
+ )
195
+ for candidate_label in sample_candidate_labels
146
196
  ]
147
- predicted_label = candidate_labels[np.argmin(edit_distances).item()]
148
- new_predicted_labels.append(predicted_label)
197
+
198
+ # If no candidate labels were found, we assume that something is wrong with the
199
+ # model output, and we raise an error
200
+ if min(edit_distances) > 100:
201
+ raise InvalidBenchmark(
202
+ "No candidate labels found for the predicted label "
203
+ f"{predicted_label!r}, out of the candidate labels "
204
+ f"{sample_candidate_labels}. This likely means that the model output "
205
+ "is completely off, and we cannot extract any labels from it. Please "
206
+ "check the model output and the candidate labels."
207
+ )
208
+
209
+ # Pick the label with the smallest word edit distance to the predicted label
210
+ best_candidate_label = sample_candidate_labels[np.argmin(edit_distances).item()]
211
+ new_predicted_labels.append(best_candidate_label)
149
212
 
150
213
  return new_predicted_labels
151
214
 
@@ -187,11 +250,7 @@ def get_closest_logprobs_labels(
187
250
  for sample in generation_logprobs:
188
251
  for logprob_list in sample:
189
252
  generated_labels = [
190
- re.sub(
191
- pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$",
192
- repl="",
193
- string=label.lower(),
194
- )
253
+ re.sub(pattern=r"^[^a-zæøåüöä0-9]+$", repl="", string=label.lower())
195
254
  for label, _ in logprob_list
196
255
  ]
197
256
  generated_labels = [label for label in generated_labels if label != ""]
@@ -227,6 +286,18 @@ def get_closest_logprobs_labels(
227
286
  if candidate_label.startswith(generated_label)
228
287
  }
229
288
 
289
+ # If the generated label is a numeral (e.g., "1", "2", "3") and there is
290
+ # a matching candidate label, we only keep the full match
291
+ if re.match(r"^\d+$", generated_label) and any(
292
+ candidate_label == generated_label
293
+ for candidate_label in candidate_output_labels
294
+ ):
295
+ candidate_output_labels = {
296
+ candidate_label
297
+ for candidate_label in candidate_output_labels
298
+ if candidate_label == generated_label
299
+ }
300
+
230
301
  # If we can uniquely determine the output label, we break the loop.
231
302
  if len(candidate_output_labels) == 1:
232
303
  output_label = candidate_output_labels.pop()
@@ -263,10 +334,12 @@ def get_closest_logprobs_labels(
263
334
  if candidate_output_labels_starting_with_generated_label:
264
335
  log_once(
265
336
  f"No candidate label found for the generated label "
266
- f"{generated_label!r}. This means that using logprobs to "
267
- "extract the labels is not reliable, and we will instead "
268
- "fall back to extracting the labels using word edit "
269
- "distance.",
337
+ f"{generated_label!r}, but there are candidate labels "
338
+ f"starting with it: "
339
+ f"{candidate_output_labels_starting_with_generated_label}. "
340
+ "This means that the first label token mapping is not "
341
+ "reliable, and we will instead fall back to extracting "
342
+ "the labels using word edit distance.",
270
343
  level=logging.DEBUG,
271
344
  )
272
345
  return None
@@ -291,16 +364,16 @@ def get_closest_logprobs_labels(
291
364
  if len(sample) == 0:
292
365
  log_once(
293
366
  "The model outputted an empty string, so no candidate labels could "
294
- f"be determined. Using {candidate_labels[0]!r} as the output "
295
- "label.",
296
- level=logging.DEBUG,
367
+ f"be determined. Using the first label, {candidate_labels[0]!r}, "
368
+ "as the output label.",
369
+ level=logging.INFO,
297
370
  )
298
371
  else:
299
372
  log_once(
300
373
  "Could not find a candidate label for any of the generated "
301
- f"labels in the sample {sample}. Using {candidate_labels[0]!r} "
302
- "as the output label.",
303
- level=logging.DEBUG,
374
+ f"labels in the sample {sample}. Using the first label, "
375
+ f"{candidate_labels[0]!r}, as the output label.",
376
+ level=logging.INFO,
304
377
  )
305
378
  output_labels.append(candidate_labels[0])
306
379
 
@@ -75,7 +75,11 @@ def compute_metrics(
75
75
  while True:
76
76
  try:
77
77
  score: float | None = metric(
78
- predictions=predictions, references=labels, dataset=dataset
78
+ predictions=predictions,
79
+ references=labels,
80
+ dataset=dataset,
81
+ dataset_config=dataset_config,
82
+ benchmark_config=benchmark_config,
79
83
  )
80
84
  break
81
85
  except Exception as e:
@@ -85,7 +89,7 @@ def compute_metrics(
85
89
  "MPS backend out of memory",
86
90
  ]
87
91
  if not any(error in str(e) for error in oom_error):
88
- raise InvalidBenchmark(str(e))
92
+ raise InvalidBenchmark(str(e)) from e
89
93
 
90
94
  if (
91
95
  isinstance(metric, HuggingFaceMetric)
@@ -98,7 +102,7 @@ def compute_metrics(
98
102
  "the CPU."
99
103
  )
100
104
  else:
101
- raise InvalidBenchmark(str(e))
105
+ raise InvalidBenchmark(str(e)) from e
102
106
  finally:
103
107
  for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
104
108
  if hasattr(metric, attribute):