EuroEval 16.0.1__py3-none-any.whl → 16.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (48) hide show
  1. euroeval/benchmark_config_factory.py +6 -1
  2. euroeval/benchmark_modules/base.py +2 -0
  3. euroeval/benchmark_modules/fresh.py +7 -1
  4. euroeval/benchmark_modules/hf.py +26 -21
  5. euroeval/benchmark_modules/litellm.py +258 -131
  6. euroeval/benchmark_modules/vllm.py +79 -40
  7. euroeval/benchmarker.py +11 -2
  8. euroeval/cli.py +14 -1
  9. euroeval/constants.py +1 -1
  10. euroeval/data_models.py +77 -6
  11. euroeval/dataset_configs/__init__.py +1 -0
  12. euroeval/dataset_configs/danish.py +14 -0
  13. euroeval/dataset_configs/dutch.py +14 -0
  14. euroeval/dataset_configs/english.py +22 -0
  15. euroeval/dataset_configs/estonian.py +15 -7
  16. euroeval/dataset_configs/finnish.py +14 -0
  17. euroeval/dataset_configs/french.py +14 -0
  18. euroeval/dataset_configs/german.py +23 -0
  19. euroeval/dataset_configs/italian.py +14 -0
  20. euroeval/dataset_configs/latvian.py +14 -0
  21. euroeval/dataset_configs/norwegian.py +14 -0
  22. euroeval/dataset_configs/polish.py +126 -0
  23. euroeval/dataset_configs/portuguese.py +14 -0
  24. euroeval/dataset_configs/spanish.py +14 -0
  25. euroeval/dataset_configs/swedish.py +25 -0
  26. euroeval/enums.py +12 -0
  27. euroeval/generation.py +17 -8
  28. euroeval/generation_utils.py +65 -11
  29. euroeval/metrics/pipeline.py +1 -1
  30. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  31. euroeval/prompt_templates/multiple_choice.py +27 -1
  32. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  33. euroeval/prompt_templates/reading_comprehension.py +11 -0
  34. euroeval/prompt_templates/sentiment_classification.py +15 -0
  35. euroeval/prompt_templates/summarization.py +27 -1
  36. euroeval/scores.py +5 -0
  37. euroeval/task_group_utils/question_answering.py +29 -29
  38. euroeval/task_group_utils/sequence_classification.py +11 -34
  39. euroeval/task_group_utils/token_classification.py +3 -3
  40. euroeval/tasks.py +4 -4
  41. euroeval/{tokenization_utils.py → tokenisation_utils.py} +50 -28
  42. euroeval/utils.py +36 -3
  43. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/METADATA +1 -1
  44. euroeval-16.1.1.dist-info/RECORD +70 -0
  45. euroeval-16.0.1.dist-info/RECORD +0 -69
  46. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/WHEEL +0 -0
  47. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/entry_points.txt +0 -0
  48. {euroeval-16.0.1.dist-info → euroeval-16.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -19,6 +19,7 @@ from ..languages import (
19
19
  NL,
20
20
  NN,
21
21
  NO,
22
+ PL,
22
23
  PT,
23
24
  SV,
24
25
  )
@@ -67,6 +68,14 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
67
68
  default_instruction_prompt="Lause: {text}\n\nOtsusta, kas lause on "
68
69
  "grammatiliselt õige või mitte. Vasta {labels_str}, ja mitte midagi muud.",
69
70
  ),
71
+ PL: PromptConfig(
72
+ default_prompt_label_mapping=dict(correct="tak", incorrect="nie"),
73
+ default_prompt_prefix="Poniżej znajdują się teksty i czy są "
74
+ "gramatycznie poprawne.",
75
+ default_prompt_template="Tekst: {text}\nGramatycznie poprawny: {label}",
76
+ default_instruction_prompt="Tekst: {text}\n\nOkreśl czy tekst jest "
77
+ "gramatycznie poprawny czy nie. Odpowiedz {labels_str}, i nic więcej.",
78
+ ),
70
79
  PT: PromptConfig(
71
80
  default_prompt_label_mapping=dict(correct="sim", incorrect="não"),
72
81
  default_prompt_prefix="Seguem-se abaixo textos e se são "
@@ -3,7 +3,25 @@
3
3
  import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
- from ..languages import DA, DE, EN, ES, ET, FI, FR, IS, IT, LV, NB, NL, NN, NO, PT, SV
6
+ from ..languages import (
7
+ DA,
8
+ DE,
9
+ EN,
10
+ ES,
11
+ ET,
12
+ FI,
13
+ FR,
14
+ IS,
15
+ IT,
16
+ LV,
17
+ NB,
18
+ NL,
19
+ NN,
20
+ NO,
21
+ PL,
22
+ PT,
23
+ SV,
24
+ )
7
25
 
8
26
  if t.TYPE_CHECKING:
9
27
  from ..data_models import Language
@@ -123,6 +141,14 @@ MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
123
141
  "{labels_str}, og ikke noe annet.",
124
142
  default_prompt_label_mapping="auto",
125
143
  ),
144
+ PL: PromptConfig(
145
+ default_prompt_prefix="Poniżej znajdują się pytania wielokrotnego wyboru "
146
+ "(z odpowiedziami).",
147
+ default_prompt_template="Pytanie: {text}\nOdpowiedź: {label}",
148
+ default_instruction_prompt="Pytanie: {text}\n\nOdpowiedz na powyższe pytanie, "
149
+ "odpowiadając {labels_str}, i nic więcej.",
150
+ default_prompt_label_mapping="auto",
151
+ ),
126
152
  SV: PromptConfig(
127
153
  default_prompt_prefix="Följande är flervalsfrågor (med svar).",
128
154
  default_prompt_template="Fråga: {text}\nSvar: {label}",
@@ -19,6 +19,7 @@ from ..languages import (
19
19
  NL,
20
20
  NN,
21
21
  NO,
22
+ PL,
22
23
  PT,
23
24
  SV,
24
25
  )
@@ -336,6 +337,25 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
336
337
  "Verdiene skal være lister over de navngitte enhetene "
337
338
  "av den typen, akkurat som de vises i frasen.",
338
339
  ),
340
+ PL: PromptConfig(
341
+ default_prompt_label_mapping={
342
+ "b-per": "osoba",
343
+ "i-per": "osoba",
344
+ "b-loc": "lokalizacja",
345
+ "i-loc": "lokalizacja",
346
+ "b-org": "organizacja",
347
+ "i-org": "organizacja",
348
+ "b-misc": "różne",
349
+ "i-misc": "różne",
350
+ },
351
+ default_prompt_prefix="Poniżej znajdują się zdania i słowniki JSON z nazwanymi "
352
+ "jednostkami występującymi w danym zdaniu.",
353
+ default_prompt_template="Zdanie: {text}\nNazwane jednostki: {label}",
354
+ default_instruction_prompt="Zdanie: {text}\n\nZidentyfikuj nazwane jednostki "
355
+ "w zdaniu. Powinieneś wypisać to jako słownik JSON z kluczami "
356
+ "{labels_str}. Wartości powinny być listami nazwanych jednostek "
357
+ "tego typu, dokładnie tak jak pojawiają się w zdaniu.",
358
+ ),
339
359
  SV: PromptConfig(
340
360
  default_prompt_label_mapping={
341
361
  "b-per": "person",
@@ -19,6 +19,7 @@ from ..languages import (
19
19
  NL,
20
20
  NN,
21
21
  NO,
22
+ PL,
22
23
  PT,
23
24
  SV,
24
25
  )
@@ -157,6 +158,16 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
157
158
  "teksten ovenfor med maks 3 ord.\n\nSpørsmål: {question}",
158
159
  default_prompt_label_mapping=dict(),
159
160
  ),
161
+ PL: PromptConfig(
162
+ default_prompt_prefix=(
163
+ "Poniżej znajdują się teksty z towarzyszącymi pytaniami i odpowiedziami."
164
+ ),
165
+ default_prompt_template="Tekst: {text}\nPytanie: {question}\nOdpowiedź w "
166
+ "maksymalnie 3 słowach: {label}",
167
+ default_instruction_prompt="Tekst: {text}\n\nOdpowiedz na następujące pytanie "
168
+ "dotyczące powyższego tekstu w maksymalnie 3 słowach.\n\nPytanie: {question}",
169
+ default_prompt_label_mapping=dict(),
170
+ ),
160
171
  PT: PromptConfig(
161
172
  default_prompt_prefix="Os textos que se seguem são acompanhados de perguntas "
162
173
  "e respostas.",
@@ -19,6 +19,7 @@ from ..languages import (
19
19
  NL,
20
20
  NN,
21
21
  NO,
22
+ PL,
22
23
  PT,
23
24
  SV,
24
25
  )
@@ -78,6 +79,20 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
78
79
  "meelestatuse järgi. Võimalikud vastused: {labels_str}. Muud vastused "
79
80
  "ei ole lubatud.",
80
81
  ),
82
+ PL: PromptConfig(
83
+ default_prompt_label_mapping=dict(
84
+ positive="pozytywny", neutral="neutralny", negative="negatywny"
85
+ ),
86
+ default_prompt_prefix=(
87
+ "Poniżej znajdują się dokumenty i ich sentyment, który może być "
88
+ "{labels_str}."
89
+ ),
90
+ default_prompt_template="Dokument: {text}\nSentyment: {label}",
91
+ default_instruction_prompt=(
92
+ "Dokument: {text}\n\nKlasyfikuj sentyment w dokumencie. "
93
+ "Odpowiedz z {labels_str}, i nic więcej."
94
+ ),
95
+ ),
81
96
  PT: PromptConfig(
82
97
  default_prompt_label_mapping=dict(
83
98
  positive="positivo", neutral="neutro", negative="negativo"
@@ -3,7 +3,25 @@
3
3
  import typing as t
4
4
 
5
5
  from ..data_models import PromptConfig
6
- from ..languages import DA, DE, EN, ES, ET, FI, FR, IS, IT, LV, NB, NL, NN, NO, PT, SV
6
+ from ..languages import (
7
+ DA,
8
+ DE,
9
+ EN,
10
+ ES,
11
+ ET,
12
+ FI,
13
+ FR,
14
+ IS,
15
+ IT,
16
+ LV,
17
+ NB,
18
+ NL,
19
+ NN,
20
+ NO,
21
+ PL,
22
+ PT,
23
+ SV,
24
+ )
7
25
 
8
26
  if t.TYPE_CHECKING:
9
27
  from ..data_models import Language
@@ -122,6 +140,14 @@ SUMM_TEMPLATES: dict["Language", PromptConfig] = {
122
140
  "dokumentet ovenfor.",
123
141
  default_prompt_label_mapping=dict(),
124
142
  ),
143
+ PL: PromptConfig(
144
+ default_prompt_prefix="Poniżej znajdują się artykuły z towarzyszącymi "
145
+ "streszczeniami.",
146
+ default_prompt_template="Artykuł: {text}\nStreszczenie: {target_text}",
147
+ default_instruction_prompt="Artykuł: {text}\n\nNapisz streszczenie "
148
+ "powyższego artykułu.",
149
+ default_prompt_label_mapping=dict(),
150
+ ),
125
151
  SV: PromptConfig(
126
152
  default_prompt_prefix="Nedan följer dokument med tillhörande sammanfattningar.",
127
153
  default_prompt_template="Dokument: {text}\nSammanfattning: {target_text}",
euroeval/scores.py CHANGED
@@ -19,6 +19,7 @@ def log_scores(
19
19
  scores: list[dict[str, float]],
20
20
  model_id: str,
21
21
  model_revision: str,
22
+ model_param: str | None,
22
23
  ) -> "ScoreDict":
23
24
  """Log the scores.
24
25
 
@@ -34,6 +35,8 @@ def log_scores(
34
35
  The model ID of the model that was evaluated.
35
36
  model_revision:
36
37
  The revision of the model.
38
+ model_param:
39
+ The model parameter, if any.
37
40
 
38
41
  Returns:
39
42
  A dictionary with keys 'raw_scores' and 'total', with 'raw_scores' being
@@ -42,6 +45,8 @@ def log_scores(
42
45
  """
43
46
  if model_revision and model_revision != "main":
44
47
  model_id += f"@{model_revision}"
48
+ if model_param is not None:
49
+ model_id += f"#{model_param}"
45
50
 
46
51
  logger.info(f"Finished evaluation of {model_id} on {dataset_name}.")
47
52
 
@@ -10,7 +10,7 @@ from transformers.tokenization_utils_base import PreTrainedTokenizerBase
10
10
  from transformers.trainer import Trainer
11
11
 
12
12
  from ..exceptions import InvalidBenchmark
13
- from ..tokenization_utils import get_special_token_metadata
13
+ from ..tokenisation_utils import get_special_token_metadata
14
14
  from ..utils import raise_if_model_output_contains_nan_values
15
15
 
16
16
  if t.TYPE_CHECKING:
@@ -261,7 +261,7 @@ def prepare_train_examples(
261
261
  ]
262
262
  examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
263
263
 
264
- # Set the stride used during tokenization, when the context is long enough to be
264
+ # Set the stride used during tokenisation, when the context is long enough to be
265
265
  # split into several features. Since we are always keeping the question tokens, we
266
266
  # need to make sure that the stride does not exceed the resulting maximum context
267
267
  # length.
@@ -272,11 +272,11 @@ def prepare_train_examples(
272
272
  stride = min(stride, max_length - max_question_tokens - num_special_tokens)
273
273
  max_length = tokeniser.model_max_length - stride
274
274
 
275
- # Tokenize our examples with truncation and padding, but keep the overflows using a
275
+ # Tokenise our examples with truncation and padding, but keep the overflows using a
276
276
  # stride. This results in one example possible giving several features when a
277
277
  # context is long, each of those features having a context that overlaps a bit the
278
278
  # context of the previous feature.
279
- tokenized_examples = tokeniser(
279
+ tokenised_examples = tokeniser(
280
280
  text=examples["question"],
281
281
  text_pair=examples["context"],
282
282
  truncation="only_second",
@@ -290,27 +290,27 @@ def prepare_train_examples(
290
290
  # Since one example might give us several features if it has a long context, we
291
291
  # need a map from a feature to its corresponding example. This key gives us just
292
292
  # that
293
- sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
293
+ sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
294
294
 
295
295
  # The offset mappings will give us a map from token to character position in the
296
296
  # original context. This will help us compute the start_positions and
297
297
  # end_positions.
298
- offset_mapping = tokenized_examples.pop("offset_mapping")
298
+ offset_mapping = tokenised_examples.pop("offset_mapping")
299
299
 
300
300
  # Initialise the start- and end positions of the answers
301
- tokenized_examples["start_positions"] = list()
302
- tokenized_examples["end_positions"] = list()
301
+ tokenised_examples["start_positions"] = list()
302
+ tokenised_examples["end_positions"] = list()
303
303
 
304
304
  for i, offsets in enumerate(offset_mapping):
305
305
  # Get the input IDs for the current example
306
- input_ids = tokenized_examples.input_ids[i]
306
+ input_ids = tokenised_examples.input_ids[i]
307
307
 
308
308
  # We will label impossible answers with the index of the CLS token
309
309
  cls_index = input_ids.index(cls_token_id)
310
310
 
311
311
  # Grab the sequence corresponding to that example (to know what is the context
312
312
  # and what is the question).
313
- sequence_ids = tokenized_examples.sequence_ids(i)
313
+ sequence_ids = tokenised_examples.sequence_ids(i)
314
314
 
315
315
  # Manually ensure that the special tokens are set to None in `sequence_ids`
316
316
  for special_token in tokeniser.special_tokens_map.keys():
@@ -329,8 +329,8 @@ def prepare_train_examples(
329
329
 
330
330
  # If no answers are given, set the cls_index as answer.
331
331
  if len(answers["answer_start"]) == 0:
332
- tokenized_examples.start_positions.append(cls_index)
333
- tokenized_examples.end_positions.append(cls_index)
332
+ tokenised_examples.start_positions.append(cls_index)
333
+ tokenised_examples.end_positions.append(cls_index)
334
334
 
335
335
  else:
336
336
  # Start/end character index of the answer in the text.
@@ -353,8 +353,8 @@ def prepare_train_examples(
353
353
  offsets[token_start_index][0] <= start_char
354
354
  and offsets[token_end_index][1] >= end_char
355
355
  ):
356
- tokenized_examples.start_positions.append(cls_index)
357
- tokenized_examples.end_positions.append(cls_index)
356
+ tokenised_examples.start_positions.append(cls_index)
357
+ tokenised_examples.end_positions.append(cls_index)
358
358
 
359
359
  # Otherwise move the token_start_index and token_end_index to the two ends
360
360
  # of the answer. Note: we could go after the last offset if the answer is
@@ -366,17 +366,17 @@ def prepare_train_examples(
366
366
  ):
367
367
  token_start_index += 1
368
368
  token_start_index -= 1
369
- tokenized_examples.start_positions.append(token_start_index)
369
+ tokenised_examples.start_positions.append(token_start_index)
370
370
  while (
371
371
  token_start_index <= token_end_index
372
372
  and offsets[token_end_index][1] >= end_char
373
373
  ):
374
374
  token_end_index -= 1
375
375
  token_end_index += 1
376
- tokenized_examples.end_positions.append(token_end_index)
376
+ tokenised_examples.end_positions.append(token_end_index)
377
377
  assert token_end_index >= token_start_index
378
378
 
379
- return tokenized_examples
379
+ return tokenised_examples
380
380
 
381
381
 
382
382
  def prepare_test_examples(
@@ -394,7 +394,7 @@ def prepare_test_examples(
394
394
  The prepared test examples.
395
395
  """
396
396
  # Some of the questions have lots of whitespace on the left, which is not useful
397
- # and will make the truncation of the context fail (the tokenized question will
397
+ # and will make the truncation of the context fail (the tokenised question will
398
398
  # take a lots of space). So we remove that left whitespace
399
399
  examples["question"] = [q.lstrip() for q in examples["question"]]
400
400
 
@@ -412,7 +412,7 @@ def prepare_test_examples(
412
412
  ]
413
413
  examples["context"] = [f"{c}{sep_token}" for c in examples["context"]]
414
414
 
415
- # Set the stride used during tokenization, when the context is long enough to be
415
+ # Set the stride used during tokenisation, when the context is long enough to be
416
416
  # split into several features. Since we are always keeping the question tokens, we
417
417
  # need to make sure that the stride does not exceed the resulting maximum context
418
418
  # length.
@@ -423,11 +423,11 @@ def prepare_test_examples(
423
423
  stride = min(stride, max_length - max_question_tokens - num_special_tokens)
424
424
  max_length = tokeniser.model_max_length - stride
425
425
 
426
- # Tokenize our examples with truncation and maybe padding, but keep the overflows
426
+ # Tokenise our examples with truncation and maybe padding, but keep the overflows
427
427
  # using a stride. This results in one example possible giving several features when
428
428
  # a context is long, each of those features having a context that overlaps a bit
429
429
  # the context of the previous feature.
430
- tokenized_examples = tokeniser(
430
+ tokenised_examples = tokeniser(
431
431
  text=examples["question"],
432
432
  text_pair=examples["context"],
433
433
  truncation="only_second",
@@ -441,30 +441,30 @@ def prepare_test_examples(
441
441
  # Since one example might give us several features if it has a long context, we
442
442
  # need a map from a feature to its corresponding example. This key gives us just
443
443
  # that.
444
- sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
444
+ sample_mapping = tokenised_examples.pop("overflow_to_sample_mapping")
445
445
 
446
446
  # We keep the id that gave us this feature and we will store the offset mappings.
447
- tokenized_examples["id"] = list()
447
+ tokenised_examples["id"] = list()
448
448
 
449
- for i in range(len(tokenized_examples.input_ids)):
449
+ for i in range(len(tokenised_examples.input_ids)):
450
450
  # Grab the sequence corresponding to that example (to know what is the context
451
451
  # and what is the question).
452
- sequence_ids = tokenized_examples.sequence_ids(i)
452
+ sequence_ids = tokenised_examples.sequence_ids(i)
453
453
  context_index = 1
454
454
 
455
455
  # One example can give several spans, this is the index of the example
456
456
  # containing this span of text.
457
457
  sample_index = sample_mapping[i]
458
- tokenized_examples.id.append(examples["id"][sample_index])
458
+ tokenised_examples.id.append(examples["id"][sample_index])
459
459
 
460
460
  # Set to (-1, -1) the offset_mapping that are not part of the context so it's
461
461
  # easy to determine if a token position is part of the context or not.
462
- tokenized_examples.offset_mapping[i] = [
462
+ tokenised_examples.offset_mapping[i] = [
463
463
  (o if sequence_ids[k] == context_index else (-1, -1))
464
- for k, o in enumerate(tokenized_examples.offset_mapping[i])
464
+ for k, o in enumerate(tokenised_examples.offset_mapping[i])
465
465
  ]
466
466
 
467
- return tokenized_examples
467
+ return tokenised_examples
468
468
 
469
469
 
470
470
  def postprocess_predictions_and_labels(
@@ -198,8 +198,8 @@ def extract_labels_from_generation(
198
198
  # If no candidate labels were found, we either pick the label with the smallest
199
199
  # word edit distance to the predicted label (if invalid model outputs are
200
200
  # allowed), or we raise an error
201
- if min(edit_distances) > 100:
202
- if dataset_config.task.allow_invalid_model_outputs:
201
+ if min(edit_distances) >= 1000:
202
+ if dataset_config.allow_invalid_model_outputs:
203
203
  logger.warning(
204
204
  "No candidate labels found for the predicted label "
205
205
  f"{predicted_label!r}, out of the candidate labels "
@@ -296,19 +296,7 @@ def get_closest_logprobs_labels(
296
296
  candidate_output_labels = {
297
297
  candidate_label
298
298
  for candidate_label in candidate_labels[idx]
299
- if candidate_label.startswith(generated_label)
300
- }
301
-
302
- # If the generated label is a numeral (e.g., "1", "2", "3") and there is
303
- # a matching candidate label, we only keep the full match
304
- if re.match(r"^\d+$", generated_label) and any(
305
- candidate_label == generated_label
306
- for candidate_label in candidate_output_labels
307
- ):
308
- candidate_output_labels = {
309
- candidate_label
310
- for candidate_label in candidate_output_labels
311
- if candidate_label == generated_label
299
+ if candidate_label.startswith(generated_label.strip())
312
300
  }
313
301
 
314
302
  # If we can uniquely determine the output label, we break the loop.
@@ -357,19 +345,6 @@ def get_closest_logprobs_labels(
357
345
  )
358
346
  return None
359
347
 
360
- # If we did not find any candidate label for any of the generated labels, we
361
- # assume that something is wrong with the model output, and we fall back to
362
- # using word edit distance to extract the labels
363
- else:
364
- log_once(
365
- f"No candidate label found for any of the generated labels "
366
- f"{generated_labels}. This means that using logprobs to extract "
367
- "the labels is not reliable, and we will instead fall back to "
368
- "extracting the labels using word edit distance.",
369
- level=logging.DEBUG,
370
- )
371
- return None
372
-
373
348
  if output_label is not None:
374
349
  output_labels.append(output_label)
375
350
  break
@@ -377,18 +352,20 @@ def get_closest_logprobs_labels(
377
352
  if len(sample) == 0:
378
353
  log_once(
379
354
  "The model outputted an empty string, so no candidate labels could "
380
- "be determined. Using the first label, "
381
- f"{candidate_labels[idx][0]!r}, as the output label.",
355
+ "be determined. This means that using logprobs to extract the "
356
+ "labels is not reliable, and we will instead fall back to "
357
+ "extracting the labels using word edit distance.",
382
358
  level=logging.INFO,
383
359
  )
384
360
  else:
385
361
  log_once(
386
- "Could not find a candidate label for any of the generated "
387
- f"labels in the sample {sample}. Using the first label, "
388
- f"{candidate_labels[idx][0]!r}, as the output label.",
362
+ "No candidate label found for any of the generated labels, which "
363
+ "means that using logprobs to extract the labels is not reliable, "
364
+ "and we will instead fall back to extracting the labels using "
365
+ "word edit distance.",
389
366
  level=logging.INFO,
390
367
  )
391
- output_labels.append(candidate_labels[idx][0])
368
+ return None
392
369
 
393
370
  assert len(output_labels) == len(generation_logprobs)
394
371
  return output_labels
@@ -273,7 +273,7 @@ def tokenize_and_align_labels(
273
273
  Returns:
274
274
  A dictionary containing the tokenized data as well as labels.
275
275
  """
276
- # Tokenize the texts. We use the `is_split_into_words` argument here because
276
+ # Tokenise the texts. We use the `is_split_into_words` argument here because
277
277
  # the texts in our dataset are lists of words (with a label for each word)
278
278
  tokenized_inputs = tokeniser(
279
279
  examples["tokens"], is_split_into_words=True, truncation=True, padding=True
@@ -396,7 +396,7 @@ def handle_unk_tokens(
396
396
 
397
397
  Args:
398
398
  tokeniser:
399
- The tokeniser used to tokenize the words.
399
+ The tokeniser used to tokenise the words.
400
400
  tokens:
401
401
  The list of tokens.
402
402
  words:
@@ -423,7 +423,7 @@ def handle_unk_tokens(
423
423
  # Fetch the word
424
424
  word = words[word_idx]
425
425
 
426
- # Tokenize the word, which is now a list containing at least one UNK token
426
+ # Tokenise the word, which is now a list containing at least one UNK token
427
427
  tokens_with_unk = tokeniser.convert_ids_to_tokens(
428
428
  tokeniser.encode(word, add_special_tokens=False)
429
429
  )
euroeval/tasks.py CHANGED
@@ -88,7 +88,7 @@ SUMM = Task(
88
88
  default_num_few_shot_examples=1,
89
89
  default_max_generated_tokens=256,
90
90
  default_labels=[],
91
- allowed_model_types=[ModelType.GENERATIVE],
91
+ default_allowed_model_types=[ModelType.GENERATIVE],
92
92
  )
93
93
 
94
94
 
@@ -136,14 +136,14 @@ EUROPEAN_VALUES = Task(
136
136
  default_num_few_shot_examples=0,
137
137
  default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
138
138
  default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
139
- allowed_model_types=[ModelType.GENERATIVE],
140
- allowed_generative_types=[
139
+ default_allowed_model_types=[ModelType.GENERATIVE],
140
+ default_allowed_generative_types=[
141
141
  GenerativeType.INSTRUCTION_TUNED,
142
142
  GenerativeType.REASONING,
143
143
  ],
144
144
  requires_zero_shot=True,
145
145
  uses_logprobs=True,
146
- allow_invalid_model_outputs=False,
146
+ default_allow_invalid_model_outputs=False,
147
147
  )
148
148
 
149
149