EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (78) hide show
  1. euroeval/__init__.py +9 -2
  2. euroeval/benchmark_config_factory.py +51 -50
  3. euroeval/benchmark_modules/base.py +9 -21
  4. euroeval/benchmark_modules/fresh.py +2 -1
  5. euroeval/benchmark_modules/hf.py +101 -71
  6. euroeval/benchmark_modules/litellm.py +115 -53
  7. euroeval/benchmark_modules/vllm.py +107 -92
  8. euroeval/benchmarker.py +144 -121
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +86 -8
  12. euroeval/constants.py +9 -0
  13. euroeval/data_loading.py +80 -29
  14. euroeval/data_models.py +338 -330
  15. euroeval/dataset_configs/__init__.py +12 -3
  16. euroeval/dataset_configs/bulgarian.py +56 -0
  17. euroeval/dataset_configs/czech.py +75 -0
  18. euroeval/dataset_configs/danish.py +55 -93
  19. euroeval/dataset_configs/dutch.py +48 -87
  20. euroeval/dataset_configs/english.py +45 -77
  21. euroeval/dataset_configs/estonian.py +42 -34
  22. euroeval/dataset_configs/faroese.py +19 -60
  23. euroeval/dataset_configs/finnish.py +36 -69
  24. euroeval/dataset_configs/french.py +39 -75
  25. euroeval/dataset_configs/german.py +45 -82
  26. euroeval/dataset_configs/greek.py +64 -0
  27. euroeval/dataset_configs/icelandic.py +54 -91
  28. euroeval/dataset_configs/italian.py +42 -79
  29. euroeval/dataset_configs/latvian.py +28 -35
  30. euroeval/dataset_configs/lithuanian.py +28 -26
  31. euroeval/dataset_configs/norwegian.py +72 -115
  32. euroeval/dataset_configs/polish.py +33 -61
  33. euroeval/dataset_configs/portuguese.py +33 -66
  34. euroeval/dataset_configs/serbian.py +64 -0
  35. euroeval/dataset_configs/slovak.py +55 -0
  36. euroeval/dataset_configs/spanish.py +42 -77
  37. euroeval/dataset_configs/swedish.py +52 -90
  38. euroeval/dataset_configs/ukrainian.py +64 -0
  39. euroeval/exceptions.py +1 -1
  40. euroeval/finetuning.py +24 -17
  41. euroeval/generation.py +15 -14
  42. euroeval/generation_utils.py +8 -8
  43. euroeval/languages.py +395 -323
  44. euroeval/logging_utils.py +250 -0
  45. euroeval/metrics/base.py +0 -3
  46. euroeval/metrics/huggingface.py +21 -6
  47. euroeval/metrics/llm_as_a_judge.py +6 -4
  48. euroeval/metrics/pipeline.py +17 -9
  49. euroeval/metrics/speed.py +0 -3
  50. euroeval/model_cache.py +17 -19
  51. euroeval/model_config.py +4 -5
  52. euroeval/model_loading.py +3 -0
  53. euroeval/prompt_templates/__init__.py +2 -0
  54. euroeval/prompt_templates/classification.py +206 -0
  55. euroeval/prompt_templates/linguistic_acceptability.py +99 -42
  56. euroeval/prompt_templates/multiple_choice.py +102 -38
  57. euroeval/prompt_templates/named_entity_recognition.py +172 -51
  58. euroeval/prompt_templates/reading_comprehension.py +119 -42
  59. euroeval/prompt_templates/sentiment_classification.py +110 -40
  60. euroeval/prompt_templates/summarization.py +85 -40
  61. euroeval/prompt_templates/token_classification.py +279 -0
  62. euroeval/scores.py +11 -10
  63. euroeval/speed_benchmark.py +5 -6
  64. euroeval/task_group_utils/multiple_choice_classification.py +2 -4
  65. euroeval/task_group_utils/question_answering.py +24 -16
  66. euroeval/task_group_utils/sequence_classification.py +48 -35
  67. euroeval/task_group_utils/text_to_text.py +19 -9
  68. euroeval/task_group_utils/token_classification.py +21 -17
  69. euroeval/tasks.py +44 -1
  70. euroeval/tokenisation_utils.py +33 -22
  71. euroeval/types.py +10 -9
  72. euroeval/utils.py +35 -149
  73. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
  74. euroeval-16.5.0.dist-info/RECORD +81 -0
  75. euroeval-16.3.0.dist-info/RECORD +0 -71
  76. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  77. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  78. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,6 @@
1
1
  """Utility functions related to the question-answering task group."""
2
2
 
3
3
  import collections.abc as c
4
- import logging
5
4
  import typing as t
6
5
  from collections import defaultdict
7
6
 
@@ -26,8 +25,6 @@ if t.TYPE_CHECKING:
26
25
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
27
26
  from ..types import Labels, Predictions
28
27
 
29
- logger = logging.getLogger("euroeval")
30
-
31
28
 
32
29
  class QuestionAnsweringTrainer(Trainer):
33
30
  """Trainer subclass for question answering tasks."""
@@ -40,7 +37,7 @@ class QuestionAnsweringTrainer(Trainer):
40
37
  train_dataset: "Dataset",
41
38
  eval_dataset: "Dataset",
42
39
  compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
43
- callbacks: "list[TrainerCallback]",
40
+ callbacks: "c.Sequence[TrainerCallback]",
44
41
  data_collator: "c.Callable",
45
42
  **kwargs,
46
43
  ) -> None:
@@ -70,7 +67,7 @@ class QuestionAnsweringTrainer(Trainer):
70
67
  self,
71
68
  eval_dataset: "Dataset | None" = None,
72
69
  orig_eval_dataset: "Dataset | None" = None,
73
- ignore_keys: list[str] | None = None,
70
+ ignore_keys: c.Sequence[str] | None = None,
74
71
  metric_key_prefix: str = "eval",
75
72
  ) -> dict[str, float]:
76
73
  """Evaluate the model on the given dataset.
@@ -206,7 +203,7 @@ def compute_metrics(
206
203
 
207
204
  def extract_labels_from_generation(
208
205
  input_batch: dict[str, list], model_output: "GenerativeModelOutput"
209
- ) -> list[t.Any]:
206
+ ) -> c.Sequence[t.Any]:
210
207
  """Extract the predicted labels from the generated output.
211
208
 
212
209
  Args:
@@ -268,8 +265,11 @@ def prepare_train_examples(
268
265
  max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
269
266
  num_special_tokens = int(has_cls_token) + int(has_sep_token)
270
267
  stride = tokeniser.model_max_length // 4
271
- max_length = tokeniser.model_max_length - stride
272
- stride = min(stride, max_length - max_question_tokens - num_special_tokens)
268
+ stride = min(
269
+ stride,
270
+ tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
271
+ )
272
+ stride = max(stride, 0)
273
273
  max_length = tokeniser.model_max_length - stride
274
274
 
275
275
  # Tokenise our examples with truncation and padding, but keep the overflows using a
@@ -338,9 +338,17 @@ def prepare_train_examples(
338
338
  end_char = start_char + len(answers["text"][0])
339
339
 
340
340
  # Start token index of the current span in the text.
341
- token_start_index = 0
342
- while sequence_ids[token_start_index] != 1:
343
- token_start_index += 1
341
+ try:
342
+ token_start_index = 0
343
+ while sequence_ids[token_start_index] != 1:
344
+ token_start_index += 1
345
+
346
+ # If it turns out that we cannot find the context in the span, then we
347
+ # treat this as an impossible case
348
+ except IndexError:
349
+ tokenised_examples.start_positions.append(cls_index)
350
+ tokenised_examples.end_positions.append(cls_index)
351
+ continue
344
352
 
345
353
  # End token index of the current span in the text.
346
354
  token_end_index = len(input_ids) - 1
@@ -472,7 +480,7 @@ def postprocess_predictions_and_labels(
472
480
  dataset: "Dataset",
473
481
  prepared_dataset: "Dataset",
474
482
  cls_token_index: int,
475
- ) -> tuple[list[dict], list[dict]]:
483
+ ) -> tuple[c.Sequence[dict], c.Sequence[dict]]:
476
484
  """Postprocess the predictions and labels, to allow easier metric computation.
477
485
 
478
486
  Args:
@@ -553,7 +561,7 @@ def find_best_answer(
553
561
  all_start_logits: np.ndarray,
554
562
  all_end_logits: np.ndarray,
555
563
  prepared_dataset: "Dataset",
556
- feature_indices: list[int],
564
+ feature_indices: c.Sequence[int],
557
565
  context: str,
558
566
  max_answer_length: int,
559
567
  num_best_logits: int,
@@ -586,7 +594,7 @@ def find_best_answer(
586
594
  The best answer for the example.
587
595
  """
588
596
  # Loop through all the features associated to the current example
589
- valid_answers = list()
597
+ valid_answers: list[dict] = list()
590
598
  for feature_index in feature_indices:
591
599
  # Get the features associated with the current example
592
600
  features = prepared_dataset[feature_index]
@@ -627,12 +635,12 @@ def find_best_answer(
627
635
  def find_valid_answers(
628
636
  start_logits: np.ndarray,
629
637
  end_logits: np.ndarray,
630
- offset_mapping: list[tuple[int, int]],
638
+ offset_mapping: c.Sequence[tuple[int, int]],
631
639
  context: str,
632
640
  max_answer_length: int,
633
641
  num_best_logits: int,
634
642
  min_null_score: float,
635
- ) -> list[dict]:
643
+ ) -> c.Sequence[dict]:
636
644
  """Find the valid answers from the start and end indexes.
637
645
 
638
646
  Args:
@@ -1,5 +1,6 @@
1
1
  """Utility functions related to the sequence-classification task group."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import re
5
6
  import typing as t
@@ -19,13 +20,15 @@ if t.TYPE_CHECKING:
19
20
  from datasets.arrow_dataset import Dataset
20
21
  from transformers.trainer_utils import EvalPrediction
21
22
 
22
- from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
23
+ from ..data_models import (
24
+ BenchmarkConfig,
25
+ DatasetConfig,
26
+ GenerativeModelOutput,
27
+ ModelConfig,
28
+ )
23
29
  from ..types import Labels, Predictions
24
30
 
25
31
 
26
- logger = logging.getLogger("euroeval")
27
-
28
-
29
32
  def compute_metrics(
30
33
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
31
34
  dataset_config: "DatasetConfig",
@@ -106,8 +109,9 @@ def extract_labels_from_generation(
106
109
  input_batch: dict[str, list],
107
110
  model_output: "GenerativeModelOutput",
108
111
  dataset_config: "DatasetConfig",
112
+ model_config: "ModelConfig",
109
113
  first_label_token_mapping: dict[str, str] | bool,
110
- ) -> list[str]:
114
+ ) -> c.Sequence[str]:
111
115
  """Extract the predicted labels from the generated output.
112
116
 
113
117
  Args:
@@ -118,6 +122,8 @@ def extract_labels_from_generation(
118
122
  The raw generated output of the model.
119
123
  dataset_config:
120
124
  The configuration of the dataset.
125
+ model_config:
126
+ The configuration of the model.
121
127
  first_label_token_mapping:
122
128
  A mapping from labels to the first token in each label, or alternatively a
123
129
  Boolean value indicating whether the model should output scores (if the
@@ -167,6 +173,7 @@ def extract_labels_from_generation(
167
173
  )
168
174
 
169
175
  new_predicted_labels: list[str] = list()
176
+ num_predictions_being_very_off = 0
170
177
  for idx, predicted_label in enumerate(model_output.sequences):
171
178
  # If the prediction includes a boxed answer, use that instead of the full
172
179
  # generation
@@ -199,42 +206,48 @@ def extract_labels_from_generation(
199
206
  # word edit distance to the predicted label (if invalid model outputs are
200
207
  # allowed), or we raise an error
201
208
  if min(edit_distances) >= 1000:
202
- if dataset_config.allow_invalid_model_outputs:
203
- logger.warning(
204
- "No candidate labels found for the predicted label "
205
- f"{predicted_label!r}, out of the candidate labels "
206
- f"{sample_candidate_labels[idx]}. This likely means that the model "
207
- "output is completely off, but since invalid model outputs are "
208
- "allowed for this task, we will use the closest candidate label "
209
- f"({best_candidate_label})) as the output label. If you see this "
210
- "warning very often, please report this issue to the EuroEval "
211
- "team at github.com/EuroEval/EuroEval/issues."
212
- )
213
- logger.debug(
214
- "The candidate labels were extracted from the prompt: "
215
- f"{input_batch['text'][idx]!r}."
216
- )
217
- else:
218
- raise InvalidBenchmark(
219
- "No candidate labels found for the predicted label "
220
- f"{predicted_label!r}, out of the candidate labels "
221
- f"{sample_candidate_labels[idx]}. This likely means that the model "
222
- "output is completely off, and we cannot extract any labels from "
223
- "it. Please check the model output and the candidate labels. The "
224
- "candidate labels were extracted from the prompt: "
225
- f"{input_batch['text'][idx]!r}."
226
- )
209
+ num_predictions_being_very_off += 1
227
210
 
228
211
  new_predicted_labels.append(best_candidate_label)
229
212
 
213
+ if num_predictions_being_very_off > 0:
214
+ if dataset_config.allow_invalid_model_outputs:
215
+ log_msg = (
216
+ "No candidate labels found for the predicted label in "
217
+ f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
218
+ f"of the samples with the model {model_config.model_id!r}. This "
219
+ "likely means that the model were completely off in these cases, "
220
+ "but since invalid model outputs are allowed for this task, we used "
221
+ "the closest candidate labels as the output labels."
222
+ )
223
+ level = logging.DEBUG
224
+ if num_predictions_being_very_off / len(model_output.sequences) > 0.5:
225
+ log_msg += (
226
+ " Since this happened for most of the model's predictions, please "
227
+ "report this issue to the EuroEval team at "
228
+ "github.com/EuroEval/EuroEval/issues."
229
+ )
230
+ level = logging.WARNING
231
+ log_once(log_msg, level=level)
232
+ else:
233
+ raise InvalidBenchmark(
234
+ "No candidate labels found for the predicted label in "
235
+ f"{num_predictions_being_very_off:,}/{len(model_output.sequences):,} "
236
+ "of the samples. This likely means that the model were completely "
237
+ "off in these cases. Since this task does not allow invalid model "
238
+ "outputs, we have to abort the evaluation. Please re-run the "
239
+ "evaluation with the `--debug` flag (or `debug=True` if you're using "
240
+ "the `Benchmarker` API) to see the precise model outputs."
241
+ )
242
+
230
243
  return new_predicted_labels
231
244
 
232
245
 
233
246
  def get_closest_logprobs_labels(
234
- generation_logprobs: list[list[list[tuple[str, float]]]],
247
+ generation_logprobs: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]],
235
248
  first_label_token_mapping: dict[str, str] | t.Literal[True],
236
- candidate_labels: list[list[str]],
237
- ) -> list[str] | None:
249
+ candidate_labels: c.Sequence[c.Sequence[str]],
250
+ ) -> c.Sequence[str] | None:
238
251
  """Get the labels with the highest predicted logprob value.
239
252
 
240
253
  In case a candidate label is split into multiple tokens, we only use the first
@@ -355,7 +368,7 @@ def get_closest_logprobs_labels(
355
368
  "be determined. This means that using logprobs to extract the "
356
369
  "labels is not reliable, and we will instead fall back to "
357
370
  "extracting the labels using word edit distance.",
358
- level=logging.INFO,
371
+ level=logging.DEBUG,
359
372
  )
360
373
  else:
361
374
  log_once(
@@ -363,7 +376,7 @@ def get_closest_logprobs_labels(
363
376
  "means that using logprobs to extract the labels is not reliable, "
364
377
  "and we will instead fall back to extracting the labels using "
365
378
  "word edit distance.",
366
- level=logging.INFO,
379
+ level=logging.DEBUG,
367
380
  )
368
381
  return None
369
382
 
@@ -1,5 +1,6 @@
1
1
  """Utility functions related to the text-to-text task group."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import typing as t
5
6
 
@@ -7,6 +8,7 @@ import numpy as np
7
8
 
8
9
  from ..constants import METRIC_ATTRIBUTES_TAKING_UP_MEMORY
9
10
  from ..exceptions import InvalidBenchmark
11
+ from ..logging_utils import log
10
12
  from ..metrics import HuggingFaceMetric
11
13
  from ..utils import raise_if_model_output_contains_nan_values
12
14
 
@@ -18,9 +20,6 @@ if t.TYPE_CHECKING:
18
20
  from ..types import Labels, Predictions
19
21
 
20
22
 
21
- logger = logging.getLogger("euroeval")
22
-
23
-
24
23
  def compute_metrics(
25
24
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
26
25
  dataset_config: "DatasetConfig",
@@ -44,6 +43,10 @@ def compute_metrics(
44
43
  Returns:
45
44
  A dictionary with the names of the metrics as keys and the metric values as
46
45
  values.
46
+
47
+ Raises:
48
+ InvalidBenchmark:
49
+ If the metric computation fails.
47
50
  """
48
51
  model_outputs, labels = model_outputs_and_labels
49
52
 
@@ -72,7 +75,7 @@ def compute_metrics(
72
75
  ):
73
76
  metric.compute_kwargs["device"] = benchmark_config.device.type
74
77
 
75
- while True:
78
+ for _ in range(num_attempts := 5):
76
79
  try:
77
80
  score: float | None = metric(
78
81
  predictions=predictions,
@@ -96,21 +99,28 @@ def compute_metrics(
96
99
  and metric.compute_kwargs.get("device", "cpu") != "cpu"
97
100
  ):
98
101
  metric.compute_kwargs["device"] = "cpu"
99
- logger.debug(
102
+ log(
100
103
  "Out of memory error occurred during the computation of "
101
104
  f"the metric {metric.pretty_name}. Moving the computation to "
102
- "the CPU."
105
+ "the CPU.",
106
+ level=logging.DEBUG,
103
107
  )
104
108
  else:
105
109
  raise InvalidBenchmark(str(e)) from e
106
110
  finally:
107
111
  for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
108
112
  if hasattr(metric, attribute):
109
- logger.debug(
113
+ log(
110
114
  f"Deleting the {attribute!r} attribute of the metric "
111
- f"{metric.pretty_name} to free up memory."
115
+ f"{metric.pretty_name} to free up memory.",
116
+ level=logging.DEBUG,
112
117
  )
113
118
  delattr(metric, attribute)
119
+ else:
120
+ raise InvalidBenchmark(
121
+ f"Could not compute the metric {metric.pretty_name} after "
122
+ f"{num_attempts} attempts due to out of memory errors."
123
+ )
114
124
 
115
125
  # The metric returns None if we are running on multi-GPU and the current
116
126
  # process is not the main process
@@ -122,7 +132,7 @@ def compute_metrics(
122
132
 
123
133
  def extract_labels_from_generation(
124
134
  input_batch: dict[str, list], model_output: "GenerativeModelOutput"
125
- ) -> list[t.Any]:
135
+ ) -> c.Sequence[t.Any]:
126
136
  """Extract the predicted labels from the generated output.
127
137
 
128
138
  Args:
@@ -1,5 +1,6 @@
1
1
  """Utility functions related to the token-classification task group."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import typing as t
5
6
  from copy import deepcopy
@@ -7,6 +8,7 @@ from copy import deepcopy
7
8
  import numpy as np
8
9
 
9
10
  from ..exceptions import InvalidBenchmark
11
+ from ..logging_utils import log
10
12
  from ..utils import (
11
13
  extract_json_dict_from_string,
12
14
  raise_if_model_output_contains_nan_values,
@@ -22,9 +24,6 @@ if t.TYPE_CHECKING:
22
24
  from ..types import Labels, Predictions
23
25
 
24
26
 
25
- logger = logging.getLogger("euroeval")
26
-
27
-
28
27
  def compute_metrics(
29
28
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
30
29
  has_misc_tags: bool,
@@ -61,7 +60,9 @@ def compute_metrics(
61
60
 
62
61
  predictions: list[list[str]]
63
62
  if not isinstance(model_outputs[0][0], str):
64
- raw_predictions: list[list[int]] = np.argmax(model_outputs, axis=-1).tolist()
63
+ raw_predictions: c.Sequence[c.Sequence[int]] = np.argmax(
64
+ model_outputs, axis=-1
65
+ ).tolist()
65
66
 
66
67
  # Remove ignored index (special tokens)
67
68
  predictions = [
@@ -191,7 +192,7 @@ def extract_labels_from_generation(
191
192
  input_batch: dict[str, list],
192
193
  model_output: "GenerativeModelOutput",
193
194
  dataset_config: "DatasetConfig",
194
- ) -> list[t.Any]:
195
+ ) -> c.Sequence[t.Any]:
195
196
  """Extract the predicted labels from the generated output.
196
197
 
197
198
  Args:
@@ -216,17 +217,19 @@ def extract_labels_from_generation(
216
217
  prompt_label_mapping = dataset_config.prompt_label_mapping
217
218
  for prompt_tag_name, named_entities in prediction_dict.items():
218
219
  if not isinstance(named_entities, list):
219
- logger.debug(
220
+ log(
220
221
  "The model produced an invalid format for the named entities. "
221
- f"Expected a list but got {type(named_entities)}. Skipping."
222
+ f"Expected a list but got {type(named_entities)}. Skipping.",
223
+ level=logging.DEBUG,
222
224
  )
223
225
  continue
224
226
  try:
225
227
  named_entities = [str(ne) for ne in named_entities]
226
228
  except Exception:
227
- logger.debug(
229
+ log(
228
230
  "The model produced an invalid format for the named entities. "
229
- f"Expected a list of strings but got {named_entities}. Skipping."
231
+ f"Expected a list of strings but got {named_entities}. Skipping.",
232
+ level=logging.DEBUG,
230
233
  )
231
234
  continue
232
235
  try:
@@ -236,9 +239,10 @@ def extract_labels_from_generation(
236
239
  if prompt_tag == prompt_tag_name
237
240
  ][0]
238
241
  except IndexError:
239
- logger.debug(
242
+ log(
240
243
  "The model produced an invalid prompt tag name, "
241
- f"{prompt_tag_name}. Skipping."
244
+ f"{prompt_tag_name}. Skipping.",
245
+ level=logging.DEBUG,
242
246
  )
243
247
  continue
244
248
 
@@ -283,8 +287,8 @@ def tokenize_and_align_labels(
283
287
  # tokeniser is of a "fast" variant then this can be accessed through the
284
288
  # `word_ids` method. Otherwise, we have to extract it manually.
285
289
  all_labels: list[list[int]] = list()
286
- labels: list[str]
287
- word_ids: list[int | None]
290
+ labels: c.Sequence[str]
291
+ word_ids: c.Sequence[int | None]
288
292
  for i, labels in enumerate(examples["labels"]):
289
293
  # Try to get the word IDs from the tokeniser
290
294
  try:
@@ -294,10 +298,10 @@ def tokenize_and_align_labels(
294
298
  # IDs manually
295
299
  except ValueError:
296
300
  # Get the list of words in the document
297
- words: list[str] = examples["tokens"][i]
301
+ words: c.Sequence[str] = examples["tokens"][i]
298
302
 
299
303
  # Get the list of token IDs in the document
300
- tok_ids: list[int] = tokenized_inputs.input_ids[i]
304
+ tok_ids: c.Sequence[int] = tokenized_inputs.input_ids[i]
301
305
 
302
306
  # Decode the token IDs
303
307
  tokens = tokeniser.convert_ids_to_tokens(tok_ids)
@@ -390,8 +394,8 @@ def tokenize_and_align_labels(
390
394
 
391
395
 
392
396
  def handle_unk_tokens(
393
- tokeniser: "PreTrainedTokenizer", tokens: list[str], words: list[str]
394
- ) -> list[str]:
397
+ tokeniser: "PreTrainedTokenizer", tokens: list[str], words: c.Sequence[str]
398
+ ) -> c.Sequence[str]:
395
399
  """Replace unknown tokens in the tokens with the corresponding word.
396
400
 
397
401
  Args:
euroeval/tasks.py CHANGED
@@ -5,12 +5,14 @@ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
5
5
  from .data_models import Task
6
6
  from .enums import GenerativeType, ModelType, TaskGroup
7
7
  from .prompt_templates import (
8
+ CLASSIFICATION_TEMPLATES,
8
9
  LA_TEMPLATES,
9
10
  MULTIPLE_CHOICE_TEMPLATES,
10
11
  NER_TEMPLATES,
11
12
  RC_TEMPLATES,
12
13
  SENT_TEMPLATES,
13
14
  SUMM_TEMPLATES,
15
+ TOKEN_CLASSIFICATION_TEMPLATES,
14
16
  )
15
17
 
16
18
 
@@ -20,7 +22,11 @@ def get_all_tasks() -> dict[str, Task]:
20
22
  Returns:
21
23
  A mapping between names of dataset tasks and their configurations.
22
24
  """
23
- return {cfg.name: cfg for cfg in globals().values() if isinstance(cfg, Task)}
25
+ return {
26
+ cfg.name: cfg
27
+ for cfg in globals().values()
28
+ if isinstance(cfg, Task) and cfg != SPEED
29
+ }
24
30
 
25
31
 
26
32
  LA = Task(
@@ -159,3 +165,40 @@ SPEED = Task(
159
165
  default_max_generated_tokens=5,
160
166
  default_labels=[],
161
167
  )
168
+
169
+
170
+ # Used for custom datasets
171
+
172
+ TEXT_CLASSIFICATION = Task(
173
+ name="classification",
174
+ task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
175
+ template_dict=CLASSIFICATION_TEMPLATES,
176
+ metrics=[m.mcc_metric, m.macro_f1_metric],
177
+ default_num_few_shot_examples=12,
178
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
179
+ default_labels=None,
180
+ uses_logprobs=True,
181
+ )
182
+
183
+ TOKEN_CLASSIFICATION = Task(
184
+ name="token-classification",
185
+ task_group=TaskGroup.TOKEN_CLASSIFICATION,
186
+ template_dict=TOKEN_CLASSIFICATION_TEMPLATES,
187
+ metrics=[m.micro_f1_metric],
188
+ default_num_few_shot_examples=8,
189
+ default_max_generated_tokens=128,
190
+ default_labels=None,
191
+ uses_structured_output=True,
192
+ )
193
+
194
+ MULTIPLE_CHOICE = Task(
195
+ name="multiple-choice",
196
+ task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
197
+ template_dict=MULTIPLE_CHOICE_TEMPLATES,
198
+ metrics=[m.mcc_metric, m.accuracy_metric],
199
+ default_num_few_shot_examples=5,
200
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
201
+ default_labels=None,
202
+ default_allowed_model_types=[ModelType.GENERATIVE],
203
+ uses_logprobs=True,
204
+ )