EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (71) hide show
  1. euroeval/__init__.py +6 -0
  2. euroeval/benchmark_config_factory.py +51 -46
  3. euroeval/benchmark_modules/base.py +6 -5
  4. euroeval/benchmark_modules/hf.py +2 -9
  5. euroeval/benchmark_modules/litellm.py +14 -12
  6. euroeval/benchmark_modules/vllm.py +17 -10
  7. euroeval/benchmarker.py +61 -44
  8. euroeval/caching_utils.py +1 -1
  9. euroeval/cli.py +86 -8
  10. euroeval/constants.py +3 -0
  11. euroeval/data_loading.py +78 -30
  12. euroeval/data_models.py +326 -326
  13. euroeval/dataset_configs/__init__.py +10 -3
  14. euroeval/dataset_configs/bulgarian.py +56 -0
  15. euroeval/dataset_configs/czech.py +25 -29
  16. euroeval/dataset_configs/danish.py +51 -88
  17. euroeval/dataset_configs/dutch.py +48 -86
  18. euroeval/dataset_configs/english.py +45 -76
  19. euroeval/dataset_configs/estonian.py +36 -38
  20. euroeval/dataset_configs/faroese.py +19 -60
  21. euroeval/dataset_configs/finnish.py +36 -68
  22. euroeval/dataset_configs/french.py +39 -74
  23. euroeval/dataset_configs/german.py +45 -81
  24. euroeval/dataset_configs/greek.py +64 -0
  25. euroeval/dataset_configs/icelandic.py +54 -91
  26. euroeval/dataset_configs/italian.py +42 -78
  27. euroeval/dataset_configs/latvian.py +28 -34
  28. euroeval/dataset_configs/lithuanian.py +22 -26
  29. euroeval/dataset_configs/norwegian.py +72 -114
  30. euroeval/dataset_configs/polish.py +33 -60
  31. euroeval/dataset_configs/portuguese.py +33 -65
  32. euroeval/dataset_configs/serbian.py +64 -0
  33. euroeval/dataset_configs/slovak.py +19 -24
  34. euroeval/dataset_configs/spanish.py +42 -76
  35. euroeval/dataset_configs/swedish.py +48 -84
  36. euroeval/dataset_configs/ukrainian.py +64 -0
  37. euroeval/exceptions.py +1 -1
  38. euroeval/finetuning.py +3 -2
  39. euroeval/generation.py +5 -4
  40. euroeval/generation_utils.py +6 -5
  41. euroeval/languages.py +395 -323
  42. euroeval/metrics/huggingface.py +14 -3
  43. euroeval/metrics/llm_as_a_judge.py +1 -1
  44. euroeval/model_cache.py +6 -5
  45. euroeval/model_loading.py +1 -1
  46. euroeval/prompt_templates/__init__.py +2 -0
  47. euroeval/prompt_templates/classification.py +206 -0
  48. euroeval/prompt_templates/linguistic_acceptability.py +82 -43
  49. euroeval/prompt_templates/multiple_choice.py +81 -41
  50. euroeval/prompt_templates/named_entity_recognition.py +125 -44
  51. euroeval/prompt_templates/reading_comprehension.py +92 -43
  52. euroeval/prompt_templates/sentiment_classification.py +91 -43
  53. euroeval/prompt_templates/summarization.py +64 -39
  54. euroeval/prompt_templates/token_classification.py +279 -0
  55. euroeval/scores.py +4 -3
  56. euroeval/speed_benchmark.py +2 -1
  57. euroeval/task_group_utils/multiple_choice_classification.py +2 -1
  58. euroeval/task_group_utils/question_answering.py +24 -13
  59. euroeval/task_group_utils/sequence_classification.py +5 -4
  60. euroeval/task_group_utils/text_to_text.py +2 -1
  61. euroeval/task_group_utils/token_classification.py +11 -8
  62. euroeval/tasks.py +44 -1
  63. euroeval/tokenisation_utils.py +19 -10
  64. euroeval/types.py +10 -9
  65. euroeval/utils.py +6 -3
  66. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
  67. euroeval-16.5.0.dist-info/RECORD +81 -0
  68. euroeval-16.4.0.dist-info/RECORD +0 -75
  69. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  70. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  71. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -37,7 +37,7 @@ class QuestionAnsweringTrainer(Trainer):
37
37
  train_dataset: "Dataset",
38
38
  eval_dataset: "Dataset",
39
39
  compute_metrics: "c.Callable[[EvalPrediction], dict[str, float]]",
40
- callbacks: "list[TrainerCallback]",
40
+ callbacks: "c.Sequence[TrainerCallback]",
41
41
  data_collator: "c.Callable",
42
42
  **kwargs,
43
43
  ) -> None:
@@ -67,7 +67,7 @@ class QuestionAnsweringTrainer(Trainer):
67
67
  self,
68
68
  eval_dataset: "Dataset | None" = None,
69
69
  orig_eval_dataset: "Dataset | None" = None,
70
- ignore_keys: list[str] | None = None,
70
+ ignore_keys: c.Sequence[str] | None = None,
71
71
  metric_key_prefix: str = "eval",
72
72
  ) -> dict[str, float]:
73
73
  """Evaluate the model on the given dataset.
@@ -203,7 +203,7 @@ def compute_metrics(
203
203
 
204
204
  def extract_labels_from_generation(
205
205
  input_batch: dict[str, list], model_output: "GenerativeModelOutput"
206
- ) -> list[t.Any]:
206
+ ) -> c.Sequence[t.Any]:
207
207
  """Extract the predicted labels from the generated output.
208
208
 
209
209
  Args:
@@ -265,8 +265,11 @@ def prepare_train_examples(
265
265
  max_question_tokens = max(len(tokeniser(q).input_ids) for q in examples["question"])
266
266
  num_special_tokens = int(has_cls_token) + int(has_sep_token)
267
267
  stride = tokeniser.model_max_length // 4
268
- max_length = tokeniser.model_max_length - stride
269
- stride = min(stride, max_length - max_question_tokens - num_special_tokens)
268
+ stride = min(
269
+ stride,
270
+ tokeniser.model_max_length - stride - max_question_tokens - num_special_tokens,
271
+ )
272
+ stride = max(stride, 0)
270
273
  max_length = tokeniser.model_max_length - stride
271
274
 
272
275
  # Tokenise our examples with truncation and padding, but keep the overflows using a
@@ -335,9 +338,17 @@ def prepare_train_examples(
335
338
  end_char = start_char + len(answers["text"][0])
336
339
 
337
340
  # Start token index of the current span in the text.
338
- token_start_index = 0
339
- while sequence_ids[token_start_index] != 1:
340
- token_start_index += 1
341
+ try:
342
+ token_start_index = 0
343
+ while sequence_ids[token_start_index] != 1:
344
+ token_start_index += 1
345
+
346
+ # If it turns out that we cannot find the context in the span, then we
347
+ # treat this as an impossible case
348
+ except IndexError:
349
+ tokenised_examples.start_positions.append(cls_index)
350
+ tokenised_examples.end_positions.append(cls_index)
351
+ continue
341
352
 
342
353
  # End token index of the current span in the text.
343
354
  token_end_index = len(input_ids) - 1
@@ -469,7 +480,7 @@ def postprocess_predictions_and_labels(
469
480
  dataset: "Dataset",
470
481
  prepared_dataset: "Dataset",
471
482
  cls_token_index: int,
472
- ) -> tuple[list[dict], list[dict]]:
483
+ ) -> tuple[c.Sequence[dict], c.Sequence[dict]]:
473
484
  """Postprocess the predictions and labels, to allow easier metric computation.
474
485
 
475
486
  Args:
@@ -550,7 +561,7 @@ def find_best_answer(
550
561
  all_start_logits: np.ndarray,
551
562
  all_end_logits: np.ndarray,
552
563
  prepared_dataset: "Dataset",
553
- feature_indices: list[int],
564
+ feature_indices: c.Sequence[int],
554
565
  context: str,
555
566
  max_answer_length: int,
556
567
  num_best_logits: int,
@@ -583,7 +594,7 @@ def find_best_answer(
583
594
  The best answer for the example.
584
595
  """
585
596
  # Loop through all the features associated to the current example
586
- valid_answers = list()
597
+ valid_answers: list[dict] = list()
587
598
  for feature_index in feature_indices:
588
599
  # Get the features associated with the current example
589
600
  features = prepared_dataset[feature_index]
@@ -624,12 +635,12 @@ def find_best_answer(
624
635
  def find_valid_answers(
625
636
  start_logits: np.ndarray,
626
637
  end_logits: np.ndarray,
627
- offset_mapping: list[tuple[int, int]],
638
+ offset_mapping: c.Sequence[tuple[int, int]],
628
639
  context: str,
629
640
  max_answer_length: int,
630
641
  num_best_logits: int,
631
642
  min_null_score: float,
632
- ) -> list[dict]:
643
+ ) -> c.Sequence[dict]:
633
644
  """Find the valid answers from the start and end indexes.
634
645
 
635
646
  Args:
@@ -1,5 +1,6 @@
1
1
  """Utility functions related to the sequence-classification task group."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import re
5
6
  import typing as t
@@ -110,7 +111,7 @@ def extract_labels_from_generation(
110
111
  dataset_config: "DatasetConfig",
111
112
  model_config: "ModelConfig",
112
113
  first_label_token_mapping: dict[str, str] | bool,
113
- ) -> list[str]:
114
+ ) -> c.Sequence[str]:
114
115
  """Extract the predicted labels from the generated output.
115
116
 
116
117
  Args:
@@ -243,10 +244,10 @@ def extract_labels_from_generation(
243
244
 
244
245
 
245
246
  def get_closest_logprobs_labels(
246
- generation_logprobs: list[list[list[tuple[str, float]]]],
247
+ generation_logprobs: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]],
247
248
  first_label_token_mapping: dict[str, str] | t.Literal[True],
248
- candidate_labels: list[list[str]],
249
- ) -> list[str] | None:
249
+ candidate_labels: c.Sequence[c.Sequence[str]],
250
+ ) -> c.Sequence[str] | None:
250
251
  """Get the labels with the highest predicted logprob value.
251
252
 
252
253
  In case a candidate label is split into multiple tokens, we only use the first
@@ -1,5 +1,6 @@
1
1
  """Utility functions related to the text-to-text task group."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import typing as t
5
6
 
@@ -131,7 +132,7 @@ def compute_metrics(
131
132
 
132
133
  def extract_labels_from_generation(
133
134
  input_batch: dict[str, list], model_output: "GenerativeModelOutput"
134
- ) -> list[t.Any]:
135
+ ) -> c.Sequence[t.Any]:
135
136
  """Extract the predicted labels from the generated output.
136
137
 
137
138
  Args:
@@ -1,5 +1,6 @@
1
1
  """Utility functions related to the token-classification task group."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import typing as t
5
6
  from copy import deepcopy
@@ -59,7 +60,9 @@ def compute_metrics(
59
60
 
60
61
  predictions: list[list[str]]
61
62
  if not isinstance(model_outputs[0][0], str):
62
- raw_predictions: list[list[int]] = np.argmax(model_outputs, axis=-1).tolist()
63
+ raw_predictions: c.Sequence[c.Sequence[int]] = np.argmax(
64
+ model_outputs, axis=-1
65
+ ).tolist()
63
66
 
64
67
  # Remove ignored index (special tokens)
65
68
  predictions = [
@@ -189,7 +192,7 @@ def extract_labels_from_generation(
189
192
  input_batch: dict[str, list],
190
193
  model_output: "GenerativeModelOutput",
191
194
  dataset_config: "DatasetConfig",
192
- ) -> list[t.Any]:
195
+ ) -> c.Sequence[t.Any]:
193
196
  """Extract the predicted labels from the generated output.
194
197
 
195
198
  Args:
@@ -284,8 +287,8 @@ def tokenize_and_align_labels(
284
287
  # tokeniser is of a "fast" variant then this can be accessed through the
285
288
  # `word_ids` method. Otherwise, we have to extract it manually.
286
289
  all_labels: list[list[int]] = list()
287
- labels: list[str]
288
- word_ids: list[int | None]
290
+ labels: c.Sequence[str]
291
+ word_ids: c.Sequence[int | None]
289
292
  for i, labels in enumerate(examples["labels"]):
290
293
  # Try to get the word IDs from the tokeniser
291
294
  try:
@@ -295,10 +298,10 @@ def tokenize_and_align_labels(
295
298
  # IDs manually
296
299
  except ValueError:
297
300
  # Get the list of words in the document
298
- words: list[str] = examples["tokens"][i]
301
+ words: c.Sequence[str] = examples["tokens"][i]
299
302
 
300
303
  # Get the list of token IDs in the document
301
- tok_ids: list[int] = tokenized_inputs.input_ids[i]
304
+ tok_ids: c.Sequence[int] = tokenized_inputs.input_ids[i]
302
305
 
303
306
  # Decode the token IDs
304
307
  tokens = tokeniser.convert_ids_to_tokens(tok_ids)
@@ -391,8 +394,8 @@ def tokenize_and_align_labels(
391
394
 
392
395
 
393
396
  def handle_unk_tokens(
394
- tokeniser: "PreTrainedTokenizer", tokens: list[str], words: list[str]
395
- ) -> list[str]:
397
+ tokeniser: "PreTrainedTokenizer", tokens: list[str], words: c.Sequence[str]
398
+ ) -> c.Sequence[str]:
396
399
  """Replace unknown tokens in the tokens with the corresponding word.
397
400
 
398
401
  Args:
euroeval/tasks.py CHANGED
@@ -5,12 +5,14 @@ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
5
5
  from .data_models import Task
6
6
  from .enums import GenerativeType, ModelType, TaskGroup
7
7
  from .prompt_templates import (
8
+ CLASSIFICATION_TEMPLATES,
8
9
  LA_TEMPLATES,
9
10
  MULTIPLE_CHOICE_TEMPLATES,
10
11
  NER_TEMPLATES,
11
12
  RC_TEMPLATES,
12
13
  SENT_TEMPLATES,
13
14
  SUMM_TEMPLATES,
15
+ TOKEN_CLASSIFICATION_TEMPLATES,
14
16
  )
15
17
 
16
18
 
@@ -20,7 +22,11 @@ def get_all_tasks() -> dict[str, Task]:
20
22
  Returns:
21
23
  A mapping between names of dataset tasks and their configurations.
22
24
  """
23
- return {cfg.name: cfg for cfg in globals().values() if isinstance(cfg, Task)}
25
+ return {
26
+ cfg.name: cfg
27
+ for cfg in globals().values()
28
+ if isinstance(cfg, Task) and cfg != SPEED
29
+ }
24
30
 
25
31
 
26
32
  LA = Task(
@@ -159,3 +165,40 @@ SPEED = Task(
159
165
  default_max_generated_tokens=5,
160
166
  default_labels=[],
161
167
  )
168
+
169
+
170
+ # Used for custom datasets
171
+
172
+ TEXT_CLASSIFICATION = Task(
173
+ name="classification",
174
+ task_group=TaskGroup.SEQUENCE_CLASSIFICATION,
175
+ template_dict=CLASSIFICATION_TEMPLATES,
176
+ metrics=[m.mcc_metric, m.macro_f1_metric],
177
+ default_num_few_shot_examples=12,
178
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
179
+ default_labels=None,
180
+ uses_logprobs=True,
181
+ )
182
+
183
+ TOKEN_CLASSIFICATION = Task(
184
+ name="token-classification",
185
+ task_group=TaskGroup.TOKEN_CLASSIFICATION,
186
+ template_dict=TOKEN_CLASSIFICATION_TEMPLATES,
187
+ metrics=[m.micro_f1_metric],
188
+ default_num_few_shot_examples=8,
189
+ default_max_generated_tokens=128,
190
+ default_labels=None,
191
+ uses_structured_output=True,
192
+ )
193
+
194
+ MULTIPLE_CHOICE = Task(
195
+ name="multiple-choice",
196
+ task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
197
+ template_dict=MULTIPLE_CHOICE_TEMPLATES,
198
+ metrics=[m.mcc_metric, m.accuracy_metric],
199
+ default_num_few_shot_examples=5,
200
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
201
+ default_labels=None,
202
+ default_allowed_model_types=[ModelType.GENERATIVE],
203
+ uses_logprobs=True,
204
+ )
@@ -1,5 +1,6 @@
1
1
  """Utility functions related to tokenisation."""
2
2
 
3
+ import collections.abc as c
3
4
  import logging
4
5
  import re
5
6
  import typing as t
@@ -71,7 +72,7 @@ def get_special_token_metadata(tokeniser: "PreTrainedTokenizerBase") -> dict:
71
72
 
72
73
 
73
74
  def should_prompts_be_stripped(
74
- labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
75
+ labels_to_be_generated: c.Sequence[str], tokeniser: "PreTrainedTokenizer"
75
76
  ) -> bool:
76
77
  """Determine if we should strip the prompts for few-shot evaluation.
77
78
 
@@ -110,7 +111,7 @@ def should_prompts_be_stripped(
110
111
 
111
112
 
112
113
  def should_prefix_space_be_added_to_labels(
113
- labels_to_be_generated: list[str], tokeniser: "PreTrainedTokenizer"
114
+ labels_to_be_generated: c.Sequence[str], tokeniser: "PreTrainedTokenizer"
114
115
  ) -> bool:
115
116
  """Determine if we should add a prefix space to the labels.
116
117
 
@@ -317,7 +318,7 @@ def get_pad_token(
317
318
 
318
319
  def get_end_of_chat_token_ids(
319
320
  tokeniser: "PreTrainedTokenizer", generative_type: GenerativeType | None
320
- ) -> list[int] | None:
321
+ ) -> c.Sequence[int] | None:
321
322
  """Get the end token ID for chat models.
322
323
 
323
324
  This is only relevant for tokenisers with a chat template.
@@ -433,13 +434,19 @@ def get_first_label_token_mapping(
433
434
 
434
435
  # Tokenise some text containing each label, which we will use to extract the
435
436
  # first token of each label
436
- all_tokens: list[list[str]]
437
+ all_tokens: c.Sequence[c.Sequence[str]]
437
438
  if not has_chat_template(tokeniser=tokeniser):
438
439
  add_prefix_space = should_prefix_space_be_added_to_labels(
439
440
  labels_to_be_generated=local_labels, tokeniser=tokeniser
440
441
  )
441
442
  all_tokens = [
442
- tokeniser.tokenize(text=f" {label}" if add_prefix_space else label)
443
+ [
444
+ tokeniser.decode(token_id)
445
+ for token_id in tokeniser.encode(
446
+ text=f" {label}" if add_prefix_space else label,
447
+ add_special_tokens=False,
448
+ )
449
+ ]
443
450
  for label in local_labels
444
451
  ]
445
452
  else:
@@ -466,7 +473,7 @@ def get_first_label_token_mapping(
466
473
  all_tokens = [
467
474
  [
468
475
  re.sub(
469
- pattern=r"^[^a-zæøåüöä0-9]+|[^a-zæøåüöä0-9]+$",
476
+ pattern=r"^[^a-zæøåüöä0-9 ]+|[^a-zæøåüöä0-9 ]+$",
470
477
  repl="",
471
478
  string=token.lower(),
472
479
  )
@@ -478,11 +485,13 @@ def get_first_label_token_mapping(
478
485
  # Extract the first token of each label
479
486
  first_tokens: list[str] = list()
480
487
  for token_list, label in zip(all_tokens, local_labels):
481
- matching_tokens = [tok for tok in token_list if tok and label.startswith(tok)]
488
+ matching_tokens = [
489
+ tok for tok in token_list if tok and label.startswith(tok.strip())
490
+ ]
482
491
  if not matching_tokens:
483
492
  if log_metadata:
484
493
  log_once(
485
- f"No matching token found in token_list for label '{label}', so "
494
+ f"No matching token found in token_list for label {label!r}, so "
486
495
  "we will not use logprobs with the model.",
487
496
  level=logging.DEBUG,
488
497
  )
@@ -549,12 +558,12 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
549
558
 
550
559
 
551
560
  def apply_chat_template(
552
- conversation: list[dict[str, str]],
561
+ conversation: c.Sequence[dict[str, str]],
553
562
  tokeniser: "PreTrainedTokenizer",
554
563
  tokenise: bool,
555
564
  add_generation_prompt: bool,
556
565
  **extra_kwargs,
557
- ) -> str | list[int]:
566
+ ) -> str | c.Sequence[int]:
558
567
  """Apply the chat template to a prompt.
559
568
 
560
569
  Args:
euroeval/types.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Types used throughout the project."""
2
2
 
3
+ import collections.abc as c
3
4
  import typing as t
4
5
 
5
6
  from transformers.trainer_utils import EvalPrediction
@@ -10,9 +11,9 @@ if t.TYPE_CHECKING:
10
11
 
11
12
  from .data_models import BenchmarkConfig, GenerativeModelOutput
12
13
 
13
- ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
14
- Predictions: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
15
- Labels: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
14
+ ScoreDict: t.TypeAlias = dict[str, dict[str, float] | c.Sequence[dict[str, float]]]
15
+ Predictions: t.TypeAlias = "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]"
16
+ Labels: t.TypeAlias = "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]"
16
17
 
17
18
 
18
19
  class ComputeMetricsFunction(t.Protocol):
@@ -22,8 +23,8 @@ class ComputeMetricsFunction(t.Protocol):
22
23
  self,
23
24
  model_outputs_and_labels: EvalPrediction
24
25
  | tuple[
25
- "NDArray | list[str] | list[list[str]]",
26
- "NDArray | list[str] | list[list[str]]",
26
+ "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]",
27
+ "NDArray | c.Sequence[str] | c.Sequence[c.Sequence[str]]",
27
28
  ],
28
29
  dataset: "Dataset",
29
30
  benchmark_config: "BenchmarkConfig",
@@ -48,7 +49,7 @@ class ExtractLabelsFunction(t.Protocol):
48
49
 
49
50
  def __call__(
50
51
  self, input_batch: dict[str, list], model_output: "GenerativeModelOutput"
51
- ) -> list[str]:
52
+ ) -> c.Sequence[str]:
52
53
  """Extract the labels from the generated output.
53
54
 
54
55
  Args:
@@ -63,7 +64,7 @@ class ExtractLabelsFunction(t.Protocol):
63
64
  ...
64
65
 
65
66
 
66
- def is_list_of_int(x: object) -> t.TypeGuard[list[int]]:
67
+ def is_list_of_int(x: object) -> t.TypeGuard[c.Sequence[int]]:
67
68
  """Check if an object is a list of integers.
68
69
 
69
70
  Args:
@@ -76,7 +77,7 @@ def is_list_of_int(x: object) -> t.TypeGuard[list[int]]:
76
77
  return isinstance(x, list) and all(isinstance(i, int) for i in x)
77
78
 
78
79
 
79
- def is_list_of_list_of_int(x: object) -> t.TypeGuard[list[list[int]]]:
80
+ def is_list_of_list_of_int(x: object) -> t.TypeGuard[c.Sequence[c.Sequence[int]]]:
80
81
  """Check if an object is a list of list of integers.
81
82
 
82
83
  Args:
@@ -93,7 +94,7 @@ def is_list_of_list_of_int(x: object) -> t.TypeGuard[list[list[int]]]:
93
94
  )
94
95
 
95
96
 
96
- def is_list_of_str(x: object) -> t.TypeGuard[list[str]]:
97
+ def is_list_of_str(x: object) -> t.TypeGuard[c.Sequence[str]]:
97
98
  """Check if an object is a list of integers.
98
99
 
99
100
  Args:
euroeval/utils.py CHANGED
@@ -1,6 +1,7 @@
1
1
  """Utility functions to be used in other scripts."""
2
2
 
3
3
  import asyncio
4
+ import collections.abc as c
4
5
  import gc
5
6
  import importlib
6
7
  import importlib.metadata
@@ -142,7 +143,9 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
142
143
  return rng
143
144
 
144
145
 
145
- def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type | None:
146
+ def get_class_by_name(
147
+ class_name: str | c.Sequence[str], module_name: str
148
+ ) -> t.Type | None:
146
149
  """Get a class by its name.
147
150
 
148
151
  Args:
@@ -421,8 +424,8 @@ def get_hf_token(api_key: str | None) -> str | bool:
421
424
 
422
425
 
423
426
  def extract_multiple_choice_labels(
424
- prompt: str, candidate_labels: list[str]
425
- ) -> list[str]:
427
+ prompt: str, candidate_labels: c.Sequence[str]
428
+ ) -> c.Sequence[str]:
426
429
  """Extract multiple choice labels from a prompt.
427
430
 
428
431
  Args: