EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show
  1. euroeval/__init__.py +8 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +199 -139
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +19 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +73 -23
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +35 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +90 -20
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +276 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/model_cache.py +13 -1
  41. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  42. euroeval/prompt_templates/multiple_choice.py +23 -2
  43. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  44. euroeval/prompt_templates/reading_comprehension.py +42 -2
  45. euroeval/prompt_templates/sentiment_classification.py +46 -2
  46. euroeval/prompt_templates/summarization.py +24 -4
  47. euroeval/scores.py +7 -2
  48. euroeval/speed_benchmark.py +6 -6
  49. euroeval/task_group_utils/multiple_choice_classification.py +19 -8
  50. euroeval/task_group_utils/question_answering.py +35 -28
  51. euroeval/task_group_utils/sequence_classification.py +128 -42
  52. euroeval/task_group_utils/text_to_text.py +7 -3
  53. euroeval/task_group_utils/token_classification.py +59 -73
  54. euroeval/tasks.py +33 -6
  55. euroeval/tokenization_utils.py +294 -207
  56. euroeval/utils.py +150 -35
  57. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
  58. euroeval-16.0.1.dist-info/RECORD +69 -0
  59. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
  60. euroeval/human_evaluation.py +0 -738
  61. euroeval/metrics.py +0 -470
  62. euroeval-15.16.0.dist-info/RECORD +0 -63
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
  64. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -75,7 +75,11 @@ def compute_metrics(
75
75
  while True:
76
76
  try:
77
77
  score: float | None = metric(
78
- predictions=predictions, references=labels, dataset=dataset
78
+ predictions=predictions,
79
+ references=labels,
80
+ dataset=dataset,
81
+ dataset_config=dataset_config,
82
+ benchmark_config=benchmark_config,
79
83
  )
80
84
  break
81
85
  except Exception as e:
@@ -85,7 +89,7 @@ def compute_metrics(
85
89
  "MPS backend out of memory",
86
90
  ]
87
91
  if not any(error in str(e) for error in oom_error):
88
- raise InvalidBenchmark(str(e))
92
+ raise InvalidBenchmark(str(e)) from e
89
93
 
90
94
  if (
91
95
  isinstance(metric, HuggingFaceMetric)
@@ -98,7 +102,7 @@ def compute_metrics(
98
102
  "the CPU."
99
103
  )
100
104
  else:
101
- raise InvalidBenchmark(str(e))
105
+ raise InvalidBenchmark(str(e)) from e
102
106
  finally:
103
107
  for attribute in METRIC_ATTRIBUTES_TAKING_UP_MEMORY:
104
108
  if hasattr(metric, attribute):
@@ -1,15 +1,16 @@
1
1
  """Utility functions related to the token-classification task group."""
2
2
 
3
3
  import logging
4
- import re
5
4
  import typing as t
6
5
  from copy import deepcopy
7
6
 
8
- import demjson3
9
7
  import numpy as np
10
8
 
11
9
  from ..exceptions import InvalidBenchmark
12
- from ..utils import raise_if_model_output_contains_nan_values
10
+ from ..utils import (
11
+ extract_json_dict_from_string,
12
+ raise_if_model_output_contains_nan_values,
13
+ )
13
14
 
14
15
  if t.TYPE_CHECKING:
15
16
  from datasets.arrow_dataset import Dataset
@@ -17,7 +18,7 @@ if t.TYPE_CHECKING:
17
18
  from transformers.tokenization_utils_base import BatchEncoding
18
19
  from transformers.trainer_utils import EvalPrediction
19
20
 
20
- from ..data_models import DatasetConfig, GenerativeModelOutput
21
+ from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
21
22
  from ..types import Labels, Predictions
22
23
 
23
24
 
@@ -28,6 +29,7 @@ def compute_metrics(
28
29
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
29
30
  has_misc_tags: bool,
30
31
  dataset_config: "DatasetConfig",
32
+ benchmark_config: "BenchmarkConfig",
31
33
  dataset: "Dataset",
32
34
  ) -> dict[str, float]:
33
35
  """Compute the metrics needed for evaluation.
@@ -40,6 +42,8 @@ def compute_metrics(
40
42
  Whether the dataset has MISC tags.
41
43
  dataset_config:
42
44
  The configuration of the dataset.
45
+ benchmark_config:
46
+ The configuration of the benchmark.
43
47
  dataset:
44
48
  The dataset used for evaluation. This is only used in case any additional
45
49
  metadata is used to compute the metrics.
@@ -142,7 +146,11 @@ def compute_metrics(
142
146
  if metric.name == "micro_f1"
143
147
  )
144
148
  micro_f1_score = metric(
145
- predictions=predictions, references=list(labels), dataset=dataset
149
+ predictions=predictions,
150
+ references=list(labels),
151
+ dataset=dataset,
152
+ dataset_config=dataset_config,
153
+ benchmark_config=benchmark_config,
146
154
  )
147
155
 
148
156
  # Compute the metrics without MISC tags
@@ -165,7 +173,11 @@ def compute_metrics(
165
173
  if metric.name == "micro_f1_no_misc"
166
174
  )
167
175
  micro_f1_no_misc_score = metric(
168
- predictions=predictions_no_misc, references=labels_no_misc, dataset=dataset
176
+ predictions=predictions_no_misc,
177
+ references=labels_no_misc,
178
+ dataset=dataset,
179
+ dataset_config=dataset_config,
180
+ benchmark_config=benchmark_config,
169
181
  )
170
182
 
171
183
  # Raise error if the metrics are invalid
@@ -194,55 +206,29 @@ def extract_labels_from_generation(
194
206
  Returns:
195
207
  The predicted labels.
196
208
  """
197
- raw_predictions = model_output.sequences
198
-
199
- # Attempt to extract the JSON dictionary from the predictions
200
- json_regex = r"\{[^{}]+?\}"
201
- json_matches = [
202
- re.search(pattern=json_regex, string=raw_prediction, flags=re.DOTALL)
203
- or raw_prediction
204
- for raw_prediction in raw_predictions
205
- ]
206
- raw_predictions = [
207
- json_match.group() if isinstance(json_match, re.Match) else json_match
208
- for json_match in json_matches
209
- ]
210
-
211
209
  tokens = input_batch["tokens"]
212
210
  predicted_labels: list[list[str]] = [["o"] * len(token_ids) for token_ids in tokens]
213
- for idx, raw_prediction in enumerate(raw_predictions):
214
- try:
215
- json_output = demjson3.decode(txt=raw_prediction)
216
- if not isinstance(json_output, dict):
217
- logger.debug(
218
- "The model output is not a JSON dictionary, so cannot parse "
219
- f"it. Skipping. Here is the output: {raw_prediction}"
220
- )
221
- continue
222
- elif not all(isinstance(key, str) for key in json_output.keys()):
211
+ for idx, raw_prediction in enumerate(model_output.sequences):
212
+ prediction_dict = extract_json_dict_from_string(s=raw_prediction)
213
+ if prediction_dict is None:
214
+ continue
215
+
216
+ prompt_label_mapping = dataset_config.prompt_label_mapping
217
+ for prompt_tag_name, named_entities in prediction_dict.items():
218
+ if not isinstance(named_entities, list):
223
219
  logger.debug(
224
- "The model output is not a JSON dictionary with string keys, "
225
- "so cannot parse it. Skipping. Here is the output: "
226
- f"{raw_prediction}"
220
+ "The model produced an invalid format for the named entities. "
221
+ f"Expected a list but got {type(named_entities)}. Skipping."
227
222
  )
228
223
  continue
229
- elif not all(isinstance(value, list) for value in json_output.values()):
224
+ try:
225
+ named_entities = [str(ne) for ne in named_entities]
226
+ except Exception:
230
227
  logger.debug(
231
- "The model output is not a JSON dictionary with list values, "
232
- "so cannot parse it. Skipping. Here is the output: "
233
- f"{raw_prediction}"
228
+ "The model produced an invalid format for the named entities. "
229
+ f"Expected a list of strings but got {named_entities}. Skipping."
234
230
  )
235
231
  continue
236
- prediction_dict: dict[str, list[str]] = json_output
237
- except demjson3.JSONDecodeError:
238
- logger.debug(
239
- "The model output is not valid JSON, so cannot parse it. Skipping. "
240
- f"Here is the output: {raw_prediction!r}"
241
- )
242
- continue
243
-
244
- prompt_label_mapping = dataset_config.prompt_label_mapping
245
- for prompt_tag_name, named_entities in prediction_dict.items():
246
232
  try:
247
233
  tag_name = [
248
234
  tag[2:]
@@ -272,15 +258,15 @@ def extract_labels_from_generation(
272
258
 
273
259
 
274
260
  def tokenize_and_align_labels(
275
- examples: dict, tokenizer: "PreTrainedTokenizer", label2id: dict[str, int]
261
+ examples: dict, tokeniser: "PreTrainedTokenizer", label2id: dict[str, int]
276
262
  ) -> "BatchEncoding":
277
263
  """Tokenise all texts and align the labels with them.
278
264
 
279
265
  Args:
280
266
  examples:
281
267
  The examples to be tokenised.
282
- tokenizer:
283
- A pretrained tokenizer.
268
+ tokeniser:
269
+ A pretrained tokeniser.
284
270
  label2id:
285
271
  A dictionary that converts NER tags to IDs.
286
272
 
@@ -289,22 +275,22 @@ def tokenize_and_align_labels(
289
275
  """
290
276
  # Tokenize the texts. We use the `is_split_into_words` argument here because
291
277
  # the texts in our dataset are lists of words (with a label for each word)
292
- tokenized_inputs = tokenizer(
278
+ tokenized_inputs = tokeniser(
293
279
  examples["tokens"], is_split_into_words=True, truncation=True, padding=True
294
280
  )
295
281
 
296
282
  # Extract a mapping between all the tokens and their corresponding word. If the
297
- # tokenizer is of a "fast" variant then this can be accessed through the
283
+ # tokeniser is of a "fast" variant then this can be accessed through the
298
284
  # `word_ids` method. Otherwise, we have to extract it manually.
299
285
  all_labels: list[list[int]] = list()
300
286
  labels: list[str]
301
287
  word_ids: list[int | None]
302
288
  for i, labels in enumerate(examples["labels"]):
303
- # Try to get the word IDs from the tokenizer
289
+ # Try to get the word IDs from the tokeniser
304
290
  try:
305
291
  word_ids = tokenized_inputs.word_ids(batch_index=i)
306
292
 
307
- # If the tokenizer is not of a "fast" variant, we have to extract the word
293
+ # If the tokeniser is not of a "fast" variant, we have to extract the word
308
294
  # IDs manually
309
295
  except ValueError:
310
296
  # Get the list of words in the document
@@ -314,7 +300,7 @@ def tokenize_and_align_labels(
314
300
  tok_ids: list[int] = tokenized_inputs.input_ids[i]
315
301
 
316
302
  # Decode the token IDs
317
- tokens = tokenizer.convert_ids_to_tokens(tok_ids)
303
+ tokens = tokeniser.convert_ids_to_tokens(tok_ids)
318
304
  assert isinstance(tokens, list)
319
305
 
320
306
  # Remove prefixes from the tokens
@@ -326,14 +312,14 @@ def tokenize_and_align_labels(
326
312
  tokens[tok_idx] = tok[len(prefix) :]
327
313
 
328
314
  # Replace UNK tokens with the correct word
329
- tokens = handle_unk_tokens(tokenizer=tokenizer, tokens=tokens, words=words)
315
+ tokens = handle_unk_tokens(tokeniser=tokeniser, tokens=tokens, words=words)
330
316
 
331
- # Get list of special tokens. Some tokenizers do not record these
317
+ # Get list of special tokens. Some tokenisers do not record these
332
318
  # properly, which is why we convert the values to their indices and
333
319
  # then back to strings
334
320
  sp_toks = [
335
- tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids(sp_tok))
336
- for sp_tok in tokenizer.special_tokens_map.values()
321
+ tokeniser.convert_ids_to_tokens(tokeniser.convert_tokens_to_ids(sp_tok))
322
+ for sp_tok in tokeniser.special_tokens_map.values()
337
323
  ]
338
324
 
339
325
  # Replace special tokens with `None`
@@ -357,7 +343,7 @@ def tokenize_and_align_labels(
357
343
  if len(word_idxs) != len(token_idxs):
358
344
  raise InvalidBenchmark(
359
345
  "The tokens could not be aligned with the words during manual "
360
- "word-token alignment. It seems that the tokenizer is neither "
346
+ "word-token alignment. It seems that the tokeniser is neither "
361
347
  "of the fast variant nor of a SentencePiece/WordPiece variant."
362
348
  )
363
349
 
@@ -387,9 +373,9 @@ def tokenize_and_align_labels(
387
373
  label = labels[word_id]
388
374
  try:
389
375
  label_id = label2id[label.lower()]
390
- except KeyError:
376
+ except KeyError as e:
391
377
  msg = f"The label {label} was not found in the model's config."
392
- raise InvalidBenchmark(msg)
378
+ raise InvalidBenchmark(msg) from e
393
379
  label_ids.append(label_id)
394
380
 
395
381
  # For the other tokens in a word, we set the label to -100
@@ -404,13 +390,13 @@ def tokenize_and_align_labels(
404
390
 
405
391
 
406
392
  def handle_unk_tokens(
407
- tokenizer: "PreTrainedTokenizer", tokens: list[str], words: list[str]
393
+ tokeniser: "PreTrainedTokenizer", tokens: list[str], words: list[str]
408
394
  ) -> list[str]:
409
395
  """Replace unknown tokens in the tokens with the corresponding word.
410
396
 
411
397
  Args:
412
- tokenizer:
413
- The tokenizer used to tokenize the words.
398
+ tokeniser:
399
+ The tokeniser used to tokenize the words.
414
400
  tokens:
415
401
  The list of tokens.
416
402
  words:
@@ -420,15 +406,15 @@ def handle_unk_tokens(
420
406
  The list of tokens with unknown tokens replaced by the corresponding word.
421
407
  """
422
408
  # Locate the token indices of the unknown tokens
423
- token_unk_idxs = [i for i, tok in enumerate(tokens) if tok == tokenizer.unk_token]
409
+ token_unk_idxs = [i for i, tok in enumerate(tokens) if tok == tokeniser.unk_token]
424
410
 
425
411
  # Locate the word indices of the words which contain an unknown token
426
412
  word_unk_idxs = [
427
413
  i
428
414
  for i, word in enumerate(words)
429
- if tokenizer.unk_token
430
- in tokenizer.convert_ids_to_tokens(
431
- tokenizer.encode(word, add_special_tokens=False)
415
+ if tokeniser.unk_token
416
+ in tokeniser.convert_ids_to_tokens(
417
+ tokeniser.encode(word, add_special_tokens=False)
432
418
  )
433
419
  ]
434
420
 
@@ -438,8 +424,8 @@ def handle_unk_tokens(
438
424
  word = words[word_idx]
439
425
 
440
426
  # Tokenize the word, which is now a list containing at least one UNK token
441
- tokens_with_unk = tokenizer.convert_ids_to_tokens(
442
- tokenizer.encode(word, add_special_tokens=False)
427
+ tokens_with_unk = tokeniser.convert_ids_to_tokens(
428
+ tokeniser.encode(word, add_special_tokens=False)
443
429
  )
444
430
 
445
431
  # Iterate over the tokens in the word
@@ -448,10 +434,10 @@ def handle_unk_tokens(
448
434
  # of the content of this token from the word. The result of the `word`
449
435
  # variable will be the content of the UNK token.
450
436
  # NOTE: This is a bit hacky and not bulletproof. For instance, if the
451
- # word is "1925-1950" and the tokenizer splits it into ["[UNK]", "-",
437
+ # word is "1925-1950" and the tokeniser splits it into ["[UNK]", "-",
452
438
  # "19", "50"], then the result will be 2519 instead of 1925. This
453
439
  # happens almost never, however, so we can live with it.
454
- if possible_unk_token != tokenizer.unk_token:
440
+ if possible_unk_token != tokeniser.unk_token:
455
441
  word = word.replace(possible_unk_token, "", 1)
456
442
 
457
443
  # Replace the token with the word
euroeval/tasks.py CHANGED
@@ -1,8 +1,9 @@
1
1
  """All benchmarks tasks used in EuroEval."""
2
2
 
3
3
  from . import metrics as m
4
+ from .constants import NUM_GENERATION_TOKENS_FOR_CLASSIFICATION
4
5
  from .data_models import Task
5
- from .enums import TaskGroup
6
+ from .enums import GenerativeType, ModelType, TaskGroup
6
7
  from .prompt_templates import (
7
8
  LA_TEMPLATES,
8
9
  MULTIPLE_CHOICE_TEMPLATES,
@@ -28,8 +29,9 @@ LA = Task(
28
29
  template_dict=LA_TEMPLATES,
29
30
  metrics=[m.mcc_metric, m.macro_f1_metric],
30
31
  default_num_few_shot_examples=12,
31
- default_max_generated_tokens=5,
32
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
32
33
  default_labels=["correct", "incorrect"],
34
+ uses_logprobs=True,
33
35
  )
34
36
 
35
37
 
@@ -51,6 +53,7 @@ NER = Task(
51
53
  "b-misc",
52
54
  "i-misc",
53
55
  ],
56
+ uses_structured_output=True,
54
57
  )
55
58
 
56
59
 
@@ -71,8 +74,9 @@ SENT = Task(
71
74
  template_dict=SENT_TEMPLATES,
72
75
  metrics=[m.mcc_metric, m.macro_f1_metric],
73
76
  default_num_few_shot_examples=12,
74
- default_max_generated_tokens=5,
77
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
75
78
  default_labels=["positive", "neutral", "negative"],
79
+ uses_logprobs=True,
76
80
  )
77
81
 
78
82
 
@@ -84,6 +88,7 @@ SUMM = Task(
84
88
  default_num_few_shot_examples=1,
85
89
  default_max_generated_tokens=256,
86
90
  default_labels=[],
91
+ allowed_model_types=[ModelType.GENERATIVE],
87
92
  )
88
93
 
89
94
 
@@ -93,8 +98,9 @@ KNOW = Task(
93
98
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
94
99
  metrics=[m.mcc_metric, m.accuracy_metric],
95
100
  default_num_few_shot_examples=5,
96
- default_max_generated_tokens=5,
101
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
97
102
  default_labels=["a", "b", "c", "d"],
103
+ uses_logprobs=True,
98
104
  )
99
105
 
100
106
 
@@ -104,8 +110,9 @@ MCRC = Task(
104
110
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
105
111
  metrics=[m.mcc_metric, m.accuracy_metric],
106
112
  default_num_few_shot_examples=5,
107
- default_max_generated_tokens=5,
113
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
108
114
  default_labels=["a", "b", "c", "d"],
115
+ uses_logprobs=True,
109
116
  )
110
117
 
111
118
 
@@ -115,8 +122,28 @@ COMMON_SENSE = Task(
115
122
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
116
123
  metrics=[m.mcc_metric, m.accuracy_metric],
117
124
  default_num_few_shot_examples=5,
118
- default_max_generated_tokens=5,
125
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
119
126
  default_labels=["a", "b", "c", "d"],
127
+ uses_logprobs=True,
128
+ )
129
+
130
+
131
+ EUROPEAN_VALUES = Task(
132
+ name="european-values",
133
+ task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
134
+ template_dict=MULTIPLE_CHOICE_TEMPLATES,
135
+ metrics=[m.european_values_metric],
136
+ default_num_few_shot_examples=0,
137
+ default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
138
+ default_labels=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"],
139
+ allowed_model_types=[ModelType.GENERATIVE],
140
+ allowed_generative_types=[
141
+ GenerativeType.INSTRUCTION_TUNED,
142
+ GenerativeType.REASONING,
143
+ ],
144
+ requires_zero_shot=True,
145
+ uses_logprobs=True,
146
+ allow_invalid_model_outputs=False,
120
147
  )
121
148
 
122
149