EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show
  1. euroeval/__init__.py +3 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +161 -114
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +13 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +53 -7
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +38 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +46 -14
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +234 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  41. euroeval/prompt_templates/multiple_choice.py +23 -2
  42. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  43. euroeval/prompt_templates/reading_comprehension.py +42 -2
  44. euroeval/prompt_templates/sentiment_classification.py +46 -2
  45. euroeval/prompt_templates/summarization.py +24 -4
  46. euroeval/scores.py +7 -2
  47. euroeval/speed_benchmark.py +6 -6
  48. euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  49. euroeval/task_group_utils/question_answering.py +35 -28
  50. euroeval/task_group_utils/sequence_classification.py +96 -23
  51. euroeval/task_group_utils/text_to_text.py +7 -3
  52. euroeval/task_group_utils/token_classification.py +47 -75
  53. euroeval/tasks.py +31 -6
  54. euroeval/tokenization_utils.py +295 -207
  55. euroeval/utils.py +118 -34
  56. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
  57. euroeval-16.0.0.dist-info/RECORD +69 -0
  58. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
  59. euroeval/human_evaluation.py +0 -738
  60. euroeval/metrics.py +0 -470
  61. euroeval-15.16.0.dist-info/RECORD +0 -63
  62. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,15 +1,16 @@
1
1
  """Utility functions related to the token-classification task group."""
2
2
 
3
3
  import logging
4
- import re
5
4
  import typing as t
6
5
  from copy import deepcopy
7
6
 
8
- import demjson3
9
7
  import numpy as np
10
8
 
11
9
  from ..exceptions import InvalidBenchmark
12
- from ..utils import raise_if_model_output_contains_nan_values
10
+ from ..utils import (
11
+ extract_json_dict_from_string,
12
+ raise_if_model_output_contains_nan_values,
13
+ )
13
14
 
14
15
  if t.TYPE_CHECKING:
15
16
  from datasets.arrow_dataset import Dataset
@@ -17,7 +18,7 @@ if t.TYPE_CHECKING:
17
18
  from transformers.tokenization_utils_base import BatchEncoding
18
19
  from transformers.trainer_utils import EvalPrediction
19
20
 
20
- from ..data_models import DatasetConfig, GenerativeModelOutput
21
+ from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
21
22
  from ..types import Labels, Predictions
22
23
 
23
24
 
@@ -28,6 +29,7 @@ def compute_metrics(
28
29
  model_outputs_and_labels: "tuple[Predictions, Labels] | EvalPrediction",
29
30
  has_misc_tags: bool,
30
31
  dataset_config: "DatasetConfig",
32
+ benchmark_config: "BenchmarkConfig",
31
33
  dataset: "Dataset",
32
34
  ) -> dict[str, float]:
33
35
  """Compute the metrics needed for evaluation.
@@ -40,6 +42,8 @@ def compute_metrics(
40
42
  Whether the dataset has MISC tags.
41
43
  dataset_config:
42
44
  The configuration of the dataset.
45
+ benchmark_config:
46
+ The configuration of the benchmark.
43
47
  dataset:
44
48
  The dataset used for evaluation. This is only used in case any additional
45
49
  metadata is used to compute the metrics.
@@ -142,7 +146,11 @@ def compute_metrics(
142
146
  if metric.name == "micro_f1"
143
147
  )
144
148
  micro_f1_score = metric(
145
- predictions=predictions, references=list(labels), dataset=dataset
149
+ predictions=predictions,
150
+ references=list(labels),
151
+ dataset=dataset,
152
+ dataset_config=dataset_config,
153
+ benchmark_config=benchmark_config,
146
154
  )
147
155
 
148
156
  # Compute the metrics without MISC tags
@@ -165,7 +173,11 @@ def compute_metrics(
165
173
  if metric.name == "micro_f1_no_misc"
166
174
  )
167
175
  micro_f1_no_misc_score = metric(
168
- predictions=predictions_no_misc, references=labels_no_misc, dataset=dataset
176
+ predictions=predictions_no_misc,
177
+ references=labels_no_misc,
178
+ dataset=dataset,
179
+ dataset_config=dataset_config,
180
+ benchmark_config=benchmark_config,
169
181
  )
170
182
 
171
183
  # Raise error if the metrics are invalid
@@ -194,51 +206,11 @@ def extract_labels_from_generation(
194
206
  Returns:
195
207
  The predicted labels.
196
208
  """
197
- raw_predictions = model_output.sequences
198
-
199
- # Attempt to extract the JSON dictionary from the predictions
200
- json_regex = r"\{[^{}]+?\}"
201
- json_matches = [
202
- re.search(pattern=json_regex, string=raw_prediction, flags=re.DOTALL)
203
- or raw_prediction
204
- for raw_prediction in raw_predictions
205
- ]
206
- raw_predictions = [
207
- json_match.group() if isinstance(json_match, re.Match) else json_match
208
- for json_match in json_matches
209
- ]
210
-
211
209
  tokens = input_batch["tokens"]
212
210
  predicted_labels: list[list[str]] = [["o"] * len(token_ids) for token_ids in tokens]
213
- for idx, raw_prediction in enumerate(raw_predictions):
214
- try:
215
- json_output = demjson3.decode(txt=raw_prediction)
216
- if not isinstance(json_output, dict):
217
- logger.debug(
218
- "The model output is not a JSON dictionary, so cannot parse "
219
- f"it. Skipping. Here is the output: {raw_prediction}"
220
- )
221
- continue
222
- elif not all(isinstance(key, str) for key in json_output.keys()):
223
- logger.debug(
224
- "The model output is not a JSON dictionary with string keys, "
225
- "so cannot parse it. Skipping. Here is the output: "
226
- f"{raw_prediction}"
227
- )
228
- continue
229
- elif not all(isinstance(value, list) for value in json_output.values()):
230
- logger.debug(
231
- "The model output is not a JSON dictionary with list values, "
232
- "so cannot parse it. Skipping. Here is the output: "
233
- f"{raw_prediction}"
234
- )
235
- continue
236
- prediction_dict: dict[str, list[str]] = json_output
237
- except demjson3.JSONDecodeError:
238
- logger.debug(
239
- "The model output is not valid JSON, so cannot parse it. Skipping. "
240
- f"Here is the output: {raw_prediction!r}"
241
- )
211
+ for idx, raw_prediction in enumerate(model_output.sequences):
212
+ prediction_dict = extract_json_dict_from_string(s=raw_prediction)
213
+ if prediction_dict is None:
242
214
  continue
243
215
 
244
216
  prompt_label_mapping = dataset_config.prompt_label_mapping
@@ -272,15 +244,15 @@ def extract_labels_from_generation(
272
244
 
273
245
 
274
246
  def tokenize_and_align_labels(
275
- examples: dict, tokenizer: "PreTrainedTokenizer", label2id: dict[str, int]
247
+ examples: dict, tokeniser: "PreTrainedTokenizer", label2id: dict[str, int]
276
248
  ) -> "BatchEncoding":
277
249
  """Tokenise all texts and align the labels with them.
278
250
 
279
251
  Args:
280
252
  examples:
281
253
  The examples to be tokenised.
282
- tokenizer:
283
- A pretrained tokenizer.
254
+ tokeniser:
255
+ A pretrained tokeniser.
284
256
  label2id:
285
257
  A dictionary that converts NER tags to IDs.
286
258
 
@@ -289,22 +261,22 @@ def tokenize_and_align_labels(
289
261
  """
290
262
  # Tokenize the texts. We use the `is_split_into_words` argument here because
291
263
  # the texts in our dataset are lists of words (with a label for each word)
292
- tokenized_inputs = tokenizer(
264
+ tokenized_inputs = tokeniser(
293
265
  examples["tokens"], is_split_into_words=True, truncation=True, padding=True
294
266
  )
295
267
 
296
268
  # Extract a mapping between all the tokens and their corresponding word. If the
297
- # tokenizer is of a "fast" variant then this can be accessed through the
269
+ # tokeniser is of a "fast" variant then this can be accessed through the
298
270
  # `word_ids` method. Otherwise, we have to extract it manually.
299
271
  all_labels: list[list[int]] = list()
300
272
  labels: list[str]
301
273
  word_ids: list[int | None]
302
274
  for i, labels in enumerate(examples["labels"]):
303
- # Try to get the word IDs from the tokenizer
275
+ # Try to get the word IDs from the tokeniser
304
276
  try:
305
277
  word_ids = tokenized_inputs.word_ids(batch_index=i)
306
278
 
307
- # If the tokenizer is not of a "fast" variant, we have to extract the word
279
+ # If the tokeniser is not of a "fast" variant, we have to extract the word
308
280
  # IDs manually
309
281
  except ValueError:
310
282
  # Get the list of words in the document
@@ -314,7 +286,7 @@ def tokenize_and_align_labels(
314
286
  tok_ids: list[int] = tokenized_inputs.input_ids[i]
315
287
 
316
288
  # Decode the token IDs
317
- tokens = tokenizer.convert_ids_to_tokens(tok_ids)
289
+ tokens = tokeniser.convert_ids_to_tokens(tok_ids)
318
290
  assert isinstance(tokens, list)
319
291
 
320
292
  # Remove prefixes from the tokens
@@ -326,14 +298,14 @@ def tokenize_and_align_labels(
326
298
  tokens[tok_idx] = tok[len(prefix) :]
327
299
 
328
300
  # Replace UNK tokens with the correct word
329
- tokens = handle_unk_tokens(tokenizer=tokenizer, tokens=tokens, words=words)
301
+ tokens = handle_unk_tokens(tokeniser=tokeniser, tokens=tokens, words=words)
330
302
 
331
- # Get list of special tokens. Some tokenizers do not record these
303
+ # Get list of special tokens. Some tokenisers do not record these
332
304
  # properly, which is why we convert the values to their indices and
333
305
  # then back to strings
334
306
  sp_toks = [
335
- tokenizer.convert_ids_to_tokens(tokenizer.convert_tokens_to_ids(sp_tok))
336
- for sp_tok in tokenizer.special_tokens_map.values()
307
+ tokeniser.convert_ids_to_tokens(tokeniser.convert_tokens_to_ids(sp_tok))
308
+ for sp_tok in tokeniser.special_tokens_map.values()
337
309
  ]
338
310
 
339
311
  # Replace special tokens with `None`
@@ -357,7 +329,7 @@ def tokenize_and_align_labels(
357
329
  if len(word_idxs) != len(token_idxs):
358
330
  raise InvalidBenchmark(
359
331
  "The tokens could not be aligned with the words during manual "
360
- "word-token alignment. It seems that the tokenizer is neither "
332
+ "word-token alignment. It seems that the tokeniser is neither "
361
333
  "of the fast variant nor of a SentencePiece/WordPiece variant."
362
334
  )
363
335
 
@@ -387,9 +359,9 @@ def tokenize_and_align_labels(
387
359
  label = labels[word_id]
388
360
  try:
389
361
  label_id = label2id[label.lower()]
390
- except KeyError:
362
+ except KeyError as e:
391
363
  msg = f"The label {label} was not found in the model's config."
392
- raise InvalidBenchmark(msg)
364
+ raise InvalidBenchmark(msg) from e
393
365
  label_ids.append(label_id)
394
366
 
395
367
  # For the other tokens in a word, we set the label to -100
@@ -404,13 +376,13 @@ def tokenize_and_align_labels(
404
376
 
405
377
 
406
378
  def handle_unk_tokens(
407
- tokenizer: "PreTrainedTokenizer", tokens: list[str], words: list[str]
379
+ tokeniser: "PreTrainedTokenizer", tokens: list[str], words: list[str]
408
380
  ) -> list[str]:
409
381
  """Replace unknown tokens in the tokens with the corresponding word.
410
382
 
411
383
  Args:
412
- tokenizer:
413
- The tokenizer used to tokenize the words.
384
+ tokeniser:
385
+ The tokeniser used to tokenize the words.
414
386
  tokens:
415
387
  The list of tokens.
416
388
  words:
@@ -420,15 +392,15 @@ def handle_unk_tokens(
420
392
  The list of tokens with unknown tokens replaced by the corresponding word.
421
393
  """
422
394
  # Locate the token indices of the unknown tokens
423
- token_unk_idxs = [i for i, tok in enumerate(tokens) if tok == tokenizer.unk_token]
395
+ token_unk_idxs = [i for i, tok in enumerate(tokens) if tok == tokeniser.unk_token]
424
396
 
425
397
  # Locate the word indices of the words which contain an unknown token
426
398
  word_unk_idxs = [
427
399
  i
428
400
  for i, word in enumerate(words)
429
- if tokenizer.unk_token
430
- in tokenizer.convert_ids_to_tokens(
431
- tokenizer.encode(word, add_special_tokens=False)
401
+ if tokeniser.unk_token
402
+ in tokeniser.convert_ids_to_tokens(
403
+ tokeniser.encode(word, add_special_tokens=False)
432
404
  )
433
405
  ]
434
406
 
@@ -438,8 +410,8 @@ def handle_unk_tokens(
438
410
  word = words[word_idx]
439
411
 
440
412
  # Tokenize the word, which is now a list containing at least one UNK token
441
- tokens_with_unk = tokenizer.convert_ids_to_tokens(
442
- tokenizer.encode(word, add_special_tokens=False)
413
+ tokens_with_unk = tokeniser.convert_ids_to_tokens(
414
+ tokeniser.encode(word, add_special_tokens=False)
443
415
  )
444
416
 
445
417
  # Iterate over the tokens in the word
@@ -448,10 +420,10 @@ def handle_unk_tokens(
448
420
  # of the content of this token from the word. The result of the `word`
449
421
  # variable will be the content of the UNK token.
450
422
  # NOTE: This is a bit hacky and not bulletproof. For instance, if the
451
- # word is "1925-1950" and the tokenizer splits it into ["[UNK]", "-",
423
+ # word is "1925-1950" and the tokeniser splits it into ["[UNK]", "-",
452
424
  # "19", "50"], then the result will be 2519 instead of 1925. This
453
425
  # happens almost never, however, so we can live with it.
454
- if possible_unk_token != tokenizer.unk_token:
426
+ if possible_unk_token != tokeniser.unk_token:
455
427
  word = word.replace(possible_unk_token, "", 1)
456
428
 
457
429
  # Replace the token with the word
euroeval/tasks.py CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  from . import metrics as m
4
4
  from .data_models import Task
5
- from .enums import TaskGroup
5
+ from .enums import GenerativeType, ModelType, TaskGroup
6
6
  from .prompt_templates import (
7
7
  LA_TEMPLATES,
8
8
  MULTIPLE_CHOICE_TEMPLATES,
@@ -28,8 +28,9 @@ LA = Task(
28
28
  template_dict=LA_TEMPLATES,
29
29
  metrics=[m.mcc_metric, m.macro_f1_metric],
30
30
  default_num_few_shot_examples=12,
31
- default_max_generated_tokens=5,
31
+ default_max_generated_tokens=10,
32
32
  default_labels=["correct", "incorrect"],
33
+ uses_logprobs=True,
33
34
  )
34
35
 
35
36
 
@@ -51,6 +52,7 @@ NER = Task(
51
52
  "b-misc",
52
53
  "i-misc",
53
54
  ],
55
+ uses_structured_output=True,
54
56
  )
55
57
 
56
58
 
@@ -71,8 +73,9 @@ SENT = Task(
71
73
  template_dict=SENT_TEMPLATES,
72
74
  metrics=[m.mcc_metric, m.macro_f1_metric],
73
75
  default_num_few_shot_examples=12,
74
- default_max_generated_tokens=5,
76
+ default_max_generated_tokens=10,
75
77
  default_labels=["positive", "neutral", "negative"],
78
+ uses_logprobs=True,
76
79
  )
77
80
 
78
81
 
@@ -84,6 +87,7 @@ SUMM = Task(
84
87
  default_num_few_shot_examples=1,
85
88
  default_max_generated_tokens=256,
86
89
  default_labels=[],
90
+ allowed_model_types=[ModelType.GENERATIVE],
87
91
  )
88
92
 
89
93
 
@@ -93,8 +97,9 @@ KNOW = Task(
93
97
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
94
98
  metrics=[m.mcc_metric, m.accuracy_metric],
95
99
  default_num_few_shot_examples=5,
96
- default_max_generated_tokens=5,
100
+ default_max_generated_tokens=10,
97
101
  default_labels=["a", "b", "c", "d"],
102
+ uses_logprobs=True,
98
103
  )
99
104
 
100
105
 
@@ -104,8 +109,9 @@ MCRC = Task(
104
109
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
105
110
  metrics=[m.mcc_metric, m.accuracy_metric],
106
111
  default_num_few_shot_examples=5,
107
- default_max_generated_tokens=5,
112
+ default_max_generated_tokens=10,
108
113
  default_labels=["a", "b", "c", "d"],
114
+ uses_logprobs=True,
109
115
  )
110
116
 
111
117
 
@@ -115,8 +121,27 @@ COMMON_SENSE = Task(
115
121
  template_dict=MULTIPLE_CHOICE_TEMPLATES,
116
122
  metrics=[m.mcc_metric, m.accuracy_metric],
117
123
  default_num_few_shot_examples=5,
118
- default_max_generated_tokens=5,
124
+ default_max_generated_tokens=10,
119
125
  default_labels=["a", "b", "c", "d"],
126
+ uses_logprobs=True,
127
+ )
128
+
129
+
130
+ EUROPEAN_VALUES = Task(
131
+ name="european-values",
132
+ task_group=TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
133
+ template_dict=MULTIPLE_CHOICE_TEMPLATES,
134
+ metrics=[m.european_values_metric],
135
+ default_num_few_shot_examples=0,
136
+ default_max_generated_tokens=10,
137
+ default_labels=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
138
+ allowed_model_types=[ModelType.GENERATIVE],
139
+ allowed_generative_types=[
140
+ GenerativeType.INSTRUCTION_TUNED,
141
+ GenerativeType.REASONING,
142
+ ],
143
+ requires_zero_shot=True,
144
+ uses_logprobs=True,
120
145
  )
121
146
 
122
147