EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. euroeval/__init__.py +32 -14
  2. euroeval/benchmark_config_factory.py +92 -180
  3. euroeval/benchmark_modules/base.py +49 -39
  4. euroeval/benchmark_modules/fresh.py +35 -21
  5. euroeval/benchmark_modules/hf.py +280 -244
  6. euroeval/benchmark_modules/litellm.py +752 -312
  7. euroeval/benchmark_modules/vllm.py +570 -268
  8. euroeval/benchmarker.py +651 -528
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +49 -38
  12. euroeval/constants.py +44 -25
  13. euroeval/data_loading.py +111 -55
  14. euroeval/data_models.py +490 -323
  15. euroeval/dataset_configs/__init__.py +26 -4
  16. euroeval/dataset_configs/bosnian.py +39 -0
  17. euroeval/dataset_configs/bulgarian.py +56 -0
  18. euroeval/dataset_configs/croatian.py +56 -0
  19. euroeval/dataset_configs/czech.py +75 -0
  20. euroeval/dataset_configs/danish.py +78 -50
  21. euroeval/dataset_configs/dutch.py +74 -44
  22. euroeval/dataset_configs/english.py +71 -36
  23. euroeval/dataset_configs/estonian.py +111 -0
  24. euroeval/dataset_configs/faroese.py +25 -18
  25. euroeval/dataset_configs/finnish.py +63 -26
  26. euroeval/dataset_configs/french.py +65 -32
  27. euroeval/dataset_configs/german.py +77 -36
  28. euroeval/dataset_configs/greek.py +64 -0
  29. euroeval/dataset_configs/icelandic.py +68 -57
  30. euroeval/dataset_configs/italian.py +68 -36
  31. euroeval/dataset_configs/latvian.py +87 -0
  32. euroeval/dataset_configs/lithuanian.py +64 -0
  33. euroeval/dataset_configs/norwegian.py +98 -72
  34. euroeval/dataset_configs/polish.py +96 -0
  35. euroeval/dataset_configs/portuguese.py +63 -40
  36. euroeval/dataset_configs/serbian.py +64 -0
  37. euroeval/dataset_configs/slovak.py +55 -0
  38. euroeval/dataset_configs/slovene.py +56 -0
  39. euroeval/dataset_configs/spanish.py +68 -34
  40. euroeval/dataset_configs/swedish.py +82 -41
  41. euroeval/dataset_configs/ukrainian.py +64 -0
  42. euroeval/enums.py +12 -6
  43. euroeval/exceptions.py +21 -1
  44. euroeval/finetuning.py +34 -26
  45. euroeval/generation.py +76 -41
  46. euroeval/generation_utils.py +169 -34
  47. euroeval/languages.py +1020 -188
  48. euroeval/logging_utils.py +268 -0
  49. euroeval/metrics/__init__.py +6 -0
  50. euroeval/metrics/base.py +85 -0
  51. euroeval/metrics/huggingface.py +216 -0
  52. euroeval/metrics/llm_as_a_judge.py +260 -0
  53. euroeval/metrics/pipeline.py +289 -0
  54. euroeval/metrics/speed.py +48 -0
  55. euroeval/model_cache.py +40 -21
  56. euroeval/model_config.py +4 -5
  57. euroeval/model_loading.py +3 -0
  58. euroeval/prompt_templates/__init__.py +2 -0
  59. euroeval/prompt_templates/classification.py +206 -0
  60. euroeval/prompt_templates/linguistic_acceptability.py +157 -22
  61. euroeval/prompt_templates/multiple_choice.py +159 -17
  62. euroeval/prompt_templates/named_entity_recognition.py +318 -21
  63. euroeval/prompt_templates/reading_comprehension.py +207 -16
  64. euroeval/prompt_templates/sentiment_classification.py +205 -22
  65. euroeval/prompt_templates/summarization.py +122 -22
  66. euroeval/prompt_templates/token_classification.py +279 -0
  67. euroeval/scores.py +20 -9
  68. euroeval/speed_benchmark.py +11 -12
  69. euroeval/task_group_utils/multiple_choice_classification.py +21 -12
  70. euroeval/task_group_utils/question_answering.py +101 -73
  71. euroeval/task_group_utils/sequence_classification.py +144 -61
  72. euroeval/task_group_utils/text_to_text.py +33 -12
  73. euroeval/task_group_utils/token_classification.py +86 -89
  74. euroeval/tasks.py +75 -16
  75. euroeval/tokenisation_utils.py +603 -0
  76. euroeval/types.py +17 -11
  77. euroeval/utils.py +332 -137
  78. euroeval-16.7.1.dist-info/METADATA +623 -0
  79. euroeval-16.7.1.dist-info/RECORD +84 -0
  80. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
  81. euroeval/human_evaluation.py +0 -737
  82. euroeval/metrics.py +0 -452
  83. euroeval/tokenization_utils.py +0 -498
  84. euroeval-15.12.0.dist-info/METADATA +0 -285
  85. euroeval-15.12.0.dist-info/RECORD +0 -63
  86. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
  87. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,27 +1,32 @@
1
1
  """Utility functions related to generative models."""
2
2
 
3
+ import collections.abc as c
3
4
  import itertools as it
4
5
  import json
5
6
  import logging
6
7
  import random
8
+ import re
7
9
  import typing as t
8
10
 
9
- from .enums import TaskGroup
10
- from .exceptions import InvalidBenchmark
11
- from .utils import log_once
11
+ from .enums import GenerativeType, TaskGroup
12
+ from .exceptions import InvalidBenchmark, InvalidModel
13
+ from .logging_utils import log_once
14
+ from .tokenisation_utils import apply_chat_template
15
+ from .utils import extract_multiple_choice_labels
12
16
 
13
17
  if t.TYPE_CHECKING:
14
18
  from datasets import DatasetDict
15
19
  from transformers.tokenization_utils import PreTrainedTokenizer
16
20
 
17
- from .data_models import DatasetConfig, ModelConfig
18
-
19
- logger = logging.getLogger("euroeval")
21
+ from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
20
22
 
21
23
 
22
24
  def extract_few_shot_examples(
23
- dataset: "DatasetDict", dataset_config: "DatasetConfig", itr_idx: int
24
- ) -> list[dict[str, t.Any]]:
25
+ dataset: "DatasetDict",
26
+ dataset_config: "DatasetConfig",
27
+ benchmark_config: "BenchmarkConfig",
28
+ itr_idx: int,
29
+ ) -> c.Sequence[dict[str, t.Any]]:
25
30
  """Extract few-shot examples from a dataset.
26
31
 
27
32
  This will always extract the examples from the training split.
@@ -33,12 +38,32 @@ def extract_few_shot_examples(
33
38
  The dataset to extract the few-shot examples from.
34
39
  dataset_config:
35
40
  The dataset configuration.
41
+ benchmark_config:
42
+ The benchmark configuration.
36
43
  itr_idx:
37
44
  The index of the dataset in the iterator.
38
45
 
39
46
  Returns:
40
47
  The few-shot examples.
48
+
49
+ Raises:
50
+ InvalidBenchmark:
51
+ If there are not enough short examples for few-shot learning.
41
52
  """
53
+ if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
54
+ msg = (
55
+ "This task only allows zero-shot evaluation, so even though you have "
56
+ "requested few-shot evaluation "
57
+ )
58
+ if benchmark_config.run_with_cli:
59
+ msg += "(by not setting the --zero-shot flag), "
60
+ else:
61
+ msg += "(by setting the default `few_shot=True` argument), "
62
+ msg += "we will run the evaluation in zero-shot mode."
63
+ benchmark_config.few_shot = False
64
+ log_once(msg, level=logging.DEBUG)
65
+ return []
66
+
42
67
  random_seed = 4242 + itr_idx
43
68
  num_few_shots = dataset_config.num_few_shot_examples
44
69
  few_shot_examples: list[dict[str, t.Any]] = list()
@@ -54,7 +79,7 @@ def extract_few_shot_examples(
54
79
  lambda example: len(example["text"]) < max_num_tokens
55
80
  )
56
81
  num_short_examples = len(train_with_short_examples)
57
- if num_short_examples >= dataset_config.num_few_shot_examples:
82
+ if num_short_examples >= num_few_shots:
58
83
  break
59
84
  else:
60
85
  raise InvalidBenchmark(
@@ -63,12 +88,19 @@ def extract_few_shot_examples(
63
88
 
64
89
  shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
65
90
  labels = it.cycle(dataset_config.labels)
91
+ labels_with_no_samples: set[str] = set()
66
92
  while len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0:
93
+ if len(labels_with_no_samples) == len(dataset_config.labels):
94
+ raise InvalidBenchmark(
95
+ "Could not find enough examples for few-shot learning. "
96
+ "Please check the dataset and the labels."
97
+ )
67
98
  label = next(labels)
68
99
  possible_examples = shuffled_train.filter(
69
100
  lambda x: x["label"].lower() == label.lower()
70
101
  )
71
102
  if len(possible_examples) == 0:
103
+ labels_with_no_samples.add(label)
72
104
  continue
73
105
  example = possible_examples.select(range(1))[0]
74
106
  few_shot_examples.append(example)
@@ -112,7 +144,7 @@ def extract_few_shot_examples(
112
144
  lambda example: len(example["context"]) < max_num_tokens
113
145
  )
114
146
  num_short_examples = len(train_with_short_examples)
115
- if num_short_examples >= dataset_config.num_few_shot_examples:
147
+ if num_short_examples >= num_few_shots:
116
148
  break
117
149
  else:
118
150
  raise InvalidBenchmark(
@@ -139,12 +171,12 @@ def extract_few_shot_examples(
139
171
 
140
172
  def apply_prompt(
141
173
  examples: dict[str, t.Any],
142
- few_shot_examples: list[dict[str, t.Any]],
174
+ few_shot_examples: c.Sequence[dict[str, t.Any]],
143
175
  model_config: "ModelConfig",
144
176
  dataset_config: "DatasetConfig",
145
- instruction_model: bool,
177
+ generative_type: GenerativeType | None,
146
178
  always_populate_text_field: bool,
147
- tokenizer: "PreTrainedTokenizer | None",
179
+ tokeniser: "PreTrainedTokenizer | None",
148
180
  ) -> dict[str, t.Any]:
149
181
  """Apply prompt template to an example, potentially with few-shot examples.
150
182
 
@@ -153,23 +185,29 @@ def apply_prompt(
153
185
  The examples to apply the few-shot examples to.
154
186
  few_shot_examples:
155
187
  The few-shot examples to apply.
188
+ model_config:
189
+ The model configuration.
156
190
  dataset_config:
157
191
  The dataset configuration.
158
- instruction_model:
159
- Whether the model is instruction-tuned.
192
+ generative_type:
193
+ The generative type of the model.
160
194
  always_populate_text_field:
161
195
  Whether to always populate the 'text' field in the examples, as opposed to
162
196
  the 'messages' field.
163
- tokenizer:
164
- The tokenizer to use for the model. If None, the tokenizer is not used.
197
+ tokeniser:
198
+ The tokeniser to use for the model. If None, the tokeniser is not used.
165
199
 
166
200
  Returns:
167
201
  The example with the few-shot examples applied.
168
202
  """
169
203
  # Sanity check
170
- if instruction_model and always_populate_text_field and tokenizer is None:
204
+ if (
205
+ generative_type in {GenerativeType.INSTRUCTION_TUNED, GenerativeType.REASONING}
206
+ and always_populate_text_field
207
+ and tokeniser is None
208
+ ):
171
209
  raise ValueError(
172
- "The `tokenizer` argument must be provided when the model is instruction "
210
+ "The `tokeniser` argument must be provided when the model is instruction "
173
211
  "tuned and when we are not just returning the raw messages."
174
212
  )
175
213
 
@@ -191,7 +229,10 @@ def apply_prompt(
191
229
  )
192
230
  label_mapping = dataset_config.prompt_label_mapping
193
231
  label = label_mapping.get(label, label)
194
- if instruction_model:
232
+ if generative_type in {
233
+ GenerativeType.INSTRUCTION_TUNED,
234
+ GenerativeType.REASONING,
235
+ }:
195
236
  prompt = dataset_config.instruction_prompt.format(**kwargs)
196
237
  return prompt, label
197
238
  else:
@@ -199,18 +240,49 @@ def apply_prompt(
199
240
  return dataset_config.prompt_template.format(**kwargs), ""
200
241
 
201
242
  match dataset_config.task.task_group:
202
- case (
203
- TaskGroup.SEQUENCE_CLASSIFICATION | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
204
- ):
243
+ case TaskGroup.SEQUENCE_CLASSIFICATION:
244
+ labels_str = dataset_config.get_labels_str()
205
245
  few_shot_sections = [
206
246
  create_prompt(
207
247
  text=example["text"].replace("\n", " ").strip(),
208
248
  label=example["label"].replace("\n", " ").strip(),
249
+ labels_str=labels_str,
209
250
  )
210
251
  for example in few_shot_examples
211
252
  ]
212
253
  new_sections = [
213
- create_prompt(text=text.replace("\n", " ").strip(), label="")
254
+ create_prompt(
255
+ text=text.replace("\n", " ").strip(),
256
+ label="",
257
+ labels_str=labels_str,
258
+ )
259
+ for text in examples["text"]
260
+ ]
261
+
262
+ case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
263
+ few_shot_sections = [
264
+ create_prompt(
265
+ text=example["text"].replace("\n", " ").strip(),
266
+ label=example["label"].replace("\n", " ").strip(),
267
+ labels_str=dataset_config.get_labels_str(
268
+ labels=extract_multiple_choice_labels(
269
+ prompt=example["text"],
270
+ candidate_labels=dataset_config.labels,
271
+ )
272
+ ),
273
+ )
274
+ for example in few_shot_examples
275
+ ]
276
+ new_sections = [
277
+ create_prompt(
278
+ text=text.replace("\n", " ").strip(),
279
+ label="",
280
+ labels_str=dataset_config.get_labels_str(
281
+ labels=extract_multiple_choice_labels(
282
+ prompt=text, candidate_labels=dataset_config.labels
283
+ )
284
+ ),
285
+ )
214
286
  for text in examples["text"]
215
287
  ]
216
288
 
@@ -228,6 +300,7 @@ def apply_prompt(
228
300
  ]
229
301
 
230
302
  case TaskGroup.TOKEN_CLASSIFICATION:
303
+ labels_str = dataset_config.get_labels_str()
231
304
 
232
305
  def create_label(example: dict) -> str:
233
306
  prompt_labels = dataset_config.prompt_label_mapping.values()
@@ -249,12 +322,15 @@ def apply_prompt(
249
322
  create_prompt(
250
323
  text=" ".join(example["tokens"]).replace("\n", " ").strip(),
251
324
  label=create_label(example=example),
325
+ labels_str=labels_str,
252
326
  )
253
327
  for example in few_shot_examples
254
328
  ]
255
329
  new_sections = [
256
330
  create_prompt(
257
- text=" ".join(tokens).replace("\n", " ").strip(), label=""
331
+ text=" ".join(tokens).replace("\n", " ").strip(),
332
+ label="",
333
+ labels_str=labels_str,
258
334
  )
259
335
  for tokens in examples["tokens"]
260
336
  ]
@@ -282,7 +358,7 @@ def apply_prompt(
282
358
  f"Unsupported task group: {dataset_config.task.task_group}."
283
359
  )
284
360
 
285
- if instruction_model:
361
+ if generative_type in {GenerativeType.INSTRUCTION_TUNED, GenerativeType.REASONING}:
286
362
  few_shot_messages = [
287
363
  dict(role=role, content=content)
288
364
  for prompt, label in few_shot_sections
@@ -296,33 +372,46 @@ def apply_prompt(
296
372
 
297
373
  if not always_populate_text_field:
298
374
  examples["messages"] = messages_list
299
-
300
375
  else:
301
- assert tokenizer is not None
376
+ assert tokeniser is not None
302
377
 
303
378
  # Pick the chat template that matches the language of the dataset, if such a
304
379
  # template exists
305
380
  chat_template: str | None = None
306
- if isinstance(tokenizer.chat_template, dict):
381
+ if hasattr(tokeniser, "chat_template") and isinstance(
382
+ tokeniser.chat_template, dict
383
+ ):
307
384
  language_codes = [
308
385
  language.code for language in dataset_config.languages
309
386
  ]
310
- for name, candidate_template in tokenizer.chat_template.items():
387
+ for name, candidate_template in tokeniser.chat_template.items():
311
388
  if name.lower() in language_codes:
312
389
  chat_template = candidate_template
313
390
  log_once(
314
- f"Using the {name!r} chat template for the tokenizer for "
391
+ f"Using the {name!r} chat template for the tokeniser for "
315
392
  f"model {model_config.model_id!r}.",
316
393
  level=logging.DEBUG,
317
394
  )
318
395
  break
319
396
 
397
+ # Custom chat template kwargs
398
+ chat_template_kwargs: dict[str, t.Any] = dict()
399
+ if model_config.param in {"low", "medium", "high"}:
400
+ chat_template_kwargs["reasoning_effort"] = model_config.param
401
+ log_once(
402
+ f"Set reasoning mode to {model_config.param!r}.",
403
+ level=logging.DEBUG,
404
+ )
405
+
320
406
  texts = [
321
- tokenizer.apply_chat_template(
407
+ apply_chat_template(
322
408
  conversation=messages,
323
- tokenize=False,
409
+ tokeniser=tokeniser,
410
+ tokenise=False,
324
411
  add_generation_prompt=True,
412
+ enable_thinking=(generative_type == GenerativeType.REASONING),
325
413
  chat_template=chat_template,
414
+ **chat_template_kwargs,
326
415
  )
327
416
  for messages in messages_list
328
417
  ]
@@ -332,7 +421,10 @@ def apply_prompt(
332
421
  else:
333
422
  prompt_prefix = ""
334
423
  if dataset_config.prompt_prefix:
335
- prompt_prefix = dataset_config.prompt_prefix + "\n\n"
424
+ labels_str = dataset_config.get_labels_str()
425
+ prompt_prefix = (
426
+ dataset_config.prompt_prefix.format(labels_str=labels_str) + "\n\n"
427
+ )
336
428
 
337
429
  few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
338
430
  if few_shot_prompt:
@@ -343,4 +435,47 @@ def apply_prompt(
343
435
  for new_prompt, _ in new_sections
344
436
  ]
345
437
 
438
+ # Always add the final prompts without few-shot examples, too, for analysis
439
+ examples["prompt"] = [new_prompt for new_prompt, _ in new_sections]
440
+
346
441
  return examples
442
+
443
+
444
+ def raise_if_wrong_params(
445
+ model_config: "ModelConfig", allowed_params: dict[re.Pattern, c.Sequence[str]]
446
+ ) -> None:
447
+ """Raise an error if the model configuration has invalid parameters.
448
+
449
+ Args:
450
+ model_config:
451
+ The model configuration.
452
+ allowed_params:
453
+ The allowed parameters for the model, being a dictionary mapping a regex
454
+ pattern matching the model ID to a list of allowed parameters for those
455
+ models.
456
+
457
+ Raises:
458
+ InvalidModel:
459
+ If the model configuration has invalid parameters.
460
+ """
461
+ # Do nothing if there are no parameters to check
462
+ if model_config.param is None:
463
+ return
464
+
465
+ # Make list of all allowed parameters for the model
466
+ all_allowed_params: set[str] = set()
467
+ for model_regex, allowed_params_list in allowed_params.items():
468
+ if re.fullmatch(pattern=model_regex, string=model_config.model_id):
469
+ all_allowed_params.update(allowed_params_list)
470
+
471
+ # Raise error if the parameter is not allowed
472
+ if model_config.param not in all_allowed_params:
473
+ msg = (
474
+ f"Invalid parameter {model_config.param!r} for model "
475
+ f"{model_config.model_id!r}."
476
+ )
477
+ if all_allowed_params:
478
+ msg += f" Allowed parameters are: {', '.join(all_allowed_params)}."
479
+ else:
480
+ msg += " No parameters are allowed."
481
+ raise InvalidModel(msg)