EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. euroeval/__init__.py +32 -14
  2. euroeval/benchmark_config_factory.py +92 -180
  3. euroeval/benchmark_modules/base.py +49 -39
  4. euroeval/benchmark_modules/fresh.py +35 -21
  5. euroeval/benchmark_modules/hf.py +280 -244
  6. euroeval/benchmark_modules/litellm.py +752 -312
  7. euroeval/benchmark_modules/vllm.py +570 -268
  8. euroeval/benchmarker.py +651 -528
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +49 -38
  12. euroeval/constants.py +44 -25
  13. euroeval/data_loading.py +111 -55
  14. euroeval/data_models.py +490 -323
  15. euroeval/dataset_configs/__init__.py +26 -4
  16. euroeval/dataset_configs/bosnian.py +39 -0
  17. euroeval/dataset_configs/bulgarian.py +56 -0
  18. euroeval/dataset_configs/croatian.py +56 -0
  19. euroeval/dataset_configs/czech.py +75 -0
  20. euroeval/dataset_configs/danish.py +78 -50
  21. euroeval/dataset_configs/dutch.py +74 -44
  22. euroeval/dataset_configs/english.py +71 -36
  23. euroeval/dataset_configs/estonian.py +111 -0
  24. euroeval/dataset_configs/faroese.py +25 -18
  25. euroeval/dataset_configs/finnish.py +63 -26
  26. euroeval/dataset_configs/french.py +65 -32
  27. euroeval/dataset_configs/german.py +77 -36
  28. euroeval/dataset_configs/greek.py +64 -0
  29. euroeval/dataset_configs/icelandic.py +68 -57
  30. euroeval/dataset_configs/italian.py +68 -36
  31. euroeval/dataset_configs/latvian.py +87 -0
  32. euroeval/dataset_configs/lithuanian.py +64 -0
  33. euroeval/dataset_configs/norwegian.py +98 -72
  34. euroeval/dataset_configs/polish.py +96 -0
  35. euroeval/dataset_configs/portuguese.py +63 -40
  36. euroeval/dataset_configs/serbian.py +64 -0
  37. euroeval/dataset_configs/slovak.py +55 -0
  38. euroeval/dataset_configs/slovene.py +56 -0
  39. euroeval/dataset_configs/spanish.py +68 -34
  40. euroeval/dataset_configs/swedish.py +82 -41
  41. euroeval/dataset_configs/ukrainian.py +64 -0
  42. euroeval/enums.py +12 -6
  43. euroeval/exceptions.py +21 -1
  44. euroeval/finetuning.py +34 -26
  45. euroeval/generation.py +76 -41
  46. euroeval/generation_utils.py +169 -34
  47. euroeval/languages.py +1020 -188
  48. euroeval/logging_utils.py +268 -0
  49. euroeval/metrics/__init__.py +6 -0
  50. euroeval/metrics/base.py +85 -0
  51. euroeval/metrics/huggingface.py +216 -0
  52. euroeval/metrics/llm_as_a_judge.py +260 -0
  53. euroeval/metrics/pipeline.py +289 -0
  54. euroeval/metrics/speed.py +48 -0
  55. euroeval/model_cache.py +40 -21
  56. euroeval/model_config.py +4 -5
  57. euroeval/model_loading.py +3 -0
  58. euroeval/prompt_templates/__init__.py +2 -0
  59. euroeval/prompt_templates/classification.py +206 -0
  60. euroeval/prompt_templates/linguistic_acceptability.py +157 -22
  61. euroeval/prompt_templates/multiple_choice.py +159 -17
  62. euroeval/prompt_templates/named_entity_recognition.py +318 -21
  63. euroeval/prompt_templates/reading_comprehension.py +207 -16
  64. euroeval/prompt_templates/sentiment_classification.py +205 -22
  65. euroeval/prompt_templates/summarization.py +122 -22
  66. euroeval/prompt_templates/token_classification.py +279 -0
  67. euroeval/scores.py +20 -9
  68. euroeval/speed_benchmark.py +11 -12
  69. euroeval/task_group_utils/multiple_choice_classification.py +21 -12
  70. euroeval/task_group_utils/question_answering.py +101 -73
  71. euroeval/task_group_utils/sequence_classification.py +144 -61
  72. euroeval/task_group_utils/text_to_text.py +33 -12
  73. euroeval/task_group_utils/token_classification.py +86 -89
  74. euroeval/tasks.py +75 -16
  75. euroeval/tokenisation_utils.py +603 -0
  76. euroeval/types.py +17 -11
  77. euroeval/utils.py +332 -137
  78. euroeval-16.7.1.dist-info/METADATA +623 -0
  79. euroeval-16.7.1.dist-info/RECORD +84 -0
  80. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
  81. euroeval/human_evaluation.py +0 -737
  82. euroeval/metrics.py +0 -452
  83. euroeval/tokenization_utils.py +0 -498
  84. euroeval-15.12.0.dist-info/METADATA +0 -285
  85. euroeval-15.12.0.dist-info/RECORD +0 -63
  86. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
  87. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,6 @@
1
1
  """Freshly initialised encoder models."""
2
2
 
3
- import os
3
+ import re
4
4
  import typing as t
5
5
  from functools import cached_property
6
6
  from json import JSONDecodeError
@@ -26,10 +26,12 @@ from ..exceptions import (
26
26
  NeedsEnvironmentVariable,
27
27
  NeedsExtraInstalled,
28
28
  )
29
- from ..utils import block_terminal_output, create_model_cache_dir
29
+ from ..generation_utils import raise_if_wrong_params
30
+ from ..logging_utils import block_terminal_output
31
+ from ..utils import create_model_cache_dir, get_hf_token
30
32
  from .hf import (
31
33
  HuggingFaceEncoderModel,
32
- align_model_and_tokenizer,
34
+ align_model_and_tokeniser,
33
35
  setup_model_for_question_answering,
34
36
  )
35
37
 
@@ -45,12 +47,14 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
45
47
  """A freshly initialised encoder model."""
46
48
 
47
49
  fresh_model = True
50
+ allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
48
51
 
49
52
  def __init__(
50
53
  self,
51
54
  model_config: "ModelConfig",
52
55
  dataset_config: "DatasetConfig",
53
56
  benchmark_config: "BenchmarkConfig",
57
+ log_metadata: bool = True,
54
58
  ) -> None:
55
59
  """Initialise the model.
56
60
 
@@ -61,23 +65,29 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
61
65
  The dataset configuration.
62
66
  benchmark_config:
63
67
  The benchmark configuration.
68
+ log_metadata:
69
+ Whether to log metadata about the model and the benchmark.
64
70
  """
71
+ raise_if_wrong_params(
72
+ model_config=model_config, allowed_params=self.allowed_params
73
+ )
74
+
65
75
  # This is already set when calling `super.__init__`, but we need it to get a
66
76
  # value from `self.model_max_length`, so we set it here as well.
67
77
  self.model_config = model_config
68
78
 
69
- model, tokenizer = load_model_and_tokenizer(
79
+ model, tokeniser = load_model_and_tokeniser(
70
80
  model_config=model_config,
71
81
  dataset_config=dataset_config,
72
82
  benchmark_config=benchmark_config,
73
83
  model_max_length=self.model_max_length,
74
84
  )
75
85
  self._model: "PreTrainedModel" = model
76
- self._tokenizer: "PreTrainedTokenizer" = tokenizer
86
+ self._tokeniser: "PreTrainedTokenizer" = tokeniser
77
87
 
78
- self._model, self._tokenizer = align_model_and_tokenizer(
88
+ self._model, self._tokeniser = align_model_and_tokeniser(
79
89
  model=self._model,
80
- tokenizer=self._tokenizer,
90
+ tokeniser=self._tokeniser,
81
91
  model_max_length=self.model_max_length,
82
92
  raise_errors=benchmark_config.raise_errors,
83
93
  )
@@ -88,6 +98,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
88
98
  model_config=model_config,
89
99
  dataset_config=dataset_config,
90
100
  benchmark_config=benchmark_config,
101
+ log_metadata=log_metadata,
91
102
  )
92
103
 
93
104
  @cached_property
@@ -180,9 +191,10 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
180
191
  """
181
192
  return ModelConfig(
182
193
  model_id=model_id,
194
+ revision="main",
195
+ param=None,
183
196
  task="fill-mask",
184
197
  languages=list(),
185
- revision="main",
186
198
  merge=False,
187
199
  inference_backend=InferenceBackend.TRANSFORMERS,
188
200
  model_type=ModelType.ENCODER,
@@ -194,13 +206,13 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
194
206
  )
195
207
 
196
208
 
197
- def load_model_and_tokenizer(
209
+ def load_model_and_tokeniser(
198
210
  model_config: "ModelConfig",
199
211
  dataset_config: "DatasetConfig",
200
212
  benchmark_config: "BenchmarkConfig",
201
213
  model_max_length: int,
202
214
  ) -> "tuple[PreTrainedModel, PreTrainedTokenizer]":
203
- """Load the model and tokenizer.
215
+ """Load the model and tokeniser.
204
216
 
205
217
  Args:
206
218
  model_config:
@@ -213,7 +225,7 @@ def load_model_and_tokenizer(
213
225
  The maximum context length of the model.
214
226
 
215
227
  Returns:
216
- The loaded model and tokenizer.
228
+ The loaded model and tokeniser.
217
229
  """
218
230
  config: "PretrainedConfig"
219
231
  block_terminal_output()
@@ -262,7 +274,7 @@ def load_model_and_tokenizer(
262
274
 
263
275
  config = AutoConfig.from_pretrained(
264
276
  real_model_id,
265
- token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
277
+ token=get_hf_token(api_key=benchmark_config.api_key),
266
278
  num_labels=len(id2label),
267
279
  id2label=id2label,
268
280
  label2id={label: id_ for id_, label in id2label.items()},
@@ -274,29 +286,31 @@ def load_model_and_tokenizer(
274
286
  if dataset_config.task.task_group == TaskGroup.QUESTION_ANSWERING:
275
287
  model = setup_model_for_question_answering(model=model)
276
288
 
277
- # Load the tokenizer. If the model is a subclass of a RoBERTa model then we
289
+ # Load the tokeniser. If the model is a subclass of a RoBERTa model then we
278
290
  # have to add a prefix space to the tokens, by the way the model is constructed
279
291
  prefix_models = ["Roberta", "GPT", "Deberta"]
280
292
  prefix = any(model_type in type(model).__name__ for model_type in prefix_models)
281
293
  try:
282
- tokenizer: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
294
+ tokeniser: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
283
295
  real_model_id,
284
296
  revision=model_config.revision,
285
- token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
297
+ token=get_hf_token(api_key=benchmark_config.api_key),
286
298
  add_prefix_space=prefix,
287
299
  cache_dir=model_config.model_cache_dir,
288
- use_fast=True,
300
+ use_fast=False if model_config.param == "slow-tokenizer" else True,
289
301
  verbose=False,
290
302
  trust_remote_code=benchmark_config.trust_remote_code,
291
303
  )
292
- except (JSONDecodeError, OSError):
293
- raise InvalidModel(f"Could not load tokenizer for model {real_model_id!r}.")
304
+ except (JSONDecodeError, OSError) as e:
305
+ raise InvalidModel(
306
+ f"Could not load tokeniser for model {real_model_id!r}."
307
+ ) from e
294
308
 
295
- model, tokenizer = align_model_and_tokenizer(
309
+ model, tokeniser = align_model_and_tokeniser(
296
310
  model=model,
297
- tokenizer=tokenizer,
311
+ tokeniser=tokeniser,
298
312
  model_max_length=model_max_length,
299
313
  raise_errors=benchmark_config.raise_errors,
300
314
  )
301
315
 
302
- return model, tokenizer
316
+ return model, tokeniser