EuroEval 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show
  1. euroeval/__init__.py +3 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +323 -193
  7. euroeval/benchmark_modules/vllm.py +166 -112
  8. euroeval/benchmarker.py +59 -33
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +13 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +53 -7
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +38 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +8 -7
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +46 -14
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +234 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  41. euroeval/prompt_templates/multiple_choice.py +23 -2
  42. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  43. euroeval/prompt_templates/reading_comprehension.py +42 -2
  44. euroeval/prompt_templates/sentiment_classification.py +46 -2
  45. euroeval/prompt_templates/summarization.py +24 -4
  46. euroeval/scores.py +7 -2
  47. euroeval/speed_benchmark.py +6 -6
  48. euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  49. euroeval/task_group_utils/question_answering.py +35 -28
  50. euroeval/task_group_utils/sequence_classification.py +96 -23
  51. euroeval/task_group_utils/text_to_text.py +7 -3
  52. euroeval/task_group_utils/token_classification.py +47 -75
  53. euroeval/tasks.py +31 -6
  54. euroeval/tokenization_utils.py +295 -207
  55. euroeval/utils.py +118 -34
  56. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +12 -14
  57. euroeval-16.0.0.dist-info/RECORD +69 -0
  58. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
  59. euroeval/human_evaluation.py +0 -738
  60. euroeval/metrics.py +0 -468
  61. euroeval-15.15.0.dist-info/RECORD +0 -63
  62. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
  63. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py CHANGED
@@ -77,10 +77,6 @@ os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
77
77
  os.environ["OMP_NUM_THREADS"] = "1"
78
78
 
79
79
 
80
- # Disable a warning from Ray regarding the detection of the number of CPUs
81
- os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
82
-
83
-
84
80
  # Avoid the "Cannot re-initialize CUDA in forked subprocess" error - see
85
81
  # https://github.com/vllm-project/vllm/issues/6152 for more
86
82
  os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@@ -100,9 +96,9 @@ os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
100
96
  os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
101
97
 
102
98
 
103
- # Use older version v0 of vLLM, as the newer one requires XGrammar as decoding backend,
104
- # but XGrammar does not support having a maximal amount of elements in lists
105
- os.environ["VLLM_USE_V1"] = "0"
99
+ # Enable the newer vLLM V1 engine, which is faster and offers more compatibility with
100
+ # newer models
101
+ os.environ["VLLM_USE_V1"] = "1"
106
102
 
107
103
 
108
104
  # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
@@ -45,8 +45,7 @@ def build_benchmark_config(
45
45
  gpu_memory_utilization: float,
46
46
  debug: bool,
47
47
  run_with_cli: bool,
48
- only_allow_safetensors: bool,
49
- first_time: bool = False,
48
+ requires_safetensors: bool,
50
49
  ) -> BenchmarkConfig:
51
50
  """Create a benchmark configuration.
52
51
 
@@ -112,11 +111,8 @@ def build_benchmark_config(
112
111
  Whether to run the benchmark in debug mode.
113
112
  run_with_cli:
114
113
  Whether the benchmark is being run with the CLI.
115
- only_allow_safetensors:
114
+ requires_safetensors:
116
115
  Whether to only allow evaluations of models stored as safetensors.
117
- first_time:
118
- Whether this is the first time the benchmark configuration is being created.
119
- Defaults to False.
120
116
 
121
117
  Returns:
122
118
  The benchmark configuration.
@@ -163,7 +159,7 @@ def build_benchmark_config(
163
159
  gpu_memory_utilization=gpu_memory_utilization,
164
160
  debug=debug,
165
161
  run_with_cli=run_with_cli,
166
- only_allow_safetensors=only_allow_safetensors,
162
+ requires_safetensors=requires_safetensors,
167
163
  )
168
164
 
169
165
 
@@ -7,12 +7,12 @@ import typing as t
7
7
  from abc import ABC, abstractmethod
8
8
  from functools import cached_property, partial
9
9
 
10
- from datasets import DatasetDict
10
+ from datasets import Dataset, DatasetDict
11
11
  from torch import nn
12
12
  from tqdm.auto import tqdm
13
13
 
14
14
  from ..enums import TaskGroup
15
- from ..exceptions import NeedsEnvironmentVariable, NeedsExtraInstalled
15
+ from ..exceptions import InvalidBenchmark, NeedsEnvironmentVariable, NeedsExtraInstalled
16
16
  from ..task_group_utils import (
17
17
  question_answering,
18
18
  sequence_classification,
@@ -61,6 +61,7 @@ class BenchmarkModule(ABC):
61
61
  model_config: "ModelConfig",
62
62
  dataset_config: "DatasetConfig",
63
63
  benchmark_config: "BenchmarkConfig",
64
+ log_metadata: bool = True,
64
65
  ) -> None:
65
66
  """Initialise the benchmark module.
66
67
 
@@ -71,12 +72,16 @@ class BenchmarkModule(ABC):
71
72
  The dataset configuration.
72
73
  benchmark_config:
73
74
  The benchmark configuration.
75
+ log_metadata:
76
+ Whether to log the metadata of the model.
74
77
  """
75
78
  self.model_config = model_config
76
79
  self.dataset_config = dataset_config
77
80
  self.benchmark_config = benchmark_config
81
+ self.log_metadata = log_metadata
78
82
  self.buffer: dict[str, t.Any] = dict()
79
- self._log_metadata()
83
+ if self.log_metadata:
84
+ self._log_metadata()
80
85
 
81
86
  def _log_metadata(self) -> None:
82
87
  """Log the metadata of the model."""
@@ -117,16 +122,16 @@ class BenchmarkModule(ABC):
117
122
  f"{self.__class__.__name__}."
118
123
  )
119
124
 
120
- def get_tokenizer(self) -> "PreTrainedTokenizer":
121
- """Get the underlying tokenizer.
125
+ def get_tokeniser(self) -> "PreTrainedTokenizer":
126
+ """Get the underlying tokeniser.
122
127
 
123
128
  Returns:
124
- The tokenizer.
129
+ The tokeniser.
125
130
  """
126
- if hasattr(self, "_tokenizer"):
127
- return self._tokenizer
131
+ if hasattr(self, "_tokeniser"):
132
+ return self._tokeniser
128
133
  raise NotImplementedError(
129
- "The `get_tokenizer` method has not been implemented for "
134
+ "The `get_tokeniser` method has not been implemented for "
130
135
  f"{self.__class__.__name__}."
131
136
  )
132
137
 
@@ -192,11 +197,13 @@ class BenchmarkModule(ABC):
192
197
  return partial(
193
198
  sequence_classification.compute_metrics,
194
199
  dataset_config=self.dataset_config,
200
+ benchmark_config=self.benchmark_config,
195
201
  )
196
202
  case TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION:
197
203
  return partial(
198
204
  sequence_classification.compute_metrics,
199
205
  dataset_config=self.dataset_config,
206
+ benchmark_config=self.benchmark_config,
200
207
  )
201
208
  case TaskGroup.TEXT_TO_TEXT:
202
209
  return partial(
@@ -209,11 +216,13 @@ class BenchmarkModule(ABC):
209
216
  token_classification.compute_metrics,
210
217
  has_misc_tags=self.buffer.get("has_misc_tags", True),
211
218
  dataset_config=self.dataset_config,
219
+ benchmark_config=self.benchmark_config,
212
220
  )
213
221
  case TaskGroup.QUESTION_ANSWERING:
214
222
  return partial(
215
223
  question_answering.compute_metrics,
216
224
  dataset_config=self.dataset_config,
225
+ benchmark_config=self.benchmark_config,
217
226
  )
218
227
  case _:
219
228
  raise NotImplementedError(
@@ -255,6 +264,11 @@ class BenchmarkModule(ABC):
255
264
 
256
265
  Returns:
257
266
  The prepared datasets.
267
+
268
+ Raises:
269
+ InvalidBenchmark:
270
+ If the dataset does not have a 'train' split for token classification
271
+ tasks.
258
272
  """
259
273
  for idx, dataset in enumerate(
260
274
  tqdm(iterable=datasets, desc="Preparing datasets")
@@ -263,22 +277,24 @@ class BenchmarkModule(ABC):
263
277
  dataset=dataset, task=task, itr_idx=idx
264
278
  )
265
279
  if self.dataset_config.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
280
+ if "train" not in dataset:
281
+ raise InvalidBenchmark(
282
+ "The dataset does not have a 'train' split, which is required "
283
+ "for token classification tasks."
284
+ )
266
285
  labels_in_train: set[str] = {
267
286
  tag for tag_list in dataset["train"]["labels"] for tag in tag_list
268
287
  }
269
288
  self.buffer["has_misc_tags"] = (
270
289
  "B-MISC" in labels_in_train or "I-MISC" in labels_in_train
271
290
  )
272
- datasets[idx] = DatasetDict(
273
- dict(
274
- train=prepared_dataset["train"],
275
- val=prepared_dataset["val"],
276
- test=prepared_dataset["test"],
277
- original_train=dataset["train"],
278
- original_val=dataset["val"],
279
- original_test=dataset["test"],
280
- )
281
- )
291
+
292
+ datasets_dict: dict[str, Dataset] = dict()
293
+ for split_name, split in prepared_dataset.items():
294
+ datasets_dict[split_name] = split
295
+ for split_name, split in dataset.items():
296
+ datasets_dict[f"original_{split_name}"] = split
297
+ datasets[idx] = DatasetDict(datasets_dict)
282
298
  return datasets
283
299
 
284
300
  @abstractmethod
@@ -1,6 +1,5 @@
1
1
  """Freshly initialised encoder models."""
2
2
 
3
- import os
4
3
  import typing as t
5
4
  from functools import cached_property
6
5
  from json import JSONDecodeError
@@ -26,10 +25,10 @@ from ..exceptions import (
26
25
  NeedsEnvironmentVariable,
27
26
  NeedsExtraInstalled,
28
27
  )
29
- from ..utils import block_terminal_output, create_model_cache_dir
28
+ from ..utils import block_terminal_output, create_model_cache_dir, get_hf_token
30
29
  from .hf import (
31
30
  HuggingFaceEncoderModel,
32
- align_model_and_tokenizer,
31
+ align_model_and_tokeniser,
33
32
  setup_model_for_question_answering,
34
33
  )
35
34
 
@@ -51,6 +50,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
51
50
  model_config: "ModelConfig",
52
51
  dataset_config: "DatasetConfig",
53
52
  benchmark_config: "BenchmarkConfig",
53
+ log_metadata: bool = True,
54
54
  ) -> None:
55
55
  """Initialise the model.
56
56
 
@@ -61,23 +61,25 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
61
61
  The dataset configuration.
62
62
  benchmark_config:
63
63
  The benchmark configuration.
64
+ log_metadata:
65
+ Whether to log metadata about the model and the benchmark.
64
66
  """
65
67
  # This is already set when calling `super.__init__`, but we need it to get a
66
68
  # value from `self.model_max_length`, so we set it here as well.
67
69
  self.model_config = model_config
68
70
 
69
- model, tokenizer = load_model_and_tokenizer(
71
+ model, tokeniser = load_model_and_tokeniser(
70
72
  model_config=model_config,
71
73
  dataset_config=dataset_config,
72
74
  benchmark_config=benchmark_config,
73
75
  model_max_length=self.model_max_length,
74
76
  )
75
77
  self._model: "PreTrainedModel" = model
76
- self._tokenizer: "PreTrainedTokenizer" = tokenizer
78
+ self._tokeniser: "PreTrainedTokenizer" = tokeniser
77
79
 
78
- self._model, self._tokenizer = align_model_and_tokenizer(
80
+ self._model, self._tokeniser = align_model_and_tokeniser(
79
81
  model=self._model,
80
- tokenizer=self._tokenizer,
82
+ tokeniser=self._tokeniser,
81
83
  model_max_length=self.model_max_length,
82
84
  raise_errors=benchmark_config.raise_errors,
83
85
  )
@@ -88,6 +90,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
88
90
  model_config=model_config,
89
91
  dataset_config=dataset_config,
90
92
  benchmark_config=benchmark_config,
93
+ log_metadata=log_metadata,
91
94
  )
92
95
 
93
96
  @cached_property
@@ -194,13 +197,13 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
194
197
  )
195
198
 
196
199
 
197
- def load_model_and_tokenizer(
200
+ def load_model_and_tokeniser(
198
201
  model_config: "ModelConfig",
199
202
  dataset_config: "DatasetConfig",
200
203
  benchmark_config: "BenchmarkConfig",
201
204
  model_max_length: int,
202
205
  ) -> "tuple[PreTrainedModel, PreTrainedTokenizer]":
203
- """Load the model and tokenizer.
206
+ """Load the model and tokeniser.
204
207
 
205
208
  Args:
206
209
  model_config:
@@ -213,7 +216,7 @@ def load_model_and_tokenizer(
213
216
  The maximum context length of the model.
214
217
 
215
218
  Returns:
216
- The loaded model and tokenizer.
219
+ The loaded model and tokeniser.
217
220
  """
218
221
  config: "PretrainedConfig"
219
222
  block_terminal_output()
@@ -262,7 +265,7 @@ def load_model_and_tokenizer(
262
265
 
263
266
  config = AutoConfig.from_pretrained(
264
267
  real_model_id,
265
- token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
268
+ token=get_hf_token(api_key=benchmark_config.api_key),
266
269
  num_labels=len(id2label),
267
270
  id2label=id2label,
268
271
  label2id={label: id_ for id_, label in id2label.items()},
@@ -274,29 +277,31 @@ def load_model_and_tokenizer(
274
277
  if dataset_config.task.task_group == TaskGroup.QUESTION_ANSWERING:
275
278
  model = setup_model_for_question_answering(model=model)
276
279
 
277
- # Load the tokenizer. If the model is a subclass of a RoBERTa model then we
280
+ # Load the tokeniser. If the model is a subclass of a RoBERTa model then we
278
281
  # have to add a prefix space to the tokens, by the way the model is constructed
279
282
  prefix_models = ["Roberta", "GPT", "Deberta"]
280
283
  prefix = any(model_type in type(model).__name__ for model_type in prefix_models)
281
284
  try:
282
- tokenizer: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
285
+ tokeniser: "PreTrainedTokenizer" = AutoTokenizer.from_pretrained(
283
286
  real_model_id,
284
287
  revision=model_config.revision,
285
- token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
288
+ token=get_hf_token(api_key=benchmark_config.api_key),
286
289
  add_prefix_space=prefix,
287
290
  cache_dir=model_config.model_cache_dir,
288
291
  use_fast=True,
289
292
  verbose=False,
290
293
  trust_remote_code=benchmark_config.trust_remote_code,
291
294
  )
292
- except (JSONDecodeError, OSError):
293
- raise InvalidModel(f"Could not load tokenizer for model {real_model_id!r}.")
295
+ except (JSONDecodeError, OSError) as e:
296
+ raise InvalidModel(
297
+ f"Could not load tokeniser for model {real_model_id!r}."
298
+ ) from e
294
299
 
295
- model, tokenizer = align_model_and_tokenizer(
300
+ model, tokeniser = align_model_and_tokeniser(
296
301
  model=model,
297
- tokenizer=tokenizer,
302
+ tokeniser=tokeniser,
298
303
  model_max_length=model_max_length,
299
304
  raise_errors=benchmark_config.raise_errors,
300
305
  )
301
306
 
302
- return model, tokenizer
307
+ return model, tokeniser