EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. euroeval/__init__.py +32 -14
  2. euroeval/benchmark_config_factory.py +92 -180
  3. euroeval/benchmark_modules/base.py +49 -39
  4. euroeval/benchmark_modules/fresh.py +35 -21
  5. euroeval/benchmark_modules/hf.py +280 -244
  6. euroeval/benchmark_modules/litellm.py +752 -312
  7. euroeval/benchmark_modules/vllm.py +570 -268
  8. euroeval/benchmarker.py +651 -528
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +49 -38
  12. euroeval/constants.py +44 -25
  13. euroeval/data_loading.py +111 -55
  14. euroeval/data_models.py +490 -323
  15. euroeval/dataset_configs/__init__.py +26 -4
  16. euroeval/dataset_configs/bosnian.py +39 -0
  17. euroeval/dataset_configs/bulgarian.py +56 -0
  18. euroeval/dataset_configs/croatian.py +56 -0
  19. euroeval/dataset_configs/czech.py +75 -0
  20. euroeval/dataset_configs/danish.py +78 -50
  21. euroeval/dataset_configs/dutch.py +74 -44
  22. euroeval/dataset_configs/english.py +71 -36
  23. euroeval/dataset_configs/estonian.py +111 -0
  24. euroeval/dataset_configs/faroese.py +25 -18
  25. euroeval/dataset_configs/finnish.py +63 -26
  26. euroeval/dataset_configs/french.py +65 -32
  27. euroeval/dataset_configs/german.py +77 -36
  28. euroeval/dataset_configs/greek.py +64 -0
  29. euroeval/dataset_configs/icelandic.py +68 -57
  30. euroeval/dataset_configs/italian.py +68 -36
  31. euroeval/dataset_configs/latvian.py +87 -0
  32. euroeval/dataset_configs/lithuanian.py +64 -0
  33. euroeval/dataset_configs/norwegian.py +98 -72
  34. euroeval/dataset_configs/polish.py +96 -0
  35. euroeval/dataset_configs/portuguese.py +63 -40
  36. euroeval/dataset_configs/serbian.py +64 -0
  37. euroeval/dataset_configs/slovak.py +55 -0
  38. euroeval/dataset_configs/slovene.py +56 -0
  39. euroeval/dataset_configs/spanish.py +68 -34
  40. euroeval/dataset_configs/swedish.py +82 -41
  41. euroeval/dataset_configs/ukrainian.py +64 -0
  42. euroeval/enums.py +12 -6
  43. euroeval/exceptions.py +21 -1
  44. euroeval/finetuning.py +34 -26
  45. euroeval/generation.py +76 -41
  46. euroeval/generation_utils.py +169 -34
  47. euroeval/languages.py +1020 -188
  48. euroeval/logging_utils.py +268 -0
  49. euroeval/metrics/__init__.py +6 -0
  50. euroeval/metrics/base.py +85 -0
  51. euroeval/metrics/huggingface.py +216 -0
  52. euroeval/metrics/llm_as_a_judge.py +260 -0
  53. euroeval/metrics/pipeline.py +289 -0
  54. euroeval/metrics/speed.py +48 -0
  55. euroeval/model_cache.py +40 -21
  56. euroeval/model_config.py +4 -5
  57. euroeval/model_loading.py +3 -0
  58. euroeval/prompt_templates/__init__.py +2 -0
  59. euroeval/prompt_templates/classification.py +206 -0
  60. euroeval/prompt_templates/linguistic_acceptability.py +157 -22
  61. euroeval/prompt_templates/multiple_choice.py +159 -17
  62. euroeval/prompt_templates/named_entity_recognition.py +318 -21
  63. euroeval/prompt_templates/reading_comprehension.py +207 -16
  64. euroeval/prompt_templates/sentiment_classification.py +205 -22
  65. euroeval/prompt_templates/summarization.py +122 -22
  66. euroeval/prompt_templates/token_classification.py +279 -0
  67. euroeval/scores.py +20 -9
  68. euroeval/speed_benchmark.py +11 -12
  69. euroeval/task_group_utils/multiple_choice_classification.py +21 -12
  70. euroeval/task_group_utils/question_answering.py +101 -73
  71. euroeval/task_group_utils/sequence_classification.py +144 -61
  72. euroeval/task_group_utils/text_to_text.py +33 -12
  73. euroeval/task_group_utils/token_classification.py +86 -89
  74. euroeval/tasks.py +75 -16
  75. euroeval/tokenisation_utils.py +603 -0
  76. euroeval/types.py +17 -11
  77. euroeval/utils.py +332 -137
  78. euroeval-16.7.1.dist-info/METADATA +623 -0
  79. euroeval-16.7.1.dist-info/RECORD +84 -0
  80. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
  81. euroeval/human_evaluation.py +0 -737
  82. euroeval/metrics.py +0 -452
  83. euroeval/tokenization_utils.py +0 -498
  84. euroeval-15.12.0.dist-info/METADATA +0 -285
  85. euroeval-15.12.0.dist-info/RECORD +0 -63
  86. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
  87. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
euroeval/data_models.py CHANGED
@@ -1,85 +1,61 @@
1
1
  """Data models used in EuroEval."""
2
2
 
3
+ import collections.abc as c
3
4
  import json
4
5
  import pathlib
5
6
  import re
6
7
  import typing as t
8
+ from copy import deepcopy
7
9
  from dataclasses import dataclass, field
8
10
 
9
11
  import pydantic
10
12
  import torch
11
13
 
12
- from .enums import Device, InferenceBackend, ModelType, TaskGroup
13
- from .metrics import Metric
14
+ from .enums import Device, GenerativeType, ModelType, TaskGroup
15
+ from .exceptions import InvalidBenchmark
16
+ from .languages import (
17
+ ENGLISH,
18
+ EUROPEAN_PORTUGUESE,
19
+ NORWEGIAN,
20
+ NORWEGIAN_BOKMÅL,
21
+ NORWEGIAN_NYNORSK,
22
+ PORTUGUESE,
23
+ Language,
24
+ )
25
+ from .metrics.base import Metric
14
26
  from .types import ScoreDict
15
27
  from .utils import get_package_version
16
28
 
29
+ if t.TYPE_CHECKING:
30
+ from .enums import InferenceBackend
31
+
17
32
 
18
33
  @dataclass
19
- class Language:
20
- """A benchmarkable language.
34
+ class PromptConfig:
35
+ """Configuration for task-specific prompting across languages.
36
+
37
+ Defines the prompt templates needed for evaluating a specific task in a given
38
+ language.
21
39
 
22
40
  Attributes:
23
- code:
24
- The ISO 639-1 language code of the language.
25
- name:
26
- The name of the language.
27
- and_separator (optional):
28
- The word 'and' in the language.
29
- or_separator (optional):
30
- The word 'or' in the language.
41
+ default_prompt_prefix:
42
+ The default prefix to use in the few-shot prompt.
43
+ default_prompt_template:
44
+ The default template for the prompt to use when benchmarking the dataset
45
+ using few-shot evaluation.
46
+ default_instruction_prompt:
47
+ The default prompt to use when benchmarking the dataset using
48
+ instruction-based evaluation.
49
+ default_prompt_label_mapping:
50
+ The default mapping from the labels to another phrase which is used as a
51
+ substitute for the label in few-shot evaluation. If set to "auto", the
52
+ mapping will be set to a 1:1 mapping between the labels and themselves.
31
53
  """
32
54
 
33
- code: str
34
- name: str
35
- _and_separator: str | None = field(repr=False, default=None)
36
- _or_separator: str | None = field(repr=False, default=None)
37
-
38
- def __hash__(self) -> int:
39
- """Return a hash of the language."""
40
- return hash(self.code)
41
-
42
- @property
43
- def and_separator(self) -> str:
44
- """Get the word 'and' in the language.
45
-
46
- Returns:
47
- The word 'and' in the language.
48
-
49
- Raises:
50
- NotImplementedError:
51
- If `and_separator` is `None`.
52
- """
53
- if not self._and_separator:
54
- raise NotImplementedError(
55
- f"Separator for the word 'and' has not been defined for {self.name}."
56
- )
57
- return self._and_separator
58
-
59
- @and_separator.setter
60
- def and_separator(self, value: str | None) -> None:
61
- self._and_separator = value
62
-
63
- @property
64
- def or_separator(self) -> str:
65
- """Get the word 'or' in the language.
66
-
67
- Returns:
68
- The word 'or' in the language.
69
-
70
- Raises:
71
- NotImplementedError:
72
- If `or_separator` is `None`.
73
- """
74
- if not self._or_separator:
75
- raise NotImplementedError(
76
- f"Separator for the word 'or' has not been defined for {self.name}."
77
- )
78
- return self._or_separator
79
-
80
- @or_separator.setter
81
- def or_separator(self, value: str | None) -> None:
82
- self._or_separator = value
55
+ default_prompt_prefix: str
56
+ default_prompt_template: str
57
+ default_instruction_prompt: str
58
+ default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
83
59
 
84
60
 
85
61
  @dataclass
@@ -104,210 +80,68 @@ class Task:
104
80
  using few-shot evaluation.
105
81
  default_labels:
106
82
  The default labels for datasets using this task.
83
+ requires_zero_shot (optional):
84
+ Whether to only allow zero-shot evaluation for this task. If True, the
85
+ task will not be evaluated using few-shot examples.
86
+ uses_structured_output (optional):
87
+ Whether the task uses structured output. If True, the task will return
88
+ structured output (e.g., BIO tags for NER). Defaults to False.
89
+ uses_logprobs (optional):
90
+ Whether the task uses log probabilities. If True, the task will return
91
+ log probabilities for the generated tokens. Defaults to False.
92
+ requires_logprobs (optional):
93
+ Whether the task requires log probabilities. Implies `uses_logprobs`.
94
+ default_allowed_model_types (optional):
95
+ A list of model types that are allowed to be evaluated on this task.
96
+ Defaults to all model types being allowed.
97
+ default_allowed_generative_types (optional):
98
+ A list of generative model types that are allowed to be evaluated on this
99
+ task. If None, all generative model types are allowed. Only relevant if
100
+ `allowed_model_types` includes generative models.
101
+ default_allow_invalid_model_outputs (optional):
102
+ Whether to allow invalid model outputs. This is only relevant for generative
103
+ models on classification tasks, where the model may generate an output
104
+ which is not one of the allowed labels. If True, the model output will be
105
+ mapped to the closest valid label. If False, the model output will be
106
+ considered incorrect and the evaluation will be aborted. Defaults to True.
107
107
  """
108
108
 
109
+ model_config = pydantic.ConfigDict(
110
+ protected_namespaces=(), arbitrary_types_allowed=True
111
+ )
112
+
109
113
  name: str
110
114
  task_group: TaskGroup
111
- template_dict: dict["Language", "PromptConfig"]
112
- metrics: list[Metric]
115
+ template_dict: dict[Language, PromptConfig]
116
+ metrics: c.Sequence[Metric]
113
117
  default_num_few_shot_examples: int
114
118
  default_max_generated_tokens: int
115
- default_labels: list[str]
119
+ default_labels: c.Sequence[str] | None
120
+ requires_zero_shot: bool = False
121
+ uses_structured_output: bool = False
122
+ uses_logprobs: bool = False
123
+ requires_logprobs: bool = False
124
+ default_allowed_model_types: c.Sequence[ModelType] = field(
125
+ default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
126
+ )
127
+ default_allowed_generative_types: c.Sequence[GenerativeType] = field(
128
+ default_factory=lambda: [
129
+ GenerativeType.BASE,
130
+ GenerativeType.INSTRUCTION_TUNED,
131
+ GenerativeType.REASONING,
132
+ ]
133
+ )
134
+ default_allow_invalid_model_outputs: bool = True
135
+
136
+ def __post_init__(self) -> None:
137
+ """Post-initialisation checks."""
138
+ self.uses_logprobs = self.uses_logprobs or self.requires_logprobs
116
139
 
117
140
  def __hash__(self) -> int:
118
141
  """Return a hash of the task."""
119
142
  return hash(self.name)
120
143
 
121
144
 
122
- @dataclass
123
- class BenchmarkConfig:
124
- """General benchmarking configuration, across datasets and models.
125
-
126
- Attributes:
127
- model_languages:
128
- The languages of the models to benchmark.
129
- dataset_languages:
130
- The languages of the datasets in the benchmark.
131
- tasks:
132
- The tasks benchmark the model(s) on.
133
- datasets:
134
- The datasets to benchmark on.
135
- batch_size:
136
- The batch size to use.
137
- raise_errors:
138
- Whether to raise errors instead of skipping them.
139
- cache_dir:
140
- Directory to store cached models and datasets.
141
- api_key:
142
- The API key to use for a given inference API.
143
- force:
144
- Whether to force the benchmark to run even if the results are already
145
- cached.
146
- progress_bar:
147
- Whether to show a progress bar.
148
- save_results:
149
- Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
150
- device:
151
- The device to use for benchmarking.
152
- verbose:
153
- Whether to print verbose output.
154
- trust_remote_code:
155
- Whether to trust remote code when loading models from the Hugging Face Hub.
156
- clear_model_cache:
157
- Whether to clear the model cache after benchmarking each model.
158
- evaluate_test_split:
159
- Whether to evaluate on the test split.
160
- few_shot:
161
- Whether to only evaluate the model using few-shot evaluation. Only relevant
162
- if the model is generative.
163
- num_iterations:
164
- The number of iterations each model should be evaluated for.
165
- api_base:
166
- The base URL for a given inference API. Only relevant if `model` refers to a
167
- model on an inference API.
168
- api_version:
169
- The version of the API to use. Only relevant if `model` refers to a model on
170
- an inference API.
171
- gpu_memory_utilization:
172
- The GPU memory utilization to use for vLLM. A larger value will result in
173
- faster evaluation, but at the risk of running out of GPU memory. Only reduce
174
- this if you are running out of GPU memory. Only relevant if the model is
175
- generative.
176
- debug:
177
- Whether to run the benchmark in debug mode.
178
- run_with_cli:
179
- Whether the benchmark is being run with the CLI.
180
- only_allow_safetensors:
181
- Whether to only allow models that use the safetensors format.
182
- """
183
-
184
- model_languages: list[Language]
185
- dataset_languages: list[Language]
186
- tasks: list[Task]
187
- datasets: list[str]
188
- batch_size: int
189
- raise_errors: bool
190
- cache_dir: str
191
- api_key: str | None
192
- force: bool
193
- progress_bar: bool
194
- save_results: bool
195
- device: torch.device
196
- verbose: bool
197
- trust_remote_code: bool
198
- clear_model_cache: bool
199
- evaluate_test_split: bool
200
- few_shot: bool
201
- num_iterations: int
202
- api_base: str | None
203
- api_version: str | None
204
- gpu_memory_utilization: float
205
- debug: bool
206
- run_with_cli: bool
207
- only_allow_safetensors: bool
208
-
209
-
210
- class BenchmarkConfigParams(pydantic.BaseModel):
211
- """The parameters for the benchmark configuration."""
212
-
213
- model_config = pydantic.ConfigDict(protected_namespaces=())
214
-
215
- progress_bar: bool
216
- save_results: bool
217
- task: str | list[str] | None
218
- dataset: str | list[str] | None
219
- language: str | list[str]
220
- model_language: str | list[str] | None
221
- dataset_language: str | list[str] | None
222
- device: Device | None
223
- batch_size: int
224
- raise_errors: bool
225
- cache_dir: str
226
- api_key: str | None
227
- force: bool
228
- verbose: bool
229
- trust_remote_code: bool
230
- clear_model_cache: bool
231
- evaluate_test_split: bool
232
- few_shot: bool
233
- num_iterations: int
234
- api_base: str | None
235
- api_version: str | None
236
- gpu_memory_utilization: float
237
- debug: bool
238
- run_with_cli: bool
239
- only_allow_safetensors: bool
240
-
241
-
242
- class BenchmarkResult(pydantic.BaseModel):
243
- """A benchmark result."""
244
-
245
- dataset: str
246
- task: str
247
- dataset_languages: list[str]
248
- model: str
249
- results: ScoreDict
250
- num_model_parameters: int
251
- max_sequence_length: int
252
- vocabulary_size: int
253
- merge: bool
254
- generative: bool
255
- generative_type: str | None
256
- few_shot: bool
257
- validation_split: bool
258
- euroeval_version: str | None = get_package_version("euroeval")
259
- transformers_version: str | None = get_package_version("transformers")
260
- torch_version: str | None = get_package_version("torch")
261
- vllm_version: str | None = get_package_version("vllm")
262
- outlines_version: str | None = get_package_version("outlines")
263
-
264
- @classmethod
265
- def from_dict(cls, config: dict) -> "BenchmarkResult":
266
- """Create a benchmark result from a dictionary.
267
-
268
- Args:
269
- config:
270
- The configuration dictionary.
271
-
272
- Returns:
273
- The benchmark result.
274
- """
275
- # To be backwards compatible, we accept old results which changed the model
276
- # name with parameters rather than adding them as explicit parameters
277
- val_matches = re.search(r"\(.*val.*\)$", config["model"])
278
- few_shot_matches = re.search(r"\(.*few-shot.*\)$", config["model"])
279
- zero_shot_matches = re.search(r"\(.*zero-shot.*\)$", config["model"])
280
- config["model"] = re.sub(
281
- r"\(.*(few-shot|val).*\)$", "", config["model"]
282
- ).strip()
283
-
284
- if "merge" not in config:
285
- config["merge"] = False
286
- if "generative" not in config:
287
- config["generative"] = (
288
- few_shot_matches is not None or zero_shot_matches is not None
289
- )
290
- if "generative_type" not in config:
291
- config["generative_type"] = None
292
- if "few_shot" not in config:
293
- config["few_shot"] = zero_shot_matches is None
294
- if "validation_split" not in config:
295
- config["validation_split"] = val_matches is not None
296
-
297
- return cls(**config)
298
-
299
- def append_to_results(self, results_path: pathlib.Path) -> None:
300
- """Append the benchmark result to the results file.
301
-
302
- Args:
303
- results_path:
304
- The path to the results file.
305
- """
306
- json_str = json.dumps(self.model_dump())
307
- with results_path.open("a") as f:
308
- f.write("\n" + json_str)
309
-
310
-
311
145
  @dataclass
312
146
  class DatasetConfig:
313
147
  """Configuration for a dataset.
@@ -318,8 +152,9 @@ class DatasetConfig:
318
152
  pretty_name:
319
153
  A longer prettier name for the dataset, which allows cases and spaces. Used
320
154
  for logging.
321
- huggingface_id:
322
- The Hugging Face ID of the dataset.
155
+ source:
156
+ The source of the dataset, which can be a Hugging Face ID or a dictionary
157
+ with keys "train", "val" and "test" mapping to local CSV file paths.
323
158
  task:
324
159
  The task of the dataset.
325
160
  languages:
@@ -356,63 +191,154 @@ class DatasetConfig:
356
191
  to a 1:1 mapping between the labels and themselves. If None then the mapping
357
192
  will be set to the default mapping for the task and language. Defaults to
358
193
  None.
194
+ _allowed_model_types (optional):
195
+ A list of model types that are allowed to be evaluated on this dataset.
196
+ Defaults to the one for the task.
197
+ _allowed_generative_types (optional):
198
+ A list of generative model types that are allowed to be evaluated on this
199
+ dataset. If None, all generative model types are allowed. Only relevant if
200
+ `allowed_model_types` includes generative models. Defaults to the one for
201
+ the task.
202
+ _allow_invalid_model_outputs (optional):
203
+ Whether to allow invalid model outputs. This is only relevant for
204
+ generative models on classification tasks, where the model may generate an
205
+ output which is not one of the allowed labels. If True, the model output
206
+ will be mapped to the closest valid label. If False, the model output will
207
+ be considered incorrect and the evaluation will be aborted. Defaults to
208
+ the one for the task.
209
+ _logging_string (optional):
210
+ The string used to describe evaluation on the dataset in logging. If not
211
+ provided, a default string will be generated, based on the pretty name. Only
212
+ use this if the default string is not suitable.
213
+ splits (optional):
214
+ The names of the splits in the dataset. If not provided, defaults to
215
+ ["train", "val", "test"].
216
+ bootstrap_samples (optional):
217
+ Whether to bootstrap the dataset samples. Defaults to True.
359
218
  unofficial (optional):
360
219
  Whether the dataset is unofficial. Defaults to False.
361
220
  """
362
221
 
363
222
  name: str
364
223
  pretty_name: str
365
- huggingface_id: str
224
+ source: str | dict[str, str]
366
225
  task: Task
367
- languages: list[Language]
226
+ languages: c.Sequence[Language]
368
227
  _prompt_prefix: str | None = None
369
228
  _prompt_template: str | None = None
370
229
  _instruction_prompt: str | None = None
371
230
  _num_few_shot_examples: int | None = None
372
231
  _max_generated_tokens: int | None = None
373
- _labels: list[str] | None = None
232
+ _labels: c.Sequence[str] | None = None
374
233
  _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
234
+ _allowed_model_types: c.Sequence[ModelType] | None = None
235
+ _allowed_generative_types: c.Sequence[GenerativeType] | None = None
236
+ _allow_invalid_model_outputs: bool | None = None
237
+ _logging_string: str | None = None
238
+ splits: c.Sequence[str] = field(default_factory=lambda: ["train", "val", "test"])
239
+ bootstrap_samples: bool = True
375
240
  unofficial: bool = False
376
241
 
242
+ @property
243
+ def main_language(self) -> Language:
244
+ """Get the main language of the dataset.
245
+
246
+ Returns:
247
+ The main language.
248
+ """
249
+ match len(self.languages):
250
+ case 0:
251
+ raise InvalidBenchmark(
252
+ f"Dataset {self.name!r} must have at least one language."
253
+ )
254
+ case 1:
255
+ return self.languages[0]
256
+ case _:
257
+ if ENGLISH in self.languages:
258
+ return ENGLISH
259
+ elif NORWEGIAN in self.languages:
260
+ return NORWEGIAN
261
+ elif PORTUGUESE in self.languages:
262
+ return PORTUGUESE
263
+ else:
264
+ return self.languages[0]
265
+
266
+ @property
267
+ def logging_string(self) -> str:
268
+ """The string used to describe evaluation on the dataset in logging."""
269
+ if self._logging_string is not None:
270
+ return self._logging_string
271
+
272
+ truncated_str = (
273
+ "truncated version of the "
274
+ if isinstance(self.source, str) and self.source.endswith("-mini")
275
+ else ""
276
+ )
277
+
278
+ logging_languages = list(deepcopy(self.languages))
279
+ if len(self.languages) > 1:
280
+ if (
281
+ NORWEGIAN_BOKMÅL in self.languages
282
+ and NORWEGIAN_NYNORSK in self.languages
283
+ and NORWEGIAN in self.languages
284
+ ):
285
+ logging_languages.remove(NORWEGIAN_BOKMÅL)
286
+ logging_languages.remove(NORWEGIAN_NYNORSK)
287
+ elif (
288
+ NORWEGIAN_BOKMÅL in self.languages
289
+ or NORWEGIAN_NYNORSK in self.languages
290
+ ) and NORWEGIAN in self.languages:
291
+ logging_languages.remove(NORWEGIAN)
292
+ if PORTUGUESE in self.languages and EUROPEAN_PORTUGUESE in self.languages:
293
+ logging_languages.remove(EUROPEAN_PORTUGUESE)
294
+
295
+ if len(logging_languages) > 1:
296
+ languages_str = (
297
+ ", ".join([lang.name for lang in logging_languages[:-1]])
298
+ + f" and {logging_languages[-1].name}"
299
+ )
300
+ else:
301
+ languages_str = logging_languages[0].name
302
+
303
+ task_str = self.task.name.replace("-", " ")
304
+ dataset_name_str = (
305
+ self.pretty_name or self.name.replace("-", " ").replace("_", " ").title()
306
+ )
307
+ return (
308
+ f"the {truncated_str}{languages_str} {task_str} dataset {dataset_name_str}"
309
+ )
310
+
377
311
  @property
378
312
  def prompt_prefix(self) -> str:
379
313
  """The prefix to use in the few-shot prompt."""
380
- main_language = self.languages[0]
381
- prompt_config = self.task.template_dict[main_language]
314
+ prompt_config = self.task.template_dict[self.main_language]
382
315
  prompt_prefix = (
383
316
  prompt_config.default_prompt_prefix
384
317
  if self._prompt_prefix is None
385
318
  else self._prompt_prefix
386
319
  )
387
- prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
388
320
  return prompt_prefix
389
321
 
390
322
  @property
391
323
  def prompt_template(self) -> str:
392
324
  """The template used during few-shot evaluation."""
393
- main_language = self.languages[0]
394
- prompt_config = self.task.template_dict[main_language]
325
+ prompt_config = self.task.template_dict[self.main_language]
395
326
  prompt_template = (
396
327
  prompt_config.default_prompt_template
397
328
  if self._prompt_template is None
398
329
  else self._prompt_template
399
330
  )
400
- prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
401
331
  return prompt_template
402
332
 
403
333
  @property
404
334
  def instruction_prompt(self) -> str:
405
335
  """The prompt to use when evaluating instruction-tuned models."""
406
- main_language = self.languages[0]
407
- prompt_config = self.task.template_dict[main_language]
336
+ prompt_config = self.task.template_dict[self.main_language]
408
337
  instruction_prompt = (
409
338
  prompt_config.default_instruction_prompt
410
339
  if self._instruction_prompt is None
411
340
  else self._instruction_prompt
412
341
  )
413
- instruction_prompt = instruction_prompt.replace(
414
- "{labels_str}", self._labels_str
415
- )
416
342
  return instruction_prompt
417
343
 
418
344
  @property
@@ -434,9 +360,18 @@ class DatasetConfig:
434
360
  )
435
361
 
436
362
  @property
437
- def labels(self) -> list[str]:
363
+ def labels(self) -> c.Sequence[str]:
438
364
  """The labels in the dataset."""
439
- return self._labels if self._labels is not None else self.task.default_labels
365
+ if self._labels is not None:
366
+ return self._labels
367
+ elif self.task.default_labels is not None:
368
+ return self.task.default_labels
369
+ else:
370
+ raise ValueError(
371
+ f"Labels must be specified for dataset {self.name!r} with the "
372
+ f"attribute `_labels`, as the task {self.task.name!r} does not have "
373
+ "default labels."
374
+ )
440
375
 
441
376
  @property
442
377
  def prompt_label_mapping(self) -> dict[str, str]:
@@ -445,24 +380,48 @@ class DatasetConfig:
445
380
  return {label: label for label in self.labels}
446
381
  elif self._prompt_label_mapping is not None:
447
382
  return self._prompt_label_mapping
448
-
449
- main_language = self.languages[0]
450
- prompt_config = self.task.template_dict[main_language]
451
-
383
+ prompt_config = self.task.template_dict[self.main_language]
452
384
  if prompt_config.default_prompt_label_mapping == "auto":
453
385
  return {label: label for label in self.labels}
454
386
  else:
455
387
  return prompt_config.default_prompt_label_mapping
456
388
 
457
389
  @property
458
- def id2label(self) -> dict[int, str]:
390
+ def allowed_model_types(self) -> c.Sequence[ModelType]:
391
+ """A list of model types that are allowed to be evaluated on this dataset."""
392
+ return (
393
+ self._allowed_model_types
394
+ if self._allowed_model_types is not None
395
+ else self.task.default_allowed_model_types
396
+ )
397
+
398
+ @property
399
+ def allowed_generative_types(self) -> c.Sequence[GenerativeType]:
400
+ """A list of generative model types that are allowed on this dataset."""
401
+ return (
402
+ self._allowed_generative_types
403
+ if self._allowed_generative_types is not None
404
+ else self.task.default_allowed_generative_types
405
+ )
406
+
407
+ @property
408
+ def allow_invalid_model_outputs(self) -> bool:
409
+ """Whether to allow invalid model outputs."""
410
+ return (
411
+ self._allow_invalid_model_outputs
412
+ if self._allow_invalid_model_outputs is not None
413
+ else self.task.default_allow_invalid_model_outputs
414
+ )
415
+
416
+ @property
417
+ def id2label(self) -> "HashableDict":
459
418
  """The mapping from ID to label."""
460
- return {idx: label for idx, label in enumerate(self.labels)}
419
+ return HashableDict({idx: label for idx, label in enumerate(self.labels)})
461
420
 
462
421
  @property
463
- def label2id(self) -> dict[str, int]:
422
+ def label2id(self) -> "HashableDict":
464
423
  """The mapping from label to ID."""
465
- return {label: i for i, label in enumerate(self.labels)}
424
+ return HashableDict({label: i for i, label in enumerate(self.labels)})
466
425
 
467
426
  @property
468
427
  def num_labels(self) -> int:
@@ -473,36 +432,36 @@ class DatasetConfig:
473
432
  """Return a hash of the dataset configuration."""
474
433
  return hash(self.name)
475
434
 
476
- @property
477
- def _labels_str(self) -> str:
435
+ def get_labels_str(self, labels: c.Sequence[str] | None = None) -> str:
478
436
  """Converts a set of labels to a natural string, in the specified language.
479
437
 
480
438
  If the task is NER, we separate using 'and' and use the mapped labels instead of
481
439
  the BIO NER labels.
482
440
 
483
441
  Args:
484
- language: The language to be used when converting the labels.
442
+ labels (optional):
443
+ The labels to convert to a natural string. If None, uses all the labels
444
+ in the dataset. Defaults to None.
485
445
 
486
446
  Returns:
487
447
  The natural string representation of the labels in specified language.
488
448
  """
489
- main_language = self.languages[0]
490
-
491
449
  if self.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
492
- sep_word = main_language.and_separator
450
+ sep_word = self.main_language.and_separator
493
451
  else:
494
- sep_word = main_language.or_separator
452
+ sep_word = self.main_language.or_separator
495
453
 
496
- local_labels: list[str] = []
497
- for label in self.labels:
498
- if label not in self.prompt_label_mapping:
499
- continue
500
- local_label = self.prompt_label_mapping[label]
501
- if local_label not in local_labels:
502
- local_labels.append(local_label)
454
+ if labels is None:
455
+ labels = list()
456
+ for english_label in self.labels:
457
+ if english_label not in self.prompt_label_mapping:
458
+ continue
459
+ label = self.prompt_label_mapping[english_label]
460
+ if label not in labels:
461
+ labels.append(label)
503
462
 
504
463
  # Convert labels to single-quoted labels - and remove duplicates
505
- quoted_labels = [f"'{label}'" for label in local_labels]
464
+ quoted_labels = [f"'{label}'" for label in labels]
506
465
 
507
466
  if not quoted_labels:
508
467
  return ""
@@ -514,6 +473,213 @@ class DatasetConfig:
514
473
  return f"{', '.join(quoted_labels[:-1])} {sep_word} {quoted_labels[-1]}"
515
474
 
516
475
 
476
+ @dataclass
477
+ class BenchmarkConfig:
478
+ """General benchmarking configuration, across datasets and models.
479
+
480
+ Attributes:
481
+ datasets:
482
+ The datasets to benchmark on.
483
+ finetuning_batch_size:
484
+ The batch size to use for finetuning.
485
+ raise_errors:
486
+ Whether to raise errors instead of skipping them.
487
+ cache_dir:
488
+ Directory to store cached models and datasets.
489
+ api_key:
490
+ The API key to use for a given inference API.
491
+ api_base:
492
+ The base URL for a given inference API. Only relevant if `model` refers to a
493
+ model on an inference API.
494
+ api_version:
495
+ The version of the API to use. Only relevant if `model` refers to a model on
496
+ an inference API.
497
+ progress_bar:
498
+ Whether to show a progress bar.
499
+ save_results:
500
+ Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
501
+ device:
502
+ The device to use for benchmarking.
503
+ trust_remote_code:
504
+ Whether to trust remote code when loading models from the Hugging Face Hub.
505
+ clear_model_cache:
506
+ Whether to clear the model cache after benchmarking each model.
507
+ evaluate_test_split:
508
+ Whether to evaluate on the test split.
509
+ few_shot:
510
+ Whether to only evaluate the model using few-shot evaluation. Only relevant
511
+ if the model is generative.
512
+ num_iterations:
513
+ The number of iterations each model should be evaluated for.
514
+ gpu_memory_utilization:
515
+ The GPU memory utilization to use for vLLM. A larger value will result in
516
+ faster evaluation, but at the risk of running out of GPU memory. Only reduce
517
+ this if you are running out of GPU memory. Only relevant if the model is
518
+ generative.
519
+ requires_safetensors:
520
+ Whether to only allow models that use the safetensors format.
521
+ generative_type:
522
+ The type of generative model to benchmark. Only relevant if the model is
523
+ generative.
524
+ download_only:
525
+ Whether to only download the models, metrics and datasets without
526
+ evaluating.
527
+ force:
528
+ Whether to force the benchmark to run even if the results are already
529
+ cached.
530
+ verbose:
531
+ Whether to print verbose output.
532
+ debug:
533
+ Whether to run the benchmark in debug mode.
534
+ run_with_cli:
535
+ Whether the benchmark is being run with the CLI.
536
+ """
537
+
538
+ datasets: c.Sequence[DatasetConfig]
539
+ languages: c.Sequence[Language]
540
+ finetuning_batch_size: int
541
+ raise_errors: bool
542
+ cache_dir: str
543
+ api_key: str | None
544
+ api_base: str | None
545
+ api_version: str | None
546
+ progress_bar: bool
547
+ save_results: bool
548
+ device: torch.device
549
+ trust_remote_code: bool
550
+ clear_model_cache: bool
551
+ evaluate_test_split: bool
552
+ few_shot: bool
553
+ num_iterations: int
554
+ gpu_memory_utilization: float
555
+ requires_safetensors: bool
556
+ generative_type: GenerativeType | None
557
+ download_only: bool
558
+ force: bool
559
+ verbose: bool
560
+ debug: bool
561
+ run_with_cli: bool
562
+
563
+ @property
564
+ def tasks(self) -> c.Sequence[Task]:
565
+ """Get the tasks in the benchmark configuration."""
566
+ return list({dataset_config.task for dataset_config in self.datasets})
567
+
568
+ def __post_init__(self) -> None:
569
+ """Post-initialisation checks."""
570
+ # Set dummy API key if it has not been set and we're benchmarking a model on an
571
+ # inference API
572
+ if self.api_key is None and self.api_base is not None:
573
+ self.api_key = "dummy"
574
+
575
+
576
+ class BenchmarkConfigParams(pydantic.BaseModel):
577
+ """The parameters for the benchmark configuration."""
578
+
579
+ model_config = pydantic.ConfigDict(
580
+ protected_namespaces=(), arbitrary_types_allowed=True
581
+ )
582
+
583
+ task: str | Task | c.Sequence[str | Task] | None
584
+ dataset: str | DatasetConfig | c.Sequence[str | DatasetConfig] | None
585
+ progress_bar: bool
586
+ save_results: bool
587
+ language: str | c.Sequence[str]
588
+ device: Device | None
589
+ finetuning_batch_size: int
590
+ raise_errors: bool
591
+ cache_dir: str
592
+ api_key: str | None
593
+ api_base: str | None
594
+ api_version: str | None
595
+ trust_remote_code: bool
596
+ clear_model_cache: bool
597
+ evaluate_test_split: bool
598
+ few_shot: bool
599
+ num_iterations: int
600
+ requires_safetensors: bool
601
+ download_only: bool
602
+ gpu_memory_utilization: float
603
+ generative_type: GenerativeType | None
604
+ force: bool
605
+ verbose: bool
606
+ debug: bool
607
+ run_with_cli: bool
608
+
609
+
610
+ class BenchmarkResult(pydantic.BaseModel):
611
+ """A benchmark result."""
612
+
613
+ dataset: str
614
+ task: str
615
+ languages: c.Sequence[str]
616
+ model: str
617
+ results: ScoreDict
618
+ num_model_parameters: int
619
+ max_sequence_length: int
620
+ vocabulary_size: int
621
+ merge: bool
622
+ generative: bool
623
+ generative_type: str | None
624
+ few_shot: bool
625
+ validation_split: bool
626
+ euroeval_version: str | None = get_package_version("euroeval")
627
+ transformers_version: str | None = get_package_version("transformers")
628
+ torch_version: str | None = get_package_version("torch")
629
+ vllm_version: str | None = get_package_version("vllm")
630
+ xgrammar_version: str | None = get_package_version("xgrammar")
631
+
632
+ @classmethod
633
+ def from_dict(cls, config: dict) -> "BenchmarkResult":
634
+ """Create a benchmark result from a dictionary.
635
+
636
+ Args:
637
+ config:
638
+ The configuration dictionary.
639
+
640
+ Returns:
641
+ The benchmark result.
642
+ """
643
+ # To be backwards compatible, we accept old results which changed the model
644
+ # name with parameters rather than adding them as explicit parameters
645
+ val_matches = re.search(r"\(.*val.*\)$", config["model"])
646
+ few_shot_matches = re.search(r"\(.*few-shot.*\)$", config["model"])
647
+ zero_shot_matches = re.search(r"\(.*zero-shot.*\)$", config["model"])
648
+ config["model"] = re.sub(
649
+ r"\(.*(few-shot|val).*\)$", "", config["model"]
650
+ ).strip()
651
+
652
+ if "merge" not in config:
653
+ config["merge"] = False
654
+ if "generative" not in config:
655
+ config["generative"] = (
656
+ few_shot_matches is not None or zero_shot_matches is not None
657
+ )
658
+ if "generative_type" not in config:
659
+ config["generative_type"] = None
660
+ if "few_shot" not in config:
661
+ config["few_shot"] = zero_shot_matches is None
662
+ if "validation_split" not in config:
663
+ config["validation_split"] = val_matches is not None
664
+
665
+ # Backwards compatibility
666
+ if "dataset_languages" in config:
667
+ config["languages"] = config.pop("dataset_languages")
668
+
669
+ return cls(**config)
670
+
671
+ def append_to_results(self, results_path: pathlib.Path) -> None:
672
+ """Append the benchmark result to the results file.
673
+
674
+ Args:
675
+ results_path:
676
+ The path to the results file.
677
+ """
678
+ json_str = json.dumps(self.model_dump())
679
+ with results_path.open("a") as f:
680
+ f.write("\n" + json_str)
681
+
682
+
517
683
  @dataclass
518
684
  class ModelConfig:
519
685
  """Configuration for a model.
@@ -523,6 +689,8 @@ class ModelConfig:
523
689
  The ID of the model.
524
690
  revision:
525
691
  The revision of the model.
692
+ param:
693
+ The parameter of the model, or None if the model has no parameters.
526
694
  task:
527
695
  The task that the model was trained on.
528
696
  languages:
@@ -544,9 +712,10 @@ class ModelConfig:
544
712
 
545
713
  model_id: str
546
714
  revision: str
715
+ param: str | None
547
716
  task: str
548
- languages: list[Language]
549
- inference_backend: InferenceBackend
717
+ languages: c.Sequence[Language]
718
+ inference_backend: "InferenceBackend"
550
719
  merge: bool
551
720
  model_type: ModelType
552
721
  fresh: bool
@@ -573,7 +742,7 @@ class PreparedModelInputs:
573
742
  instead.
574
743
  """
575
744
 
576
- texts: list[str] | None = None
745
+ texts: c.Sequence[str] | None = None
577
746
  input_ids: torch.Tensor | None = None
578
747
  attention_mask: torch.Tensor | None = None
579
748
 
@@ -591,8 +760,8 @@ class GenerativeModelOutput:
591
760
  token and its logprob. Can be None if the scores are not available.
592
761
  """
593
762
 
594
- sequences: list[str]
595
- scores: list[list[list[tuple[str, float]]]] | None = None
763
+ sequences: c.Sequence[str]
764
+ scores: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]] | None = None
596
765
 
597
766
 
598
767
  @dataclass
@@ -609,7 +778,7 @@ class SingleGenerativeModelOutput:
609
778
  """
610
779
 
611
780
  sequence: str
612
- scores: list[list[tuple[str, float]]] | None = None
781
+ scores: c.Sequence[c.Sequence[tuple[str, float]]] | None = None
613
782
 
614
783
 
615
784
  @dataclass
@@ -627,33 +796,31 @@ class HFModelInfo:
627
796
  """
628
797
 
629
798
  pipeline_tag: str
630
- tags: list[str]
799
+ tags: c.Sequence[str]
631
800
  adapter_base_model_id: str | None
632
801
 
633
802
 
634
803
  @dataclass
635
- class PromptConfig:
636
- """Configuration for task-specific prompting across languages.
637
-
638
- Defines the prompt templates needed for evaluating a specific task in a given
639
- language.
804
+ class ModelIdComponents:
805
+ """A model ID split into its components.
640
806
 
641
807
  Attributes:
642
- default_prompt_prefix:
643
- The default prefix to use in the few-shot prompt.
644
- default_prompt_template:
645
- The default template for the prompt to use when benchmarking the dataset
646
- using few-shot evaluation.
647
- default_instruction_prompt:
648
- The default prompt to use when benchmarking the dataset using
649
- instruction-based evaluation.
650
- default_prompt_label_mapping:
651
- The default mapping from the labels to another phrase which is used as a
652
- substitute for the label in few-shot evaluation. If set to "auto", the
653
- mapping will be set to a 1:1 mapping between the labels and themselves.
808
+ model_id:
809
+ The main model ID without revision or parameters.
810
+ revision:
811
+ The revision of the model, if any.
812
+ param:
813
+ The parameter of the model, if any.
654
814
  """
655
815
 
656
- default_prompt_prefix: str
657
- default_prompt_template: str
658
- default_instruction_prompt: str
659
- default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
816
+ model_id: str
817
+ revision: str
818
+ param: str | None
819
+
820
+
821
+ class HashableDict(dict):
822
+ """A hashable dictionary."""
823
+
824
+ def __hash__(self) -> int: # type: ignore[override]
825
+ """Return the hash of the dictionary."""
826
+ return hash(frozenset(self.items()))