EuroEval 16.4.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (71) hide show
  1. euroeval/__init__.py +6 -0
  2. euroeval/benchmark_config_factory.py +51 -46
  3. euroeval/benchmark_modules/base.py +6 -5
  4. euroeval/benchmark_modules/hf.py +2 -9
  5. euroeval/benchmark_modules/litellm.py +14 -12
  6. euroeval/benchmark_modules/vllm.py +17 -10
  7. euroeval/benchmarker.py +61 -44
  8. euroeval/caching_utils.py +1 -1
  9. euroeval/cli.py +86 -8
  10. euroeval/constants.py +3 -0
  11. euroeval/data_loading.py +78 -30
  12. euroeval/data_models.py +326 -326
  13. euroeval/dataset_configs/__init__.py +10 -3
  14. euroeval/dataset_configs/bulgarian.py +56 -0
  15. euroeval/dataset_configs/czech.py +25 -29
  16. euroeval/dataset_configs/danish.py +51 -88
  17. euroeval/dataset_configs/dutch.py +48 -86
  18. euroeval/dataset_configs/english.py +45 -76
  19. euroeval/dataset_configs/estonian.py +36 -38
  20. euroeval/dataset_configs/faroese.py +19 -60
  21. euroeval/dataset_configs/finnish.py +36 -68
  22. euroeval/dataset_configs/french.py +39 -74
  23. euroeval/dataset_configs/german.py +45 -81
  24. euroeval/dataset_configs/greek.py +64 -0
  25. euroeval/dataset_configs/icelandic.py +54 -91
  26. euroeval/dataset_configs/italian.py +42 -78
  27. euroeval/dataset_configs/latvian.py +28 -34
  28. euroeval/dataset_configs/lithuanian.py +22 -26
  29. euroeval/dataset_configs/norwegian.py +72 -114
  30. euroeval/dataset_configs/polish.py +33 -60
  31. euroeval/dataset_configs/portuguese.py +33 -65
  32. euroeval/dataset_configs/serbian.py +64 -0
  33. euroeval/dataset_configs/slovak.py +19 -24
  34. euroeval/dataset_configs/spanish.py +42 -76
  35. euroeval/dataset_configs/swedish.py +48 -84
  36. euroeval/dataset_configs/ukrainian.py +64 -0
  37. euroeval/exceptions.py +1 -1
  38. euroeval/finetuning.py +3 -2
  39. euroeval/generation.py +5 -4
  40. euroeval/generation_utils.py +6 -5
  41. euroeval/languages.py +395 -323
  42. euroeval/metrics/huggingface.py +14 -3
  43. euroeval/metrics/llm_as_a_judge.py +1 -1
  44. euroeval/model_cache.py +6 -5
  45. euroeval/model_loading.py +1 -1
  46. euroeval/prompt_templates/__init__.py +2 -0
  47. euroeval/prompt_templates/classification.py +206 -0
  48. euroeval/prompt_templates/linguistic_acceptability.py +82 -43
  49. euroeval/prompt_templates/multiple_choice.py +81 -41
  50. euroeval/prompt_templates/named_entity_recognition.py +125 -44
  51. euroeval/prompt_templates/reading_comprehension.py +92 -43
  52. euroeval/prompt_templates/sentiment_classification.py +91 -43
  53. euroeval/prompt_templates/summarization.py +64 -39
  54. euroeval/prompt_templates/token_classification.py +279 -0
  55. euroeval/scores.py +4 -3
  56. euroeval/speed_benchmark.py +2 -1
  57. euroeval/task_group_utils/multiple_choice_classification.py +2 -1
  58. euroeval/task_group_utils/question_answering.py +24 -13
  59. euroeval/task_group_utils/sequence_classification.py +5 -4
  60. euroeval/task_group_utils/text_to_text.py +2 -1
  61. euroeval/task_group_utils/token_classification.py +11 -8
  62. euroeval/tasks.py +44 -1
  63. euroeval/tokenisation_utils.py +19 -10
  64. euroeval/types.py +10 -9
  65. euroeval/utils.py +6 -3
  66. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +194 -37
  67. euroeval-16.5.0.dist-info/RECORD +81 -0
  68. euroeval-16.4.0.dist-info/RECORD +0 -75
  69. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  70. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  71. {euroeval-16.4.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/data_models.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Data models used in EuroEval."""
2
2
 
3
+ import collections.abc as c
3
4
  import json
4
5
  import pathlib
5
6
  import re
@@ -10,79 +11,42 @@ import pydantic
10
11
  import torch
11
12
 
12
13
  from .enums import Device, GenerativeType, ModelType, TaskGroup
14
+ from .exceptions import InvalidBenchmark
15
+ from .languages import ENGLISH, NORWEGIAN, PORTUGUESE, Language
16
+ from .metrics.base import Metric
13
17
  from .types import ScoreDict
14
18
  from .utils import get_package_version
15
19
 
16
20
  if t.TYPE_CHECKING:
17
21
  from .enums import InferenceBackend
18
- from .metrics import Metric
19
22
 
20
23
 
21
24
  @dataclass
22
- class Language:
23
- """A benchmarkable language.
25
+ class PromptConfig:
26
+ """Configuration for task-specific prompting across languages.
27
+
28
+ Defines the prompt templates needed for evaluating a specific task in a given
29
+ language.
24
30
 
25
31
  Attributes:
26
- code:
27
- The ISO 639-1 language code of the language.
28
- name:
29
- The name of the language.
30
- and_separator (optional):
31
- The word 'and' in the language.
32
- or_separator (optional):
33
- The word 'or' in the language.
32
+ default_prompt_prefix:
33
+ The default prefix to use in the few-shot prompt.
34
+ default_prompt_template:
35
+ The default template for the prompt to use when benchmarking the dataset
36
+ using few-shot evaluation.
37
+ default_instruction_prompt:
38
+ The default prompt to use when benchmarking the dataset using
39
+ instruction-based evaluation.
40
+ default_prompt_label_mapping:
41
+ The default mapping from the labels to another phrase which is used as a
42
+ substitute for the label in few-shot evaluation. If set to "auto", the
43
+ mapping will be set to a 1:1 mapping between the labels and themselves.
34
44
  """
35
45
 
36
- code: str
37
- name: str
38
- _and_separator: str | None = field(repr=False, default=None)
39
- _or_separator: str | None = field(repr=False, default=None)
40
-
41
- def __hash__(self) -> int:
42
- """Return a hash of the language."""
43
- return hash(self.code)
44
-
45
- @property
46
- def and_separator(self) -> str:
47
- """Get the word 'and' in the language.
48
-
49
- Returns:
50
- The word 'and' in the language.
51
-
52
- Raises:
53
- NotImplementedError:
54
- If `and_separator` is `None`.
55
- """
56
- if not self._and_separator:
57
- raise NotImplementedError(
58
- f"Separator for the word 'and' has not been defined for {self.name}."
59
- )
60
- return self._and_separator
61
-
62
- @and_separator.setter
63
- def and_separator(self, value: str | None) -> None:
64
- self._and_separator = value
65
-
66
- @property
67
- def or_separator(self) -> str:
68
- """Get the word 'or' in the language.
69
-
70
- Returns:
71
- The word 'or' in the language.
72
-
73
- Raises:
74
- NotImplementedError:
75
- If `or_separator` is `None`.
76
- """
77
- if not self._or_separator:
78
- raise NotImplementedError(
79
- f"Separator for the word 'or' has not been defined for {self.name}."
80
- )
81
- return self._or_separator
82
-
83
- @or_separator.setter
84
- def or_separator(self, value: str | None) -> None:
85
- self._or_separator = value
46
+ default_prompt_prefix: str
47
+ default_prompt_template: str
48
+ default_instruction_prompt: str
49
+ default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
86
50
 
87
51
 
88
52
  @dataclass
@@ -133,21 +97,25 @@ class Task:
133
97
  considered incorrect and the evaluation will be aborted. Defaults to True.
134
98
  """
135
99
 
100
+ model_config = pydantic.ConfigDict(
101
+ protected_namespaces=(), arbitrary_types_allowed=True
102
+ )
103
+
136
104
  name: str
137
105
  task_group: TaskGroup
138
- template_dict: dict["Language", "PromptConfig"]
139
- metrics: list["Metric"]
106
+ template_dict: dict[Language, PromptConfig]
107
+ metrics: c.Sequence[Metric]
140
108
  default_num_few_shot_examples: int
141
109
  default_max_generated_tokens: int
142
- default_labels: list[str]
110
+ default_labels: c.Sequence[str] | None
143
111
  requires_zero_shot: bool = False
144
112
  uses_structured_output: bool = False
145
113
  uses_logprobs: bool = False
146
114
  requires_logprobs: bool = False
147
- default_allowed_model_types: list[ModelType] = field(
115
+ default_allowed_model_types: c.Sequence[ModelType] = field(
148
116
  default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
149
117
  )
150
- default_allowed_generative_types: list[GenerativeType] = field(
118
+ default_allowed_generative_types: c.Sequence[GenerativeType] = field(
151
119
  default_factory=lambda: [
152
120
  GenerativeType.BASE,
153
121
  GenerativeType.INSTRUCTION_TUNED,
@@ -165,205 +133,6 @@ class Task:
165
133
  return hash(self.name)
166
134
 
167
135
 
168
- @dataclass
169
- class BenchmarkConfig:
170
- """General benchmarking configuration, across datasets and models.
171
-
172
- Attributes:
173
- tasks:
174
- The tasks benchmark the model(s) on.
175
- datasets:
176
- The datasets to benchmark on.
177
- model_languages:
178
- The languages of the models to benchmark.
179
- dataset_languages:
180
- The languages of the datasets in the benchmark.
181
- device:
182
- The device to use for benchmarking.
183
- batch_size:
184
- The batch size to use.
185
- raise_errors:
186
- Whether to raise errors instead of skipping them.
187
- cache_dir:
188
- Directory to store cached models and datasets.
189
- api_key:
190
- The API key to use for a given inference API.
191
- api_base:
192
- The base URL for a given inference API. Only relevant if `model` refers to a
193
- model on an inference API.
194
- api_version:
195
- The version of the API to use. Only relevant if `model` refers to a model on
196
- an inference API.
197
- progress_bar:
198
- Whether to show a progress bar.
199
- save_results:
200
- Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
201
- trust_remote_code:
202
- Whether to trust remote code when loading models from the Hugging Face Hub.
203
- clear_model_cache:
204
- Whether to clear the model cache after benchmarking each model.
205
- evaluate_test_split:
206
- Whether to evaluate on the test split.
207
- few_shot:
208
- Whether to only evaluate the model using few-shot evaluation. Only relevant
209
- if the model is generative.
210
- num_iterations:
211
- The number of iterations each model should be evaluated for.
212
- gpu_memory_utilization:
213
- The GPU memory utilization to use for vLLM. A larger value will result in
214
- faster evaluation, but at the risk of running out of GPU memory. Only reduce
215
- this if you are running out of GPU memory. Only relevant if the model is
216
- generative.
217
- requires_safetensors:
218
- Whether to only allow models that use the safetensors format.
219
- generative_type:
220
- The type of generative model to benchmark. Only relevant if the model is
221
- generative.
222
- download_only:
223
- Whether to only download the models, metrics and datasets without
224
- evaluating.
225
- force:
226
- Whether to force the benchmark to run even if the results are already
227
- cached.
228
- verbose:
229
- Whether to print verbose output.
230
- debug:
231
- Whether to run the benchmark in debug mode.
232
- run_with_cli:
233
- Whether the benchmark is being run with the CLI.
234
- """
235
-
236
- model_languages: list[Language]
237
- dataset_languages: list[Language]
238
- tasks: list[Task]
239
- datasets: list[str]
240
- batch_size: int
241
- raise_errors: bool
242
- cache_dir: str
243
- api_key: str | None
244
- api_base: str | None
245
- api_version: str | None
246
- progress_bar: bool
247
- save_results: bool
248
- device: torch.device
249
- trust_remote_code: bool
250
- clear_model_cache: bool
251
- evaluate_test_split: bool
252
- few_shot: bool
253
- num_iterations: int
254
- gpu_memory_utilization: float
255
- requires_safetensors: bool
256
- generative_type: GenerativeType | None
257
- download_only: bool
258
- force: bool
259
- verbose: bool
260
- debug: bool
261
- run_with_cli: bool
262
-
263
-
264
- class BenchmarkConfigParams(pydantic.BaseModel):
265
- """The parameters for the benchmark configuration."""
266
-
267
- model_config = pydantic.ConfigDict(protected_namespaces=())
268
-
269
- task: str | list[str] | None
270
- dataset: str | list[str] | None
271
- progress_bar: bool
272
- save_results: bool
273
- language: str | list[str]
274
- model_language: str | list[str] | None
275
- dataset_language: str | list[str] | None
276
- device: Device | None
277
- batch_size: int
278
- raise_errors: bool
279
- cache_dir: str
280
- api_key: str | None
281
- api_base: str | None
282
- api_version: str | None
283
- trust_remote_code: bool
284
- clear_model_cache: bool
285
- evaluate_test_split: bool
286
- few_shot: bool
287
- num_iterations: int
288
- requires_safetensors: bool
289
- download_only: bool
290
- gpu_memory_utilization: float
291
- generative_type: GenerativeType | None
292
- force: bool
293
- verbose: bool
294
- debug: bool
295
- run_with_cli: bool
296
-
297
-
298
- class BenchmarkResult(pydantic.BaseModel):
299
- """A benchmark result."""
300
-
301
- dataset: str
302
- task: str
303
- dataset_languages: list[str]
304
- model: str
305
- results: ScoreDict
306
- num_model_parameters: int
307
- max_sequence_length: int
308
- vocabulary_size: int
309
- merge: bool
310
- generative: bool
311
- generative_type: str | None
312
- few_shot: bool
313
- validation_split: bool
314
- euroeval_version: str | None = get_package_version("euroeval")
315
- transformers_version: str | None = get_package_version("transformers")
316
- torch_version: str | None = get_package_version("torch")
317
- vllm_version: str | None = get_package_version("vllm")
318
- xgrammar_version: str | None = get_package_version("xgrammar")
319
-
320
- @classmethod
321
- def from_dict(cls, config: dict) -> "BenchmarkResult":
322
- """Create a benchmark result from a dictionary.
323
-
324
- Args:
325
- config:
326
- The configuration dictionary.
327
-
328
- Returns:
329
- The benchmark result.
330
- """
331
- # To be backwards compatible, we accept old results which changed the model
332
- # name with parameters rather than adding them as explicit parameters
333
- val_matches = re.search(r"\(.*val.*\)$", config["model"])
334
- few_shot_matches = re.search(r"\(.*few-shot.*\)$", config["model"])
335
- zero_shot_matches = re.search(r"\(.*zero-shot.*\)$", config["model"])
336
- config["model"] = re.sub(
337
- r"\(.*(few-shot|val).*\)$", "", config["model"]
338
- ).strip()
339
-
340
- if "merge" not in config:
341
- config["merge"] = False
342
- if "generative" not in config:
343
- config["generative"] = (
344
- few_shot_matches is not None or zero_shot_matches is not None
345
- )
346
- if "generative_type" not in config:
347
- config["generative_type"] = None
348
- if "few_shot" not in config:
349
- config["few_shot"] = zero_shot_matches is None
350
- if "validation_split" not in config:
351
- config["validation_split"] = val_matches is not None
352
-
353
- return cls(**config)
354
-
355
- def append_to_results(self, results_path: pathlib.Path) -> None:
356
- """Append the benchmark result to the results file.
357
-
358
- Args:
359
- results_path:
360
- The path to the results file.
361
- """
362
- json_str = json.dumps(self.model_dump())
363
- with results_path.open("a") as f:
364
- f.write("\n" + json_str)
365
-
366
-
367
136
  @dataclass
368
137
  class DatasetConfig:
369
138
  """Configuration for a dataset.
@@ -374,8 +143,9 @@ class DatasetConfig:
374
143
  pretty_name:
375
144
  A longer prettier name for the dataset, which allows cases and spaces. Used
376
145
  for logging.
377
- huggingface_id:
378
- The Hugging Face ID of the dataset.
146
+ source:
147
+ The source of the dataset, which can be a Hugging Face ID or a dictionary
148
+ with keys "train", "val" and "test" mapping to local CSV file paths.
379
149
  task:
380
150
  The task of the dataset.
381
151
  languages:
@@ -427,6 +197,10 @@ class DatasetConfig:
427
197
  will be mapped to the closest valid label. If False, the model output will
428
198
  be considered incorrect and the evaluation will be aborted. Defaults to
429
199
  the one for the task.
200
+ _logging_string (optional):
201
+ The string used to describe evaluation on the dataset in logging. If not
202
+ provided, a default string will be generated, based on the pretty name. Only
203
+ use this if the default string is not suitable.
430
204
  splits (optional):
431
205
  The names of the splits in the dataset. If not provided, defaults to
432
206
  ["train", "val", "test"].
@@ -438,28 +212,77 @@ class DatasetConfig:
438
212
 
439
213
  name: str
440
214
  pretty_name: str
441
- huggingface_id: str
215
+ source: str | dict[str, str]
442
216
  task: Task
443
- languages: list[Language]
217
+ languages: c.Sequence[Language]
444
218
  _prompt_prefix: str | None = None
445
219
  _prompt_template: str | None = None
446
220
  _instruction_prompt: str | None = None
447
221
  _num_few_shot_examples: int | None = None
448
222
  _max_generated_tokens: int | None = None
449
- _labels: list[str] | None = None
223
+ _labels: c.Sequence[str] | None = None
450
224
  _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
451
- _allowed_model_types: list[ModelType] | None = None
452
- _allowed_generative_types: list[GenerativeType] | None = None
225
+ _allowed_model_types: c.Sequence[ModelType] | None = None
226
+ _allowed_generative_types: c.Sequence[GenerativeType] | None = None
453
227
  _allow_invalid_model_outputs: bool | None = None
454
- splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
228
+ _logging_string: str | None = None
229
+ splits: c.Sequence[str] = field(default_factory=lambda: ["train", "val", "test"])
455
230
  bootstrap_samples: bool = True
456
231
  unofficial: bool = False
457
232
 
233
+ @property
234
+ def main_language(self) -> Language:
235
+ """Get the main language of the dataset.
236
+
237
+ Returns:
238
+ The main language.
239
+ """
240
+ match len(self.languages):
241
+ case 0:
242
+ raise InvalidBenchmark(
243
+ f"Dataset {self.name!r} must have at least one language."
244
+ )
245
+ case 1:
246
+ return self.languages[0]
247
+ case _:
248
+ if ENGLISH in self.languages:
249
+ return ENGLISH
250
+ elif NORWEGIAN in self.languages:
251
+ return NORWEGIAN
252
+ elif PORTUGUESE in self.languages:
253
+ return PORTUGUESE
254
+ else:
255
+ return self.languages[0]
256
+
257
+ @property
258
+ def logging_string(self) -> str:
259
+ """The string used to describe evaluation on the dataset in logging."""
260
+ if self._logging_string is not None:
261
+ return self._logging_string
262
+ truncated_str = (
263
+ "truncated version of the "
264
+ if isinstance(self.source, str) and self.source.endswith("-mini")
265
+ else ""
266
+ )
267
+ if len(self.languages) > 1:
268
+ languages_str = (
269
+ ", ".join([lang.name for lang in self.languages[:-1]])
270
+ + f" and {self.languages[-1].name}"
271
+ )
272
+ else:
273
+ languages_str = self.languages[0].name
274
+ task_str = self.task.name.replace("-", " ")
275
+ dataset_name_str = (
276
+ self.pretty_name or self.name.replace("-", " ").replace("_", " ").title()
277
+ )
278
+ return (
279
+ f"the {truncated_str}{languages_str} {task_str} dataset {dataset_name_str}"
280
+ )
281
+
458
282
  @property
459
283
  def prompt_prefix(self) -> str:
460
284
  """The prefix to use in the few-shot prompt."""
461
- main_language = self.languages[0]
462
- prompt_config = self.task.template_dict[main_language]
285
+ prompt_config = self.task.template_dict[self.main_language]
463
286
  prompt_prefix = (
464
287
  prompt_config.default_prompt_prefix
465
288
  if self._prompt_prefix is None
@@ -470,8 +293,7 @@ class DatasetConfig:
470
293
  @property
471
294
  def prompt_template(self) -> str:
472
295
  """The template used during few-shot evaluation."""
473
- main_language = self.languages[0]
474
- prompt_config = self.task.template_dict[main_language]
296
+ prompt_config = self.task.template_dict[self.main_language]
475
297
  prompt_template = (
476
298
  prompt_config.default_prompt_template
477
299
  if self._prompt_template is None
@@ -482,8 +304,7 @@ class DatasetConfig:
482
304
  @property
483
305
  def instruction_prompt(self) -> str:
484
306
  """The prompt to use when evaluating instruction-tuned models."""
485
- main_language = self.languages[0]
486
- prompt_config = self.task.template_dict[main_language]
307
+ prompt_config = self.task.template_dict[self.main_language]
487
308
  instruction_prompt = (
488
309
  prompt_config.default_instruction_prompt
489
310
  if self._instruction_prompt is None
@@ -510,9 +331,18 @@ class DatasetConfig:
510
331
  )
511
332
 
512
333
  @property
513
- def labels(self) -> list[str]:
334
+ def labels(self) -> c.Sequence[str]:
514
335
  """The labels in the dataset."""
515
- return self._labels if self._labels is not None else self.task.default_labels
336
+ if self._labels is not None:
337
+ return self._labels
338
+ elif self.task.default_labels is not None:
339
+ return self.task.default_labels
340
+ else:
341
+ raise ValueError(
342
+ f"Labels must be specified for dataset {self.name!r} with the "
343
+ f"attribute `_labels`, as the task {self.task.name!r} does not have "
344
+ "default labels."
345
+ )
516
346
 
517
347
  @property
518
348
  def prompt_label_mapping(self) -> dict[str, str]:
@@ -521,17 +351,14 @@ class DatasetConfig:
521
351
  return {label: label for label in self.labels}
522
352
  elif self._prompt_label_mapping is not None:
523
353
  return self._prompt_label_mapping
524
-
525
- main_language = self.languages[0]
526
- prompt_config = self.task.template_dict[main_language]
527
-
354
+ prompt_config = self.task.template_dict[self.main_language]
528
355
  if prompt_config.default_prompt_label_mapping == "auto":
529
356
  return {label: label for label in self.labels}
530
357
  else:
531
358
  return prompt_config.default_prompt_label_mapping
532
359
 
533
360
  @property
534
- def allowed_model_types(self) -> list[ModelType]:
361
+ def allowed_model_types(self) -> c.Sequence[ModelType]:
535
362
  """A list of model types that are allowed to be evaluated on this dataset."""
536
363
  return (
537
364
  self._allowed_model_types
@@ -540,7 +367,7 @@ class DatasetConfig:
540
367
  )
541
368
 
542
369
  @property
543
- def allowed_generative_types(self) -> list[GenerativeType]:
370
+ def allowed_generative_types(self) -> c.Sequence[GenerativeType]:
544
371
  """A list of generative model types that are allowed on this dataset."""
545
372
  return (
546
373
  self._allowed_generative_types
@@ -576,7 +403,7 @@ class DatasetConfig:
576
403
  """Return a hash of the dataset configuration."""
577
404
  return hash(self.name)
578
405
 
579
- def get_labels_str(self, labels: list[str] | None = None) -> str:
406
+ def get_labels_str(self, labels: c.Sequence[str] | None = None) -> str:
580
407
  """Converts a set of labels to a natural string, in the specified language.
581
408
 
582
409
  If the task is NER, we separate using 'and' and use the mapped labels instead of
@@ -590,12 +417,10 @@ class DatasetConfig:
590
417
  Returns:
591
418
  The natural string representation of the labels in specified language.
592
419
  """
593
- main_language = self.languages[0]
594
-
595
420
  if self.task.task_group == TaskGroup.TOKEN_CLASSIFICATION:
596
- sep_word = main_language.and_separator
421
+ sep_word = self.main_language.and_separator
597
422
  else:
598
- sep_word = main_language.or_separator
423
+ sep_word = self.main_language.or_separator
599
424
 
600
425
  if labels is None:
601
426
  labels = list()
@@ -619,6 +444,209 @@ class DatasetConfig:
619
444
  return f"{', '.join(quoted_labels[:-1])} {sep_word} {quoted_labels[-1]}"
620
445
 
621
446
 
447
+ @dataclass
448
+ class BenchmarkConfig:
449
+ """General benchmarking configuration, across datasets and models.
450
+
451
+ Attributes:
452
+ datasets:
453
+ The datasets to benchmark on.
454
+ model_languages:
455
+ The languages of the models to benchmark.
456
+ dataset_languages:
457
+ The languages of the datasets in the benchmark.
458
+ batch_size:
459
+ The batch size to use.
460
+ raise_errors:
461
+ Whether to raise errors instead of skipping them.
462
+ cache_dir:
463
+ Directory to store cached models and datasets.
464
+ api_key:
465
+ The API key to use for a given inference API.
466
+ api_base:
467
+ The base URL for a given inference API. Only relevant if `model` refers to a
468
+ model on an inference API.
469
+ api_version:
470
+ The version of the API to use. Only relevant if `model` refers to a model on
471
+ an inference API.
472
+ progress_bar:
473
+ Whether to show a progress bar.
474
+ save_results:
475
+ Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
476
+ device:
477
+ The device to use for benchmarking.
478
+ trust_remote_code:
479
+ Whether to trust remote code when loading models from the Hugging Face Hub.
480
+ clear_model_cache:
481
+ Whether to clear the model cache after benchmarking each model.
482
+ evaluate_test_split:
483
+ Whether to evaluate on the test split.
484
+ few_shot:
485
+ Whether to only evaluate the model using few-shot evaluation. Only relevant
486
+ if the model is generative.
487
+ num_iterations:
488
+ The number of iterations each model should be evaluated for.
489
+ gpu_memory_utilization:
490
+ The GPU memory utilization to use for vLLM. A larger value will result in
491
+ faster evaluation, but at the risk of running out of GPU memory. Only reduce
492
+ this if you are running out of GPU memory. Only relevant if the model is
493
+ generative.
494
+ requires_safetensors:
495
+ Whether to only allow models that use the safetensors format.
496
+ generative_type:
497
+ The type of generative model to benchmark. Only relevant if the model is
498
+ generative.
499
+ download_only:
500
+ Whether to only download the models, metrics and datasets without
501
+ evaluating.
502
+ force:
503
+ Whether to force the benchmark to run even if the results are already
504
+ cached.
505
+ verbose:
506
+ Whether to print verbose output.
507
+ debug:
508
+ Whether to run the benchmark in debug mode.
509
+ run_with_cli:
510
+ Whether the benchmark is being run with the CLI.
511
+ """
512
+
513
+ datasets: c.Sequence[DatasetConfig]
514
+ model_languages: c.Sequence[Language]
515
+ dataset_languages: c.Sequence[Language]
516
+ batch_size: int
517
+ raise_errors: bool
518
+ cache_dir: str
519
+ api_key: str | None
520
+ api_base: str | None
521
+ api_version: str | None
522
+ progress_bar: bool
523
+ save_results: bool
524
+ device: torch.device
525
+ trust_remote_code: bool
526
+ clear_model_cache: bool
527
+ evaluate_test_split: bool
528
+ few_shot: bool
529
+ num_iterations: int
530
+ gpu_memory_utilization: float
531
+ requires_safetensors: bool
532
+ generative_type: GenerativeType | None
533
+ download_only: bool
534
+ force: bool
535
+ verbose: bool
536
+ debug: bool
537
+ run_with_cli: bool
538
+
539
+ @property
540
+ def tasks(self) -> c.Sequence[Task]:
541
+ """Get the tasks in the benchmark configuration."""
542
+ return list({dataset_config.task for dataset_config in self.datasets})
543
+
544
+
545
+ class BenchmarkConfigParams(pydantic.BaseModel):
546
+ """The parameters for the benchmark configuration."""
547
+
548
+ model_config = pydantic.ConfigDict(
549
+ protected_namespaces=(), arbitrary_types_allowed=True
550
+ )
551
+
552
+ task: str | Task | c.Sequence[str | Task] | None
553
+ dataset: str | DatasetConfig | c.Sequence[str | DatasetConfig] | None
554
+ progress_bar: bool
555
+ save_results: bool
556
+ language: str | c.Sequence[str]
557
+ model_language: str | c.Sequence[str] | None
558
+ dataset_language: str | c.Sequence[str] | None
559
+ device: Device | None
560
+ batch_size: int
561
+ raise_errors: bool
562
+ cache_dir: str
563
+ api_key: str | None
564
+ api_base: str | None
565
+ api_version: str | None
566
+ trust_remote_code: bool
567
+ clear_model_cache: bool
568
+ evaluate_test_split: bool
569
+ few_shot: bool
570
+ num_iterations: int
571
+ requires_safetensors: bool
572
+ download_only: bool
573
+ gpu_memory_utilization: float
574
+ generative_type: GenerativeType | None
575
+ force: bool
576
+ verbose: bool
577
+ debug: bool
578
+ run_with_cli: bool
579
+
580
+
581
+ class BenchmarkResult(pydantic.BaseModel):
582
+ """A benchmark result."""
583
+
584
+ dataset: str
585
+ task: str
586
+ dataset_languages: c.Sequence[str]
587
+ model: str
588
+ results: ScoreDict
589
+ num_model_parameters: int
590
+ max_sequence_length: int
591
+ vocabulary_size: int
592
+ merge: bool
593
+ generative: bool
594
+ generative_type: str | None
595
+ few_shot: bool
596
+ validation_split: bool
597
+ euroeval_version: str | None = get_package_version("euroeval")
598
+ transformers_version: str | None = get_package_version("transformers")
599
+ torch_version: str | None = get_package_version("torch")
600
+ vllm_version: str | None = get_package_version("vllm")
601
+ xgrammar_version: str | None = get_package_version("xgrammar")
602
+
603
+ @classmethod
604
+ def from_dict(cls, config: dict) -> "BenchmarkResult":
605
+ """Create a benchmark result from a dictionary.
606
+
607
+ Args:
608
+ config:
609
+ The configuration dictionary.
610
+
611
+ Returns:
612
+ The benchmark result.
613
+ """
614
+ # To be backwards compatible, we accept old results which changed the model
615
+ # name with parameters rather than adding them as explicit parameters
616
+ val_matches = re.search(r"\(.*val.*\)$", config["model"])
617
+ few_shot_matches = re.search(r"\(.*few-shot.*\)$", config["model"])
618
+ zero_shot_matches = re.search(r"\(.*zero-shot.*\)$", config["model"])
619
+ config["model"] = re.sub(
620
+ r"\(.*(few-shot|val).*\)$", "", config["model"]
621
+ ).strip()
622
+
623
+ if "merge" not in config:
624
+ config["merge"] = False
625
+ if "generative" not in config:
626
+ config["generative"] = (
627
+ few_shot_matches is not None or zero_shot_matches is not None
628
+ )
629
+ if "generative_type" not in config:
630
+ config["generative_type"] = None
631
+ if "few_shot" not in config:
632
+ config["few_shot"] = zero_shot_matches is None
633
+ if "validation_split" not in config:
634
+ config["validation_split"] = val_matches is not None
635
+
636
+ return cls(**config)
637
+
638
+ def append_to_results(self, results_path: pathlib.Path) -> None:
639
+ """Append the benchmark result to the results file.
640
+
641
+ Args:
642
+ results_path:
643
+ The path to the results file.
644
+ """
645
+ json_str = json.dumps(self.model_dump())
646
+ with results_path.open("a") as f:
647
+ f.write("\n" + json_str)
648
+
649
+
622
650
  @dataclass
623
651
  class ModelConfig:
624
652
  """Configuration for a model.
@@ -653,7 +681,7 @@ class ModelConfig:
653
681
  revision: str
654
682
  param: str | None
655
683
  task: str
656
- languages: list[Language]
684
+ languages: c.Sequence[Language]
657
685
  inference_backend: "InferenceBackend"
658
686
  merge: bool
659
687
  model_type: ModelType
@@ -681,7 +709,7 @@ class PreparedModelInputs:
681
709
  instead.
682
710
  """
683
711
 
684
- texts: list[str] | None = None
712
+ texts: c.Sequence[str] | None = None
685
713
  input_ids: torch.Tensor | None = None
686
714
  attention_mask: torch.Tensor | None = None
687
715
 
@@ -699,8 +727,8 @@ class GenerativeModelOutput:
699
727
  token and its logprob. Can be None if the scores are not available.
700
728
  """
701
729
 
702
- sequences: list[str]
703
- scores: list[list[list[tuple[str, float]]]] | None = None
730
+ sequences: c.Sequence[str]
731
+ scores: c.Sequence[c.Sequence[c.Sequence[tuple[str, float]]]] | None = None
704
732
 
705
733
 
706
734
  @dataclass
@@ -717,7 +745,7 @@ class SingleGenerativeModelOutput:
717
745
  """
718
746
 
719
747
  sequence: str
720
- scores: list[list[tuple[str, float]]] | None = None
748
+ scores: c.Sequence[c.Sequence[tuple[str, float]]] | None = None
721
749
 
722
750
 
723
751
  @dataclass
@@ -735,38 +763,10 @@ class HFModelInfo:
735
763
  """
736
764
 
737
765
  pipeline_tag: str
738
- tags: list[str]
766
+ tags: c.Sequence[str]
739
767
  adapter_base_model_id: str | None
740
768
 
741
769
 
742
- @dataclass
743
- class PromptConfig:
744
- """Configuration for task-specific prompting across languages.
745
-
746
- Defines the prompt templates needed for evaluating a specific task in a given
747
- language.
748
-
749
- Attributes:
750
- default_prompt_prefix:
751
- The default prefix to use in the few-shot prompt.
752
- default_prompt_template:
753
- The default template for the prompt to use when benchmarking the dataset
754
- using few-shot evaluation.
755
- default_instruction_prompt:
756
- The default prompt to use when benchmarking the dataset using
757
- instruction-based evaluation.
758
- default_prompt_label_mapping:
759
- The default mapping from the labels to another phrase which is used as a
760
- substitute for the label in few-shot evaluation. If set to "auto", the
761
- mapping will be set to a 1:1 mapping between the labels and themselves.
762
- """
763
-
764
- default_prompt_prefix: str
765
- default_prompt_template: str
766
- default_instruction_prompt: str
767
- default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
768
-
769
-
770
770
  @dataclass
771
771
  class ModelIdComponents:
772
772
  """A model ID split into its components.