EuroEval 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show
  1. euroeval/__init__.py +3 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +323 -193
  7. euroeval/benchmark_modules/vllm.py +166 -112
  8. euroeval/benchmarker.py +59 -33
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +13 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +53 -7
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +38 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +8 -7
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +46 -14
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +234 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  41. euroeval/prompt_templates/multiple_choice.py +23 -2
  42. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  43. euroeval/prompt_templates/reading_comprehension.py +42 -2
  44. euroeval/prompt_templates/sentiment_classification.py +46 -2
  45. euroeval/prompt_templates/summarization.py +24 -4
  46. euroeval/scores.py +7 -2
  47. euroeval/speed_benchmark.py +6 -6
  48. euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  49. euroeval/task_group_utils/question_answering.py +35 -28
  50. euroeval/task_group_utils/sequence_classification.py +96 -23
  51. euroeval/task_group_utils/text_to_text.py +7 -3
  52. euroeval/task_group_utils/token_classification.py +47 -75
  53. euroeval/tasks.py +31 -6
  54. euroeval/tokenization_utils.py +295 -207
  55. euroeval/utils.py +118 -34
  56. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +12 -14
  57. euroeval-16.0.0.dist-info/RECORD +69 -0
  58. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
  59. euroeval/human_evaluation.py +0 -738
  60. euroeval/metrics.py +0 -468
  61. euroeval-15.15.0.dist-info/RECORD +0 -63
  62. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
  63. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py CHANGED
@@ -15,7 +15,7 @@ from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
15
15
  from torch.distributed import destroy_process_group
16
16
 
17
17
  from .benchmark_config_factory import build_benchmark_config
18
- from .constants import GENERATIVE_DATASET_TASK_GROUPS, GENERATIVE_PIPELINE_TAGS
18
+ from .constants import GENERATIVE_PIPELINE_TAGS
19
19
  from .data_loading import load_data
20
20
  from .data_models import BenchmarkConfigParams, BenchmarkResult
21
21
  from .dataset_configs import get_all_dataset_configs
@@ -81,7 +81,7 @@ class Benchmarker:
81
81
  gpu_memory_utilization: float = 0.9,
82
82
  debug: bool = False,
83
83
  run_with_cli: bool = False,
84
- only_allow_safetensors: bool = False,
84
+ requires_safetensors: bool = False,
85
85
  ) -> None:
86
86
  """Initialise the benchmarker.
87
87
 
@@ -156,7 +156,7 @@ class Benchmarker:
156
156
  run_with_cli:
157
157
  Whether the benchmarker is being run from the command-line interface.
158
158
  Defaults to False.
159
- only_allow_safetensors:
159
+ requires_safetensors:
160
160
  Whether to only allow models that use the safetensors format. Defaults
161
161
  to False.
162
162
 
@@ -201,11 +201,11 @@ class Benchmarker:
201
201
  gpu_memory_utilization=gpu_memory_utilization,
202
202
  debug=debug,
203
203
  run_with_cli=run_with_cli,
204
- only_allow_safetensors=only_allow_safetensors,
204
+ requires_safetensors=requires_safetensors,
205
205
  )
206
206
 
207
207
  self.benchmark_config = build_benchmark_config(
208
- first_time=True, **self.benchmark_config_default_params.model_dump()
208
+ **self.benchmark_config_default_params.model_dump()
209
209
  )
210
210
 
211
211
  # Initialise variable storing model lists, so we only have to fetch it once
@@ -249,7 +249,7 @@ class Benchmarker:
249
249
  evaluate_test_split: bool | None = None,
250
250
  few_shot: bool | None = None,
251
251
  num_iterations: int | None = None,
252
- only_allow_safetensors: bool | None = None,
252
+ requires_safetensors: bool | None = None,
253
253
  ) -> list[BenchmarkResult]:
254
254
  """Benchmarks models on datasets.
255
255
 
@@ -327,7 +327,7 @@ class Benchmarker:
327
327
  to be used for power users, and scores will not be allowed on the
328
328
  leaderboards if this is changed. Defaults to the value specified when
329
329
  initialising the benchmarker.
330
- only_allow_safetensors:
330
+ requires_safetensors:
331
331
  Whether to only allow models that use the safetensors format. Defaults
332
332
  to the value specified when initialising the benchmarker.
333
333
 
@@ -361,7 +361,7 @@ class Benchmarker:
361
361
  evaluate_test_split=evaluate_test_split,
362
362
  few_shot=few_shot,
363
363
  num_iterations=num_iterations,
364
- only_allow_safetensors=only_allow_safetensors,
364
+ requires_safetensors=requires_safetensors,
365
365
  )
366
366
 
367
367
  adjust_logging_level(verbose=benchmark_config.verbose)
@@ -379,9 +379,46 @@ class Benchmarker:
379
379
 
380
380
  current_benchmark_results: list[BenchmarkResult] = list()
381
381
  for model_id in model_ids:
382
- model_config: ModelConfig | None = None
382
+ # Load the model configuration, or skip the model if it is invalid
383
+ try:
384
+ model_config = get_model_config(
385
+ model_id=model_id, benchmark_config=benchmark_config
386
+ )
387
+ except InvalidModel as e:
388
+ logger.info(e.message)
389
+ num_finished_benchmarks += len(dataset_configs)
390
+ continue
391
+
383
392
  loaded_model: BenchmarkModule | None = None
393
+ benchmark_params_to_revert: dict[str, t.Any] = dict()
384
394
  for dataset_config in dataset_configs:
395
+ # Revert any changes to the benchmark configuration made for the
396
+ # previous dataset
397
+ for param, value in benchmark_params_to_revert.items():
398
+ setattr(benchmark_config, param, value)
399
+ benchmark_params_to_revert = dict()
400
+
401
+ # Update the benchmark config if the dataset requires it
402
+ if (
403
+ "val" not in dataset_config.splits
404
+ and not benchmark_config.evaluate_test_split
405
+ ):
406
+ logger.debug(
407
+ "The dataset does not have a validation split, so even though "
408
+ "you requested evaluating the validation split (the default), "
409
+ "we will evaluate on the test split."
410
+ )
411
+ benchmark_params_to_revert["evaluate_test_split"] = False
412
+ benchmark_config.evaluate_test_split = True
413
+ if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
414
+ logger.debug(
415
+ "The task requires zero-shot evaluation, so even though you "
416
+ "requested few-shot evaluation (the default), we will evaluate "
417
+ "zero-shot."
418
+ )
419
+ benchmark_params_to_revert["few_shot"] = True
420
+ benchmark_config.few_shot = False
421
+
385
422
  # Skip if we have already benchmarked this model on this dataset and
386
423
  # we are not forcing the benchmark
387
424
  if not benchmark_config.force and model_has_been_benchmarked(
@@ -399,25 +436,14 @@ class Benchmarker:
399
436
  num_finished_benchmarks += 1
400
437
  continue
401
438
 
402
- if model_config is None:
403
- try:
404
- model_config = get_model_config(
405
- model_id=model_id, benchmark_config=benchmark_config
406
- )
407
- except InvalidModel as e:
408
- logger.info(e.message)
409
- num_finished_benchmarks += len(dataset_configs)
410
- continue
411
-
412
- # Skip if the model is an encoder model and the task is generative
413
- task_is_generative = (
414
- dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
415
- )
416
- if model_config.model_type == ModelType.ENCODER and task_is_generative:
439
+ # Skip if the model type should not be benchmarked on this dataset
440
+ model_type = model_config.model_type
441
+ allowed_model_types = dataset_config.task.allowed_model_types
442
+ if model_type not in allowed_model_types:
417
443
  logger.debug(
418
444
  f"Skipping benchmarking {model_id} on "
419
- f"{dataset_config.pretty_name}, as it is an encoder model and "
420
- "the task is generative."
445
+ f"{dataset_config.pretty_name}, as it is of type {model_type}, "
446
+ f"and the only allowed model types are {allowed_model_types}."
421
447
  )
422
448
  continue
423
449
 
@@ -536,7 +562,7 @@ class Benchmarker:
536
562
  api_version: str | None | None = None,
537
563
  debug: bool | None = None,
538
564
  run_with_cli: bool | None = None,
539
- only_allow_safetensors: bool | None = None,
565
+ requires_safetensors: bool | None = None,
540
566
  ) -> "BenchmarkConfig":
541
567
  """Get an updated benchmark configuration.
542
568
 
@@ -610,7 +636,7 @@ class Benchmarker:
610
636
  run_with_cli:
611
637
  Whether the benchmarker is being run from the command-line interface.
612
638
  If None, then this value will not be updated.
613
- only_allow_safetensors:
639
+ requires_safetensors:
614
640
  Whether to only allow models that use the safetensors format. If None,
615
641
  then this value will not be updated.
616
642
 
@@ -667,8 +693,8 @@ class Benchmarker:
667
693
  benchmark_config_params.debug = debug
668
694
  if run_with_cli is not None:
669
695
  benchmark_config_params.run_with_cli = run_with_cli
670
- if only_allow_safetensors is not None:
671
- benchmark_config_params.only_allow_safetensors = only_allow_safetensors
696
+ if requires_safetensors is not None:
697
+ benchmark_config_params.requires_safetensors = requires_safetensors
672
698
 
673
699
  return build_benchmark_config(**benchmark_config_params.model_dump())
674
700
 
@@ -858,7 +884,7 @@ class Benchmarker:
858
884
  evaluate_test_split: bool | None = None,
859
885
  few_shot: bool | None = None,
860
886
  num_iterations: int | None = None,
861
- only_allow_safetensors: bool | None = None,
887
+ requires_safetensors: bool | None = None,
862
888
  ) -> list[BenchmarkResult]:
863
889
  """Benchmarks models on datasets.
864
890
 
@@ -936,7 +962,7 @@ class Benchmarker:
936
962
  to be used for power users, and scores will not be allowed on the
937
963
  leaderboards if this is changed. Defaults to the value specified when
938
964
  initialising the benchmarker.
939
- only_allow_safetensors:
965
+ requires_safetensors:
940
966
  Whether to only allow models that use the safetensors format. Defaults
941
967
  to the value specified when initialising the benchmarker.
942
968
 
@@ -972,7 +998,7 @@ class Benchmarker:
972
998
  evaluate_test_split=evaluate_test_split,
973
999
  few_shot=few_shot,
974
1000
  num_iterations=num_iterations,
975
- only_allow_safetensors=only_allow_safetensors,
1001
+ requires_safetensors=requires_safetensors,
976
1002
  )
977
1003
 
978
1004
 
euroeval/cli.py CHANGED
@@ -203,7 +203,7 @@ from .tasks import get_all_tasks
203
203
  "relevant if the model is generative.",
204
204
  )
205
205
  @click.option(
206
- "--only-allow-safetensors",
206
+ "--requires-safetensors",
207
207
  is_flag=True,
208
208
  help="Only allow loading models that have safetensors weights available",
209
209
  default=False,
@@ -233,7 +233,7 @@ def benchmark(
233
233
  api_version: str | None,
234
234
  gpu_memory_utilization: float,
235
235
  debug: bool,
236
- only_allow_safetensors: bool,
236
+ requires_safetensors: bool,
237
237
  ) -> None:
238
238
  """Benchmark pretrained language models on language tasks."""
239
239
  models = list(model)
@@ -270,7 +270,7 @@ def benchmark(
270
270
  gpu_memory_utilization=gpu_memory_utilization,
271
271
  debug=debug,
272
272
  run_with_cli=True,
273
- only_allow_safetensors=only_allow_safetensors,
273
+ requires_safetensors=requires_safetensors,
274
274
  )
275
275
 
276
276
  # Perform the benchmark evaluation
euroeval/constants.py CHANGED
@@ -1,7 +1,6 @@
1
1
  """Constants used throughout the project."""
2
2
 
3
3
  from .enums import TaskGroup
4
- from .tasks import NER
5
4
 
6
5
  # This is used as input to generative models; it cannot be a special token
7
6
  DUMMY_FILL_VALUE = 100
@@ -11,7 +10,7 @@ DUMMY_FILL_VALUE = 100
11
10
  # benchmark. We will still report the models' true maximum context length in the
12
11
  # metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
13
12
  # all tokens in the context.
14
- MAX_CONTEXT_LENGTH = 5_000
13
+ MAX_CONTEXT_LENGTH = 8_192
15
14
 
16
15
 
17
16
  # We need to raise the amount of tokens generated for reasoning models, to give them
@@ -37,21 +36,10 @@ GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
37
36
  LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
38
37
 
39
38
 
40
- # Tasks where we use structured generation for generative models
41
- TASKS_USING_JSON = [NER]
42
-
43
-
44
- # Tasks where we use log probabilities for generative models, rather than the raw
45
- # completion
46
- TASK_GROUPS_USING_LOGPROBS = [
47
- TaskGroup.SEQUENCE_CLASSIFICATION,
48
- TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
49
- ]
50
-
51
-
52
39
  # The number of top log probabilities to return for generative models. For several APIs
53
40
  # this is the maximum number of log probabilities that can be returned
54
- MAX_LOGPROBS = 8
41
+ MAX_VLLM_LOGPROBS = 20
42
+ MAX_LITELLM_LOGPROBS = 8
55
43
 
56
44
 
57
45
  # We make sure to remove these metric attributes after each iteration, to avoid memory
@@ -77,3 +65,13 @@ REASONING_TOKENS = [
77
65
  # manually. We only use them as stop tokens if they actually appear in the model's
78
66
  # output
79
67
  CUSTOM_STOP_TOKENS = ["<sep>"]
68
+
69
+
70
+ # For classification tasks we force LiteLLM models to output a JSON dictionary with a
71
+ # single key and the values being restricted to the allowed labels. This is the key we
72
+ # use
73
+ LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
74
+
75
+
76
+ # These characters are stripped from JSON output when trying to identify the label
77
+ JSON_STRIP_CHARACTERS = ' {}\n\r":'
euroeval/data_loading.py CHANGED
@@ -12,6 +12,7 @@ from huggingface_hub.errors import HfHubHTTPError
12
12
  from numpy.random import Generator
13
13
 
14
14
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark
15
+ from .tasks import EUROPEAN_VALUES
15
16
  from .utils import unscramble
16
17
 
17
18
  if t.TYPE_CHECKING:
@@ -48,40 +49,45 @@ def load_data(
48
49
  dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
49
50
  )
50
51
 
51
- if not benchmark_config.evaluate_test_split:
52
+ if not benchmark_config.evaluate_test_split and "val" in dataset:
52
53
  dataset["test"] = dataset["val"]
53
54
 
54
55
  # Remove empty examples from the datasets
55
56
  for text_feature in ["tokens", "text"]:
56
- if text_feature in dataset["train"].features:
57
- dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
57
+ for split in dataset_config.splits:
58
+ if text_feature in dataset[split].features:
59
+ dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
58
60
 
59
- # If we are testing then truncate the test set
60
- if hasattr(sys, "_called_from_test"):
61
+ # If we are testing then truncate the test set, unless we need the full set for
62
+ # evaluation
63
+ if hasattr(sys, "_called_from_test") and dataset_config.task != EUROPEAN_VALUES:
61
64
  dataset["test"] = dataset["test"].select(range(1))
62
65
 
63
- # Bootstrap the splits
64
- bootstrapped_splits: dict[str, list["Dataset"]] = dict()
65
- for split in ["train", "val", "test"]:
66
- bootstrap_indices = rng.integers(
67
- 0,
68
- len(dataset[split]),
69
- size=(benchmark_config.num_iterations, len(dataset[split])),
70
- )
71
- bootstrapped_splits[split] = [
72
- dataset[split].select(bootstrap_indices[idx])
66
+ # Bootstrap the splits, if applicable
67
+ if dataset_config.bootstrap_samples:
68
+ bootstrapped_splits: dict[str, list["Dataset"]] = dict()
69
+ for split in dataset_config.splits:
70
+ bootstrap_indices = rng.integers(
71
+ 0,
72
+ len(dataset[split]),
73
+ size=(benchmark_config.num_iterations, len(dataset[split])),
74
+ )
75
+ bootstrapped_splits[split] = [
76
+ dataset[split].select(bootstrap_indices[idx])
77
+ for idx in range(benchmark_config.num_iterations)
78
+ ]
79
+ datasets = [
80
+ DatasetDict(
81
+ {
82
+ split: bootstrapped_splits[split][idx]
83
+ for split in dataset_config.splits
84
+ }
85
+ )
73
86
  for idx in range(benchmark_config.num_iterations)
74
87
  ]
88
+ else:
89
+ datasets = [dataset] * benchmark_config.num_iterations
75
90
 
76
- datasets = [
77
- DatasetDict(
78
- {
79
- split: bootstrapped_splits[split][idx]
80
- for split in ["train", "val", "test"]
81
- }
82
- )
83
- for idx in range(benchmark_config.num_iterations)
84
- ]
85
91
  return datasets
86
92
 
87
93
 
@@ -113,7 +119,7 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
113
119
  requests.ConnectionError,
114
120
  requests.ReadTimeout,
115
121
  ):
116
- logger.warning(
122
+ logger.debug(
117
123
  f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
118
124
  )
119
125
  time.sleep(1)
@@ -126,11 +132,10 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
126
132
  f"{num_attempts} attempts."
127
133
  )
128
134
  assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
129
- required_keys = ["train", "val", "test"]
130
- missing_keys = [key for key in required_keys if key not in dataset]
135
+ missing_keys = [key for key in dataset_config.splits if key not in dataset]
131
136
  if missing_keys:
132
137
  raise InvalidBenchmark(
133
138
  "The dataset is missing the following required splits: "
134
139
  f"{', '.join(missing_keys)}"
135
140
  )
136
- return DatasetDict({key: dataset[key] for key in required_keys})
141
+ return DatasetDict({key: dataset[key] for key in dataset_config.splits})
euroeval/data_models.py CHANGED
@@ -9,11 +9,14 @@ from dataclasses import dataclass, field
9
9
  import pydantic
10
10
  import torch
11
11
 
12
- from .enums import Device, InferenceBackend, ModelType, TaskGroup
13
- from .metrics import Metric
12
+ from .enums import Device, GenerativeType, ModelType, TaskGroup
14
13
  from .types import ScoreDict
15
14
  from .utils import get_package_version
16
15
 
16
+ if t.TYPE_CHECKING:
17
+ from .enums import InferenceBackend
18
+ from .metrics import Metric
19
+
17
20
 
18
21
  @dataclass
19
22
  class Language:
@@ -104,15 +107,51 @@ class Task:
104
107
  using few-shot evaluation.
105
108
  default_labels:
106
109
  The default labels for datasets using this task.
110
+ requires_zero_shot (optional):
111
+ Whether to only allow zero-shot evaluation for this task. If True, the
112
+ task will not be evaluated using few-shot examples.
113
+ uses_structured_output (optional):
114
+ Whether the task uses structured output. If True, the task will return
115
+ structured output (e.g., BIO tags for NER). Defaults to False.
116
+ uses_logprobs (optional):
117
+ Whether the task uses log probabilities. If True, the task will return
118
+ log probabilities for the generated tokens. Defaults to False.
119
+ requires_logprobs (optional):
120
+ Whether the task requires log probabilities. Implies `uses_logprobs`.
121
+ allowed_model_types (optional):
122
+ A list of model types that are allowed to be evaluated on this task.
123
+ Defaults to all model types being allowed.
124
+ allowed_generative_types (optional):
125
+ A list of generative model types that are allowed to be evaluated on this
126
+ task. If None, all generative model types are allowed. Only relevant if
127
+ `allowed_model_types` includes generative models.
107
128
  """
108
129
 
109
130
  name: str
110
131
  task_group: TaskGroup
111
132
  template_dict: dict["Language", "PromptConfig"]
112
- metrics: list[Metric]
133
+ metrics: list["Metric"]
113
134
  default_num_few_shot_examples: int
114
135
  default_max_generated_tokens: int
115
136
  default_labels: list[str]
137
+ requires_zero_shot: bool = False
138
+ uses_structured_output: bool = False
139
+ uses_logprobs: bool = False
140
+ requires_logprobs: bool = False
141
+ allowed_model_types: list[ModelType] = field(
142
+ default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
143
+ )
144
+ allowed_generative_types: list[GenerativeType] = field(
145
+ default_factory=lambda: [
146
+ GenerativeType.BASE,
147
+ GenerativeType.INSTRUCTION_TUNED,
148
+ GenerativeType.REASONING,
149
+ ]
150
+ )
151
+
152
+ def __post_init__(self) -> None:
153
+ """Post-initialisation checks."""
154
+ self.uses_logprobs = self.uses_logprobs or self.requires_logprobs
116
155
 
117
156
  def __hash__(self) -> int:
118
157
  """Return a hash of the task."""
@@ -177,7 +216,7 @@ class BenchmarkConfig:
177
216
  Whether to run the benchmark in debug mode.
178
217
  run_with_cli:
179
218
  Whether the benchmark is being run with the CLI.
180
- only_allow_safetensors:
219
+ requires_safetensors:
181
220
  Whether to only allow models that use the safetensors format.
182
221
  """
183
222
 
@@ -204,7 +243,7 @@ class BenchmarkConfig:
204
243
  gpu_memory_utilization: float
205
244
  debug: bool
206
245
  run_with_cli: bool
207
- only_allow_safetensors: bool
246
+ requires_safetensors: bool
208
247
 
209
248
 
210
249
  class BenchmarkConfigParams(pydantic.BaseModel):
@@ -236,7 +275,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
236
275
  gpu_memory_utilization: float
237
276
  debug: bool
238
277
  run_with_cli: bool
239
- only_allow_safetensors: bool
278
+ requires_safetensors: bool
240
279
 
241
280
 
242
281
  class BenchmarkResult(pydantic.BaseModel):
@@ -356,6 +395,11 @@ class DatasetConfig:
356
395
  to a 1:1 mapping between the labels and themselves. If None then the mapping
357
396
  will be set to the default mapping for the task and language. Defaults to
358
397
  None.
398
+ splits (optional):
399
+ The names of the splits in the dataset. If not provided, defaults to
400
+ ["train", "val", "test"].
401
+ bootstrap_samples (optional):
402
+ Whether to bootstrap the dataset samples. Defaults to True.
359
403
  unofficial (optional):
360
404
  Whether the dataset is unofficial. Defaults to False.
361
405
  """
@@ -372,6 +416,8 @@ class DatasetConfig:
372
416
  _max_generated_tokens: int | None = None
373
417
  _labels: list[str] | None = None
374
418
  _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
419
+ splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
420
+ bootstrap_samples: bool = True
375
421
  unofficial: bool = False
376
422
 
377
423
  @property
@@ -546,7 +592,7 @@ class ModelConfig:
546
592
  revision: str
547
593
  task: str
548
594
  languages: list[Language]
549
- inference_backend: InferenceBackend
595
+ inference_backend: "InferenceBackend"
550
596
  merge: bool
551
597
  model_type: ModelType
552
598
  fresh: bool
@@ -6,12 +6,14 @@ from ..tasks import SPEED
6
6
  from .danish import * # noqa: F403
7
7
  from .dutch import * # noqa: F403
8
8
  from .english import * # noqa: F403
9
+ from .estonian import * # noqa: F403
9
10
  from .faroese import * # noqa: F403
10
11
  from .finnish import * # noqa: F403
11
12
  from .french import * # noqa: F403
12
13
  from .german import * # noqa: F403
13
14
  from .icelandic import * # noqa: F403
14
15
  from .italian import * # noqa: F403
16
+ from .latvian import * # noqa: F403
15
17
  from .norwegian import * # noqa: F403
16
18
  from .portuguese import * # noqa: F403
17
19
  from .spanish import * # noqa: F403
@@ -2,7 +2,7 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import DA
5
- from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
5
+ from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
@@ -76,6 +76,17 @@ HELLASWAG_DA_CONFIG = DatasetConfig(
76
76
  languages=[DA],
77
77
  )
78
78
 
79
+ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
80
+ name="european-values-da",
81
+ pretty_name="the Danish version of the European values evaluation dataset",
82
+ huggingface_id="EuroEval/european-values-da",
83
+ task=EUROPEAN_VALUES,
84
+ languages=[DA],
85
+ splits=["test"],
86
+ bootstrap_samples=False,
87
+ _instruction_prompt="{text}",
88
+ )
89
+
79
90
 
80
91
  ### Unofficial datasets ###
81
92
 
@@ -138,3 +149,29 @@ GOLDENSWAG_DA_CONFIG = DatasetConfig(
138
149
  languages=[DA],
139
150
  unofficial=True,
140
151
  )
152
+
153
+ EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
154
+ name="european-values-situational-da",
155
+ pretty_name="the Danish version of the European values evaluation dataset, where "
156
+ "the questions are phrased in a situational way",
157
+ huggingface_id="EuroEval/european-values-situational-da",
158
+ task=EUROPEAN_VALUES,
159
+ languages=[DA],
160
+ splits=["test"],
161
+ bootstrap_samples=False,
162
+ _instruction_prompt="{text}",
163
+ unofficial=True,
164
+ )
165
+
166
+ EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
167
+ name="european-values-completions-da",
168
+ pretty_name="the Danish version of the European values evaluation dataset, where "
169
+ "the questions are phrased as sentence completions",
170
+ huggingface_id="EuroEval/european-values-completions-da",
171
+ task=EUROPEAN_VALUES,
172
+ languages=[DA],
173
+ splits=["test"],
174
+ bootstrap_samples=False,
175
+ _instruction_prompt="{text}",
176
+ unofficial=True,
177
+ )
@@ -2,7 +2,7 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import NL
5
- from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
5
+ from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
@@ -69,6 +69,17 @@ HELLASWAG_NL_CONFIG = DatasetConfig(
69
69
  languages=[NL],
70
70
  )
71
71
 
72
+ EUROPEAN_VALUES_NL_CONFIG = DatasetConfig(
73
+ name="european-values-nl",
74
+ pretty_name="the Dutch version of the European values evaluation dataset",
75
+ huggingface_id="EuroEval/european-values-nl",
76
+ task=EUROPEAN_VALUES,
77
+ languages=[NL],
78
+ splits=["test"],
79
+ bootstrap_samples=False,
80
+ _instruction_prompt="{text}",
81
+ )
82
+
72
83
 
73
84
  ### Unofficial datasets ###
74
85
 
@@ -130,3 +141,29 @@ GOLDENSWAG_NL_CONFIG = DatasetConfig(
130
141
  languages=[NL],
131
142
  unofficial=True,
132
143
  )
144
+
145
+ EUROPEAN_VALUES_SITUATIONAL_NL_CONFIG = DatasetConfig(
146
+ name="european-values-situational-nl",
147
+ pretty_name="the Dutch version of the European values evaluation dataset, where "
148
+ "the questions are phrased in a situational way",
149
+ huggingface_id="EuroEval/european-values-situational-nl",
150
+ task=EUROPEAN_VALUES,
151
+ languages=[NL],
152
+ splits=["test"],
153
+ bootstrap_samples=False,
154
+ _instruction_prompt="{text}",
155
+ unofficial=True,
156
+ )
157
+
158
+ EUROPEAN_VALUES_COMPLETIONS_NL_CONFIG = DatasetConfig(
159
+ name="european-values-completions-nl",
160
+ pretty_name="the Dutch version of the European values evaluation dataset, where "
161
+ "the questions are phrased as sentence completions",
162
+ huggingface_id="EuroEval/european-values-completions-nl",
163
+ task=EUROPEAN_VALUES,
164
+ languages=[NL],
165
+ splits=["test"],
166
+ bootstrap_samples=False,
167
+ _instruction_prompt="{text}",
168
+ unofficial=True,
169
+ )