EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show
  1. euroeval/__init__.py +8 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +199 -139
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +19 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +73 -23
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +35 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +90 -20
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +276 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/model_cache.py +13 -1
  41. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  42. euroeval/prompt_templates/multiple_choice.py +23 -2
  43. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  44. euroeval/prompt_templates/reading_comprehension.py +42 -2
  45. euroeval/prompt_templates/sentiment_classification.py +46 -2
  46. euroeval/prompt_templates/summarization.py +24 -4
  47. euroeval/scores.py +7 -2
  48. euroeval/speed_benchmark.py +6 -6
  49. euroeval/task_group_utils/multiple_choice_classification.py +19 -8
  50. euroeval/task_group_utils/question_answering.py +35 -28
  51. euroeval/task_group_utils/sequence_classification.py +128 -42
  52. euroeval/task_group_utils/text_to_text.py +7 -3
  53. euroeval/task_group_utils/token_classification.py +59 -73
  54. euroeval/tasks.py +33 -6
  55. euroeval/tokenization_utils.py +294 -207
  56. euroeval/utils.py +150 -35
  57. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
  58. euroeval-16.0.1.dist-info/RECORD +69 -0
  59. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
  60. euroeval/human_evaluation.py +0 -738
  61. euroeval/metrics.py +0 -470
  62. euroeval-15.16.0.dist-info/RECORD +0 -63
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
  64. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py CHANGED
@@ -15,7 +15,7 @@ from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
15
15
  from torch.distributed import destroy_process_group
16
16
 
17
17
  from .benchmark_config_factory import build_benchmark_config
18
- from .constants import GENERATIVE_DATASET_TASK_GROUPS, GENERATIVE_PIPELINE_TAGS
18
+ from .constants import GENERATIVE_PIPELINE_TAGS
19
19
  from .data_loading import load_data
20
20
  from .data_models import BenchmarkConfigParams, BenchmarkResult
21
21
  from .dataset_configs import get_all_dataset_configs
@@ -81,7 +81,7 @@ class Benchmarker:
81
81
  gpu_memory_utilization: float = 0.9,
82
82
  debug: bool = False,
83
83
  run_with_cli: bool = False,
84
- only_allow_safetensors: bool = False,
84
+ requires_safetensors: bool = False,
85
85
  ) -> None:
86
86
  """Initialise the benchmarker.
87
87
 
@@ -156,7 +156,7 @@ class Benchmarker:
156
156
  run_with_cli:
157
157
  Whether the benchmarker is being run from the command-line interface.
158
158
  Defaults to False.
159
- only_allow_safetensors:
159
+ requires_safetensors:
160
160
  Whether to only allow models that use the safetensors format. Defaults
161
161
  to False.
162
162
 
@@ -201,11 +201,11 @@ class Benchmarker:
201
201
  gpu_memory_utilization=gpu_memory_utilization,
202
202
  debug=debug,
203
203
  run_with_cli=run_with_cli,
204
- only_allow_safetensors=only_allow_safetensors,
204
+ requires_safetensors=requires_safetensors,
205
205
  )
206
206
 
207
207
  self.benchmark_config = build_benchmark_config(
208
- first_time=True, **self.benchmark_config_default_params.model_dump()
208
+ **self.benchmark_config_default_params.model_dump()
209
209
  )
210
210
 
211
211
  # Initialise variable storing model lists, so we only have to fetch it once
@@ -249,7 +249,7 @@ class Benchmarker:
249
249
  evaluate_test_split: bool | None = None,
250
250
  few_shot: bool | None = None,
251
251
  num_iterations: int | None = None,
252
- only_allow_safetensors: bool | None = None,
252
+ requires_safetensors: bool | None = None,
253
253
  ) -> list[BenchmarkResult]:
254
254
  """Benchmarks models on datasets.
255
255
 
@@ -327,7 +327,7 @@ class Benchmarker:
327
327
  to be used for power users, and scores will not be allowed on the
328
328
  leaderboards if this is changed. Defaults to the value specified when
329
329
  initialising the benchmarker.
330
- only_allow_safetensors:
330
+ requires_safetensors:
331
331
  Whether to only allow models that use the safetensors format. Defaults
332
332
  to the value specified when initialising the benchmarker.
333
333
 
@@ -361,7 +361,7 @@ class Benchmarker:
361
361
  evaluate_test_split=evaluate_test_split,
362
362
  few_shot=few_shot,
363
363
  num_iterations=num_iterations,
364
- only_allow_safetensors=only_allow_safetensors,
364
+ requires_safetensors=requires_safetensors,
365
365
  )
366
366
 
367
367
  adjust_logging_level(verbose=benchmark_config.verbose)
@@ -390,7 +390,35 @@ class Benchmarker:
390
390
  continue
391
391
 
392
392
  loaded_model: BenchmarkModule | None = None
393
+ benchmark_params_to_revert: dict[str, t.Any] = dict()
393
394
  for dataset_config in dataset_configs:
395
+ # Revert any changes to the benchmark configuration made for the
396
+ # previous dataset
397
+ for param, value in benchmark_params_to_revert.items():
398
+ setattr(benchmark_config, param, value)
399
+ benchmark_params_to_revert = dict()
400
+
401
+ # Update the benchmark config if the dataset requires it
402
+ if (
403
+ "val" not in dataset_config.splits
404
+ and not benchmark_config.evaluate_test_split
405
+ ):
406
+ logger.debug(
407
+ "The dataset does not have a validation split, so even though "
408
+ "you requested evaluating the validation split (the default), "
409
+ "we will evaluate on the test split."
410
+ )
411
+ benchmark_params_to_revert["evaluate_test_split"] = False
412
+ benchmark_config.evaluate_test_split = True
413
+ if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
414
+ logger.debug(
415
+ "The task requires zero-shot evaluation, so even though you "
416
+ "requested few-shot evaluation (the default), we will evaluate "
417
+ "zero-shot."
418
+ )
419
+ benchmark_params_to_revert["few_shot"] = True
420
+ benchmark_config.few_shot = False
421
+
394
422
  # Skip if we have already benchmarked this model on this dataset and
395
423
  # we are not forcing the benchmark
396
424
  if not benchmark_config.force and model_has_been_benchmarked(
@@ -408,15 +436,14 @@ class Benchmarker:
408
436
  num_finished_benchmarks += 1
409
437
  continue
410
438
 
411
- # Skip if the model is an encoder model and the task is generative
412
- task_is_generative = (
413
- dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
414
- )
415
- if model_config.model_type == ModelType.ENCODER and task_is_generative:
439
+ # Skip if the model type should not be benchmarked on this dataset
440
+ model_type = model_config.model_type
441
+ allowed_model_types = dataset_config.task.allowed_model_types
442
+ if model_type not in allowed_model_types:
416
443
  logger.debug(
417
444
  f"Skipping benchmarking {model_id} on "
418
- f"{dataset_config.pretty_name}, as it is an encoder model and "
419
- "the task is generative."
445
+ f"{dataset_config.pretty_name}, as it is of type {model_type}, "
446
+ f"and the only allowed model types are {allowed_model_types}."
420
447
  )
421
448
  continue
422
449
 
@@ -535,7 +562,7 @@ class Benchmarker:
535
562
  api_version: str | None | None = None,
536
563
  debug: bool | None = None,
537
564
  run_with_cli: bool | None = None,
538
- only_allow_safetensors: bool | None = None,
565
+ requires_safetensors: bool | None = None,
539
566
  ) -> "BenchmarkConfig":
540
567
  """Get an updated benchmark configuration.
541
568
 
@@ -609,7 +636,7 @@ class Benchmarker:
609
636
  run_with_cli:
610
637
  Whether the benchmarker is being run from the command-line interface.
611
638
  If None, then this value will not be updated.
612
- only_allow_safetensors:
639
+ requires_safetensors:
613
640
  Whether to only allow models that use the safetensors format. If None,
614
641
  then this value will not be updated.
615
642
 
@@ -666,8 +693,8 @@ class Benchmarker:
666
693
  benchmark_config_params.debug = debug
667
694
  if run_with_cli is not None:
668
695
  benchmark_config_params.run_with_cli = run_with_cli
669
- if only_allow_safetensors is not None:
670
- benchmark_config_params.only_allow_safetensors = only_allow_safetensors
696
+ if requires_safetensors is not None:
697
+ benchmark_config_params.requires_safetensors = requires_safetensors
671
698
 
672
699
  return build_benchmark_config(**benchmark_config_params.model_dump())
673
700
 
@@ -857,7 +884,7 @@ class Benchmarker:
857
884
  evaluate_test_split: bool | None = None,
858
885
  few_shot: bool | None = None,
859
886
  num_iterations: int | None = None,
860
- only_allow_safetensors: bool | None = None,
887
+ requires_safetensors: bool | None = None,
861
888
  ) -> list[BenchmarkResult]:
862
889
  """Benchmarks models on datasets.
863
890
 
@@ -935,7 +962,7 @@ class Benchmarker:
935
962
  to be used for power users, and scores will not be allowed on the
936
963
  leaderboards if this is changed. Defaults to the value specified when
937
964
  initialising the benchmarker.
938
- only_allow_safetensors:
965
+ requires_safetensors:
939
966
  Whether to only allow models that use the safetensors format. Defaults
940
967
  to the value specified when initialising the benchmarker.
941
968
 
@@ -971,7 +998,7 @@ class Benchmarker:
971
998
  evaluate_test_split=evaluate_test_split,
972
999
  few_shot=few_shot,
973
1000
  num_iterations=num_iterations,
974
- only_allow_safetensors=only_allow_safetensors,
1001
+ requires_safetensors=requires_safetensors,
975
1002
  )
976
1003
 
977
1004
 
euroeval/cli.py CHANGED
@@ -203,7 +203,7 @@ from .tasks import get_all_tasks
203
203
  "relevant if the model is generative.",
204
204
  )
205
205
  @click.option(
206
- "--only-allow-safetensors",
206
+ "--requires-safetensors",
207
207
  is_flag=True,
208
208
  help="Only allow loading models that have safetensors weights available",
209
209
  default=False,
@@ -233,7 +233,7 @@ def benchmark(
233
233
  api_version: str | None,
234
234
  gpu_memory_utilization: float,
235
235
  debug: bool,
236
- only_allow_safetensors: bool,
236
+ requires_safetensors: bool,
237
237
  ) -> None:
238
238
  """Benchmark pretrained language models on language tasks."""
239
239
  models = list(model)
@@ -270,7 +270,7 @@ def benchmark(
270
270
  gpu_memory_utilization=gpu_memory_utilization,
271
271
  debug=debug,
272
272
  run_with_cli=True,
273
- only_allow_safetensors=only_allow_safetensors,
273
+ requires_safetensors=requires_safetensors,
274
274
  )
275
275
 
276
276
  # Perform the benchmark evaluation
euroeval/constants.py CHANGED
@@ -1,7 +1,6 @@
1
1
  """Constants used throughout the project."""
2
2
 
3
3
  from .enums import TaskGroup
4
- from .tasks import NER
5
4
 
6
5
  # This is used as input to generative models; it cannot be a special token
7
6
  DUMMY_FILL_VALUE = 100
@@ -11,7 +10,7 @@ DUMMY_FILL_VALUE = 100
11
10
  # benchmark. We will still report the models' true maximum context length in the
12
11
  # metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
13
12
  # all tokens in the context.
14
- MAX_CONTEXT_LENGTH = 5_000
13
+ MAX_CONTEXT_LENGTH = 8_192
15
14
 
16
15
 
17
16
  # We need to raise the amount of tokens generated for reasoning models, to give them
@@ -37,21 +36,10 @@ GENERATIVE_DATASET_TASK_GROUPS = [TaskGroup.TEXT_TO_TEXT]
37
36
  LOCAL_MODELS_REQUIRED_FILES = ["config.json"]
38
37
 
39
38
 
40
- # Tasks where we use structured generation for generative models
41
- TASKS_USING_JSON = [NER]
42
-
43
-
44
- # Tasks where we use log probabilities for generative models, rather than the raw
45
- # completion
46
- TASK_GROUPS_USING_LOGPROBS = [
47
- TaskGroup.SEQUENCE_CLASSIFICATION,
48
- TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION,
49
- ]
50
-
51
-
52
39
  # The number of top log probabilities to return for generative models. For several APIs
53
40
  # this is the maximum number of log probabilities that can be returned
54
- MAX_LOGPROBS = 8
41
+ MAX_VLLM_LOGPROBS = 20
42
+ MAX_LITELLM_LOGPROBS = 8
55
43
 
56
44
 
57
45
  # We make sure to remove these metric attributes after each iteration, to avoid memory
@@ -77,3 +65,19 @@ REASONING_TOKENS = [
77
65
  # manually. We only use them as stop tokens if they actually appear in the model's
78
66
  # output
79
67
  CUSTOM_STOP_TOKENS = ["<sep>"]
68
+
69
+
70
+ # For classification tasks we force LiteLLM models to output a JSON dictionary with a
71
+ # single key and the values being restricted to the allowed labels. This is the key we
72
+ # use
73
+ LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
74
+
75
+
76
+ # These characters are stripped from JSON output when trying to identify the label
77
+ JSON_STRIP_CHARACTERS = ' {}\n\r":'
78
+
79
+
80
+ # The number of tokens we generate when evaluating generative models on classification
81
+ # tasks. We also use this to determine whether we should store logprobs in the model
82
+ # outputs (and cache).
83
+ NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
euroeval/data_loading.py CHANGED
@@ -12,6 +12,7 @@ from huggingface_hub.errors import HfHubHTTPError
12
12
  from numpy.random import Generator
13
13
 
14
14
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark
15
+ from .tasks import EUROPEAN_VALUES
15
16
  from .utils import unscramble
16
17
 
17
18
  if t.TYPE_CHECKING:
@@ -48,40 +49,45 @@ def load_data(
48
49
  dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
49
50
  )
50
51
 
51
- if not benchmark_config.evaluate_test_split:
52
+ if not benchmark_config.evaluate_test_split and "val" in dataset:
52
53
  dataset["test"] = dataset["val"]
53
54
 
54
55
  # Remove empty examples from the datasets
55
56
  for text_feature in ["tokens", "text"]:
56
- if text_feature in dataset["train"].features:
57
- dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
57
+ for split in dataset_config.splits:
58
+ if text_feature in dataset[split].features:
59
+ dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
58
60
 
59
- # If we are testing then truncate the test set
60
- if hasattr(sys, "_called_from_test"):
61
+ # If we are testing then truncate the test set, unless we need the full set for
62
+ # evaluation
63
+ if hasattr(sys, "_called_from_test") and dataset_config.task != EUROPEAN_VALUES:
61
64
  dataset["test"] = dataset["test"].select(range(1))
62
65
 
63
- # Bootstrap the splits
64
- bootstrapped_splits: dict[str, list["Dataset"]] = dict()
65
- for split in ["train", "val", "test"]:
66
- bootstrap_indices = rng.integers(
67
- 0,
68
- len(dataset[split]),
69
- size=(benchmark_config.num_iterations, len(dataset[split])),
70
- )
71
- bootstrapped_splits[split] = [
72
- dataset[split].select(bootstrap_indices[idx])
66
+ # Bootstrap the splits, if applicable
67
+ if dataset_config.bootstrap_samples:
68
+ bootstrapped_splits: dict[str, list["Dataset"]] = dict()
69
+ for split in dataset_config.splits:
70
+ bootstrap_indices = rng.integers(
71
+ 0,
72
+ len(dataset[split]),
73
+ size=(benchmark_config.num_iterations, len(dataset[split])),
74
+ )
75
+ bootstrapped_splits[split] = [
76
+ dataset[split].select(bootstrap_indices[idx])
77
+ for idx in range(benchmark_config.num_iterations)
78
+ ]
79
+ datasets = [
80
+ DatasetDict(
81
+ {
82
+ split: bootstrapped_splits[split][idx]
83
+ for split in dataset_config.splits
84
+ }
85
+ )
73
86
  for idx in range(benchmark_config.num_iterations)
74
87
  ]
88
+ else:
89
+ datasets = [dataset] * benchmark_config.num_iterations
75
90
 
76
- datasets = [
77
- DatasetDict(
78
- {
79
- split: bootstrapped_splits[split][idx]
80
- for split in ["train", "val", "test"]
81
- }
82
- )
83
- for idx in range(benchmark_config.num_iterations)
84
- ]
85
91
  return datasets
86
92
 
87
93
 
@@ -113,7 +119,7 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
113
119
  requests.ConnectionError,
114
120
  requests.ReadTimeout,
115
121
  ):
116
- logger.warning(
122
+ logger.debug(
117
123
  f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
118
124
  )
119
125
  time.sleep(1)
@@ -126,11 +132,10 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
126
132
  f"{num_attempts} attempts."
127
133
  )
128
134
  assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
129
- required_keys = ["train", "val", "test"]
130
- missing_keys = [key for key in required_keys if key not in dataset]
135
+ missing_keys = [key for key in dataset_config.splits if key not in dataset]
131
136
  if missing_keys:
132
137
  raise InvalidBenchmark(
133
138
  "The dataset is missing the following required splits: "
134
139
  f"{', '.join(missing_keys)}"
135
140
  )
136
- return DatasetDict({key: dataset[key] for key in required_keys})
141
+ return DatasetDict({key: dataset[key] for key in dataset_config.splits})
euroeval/data_models.py CHANGED
@@ -9,11 +9,14 @@ from dataclasses import dataclass, field
9
9
  import pydantic
10
10
  import torch
11
11
 
12
- from .enums import Device, InferenceBackend, ModelType, TaskGroup
13
- from .metrics import Metric
12
+ from .enums import Device, GenerativeType, ModelType, TaskGroup
14
13
  from .types import ScoreDict
15
14
  from .utils import get_package_version
16
15
 
16
+ if t.TYPE_CHECKING:
17
+ from .enums import InferenceBackend
18
+ from .metrics import Metric
19
+
17
20
 
18
21
  @dataclass
19
22
  class Language:
@@ -104,15 +107,58 @@ class Task:
104
107
  using few-shot evaluation.
105
108
  default_labels:
106
109
  The default labels for datasets using this task.
110
+ requires_zero_shot (optional):
111
+ Whether to only allow zero-shot evaluation for this task. If True, the
112
+ task will not be evaluated using few-shot examples.
113
+ uses_structured_output (optional):
114
+ Whether the task uses structured output. If True, the task will return
115
+ structured output (e.g., BIO tags for NER). Defaults to False.
116
+ uses_logprobs (optional):
117
+ Whether the task uses log probabilities. If True, the task will return
118
+ log probabilities for the generated tokens. Defaults to False.
119
+ requires_logprobs (optional):
120
+ Whether the task requires log probabilities. Implies `uses_logprobs`.
121
+ allowed_model_types (optional):
122
+ A list of model types that are allowed to be evaluated on this task.
123
+ Defaults to all model types being allowed.
124
+ allowed_generative_types (optional):
125
+ A list of generative model types that are allowed to be evaluated on this
126
+ task. If None, all generative model types are allowed. Only relevant if
127
+ `allowed_model_types` includes generative models.
128
+ allow_invalid_model_outputs (optional):
129
+ Whether to allow invalid model outputs. This is only relevant for generative
130
+ models on classification tasks, where the model may generate an output
131
+ which is not one of the allowed labels. If True, the model output will be
132
+ mapped to the closest valid label. If False, the model output will be
133
+ considered incorrect and the evaluation will be aborted. Defaults to True.
107
134
  """
108
135
 
109
136
  name: str
110
137
  task_group: TaskGroup
111
138
  template_dict: dict["Language", "PromptConfig"]
112
- metrics: list[Metric]
139
+ metrics: list["Metric"]
113
140
  default_num_few_shot_examples: int
114
141
  default_max_generated_tokens: int
115
142
  default_labels: list[str]
143
+ requires_zero_shot: bool = False
144
+ uses_structured_output: bool = False
145
+ uses_logprobs: bool = False
146
+ requires_logprobs: bool = False
147
+ allowed_model_types: list[ModelType] = field(
148
+ default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
149
+ )
150
+ allowed_generative_types: list[GenerativeType] = field(
151
+ default_factory=lambda: [
152
+ GenerativeType.BASE,
153
+ GenerativeType.INSTRUCTION_TUNED,
154
+ GenerativeType.REASONING,
155
+ ]
156
+ )
157
+ allow_invalid_model_outputs: bool = True
158
+
159
+ def __post_init__(self) -> None:
160
+ """Post-initialisation checks."""
161
+ self.uses_logprobs = self.uses_logprobs or self.requires_logprobs
116
162
 
117
163
  def __hash__(self) -> int:
118
164
  """Return a hash of the task."""
@@ -177,7 +223,7 @@ class BenchmarkConfig:
177
223
  Whether to run the benchmark in debug mode.
178
224
  run_with_cli:
179
225
  Whether the benchmark is being run with the CLI.
180
- only_allow_safetensors:
226
+ requires_safetensors:
181
227
  Whether to only allow models that use the safetensors format.
182
228
  """
183
229
 
@@ -204,7 +250,7 @@ class BenchmarkConfig:
204
250
  gpu_memory_utilization: float
205
251
  debug: bool
206
252
  run_with_cli: bool
207
- only_allow_safetensors: bool
253
+ requires_safetensors: bool
208
254
 
209
255
 
210
256
  class BenchmarkConfigParams(pydantic.BaseModel):
@@ -236,7 +282,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
236
282
  gpu_memory_utilization: float
237
283
  debug: bool
238
284
  run_with_cli: bool
239
- only_allow_safetensors: bool
285
+ requires_safetensors: bool
240
286
 
241
287
 
242
288
  class BenchmarkResult(pydantic.BaseModel):
@@ -356,6 +402,11 @@ class DatasetConfig:
356
402
  to a 1:1 mapping between the labels and themselves. If None then the mapping
357
403
  will be set to the default mapping for the task and language. Defaults to
358
404
  None.
405
+ splits (optional):
406
+ The names of the splits in the dataset. If not provided, defaults to
407
+ ["train", "val", "test"].
408
+ bootstrap_samples (optional):
409
+ Whether to bootstrap the dataset samples. Defaults to True.
359
410
  unofficial (optional):
360
411
  Whether the dataset is unofficial. Defaults to False.
361
412
  """
@@ -372,6 +423,8 @@ class DatasetConfig:
372
423
  _max_generated_tokens: int | None = None
373
424
  _labels: list[str] | None = None
374
425
  _prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
426
+ splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
427
+ bootstrap_samples: bool = True
375
428
  unofficial: bool = False
376
429
 
377
430
  @property
@@ -384,7 +437,6 @@ class DatasetConfig:
384
437
  if self._prompt_prefix is None
385
438
  else self._prompt_prefix
386
439
  )
387
- prompt_prefix = prompt_prefix.replace("{labels_str}", self._labels_str)
388
440
  return prompt_prefix
389
441
 
390
442
  @property
@@ -397,7 +449,6 @@ class DatasetConfig:
397
449
  if self._prompt_template is None
398
450
  else self._prompt_template
399
451
  )
400
- prompt_template = prompt_template.replace("{labels_str}", self._labels_str)
401
452
  return prompt_template
402
453
 
403
454
  @property
@@ -410,9 +461,6 @@ class DatasetConfig:
410
461
  if self._instruction_prompt is None
411
462
  else self._instruction_prompt
412
463
  )
413
- instruction_prompt = instruction_prompt.replace(
414
- "{labels_str}", self._labels_str
415
- )
416
464
  return instruction_prompt
417
465
 
418
466
  @property
@@ -473,15 +521,16 @@ class DatasetConfig:
473
521
  """Return a hash of the dataset configuration."""
474
522
  return hash(self.name)
475
523
 
476
- @property
477
- def _labels_str(self) -> str:
524
+ def get_labels_str(self, labels: list[str] | None = None) -> str:
478
525
  """Converts a set of labels to a natural string, in the specified language.
479
526
 
480
527
  If the task is NER, we separate using 'and' and use the mapped labels instead of
481
528
  the BIO NER labels.
482
529
 
483
530
  Args:
484
- language: The language to be used when converting the labels.
531
+ labels (optional):
532
+ The labels to convert to a natural string. If None, uses all the labels
533
+ in the dataset. Defaults to None.
485
534
 
486
535
  Returns:
487
536
  The natural string representation of the labels in specified language.
@@ -493,16 +542,17 @@ class DatasetConfig:
493
542
  else:
494
543
  sep_word = main_language.or_separator
495
544
 
496
- local_labels: list[str] = []
497
- for label in self.labels:
498
- if label not in self.prompt_label_mapping:
499
- continue
500
- local_label = self.prompt_label_mapping[label]
501
- if local_label not in local_labels:
502
- local_labels.append(local_label)
545
+ if labels is None:
546
+ labels = list()
547
+ for english_label in self.labels:
548
+ if english_label not in self.prompt_label_mapping:
549
+ continue
550
+ label = self.prompt_label_mapping[english_label]
551
+ if label not in labels:
552
+ labels.append(label)
503
553
 
504
554
  # Convert labels to single-quoted labels - and remove duplicates
505
- quoted_labels = [f"'{label}'" for label in local_labels]
555
+ quoted_labels = [f"'{label}'" for label in labels]
506
556
 
507
557
  if not quoted_labels:
508
558
  return ""
@@ -546,7 +596,7 @@ class ModelConfig:
546
596
  revision: str
547
597
  task: str
548
598
  languages: list[Language]
549
- inference_backend: InferenceBackend
599
+ inference_backend: "InferenceBackend"
550
600
  merge: bool
551
601
  model_type: ModelType
552
602
  fresh: bool
@@ -6,12 +6,14 @@ from ..tasks import SPEED
6
6
  from .danish import * # noqa: F403
7
7
  from .dutch import * # noqa: F403
8
8
  from .english import * # noqa: F403
9
+ from .estonian import * # noqa: F403
9
10
  from .faroese import * # noqa: F403
10
11
  from .finnish import * # noqa: F403
11
12
  from .french import * # noqa: F403
12
13
  from .german import * # noqa: F403
13
14
  from .icelandic import * # noqa: F403
14
15
  from .italian import * # noqa: F403
16
+ from .latvian import * # noqa: F403
15
17
  from .norwegian import * # noqa: F403
16
18
  from .portuguese import * # noqa: F403
17
19
  from .spanish import * # noqa: F403
@@ -2,7 +2,7 @@
2
2
 
3
3
  from ..data_models import DatasetConfig
4
4
  from ..languages import DA
5
- from ..tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SUMM
5
+ from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
6
6
 
7
7
  ### Official datasets ###
8
8
 
@@ -76,6 +76,16 @@ HELLASWAG_DA_CONFIG = DatasetConfig(
76
76
  languages=[DA],
77
77
  )
78
78
 
79
+ EUROPEAN_VALUES_DA_CONFIG = DatasetConfig(
80
+ name="european-values-da",
81
+ pretty_name="the Danish version of the European values evaluation dataset",
82
+ huggingface_id="EuroEval/european-values-da",
83
+ task=EUROPEAN_VALUES,
84
+ languages=[DA],
85
+ splits=["test"],
86
+ bootstrap_samples=False,
87
+ )
88
+
79
89
 
80
90
  ### Unofficial datasets ###
81
91
 
@@ -138,3 +148,27 @@ GOLDENSWAG_DA_CONFIG = DatasetConfig(
138
148
  languages=[DA],
139
149
  unofficial=True,
140
150
  )
151
+
152
+ EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
153
+ name="european-values-situational-da",
154
+ pretty_name="the Danish version of the European values evaluation dataset, where "
155
+ "the questions are phrased in a situational way",
156
+ huggingface_id="EuroEval/european-values-situational-da",
157
+ task=EUROPEAN_VALUES,
158
+ languages=[DA],
159
+ splits=["test"],
160
+ bootstrap_samples=False,
161
+ unofficial=True,
162
+ )
163
+
164
+ EUROPEAN_VALUES_COMPLETIONS_DA_CONFIG = DatasetConfig(
165
+ name="european-values-completions-da",
166
+ pretty_name="the Danish version of the European values evaluation dataset, where "
167
+ "the questions are phrased as sentence completions",
168
+ huggingface_id="EuroEval/european-values-completions-da",
169
+ task=EUROPEAN_VALUES,
170
+ languages=[DA],
171
+ splits=["test"],
172
+ bootstrap_samples=False,
173
+ unofficial=True,
174
+ )