ScandEval 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scandeval/__init__.py +0 -9
- scandeval/async_utils.py +46 -0
- scandeval/benchmark_config_factory.py +31 -2
- scandeval/benchmark_modules/fresh.py +2 -1
- scandeval/benchmark_modules/hf.py +76 -23
- scandeval/benchmark_modules/litellm.py +33 -15
- scandeval/benchmark_modules/vllm.py +97 -44
- scandeval/benchmarker.py +29 -33
- scandeval/cli.py +11 -0
- scandeval/constants.py +36 -2
- scandeval/custom_dataset_configs.py +152 -0
- scandeval/data_loading.py +87 -31
- scandeval/data_models.py +405 -224
- scandeval/dataset_configs/__init__.py +51 -25
- scandeval/dataset_configs/albanian.py +1 -1
- scandeval/dataset_configs/belarusian.py +47 -0
- scandeval/dataset_configs/bulgarian.py +1 -1
- scandeval/dataset_configs/catalan.py +1 -1
- scandeval/dataset_configs/croatian.py +1 -1
- scandeval/dataset_configs/danish.py +3 -2
- scandeval/dataset_configs/dutch.py +16 -5
- scandeval/dataset_configs/english.py +4 -3
- scandeval/dataset_configs/estonian.py +8 -7
- scandeval/dataset_configs/faroese.py +1 -1
- scandeval/dataset_configs/finnish.py +5 -4
- scandeval/dataset_configs/french.py +6 -5
- scandeval/dataset_configs/german.py +4 -3
- scandeval/dataset_configs/greek.py +1 -1
- scandeval/dataset_configs/hungarian.py +1 -1
- scandeval/dataset_configs/icelandic.py +4 -3
- scandeval/dataset_configs/italian.py +4 -3
- scandeval/dataset_configs/latvian.py +2 -2
- scandeval/dataset_configs/lithuanian.py +1 -1
- scandeval/dataset_configs/norwegian.py +6 -5
- scandeval/dataset_configs/polish.py +4 -3
- scandeval/dataset_configs/portuguese.py +5 -4
- scandeval/dataset_configs/romanian.py +2 -2
- scandeval/dataset_configs/serbian.py +1 -1
- scandeval/dataset_configs/slovene.py +1 -1
- scandeval/dataset_configs/spanish.py +4 -3
- scandeval/dataset_configs/swedish.py +4 -3
- scandeval/dataset_configs/ukrainian.py +1 -1
- scandeval/generation_utils.py +6 -6
- scandeval/metrics/__init__.py +1 -0
- scandeval/metrics/bias.py +237 -0
- scandeval/metrics/huggingface.py +2 -1
- scandeval/metrics/llm_as_a_judge.py +1 -1
- scandeval/metrics/pipeline.py +1 -1
- scandeval/model_cache.py +34 -4
- scandeval/prompt_templates/linguistic_acceptability.py +9 -0
- scandeval/prompt_templates/multiple_choice.py +9 -0
- scandeval/prompt_templates/named_entity_recognition.py +21 -0
- scandeval/prompt_templates/reading_comprehension.py +10 -0
- scandeval/prompt_templates/sentiment_classification.py +11 -0
- scandeval/string_utils.py +157 -0
- scandeval/task_group_utils/sequence_classification.py +2 -5
- scandeval/task_group_utils/token_classification.py +2 -4
- scandeval/tasks.py +22 -0
- scandeval/tokenisation_utils.py +12 -1
- scandeval/utils.py +13 -383
- scandeval-16.13.0.dist-info/METADATA +334 -0
- scandeval-16.13.0.dist-info/RECORD +94 -0
- scandeval-16.11.0.dist-info/METADATA +0 -649
- scandeval-16.11.0.dist-info/RECORD +0 -89
- {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
- {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
- {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0
scandeval/data_models.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
"""Data models used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
+
import importlib.metadata
|
|
5
|
+
import importlib.util
|
|
4
6
|
import json
|
|
7
|
+
import logging
|
|
5
8
|
import re
|
|
6
9
|
import typing as t
|
|
7
10
|
from copy import deepcopy
|
|
@@ -12,6 +15,7 @@ import pydantic
|
|
|
12
15
|
import torch
|
|
13
16
|
from transformers.generation.configuration_utils import GenerationConfig
|
|
14
17
|
|
|
18
|
+
from .constants import ATTENTION_BACKENDS, MAX_NUMBER_OF_LOGGING_LANGUAGES
|
|
15
19
|
from .enums import Device, GenerativeType, ModelType, TaskGroup
|
|
16
20
|
from .exceptions import InvalidBenchmark
|
|
17
21
|
from .languages import (
|
|
@@ -23,14 +27,30 @@ from .languages import (
|
|
|
23
27
|
PORTUGUESE,
|
|
24
28
|
Language,
|
|
25
29
|
)
|
|
30
|
+
from .logging_utils import log_once
|
|
26
31
|
from .metrics.base import Metric
|
|
27
32
|
from .types import ScoreDict
|
|
28
|
-
from .utils import get_package_version
|
|
29
33
|
|
|
30
34
|
if t.TYPE_CHECKING:
|
|
31
35
|
from .enums import InferenceBackend
|
|
32
36
|
|
|
33
37
|
|
|
38
|
+
def get_package_version(package_name: str) -> str | None:
|
|
39
|
+
"""Get the version of a package.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
package_name:
|
|
43
|
+
The name of the package.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
The version of the package, or None if the package is not installed.
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
return importlib.metadata.version(package_name)
|
|
50
|
+
except importlib.metadata.PackageNotFoundError:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
34
54
|
@dataclass
|
|
35
55
|
class PromptConfig:
|
|
36
56
|
"""Configuration for task-specific prompting across languages.
|
|
@@ -79,8 +99,9 @@ class Task:
|
|
|
79
99
|
default_max_generated_tokens:
|
|
80
100
|
The default maximum number of tokens to generate when benchmarking the task
|
|
81
101
|
using few-shot evaluation.
|
|
82
|
-
default_labels:
|
|
83
|
-
The default labels for datasets using this task.
|
|
102
|
+
default_labels (optional):
|
|
103
|
+
The default labels for datasets using this task. Can be None if the labels
|
|
104
|
+
should be set manually in the dataset configs. Defaults to an empty tuple.
|
|
84
105
|
requires_zero_shot (optional):
|
|
85
106
|
Whether to only allow zero-shot evaluation for this task. If True, the
|
|
86
107
|
task will not be evaluated using few-shot examples.
|
|
@@ -117,7 +138,7 @@ class Task:
|
|
|
117
138
|
metrics: c.Sequence[Metric]
|
|
118
139
|
default_num_few_shot_examples: int
|
|
119
140
|
default_max_generated_tokens: int
|
|
120
|
-
default_labels: c.Sequence[str] | None
|
|
141
|
+
default_labels: c.Sequence[str] | None = tuple()
|
|
121
142
|
requires_zero_shot: bool = False
|
|
122
143
|
uses_structured_output: bool = False
|
|
123
144
|
uses_logprobs: bool = False
|
|
@@ -143,133 +164,362 @@ class Task:
|
|
|
143
164
|
return hash(self.name)
|
|
144
165
|
|
|
145
166
|
|
|
146
|
-
@dataclass
|
|
147
167
|
class DatasetConfig:
|
|
148
|
-
"""Configuration for a dataset.
|
|
168
|
+
"""Configuration for a dataset."""
|
|
169
|
+
|
|
170
|
+
def __init__(
|
|
171
|
+
self,
|
|
172
|
+
task: Task,
|
|
173
|
+
languages: c.Sequence[Language],
|
|
174
|
+
name: str | None = None,
|
|
175
|
+
pretty_name: str | None = None,
|
|
176
|
+
source: str | dict[str, str] | None = None,
|
|
177
|
+
prompt_prefix: str | None = None,
|
|
178
|
+
prompt_template: str | None = None,
|
|
179
|
+
instruction_prompt: str | None = None,
|
|
180
|
+
num_few_shot_examples: int | None = None,
|
|
181
|
+
max_generated_tokens: int | None = None,
|
|
182
|
+
labels: c.Sequence[str] | None = None,
|
|
183
|
+
prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None,
|
|
184
|
+
allowed_model_types: c.Sequence[ModelType] | None = None,
|
|
185
|
+
allowed_generative_types: c.Sequence[GenerativeType] | None = None,
|
|
186
|
+
allow_invalid_model_outputs: bool | None = None,
|
|
187
|
+
train_split: str | None = "train",
|
|
188
|
+
val_split: str | None = "val",
|
|
189
|
+
test_split: str = "test",
|
|
190
|
+
bootstrap_samples: bool = True,
|
|
191
|
+
unofficial: bool = False,
|
|
192
|
+
_prompt_prefix: str | None = None,
|
|
193
|
+
_prompt_template: str | None = None,
|
|
194
|
+
_instruction_prompt: str | None = None,
|
|
195
|
+
_num_few_shot_examples: int | None = None,
|
|
196
|
+
_max_generated_tokens: int | None = None,
|
|
197
|
+
_labels: c.Sequence[str] | None = None,
|
|
198
|
+
_prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None,
|
|
199
|
+
_allowed_model_types: c.Sequence[ModelType] | None = None,
|
|
200
|
+
_allowed_generative_types: c.Sequence[GenerativeType] | None = None,
|
|
201
|
+
_allow_invalid_model_outputs: bool | None = None,
|
|
202
|
+
_logging_string: str | None = None,
|
|
203
|
+
) -> None:
|
|
204
|
+
"""Initialise a DatasetConfig object.
|
|
149
205
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
206
|
+
Args:
|
|
207
|
+
task:
|
|
208
|
+
The task of the dataset.
|
|
209
|
+
languages:
|
|
210
|
+
The ISO 639-1 language codes of the entries in the dataset.
|
|
211
|
+
name (optional):
|
|
212
|
+
The name of the dataset. Must be lower case with no spaces. Can be None
|
|
213
|
+
if and only if the dataset config resides directly in the Hugging Face
|
|
214
|
+
dataset repo. Defaults to None.
|
|
215
|
+
pretty_name (optional):
|
|
216
|
+
A longer prettier name for the dataset, which allows cases and spaces.
|
|
217
|
+
Used for logging. Can be None if and only if the dataset config resides
|
|
218
|
+
directly in the Hugging Face dataset repo. Defaults to None.
|
|
219
|
+
source (optional):
|
|
220
|
+
The source of the dataset, which can be a Hugging Face ID or a
|
|
221
|
+
dictionary with keys "train", "val" and "test" mapping to local CSV file
|
|
222
|
+
paths. Can be None if and only if the dataset config resides directly in
|
|
223
|
+
the Hugging Face dataset repo. Defaults to None.
|
|
224
|
+
prompt_prefix (optional):
|
|
225
|
+
The prefix to use in the few-shot prompt. Defaults to the template for
|
|
226
|
+
the task and language.
|
|
227
|
+
prompt_template (optional):
|
|
228
|
+
The template for the prompt to use when benchmarking the dataset using
|
|
229
|
+
few-shot evaluation. Defaults to the template for the task and language.
|
|
230
|
+
instruction_prompt (optional):
|
|
231
|
+
The prompt to use when benchmarking the dataset using instruction-based
|
|
232
|
+
evaluation. Defaults to the template for the task and language.
|
|
233
|
+
num_few_shot_examples (optional):
|
|
234
|
+
The number of examples to use when benchmarking the dataset using
|
|
235
|
+
few-shot evaluation. For a classification task, these will be drawn
|
|
236
|
+
evenly from each label. Defaults to the template for the task and
|
|
237
|
+
language.
|
|
238
|
+
max_generated_tokens (optional):
|
|
239
|
+
The maximum number of tokens to generate when benchmarking the dataset
|
|
240
|
+
using few-shot evaluation. Defaults to the template for the task and
|
|
241
|
+
language.
|
|
242
|
+
labels (optional):
|
|
243
|
+
The labels in the dataset. Defaults to the template for the task and
|
|
244
|
+
language.
|
|
245
|
+
prompt_label_mapping (optional):
|
|
246
|
+
A mapping from the labels to another phrase which is used as a
|
|
247
|
+
substitute for the label in few-shot evaluation. If "auto" then the
|
|
248
|
+
mapping will be set to a 1:1 mapping between the labels and themselves.
|
|
249
|
+
If None then the mapping will be set to the default mapping for the task
|
|
250
|
+
and language. Defaults to None.
|
|
251
|
+
allowed_model_types (optional):
|
|
252
|
+
A list of model types that are allowed to be evaluated on this dataset.
|
|
253
|
+
Defaults to the one for the task.
|
|
254
|
+
allowed_generative_types (optional):
|
|
255
|
+
A list of generative model types that are allowed to be evaluated on
|
|
256
|
+
this dataset. If None, all generative model types are allowed. Only
|
|
257
|
+
relevant if `allowed_model_types` includes generative models. Defaults
|
|
258
|
+
to the one for the task.
|
|
259
|
+
allow_invalid_model_outputs (optional):
|
|
260
|
+
Whether to allow invalid model outputs. This is only relevant for
|
|
261
|
+
generative models on classification tasks, where the model may generate
|
|
262
|
+
an output which is not one of the allowed labels. If True, the model
|
|
263
|
+
output will be mapped to the closest valid label. If False, the model
|
|
264
|
+
output will be considered incorrect and the evaluation will be aborted.
|
|
265
|
+
Defaults to the one for the task.
|
|
266
|
+
train_split (optional):
|
|
267
|
+
The name of the split to use as the training set. Can be None if there
|
|
268
|
+
is no training split in the dataset. Defaults to "train".
|
|
269
|
+
val_split (optional):
|
|
270
|
+
The name of the split to use as the validation set. Can be None if there
|
|
271
|
+
is no validation split in the dataset. Defaults to "val".
|
|
272
|
+
test_split (optional):
|
|
273
|
+
The name of the split to use as the test set. Defaults to "test".
|
|
274
|
+
bootstrap_samples (optional):
|
|
275
|
+
Whether to bootstrap the dataset samples. Defaults to True.
|
|
276
|
+
unofficial (optional):
|
|
277
|
+
Whether the dataset is unofficial. Defaults to False.
|
|
278
|
+
_prompt_prefix (optional):
|
|
279
|
+
This argument is deprecated. Please use `prompt_prefix` instead.
|
|
280
|
+
_prompt_template (optional):
|
|
281
|
+
This argument is deprecated. Please use `prompt_template` instead.
|
|
282
|
+
_instruction_prompt (optional):
|
|
283
|
+
This argument is deprecated. Please use `instruction_prompt` instead.
|
|
284
|
+
_num_few_shot_examples (optional):
|
|
285
|
+
This argument is deprecated. Please use `num_few_shot_examples` instead.
|
|
286
|
+
_max_generated_tokens (optional):
|
|
287
|
+
This argument is deprecated. Please use `max_generated_tokens` instead.
|
|
288
|
+
_labels (optional):
|
|
289
|
+
This argument is deprecated. Please use `labels` instead.
|
|
290
|
+
_prompt_label_mapping (optional):
|
|
291
|
+
This argument is deprecated. Please use `prompt_label_mapping` instead.
|
|
292
|
+
_allowed_model_types (optional):
|
|
293
|
+
This argument is deprecated. Please use `allowed_model_types` instead.
|
|
294
|
+
_allowed_generative_types (optional):
|
|
295
|
+
This argument is deprecated. Please use `allowed_generative_types`
|
|
296
|
+
instead.
|
|
297
|
+
_allow_invalid_model_outputs (optional):
|
|
298
|
+
This argument is deprecated. Please use `allow_invalid_model_outputs`
|
|
299
|
+
instead.
|
|
300
|
+
_logging_string (optional):
|
|
301
|
+
This argument is deprecated. Please use `logging_string` instead.
|
|
302
|
+
"""
|
|
303
|
+
# Deprecation warnings
|
|
304
|
+
if _prompt_prefix is not None:
|
|
305
|
+
log_once(
|
|
306
|
+
"The `_prompt_prefix` argument is deprecated. Please use "
|
|
307
|
+
"`prompt_prefix` instead.",
|
|
308
|
+
level=logging.WARNING,
|
|
309
|
+
)
|
|
310
|
+
prompt_prefix = _prompt_prefix
|
|
311
|
+
if _prompt_template is not None:
|
|
312
|
+
log_once(
|
|
313
|
+
"The `_prompt_template` argument is deprecated. Please use "
|
|
314
|
+
"`prompt_template` instead.",
|
|
315
|
+
level=logging.WARNING,
|
|
316
|
+
)
|
|
317
|
+
prompt_template = _prompt_template
|
|
318
|
+
if _instruction_prompt is not None:
|
|
319
|
+
log_once(
|
|
320
|
+
"The `_instruction_prompt` argument is deprecated. Please use "
|
|
321
|
+
"`instruction_prompt` instead.",
|
|
322
|
+
level=logging.WARNING,
|
|
323
|
+
)
|
|
324
|
+
instruction_prompt = _instruction_prompt
|
|
325
|
+
if _num_few_shot_examples is not None:
|
|
326
|
+
log_once(
|
|
327
|
+
"The `_num_few_shot_examples` argument is deprecated. Please use "
|
|
328
|
+
"`num_few_shot_examples` instead.",
|
|
329
|
+
level=logging.WARNING,
|
|
330
|
+
)
|
|
331
|
+
num_few_shot_examples = _num_few_shot_examples
|
|
332
|
+
if _max_generated_tokens is not None:
|
|
333
|
+
log_once(
|
|
334
|
+
"The `_max_generated_tokens` argument is deprecated. Please use "
|
|
335
|
+
"`max_generated_tokens` instead.",
|
|
336
|
+
level=logging.WARNING,
|
|
337
|
+
)
|
|
338
|
+
max_generated_tokens = _max_generated_tokens
|
|
339
|
+
if _labels is not None:
|
|
340
|
+
log_once(
|
|
341
|
+
"The `_labels` argument is deprecated. Please use `labels` instead.",
|
|
342
|
+
level=logging.WARNING,
|
|
343
|
+
)
|
|
344
|
+
labels = _labels
|
|
345
|
+
if _prompt_label_mapping is not None:
|
|
346
|
+
log_once(
|
|
347
|
+
"The `_prompt_label_mapping` argument is deprecated. Please use "
|
|
348
|
+
"`prompt_label_mapping` instead.",
|
|
349
|
+
level=logging.WARNING,
|
|
350
|
+
)
|
|
351
|
+
prompt_label_mapping = _prompt_label_mapping
|
|
352
|
+
if _allowed_model_types is not None:
|
|
353
|
+
log_once(
|
|
354
|
+
"The `_allowed_model_types` argument is deprecated. Please use "
|
|
355
|
+
"`allowed_model_types` instead.",
|
|
356
|
+
level=logging.WARNING,
|
|
357
|
+
)
|
|
358
|
+
allowed_model_types = _allowed_model_types
|
|
359
|
+
if _allowed_generative_types is not None:
|
|
360
|
+
log_once(
|
|
361
|
+
"The `_allowed_generative_types` argument is deprecated. Please use "
|
|
362
|
+
"`allowed_generative_types` instead.",
|
|
363
|
+
level=logging.WARNING,
|
|
364
|
+
)
|
|
365
|
+
allowed_generative_types = _allowed_generative_types
|
|
366
|
+
if _allow_invalid_model_outputs is not None:
|
|
367
|
+
log_once(
|
|
368
|
+
"The `_allow_invalid_model_outputs` argument is deprecated. Please use "
|
|
369
|
+
"`allow_invalid_model_outputs` instead.",
|
|
370
|
+
level=logging.WARNING,
|
|
371
|
+
)
|
|
372
|
+
allow_invalid_model_outputs = _allow_invalid_model_outputs
|
|
373
|
+
if _logging_string is not None:
|
|
374
|
+
log_once(
|
|
375
|
+
"The `_logging_string` argument is deprecated and is not used anymore. "
|
|
376
|
+
"Using it will have no effect.",
|
|
377
|
+
level=logging.WARNING,
|
|
378
|
+
)
|
|
222
379
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
380
|
+
self._name = name
|
|
381
|
+
self._pretty_name = pretty_name
|
|
382
|
+
self._source = source
|
|
383
|
+
self.task = task
|
|
384
|
+
self.languages = languages
|
|
385
|
+
|
|
386
|
+
template = self.task.template_dict.get(self.main_language)
|
|
387
|
+
self.prompt_prefix = (
|
|
388
|
+
prompt_prefix
|
|
389
|
+
if prompt_prefix is not None
|
|
390
|
+
else template.default_prompt_prefix
|
|
391
|
+
if template is not None
|
|
392
|
+
else ""
|
|
393
|
+
)
|
|
394
|
+
self.prompt_template = (
|
|
395
|
+
prompt_template
|
|
396
|
+
if prompt_template is not None
|
|
397
|
+
else template.default_prompt_template
|
|
398
|
+
if template is not None
|
|
399
|
+
else ""
|
|
400
|
+
)
|
|
401
|
+
self.instruction_prompt = (
|
|
402
|
+
instruction_prompt
|
|
403
|
+
if instruction_prompt is not None
|
|
404
|
+
else template.default_instruction_prompt
|
|
405
|
+
if template is not None
|
|
406
|
+
else ""
|
|
407
|
+
)
|
|
408
|
+
self.num_few_shot_examples = (
|
|
409
|
+
num_few_shot_examples
|
|
410
|
+
if num_few_shot_examples is not None
|
|
411
|
+
else self.task.default_num_few_shot_examples
|
|
412
|
+
)
|
|
413
|
+
self.max_generated_tokens = (
|
|
414
|
+
max_generated_tokens
|
|
415
|
+
if max_generated_tokens is not None
|
|
416
|
+
else self.task.default_max_generated_tokens
|
|
417
|
+
)
|
|
418
|
+
self.labels = (
|
|
419
|
+
labels if labels is not None else self.task.default_labels or list()
|
|
420
|
+
)
|
|
421
|
+
if prompt_label_mapping is None:
|
|
422
|
+
prompt_label_mapping = (
|
|
423
|
+
template.default_prompt_label_mapping
|
|
424
|
+
if template is not None
|
|
425
|
+
else dict()
|
|
426
|
+
)
|
|
427
|
+
self.prompt_label_mapping = (
|
|
428
|
+
{label: label for label in self.labels}
|
|
429
|
+
if prompt_label_mapping == "auto"
|
|
430
|
+
else prompt_label_mapping
|
|
431
|
+
)
|
|
432
|
+
self.allowed_model_types = (
|
|
433
|
+
allowed_model_types
|
|
434
|
+
if allowed_model_types is not None
|
|
435
|
+
else self.task.default_allowed_model_types
|
|
436
|
+
)
|
|
437
|
+
self.allowed_generative_types = (
|
|
438
|
+
allowed_generative_types
|
|
439
|
+
if allowed_generative_types is not None
|
|
440
|
+
else self.task.default_allowed_generative_types
|
|
441
|
+
)
|
|
442
|
+
self.allow_invalid_model_outputs = (
|
|
443
|
+
allow_invalid_model_outputs
|
|
444
|
+
if allow_invalid_model_outputs is not None
|
|
445
|
+
else self.task.default_allow_invalid_model_outputs
|
|
446
|
+
)
|
|
447
|
+
self.train_split = train_split
|
|
448
|
+
self.val_split = val_split
|
|
449
|
+
self.test_split = test_split
|
|
450
|
+
self.bootstrap_samples = bootstrap_samples
|
|
451
|
+
self.unofficial = unofficial
|
|
242
452
|
|
|
243
453
|
@property
|
|
244
|
-
def
|
|
245
|
-
"""
|
|
454
|
+
def name(self) -> str:
|
|
455
|
+
"""The name of the dataset.
|
|
246
456
|
|
|
247
457
|
Returns:
|
|
248
|
-
The
|
|
458
|
+
The name of the dataset.
|
|
249
459
|
"""
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
460
|
+
if self._name is None:
|
|
461
|
+
raise ValueError("The name of the dataset is not set!")
|
|
462
|
+
return self._name
|
|
463
|
+
|
|
464
|
+
@name.setter
|
|
465
|
+
def name(self, value: str) -> None:
|
|
466
|
+
"""Set the name of the dataset.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
value:
|
|
470
|
+
The new name of the dataset.
|
|
471
|
+
"""
|
|
472
|
+
self._name = value
|
|
473
|
+
|
|
474
|
+
@property
|
|
475
|
+
def pretty_name(self) -> str:
|
|
476
|
+
"""The pretty name of the dataset.
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
The pretty name of the dataset.
|
|
480
|
+
"""
|
|
481
|
+
if self._pretty_name is None:
|
|
482
|
+
raise ValueError("The pretty name of the dataset is not set!")
|
|
483
|
+
return self._pretty_name
|
|
484
|
+
|
|
485
|
+
@pretty_name.setter
|
|
486
|
+
def pretty_name(self, value: str) -> None:
|
|
487
|
+
"""Set the pretty name of the dataset.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
value:
|
|
491
|
+
The new pretty name of the dataset.
|
|
492
|
+
"""
|
|
493
|
+
self._pretty_name = value
|
|
494
|
+
|
|
495
|
+
@property
|
|
496
|
+
def source(self) -> str | dict[str, str]:
|
|
497
|
+
"""The source of the dataset.
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
The source of the dataset.
|
|
501
|
+
"""
|
|
502
|
+
if self._source is None:
|
|
503
|
+
raise ValueError("The source of the dataset is not set!")
|
|
504
|
+
return self._source
|
|
505
|
+
|
|
506
|
+
@source.setter
|
|
507
|
+
def source(self, value: str | dict[str, str]) -> None:
|
|
508
|
+
"""Set the source of the dataset.
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
value:
|
|
512
|
+
The new source of the dataset.
|
|
513
|
+
"""
|
|
514
|
+
self._source = value
|
|
266
515
|
|
|
267
516
|
@property
|
|
268
517
|
def logging_string(self) -> str:
|
|
269
|
-
"""The string used to describe evaluation on the dataset in logging.
|
|
270
|
-
if self._logging_string is not None:
|
|
271
|
-
return self._logging_string
|
|
518
|
+
"""The string used to describe evaluation on the dataset in logging.
|
|
272
519
|
|
|
520
|
+
Returns:
|
|
521
|
+
The logging string.
|
|
522
|
+
"""
|
|
273
523
|
truncated_str = (
|
|
274
524
|
"truncated version of the "
|
|
275
525
|
if isinstance(self.source, str) and self.source.endswith("-mini")
|
|
@@ -293,126 +543,48 @@ class DatasetConfig:
|
|
|
293
543
|
if PORTUGUESE in self.languages and EUROPEAN_PORTUGUESE in self.languages:
|
|
294
544
|
logging_languages.remove(EUROPEAN_PORTUGUESE)
|
|
295
545
|
|
|
296
|
-
if len(logging_languages) >
|
|
546
|
+
if len(logging_languages) > MAX_NUMBER_OF_LOGGING_LANGUAGES:
|
|
547
|
+
languages_str = ""
|
|
548
|
+
elif len(logging_languages) > 1:
|
|
297
549
|
languages_str = (
|
|
298
550
|
", ".join([lang.name for lang in logging_languages[:-1]])
|
|
299
551
|
+ f" and {logging_languages[-1].name}"
|
|
552
|
+
+ " "
|
|
300
553
|
)
|
|
301
554
|
else:
|
|
302
|
-
languages_str = logging_languages[0].name
|
|
555
|
+
languages_str = logging_languages[0].name + " "
|
|
303
556
|
|
|
304
557
|
task_str = self.task.name.replace("-", " ")
|
|
305
558
|
dataset_name_str = (
|
|
306
559
|
self.pretty_name or self.name.replace("-", " ").replace("_", " ").title()
|
|
307
560
|
)
|
|
308
561
|
return (
|
|
309
|
-
f"the {truncated_str}{languages_str}
|
|
310
|
-
)
|
|
311
|
-
|
|
312
|
-
@property
|
|
313
|
-
def prompt_prefix(self) -> str:
|
|
314
|
-
"""The prefix to use in the few-shot prompt."""
|
|
315
|
-
prompt_config = self.task.template_dict[self.main_language]
|
|
316
|
-
prompt_prefix = (
|
|
317
|
-
prompt_config.default_prompt_prefix
|
|
318
|
-
if self._prompt_prefix is None
|
|
319
|
-
else self._prompt_prefix
|
|
320
|
-
)
|
|
321
|
-
return prompt_prefix
|
|
322
|
-
|
|
323
|
-
@property
|
|
324
|
-
def prompt_template(self) -> str:
|
|
325
|
-
"""The template used during few-shot evaluation."""
|
|
326
|
-
prompt_config = self.task.template_dict[self.main_language]
|
|
327
|
-
prompt_template = (
|
|
328
|
-
prompt_config.default_prompt_template
|
|
329
|
-
if self._prompt_template is None
|
|
330
|
-
else self._prompt_template
|
|
331
|
-
)
|
|
332
|
-
return prompt_template
|
|
333
|
-
|
|
334
|
-
@property
|
|
335
|
-
def instruction_prompt(self) -> str:
|
|
336
|
-
"""The prompt to use when evaluating instruction-tuned models."""
|
|
337
|
-
prompt_config = self.task.template_dict[self.main_language]
|
|
338
|
-
instruction_prompt = (
|
|
339
|
-
prompt_config.default_instruction_prompt
|
|
340
|
-
if self._instruction_prompt is None
|
|
341
|
-
else self._instruction_prompt
|
|
342
|
-
)
|
|
343
|
-
return instruction_prompt
|
|
344
|
-
|
|
345
|
-
@property
|
|
346
|
-
def num_few_shot_examples(self) -> int:
|
|
347
|
-
"""The number of few-shot examples to use."""
|
|
348
|
-
return (
|
|
349
|
-
self._num_few_shot_examples
|
|
350
|
-
if self._num_few_shot_examples is not None
|
|
351
|
-
else self.task.default_num_few_shot_examples
|
|
352
|
-
)
|
|
353
|
-
|
|
354
|
-
@property
|
|
355
|
-
def max_generated_tokens(self) -> int:
|
|
356
|
-
"""The maximum number of tokens to generate when evaluating a model."""
|
|
357
|
-
return (
|
|
358
|
-
self._max_generated_tokens
|
|
359
|
-
if self._max_generated_tokens is not None
|
|
360
|
-
else self.task.default_max_generated_tokens
|
|
361
|
-
)
|
|
362
|
-
|
|
363
|
-
@property
|
|
364
|
-
def labels(self) -> c.Sequence[str]:
|
|
365
|
-
"""The labels in the dataset."""
|
|
366
|
-
if self._labels is not None:
|
|
367
|
-
return self._labels
|
|
368
|
-
elif self.task.default_labels is not None:
|
|
369
|
-
return self.task.default_labels
|
|
370
|
-
else:
|
|
371
|
-
raise ValueError(
|
|
372
|
-
f"Labels must be specified for dataset {self.name!r} with the "
|
|
373
|
-
f"attribute `_labels`, as the task {self.task.name!r} does not have "
|
|
374
|
-
"default labels."
|
|
375
|
-
)
|
|
376
|
-
|
|
377
|
-
@property
|
|
378
|
-
def prompt_label_mapping(self) -> dict[str, str]:
|
|
379
|
-
"""Mapping from English labels to localised labels."""
|
|
380
|
-
if self._prompt_label_mapping == "auto":
|
|
381
|
-
return {label: label for label in self.labels}
|
|
382
|
-
elif self._prompt_label_mapping is not None:
|
|
383
|
-
return self._prompt_label_mapping
|
|
384
|
-
prompt_config = self.task.template_dict[self.main_language]
|
|
385
|
-
if prompt_config.default_prompt_label_mapping == "auto":
|
|
386
|
-
return {label: label for label in self.labels}
|
|
387
|
-
else:
|
|
388
|
-
return prompt_config.default_prompt_label_mapping
|
|
389
|
-
|
|
390
|
-
@property
|
|
391
|
-
def allowed_model_types(self) -> c.Sequence[ModelType]:
|
|
392
|
-
"""A list of model types that are allowed to be evaluated on this dataset."""
|
|
393
|
-
return (
|
|
394
|
-
self._allowed_model_types
|
|
395
|
-
if self._allowed_model_types is not None
|
|
396
|
-
else self.task.default_allowed_model_types
|
|
562
|
+
f"the {truncated_str}{languages_str}{task_str} dataset {dataset_name_str}"
|
|
397
563
|
)
|
|
398
564
|
|
|
399
565
|
@property
|
|
400
|
-
def
|
|
401
|
-
"""
|
|
402
|
-
return (
|
|
403
|
-
self._allowed_generative_types
|
|
404
|
-
if self._allowed_generative_types is not None
|
|
405
|
-
else self.task.default_allowed_generative_types
|
|
406
|
-
)
|
|
566
|
+
def main_language(self) -> Language:
|
|
567
|
+
"""Get the main language of the dataset.
|
|
407
568
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
"""
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
569
|
+
Returns:
|
|
570
|
+
The main language.
|
|
571
|
+
"""
|
|
572
|
+
match len(self.languages):
|
|
573
|
+
case 0:
|
|
574
|
+
raise InvalidBenchmark(
|
|
575
|
+
f"Dataset {self.name!r} must have at least one language."
|
|
576
|
+
)
|
|
577
|
+
case 1:
|
|
578
|
+
return self.languages[0]
|
|
579
|
+
case _:
|
|
580
|
+
if ENGLISH in self.languages:
|
|
581
|
+
return ENGLISH
|
|
582
|
+
elif NORWEGIAN in self.languages:
|
|
583
|
+
return NORWEGIAN
|
|
584
|
+
elif PORTUGUESE in self.languages:
|
|
585
|
+
return PORTUGUESE
|
|
586
|
+
else:
|
|
587
|
+
return self.languages[0]
|
|
416
588
|
|
|
417
589
|
@property
|
|
418
590
|
def id2label(self) -> "HashableDict":
|
|
@@ -517,6 +689,9 @@ class BenchmarkConfig:
|
|
|
517
689
|
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
518
690
|
this if you are running out of GPU memory. Only relevant if the model is
|
|
519
691
|
generative.
|
|
692
|
+
attention_backend:
|
|
693
|
+
The attention backend to use for vLLM. Defaults to FLASHINFER. Only
|
|
694
|
+
relevant if the model is generative.
|
|
520
695
|
requires_safetensors:
|
|
521
696
|
Whether to only allow models that use the safetensors format.
|
|
522
697
|
generative_type:
|
|
@@ -553,6 +728,9 @@ class BenchmarkConfig:
|
|
|
553
728
|
few_shot: bool
|
|
554
729
|
num_iterations: int
|
|
555
730
|
gpu_memory_utilization: float
|
|
731
|
+
attention_backend: t.Literal[
|
|
732
|
+
*ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
|
|
733
|
+
]
|
|
556
734
|
requires_safetensors: bool
|
|
557
735
|
generative_type: GenerativeType | None
|
|
558
736
|
download_only: bool
|
|
@@ -601,6 +779,9 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
601
779
|
requires_safetensors: bool
|
|
602
780
|
download_only: bool
|
|
603
781
|
gpu_memory_utilization: float
|
|
782
|
+
attention_backend: t.Literal[
|
|
783
|
+
*ATTENTION_BACKENDS # pyrefly: ignore[invalid-literal]
|
|
784
|
+
]
|
|
604
785
|
generative_type: GenerativeType | None
|
|
605
786
|
custom_datasets_file: Path
|
|
606
787
|
force: bool
|