ScandEval 16.12.0__py3-none-any.whl → 16.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scandeval/async_utils.py +46 -0
- scandeval/benchmark_config_factory.py +26 -2
- scandeval/benchmark_modules/fresh.py +2 -1
- scandeval/benchmark_modules/hf.py +50 -12
- scandeval/benchmark_modules/litellm.py +25 -15
- scandeval/benchmark_modules/vllm.py +3 -3
- scandeval/benchmarker.py +15 -33
- scandeval/cli.py +2 -4
- scandeval/constants.py +5 -0
- scandeval/custom_dataset_configs.py +152 -0
- scandeval/data_loading.py +87 -31
- scandeval/data_models.py +396 -225
- scandeval/dataset_configs/__init__.py +51 -25
- scandeval/dataset_configs/albanian.py +1 -1
- scandeval/dataset_configs/belarusian.py +47 -0
- scandeval/dataset_configs/bulgarian.py +1 -1
- scandeval/dataset_configs/catalan.py +1 -1
- scandeval/dataset_configs/croatian.py +1 -1
- scandeval/dataset_configs/danish.py +3 -2
- scandeval/dataset_configs/dutch.py +7 -6
- scandeval/dataset_configs/english.py +4 -3
- scandeval/dataset_configs/estonian.py +8 -7
- scandeval/dataset_configs/faroese.py +1 -1
- scandeval/dataset_configs/finnish.py +5 -4
- scandeval/dataset_configs/french.py +6 -5
- scandeval/dataset_configs/german.py +4 -3
- scandeval/dataset_configs/greek.py +1 -1
- scandeval/dataset_configs/hungarian.py +1 -1
- scandeval/dataset_configs/icelandic.py +4 -3
- scandeval/dataset_configs/italian.py +4 -3
- scandeval/dataset_configs/latvian.py +2 -2
- scandeval/dataset_configs/lithuanian.py +1 -1
- scandeval/dataset_configs/norwegian.py +6 -5
- scandeval/dataset_configs/polish.py +4 -3
- scandeval/dataset_configs/portuguese.py +5 -4
- scandeval/dataset_configs/romanian.py +2 -2
- scandeval/dataset_configs/serbian.py +1 -1
- scandeval/dataset_configs/slovene.py +1 -1
- scandeval/dataset_configs/spanish.py +4 -3
- scandeval/dataset_configs/swedish.py +4 -3
- scandeval/dataset_configs/ukrainian.py +1 -1
- scandeval/generation_utils.py +6 -6
- scandeval/metrics/llm_as_a_judge.py +1 -1
- scandeval/metrics/pipeline.py +1 -1
- scandeval/model_cache.py +34 -4
- scandeval/prompt_templates/linguistic_acceptability.py +9 -0
- scandeval/prompt_templates/multiple_choice.py +9 -0
- scandeval/prompt_templates/named_entity_recognition.py +21 -0
- scandeval/prompt_templates/reading_comprehension.py +10 -0
- scandeval/prompt_templates/sentiment_classification.py +11 -0
- scandeval/string_utils.py +157 -0
- scandeval/task_group_utils/sequence_classification.py +2 -5
- scandeval/task_group_utils/token_classification.py +2 -4
- scandeval/utils.py +6 -323
- scandeval-16.13.0.dist-info/METADATA +334 -0
- scandeval-16.13.0.dist-info/RECORD +94 -0
- scandeval-16.12.0.dist-info/METADATA +0 -667
- scandeval-16.12.0.dist-info/RECORD +0 -90
- {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
- {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
- {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0
scandeval/data_models.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
"""Data models used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
+
import importlib.metadata
|
|
5
|
+
import importlib.util
|
|
4
6
|
import json
|
|
7
|
+
import logging
|
|
5
8
|
import re
|
|
6
9
|
import typing as t
|
|
7
10
|
from copy import deepcopy
|
|
@@ -12,7 +15,7 @@ import pydantic
|
|
|
12
15
|
import torch
|
|
13
16
|
from transformers.generation.configuration_utils import GenerationConfig
|
|
14
17
|
|
|
15
|
-
from .constants import ATTENTION_BACKENDS
|
|
18
|
+
from .constants import ATTENTION_BACKENDS, MAX_NUMBER_OF_LOGGING_LANGUAGES
|
|
16
19
|
from .enums import Device, GenerativeType, ModelType, TaskGroup
|
|
17
20
|
from .exceptions import InvalidBenchmark
|
|
18
21
|
from .languages import (
|
|
@@ -24,14 +27,30 @@ from .languages import (
|
|
|
24
27
|
PORTUGUESE,
|
|
25
28
|
Language,
|
|
26
29
|
)
|
|
30
|
+
from .logging_utils import log_once
|
|
27
31
|
from .metrics.base import Metric
|
|
28
32
|
from .types import ScoreDict
|
|
29
|
-
from .utils import get_package_version
|
|
30
33
|
|
|
31
34
|
if t.TYPE_CHECKING:
|
|
32
35
|
from .enums import InferenceBackend
|
|
33
36
|
|
|
34
37
|
|
|
38
|
+
def get_package_version(package_name: str) -> str | None:
|
|
39
|
+
"""Get the version of a package.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
package_name:
|
|
43
|
+
The name of the package.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
The version of the package, or None if the package is not installed.
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
return importlib.metadata.version(package_name)
|
|
50
|
+
except importlib.metadata.PackageNotFoundError:
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
35
54
|
@dataclass
|
|
36
55
|
class PromptConfig:
|
|
37
56
|
"""Configuration for task-specific prompting across languages.
|
|
@@ -80,8 +99,9 @@ class Task:
|
|
|
80
99
|
default_max_generated_tokens:
|
|
81
100
|
The default maximum number of tokens to generate when benchmarking the task
|
|
82
101
|
using few-shot evaluation.
|
|
83
|
-
default_labels:
|
|
84
|
-
The default labels for datasets using this task.
|
|
102
|
+
default_labels (optional):
|
|
103
|
+
The default labels for datasets using this task. Can be None if the labels
|
|
104
|
+
should be set manually in the dataset configs. Defaults to an empty tuple.
|
|
85
105
|
requires_zero_shot (optional):
|
|
86
106
|
Whether to only allow zero-shot evaluation for this task. If True, the
|
|
87
107
|
task will not be evaluated using few-shot examples.
|
|
@@ -118,7 +138,7 @@ class Task:
|
|
|
118
138
|
metrics: c.Sequence[Metric]
|
|
119
139
|
default_num_few_shot_examples: int
|
|
120
140
|
default_max_generated_tokens: int
|
|
121
|
-
default_labels: c.Sequence[str] | None
|
|
141
|
+
default_labels: c.Sequence[str] | None = tuple()
|
|
122
142
|
requires_zero_shot: bool = False
|
|
123
143
|
uses_structured_output: bool = False
|
|
124
144
|
uses_logprobs: bool = False
|
|
@@ -144,133 +164,362 @@ class Task:
|
|
|
144
164
|
return hash(self.name)
|
|
145
165
|
|
|
146
166
|
|
|
147
|
-
@dataclass
|
|
148
167
|
class DatasetConfig:
|
|
149
|
-
"""Configuration for a dataset.
|
|
168
|
+
"""Configuration for a dataset."""
|
|
169
|
+
|
|
170
|
+
def __init__(
|
|
171
|
+
self,
|
|
172
|
+
task: Task,
|
|
173
|
+
languages: c.Sequence[Language],
|
|
174
|
+
name: str | None = None,
|
|
175
|
+
pretty_name: str | None = None,
|
|
176
|
+
source: str | dict[str, str] | None = None,
|
|
177
|
+
prompt_prefix: str | None = None,
|
|
178
|
+
prompt_template: str | None = None,
|
|
179
|
+
instruction_prompt: str | None = None,
|
|
180
|
+
num_few_shot_examples: int | None = None,
|
|
181
|
+
max_generated_tokens: int | None = None,
|
|
182
|
+
labels: c.Sequence[str] | None = None,
|
|
183
|
+
prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None,
|
|
184
|
+
allowed_model_types: c.Sequence[ModelType] | None = None,
|
|
185
|
+
allowed_generative_types: c.Sequence[GenerativeType] | None = None,
|
|
186
|
+
allow_invalid_model_outputs: bool | None = None,
|
|
187
|
+
train_split: str | None = "train",
|
|
188
|
+
val_split: str | None = "val",
|
|
189
|
+
test_split: str = "test",
|
|
190
|
+
bootstrap_samples: bool = True,
|
|
191
|
+
unofficial: bool = False,
|
|
192
|
+
_prompt_prefix: str | None = None,
|
|
193
|
+
_prompt_template: str | None = None,
|
|
194
|
+
_instruction_prompt: str | None = None,
|
|
195
|
+
_num_few_shot_examples: int | None = None,
|
|
196
|
+
_max_generated_tokens: int | None = None,
|
|
197
|
+
_labels: c.Sequence[str] | None = None,
|
|
198
|
+
_prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None,
|
|
199
|
+
_allowed_model_types: c.Sequence[ModelType] | None = None,
|
|
200
|
+
_allowed_generative_types: c.Sequence[GenerativeType] | None = None,
|
|
201
|
+
_allow_invalid_model_outputs: bool | None = None,
|
|
202
|
+
_logging_string: str | None = None,
|
|
203
|
+
) -> None:
|
|
204
|
+
"""Initialise a DatasetConfig object.
|
|
150
205
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
206
|
+
Args:
|
|
207
|
+
task:
|
|
208
|
+
The task of the dataset.
|
|
209
|
+
languages:
|
|
210
|
+
The ISO 639-1 language codes of the entries in the dataset.
|
|
211
|
+
name (optional):
|
|
212
|
+
The name of the dataset. Must be lower case with no spaces. Can be None
|
|
213
|
+
if and only if the dataset config resides directly in the Hugging Face
|
|
214
|
+
dataset repo. Defaults to None.
|
|
215
|
+
pretty_name (optional):
|
|
216
|
+
A longer prettier name for the dataset, which allows cases and spaces.
|
|
217
|
+
Used for logging. Can be None if and only if the dataset config resides
|
|
218
|
+
directly in the Hugging Face dataset repo. Defaults to None.
|
|
219
|
+
source (optional):
|
|
220
|
+
The source of the dataset, which can be a Hugging Face ID or a
|
|
221
|
+
dictionary with keys "train", "val" and "test" mapping to local CSV file
|
|
222
|
+
paths. Can be None if and only if the dataset config resides directly in
|
|
223
|
+
the Hugging Face dataset repo. Defaults to None.
|
|
224
|
+
prompt_prefix (optional):
|
|
225
|
+
The prefix to use in the few-shot prompt. Defaults to the template for
|
|
226
|
+
the task and language.
|
|
227
|
+
prompt_template (optional):
|
|
228
|
+
The template for the prompt to use when benchmarking the dataset using
|
|
229
|
+
few-shot evaluation. Defaults to the template for the task and language.
|
|
230
|
+
instruction_prompt (optional):
|
|
231
|
+
The prompt to use when benchmarking the dataset using instruction-based
|
|
232
|
+
evaluation. Defaults to the template for the task and language.
|
|
233
|
+
num_few_shot_examples (optional):
|
|
234
|
+
The number of examples to use when benchmarking the dataset using
|
|
235
|
+
few-shot evaluation. For a classification task, these will be drawn
|
|
236
|
+
evenly from each label. Defaults to the template for the task and
|
|
237
|
+
language.
|
|
238
|
+
max_generated_tokens (optional):
|
|
239
|
+
The maximum number of tokens to generate when benchmarking the dataset
|
|
240
|
+
using few-shot evaluation. Defaults to the template for the task and
|
|
241
|
+
language.
|
|
242
|
+
labels (optional):
|
|
243
|
+
The labels in the dataset. Defaults to the template for the task and
|
|
244
|
+
language.
|
|
245
|
+
prompt_label_mapping (optional):
|
|
246
|
+
A mapping from the labels to another phrase which is used as a
|
|
247
|
+
substitute for the label in few-shot evaluation. If "auto" then the
|
|
248
|
+
mapping will be set to a 1:1 mapping between the labels and themselves.
|
|
249
|
+
If None then the mapping will be set to the default mapping for the task
|
|
250
|
+
and language. Defaults to None.
|
|
251
|
+
allowed_model_types (optional):
|
|
252
|
+
A list of model types that are allowed to be evaluated on this dataset.
|
|
253
|
+
Defaults to the one for the task.
|
|
254
|
+
allowed_generative_types (optional):
|
|
255
|
+
A list of generative model types that are allowed to be evaluated on
|
|
256
|
+
this dataset. If None, all generative model types are allowed. Only
|
|
257
|
+
relevant if `allowed_model_types` includes generative models. Defaults
|
|
258
|
+
to the one for the task.
|
|
259
|
+
allow_invalid_model_outputs (optional):
|
|
260
|
+
Whether to allow invalid model outputs. This is only relevant for
|
|
261
|
+
generative models on classification tasks, where the model may generate
|
|
262
|
+
an output which is not one of the allowed labels. If True, the model
|
|
263
|
+
output will be mapped to the closest valid label. If False, the model
|
|
264
|
+
output will be considered incorrect and the evaluation will be aborted.
|
|
265
|
+
Defaults to the one for the task.
|
|
266
|
+
train_split (optional):
|
|
267
|
+
The name of the split to use as the training set. Can be None if there
|
|
268
|
+
is no training split in the dataset. Defaults to "train".
|
|
269
|
+
val_split (optional):
|
|
270
|
+
The name of the split to use as the validation set. Can be None if there
|
|
271
|
+
is no validation split in the dataset. Defaults to "val".
|
|
272
|
+
test_split (optional):
|
|
273
|
+
The name of the split to use as the test set. Defaults to "test".
|
|
274
|
+
bootstrap_samples (optional):
|
|
275
|
+
Whether to bootstrap the dataset samples. Defaults to True.
|
|
276
|
+
unofficial (optional):
|
|
277
|
+
Whether the dataset is unofficial. Defaults to False.
|
|
278
|
+
_prompt_prefix (optional):
|
|
279
|
+
This argument is deprecated. Please use `prompt_prefix` instead.
|
|
280
|
+
_prompt_template (optional):
|
|
281
|
+
This argument is deprecated. Please use `prompt_template` instead.
|
|
282
|
+
_instruction_prompt (optional):
|
|
283
|
+
This argument is deprecated. Please use `instruction_prompt` instead.
|
|
284
|
+
_num_few_shot_examples (optional):
|
|
285
|
+
This argument is deprecated. Please use `num_few_shot_examples` instead.
|
|
286
|
+
_max_generated_tokens (optional):
|
|
287
|
+
This argument is deprecated. Please use `max_generated_tokens` instead.
|
|
288
|
+
_labels (optional):
|
|
289
|
+
This argument is deprecated. Please use `labels` instead.
|
|
290
|
+
_prompt_label_mapping (optional):
|
|
291
|
+
This argument is deprecated. Please use `prompt_label_mapping` instead.
|
|
292
|
+
_allowed_model_types (optional):
|
|
293
|
+
This argument is deprecated. Please use `allowed_model_types` instead.
|
|
294
|
+
_allowed_generative_types (optional):
|
|
295
|
+
This argument is deprecated. Please use `allowed_generative_types`
|
|
296
|
+
instead.
|
|
297
|
+
_allow_invalid_model_outputs (optional):
|
|
298
|
+
This argument is deprecated. Please use `allow_invalid_model_outputs`
|
|
299
|
+
instead.
|
|
300
|
+
_logging_string (optional):
|
|
301
|
+
This argument is deprecated. Please use `logging_string` instead.
|
|
302
|
+
"""
|
|
303
|
+
# Deprecation warnings
|
|
304
|
+
if _prompt_prefix is not None:
|
|
305
|
+
log_once(
|
|
306
|
+
"The `_prompt_prefix` argument is deprecated. Please use "
|
|
307
|
+
"`prompt_prefix` instead.",
|
|
308
|
+
level=logging.WARNING,
|
|
309
|
+
)
|
|
310
|
+
prompt_prefix = _prompt_prefix
|
|
311
|
+
if _prompt_template is not None:
|
|
312
|
+
log_once(
|
|
313
|
+
"The `_prompt_template` argument is deprecated. Please use "
|
|
314
|
+
"`prompt_template` instead.",
|
|
315
|
+
level=logging.WARNING,
|
|
316
|
+
)
|
|
317
|
+
prompt_template = _prompt_template
|
|
318
|
+
if _instruction_prompt is not None:
|
|
319
|
+
log_once(
|
|
320
|
+
"The `_instruction_prompt` argument is deprecated. Please use "
|
|
321
|
+
"`instruction_prompt` instead.",
|
|
322
|
+
level=logging.WARNING,
|
|
323
|
+
)
|
|
324
|
+
instruction_prompt = _instruction_prompt
|
|
325
|
+
if _num_few_shot_examples is not None:
|
|
326
|
+
log_once(
|
|
327
|
+
"The `_num_few_shot_examples` argument is deprecated. Please use "
|
|
328
|
+
"`num_few_shot_examples` instead.",
|
|
329
|
+
level=logging.WARNING,
|
|
330
|
+
)
|
|
331
|
+
num_few_shot_examples = _num_few_shot_examples
|
|
332
|
+
if _max_generated_tokens is not None:
|
|
333
|
+
log_once(
|
|
334
|
+
"The `_max_generated_tokens` argument is deprecated. Please use "
|
|
335
|
+
"`max_generated_tokens` instead.",
|
|
336
|
+
level=logging.WARNING,
|
|
337
|
+
)
|
|
338
|
+
max_generated_tokens = _max_generated_tokens
|
|
339
|
+
if _labels is not None:
|
|
340
|
+
log_once(
|
|
341
|
+
"The `_labels` argument is deprecated. Please use `labels` instead.",
|
|
342
|
+
level=logging.WARNING,
|
|
343
|
+
)
|
|
344
|
+
labels = _labels
|
|
345
|
+
if _prompt_label_mapping is not None:
|
|
346
|
+
log_once(
|
|
347
|
+
"The `_prompt_label_mapping` argument is deprecated. Please use "
|
|
348
|
+
"`prompt_label_mapping` instead.",
|
|
349
|
+
level=logging.WARNING,
|
|
350
|
+
)
|
|
351
|
+
prompt_label_mapping = _prompt_label_mapping
|
|
352
|
+
if _allowed_model_types is not None:
|
|
353
|
+
log_once(
|
|
354
|
+
"The `_allowed_model_types` argument is deprecated. Please use "
|
|
355
|
+
"`allowed_model_types` instead.",
|
|
356
|
+
level=logging.WARNING,
|
|
357
|
+
)
|
|
358
|
+
allowed_model_types = _allowed_model_types
|
|
359
|
+
if _allowed_generative_types is not None:
|
|
360
|
+
log_once(
|
|
361
|
+
"The `_allowed_generative_types` argument is deprecated. Please use "
|
|
362
|
+
"`allowed_generative_types` instead.",
|
|
363
|
+
level=logging.WARNING,
|
|
364
|
+
)
|
|
365
|
+
allowed_generative_types = _allowed_generative_types
|
|
366
|
+
if _allow_invalid_model_outputs is not None:
|
|
367
|
+
log_once(
|
|
368
|
+
"The `_allow_invalid_model_outputs` argument is deprecated. Please use "
|
|
369
|
+
"`allow_invalid_model_outputs` instead.",
|
|
370
|
+
level=logging.WARNING,
|
|
371
|
+
)
|
|
372
|
+
allow_invalid_model_outputs = _allow_invalid_model_outputs
|
|
373
|
+
if _logging_string is not None:
|
|
374
|
+
log_once(
|
|
375
|
+
"The `_logging_string` argument is deprecated and is not used anymore. "
|
|
376
|
+
"Using it will have no effect.",
|
|
377
|
+
level=logging.WARNING,
|
|
378
|
+
)
|
|
223
379
|
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
380
|
+
self._name = name
|
|
381
|
+
self._pretty_name = pretty_name
|
|
382
|
+
self._source = source
|
|
383
|
+
self.task = task
|
|
384
|
+
self.languages = languages
|
|
385
|
+
|
|
386
|
+
template = self.task.template_dict.get(self.main_language)
|
|
387
|
+
self.prompt_prefix = (
|
|
388
|
+
prompt_prefix
|
|
389
|
+
if prompt_prefix is not None
|
|
390
|
+
else template.default_prompt_prefix
|
|
391
|
+
if template is not None
|
|
392
|
+
else ""
|
|
393
|
+
)
|
|
394
|
+
self.prompt_template = (
|
|
395
|
+
prompt_template
|
|
396
|
+
if prompt_template is not None
|
|
397
|
+
else template.default_prompt_template
|
|
398
|
+
if template is not None
|
|
399
|
+
else ""
|
|
400
|
+
)
|
|
401
|
+
self.instruction_prompt = (
|
|
402
|
+
instruction_prompt
|
|
403
|
+
if instruction_prompt is not None
|
|
404
|
+
else template.default_instruction_prompt
|
|
405
|
+
if template is not None
|
|
406
|
+
else ""
|
|
407
|
+
)
|
|
408
|
+
self.num_few_shot_examples = (
|
|
409
|
+
num_few_shot_examples
|
|
410
|
+
if num_few_shot_examples is not None
|
|
411
|
+
else self.task.default_num_few_shot_examples
|
|
412
|
+
)
|
|
413
|
+
self.max_generated_tokens = (
|
|
414
|
+
max_generated_tokens
|
|
415
|
+
if max_generated_tokens is not None
|
|
416
|
+
else self.task.default_max_generated_tokens
|
|
417
|
+
)
|
|
418
|
+
self.labels = (
|
|
419
|
+
labels if labels is not None else self.task.default_labels or list()
|
|
420
|
+
)
|
|
421
|
+
if prompt_label_mapping is None:
|
|
422
|
+
prompt_label_mapping = (
|
|
423
|
+
template.default_prompt_label_mapping
|
|
424
|
+
if template is not None
|
|
425
|
+
else dict()
|
|
426
|
+
)
|
|
427
|
+
self.prompt_label_mapping = (
|
|
428
|
+
{label: label for label in self.labels}
|
|
429
|
+
if prompt_label_mapping == "auto"
|
|
430
|
+
else prompt_label_mapping
|
|
431
|
+
)
|
|
432
|
+
self.allowed_model_types = (
|
|
433
|
+
allowed_model_types
|
|
434
|
+
if allowed_model_types is not None
|
|
435
|
+
else self.task.default_allowed_model_types
|
|
436
|
+
)
|
|
437
|
+
self.allowed_generative_types = (
|
|
438
|
+
allowed_generative_types
|
|
439
|
+
if allowed_generative_types is not None
|
|
440
|
+
else self.task.default_allowed_generative_types
|
|
441
|
+
)
|
|
442
|
+
self.allow_invalid_model_outputs = (
|
|
443
|
+
allow_invalid_model_outputs
|
|
444
|
+
if allow_invalid_model_outputs is not None
|
|
445
|
+
else self.task.default_allow_invalid_model_outputs
|
|
446
|
+
)
|
|
447
|
+
self.train_split = train_split
|
|
448
|
+
self.val_split = val_split
|
|
449
|
+
self.test_split = test_split
|
|
450
|
+
self.bootstrap_samples = bootstrap_samples
|
|
451
|
+
self.unofficial = unofficial
|
|
243
452
|
|
|
244
453
|
@property
|
|
245
|
-
def
|
|
246
|
-
"""
|
|
454
|
+
def name(self) -> str:
|
|
455
|
+
"""The name of the dataset.
|
|
247
456
|
|
|
248
457
|
Returns:
|
|
249
|
-
The
|
|
458
|
+
The name of the dataset.
|
|
250
459
|
"""
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
460
|
+
if self._name is None:
|
|
461
|
+
raise ValueError("The name of the dataset is not set!")
|
|
462
|
+
return self._name
|
|
463
|
+
|
|
464
|
+
@name.setter
|
|
465
|
+
def name(self, value: str) -> None:
|
|
466
|
+
"""Set the name of the dataset.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
value:
|
|
470
|
+
The new name of the dataset.
|
|
471
|
+
"""
|
|
472
|
+
self._name = value
|
|
473
|
+
|
|
474
|
+
@property
|
|
475
|
+
def pretty_name(self) -> str:
|
|
476
|
+
"""The pretty name of the dataset.
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
The pretty name of the dataset.
|
|
480
|
+
"""
|
|
481
|
+
if self._pretty_name is None:
|
|
482
|
+
raise ValueError("The pretty name of the dataset is not set!")
|
|
483
|
+
return self._pretty_name
|
|
484
|
+
|
|
485
|
+
@pretty_name.setter
|
|
486
|
+
def pretty_name(self, value: str) -> None:
|
|
487
|
+
"""Set the pretty name of the dataset.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
value:
|
|
491
|
+
The new pretty name of the dataset.
|
|
492
|
+
"""
|
|
493
|
+
self._pretty_name = value
|
|
494
|
+
|
|
495
|
+
@property
|
|
496
|
+
def source(self) -> str | dict[str, str]:
|
|
497
|
+
"""The source of the dataset.
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
The source of the dataset.
|
|
501
|
+
"""
|
|
502
|
+
if self._source is None:
|
|
503
|
+
raise ValueError("The source of the dataset is not set!")
|
|
504
|
+
return self._source
|
|
505
|
+
|
|
506
|
+
@source.setter
|
|
507
|
+
def source(self, value: str | dict[str, str]) -> None:
|
|
508
|
+
"""Set the source of the dataset.
|
|
509
|
+
|
|
510
|
+
Args:
|
|
511
|
+
value:
|
|
512
|
+
The new source of the dataset.
|
|
513
|
+
"""
|
|
514
|
+
self._source = value
|
|
267
515
|
|
|
268
516
|
@property
|
|
269
517
|
def logging_string(self) -> str:
|
|
270
|
-
"""The string used to describe evaluation on the dataset in logging.
|
|
271
|
-
if self._logging_string is not None:
|
|
272
|
-
return self._logging_string
|
|
518
|
+
"""The string used to describe evaluation on the dataset in logging.
|
|
273
519
|
|
|
520
|
+
Returns:
|
|
521
|
+
The logging string.
|
|
522
|
+
"""
|
|
274
523
|
truncated_str = (
|
|
275
524
|
"truncated version of the "
|
|
276
525
|
if isinstance(self.source, str) and self.source.endswith("-mini")
|
|
@@ -294,126 +543,48 @@ class DatasetConfig:
|
|
|
294
543
|
if PORTUGUESE in self.languages and EUROPEAN_PORTUGUESE in self.languages:
|
|
295
544
|
logging_languages.remove(EUROPEAN_PORTUGUESE)
|
|
296
545
|
|
|
297
|
-
if len(logging_languages) >
|
|
546
|
+
if len(logging_languages) > MAX_NUMBER_OF_LOGGING_LANGUAGES:
|
|
547
|
+
languages_str = ""
|
|
548
|
+
elif len(logging_languages) > 1:
|
|
298
549
|
languages_str = (
|
|
299
550
|
", ".join([lang.name for lang in logging_languages[:-1]])
|
|
300
551
|
+ f" and {logging_languages[-1].name}"
|
|
552
|
+
+ " "
|
|
301
553
|
)
|
|
302
554
|
else:
|
|
303
|
-
languages_str = logging_languages[0].name
|
|
555
|
+
languages_str = logging_languages[0].name + " "
|
|
304
556
|
|
|
305
557
|
task_str = self.task.name.replace("-", " ")
|
|
306
558
|
dataset_name_str = (
|
|
307
559
|
self.pretty_name or self.name.replace("-", " ").replace("_", " ").title()
|
|
308
560
|
)
|
|
309
561
|
return (
|
|
310
|
-
f"the {truncated_str}{languages_str}
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
@property
|
|
314
|
-
def prompt_prefix(self) -> str:
|
|
315
|
-
"""The prefix to use in the few-shot prompt."""
|
|
316
|
-
prompt_config = self.task.template_dict[self.main_language]
|
|
317
|
-
prompt_prefix = (
|
|
318
|
-
prompt_config.default_prompt_prefix
|
|
319
|
-
if self._prompt_prefix is None
|
|
320
|
-
else self._prompt_prefix
|
|
321
|
-
)
|
|
322
|
-
return prompt_prefix
|
|
323
|
-
|
|
324
|
-
@property
|
|
325
|
-
def prompt_template(self) -> str:
|
|
326
|
-
"""The template used during few-shot evaluation."""
|
|
327
|
-
prompt_config = self.task.template_dict[self.main_language]
|
|
328
|
-
prompt_template = (
|
|
329
|
-
prompt_config.default_prompt_template
|
|
330
|
-
if self._prompt_template is None
|
|
331
|
-
else self._prompt_template
|
|
332
|
-
)
|
|
333
|
-
return prompt_template
|
|
334
|
-
|
|
335
|
-
@property
|
|
336
|
-
def instruction_prompt(self) -> str:
|
|
337
|
-
"""The prompt to use when evaluating instruction-tuned models."""
|
|
338
|
-
prompt_config = self.task.template_dict[self.main_language]
|
|
339
|
-
instruction_prompt = (
|
|
340
|
-
prompt_config.default_instruction_prompt
|
|
341
|
-
if self._instruction_prompt is None
|
|
342
|
-
else self._instruction_prompt
|
|
343
|
-
)
|
|
344
|
-
return instruction_prompt
|
|
345
|
-
|
|
346
|
-
@property
|
|
347
|
-
def num_few_shot_examples(self) -> int:
|
|
348
|
-
"""The number of few-shot examples to use."""
|
|
349
|
-
return (
|
|
350
|
-
self._num_few_shot_examples
|
|
351
|
-
if self._num_few_shot_examples is not None
|
|
352
|
-
else self.task.default_num_few_shot_examples
|
|
353
|
-
)
|
|
354
|
-
|
|
355
|
-
@property
|
|
356
|
-
def max_generated_tokens(self) -> int:
|
|
357
|
-
"""The maximum number of tokens to generate when evaluating a model."""
|
|
358
|
-
return (
|
|
359
|
-
self._max_generated_tokens
|
|
360
|
-
if self._max_generated_tokens is not None
|
|
361
|
-
else self.task.default_max_generated_tokens
|
|
362
|
-
)
|
|
363
|
-
|
|
364
|
-
@property
|
|
365
|
-
def labels(self) -> c.Sequence[str]:
|
|
366
|
-
"""The labels in the dataset."""
|
|
367
|
-
if self._labels is not None:
|
|
368
|
-
return self._labels
|
|
369
|
-
elif self.task.default_labels is not None:
|
|
370
|
-
return self.task.default_labels
|
|
371
|
-
else:
|
|
372
|
-
raise ValueError(
|
|
373
|
-
f"Labels must be specified for dataset {self.name!r} with the "
|
|
374
|
-
f"attribute `_labels`, as the task {self.task.name!r} does not have "
|
|
375
|
-
"default labels."
|
|
376
|
-
)
|
|
377
|
-
|
|
378
|
-
@property
|
|
379
|
-
def prompt_label_mapping(self) -> dict[str, str]:
|
|
380
|
-
"""Mapping from English labels to localised labels."""
|
|
381
|
-
if self._prompt_label_mapping == "auto":
|
|
382
|
-
return {label: label for label in self.labels}
|
|
383
|
-
elif self._prompt_label_mapping is not None:
|
|
384
|
-
return self._prompt_label_mapping
|
|
385
|
-
prompt_config = self.task.template_dict[self.main_language]
|
|
386
|
-
if prompt_config.default_prompt_label_mapping == "auto":
|
|
387
|
-
return {label: label for label in self.labels}
|
|
388
|
-
else:
|
|
389
|
-
return prompt_config.default_prompt_label_mapping
|
|
390
|
-
|
|
391
|
-
@property
|
|
392
|
-
def allowed_model_types(self) -> c.Sequence[ModelType]:
|
|
393
|
-
"""A list of model types that are allowed to be evaluated on this dataset."""
|
|
394
|
-
return (
|
|
395
|
-
self._allowed_model_types
|
|
396
|
-
if self._allowed_model_types is not None
|
|
397
|
-
else self.task.default_allowed_model_types
|
|
562
|
+
f"the {truncated_str}{languages_str}{task_str} dataset {dataset_name_str}"
|
|
398
563
|
)
|
|
399
564
|
|
|
400
565
|
@property
|
|
401
|
-
def
|
|
402
|
-
"""
|
|
403
|
-
return (
|
|
404
|
-
self._allowed_generative_types
|
|
405
|
-
if self._allowed_generative_types is not None
|
|
406
|
-
else self.task.default_allowed_generative_types
|
|
407
|
-
)
|
|
566
|
+
def main_language(self) -> Language:
|
|
567
|
+
"""Get the main language of the dataset.
|
|
408
568
|
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
"""
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
569
|
+
Returns:
|
|
570
|
+
The main language.
|
|
571
|
+
"""
|
|
572
|
+
match len(self.languages):
|
|
573
|
+
case 0:
|
|
574
|
+
raise InvalidBenchmark(
|
|
575
|
+
f"Dataset {self.name!r} must have at least one language."
|
|
576
|
+
)
|
|
577
|
+
case 1:
|
|
578
|
+
return self.languages[0]
|
|
579
|
+
case _:
|
|
580
|
+
if ENGLISH in self.languages:
|
|
581
|
+
return ENGLISH
|
|
582
|
+
elif NORWEGIAN in self.languages:
|
|
583
|
+
return NORWEGIAN
|
|
584
|
+
elif PORTUGUESE in self.languages:
|
|
585
|
+
return PORTUGUESE
|
|
586
|
+
else:
|
|
587
|
+
return self.languages[0]
|
|
417
588
|
|
|
418
589
|
@property
|
|
419
590
|
def id2label(self) -> "HashableDict":
|