ScandEval 16.12.0__py3-none-any.whl → 16.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scandeval/async_utils.py +46 -0
- scandeval/benchmark_config_factory.py +26 -2
- scandeval/benchmark_modules/fresh.py +2 -1
- scandeval/benchmark_modules/hf.py +50 -12
- scandeval/benchmark_modules/litellm.py +25 -15
- scandeval/benchmark_modules/vllm.py +3 -3
- scandeval/benchmarker.py +15 -33
- scandeval/cli.py +2 -4
- scandeval/constants.py +5 -0
- scandeval/custom_dataset_configs.py +152 -0
- scandeval/data_loading.py +87 -31
- scandeval/data_models.py +396 -225
- scandeval/dataset_configs/__init__.py +51 -25
- scandeval/dataset_configs/albanian.py +1 -1
- scandeval/dataset_configs/belarusian.py +47 -0
- scandeval/dataset_configs/bulgarian.py +1 -1
- scandeval/dataset_configs/catalan.py +1 -1
- scandeval/dataset_configs/croatian.py +1 -1
- scandeval/dataset_configs/danish.py +3 -2
- scandeval/dataset_configs/dutch.py +7 -6
- scandeval/dataset_configs/english.py +4 -3
- scandeval/dataset_configs/estonian.py +8 -7
- scandeval/dataset_configs/faroese.py +1 -1
- scandeval/dataset_configs/finnish.py +5 -4
- scandeval/dataset_configs/french.py +6 -5
- scandeval/dataset_configs/german.py +4 -3
- scandeval/dataset_configs/greek.py +1 -1
- scandeval/dataset_configs/hungarian.py +1 -1
- scandeval/dataset_configs/icelandic.py +4 -3
- scandeval/dataset_configs/italian.py +4 -3
- scandeval/dataset_configs/latvian.py +2 -2
- scandeval/dataset_configs/lithuanian.py +1 -1
- scandeval/dataset_configs/norwegian.py +6 -5
- scandeval/dataset_configs/polish.py +4 -3
- scandeval/dataset_configs/portuguese.py +5 -4
- scandeval/dataset_configs/romanian.py +2 -2
- scandeval/dataset_configs/serbian.py +1 -1
- scandeval/dataset_configs/slovene.py +1 -1
- scandeval/dataset_configs/spanish.py +4 -3
- scandeval/dataset_configs/swedish.py +4 -3
- scandeval/dataset_configs/ukrainian.py +1 -1
- scandeval/generation_utils.py +6 -6
- scandeval/metrics/llm_as_a_judge.py +1 -1
- scandeval/metrics/pipeline.py +1 -1
- scandeval/model_cache.py +34 -4
- scandeval/prompt_templates/linguistic_acceptability.py +9 -0
- scandeval/prompt_templates/multiple_choice.py +9 -0
- scandeval/prompt_templates/named_entity_recognition.py +21 -0
- scandeval/prompt_templates/reading_comprehension.py +10 -0
- scandeval/prompt_templates/sentiment_classification.py +11 -0
- scandeval/string_utils.py +157 -0
- scandeval/task_group_utils/sequence_classification.py +2 -5
- scandeval/task_group_utils/token_classification.py +2 -4
- scandeval/utils.py +6 -323
- scandeval-16.13.0.dist-info/METADATA +334 -0
- scandeval-16.13.0.dist-info/RECORD +94 -0
- scandeval-16.12.0.dist-info/METADATA +0 -667
- scandeval-16.12.0.dist-info/RECORD +0 -90
- {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
- {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
- {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0
scandeval/data_loading.py
CHANGED
|
@@ -9,14 +9,15 @@ import typing as t
|
|
|
9
9
|
import requests
|
|
10
10
|
from datasets import DatasetDict, load_dataset
|
|
11
11
|
from datasets.exceptions import DatasetsError
|
|
12
|
-
from huggingface_hub.errors import HfHubHTTPError
|
|
12
|
+
from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
|
|
13
13
|
from numpy.random import Generator
|
|
14
14
|
|
|
15
15
|
from .constants import SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS
|
|
16
16
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
17
17
|
from .logging_utils import log, no_terminal_output
|
|
18
|
+
from .string_utils import unscramble
|
|
18
19
|
from .tasks import EUROPEAN_VALUES
|
|
19
|
-
from .utils import
|
|
20
|
+
from .utils import get_hf_token
|
|
20
21
|
|
|
21
22
|
if t.TYPE_CHECKING:
|
|
22
23
|
from datasets import Dataset
|
|
@@ -47,15 +48,30 @@ def load_data(
|
|
|
47
48
|
If the Hugging Face Hub is down.
|
|
48
49
|
"""
|
|
49
50
|
dataset = load_raw_data(
|
|
50
|
-
dataset_config=dataset_config,
|
|
51
|
+
dataset_config=dataset_config,
|
|
52
|
+
cache_dir=benchmark_config.cache_dir,
|
|
53
|
+
api_key=benchmark_config.api_key,
|
|
51
54
|
)
|
|
52
55
|
|
|
53
|
-
if
|
|
54
|
-
|
|
56
|
+
if (
|
|
57
|
+
not benchmark_config.evaluate_test_split
|
|
58
|
+
and dataset_config.val_split is not None
|
|
59
|
+
):
|
|
60
|
+
dataset[dataset_config.test_split] = dataset[dataset_config.val_split]
|
|
61
|
+
|
|
62
|
+
splits = [
|
|
63
|
+
split
|
|
64
|
+
for split in [
|
|
65
|
+
dataset_config.train_split,
|
|
66
|
+
dataset_config.val_split,
|
|
67
|
+
dataset_config.test_split,
|
|
68
|
+
]
|
|
69
|
+
if split is not None
|
|
70
|
+
]
|
|
55
71
|
|
|
56
72
|
# Remove empty examples from the datasets
|
|
57
73
|
for text_feature in ["tokens", "text"]:
|
|
58
|
-
for split in
|
|
74
|
+
for split in splits:
|
|
59
75
|
if text_feature in dataset[split].features:
|
|
60
76
|
dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
|
|
61
77
|
|
|
@@ -67,7 +83,7 @@ def load_data(
|
|
|
67
83
|
# Bootstrap the splits, if applicable
|
|
68
84
|
if dataset_config.bootstrap_samples:
|
|
69
85
|
bootstrapped_splits: dict[str, c.Sequence["Dataset"]] = dict()
|
|
70
|
-
for split in
|
|
86
|
+
for split in splits:
|
|
71
87
|
bootstrap_indices = rng.integers(
|
|
72
88
|
0,
|
|
73
89
|
len(dataset[split]),
|
|
@@ -81,7 +97,12 @@ def load_data(
|
|
|
81
97
|
DatasetDict( # type: ignore[no-matching-overload]
|
|
82
98
|
{
|
|
83
99
|
split: bootstrapped_splits[split][idx]
|
|
84
|
-
for split in
|
|
100
|
+
for split in [
|
|
101
|
+
dataset_config.train_split,
|
|
102
|
+
dataset_config.val_split,
|
|
103
|
+
dataset_config.test_split,
|
|
104
|
+
]
|
|
105
|
+
if split is not None
|
|
85
106
|
}
|
|
86
107
|
)
|
|
87
108
|
for idx in range(benchmark_config.num_iterations)
|
|
@@ -92,7 +113,9 @@ def load_data(
|
|
|
92
113
|
return datasets
|
|
93
114
|
|
|
94
115
|
|
|
95
|
-
def load_raw_data(
|
|
116
|
+
def load_raw_data(
|
|
117
|
+
dataset_config: "DatasetConfig", cache_dir: str, api_key: str | None
|
|
118
|
+
) -> "DatasetDict":
|
|
96
119
|
"""Load the raw dataset.
|
|
97
120
|
|
|
98
121
|
Args:
|
|
@@ -100,6 +123,8 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
100
123
|
The configuration for the dataset.
|
|
101
124
|
cache_dir:
|
|
102
125
|
The directory to cache the dataset.
|
|
126
|
+
api_key:
|
|
127
|
+
The API key to use as the Hugging Face token.
|
|
103
128
|
|
|
104
129
|
Returns:
|
|
105
130
|
The dataset.
|
|
@@ -125,16 +150,38 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
125
150
|
FileNotFoundError,
|
|
126
151
|
ConnectionError,
|
|
127
152
|
DatasetsError,
|
|
153
|
+
RepositoryNotFoundError,
|
|
128
154
|
requests.ConnectionError,
|
|
129
155
|
requests.ReadTimeout,
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
156
|
+
):
|
|
157
|
+
try:
|
|
158
|
+
with no_terminal_output():
|
|
159
|
+
dataset = load_dataset(
|
|
160
|
+
path=dataset_config.source.split("::")[0],
|
|
161
|
+
name=(
|
|
162
|
+
dataset_config.source.split("::")[1]
|
|
163
|
+
if "::" in dataset_config.source
|
|
164
|
+
else None
|
|
165
|
+
),
|
|
166
|
+
cache_dir=cache_dir,
|
|
167
|
+
token=get_hf_token(api_key=api_key),
|
|
168
|
+
)
|
|
169
|
+
break
|
|
170
|
+
except (
|
|
171
|
+
FileNotFoundError,
|
|
172
|
+
ConnectionError,
|
|
173
|
+
DatasetsError,
|
|
174
|
+
RepositoryNotFoundError,
|
|
175
|
+
requests.ConnectionError,
|
|
176
|
+
requests.ReadTimeout,
|
|
177
|
+
) as e:
|
|
178
|
+
log(
|
|
179
|
+
f"Failed to load dataset {dataset_config.source!r}, due to "
|
|
180
|
+
f"the following error: {e}. Retrying...",
|
|
181
|
+
level=logging.DEBUG,
|
|
182
|
+
)
|
|
183
|
+
time.sleep(1)
|
|
184
|
+
continue
|
|
138
185
|
except HfHubHTTPError:
|
|
139
186
|
raise HuggingFaceHubDown()
|
|
140
187
|
else:
|
|
@@ -147,17 +194,22 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
147
194
|
# Case where the dataset source is a dictionary with keys "train", "val" and "test",
|
|
148
195
|
# with the values pointing to local CSV files
|
|
149
196
|
else:
|
|
197
|
+
split_mapping = dict(
|
|
198
|
+
train=dataset_config.train_split,
|
|
199
|
+
val=dataset_config.val_split,
|
|
200
|
+
test=dataset_config.test_split,
|
|
201
|
+
)
|
|
150
202
|
data_files = {
|
|
151
|
-
|
|
152
|
-
for
|
|
153
|
-
if
|
|
203
|
+
config_split: dataset_config.source[source_split]
|
|
204
|
+
for source_split, config_split in split_mapping.items()
|
|
205
|
+
if source_split in dataset_config.source and config_split is not None
|
|
154
206
|
}
|
|
155
207
|
|
|
156
208
|
# Get the file extension and ensure that all files have the same extension
|
|
157
209
|
file_extensions = {
|
|
158
|
-
|
|
159
|
-
for
|
|
160
|
-
if
|
|
210
|
+
config_split: dataset_config.source[source_split].split(".")[-1]
|
|
211
|
+
for source_split, config_split in split_mapping.items()
|
|
212
|
+
if source_split in dataset_config.source and config_split is not None
|
|
161
213
|
}
|
|
162
214
|
if len(set(file_extensions.values())) != 1:
|
|
163
215
|
raise InvalidBenchmark(
|
|
@@ -182,11 +234,15 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
182
234
|
path=file_extension, data_files=data_files, cache_dir=cache_dir
|
|
183
235
|
)
|
|
184
236
|
|
|
185
|
-
assert isinstance(dataset, DatasetDict)
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
237
|
+
assert isinstance(dataset, DatasetDict)
|
|
238
|
+
return DatasetDict( # pyrefly: ignore[no-matching-overload]
|
|
239
|
+
{
|
|
240
|
+
split: dataset[split]
|
|
241
|
+
for split in [
|
|
242
|
+
dataset_config.train_split,
|
|
243
|
+
dataset_config.val_split,
|
|
244
|
+
dataset_config.test_split,
|
|
245
|
+
]
|
|
246
|
+
if split is not None
|
|
247
|
+
}
|
|
248
|
+
)
|