ScandEval 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scandeval/__init__.py +0 -9
- scandeval/async_utils.py +46 -0
- scandeval/benchmark_config_factory.py +31 -2
- scandeval/benchmark_modules/fresh.py +2 -1
- scandeval/benchmark_modules/hf.py +76 -23
- scandeval/benchmark_modules/litellm.py +33 -15
- scandeval/benchmark_modules/vllm.py +97 -44
- scandeval/benchmarker.py +29 -33
- scandeval/cli.py +11 -0
- scandeval/constants.py +36 -2
- scandeval/custom_dataset_configs.py +152 -0
- scandeval/data_loading.py +87 -31
- scandeval/data_models.py +405 -224
- scandeval/dataset_configs/__init__.py +51 -25
- scandeval/dataset_configs/albanian.py +1 -1
- scandeval/dataset_configs/belarusian.py +47 -0
- scandeval/dataset_configs/bulgarian.py +1 -1
- scandeval/dataset_configs/catalan.py +1 -1
- scandeval/dataset_configs/croatian.py +1 -1
- scandeval/dataset_configs/danish.py +3 -2
- scandeval/dataset_configs/dutch.py +16 -5
- scandeval/dataset_configs/english.py +4 -3
- scandeval/dataset_configs/estonian.py +8 -7
- scandeval/dataset_configs/faroese.py +1 -1
- scandeval/dataset_configs/finnish.py +5 -4
- scandeval/dataset_configs/french.py +6 -5
- scandeval/dataset_configs/german.py +4 -3
- scandeval/dataset_configs/greek.py +1 -1
- scandeval/dataset_configs/hungarian.py +1 -1
- scandeval/dataset_configs/icelandic.py +4 -3
- scandeval/dataset_configs/italian.py +4 -3
- scandeval/dataset_configs/latvian.py +2 -2
- scandeval/dataset_configs/lithuanian.py +1 -1
- scandeval/dataset_configs/norwegian.py +6 -5
- scandeval/dataset_configs/polish.py +4 -3
- scandeval/dataset_configs/portuguese.py +5 -4
- scandeval/dataset_configs/romanian.py +2 -2
- scandeval/dataset_configs/serbian.py +1 -1
- scandeval/dataset_configs/slovene.py +1 -1
- scandeval/dataset_configs/spanish.py +4 -3
- scandeval/dataset_configs/swedish.py +4 -3
- scandeval/dataset_configs/ukrainian.py +1 -1
- scandeval/generation_utils.py +6 -6
- scandeval/metrics/__init__.py +1 -0
- scandeval/metrics/bias.py +237 -0
- scandeval/metrics/huggingface.py +2 -1
- scandeval/metrics/llm_as_a_judge.py +1 -1
- scandeval/metrics/pipeline.py +1 -1
- scandeval/model_cache.py +34 -4
- scandeval/prompt_templates/linguistic_acceptability.py +9 -0
- scandeval/prompt_templates/multiple_choice.py +9 -0
- scandeval/prompt_templates/named_entity_recognition.py +21 -0
- scandeval/prompt_templates/reading_comprehension.py +10 -0
- scandeval/prompt_templates/sentiment_classification.py +11 -0
- scandeval/string_utils.py +157 -0
- scandeval/task_group_utils/sequence_classification.py +2 -5
- scandeval/task_group_utils/token_classification.py +2 -4
- scandeval/tasks.py +22 -0
- scandeval/tokenisation_utils.py +12 -1
- scandeval/utils.py +13 -383
- scandeval-16.13.0.dist-info/METADATA +334 -0
- scandeval-16.13.0.dist-info/RECORD +94 -0
- scandeval-16.11.0.dist-info/METADATA +0 -649
- scandeval-16.11.0.dist-info/RECORD +0 -89
- {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
- {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
- {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
"""Load custom dataset configs."""
|
|
2
|
+
|
|
3
|
+
import importlib.util
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from types import ModuleType
|
|
7
|
+
|
|
8
|
+
from huggingface_hub import HfApi
|
|
9
|
+
|
|
10
|
+
from .data_models import DatasetConfig
|
|
11
|
+
from .logging_utils import log_once
|
|
12
|
+
from .utils import get_hf_token
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def load_custom_datasets_module(custom_datasets_file: Path) -> ModuleType | None:
|
|
16
|
+
"""Load the custom datasets module if it exists.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
custom_datasets_file:
|
|
20
|
+
The path to the custom datasets module.
|
|
21
|
+
|
|
22
|
+
Raises:
|
|
23
|
+
RuntimeError:
|
|
24
|
+
If the custom datasets module cannot be loaded.
|
|
25
|
+
"""
|
|
26
|
+
if custom_datasets_file.exists():
|
|
27
|
+
spec = importlib.util.spec_from_file_location(
|
|
28
|
+
name="custom_datasets_module", location=str(custom_datasets_file.resolve())
|
|
29
|
+
)
|
|
30
|
+
if spec is None:
|
|
31
|
+
log_once(
|
|
32
|
+
"Could not load the spec for the custom datasets file from "
|
|
33
|
+
f"{custom_datasets_file.resolve()}.",
|
|
34
|
+
level=logging.ERROR,
|
|
35
|
+
)
|
|
36
|
+
return None
|
|
37
|
+
module = importlib.util.module_from_spec(spec=spec)
|
|
38
|
+
if spec.loader is None:
|
|
39
|
+
log_once(
|
|
40
|
+
"Could not load the module for the custom datasets file from "
|
|
41
|
+
f"{custom_datasets_file.resolve()}.",
|
|
42
|
+
level=logging.ERROR,
|
|
43
|
+
)
|
|
44
|
+
return None
|
|
45
|
+
spec.loader.exec_module(module)
|
|
46
|
+
return module
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def try_get_dataset_config_from_repo(
|
|
51
|
+
dataset_id: str, api_key: str | None, cache_dir: Path
|
|
52
|
+
) -> DatasetConfig | None:
|
|
53
|
+
"""Try to get a dataset config from a Hugging Face dataset repository.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
dataset_id:
|
|
57
|
+
The ID of the dataset to get the config for.
|
|
58
|
+
api_key:
|
|
59
|
+
The Hugging Face API key to use to check if the repositories have custom
|
|
60
|
+
dataset configs.
|
|
61
|
+
cache_dir:
|
|
62
|
+
The directory to store the cache in.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
The dataset config if it exists, otherwise None.
|
|
66
|
+
"""
|
|
67
|
+
# Check if the dataset ID is a Hugging Face dataset ID, abort if it isn't
|
|
68
|
+
token = get_hf_token(api_key=api_key)
|
|
69
|
+
hf_api = HfApi(token=token)
|
|
70
|
+
if not hf_api.repo_exists(repo_id=dataset_id, repo_type="dataset"):
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
# Check if the repository has a euroeval_config.py file, abort if it doesn't
|
|
74
|
+
repo_files = hf_api.list_repo_files(
|
|
75
|
+
repo_id=dataset_id, repo_type="dataset", revision="main"
|
|
76
|
+
)
|
|
77
|
+
if "euroeval_config.py" not in repo_files:
|
|
78
|
+
log_once(
|
|
79
|
+
f"Dataset {dataset_id} does not have a euroeval_config.py file, so we "
|
|
80
|
+
"cannot load it. Skipping.",
|
|
81
|
+
level=logging.WARNING,
|
|
82
|
+
)
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
# Fetch the euroeval_config.py file, abort if loading failed
|
|
86
|
+
external_config_path = cache_dir / "external_dataset_configs" / dataset_id
|
|
87
|
+
external_config_path.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
hf_api.hf_hub_download(
|
|
89
|
+
repo_id=dataset_id,
|
|
90
|
+
repo_type="dataset",
|
|
91
|
+
filename="euroeval_config.py",
|
|
92
|
+
local_dir=external_config_path,
|
|
93
|
+
local_dir_use_symlinks=False,
|
|
94
|
+
)
|
|
95
|
+
module = load_custom_datasets_module(
|
|
96
|
+
custom_datasets_file=external_config_path / "euroeval_config.py"
|
|
97
|
+
)
|
|
98
|
+
if module is None:
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
# Check that there is exactly one dataset config, abort if there isn't
|
|
102
|
+
repo_dataset_configs = [
|
|
103
|
+
cfg for cfg in vars(module).values() if isinstance(cfg, DatasetConfig)
|
|
104
|
+
]
|
|
105
|
+
if not repo_dataset_configs:
|
|
106
|
+
return None # Already warned the user in this case, so we just skip
|
|
107
|
+
elif len(repo_dataset_configs) > 1:
|
|
108
|
+
log_once(
|
|
109
|
+
f"Dataset {dataset_id} has multiple dataset configurations. Please ensure "
|
|
110
|
+
"that only a single DatasetConfig is defined in the `euroeval_config.py` "
|
|
111
|
+
"file.",
|
|
112
|
+
level=logging.WARNING,
|
|
113
|
+
)
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
# Get the dataset split names
|
|
117
|
+
splits = [
|
|
118
|
+
split["name"]
|
|
119
|
+
for split in hf_api.dataset_info(repo_id=dataset_id).card_data.dataset_info[
|
|
120
|
+
"splits"
|
|
121
|
+
]
|
|
122
|
+
]
|
|
123
|
+
train_split_candidates = sorted(
|
|
124
|
+
[split for split in splits if "train" in split.lower()], key=len
|
|
125
|
+
)
|
|
126
|
+
val_split_candidates = sorted(
|
|
127
|
+
[split for split in splits if "val" in split.lower()], key=len
|
|
128
|
+
)
|
|
129
|
+
test_split_candidates = sorted(
|
|
130
|
+
[split for split in splits if "test" in split.lower()], key=len
|
|
131
|
+
)
|
|
132
|
+
train_split = train_split_candidates[0] if train_split_candidates else None
|
|
133
|
+
val_split = val_split_candidates[0] if val_split_candidates else None
|
|
134
|
+
test_split = test_split_candidates[0] if test_split_candidates else None
|
|
135
|
+
if test_split is None:
|
|
136
|
+
log_once(
|
|
137
|
+
f"Dataset {dataset_id} does not have a test split, so we cannot load it. "
|
|
138
|
+
"Please ensure that the dataset has a test split.",
|
|
139
|
+
level=logging.ERROR,
|
|
140
|
+
)
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
# Set up the config with the repo information
|
|
144
|
+
repo_dataset_config = repo_dataset_configs[0]
|
|
145
|
+
repo_dataset_config.name = dataset_id
|
|
146
|
+
repo_dataset_config.pretty_name = dataset_id
|
|
147
|
+
repo_dataset_config.source = dataset_id
|
|
148
|
+
repo_dataset_config.train_split = train_split
|
|
149
|
+
repo_dataset_config.val_split = val_split
|
|
150
|
+
repo_dataset_config.test_split = test_split
|
|
151
|
+
|
|
152
|
+
return repo_dataset_config
|
scandeval/data_loading.py
CHANGED
|
@@ -9,14 +9,15 @@ import typing as t
|
|
|
9
9
|
import requests
|
|
10
10
|
from datasets import DatasetDict, load_dataset
|
|
11
11
|
from datasets.exceptions import DatasetsError
|
|
12
|
-
from huggingface_hub.errors import HfHubHTTPError
|
|
12
|
+
from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
|
|
13
13
|
from numpy.random import Generator
|
|
14
14
|
|
|
15
15
|
from .constants import SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS
|
|
16
16
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark
|
|
17
17
|
from .logging_utils import log, no_terminal_output
|
|
18
|
+
from .string_utils import unscramble
|
|
18
19
|
from .tasks import EUROPEAN_VALUES
|
|
19
|
-
from .utils import
|
|
20
|
+
from .utils import get_hf_token
|
|
20
21
|
|
|
21
22
|
if t.TYPE_CHECKING:
|
|
22
23
|
from datasets import Dataset
|
|
@@ -47,15 +48,30 @@ def load_data(
|
|
|
47
48
|
If the Hugging Face Hub is down.
|
|
48
49
|
"""
|
|
49
50
|
dataset = load_raw_data(
|
|
50
|
-
dataset_config=dataset_config,
|
|
51
|
+
dataset_config=dataset_config,
|
|
52
|
+
cache_dir=benchmark_config.cache_dir,
|
|
53
|
+
api_key=benchmark_config.api_key,
|
|
51
54
|
)
|
|
52
55
|
|
|
53
|
-
if
|
|
54
|
-
|
|
56
|
+
if (
|
|
57
|
+
not benchmark_config.evaluate_test_split
|
|
58
|
+
and dataset_config.val_split is not None
|
|
59
|
+
):
|
|
60
|
+
dataset[dataset_config.test_split] = dataset[dataset_config.val_split]
|
|
61
|
+
|
|
62
|
+
splits = [
|
|
63
|
+
split
|
|
64
|
+
for split in [
|
|
65
|
+
dataset_config.train_split,
|
|
66
|
+
dataset_config.val_split,
|
|
67
|
+
dataset_config.test_split,
|
|
68
|
+
]
|
|
69
|
+
if split is not None
|
|
70
|
+
]
|
|
55
71
|
|
|
56
72
|
# Remove empty examples from the datasets
|
|
57
73
|
for text_feature in ["tokens", "text"]:
|
|
58
|
-
for split in
|
|
74
|
+
for split in splits:
|
|
59
75
|
if text_feature in dataset[split].features:
|
|
60
76
|
dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
|
|
61
77
|
|
|
@@ -67,7 +83,7 @@ def load_data(
|
|
|
67
83
|
# Bootstrap the splits, if applicable
|
|
68
84
|
if dataset_config.bootstrap_samples:
|
|
69
85
|
bootstrapped_splits: dict[str, c.Sequence["Dataset"]] = dict()
|
|
70
|
-
for split in
|
|
86
|
+
for split in splits:
|
|
71
87
|
bootstrap_indices = rng.integers(
|
|
72
88
|
0,
|
|
73
89
|
len(dataset[split]),
|
|
@@ -81,7 +97,12 @@ def load_data(
|
|
|
81
97
|
DatasetDict( # type: ignore[no-matching-overload]
|
|
82
98
|
{
|
|
83
99
|
split: bootstrapped_splits[split][idx]
|
|
84
|
-
for split in
|
|
100
|
+
for split in [
|
|
101
|
+
dataset_config.train_split,
|
|
102
|
+
dataset_config.val_split,
|
|
103
|
+
dataset_config.test_split,
|
|
104
|
+
]
|
|
105
|
+
if split is not None
|
|
85
106
|
}
|
|
86
107
|
)
|
|
87
108
|
for idx in range(benchmark_config.num_iterations)
|
|
@@ -92,7 +113,9 @@ def load_data(
|
|
|
92
113
|
return datasets
|
|
93
114
|
|
|
94
115
|
|
|
95
|
-
def load_raw_data(
|
|
116
|
+
def load_raw_data(
|
|
117
|
+
dataset_config: "DatasetConfig", cache_dir: str, api_key: str | None
|
|
118
|
+
) -> "DatasetDict":
|
|
96
119
|
"""Load the raw dataset.
|
|
97
120
|
|
|
98
121
|
Args:
|
|
@@ -100,6 +123,8 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
100
123
|
The configuration for the dataset.
|
|
101
124
|
cache_dir:
|
|
102
125
|
The directory to cache the dataset.
|
|
126
|
+
api_key:
|
|
127
|
+
The API key to use as the Hugging Face token.
|
|
103
128
|
|
|
104
129
|
Returns:
|
|
105
130
|
The dataset.
|
|
@@ -125,16 +150,38 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
125
150
|
FileNotFoundError,
|
|
126
151
|
ConnectionError,
|
|
127
152
|
DatasetsError,
|
|
153
|
+
RepositoryNotFoundError,
|
|
128
154
|
requests.ConnectionError,
|
|
129
155
|
requests.ReadTimeout,
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
156
|
+
):
|
|
157
|
+
try:
|
|
158
|
+
with no_terminal_output():
|
|
159
|
+
dataset = load_dataset(
|
|
160
|
+
path=dataset_config.source.split("::")[0],
|
|
161
|
+
name=(
|
|
162
|
+
dataset_config.source.split("::")[1]
|
|
163
|
+
if "::" in dataset_config.source
|
|
164
|
+
else None
|
|
165
|
+
),
|
|
166
|
+
cache_dir=cache_dir,
|
|
167
|
+
token=get_hf_token(api_key=api_key),
|
|
168
|
+
)
|
|
169
|
+
break
|
|
170
|
+
except (
|
|
171
|
+
FileNotFoundError,
|
|
172
|
+
ConnectionError,
|
|
173
|
+
DatasetsError,
|
|
174
|
+
RepositoryNotFoundError,
|
|
175
|
+
requests.ConnectionError,
|
|
176
|
+
requests.ReadTimeout,
|
|
177
|
+
) as e:
|
|
178
|
+
log(
|
|
179
|
+
f"Failed to load dataset {dataset_config.source!r}, due to "
|
|
180
|
+
f"the following error: {e}. Retrying...",
|
|
181
|
+
level=logging.DEBUG,
|
|
182
|
+
)
|
|
183
|
+
time.sleep(1)
|
|
184
|
+
continue
|
|
138
185
|
except HfHubHTTPError:
|
|
139
186
|
raise HuggingFaceHubDown()
|
|
140
187
|
else:
|
|
@@ -147,17 +194,22 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
147
194
|
# Case where the dataset source is a dictionary with keys "train", "val" and "test",
|
|
148
195
|
# with the values pointing to local CSV files
|
|
149
196
|
else:
|
|
197
|
+
split_mapping = dict(
|
|
198
|
+
train=dataset_config.train_split,
|
|
199
|
+
val=dataset_config.val_split,
|
|
200
|
+
test=dataset_config.test_split,
|
|
201
|
+
)
|
|
150
202
|
data_files = {
|
|
151
|
-
|
|
152
|
-
for
|
|
153
|
-
if
|
|
203
|
+
config_split: dataset_config.source[source_split]
|
|
204
|
+
for source_split, config_split in split_mapping.items()
|
|
205
|
+
if source_split in dataset_config.source and config_split is not None
|
|
154
206
|
}
|
|
155
207
|
|
|
156
208
|
# Get the file extension and ensure that all files have the same extension
|
|
157
209
|
file_extensions = {
|
|
158
|
-
|
|
159
|
-
for
|
|
160
|
-
if
|
|
210
|
+
config_split: dataset_config.source[source_split].split(".")[-1]
|
|
211
|
+
for source_split, config_split in split_mapping.items()
|
|
212
|
+
if source_split in dataset_config.source and config_split is not None
|
|
161
213
|
}
|
|
162
214
|
if len(set(file_extensions.values())) != 1:
|
|
163
215
|
raise InvalidBenchmark(
|
|
@@ -182,11 +234,15 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
|
|
|
182
234
|
path=file_extension, data_files=data_files, cache_dir=cache_dir
|
|
183
235
|
)
|
|
184
236
|
|
|
185
|
-
assert isinstance(dataset, DatasetDict)
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
237
|
+
assert isinstance(dataset, DatasetDict)
|
|
238
|
+
return DatasetDict( # pyrefly: ignore[no-matching-overload]
|
|
239
|
+
{
|
|
240
|
+
split: dataset[split]
|
|
241
|
+
for split in [
|
|
242
|
+
dataset_config.train_split,
|
|
243
|
+
dataset_config.val_split,
|
|
244
|
+
dataset_config.test_split,
|
|
245
|
+
]
|
|
246
|
+
if split is not None
|
|
247
|
+
}
|
|
248
|
+
)
|