ScandEval 16.11.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. scandeval/__init__.py +0 -9
  2. scandeval/async_utils.py +46 -0
  3. scandeval/benchmark_config_factory.py +31 -2
  4. scandeval/benchmark_modules/fresh.py +2 -1
  5. scandeval/benchmark_modules/hf.py +76 -23
  6. scandeval/benchmark_modules/litellm.py +33 -15
  7. scandeval/benchmark_modules/vllm.py +97 -44
  8. scandeval/benchmarker.py +29 -33
  9. scandeval/cli.py +11 -0
  10. scandeval/constants.py +36 -2
  11. scandeval/custom_dataset_configs.py +152 -0
  12. scandeval/data_loading.py +87 -31
  13. scandeval/data_models.py +405 -224
  14. scandeval/dataset_configs/__init__.py +51 -25
  15. scandeval/dataset_configs/albanian.py +1 -1
  16. scandeval/dataset_configs/belarusian.py +47 -0
  17. scandeval/dataset_configs/bulgarian.py +1 -1
  18. scandeval/dataset_configs/catalan.py +1 -1
  19. scandeval/dataset_configs/croatian.py +1 -1
  20. scandeval/dataset_configs/danish.py +3 -2
  21. scandeval/dataset_configs/dutch.py +16 -5
  22. scandeval/dataset_configs/english.py +4 -3
  23. scandeval/dataset_configs/estonian.py +8 -7
  24. scandeval/dataset_configs/faroese.py +1 -1
  25. scandeval/dataset_configs/finnish.py +5 -4
  26. scandeval/dataset_configs/french.py +6 -5
  27. scandeval/dataset_configs/german.py +4 -3
  28. scandeval/dataset_configs/greek.py +1 -1
  29. scandeval/dataset_configs/hungarian.py +1 -1
  30. scandeval/dataset_configs/icelandic.py +4 -3
  31. scandeval/dataset_configs/italian.py +4 -3
  32. scandeval/dataset_configs/latvian.py +2 -2
  33. scandeval/dataset_configs/lithuanian.py +1 -1
  34. scandeval/dataset_configs/norwegian.py +6 -5
  35. scandeval/dataset_configs/polish.py +4 -3
  36. scandeval/dataset_configs/portuguese.py +5 -4
  37. scandeval/dataset_configs/romanian.py +2 -2
  38. scandeval/dataset_configs/serbian.py +1 -1
  39. scandeval/dataset_configs/slovene.py +1 -1
  40. scandeval/dataset_configs/spanish.py +4 -3
  41. scandeval/dataset_configs/swedish.py +4 -3
  42. scandeval/dataset_configs/ukrainian.py +1 -1
  43. scandeval/generation_utils.py +6 -6
  44. scandeval/metrics/__init__.py +1 -0
  45. scandeval/metrics/bias.py +237 -0
  46. scandeval/metrics/huggingface.py +2 -1
  47. scandeval/metrics/llm_as_a_judge.py +1 -1
  48. scandeval/metrics/pipeline.py +1 -1
  49. scandeval/model_cache.py +34 -4
  50. scandeval/prompt_templates/linguistic_acceptability.py +9 -0
  51. scandeval/prompt_templates/multiple_choice.py +9 -0
  52. scandeval/prompt_templates/named_entity_recognition.py +21 -0
  53. scandeval/prompt_templates/reading_comprehension.py +10 -0
  54. scandeval/prompt_templates/sentiment_classification.py +11 -0
  55. scandeval/string_utils.py +157 -0
  56. scandeval/task_group_utils/sequence_classification.py +2 -5
  57. scandeval/task_group_utils/token_classification.py +2 -4
  58. scandeval/tasks.py +22 -0
  59. scandeval/tokenisation_utils.py +12 -1
  60. scandeval/utils.py +13 -383
  61. scandeval-16.13.0.dist-info/METADATA +334 -0
  62. scandeval-16.13.0.dist-info/RECORD +94 -0
  63. scandeval-16.11.0.dist-info/METADATA +0 -649
  64. scandeval-16.11.0.dist-info/RECORD +0 -89
  65. {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
  66. {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
  67. {scandeval-16.11.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,152 @@
1
+ """Load custom dataset configs."""
2
+
3
+ import importlib.util
4
+ import logging
5
+ from pathlib import Path
6
+ from types import ModuleType
7
+
8
+ from huggingface_hub import HfApi
9
+
10
+ from .data_models import DatasetConfig
11
+ from .logging_utils import log_once
12
+ from .utils import get_hf_token
13
+
14
+
15
+ def load_custom_datasets_module(custom_datasets_file: Path) -> ModuleType | None:
16
+ """Load the custom datasets module if it exists.
17
+
18
+ Args:
19
+ custom_datasets_file:
20
+ The path to the custom datasets module.
21
+
22
+ Raises:
23
+ RuntimeError:
24
+ If the custom datasets module cannot be loaded.
25
+ """
26
+ if custom_datasets_file.exists():
27
+ spec = importlib.util.spec_from_file_location(
28
+ name="custom_datasets_module", location=str(custom_datasets_file.resolve())
29
+ )
30
+ if spec is None:
31
+ log_once(
32
+ "Could not load the spec for the custom datasets file from "
33
+ f"{custom_datasets_file.resolve()}.",
34
+ level=logging.ERROR,
35
+ )
36
+ return None
37
+ module = importlib.util.module_from_spec(spec=spec)
38
+ if spec.loader is None:
39
+ log_once(
40
+ "Could not load the module for the custom datasets file from "
41
+ f"{custom_datasets_file.resolve()}.",
42
+ level=logging.ERROR,
43
+ )
44
+ return None
45
+ spec.loader.exec_module(module)
46
+ return module
47
+ return None
48
+
49
+
50
+ def try_get_dataset_config_from_repo(
51
+ dataset_id: str, api_key: str | None, cache_dir: Path
52
+ ) -> DatasetConfig | None:
53
+ """Try to get a dataset config from a Hugging Face dataset repository.
54
+
55
+ Args:
56
+ dataset_id:
57
+ The ID of the dataset to get the config for.
58
+ api_key:
59
+ The Hugging Face API key to use to check if the repositories have custom
60
+ dataset configs.
61
+ cache_dir:
62
+ The directory to store the cache in.
63
+
64
+ Returns:
65
+ The dataset config if it exists, otherwise None.
66
+ """
67
+ # Check if the dataset ID is a Hugging Face dataset ID, abort if it isn't
68
+ token = get_hf_token(api_key=api_key)
69
+ hf_api = HfApi(token=token)
70
+ if not hf_api.repo_exists(repo_id=dataset_id, repo_type="dataset"):
71
+ return None
72
+
73
+ # Check if the repository has a euroeval_config.py file, abort if it doesn't
74
+ repo_files = hf_api.list_repo_files(
75
+ repo_id=dataset_id, repo_type="dataset", revision="main"
76
+ )
77
+ if "euroeval_config.py" not in repo_files:
78
+ log_once(
79
+ f"Dataset {dataset_id} does not have a euroeval_config.py file, so we "
80
+ "cannot load it. Skipping.",
81
+ level=logging.WARNING,
82
+ )
83
+ return None
84
+
85
+ # Fetch the euroeval_config.py file, abort if loading failed
86
+ external_config_path = cache_dir / "external_dataset_configs" / dataset_id
87
+ external_config_path.mkdir(parents=True, exist_ok=True)
88
+ hf_api.hf_hub_download(
89
+ repo_id=dataset_id,
90
+ repo_type="dataset",
91
+ filename="euroeval_config.py",
92
+ local_dir=external_config_path,
93
+ local_dir_use_symlinks=False,
94
+ )
95
+ module = load_custom_datasets_module(
96
+ custom_datasets_file=external_config_path / "euroeval_config.py"
97
+ )
98
+ if module is None:
99
+ return None
100
+
101
+ # Check that there is exactly one dataset config, abort if there isn't
102
+ repo_dataset_configs = [
103
+ cfg for cfg in vars(module).values() if isinstance(cfg, DatasetConfig)
104
+ ]
105
+ if not repo_dataset_configs:
106
+ return None # Already warned the user in this case, so we just skip
107
+ elif len(repo_dataset_configs) > 1:
108
+ log_once(
109
+ f"Dataset {dataset_id} has multiple dataset configurations. Please ensure "
110
+ "that only a single DatasetConfig is defined in the `euroeval_config.py` "
111
+ "file.",
112
+ level=logging.WARNING,
113
+ )
114
+ return None
115
+
116
+ # Get the dataset split names
117
+ splits = [
118
+ split["name"]
119
+ for split in hf_api.dataset_info(repo_id=dataset_id).card_data.dataset_info[
120
+ "splits"
121
+ ]
122
+ ]
123
+ train_split_candidates = sorted(
124
+ [split for split in splits if "train" in split.lower()], key=len
125
+ )
126
+ val_split_candidates = sorted(
127
+ [split for split in splits if "val" in split.lower()], key=len
128
+ )
129
+ test_split_candidates = sorted(
130
+ [split for split in splits if "test" in split.lower()], key=len
131
+ )
132
+ train_split = train_split_candidates[0] if train_split_candidates else None
133
+ val_split = val_split_candidates[0] if val_split_candidates else None
134
+ test_split = test_split_candidates[0] if test_split_candidates else None
135
+ if test_split is None:
136
+ log_once(
137
+ f"Dataset {dataset_id} does not have a test split, so we cannot load it. "
138
+ "Please ensure that the dataset has a test split.",
139
+ level=logging.ERROR,
140
+ )
141
+ return None
142
+
143
+ # Set up the config with the repo information
144
+ repo_dataset_config = repo_dataset_configs[0]
145
+ repo_dataset_config.name = dataset_id
146
+ repo_dataset_config.pretty_name = dataset_id
147
+ repo_dataset_config.source = dataset_id
148
+ repo_dataset_config.train_split = train_split
149
+ repo_dataset_config.val_split = val_split
150
+ repo_dataset_config.test_split = test_split
151
+
152
+ return repo_dataset_config
scandeval/data_loading.py CHANGED
@@ -9,14 +9,15 @@ import typing as t
9
9
  import requests
10
10
  from datasets import DatasetDict, load_dataset
11
11
  from datasets.exceptions import DatasetsError
12
- from huggingface_hub.errors import HfHubHTTPError
12
+ from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
13
13
  from numpy.random import Generator
14
14
 
15
15
  from .constants import SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS
16
16
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark
17
17
  from .logging_utils import log, no_terminal_output
18
+ from .string_utils import unscramble
18
19
  from .tasks import EUROPEAN_VALUES
19
- from .utils import unscramble
20
+ from .utils import get_hf_token
20
21
 
21
22
  if t.TYPE_CHECKING:
22
23
  from datasets import Dataset
@@ -47,15 +48,30 @@ def load_data(
47
48
  If the Hugging Face Hub is down.
48
49
  """
49
50
  dataset = load_raw_data(
50
- dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
51
+ dataset_config=dataset_config,
52
+ cache_dir=benchmark_config.cache_dir,
53
+ api_key=benchmark_config.api_key,
51
54
  )
52
55
 
53
- if not benchmark_config.evaluate_test_split and "val" in dataset:
54
- dataset["test"] = dataset["val"]
56
+ if (
57
+ not benchmark_config.evaluate_test_split
58
+ and dataset_config.val_split is not None
59
+ ):
60
+ dataset[dataset_config.test_split] = dataset[dataset_config.val_split]
61
+
62
+ splits = [
63
+ split
64
+ for split in [
65
+ dataset_config.train_split,
66
+ dataset_config.val_split,
67
+ dataset_config.test_split,
68
+ ]
69
+ if split is not None
70
+ ]
55
71
 
56
72
  # Remove empty examples from the datasets
57
73
  for text_feature in ["tokens", "text"]:
58
- for split in dataset_config.splits:
74
+ for split in splits:
59
75
  if text_feature in dataset[split].features:
60
76
  dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
61
77
 
@@ -67,7 +83,7 @@ def load_data(
67
83
  # Bootstrap the splits, if applicable
68
84
  if dataset_config.bootstrap_samples:
69
85
  bootstrapped_splits: dict[str, c.Sequence["Dataset"]] = dict()
70
- for split in dataset_config.splits:
86
+ for split in splits:
71
87
  bootstrap_indices = rng.integers(
72
88
  0,
73
89
  len(dataset[split]),
@@ -81,7 +97,12 @@ def load_data(
81
97
  DatasetDict( # type: ignore[no-matching-overload]
82
98
  {
83
99
  split: bootstrapped_splits[split][idx]
84
- for split in dataset_config.splits
100
+ for split in [
101
+ dataset_config.train_split,
102
+ dataset_config.val_split,
103
+ dataset_config.test_split,
104
+ ]
105
+ if split is not None
85
106
  }
86
107
  )
87
108
  for idx in range(benchmark_config.num_iterations)
@@ -92,7 +113,9 @@ def load_data(
92
113
  return datasets
93
114
 
94
115
 
95
- def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDict":
116
+ def load_raw_data(
117
+ dataset_config: "DatasetConfig", cache_dir: str, api_key: str | None
118
+ ) -> "DatasetDict":
96
119
  """Load the raw dataset.
97
120
 
98
121
  Args:
@@ -100,6 +123,8 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
100
123
  The configuration for the dataset.
101
124
  cache_dir:
102
125
  The directory to cache the dataset.
126
+ api_key:
127
+ The API key to use as the Hugging Face token.
103
128
 
104
129
  Returns:
105
130
  The dataset.
@@ -125,16 +150,38 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
125
150
  FileNotFoundError,
126
151
  ConnectionError,
127
152
  DatasetsError,
153
+ RepositoryNotFoundError,
128
154
  requests.ConnectionError,
129
155
  requests.ReadTimeout,
130
- ) as e:
131
- log(
132
- f"Failed to load dataset {dataset_config.source!r}, due to "
133
- f"the following error: {e}. Retrying...",
134
- level=logging.DEBUG,
135
- )
136
- time.sleep(1)
137
- continue
156
+ ):
157
+ try:
158
+ with no_terminal_output():
159
+ dataset = load_dataset(
160
+ path=dataset_config.source.split("::")[0],
161
+ name=(
162
+ dataset_config.source.split("::")[1]
163
+ if "::" in dataset_config.source
164
+ else None
165
+ ),
166
+ cache_dir=cache_dir,
167
+ token=get_hf_token(api_key=api_key),
168
+ )
169
+ break
170
+ except (
171
+ FileNotFoundError,
172
+ ConnectionError,
173
+ DatasetsError,
174
+ RepositoryNotFoundError,
175
+ requests.ConnectionError,
176
+ requests.ReadTimeout,
177
+ ) as e:
178
+ log(
179
+ f"Failed to load dataset {dataset_config.source!r}, due to "
180
+ f"the following error: {e}. Retrying...",
181
+ level=logging.DEBUG,
182
+ )
183
+ time.sleep(1)
184
+ continue
138
185
  except HfHubHTTPError:
139
186
  raise HuggingFaceHubDown()
140
187
  else:
@@ -147,17 +194,22 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
147
194
  # Case where the dataset source is a dictionary with keys "train", "val" and "test",
148
195
  # with the values pointing to local CSV files
149
196
  else:
197
+ split_mapping = dict(
198
+ train=dataset_config.train_split,
199
+ val=dataset_config.val_split,
200
+ test=dataset_config.test_split,
201
+ )
150
202
  data_files = {
151
- split: dataset_config.source[split]
152
- for split in dataset_config.splits
153
- if split in dataset_config.source
203
+ config_split: dataset_config.source[source_split]
204
+ for source_split, config_split in split_mapping.items()
205
+ if source_split in dataset_config.source and config_split is not None
154
206
  }
155
207
 
156
208
  # Get the file extension and ensure that all files have the same extension
157
209
  file_extensions = {
158
- split: dataset_config.source[split].split(".")[-1]
159
- for split in dataset_config.splits
160
- if split in dataset_config.source
210
+ config_split: dataset_config.source[source_split].split(".")[-1]
211
+ for source_split, config_split in split_mapping.items()
212
+ if source_split in dataset_config.source and config_split is not None
161
213
  }
162
214
  if len(set(file_extensions.values())) != 1:
163
215
  raise InvalidBenchmark(
@@ -182,11 +234,15 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
182
234
  path=file_extension, data_files=data_files, cache_dir=cache_dir
183
235
  )
184
236
 
185
- assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
186
- missing_keys = [key for key in dataset_config.splits if key not in dataset]
187
- if missing_keys:
188
- raise InvalidBenchmark(
189
- "The dataset is missing the following required splits: "
190
- f"{', '.join(missing_keys)}"
191
- )
192
- return DatasetDict({key: dataset[key] for key in dataset_config.splits}) # type: ignore[no-matching-overload]
237
+ assert isinstance(dataset, DatasetDict)
238
+ return DatasetDict( # pyrefly: ignore[no-matching-overload]
239
+ {
240
+ split: dataset[split]
241
+ for split in [
242
+ dataset_config.train_split,
243
+ dataset_config.val_split,
244
+ dataset_config.test_split,
245
+ ]
246
+ if split is not None
247
+ }
248
+ )