ScandEval 16.12.0__py3-none-any.whl → 16.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. scandeval/async_utils.py +46 -0
  2. scandeval/benchmark_config_factory.py +26 -2
  3. scandeval/benchmark_modules/fresh.py +2 -1
  4. scandeval/benchmark_modules/hf.py +50 -12
  5. scandeval/benchmark_modules/litellm.py +25 -15
  6. scandeval/benchmark_modules/vllm.py +3 -3
  7. scandeval/benchmarker.py +15 -33
  8. scandeval/cli.py +2 -4
  9. scandeval/constants.py +5 -0
  10. scandeval/custom_dataset_configs.py +152 -0
  11. scandeval/data_loading.py +87 -31
  12. scandeval/data_models.py +396 -225
  13. scandeval/dataset_configs/__init__.py +51 -25
  14. scandeval/dataset_configs/albanian.py +1 -1
  15. scandeval/dataset_configs/belarusian.py +47 -0
  16. scandeval/dataset_configs/bulgarian.py +1 -1
  17. scandeval/dataset_configs/catalan.py +1 -1
  18. scandeval/dataset_configs/croatian.py +1 -1
  19. scandeval/dataset_configs/danish.py +3 -2
  20. scandeval/dataset_configs/dutch.py +7 -6
  21. scandeval/dataset_configs/english.py +4 -3
  22. scandeval/dataset_configs/estonian.py +8 -7
  23. scandeval/dataset_configs/faroese.py +1 -1
  24. scandeval/dataset_configs/finnish.py +5 -4
  25. scandeval/dataset_configs/french.py +6 -5
  26. scandeval/dataset_configs/german.py +4 -3
  27. scandeval/dataset_configs/greek.py +1 -1
  28. scandeval/dataset_configs/hungarian.py +1 -1
  29. scandeval/dataset_configs/icelandic.py +4 -3
  30. scandeval/dataset_configs/italian.py +4 -3
  31. scandeval/dataset_configs/latvian.py +2 -2
  32. scandeval/dataset_configs/lithuanian.py +1 -1
  33. scandeval/dataset_configs/norwegian.py +6 -5
  34. scandeval/dataset_configs/polish.py +4 -3
  35. scandeval/dataset_configs/portuguese.py +5 -4
  36. scandeval/dataset_configs/romanian.py +2 -2
  37. scandeval/dataset_configs/serbian.py +1 -1
  38. scandeval/dataset_configs/slovene.py +1 -1
  39. scandeval/dataset_configs/spanish.py +4 -3
  40. scandeval/dataset_configs/swedish.py +4 -3
  41. scandeval/dataset_configs/ukrainian.py +1 -1
  42. scandeval/generation_utils.py +6 -6
  43. scandeval/metrics/llm_as_a_judge.py +1 -1
  44. scandeval/metrics/pipeline.py +1 -1
  45. scandeval/model_cache.py +34 -4
  46. scandeval/prompt_templates/linguistic_acceptability.py +9 -0
  47. scandeval/prompt_templates/multiple_choice.py +9 -0
  48. scandeval/prompt_templates/named_entity_recognition.py +21 -0
  49. scandeval/prompt_templates/reading_comprehension.py +10 -0
  50. scandeval/prompt_templates/sentiment_classification.py +11 -0
  51. scandeval/string_utils.py +157 -0
  52. scandeval/task_group_utils/sequence_classification.py +2 -5
  53. scandeval/task_group_utils/token_classification.py +2 -4
  54. scandeval/utils.py +6 -323
  55. scandeval-16.13.0.dist-info/METADATA +334 -0
  56. scandeval-16.13.0.dist-info/RECORD +94 -0
  57. scandeval-16.12.0.dist-info/METADATA +0 -667
  58. scandeval-16.12.0.dist-info/RECORD +0 -90
  59. {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/WHEEL +0 -0
  60. {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/entry_points.txt +0 -0
  61. {scandeval-16.12.0.dist-info → scandeval-16.13.0.dist-info}/licenses/LICENSE +0 -0
scandeval/data_loading.py CHANGED
@@ -9,14 +9,15 @@ import typing as t
9
9
  import requests
10
10
  from datasets import DatasetDict, load_dataset
11
11
  from datasets.exceptions import DatasetsError
12
- from huggingface_hub.errors import HfHubHTTPError
12
+ from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
13
13
  from numpy.random import Generator
14
14
 
15
15
  from .constants import SUPPORTED_FILE_FORMATS_FOR_LOCAL_DATASETS
16
16
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark
17
17
  from .logging_utils import log, no_terminal_output
18
+ from .string_utils import unscramble
18
19
  from .tasks import EUROPEAN_VALUES
19
- from .utils import unscramble
20
+ from .utils import get_hf_token
20
21
 
21
22
  if t.TYPE_CHECKING:
22
23
  from datasets import Dataset
@@ -47,15 +48,30 @@ def load_data(
47
48
  If the Hugging Face Hub is down.
48
49
  """
49
50
  dataset = load_raw_data(
50
- dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
51
+ dataset_config=dataset_config,
52
+ cache_dir=benchmark_config.cache_dir,
53
+ api_key=benchmark_config.api_key,
51
54
  )
52
55
 
53
- if not benchmark_config.evaluate_test_split and "val" in dataset:
54
- dataset["test"] = dataset["val"]
56
+ if (
57
+ not benchmark_config.evaluate_test_split
58
+ and dataset_config.val_split is not None
59
+ ):
60
+ dataset[dataset_config.test_split] = dataset[dataset_config.val_split]
61
+
62
+ splits = [
63
+ split
64
+ for split in [
65
+ dataset_config.train_split,
66
+ dataset_config.val_split,
67
+ dataset_config.test_split,
68
+ ]
69
+ if split is not None
70
+ ]
55
71
 
56
72
  # Remove empty examples from the datasets
57
73
  for text_feature in ["tokens", "text"]:
58
- for split in dataset_config.splits:
74
+ for split in splits:
59
75
  if text_feature in dataset[split].features:
60
76
  dataset = dataset.filter(lambda x: len(x[text_feature]) > 0)
61
77
 
@@ -67,7 +83,7 @@ def load_data(
67
83
  # Bootstrap the splits, if applicable
68
84
  if dataset_config.bootstrap_samples:
69
85
  bootstrapped_splits: dict[str, c.Sequence["Dataset"]] = dict()
70
- for split in dataset_config.splits:
86
+ for split in splits:
71
87
  bootstrap_indices = rng.integers(
72
88
  0,
73
89
  len(dataset[split]),
@@ -81,7 +97,12 @@ def load_data(
81
97
  DatasetDict( # type: ignore[no-matching-overload]
82
98
  {
83
99
  split: bootstrapped_splits[split][idx]
84
- for split in dataset_config.splits
100
+ for split in [
101
+ dataset_config.train_split,
102
+ dataset_config.val_split,
103
+ dataset_config.test_split,
104
+ ]
105
+ if split is not None
85
106
  }
86
107
  )
87
108
  for idx in range(benchmark_config.num_iterations)
@@ -92,7 +113,9 @@ def load_data(
92
113
  return datasets
93
114
 
94
115
 
95
- def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDict":
116
+ def load_raw_data(
117
+ dataset_config: "DatasetConfig", cache_dir: str, api_key: str | None
118
+ ) -> "DatasetDict":
96
119
  """Load the raw dataset.
97
120
 
98
121
  Args:
@@ -100,6 +123,8 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
100
123
  The configuration for the dataset.
101
124
  cache_dir:
102
125
  The directory to cache the dataset.
126
+ api_key:
127
+ The API key to use as the Hugging Face token.
103
128
 
104
129
  Returns:
105
130
  The dataset.
@@ -125,16 +150,38 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
125
150
  FileNotFoundError,
126
151
  ConnectionError,
127
152
  DatasetsError,
153
+ RepositoryNotFoundError,
128
154
  requests.ConnectionError,
129
155
  requests.ReadTimeout,
130
- ) as e:
131
- log(
132
- f"Failed to load dataset {dataset_config.source!r}, due to "
133
- f"the following error: {e}. Retrying...",
134
- level=logging.DEBUG,
135
- )
136
- time.sleep(1)
137
- continue
156
+ ):
157
+ try:
158
+ with no_terminal_output():
159
+ dataset = load_dataset(
160
+ path=dataset_config.source.split("::")[0],
161
+ name=(
162
+ dataset_config.source.split("::")[1]
163
+ if "::" in dataset_config.source
164
+ else None
165
+ ),
166
+ cache_dir=cache_dir,
167
+ token=get_hf_token(api_key=api_key),
168
+ )
169
+ break
170
+ except (
171
+ FileNotFoundError,
172
+ ConnectionError,
173
+ DatasetsError,
174
+ RepositoryNotFoundError,
175
+ requests.ConnectionError,
176
+ requests.ReadTimeout,
177
+ ) as e:
178
+ log(
179
+ f"Failed to load dataset {dataset_config.source!r}, due to "
180
+ f"the following error: {e}. Retrying...",
181
+ level=logging.DEBUG,
182
+ )
183
+ time.sleep(1)
184
+ continue
138
185
  except HfHubHTTPError:
139
186
  raise HuggingFaceHubDown()
140
187
  else:
@@ -147,17 +194,22 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
147
194
  # Case where the dataset source is a dictionary with keys "train", "val" and "test",
148
195
  # with the values pointing to local CSV files
149
196
  else:
197
+ split_mapping = dict(
198
+ train=dataset_config.train_split,
199
+ val=dataset_config.val_split,
200
+ test=dataset_config.test_split,
201
+ )
150
202
  data_files = {
151
- split: dataset_config.source[split]
152
- for split in dataset_config.splits
153
- if split in dataset_config.source
203
+ config_split: dataset_config.source[source_split]
204
+ for source_split, config_split in split_mapping.items()
205
+ if source_split in dataset_config.source and config_split is not None
154
206
  }
155
207
 
156
208
  # Get the file extension and ensure that all files have the same extension
157
209
  file_extensions = {
158
- split: dataset_config.source[split].split(".")[-1]
159
- for split in dataset_config.splits
160
- if split in dataset_config.source
210
+ config_split: dataset_config.source[source_split].split(".")[-1]
211
+ for source_split, config_split in split_mapping.items()
212
+ if source_split in dataset_config.source and config_split is not None
161
213
  }
162
214
  if len(set(file_extensions.values())) != 1:
163
215
  raise InvalidBenchmark(
@@ -182,11 +234,15 @@ def load_raw_data(dataset_config: "DatasetConfig", cache_dir: str) -> "DatasetDi
182
234
  path=file_extension, data_files=data_files, cache_dir=cache_dir
183
235
  )
184
236
 
185
- assert isinstance(dataset, DatasetDict) # type: ignore[used-before-def]
186
- missing_keys = [key for key in dataset_config.splits if key not in dataset]
187
- if missing_keys:
188
- raise InvalidBenchmark(
189
- "The dataset is missing the following required splits: "
190
- f"{', '.join(missing_keys)}"
191
- )
192
- return DatasetDict({key: dataset[key] for key in dataset_config.splits}) # type: ignore[no-matching-overload]
237
+ assert isinstance(dataset, DatasetDict)
238
+ return DatasetDict( # pyrefly: ignore[no-matching-overload]
239
+ {
240
+ split: dataset[split]
241
+ for split in [
242
+ dataset_config.train_split,
243
+ dataset_config.val_split,
244
+ dataset_config.test_split,
245
+ ]
246
+ if split is not None
247
+ }
248
+ )