EuroEval 16.2.1__py3-none-any.whl → 16.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +4 -2
- euroeval/benchmark_modules/fresh.py +3 -1
- euroeval/benchmark_modules/hf.py +8 -4
- euroeval/benchmark_modules/litellm.py +5 -17
- euroeval/benchmark_modules/vllm.py +98 -30
- euroeval/benchmarker.py +291 -405
- euroeval/cli.py +1 -1
- euroeval/constants.py +3 -0
- euroeval/data_models.py +35 -35
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/danish.py +0 -2
- euroeval/dataset_configs/dutch.py +0 -2
- euroeval/dataset_configs/english.py +0 -2
- euroeval/dataset_configs/finnish.py +0 -2
- euroeval/dataset_configs/french.py +0 -2
- euroeval/dataset_configs/german.py +0 -2
- euroeval/dataset_configs/italian.py +0 -2
- euroeval/dataset_configs/latvian.py +2 -3
- euroeval/dataset_configs/lithuanian.py +62 -0
- euroeval/dataset_configs/norwegian.py +0 -2
- euroeval/dataset_configs/polish.py +0 -2
- euroeval/dataset_configs/portuguese.py +0 -2
- euroeval/dataset_configs/spanish.py +0 -2
- euroeval/dataset_configs/swedish.py +0 -3
- euroeval/metrics/huggingface.py +1 -1
- euroeval/metrics/pipeline.py +5 -0
- euroeval/prompt_templates/linguistic_acceptability.py +9 -0
- euroeval/prompt_templates/multiple_choice.py +9 -0
- euroeval/prompt_templates/named_entity_recognition.py +20 -0
- euroeval/prompt_templates/reading_comprehension.py +10 -0
- euroeval/prompt_templates/sentiment_classification.py +11 -0
- euroeval/tokenisation_utils.py +8 -8
- euroeval/utils.py +10 -5
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
- euroeval-16.3.0.dist-info/RECORD +71 -0
- euroeval-16.2.1.dist-info/RECORD +0 -70
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0
euroeval/cli.py
CHANGED
|
@@ -188,7 +188,7 @@ from .tasks import get_all_tasks
|
|
|
188
188
|
)
|
|
189
189
|
@click.option(
|
|
190
190
|
"--gpu-memory-utilization",
|
|
191
|
-
default=0.
|
|
191
|
+
default=0.8,
|
|
192
192
|
show_default=True,
|
|
193
193
|
help="The GPU memory utilization to use for vLLM. A larger value will result in "
|
|
194
194
|
"faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
|
euroeval/constants.py
CHANGED
|
@@ -50,9 +50,11 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
|
|
|
50
50
|
# Hugging Face Hub tags used to classify models as merge models
|
|
51
51
|
MERGE_TAGS = ["merge", "mergekit"]
|
|
52
52
|
|
|
53
|
+
|
|
53
54
|
# The minimum required CUDA compute capability for using bfloat16 in vLLM
|
|
54
55
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
|
|
55
56
|
|
|
57
|
+
|
|
56
58
|
# Used to detect whether a model is a reasoning model
|
|
57
59
|
REASONING_TOKENS = [
|
|
58
60
|
("<think>", "</think>"),
|
|
@@ -60,6 +62,7 @@ REASONING_TOKENS = [
|
|
|
60
62
|
("<reasoning>", "</reasoning>"),
|
|
61
63
|
]
|
|
62
64
|
|
|
65
|
+
|
|
63
66
|
# These tokens are sometimes used by models to indicate the end of a generated
|
|
64
67
|
# response, but they do not use them as a proper EOS token, so we have to deal with them
|
|
65
68
|
# manually. We only use them as stop tokens if they actually appear in the model's
|
euroeval/data_models.py
CHANGED
|
@@ -170,14 +170,16 @@ class BenchmarkConfig:
|
|
|
170
170
|
"""General benchmarking configuration, across datasets and models.
|
|
171
171
|
|
|
172
172
|
Attributes:
|
|
173
|
-
model_languages:
|
|
174
|
-
The languages of the models to benchmark.
|
|
175
|
-
dataset_languages:
|
|
176
|
-
The languages of the datasets in the benchmark.
|
|
177
173
|
tasks:
|
|
178
174
|
The tasks benchmark the model(s) on.
|
|
179
175
|
datasets:
|
|
180
176
|
The datasets to benchmark on.
|
|
177
|
+
model_languages:
|
|
178
|
+
The languages of the models to benchmark.
|
|
179
|
+
dataset_languages:
|
|
180
|
+
The languages of the datasets in the benchmark.
|
|
181
|
+
device:
|
|
182
|
+
The device to use for benchmarking.
|
|
181
183
|
batch_size:
|
|
182
184
|
The batch size to use.
|
|
183
185
|
raise_errors:
|
|
@@ -186,17 +188,16 @@ class BenchmarkConfig:
|
|
|
186
188
|
Directory to store cached models and datasets.
|
|
187
189
|
api_key:
|
|
188
190
|
The API key to use for a given inference API.
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
191
|
+
api_base:
|
|
192
|
+
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
193
|
+
model on an inference API.
|
|
194
|
+
api_version:
|
|
195
|
+
The version of the API to use. Only relevant if `model` refers to a model on
|
|
196
|
+
an inference API.
|
|
192
197
|
progress_bar:
|
|
193
198
|
Whether to show a progress bar.
|
|
194
199
|
save_results:
|
|
195
200
|
Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
|
|
196
|
-
device:
|
|
197
|
-
The device to use for benchmarking.
|
|
198
|
-
verbose:
|
|
199
|
-
Whether to print verbose output.
|
|
200
201
|
trust_remote_code:
|
|
201
202
|
Whether to trust remote code when loading models from the Hugging Face Hub.
|
|
202
203
|
clear_model_cache:
|
|
@@ -208,21 +209,11 @@ class BenchmarkConfig:
|
|
|
208
209
|
if the model is generative.
|
|
209
210
|
num_iterations:
|
|
210
211
|
The number of iterations each model should be evaluated for.
|
|
211
|
-
api_base:
|
|
212
|
-
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
213
|
-
model on an inference API.
|
|
214
|
-
api_version:
|
|
215
|
-
The version of the API to use. Only relevant if `model` refers to a model on
|
|
216
|
-
an inference API.
|
|
217
212
|
gpu_memory_utilization:
|
|
218
213
|
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
219
214
|
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
220
215
|
this if you are running out of GPU memory. Only relevant if the model is
|
|
221
216
|
generative.
|
|
222
|
-
debug:
|
|
223
|
-
Whether to run the benchmark in debug mode.
|
|
224
|
-
run_with_cli:
|
|
225
|
-
Whether the benchmark is being run with the CLI.
|
|
226
217
|
requires_safetensors:
|
|
227
218
|
Whether to only allow models that use the safetensors format.
|
|
228
219
|
generative_type:
|
|
@@ -231,6 +222,15 @@ class BenchmarkConfig:
|
|
|
231
222
|
download_only:
|
|
232
223
|
Whether to only download the models, metrics and datasets without
|
|
233
224
|
evaluating.
|
|
225
|
+
force:
|
|
226
|
+
Whether to force the benchmark to run even if the results are already
|
|
227
|
+
cached.
|
|
228
|
+
verbose:
|
|
229
|
+
Whether to print verbose output.
|
|
230
|
+
debug:
|
|
231
|
+
Whether to run the benchmark in debug mode.
|
|
232
|
+
run_with_cli:
|
|
233
|
+
Whether the benchmark is being run with the CLI.
|
|
234
234
|
"""
|
|
235
235
|
|
|
236
236
|
model_languages: list[Language]
|
|
@@ -241,24 +241,24 @@ class BenchmarkConfig:
|
|
|
241
241
|
raise_errors: bool
|
|
242
242
|
cache_dir: str
|
|
243
243
|
api_key: str | None
|
|
244
|
-
|
|
244
|
+
api_base: str | None
|
|
245
|
+
api_version: str | None
|
|
245
246
|
progress_bar: bool
|
|
246
247
|
save_results: bool
|
|
247
248
|
device: torch.device
|
|
248
|
-
verbose: bool
|
|
249
249
|
trust_remote_code: bool
|
|
250
250
|
clear_model_cache: bool
|
|
251
251
|
evaluate_test_split: bool
|
|
252
252
|
few_shot: bool
|
|
253
253
|
num_iterations: int
|
|
254
|
-
api_base: str | None
|
|
255
|
-
api_version: str | None
|
|
256
254
|
gpu_memory_utilization: float
|
|
257
|
-
debug: bool
|
|
258
|
-
run_with_cli: bool
|
|
259
255
|
requires_safetensors: bool
|
|
260
256
|
generative_type: GenerativeType | None
|
|
261
257
|
download_only: bool
|
|
258
|
+
force: bool
|
|
259
|
+
verbose: bool
|
|
260
|
+
debug: bool
|
|
261
|
+
run_with_cli: bool
|
|
262
262
|
|
|
263
263
|
|
|
264
264
|
class BenchmarkConfigParams(pydantic.BaseModel):
|
|
@@ -266,10 +266,10 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
266
266
|
|
|
267
267
|
model_config = pydantic.ConfigDict(protected_namespaces=())
|
|
268
268
|
|
|
269
|
-
progress_bar: bool
|
|
270
|
-
save_results: bool
|
|
271
269
|
task: str | list[str] | None
|
|
272
270
|
dataset: str | list[str] | None
|
|
271
|
+
progress_bar: bool
|
|
272
|
+
save_results: bool
|
|
273
273
|
language: str | list[str]
|
|
274
274
|
model_language: str | list[str] | None
|
|
275
275
|
dataset_language: str | list[str] | None
|
|
@@ -278,21 +278,21 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
278
278
|
raise_errors: bool
|
|
279
279
|
cache_dir: str
|
|
280
280
|
api_key: str | None
|
|
281
|
-
|
|
282
|
-
|
|
281
|
+
api_base: str | None
|
|
282
|
+
api_version: str | None
|
|
283
283
|
trust_remote_code: bool
|
|
284
284
|
clear_model_cache: bool
|
|
285
285
|
evaluate_test_split: bool
|
|
286
286
|
few_shot: bool
|
|
287
287
|
num_iterations: int
|
|
288
|
-
|
|
289
|
-
|
|
288
|
+
requires_safetensors: bool
|
|
289
|
+
download_only: bool
|
|
290
290
|
gpu_memory_utilization: float
|
|
291
291
|
generative_type: GenerativeType | None
|
|
292
|
-
|
|
292
|
+
force: bool
|
|
293
|
+
verbose: bool
|
|
293
294
|
debug: bool
|
|
294
295
|
run_with_cli: bool
|
|
295
|
-
requires_safetensors: bool
|
|
296
296
|
|
|
297
297
|
|
|
298
298
|
class BenchmarkResult(pydantic.BaseModel):
|
|
@@ -14,6 +14,7 @@ from .german import * # noqa: F403
|
|
|
14
14
|
from .icelandic import * # noqa: F403
|
|
15
15
|
from .italian import * # noqa: F403
|
|
16
16
|
from .latvian import * # noqa: F403
|
|
17
|
+
from .lithuanian import * # noqa: F403
|
|
17
18
|
from .norwegian import * # noqa: F403
|
|
18
19
|
from .polish import * # noqa: F403
|
|
19
20
|
from .portuguese import * # noqa: F403
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Danish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import DA
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -159,7 +158,6 @@ WINOGRANDE_DA_CONFIG = DatasetConfig(
|
|
|
159
158
|
languages=[DA],
|
|
160
159
|
splits=["train", "test"],
|
|
161
160
|
_labels=["a", "b"],
|
|
162
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
163
161
|
unofficial=True,
|
|
164
162
|
)
|
|
165
163
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Dutch dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import NL
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -152,7 +151,6 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
|
|
|
152
151
|
languages=[NL],
|
|
153
152
|
splits=["train", "test"],
|
|
154
153
|
_labels=["a", "b"],
|
|
155
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
156
154
|
unofficial=True,
|
|
157
155
|
)
|
|
158
156
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All English dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import EN
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -135,7 +134,6 @@ WINOGRANDE_CONFIG = DatasetConfig(
|
|
|
135
134
|
languages=[EN],
|
|
136
135
|
splits=["train", "test"],
|
|
137
136
|
_labels=["a", "b"],
|
|
138
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
139
137
|
unofficial=True,
|
|
140
138
|
)
|
|
141
139
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Finnish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import FI
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -111,7 +110,6 @@ WINOGRANDE_FI_CONFIG = DatasetConfig(
|
|
|
111
110
|
languages=[FI],
|
|
112
111
|
splits=["train", "test"],
|
|
113
112
|
_labels=["a", "b"],
|
|
114
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
115
113
|
unofficial=True,
|
|
116
114
|
)
|
|
117
115
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All French dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import FR
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -123,7 +122,6 @@ WINOGRANDE_FR_CONFIG = DatasetConfig(
|
|
|
123
122
|
languages=[FR],
|
|
124
123
|
splits=["train", "test"],
|
|
125
124
|
_labels=["a", "b"],
|
|
126
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
127
125
|
unofficial=True,
|
|
128
126
|
)
|
|
129
127
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All German dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import DE
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -140,7 +139,6 @@ WINOGRANDE_DE_CONFIG = DatasetConfig(
|
|
|
140
139
|
languages=[DE],
|
|
141
140
|
splits=["train", "test"],
|
|
142
141
|
_labels=["a", "b"],
|
|
143
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
144
142
|
unofficial=True,
|
|
145
143
|
)
|
|
146
144
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Italian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import IT
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -131,7 +130,6 @@ WINOGRANDE_IT_CONFIG = DatasetConfig(
|
|
|
131
130
|
languages=[IT],
|
|
132
131
|
splits=["train", "test"],
|
|
133
132
|
_labels=["a", "b"],
|
|
134
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
135
133
|
unofficial=True,
|
|
136
134
|
)
|
|
137
135
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Latvian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import LV
|
|
6
5
|
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -25,7 +24,8 @@ SCALA_LV_CONFIG = DatasetConfig(
|
|
|
25
24
|
|
|
26
25
|
FULLSTACK_NER_LV_CONFIG = DatasetConfig(
|
|
27
26
|
name="fullstack-ner-lv",
|
|
28
|
-
pretty_name="the truncated version of the
|
|
27
|
+
pretty_name="the truncated version of the Latvian named entity recognition "
|
|
28
|
+
"dataset FullStack-NER-lv",
|
|
29
29
|
huggingface_id="EuroEval/fullstack-ner-lv-mini",
|
|
30
30
|
task=NER,
|
|
31
31
|
languages=[LV],
|
|
@@ -90,6 +90,5 @@ WINOGRANDE_LV_CONFIG = DatasetConfig(
|
|
|
90
90
|
languages=[LV],
|
|
91
91
|
splits=["train", "test"],
|
|
92
92
|
_labels=["a", "b"],
|
|
93
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
94
93
|
unofficial=True,
|
|
95
94
|
)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""All Lithuanian dataset configurations used in EuroEval."""
|
|
2
|
+
|
|
3
|
+
from ..data_models import DatasetConfig
|
|
4
|
+
from ..languages import LT
|
|
5
|
+
from ..tasks import COMMON_SENSE, KNOW, LA, NER, RC, SENT
|
|
6
|
+
|
|
7
|
+
### Official datasets ###
|
|
8
|
+
|
|
9
|
+
LITHUANIAN_EMOTIONS_CONFIG = DatasetConfig(
|
|
10
|
+
name="lithuanian-emotions",
|
|
11
|
+
pretty_name="the truncated version of the Lithuanian sentiment "
|
|
12
|
+
"classification dataset Lithuanian Emotions",
|
|
13
|
+
huggingface_id="EuroEval/lithuanian-emotions-mini",
|
|
14
|
+
task=SENT,
|
|
15
|
+
languages=[LT],
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
SCALA_LT_CONFIG = DatasetConfig(
|
|
19
|
+
name="scala-lt",
|
|
20
|
+
pretty_name="the Lithuanian part of the linguistic acceptability dataset ScaLA",
|
|
21
|
+
huggingface_id="EuroEval/scala-lt",
|
|
22
|
+
task=LA,
|
|
23
|
+
languages=[LT],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
WIKIANN_LT_CONFIG = DatasetConfig(
|
|
27
|
+
name="wikiann-lt",
|
|
28
|
+
pretty_name="the truncated version of the Lithuanian part of the named entity "
|
|
29
|
+
"recognition dataset WikiANN",
|
|
30
|
+
huggingface_id="EuroEval/wikiann-lt-mini",
|
|
31
|
+
task=NER,
|
|
32
|
+
languages=[LT],
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
MULTI_WIKI_QA_LT_CONFIG = DatasetConfig(
|
|
36
|
+
name="multi-wiki-qa-lt",
|
|
37
|
+
pretty_name="the truncated version of the Lithuanian part of the reading "
|
|
38
|
+
"comprehension dataset MultiWikiQA",
|
|
39
|
+
huggingface_id="EuroEval/multi-wiki-qa-lt-mini",
|
|
40
|
+
task=RC,
|
|
41
|
+
languages=[LT],
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
LT_HISTORY_CONFIG = DatasetConfig(
|
|
45
|
+
name="lt-history",
|
|
46
|
+
pretty_name="the Lithuanian knowledge dataset LT-History",
|
|
47
|
+
huggingface_id="EuroEval/lt-history",
|
|
48
|
+
task=KNOW,
|
|
49
|
+
languages=[LT],
|
|
50
|
+
splits=["train", "test"],
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
WINOGRANDE_LT_CONFIG = DatasetConfig(
|
|
54
|
+
name="winogrande-lt",
|
|
55
|
+
pretty_name="the Lithuanian common-sense reasoning dataset Winogrande-lt, "
|
|
56
|
+
"translated from the English Winogrande dataset",
|
|
57
|
+
huggingface_id="EuroEval/winogrande-lt",
|
|
58
|
+
task=COMMON_SENSE,
|
|
59
|
+
languages=[LT],
|
|
60
|
+
splits=["train", "test"],
|
|
61
|
+
_labels=["a", "b"],
|
|
62
|
+
)
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Norwegian dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import NB, NN, NO
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -226,7 +225,6 @@ WINOGRANDE_NO_CONFIG = DatasetConfig(
|
|
|
226
225
|
languages=[NB, NN, NO],
|
|
227
226
|
splits=["train", "test"],
|
|
228
227
|
_labels=["a", "b"],
|
|
229
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
230
228
|
unofficial=True,
|
|
231
229
|
)
|
|
232
230
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Polish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import PL
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -64,7 +63,6 @@ WINOGRANDE_PL_CONFIG = DatasetConfig(
|
|
|
64
63
|
languages=[PL],
|
|
65
64
|
splits=["train", "test"],
|
|
66
65
|
_labels=["a", "b"],
|
|
67
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
68
66
|
)
|
|
69
67
|
|
|
70
68
|
EUROPEAN_VALUES_PL_CONFIG = DatasetConfig(
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Portuguese dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import PT
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -101,7 +100,6 @@ WINOGRANDE_PT_CONFIG = DatasetConfig(
|
|
|
101
100
|
languages=[PT],
|
|
102
101
|
splits=["train", "test"],
|
|
103
102
|
_labels=["a", "b"],
|
|
104
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
105
103
|
unofficial=True,
|
|
106
104
|
)
|
|
107
105
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Spanish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import ES
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -129,7 +128,6 @@ WINOGRANDE_ES_CONFIG = DatasetConfig(
|
|
|
129
128
|
languages=[ES],
|
|
130
129
|
splits=["train", "test"],
|
|
131
130
|
_labels=["a", "b"],
|
|
132
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
133
131
|
unofficial=True,
|
|
134
132
|
)
|
|
135
133
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Swedish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import SV
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -140,7 +139,6 @@ WINOGRANDE_SV_CONFIG = DatasetConfig(
|
|
|
140
139
|
languages=[SV],
|
|
141
140
|
splits=["train", "test"],
|
|
142
141
|
_labels=["a", "b"],
|
|
143
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
144
142
|
unofficial=True,
|
|
145
143
|
)
|
|
146
144
|
|
|
@@ -177,6 +175,5 @@ SKOLPROV_CONFIG = DatasetConfig(
|
|
|
177
175
|
task=KNOW,
|
|
178
176
|
languages=[SV],
|
|
179
177
|
splits=["train", "test"],
|
|
180
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
181
178
|
unofficial=True,
|
|
182
179
|
)
|
euroeval/metrics/huggingface.py
CHANGED
|
@@ -197,7 +197,7 @@ bert_score_metric = HuggingFaceMetric(
|
|
|
197
197
|
huggingface_id="bertscore",
|
|
198
198
|
results_key="f1",
|
|
199
199
|
compute_kwargs=dict(
|
|
200
|
-
model_type="microsoft/mdeberta-v3-base", device="
|
|
200
|
+
model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
|
|
201
201
|
),
|
|
202
202
|
)
|
|
203
203
|
|
euroeval/metrics/pipeline.py
CHANGED
|
@@ -191,6 +191,11 @@ def european_values_preprocessing_fn(
|
|
|
191
191
|
for idx, choice in idx_to_choice.items()
|
|
192
192
|
if choice is not None
|
|
193
193
|
}
|
|
194
|
+
if prediction not in idx_to_choice:
|
|
195
|
+
raise InvalidBenchmark(
|
|
196
|
+
f"The prediction {prediction} is not a valid index for the "
|
|
197
|
+
f"question with choices {idx_to_choice}."
|
|
198
|
+
)
|
|
194
199
|
integer_prediction = idx_to_choice[prediction]
|
|
195
200
|
integer_predictions.append(integer_prediction)
|
|
196
201
|
|
|
@@ -14,6 +14,7 @@ from ..languages import (
|
|
|
14
14
|
FR,
|
|
15
15
|
IS,
|
|
16
16
|
IT,
|
|
17
|
+
LT,
|
|
17
18
|
LV,
|
|
18
19
|
NB,
|
|
19
20
|
NL,
|
|
@@ -126,6 +127,14 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
126
127
|
default_instruction_prompt="Frase: {text}\n\nStabilite se la frase è "
|
|
127
128
|
"grammaticalmente corretta o meno. Rispondere con {labels_str}, e nient'altro.",
|
|
128
129
|
),
|
|
130
|
+
LT: PromptConfig(
|
|
131
|
+
default_prompt_label_mapping=dict(correct="taip", incorrect="ne"),
|
|
132
|
+
default_prompt_prefix="Toliau pateikti sakiniai ir ar jie yra gramatiškai "
|
|
133
|
+
"teisingi.",
|
|
134
|
+
default_prompt_template="Sakinys: {text}\nGramatiškai teisingas: {label}",
|
|
135
|
+
default_instruction_prompt="Sakinys: {text}\n\nNustatykite, ar sakinys yra "
|
|
136
|
+
"gramatiškai teisingas, ar ne. Atsakykite su {labels_str}, ir nieko kito.",
|
|
137
|
+
),
|
|
129
138
|
LV: PromptConfig(
|
|
130
139
|
default_prompt_label_mapping=dict(correct="jā", incorrect="nē"),
|
|
131
140
|
default_prompt_prefix="Šie ir teikumi un to gramatiskie pareizumi.",
|
|
@@ -13,6 +13,7 @@ from ..languages import (
|
|
|
13
13
|
FR,
|
|
14
14
|
IS,
|
|
15
15
|
IT,
|
|
16
|
+
LT,
|
|
16
17
|
LV,
|
|
17
18
|
NB,
|
|
18
19
|
NL,
|
|
@@ -105,6 +106,14 @@ MULTIPLE_CHOICE_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
105
106
|
"precedente con {labels_str}, e nient'altro.",
|
|
106
107
|
default_prompt_label_mapping="auto",
|
|
107
108
|
),
|
|
109
|
+
LT: PromptConfig(
|
|
110
|
+
default_prompt_prefix="Toliau pateikti daugiavariančiai klausimai "
|
|
111
|
+
"(su atsakymais).",
|
|
112
|
+
default_prompt_template="Klausimas: {text}\nAtsakymas: {label}",
|
|
113
|
+
default_instruction_prompt="Klausimas: {text}\n\nAtsakykite į aukščiau "
|
|
114
|
+
"pateiktą klausimą atsakydami {labels_str}, ir nieko daugiau.",
|
|
115
|
+
default_prompt_label_mapping="auto",
|
|
116
|
+
),
|
|
108
117
|
LV: PromptConfig(
|
|
109
118
|
default_prompt_prefix="Tālāk seko jautājumi ar vairākām atbilžu izvēlēm "
|
|
110
119
|
"(ar atbildēm).",
|
|
@@ -14,6 +14,7 @@ from ..languages import (
|
|
|
14
14
|
FR,
|
|
15
15
|
IS,
|
|
16
16
|
IT,
|
|
17
|
+
LT,
|
|
17
18
|
LV,
|
|
18
19
|
NB,
|
|
19
20
|
NL,
|
|
@@ -241,6 +242,25 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
241
242
|
"{labels_str}. I valori devono essere elenchi di entità "
|
|
242
243
|
"nominate di quel tipo, esattamente come appaiono nella frase.",
|
|
243
244
|
),
|
|
245
|
+
LT: PromptConfig(
|
|
246
|
+
default_prompt_label_mapping={
|
|
247
|
+
"b-per": "asmuo",
|
|
248
|
+
"i-per": "asmuo",
|
|
249
|
+
"b-loc": "vieta",
|
|
250
|
+
"i-loc": "vieta",
|
|
251
|
+
"b-org": "organizacija",
|
|
252
|
+
"i-org": "organizacija",
|
|
253
|
+
"b-misc": "kita",
|
|
254
|
+
"i-misc": "kita",
|
|
255
|
+
},
|
|
256
|
+
default_prompt_prefix="Toliau pateikti sakiniai ir JSON žodynai su vardiniais "
|
|
257
|
+
"vienetais, kurie pateikiame sakinyje.",
|
|
258
|
+
default_prompt_template="Sakinys: {text}\nVardiniai vienetai: {label}",
|
|
259
|
+
default_instruction_prompt="Sakinys: {text}\n\nIdentifikuokite vardinius "
|
|
260
|
+
"vienetus sakinyje. Turėtumėte pateikti tai kaip JSON žodyną su raktais "
|
|
261
|
+
"{labels_str}. Reikšmės turi būti to tipo vardinių vienetų sąrašai, "
|
|
262
|
+
"tiksliai taip, kaip jie rodomi sakinyje.",
|
|
263
|
+
),
|
|
244
264
|
LV: PromptConfig(
|
|
245
265
|
default_prompt_label_mapping={
|
|
246
266
|
"b-per": "persona",
|
|
@@ -14,6 +14,7 @@ from ..languages import (
|
|
|
14
14
|
FR,
|
|
15
15
|
IS,
|
|
16
16
|
IT,
|
|
17
|
+
LT,
|
|
17
18
|
LV,
|
|
18
19
|
NB,
|
|
19
20
|
NL,
|
|
@@ -116,6 +117,15 @@ RC_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
116
117
|
"sul in un massimo di 3 parole.\n\nDomanda: {question}",
|
|
117
118
|
default_prompt_label_mapping=dict(),
|
|
118
119
|
),
|
|
120
|
+
LT: PromptConfig(
|
|
121
|
+
default_prompt_prefix="Toliau pateikti tekstai su atitinkamais klausimais ir "
|
|
122
|
+
"atsakymais.",
|
|
123
|
+
default_prompt_template="Tekstas: {text}\nKlausimas: {question}\nAtsakykite ne "
|
|
124
|
+
"daugiau kaip 3 žodžiais: {label}",
|
|
125
|
+
default_instruction_prompt="Tekstas: {text}\n\nAtsakykite į šį klausimą apie "
|
|
126
|
+
"aukščiau pateiktą tekstą ne daugiau kaip 3 žodžiais.\n\nKlausimas: {question}",
|
|
127
|
+
default_prompt_label_mapping=dict(),
|
|
128
|
+
),
|
|
119
129
|
LV: PromptConfig(
|
|
120
130
|
default_prompt_prefix="Turpmāk seko teksti ar atbilstošiem jautājumiem un "
|
|
121
131
|
"atbildēm.",
|
|
@@ -14,6 +14,7 @@ from ..languages import (
|
|
|
14
14
|
FR,
|
|
15
15
|
IS,
|
|
16
16
|
IT,
|
|
17
|
+
LT,
|
|
17
18
|
LV,
|
|
18
19
|
NB,
|
|
19
20
|
NL,
|
|
@@ -153,6 +154,16 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
153
154
|
default_instruction_prompt="Documento: {text}\n\nClassificare il sentiment del "
|
|
154
155
|
"documento. Rispondere con {labels_str}, e nient'altro.",
|
|
155
156
|
),
|
|
157
|
+
LT: PromptConfig(
|
|
158
|
+
default_prompt_label_mapping=dict(
|
|
159
|
+
positive="teigiamas", neutral="neutralus", negative="neigiamas"
|
|
160
|
+
),
|
|
161
|
+
default_prompt_prefix="Toliau pateikti dokumentai ir jų nuotaika, kuri "
|
|
162
|
+
"gali būti {labels_str}.",
|
|
163
|
+
default_prompt_template="Dokumentas: {text}\nNuotaika: {label}",
|
|
164
|
+
default_instruction_prompt="Dokumentas: {text}\n\nKlasifikuokite nuotaiką "
|
|
165
|
+
"dokumente. Atsakykite su {labels_str}, ir nieko kito.",
|
|
166
|
+
),
|
|
156
167
|
LV: PromptConfig(
|
|
157
168
|
default_prompt_label_mapping=dict(
|
|
158
169
|
positive="pozitīvs", neutral="neitrāls", negative="negatīvs"
|
euroeval/tokenisation_utils.py
CHANGED
|
@@ -521,7 +521,14 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
|
|
|
521
521
|
Returns:
|
|
522
522
|
Whether the tokeniser has a chat template.
|
|
523
523
|
"""
|
|
524
|
-
if
|
|
524
|
+
if isinstance(tokeniser, MistralCommonTokenizer):
|
|
525
|
+
log_once(
|
|
526
|
+
"The tokeniser is a Mistral tokeniser, so assuming that the model is "
|
|
527
|
+
"instruction tuned.",
|
|
528
|
+
level=logging.DEBUG,
|
|
529
|
+
)
|
|
530
|
+
return True
|
|
531
|
+
elif hasattr(tokeniser, "chat_template"):
|
|
525
532
|
has_template = tokeniser.chat_template is not None
|
|
526
533
|
if has_template:
|
|
527
534
|
log_once(
|
|
@@ -530,13 +537,6 @@ def has_chat_template(tokeniser: "PreTrainedTokenizer") -> bool:
|
|
|
530
537
|
level=logging.DEBUG,
|
|
531
538
|
)
|
|
532
539
|
return has_template
|
|
533
|
-
elif isinstance(tokeniser, MistralCommonTokenizer):
|
|
534
|
-
log_once(
|
|
535
|
-
"The tokeniser is a Mistral tokeniser, so assuming that the model is "
|
|
536
|
-
"instruction tuned.",
|
|
537
|
-
level=logging.DEBUG,
|
|
538
|
-
)
|
|
539
|
-
return True
|
|
540
540
|
else:
|
|
541
541
|
log_once(
|
|
542
542
|
"We cannot find a chat template for the tokeniser, so assuming that the "
|