EuroEval 16.0.1__py3-none-any.whl → 16.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_config_factory.py +6 -1
- euroeval/benchmark_modules/base.py +2 -0
- euroeval/benchmark_modules/fresh.py +7 -1
- euroeval/benchmark_modules/hf.py +26 -21
- euroeval/benchmark_modules/litellm.py +258 -131
- euroeval/benchmark_modules/vllm.py +79 -40
- euroeval/benchmarker.py +11 -2
- euroeval/cli.py +14 -1
- euroeval/constants.py +1 -1
- euroeval/data_models.py +77 -6
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/danish.py +14 -0
- euroeval/dataset_configs/dutch.py +14 -0
- euroeval/dataset_configs/english.py +22 -0
- euroeval/dataset_configs/estonian.py +15 -7
- euroeval/dataset_configs/finnish.py +14 -0
- euroeval/dataset_configs/french.py +14 -0
- euroeval/dataset_configs/german.py +23 -0
- euroeval/dataset_configs/italian.py +14 -0
- euroeval/dataset_configs/latvian.py +14 -0
- euroeval/dataset_configs/norwegian.py +14 -0
- euroeval/dataset_configs/polish.py +126 -0
- euroeval/dataset_configs/portuguese.py +14 -0
- euroeval/dataset_configs/spanish.py +14 -0
- euroeval/dataset_configs/swedish.py +25 -0
- euroeval/enums.py +12 -0
- euroeval/generation.py +17 -8
- euroeval/generation_utils.py +58 -10
- euroeval/metrics/pipeline.py +1 -1
- euroeval/prompt_templates/linguistic_acceptability.py +9 -0
- euroeval/prompt_templates/multiple_choice.py +27 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -0
- euroeval/prompt_templates/reading_comprehension.py +11 -0
- euroeval/prompt_templates/sentiment_classification.py +15 -0
- euroeval/prompt_templates/summarization.py +27 -1
- euroeval/scores.py +5 -0
- euroeval/task_group_utils/question_answering.py +29 -29
- euroeval/task_group_utils/sequence_classification.py +10 -33
- euroeval/task_group_utils/token_classification.py +3 -3
- euroeval/tasks.py +4 -4
- euroeval/{tokenization_utils.py → tokenisation_utils.py} +40 -23
- euroeval/utils.py +36 -3
- {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/METADATA +1 -1
- euroeval-16.1.0.dist-info/RECORD +70 -0
- euroeval-16.0.1.dist-info/RECORD +0 -69
- {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
- {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -44,7 +44,11 @@ from ..exceptions import (
|
|
|
44
44
|
NeedsEnvironmentVariable,
|
|
45
45
|
NeedsExtraInstalled,
|
|
46
46
|
)
|
|
47
|
-
from ..generation_utils import
|
|
47
|
+
from ..generation_utils import (
|
|
48
|
+
apply_prompt,
|
|
49
|
+
extract_few_shot_examples,
|
|
50
|
+
raise_if_wrong_params,
|
|
51
|
+
)
|
|
48
52
|
from ..languages import get_all_languages
|
|
49
53
|
from ..task_group_utils import (
|
|
50
54
|
question_answering,
|
|
@@ -52,7 +56,7 @@ from ..task_group_utils import (
|
|
|
52
56
|
text_to_text,
|
|
53
57
|
token_classification,
|
|
54
58
|
)
|
|
55
|
-
from ..
|
|
59
|
+
from ..tokenisation_utils import (
|
|
56
60
|
apply_chat_template,
|
|
57
61
|
get_bos_token,
|
|
58
62
|
get_end_of_chat_token_ids,
|
|
@@ -69,6 +73,7 @@ from ..utils import (
|
|
|
69
73
|
get_hf_token,
|
|
70
74
|
get_min_cuda_compute_capability,
|
|
71
75
|
log_once,
|
|
76
|
+
split_model_id,
|
|
72
77
|
)
|
|
73
78
|
from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
|
|
74
79
|
|
|
@@ -97,6 +102,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
97
102
|
fresh_model = False
|
|
98
103
|
batching_preference = BatchingPreference.ALL_AT_ONCE
|
|
99
104
|
high_priority = True
|
|
105
|
+
allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
|
|
100
106
|
|
|
101
107
|
def __init__(
|
|
102
108
|
self,
|
|
@@ -120,42 +126,46 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
120
126
|
if importlib.util.find_spec("vllm") is None:
|
|
121
127
|
raise NeedsExtraInstalled(extra="generative")
|
|
122
128
|
|
|
129
|
+
raise_if_wrong_params(
|
|
130
|
+
model_config=model_config, allowed_params=self.allowed_params
|
|
131
|
+
)
|
|
132
|
+
|
|
123
133
|
model, tokeniser = load_model_and_tokeniser(
|
|
124
134
|
model_config=model_config, benchmark_config=benchmark_config
|
|
125
135
|
)
|
|
126
136
|
self._model: "LLM" = model
|
|
127
137
|
self._tokeniser: "PreTrainedTokenizer" = tokeniser
|
|
138
|
+
|
|
139
|
+
# We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
|
|
140
|
+
# to call the `__init__` method of the `BenchmarkModule` class.
|
|
141
|
+
super(HuggingFaceEncoderModel, self).__init__(
|
|
142
|
+
model_config=model_config,
|
|
143
|
+
dataset_config=dataset_config,
|
|
144
|
+
benchmark_config=benchmark_config,
|
|
145
|
+
log_metadata=log_metadata,
|
|
146
|
+
)
|
|
147
|
+
|
|
128
148
|
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
129
149
|
model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
|
|
130
150
|
)
|
|
131
151
|
self.end_of_chat_token_ids = get_end_of_chat_token_ids(
|
|
132
|
-
tokeniser=self._tokeniser
|
|
152
|
+
tokeniser=self._tokeniser, generative_type=self.generative_type
|
|
133
153
|
)
|
|
134
154
|
self.custom_stop_tokens = get_custom_stop_tokens(
|
|
135
155
|
model=self._model,
|
|
136
156
|
tokeniser=self._tokeniser,
|
|
137
157
|
model_id=model_config.model_id,
|
|
138
|
-
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
# We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
|
|
142
|
-
# to call the `__init__` method of the `BenchmarkModule` class.
|
|
143
|
-
super(HuggingFaceEncoderModel, self).__init__(
|
|
144
|
-
model_config=model_config,
|
|
145
|
-
dataset_config=dataset_config,
|
|
146
|
-
benchmark_config=benchmark_config,
|
|
147
|
-
log_metadata=log_metadata,
|
|
158
|
+
generative_type=self.generative_type,
|
|
148
159
|
)
|
|
149
160
|
|
|
150
161
|
self.buffer |= dict(
|
|
151
|
-
instruction_model=has_chat_template(tokeniser=self._tokeniser),
|
|
152
162
|
first_label_token_mapping=get_first_label_token_mapping(
|
|
153
163
|
dataset_config=self.dataset_config,
|
|
154
164
|
model_config=self.model_config,
|
|
155
165
|
tokeniser=self._tokeniser,
|
|
156
166
|
generative_type=self.generative_type,
|
|
157
167
|
log_metadata=self.log_metadata,
|
|
158
|
-
)
|
|
168
|
+
)
|
|
159
169
|
)
|
|
160
170
|
if self.model_config.adapter_base_model_id is not None:
|
|
161
171
|
adapter_path = snapshot_download(
|
|
@@ -187,16 +197,36 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
187
197
|
The generative type of the model, or None if it has not been set yet.
|
|
188
198
|
"""
|
|
189
199
|
if not hasattr(self, "_tokeniser"):
|
|
200
|
+
log_once(
|
|
201
|
+
"The generative type of the model has not been set yet as the "
|
|
202
|
+
"tokeniser has not been loaded.",
|
|
203
|
+
level=logging.DEBUG,
|
|
204
|
+
)
|
|
190
205
|
return None
|
|
191
|
-
elif self.
|
|
192
|
-
|
|
206
|
+
elif self.benchmark_config.generative_type is not None:
|
|
207
|
+
type_ = self.benchmark_config.generative_type
|
|
208
|
+
elif self.model_config.param in {"thinking"}:
|
|
209
|
+
type_ = GenerativeType.REASONING
|
|
210
|
+
elif self.model_config.param in {"no-thinking"}:
|
|
211
|
+
type_ = GenerativeType.INSTRUCTION_TUNED
|
|
212
|
+
elif (
|
|
213
|
+
hasattr(self, "end_of_reasoning_token")
|
|
214
|
+
and self.end_of_reasoning_token is not None
|
|
215
|
+
):
|
|
216
|
+
type_ = GenerativeType.REASONING
|
|
193
217
|
elif (
|
|
194
218
|
has_chat_template(tokeniser=self._tokeniser)
|
|
195
219
|
or "instruct" in self.model_config.model_id.lower()
|
|
196
220
|
):
|
|
197
|
-
|
|
221
|
+
type_ = GenerativeType.INSTRUCTION_TUNED
|
|
198
222
|
else:
|
|
199
|
-
|
|
223
|
+
type_ = GenerativeType.BASE
|
|
224
|
+
log_once(
|
|
225
|
+
f"Detected generative type {type_.name!r} for model "
|
|
226
|
+
f"{self.model_config.model_id!r}",
|
|
227
|
+
level=logging.DEBUG,
|
|
228
|
+
)
|
|
229
|
+
return type_
|
|
200
230
|
|
|
201
231
|
@property
|
|
202
232
|
def extract_labels_from_generation(self) -> ExtractLabelsFunction:
|
|
@@ -285,7 +315,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
285
315
|
few_shot_examples=few_shot_examples,
|
|
286
316
|
model_config=self.model_config,
|
|
287
317
|
dataset_config=self.dataset_config,
|
|
288
|
-
|
|
318
|
+
generative_type=self.generative_type,
|
|
289
319
|
always_populate_text_field=True,
|
|
290
320
|
tokeniser=self._tokeniser,
|
|
291
321
|
),
|
|
@@ -313,7 +343,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
313
343
|
"""
|
|
314
344
|
# Get stopping tokens
|
|
315
345
|
stop_tokens: list[str] = self.custom_stop_tokens.copy()
|
|
316
|
-
if self.
|
|
346
|
+
if self.generative_type == GenerativeType.BASE:
|
|
317
347
|
stop_tokens.append("\n\n")
|
|
318
348
|
if self._tokeniser.pad_token_id is not None:
|
|
319
349
|
assert isinstance(self._tokeniser.pad_token, str), (
|
|
@@ -430,9 +460,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
430
460
|
labels_to_be_generated = list(self.dataset_config.prompt_label_mapping.values())
|
|
431
461
|
if len(labels_to_be_generated) == 0:
|
|
432
462
|
labels_to_be_generated = ["negative", "positive"]
|
|
433
|
-
if
|
|
434
|
-
"instruction_model", False
|
|
435
|
-
) and should_prompts_be_stripped(
|
|
463
|
+
if self.generative_type == GenerativeType.BASE and should_prompts_be_stripped(
|
|
436
464
|
labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
|
|
437
465
|
):
|
|
438
466
|
log_once(
|
|
@@ -590,9 +618,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
590
618
|
if using_api:
|
|
591
619
|
return False
|
|
592
620
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
621
|
+
model_id_components = split_model_id(model_id=model_id)
|
|
622
|
+
model_id = model_id_components.model_id
|
|
623
|
+
revision = model_id_components.revision
|
|
624
|
+
|
|
596
625
|
model_info = get_model_repo_info(
|
|
597
626
|
model_id=model_id, revision=revision, benchmark_config=benchmark_config
|
|
598
627
|
)
|
|
@@ -616,11 +645,11 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
616
645
|
Returns:
|
|
617
646
|
The model configuration.
|
|
618
647
|
"""
|
|
619
|
-
|
|
620
|
-
model_id.split("@") if "@" in model_id else (model_id, "main")
|
|
621
|
-
)
|
|
648
|
+
model_id_components = split_model_id(model_id=model_id)
|
|
622
649
|
model_info = get_model_repo_info(
|
|
623
|
-
model_id=model_id,
|
|
650
|
+
model_id=model_id_components.model_id,
|
|
651
|
+
revision=model_id_components.revision,
|
|
652
|
+
benchmark_config=benchmark_config,
|
|
624
653
|
)
|
|
625
654
|
if model_info is None:
|
|
626
655
|
raise InvalidModel(f"The model {model_id!r} could not be found.")
|
|
@@ -629,8 +658,9 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
629
658
|
language_codes = list(language_mapping.keys())
|
|
630
659
|
|
|
631
660
|
model_config = ModelConfig(
|
|
632
|
-
model_id=model_id,
|
|
633
|
-
revision=revision,
|
|
661
|
+
model_id=model_id_components.model_id,
|
|
662
|
+
revision=model_id_components.revision,
|
|
663
|
+
param=model_id_components.param,
|
|
634
664
|
task=model_info.pipeline_tag,
|
|
635
665
|
languages=[
|
|
636
666
|
language_mapping[tag]
|
|
@@ -985,7 +1015,11 @@ def get_end_of_reasoning_token(
|
|
|
985
1015
|
prompt = "What is your name?"
|
|
986
1016
|
if has_chat_template(tokeniser=tokeniser):
|
|
987
1017
|
templated_prompt = apply_chat_template(
|
|
988
|
-
conversation=[dict(role="user", content=prompt)],
|
|
1018
|
+
conversation=[dict(role="user", content=prompt)],
|
|
1019
|
+
tokeniser=tokeniser,
|
|
1020
|
+
tokenise=False,
|
|
1021
|
+
add_generation_prompt=True,
|
|
1022
|
+
enable_thinking=True,
|
|
989
1023
|
)
|
|
990
1024
|
assert isinstance(templated_prompt, str)
|
|
991
1025
|
prompt = templated_prompt
|
|
@@ -1063,7 +1097,7 @@ def get_custom_stop_tokens(
|
|
|
1063
1097
|
model: "LLM",
|
|
1064
1098
|
tokeniser: "PreTrainedTokenizer",
|
|
1065
1099
|
model_id: str,
|
|
1066
|
-
|
|
1100
|
+
generative_type: GenerativeType | None,
|
|
1067
1101
|
) -> list[str]:
|
|
1068
1102
|
"""Get the stop tokens for a generative model.
|
|
1069
1103
|
|
|
@@ -1074,9 +1108,8 @@ def get_custom_stop_tokens(
|
|
|
1074
1108
|
The tokeniser.
|
|
1075
1109
|
model_id:
|
|
1076
1110
|
The model ID.
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
of generated tokens to allow before stopping the generation.
|
|
1111
|
+
generative_type:
|
|
1112
|
+
The generative type of the model.
|
|
1080
1113
|
|
|
1081
1114
|
Returns:
|
|
1082
1115
|
A list of stop tokens.
|
|
@@ -1086,12 +1119,18 @@ def get_custom_stop_tokens(
|
|
|
1086
1119
|
prompt = "Hello"
|
|
1087
1120
|
if has_chat_template(tokeniser=tokeniser):
|
|
1088
1121
|
templated_prompt = apply_chat_template(
|
|
1089
|
-
conversation=[dict(role="user", content=prompt)],
|
|
1122
|
+
conversation=[dict(role="user", content=prompt)],
|
|
1123
|
+
tokeniser=tokeniser,
|
|
1124
|
+
tokenise=False,
|
|
1125
|
+
add_generation_prompt=True,
|
|
1126
|
+
enable_thinking=generative_type == GenerativeType.REASONING,
|
|
1090
1127
|
)
|
|
1091
1128
|
assert isinstance(templated_prompt, str)
|
|
1092
1129
|
prompt = templated_prompt
|
|
1093
1130
|
|
|
1094
|
-
max_tokens =
|
|
1131
|
+
max_tokens = (
|
|
1132
|
+
REASONING_MAX_TOKENS if generative_type == GenerativeType.REASONING else 10
|
|
1133
|
+
)
|
|
1095
1134
|
completion = (
|
|
1096
1135
|
model.generate(
|
|
1097
1136
|
prompts=[prompt],
|
euroeval/benchmarker.py
CHANGED
|
@@ -19,7 +19,7 @@ from .constants import GENERATIVE_PIPELINE_TAGS
|
|
|
19
19
|
from .data_loading import load_data
|
|
20
20
|
from .data_models import BenchmarkConfigParams, BenchmarkResult
|
|
21
21
|
from .dataset_configs import get_all_dataset_configs
|
|
22
|
-
from .enums import Device, ModelType
|
|
22
|
+
from .enums import Device, GenerativeType, ModelType
|
|
23
23
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
|
|
24
24
|
from .finetuning import finetune
|
|
25
25
|
from .generation import generate
|
|
@@ -79,6 +79,7 @@ class Benchmarker:
|
|
|
79
79
|
api_base: str | None = None,
|
|
80
80
|
api_version: str | None = None,
|
|
81
81
|
gpu_memory_utilization: float = 0.9,
|
|
82
|
+
generative_type: GenerativeType | None = None,
|
|
82
83
|
debug: bool = False,
|
|
83
84
|
run_with_cli: bool = False,
|
|
84
85
|
requires_safetensors: bool = False,
|
|
@@ -151,6 +152,10 @@ class Benchmarker:
|
|
|
151
152
|
is generative. A larger value will result in faster evaluation, but at
|
|
152
153
|
the risk of running out of GPU memory. Only reduce this if you are
|
|
153
154
|
running out of GPU memory. Defaults to 0.9.
|
|
155
|
+
generative_type:
|
|
156
|
+
The type of generative model to benchmark. Only relevant if the model is
|
|
157
|
+
generative. If not specified, then the type will be inferred based on
|
|
158
|
+
the tags of the model. Defaults to None.
|
|
154
159
|
debug:
|
|
155
160
|
Whether to output debug information. Defaults to False.
|
|
156
161
|
run_with_cli:
|
|
@@ -199,6 +204,7 @@ class Benchmarker:
|
|
|
199
204
|
api_base=api_base,
|
|
200
205
|
api_version=api_version,
|
|
201
206
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
207
|
+
generative_type=generative_type,
|
|
202
208
|
debug=debug,
|
|
203
209
|
run_with_cli=run_with_cli,
|
|
204
210
|
requires_safetensors=requires_safetensors,
|
|
@@ -438,7 +444,7 @@ class Benchmarker:
|
|
|
438
444
|
|
|
439
445
|
# Skip if the model type should not be benchmarked on this dataset
|
|
440
446
|
model_type = model_config.model_type
|
|
441
|
-
allowed_model_types = dataset_config.
|
|
447
|
+
allowed_model_types = dataset_config.allowed_model_types
|
|
442
448
|
if model_type not in allowed_model_types:
|
|
443
449
|
logger.debug(
|
|
444
450
|
f"Skipping benchmarking {model_id} on "
|
|
@@ -804,6 +810,7 @@ class Benchmarker:
|
|
|
804
810
|
scores=scores,
|
|
805
811
|
model_id=model_config.model_id,
|
|
806
812
|
model_revision=model_config.revision,
|
|
813
|
+
model_param=model_config.param,
|
|
807
814
|
)
|
|
808
815
|
|
|
809
816
|
record = BenchmarkResult(
|
|
@@ -1108,6 +1115,8 @@ def initial_logging(
|
|
|
1108
1115
|
model_id = model_config.model_id
|
|
1109
1116
|
if model_config.revision and model_config.revision != "main":
|
|
1110
1117
|
model_id += f"@{model_config.revision}"
|
|
1118
|
+
if model_config.param is not None:
|
|
1119
|
+
model_id += f"#{model_config.param}"
|
|
1111
1120
|
|
|
1112
1121
|
split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
|
|
1113
1122
|
if model_config.task in GENERATIVE_PIPELINE_TAGS:
|
euroeval/cli.py
CHANGED
|
@@ -4,7 +4,7 @@ import click
|
|
|
4
4
|
|
|
5
5
|
from .benchmarker import Benchmarker
|
|
6
6
|
from .dataset_configs import get_all_dataset_configs
|
|
7
|
-
from .enums import Device
|
|
7
|
+
from .enums import Device, GenerativeType
|
|
8
8
|
from .languages import get_all_languages
|
|
9
9
|
from .tasks import get_all_tasks
|
|
10
10
|
|
|
@@ -208,6 +208,14 @@ from .tasks import get_all_tasks
|
|
|
208
208
|
help="Only allow loading models that have safetensors weights available",
|
|
209
209
|
default=False,
|
|
210
210
|
)
|
|
211
|
+
@click.option(
|
|
212
|
+
"--generative-type",
|
|
213
|
+
type=click.Choice(["base", "instruction_tuned", "reasoning"]),
|
|
214
|
+
default=None,
|
|
215
|
+
show_default=True,
|
|
216
|
+
help="The type of generative model. Only relevant if the model is generative. If "
|
|
217
|
+
"not specified, the type will be inferred automatically.",
|
|
218
|
+
)
|
|
211
219
|
def benchmark(
|
|
212
220
|
model: tuple[str],
|
|
213
221
|
dataset: tuple[str],
|
|
@@ -234,6 +242,7 @@ def benchmark(
|
|
|
234
242
|
gpu_memory_utilization: float,
|
|
235
243
|
debug: bool,
|
|
236
244
|
requires_safetensors: bool,
|
|
245
|
+
generative_type: str | None,
|
|
237
246
|
) -> None:
|
|
238
247
|
"""Benchmark pretrained language models on language tasks."""
|
|
239
248
|
models = list(model)
|
|
@@ -244,6 +253,9 @@ def benchmark(
|
|
|
244
253
|
tasks = None if len(task) == 0 else list(task)
|
|
245
254
|
batch_size_int = int(batch_size)
|
|
246
255
|
device = Device[device.upper()] if device is not None else None
|
|
256
|
+
generative_type_obj = (
|
|
257
|
+
GenerativeType[generative_type.upper()] if generative_type else None
|
|
258
|
+
)
|
|
247
259
|
|
|
248
260
|
benchmarker = Benchmarker(
|
|
249
261
|
language=languages,
|
|
@@ -268,6 +280,7 @@ def benchmark(
|
|
|
268
280
|
api_base=api_base,
|
|
269
281
|
api_version=api_version,
|
|
270
282
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
283
|
+
generative_type=generative_type_obj,
|
|
271
284
|
debug=debug,
|
|
272
285
|
run_with_cli=True,
|
|
273
286
|
requires_safetensors=requires_safetensors,
|
euroeval/constants.py
CHANGED
|
@@ -15,7 +15,7 @@ MAX_CONTEXT_LENGTH = 8_192
|
|
|
15
15
|
|
|
16
16
|
# We need to raise the amount of tokens generated for reasoning models, to give them
|
|
17
17
|
# time to think
|
|
18
|
-
REASONING_MAX_TOKENS =
|
|
18
|
+
REASONING_MAX_TOKENS = 8_192
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
# The Hugging Face Hub pipeline tags used to classify models as generative
|
euroeval/data_models.py
CHANGED
|
@@ -118,14 +118,14 @@ class Task:
|
|
|
118
118
|
log probabilities for the generated tokens. Defaults to False.
|
|
119
119
|
requires_logprobs (optional):
|
|
120
120
|
Whether the task requires log probabilities. Implies `uses_logprobs`.
|
|
121
|
-
|
|
121
|
+
default_allowed_model_types (optional):
|
|
122
122
|
A list of model types that are allowed to be evaluated on this task.
|
|
123
123
|
Defaults to all model types being allowed.
|
|
124
|
-
|
|
124
|
+
default_allowed_generative_types (optional):
|
|
125
125
|
A list of generative model types that are allowed to be evaluated on this
|
|
126
126
|
task. If None, all generative model types are allowed. Only relevant if
|
|
127
127
|
`allowed_model_types` includes generative models.
|
|
128
|
-
|
|
128
|
+
default_allow_invalid_model_outputs (optional):
|
|
129
129
|
Whether to allow invalid model outputs. This is only relevant for generative
|
|
130
130
|
models on classification tasks, where the model may generate an output
|
|
131
131
|
which is not one of the allowed labels. If True, the model output will be
|
|
@@ -144,17 +144,17 @@ class Task:
|
|
|
144
144
|
uses_structured_output: bool = False
|
|
145
145
|
uses_logprobs: bool = False
|
|
146
146
|
requires_logprobs: bool = False
|
|
147
|
-
|
|
147
|
+
default_allowed_model_types: list[ModelType] = field(
|
|
148
148
|
default_factory=lambda: [ModelType.ENCODER, ModelType.GENERATIVE]
|
|
149
149
|
)
|
|
150
|
-
|
|
150
|
+
default_allowed_generative_types: list[GenerativeType] = field(
|
|
151
151
|
default_factory=lambda: [
|
|
152
152
|
GenerativeType.BASE,
|
|
153
153
|
GenerativeType.INSTRUCTION_TUNED,
|
|
154
154
|
GenerativeType.REASONING,
|
|
155
155
|
]
|
|
156
156
|
)
|
|
157
|
-
|
|
157
|
+
default_allow_invalid_model_outputs: bool = True
|
|
158
158
|
|
|
159
159
|
def __post_init__(self) -> None:
|
|
160
160
|
"""Post-initialisation checks."""
|
|
@@ -225,6 +225,9 @@ class BenchmarkConfig:
|
|
|
225
225
|
Whether the benchmark is being run with the CLI.
|
|
226
226
|
requires_safetensors:
|
|
227
227
|
Whether to only allow models that use the safetensors format.
|
|
228
|
+
generative_type:
|
|
229
|
+
The type of generative model to benchmark. Only relevant if the model is
|
|
230
|
+
generative.
|
|
228
231
|
"""
|
|
229
232
|
|
|
230
233
|
model_languages: list[Language]
|
|
@@ -251,6 +254,7 @@ class BenchmarkConfig:
|
|
|
251
254
|
debug: bool
|
|
252
255
|
run_with_cli: bool
|
|
253
256
|
requires_safetensors: bool
|
|
257
|
+
generative_type: GenerativeType | None
|
|
254
258
|
|
|
255
259
|
|
|
256
260
|
class BenchmarkConfigParams(pydantic.BaseModel):
|
|
@@ -280,6 +284,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
280
284
|
api_base: str | None
|
|
281
285
|
api_version: str | None
|
|
282
286
|
gpu_memory_utilization: float
|
|
287
|
+
generative_type: GenerativeType | None
|
|
283
288
|
debug: bool
|
|
284
289
|
run_with_cli: bool
|
|
285
290
|
requires_safetensors: bool
|
|
@@ -402,6 +407,21 @@ class DatasetConfig:
|
|
|
402
407
|
to a 1:1 mapping between the labels and themselves. If None then the mapping
|
|
403
408
|
will be set to the default mapping for the task and language. Defaults to
|
|
404
409
|
None.
|
|
410
|
+
_allowed_model_types (optional):
|
|
411
|
+
A list of model types that are allowed to be evaluated on this dataset.
|
|
412
|
+
Defaults to the one for the task.
|
|
413
|
+
_allowed_generative_types (optional):
|
|
414
|
+
A list of generative model types that are allowed to be evaluated on this
|
|
415
|
+
dataset. If None, all generative model types are allowed. Only relevant if
|
|
416
|
+
`allowed_model_types` includes generative models. Defaults to the one for
|
|
417
|
+
the task.
|
|
418
|
+
_allow_invalid_model_outputs (optional):
|
|
419
|
+
Whether to allow invalid model outputs. This is only relevant for
|
|
420
|
+
generative models on classification tasks, where the model may generate an
|
|
421
|
+
output which is not one of the allowed labels. If True, the model output
|
|
422
|
+
will be mapped to the closest valid label. If False, the model output will
|
|
423
|
+
be considered incorrect and the evaluation will be aborted. Defaults to
|
|
424
|
+
the one for the task.
|
|
405
425
|
splits (optional):
|
|
406
426
|
The names of the splits in the dataset. If not provided, defaults to
|
|
407
427
|
["train", "val", "test"].
|
|
@@ -423,6 +443,9 @@ class DatasetConfig:
|
|
|
423
443
|
_max_generated_tokens: int | None = None
|
|
424
444
|
_labels: list[str] | None = None
|
|
425
445
|
_prompt_label_mapping: dict[str, str] | t.Literal["auto"] | None = None
|
|
446
|
+
_allowed_model_types: list[ModelType] | None = None
|
|
447
|
+
_allowed_generative_types: list[GenerativeType] | None = None
|
|
448
|
+
_allow_invalid_model_outputs: bool | None = None
|
|
426
449
|
splits: list[str] = field(default_factory=lambda: ["train", "val", "test"])
|
|
427
450
|
bootstrap_samples: bool = True
|
|
428
451
|
unofficial: bool = False
|
|
@@ -502,6 +525,33 @@ class DatasetConfig:
|
|
|
502
525
|
else:
|
|
503
526
|
return prompt_config.default_prompt_label_mapping
|
|
504
527
|
|
|
528
|
+
@property
|
|
529
|
+
def allowed_model_types(self) -> list[ModelType]:
|
|
530
|
+
"""A list of model types that are allowed to be evaluated on this dataset."""
|
|
531
|
+
return (
|
|
532
|
+
self._allowed_model_types
|
|
533
|
+
if self._allowed_model_types is not None
|
|
534
|
+
else self.task.default_allowed_model_types
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
@property
|
|
538
|
+
def allowed_generative_types(self) -> list[GenerativeType]:
|
|
539
|
+
"""A list of generative model types that are allowed on this dataset."""
|
|
540
|
+
return (
|
|
541
|
+
self._allowed_generative_types
|
|
542
|
+
if self._allowed_generative_types is not None
|
|
543
|
+
else self.task.default_allowed_generative_types
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
@property
|
|
547
|
+
def allow_invalid_model_outputs(self) -> bool:
|
|
548
|
+
"""Whether to allow invalid model outputs."""
|
|
549
|
+
return (
|
|
550
|
+
self._allow_invalid_model_outputs
|
|
551
|
+
if self._allow_invalid_model_outputs is not None
|
|
552
|
+
else self.task.default_allow_invalid_model_outputs
|
|
553
|
+
)
|
|
554
|
+
|
|
505
555
|
@property
|
|
506
556
|
def id2label(self) -> dict[int, str]:
|
|
507
557
|
"""The mapping from ID to label."""
|
|
@@ -573,6 +623,8 @@ class ModelConfig:
|
|
|
573
623
|
The ID of the model.
|
|
574
624
|
revision:
|
|
575
625
|
The revision of the model.
|
|
626
|
+
param:
|
|
627
|
+
The parameter of the model, or None if the model has no parameters.
|
|
576
628
|
task:
|
|
577
629
|
The task that the model was trained on.
|
|
578
630
|
languages:
|
|
@@ -594,6 +646,7 @@ class ModelConfig:
|
|
|
594
646
|
|
|
595
647
|
model_id: str
|
|
596
648
|
revision: str
|
|
649
|
+
param: str | None
|
|
597
650
|
task: str
|
|
598
651
|
languages: list[Language]
|
|
599
652
|
inference_backend: "InferenceBackend"
|
|
@@ -707,3 +760,21 @@ class PromptConfig:
|
|
|
707
760
|
default_prompt_template: str
|
|
708
761
|
default_instruction_prompt: str
|
|
709
762
|
default_prompt_label_mapping: dict[str, str] | t.Literal["auto"]
|
|
763
|
+
|
|
764
|
+
|
|
765
|
+
@dataclass
|
|
766
|
+
class ModelIdComponents:
|
|
767
|
+
"""A model ID split into its components.
|
|
768
|
+
|
|
769
|
+
Attributes:
|
|
770
|
+
model_id:
|
|
771
|
+
The main model ID without revision or parameters.
|
|
772
|
+
revision:
|
|
773
|
+
The revision of the model, if any.
|
|
774
|
+
param:
|
|
775
|
+
The parameter of the model, if any.
|
|
776
|
+
"""
|
|
777
|
+
|
|
778
|
+
model_id: str
|
|
779
|
+
revision: str
|
|
780
|
+
param: str | None
|
|
@@ -15,6 +15,7 @@ from .icelandic import * # noqa: F403
|
|
|
15
15
|
from .italian import * # noqa: F403
|
|
16
16
|
from .latvian import * # noqa: F403
|
|
17
17
|
from .norwegian import * # noqa: F403
|
|
18
|
+
from .polish import * # noqa: F403
|
|
18
19
|
from .portuguese import * # noqa: F403
|
|
19
20
|
from .spanish import * # noqa: F403
|
|
20
21
|
from .swedish import * # noqa: F403
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Danish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import DA
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -149,6 +150,19 @@ GOLDENSWAG_DA_CONFIG = DatasetConfig(
|
|
|
149
150
|
unofficial=True,
|
|
150
151
|
)
|
|
151
152
|
|
|
153
|
+
WINOGRANDE_DA_CONFIG = DatasetConfig(
|
|
154
|
+
name="winogrande-da",
|
|
155
|
+
pretty_name="the Danish common-sense reasoning dataset Winogrande-da, translated "
|
|
156
|
+
"from the English Winogrande dataset",
|
|
157
|
+
huggingface_id="EuroEval/winogrande-da",
|
|
158
|
+
task=COMMON_SENSE,
|
|
159
|
+
languages=[DA],
|
|
160
|
+
splits=["train", "test"],
|
|
161
|
+
_labels=["a", "b"],
|
|
162
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
163
|
+
unofficial=True,
|
|
164
|
+
)
|
|
165
|
+
|
|
152
166
|
EUROPEAN_VALUES_SITUATIONAL_DA_CONFIG = DatasetConfig(
|
|
153
167
|
name="european-values-situational-da",
|
|
154
168
|
pretty_name="the Danish version of the European values evaluation dataset, where "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All Dutch dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import NL
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -142,6 +143,19 @@ GOLDENSWAG_NL_CONFIG = DatasetConfig(
|
|
|
142
143
|
unofficial=True,
|
|
143
144
|
)
|
|
144
145
|
|
|
146
|
+
WINOGRANDE_NL_CONFIG = DatasetConfig(
|
|
147
|
+
name="winogrande-nl",
|
|
148
|
+
pretty_name="the Dutch common-sense reasoning dataset Winogrande-nl, translated "
|
|
149
|
+
"from the English Winogrande dataset",
|
|
150
|
+
huggingface_id="EuroEval/winogrande-nl",
|
|
151
|
+
task=COMMON_SENSE,
|
|
152
|
+
languages=[NL],
|
|
153
|
+
splits=["train", "test"],
|
|
154
|
+
_labels=["a", "b"],
|
|
155
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
156
|
+
unofficial=True,
|
|
157
|
+
)
|
|
158
|
+
|
|
145
159
|
EUROPEAN_VALUES_SITUATIONAL_NL_CONFIG = DatasetConfig(
|
|
146
160
|
name="european-values-situational-nl",
|
|
147
161
|
pretty_name="the Dutch version of the European values evaluation dataset, where "
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""All English dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
+
from ..enums import ModelType
|
|
4
5
|
from ..languages import EN
|
|
5
6
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
6
7
|
|
|
@@ -80,6 +81,15 @@ EUROPEAN_VALUES_EN_CONFIG = DatasetConfig(
|
|
|
80
81
|
|
|
81
82
|
### Unofficial datasets ###
|
|
82
83
|
|
|
84
|
+
XQUAD_EN_CONFIG = DatasetConfig(
|
|
85
|
+
name="xquad-en",
|
|
86
|
+
pretty_name="the English version of the reading comprehension dataset XQuAD",
|
|
87
|
+
huggingface_id="EuroEval/xquad-en",
|
|
88
|
+
task=RC,
|
|
89
|
+
languages=[EN],
|
|
90
|
+
unofficial=True,
|
|
91
|
+
)
|
|
92
|
+
|
|
83
93
|
ARC_CONFIG = DatasetConfig(
|
|
84
94
|
name="arc",
|
|
85
95
|
pretty_name="the truncated version of the English knowledge dataset ARC",
|
|
@@ -117,6 +127,18 @@ MULTI_WIKI_QA_EN_CONFIG = DatasetConfig(
|
|
|
117
127
|
unofficial=True,
|
|
118
128
|
)
|
|
119
129
|
|
|
130
|
+
WINOGRANDE_CONFIG = DatasetConfig(
|
|
131
|
+
name="winogrande",
|
|
132
|
+
pretty_name="the English common-sense reasoning dataset Winogrande",
|
|
133
|
+
huggingface_id="EuroEval/winogrande-en",
|
|
134
|
+
task=COMMON_SENSE,
|
|
135
|
+
languages=[EN],
|
|
136
|
+
splits=["train", "test"],
|
|
137
|
+
_labels=["a", "b"],
|
|
138
|
+
_allowed_model_types=[ModelType.GENERATIVE],
|
|
139
|
+
unofficial=True,
|
|
140
|
+
)
|
|
141
|
+
|
|
120
142
|
EUROPEAN_VALUES_SITUATIONAL_EN_CONFIG = DatasetConfig(
|
|
121
143
|
name="european-values-situational-en",
|
|
122
144
|
pretty_name="the English version of the European values evaluation dataset, where "
|