EuroEval 16.0.0__py3-none-any.whl → 16.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +5 -0
- euroeval/benchmark_config_factory.py +6 -1
- euroeval/benchmark_modules/base.py +2 -0
- euroeval/benchmark_modules/fresh.py +7 -1
- euroeval/benchmark_modules/hf.py +26 -21
- euroeval/benchmark_modules/litellm.py +258 -131
- euroeval/benchmark_modules/vllm.py +120 -68
- euroeval/benchmarker.py +11 -2
- euroeval/cli.py +14 -1
- euroeval/constants.py +7 -1
- euroeval/data_models.py +95 -20
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/danish.py +14 -3
- euroeval/dataset_configs/dutch.py +14 -0
- euroeval/dataset_configs/english.py +22 -0
- euroeval/dataset_configs/estonian.py +15 -7
- euroeval/dataset_configs/finnish.py +14 -0
- euroeval/dataset_configs/french.py +14 -0
- euroeval/dataset_configs/german.py +23 -0
- euroeval/dataset_configs/italian.py +14 -0
- euroeval/dataset_configs/latvian.py +14 -0
- euroeval/dataset_configs/norwegian.py +14 -0
- euroeval/dataset_configs/polish.py +126 -0
- euroeval/dataset_configs/portuguese.py +14 -0
- euroeval/dataset_configs/spanish.py +14 -0
- euroeval/dataset_configs/swedish.py +25 -0
- euroeval/enums.py +12 -0
- euroeval/generation.py +17 -8
- euroeval/generation_utils.py +102 -16
- euroeval/metrics/pipeline.py +51 -9
- euroeval/model_cache.py +13 -1
- euroeval/prompt_templates/linguistic_acceptability.py +9 -0
- euroeval/prompt_templates/multiple_choice.py +27 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -0
- euroeval/prompt_templates/reading_comprehension.py +11 -0
- euroeval/prompt_templates/sentiment_classification.py +15 -0
- euroeval/prompt_templates/summarization.py +27 -1
- euroeval/scores.py +5 -0
- euroeval/task_group_utils/multiple_choice_classification.py +2 -2
- euroeval/task_group_utils/question_answering.py +29 -29
- euroeval/task_group_utils/sequence_classification.py +71 -81
- euroeval/task_group_utils/token_classification.py +17 -3
- euroeval/tasks.py +12 -10
- euroeval/{tokenization_utils.py → tokenisation_utils.py} +41 -25
- euroeval/utils.py +67 -3
- {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/METADATA +3 -1
- euroeval-16.1.0.dist-info/RECORD +70 -0
- euroeval-16.0.0.dist-info/RECORD +0 -69
- {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
- {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.0.0.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -44,7 +44,11 @@ from ..exceptions import (
|
|
|
44
44
|
NeedsEnvironmentVariable,
|
|
45
45
|
NeedsExtraInstalled,
|
|
46
46
|
)
|
|
47
|
-
from ..generation_utils import
|
|
47
|
+
from ..generation_utils import (
|
|
48
|
+
apply_prompt,
|
|
49
|
+
extract_few_shot_examples,
|
|
50
|
+
raise_if_wrong_params,
|
|
51
|
+
)
|
|
48
52
|
from ..languages import get_all_languages
|
|
49
53
|
from ..task_group_utils import (
|
|
50
54
|
question_answering,
|
|
@@ -52,7 +56,7 @@ from ..task_group_utils import (
|
|
|
52
56
|
text_to_text,
|
|
53
57
|
token_classification,
|
|
54
58
|
)
|
|
55
|
-
from ..
|
|
59
|
+
from ..tokenisation_utils import (
|
|
56
60
|
apply_chat_template,
|
|
57
61
|
get_bos_token,
|
|
58
62
|
get_end_of_chat_token_ids,
|
|
@@ -69,6 +73,7 @@ from ..utils import (
|
|
|
69
73
|
get_hf_token,
|
|
70
74
|
get_min_cuda_compute_capability,
|
|
71
75
|
log_once,
|
|
76
|
+
split_model_id,
|
|
72
77
|
)
|
|
73
78
|
from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
|
|
74
79
|
|
|
@@ -97,6 +102,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
97
102
|
fresh_model = False
|
|
98
103
|
batching_preference = BatchingPreference.ALL_AT_ONCE
|
|
99
104
|
high_priority = True
|
|
105
|
+
allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
|
|
100
106
|
|
|
101
107
|
def __init__(
|
|
102
108
|
self,
|
|
@@ -120,42 +126,46 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
120
126
|
if importlib.util.find_spec("vllm") is None:
|
|
121
127
|
raise NeedsExtraInstalled(extra="generative")
|
|
122
128
|
|
|
129
|
+
raise_if_wrong_params(
|
|
130
|
+
model_config=model_config, allowed_params=self.allowed_params
|
|
131
|
+
)
|
|
132
|
+
|
|
123
133
|
model, tokeniser = load_model_and_tokeniser(
|
|
124
134
|
model_config=model_config, benchmark_config=benchmark_config
|
|
125
135
|
)
|
|
126
136
|
self._model: "LLM" = model
|
|
127
137
|
self._tokeniser: "PreTrainedTokenizer" = tokeniser
|
|
138
|
+
|
|
139
|
+
# We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
|
|
140
|
+
# to call the `__init__` method of the `BenchmarkModule` class.
|
|
141
|
+
super(HuggingFaceEncoderModel, self).__init__(
|
|
142
|
+
model_config=model_config,
|
|
143
|
+
dataset_config=dataset_config,
|
|
144
|
+
benchmark_config=benchmark_config,
|
|
145
|
+
log_metadata=log_metadata,
|
|
146
|
+
)
|
|
147
|
+
|
|
128
148
|
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
129
149
|
model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
|
|
130
150
|
)
|
|
131
151
|
self.end_of_chat_token_ids = get_end_of_chat_token_ids(
|
|
132
|
-
tokeniser=self._tokeniser
|
|
152
|
+
tokeniser=self._tokeniser, generative_type=self.generative_type
|
|
133
153
|
)
|
|
134
154
|
self.custom_stop_tokens = get_custom_stop_tokens(
|
|
135
155
|
model=self._model,
|
|
136
156
|
tokeniser=self._tokeniser,
|
|
137
157
|
model_id=model_config.model_id,
|
|
138
|
-
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
# We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
|
|
142
|
-
# to call the `__init__` method of the `BenchmarkModule` class.
|
|
143
|
-
super(HuggingFaceEncoderModel, self).__init__(
|
|
144
|
-
model_config=model_config,
|
|
145
|
-
dataset_config=dataset_config,
|
|
146
|
-
benchmark_config=benchmark_config,
|
|
147
|
-
log_metadata=log_metadata,
|
|
158
|
+
generative_type=self.generative_type,
|
|
148
159
|
)
|
|
149
160
|
|
|
150
161
|
self.buffer |= dict(
|
|
151
|
-
instruction_model=has_chat_template(tokeniser=self._tokeniser),
|
|
152
162
|
first_label_token_mapping=get_first_label_token_mapping(
|
|
153
163
|
dataset_config=self.dataset_config,
|
|
154
164
|
model_config=self.model_config,
|
|
155
165
|
tokeniser=self._tokeniser,
|
|
156
166
|
generative_type=self.generative_type,
|
|
157
167
|
log_metadata=self.log_metadata,
|
|
158
|
-
)
|
|
168
|
+
)
|
|
159
169
|
)
|
|
160
170
|
if self.model_config.adapter_base_model_id is not None:
|
|
161
171
|
adapter_path = snapshot_download(
|
|
@@ -187,16 +197,36 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
187
197
|
The generative type of the model, or None if it has not been set yet.
|
|
188
198
|
"""
|
|
189
199
|
if not hasattr(self, "_tokeniser"):
|
|
200
|
+
log_once(
|
|
201
|
+
"The generative type of the model has not been set yet as the "
|
|
202
|
+
"tokeniser has not been loaded.",
|
|
203
|
+
level=logging.DEBUG,
|
|
204
|
+
)
|
|
190
205
|
return None
|
|
191
|
-
elif self.
|
|
192
|
-
|
|
206
|
+
elif self.benchmark_config.generative_type is not None:
|
|
207
|
+
type_ = self.benchmark_config.generative_type
|
|
208
|
+
elif self.model_config.param in {"thinking"}:
|
|
209
|
+
type_ = GenerativeType.REASONING
|
|
210
|
+
elif self.model_config.param in {"no-thinking"}:
|
|
211
|
+
type_ = GenerativeType.INSTRUCTION_TUNED
|
|
212
|
+
elif (
|
|
213
|
+
hasattr(self, "end_of_reasoning_token")
|
|
214
|
+
and self.end_of_reasoning_token is not None
|
|
215
|
+
):
|
|
216
|
+
type_ = GenerativeType.REASONING
|
|
193
217
|
elif (
|
|
194
218
|
has_chat_template(tokeniser=self._tokeniser)
|
|
195
219
|
or "instruct" in self.model_config.model_id.lower()
|
|
196
220
|
):
|
|
197
|
-
|
|
221
|
+
type_ = GenerativeType.INSTRUCTION_TUNED
|
|
198
222
|
else:
|
|
199
|
-
|
|
223
|
+
type_ = GenerativeType.BASE
|
|
224
|
+
log_once(
|
|
225
|
+
f"Detected generative type {type_.name!r} for model "
|
|
226
|
+
f"{self.model_config.model_id!r}",
|
|
227
|
+
level=logging.DEBUG,
|
|
228
|
+
)
|
|
229
|
+
return type_
|
|
200
230
|
|
|
201
231
|
@property
|
|
202
232
|
def extract_labels_from_generation(self) -> ExtractLabelsFunction:
|
|
@@ -285,7 +315,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
285
315
|
few_shot_examples=few_shot_examples,
|
|
286
316
|
model_config=self.model_config,
|
|
287
317
|
dataset_config=self.dataset_config,
|
|
288
|
-
|
|
318
|
+
generative_type=self.generative_type,
|
|
289
319
|
always_populate_text_field=True,
|
|
290
320
|
tokeniser=self._tokeniser,
|
|
291
321
|
),
|
|
@@ -313,7 +343,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
313
343
|
"""
|
|
314
344
|
# Get stopping tokens
|
|
315
345
|
stop_tokens: list[str] = self.custom_stop_tokens.copy()
|
|
316
|
-
if self.
|
|
346
|
+
if self.generative_type == GenerativeType.BASE:
|
|
317
347
|
stop_tokens.append("\n\n")
|
|
318
348
|
if self._tokeniser.pad_token_id is not None:
|
|
319
349
|
assert isinstance(self._tokeniser.pad_token, str), (
|
|
@@ -337,31 +367,6 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
337
367
|
if end_of_chat_token:
|
|
338
368
|
stop_tokens.append(end_of_chat_token)
|
|
339
369
|
|
|
340
|
-
structured_generation_schema = None
|
|
341
|
-
if self.dataset_config.task.uses_structured_output:
|
|
342
|
-
if self.generative_type == GenerativeType.REASONING:
|
|
343
|
-
log_once(
|
|
344
|
-
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
345
|
-
"and thus does not support structured generation, so we do not "
|
|
346
|
-
"enable it.",
|
|
347
|
-
level=logging.DEBUG,
|
|
348
|
-
)
|
|
349
|
-
else:
|
|
350
|
-
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
351
|
-
keys_and_their_types: dict[str, t.Any] = {
|
|
352
|
-
tag_name: (conlist(str, max_length=5), ...)
|
|
353
|
-
for tag_name in ner_tag_names
|
|
354
|
-
}
|
|
355
|
-
answer_format_class = create_model(
|
|
356
|
-
"AnswerFormat", **keys_and_their_types
|
|
357
|
-
)
|
|
358
|
-
structured_generation_schema = answer_format_class.model_json_schema()
|
|
359
|
-
log_once(
|
|
360
|
-
"Using structured generation with the JSON schema "
|
|
361
|
-
f"{structured_generation_schema}",
|
|
362
|
-
level=logging.DEBUG,
|
|
363
|
-
)
|
|
364
|
-
|
|
365
370
|
# Get the mapping from labels to the first token in the label. We call this each
|
|
366
371
|
# time we generate a new dataset since the dataset config can change
|
|
367
372
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
@@ -382,8 +387,29 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
382
387
|
"error was. Skipping this evaluation."
|
|
383
388
|
)
|
|
384
389
|
|
|
385
|
-
|
|
386
|
-
if
|
|
390
|
+
structured_generation_schema = None
|
|
391
|
+
if (
|
|
392
|
+
self.dataset_config.task.uses_structured_output
|
|
393
|
+
or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
|
|
394
|
+
) and self.generative_type == GenerativeType.REASONING:
|
|
395
|
+
guided_decoding = None
|
|
396
|
+
logger.debug(
|
|
397
|
+
"The dataset uses structured output, but we are not using it as the "
|
|
398
|
+
"model is a reasoning model."
|
|
399
|
+
)
|
|
400
|
+
elif self.dataset_config.task.uses_structured_output:
|
|
401
|
+
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
402
|
+
keys_and_their_types: dict[str, t.Any] = {
|
|
403
|
+
tag_name: (conlist(str, max_length=5), ...)
|
|
404
|
+
for tag_name in ner_tag_names
|
|
405
|
+
}
|
|
406
|
+
answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
407
|
+
structured_generation_schema = answer_format_class.model_json_schema()
|
|
408
|
+
log_once(
|
|
409
|
+
"Using structured generation with the JSON schema: "
|
|
410
|
+
f"{json.dumps(structured_generation_schema)}",
|
|
411
|
+
level=logging.DEBUG,
|
|
412
|
+
)
|
|
387
413
|
guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
|
|
388
414
|
elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
|
|
389
415
|
guided_decoding = GuidedDecodingParams(
|
|
@@ -392,8 +418,17 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
392
418
|
for label in self.dataset_config.labels
|
|
393
419
|
]
|
|
394
420
|
)
|
|
421
|
+
log_once(
|
|
422
|
+
"Using structured generation with the choices: "
|
|
423
|
+
f"{guided_decoding.choice!r}.",
|
|
424
|
+
level=logging.DEBUG,
|
|
425
|
+
)
|
|
395
426
|
else:
|
|
396
427
|
guided_decoding = None
|
|
428
|
+
log_once(
|
|
429
|
+
"Not using structured generation as the dataset does not require it.",
|
|
430
|
+
level=logging.DEBUG,
|
|
431
|
+
)
|
|
397
432
|
|
|
398
433
|
# Define the parameters used for vLLM generation
|
|
399
434
|
max_tokens: int = (
|
|
@@ -425,9 +460,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
425
460
|
labels_to_be_generated = list(self.dataset_config.prompt_label_mapping.values())
|
|
426
461
|
if len(labels_to_be_generated) == 0:
|
|
427
462
|
labels_to_be_generated = ["negative", "positive"]
|
|
428
|
-
if
|
|
429
|
-
"instruction_model", False
|
|
430
|
-
) and should_prompts_be_stripped(
|
|
463
|
+
if self.generative_type == GenerativeType.BASE and should_prompts_be_stripped(
|
|
431
464
|
labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
|
|
432
465
|
):
|
|
433
466
|
log_once(
|
|
@@ -439,6 +472,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
439
472
|
# Generate sequences using vLLM
|
|
440
473
|
input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
|
|
441
474
|
num_attempts = 3
|
|
475
|
+
truncation_attempts = 0
|
|
442
476
|
for _ in range(num_attempts):
|
|
443
477
|
try:
|
|
444
478
|
raw_outputs = self._model.generate(
|
|
@@ -466,12 +500,19 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
466
500
|
"Prompts are too long, so truncating them and trying again..."
|
|
467
501
|
)
|
|
468
502
|
logger.debug(f"The error message was: {str(e)}")
|
|
503
|
+
|
|
504
|
+
# If we have already tried truncating the prompts a few times, then
|
|
505
|
+
# we truncate a bit more aggressively
|
|
506
|
+
extra_truncation = 50 * truncation_attempts
|
|
507
|
+
truncation_attempts += 1
|
|
508
|
+
|
|
469
509
|
tokenized_prompts = self._tokeniser(
|
|
470
510
|
text=prompts,
|
|
471
511
|
truncation=True,
|
|
472
512
|
max_length=max(
|
|
473
513
|
min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
|
|
474
|
-
- max_tokens
|
|
514
|
+
- max_tokens
|
|
515
|
+
- extra_truncation,
|
|
475
516
|
0,
|
|
476
517
|
),
|
|
477
518
|
)
|
|
@@ -577,9 +618,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
577
618
|
if using_api:
|
|
578
619
|
return False
|
|
579
620
|
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
621
|
+
model_id_components = split_model_id(model_id=model_id)
|
|
622
|
+
model_id = model_id_components.model_id
|
|
623
|
+
revision = model_id_components.revision
|
|
624
|
+
|
|
583
625
|
model_info = get_model_repo_info(
|
|
584
626
|
model_id=model_id, revision=revision, benchmark_config=benchmark_config
|
|
585
627
|
)
|
|
@@ -603,11 +645,11 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
603
645
|
Returns:
|
|
604
646
|
The model configuration.
|
|
605
647
|
"""
|
|
606
|
-
|
|
607
|
-
model_id.split("@") if "@" in model_id else (model_id, "main")
|
|
608
|
-
)
|
|
648
|
+
model_id_components = split_model_id(model_id=model_id)
|
|
609
649
|
model_info = get_model_repo_info(
|
|
610
|
-
model_id=model_id,
|
|
650
|
+
model_id=model_id_components.model_id,
|
|
651
|
+
revision=model_id_components.revision,
|
|
652
|
+
benchmark_config=benchmark_config,
|
|
611
653
|
)
|
|
612
654
|
if model_info is None:
|
|
613
655
|
raise InvalidModel(f"The model {model_id!r} could not be found.")
|
|
@@ -616,8 +658,9 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
616
658
|
language_codes = list(language_mapping.keys())
|
|
617
659
|
|
|
618
660
|
model_config = ModelConfig(
|
|
619
|
-
model_id=model_id,
|
|
620
|
-
revision=revision,
|
|
661
|
+
model_id=model_id_components.model_id,
|
|
662
|
+
revision=model_id_components.revision,
|
|
663
|
+
param=model_id_components.param,
|
|
621
664
|
task=model_info.pipeline_tag,
|
|
622
665
|
languages=[
|
|
623
666
|
language_mapping[tag]
|
|
@@ -972,7 +1015,11 @@ def get_end_of_reasoning_token(
|
|
|
972
1015
|
prompt = "What is your name?"
|
|
973
1016
|
if has_chat_template(tokeniser=tokeniser):
|
|
974
1017
|
templated_prompt = apply_chat_template(
|
|
975
|
-
conversation=[dict(role="user", content=prompt)],
|
|
1018
|
+
conversation=[dict(role="user", content=prompt)],
|
|
1019
|
+
tokeniser=tokeniser,
|
|
1020
|
+
tokenise=False,
|
|
1021
|
+
add_generation_prompt=True,
|
|
1022
|
+
enable_thinking=True,
|
|
976
1023
|
)
|
|
977
1024
|
assert isinstance(templated_prompt, str)
|
|
978
1025
|
prompt = templated_prompt
|
|
@@ -1050,7 +1097,7 @@ def get_custom_stop_tokens(
|
|
|
1050
1097
|
model: "LLM",
|
|
1051
1098
|
tokeniser: "PreTrainedTokenizer",
|
|
1052
1099
|
model_id: str,
|
|
1053
|
-
|
|
1100
|
+
generative_type: GenerativeType | None,
|
|
1054
1101
|
) -> list[str]:
|
|
1055
1102
|
"""Get the stop tokens for a generative model.
|
|
1056
1103
|
|
|
@@ -1061,9 +1108,8 @@ def get_custom_stop_tokens(
|
|
|
1061
1108
|
The tokeniser.
|
|
1062
1109
|
model_id:
|
|
1063
1110
|
The model ID.
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
of generated tokens to allow before stopping the generation.
|
|
1111
|
+
generative_type:
|
|
1112
|
+
The generative type of the model.
|
|
1067
1113
|
|
|
1068
1114
|
Returns:
|
|
1069
1115
|
A list of stop tokens.
|
|
@@ -1073,12 +1119,18 @@ def get_custom_stop_tokens(
|
|
|
1073
1119
|
prompt = "Hello"
|
|
1074
1120
|
if has_chat_template(tokeniser=tokeniser):
|
|
1075
1121
|
templated_prompt = apply_chat_template(
|
|
1076
|
-
conversation=[dict(role="user", content=prompt)],
|
|
1122
|
+
conversation=[dict(role="user", content=prompt)],
|
|
1123
|
+
tokeniser=tokeniser,
|
|
1124
|
+
tokenise=False,
|
|
1125
|
+
add_generation_prompt=True,
|
|
1126
|
+
enable_thinking=generative_type == GenerativeType.REASONING,
|
|
1077
1127
|
)
|
|
1078
1128
|
assert isinstance(templated_prompt, str)
|
|
1079
1129
|
prompt = templated_prompt
|
|
1080
1130
|
|
|
1081
|
-
max_tokens =
|
|
1131
|
+
max_tokens = (
|
|
1132
|
+
REASONING_MAX_TOKENS if generative_type == GenerativeType.REASONING else 10
|
|
1133
|
+
)
|
|
1082
1134
|
completion = (
|
|
1083
1135
|
model.generate(
|
|
1084
1136
|
prompts=[prompt],
|
euroeval/benchmarker.py
CHANGED
|
@@ -19,7 +19,7 @@ from .constants import GENERATIVE_PIPELINE_TAGS
|
|
|
19
19
|
from .data_loading import load_data
|
|
20
20
|
from .data_models import BenchmarkConfigParams, BenchmarkResult
|
|
21
21
|
from .dataset_configs import get_all_dataset_configs
|
|
22
|
-
from .enums import Device, ModelType
|
|
22
|
+
from .enums import Device, GenerativeType, ModelType
|
|
23
23
|
from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
|
|
24
24
|
from .finetuning import finetune
|
|
25
25
|
from .generation import generate
|
|
@@ -79,6 +79,7 @@ class Benchmarker:
|
|
|
79
79
|
api_base: str | None = None,
|
|
80
80
|
api_version: str | None = None,
|
|
81
81
|
gpu_memory_utilization: float = 0.9,
|
|
82
|
+
generative_type: GenerativeType | None = None,
|
|
82
83
|
debug: bool = False,
|
|
83
84
|
run_with_cli: bool = False,
|
|
84
85
|
requires_safetensors: bool = False,
|
|
@@ -151,6 +152,10 @@ class Benchmarker:
|
|
|
151
152
|
is generative. A larger value will result in faster evaluation, but at
|
|
152
153
|
the risk of running out of GPU memory. Only reduce this if you are
|
|
153
154
|
running out of GPU memory. Defaults to 0.9.
|
|
155
|
+
generative_type:
|
|
156
|
+
The type of generative model to benchmark. Only relevant if the model is
|
|
157
|
+
generative. If not specified, then the type will be inferred based on
|
|
158
|
+
the tags of the model. Defaults to None.
|
|
154
159
|
debug:
|
|
155
160
|
Whether to output debug information. Defaults to False.
|
|
156
161
|
run_with_cli:
|
|
@@ -199,6 +204,7 @@ class Benchmarker:
|
|
|
199
204
|
api_base=api_base,
|
|
200
205
|
api_version=api_version,
|
|
201
206
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
207
|
+
generative_type=generative_type,
|
|
202
208
|
debug=debug,
|
|
203
209
|
run_with_cli=run_with_cli,
|
|
204
210
|
requires_safetensors=requires_safetensors,
|
|
@@ -438,7 +444,7 @@ class Benchmarker:
|
|
|
438
444
|
|
|
439
445
|
# Skip if the model type should not be benchmarked on this dataset
|
|
440
446
|
model_type = model_config.model_type
|
|
441
|
-
allowed_model_types = dataset_config.
|
|
447
|
+
allowed_model_types = dataset_config.allowed_model_types
|
|
442
448
|
if model_type not in allowed_model_types:
|
|
443
449
|
logger.debug(
|
|
444
450
|
f"Skipping benchmarking {model_id} on "
|
|
@@ -804,6 +810,7 @@ class Benchmarker:
|
|
|
804
810
|
scores=scores,
|
|
805
811
|
model_id=model_config.model_id,
|
|
806
812
|
model_revision=model_config.revision,
|
|
813
|
+
model_param=model_config.param,
|
|
807
814
|
)
|
|
808
815
|
|
|
809
816
|
record = BenchmarkResult(
|
|
@@ -1108,6 +1115,8 @@ def initial_logging(
|
|
|
1108
1115
|
model_id = model_config.model_id
|
|
1109
1116
|
if model_config.revision and model_config.revision != "main":
|
|
1110
1117
|
model_id += f"@{model_config.revision}"
|
|
1118
|
+
if model_config.param is not None:
|
|
1119
|
+
model_id += f"#{model_config.param}"
|
|
1111
1120
|
|
|
1112
1121
|
split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
|
|
1113
1122
|
if model_config.task in GENERATIVE_PIPELINE_TAGS:
|
euroeval/cli.py
CHANGED
|
@@ -4,7 +4,7 @@ import click
|
|
|
4
4
|
|
|
5
5
|
from .benchmarker import Benchmarker
|
|
6
6
|
from .dataset_configs import get_all_dataset_configs
|
|
7
|
-
from .enums import Device
|
|
7
|
+
from .enums import Device, GenerativeType
|
|
8
8
|
from .languages import get_all_languages
|
|
9
9
|
from .tasks import get_all_tasks
|
|
10
10
|
|
|
@@ -208,6 +208,14 @@ from .tasks import get_all_tasks
|
|
|
208
208
|
help="Only allow loading models that have safetensors weights available",
|
|
209
209
|
default=False,
|
|
210
210
|
)
|
|
211
|
+
@click.option(
|
|
212
|
+
"--generative-type",
|
|
213
|
+
type=click.Choice(["base", "instruction_tuned", "reasoning"]),
|
|
214
|
+
default=None,
|
|
215
|
+
show_default=True,
|
|
216
|
+
help="The type of generative model. Only relevant if the model is generative. If "
|
|
217
|
+
"not specified, the type will be inferred automatically.",
|
|
218
|
+
)
|
|
211
219
|
def benchmark(
|
|
212
220
|
model: tuple[str],
|
|
213
221
|
dataset: tuple[str],
|
|
@@ -234,6 +242,7 @@ def benchmark(
|
|
|
234
242
|
gpu_memory_utilization: float,
|
|
235
243
|
debug: bool,
|
|
236
244
|
requires_safetensors: bool,
|
|
245
|
+
generative_type: str | None,
|
|
237
246
|
) -> None:
|
|
238
247
|
"""Benchmark pretrained language models on language tasks."""
|
|
239
248
|
models = list(model)
|
|
@@ -244,6 +253,9 @@ def benchmark(
|
|
|
244
253
|
tasks = None if len(task) == 0 else list(task)
|
|
245
254
|
batch_size_int = int(batch_size)
|
|
246
255
|
device = Device[device.upper()] if device is not None else None
|
|
256
|
+
generative_type_obj = (
|
|
257
|
+
GenerativeType[generative_type.upper()] if generative_type else None
|
|
258
|
+
)
|
|
247
259
|
|
|
248
260
|
benchmarker = Benchmarker(
|
|
249
261
|
language=languages,
|
|
@@ -268,6 +280,7 @@ def benchmark(
|
|
|
268
280
|
api_base=api_base,
|
|
269
281
|
api_version=api_version,
|
|
270
282
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
283
|
+
generative_type=generative_type_obj,
|
|
271
284
|
debug=debug,
|
|
272
285
|
run_with_cli=True,
|
|
273
286
|
requires_safetensors=requires_safetensors,
|
euroeval/constants.py
CHANGED
|
@@ -15,7 +15,7 @@ MAX_CONTEXT_LENGTH = 8_192
|
|
|
15
15
|
|
|
16
16
|
# We need to raise the amount of tokens generated for reasoning models, to give them
|
|
17
17
|
# time to think
|
|
18
|
-
REASONING_MAX_TOKENS =
|
|
18
|
+
REASONING_MAX_TOKENS = 8_192
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
# The Hugging Face Hub pipeline tags used to classify models as generative
|
|
@@ -75,3 +75,9 @@ LITELLM_CLASSIFICATION_OUTPUT_KEY = "label"
|
|
|
75
75
|
|
|
76
76
|
# These characters are stripped from JSON output when trying to identify the label
|
|
77
77
|
JSON_STRIP_CHARACTERS = ' {}\n\r":'
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# The number of tokens we generate when evaluating generative models on classification
|
|
81
|
+
# tasks. We also use this to determine whether we should store logprobs in the model
|
|
82
|
+
# outputs (and cache).
|
|
83
|
+
NUM_GENERATION_TOKENS_FOR_CLASSIFICATION = 10
|