EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +2 -2
- euroeval/benchmark_modules/base.py +3 -2
- euroeval/benchmark_modules/fresh.py +8 -6
- euroeval/benchmark_modules/hf.py +44 -33
- euroeval/benchmark_modules/litellm.py +314 -120
- euroeval/benchmark_modules/vllm.py +99 -59
- euroeval/benchmarker.py +52 -21
- euroeval/callbacks.py +2 -2
- euroeval/constants.py +9 -2
- euroeval/data_models.py +258 -44
- euroeval/dataset_configs/__init__.py +61 -0
- euroeval/dataset_configs/danish.py +120 -0
- euroeval/dataset_configs/dutch.py +123 -0
- euroeval/dataset_configs/english.py +88 -0
- euroeval/dataset_configs/faroese.py +53 -0
- euroeval/dataset_configs/french.py +83 -0
- euroeval/dataset_configs/german.py +91 -0
- euroeval/dataset_configs/icelandic.py +148 -0
- euroeval/dataset_configs/italian.py +81 -0
- euroeval/dataset_configs/norwegian.py +178 -0
- euroeval/dataset_configs/spanish.py +78 -0
- euroeval/dataset_configs/swedish.py +100 -0
- euroeval/exceptions.py +10 -10
- euroeval/finetuning.py +6 -10
- euroeval/generation.py +1 -0
- euroeval/human_evaluation.py +2 -2
- euroeval/languages.py +20 -13
- euroeval/model_cache.py +1 -1
- euroeval/model_loading.py +1 -12
- euroeval/prompt_templates/__init__.py +8 -0
- euroeval/prompt_templates/linguistic_acceptability.py +112 -0
- euroeval/prompt_templates/multiple_choice.py +97 -0
- euroeval/prompt_templates/named_entity_recognition.py +257 -0
- euroeval/prompt_templates/reading_comprehension.py +118 -0
- euroeval/prompt_templates/sentiment_classification.py +137 -0
- euroeval/prompt_templates/summarization.py +97 -0
- euroeval/speed_benchmark.py +1 -1
- euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
- euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
- euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
- euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
- euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
- euroeval/tasks.py +54 -0
- euroeval/tokenization_utils.py +343 -0
- euroeval/types.py +3 -1
- euroeval/utils.py +5 -254
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
- euroeval-15.6.0.dist-info/RECORD +59 -0
- euroeval/dataset_configs.py +0 -2408
- euroeval-15.4.2.dist-info/RECORD +0 -40
- /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Generative models using the vLLM inference framework."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
+
import contextlib
|
|
4
5
|
import importlib.util
|
|
5
6
|
import itertools as it
|
|
6
7
|
import json
|
|
@@ -20,15 +21,18 @@ from datasets import DatasetDict
|
|
|
20
21
|
from huggingface_hub import snapshot_download
|
|
21
22
|
from pydantic import conlist, create_model
|
|
22
23
|
from tqdm.auto import tqdm
|
|
23
|
-
from transformers import AutoConfig
|
|
24
|
+
from transformers.models.auto.configuration_auto import AutoConfig
|
|
25
|
+
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
26
|
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
27
|
+
from transformers.trainer import Trainer
|
|
24
28
|
from urllib3.exceptions import RequestError
|
|
25
29
|
|
|
26
30
|
from ..constants import (
|
|
27
31
|
GENERATIVE_PIPELINE_TAGS,
|
|
32
|
+
MAX_CONTEXT_LENGTH,
|
|
28
33
|
MAX_LOGPROBS,
|
|
29
34
|
MERGE_TAGS,
|
|
30
35
|
REASONING_MAX_TOKENS,
|
|
31
|
-
TASK_GROUPS_USING_LOGPROBS,
|
|
32
36
|
TASKS_USING_JSON,
|
|
33
37
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
|
|
34
38
|
)
|
|
@@ -53,39 +57,39 @@ from ..exceptions import (
|
|
|
53
57
|
NeedsExtraInstalled,
|
|
54
58
|
)
|
|
55
59
|
from ..languages import get_all_languages
|
|
56
|
-
from ..
|
|
60
|
+
from ..task_group_utils import (
|
|
57
61
|
question_answering,
|
|
58
62
|
sequence_classification,
|
|
59
63
|
text_to_text,
|
|
60
64
|
token_classification,
|
|
61
65
|
)
|
|
66
|
+
from ..tokenization_utils import (
|
|
67
|
+
get_bos_token,
|
|
68
|
+
get_end_of_chat_token_ids,
|
|
69
|
+
get_eos_token,
|
|
70
|
+
get_first_label_token_mapping,
|
|
71
|
+
should_prompts_be_stripped,
|
|
72
|
+
)
|
|
62
73
|
from ..types import ExtractLabelsFunction
|
|
63
74
|
from ..utils import (
|
|
64
75
|
clear_memory,
|
|
65
76
|
create_model_cache_dir,
|
|
66
|
-
get_bos_token,
|
|
67
|
-
get_end_of_chat_token_ids,
|
|
68
|
-
get_eos_token,
|
|
69
77
|
get_min_cuda_compute_capability,
|
|
70
78
|
log_once,
|
|
71
|
-
should_prompts_be_stripped,
|
|
72
79
|
)
|
|
73
80
|
from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
|
|
74
81
|
|
|
75
82
|
if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
76
83
|
from vllm import LLM, RequestOutput, SamplingParams
|
|
84
|
+
from vllm.distributed.parallel_state import (
|
|
85
|
+
destroy_distributed_environment,
|
|
86
|
+
destroy_model_parallel,
|
|
87
|
+
)
|
|
77
88
|
from vllm.lora.request import LoRARequest
|
|
78
89
|
|
|
79
|
-
try:
|
|
80
|
-
from vllm.model_executor.parallel_utils.parallel_state import (
|
|
81
|
-
destroy_model_parallel,
|
|
82
|
-
)
|
|
83
|
-
except ImportError:
|
|
84
|
-
from vllm.distributed.parallel_state import destroy_model_parallel
|
|
85
|
-
|
|
86
90
|
if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
|
|
87
91
|
from outlines.models.vllm import adapt_tokenizer
|
|
88
|
-
from outlines.processors import JSONLogitsProcessor
|
|
92
|
+
from outlines.processors.structured import JSONLogitsProcessor
|
|
89
93
|
|
|
90
94
|
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
91
95
|
import ray
|
|
@@ -122,11 +126,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
122
126
|
):
|
|
123
127
|
raise NeedsExtraInstalled(extra="generative")
|
|
124
128
|
|
|
125
|
-
output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
|
|
126
129
|
model, tokenizer = load_model_and_tokenizer(
|
|
127
|
-
model_config=model_config,
|
|
128
|
-
benchmark_config=benchmark_config,
|
|
129
|
-
output_scores=output_scores,
|
|
130
|
+
model_config=model_config, benchmark_config=benchmark_config
|
|
130
131
|
)
|
|
131
132
|
self._model: LLM = model
|
|
132
133
|
self._tokenizer: PreTrainedTokenizer = tokenizer
|
|
@@ -142,8 +143,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
142
143
|
benchmark_config=benchmark_config,
|
|
143
144
|
)
|
|
144
145
|
|
|
145
|
-
self.buffer
|
|
146
|
-
|
|
146
|
+
self.buffer |= dict(
|
|
147
|
+
instruction_model=self._tokenizer.chat_template is not None,
|
|
148
|
+
first_label_token_mapping=get_first_label_token_mapping(
|
|
149
|
+
dataset_config=self.dataset_config, tokenizer=self._tokenizer
|
|
150
|
+
),
|
|
151
|
+
)
|
|
147
152
|
if self.model_config.adapter_base_model_id is not None:
|
|
148
153
|
adapter_path = snapshot_download(
|
|
149
154
|
repo_id=self.model_config.model_id,
|
|
@@ -154,6 +159,14 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
154
159
|
lora_name="adapter", lora_int_id=1, lora_path=adapter_path
|
|
155
160
|
)
|
|
156
161
|
|
|
162
|
+
def __del__(self) -> None:
|
|
163
|
+
"""Clean up the model and tokenizer."""
|
|
164
|
+
clear_vllm()
|
|
165
|
+
if hasattr(self, "_model"):
|
|
166
|
+
del self._model
|
|
167
|
+
if hasattr(self, "_tokenizer"):
|
|
168
|
+
del self._tokenizer
|
|
169
|
+
|
|
157
170
|
@property
|
|
158
171
|
def generative_type(self) -> GenerativeType | None:
|
|
159
172
|
"""Get the generative type of the model.
|
|
@@ -185,6 +198,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
185
198
|
return partial(
|
|
186
199
|
sequence_classification.extract_labels_from_generation,
|
|
187
200
|
dataset_config=self.dataset_config,
|
|
201
|
+
first_label_token_mapping=self.buffer["first_label_token_mapping"],
|
|
188
202
|
)
|
|
189
203
|
case TaskGroup.TEXT_TO_TEXT:
|
|
190
204
|
return text_to_text.extract_labels_from_generation
|
|
@@ -327,7 +341,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
327
341
|
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
328
342
|
logits_processor = JSONLogitsProcessor(
|
|
329
343
|
schema=pydantic_class,
|
|
330
|
-
tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), #
|
|
344
|
+
tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
|
|
331
345
|
whitespace_pattern=r" ?",
|
|
332
346
|
)
|
|
333
347
|
log_once(
|
|
@@ -338,6 +352,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
338
352
|
else:
|
|
339
353
|
logits_processor = None
|
|
340
354
|
|
|
355
|
+
# Get the mapping from labels to the first token in the label. We call this each
|
|
356
|
+
# time we generate a new dataset since the dataset config can change
|
|
357
|
+
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
358
|
+
dataset_config=self.dataset_config, tokenizer=self._tokenizer
|
|
359
|
+
)
|
|
360
|
+
|
|
341
361
|
# Define the parameters used for vLLM generation
|
|
342
362
|
max_tokens: int = (
|
|
343
363
|
REASONING_MAX_TOKENS
|
|
@@ -346,7 +366,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
346
366
|
)
|
|
347
367
|
sampling_params = SamplingParams(
|
|
348
368
|
max_tokens=max_tokens,
|
|
349
|
-
logprobs=MAX_LOGPROBS if self.buffer["
|
|
369
|
+
logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
|
|
350
370
|
temperature=0.0,
|
|
351
371
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
352
372
|
logits_processors=[logits_processor] if logits_processor else None,
|
|
@@ -416,7 +436,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
416
436
|
completions = [completion.strip() for completion in completions]
|
|
417
437
|
|
|
418
438
|
# Add logprobs scores to the output
|
|
419
|
-
if self.buffer["
|
|
439
|
+
if self.buffer["first_label_token_mapping"]:
|
|
420
440
|
scores: list[list[list[tuple[str, float]]]] = [
|
|
421
441
|
[
|
|
422
442
|
[
|
|
@@ -846,7 +866,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
846
866
|
|
|
847
867
|
|
|
848
868
|
def load_model_and_tokenizer(
|
|
849
|
-
model_config: ModelConfig, benchmark_config: BenchmarkConfig
|
|
869
|
+
model_config: ModelConfig, benchmark_config: BenchmarkConfig
|
|
850
870
|
) -> "tuple[LLM, PreTrainedTokenizer]":
|
|
851
871
|
"""Load the model and tokenizer.
|
|
852
872
|
|
|
@@ -855,11 +875,9 @@ def load_model_and_tokenizer(
|
|
|
855
875
|
The model configuration.
|
|
856
876
|
benchmark_config:
|
|
857
877
|
The benchmark configuration.
|
|
858
|
-
output_scores:
|
|
859
|
-
Whether to output scores.
|
|
860
878
|
|
|
861
879
|
Returns:
|
|
862
|
-
|
|
880
|
+
A pair (model, tokenizer), with the loaded model and tokenizer
|
|
863
881
|
"""
|
|
864
882
|
# Prefer base model ID if the model is an adapter - the adapter will be added on
|
|
865
883
|
# during inference in this case
|
|
@@ -893,7 +911,27 @@ def load_model_and_tokenizer(
|
|
|
893
911
|
if quantization == "awq" and importlib.util.find_spec("awq") is None:
|
|
894
912
|
raise NeedsExtraInstalled(extra="quantization")
|
|
895
913
|
|
|
914
|
+
# Start with dtype being the "auto" vLLM dtype
|
|
896
915
|
dtype: str | torch.dtype = "auto"
|
|
916
|
+
|
|
917
|
+
# Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
|
|
918
|
+
if hf_model_config.torch_dtype == torch.float32:
|
|
919
|
+
if torch.cuda.is_bf16_supported():
|
|
920
|
+
logger.info(
|
|
921
|
+
"You are loading a model with dtype FP32, which we will convert to "
|
|
922
|
+
"BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
|
|
923
|
+
"GPU."
|
|
924
|
+
)
|
|
925
|
+
dtype = torch.bfloat16
|
|
926
|
+
else:
|
|
927
|
+
logger.info(
|
|
928
|
+
"You are loading a model with dtype FP32, which we will convert to "
|
|
929
|
+
"FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
|
|
930
|
+
"your GPU."
|
|
931
|
+
)
|
|
932
|
+
dtype = torch.float16
|
|
933
|
+
|
|
934
|
+
# If the model is a quantized model, we need to set the dtype to float16
|
|
897
935
|
if quantization is not None and hf_model_config.torch_dtype != torch.float16:
|
|
898
936
|
logger.info(
|
|
899
937
|
"You are loading a quantized model with dtype "
|
|
@@ -902,6 +940,7 @@ def load_model_and_tokenizer(
|
|
|
902
940
|
)
|
|
903
941
|
dtype = torch.float16
|
|
904
942
|
|
|
943
|
+
# If the model is a bf16 model, we need to check the CUDA compute capability
|
|
905
944
|
if hf_model_config.torch_dtype == torch.bfloat16:
|
|
906
945
|
min_cuda_compute_capability = get_min_cuda_compute_capability()
|
|
907
946
|
required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
|
|
@@ -940,29 +979,38 @@ def load_model_and_tokenizer(
|
|
|
940
979
|
if len(true_max_model_len_candidates) > 0:
|
|
941
980
|
true_max_model_len = min(true_max_model_len_candidates)
|
|
942
981
|
else:
|
|
943
|
-
true_max_model_len =
|
|
982
|
+
true_max_model_len = MAX_CONTEXT_LENGTH
|
|
944
983
|
|
|
945
|
-
|
|
984
|
+
tokenizer = load_tokenizer(
|
|
985
|
+
model_id=model_config.model_id,
|
|
986
|
+
revision=model_config.revision,
|
|
987
|
+
adapter_base_model_id=model_config.adapter_base_model_id,
|
|
988
|
+
trust_remote_code=benchmark_config.trust_remote_code,
|
|
989
|
+
model_max_length=true_max_model_len,
|
|
990
|
+
model_cache_dir=model_config.model_cache_dir,
|
|
991
|
+
token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
|
|
992
|
+
)
|
|
946
993
|
|
|
947
|
-
|
|
994
|
+
clear_vllm()
|
|
948
995
|
|
|
949
996
|
try:
|
|
950
997
|
model = LLM(
|
|
951
998
|
model=model_id,
|
|
952
999
|
tokenizer=model_id,
|
|
953
|
-
gpu_memory_utilization=0.
|
|
954
|
-
max_model_len=min(true_max_model_len,
|
|
1000
|
+
gpu_memory_utilization=0.9,
|
|
1001
|
+
max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
|
|
955
1002
|
download_dir=download_dir,
|
|
956
1003
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
957
1004
|
revision=revision,
|
|
958
1005
|
seed=4242,
|
|
959
|
-
distributed_executor_backend=
|
|
1006
|
+
distributed_executor_backend=(
|
|
1007
|
+
"ray" if torch.cuda.device_count() > 1 else "mp"
|
|
1008
|
+
),
|
|
960
1009
|
tensor_parallel_size=torch.cuda.device_count(),
|
|
961
1010
|
disable_custom_all_reduce=True,
|
|
962
1011
|
quantization=quantization,
|
|
963
1012
|
dtype=dtype,
|
|
964
1013
|
enforce_eager=True,
|
|
965
|
-
max_logprobs=MAX_LOGPROBS if output_scores else None,
|
|
966
1014
|
# TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
|
|
967
1015
|
# so we disable it for now
|
|
968
1016
|
enable_prefix_caching=False,
|
|
@@ -988,16 +1036,6 @@ def load_model_and_tokenizer(
|
|
|
988
1036
|
model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
|
|
989
1037
|
model.config = hf_model_config
|
|
990
1038
|
|
|
991
|
-
tokenizer = load_tokenizer(
|
|
992
|
-
model_id=model_config.model_id,
|
|
993
|
-
revision=model_config.revision,
|
|
994
|
-
adapter_base_model_id=model_config.adapter_base_model_id,
|
|
995
|
-
trust_remote_code=benchmark_config.trust_remote_code,
|
|
996
|
-
model_max_length=true_max_model_len,
|
|
997
|
-
model_cache_dir=model_config.model_cache_dir,
|
|
998
|
-
token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
|
|
999
|
-
)
|
|
1000
|
-
|
|
1001
1039
|
return model, tokenizer
|
|
1002
1040
|
|
|
1003
1041
|
|
|
@@ -1118,13 +1156,16 @@ def _run_engine_with_fixed_progress_bars(
|
|
|
1118
1156
|
|
|
1119
1157
|
def clear_vllm() -> None:
|
|
1120
1158
|
"""Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
|
|
1121
|
-
|
|
1159
|
+
with contextlib.suppress(ValueError):
|
|
1122
1160
|
destroy_model_parallel()
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1161
|
+
destroy_distributed_environment()
|
|
1162
|
+
if ray.is_initialized():
|
|
1163
|
+
ray.shutdown()
|
|
1164
|
+
with contextlib.suppress(AssertionError):
|
|
1165
|
+
torch.distributed.destroy_process_group()
|
|
1126
1166
|
if ray.is_initialized():
|
|
1127
1167
|
ray.shutdown()
|
|
1168
|
+
clear_memory()
|
|
1128
1169
|
|
|
1129
1170
|
|
|
1130
1171
|
def get_end_of_reasoning_token_id(
|
|
@@ -1148,24 +1189,23 @@ def get_end_of_reasoning_token_id(
|
|
|
1148
1189
|
if tokenizer.chat_template is None:
|
|
1149
1190
|
prompt = "What is your name?"
|
|
1150
1191
|
else:
|
|
1151
|
-
|
|
1192
|
+
templated_prompt = tokenizer.apply_chat_template(
|
|
1152
1193
|
conversation=[dict(role="user", content="What is your name?")],
|
|
1153
1194
|
add_generation_prompt=True,
|
|
1154
1195
|
tokenize=False,
|
|
1155
1196
|
)
|
|
1156
|
-
|
|
1197
|
+
assert isinstance(templated_prompt, str)
|
|
1198
|
+
prompt = templated_prompt
|
|
1157
1199
|
|
|
1158
1200
|
# Generate a completion and remove the BOS token from it, to not confuse it with the
|
|
1159
1201
|
# potential reasoning token
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
use_tqdm=False,
|
|
1165
|
-
)[0]
|
|
1166
|
-
.outputs[0]
|
|
1167
|
-
.text
|
|
1202
|
+
model_output = model.generate(
|
|
1203
|
+
prompts=[prompt],
|
|
1204
|
+
sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
|
|
1205
|
+
use_tqdm=False,
|
|
1168
1206
|
)
|
|
1207
|
+
completion = model_output[0].outputs[0].text
|
|
1208
|
+
|
|
1169
1209
|
if tokenizer.bos_token is not None:
|
|
1170
1210
|
if isinstance(tokenizer.bos_token, str):
|
|
1171
1211
|
prompt = prompt.replace(tokenizer.bos_token, "").strip()
|
euroeval/benchmarker.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Class that benchmarks language models."""
|
|
2
2
|
|
|
3
|
+
import contextlib
|
|
3
4
|
import json
|
|
4
5
|
import logging
|
|
5
6
|
import re
|
|
@@ -13,7 +14,7 @@ from time import sleep
|
|
|
13
14
|
from torch.distributed import destroy_process_group
|
|
14
15
|
|
|
15
16
|
from .benchmark_config_factory import build_benchmark_config
|
|
16
|
-
from .constants import GENERATIVE_PIPELINE_TAGS
|
|
17
|
+
from .constants import GENERATIVE_DATASET_TASK_GROUPS, GENERATIVE_PIPELINE_TAGS
|
|
17
18
|
from .data_loading import load_data
|
|
18
19
|
from .data_models import BenchmarkConfigParams, BenchmarkResult
|
|
19
20
|
from .dataset_configs import get_all_dataset_configs
|
|
@@ -366,14 +367,18 @@ class Benchmarker:
|
|
|
366
367
|
dataset_names=benchmark_config.datasets
|
|
367
368
|
)
|
|
368
369
|
|
|
370
|
+
total_benchmarks = len(model_ids) * len(dataset_configs)
|
|
371
|
+
num_finished_benchmarks = 0
|
|
372
|
+
|
|
369
373
|
current_benchmark_results: list[BenchmarkResult] = list()
|
|
370
|
-
for
|
|
374
|
+
for model_id in model_ids:
|
|
371
375
|
try:
|
|
372
376
|
model_config = get_model_config(
|
|
373
|
-
model_id=
|
|
377
|
+
model_id=model_id, benchmark_config=benchmark_config
|
|
374
378
|
)
|
|
375
379
|
except InvalidModel as e:
|
|
376
380
|
logger.info(e.message)
|
|
381
|
+
num_finished_benchmarks += len(dataset_configs)
|
|
377
382
|
continue
|
|
378
383
|
|
|
379
384
|
loaded_model: BenchmarkModule | None = None
|
|
@@ -381,21 +386,35 @@ class Benchmarker:
|
|
|
381
386
|
# Skip if we have already benchmarked this model on this dataset and
|
|
382
387
|
# we are not forcing the benchmark
|
|
383
388
|
if not benchmark_config.force and model_has_been_benchmarked(
|
|
384
|
-
model_id=
|
|
389
|
+
model_id=model_id,
|
|
385
390
|
dataset=dataset_config.name,
|
|
386
391
|
few_shot=benchmark_config.few_shot,
|
|
387
392
|
validation_split=not benchmark_config.evaluate_test_split,
|
|
388
393
|
benchmark_results=self.benchmark_results,
|
|
389
394
|
):
|
|
390
395
|
logger.debug(
|
|
391
|
-
f"Skipping benchmarking {
|
|
392
|
-
" as it
|
|
396
|
+
f"Skipping benchmarking {model_id} on "
|
|
397
|
+
f"{dataset_config.pretty_name}, as it "
|
|
398
|
+
"has already been benchmarked."
|
|
399
|
+
)
|
|
400
|
+
num_finished_benchmarks += 1
|
|
401
|
+
continue
|
|
402
|
+
|
|
403
|
+
# Skip if the model is an encoder model and the task is generative
|
|
404
|
+
task_is_generative = (
|
|
405
|
+
dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
|
|
406
|
+
)
|
|
407
|
+
if model_config.model_type == ModelType.ENCODER and task_is_generative:
|
|
408
|
+
logger.debug(
|
|
409
|
+
f"Skipping benchmarking {model_id} on "
|
|
410
|
+
f"{dataset_config.pretty_name}, as it is an encoder model and "
|
|
411
|
+
"the task is generative."
|
|
393
412
|
)
|
|
394
413
|
continue
|
|
395
414
|
|
|
396
415
|
# We do not re-initialise generative models as their architecture is not
|
|
397
416
|
# customised to specific datasets
|
|
398
|
-
if model_config.
|
|
417
|
+
if model_config.model_type == ModelType.GENERATIVE:
|
|
399
418
|
initial_logging(
|
|
400
419
|
model_config=model_config,
|
|
401
420
|
dataset_config=dataset_config,
|
|
@@ -413,6 +432,15 @@ class Benchmarker:
|
|
|
413
432
|
if benchmark_config.raise_errors:
|
|
414
433
|
raise e
|
|
415
434
|
logger.info(e.message)
|
|
435
|
+
|
|
436
|
+
# Add the remaining number of benchmarks for the model to
|
|
437
|
+
# our benchmark counter, since we're skipping the rest of
|
|
438
|
+
# them
|
|
439
|
+
num_finished_benchmarks += (
|
|
440
|
+
len(dataset_configs)
|
|
441
|
+
- dataset_configs.index(dataset_config)
|
|
442
|
+
- 1
|
|
443
|
+
)
|
|
416
444
|
break
|
|
417
445
|
else:
|
|
418
446
|
loaded_model.dataset_config = dataset_config
|
|
@@ -432,27 +460,33 @@ class Benchmarker:
|
|
|
432
460
|
raise benchmark_output_or_err
|
|
433
461
|
|
|
434
462
|
elif isinstance(benchmark_output_or_err, InvalidBenchmark):
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
logger.info(
|
|
438
|
-
f"{m_id} could not be benchmarked on "
|
|
439
|
-
f"{dataset_config.pretty_name}. Skipping. The error message "
|
|
440
|
-
f"raised was {benchmark_output_or_err.message!r}."
|
|
441
|
-
)
|
|
463
|
+
logger.info(benchmark_output_or_err.message)
|
|
464
|
+
num_finished_benchmarks += 1
|
|
442
465
|
continue
|
|
443
466
|
|
|
444
467
|
elif isinstance(benchmark_output_or_err, InvalidModel):
|
|
445
|
-
if benchmark_config.raise_errors:
|
|
446
|
-
raise benchmark_output_or_err
|
|
447
468
|
logger.info(benchmark_output_or_err.message)
|
|
469
|
+
|
|
470
|
+
# Add the remaining number of benchmarks for the model to our
|
|
471
|
+
# benchmark counter, since we're skipping the rest of them
|
|
472
|
+
num_finished_benchmarks += (
|
|
473
|
+
len(dataset_configs) - dataset_configs.index(dataset_config) - 1
|
|
474
|
+
)
|
|
448
475
|
break
|
|
449
476
|
|
|
450
477
|
else:
|
|
451
|
-
record = benchmark_output_or_err
|
|
478
|
+
record: BenchmarkResult = benchmark_output_or_err
|
|
452
479
|
current_benchmark_results.append(record)
|
|
453
480
|
if benchmark_config.save_results:
|
|
454
481
|
record.append_to_results(results_path=self.results_path)
|
|
455
482
|
|
|
483
|
+
num_finished_benchmarks += 1
|
|
484
|
+
logger.info(
|
|
485
|
+
f"Finished {num_finished_benchmarks} out of "
|
|
486
|
+
f"{total_benchmarks} benchmarks."
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
del loaded_model
|
|
456
490
|
if benchmark_config.clear_model_cache:
|
|
457
491
|
clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
|
|
458
492
|
|
|
@@ -464,11 +498,8 @@ class Benchmarker:
|
|
|
464
498
|
# point and block the progress of another member of the process group. This
|
|
465
499
|
# constraint has always been present, but this warning has only been added
|
|
466
500
|
# since PyTorch 2.4 (function operator())
|
|
467
|
-
|
|
501
|
+
with contextlib.suppress(AssertionError):
|
|
468
502
|
destroy_process_group()
|
|
469
|
-
except AssertionError:
|
|
470
|
-
pass
|
|
471
|
-
|
|
472
503
|
return current_benchmark_results
|
|
473
504
|
|
|
474
505
|
def _get_updated_benchmark_config(
|
euroeval/callbacks.py
CHANGED
|
@@ -5,8 +5,8 @@ from collections.abc import Sized
|
|
|
5
5
|
|
|
6
6
|
from torch.utils.data import DataLoader
|
|
7
7
|
from tqdm.auto import tqdm
|
|
8
|
-
from transformers import TrainerControl, TrainerState
|
|
9
|
-
from transformers.
|
|
8
|
+
from transformers.trainer_callback import ProgressCallback, TrainerControl, TrainerState
|
|
9
|
+
from transformers.training_args import TrainingArguments
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class NeverLeaveProgressCallback(ProgressCallback):
|
euroeval/constants.py
CHANGED
|
@@ -7,6 +7,13 @@ from .tasks import NER
|
|
|
7
7
|
DUMMY_FILL_VALUE = 100
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
# This is the maximum allowed context length for models for the purpose of this
|
|
11
|
+
# benchmark. We will still report the models' true maximum context length in the
|
|
12
|
+
# metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
|
|
13
|
+
# all tokens in the context.
|
|
14
|
+
MAX_CONTEXT_LENGTH = 5_000
|
|
15
|
+
|
|
16
|
+
|
|
10
17
|
# We need to raise the amount of tokens generated for reasoning models, to give them
|
|
11
18
|
# time to think
|
|
12
19
|
REASONING_MAX_TOKENS = 8_192
|
|
@@ -44,10 +51,10 @@ TASK_GROUPS_USING_LOGPROBS = [
|
|
|
44
51
|
|
|
45
52
|
# The number of top log probabilities to return for generative models. For several APIs
|
|
46
53
|
# this is the maximum number of log probabilities that can be returned
|
|
47
|
-
MAX_LOGPROBS =
|
|
54
|
+
MAX_LOGPROBS = 8
|
|
48
55
|
|
|
49
56
|
|
|
50
|
-
# We make sure to remove these metric
|
|
57
|
+
# We make sure to remove these metric attributes after each iteration, to avoid memory
|
|
51
58
|
# leaks
|
|
52
59
|
METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
|
|
53
60
|
|