EuroEval 15.4.1__py3-none-any.whl → 15.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +2 -2
- euroeval/benchmark_modules/hf.py +79 -39
- euroeval/benchmark_modules/litellm.py +204 -74
- euroeval/benchmark_modules/vllm.py +106 -42
- euroeval/benchmarker.py +35 -6
- euroeval/constants.py +11 -1
- euroeval/data_models.py +6 -2
- euroeval/dataset_configs.py +6 -6
- euroeval/task_utils/sequence_classification.py +70 -30
- euroeval/types.py +3 -3
- euroeval/utils.py +131 -32
- {euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/METADATA +6 -4
- {euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/RECORD +16 -16
- {euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/WHEEL +0 -0
- {euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -25,11 +25,12 @@ from urllib3.exceptions import RequestError
|
|
|
25
25
|
|
|
26
26
|
from ..constants import (
|
|
27
27
|
GENERATIVE_PIPELINE_TAGS,
|
|
28
|
+
MAX_CONTEXT_LENGTH,
|
|
28
29
|
MAX_LOGPROBS,
|
|
29
30
|
MERGE_TAGS,
|
|
30
31
|
REASONING_MAX_TOKENS,
|
|
31
|
-
TASK_GROUPS_USING_LOGPROBS,
|
|
32
32
|
TASKS_USING_JSON,
|
|
33
|
+
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
|
|
33
34
|
)
|
|
34
35
|
from ..data_models import (
|
|
35
36
|
BenchmarkConfig,
|
|
@@ -65,6 +66,8 @@ from ..utils import (
|
|
|
65
66
|
get_bos_token,
|
|
66
67
|
get_end_of_chat_token_ids,
|
|
67
68
|
get_eos_token,
|
|
69
|
+
get_first_label_token_mapping,
|
|
70
|
+
get_min_cuda_compute_capability,
|
|
68
71
|
log_once,
|
|
69
72
|
should_prompts_be_stripped,
|
|
70
73
|
)
|
|
@@ -120,11 +123,8 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
120
123
|
):
|
|
121
124
|
raise NeedsExtraInstalled(extra="generative")
|
|
122
125
|
|
|
123
|
-
output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
|
|
124
126
|
model, tokenizer = load_model_and_tokenizer(
|
|
125
|
-
model_config=model_config,
|
|
126
|
-
benchmark_config=benchmark_config,
|
|
127
|
-
output_scores=output_scores,
|
|
127
|
+
model_config=model_config, benchmark_config=benchmark_config
|
|
128
128
|
)
|
|
129
129
|
self._model: LLM = model
|
|
130
130
|
self._tokenizer: PreTrainedTokenizer = tokenizer
|
|
@@ -140,11 +140,16 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
140
140
|
benchmark_config=benchmark_config,
|
|
141
141
|
)
|
|
142
142
|
|
|
143
|
-
self.buffer
|
|
144
|
-
|
|
143
|
+
self.buffer |= dict(
|
|
144
|
+
instruction_model=self._tokenizer.chat_template is not None,
|
|
145
|
+
first_label_token_mapping=get_first_label_token_mapping(
|
|
146
|
+
dataset_config=self.dataset_config, tokenizer=self._tokenizer
|
|
147
|
+
),
|
|
148
|
+
)
|
|
145
149
|
if self.model_config.adapter_base_model_id is not None:
|
|
146
150
|
adapter_path = snapshot_download(
|
|
147
151
|
repo_id=self.model_config.model_id,
|
|
152
|
+
revision=self.model_config.revision,
|
|
148
153
|
cache_dir=Path(self.model_config.model_cache_dir),
|
|
149
154
|
)
|
|
150
155
|
self.buffer["lora_request"] = LoRARequest(
|
|
@@ -182,6 +187,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
182
187
|
return partial(
|
|
183
188
|
sequence_classification.extract_labels_from_generation,
|
|
184
189
|
dataset_config=self.dataset_config,
|
|
190
|
+
first_label_token_mapping=self.buffer["first_label_token_mapping"],
|
|
185
191
|
)
|
|
186
192
|
case TaskGroup.TEXT_TO_TEXT:
|
|
187
193
|
return text_to_text.extract_labels_from_generation
|
|
@@ -335,6 +341,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
335
341
|
else:
|
|
336
342
|
logits_processor = None
|
|
337
343
|
|
|
344
|
+
# Get the mapping from labels to the first token in the label. We call this each
|
|
345
|
+
# time we generate a new dataset since the dataset config can change
|
|
346
|
+
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
347
|
+
dataset_config=self.dataset_config, tokenizer=self._tokenizer
|
|
348
|
+
)
|
|
349
|
+
|
|
338
350
|
# Define the parameters used for vLLM generation
|
|
339
351
|
max_tokens: int = (
|
|
340
352
|
REASONING_MAX_TOKENS
|
|
@@ -343,7 +355,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
343
355
|
)
|
|
344
356
|
sampling_params = SamplingParams(
|
|
345
357
|
max_tokens=max_tokens,
|
|
346
|
-
logprobs=MAX_LOGPROBS if self.buffer["
|
|
358
|
+
logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
|
|
347
359
|
temperature=0.0,
|
|
348
360
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
349
361
|
logits_processors=[logits_processor] if logits_processor else None,
|
|
@@ -373,12 +385,27 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
373
385
|
|
|
374
386
|
# Generate sequences using vLLM
|
|
375
387
|
input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
388
|
+
num_attempts = 3
|
|
389
|
+
for _ in range(num_attempts):
|
|
390
|
+
try:
|
|
391
|
+
raw_outputs = self._model.generate(
|
|
392
|
+
prompts=prompts,
|
|
393
|
+
sampling_params=sampling_params,
|
|
394
|
+
use_tqdm=(not input_is_a_test),
|
|
395
|
+
lora_request=self.buffer.get("lora_request"),
|
|
396
|
+
)
|
|
397
|
+
break
|
|
398
|
+
except TypeError as e:
|
|
399
|
+
logger.debug(
|
|
400
|
+
f"Encountered error during vLLM generation: {str(e)}. Retrying..."
|
|
401
|
+
)
|
|
402
|
+
sleep(1)
|
|
403
|
+
else:
|
|
404
|
+
raise InvalidBenchmark(
|
|
405
|
+
f"Could not generate sequences after {num_attempts} attempts."
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
# Parse the raw model outputs
|
|
382
409
|
completion_ids: list[list[int]] = [
|
|
383
410
|
output.outputs[0].token_ids for output in raw_outputs
|
|
384
411
|
]
|
|
@@ -398,7 +425,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
398
425
|
completions = [completion.strip() for completion in completions]
|
|
399
426
|
|
|
400
427
|
# Add logprobs scores to the output
|
|
401
|
-
if self.buffer["
|
|
428
|
+
if self.buffer["first_label_token_mapping"]:
|
|
402
429
|
scores: list[list[list[tuple[str, float]]]] = [
|
|
403
430
|
[
|
|
404
431
|
[
|
|
@@ -828,7 +855,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
828
855
|
|
|
829
856
|
|
|
830
857
|
def load_model_and_tokenizer(
|
|
831
|
-
model_config: ModelConfig, benchmark_config: BenchmarkConfig
|
|
858
|
+
model_config: ModelConfig, benchmark_config: BenchmarkConfig
|
|
832
859
|
) -> "tuple[LLM, PreTrainedTokenizer]":
|
|
833
860
|
"""Load the model and tokenizer.
|
|
834
861
|
|
|
@@ -837,22 +864,23 @@ def load_model_and_tokenizer(
|
|
|
837
864
|
The model configuration.
|
|
838
865
|
benchmark_config:
|
|
839
866
|
The benchmark configuration.
|
|
840
|
-
output_scores:
|
|
841
|
-
Whether to output scores.
|
|
842
867
|
|
|
843
868
|
Returns:
|
|
844
|
-
|
|
869
|
+
A pair (model, tokenizer), with the loaded model and tokenizer
|
|
845
870
|
"""
|
|
846
871
|
# Prefer base model ID if the model is an adapter - the adapter will be added on
|
|
847
872
|
# during inference in this case
|
|
848
873
|
model_id = model_config.adapter_base_model_id or model_config.model_id
|
|
874
|
+
revision = (
|
|
875
|
+
model_config.revision if model_config.adapter_base_model_id is None else "main"
|
|
876
|
+
)
|
|
849
877
|
|
|
850
878
|
hf_model_config = load_hf_model_config(
|
|
851
879
|
model_id=model_id,
|
|
852
880
|
num_labels=0,
|
|
853
881
|
id2label=dict(),
|
|
854
882
|
label2id=dict(),
|
|
855
|
-
revision=
|
|
883
|
+
revision=revision,
|
|
856
884
|
model_cache_dir=model_config.model_cache_dir,
|
|
857
885
|
api_key=benchmark_config.api_key,
|
|
858
886
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
@@ -872,7 +900,27 @@ def load_model_and_tokenizer(
|
|
|
872
900
|
if quantization == "awq" and importlib.util.find_spec("awq") is None:
|
|
873
901
|
raise NeedsExtraInstalled(extra="quantization")
|
|
874
902
|
|
|
903
|
+
# Start with dtype being the "auto" vLLM dtype
|
|
875
904
|
dtype: str | torch.dtype = "auto"
|
|
905
|
+
|
|
906
|
+
# Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
|
|
907
|
+
if hf_model_config.torch_dtype == torch.float32:
|
|
908
|
+
if torch.cuda.is_bf16_supported():
|
|
909
|
+
logger.info(
|
|
910
|
+
"You are loading a model with dtype FP32, which we will convert to "
|
|
911
|
+
"BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
|
|
912
|
+
"GPU."
|
|
913
|
+
)
|
|
914
|
+
dtype = torch.bfloat16
|
|
915
|
+
else:
|
|
916
|
+
logger.info(
|
|
917
|
+
"You are loading a model with dtype FP32, which we will convert to "
|
|
918
|
+
"FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
|
|
919
|
+
"your GPU."
|
|
920
|
+
)
|
|
921
|
+
dtype = torch.float16
|
|
922
|
+
|
|
923
|
+
# If the model is a quantized model, we need to set the dtype to float16
|
|
876
924
|
if quantization is not None and hf_model_config.torch_dtype != torch.float16:
|
|
877
925
|
logger.info(
|
|
878
926
|
"You are loading a quantized model with dtype "
|
|
@@ -881,6 +929,24 @@ def load_model_and_tokenizer(
|
|
|
881
929
|
)
|
|
882
930
|
dtype = torch.float16
|
|
883
931
|
|
|
932
|
+
# If the model is a bf16 model, we need to check the CUDA compute capability
|
|
933
|
+
if hf_model_config.torch_dtype == torch.bfloat16:
|
|
934
|
+
min_cuda_compute_capability = get_min_cuda_compute_capability()
|
|
935
|
+
required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
|
|
936
|
+
|
|
937
|
+
if min_cuda_compute_capability is not None:
|
|
938
|
+
if min_cuda_compute_capability < required_capability:
|
|
939
|
+
logger.info(
|
|
940
|
+
"You are loading a model with "
|
|
941
|
+
f"dtype {hf_model_config.torch_dtype}, "
|
|
942
|
+
"which vLLM only supports for CUDA devices with"
|
|
943
|
+
f"CUDA compute capability >={required_capability}. "
|
|
944
|
+
"You are using one or more devices with "
|
|
945
|
+
f"compute capability {min_cuda_compute_capability}. "
|
|
946
|
+
"Setting dtype to float16 instead."
|
|
947
|
+
)
|
|
948
|
+
dtype = torch.float16
|
|
949
|
+
|
|
884
950
|
if model_config.adapter_base_model_id is not None:
|
|
885
951
|
download_dir = str(Path(model_config.model_cache_dir) / "base_model")
|
|
886
952
|
else:
|
|
@@ -902,7 +968,17 @@ def load_model_and_tokenizer(
|
|
|
902
968
|
if len(true_max_model_len_candidates) > 0:
|
|
903
969
|
true_max_model_len = min(true_max_model_len_candidates)
|
|
904
970
|
else:
|
|
905
|
-
true_max_model_len =
|
|
971
|
+
true_max_model_len = MAX_CONTEXT_LENGTH
|
|
972
|
+
|
|
973
|
+
tokenizer = load_tokenizer(
|
|
974
|
+
model_id=model_config.model_id,
|
|
975
|
+
revision=model_config.revision,
|
|
976
|
+
adapter_base_model_id=model_config.adapter_base_model_id,
|
|
977
|
+
trust_remote_code=benchmark_config.trust_remote_code,
|
|
978
|
+
model_max_length=true_max_model_len,
|
|
979
|
+
model_cache_dir=model_config.model_cache_dir,
|
|
980
|
+
token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
|
|
981
|
+
)
|
|
906
982
|
|
|
907
983
|
clear_vllm()
|
|
908
984
|
|
|
@@ -913,10 +989,10 @@ def load_model_and_tokenizer(
|
|
|
913
989
|
model=model_id,
|
|
914
990
|
tokenizer=model_id,
|
|
915
991
|
gpu_memory_utilization=0.95,
|
|
916
|
-
max_model_len=min(true_max_model_len,
|
|
992
|
+
max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
|
|
917
993
|
download_dir=download_dir,
|
|
918
994
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
919
|
-
revision=
|
|
995
|
+
revision=revision,
|
|
920
996
|
seed=4242,
|
|
921
997
|
distributed_executor_backend=executor_backend,
|
|
922
998
|
tensor_parallel_size=torch.cuda.device_count(),
|
|
@@ -924,7 +1000,6 @@ def load_model_and_tokenizer(
|
|
|
924
1000
|
quantization=quantization,
|
|
925
1001
|
dtype=dtype,
|
|
926
1002
|
enforce_eager=True,
|
|
927
|
-
max_logprobs=MAX_LOGPROBS if output_scores else None,
|
|
928
1003
|
# TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
|
|
929
1004
|
# so we disable it for now
|
|
930
1005
|
enable_prefix_caching=False,
|
|
@@ -950,16 +1025,6 @@ def load_model_and_tokenizer(
|
|
|
950
1025
|
model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
|
|
951
1026
|
model.config = hf_model_config
|
|
952
1027
|
|
|
953
|
-
tokenizer = load_tokenizer(
|
|
954
|
-
model_id=model_config.model_id,
|
|
955
|
-
revision=model_config.revision,
|
|
956
|
-
adapter_base_model_id=model_config.adapter_base_model_id,
|
|
957
|
-
trust_remote_code=benchmark_config.trust_remote_code,
|
|
958
|
-
model_max_length=true_max_model_len,
|
|
959
|
-
model_cache_dir=model_config.model_cache_dir,
|
|
960
|
-
token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
|
|
961
|
-
)
|
|
962
|
-
|
|
963
1028
|
return model, tokenizer
|
|
964
1029
|
|
|
965
1030
|
|
|
@@ -994,6 +1059,7 @@ def load_tokenizer(
|
|
|
994
1059
|
Returns:
|
|
995
1060
|
The loaded tokenizer.
|
|
996
1061
|
"""
|
|
1062
|
+
revision = revision if adapter_base_model_id is None else "main"
|
|
997
1063
|
config = AutoConfig.from_pretrained(
|
|
998
1064
|
adapter_base_model_id or model_id,
|
|
999
1065
|
revision=revision,
|
|
@@ -1118,15 +1184,13 @@ def get_end_of_reasoning_token_id(
|
|
|
1118
1184
|
|
|
1119
1185
|
# Generate a completion and remove the BOS token from it, to not confuse it with the
|
|
1120
1186
|
# potential reasoning token
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
use_tqdm=False,
|
|
1126
|
-
)[0]
|
|
1127
|
-
.outputs[0]
|
|
1128
|
-
.text
|
|
1187
|
+
model_output = model.generate(
|
|
1188
|
+
prompts=[prompt],
|
|
1189
|
+
sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
|
|
1190
|
+
use_tqdm=False,
|
|
1129
1191
|
)
|
|
1192
|
+
completion = model_output[0].outputs[0].text
|
|
1193
|
+
|
|
1130
1194
|
if tokenizer.bos_token is not None:
|
|
1131
1195
|
if isinstance(tokenizer.bos_token, str):
|
|
1132
1196
|
prompt = prompt.replace(tokenizer.bos_token, "").strip()
|
euroeval/benchmarker.py
CHANGED
|
@@ -366,14 +366,18 @@ class Benchmarker:
|
|
|
366
366
|
dataset_names=benchmark_config.datasets
|
|
367
367
|
)
|
|
368
368
|
|
|
369
|
+
total_benchmarks = len(model_ids) * len(dataset_configs)
|
|
370
|
+
num_finished_benchmarks = 0
|
|
371
|
+
|
|
369
372
|
current_benchmark_results: list[BenchmarkResult] = list()
|
|
370
|
-
for
|
|
373
|
+
for model_id in model_ids:
|
|
371
374
|
try:
|
|
372
375
|
model_config = get_model_config(
|
|
373
|
-
model_id=
|
|
376
|
+
model_id=model_id, benchmark_config=benchmark_config
|
|
374
377
|
)
|
|
375
378
|
except InvalidModel as e:
|
|
376
379
|
logger.info(e.message)
|
|
380
|
+
num_finished_benchmarks += len(dataset_configs)
|
|
377
381
|
continue
|
|
378
382
|
|
|
379
383
|
loaded_model: BenchmarkModule | None = None
|
|
@@ -381,16 +385,18 @@ class Benchmarker:
|
|
|
381
385
|
# Skip if we have already benchmarked this model on this dataset and
|
|
382
386
|
# we are not forcing the benchmark
|
|
383
387
|
if not benchmark_config.force and model_has_been_benchmarked(
|
|
384
|
-
model_id=
|
|
388
|
+
model_id=model_id,
|
|
385
389
|
dataset=dataset_config.name,
|
|
386
390
|
few_shot=benchmark_config.few_shot,
|
|
387
391
|
validation_split=not benchmark_config.evaluate_test_split,
|
|
388
392
|
benchmark_results=self.benchmark_results,
|
|
389
393
|
):
|
|
390
394
|
logger.debug(
|
|
391
|
-
f"Skipping benchmarking {
|
|
392
|
-
" as it
|
|
395
|
+
f"Skipping benchmarking {model_id} on "
|
|
396
|
+
f"{dataset_config.pretty_name}, as it "
|
|
397
|
+
"has already been benchmarked."
|
|
393
398
|
)
|
|
399
|
+
num_finished_benchmarks += 1
|
|
394
400
|
continue
|
|
395
401
|
|
|
396
402
|
# We do not re-initialise generative models as their architecture is not
|
|
@@ -413,6 +419,15 @@ class Benchmarker:
|
|
|
413
419
|
if benchmark_config.raise_errors:
|
|
414
420
|
raise e
|
|
415
421
|
logger.info(e.message)
|
|
422
|
+
|
|
423
|
+
# Add the remaining number of benchmarks for the model to
|
|
424
|
+
# our benchmark counter, since we're skipping the
|
|
425
|
+
# rest of them
|
|
426
|
+
num_finished_benchmarks += (
|
|
427
|
+
len(dataset_configs)
|
|
428
|
+
- dataset_configs.index(dataset_config)
|
|
429
|
+
- 1
|
|
430
|
+
)
|
|
416
431
|
break
|
|
417
432
|
else:
|
|
418
433
|
loaded_model.dataset_config = dataset_config
|
|
@@ -435,16 +450,24 @@ class Benchmarker:
|
|
|
435
450
|
if benchmark_config.raise_errors:
|
|
436
451
|
raise benchmark_output_or_err
|
|
437
452
|
logger.info(
|
|
438
|
-
f"{
|
|
453
|
+
f"{model_id} could not be benchmarked on "
|
|
439
454
|
f"{dataset_config.pretty_name}. Skipping. The error message "
|
|
440
455
|
f"raised was {benchmark_output_or_err.message!r}."
|
|
441
456
|
)
|
|
457
|
+
num_finished_benchmarks += 1
|
|
442
458
|
continue
|
|
443
459
|
|
|
444
460
|
elif isinstance(benchmark_output_or_err, InvalidModel):
|
|
445
461
|
if benchmark_config.raise_errors:
|
|
446
462
|
raise benchmark_output_or_err
|
|
447
463
|
logger.info(benchmark_output_or_err.message)
|
|
464
|
+
|
|
465
|
+
# Add the remaining number of benchmarks for the model to
|
|
466
|
+
# our benchmark counter, since we're skipping the
|
|
467
|
+
# rest of them
|
|
468
|
+
num_finished_benchmarks += (
|
|
469
|
+
len(dataset_configs) - dataset_configs.index(dataset_config) - 1
|
|
470
|
+
)
|
|
448
471
|
break
|
|
449
472
|
|
|
450
473
|
else:
|
|
@@ -453,6 +476,12 @@ class Benchmarker:
|
|
|
453
476
|
if benchmark_config.save_results:
|
|
454
477
|
record.append_to_results(results_path=self.results_path)
|
|
455
478
|
|
|
479
|
+
num_finished_benchmarks += 1
|
|
480
|
+
logger.info(
|
|
481
|
+
f"Finished {num_finished_benchmarks} out of "
|
|
482
|
+
f"{total_benchmarks} benchmarks."
|
|
483
|
+
)
|
|
484
|
+
|
|
456
485
|
if benchmark_config.clear_model_cache:
|
|
457
486
|
clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
|
|
458
487
|
|
euroeval/constants.py
CHANGED
|
@@ -7,6 +7,13 @@ from .tasks import NER
|
|
|
7
7
|
DUMMY_FILL_VALUE = 100
|
|
8
8
|
|
|
9
9
|
|
|
10
|
+
# This is the maximum allowed context length for models for the purpose of this
|
|
11
|
+
# benchmark. We will still report the models' true maximum context length in the
|
|
12
|
+
# metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
|
|
13
|
+
# all tokens in the context.
|
|
14
|
+
MAX_CONTEXT_LENGTH = 5_000
|
|
15
|
+
|
|
16
|
+
|
|
10
17
|
# We need to raise the amount of tokens generated for reasoning models, to give them
|
|
11
18
|
# time to think
|
|
12
19
|
REASONING_MAX_TOKENS = 8_192
|
|
@@ -47,10 +54,13 @@ TASK_GROUPS_USING_LOGPROBS = [
|
|
|
47
54
|
MAX_LOGPROBS = 10
|
|
48
55
|
|
|
49
56
|
|
|
50
|
-
# We make sure to remove these metric
|
|
57
|
+
# We make sure to remove these metric attributes after each iteration, to avoid memory
|
|
51
58
|
# leaks
|
|
52
59
|
METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
|
|
53
60
|
|
|
54
61
|
|
|
55
62
|
# Hugging Face Hub tags used to classify models as merge models
|
|
56
63
|
MERGE_TAGS = ["merge", "mergekit"]
|
|
64
|
+
|
|
65
|
+
# The minimum required CUDA compute capability for using bfloat16 in vLLM
|
|
66
|
+
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
|
euroeval/data_models.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Data models used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
-
import importlib.metadata
|
|
5
4
|
import json
|
|
6
5
|
import pathlib
|
|
7
6
|
import re
|
|
@@ -13,6 +12,7 @@ import torch
|
|
|
13
12
|
|
|
14
13
|
from .enums import Device, InferenceBackend, ModelType, TaskGroup
|
|
15
14
|
from .types import ScoreDict
|
|
15
|
+
from .utils import get_package_version
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
@dataclass
|
|
@@ -228,7 +228,11 @@ class BenchmarkResult(pydantic.BaseModel):
|
|
|
228
228
|
generative_type: str | None
|
|
229
229
|
few_shot: bool
|
|
230
230
|
validation_split: bool
|
|
231
|
-
euroeval_version: str =
|
|
231
|
+
euroeval_version: str | None = get_package_version("euroeval")
|
|
232
|
+
transformers_version: str | None = get_package_version("transformers")
|
|
233
|
+
torch_version: str | None = get_package_version("torch")
|
|
234
|
+
vllm_version: str | None = get_package_version("vllm")
|
|
235
|
+
outlines_version: str | None = get_package_version("outlines")
|
|
232
236
|
|
|
233
237
|
@classmethod
|
|
234
238
|
def from_dict(cls, config: dict) -> "BenchmarkResult":
|
euroeval/dataset_configs.py
CHANGED
|
@@ -244,7 +244,7 @@ FOSENT_CONFIG = DatasetConfig(
|
|
|
244
244
|
ALLOCINE_CONFIG = DatasetConfig(
|
|
245
245
|
name="allocine",
|
|
246
246
|
pretty_name="the truncated version of the French sentiment classification "
|
|
247
|
-
"dataset
|
|
247
|
+
"dataset AlloCiné",
|
|
248
248
|
huggingface_id="EuroEval/allocine-mini",
|
|
249
249
|
task=SENT,
|
|
250
250
|
languages=[FR],
|
|
@@ -1467,9 +1467,9 @@ NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
|
|
|
1467
1467
|
max_generated_tokens=256,
|
|
1468
1468
|
)
|
|
1469
1469
|
|
|
1470
|
-
|
|
1471
|
-
name="mlsum",
|
|
1472
|
-
pretty_name="the truncated version of the German summarisation dataset MLSum",
|
|
1470
|
+
MLSUM_DE_CONFIG = DatasetConfig(
|
|
1471
|
+
name="mlsum-de",
|
|
1472
|
+
pretty_name="the truncated version of the German summarisation dataset MLSum-de",
|
|
1473
1473
|
huggingface_id="EuroEval/mlsum-mini",
|
|
1474
1474
|
task=SUMM,
|
|
1475
1475
|
languages=[DE],
|
|
@@ -1484,7 +1484,7 @@ MLSUM_CONFIG = DatasetConfig(
|
|
|
1484
1484
|
|
|
1485
1485
|
MLSUM_ES_CONFIG = DatasetConfig(
|
|
1486
1486
|
name="mlsum-es",
|
|
1487
|
-
pretty_name="the truncated version of the Spanish summarisation dataset MLSum",
|
|
1487
|
+
pretty_name="the truncated version of the Spanish summarisation dataset MLSum-es",
|
|
1488
1488
|
huggingface_id="EuroEval/mlsum-es-mini",
|
|
1489
1489
|
task=SUMM,
|
|
1490
1490
|
languages=[ES],
|
|
@@ -1643,7 +1643,7 @@ ORANGE_SUM_CONFIG = DatasetConfig(
|
|
|
1643
1643
|
|
|
1644
1644
|
ILPOST_SUM_CONFIG = DatasetConfig(
|
|
1645
1645
|
name="ilpost-sum",
|
|
1646
|
-
pretty_name="the truncated version of the Italian summarisation dataset IlPost",
|
|
1646
|
+
pretty_name="the truncated version of the Italian summarisation dataset IlPost-Sum",
|
|
1647
1647
|
huggingface_id="EuroEval/ilpost-sum",
|
|
1648
1648
|
task=SUMM,
|
|
1649
1649
|
languages=[IT],
|
|
@@ -10,6 +10,7 @@ import numpy as np
|
|
|
10
10
|
from evaluate import EvaluationModule
|
|
11
11
|
|
|
12
12
|
from ..data_models import BenchmarkConfig, GenerativeModelOutput
|
|
13
|
+
from ..exceptions import InvalidBenchmark
|
|
13
14
|
from ..utils import log_once, raise_if_model_output_contains_nan_values
|
|
14
15
|
|
|
15
16
|
if t.TYPE_CHECKING:
|
|
@@ -110,6 +111,7 @@ def extract_labels_from_generation(
|
|
|
110
111
|
input_batch: dict[str, list],
|
|
111
112
|
model_output: GenerativeModelOutput,
|
|
112
113
|
dataset_config: "DatasetConfig",
|
|
114
|
+
first_label_token_mapping: dict[str, str] | bool,
|
|
113
115
|
) -> list[str]:
|
|
114
116
|
"""Extract the predicted labels from the generated output.
|
|
115
117
|
|
|
@@ -121,13 +123,19 @@ def extract_labels_from_generation(
|
|
|
121
123
|
The raw generated output of the model.
|
|
122
124
|
dataset_config:
|
|
123
125
|
The configuration of the dataset.
|
|
126
|
+
first_label_token_mapping:
|
|
127
|
+
A mapping from labels to the first token in each label, or alternatively a
|
|
128
|
+
Boolean value indicating whether the model should output scores (if the
|
|
129
|
+
mapping is outputted then the model will always output scores).
|
|
124
130
|
|
|
125
131
|
Returns:
|
|
126
132
|
The predicted labels.
|
|
127
133
|
"""
|
|
128
134
|
if model_output.scores is not None:
|
|
129
135
|
return get_closest_logprobs_labels(
|
|
130
|
-
generation_logprobs=model_output.scores,
|
|
136
|
+
generation_logprobs=model_output.scores,
|
|
137
|
+
dataset_config=dataset_config,
|
|
138
|
+
first_label_token_mapping=first_label_token_mapping,
|
|
131
139
|
)
|
|
132
140
|
else:
|
|
133
141
|
return get_closest_word_edit_labels(
|
|
@@ -138,6 +146,7 @@ def extract_labels_from_generation(
|
|
|
138
146
|
def get_closest_logprobs_labels(
|
|
139
147
|
generation_logprobs: list[list[list[tuple[str, float]]]],
|
|
140
148
|
dataset_config: "DatasetConfig",
|
|
149
|
+
first_label_token_mapping: dict[str, str] | bool,
|
|
141
150
|
) -> list[str]:
|
|
142
151
|
"""Get the labels with the highest predicted logprob value.
|
|
143
152
|
|
|
@@ -152,6 +161,10 @@ def get_closest_logprobs_labels(
|
|
|
152
161
|
(batch_size, num_tokens, num_logprobs).
|
|
153
162
|
dataset_config:
|
|
154
163
|
The configuration of the dataset.
|
|
164
|
+
first_label_token_mapping:
|
|
165
|
+
A mapping from labels to the first token in each label, or alternatively a
|
|
166
|
+
Boolean value indicating whether the model should output scores (if the
|
|
167
|
+
mapping is outputted then the model will always output scores).
|
|
155
168
|
|
|
156
169
|
Returns:
|
|
157
170
|
The predicted labels.
|
|
@@ -162,8 +175,7 @@ def get_closest_logprobs_labels(
|
|
|
162
175
|
"""
|
|
163
176
|
english_labels = list(dataset_config.id2label.values())
|
|
164
177
|
english2local = dataset_config.prompt_label_mapping
|
|
165
|
-
|
|
166
|
-
candidate_labels = local_labels + english_labels
|
|
178
|
+
candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
|
|
167
179
|
|
|
168
180
|
output_labels: list[str] = list()
|
|
169
181
|
for sample in generation_logprobs:
|
|
@@ -182,38 +194,66 @@ def get_closest_logprobs_labels(
|
|
|
182
194
|
# label, as the output label
|
|
183
195
|
output_label: str | None = None
|
|
184
196
|
previously_generated_labels: list[str] = list()
|
|
185
|
-
for generated_label in generated_labels:
|
|
197
|
+
for label_idx, generated_label in enumerate(generated_labels):
|
|
186
198
|
generated_label = "".join(previously_generated_labels) + generated_label
|
|
187
199
|
|
|
188
|
-
# Get the candidate labels that
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
200
|
+
# Get the candidate labels that starts with the generated label
|
|
201
|
+
if isinstance(first_label_token_mapping, dict):
|
|
202
|
+
if any(
|
|
203
|
+
candidate_label not in first_label_token_mapping
|
|
204
|
+
for candidate_label in candidate_labels
|
|
205
|
+
):
|
|
206
|
+
raise InvalidBenchmark(
|
|
207
|
+
"There is a label not present in the first label token "
|
|
208
|
+
"mapping - this should never happen! Please report this "
|
|
209
|
+
"issue to the EuroEval team at "
|
|
210
|
+
"github.com/EuroEval/EuroEval/issues."
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
candidate_output_labels = {
|
|
214
|
+
candidate_label
|
|
215
|
+
for candidate_label in candidate_labels
|
|
216
|
+
if generated_label == first_label_token_mapping[candidate_label]
|
|
217
|
+
}
|
|
218
|
+
else:
|
|
219
|
+
candidate_output_labels = {
|
|
220
|
+
candidate_label
|
|
221
|
+
for candidate_label in candidate_labels
|
|
222
|
+
if candidate_label.startswith(generated_label)
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
# If we can uniquely determine the output label, we break the loop. If
|
|
226
|
+
# there are multiple possible labels then we store the current one, and
|
|
227
|
+
# concatenate it with the next generated label. We can only do this if
|
|
228
|
+
# the current one is the first one, however, since we're using greedy
|
|
229
|
+
# sampling. In case this happens for a label that is not the first one,
|
|
230
|
+
# we warn the user.
|
|
231
|
+
if len(candidate_output_labels) == 1:
|
|
232
|
+
output_label = candidate_output_labels.pop()
|
|
233
|
+
break
|
|
234
|
+
elif len(candidate_output_labels) > 1:
|
|
235
|
+
if label_idx == 0:
|
|
213
236
|
previously_generated_labels.append(generated_label)
|
|
237
|
+
else:
|
|
238
|
+
output_label = candidate_output_labels.pop()
|
|
239
|
+
candidate_output_labels.add(output_label)
|
|
240
|
+
raise InvalidBenchmark(
|
|
241
|
+
"Multiple candidate labels found for the generated label "
|
|
242
|
+
f"{generated_label!r}: {candidate_output_labels}. Since "
|
|
243
|
+
"this is not the first generated label, we cannot "
|
|
244
|
+
"concatenate it with the next generated label. We are thus "
|
|
245
|
+
f"forced to use the arbitrary {output_label!r} as the "
|
|
246
|
+
"output label, potentially resulting in worse performance. "
|
|
247
|
+
"Please report this issue to the EuroEval team at "
|
|
248
|
+
"github.com/EuroEval/EuroEval/issues."
|
|
249
|
+
)
|
|
250
|
+
elif len(candidate_output_labels) == 0:
|
|
251
|
+
logger.debug(
|
|
252
|
+
f"No candidate label found for the generated label "
|
|
253
|
+
f"{generated_label!r}. The generated label is thus ignored."
|
|
254
|
+
)
|
|
214
255
|
|
|
215
256
|
if output_label is not None:
|
|
216
|
-
output_label = english2local.get(output_label, output_label)
|
|
217
257
|
output_labels.append(output_label)
|
|
218
258
|
break
|
|
219
259
|
else:
|
euroeval/types.py
CHANGED
|
@@ -8,9 +8,9 @@ if t.TYPE_CHECKING:
|
|
|
8
8
|
from .data_models import GenerativeModelOutput
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
ScoreDict = dict[str, dict[str, float] | list[dict[str, float]]]
|
|
12
|
-
Predictions = NDArray | list[str] | list[list[str]]
|
|
13
|
-
Labels = NDArray | list[str] | list[list[str]]
|
|
11
|
+
ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
|
|
12
|
+
Predictions: t.TypeAlias = NDArray | list[str] | list[list[str]]
|
|
13
|
+
Labels: t.TypeAlias = NDArray | list[str] | list[list[str]]
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class ComputeMetricsFunction(t.Protocol):
|