EuroEval 15.8.2__py3-none-any.whl → 15.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +14 -0
- euroeval/benchmark_config_factory.py +0 -31
- euroeval/benchmark_modules/hf.py +26 -13
- euroeval/benchmark_modules/vllm.py +70 -2
- euroeval/benchmarker.py +0 -21
- euroeval/cli.py +0 -10
- euroeval/data_models.py +0 -5
- euroeval/exceptions.py +0 -22
- euroeval/human_evaluation.py +0 -1
- {euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/METADATA +3 -5
- {euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/RECORD +14 -14
- {euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/WHEEL +0 -0
- {euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/entry_points.txt +0 -0
- {euroeval-15.8.2.dist-info → euroeval-15.9.1.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
### STAGE 1 ###
|
|
4
4
|
### Block unwanted terminal output that happens on importing external modules ###
|
|
5
5
|
|
|
6
|
+
import importlib.util
|
|
6
7
|
import logging
|
|
7
8
|
import os
|
|
8
9
|
import sys
|
|
@@ -27,6 +28,19 @@ logging.basicConfig(
|
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
### STAGE 2 ###
|
|
31
|
+
### Check for incompatible packages ###
|
|
32
|
+
|
|
33
|
+
# Throw informative error if `flash_attn` is installed ###
|
|
34
|
+
if importlib.util.find_spec("flash_attn") is not None:
|
|
35
|
+
logging.critical(
|
|
36
|
+
"The `flash_attn` package is not supported by EuroEval, as it is now built "
|
|
37
|
+
"into the other packages and it conflicts with the other implementations. "
|
|
38
|
+
"Please uninstall it using `pip uninstall flash_attn` and try again."
|
|
39
|
+
)
|
|
40
|
+
sys.exit(1)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
### STAGE 3 ###
|
|
30
44
|
### Set the rest up ###
|
|
31
45
|
|
|
32
46
|
import importlib.metadata # noqa: E402
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""Factory class for creating dataset configurations."""
|
|
2
2
|
|
|
3
|
-
import importlib.util
|
|
4
3
|
import logging
|
|
5
4
|
import sys
|
|
6
5
|
import typing as t
|
|
@@ -13,7 +12,6 @@ from .enums import Device
|
|
|
13
12
|
from .exceptions import InvalidBenchmark
|
|
14
13
|
from .languages import get_all_languages
|
|
15
14
|
from .tasks import SPEED, get_all_tasks
|
|
16
|
-
from .utils import log_once
|
|
17
15
|
|
|
18
16
|
if t.TYPE_CHECKING:
|
|
19
17
|
from .data_models import Language, Task
|
|
@@ -38,7 +36,6 @@ def build_benchmark_config(
|
|
|
38
36
|
force: bool,
|
|
39
37
|
verbose: bool,
|
|
40
38
|
trust_remote_code: bool,
|
|
41
|
-
use_flash_attention: bool | None,
|
|
42
39
|
clear_model_cache: bool,
|
|
43
40
|
evaluate_test_split: bool,
|
|
44
41
|
few_shot: bool,
|
|
@@ -92,9 +89,6 @@ def build_benchmark_config(
|
|
|
92
89
|
automatically set if `debug` is True.
|
|
93
90
|
trust_remote_code:
|
|
94
91
|
Whether to trust remote code when running the benchmark.
|
|
95
|
-
use_flash_attention:
|
|
96
|
-
Whether to use Flash Attention for the models. If None then it will be used
|
|
97
|
-
if it is available.
|
|
98
92
|
clear_model_cache:
|
|
99
93
|
Whether to clear the model cache before running the benchmark.
|
|
100
94
|
evaluate_test_split:
|
|
@@ -135,30 +129,6 @@ def build_benchmark_config(
|
|
|
135
129
|
|
|
136
130
|
torch_device = prepare_device(device=device)
|
|
137
131
|
|
|
138
|
-
if use_flash_attention is None:
|
|
139
|
-
if torch_device.type != "cuda":
|
|
140
|
-
use_flash_attention = False
|
|
141
|
-
elif (
|
|
142
|
-
importlib.util.find_spec("flash_attn") is None
|
|
143
|
-
and importlib.util.find_spec("vllm_flash_attn") is None
|
|
144
|
-
):
|
|
145
|
-
use_flash_attention = False
|
|
146
|
-
if first_time and torch_device.type == "cuda":
|
|
147
|
-
message = (
|
|
148
|
-
"Flash attention has not been installed, so this will not be used. "
|
|
149
|
-
"To install it, run `pip install -U wheel && "
|
|
150
|
-
"FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn "
|
|
151
|
-
"--no-build-isolation`. Alternatively, you can disable this "
|
|
152
|
-
"message by setting "
|
|
153
|
-
)
|
|
154
|
-
if run_with_cli:
|
|
155
|
-
message += "the flag `--no-use-flash-attention`."
|
|
156
|
-
else:
|
|
157
|
-
message += (
|
|
158
|
-
"the argument `use_flash_attention=False` in the `Benchmarker`."
|
|
159
|
-
)
|
|
160
|
-
log_once(message=message, level=logging.INFO)
|
|
161
|
-
|
|
162
132
|
# Set variable with number of iterations
|
|
163
133
|
if hasattr(sys, "_called_from_test"):
|
|
164
134
|
num_iterations = 1
|
|
@@ -178,7 +148,6 @@ def build_benchmark_config(
|
|
|
178
148
|
verbose=verbose or debug,
|
|
179
149
|
device=torch_device,
|
|
180
150
|
trust_remote_code=trust_remote_code,
|
|
181
|
-
use_flash_attention=use_flash_attention,
|
|
182
151
|
clear_model_cache=clear_model_cache,
|
|
183
152
|
evaluate_test_split=evaluate_test_split,
|
|
184
153
|
few_shot=few_shot,
|
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -54,13 +54,11 @@ from ..enums import (
|
|
|
54
54
|
TaskGroup,
|
|
55
55
|
)
|
|
56
56
|
from ..exceptions import (
|
|
57
|
-
HuggingFaceHubDown,
|
|
58
57
|
InvalidBenchmark,
|
|
59
58
|
InvalidModel,
|
|
60
59
|
NeedsAdditionalArgument,
|
|
61
60
|
NeedsEnvironmentVariable,
|
|
62
61
|
NeedsExtraInstalled,
|
|
63
|
-
NoInternetConnection,
|
|
64
62
|
)
|
|
65
63
|
from ..languages import get_all_languages
|
|
66
64
|
from ..task_group_utils import (
|
|
@@ -737,9 +735,10 @@ def get_model_repo_info(
|
|
|
737
735
|
model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
|
|
738
736
|
|
|
739
737
|
# If the model does not exist locally, then we get the model info from the Hugging
|
|
740
|
-
# Face Hub
|
|
738
|
+
# Face Hub, if possible
|
|
741
739
|
if model_info is None:
|
|
742
740
|
num_attempts = 3
|
|
741
|
+
errors: list[Exception] = list()
|
|
743
742
|
for _ in range(num_attempts):
|
|
744
743
|
try:
|
|
745
744
|
model_info = hf_api.model_info(
|
|
@@ -749,25 +748,37 @@ def get_model_repo_info(
|
|
|
749
748
|
except (GatedRepoError, LocalTokenNotFoundError) as e:
|
|
750
749
|
try:
|
|
751
750
|
hf_whoami(token=token)
|
|
752
|
-
logger.
|
|
751
|
+
logger.debug(
|
|
753
752
|
f"Could not access the model {model_id} with the revision "
|
|
754
753
|
f"{revision}. The error was {str(e)!r}."
|
|
755
754
|
)
|
|
756
755
|
return None
|
|
757
756
|
except LocalTokenNotFoundError:
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
757
|
+
logger.debug(
|
|
758
|
+
f"Could not access the model {model_id} with the revision "
|
|
759
|
+
f"{revision}. The error was {str(e)!r}. Please set the "
|
|
760
|
+
"`HUGGINGFACE_API_KEY` environment variable or use the "
|
|
761
|
+
"`--api-key` argument."
|
|
762
762
|
)
|
|
763
|
+
return None
|
|
763
764
|
except (RepositoryNotFoundError, HFValidationError):
|
|
764
765
|
return None
|
|
765
|
-
except (OSError, RequestException):
|
|
766
|
+
except (OSError, RequestException) as e:
|
|
766
767
|
if internet_connection_available():
|
|
768
|
+
errors.append(e)
|
|
767
769
|
continue
|
|
768
|
-
|
|
770
|
+
logger.debug(
|
|
771
|
+
"Could not access the Hugging Face Hub. Please check your internet "
|
|
772
|
+
"connection."
|
|
773
|
+
)
|
|
774
|
+
return None
|
|
769
775
|
else:
|
|
770
|
-
|
|
776
|
+
logger.debug(
|
|
777
|
+
f"Could not access model info for the model {model_id!r} from the "
|
|
778
|
+
f"Hugging Face Hub, after {num_attempts} attempts. The errors "
|
|
779
|
+
f"encountered were {errors!r}."
|
|
780
|
+
)
|
|
781
|
+
return None
|
|
771
782
|
|
|
772
783
|
# Get all the Hugging Face repository tags for the model. If the model is an adapter
|
|
773
784
|
# model, then we also get the tags for the base model
|
|
@@ -836,7 +847,8 @@ def get_model_repo_info(
|
|
|
836
847
|
"Skipping since the `only_allow_safetensors` argument is set "
|
|
837
848
|
"to `True`."
|
|
838
849
|
)
|
|
839
|
-
|
|
850
|
+
logger.warning(msg)
|
|
851
|
+
return None
|
|
840
852
|
|
|
841
853
|
# Also check base model if we are evaluating an adapter
|
|
842
854
|
if base_model_id is not None:
|
|
@@ -856,7 +868,8 @@ def get_model_repo_info(
|
|
|
856
868
|
" Skipping since the `only_allow_safetensors` argument is set "
|
|
857
869
|
"to `True`."
|
|
858
870
|
)
|
|
859
|
-
|
|
871
|
+
logging.warning(msg)
|
|
872
|
+
return None
|
|
860
873
|
|
|
861
874
|
return HFModelInfo(
|
|
862
875
|
pipeline_tag=pipeline_tag, tags=tags, adapter_base_model_id=base_model_id
|
|
@@ -84,7 +84,12 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
|
84
84
|
destroy_distributed_environment,
|
|
85
85
|
destroy_model_parallel,
|
|
86
86
|
)
|
|
87
|
+
from vllm.inputs import PromptType
|
|
87
88
|
from vllm.lora.request import LoRARequest
|
|
89
|
+
from vllm.model_executor.guided_decoding.guided_fields import GuidedDecodingRequest
|
|
90
|
+
from vllm.pooling_params import PoolingParams
|
|
91
|
+
from vllm.prompt_adapter.request import PromptAdapterRequest
|
|
92
|
+
from vllm.sampling_params import RequestOutputKind
|
|
88
93
|
|
|
89
94
|
if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
|
|
90
95
|
from outlines.models.vllm import adapt_tokenizer
|
|
@@ -451,7 +456,9 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
451
456
|
text=prompts,
|
|
452
457
|
truncation=True,
|
|
453
458
|
max_length=max(
|
|
454
|
-
self._tokenizer.model_max_length
|
|
459
|
+
min(self._tokenizer.model_max_length, MAX_CONTEXT_LENGTH)
|
|
460
|
+
- max_tokens,
|
|
461
|
+
0,
|
|
455
462
|
),
|
|
456
463
|
)
|
|
457
464
|
prompts = self._tokenizer.batch_decode(
|
|
@@ -491,8 +498,19 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
491
498
|
output.outputs[0].token_ids for output in raw_outputs
|
|
492
499
|
]
|
|
493
500
|
if self.end_of_reasoning_token_id in completion_ids[0]:
|
|
501
|
+
# Find the latest index of the end of reasoning token and slice
|
|
502
|
+
# the token IDs to only include the tokens after it
|
|
494
503
|
completion_ids = [
|
|
495
|
-
token_ids[
|
|
504
|
+
token_ids[
|
|
505
|
+
max(
|
|
506
|
+
[
|
|
507
|
+
i
|
|
508
|
+
for i, x in enumerate(token_ids)
|
|
509
|
+
if x == self.end_of_reasoning_token_id
|
|
510
|
+
]
|
|
511
|
+
)
|
|
512
|
+
+ 1 :
|
|
513
|
+
]
|
|
496
514
|
if self.end_of_reasoning_token_id in token_ids
|
|
497
515
|
else token_ids
|
|
498
516
|
for token_ids in completion_ids
|
|
@@ -814,6 +832,9 @@ def load_model_and_tokenizer(
|
|
|
814
832
|
)
|
|
815
833
|
|
|
816
834
|
model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
|
|
835
|
+
model._validate_and_add_requests = MethodType(
|
|
836
|
+
_validate_and_add_requests_with_fixed_progress_bars, model
|
|
837
|
+
)
|
|
817
838
|
model.config = hf_model_config
|
|
818
839
|
|
|
819
840
|
return model, tokenizer
|
|
@@ -934,6 +955,53 @@ def _run_engine_with_fixed_progress_bars(
|
|
|
934
955
|
return outputs
|
|
935
956
|
|
|
936
957
|
|
|
958
|
+
def _validate_and_add_requests_with_fixed_progress_bars(
|
|
959
|
+
self: "LLM",
|
|
960
|
+
prompts: "PromptType | c.Sequence[PromptType]",
|
|
961
|
+
params: "SamplingParams | c.Sequence[SamplingParams] | PoolingParams | c.Sequence[PoolingParams]", # noqa: E501
|
|
962
|
+
*,
|
|
963
|
+
use_tqdm: bool,
|
|
964
|
+
lora_request: "c.Sequence[LoRARequest] | LoRARequest | None",
|
|
965
|
+
prompt_adapter_request: "PromptAdapterRequest | None",
|
|
966
|
+
tokenization_kwargs: dict[str, t.Any] | None = None,
|
|
967
|
+
guided_options: "GuidedDecodingRequest | None" = None,
|
|
968
|
+
priority: list[int] | None = None,
|
|
969
|
+
) -> None:
|
|
970
|
+
if isinstance(prompts, (str, dict)):
|
|
971
|
+
# Convert a single prompt to a list.
|
|
972
|
+
prompts = [prompts]
|
|
973
|
+
|
|
974
|
+
num_requests = len(prompts)
|
|
975
|
+
if isinstance(params, list) and len(params) != num_requests:
|
|
976
|
+
raise ValueError("The lengths of prompts and params must be the same.")
|
|
977
|
+
if isinstance(lora_request, list) and len(lora_request) != num_requests:
|
|
978
|
+
raise ValueError("The lengths of prompts and lora_request must be the same.")
|
|
979
|
+
|
|
980
|
+
for sp in params if isinstance(params, list) else (params,):
|
|
981
|
+
if isinstance(sp, SamplingParams):
|
|
982
|
+
self._add_guided_params(sp, guided_options)
|
|
983
|
+
|
|
984
|
+
# We only care about the final output
|
|
985
|
+
sp.output_kind = RequestOutputKind.FINAL_ONLY
|
|
986
|
+
|
|
987
|
+
# Add requests to the engine.
|
|
988
|
+
it = prompts
|
|
989
|
+
if use_tqdm:
|
|
990
|
+
it = tqdm(it, desc="Adding requests", leave=False)
|
|
991
|
+
|
|
992
|
+
for i, prompt in enumerate(it):
|
|
993
|
+
self._add_request(
|
|
994
|
+
prompt,
|
|
995
|
+
params[i] if isinstance(params, c.Sequence) else params,
|
|
996
|
+
tokenization_kwargs=tokenization_kwargs,
|
|
997
|
+
lora_request=lora_request[i]
|
|
998
|
+
if isinstance(lora_request, c.Sequence)
|
|
999
|
+
else lora_request,
|
|
1000
|
+
prompt_adapter_request=prompt_adapter_request,
|
|
1001
|
+
priority=priority[i] if priority else 0,
|
|
1002
|
+
)
|
|
1003
|
+
|
|
1004
|
+
|
|
937
1005
|
def clear_vllm() -> None:
|
|
938
1006
|
"""Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
|
|
939
1007
|
with contextlib.suppress(ValueError):
|
euroeval/benchmarker.py
CHANGED
|
@@ -72,7 +72,6 @@ class Benchmarker:
|
|
|
72
72
|
force: bool = False,
|
|
73
73
|
verbose: bool = False,
|
|
74
74
|
trust_remote_code: bool = False,
|
|
75
|
-
use_flash_attention: bool | None = None,
|
|
76
75
|
clear_model_cache: bool = False,
|
|
77
76
|
evaluate_test_split: bool = False,
|
|
78
77
|
few_shot: bool = True,
|
|
@@ -129,9 +128,6 @@ class Benchmarker:
|
|
|
129
128
|
`debug` is True. Defaults to False.
|
|
130
129
|
trust_remote_code:
|
|
131
130
|
Whether to trust remote code when loading models. Defaults to False.
|
|
132
|
-
use_flash_attention:
|
|
133
|
-
Whether to use Flash Attention. If None then it will be used if it is
|
|
134
|
-
installed and the model is a decoder model. Defaults to None.
|
|
135
131
|
clear_model_cache:
|
|
136
132
|
Whether to clear the model cache after benchmarking each model.
|
|
137
133
|
Defaults to False.
|
|
@@ -190,7 +186,6 @@ class Benchmarker:
|
|
|
190
186
|
force=force,
|
|
191
187
|
verbose=verbose,
|
|
192
188
|
trust_remote_code=trust_remote_code,
|
|
193
|
-
use_flash_attention=use_flash_attention,
|
|
194
189
|
clear_model_cache=clear_model_cache,
|
|
195
190
|
evaluate_test_split=evaluate_test_split,
|
|
196
191
|
few_shot=few_shot,
|
|
@@ -243,7 +238,6 @@ class Benchmarker:
|
|
|
243
238
|
force: bool | None = None,
|
|
244
239
|
verbose: bool | None = None,
|
|
245
240
|
trust_remote_code: bool | None = None,
|
|
246
|
-
use_flash_attention: bool | None = None,
|
|
247
241
|
clear_model_cache: bool | None = None,
|
|
248
242
|
evaluate_test_split: bool | None = None,
|
|
249
243
|
few_shot: bool | None = None,
|
|
@@ -311,9 +305,6 @@ class Benchmarker:
|
|
|
311
305
|
trust_remote_code:
|
|
312
306
|
Whether to trust remote code when loading models. Defaults to the value
|
|
313
307
|
specified when initialising the benchmarker.
|
|
314
|
-
use_flash_attention:
|
|
315
|
-
Whether to use Flash Attention. Defaults to the value specified when
|
|
316
|
-
initialising the benchmarker.
|
|
317
308
|
clear_model_cache:
|
|
318
309
|
Whether to clear the model cache after benchmarking each model. Defaults
|
|
319
310
|
to the value specified when initialising the benchmarker.
|
|
@@ -359,7 +350,6 @@ class Benchmarker:
|
|
|
359
350
|
force=force,
|
|
360
351
|
verbose=verbose,
|
|
361
352
|
trust_remote_code=trust_remote_code,
|
|
362
|
-
use_flash_attention=use_flash_attention,
|
|
363
353
|
clear_model_cache=clear_model_cache,
|
|
364
354
|
evaluate_test_split=evaluate_test_split,
|
|
365
355
|
few_shot=few_shot,
|
|
@@ -531,7 +521,6 @@ class Benchmarker:
|
|
|
531
521
|
force: bool | None = None,
|
|
532
522
|
verbose: bool | None = None,
|
|
533
523
|
trust_remote_code: bool | None = None,
|
|
534
|
-
use_flash_attention: bool | None | None = None,
|
|
535
524
|
clear_model_cache: bool | None = None,
|
|
536
525
|
evaluate_test_split: bool | None = None,
|
|
537
526
|
few_shot: bool | None = None,
|
|
@@ -590,9 +579,6 @@ class Benchmarker:
|
|
|
590
579
|
trust_remote_code:
|
|
591
580
|
Whether to trust remote code when loading models. If None, then this
|
|
592
581
|
value will not be updated.
|
|
593
|
-
use_flash_attention:
|
|
594
|
-
Whether to use Flash Attention. If None, then this value will not be
|
|
595
|
-
updated.
|
|
596
582
|
clear_model_cache:
|
|
597
583
|
Whether to clear the model cache after benchmarking each model. If None,
|
|
598
584
|
then this value will not be updated.
|
|
@@ -658,8 +644,6 @@ class Benchmarker:
|
|
|
658
644
|
benchmark_config_params.verbose = verbose
|
|
659
645
|
if trust_remote_code is not None:
|
|
660
646
|
benchmark_config_params.trust_remote_code = trust_remote_code
|
|
661
|
-
if use_flash_attention is not None:
|
|
662
|
-
benchmark_config_params.use_flash_attention = use_flash_attention
|
|
663
647
|
if clear_model_cache is not None:
|
|
664
648
|
benchmark_config_params.clear_model_cache = clear_model_cache
|
|
665
649
|
if evaluate_test_split is not None:
|
|
@@ -863,7 +847,6 @@ class Benchmarker:
|
|
|
863
847
|
force: bool | None = None,
|
|
864
848
|
verbose: bool | None = None,
|
|
865
849
|
trust_remote_code: bool | None = None,
|
|
866
|
-
use_flash_attention: bool | None = None,
|
|
867
850
|
clear_model_cache: bool | None = None,
|
|
868
851
|
evaluate_test_split: bool | None = None,
|
|
869
852
|
few_shot: bool | None = None,
|
|
@@ -931,9 +914,6 @@ class Benchmarker:
|
|
|
931
914
|
trust_remote_code:
|
|
932
915
|
Whether to trust remote code when loading models. Defaults to the value
|
|
933
916
|
specified when initialising the benchmarker.
|
|
934
|
-
use_flash_attention:
|
|
935
|
-
Whether to use Flash Attention. Defaults to the value specified when
|
|
936
|
-
initialising the benchmarker.
|
|
937
917
|
clear_model_cache:
|
|
938
918
|
Whether to clear the model cache after benchmarking each model. Defaults
|
|
939
919
|
to the value specified when initialising the benchmarker.
|
|
@@ -981,7 +961,6 @@ class Benchmarker:
|
|
|
981
961
|
force=force,
|
|
982
962
|
verbose=verbose,
|
|
983
963
|
trust_remote_code=trust_remote_code,
|
|
984
|
-
use_flash_attention=use_flash_attention,
|
|
985
964
|
clear_model_cache=clear_model_cache,
|
|
986
965
|
evaluate_test_split=evaluate_test_split,
|
|
987
966
|
few_shot=few_shot,
|
euroeval/cli.py
CHANGED
|
@@ -141,14 +141,6 @@ from .tasks import get_all_tasks
|
|
|
141
141
|
help="""Whether to trust remote code. Only set this flag if you trust the supplier
|
|
142
142
|
of the model.""",
|
|
143
143
|
)
|
|
144
|
-
@click.option(
|
|
145
|
-
"--use-flash-attention/--no-use-flash-attention",
|
|
146
|
-
default=None,
|
|
147
|
-
show_default=True,
|
|
148
|
-
help="""Whether to use Flash Attention. If not specified then the model will use
|
|
149
|
-
Flash Attention for generative models if a CUDA GPU is available and `flash-attn`
|
|
150
|
-
or `vllm-flash-attn` are installed.""",
|
|
151
|
-
)
|
|
152
144
|
@click.option(
|
|
153
145
|
"--clear-model-cache/--no-clear-model-cache",
|
|
154
146
|
default=False,
|
|
@@ -225,7 +217,6 @@ def benchmark(
|
|
|
225
217
|
verbose: bool,
|
|
226
218
|
device: str | None,
|
|
227
219
|
trust_remote_code: bool,
|
|
228
|
-
use_flash_attention: bool | None,
|
|
229
220
|
clear_model_cache: bool,
|
|
230
221
|
evaluate_test_split: bool,
|
|
231
222
|
few_shot: bool,
|
|
@@ -261,7 +252,6 @@ def benchmark(
|
|
|
261
252
|
cache_dir=cache_dir,
|
|
262
253
|
device=device,
|
|
263
254
|
trust_remote_code=trust_remote_code,
|
|
264
|
-
use_flash_attention=use_flash_attention,
|
|
265
255
|
clear_model_cache=clear_model_cache,
|
|
266
256
|
evaluate_test_split=evaluate_test_split,
|
|
267
257
|
few_shot=few_shot,
|
euroeval/data_models.py
CHANGED
|
@@ -191,9 +191,6 @@ class BenchmarkConfig:
|
|
|
191
191
|
Whether to print verbose output.
|
|
192
192
|
trust_remote_code:
|
|
193
193
|
Whether to trust remote code when loading models from the Hugging Face Hub.
|
|
194
|
-
use_flash_attention:
|
|
195
|
-
Whether to use Flash Attention. If None then this will be used for
|
|
196
|
-
generative models.
|
|
197
194
|
clear_model_cache:
|
|
198
195
|
Whether to clear the model cache after benchmarking each model.
|
|
199
196
|
evaluate_test_split:
|
|
@@ -231,7 +228,6 @@ class BenchmarkConfig:
|
|
|
231
228
|
device: torch.device
|
|
232
229
|
verbose: bool
|
|
233
230
|
trust_remote_code: bool
|
|
234
|
-
use_flash_attention: bool | None
|
|
235
231
|
clear_model_cache: bool
|
|
236
232
|
evaluate_test_split: bool
|
|
237
233
|
few_shot: bool
|
|
@@ -263,7 +259,6 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
263
259
|
force: bool
|
|
264
260
|
verbose: bool
|
|
265
261
|
trust_remote_code: bool
|
|
266
|
-
use_flash_attention: bool | None
|
|
267
262
|
clear_model_cache: bool
|
|
268
263
|
evaluate_test_split: bool
|
|
269
264
|
few_shot: bool
|
euroeval/exceptions.py
CHANGED
|
@@ -81,28 +81,6 @@ class NaNValueInModelOutput(Exception):
|
|
|
81
81
|
super().__init__(self.message)
|
|
82
82
|
|
|
83
83
|
|
|
84
|
-
class FlashAttentionNotInstalled(Exception):
|
|
85
|
-
"""The `flash-attn` package has not been installed."""
|
|
86
|
-
|
|
87
|
-
def __init__(
|
|
88
|
-
self,
|
|
89
|
-
message: str = (
|
|
90
|
-
"The model you are trying to load requires Flash Attention. To use Flash "
|
|
91
|
-
"Attention, please install the `flash-attn` package, which can be done by "
|
|
92
|
-
"running `pip install -U wheel && FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE "
|
|
93
|
-
"pip install flash-attn --no-build-isolation`."
|
|
94
|
-
),
|
|
95
|
-
) -> None:
|
|
96
|
-
"""Initialise the exception.
|
|
97
|
-
|
|
98
|
-
Args:
|
|
99
|
-
message:
|
|
100
|
-
The message to display.
|
|
101
|
-
"""
|
|
102
|
-
self.message = message
|
|
103
|
-
super().__init__(self.message)
|
|
104
|
-
|
|
105
|
-
|
|
106
84
|
class NeedsExtraInstalled(InvalidModel):
|
|
107
85
|
"""The evaluation requires extra to be installed."""
|
|
108
86
|
|
euroeval/human_evaluation.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.9.1
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -62,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
|
|
|
62
62
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
64
64
|
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
65
|
-
Requires-Dist: vllm
|
|
65
|
+
Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'all'
|
|
66
66
|
Provides-Extra: generative
|
|
67
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
69
|
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
70
|
-
Requires-Dist: vllm
|
|
70
|
+
Requires-Dist: vllm>=0.9.0; (platform_system == 'Linux') and extra == 'generative'
|
|
71
71
|
Provides-Extra: human-evaluation
|
|
72
72
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
73
73
|
Provides-Extra: test
|
|
@@ -97,8 +97,6 @@ ______________________________________________________________________
|
|
|
97
97
|
|
|
98
98
|
- Dan Saattrup Nielsen ([@saattrupdan](https://github.com/saattrupdan),
|
|
99
99
|
dan.nielsen@alexandra.dk)
|
|
100
|
-
- Kenneth Enevoldsen ([@KennethEnevoldsen](https://github.com/KennethEnevoldsen),
|
|
101
|
-
kenneth.enevoldsen@cas.au.dk)
|
|
102
100
|
|
|
103
101
|
|
|
104
102
|
## Installation
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
euroeval/__init__.py,sha256=
|
|
2
|
-
euroeval/benchmark_config_factory.py,sha256=
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
1
|
+
euroeval/__init__.py,sha256=jjInLLkd5IrDrwqag3U35g7SgzITBlFYllgofc-uQFg,3067
|
|
2
|
+
euroeval/benchmark_config_factory.py,sha256=icTeT5C-bNCJmvSWFlxKdEpRboZN8OjwaHGu7JM-2xI,11158
|
|
3
|
+
euroeval/benchmarker.py,sha256=wmgrYVS31PMhhrVienjaVHHyfnZAy51kUvC6OjooiOw,48047
|
|
4
4
|
euroeval/callbacks.py,sha256=F1AJCLB8FJpxqYprwLi_PsH4Bc0x4lyR8UiTG-GlFLY,2452
|
|
5
|
-
euroeval/cli.py,sha256=
|
|
5
|
+
euroeval/cli.py,sha256=d8JztMi_RbpUlEBXidd6DQ-xeC-xhozf_qU6Vkzye20,8161
|
|
6
6
|
euroeval/constants.py,sha256=p6kp_R6-Tq5LBvyXyT6Sa6N3SkjEElGS2LSZRBoQaYs,1985
|
|
7
7
|
euroeval/data_loading.py,sha256=L_REtxefte5Ke4xE_Cz01zkfCyKlOYhSqT5ZXXulHPc,3992
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
8
|
+
euroeval/data_models.py,sha256=7nAGDpN58Y35Lt9JZE_y0y5iOYesw2htcwHc68MkBZU,22953
|
|
9
9
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
10
|
-
euroeval/exceptions.py,sha256=
|
|
10
|
+
euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
|
|
11
11
|
euroeval/finetuning.py,sha256=uuaUxNQJb7TivPQuI1OYQ_MIKbD-6-7mpkobLKsDefQ,10667
|
|
12
12
|
euroeval/generation.py,sha256=LSsskfLjIJ-c3gQxmr7eiAobPOm-5bU9vnR7uHQ7XmU,10745
|
|
13
13
|
euroeval/generation_utils.py,sha256=zRsaOHcbhysbMa983BZXxfd-qMe4NYts-ZbQxfvNTK4,13310
|
|
14
|
-
euroeval/human_evaluation.py,sha256=
|
|
14
|
+
euroeval/human_evaluation.py,sha256=zqbbJkqm2Uymf-88PxM3R9vVRR8SZJlq3QrqWEoiVeE,27643
|
|
15
15
|
euroeval/languages.py,sha256=LerXuRBAUYkQL6qSV-F82itAE4EgBGFBtzaGnJJZvOE,8555
|
|
16
16
|
euroeval/model_cache.py,sha256=HgXTgn4RMBqIjKaTmYzxu0f4NIwbXx1XJFbvbITqy4E,8686
|
|
17
17
|
euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
|
|
@@ -25,9 +25,9 @@ euroeval/utils.py,sha256=e83OnWc0GJn0Tn_vP3tbqh1DAbLy2ky-LnIlTEOKzKU,11410
|
|
|
25
25
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
26
26
|
euroeval/benchmark_modules/base.py,sha256=LcG46I2O5wcvu_3T_irBY6VkUhWVPKifBhcP-ln93TA,10798
|
|
27
27
|
euroeval/benchmark_modules/fresh.py,sha256=_LWmpqiNGGTA-NoVC0v3-fS1sraDS9n-pgKUzz89jVk,9919
|
|
28
|
-
euroeval/benchmark_modules/hf.py,sha256=
|
|
28
|
+
euroeval/benchmark_modules/hf.py,sha256=CoiaNakjhg6gm_5IbUUeevXQZebg2VrRLuhzEi2Hhrk,44617
|
|
29
29
|
euroeval/benchmark_modules/litellm.py,sha256=SxSr_0C6b_jVavR3y9QyhfkCOP5-va4zijGfghFTArY,48362
|
|
30
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
30
|
+
euroeval/benchmark_modules/vllm.py,sha256=rz_Xau5TGiFeb2VkdVpW_fYOfRCCvYrH0q9BGzCwZlo,42156
|
|
31
31
|
euroeval/dataset_configs/__init__.py,sha256=kWKtlSAOY-olOQL3UtFqL6I3Tki3G3waMZSd2YChjCg,1895
|
|
32
32
|
euroeval/dataset_configs/danish.py,sha256=MTt9EcriSer0QaFQ7_6evYxh-g9OPjroWegYdFpiKag,3395
|
|
33
33
|
euroeval/dataset_configs/dutch.py,sha256=r21nxEvMmBkKqPXVW082batPsxJ9d0RB4DzngOTMJSk,3185
|
|
@@ -54,8 +54,8 @@ euroeval/task_group_utils/question_answering.py,sha256=kZBABJ_WYNTH4Xgo2jIvfx7iY
|
|
|
54
54
|
euroeval/task_group_utils/sequence_classification.py,sha256=Yqx0pUhuHYmSkv1ZUfOndSLTvpr0lWCk19oYITfSjV4,13555
|
|
55
55
|
euroeval/task_group_utils/text_to_text.py,sha256=Nu1_qRPLbboCd9Q5rxqY4fQFJ_aGXu80aWQqoTG1cYc,5047
|
|
56
56
|
euroeval/task_group_utils/token_classification.py,sha256=3idWB81Fcx9UhTuk-gxMfXENrCBmiWBDUWdULXoIhpw,17863
|
|
57
|
-
euroeval-15.
|
|
58
|
-
euroeval-15.
|
|
59
|
-
euroeval-15.
|
|
60
|
-
euroeval-15.
|
|
61
|
-
euroeval-15.
|
|
57
|
+
euroeval-15.9.1.dist-info/METADATA,sha256=UkGmFcnarstFwD1J1eS6h3gbyxnucnaAVLnB5QhkdSo,13555
|
|
58
|
+
euroeval-15.9.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
59
|
+
euroeval-15.9.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
60
|
+
euroeval-15.9.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
61
|
+
euroeval-15.9.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|