EuroEval 15.4.0__py3-none-any.whl → 15.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/hf.py +68 -37
- euroeval/benchmark_modules/vllm.py +47 -8
- euroeval/constants.py +3 -0
- euroeval/data_models.py +7 -2
- euroeval/dataset_configs.py +5 -5
- euroeval/generation.py +17 -3
- euroeval/task_utils/sequence_classification.py +35 -10
- euroeval/types.py +3 -3
- euroeval/utils.py +32 -29
- {euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/METADATA +4 -3
- {euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/RECORD +14 -14
- {euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/WHEEL +0 -0
- {euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/entry_points.txt +0 -0
- {euroeval-15.4.0.dist-info → euroeval-15.4.2.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -20,6 +20,7 @@ from huggingface_hub.utils import (
|
|
|
20
20
|
HFValidationError,
|
|
21
21
|
LocalTokenNotFoundError,
|
|
22
22
|
)
|
|
23
|
+
from peft import PeftConfig
|
|
23
24
|
from requests.exceptions import RequestException
|
|
24
25
|
from torch import nn
|
|
25
26
|
from transformers import (
|
|
@@ -34,6 +35,9 @@ from transformers import (
|
|
|
34
35
|
Trainer,
|
|
35
36
|
)
|
|
36
37
|
from transformers.modelcard import TASK_MAPPING
|
|
38
|
+
from transformers.models.auto.modeling_auto import (
|
|
39
|
+
MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
|
|
40
|
+
)
|
|
37
41
|
from urllib3.exceptions import RequestError
|
|
38
42
|
|
|
39
43
|
from ..constants import (
|
|
@@ -73,6 +77,7 @@ from ..utils import (
|
|
|
73
77
|
get_class_by_name,
|
|
74
78
|
get_eos_token,
|
|
75
79
|
internet_connection_available,
|
|
80
|
+
log_once,
|
|
76
81
|
)
|
|
77
82
|
from .base import BenchmarkModule
|
|
78
83
|
|
|
@@ -727,53 +732,54 @@ def get_model_repo_info(
|
|
|
727
732
|
# If the model does not exist locally, then we get the model info from the Hugging
|
|
728
733
|
# Face Hub
|
|
729
734
|
if model_info is None:
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
repo_id=model_id, revision=revision, token=token
|
|
733
|
-
)
|
|
734
|
-
except (GatedRepoError, LocalTokenNotFoundError) as e:
|
|
735
|
+
num_attempts = 3
|
|
736
|
+
for _ in range(num_attempts):
|
|
735
737
|
try:
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
f"Could not access the model {model_id} with the revision "
|
|
739
|
-
f"{revision}. The error was {str(e)!r}."
|
|
738
|
+
model_info = hf_api.model_info(
|
|
739
|
+
repo_id=model_id, revision=revision, token=token
|
|
740
740
|
)
|
|
741
|
+
break
|
|
742
|
+
except (GatedRepoError, LocalTokenNotFoundError) as e:
|
|
743
|
+
try:
|
|
744
|
+
hf_whoami(token=token)
|
|
745
|
+
logger.warning(
|
|
746
|
+
f"Could not access the model {model_id} with the revision "
|
|
747
|
+
f"{revision}. The error was {str(e)!r}."
|
|
748
|
+
)
|
|
749
|
+
return None
|
|
750
|
+
except LocalTokenNotFoundError:
|
|
751
|
+
raise NeedsAdditionalArgument(
|
|
752
|
+
cli_argument="--api-key",
|
|
753
|
+
script_argument="api_key=<your-api-key>",
|
|
754
|
+
run_with_cli=benchmark_config.run_with_cli,
|
|
755
|
+
)
|
|
756
|
+
except (RepositoryNotFoundError, HFValidationError):
|
|
741
757
|
return None
|
|
742
|
-
except
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
script_argument="api_key=<your-api-key>",
|
|
746
|
-
run_with_cli=benchmark_config.run_with_cli,
|
|
747
|
-
)
|
|
748
|
-
except (RepositoryNotFoundError, HFValidationError):
|
|
749
|
-
return None
|
|
750
|
-
except (OSError, RequestException):
|
|
751
|
-
if internet_connection_available():
|
|
752
|
-
raise HuggingFaceHubDown()
|
|
753
|
-
else:
|
|
758
|
+
except (OSError, RequestException):
|
|
759
|
+
if internet_connection_available():
|
|
760
|
+
continue
|
|
754
761
|
raise NoInternetConnection()
|
|
762
|
+
else:
|
|
763
|
+
raise HuggingFaceHubDown()
|
|
755
764
|
|
|
756
765
|
# Get all the Hugging Face repository tags for the model. If the model is an adapter
|
|
757
766
|
# model, then we also get the tags for the base model
|
|
758
767
|
tags = model_info.tags or list()
|
|
759
|
-
has_base_model_tag = any(
|
|
760
|
-
tag.startswith("base_model:") and tag.count(":") == 1 for tag in tags
|
|
761
|
-
)
|
|
762
768
|
base_model_id: str | None = None
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
769
|
+
has_adapter_config = model_info.siblings is not None and any(
|
|
770
|
+
sibling.rfilename == "adapter_config.json" for sibling in model_info.siblings
|
|
771
|
+
)
|
|
772
|
+
if has_adapter_config:
|
|
773
|
+
adapter_config = PeftConfig.from_pretrained(model_id, revision=revision)
|
|
774
|
+
base_model_id = adapter_config.base_model_name_or_path
|
|
775
|
+
log_once(
|
|
776
|
+
f"Model {model_id!r} identified as an adapter model, with base model "
|
|
777
|
+
f"{base_model_id!r}.",
|
|
778
|
+
level=logging.DEBUG,
|
|
767
779
|
)
|
|
768
|
-
if
|
|
769
|
-
base_model_id = [
|
|
770
|
-
tag.split(":")[1]
|
|
771
|
-
for tag in tags
|
|
772
|
-
if tag.startswith("base_model:") and tag.count(":") == 1
|
|
773
|
-
][0]
|
|
780
|
+
if base_model_id is not None:
|
|
774
781
|
base_model_info = hf_api.model_info(
|
|
775
782
|
repo_id=base_model_id,
|
|
776
|
-
revision=revision,
|
|
777
783
|
token=benchmark_config.api_key
|
|
778
784
|
or os.getenv("HUGGINGFACE_API_KEY")
|
|
779
785
|
or True,
|
|
@@ -781,12 +787,18 @@ def get_model_repo_info(
|
|
|
781
787
|
tags += base_model_info.tags or list()
|
|
782
788
|
tags = list(set(tags))
|
|
783
789
|
|
|
790
|
+
# TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
|
|
791
|
+
# 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
|
|
792
|
+
# when this PR has been merged in and published:
|
|
793
|
+
# https://github.com/huggingface/transformers/pull/37107
|
|
794
|
+
TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
|
|
795
|
+
|
|
784
796
|
# Get the pipeline tag for the model. If it is not specified, then we determine it
|
|
785
797
|
# by checking the model's architecture as written in the model's Hugging Face config
|
|
786
798
|
pipeline_tag = model_info.pipeline_tag
|
|
787
799
|
if pipeline_tag is None:
|
|
788
800
|
hf_config = load_hf_model_config(
|
|
789
|
-
model_id=model_id,
|
|
801
|
+
model_id=base_model_id or model_id,
|
|
790
802
|
num_labels=0,
|
|
791
803
|
id2label=dict(),
|
|
792
804
|
label2id=dict(),
|
|
@@ -812,7 +824,6 @@ def get_model_repo_info(
|
|
|
812
824
|
pipeline_tag = "fill-mask"
|
|
813
825
|
|
|
814
826
|
if benchmark_config.only_allow_safetensors:
|
|
815
|
-
# Check if any file ends with .safetensors
|
|
816
827
|
repo_files = hf_api.list_repo_files(repo_id=model_id, revision=revision)
|
|
817
828
|
has_safetensors = any(f.endswith(".safetensors") for f in repo_files)
|
|
818
829
|
if not has_safetensors:
|
|
@@ -826,6 +837,26 @@ def get_model_repo_info(
|
|
|
826
837
|
)
|
|
827
838
|
raise InvalidModel(msg)
|
|
828
839
|
|
|
840
|
+
# Also check base model if we are evaluating an adapter
|
|
841
|
+
if base_model_id is not None:
|
|
842
|
+
base_repo_files = hf_api.list_repo_files(repo_id=base_model_id)
|
|
843
|
+
base_has_safetensors = any(
|
|
844
|
+
f.endswith(".safetensors") for f in base_repo_files
|
|
845
|
+
)
|
|
846
|
+
if not base_has_safetensors:
|
|
847
|
+
msg = (
|
|
848
|
+
f"Base model {base_model_id} does not have safetensors weights "
|
|
849
|
+
"available."
|
|
850
|
+
)
|
|
851
|
+
if benchmark_config.run_with_cli:
|
|
852
|
+
msg += " Skipping since the `--only-allow-safetensors` flag is set."
|
|
853
|
+
else:
|
|
854
|
+
msg += (
|
|
855
|
+
" Skipping since the `only_allow_safetensors` argument is set "
|
|
856
|
+
"to `True`."
|
|
857
|
+
)
|
|
858
|
+
raise InvalidModel(msg)
|
|
859
|
+
|
|
829
860
|
return HFModelInfo(
|
|
830
861
|
pipeline_tag=pipeline_tag, tags=tags, adapter_base_model_id=base_model_id
|
|
831
862
|
)
|
|
@@ -30,6 +30,7 @@ from ..constants import (
|
|
|
30
30
|
REASONING_MAX_TOKENS,
|
|
31
31
|
TASK_GROUPS_USING_LOGPROBS,
|
|
32
32
|
TASKS_USING_JSON,
|
|
33
|
+
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
|
|
33
34
|
)
|
|
34
35
|
from ..data_models import (
|
|
35
36
|
BenchmarkConfig,
|
|
@@ -65,6 +66,7 @@ from ..utils import (
|
|
|
65
66
|
get_bos_token,
|
|
66
67
|
get_end_of_chat_token_ids,
|
|
67
68
|
get_eos_token,
|
|
69
|
+
get_min_cuda_compute_capability,
|
|
68
70
|
log_once,
|
|
69
71
|
should_prompts_be_stripped,
|
|
70
72
|
)
|
|
@@ -145,6 +147,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
145
147
|
if self.model_config.adapter_base_model_id is not None:
|
|
146
148
|
adapter_path = snapshot_download(
|
|
147
149
|
repo_id=self.model_config.model_id,
|
|
150
|
+
revision=self.model_config.revision,
|
|
148
151
|
cache_dir=Path(self.model_config.model_cache_dir),
|
|
149
152
|
)
|
|
150
153
|
self.buffer["lora_request"] = LoRARequest(
|
|
@@ -373,12 +376,27 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
373
376
|
|
|
374
377
|
# Generate sequences using vLLM
|
|
375
378
|
input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
379
|
+
num_attempts = 3
|
|
380
|
+
for _ in range(num_attempts):
|
|
381
|
+
try:
|
|
382
|
+
raw_outputs = self._model.generate(
|
|
383
|
+
prompts=prompts,
|
|
384
|
+
sampling_params=sampling_params,
|
|
385
|
+
use_tqdm=(not input_is_a_test),
|
|
386
|
+
lora_request=self.buffer.get("lora_request"),
|
|
387
|
+
)
|
|
388
|
+
break
|
|
389
|
+
except TypeError as e:
|
|
390
|
+
logger.debug(
|
|
391
|
+
f"Encountered error during vLLM generation: {str(e)}. Retrying..."
|
|
392
|
+
)
|
|
393
|
+
sleep(1)
|
|
394
|
+
else:
|
|
395
|
+
raise InvalidBenchmark(
|
|
396
|
+
f"Could not generate sequences after {num_attempts} attempts."
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Parse the raw model outputs
|
|
382
400
|
completion_ids: list[list[int]] = [
|
|
383
401
|
output.outputs[0].token_ids for output in raw_outputs
|
|
384
402
|
]
|
|
@@ -846,13 +864,16 @@ def load_model_and_tokenizer(
|
|
|
846
864
|
# Prefer base model ID if the model is an adapter - the adapter will be added on
|
|
847
865
|
# during inference in this case
|
|
848
866
|
model_id = model_config.adapter_base_model_id or model_config.model_id
|
|
867
|
+
revision = (
|
|
868
|
+
model_config.revision if model_config.adapter_base_model_id is None else "main"
|
|
869
|
+
)
|
|
849
870
|
|
|
850
871
|
hf_model_config = load_hf_model_config(
|
|
851
872
|
model_id=model_id,
|
|
852
873
|
num_labels=0,
|
|
853
874
|
id2label=dict(),
|
|
854
875
|
label2id=dict(),
|
|
855
|
-
revision=
|
|
876
|
+
revision=revision,
|
|
856
877
|
model_cache_dir=model_config.model_cache_dir,
|
|
857
878
|
api_key=benchmark_config.api_key,
|
|
858
879
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
@@ -881,6 +902,23 @@ def load_model_and_tokenizer(
|
|
|
881
902
|
)
|
|
882
903
|
dtype = torch.float16
|
|
883
904
|
|
|
905
|
+
if hf_model_config.torch_dtype == torch.bfloat16:
|
|
906
|
+
min_cuda_compute_capability = get_min_cuda_compute_capability()
|
|
907
|
+
required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
|
|
908
|
+
|
|
909
|
+
if min_cuda_compute_capability is not None:
|
|
910
|
+
if min_cuda_compute_capability < required_capability:
|
|
911
|
+
logger.info(
|
|
912
|
+
"You are loading a model with "
|
|
913
|
+
f"dtype {hf_model_config.torch_dtype}, "
|
|
914
|
+
"which vLLM only supports for CUDA devices with"
|
|
915
|
+
f"CUDA compute capability >={required_capability}. "
|
|
916
|
+
"You are using one or more devices with "
|
|
917
|
+
f"compute capability {min_cuda_compute_capability}. "
|
|
918
|
+
"Setting dtype to float16 instead."
|
|
919
|
+
)
|
|
920
|
+
dtype = torch.float16
|
|
921
|
+
|
|
884
922
|
if model_config.adapter_base_model_id is not None:
|
|
885
923
|
download_dir = str(Path(model_config.model_cache_dir) / "base_model")
|
|
886
924
|
else:
|
|
@@ -916,7 +954,7 @@ def load_model_and_tokenizer(
|
|
|
916
954
|
max_model_len=min(true_max_model_len, 5_000),
|
|
917
955
|
download_dir=download_dir,
|
|
918
956
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
919
|
-
revision=
|
|
957
|
+
revision=revision,
|
|
920
958
|
seed=4242,
|
|
921
959
|
distributed_executor_backend=executor_backend,
|
|
922
960
|
tensor_parallel_size=torch.cuda.device_count(),
|
|
@@ -994,6 +1032,7 @@ def load_tokenizer(
|
|
|
994
1032
|
Returns:
|
|
995
1033
|
The loaded tokenizer.
|
|
996
1034
|
"""
|
|
1035
|
+
revision = revision if adapter_base_model_id is None else "main"
|
|
997
1036
|
config = AutoConfig.from_pretrained(
|
|
998
1037
|
adapter_base_model_id or model_id,
|
|
999
1038
|
revision=revision,
|
euroeval/constants.py
CHANGED
|
@@ -54,3 +54,6 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
|
|
|
54
54
|
|
|
55
55
|
# Hugging Face Hub tags used to classify models as merge models
|
|
56
56
|
MERGE_TAGS = ["merge", "mergekit"]
|
|
57
|
+
|
|
58
|
+
# The minimum required CUDA compute capability for using bfloat16 in vLLM
|
|
59
|
+
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
|
euroeval/data_models.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""Data models used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
|
-
import importlib.metadata
|
|
5
4
|
import json
|
|
6
5
|
import pathlib
|
|
7
6
|
import re
|
|
@@ -11,6 +10,8 @@ from dataclasses import dataclass, field
|
|
|
11
10
|
import pydantic
|
|
12
11
|
import torch
|
|
13
12
|
|
|
13
|
+
from euroeval.utils import get_package_version
|
|
14
|
+
|
|
14
15
|
from .enums import Device, InferenceBackend, ModelType, TaskGroup
|
|
15
16
|
from .types import ScoreDict
|
|
16
17
|
|
|
@@ -228,7 +229,11 @@ class BenchmarkResult(pydantic.BaseModel):
|
|
|
228
229
|
generative_type: str | None
|
|
229
230
|
few_shot: bool
|
|
230
231
|
validation_split: bool
|
|
231
|
-
euroeval_version: str =
|
|
232
|
+
euroeval_version: str | None = get_package_version("euroeval")
|
|
233
|
+
transformers_version: str | None = get_package_version("transformers")
|
|
234
|
+
torch_version: str | None = get_package_version("torch")
|
|
235
|
+
vllm_version: str | None = get_package_version("vllm")
|
|
236
|
+
outlines_version: str | None = get_package_version("outlines")
|
|
232
237
|
|
|
233
238
|
@classmethod
|
|
234
239
|
def from_dict(cls, config: dict) -> "BenchmarkResult":
|
euroeval/dataset_configs.py
CHANGED
|
@@ -244,7 +244,7 @@ FOSENT_CONFIG = DatasetConfig(
|
|
|
244
244
|
ALLOCINE_CONFIG = DatasetConfig(
|
|
245
245
|
name="allocine",
|
|
246
246
|
pretty_name="the truncated version of the French sentiment classification "
|
|
247
|
-
"dataset
|
|
247
|
+
"dataset AlloCiné",
|
|
248
248
|
huggingface_id="EuroEval/allocine-mini",
|
|
249
249
|
task=SENT,
|
|
250
250
|
languages=[FR],
|
|
@@ -1467,9 +1467,9 @@ NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
|
|
|
1467
1467
|
max_generated_tokens=256,
|
|
1468
1468
|
)
|
|
1469
1469
|
|
|
1470
|
-
|
|
1471
|
-
name="mlsum",
|
|
1472
|
-
pretty_name="the truncated version of the German summarisation dataset MLSum",
|
|
1470
|
+
MLSUM_DE_CONFIG = DatasetConfig(
|
|
1471
|
+
name="mlsum-de",
|
|
1472
|
+
pretty_name="the truncated version of the German summarisation dataset MLSum-de",
|
|
1473
1473
|
huggingface_id="EuroEval/mlsum-mini",
|
|
1474
1474
|
task=SUMM,
|
|
1475
1475
|
languages=[DE],
|
|
@@ -1484,7 +1484,7 @@ MLSUM_CONFIG = DatasetConfig(
|
|
|
1484
1484
|
|
|
1485
1485
|
MLSUM_ES_CONFIG = DatasetConfig(
|
|
1486
1486
|
name="mlsum-es",
|
|
1487
|
-
pretty_name="the truncated version of the Spanish summarisation dataset MLSum",
|
|
1487
|
+
pretty_name="the truncated version of the Spanish summarisation dataset MLSum-es",
|
|
1488
1488
|
huggingface_id="EuroEval/mlsum-es-mini",
|
|
1489
1489
|
task=SUMM,
|
|
1490
1490
|
languages=[ES],
|
euroeval/generation.py
CHANGED
|
@@ -20,7 +20,12 @@ from .model_cache import (
|
|
|
20
20
|
from .utils import clear_memory
|
|
21
21
|
|
|
22
22
|
if t.TYPE_CHECKING:
|
|
23
|
-
from .data_models import
|
|
23
|
+
from .data_models import (
|
|
24
|
+
BenchmarkConfig,
|
|
25
|
+
DatasetConfig,
|
|
26
|
+
GenerativeModelOutput,
|
|
27
|
+
ModelConfig,
|
|
28
|
+
)
|
|
24
29
|
|
|
25
30
|
logger = logging.getLogger("euroeval")
|
|
26
31
|
|
|
@@ -163,6 +168,7 @@ def generate_single_iteration(
|
|
|
163
168
|
if benchmark_config.debug:
|
|
164
169
|
debug_log(
|
|
165
170
|
batch=batch,
|
|
171
|
+
model_output=model_output,
|
|
166
172
|
extracted_labels=extracted_labels, # type: ignore[arg-type]
|
|
167
173
|
dataset_config=dataset_config,
|
|
168
174
|
)
|
|
@@ -217,6 +223,7 @@ def generate_single_iteration(
|
|
|
217
223
|
|
|
218
224
|
def debug_log(
|
|
219
225
|
batch: dict[str, t.Any],
|
|
226
|
+
model_output: "GenerativeModelOutput",
|
|
220
227
|
extracted_labels: list[dict | str | list[str]],
|
|
221
228
|
dataset_config: "DatasetConfig",
|
|
222
229
|
) -> None:
|
|
@@ -225,6 +232,8 @@ def debug_log(
|
|
|
225
232
|
Args:
|
|
226
233
|
batch:
|
|
227
234
|
The batch of examples to evaluate on.
|
|
235
|
+
model_output:
|
|
236
|
+
The output of the model.
|
|
228
237
|
extracted_labels:
|
|
229
238
|
The extracted labels from the model output.
|
|
230
239
|
dataset_config:
|
|
@@ -290,7 +299,12 @@ def debug_log(
|
|
|
290
299
|
else:
|
|
291
300
|
input_texts = batch["text"]
|
|
292
301
|
|
|
293
|
-
for input_text, prediction, label in zip(
|
|
302
|
+
for input_text, raw_output, prediction, label in zip(
|
|
303
|
+
input_texts, model_output.sequences, extracted_labels, labels
|
|
304
|
+
):
|
|
294
305
|
logger.info(
|
|
295
|
-
f"Input: '{input_text}'\
|
|
306
|
+
f"Input: '{input_text}'\n"
|
|
307
|
+
f"Raw outout: '{raw_output}'\n"
|
|
308
|
+
f"Prediction: '{prediction}'\n"
|
|
309
|
+
f"Label: '{label}'"
|
|
296
310
|
)
|
|
@@ -162,9 +162,7 @@ def get_closest_logprobs_labels(
|
|
|
162
162
|
"""
|
|
163
163
|
english_labels = list(dataset_config.id2label.values())
|
|
164
164
|
english2local = dataset_config.prompt_label_mapping
|
|
165
|
-
candidate_labels = [
|
|
166
|
-
english2local[lbl].lower() for lbl in english_labels
|
|
167
|
-
] + english_labels
|
|
165
|
+
candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
|
|
168
166
|
|
|
169
167
|
output_labels: list[str] = list()
|
|
170
168
|
for sample in generation_logprobs:
|
|
@@ -179,21 +177,48 @@ def get_closest_logprobs_labels(
|
|
|
179
177
|
]
|
|
180
178
|
generated_labels = [label for label in generated_labels if label != ""]
|
|
181
179
|
|
|
182
|
-
# We want to use the first generated label which
|
|
180
|
+
# We want to use the first generated label which contains a unique candidate
|
|
183
181
|
# label, as the output label
|
|
184
182
|
output_label: str | None = None
|
|
185
|
-
|
|
186
|
-
|
|
183
|
+
previously_generated_labels: list[str] = list()
|
|
184
|
+
for label_idx, generated_label in enumerate(generated_labels):
|
|
185
|
+
generated_label = "".join(previously_generated_labels) + generated_label
|
|
186
|
+
|
|
187
|
+
# Get the candidate labels that starts with the generated label
|
|
188
|
+
candidate_output_labels = {
|
|
187
189
|
candidate_label
|
|
188
190
|
for candidate_label in candidate_labels
|
|
189
191
|
if candidate_label.startswith(generated_label)
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
# If we can uniquely determine the output label, we break the loop. If
|
|
195
|
+
# there are multiple possible labels then we store the current one, and
|
|
196
|
+
# concatenate it with the next generated label. We can only do this if
|
|
197
|
+
# the current one is the first one, however, since we're using greedy
|
|
198
|
+
# sampling. In case this happens for a label that is not the first one,
|
|
199
|
+
# we warn the user.
|
|
200
|
+
if len(candidate_output_labels) == 1:
|
|
201
|
+
output_label = candidate_output_labels.pop()
|
|
193
202
|
break
|
|
203
|
+
elif len(candidate_output_labels) > 1:
|
|
204
|
+
if label_idx == 0:
|
|
205
|
+
previously_generated_labels.append(generated_label)
|
|
206
|
+
else:
|
|
207
|
+
output_label = candidate_output_labels.pop()
|
|
208
|
+
candidate_output_labels.add(output_label)
|
|
209
|
+
log_once(
|
|
210
|
+
"Multiple candidate labels found for the generated label "
|
|
211
|
+
f"{generated_label!r}: {candidate_output_labels}. Since "
|
|
212
|
+
"this is not the first generated label, we cannot "
|
|
213
|
+
"concatenate it with the next generated label. We are thus "
|
|
214
|
+
f"forced to use the arbitrary {output_label!r} as the "
|
|
215
|
+
"output label, potentially resulting in worse performance. "
|
|
216
|
+
"Please report this issue to the EuroEval team at "
|
|
217
|
+
"github.com/EuroEval/EuroEval/issues.",
|
|
218
|
+
level=logging.WARNING,
|
|
219
|
+
)
|
|
194
220
|
|
|
195
221
|
if output_label is not None:
|
|
196
|
-
output_label = english2local.get(output_label, output_label)
|
|
197
222
|
output_labels.append(output_label)
|
|
198
223
|
break
|
|
199
224
|
else:
|
euroeval/types.py
CHANGED
|
@@ -8,9 +8,9 @@ if t.TYPE_CHECKING:
|
|
|
8
8
|
from .data_models import GenerativeModelOutput
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
ScoreDict = dict[str, dict[str, float] | list[dict[str, float]]]
|
|
12
|
-
Predictions = NDArray | list[str] | list[list[str]]
|
|
13
|
-
Labels = NDArray | list[str] | list[list[str]]
|
|
11
|
+
ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
|
|
12
|
+
Predictions: t.TypeAlias = NDArray | list[str] | list[list[str]]
|
|
13
|
+
Labels: t.TypeAlias = NDArray | list[str] | list[list[str]]
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class ComputeMetricsFunction(t.Protocol):
|
euroeval/utils.py
CHANGED
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
import gc
|
|
4
4
|
import importlib
|
|
5
|
+
import importlib.metadata
|
|
5
6
|
import importlib.util
|
|
6
7
|
import logging
|
|
7
8
|
import os
|
|
8
9
|
import random
|
|
9
|
-
import re
|
|
10
10
|
import sys
|
|
11
11
|
import typing as t
|
|
12
12
|
import warnings
|
|
@@ -16,7 +16,6 @@ from types import TracebackType
|
|
|
16
16
|
|
|
17
17
|
import litellm
|
|
18
18
|
import numpy as np
|
|
19
|
-
import pkg_resources
|
|
20
19
|
import requests
|
|
21
20
|
import torch
|
|
22
21
|
from datasets.utils import disable_progress_bar
|
|
@@ -84,33 +83,6 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
|
|
|
84
83
|
return rng
|
|
85
84
|
|
|
86
85
|
|
|
87
|
-
def is_module_installed(module: str) -> bool:
|
|
88
|
-
"""Check if a module is installed.
|
|
89
|
-
|
|
90
|
-
This is used when dealing with spaCy models, as these are installed as separate
|
|
91
|
-
Python packages.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
module:
|
|
95
|
-
The name of the module.
|
|
96
|
-
|
|
97
|
-
Returns:
|
|
98
|
-
Whether the module is installed or not.
|
|
99
|
-
"""
|
|
100
|
-
# Get list of all modules, including their versions
|
|
101
|
-
installed_modules_with_versions = list(pkg_resources.working_set)
|
|
102
|
-
|
|
103
|
-
# Strip the module versions from the list of modules. Also make the modules lower
|
|
104
|
-
# case and replace dashes with underscores
|
|
105
|
-
installed_modules = [
|
|
106
|
-
re.sub("[0-9. ]", "", str(module)).lower().replace("-", "_")
|
|
107
|
-
for module in installed_modules_with_versions
|
|
108
|
-
]
|
|
109
|
-
|
|
110
|
-
# Check if the module is installed by checking if the module name is in the list
|
|
111
|
-
return module.lower() in installed_modules
|
|
112
|
-
|
|
113
|
-
|
|
114
86
|
def block_terminal_output() -> None:
|
|
115
87
|
"""Blocks libraries from writing output to the terminal.
|
|
116
88
|
|
|
@@ -206,6 +178,21 @@ def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type |
|
|
|
206
178
|
return None
|
|
207
179
|
|
|
208
180
|
|
|
181
|
+
def get_min_cuda_compute_capability() -> float | None:
|
|
182
|
+
"""Gets the lowest cuda capability.
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
Device capability as float, or None if CUDA is not available.
|
|
186
|
+
"""
|
|
187
|
+
if not torch.cuda.is_available():
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
device_range = range(torch.cuda.device_count())
|
|
191
|
+
capabilities = map(torch.cuda.get_device_capability, device_range)
|
|
192
|
+
major, minor = min(capabilities)
|
|
193
|
+
return float(f"{major}.{minor}")
|
|
194
|
+
|
|
195
|
+
|
|
209
196
|
def kebab_to_pascal(kebab_string: str) -> str:
|
|
210
197
|
"""Converts a kebab-case string to PascalCase.
|
|
211
198
|
|
|
@@ -573,3 +560,19 @@ def log_once(message: str, level: int = logging.INFO) -> None:
|
|
|
573
560
|
logger.critical(message)
|
|
574
561
|
case _:
|
|
575
562
|
raise ValueError(f"Invalid logging level: {level}")
|
|
563
|
+
|
|
564
|
+
|
|
565
|
+
def get_package_version(package_name: str) -> str | None:
|
|
566
|
+
"""Get the version of a package.
|
|
567
|
+
|
|
568
|
+
Args:
|
|
569
|
+
package_name:
|
|
570
|
+
The name of the package.
|
|
571
|
+
|
|
572
|
+
Returns:
|
|
573
|
+
The version of the package, or None if the package is not installed.
|
|
574
|
+
"""
|
|
575
|
+
try:
|
|
576
|
+
return importlib.metadata.version(package_name)
|
|
577
|
+
except importlib.metadata.PackageNotFoundError:
|
|
578
|
+
return None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.4.
|
|
3
|
+
Version: 15.4.2
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -42,6 +42,7 @@ Requires-Dist: more-itertools>=10.5.0
|
|
|
42
42
|
Requires-Dist: numpy<2.0.0,>=1.23.0
|
|
43
43
|
Requires-Dist: ollama>=0.4.7
|
|
44
44
|
Requires-Dist: pandas>=2.2.0
|
|
45
|
+
Requires-Dist: peft>=0.15.0
|
|
45
46
|
Requires-Dist: protobuf~=3.20.0
|
|
46
47
|
Requires-Dist: pydantic>=2.6.0
|
|
47
48
|
Requires-Dist: pyinfer>=0.0.3
|
|
@@ -61,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
|
|
|
61
62
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
62
63
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
63
64
|
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
64
|
-
Requires-Dist: vllm
|
|
65
|
+
Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'all'
|
|
65
66
|
Provides-Extra: generative
|
|
66
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
67
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
68
69
|
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
69
|
-
Requires-Dist: vllm
|
|
70
|
+
Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'generative'
|
|
70
71
|
Provides-Extra: human-evaluation
|
|
71
72
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
72
73
|
Provides-Extra: test
|
|
@@ -3,14 +3,14 @@ euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHh
|
|
|
3
3
|
euroeval/benchmarker.py,sha256=PIdqLPleLN3nml5Zb1g_dQaLzqxQhmgC8VuvD5yloV4,46524
|
|
4
4
|
euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
|
|
5
5
|
euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
|
|
6
|
-
euroeval/constants.py,sha256=
|
|
6
|
+
euroeval/constants.py,sha256=zL8dm7SEFpIgC2vaPhqzdKydVSWW-ZyMHenWPnNxWqQ,1681
|
|
7
7
|
euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
9
|
-
euroeval/dataset_configs.py,sha256=
|
|
8
|
+
euroeval/data_models.py,sha256=b4rOMdhoxkIPcnTQdwqq5iWaF6uia1OzAgdiOBvoGVM,14779
|
|
9
|
+
euroeval/dataset_configs.py,sha256=C5Gnp95cBeCmmuRA8Rznt0c4gMOn8Pilk_kDCleDMjg,90640
|
|
10
10
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
11
11
|
euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
|
|
12
12
|
euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
|
|
13
|
-
euroeval/generation.py,sha256=
|
|
13
|
+
euroeval/generation.py,sha256=dohSPYc4eASm5tJhNKfBlpJnellKG7nVeyx8yXXxMlE,10721
|
|
14
14
|
euroeval/human_evaluation.py,sha256=5uOm8cZf5uy2jBPs-ih7g8ni-a3hUz8UiXVPh6PzUWw,27675
|
|
15
15
|
euroeval/languages.py,sha256=d1SyG0KVtCAA_PYpFGZCgZcyVLIr7Q8uYKPxNw6WEBc,7909
|
|
16
16
|
euroeval/model_cache.py,sha256=BhkyWrOhjskESbndy218LUv1ZiWRc48ScdH_42dKHtE,8275
|
|
@@ -19,22 +19,22 @@ euroeval/model_loading.py,sha256=ta07tMoSfK1kqjOynVXQA0vVrns6RzsCEE3g1_RGVVs,271
|
|
|
19
19
|
euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
|
|
20
20
|
euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
|
|
21
21
|
euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
|
|
22
|
-
euroeval/types.py,sha256=
|
|
23
|
-
euroeval/utils.py,sha256=
|
|
22
|
+
euroeval/types.py,sha256=5DIhaVyzH8RO9jdJfibX9pwbZviQwU35dMsfszD2Whs,2406
|
|
23
|
+
euroeval/utils.py,sha256=CFjYMoKdcxLUEM-aF3pxf_3TnGWvGasjfb8pDMJVe9U,18772
|
|
24
24
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
25
25
|
euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
|
|
26
26
|
euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
|
|
27
|
-
euroeval/benchmark_modules/hf.py,sha256=
|
|
27
|
+
euroeval/benchmark_modules/hf.py,sha256=Typig7WDqOn_uGE24s_P_9PHvq-V0MrKGD7xbh0aYnk,43244
|
|
28
28
|
euroeval/benchmark_modules/litellm.py,sha256=ZJ9dB683pXPHDf70OOJfmHn_y706xRYzstYLz2ytCKE,39784
|
|
29
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
29
|
+
euroeval/benchmark_modules/vllm.py,sha256=O8-dcVkU2jgZer44EOeTC8E4d-xQjPDOXnoyzXxAToQ,46179
|
|
30
30
|
euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
31
31
|
euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
|
|
32
32
|
euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
|
|
33
|
-
euroeval/task_utils/sequence_classification.py,sha256=
|
|
33
|
+
euroeval/task_utils/sequence_classification.py,sha256=832iWpPR3CsnlBIYA976eN21WUFQLUmIlDxFIvOsROk,10266
|
|
34
34
|
euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
|
|
35
35
|
euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
|
|
36
|
-
euroeval-15.4.
|
|
37
|
-
euroeval-15.4.
|
|
38
|
-
euroeval-15.4.
|
|
39
|
-
euroeval-15.4.
|
|
40
|
-
euroeval-15.4.
|
|
36
|
+
euroeval-15.4.2.dist-info/METADATA,sha256=cvpyWIKPXNKn1Idv7w3C7z8MBVljmw50jBdskL_32oI,10752
|
|
37
|
+
euroeval-15.4.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
38
|
+
euroeval-15.4.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
39
|
+
euroeval-15.4.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
40
|
+
euroeval-15.4.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|