EuroEval 15.4.1__py3-none-any.whl → 15.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +2 -2
- euroeval/benchmark_modules/hf.py +79 -39
- euroeval/benchmark_modules/litellm.py +204 -74
- euroeval/benchmark_modules/vllm.py +106 -42
- euroeval/benchmarker.py +35 -6
- euroeval/constants.py +11 -1
- euroeval/data_models.py +6 -2
- euroeval/dataset_configs.py +6 -6
- euroeval/task_utils/sequence_classification.py +70 -30
- euroeval/types.py +3 -3
- euroeval/utils.py +131 -32
- {euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/METADATA +6 -4
- {euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/RECORD +16 -16
- {euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/WHEEL +0 -0
- {euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-15.4.1.dist-info → euroeval-15.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/utils.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import gc
|
|
4
4
|
import importlib
|
|
5
|
+
import importlib.metadata
|
|
5
6
|
import importlib.util
|
|
6
7
|
import logging
|
|
7
8
|
import os
|
|
@@ -12,16 +13,13 @@ import typing as t
|
|
|
12
13
|
import warnings
|
|
13
14
|
from functools import cache
|
|
14
15
|
from pathlib import Path
|
|
15
|
-
from types import TracebackType
|
|
16
16
|
|
|
17
17
|
import litellm
|
|
18
18
|
import numpy as np
|
|
19
|
-
import pkg_resources
|
|
20
19
|
import requests
|
|
21
20
|
import torch
|
|
22
21
|
from datasets.utils import disable_progress_bar
|
|
23
22
|
from requests.exceptions import RequestException
|
|
24
|
-
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
|
|
25
23
|
from transformers import logging as tf_logging
|
|
26
24
|
|
|
27
25
|
from .exceptions import InvalidModel, NaNValueInModelOutput
|
|
@@ -30,6 +28,11 @@ if importlib.util.find_spec("ray") is not None:
|
|
|
30
28
|
import ray
|
|
31
29
|
|
|
32
30
|
if t.TYPE_CHECKING:
|
|
31
|
+
from types import TracebackType
|
|
32
|
+
|
|
33
|
+
from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
|
|
34
|
+
|
|
35
|
+
from .data_models import DatasetConfig
|
|
33
36
|
from .types import Predictions
|
|
34
37
|
|
|
35
38
|
|
|
@@ -84,33 +87,6 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
|
|
|
84
87
|
return rng
|
|
85
88
|
|
|
86
89
|
|
|
87
|
-
def is_module_installed(module: str) -> bool:
|
|
88
|
-
"""Check if a module is installed.
|
|
89
|
-
|
|
90
|
-
This is used when dealing with spaCy models, as these are installed as separate
|
|
91
|
-
Python packages.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
module:
|
|
95
|
-
The name of the module.
|
|
96
|
-
|
|
97
|
-
Returns:
|
|
98
|
-
Whether the module is installed or not.
|
|
99
|
-
"""
|
|
100
|
-
# Get list of all modules, including their versions
|
|
101
|
-
installed_modules_with_versions = list(pkg_resources.working_set)
|
|
102
|
-
|
|
103
|
-
# Strip the module versions from the list of modules. Also make the modules lower
|
|
104
|
-
# case and replace dashes with underscores
|
|
105
|
-
installed_modules = [
|
|
106
|
-
re.sub("[0-9. ]", "", str(module)).lower().replace("-", "_")
|
|
107
|
-
for module in installed_modules_with_versions
|
|
108
|
-
]
|
|
109
|
-
|
|
110
|
-
# Check if the module is installed by checking if the module name is in the list
|
|
111
|
-
return module.lower() in installed_modules
|
|
112
|
-
|
|
113
|
-
|
|
114
90
|
def block_terminal_output() -> None:
|
|
115
91
|
"""Blocks libraries from writing output to the terminal.
|
|
116
92
|
|
|
@@ -206,6 +182,21 @@ def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type |
|
|
|
206
182
|
return None
|
|
207
183
|
|
|
208
184
|
|
|
185
|
+
def get_min_cuda_compute_capability() -> float | None:
|
|
186
|
+
"""Gets the lowest cuda capability.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Device capability as float, or None if CUDA is not available.
|
|
190
|
+
"""
|
|
191
|
+
if not torch.cuda.is_available():
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
device_range = range(torch.cuda.device_count())
|
|
195
|
+
capabilities = map(torch.cuda.get_device_capability, device_range)
|
|
196
|
+
major, minor = min(capabilities)
|
|
197
|
+
return float(f"{major}.{minor}")
|
|
198
|
+
|
|
199
|
+
|
|
209
200
|
def kebab_to_pascal(kebab_string: str) -> str:
|
|
210
201
|
"""Converts a kebab-case string to PascalCase.
|
|
211
202
|
|
|
@@ -298,7 +289,7 @@ class HiddenPrints:
|
|
|
298
289
|
self,
|
|
299
290
|
exc_type: t.Type[BaseException],
|
|
300
291
|
exc_val: BaseException,
|
|
301
|
-
exc_tb: TracebackType,
|
|
292
|
+
exc_tb: "TracebackType",
|
|
302
293
|
) -> None:
|
|
303
294
|
"""Exit the context manager."""
|
|
304
295
|
sys.stdout.close()
|
|
@@ -368,7 +359,6 @@ def should_prompts_be_stripped(
|
|
|
368
359
|
return strip_prompts
|
|
369
360
|
|
|
370
361
|
|
|
371
|
-
# TODO: This is currently not used - maybe remove.
|
|
372
362
|
def should_prefix_space_be_added_to_labels(
|
|
373
363
|
labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
|
|
374
364
|
) -> bool:
|
|
@@ -573,3 +563,112 @@ def log_once(message: str, level: int = logging.INFO) -> None:
|
|
|
573
563
|
logger.critical(message)
|
|
574
564
|
case _:
|
|
575
565
|
raise ValueError(f"Invalid logging level: {level}")
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def get_package_version(package_name: str) -> str | None:
|
|
569
|
+
"""Get the version of a package.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
package_name:
|
|
573
|
+
The name of the package.
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
The version of the package, or None if the package is not installed.
|
|
577
|
+
"""
|
|
578
|
+
try:
|
|
579
|
+
return importlib.metadata.version(package_name)
|
|
580
|
+
except importlib.metadata.PackageNotFoundError:
|
|
581
|
+
return None
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def get_first_label_token_mapping(
|
|
585
|
+
dataset_config: "DatasetConfig", tokenizer: "PreTrainedTokenizer | None"
|
|
586
|
+
) -> dict[str, str] | bool:
|
|
587
|
+
"""Check if the model should output scores.
|
|
588
|
+
|
|
589
|
+
Args:
|
|
590
|
+
dataset_config:
|
|
591
|
+
The dataset configuration.
|
|
592
|
+
tokenizer:
|
|
593
|
+
The tokenizer, or None if not available.
|
|
594
|
+
|
|
595
|
+
Returns:
|
|
596
|
+
A mapping from labels to the first token in each label, or alternatively a
|
|
597
|
+
Boolean value indicating whether the model should output scores (if the mapping
|
|
598
|
+
is outputted then the model will always output scores).
|
|
599
|
+
"""
|
|
600
|
+
# Importing here to avoid circular imports
|
|
601
|
+
from .constants import TASK_GROUPS_USING_LOGPROBS
|
|
602
|
+
|
|
603
|
+
# If we do not have any tokenizer, then we cannot check if the model should output
|
|
604
|
+
# scores and we just assume it should if the dataset supports it
|
|
605
|
+
output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
|
|
606
|
+
if tokenizer is None:
|
|
607
|
+
if output_scores:
|
|
608
|
+
log_once(
|
|
609
|
+
"The model will output scores, since the dataset supports it and no "
|
|
610
|
+
"tokenizer is available.",
|
|
611
|
+
level=logging.DEBUG,
|
|
612
|
+
)
|
|
613
|
+
else:
|
|
614
|
+
log_once(
|
|
615
|
+
"The model will not output scores, since the dataset does not support "
|
|
616
|
+
"it and no tokenizer is available.",
|
|
617
|
+
level=logging.DEBUG,
|
|
618
|
+
)
|
|
619
|
+
return output_scores
|
|
620
|
+
|
|
621
|
+
# If there are labels associated with the dataset, and that the first token of each
|
|
622
|
+
# label is distinct, then we can safely use the logprobs
|
|
623
|
+
if output_scores and dataset_config.labels:
|
|
624
|
+
local_labels = [
|
|
625
|
+
dataset_config.prompt_label_mapping[label].strip()
|
|
626
|
+
for label in dataset_config.labels
|
|
627
|
+
]
|
|
628
|
+
|
|
629
|
+
# Get the first token of each label, where we add a prefix space if needed
|
|
630
|
+
add_prefix_space = (
|
|
631
|
+
should_prefix_space_be_added_to_labels(
|
|
632
|
+
labels_to_be_generated=local_labels, tokenizer=tokenizer
|
|
633
|
+
)
|
|
634
|
+
and tokenizer.chat_template is None
|
|
635
|
+
)
|
|
636
|
+
first_tokens = [
|
|
637
|
+
tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
|
|
638
|
+
for label in local_labels
|
|
639
|
+
]
|
|
640
|
+
first_tokens = [
|
|
641
|
+
re.sub(
|
|
642
|
+
pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
|
|
643
|
+
)
|
|
644
|
+
for token in first_tokens
|
|
645
|
+
]
|
|
646
|
+
|
|
647
|
+
# Build a mapping from labels to the first token in each label if the first
|
|
648
|
+
# tokens are distinct
|
|
649
|
+
if len(first_tokens) == len(set(first_tokens)):
|
|
650
|
+
log_once(
|
|
651
|
+
"The model will output scores, since the first tokens of the labels "
|
|
652
|
+
"are distinct.",
|
|
653
|
+
level=logging.DEBUG,
|
|
654
|
+
)
|
|
655
|
+
return {
|
|
656
|
+
label: first_token
|
|
657
|
+
for label, first_token in zip(local_labels, first_tokens)
|
|
658
|
+
}
|
|
659
|
+
else:
|
|
660
|
+
log_once(
|
|
661
|
+
"The model will not output scores, since the first tokens of the "
|
|
662
|
+
"labels are not distinct. The first tokens for the labels "
|
|
663
|
+
f"{local_labels} are {first_tokens}"
|
|
664
|
+
)
|
|
665
|
+
return False
|
|
666
|
+
|
|
667
|
+
# Otherwise, we assume that the model should not output scores, to avoid potential
|
|
668
|
+
# evaluation errors. This will force the label extraction to rely on word edit
|
|
669
|
+
# distance instead of logprobs.
|
|
670
|
+
log_once(
|
|
671
|
+
"The model will not output scores, since the dataset does not have labels.",
|
|
672
|
+
level=logging.DEBUG,
|
|
673
|
+
)
|
|
674
|
+
return False
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 15.
|
|
3
|
+
Version: 15.5.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -37,11 +37,12 @@ Requires-Dist: demjson3>=3.0.6
|
|
|
37
37
|
Requires-Dist: evaluate>=0.4.1
|
|
38
38
|
Requires-Dist: huggingface-hub>=0.24.0
|
|
39
39
|
Requires-Dist: levenshtein>=0.24.0
|
|
40
|
-
Requires-Dist: litellm>=1.
|
|
40
|
+
Requires-Dist: litellm>=1.63.0
|
|
41
41
|
Requires-Dist: more-itertools>=10.5.0
|
|
42
42
|
Requires-Dist: numpy<2.0.0,>=1.23.0
|
|
43
43
|
Requires-Dist: ollama>=0.4.7
|
|
44
44
|
Requires-Dist: pandas>=2.2.0
|
|
45
|
+
Requires-Dist: peft>=0.15.0
|
|
45
46
|
Requires-Dist: protobuf~=3.20.0
|
|
46
47
|
Requires-Dist: pydantic>=2.6.0
|
|
47
48
|
Requires-Dist: pyinfer>=0.0.3
|
|
@@ -61,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
|
|
|
61
62
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
62
63
|
Requires-Dist: gradio>=4.26.0; extra == 'all'
|
|
63
64
|
Requires-Dist: outlines>=0.1.11; extra == 'all'
|
|
64
|
-
Requires-Dist: vllm
|
|
65
|
+
Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
|
|
65
66
|
Provides-Extra: generative
|
|
66
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
67
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
68
69
|
Requires-Dist: outlines>=0.1.11; extra == 'generative'
|
|
69
|
-
Requires-Dist: vllm
|
|
70
|
+
Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
|
|
70
71
|
Provides-Extra: human-evaluation
|
|
71
72
|
Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
|
|
72
73
|
Provides-Extra: test
|
|
@@ -217,6 +218,7 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
|
|
|
217
218
|
$ uv run src/scripts/create_allocine.py
|
|
218
219
|
```
|
|
219
220
|
|
|
221
|
+
|
|
220
222
|
## Special Thanks :pray:
|
|
221
223
|
- Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
|
|
222
224
|
models on the leaderboards.
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
euroeval/__init__.py,sha256=
|
|
1
|
+
euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
|
|
2
2
|
euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
3
|
+
euroeval/benchmarker.py,sha256=8Qt1NL7k5n-AfFrhR6139wmmsVS7CgRa-QjminH0d_c,47849
|
|
4
4
|
euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
|
|
5
5
|
euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
|
|
6
|
-
euroeval/constants.py,sha256=
|
|
6
|
+
euroeval/constants.py,sha256=CJavEDvKLSKAC4uyz44sFrY1W1AnjUsxkXF63SoMjw4,1985
|
|
7
7
|
euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
9
|
-
euroeval/dataset_configs.py,sha256=
|
|
8
|
+
euroeval/data_models.py,sha256=QssdR_msDTmsp9yKe0cVba0iCpgBTFTOaOUn44o1cl8,14770
|
|
9
|
+
euroeval/dataset_configs.py,sha256=6WiRW-VAAMIL6-1J6Nb6pCm6mf4I-oQ087zB0es3HHs,90644
|
|
10
10
|
euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
|
|
11
11
|
euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
|
|
12
12
|
euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
|
|
@@ -19,22 +19,22 @@ euroeval/model_loading.py,sha256=ta07tMoSfK1kqjOynVXQA0vVrns6RzsCEE3g1_RGVVs,271
|
|
|
19
19
|
euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
|
|
20
20
|
euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
|
|
21
21
|
euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
|
|
22
|
-
euroeval/types.py,sha256=
|
|
23
|
-
euroeval/utils.py,sha256=
|
|
22
|
+
euroeval/types.py,sha256=5DIhaVyzH8RO9jdJfibX9pwbZviQwU35dMsfszD2Whs,2406
|
|
23
|
+
euroeval/utils.py,sha256=bbq7WCcIrMKjBRaZ8EcnRpRAvL_F-tCxiL0We_po3QE,22397
|
|
24
24
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
25
25
|
euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
|
|
26
26
|
euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
|
|
27
|
-
euroeval/benchmark_modules/hf.py,sha256=
|
|
28
|
-
euroeval/benchmark_modules/litellm.py,sha256=
|
|
29
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
27
|
+
euroeval/benchmark_modules/hf.py,sha256=VcgWZmSZc4B3FgeUGC0eWQIRv97luU22-KijaBfuqU0,43602
|
|
28
|
+
euroeval/benchmark_modules/litellm.py,sha256=pbTsq6Bb8cnFbdZMUSrUs-XlNAyaCIWNcEKKRIfprx8,45161
|
|
29
|
+
euroeval/benchmark_modules/vllm.py,sha256=7AZrvcwHevrQbXvbjTCp4S6HpM0Obk6CIQLbmUWIn9s,47483
|
|
30
30
|
euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
31
31
|
euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
|
|
32
32
|
euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
|
|
33
|
-
euroeval/task_utils/sequence_classification.py,sha256=
|
|
33
|
+
euroeval/task_utils/sequence_classification.py,sha256=JDZfiTj5RdwYwlhhTqVBj2mVdwmkoykZ6wJzEbWj0lo,12225
|
|
34
34
|
euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
|
|
35
35
|
euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
|
|
36
|
-
euroeval-15.
|
|
37
|
-
euroeval-15.
|
|
38
|
-
euroeval-15.
|
|
39
|
-
euroeval-15.
|
|
40
|
-
euroeval-15.
|
|
36
|
+
euroeval-15.5.0.dist-info/METADATA,sha256=T48YoPuFBEFI5sxgUadzkD3tidIB3TA1mKEKsFuh7fs,10752
|
|
37
|
+
euroeval-15.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
38
|
+
euroeval-15.5.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
|
|
39
|
+
euroeval-15.5.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
|
|
40
|
+
euroeval-15.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|