EuroEval 16.1.1__py3-none-any.whl → 16.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +7 -6
- euroeval/benchmark_config_factory.py +41 -125
- euroeval/benchmark_modules/hf.py +31 -16
- euroeval/benchmark_modules/litellm.py +2 -0
- euroeval/benchmark_modules/vllm.py +24 -9
- euroeval/benchmarker.py +138 -16
- euroeval/cli.py +8 -0
- euroeval/data_models.py +5 -0
- euroeval/generation.py +3 -1
- euroeval/metrics/base.py +12 -0
- euroeval/metrics/huggingface.py +23 -2
- euroeval/prompt_templates/linguistic_acceptability.py +6 -5
- euroeval/prompt_templates/named_entity_recognition.py +3 -3
- euroeval/prompt_templates/sentiment_classification.py +5 -5
- euroeval/tasks.py +3 -0
- euroeval/tokenisation_utils.py +0 -6
- euroeval/types.py +2 -2
- euroeval/utils.py +77 -5
- {euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/METADATA +31 -7
- {euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/RECORD +23 -23
- {euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/WHEEL +0 -0
- {euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/entry_points.txt +0 -0
- {euroeval-16.1.1.dist-info → euroeval-16.2.1.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -12,12 +12,13 @@ import warnings
|
|
|
12
12
|
from termcolor import colored
|
|
13
13
|
|
|
14
14
|
# Block specific warnings before importing anything else, as they can be noisy
|
|
15
|
-
|
|
16
|
-
warnings.filterwarnings("ignore", category=
|
|
17
|
-
|
|
18
|
-
logging.getLogger("
|
|
19
|
-
logging.getLogger("
|
|
20
|
-
|
|
15
|
+
if os.getenv("FULL_LOG") != "1":
|
|
16
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
17
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
18
|
+
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
19
|
+
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
20
|
+
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
21
|
+
os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
|
|
21
22
|
|
|
22
23
|
# Set up logging
|
|
23
24
|
fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
|
|
@@ -6,9 +6,9 @@ import typing as t
|
|
|
6
6
|
|
|
7
7
|
import torch
|
|
8
8
|
|
|
9
|
-
from .data_models import BenchmarkConfig
|
|
9
|
+
from .data_models import BenchmarkConfig, BenchmarkConfigParams
|
|
10
10
|
from .dataset_configs import get_all_dataset_configs
|
|
11
|
-
from .enums import Device
|
|
11
|
+
from .enums import Device
|
|
12
12
|
from .exceptions import InvalidBenchmark
|
|
13
13
|
from .languages import get_all_languages
|
|
14
14
|
from .tasks import SPEED, get_all_tasks
|
|
@@ -21,150 +21,66 @@ logger = logging.getLogger("euroeval")
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def build_benchmark_config(
|
|
24
|
-
|
|
25
|
-
save_results: bool,
|
|
26
|
-
task: str | list[str] | None,
|
|
27
|
-
dataset: str | list[str] | None,
|
|
28
|
-
language: str | list[str],
|
|
29
|
-
model_language: str | list[str] | None,
|
|
30
|
-
dataset_language: str | list[str] | None,
|
|
31
|
-
device: Device | None,
|
|
32
|
-
batch_size: int,
|
|
33
|
-
raise_errors: bool,
|
|
34
|
-
cache_dir: str,
|
|
35
|
-
api_key: str | None,
|
|
36
|
-
force: bool,
|
|
37
|
-
verbose: bool,
|
|
38
|
-
trust_remote_code: bool,
|
|
39
|
-
clear_model_cache: bool,
|
|
40
|
-
evaluate_test_split: bool,
|
|
41
|
-
few_shot: bool,
|
|
42
|
-
num_iterations: int,
|
|
43
|
-
api_base: str | None,
|
|
44
|
-
api_version: str | None,
|
|
45
|
-
gpu_memory_utilization: float,
|
|
46
|
-
generative_type: GenerativeType | None,
|
|
47
|
-
debug: bool,
|
|
48
|
-
run_with_cli: bool,
|
|
49
|
-
requires_safetensors: bool,
|
|
24
|
+
benchmark_config_params: BenchmarkConfigParams,
|
|
50
25
|
) -> BenchmarkConfig:
|
|
51
26
|
"""Create a benchmark configuration.
|
|
52
27
|
|
|
53
28
|
Args:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
save_results:
|
|
57
|
-
Whether to save the benchmark results to a file.
|
|
58
|
-
task:
|
|
59
|
-
The tasks to include for dataset. If None then datasets will not be
|
|
60
|
-
filtered based on their task.
|
|
61
|
-
dataset:
|
|
62
|
-
The datasets to include for task. If None then all datasets will be
|
|
63
|
-
included, limited by the `task` parameter.
|
|
64
|
-
language:
|
|
65
|
-
The language codes of the languages to include, both for models and
|
|
66
|
-
datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
|
|
67
|
-
to 'all' if all languages should be considered.
|
|
68
|
-
model_language:
|
|
69
|
-
The language codes of the languages to include for models. If None then
|
|
70
|
-
the `language` parameter will be used.
|
|
71
|
-
dataset_language:
|
|
72
|
-
The language codes of the languages to include for datasets. If None then
|
|
73
|
-
the `language` parameter will be used.
|
|
74
|
-
device:
|
|
75
|
-
The device to use for running the models. If None then the device will be
|
|
76
|
-
set automatically.
|
|
77
|
-
batch_size:
|
|
78
|
-
The batch size to use for running the models.
|
|
79
|
-
raise_errors:
|
|
80
|
-
Whether to raise errors when running the benchmark.
|
|
81
|
-
cache_dir:
|
|
82
|
-
The directory to use for caching the models.
|
|
83
|
-
api_key:
|
|
84
|
-
The API key to use for a given inference server.
|
|
85
|
-
force:
|
|
86
|
-
Whether to force the benchmark to run even if the results are already
|
|
87
|
-
cached.
|
|
88
|
-
verbose:
|
|
89
|
-
Whether to print verbose output when running the benchmark. This is
|
|
90
|
-
automatically set if `debug` is True.
|
|
91
|
-
trust_remote_code:
|
|
92
|
-
Whether to trust remote code when running the benchmark.
|
|
93
|
-
clear_model_cache:
|
|
94
|
-
Whether to clear the model cache before running the benchmark.
|
|
95
|
-
evaluate_test_split:
|
|
96
|
-
Whether to use the test split for the datasets.
|
|
97
|
-
few_shot:
|
|
98
|
-
Whether to use few-shot learning for the models.
|
|
99
|
-
num_iterations:
|
|
100
|
-
The number of iterations each model should be evaluated for.
|
|
101
|
-
api_base:
|
|
102
|
-
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
103
|
-
model on an inference API.
|
|
104
|
-
api_version:
|
|
105
|
-
The version of the API to use for a given inference API.
|
|
106
|
-
gpu_memory_utilization:
|
|
107
|
-
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
108
|
-
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
109
|
-
this if you are running out of GPU memory. Only relevant if the model is
|
|
110
|
-
generative.
|
|
111
|
-
generative_type:
|
|
112
|
-
The type of generative model. Only relevant if the model is generative. If
|
|
113
|
-
not specified, the type will be inferred automatically.
|
|
114
|
-
debug:
|
|
115
|
-
Whether to run the benchmark in debug mode.
|
|
116
|
-
run_with_cli:
|
|
117
|
-
Whether the benchmark is being run with the CLI.
|
|
118
|
-
requires_safetensors:
|
|
119
|
-
Whether to only allow evaluations of models stored as safetensors.
|
|
29
|
+
benchmark_config_params:
|
|
30
|
+
The parameters for creating the benchmark configuration.
|
|
120
31
|
|
|
121
32
|
Returns:
|
|
122
33
|
The benchmark configuration.
|
|
123
34
|
"""
|
|
124
|
-
language_codes = get_correct_language_codes(
|
|
35
|
+
language_codes = get_correct_language_codes(
|
|
36
|
+
language_codes=benchmark_config_params.language
|
|
37
|
+
)
|
|
125
38
|
model_languages = prepare_languages(
|
|
126
|
-
language_codes=model_language,
|
|
39
|
+
language_codes=benchmark_config_params.model_language,
|
|
40
|
+
default_language_codes=language_codes,
|
|
127
41
|
)
|
|
128
42
|
dataset_languages = prepare_languages(
|
|
129
|
-
language_codes=dataset_language,
|
|
43
|
+
language_codes=benchmark_config_params.dataset_language,
|
|
44
|
+
default_language_codes=language_codes,
|
|
130
45
|
)
|
|
131
46
|
|
|
132
47
|
tasks, datasets = prepare_tasks_and_datasets(
|
|
133
|
-
task=task,
|
|
48
|
+
task=benchmark_config_params.task,
|
|
49
|
+
dataset=benchmark_config_params.dataset,
|
|
50
|
+
dataset_languages=dataset_languages,
|
|
134
51
|
)
|
|
135
52
|
|
|
136
|
-
torch_device = prepare_device(device=device)
|
|
137
|
-
|
|
138
|
-
# Set variable with number of iterations
|
|
139
|
-
if hasattr(sys, "_called_from_test"):
|
|
140
|
-
num_iterations = 1
|
|
141
|
-
|
|
142
53
|
return BenchmarkConfig(
|
|
143
54
|
model_languages=model_languages,
|
|
144
55
|
dataset_languages=dataset_languages,
|
|
145
56
|
tasks=tasks,
|
|
146
57
|
datasets=datasets,
|
|
147
|
-
batch_size=batch_size,
|
|
148
|
-
raise_errors=raise_errors,
|
|
149
|
-
cache_dir=cache_dir,
|
|
150
|
-
api_key=api_key,
|
|
151
|
-
force=force,
|
|
152
|
-
progress_bar=progress_bar,
|
|
153
|
-
save_results=save_results,
|
|
154
|
-
verbose=verbose or debug,
|
|
155
|
-
device=
|
|
156
|
-
trust_remote_code=trust_remote_code,
|
|
157
|
-
clear_model_cache=clear_model_cache,
|
|
158
|
-
evaluate_test_split=evaluate_test_split,
|
|
159
|
-
few_shot=few_shot,
|
|
160
|
-
num_iterations=
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
58
|
+
batch_size=benchmark_config_params.batch_size,
|
|
59
|
+
raise_errors=benchmark_config_params.raise_errors,
|
|
60
|
+
cache_dir=benchmark_config_params.cache_dir,
|
|
61
|
+
api_key=benchmark_config_params.api_key,
|
|
62
|
+
force=benchmark_config_params.force,
|
|
63
|
+
progress_bar=benchmark_config_params.progress_bar,
|
|
64
|
+
save_results=benchmark_config_params.save_results,
|
|
65
|
+
verbose=benchmark_config_params.verbose or benchmark_config_params.debug,
|
|
66
|
+
device=prepare_device(device=benchmark_config_params.device),
|
|
67
|
+
trust_remote_code=benchmark_config_params.trust_remote_code,
|
|
68
|
+
clear_model_cache=benchmark_config_params.clear_model_cache,
|
|
69
|
+
evaluate_test_split=benchmark_config_params.evaluate_test_split,
|
|
70
|
+
few_shot=benchmark_config_params.few_shot,
|
|
71
|
+
num_iterations=(
|
|
72
|
+
1
|
|
73
|
+
if hasattr(sys, "_called_from_test")
|
|
74
|
+
else benchmark_config_params.num_iterations
|
|
75
|
+
),
|
|
76
|
+
api_base=benchmark_config_params.api_base,
|
|
77
|
+
api_version=benchmark_config_params.api_version,
|
|
78
|
+
gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
|
|
79
|
+
generative_type=benchmark_config_params.generative_type,
|
|
80
|
+
debug=benchmark_config_params.debug,
|
|
81
|
+
run_with_cli=benchmark_config_params.run_with_cli,
|
|
82
|
+
requires_safetensors=benchmark_config_params.requires_safetensors,
|
|
83
|
+
download_only=benchmark_config_params.download_only,
|
|
168
84
|
)
|
|
169
85
|
|
|
170
86
|
|
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -146,21 +146,25 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
146
146
|
Returns:
|
|
147
147
|
The number of parameters in the model.
|
|
148
148
|
"""
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
try:
|
|
152
|
-
repo_info = hf_api.model_info(
|
|
153
|
-
repo_id=self.model_config.adapter_base_model_id
|
|
154
|
-
or self.model_config.model_id,
|
|
155
|
-
revision=self.model_config.revision,
|
|
156
|
-
)
|
|
157
|
-
except (
|
|
158
|
-
RepositoryNotFoundError,
|
|
159
|
-
RevisionNotFoundError,
|
|
160
|
-
RequestException,
|
|
161
|
-
HFValidationError,
|
|
162
|
-
):
|
|
149
|
+
# No need to try to use the API if we have no internet.
|
|
150
|
+
if not internet_connection_available():
|
|
163
151
|
repo_info = None
|
|
152
|
+
else:
|
|
153
|
+
token = get_hf_token(api_key=self.benchmark_config.api_key)
|
|
154
|
+
hf_api = HfApi(token=token)
|
|
155
|
+
try:
|
|
156
|
+
repo_info = hf_api.model_info(
|
|
157
|
+
repo_id=self.model_config.adapter_base_model_id
|
|
158
|
+
or self.model_config.model_id,
|
|
159
|
+
revision=self.model_config.revision,
|
|
160
|
+
)
|
|
161
|
+
except (
|
|
162
|
+
RepositoryNotFoundError,
|
|
163
|
+
RevisionNotFoundError,
|
|
164
|
+
RequestException,
|
|
165
|
+
HFValidationError,
|
|
166
|
+
):
|
|
167
|
+
repo_info = None
|
|
164
168
|
|
|
165
169
|
if (
|
|
166
170
|
repo_info is not None
|
|
@@ -558,7 +562,7 @@ def load_model_and_tokeniser(
|
|
|
558
562
|
The benchmark configuration
|
|
559
563
|
|
|
560
564
|
Returns:
|
|
561
|
-
|
|
565
|
+
A pair (model, tokeniser), with the loaded model and tokeniser
|
|
562
566
|
"""
|
|
563
567
|
config: "PretrainedConfig"
|
|
564
568
|
block_terminal_output()
|
|
@@ -686,6 +690,7 @@ def load_model_and_tokeniser(
|
|
|
686
690
|
model=model,
|
|
687
691
|
model_id=model_id,
|
|
688
692
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
693
|
+
model_cache_dir=model_config.model_cache_dir,
|
|
689
694
|
)
|
|
690
695
|
|
|
691
696
|
return model, tokeniser
|
|
@@ -722,6 +727,11 @@ def get_model_repo_info(
|
|
|
722
727
|
):
|
|
723
728
|
model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
|
|
724
729
|
|
|
730
|
+
# If we have not internet, and the model_id is not a directory for a local model
|
|
731
|
+
# we also just create a dummy model info object.
|
|
732
|
+
elif not internet_connection_available():
|
|
733
|
+
model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
|
|
734
|
+
|
|
725
735
|
# If the model does not exist locally, then we get the model info from the Hugging
|
|
726
736
|
# Face Hub, if possible
|
|
727
737
|
if model_info is None:
|
|
@@ -867,7 +877,10 @@ def get_model_repo_info(
|
|
|
867
877
|
|
|
868
878
|
|
|
869
879
|
def load_tokeniser(
|
|
870
|
-
model: "PreTrainedModel | None",
|
|
880
|
+
model: "PreTrainedModel | None",
|
|
881
|
+
model_id: str,
|
|
882
|
+
trust_remote_code: bool,
|
|
883
|
+
model_cache_dir: str,
|
|
871
884
|
) -> "PreTrainedTokenizer":
|
|
872
885
|
"""Load the tokeniser.
|
|
873
886
|
|
|
@@ -889,6 +902,7 @@ def load_tokeniser(
|
|
|
889
902
|
trust_remote_code=trust_remote_code,
|
|
890
903
|
padding_side="right",
|
|
891
904
|
truncation_side="right",
|
|
905
|
+
cache_dir=model_cache_dir,
|
|
892
906
|
)
|
|
893
907
|
|
|
894
908
|
# If the model is a subclass of a certain model types then we have to add a prefix
|
|
@@ -999,6 +1013,7 @@ def load_hf_model_config(
|
|
|
999
1013
|
token=get_hf_token(api_key=api_key),
|
|
1000
1014
|
trust_remote_code=trust_remote_code,
|
|
1001
1015
|
cache_dir=model_cache_dir,
|
|
1016
|
+
local_files_only=not internet_connection_available(),
|
|
1002
1017
|
)
|
|
1003
1018
|
if config.eos_token_id is not None and config.pad_token_id is None:
|
|
1004
1019
|
if isinstance(config.eos_token_id, list):
|
|
@@ -984,6 +984,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
984
984
|
model=None,
|
|
985
985
|
model_id=model_id,
|
|
986
986
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
987
|
+
model_cache_dir=self.model_config.model_cache_dir,
|
|
987
988
|
)
|
|
988
989
|
|
|
989
990
|
if (
|
|
@@ -1066,6 +1067,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1066
1067
|
model=None,
|
|
1067
1068
|
model_id=model_id,
|
|
1068
1069
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
1070
|
+
model_cache_dir=self.model_config.model_cache_dir,
|
|
1069
1071
|
)
|
|
1070
1072
|
|
|
1071
1073
|
all_max_lengths: list[int] = list()
|
|
@@ -72,7 +72,9 @@ from ..utils import (
|
|
|
72
72
|
create_model_cache_dir,
|
|
73
73
|
get_hf_token,
|
|
74
74
|
get_min_cuda_compute_capability,
|
|
75
|
+
internet_connection_available,
|
|
75
76
|
log_once,
|
|
77
|
+
resolve_model_path,
|
|
76
78
|
split_model_id,
|
|
77
79
|
)
|
|
78
80
|
from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
|
|
@@ -146,7 +148,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
146
148
|
)
|
|
147
149
|
|
|
148
150
|
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
149
|
-
model=self._model, tokeniser=self._tokeniser,
|
|
151
|
+
model=self._model, tokeniser=self._tokeniser, model_config=model_config
|
|
150
152
|
)
|
|
151
153
|
self.end_of_chat_token_ids = get_end_of_chat_token_ids(
|
|
152
154
|
tokeniser=self._tokeniser, generative_type=self.generative_type
|
|
@@ -834,10 +836,15 @@ def load_model_and_tokeniser(
|
|
|
834
836
|
|
|
835
837
|
clear_vllm()
|
|
836
838
|
|
|
839
|
+
# if we do not have an internet connection we need to give the path to the folder
|
|
840
|
+
# that contains the model weights and config files, otherwise vLLM will try to
|
|
841
|
+
# download them regardless if they are already present in the download_dir
|
|
842
|
+
model_path = resolve_model_path(download_dir)
|
|
843
|
+
|
|
837
844
|
try:
|
|
838
845
|
model = LLM(
|
|
839
|
-
model=model_id,
|
|
840
|
-
tokenizer=model_id,
|
|
846
|
+
model=model_id if internet_connection_available() else model_path,
|
|
847
|
+
tokenizer=model_id if internet_connection_available() else model_path,
|
|
841
848
|
gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
|
|
842
849
|
max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
|
|
843
850
|
download_dir=download_dir,
|
|
@@ -925,6 +932,7 @@ def load_tokeniser(
|
|
|
925
932
|
cache_dir=model_cache_dir,
|
|
926
933
|
token=token,
|
|
927
934
|
trust_remote_code=trust_remote_code,
|
|
935
|
+
local_files_only=not internet_connection_available(),
|
|
928
936
|
)
|
|
929
937
|
num_retries = 5
|
|
930
938
|
for _ in range(num_retries):
|
|
@@ -937,8 +945,10 @@ def load_tokeniser(
|
|
|
937
945
|
padding_side="left",
|
|
938
946
|
truncation_side="left",
|
|
939
947
|
model_max_length=model_max_length,
|
|
948
|
+
cache_dir=model_cache_dir,
|
|
940
949
|
config=config,
|
|
941
950
|
token=token,
|
|
951
|
+
local_files_only=not internet_connection_available(),
|
|
942
952
|
)
|
|
943
953
|
break
|
|
944
954
|
except (json.JSONDecodeError, OSError, TypeError) as e:
|
|
@@ -996,7 +1006,7 @@ def clear_vllm() -> None:
|
|
|
996
1006
|
|
|
997
1007
|
|
|
998
1008
|
def get_end_of_reasoning_token(
|
|
999
|
-
model: "LLM", tokeniser: "PreTrainedTokenizer",
|
|
1009
|
+
model: "LLM", tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
|
|
1000
1010
|
) -> str | None:
|
|
1001
1011
|
"""Get the end-of-reasoning token for a generative model.
|
|
1002
1012
|
|
|
@@ -1005,21 +1015,26 @@ def get_end_of_reasoning_token(
|
|
|
1005
1015
|
The vLLM model.
|
|
1006
1016
|
tokeniser:
|
|
1007
1017
|
The tokeniser.
|
|
1008
|
-
|
|
1009
|
-
The model
|
|
1018
|
+
model_config:
|
|
1019
|
+
The model configuration.
|
|
1010
1020
|
|
|
1011
1021
|
Returns:
|
|
1012
1022
|
The end of reasoning token, or None if it could not be found.
|
|
1013
1023
|
"""
|
|
1024
|
+
model_id = model_config.model_id
|
|
1025
|
+
|
|
1014
1026
|
# Create a prompt to check if the model uses the reasoning tokens
|
|
1015
1027
|
prompt = "What is your name?"
|
|
1016
1028
|
if has_chat_template(tokeniser=tokeniser):
|
|
1029
|
+
extra_kwargs = dict()
|
|
1030
|
+
if model_config.param in {"thinking", "no-thinking"}:
|
|
1031
|
+
extra_kwargs["enable_thinking"] = model_config.param == "thinking"
|
|
1017
1032
|
templated_prompt = apply_chat_template(
|
|
1018
1033
|
conversation=[dict(role="user", content=prompt)],
|
|
1019
1034
|
tokeniser=tokeniser,
|
|
1020
1035
|
tokenise=False,
|
|
1021
1036
|
add_generation_prompt=True,
|
|
1022
|
-
|
|
1037
|
+
**extra_kwargs,
|
|
1023
1038
|
)
|
|
1024
1039
|
assert isinstance(templated_prompt, str)
|
|
1025
1040
|
prompt = templated_prompt
|
|
@@ -1042,8 +1057,8 @@ def get_end_of_reasoning_token(
|
|
|
1042
1057
|
if not bor_reasoning_matches:
|
|
1043
1058
|
log_once(
|
|
1044
1059
|
f"The model {model_id!r} did not generate any beginning-of-reasoning "
|
|
1045
|
-
"tokens in the prompt or the completion. Assuming the model is not "
|
|
1046
|
-
"
|
|
1060
|
+
"tokens in the prompt or the completion. Assuming the model is not a "
|
|
1061
|
+
"reasoning model.",
|
|
1047
1062
|
level=logging.DEBUG,
|
|
1048
1063
|
)
|
|
1049
1064
|
return None
|
euroeval/benchmarker.py
CHANGED
|
@@ -16,7 +16,7 @@ from torch.distributed import destroy_process_group
|
|
|
16
16
|
|
|
17
17
|
from .benchmark_config_factory import build_benchmark_config
|
|
18
18
|
from .constants import GENERATIVE_PIPELINE_TAGS
|
|
19
|
-
from .data_loading import load_data
|
|
19
|
+
from .data_loading import load_data, load_raw_data
|
|
20
20
|
from .data_models import BenchmarkConfigParams, BenchmarkResult
|
|
21
21
|
from .dataset_configs import get_all_dataset_configs
|
|
22
22
|
from .enums import Device, GenerativeType, ModelType
|
|
@@ -28,7 +28,12 @@ from .model_loading import load_model
|
|
|
28
28
|
from .scores import log_scores
|
|
29
29
|
from .speed_benchmark import benchmark_speed
|
|
30
30
|
from .tasks import SPEED
|
|
31
|
-
from .utils import
|
|
31
|
+
from .utils import (
|
|
32
|
+
enforce_reproducibility,
|
|
33
|
+
get_package_version,
|
|
34
|
+
internet_connection_available,
|
|
35
|
+
log_once,
|
|
36
|
+
)
|
|
32
37
|
|
|
33
38
|
if t.TYPE_CHECKING:
|
|
34
39
|
from .benchmark_modules import BenchmarkModule
|
|
@@ -83,6 +88,7 @@ class Benchmarker:
|
|
|
83
88
|
debug: bool = False,
|
|
84
89
|
run_with_cli: bool = False,
|
|
85
90
|
requires_safetensors: bool = False,
|
|
91
|
+
download_only: bool = False,
|
|
86
92
|
) -> None:
|
|
87
93
|
"""Initialise the benchmarker.
|
|
88
94
|
|
|
@@ -164,14 +170,26 @@ class Benchmarker:
|
|
|
164
170
|
requires_safetensors:
|
|
165
171
|
Whether to only allow models that use the safetensors format. Defaults
|
|
166
172
|
to False.
|
|
173
|
+
download_only:
|
|
174
|
+
Whether to only download models and datasets without performing any
|
|
175
|
+
benchmarking. Defaults to False.
|
|
167
176
|
|
|
168
177
|
Raises:
|
|
169
178
|
ValueError:
|
|
170
|
-
If both `task` and `dataset` are specified
|
|
179
|
+
If both `task` and `dataset` are specified, or if `download_only`
|
|
180
|
+
is True and we have no internet connection.
|
|
171
181
|
"""
|
|
172
182
|
if task is not None and dataset is not None:
|
|
173
183
|
raise ValueError("Only one of `task` and `dataset` can be specified.")
|
|
174
184
|
|
|
185
|
+
if not internet_connection_available() and download_only:
|
|
186
|
+
msg = "It appears you do not have an internet connection, but "
|
|
187
|
+
if run_with_cli:
|
|
188
|
+
msg += "the --download-only flag was set."
|
|
189
|
+
else:
|
|
190
|
+
msg += "the argument `download_only` was set to True."
|
|
191
|
+
raise ValueError(msg)
|
|
192
|
+
|
|
175
193
|
# Bail early if hf_transfer is enabled but not installed.
|
|
176
194
|
if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
|
|
177
195
|
raise ImportError(
|
|
@@ -205,13 +223,14 @@ class Benchmarker:
|
|
|
205
223
|
api_version=api_version,
|
|
206
224
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
207
225
|
generative_type=generative_type,
|
|
226
|
+
download_only=download_only,
|
|
208
227
|
debug=debug,
|
|
209
228
|
run_with_cli=run_with_cli,
|
|
210
229
|
requires_safetensors=requires_safetensors,
|
|
211
230
|
)
|
|
212
231
|
|
|
213
232
|
self.benchmark_config = build_benchmark_config(
|
|
214
|
-
|
|
233
|
+
benchmark_config_params=self.benchmark_config_default_params
|
|
215
234
|
)
|
|
216
235
|
|
|
217
236
|
# Initialise variable storing model lists, so we only have to fetch it once
|
|
@@ -222,17 +241,82 @@ class Benchmarker:
|
|
|
222
241
|
|
|
223
242
|
@property
|
|
224
243
|
def benchmark_results(self) -> list[BenchmarkResult]:
|
|
225
|
-
"""The benchmark results.
|
|
244
|
+
"""The benchmark results.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
A list of benchmark results.
|
|
248
|
+
|
|
249
|
+
Raises:
|
|
250
|
+
ValueError:
|
|
251
|
+
If there is an error decoding a line in the results file.
|
|
252
|
+
"""
|
|
226
253
|
if self.results_path.exists():
|
|
254
|
+
benchmark_results: list[BenchmarkResult] = list()
|
|
227
255
|
with self.results_path.open() as f:
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
256
|
+
for line in f:
|
|
257
|
+
if line.strip():
|
|
258
|
+
try:
|
|
259
|
+
result_dict = json.loads(line.strip())
|
|
260
|
+
except json.JSONDecodeError as e:
|
|
261
|
+
raise ValueError(
|
|
262
|
+
f"Error decoding JSON line: {line.strip()}"
|
|
263
|
+
) from e
|
|
264
|
+
|
|
265
|
+
# Fix for older records
|
|
266
|
+
has_old_raw_results = (
|
|
267
|
+
"results" in result_dict
|
|
268
|
+
and isinstance(result_dict["results"], dict)
|
|
269
|
+
and "raw" in result_dict["results"]
|
|
270
|
+
and isinstance(result_dict["results"]["raw"], dict)
|
|
271
|
+
and "test" in result_dict["results"]["raw"]
|
|
272
|
+
)
|
|
273
|
+
if has_old_raw_results:
|
|
274
|
+
result_dict["results"]["raw"] = result_dict["results"][
|
|
275
|
+
"raw"
|
|
276
|
+
]["test"]
|
|
277
|
+
|
|
278
|
+
result = BenchmarkResult.from_dict(result_dict)
|
|
279
|
+
benchmark_results.append(result)
|
|
280
|
+
return benchmark_results
|
|
233
281
|
else:
|
|
234
282
|
return list()
|
|
235
283
|
|
|
284
|
+
def _download(
|
|
285
|
+
self,
|
|
286
|
+
dataset_config: "DatasetConfig",
|
|
287
|
+
model_config: "ModelConfig",
|
|
288
|
+
benchmark_config: "BenchmarkConfig",
|
|
289
|
+
) -> None:
|
|
290
|
+
"""Download data, metrics, and model for the given dataset, and model.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
dataset_config: The configuration for the dataset.
|
|
294
|
+
model_config: The configuration for the model.
|
|
295
|
+
benchmark_config: The configuration for the benchmark.
|
|
296
|
+
"""
|
|
297
|
+
log_once(f"Loading data for {dataset_config.pretty_name}", level=logging.INFO)
|
|
298
|
+
dataset = load_raw_data(
|
|
299
|
+
dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
|
|
300
|
+
)
|
|
301
|
+
del dataset
|
|
302
|
+
|
|
303
|
+
log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
|
|
304
|
+
model = load_model(
|
|
305
|
+
model_config=model_config,
|
|
306
|
+
dataset_config=dataset_config,
|
|
307
|
+
benchmark_config=benchmark_config,
|
|
308
|
+
)
|
|
309
|
+
del model
|
|
310
|
+
|
|
311
|
+
log_once(
|
|
312
|
+
f"Loading metrics for the '{dataset_config.task.name}' task",
|
|
313
|
+
level=logging.INFO,
|
|
314
|
+
)
|
|
315
|
+
for metric_name in dataset_config.task.metrics:
|
|
316
|
+
log_once(f"Loading metric {metric_name.name}", level=logging.DEBUG)
|
|
317
|
+
metric = metric_name.download(cache_dir=benchmark_config.cache_dir)
|
|
318
|
+
del metric
|
|
319
|
+
|
|
236
320
|
def benchmark(
|
|
237
321
|
self,
|
|
238
322
|
model: list[str] | str,
|
|
@@ -256,6 +340,7 @@ class Benchmarker:
|
|
|
256
340
|
few_shot: bool | None = None,
|
|
257
341
|
num_iterations: int | None = None,
|
|
258
342
|
requires_safetensors: bool | None = None,
|
|
343
|
+
download_only: bool | None = None,
|
|
259
344
|
) -> list[BenchmarkResult]:
|
|
260
345
|
"""Benchmarks models on datasets.
|
|
261
346
|
|
|
@@ -336,6 +421,9 @@ class Benchmarker:
|
|
|
336
421
|
requires_safetensors:
|
|
337
422
|
Whether to only allow models that use the safetensors format. Defaults
|
|
338
423
|
to the value specified when initialising the benchmarker.
|
|
424
|
+
download_only:
|
|
425
|
+
Whether to only download the models without evaluating them. Defaults
|
|
426
|
+
to the value specified when initialising the benchmarker.
|
|
339
427
|
|
|
340
428
|
Returns:
|
|
341
429
|
A list of benchmark results.
|
|
@@ -368,6 +456,7 @@ class Benchmarker:
|
|
|
368
456
|
few_shot=few_shot,
|
|
369
457
|
num_iterations=num_iterations,
|
|
370
458
|
requires_safetensors=requires_safetensors,
|
|
459
|
+
download_only=download_only,
|
|
371
460
|
)
|
|
372
461
|
|
|
373
462
|
adjust_logging_level(verbose=benchmark_config.verbose)
|
|
@@ -395,6 +484,28 @@ class Benchmarker:
|
|
|
395
484
|
num_finished_benchmarks += len(dataset_configs)
|
|
396
485
|
continue
|
|
397
486
|
|
|
487
|
+
if model_config.adapter_base_model_id:
|
|
488
|
+
open_issue_msg = (
|
|
489
|
+
"If offline support is important to you, please "
|
|
490
|
+
"consider opening an issue at https://github.com/EuroEval/EuroEval/issues."
|
|
491
|
+
)
|
|
492
|
+
if not internet_connection_available():
|
|
493
|
+
raise InvalidModel(
|
|
494
|
+
"Offline benchmarking of models with adapters is not currently "
|
|
495
|
+
"supported. "
|
|
496
|
+
f"An active internet connection is required. {open_issue_msg}"
|
|
497
|
+
)
|
|
498
|
+
elif benchmark_config.download_only:
|
|
499
|
+
log_once(
|
|
500
|
+
"You are using download only mode with a model that includes "
|
|
501
|
+
"an adapter. "
|
|
502
|
+
"Please note: Offline benchmarking of adapter models is not "
|
|
503
|
+
"currently supported. "
|
|
504
|
+
"An internet connection will be required during evaluation. "
|
|
505
|
+
f"{open_issue_msg}",
|
|
506
|
+
level=logging.WARNING,
|
|
507
|
+
)
|
|
508
|
+
|
|
398
509
|
loaded_model: BenchmarkModule | None = None
|
|
399
510
|
benchmark_params_to_revert: dict[str, t.Any] = dict()
|
|
400
511
|
for dataset_config in dataset_configs:
|
|
@@ -569,6 +680,7 @@ class Benchmarker:
|
|
|
569
680
|
debug: bool | None = None,
|
|
570
681
|
run_with_cli: bool | None = None,
|
|
571
682
|
requires_safetensors: bool | None = None,
|
|
683
|
+
download_only: bool | None = None,
|
|
572
684
|
) -> "BenchmarkConfig":
|
|
573
685
|
"""Get an updated benchmark configuration.
|
|
574
686
|
|
|
@@ -645,6 +757,12 @@ class Benchmarker:
|
|
|
645
757
|
requires_safetensors:
|
|
646
758
|
Whether to only allow models that use the safetensors format. If None,
|
|
647
759
|
then this value will not be updated.
|
|
760
|
+
download_only:
|
|
761
|
+
Whether to only download the models without evaluating them. If None,
|
|
762
|
+
then this value will not be updated.
|
|
763
|
+
download_only:
|
|
764
|
+
Whether to only download models and datasets without performing any
|
|
765
|
+
benchmarking. If None, then this value will not be updated.
|
|
648
766
|
|
|
649
767
|
Returns:
|
|
650
768
|
The updated benchmark configuration.
|
|
@@ -701,8 +819,10 @@ class Benchmarker:
|
|
|
701
819
|
benchmark_config_params.run_with_cli = run_with_cli
|
|
702
820
|
if requires_safetensors is not None:
|
|
703
821
|
benchmark_config_params.requires_safetensors = requires_safetensors
|
|
822
|
+
if download_only is not None:
|
|
823
|
+
benchmark_config_params.download_only = download_only
|
|
704
824
|
|
|
705
|
-
return build_benchmark_config(
|
|
825
|
+
return build_benchmark_config(benchmark_config_params=benchmark_config_params)
|
|
706
826
|
|
|
707
827
|
def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
|
|
708
828
|
"""Prepare the model ID(s) to be benchmarked.
|
|
@@ -813,17 +933,19 @@ class Benchmarker:
|
|
|
813
933
|
model_param=model_config.param,
|
|
814
934
|
)
|
|
815
935
|
|
|
936
|
+
model_id_to_be_stored = model_config.model_id
|
|
937
|
+
if model_config.revision != "main":
|
|
938
|
+
model_id_to_be_stored += f"@{model_config.revision}"
|
|
939
|
+
if model_config.param is not None:
|
|
940
|
+
model_id_to_be_stored += f"#{model_config.param}"
|
|
941
|
+
|
|
816
942
|
record = BenchmarkResult(
|
|
817
943
|
dataset=dataset_config.name,
|
|
818
944
|
task=dataset_config.task.name,
|
|
819
945
|
dataset_languages=[
|
|
820
946
|
language.code for language in dataset_config.languages
|
|
821
947
|
],
|
|
822
|
-
model=
|
|
823
|
-
f"{model_config.model_id}@{model_config.revision}"
|
|
824
|
-
if model_config.revision and model_config.revision != "main"
|
|
825
|
-
else model_config.model_id
|
|
826
|
-
),
|
|
948
|
+
model=model_id_to_be_stored,
|
|
827
949
|
results=results,
|
|
828
950
|
num_model_parameters=model.num_params,
|
|
829
951
|
max_sequence_length=model.model_max_length,
|
euroeval/cli.py
CHANGED
|
@@ -216,6 +216,12 @@ from .tasks import get_all_tasks
|
|
|
216
216
|
help="The type of generative model. Only relevant if the model is generative. If "
|
|
217
217
|
"not specified, the type will be inferred automatically.",
|
|
218
218
|
)
|
|
219
|
+
@click.option(
|
|
220
|
+
"--download-only",
|
|
221
|
+
is_flag=True,
|
|
222
|
+
help="Only download the requested model weights and datasets, and exit.",
|
|
223
|
+
default=False,
|
|
224
|
+
)
|
|
219
225
|
def benchmark(
|
|
220
226
|
model: tuple[str],
|
|
221
227
|
dataset: tuple[str],
|
|
@@ -243,6 +249,7 @@ def benchmark(
|
|
|
243
249
|
debug: bool,
|
|
244
250
|
requires_safetensors: bool,
|
|
245
251
|
generative_type: str | None,
|
|
252
|
+
download_only: bool,
|
|
246
253
|
) -> None:
|
|
247
254
|
"""Benchmark pretrained language models on language tasks."""
|
|
248
255
|
models = list(model)
|
|
@@ -284,6 +291,7 @@ def benchmark(
|
|
|
284
291
|
debug=debug,
|
|
285
292
|
run_with_cli=True,
|
|
286
293
|
requires_safetensors=requires_safetensors,
|
|
294
|
+
download_only=download_only,
|
|
287
295
|
)
|
|
288
296
|
|
|
289
297
|
# Perform the benchmark evaluation
|
euroeval/data_models.py
CHANGED
|
@@ -228,6 +228,9 @@ class BenchmarkConfig:
|
|
|
228
228
|
generative_type:
|
|
229
229
|
The type of generative model to benchmark. Only relevant if the model is
|
|
230
230
|
generative.
|
|
231
|
+
download_only:
|
|
232
|
+
Whether to only download the models, metrics and datasets without
|
|
233
|
+
evaluating.
|
|
231
234
|
"""
|
|
232
235
|
|
|
233
236
|
model_languages: list[Language]
|
|
@@ -255,6 +258,7 @@ class BenchmarkConfig:
|
|
|
255
258
|
run_with_cli: bool
|
|
256
259
|
requires_safetensors: bool
|
|
257
260
|
generative_type: GenerativeType | None
|
|
261
|
+
download_only: bool
|
|
258
262
|
|
|
259
263
|
|
|
260
264
|
class BenchmarkConfigParams(pydantic.BaseModel):
|
|
@@ -285,6 +289,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
285
289
|
api_version: str | None
|
|
286
290
|
gpu_memory_utilization: float
|
|
287
291
|
generative_type: GenerativeType | None
|
|
292
|
+
download_only: bool
|
|
288
293
|
debug: bool
|
|
289
294
|
run_with_cli: bool
|
|
290
295
|
requires_safetensors: bool
|
euroeval/generation.py
CHANGED
|
@@ -243,7 +243,9 @@ def generate_single_iteration(
|
|
|
243
243
|
ground_truth = []
|
|
244
244
|
|
|
245
245
|
itr_scores: dict[str, float] = model.compute_metrics(
|
|
246
|
-
model_outputs_and_labels=(all_preds, ground_truth),
|
|
246
|
+
model_outputs_and_labels=(all_preds, ground_truth),
|
|
247
|
+
dataset=dataset,
|
|
248
|
+
benchmark_config=benchmark_config,
|
|
247
249
|
)
|
|
248
250
|
|
|
249
251
|
return itr_scores
|
euroeval/metrics/base.py
CHANGED
|
@@ -42,6 +42,18 @@ class Metric(abc.ABC):
|
|
|
42
42
|
else lambda x: (100 * x, f"{x:.2%}")
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
+
def download(self, cache_dir: str) -> "Metric":
|
|
46
|
+
"""Initiates the download of the metric if needed.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
cache_dir:
|
|
50
|
+
The directory where the metric will be downloaded to.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
The metric object itself.
|
|
54
|
+
"""
|
|
55
|
+
return self
|
|
56
|
+
|
|
45
57
|
@abc.abstractmethod
|
|
46
58
|
def __call__(
|
|
47
59
|
self,
|
euroeval/metrics/huggingface.py
CHANGED
|
@@ -3,9 +3,11 @@
|
|
|
3
3
|
import collections.abc as c
|
|
4
4
|
import logging
|
|
5
5
|
import typing as t
|
|
6
|
+
from pathlib import Path
|
|
6
7
|
|
|
7
8
|
import evaluate
|
|
8
9
|
import numpy as np
|
|
10
|
+
from datasets import DownloadConfig
|
|
9
11
|
|
|
10
12
|
from ..utils import HiddenPrints
|
|
11
13
|
from .base import Metric
|
|
@@ -76,6 +78,23 @@ class HuggingFaceMetric(Metric):
|
|
|
76
78
|
)
|
|
77
79
|
self.metric: "EvaluationModule | None" = None
|
|
78
80
|
|
|
81
|
+
def download(self, cache_dir: str) -> "HuggingFaceMetric":
|
|
82
|
+
"""Initiates the download of the metric if needed.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
cache_dir:
|
|
86
|
+
The directory where the metric will be downloaded to.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
The metric object itself.
|
|
90
|
+
"""
|
|
91
|
+
# Annoying but needed to make the metric download to a different cache dir
|
|
92
|
+
download_config = DownloadConfig(cache_dir=Path(cache_dir, "evaluate"))
|
|
93
|
+
self.metric = evaluate.load(
|
|
94
|
+
path=self.huggingface_id, download_config=download_config
|
|
95
|
+
)
|
|
96
|
+
return self
|
|
97
|
+
|
|
79
98
|
def __call__(
|
|
80
99
|
self,
|
|
81
100
|
predictions: c.Sequence,
|
|
@@ -103,7 +122,9 @@ class HuggingFaceMetric(Metric):
|
|
|
103
122
|
The calculated metric score, or None if the score should be ignored.
|
|
104
123
|
"""
|
|
105
124
|
if self.metric is None:
|
|
106
|
-
self.
|
|
125
|
+
self.download(cache_dir=benchmark_config.cache_dir)
|
|
126
|
+
|
|
127
|
+
assert self.metric is not None
|
|
107
128
|
|
|
108
129
|
with HiddenPrints():
|
|
109
130
|
results = self.metric.compute(
|
|
@@ -176,7 +197,7 @@ bert_score_metric = HuggingFaceMetric(
|
|
|
176
197
|
huggingface_id="bertscore",
|
|
177
198
|
results_key="f1",
|
|
178
199
|
compute_kwargs=dict(
|
|
179
|
-
model_type="microsoft/mdeberta-v3-base", device="
|
|
200
|
+
model_type="microsoft/mdeberta-v3-base", device="cpu", batch_size=16
|
|
180
201
|
),
|
|
181
202
|
)
|
|
182
203
|
|
|
@@ -97,7 +97,7 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
97
97
|
default_prompt_prefix="Hetta eru nakrir setningar og um teir eru mállæruliga "
|
|
98
98
|
"rættir.",
|
|
99
99
|
default_prompt_template="Setningur: {text}\nMállæruliga rættur: {label}",
|
|
100
|
-
default_instruction_prompt="Setningur: {text}\n\
|
|
100
|
+
default_instruction_prompt="Setningur: {text}\n\nGreindu hvort setningurin er "
|
|
101
101
|
"mállæruliga rættur ella ikki. Svara við {labels_str}, og einki annað.",
|
|
102
102
|
),
|
|
103
103
|
FR: PromptConfig(
|
|
@@ -111,11 +111,12 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
111
111
|
),
|
|
112
112
|
IS: PromptConfig(
|
|
113
113
|
default_prompt_label_mapping=dict(correct="já", incorrect="nei"),
|
|
114
|
-
default_prompt_prefix="
|
|
115
|
-
"málfræðilega réttar.",
|
|
114
|
+
default_prompt_prefix="Hér fyrir neðan eru setningar ásamt mati á því hvort "
|
|
115
|
+
"þær eru málfræðilega réttar.",
|
|
116
116
|
default_prompt_template="Setning: {text}\nMálfræðilega rétt: {label}",
|
|
117
|
-
default_instruction_prompt="Setning: {text}\n\
|
|
118
|
-
"málfræðilega rétt
|
|
117
|
+
default_instruction_prompt="Setning: {text}\n\nGreindu hvort setningin er "
|
|
118
|
+
"málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún "
|
|
119
|
+
"er það ekki.",
|
|
119
120
|
),
|
|
120
121
|
IT: PromptConfig(
|
|
121
122
|
default_prompt_label_mapping=dict(correct="si", incorrect="no"),
|
|
@@ -176,7 +176,7 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
176
176
|
default_prompt_prefix="Her eru nakrir setningar og nakrar JSON orðabøkur við "
|
|
177
177
|
"nevndar eindir, sum eru í setningunum.",
|
|
178
178
|
default_prompt_template="Setningur: {text}\nNevndar eindir: {label}",
|
|
179
|
-
default_instruction_prompt="Setningur: {text}\n\
|
|
179
|
+
default_instruction_prompt="Setningur: {text}\n\nGreindu nevndu einingarnar í "
|
|
180
180
|
"setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum "
|
|
181
181
|
"{labels_str}. Gildin ættu að vera listi yfir nevndu einingarnar af "
|
|
182
182
|
"þeirri gerð, nákvæmlega eins og þær koma fram í setningunni.",
|
|
@@ -215,8 +215,8 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
215
215
|
},
|
|
216
216
|
default_prompt_prefix="Eftirfarandi eru setningar ásamt JSON lyklum með "
|
|
217
217
|
"nefndum einingum sem koma fyrir í setningunum.",
|
|
218
|
-
default_prompt_template="Setning: {text}\
|
|
219
|
-
default_instruction_prompt="Setning: {text}\n\
|
|
218
|
+
default_prompt_template="Setning: {text}\nNafneiningar: {label}",
|
|
219
|
+
default_instruction_prompt="Setning: {text}\n\nGreindu nefndu einingarnar í "
|
|
220
220
|
"setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum "
|
|
221
221
|
"{labels_str}. Gildin ættu að vera listi yfir nefndu "
|
|
222
222
|
"einingarnar af þeirri gerð, nákvæmlega eins og þær koma fram í "
|
|
@@ -137,11 +137,11 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
137
137
|
default_prompt_label_mapping=dict(
|
|
138
138
|
positive="jákvætt", neutral="hlutlaust", negative="neikvætt"
|
|
139
139
|
),
|
|
140
|
-
default_prompt_prefix="
|
|
141
|
-
"verið
|
|
142
|
-
default_prompt_template="
|
|
143
|
-
default_instruction_prompt="
|
|
144
|
-
"Svaraðu með {labels_str}, og ekkert annað.",
|
|
140
|
+
default_prompt_prefix="Hér fyrir neðan eru textabrot ásamt lyndisgildi þeirra "
|
|
141
|
+
"sem getur verið 'jákvætt', 'hlutlaust' eða 'neikvætt'.",
|
|
142
|
+
default_prompt_template="Textabrot: {text}\nViðhorf: {label}",
|
|
143
|
+
default_instruction_prompt="Textabrot: {text}\n\nGreindu lyndið í "
|
|
144
|
+
"textabrotinu. Svaraðu með {labels_str}, og ekkert annað.",
|
|
145
145
|
),
|
|
146
146
|
IT: PromptConfig(
|
|
147
147
|
default_prompt_label_mapping=dict(
|
euroeval/tasks.py
CHANGED
|
@@ -100,6 +100,7 @@ KNOW = Task(
|
|
|
100
100
|
default_num_few_shot_examples=5,
|
|
101
101
|
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
102
102
|
default_labels=["a", "b", "c", "d"],
|
|
103
|
+
default_allowed_model_types=[ModelType.GENERATIVE],
|
|
103
104
|
uses_logprobs=True,
|
|
104
105
|
)
|
|
105
106
|
|
|
@@ -112,6 +113,7 @@ MCRC = Task(
|
|
|
112
113
|
default_num_few_shot_examples=5,
|
|
113
114
|
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
114
115
|
default_labels=["a", "b", "c", "d"],
|
|
116
|
+
default_allowed_model_types=[ModelType.GENERATIVE],
|
|
115
117
|
uses_logprobs=True,
|
|
116
118
|
)
|
|
117
119
|
|
|
@@ -124,6 +126,7 @@ COMMON_SENSE = Task(
|
|
|
124
126
|
default_num_few_shot_examples=5,
|
|
125
127
|
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
126
128
|
default_labels=["a", "b", "c", "d"],
|
|
129
|
+
default_allowed_model_types=[ModelType.GENERATIVE],
|
|
127
130
|
uses_logprobs=True,
|
|
128
131
|
)
|
|
129
132
|
|
euroeval/tokenisation_utils.py
CHANGED
|
@@ -551,7 +551,6 @@ def apply_chat_template(
|
|
|
551
551
|
tokeniser: "PreTrainedTokenizer",
|
|
552
552
|
tokenise: bool,
|
|
553
553
|
add_generation_prompt: bool,
|
|
554
|
-
enable_thinking: bool,
|
|
555
554
|
**extra_kwargs,
|
|
556
555
|
) -> str | list[int]:
|
|
557
556
|
"""Apply the chat template to a prompt.
|
|
@@ -568,10 +567,6 @@ def apply_chat_template(
|
|
|
568
567
|
Whether to add a generation prompt at the end of the conversation. This is
|
|
569
568
|
only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
|
|
570
569
|
always add a generation prompt.
|
|
571
|
-
enable_thinking:
|
|
572
|
-
Whether to enable special handling for reasoning models, such as adding
|
|
573
|
-
special tokens for thinking. This is only relevant for regular Hugging
|
|
574
|
-
Face tokenisers, as Mistral tokenisers always handle reasoning models.
|
|
575
570
|
**extra_kwargs:
|
|
576
571
|
Extra keyword arguments to pass to the tokeniser's `apply_chat_template`
|
|
577
572
|
method. Only relevant for regular Hugging Face tokenisers.
|
|
@@ -601,7 +596,6 @@ def apply_chat_template(
|
|
|
601
596
|
conversation=conversation,
|
|
602
597
|
add_generation_prompt=add_generation_prompt,
|
|
603
598
|
tokenize=tokenise,
|
|
604
|
-
enable_thinking=enable_thinking,
|
|
605
599
|
**extra_kwargs,
|
|
606
600
|
)
|
|
607
601
|
return templated_prompt
|
euroeval/types.py
CHANGED
|
@@ -8,8 +8,7 @@ if t.TYPE_CHECKING:
|
|
|
8
8
|
from datasets.arrow_dataset import Dataset
|
|
9
9
|
from numpy.typing import NDArray
|
|
10
10
|
|
|
11
|
-
from .data_models import GenerativeModelOutput
|
|
12
|
-
|
|
11
|
+
from .data_models import BenchmarkConfig, GenerativeModelOutput
|
|
13
12
|
|
|
14
13
|
ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
|
|
15
14
|
Predictions: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
|
|
@@ -27,6 +26,7 @@ class ComputeMetricsFunction(t.Protocol):
|
|
|
27
26
|
"NDArray | list[str] | list[list[str]]",
|
|
28
27
|
],
|
|
29
28
|
dataset: "Dataset",
|
|
29
|
+
benchmark_config: "BenchmarkConfig",
|
|
30
30
|
) -> dict[str, float]:
|
|
31
31
|
"""Compute the metrics.
|
|
32
32
|
|
euroeval/utils.py
CHANGED
|
@@ -8,6 +8,7 @@ import logging
|
|
|
8
8
|
import os
|
|
9
9
|
import random
|
|
10
10
|
import re
|
|
11
|
+
import socket
|
|
11
12
|
import sys
|
|
12
13
|
import typing as t
|
|
13
14
|
import warnings
|
|
@@ -18,10 +19,8 @@ import demjson3
|
|
|
18
19
|
import huggingface_hub as hf_hub
|
|
19
20
|
import litellm
|
|
20
21
|
import numpy as np
|
|
21
|
-
import requests
|
|
22
22
|
import torch
|
|
23
23
|
from datasets.utils import disable_progress_bar
|
|
24
|
-
from requests.exceptions import RequestException
|
|
25
24
|
from transformers import logging as tf_logging
|
|
26
25
|
|
|
27
26
|
from .exceptions import InvalidBenchmark, InvalidModel, NaNValueInModelOutput
|
|
@@ -54,6 +53,68 @@ def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
|
|
|
54
53
|
return str(cache_dir_path)
|
|
55
54
|
|
|
56
55
|
|
|
56
|
+
def resolve_model_path(download_dir: str) -> str:
|
|
57
|
+
"""Resolve the path to the directory containing the model config files and weights.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
download_dir:
|
|
61
|
+
The download directory
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
The path to the model.
|
|
65
|
+
"""
|
|
66
|
+
model_path = Path(download_dir)
|
|
67
|
+
# Get the 'path safe' version of the model id, which is the last dir in the path
|
|
68
|
+
model_id_path = model_path.name
|
|
69
|
+
# Hf hub `cache_dir` puts the files in models--`model_id_path`/snapshots
|
|
70
|
+
model_path = model_path / f"models--{model_id_path}" / "snapshots"
|
|
71
|
+
if not model_path.exists():
|
|
72
|
+
raise InvalidModel(
|
|
73
|
+
f"Attempted to load models from the {model_path} directory, "
|
|
74
|
+
"but it does not exist."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Get all files in the model path
|
|
78
|
+
found_files = [
|
|
79
|
+
found_file for found_file in model_path.rglob("*") if found_file.is_file()
|
|
80
|
+
]
|
|
81
|
+
if not found_files:
|
|
82
|
+
raise InvalidModel(f"No model files found at {model_path}")
|
|
83
|
+
|
|
84
|
+
# Make sure that there arent multiples of the files found
|
|
85
|
+
if len(found_files) == len(set(found_files)):
|
|
86
|
+
raise InvalidModel(
|
|
87
|
+
f"Found multiple model config files for {model_id_path.strip('models--')}"
|
|
88
|
+
f"at {model_path}"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Check that found_files contains at least a 'config.json'
|
|
92
|
+
config_file = next(
|
|
93
|
+
(file for file in found_files if file.name == "config.json"), None
|
|
94
|
+
)
|
|
95
|
+
if config_file is None:
|
|
96
|
+
raise InvalidModel(
|
|
97
|
+
f"Missing required file 'config.json' for {model_id_path.strip('models--')}"
|
|
98
|
+
f"at {model_path}"
|
|
99
|
+
)
|
|
100
|
+
model_path = config_file.parent
|
|
101
|
+
|
|
102
|
+
# As a precaution we also check that all of the files are in the same directory
|
|
103
|
+
# if not we create a new dir with symlinks to all of the files from all snapshots
|
|
104
|
+
# this is especially useful for vllm where we can only specify one folder and e.g.,
|
|
105
|
+
# the safetensors version of the weights was added in an unmerged PR
|
|
106
|
+
if not all(
|
|
107
|
+
[found_file.parent == found_files[0].parent for found_file in found_files]
|
|
108
|
+
):
|
|
109
|
+
new_model_path = model_path.parent / "model_files"
|
|
110
|
+
new_model_path.mkdir(exist_ok=True)
|
|
111
|
+
for found_file in found_files:
|
|
112
|
+
Path(new_model_path / found_file.name).symlink_to(found_file)
|
|
113
|
+
model_path = new_model_path
|
|
114
|
+
|
|
115
|
+
return str(model_path)
|
|
116
|
+
|
|
117
|
+
|
|
57
118
|
def clear_memory() -> None:
|
|
58
119
|
"""Clears the memory of unused items."""
|
|
59
120
|
for gc_generation in range(3):
|
|
@@ -91,6 +152,9 @@ def block_terminal_output() -> None:
|
|
|
91
152
|
libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and
|
|
92
153
|
disables most of the logging from the `transformers` library.
|
|
93
154
|
"""
|
|
155
|
+
if os.getenv("FULL_LOG") == "1":
|
|
156
|
+
return
|
|
157
|
+
|
|
94
158
|
# Ignore miscellaneous warnings
|
|
95
159
|
warnings.filterwarnings("ignore", category=UserWarning)
|
|
96
160
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
@@ -196,6 +260,7 @@ def get_min_cuda_compute_capability() -> float | None:
|
|
|
196
260
|
return float(f"{major}.{minor}")
|
|
197
261
|
|
|
198
262
|
|
|
263
|
+
@cache
|
|
199
264
|
def internet_connection_available() -> bool:
|
|
200
265
|
"""Checks if internet connection is available by pinging google.com.
|
|
201
266
|
|
|
@@ -203,10 +268,17 @@ def internet_connection_available() -> bool:
|
|
|
203
268
|
Whether or not internet connection is available.
|
|
204
269
|
"""
|
|
205
270
|
try:
|
|
206
|
-
|
|
271
|
+
s = socket.create_connection(("1.1.1.1", 80))
|
|
272
|
+
s.close()
|
|
207
273
|
return True
|
|
208
|
-
|
|
209
|
-
|
|
274
|
+
# a bit ugly but we dont want to actually import the pytest-socket exceptions
|
|
275
|
+
# we catch all exceptions and check if the name matches any known errors
|
|
276
|
+
except Exception as e:
|
|
277
|
+
pytest_socket_errors = ["SocketConnectBlockedError", "SocketBlockedError"]
|
|
278
|
+
if type(e).__name__ in pytest_socket_errors or isinstance(e, OSError):
|
|
279
|
+
return False
|
|
280
|
+
else:
|
|
281
|
+
raise e
|
|
210
282
|
|
|
211
283
|
|
|
212
284
|
class HiddenPrints:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 16.
|
|
3
|
+
Version: 16.2.1
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -61,13 +61,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
|
|
|
61
61
|
Provides-Extra: all
|
|
62
62
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
-
Requires-Dist:
|
|
65
|
-
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
+
Requires-Dist: timm>=1.0.19; extra == 'all'
|
|
65
|
+
Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
66
66
|
Provides-Extra: generative
|
|
67
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
|
-
Requires-Dist:
|
|
70
|
-
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
69
|
+
Requires-Dist: timm>=1.0.19; extra == 'generative'
|
|
70
|
+
Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
71
71
|
Description-Content-Type: text/markdown
|
|
72
72
|
|
|
73
73
|
<div align='center'>
|
|
@@ -152,13 +152,13 @@ model:
|
|
|
152
152
|
```
|
|
153
153
|
>>> from euroeval import Benchmarker
|
|
154
154
|
>>> benchmark = Benchmarker()
|
|
155
|
-
>>> benchmark(model="<model>")
|
|
155
|
+
>>> benchmark(model="<model-id>")
|
|
156
156
|
```
|
|
157
157
|
|
|
158
158
|
To benchmark on a specific task and/or language, you simply specify the `task` or
|
|
159
159
|
`language` arguments, shown here with same example as above:
|
|
160
160
|
```
|
|
161
|
-
>>> benchmark(model="<model>", task="sentiment-classification", language="da")
|
|
161
|
+
>>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
|
|
162
162
|
```
|
|
163
163
|
|
|
164
164
|
If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
|
|
@@ -168,6 +168,30 @@ models on the Danish sentiment classification task:
|
|
|
168
168
|
>>> benchmark(task="sentiment-classification", language="da")
|
|
169
169
|
```
|
|
170
170
|
|
|
171
|
+
### Benchmarking in an Offline Environment
|
|
172
|
+
If you need to benchmark in an offline environment, you need to download the models,
|
|
173
|
+
datasets and metrics beforehand. This can be done by adding the `--download-only`
|
|
174
|
+
argument, from the command line, or the `download_only` argument, if benchmarking from a
|
|
175
|
+
script. For example to download the model you want and all of the Danish sentiment
|
|
176
|
+
classification datasets:
|
|
177
|
+
```
|
|
178
|
+
$ euroeval --model <model-id> --task sentiment-classification --language da --download-only
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Or from a script:
|
|
182
|
+
```
|
|
183
|
+
>>> benchmark(
|
|
184
|
+
... model="<model-id>",
|
|
185
|
+
... task="sentiment-classification",
|
|
186
|
+
... language="da",
|
|
187
|
+
... download_only=True,
|
|
188
|
+
... )
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Please note: Offline benchmarking of adapter models is not currently supported. An
|
|
192
|
+
internet connection will be required during evaluation. If offline support is important
|
|
193
|
+
to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
|
|
194
|
+
|
|
171
195
|
### Benchmarking from Docker
|
|
172
196
|
A Dockerfile is provided in the repo, which can be downloaded and run, without needing
|
|
173
197
|
to clone the repo and installing from source. This can be fetched programmatically by
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
euroeval/__init__.py,sha256=
|
|
2
|
-
euroeval/benchmark_config_factory.py,sha256=
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
1
|
+
euroeval/__init__.py,sha256=mXTjuGrEE-1fIS9x28oJKg-gNGt4q7y2E74l330KEmY,3787
|
|
2
|
+
euroeval/benchmark_config_factory.py,sha256=eOQsd9F4cJy8I7a3_lIKDZ5b5ukipIUqk0GZ3pyytwQ,8596
|
|
3
|
+
euroeval/benchmarker.py,sha256=5l4p1ncq4VJX_bDjv2f8oBq2GETPtJmduGOnLAbWjF8,55762
|
|
4
4
|
euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
|
|
5
|
-
euroeval/cli.py,sha256=
|
|
5
|
+
euroeval/cli.py,sha256=GOAWzdtasJfOvTuVQszu-T1T9GfQ_un-blOICO-y7g4,9316
|
|
6
6
|
euroeval/constants.py,sha256=NN7kcwQdlDyyGFSrLjsL_qKVRyoRqZ9sKO5SjlgtRwA,2741
|
|
7
7
|
euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
8
|
+
euroeval/data_models.py,sha256=9Sgrq6Ktg1ETXRJ0v4VA_amAPowGuB7fZtL-8RlDQn0,27766
|
|
9
9
|
euroeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
|
|
10
10
|
euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
|
|
11
11
|
euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
|
|
12
|
-
euroeval/generation.py,sha256=
|
|
12
|
+
euroeval/generation.py,sha256=Va3EOmFzOMBNfI4fh3nW5qhhrM3CBT8_4MaLwVtsF_E,12528
|
|
13
13
|
euroeval/generation_utils.py,sha256=d2_vylWXIeH4xIXgbsI5rN6dMt0zKp0zXExD6aOKWaA,18299
|
|
14
14
|
euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
|
|
15
15
|
euroeval/model_cache.py,sha256=h61cL_fy2Sd1sqYZis5lAWqvQIfQXXt_v8QZeftKNkg,9226
|
|
@@ -17,16 +17,16 @@ euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
|
|
|
17
17
|
euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
|
|
18
18
|
euroeval/scores.py,sha256=HQQqyjdgm853FZ_ifIdnSltKfBhsY7pOITov6F3Et5o,3165
|
|
19
19
|
euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
|
|
20
|
-
euroeval/tasks.py,sha256=
|
|
21
|
-
euroeval/tokenisation_utils.py,sha256=
|
|
22
|
-
euroeval/types.py,sha256=
|
|
23
|
-
euroeval/utils.py,sha256=
|
|
20
|
+
euroeval/tasks.py,sha256=EzEWFDo_0ffabBFiRu-mw80jENUioE8D_VEn_Dsv-F8,4703
|
|
21
|
+
euroeval/tokenisation_utils.py,sha256=nLeF2cdZSm5PZiAcDTtxY82nUJ-or8VU8YxYLa167EM,21158
|
|
22
|
+
euroeval/types.py,sha256=_iVy-RwiCGu9TNX2sfyJTdCvXy1akNGTCywAo-YpBqU,2815
|
|
23
|
+
euroeval/utils.py,sha256=DRJW6wtmNpRtuHt03diWo3S5m3rdxoPEQpd-KWi7aGY,19255
|
|
24
24
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
25
25
|
euroeval/benchmark_modules/base.py,sha256=mHF8XS6GGUXV-sJtxmI5WJBWPLMHuh-4Z4OWjC25x9Y,11566
|
|
26
26
|
euroeval/benchmark_modules/fresh.py,sha256=TveSQiFBi3xXgCEQBdHwkUQ685PDkKW0y3G5Yt5rkeM,10655
|
|
27
|
-
euroeval/benchmark_modules/hf.py,sha256=
|
|
28
|
-
euroeval/benchmark_modules/litellm.py,sha256=
|
|
29
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
27
|
+
euroeval/benchmark_modules/hf.py,sha256=XmkoDFzaJqnd_5mmUkqCaOgAdRPFs3KZKZZ0cr83TlM,44742
|
|
28
|
+
euroeval/benchmark_modules/litellm.py,sha256=F3udd6NmhQOe3go_7rAcWg7mgZrNQpWWvLe-5U4E2RQ,64771
|
|
29
|
+
euroeval/benchmark_modules/vllm.py,sha256=yLy8TCTnodu4NdTiO7XSdxuHX60AJ1-7p6J3e5h7-iA,43994
|
|
30
30
|
euroeval/dataset_configs/__init__.py,sha256=uuIZmElpJV8iupo5oDj3TeQhBDRANdWpLKYFASLirHA,2046
|
|
31
31
|
euroeval/dataset_configs/danish.py,sha256=QABfgI7m-0-5AimDXegp5ssDSLcM2VrAI_RWsinSZP4,5631
|
|
32
32
|
euroeval/dataset_configs/dutch.py,sha256=63Ro2yFym5MuIDXf5953vUYenw9B0kZSCmZbXjdy4Rs,5517
|
|
@@ -45,17 +45,17 @@ euroeval/dataset_configs/portuguese.py,sha256=gQ054SdLQ5fkm4IAP6Mdh5RcPDJPDITcuy
|
|
|
45
45
|
euroeval/dataset_configs/spanish.py,sha256=DvJlMK6OQg4qmxKzQA2IficlBMB7BafvxqIVuTKiZyw,4902
|
|
46
46
|
euroeval/dataset_configs/swedish.py,sha256=YWHp7hbJ25o36csSg9uXaQCEJK1BPb7u2RQZiCe0lNs,5445
|
|
47
47
|
euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
|
|
48
|
-
euroeval/metrics/base.py,sha256=
|
|
49
|
-
euroeval/metrics/huggingface.py,sha256=
|
|
48
|
+
euroeval/metrics/base.py,sha256=HST2XeZrUQZV_vTiieePiaznEov3CIGzuVNIITtLsQc,2596
|
|
49
|
+
euroeval/metrics/huggingface.py,sha256=iHKJnvOXRc_e8sxB2ff3WkfK64jXyn5KEnIxPyfD2fM,6522
|
|
50
50
|
euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
|
|
51
51
|
euroeval/metrics/pipeline.py,sha256=Wcan3eDWV7t4WRXMPWCCe_JsA-fZnIfZU2ESinbbL2I,10284
|
|
52
52
|
euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
|
|
53
53
|
euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
|
|
54
|
-
euroeval/prompt_templates/linguistic_acceptability.py,sha256=
|
|
54
|
+
euroeval/prompt_templates/linguistic_acceptability.py,sha256=m23LrckohdnToQDsexdsW_5YyBfGTf5DTjiMI643F9A,8717
|
|
55
55
|
euroeval/prompt_templates/multiple_choice.py,sha256=Q-8-ETqG-RZeLzR8v8WUBIN7djiNSfNpmYnZRUWcd84,6905
|
|
56
|
-
euroeval/prompt_templates/named_entity_recognition.py,sha256=
|
|
56
|
+
euroeval/prompt_templates/named_entity_recognition.py,sha256=HIX9EBkSIBl5JXceFtiZTdvzWr9YHM9-55D6bcjIyQ4,16436
|
|
57
57
|
euroeval/prompt_templates/reading_comprehension.py,sha256=ogzmhiSZO6egrdxxQiWz6a0XMdC0vws-lg5yRKQoYV0,8730
|
|
58
|
-
euroeval/prompt_templates/sentiment_classification.py,sha256=
|
|
58
|
+
euroeval/prompt_templates/sentiment_classification.py,sha256=b3TvH26M77vwFfn577NlGVW881qfV7YSm-Xba_w98Fc,9504
|
|
59
59
|
euroeval/prompt_templates/summarization.py,sha256=4Sqwj6C7yNfqj4FFFCseJMLDoSZ13aIOgY0SjIzzsNo,6593
|
|
60
60
|
euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
61
61
|
euroeval/task_group_utils/multiple_choice_classification.py,sha256=i5sidJGAXnENRoB6pOelyaUeGP1qoxwPSzD-F9RLwWk,7106
|
|
@@ -63,8 +63,8 @@ euroeval/task_group_utils/question_answering.py,sha256=eUczZntrC9lhCUQlwNQB49i-5
|
|
|
63
63
|
euroeval/task_group_utils/sequence_classification.py,sha256=TAqZCoMQ9I-HFhMH35_J1mY2SQg95HUbXcgrBIyhgk0,16082
|
|
64
64
|
euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
|
|
65
65
|
euroeval/task_group_utils/token_classification.py,sha256=Yjai937ia1nZBMOWySqCXr_dA6WiVLGvmb4Hm_TU0Bg,17118
|
|
66
|
-
euroeval-16.
|
|
67
|
-
euroeval-16.
|
|
68
|
-
euroeval-16.
|
|
69
|
-
euroeval-16.
|
|
70
|
-
euroeval-16.
|
|
66
|
+
euroeval-16.2.1.dist-info/METADATA,sha256=brIXZ3x3MUf-ggNpKKC_4Lvrqem0MfKPrJ8DZJ5T3Iw,14590
|
|
67
|
+
euroeval-16.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
68
|
+
euroeval-16.2.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
|
|
69
|
+
euroeval-16.2.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
|
|
70
|
+
euroeval-16.2.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|