EuroEval 16.1.1__py3-none-any.whl → 16.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

euroeval/__init__.py CHANGED
@@ -12,12 +12,13 @@ import warnings
12
12
  from termcolor import colored
13
13
 
14
14
  # Block specific warnings before importing anything else, as they can be noisy
15
- warnings.filterwarnings("ignore", category=UserWarning)
16
- warnings.filterwarnings("ignore", category=FutureWarning)
17
- logging.getLogger("httpx").setLevel(logging.CRITICAL)
18
- logging.getLogger("datasets").setLevel(logging.CRITICAL)
19
- logging.getLogger("vllm").setLevel(logging.CRITICAL)
20
- os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
15
+ if os.getenv("FULL_LOG") != "1":
16
+ warnings.filterwarnings("ignore", category=UserWarning)
17
+ warnings.filterwarnings("ignore", category=FutureWarning)
18
+ logging.getLogger("httpx").setLevel(logging.CRITICAL)
19
+ logging.getLogger("datasets").setLevel(logging.CRITICAL)
20
+ logging.getLogger("vllm").setLevel(logging.CRITICAL)
21
+ os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
21
22
 
22
23
  # Set up logging
23
24
  fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
@@ -6,9 +6,9 @@ import typing as t
6
6
 
7
7
  import torch
8
8
 
9
- from .data_models import BenchmarkConfig
9
+ from .data_models import BenchmarkConfig, BenchmarkConfigParams
10
10
  from .dataset_configs import get_all_dataset_configs
11
- from .enums import Device, GenerativeType
11
+ from .enums import Device
12
12
  from .exceptions import InvalidBenchmark
13
13
  from .languages import get_all_languages
14
14
  from .tasks import SPEED, get_all_tasks
@@ -21,150 +21,66 @@ logger = logging.getLogger("euroeval")
21
21
 
22
22
 
23
23
  def build_benchmark_config(
24
- progress_bar: bool,
25
- save_results: bool,
26
- task: str | list[str] | None,
27
- dataset: str | list[str] | None,
28
- language: str | list[str],
29
- model_language: str | list[str] | None,
30
- dataset_language: str | list[str] | None,
31
- device: Device | None,
32
- batch_size: int,
33
- raise_errors: bool,
34
- cache_dir: str,
35
- api_key: str | None,
36
- force: bool,
37
- verbose: bool,
38
- trust_remote_code: bool,
39
- clear_model_cache: bool,
40
- evaluate_test_split: bool,
41
- few_shot: bool,
42
- num_iterations: int,
43
- api_base: str | None,
44
- api_version: str | None,
45
- gpu_memory_utilization: float,
46
- generative_type: GenerativeType | None,
47
- debug: bool,
48
- run_with_cli: bool,
49
- requires_safetensors: bool,
24
+ benchmark_config_params: BenchmarkConfigParams,
50
25
  ) -> BenchmarkConfig:
51
26
  """Create a benchmark configuration.
52
27
 
53
28
  Args:
54
- progress_bar:
55
- Whether to show a progress bar when running the benchmark.
56
- save_results:
57
- Whether to save the benchmark results to a file.
58
- task:
59
- The tasks to include for dataset. If None then datasets will not be
60
- filtered based on their task.
61
- dataset:
62
- The datasets to include for task. If None then all datasets will be
63
- included, limited by the `task` parameter.
64
- language:
65
- The language codes of the languages to include, both for models and
66
- datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
67
- to 'all' if all languages should be considered.
68
- model_language:
69
- The language codes of the languages to include for models. If None then
70
- the `language` parameter will be used.
71
- dataset_language:
72
- The language codes of the languages to include for datasets. If None then
73
- the `language` parameter will be used.
74
- device:
75
- The device to use for running the models. If None then the device will be
76
- set automatically.
77
- batch_size:
78
- The batch size to use for running the models.
79
- raise_errors:
80
- Whether to raise errors when running the benchmark.
81
- cache_dir:
82
- The directory to use for caching the models.
83
- api_key:
84
- The API key to use for a given inference server.
85
- force:
86
- Whether to force the benchmark to run even if the results are already
87
- cached.
88
- verbose:
89
- Whether to print verbose output when running the benchmark. This is
90
- automatically set if `debug` is True.
91
- trust_remote_code:
92
- Whether to trust remote code when running the benchmark.
93
- clear_model_cache:
94
- Whether to clear the model cache before running the benchmark.
95
- evaluate_test_split:
96
- Whether to use the test split for the datasets.
97
- few_shot:
98
- Whether to use few-shot learning for the models.
99
- num_iterations:
100
- The number of iterations each model should be evaluated for.
101
- api_base:
102
- The base URL for a given inference API. Only relevant if `model` refers to a
103
- model on an inference API.
104
- api_version:
105
- The version of the API to use for a given inference API.
106
- gpu_memory_utilization:
107
- The GPU memory utilization to use for vLLM. A larger value will result in
108
- faster evaluation, but at the risk of running out of GPU memory. Only reduce
109
- this if you are running out of GPU memory. Only relevant if the model is
110
- generative.
111
- generative_type:
112
- The type of generative model. Only relevant if the model is generative. If
113
- not specified, the type will be inferred automatically.
114
- debug:
115
- Whether to run the benchmark in debug mode.
116
- run_with_cli:
117
- Whether the benchmark is being run with the CLI.
118
- requires_safetensors:
119
- Whether to only allow evaluations of models stored as safetensors.
29
+ benchmark_config_params:
30
+ The parameters for creating the benchmark configuration.
120
31
 
121
32
  Returns:
122
33
  The benchmark configuration.
123
34
  """
124
- language_codes = get_correct_language_codes(language_codes=language)
35
+ language_codes = get_correct_language_codes(
36
+ language_codes=benchmark_config_params.language
37
+ )
125
38
  model_languages = prepare_languages(
126
- language_codes=model_language, default_language_codes=language_codes
39
+ language_codes=benchmark_config_params.model_language,
40
+ default_language_codes=language_codes,
127
41
  )
128
42
  dataset_languages = prepare_languages(
129
- language_codes=dataset_language, default_language_codes=language_codes
43
+ language_codes=benchmark_config_params.dataset_language,
44
+ default_language_codes=language_codes,
130
45
  )
131
46
 
132
47
  tasks, datasets = prepare_tasks_and_datasets(
133
- task=task, dataset=dataset, dataset_languages=dataset_languages
48
+ task=benchmark_config_params.task,
49
+ dataset=benchmark_config_params.dataset,
50
+ dataset_languages=dataset_languages,
134
51
  )
135
52
 
136
- torch_device = prepare_device(device=device)
137
-
138
- # Set variable with number of iterations
139
- if hasattr(sys, "_called_from_test"):
140
- num_iterations = 1
141
-
142
53
  return BenchmarkConfig(
143
54
  model_languages=model_languages,
144
55
  dataset_languages=dataset_languages,
145
56
  tasks=tasks,
146
57
  datasets=datasets,
147
- batch_size=batch_size,
148
- raise_errors=raise_errors,
149
- cache_dir=cache_dir,
150
- api_key=api_key,
151
- force=force,
152
- progress_bar=progress_bar,
153
- save_results=save_results,
154
- verbose=verbose or debug,
155
- device=torch_device,
156
- trust_remote_code=trust_remote_code,
157
- clear_model_cache=clear_model_cache,
158
- evaluate_test_split=evaluate_test_split,
159
- few_shot=few_shot,
160
- num_iterations=num_iterations,
161
- api_base=api_base,
162
- api_version=api_version,
163
- gpu_memory_utilization=gpu_memory_utilization,
164
- generative_type=generative_type,
165
- debug=debug,
166
- run_with_cli=run_with_cli,
167
- requires_safetensors=requires_safetensors,
58
+ batch_size=benchmark_config_params.batch_size,
59
+ raise_errors=benchmark_config_params.raise_errors,
60
+ cache_dir=benchmark_config_params.cache_dir,
61
+ api_key=benchmark_config_params.api_key,
62
+ force=benchmark_config_params.force,
63
+ progress_bar=benchmark_config_params.progress_bar,
64
+ save_results=benchmark_config_params.save_results,
65
+ verbose=benchmark_config_params.verbose or benchmark_config_params.debug,
66
+ device=prepare_device(device=benchmark_config_params.device),
67
+ trust_remote_code=benchmark_config_params.trust_remote_code,
68
+ clear_model_cache=benchmark_config_params.clear_model_cache,
69
+ evaluate_test_split=benchmark_config_params.evaluate_test_split,
70
+ few_shot=benchmark_config_params.few_shot,
71
+ num_iterations=(
72
+ 1
73
+ if hasattr(sys, "_called_from_test")
74
+ else benchmark_config_params.num_iterations
75
+ ),
76
+ api_base=benchmark_config_params.api_base,
77
+ api_version=benchmark_config_params.api_version,
78
+ gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
79
+ generative_type=benchmark_config_params.generative_type,
80
+ debug=benchmark_config_params.debug,
81
+ run_with_cli=benchmark_config_params.run_with_cli,
82
+ requires_safetensors=benchmark_config_params.requires_safetensors,
83
+ download_only=benchmark_config_params.download_only,
168
84
  )
169
85
 
170
86
 
@@ -146,21 +146,25 @@ class HuggingFaceEncoderModel(BenchmarkModule):
146
146
  Returns:
147
147
  The number of parameters in the model.
148
148
  """
149
- token = get_hf_token(api_key=self.benchmark_config.api_key)
150
- hf_api = HfApi(token=token)
151
- try:
152
- repo_info = hf_api.model_info(
153
- repo_id=self.model_config.adapter_base_model_id
154
- or self.model_config.model_id,
155
- revision=self.model_config.revision,
156
- )
157
- except (
158
- RepositoryNotFoundError,
159
- RevisionNotFoundError,
160
- RequestException,
161
- HFValidationError,
162
- ):
149
+ # No need to try to use the API if we have no internet.
150
+ if not internet_connection_available():
163
151
  repo_info = None
152
+ else:
153
+ token = get_hf_token(api_key=self.benchmark_config.api_key)
154
+ hf_api = HfApi(token=token)
155
+ try:
156
+ repo_info = hf_api.model_info(
157
+ repo_id=self.model_config.adapter_base_model_id
158
+ or self.model_config.model_id,
159
+ revision=self.model_config.revision,
160
+ )
161
+ except (
162
+ RepositoryNotFoundError,
163
+ RevisionNotFoundError,
164
+ RequestException,
165
+ HFValidationError,
166
+ ):
167
+ repo_info = None
164
168
 
165
169
  if (
166
170
  repo_info is not None
@@ -558,7 +562,7 @@ def load_model_and_tokeniser(
558
562
  The benchmark configuration
559
563
 
560
564
  Returns:
561
- The loaded model and tokeniser.
565
+ A pair (model, tokeniser), with the loaded model and tokeniser
562
566
  """
563
567
  config: "PretrainedConfig"
564
568
  block_terminal_output()
@@ -686,6 +690,7 @@ def load_model_and_tokeniser(
686
690
  model=model,
687
691
  model_id=model_id,
688
692
  trust_remote_code=benchmark_config.trust_remote_code,
693
+ model_cache_dir=model_config.model_cache_dir,
689
694
  )
690
695
 
691
696
  return model, tokeniser
@@ -722,6 +727,11 @@ def get_model_repo_info(
722
727
  ):
723
728
  model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
724
729
 
730
+ # If we have not internet, and the model_id is not a directory for a local model
731
+ # we also just create a dummy model info object.
732
+ elif not internet_connection_available():
733
+ model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
734
+
725
735
  # If the model does not exist locally, then we get the model info from the Hugging
726
736
  # Face Hub, if possible
727
737
  if model_info is None:
@@ -867,7 +877,10 @@ def get_model_repo_info(
867
877
 
868
878
 
869
879
  def load_tokeniser(
870
- model: "PreTrainedModel | None", model_id: str, trust_remote_code: bool
880
+ model: "PreTrainedModel | None",
881
+ model_id: str,
882
+ trust_remote_code: bool,
883
+ model_cache_dir: str,
871
884
  ) -> "PreTrainedTokenizer":
872
885
  """Load the tokeniser.
873
886
 
@@ -889,6 +902,7 @@ def load_tokeniser(
889
902
  trust_remote_code=trust_remote_code,
890
903
  padding_side="right",
891
904
  truncation_side="right",
905
+ cache_dir=model_cache_dir,
892
906
  )
893
907
 
894
908
  # If the model is a subclass of a certain model types then we have to add a prefix
@@ -999,6 +1013,7 @@ def load_hf_model_config(
999
1013
  token=get_hf_token(api_key=api_key),
1000
1014
  trust_remote_code=trust_remote_code,
1001
1015
  cache_dir=model_cache_dir,
1016
+ local_files_only=not internet_connection_available(),
1002
1017
  )
1003
1018
  if config.eos_token_id is not None and config.pad_token_id is None:
1004
1019
  if isinstance(config.eos_token_id, list):
@@ -984,6 +984,7 @@ class LiteLLMModel(BenchmarkModule):
984
984
  model=None,
985
985
  model_id=model_id,
986
986
  trust_remote_code=self.benchmark_config.trust_remote_code,
987
+ model_cache_dir=self.model_config.model_cache_dir,
987
988
  )
988
989
 
989
990
  if (
@@ -1066,6 +1067,7 @@ class LiteLLMModel(BenchmarkModule):
1066
1067
  model=None,
1067
1068
  model_id=model_id,
1068
1069
  trust_remote_code=self.benchmark_config.trust_remote_code,
1070
+ model_cache_dir=self.model_config.model_cache_dir,
1069
1071
  )
1070
1072
 
1071
1073
  all_max_lengths: list[int] = list()
@@ -72,7 +72,9 @@ from ..utils import (
72
72
  create_model_cache_dir,
73
73
  get_hf_token,
74
74
  get_min_cuda_compute_capability,
75
+ internet_connection_available,
75
76
  log_once,
77
+ resolve_model_path,
76
78
  split_model_id,
77
79
  )
78
80
  from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
@@ -146,7 +148,7 @@ class VLLMModel(HuggingFaceEncoderModel):
146
148
  )
147
149
 
148
150
  self.end_of_reasoning_token = get_end_of_reasoning_token(
149
- model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
151
+ model=self._model, tokeniser=self._tokeniser, model_config=model_config
150
152
  )
151
153
  self.end_of_chat_token_ids = get_end_of_chat_token_ids(
152
154
  tokeniser=self._tokeniser, generative_type=self.generative_type
@@ -834,10 +836,15 @@ def load_model_and_tokeniser(
834
836
 
835
837
  clear_vllm()
836
838
 
839
+ # if we do not have an internet connection we need to give the path to the folder
840
+ # that contains the model weights and config files, otherwise vLLM will try to
841
+ # download them regardless if they are already present in the download_dir
842
+ model_path = resolve_model_path(download_dir)
843
+
837
844
  try:
838
845
  model = LLM(
839
- model=model_id,
840
- tokenizer=model_id,
846
+ model=model_id if internet_connection_available() else model_path,
847
+ tokenizer=model_id if internet_connection_available() else model_path,
841
848
  gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
842
849
  max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
843
850
  download_dir=download_dir,
@@ -925,6 +932,7 @@ def load_tokeniser(
925
932
  cache_dir=model_cache_dir,
926
933
  token=token,
927
934
  trust_remote_code=trust_remote_code,
935
+ local_files_only=not internet_connection_available(),
928
936
  )
929
937
  num_retries = 5
930
938
  for _ in range(num_retries):
@@ -937,8 +945,10 @@ def load_tokeniser(
937
945
  padding_side="left",
938
946
  truncation_side="left",
939
947
  model_max_length=model_max_length,
948
+ cache_dir=model_cache_dir,
940
949
  config=config,
941
950
  token=token,
951
+ local_files_only=not internet_connection_available(),
942
952
  )
943
953
  break
944
954
  except (json.JSONDecodeError, OSError, TypeError) as e:
@@ -996,7 +1006,7 @@ def clear_vllm() -> None:
996
1006
 
997
1007
 
998
1008
  def get_end_of_reasoning_token(
999
- model: "LLM", tokeniser: "PreTrainedTokenizer", model_id: str
1009
+ model: "LLM", tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
1000
1010
  ) -> str | None:
1001
1011
  """Get the end-of-reasoning token for a generative model.
1002
1012
 
@@ -1005,21 +1015,26 @@ def get_end_of_reasoning_token(
1005
1015
  The vLLM model.
1006
1016
  tokeniser:
1007
1017
  The tokeniser.
1008
- model_id:
1009
- The model ID.
1018
+ model_config:
1019
+ The model configuration.
1010
1020
 
1011
1021
  Returns:
1012
1022
  The end of reasoning token, or None if it could not be found.
1013
1023
  """
1024
+ model_id = model_config.model_id
1025
+
1014
1026
  # Create a prompt to check if the model uses the reasoning tokens
1015
1027
  prompt = "What is your name?"
1016
1028
  if has_chat_template(tokeniser=tokeniser):
1029
+ extra_kwargs = dict()
1030
+ if model_config.param in {"thinking", "no-thinking"}:
1031
+ extra_kwargs["enable_thinking"] = model_config.param == "thinking"
1017
1032
  templated_prompt = apply_chat_template(
1018
1033
  conversation=[dict(role="user", content=prompt)],
1019
1034
  tokeniser=tokeniser,
1020
1035
  tokenise=False,
1021
1036
  add_generation_prompt=True,
1022
- enable_thinking=True,
1037
+ **extra_kwargs,
1023
1038
  )
1024
1039
  assert isinstance(templated_prompt, str)
1025
1040
  prompt = templated_prompt
@@ -1042,8 +1057,8 @@ def get_end_of_reasoning_token(
1042
1057
  if not bor_reasoning_matches:
1043
1058
  log_once(
1044
1059
  f"The model {model_id!r} did not generate any beginning-of-reasoning "
1045
- "tokens in the prompt or the completion. Assuming the model is not "
1046
- "a reasoning model.",
1060
+ "tokens in the prompt or the completion. Assuming the model is not a "
1061
+ "reasoning model.",
1047
1062
  level=logging.DEBUG,
1048
1063
  )
1049
1064
  return None
euroeval/benchmarker.py CHANGED
@@ -16,7 +16,7 @@ from torch.distributed import destroy_process_group
16
16
 
17
17
  from .benchmark_config_factory import build_benchmark_config
18
18
  from .constants import GENERATIVE_PIPELINE_TAGS
19
- from .data_loading import load_data
19
+ from .data_loading import load_data, load_raw_data
20
20
  from .data_models import BenchmarkConfigParams, BenchmarkResult
21
21
  from .dataset_configs import get_all_dataset_configs
22
22
  from .enums import Device, GenerativeType, ModelType
@@ -28,7 +28,12 @@ from .model_loading import load_model
28
28
  from .scores import log_scores
29
29
  from .speed_benchmark import benchmark_speed
30
30
  from .tasks import SPEED
31
- from .utils import enforce_reproducibility, get_package_version
31
+ from .utils import (
32
+ enforce_reproducibility,
33
+ get_package_version,
34
+ internet_connection_available,
35
+ log_once,
36
+ )
32
37
 
33
38
  if t.TYPE_CHECKING:
34
39
  from .benchmark_modules import BenchmarkModule
@@ -83,6 +88,7 @@ class Benchmarker:
83
88
  debug: bool = False,
84
89
  run_with_cli: bool = False,
85
90
  requires_safetensors: bool = False,
91
+ download_only: bool = False,
86
92
  ) -> None:
87
93
  """Initialise the benchmarker.
88
94
 
@@ -164,14 +170,26 @@ class Benchmarker:
164
170
  requires_safetensors:
165
171
  Whether to only allow models that use the safetensors format. Defaults
166
172
  to False.
173
+ download_only:
174
+ Whether to only download models and datasets without performing any
175
+ benchmarking. Defaults to False.
167
176
 
168
177
  Raises:
169
178
  ValueError:
170
- If both `task` and `dataset` are specified.
179
+ If both `task` and `dataset` are specified, or if `download_only`
180
+ is True and we have no internet connection.
171
181
  """
172
182
  if task is not None and dataset is not None:
173
183
  raise ValueError("Only one of `task` and `dataset` can be specified.")
174
184
 
185
+ if not internet_connection_available() and download_only:
186
+ msg = "It appears you do not have an internet connection, but "
187
+ if run_with_cli:
188
+ msg += "the --download-only flag was set."
189
+ else:
190
+ msg += "the argument `download_only` was set to True."
191
+ raise ValueError(msg)
192
+
175
193
  # Bail early if hf_transfer is enabled but not installed.
176
194
  if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
177
195
  raise ImportError(
@@ -205,13 +223,14 @@ class Benchmarker:
205
223
  api_version=api_version,
206
224
  gpu_memory_utilization=gpu_memory_utilization,
207
225
  generative_type=generative_type,
226
+ download_only=download_only,
208
227
  debug=debug,
209
228
  run_with_cli=run_with_cli,
210
229
  requires_safetensors=requires_safetensors,
211
230
  )
212
231
 
213
232
  self.benchmark_config = build_benchmark_config(
214
- **self.benchmark_config_default_params.model_dump()
233
+ benchmark_config_params=self.benchmark_config_default_params
215
234
  )
216
235
 
217
236
  # Initialise variable storing model lists, so we only have to fetch it once
@@ -222,17 +241,82 @@ class Benchmarker:
222
241
 
223
242
  @property
224
243
  def benchmark_results(self) -> list[BenchmarkResult]:
225
- """The benchmark results."""
244
+ """The benchmark results.
245
+
246
+ Returns:
247
+ A list of benchmark results.
248
+
249
+ Raises:
250
+ ValueError:
251
+ If there is an error decoding a line in the results file.
252
+ """
226
253
  if self.results_path.exists():
254
+ benchmark_results: list[BenchmarkResult] = list()
227
255
  with self.results_path.open() as f:
228
- return [
229
- BenchmarkResult.from_dict(json.loads(line))
230
- for line in f
231
- if line.strip()
232
- ]
256
+ for line in f:
257
+ if line.strip():
258
+ try:
259
+ result_dict = json.loads(line.strip())
260
+ except json.JSONDecodeError as e:
261
+ raise ValueError(
262
+ f"Error decoding JSON line: {line.strip()}"
263
+ ) from e
264
+
265
+ # Fix for older records
266
+ has_old_raw_results = (
267
+ "results" in result_dict
268
+ and isinstance(result_dict["results"], dict)
269
+ and "raw" in result_dict["results"]
270
+ and isinstance(result_dict["results"]["raw"], dict)
271
+ and "test" in result_dict["results"]["raw"]
272
+ )
273
+ if has_old_raw_results:
274
+ result_dict["results"]["raw"] = result_dict["results"][
275
+ "raw"
276
+ ]["test"]
277
+
278
+ result = BenchmarkResult.from_dict(result_dict)
279
+ benchmark_results.append(result)
280
+ return benchmark_results
233
281
  else:
234
282
  return list()
235
283
 
284
+ def _download(
285
+ self,
286
+ dataset_config: "DatasetConfig",
287
+ model_config: "ModelConfig",
288
+ benchmark_config: "BenchmarkConfig",
289
+ ) -> None:
290
+ """Download data, metrics, and model for the given dataset, and model.
291
+
292
+ Args:
293
+ dataset_config: The configuration for the dataset.
294
+ model_config: The configuration for the model.
295
+ benchmark_config: The configuration for the benchmark.
296
+ """
297
+ log_once(f"Loading data for {dataset_config.pretty_name}", level=logging.INFO)
298
+ dataset = load_raw_data(
299
+ dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
300
+ )
301
+ del dataset
302
+
303
+ log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
304
+ model = load_model(
305
+ model_config=model_config,
306
+ dataset_config=dataset_config,
307
+ benchmark_config=benchmark_config,
308
+ )
309
+ del model
310
+
311
+ log_once(
312
+ f"Loading metrics for the '{dataset_config.task.name}' task",
313
+ level=logging.INFO,
314
+ )
315
+ for metric_name in dataset_config.task.metrics:
316
+ log_once(f"Loading metric {metric_name.name}", level=logging.DEBUG)
317
+ metric = metric_name.download(cache_dir=benchmark_config.cache_dir)
318
+ del metric
319
+
236
320
  def benchmark(
237
321
  self,
238
322
  model: list[str] | str,
@@ -256,6 +340,7 @@ class Benchmarker:
256
340
  few_shot: bool | None = None,
257
341
  num_iterations: int | None = None,
258
342
  requires_safetensors: bool | None = None,
343
+ download_only: bool | None = None,
259
344
  ) -> list[BenchmarkResult]:
260
345
  """Benchmarks models on datasets.
261
346
 
@@ -336,6 +421,9 @@ class Benchmarker:
336
421
  requires_safetensors:
337
422
  Whether to only allow models that use the safetensors format. Defaults
338
423
  to the value specified when initialising the benchmarker.
424
+ download_only:
425
+ Whether to only download the models without evaluating them. Defaults
426
+ to the value specified when initialising the benchmarker.
339
427
 
340
428
  Returns:
341
429
  A list of benchmark results.
@@ -368,6 +456,7 @@ class Benchmarker:
368
456
  few_shot=few_shot,
369
457
  num_iterations=num_iterations,
370
458
  requires_safetensors=requires_safetensors,
459
+ download_only=download_only,
371
460
  )
372
461
 
373
462
  adjust_logging_level(verbose=benchmark_config.verbose)
@@ -395,6 +484,28 @@ class Benchmarker:
395
484
  num_finished_benchmarks += len(dataset_configs)
396
485
  continue
397
486
 
487
+ if model_config.adapter_base_model_id:
488
+ open_issue_msg = (
489
+ "If offline support is important to you, please "
490
+ "consider opening an issue at https://github.com/EuroEval/EuroEval/issues."
491
+ )
492
+ if not internet_connection_available():
493
+ raise InvalidModel(
494
+ "Offline benchmarking of models with adapters is not currently "
495
+ "supported. "
496
+ f"An active internet connection is required. {open_issue_msg}"
497
+ )
498
+ elif benchmark_config.download_only:
499
+ log_once(
500
+ "You are using download only mode with a model that includes "
501
+ "an adapter. "
502
+ "Please note: Offline benchmarking of adapter models is not "
503
+ "currently supported. "
504
+ "An internet connection will be required during evaluation. "
505
+ f"{open_issue_msg}",
506
+ level=logging.WARNING,
507
+ )
508
+
398
509
  loaded_model: BenchmarkModule | None = None
399
510
  benchmark_params_to_revert: dict[str, t.Any] = dict()
400
511
  for dataset_config in dataset_configs:
@@ -569,6 +680,7 @@ class Benchmarker:
569
680
  debug: bool | None = None,
570
681
  run_with_cli: bool | None = None,
571
682
  requires_safetensors: bool | None = None,
683
+ download_only: bool | None = None,
572
684
  ) -> "BenchmarkConfig":
573
685
  """Get an updated benchmark configuration.
574
686
 
@@ -645,6 +757,12 @@ class Benchmarker:
645
757
  requires_safetensors:
646
758
  Whether to only allow models that use the safetensors format. If None,
647
759
  then this value will not be updated.
760
+ download_only:
761
+ Whether to only download the models without evaluating them. If None,
762
+ then this value will not be updated.
763
+ download_only:
764
+ Whether to only download models and datasets without performing any
765
+ benchmarking. If None, then this value will not be updated.
648
766
 
649
767
  Returns:
650
768
  The updated benchmark configuration.
@@ -701,8 +819,10 @@ class Benchmarker:
701
819
  benchmark_config_params.run_with_cli = run_with_cli
702
820
  if requires_safetensors is not None:
703
821
  benchmark_config_params.requires_safetensors = requires_safetensors
822
+ if download_only is not None:
823
+ benchmark_config_params.download_only = download_only
704
824
 
705
- return build_benchmark_config(**benchmark_config_params.model_dump())
825
+ return build_benchmark_config(benchmark_config_params=benchmark_config_params)
706
826
 
707
827
  def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
708
828
  """Prepare the model ID(s) to be benchmarked.
@@ -813,17 +933,19 @@ class Benchmarker:
813
933
  model_param=model_config.param,
814
934
  )
815
935
 
936
+ model_id_to_be_stored = model_config.model_id
937
+ if model_config.revision != "main":
938
+ model_id_to_be_stored += f"@{model_config.revision}"
939
+ if model_config.param is not None:
940
+ model_id_to_be_stored += f"#{model_config.param}"
941
+
816
942
  record = BenchmarkResult(
817
943
  dataset=dataset_config.name,
818
944
  task=dataset_config.task.name,
819
945
  dataset_languages=[
820
946
  language.code for language in dataset_config.languages
821
947
  ],
822
- model=(
823
- f"{model_config.model_id}@{model_config.revision}"
824
- if model_config.revision and model_config.revision != "main"
825
- else model_config.model_id
826
- ),
948
+ model=model_id_to_be_stored,
827
949
  results=results,
828
950
  num_model_parameters=model.num_params,
829
951
  max_sequence_length=model.model_max_length,
euroeval/cli.py CHANGED
@@ -216,6 +216,12 @@ from .tasks import get_all_tasks
216
216
  help="The type of generative model. Only relevant if the model is generative. If "
217
217
  "not specified, the type will be inferred automatically.",
218
218
  )
219
+ @click.option(
220
+ "--download-only",
221
+ is_flag=True,
222
+ help="Only download the requested model weights and datasets, and exit.",
223
+ default=False,
224
+ )
219
225
  def benchmark(
220
226
  model: tuple[str],
221
227
  dataset: tuple[str],
@@ -243,6 +249,7 @@ def benchmark(
243
249
  debug: bool,
244
250
  requires_safetensors: bool,
245
251
  generative_type: str | None,
252
+ download_only: bool,
246
253
  ) -> None:
247
254
  """Benchmark pretrained language models on language tasks."""
248
255
  models = list(model)
@@ -284,6 +291,7 @@ def benchmark(
284
291
  debug=debug,
285
292
  run_with_cli=True,
286
293
  requires_safetensors=requires_safetensors,
294
+ download_only=download_only,
287
295
  )
288
296
 
289
297
  # Perform the benchmark evaluation
euroeval/data_models.py CHANGED
@@ -228,6 +228,9 @@ class BenchmarkConfig:
228
228
  generative_type:
229
229
  The type of generative model to benchmark. Only relevant if the model is
230
230
  generative.
231
+ download_only:
232
+ Whether to only download the models, metrics and datasets without
233
+ evaluating.
231
234
  """
232
235
 
233
236
  model_languages: list[Language]
@@ -255,6 +258,7 @@ class BenchmarkConfig:
255
258
  run_with_cli: bool
256
259
  requires_safetensors: bool
257
260
  generative_type: GenerativeType | None
261
+ download_only: bool
258
262
 
259
263
 
260
264
  class BenchmarkConfigParams(pydantic.BaseModel):
@@ -285,6 +289,7 @@ class BenchmarkConfigParams(pydantic.BaseModel):
285
289
  api_version: str | None
286
290
  gpu_memory_utilization: float
287
291
  generative_type: GenerativeType | None
292
+ download_only: bool
288
293
  debug: bool
289
294
  run_with_cli: bool
290
295
  requires_safetensors: bool
euroeval/generation.py CHANGED
@@ -243,7 +243,9 @@ def generate_single_iteration(
243
243
  ground_truth = []
244
244
 
245
245
  itr_scores: dict[str, float] = model.compute_metrics(
246
- model_outputs_and_labels=(all_preds, ground_truth), dataset=dataset
246
+ model_outputs_and_labels=(all_preds, ground_truth),
247
+ dataset=dataset,
248
+ benchmark_config=benchmark_config,
247
249
  )
248
250
 
249
251
  return itr_scores
euroeval/metrics/base.py CHANGED
@@ -42,6 +42,18 @@ class Metric(abc.ABC):
42
42
  else lambda x: (100 * x, f"{x:.2%}")
43
43
  )
44
44
 
45
+ def download(self, cache_dir: str) -> "Metric":
46
+ """Initiates the download of the metric if needed.
47
+
48
+ Args:
49
+ cache_dir:
50
+ The directory where the metric will be downloaded to.
51
+
52
+ Returns:
53
+ The metric object itself.
54
+ """
55
+ return self
56
+
45
57
  @abc.abstractmethod
46
58
  def __call__(
47
59
  self,
@@ -3,9 +3,11 @@
3
3
  import collections.abc as c
4
4
  import logging
5
5
  import typing as t
6
+ from pathlib import Path
6
7
 
7
8
  import evaluate
8
9
  import numpy as np
10
+ from datasets import DownloadConfig
9
11
 
10
12
  from ..utils import HiddenPrints
11
13
  from .base import Metric
@@ -76,6 +78,23 @@ class HuggingFaceMetric(Metric):
76
78
  )
77
79
  self.metric: "EvaluationModule | None" = None
78
80
 
81
+ def download(self, cache_dir: str) -> "HuggingFaceMetric":
82
+ """Initiates the download of the metric if needed.
83
+
84
+ Args:
85
+ cache_dir:
86
+ The directory where the metric will be downloaded to.
87
+
88
+ Returns:
89
+ The metric object itself.
90
+ """
91
+ # Annoying but needed to make the metric download to a different cache dir
92
+ download_config = DownloadConfig(cache_dir=Path(cache_dir, "evaluate"))
93
+ self.metric = evaluate.load(
94
+ path=self.huggingface_id, download_config=download_config
95
+ )
96
+ return self
97
+
79
98
  def __call__(
80
99
  self,
81
100
  predictions: c.Sequence,
@@ -103,7 +122,9 @@ class HuggingFaceMetric(Metric):
103
122
  The calculated metric score, or None if the score should be ignored.
104
123
  """
105
124
  if self.metric is None:
106
- self.metric = evaluate.load(path=self.huggingface_id)
125
+ self.download(cache_dir=benchmark_config.cache_dir)
126
+
127
+ assert self.metric is not None
107
128
 
108
129
  with HiddenPrints():
109
130
  results = self.metric.compute(
@@ -176,7 +197,7 @@ bert_score_metric = HuggingFaceMetric(
176
197
  huggingface_id="bertscore",
177
198
  results_key="f1",
178
199
  compute_kwargs=dict(
179
- model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
200
+ model_type="microsoft/mdeberta-v3-base", device="cpu", batch_size=16
180
201
  ),
181
202
  )
182
203
 
@@ -97,7 +97,7 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
97
97
  default_prompt_prefix="Hetta eru nakrir setningar og um teir eru mállæruliga "
98
98
  "rættir.",
99
99
  default_prompt_template="Setningur: {text}\nMállæruliga rættur: {label}",
100
- default_instruction_prompt="Setningur: {text}\n\nGreinið hvort setningurin er "
100
+ default_instruction_prompt="Setningur: {text}\n\nGreindu hvort setningurin er "
101
101
  "mállæruliga rættur ella ikki. Svara við {labels_str}, og einki annað.",
102
102
  ),
103
103
  FR: PromptConfig(
@@ -111,11 +111,12 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
111
111
  ),
112
112
  IS: PromptConfig(
113
113
  default_prompt_label_mapping=dict(correct="já", incorrect="nei"),
114
- default_prompt_prefix="Eftirfarandi eru setningar og hvort þær eru "
115
- "málfræðilega réttar.",
114
+ default_prompt_prefix="Hér fyrir neðan eru setningar ásamt mati á því hvort "
115
+ "þær eru málfræðilega réttar.",
116
116
  default_prompt_template="Setning: {text}\nMálfræðilega rétt: {label}",
117
- default_instruction_prompt="Setning: {text}\n\nGreinið hvort setningin er "
118
- "málfræðilega rétt eða ekki. Svaraðu með {labels_str}, og ekkert annað.",
117
+ default_instruction_prompt="Setning: {text}\n\nGreindu hvort setningin er "
118
+ "málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún "
119
+ "er það ekki.",
119
120
  ),
120
121
  IT: PromptConfig(
121
122
  default_prompt_label_mapping=dict(correct="si", incorrect="no"),
@@ -176,7 +176,7 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
176
176
  default_prompt_prefix="Her eru nakrir setningar og nakrar JSON orðabøkur við "
177
177
  "nevndar eindir, sum eru í setningunum.",
178
178
  default_prompt_template="Setningur: {text}\nNevndar eindir: {label}",
179
- default_instruction_prompt="Setningur: {text}\n\nGreinið nevndu einingarnar í "
179
+ default_instruction_prompt="Setningur: {text}\n\nGreindu nevndu einingarnar í "
180
180
  "setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum "
181
181
  "{labels_str}. Gildin ættu að vera listi yfir nevndu einingarnar af "
182
182
  "þeirri gerð, nákvæmlega eins og þær koma fram í setningunni.",
@@ -215,8 +215,8 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
215
215
  },
216
216
  default_prompt_prefix="Eftirfarandi eru setningar ásamt JSON lyklum með "
217
217
  "nefndum einingum sem koma fyrir í setningunum.",
218
- default_prompt_template="Setning: {text}\nNefndar einingar: {label}",
219
- default_instruction_prompt="Setning: {text}\n\nGreinið nefndu einingarnar í "
218
+ default_prompt_template="Setning: {text}\nNafneiningar: {label}",
219
+ default_instruction_prompt="Setning: {text}\n\nGreindu nefndu einingarnar í "
220
220
  "setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum "
221
221
  "{labels_str}. Gildin ættu að vera listi yfir nefndu "
222
222
  "einingarnar af þeirri gerð, nákvæmlega eins og þær koma fram í "
@@ -137,11 +137,11 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
137
137
  default_prompt_label_mapping=dict(
138
138
  positive="jákvætt", neutral="hlutlaust", negative="neikvætt"
139
139
  ),
140
- default_prompt_prefix="Eftirfarandi eru skjöl og viðhorf þeirra, sem geta "
141
- "verið {labels_str}.",
142
- default_prompt_template="Skjal: {text}\nViðhorf: {label}",
143
- default_instruction_prompt="Skjal: {text}\n\nFlokkaðu viðhorfið í skjalinu. "
144
- "Svaraðu með {labels_str}, og ekkert annað.",
140
+ default_prompt_prefix="Hér fyrir neðan eru textabrot ásamt lyndisgildi þeirra "
141
+ "sem getur verið 'jákvætt', 'hlutlaust' eða 'neikvætt'.",
142
+ default_prompt_template="Textabrot: {text}\nViðhorf: {label}",
143
+ default_instruction_prompt="Textabrot: {text}\n\nGreindu lyndið í "
144
+ "textabrotinu. Svaraðu með {labels_str}, og ekkert annað.",
145
145
  ),
146
146
  IT: PromptConfig(
147
147
  default_prompt_label_mapping=dict(
euroeval/tasks.py CHANGED
@@ -100,6 +100,7 @@ KNOW = Task(
100
100
  default_num_few_shot_examples=5,
101
101
  default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
102
102
  default_labels=["a", "b", "c", "d"],
103
+ default_allowed_model_types=[ModelType.GENERATIVE],
103
104
  uses_logprobs=True,
104
105
  )
105
106
 
@@ -112,6 +113,7 @@ MCRC = Task(
112
113
  default_num_few_shot_examples=5,
113
114
  default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
114
115
  default_labels=["a", "b", "c", "d"],
116
+ default_allowed_model_types=[ModelType.GENERATIVE],
115
117
  uses_logprobs=True,
116
118
  )
117
119
 
@@ -124,6 +126,7 @@ COMMON_SENSE = Task(
124
126
  default_num_few_shot_examples=5,
125
127
  default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
126
128
  default_labels=["a", "b", "c", "d"],
129
+ default_allowed_model_types=[ModelType.GENERATIVE],
127
130
  uses_logprobs=True,
128
131
  )
129
132
 
@@ -551,7 +551,6 @@ def apply_chat_template(
551
551
  tokeniser: "PreTrainedTokenizer",
552
552
  tokenise: bool,
553
553
  add_generation_prompt: bool,
554
- enable_thinking: bool,
555
554
  **extra_kwargs,
556
555
  ) -> str | list[int]:
557
556
  """Apply the chat template to a prompt.
@@ -568,10 +567,6 @@ def apply_chat_template(
568
567
  Whether to add a generation prompt at the end of the conversation. This is
569
568
  only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
570
569
  always add a generation prompt.
571
- enable_thinking:
572
- Whether to enable special handling for reasoning models, such as adding
573
- special tokens for thinking. This is only relevant for regular Hugging
574
- Face tokenisers, as Mistral tokenisers always handle reasoning models.
575
570
  **extra_kwargs:
576
571
  Extra keyword arguments to pass to the tokeniser's `apply_chat_template`
577
572
  method. Only relevant for regular Hugging Face tokenisers.
@@ -601,7 +596,6 @@ def apply_chat_template(
601
596
  conversation=conversation,
602
597
  add_generation_prompt=add_generation_prompt,
603
598
  tokenize=tokenise,
604
- enable_thinking=enable_thinking,
605
599
  **extra_kwargs,
606
600
  )
607
601
  return templated_prompt
euroeval/types.py CHANGED
@@ -8,8 +8,7 @@ if t.TYPE_CHECKING:
8
8
  from datasets.arrow_dataset import Dataset
9
9
  from numpy.typing import NDArray
10
10
 
11
- from .data_models import GenerativeModelOutput
12
-
11
+ from .data_models import BenchmarkConfig, GenerativeModelOutput
13
12
 
14
13
  ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
15
14
  Predictions: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
@@ -27,6 +26,7 @@ class ComputeMetricsFunction(t.Protocol):
27
26
  "NDArray | list[str] | list[list[str]]",
28
27
  ],
29
28
  dataset: "Dataset",
29
+ benchmark_config: "BenchmarkConfig",
30
30
  ) -> dict[str, float]:
31
31
  """Compute the metrics.
32
32
 
euroeval/utils.py CHANGED
@@ -8,6 +8,7 @@ import logging
8
8
  import os
9
9
  import random
10
10
  import re
11
+ import socket
11
12
  import sys
12
13
  import typing as t
13
14
  import warnings
@@ -18,10 +19,8 @@ import demjson3
18
19
  import huggingface_hub as hf_hub
19
20
  import litellm
20
21
  import numpy as np
21
- import requests
22
22
  import torch
23
23
  from datasets.utils import disable_progress_bar
24
- from requests.exceptions import RequestException
25
24
  from transformers import logging as tf_logging
26
25
 
27
26
  from .exceptions import InvalidBenchmark, InvalidModel, NaNValueInModelOutput
@@ -54,6 +53,68 @@ def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
54
53
  return str(cache_dir_path)
55
54
 
56
55
 
56
+ def resolve_model_path(download_dir: str) -> str:
57
+ """Resolve the path to the directory containing the model config files and weights.
58
+
59
+ Args:
60
+ download_dir:
61
+ The download directory
62
+
63
+ Returns:
64
+ The path to the model.
65
+ """
66
+ model_path = Path(download_dir)
67
+ # Get the 'path safe' version of the model id, which is the last dir in the path
68
+ model_id_path = model_path.name
69
+ # Hf hub `cache_dir` puts the files in models--`model_id_path`/snapshots
70
+ model_path = model_path / f"models--{model_id_path}" / "snapshots"
71
+ if not model_path.exists():
72
+ raise InvalidModel(
73
+ f"Attempted to load models from the {model_path} directory, "
74
+ "but it does not exist."
75
+ )
76
+
77
+ # Get all files in the model path
78
+ found_files = [
79
+ found_file for found_file in model_path.rglob("*") if found_file.is_file()
80
+ ]
81
+ if not found_files:
82
+ raise InvalidModel(f"No model files found at {model_path}")
83
+
84
+ # Make sure that there arent multiples of the files found
85
+ if len(found_files) == len(set(found_files)):
86
+ raise InvalidModel(
87
+ f"Found multiple model config files for {model_id_path.strip('models--')}"
88
+ f"at {model_path}"
89
+ )
90
+
91
+ # Check that found_files contains at least a 'config.json'
92
+ config_file = next(
93
+ (file for file in found_files if file.name == "config.json"), None
94
+ )
95
+ if config_file is None:
96
+ raise InvalidModel(
97
+ f"Missing required file 'config.json' for {model_id_path.strip('models--')}"
98
+ f"at {model_path}"
99
+ )
100
+ model_path = config_file.parent
101
+
102
+ # As a precaution we also check that all of the files are in the same directory
103
+ # if not we create a new dir with symlinks to all of the files from all snapshots
104
+ # this is especially useful for vllm where we can only specify one folder and e.g.,
105
+ # the safetensors version of the weights was added in an unmerged PR
106
+ if not all(
107
+ [found_file.parent == found_files[0].parent for found_file in found_files]
108
+ ):
109
+ new_model_path = model_path.parent / "model_files"
110
+ new_model_path.mkdir(exist_ok=True)
111
+ for found_file in found_files:
112
+ Path(new_model_path / found_file.name).symlink_to(found_file)
113
+ model_path = new_model_path
114
+
115
+ return str(model_path)
116
+
117
+
57
118
  def clear_memory() -> None:
58
119
  """Clears the memory of unused items."""
59
120
  for gc_generation in range(3):
@@ -91,6 +152,9 @@ def block_terminal_output() -> None:
91
152
  libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and
92
153
  disables most of the logging from the `transformers` library.
93
154
  """
155
+ if os.getenv("FULL_LOG") == "1":
156
+ return
157
+
94
158
  # Ignore miscellaneous warnings
95
159
  warnings.filterwarnings("ignore", category=UserWarning)
96
160
  warnings.filterwarnings("ignore", category=FutureWarning)
@@ -196,6 +260,7 @@ def get_min_cuda_compute_capability() -> float | None:
196
260
  return float(f"{major}.{minor}")
197
261
 
198
262
 
263
+ @cache
199
264
  def internet_connection_available() -> bool:
200
265
  """Checks if internet connection is available by pinging google.com.
201
266
 
@@ -203,10 +268,17 @@ def internet_connection_available() -> bool:
203
268
  Whether or not internet connection is available.
204
269
  """
205
270
  try:
206
- requests.get("https://www.google.com")
271
+ s = socket.create_connection(("1.1.1.1", 80))
272
+ s.close()
207
273
  return True
208
- except RequestException:
209
- return False
274
+ # a bit ugly but we dont want to actually import the pytest-socket exceptions
275
+ # we catch all exceptions and check if the name matches any known errors
276
+ except Exception as e:
277
+ pytest_socket_errors = ["SocketConnectBlockedError", "SocketBlockedError"]
278
+ if type(e).__name__ in pytest_socket_errors or isinstance(e, OSError):
279
+ return False
280
+ else:
281
+ raise e
210
282
 
211
283
 
212
284
  class HiddenPrints:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.1.1
3
+ Version: 16.2.1
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,13 +61,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
61
61
  Provides-Extra: all
62
62
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
64
- Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
65
- Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: timm>=1.0.19; extra == 'all'
65
+ Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
- Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
70
- Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
69
+ Requires-Dist: timm>=1.0.19; extra == 'generative'
70
+ Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
71
71
  Description-Content-Type: text/markdown
72
72
 
73
73
  <div align='center'>
@@ -152,13 +152,13 @@ model:
152
152
  ```
153
153
  >>> from euroeval import Benchmarker
154
154
  >>> benchmark = Benchmarker()
155
- >>> benchmark(model="<model>")
155
+ >>> benchmark(model="<model-id>")
156
156
  ```
157
157
 
158
158
  To benchmark on a specific task and/or language, you simply specify the `task` or
159
159
  `language` arguments, shown here with same example as above:
160
160
  ```
161
- >>> benchmark(model="<model>", task="sentiment-classification", language="da")
161
+ >>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
162
162
  ```
163
163
 
164
164
  If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
@@ -168,6 +168,30 @@ models on the Danish sentiment classification task:
168
168
  >>> benchmark(task="sentiment-classification", language="da")
169
169
  ```
170
170
 
171
+ ### Benchmarking in an Offline Environment
172
+ If you need to benchmark in an offline environment, you need to download the models,
173
+ datasets and metrics beforehand. This can be done by adding the `--download-only`
174
+ argument, from the command line, or the `download_only` argument, if benchmarking from a
175
+ script. For example to download the model you want and all of the Danish sentiment
176
+ classification datasets:
177
+ ```
178
+ $ euroeval --model <model-id> --task sentiment-classification --language da --download-only
179
+ ```
180
+
181
+ Or from a script:
182
+ ```
183
+ >>> benchmark(
184
+ ... model="<model-id>",
185
+ ... task="sentiment-classification",
186
+ ... language="da",
187
+ ... download_only=True,
188
+ ... )
189
+ ```
190
+
191
+ Please note: Offline benchmarking of adapter models is not currently supported. An
192
+ internet connection will be required during evaluation. If offline support is important
193
+ to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
194
+
171
195
  ### Benchmarking from Docker
172
196
  A Dockerfile is provided in the repo, which can be downloaded and run, without needing
173
197
  to clone the repo and installing from source. This can be fetched programmatically by
@@ -1,15 +1,15 @@
1
- euroeval/__init__.py,sha256=8jqSCcDWvwwNb1guPi8cLAekPSOX9V8DpRx_v3-c19E,3730
2
- euroeval/benchmark_config_factory.py,sha256=NzNSiqix4hlVXk3xnyzdg2WDxomkectf97UWdVS3POo,11667
3
- euroeval/benchmarker.py,sha256=JkhvYxhVpQPcWmDLzwnB8Yy6tTqj3yfDWTefklbI7RM,50355
1
+ euroeval/__init__.py,sha256=mXTjuGrEE-1fIS9x28oJKg-gNGt4q7y2E74l330KEmY,3787
2
+ euroeval/benchmark_config_factory.py,sha256=eOQsd9F4cJy8I7a3_lIKDZ5b5ukipIUqk0GZ3pyytwQ,8596
3
+ euroeval/benchmarker.py,sha256=5l4p1ncq4VJX_bDjv2f8oBq2GETPtJmduGOnLAbWjF8,55762
4
4
  euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
5
- euroeval/cli.py,sha256=wUGetj9Ld4wkS872ZOfYqHIJMh58o8L2MDi78wU5nxI,9099
5
+ euroeval/cli.py,sha256=GOAWzdtasJfOvTuVQszu-T1T9GfQ_un-blOICO-y7g4,9316
6
6
  euroeval/constants.py,sha256=NN7kcwQdlDyyGFSrLjsL_qKVRyoRqZ9sKO5SjlgtRwA,2741
7
7
  euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
8
- euroeval/data_models.py,sha256=S-PATp4F1wBwvra6wtjlJFXxZbZB_vEpJHXcdTTKA70,27593
8
+ euroeval/data_models.py,sha256=9Sgrq6Ktg1ETXRJ0v4VA_amAPowGuB7fZtL-8RlDQn0,27766
9
9
  euroeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
10
10
  euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
11
  euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
12
- euroeval/generation.py,sha256=MSrd0oIkoqwKsCOaIkY2CFF_urXLOfNR1OO5nMvcCpY,12476
12
+ euroeval/generation.py,sha256=Va3EOmFzOMBNfI4fh3nW5qhhrM3CBT8_4MaLwVtsF_E,12528
13
13
  euroeval/generation_utils.py,sha256=d2_vylWXIeH4xIXgbsI5rN6dMt0zKp0zXExD6aOKWaA,18299
14
14
  euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
15
15
  euroeval/model_cache.py,sha256=h61cL_fy2Sd1sqYZis5lAWqvQIfQXXt_v8QZeftKNkg,9226
@@ -17,16 +17,16 @@ euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
17
17
  euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
18
18
  euroeval/scores.py,sha256=HQQqyjdgm853FZ_ifIdnSltKfBhsY7pOITov6F3Et5o,3165
19
19
  euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
20
- euroeval/tasks.py,sha256=3qEOBAMmfeqgXqlGkCKzQ-s0Yw-0-jPRgFZ97EZCFng,4535
21
- euroeval/tokenisation_utils.py,sha256=e2H86vhSVfz5gx6GmzoBJwLZLG6sf3GEcoCGmvJBQLc,21505
22
- euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
23
- euroeval/utils.py,sha256=c0tFw1IXZIqgLU4EfY_k28iJ1ZlCZ_oFoKZH2sGCKYg,16499
20
+ euroeval/tasks.py,sha256=EzEWFDo_0ffabBFiRu-mw80jENUioE8D_VEn_Dsv-F8,4703
21
+ euroeval/tokenisation_utils.py,sha256=nLeF2cdZSm5PZiAcDTtxY82nUJ-or8VU8YxYLa167EM,21158
22
+ euroeval/types.py,sha256=_iVy-RwiCGu9TNX2sfyJTdCvXy1akNGTCywAo-YpBqU,2815
23
+ euroeval/utils.py,sha256=DRJW6wtmNpRtuHt03diWo3S5m3rdxoPEQpd-KWi7aGY,19255
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=mHF8XS6GGUXV-sJtxmI5WJBWPLMHuh-4Z4OWjC25x9Y,11566
26
26
  euroeval/benchmark_modules/fresh.py,sha256=TveSQiFBi3xXgCEQBdHwkUQ685PDkKW0y3G5Yt5rkeM,10655
27
- euroeval/benchmark_modules/hf.py,sha256=oBjVumnSM9PW7ZocQwCGLKpbeGFWLN_71DBotxZo1aY,44038
28
- euroeval/benchmark_modules/litellm.py,sha256=6EKjHnUoPCpuupISZHXqZsXLG8tyiA1-G12a5C6L8MM,64629
29
- euroeval/benchmark_modules/vllm.py,sha256=sYFdVzB9CZX6_sGI4xghDyXoVn6I95_nbeFUWeSMXcc,43132
27
+ euroeval/benchmark_modules/hf.py,sha256=XmkoDFzaJqnd_5mmUkqCaOgAdRPFs3KZKZZ0cr83TlM,44742
28
+ euroeval/benchmark_modules/litellm.py,sha256=F3udd6NmhQOe3go_7rAcWg7mgZrNQpWWvLe-5U4E2RQ,64771
29
+ euroeval/benchmark_modules/vllm.py,sha256=yLy8TCTnodu4NdTiO7XSdxuHX60AJ1-7p6J3e5h7-iA,43994
30
30
  euroeval/dataset_configs/__init__.py,sha256=uuIZmElpJV8iupo5oDj3TeQhBDRANdWpLKYFASLirHA,2046
31
31
  euroeval/dataset_configs/danish.py,sha256=QABfgI7m-0-5AimDXegp5ssDSLcM2VrAI_RWsinSZP4,5631
32
32
  euroeval/dataset_configs/dutch.py,sha256=63Ro2yFym5MuIDXf5953vUYenw9B0kZSCmZbXjdy4Rs,5517
@@ -45,17 +45,17 @@ euroeval/dataset_configs/portuguese.py,sha256=gQ054SdLQ5fkm4IAP6Mdh5RcPDJPDITcuy
45
45
  euroeval/dataset_configs/spanish.py,sha256=DvJlMK6OQg4qmxKzQA2IficlBMB7BafvxqIVuTKiZyw,4902
46
46
  euroeval/dataset_configs/swedish.py,sha256=YWHp7hbJ25o36csSg9uXaQCEJK1BPb7u2RQZiCe0lNs,5445
47
47
  euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
48
- euroeval/metrics/base.py,sha256=4vnRIPfKUwTNe0ZVm5YC2jQNecwchGUpN6nAH5cX0PM,2288
49
- euroeval/metrics/huggingface.py,sha256=b_Z_FUELQcmK7HeJh0zlAZs3pim1uNHnFLu7nvlZ4_A,5824
48
+ euroeval/metrics/base.py,sha256=HST2XeZrUQZV_vTiieePiaznEov3CIGzuVNIITtLsQc,2596
49
+ euroeval/metrics/huggingface.py,sha256=iHKJnvOXRc_e8sxB2ff3WkfK64jXyn5KEnIxPyfD2fM,6522
50
50
  euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
51
51
  euroeval/metrics/pipeline.py,sha256=Wcan3eDWV7t4WRXMPWCCe_JsA-fZnIfZU2ESinbbL2I,10284
52
52
  euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
53
53
  euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
54
- euroeval/prompt_templates/linguistic_acceptability.py,sha256=pRR1QBnYt5DnfxQp6dw1OYFZfIct-1R9pfdgPGpjoco,8667
54
+ euroeval/prompt_templates/linguistic_acceptability.py,sha256=m23LrckohdnToQDsexdsW_5YyBfGTf5DTjiMI643F9A,8717
55
55
  euroeval/prompt_templates/multiple_choice.py,sha256=Q-8-ETqG-RZeLzR8v8WUBIN7djiNSfNpmYnZRUWcd84,6905
56
- euroeval/prompt_templates/named_entity_recognition.py,sha256=LT7J6Y9rUCJFimpnwujBZq_V5buSmXHJteIXbTOoaCE,16442
56
+ euroeval/prompt_templates/named_entity_recognition.py,sha256=HIX9EBkSIBl5JXceFtiZTdvzWr9YHM9-55D6bcjIyQ4,16436
57
57
  euroeval/prompt_templates/reading_comprehension.py,sha256=ogzmhiSZO6egrdxxQiWz6a0XMdC0vws-lg5yRKQoYV0,8730
58
- euroeval/prompt_templates/sentiment_classification.py,sha256=BwnTpSdsAN_rL693ImgtKIRc5T_2G6ptWW0jCdC02NQ,9454
58
+ euroeval/prompt_templates/sentiment_classification.py,sha256=b3TvH26M77vwFfn577NlGVW881qfV7YSm-Xba_w98Fc,9504
59
59
  euroeval/prompt_templates/summarization.py,sha256=4Sqwj6C7yNfqj4FFFCseJMLDoSZ13aIOgY0SjIzzsNo,6593
60
60
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
61
61
  euroeval/task_group_utils/multiple_choice_classification.py,sha256=i5sidJGAXnENRoB6pOelyaUeGP1qoxwPSzD-F9RLwWk,7106
@@ -63,8 +63,8 @@ euroeval/task_group_utils/question_answering.py,sha256=eUczZntrC9lhCUQlwNQB49i-5
63
63
  euroeval/task_group_utils/sequence_classification.py,sha256=TAqZCoMQ9I-HFhMH35_J1mY2SQg95HUbXcgrBIyhgk0,16082
64
64
  euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
65
65
  euroeval/task_group_utils/token_classification.py,sha256=Yjai937ia1nZBMOWySqCXr_dA6WiVLGvmb4Hm_TU0Bg,17118
66
- euroeval-16.1.1.dist-info/METADATA,sha256=gyqd2PPeT0vv_ye9nnfqv-0DlpejquzqcftBwpwnH7Y,13729
67
- euroeval-16.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
68
- euroeval-16.1.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
69
- euroeval-16.1.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
70
- euroeval-16.1.1.dist-info/RECORD,,
66
+ euroeval-16.2.1.dist-info/METADATA,sha256=brIXZ3x3MUf-ggNpKKC_4Lvrqem0MfKPrJ8DZJ5T3Iw,14590
67
+ euroeval-16.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
68
+ euroeval-16.2.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
69
+ euroeval-16.2.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
70
+ euroeval-16.2.1.dist-info/RECORD,,