EuroEval 16.1.0__py3-none-any.whl → 16.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

euroeval/__init__.py CHANGED
@@ -12,12 +12,13 @@ import warnings
12
12
  from termcolor import colored
13
13
 
14
14
  # Block specific warnings before importing anything else, as they can be noisy
15
- warnings.filterwarnings("ignore", category=UserWarning)
16
- warnings.filterwarnings("ignore", category=FutureWarning)
17
- logging.getLogger("httpx").setLevel(logging.CRITICAL)
18
- logging.getLogger("datasets").setLevel(logging.CRITICAL)
19
- logging.getLogger("vllm").setLevel(logging.CRITICAL)
20
- os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
15
+ if os.getenv("FULL_LOG") != "1":
16
+ warnings.filterwarnings("ignore", category=UserWarning)
17
+ warnings.filterwarnings("ignore", category=FutureWarning)
18
+ logging.getLogger("httpx").setLevel(logging.CRITICAL)
19
+ logging.getLogger("datasets").setLevel(logging.CRITICAL)
20
+ logging.getLogger("vllm").setLevel(logging.CRITICAL)
21
+ os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
21
22
 
22
23
  # Set up logging
23
24
  fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
@@ -47,6 +47,7 @@ def build_benchmark_config(
47
47
  debug: bool,
48
48
  run_with_cli: bool,
49
49
  requires_safetensors: bool,
50
+ download_only: bool,
50
51
  ) -> BenchmarkConfig:
51
52
  """Create a benchmark configuration.
52
53
 
@@ -117,6 +118,8 @@ def build_benchmark_config(
117
118
  Whether the benchmark is being run with the CLI.
118
119
  requires_safetensors:
119
120
  Whether to only allow evaluations of models stored as safetensors.
121
+ download_only:
122
+ Whether to only download the requested model weights and datasets.
120
123
 
121
124
  Returns:
122
125
  The benchmark configuration.
@@ -165,6 +168,7 @@ def build_benchmark_config(
165
168
  debug=debug,
166
169
  run_with_cli=run_with_cli,
167
170
  requires_safetensors=requires_safetensors,
171
+ download_only=download_only,
168
172
  )
169
173
 
170
174
 
@@ -146,21 +146,25 @@ class HuggingFaceEncoderModel(BenchmarkModule):
146
146
  Returns:
147
147
  The number of parameters in the model.
148
148
  """
149
- token = get_hf_token(api_key=self.benchmark_config.api_key)
150
- hf_api = HfApi(token=token)
151
- try:
152
- repo_info = hf_api.model_info(
153
- repo_id=self.model_config.adapter_base_model_id
154
- or self.model_config.model_id,
155
- revision=self.model_config.revision,
156
- )
157
- except (
158
- RepositoryNotFoundError,
159
- RevisionNotFoundError,
160
- RequestException,
161
- HFValidationError,
162
- ):
149
+ # No need to try to use the API if we have no internet.
150
+ if not internet_connection_available():
163
151
  repo_info = None
152
+ else:
153
+ token = get_hf_token(api_key=self.benchmark_config.api_key)
154
+ hf_api = HfApi(token=token)
155
+ try:
156
+ repo_info = hf_api.model_info(
157
+ repo_id=self.model_config.adapter_base_model_id
158
+ or self.model_config.model_id,
159
+ revision=self.model_config.revision,
160
+ )
161
+ except (
162
+ RepositoryNotFoundError,
163
+ RevisionNotFoundError,
164
+ RequestException,
165
+ HFValidationError,
166
+ ):
167
+ repo_info = None
164
168
 
165
169
  if (
166
170
  repo_info is not None
@@ -558,7 +562,7 @@ def load_model_and_tokeniser(
558
562
  The benchmark configuration
559
563
 
560
564
  Returns:
561
- The loaded model and tokeniser.
565
+ A pair (model, tokeniser), with the loaded model and tokeniser
562
566
  """
563
567
  config: "PretrainedConfig"
564
568
  block_terminal_output()
@@ -686,6 +690,7 @@ def load_model_and_tokeniser(
686
690
  model=model,
687
691
  model_id=model_id,
688
692
  trust_remote_code=benchmark_config.trust_remote_code,
693
+ model_cache_dir=model_config.model_cache_dir,
689
694
  )
690
695
 
691
696
  return model, tokeniser
@@ -722,6 +727,11 @@ def get_model_repo_info(
722
727
  ):
723
728
  model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
724
729
 
730
+ # If we have not internet, and the model_id is not a directory for a local model
731
+ # we also just create a dummy model info object.
732
+ elif not internet_connection_available():
733
+ model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
734
+
725
735
  # If the model does not exist locally, then we get the model info from the Hugging
726
736
  # Face Hub, if possible
727
737
  if model_info is None:
@@ -867,7 +877,10 @@ def get_model_repo_info(
867
877
 
868
878
 
869
879
  def load_tokeniser(
870
- model: "PreTrainedModel | None", model_id: str, trust_remote_code: bool
880
+ model: "PreTrainedModel | None",
881
+ model_id: str,
882
+ trust_remote_code: bool,
883
+ model_cache_dir: str,
871
884
  ) -> "PreTrainedTokenizer":
872
885
  """Load the tokeniser.
873
886
 
@@ -889,6 +902,7 @@ def load_tokeniser(
889
902
  trust_remote_code=trust_remote_code,
890
903
  padding_side="right",
891
904
  truncation_side="right",
905
+ cache_dir=model_cache_dir,
892
906
  )
893
907
 
894
908
  # If the model is a subclass of a certain model types then we have to add a prefix
@@ -999,6 +1013,7 @@ def load_hf_model_config(
999
1013
  token=get_hf_token(api_key=api_key),
1000
1014
  trust_remote_code=trust_remote_code,
1001
1015
  cache_dir=model_cache_dir,
1016
+ local_files_only=not internet_connection_available(),
1002
1017
  )
1003
1018
  if config.eos_token_id is not None and config.pad_token_id is None:
1004
1019
  if isinstance(config.eos_token_id, list):
@@ -984,6 +984,7 @@ class LiteLLMModel(BenchmarkModule):
984
984
  model=None,
985
985
  model_id=model_id,
986
986
  trust_remote_code=self.benchmark_config.trust_remote_code,
987
+ model_cache_dir=self.model_config.model_cache_dir,
987
988
  )
988
989
 
989
990
  if (
@@ -1066,6 +1067,7 @@ class LiteLLMModel(BenchmarkModule):
1066
1067
  model=None,
1067
1068
  model_id=model_id,
1068
1069
  trust_remote_code=self.benchmark_config.trust_remote_code,
1070
+ model_cache_dir=self.model_config.model_cache_dir,
1069
1071
  )
1070
1072
 
1071
1073
  all_max_lengths: list[int] = list()
@@ -72,7 +72,9 @@ from ..utils import (
72
72
  create_model_cache_dir,
73
73
  get_hf_token,
74
74
  get_min_cuda_compute_capability,
75
+ internet_connection_available,
75
76
  log_once,
77
+ resolve_model_path,
76
78
  split_model_id,
77
79
  )
78
80
  from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
@@ -146,7 +148,7 @@ class VLLMModel(HuggingFaceEncoderModel):
146
148
  )
147
149
 
148
150
  self.end_of_reasoning_token = get_end_of_reasoning_token(
149
- model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
151
+ model=self._model, tokeniser=self._tokeniser, model_config=model_config
150
152
  )
151
153
  self.end_of_chat_token_ids = get_end_of_chat_token_ids(
152
154
  tokeniser=self._tokeniser, generative_type=self.generative_type
@@ -834,10 +836,15 @@ def load_model_and_tokeniser(
834
836
 
835
837
  clear_vllm()
836
838
 
839
+ # if we do not have an internet connection we need to give the path to the folder
840
+ # that contains the model weights and config files, otherwise vLLM will try to
841
+ # download them regardless if they are already present in the download_dir
842
+ model_path = resolve_model_path(download_dir)
843
+
837
844
  try:
838
845
  model = LLM(
839
- model=model_id,
840
- tokenizer=model_id,
846
+ model=model_id if internet_connection_available() else model_path,
847
+ tokenizer=model_id if internet_connection_available() else model_path,
841
848
  gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
842
849
  max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
843
850
  download_dir=download_dir,
@@ -925,6 +932,7 @@ def load_tokeniser(
925
932
  cache_dir=model_cache_dir,
926
933
  token=token,
927
934
  trust_remote_code=trust_remote_code,
935
+ local_files_only=not internet_connection_available(),
928
936
  )
929
937
  num_retries = 5
930
938
  for _ in range(num_retries):
@@ -937,8 +945,10 @@ def load_tokeniser(
937
945
  padding_side="left",
938
946
  truncation_side="left",
939
947
  model_max_length=model_max_length,
948
+ cache_dir=model_cache_dir,
940
949
  config=config,
941
950
  token=token,
951
+ local_files_only=not internet_connection_available(),
942
952
  )
943
953
  break
944
954
  except (json.JSONDecodeError, OSError, TypeError) as e:
@@ -996,7 +1006,7 @@ def clear_vllm() -> None:
996
1006
 
997
1007
 
998
1008
  def get_end_of_reasoning_token(
999
- model: "LLM", tokeniser: "PreTrainedTokenizer", model_id: str
1009
+ model: "LLM", tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
1000
1010
  ) -> str | None:
1001
1011
  """Get the end-of-reasoning token for a generative model.
1002
1012
 
@@ -1005,21 +1015,26 @@ def get_end_of_reasoning_token(
1005
1015
  The vLLM model.
1006
1016
  tokeniser:
1007
1017
  The tokeniser.
1008
- model_id:
1009
- The model ID.
1018
+ model_config:
1019
+ The model configuration.
1010
1020
 
1011
1021
  Returns:
1012
1022
  The end of reasoning token, or None if it could not be found.
1013
1023
  """
1024
+ model_id = model_config.model_id
1025
+
1014
1026
  # Create a prompt to check if the model uses the reasoning tokens
1015
1027
  prompt = "What is your name?"
1016
1028
  if has_chat_template(tokeniser=tokeniser):
1029
+ extra_kwargs = dict()
1030
+ if model_config.param in {"thinking", "no-thinking"}:
1031
+ extra_kwargs["enable_thinking"] = model_config.param == "thinking"
1017
1032
  templated_prompt = apply_chat_template(
1018
1033
  conversation=[dict(role="user", content=prompt)],
1019
1034
  tokeniser=tokeniser,
1020
1035
  tokenise=False,
1021
1036
  add_generation_prompt=True,
1022
- enable_thinking=True,
1037
+ **extra_kwargs,
1023
1038
  )
1024
1039
  assert isinstance(templated_prompt, str)
1025
1040
  prompt = templated_prompt
@@ -1042,8 +1057,8 @@ def get_end_of_reasoning_token(
1042
1057
  if not bor_reasoning_matches:
1043
1058
  log_once(
1044
1059
  f"The model {model_id!r} did not generate any beginning-of-reasoning "
1045
- "tokens in the prompt or the completion. Assuming the model is not "
1046
- "a reasoning model.",
1060
+ "tokens in the prompt or the completion. Assuming the model is not a "
1061
+ "reasoning model.",
1047
1062
  level=logging.DEBUG,
1048
1063
  )
1049
1064
  return None
euroeval/benchmarker.py CHANGED
@@ -16,7 +16,7 @@ from torch.distributed import destroy_process_group
16
16
 
17
17
  from .benchmark_config_factory import build_benchmark_config
18
18
  from .constants import GENERATIVE_PIPELINE_TAGS
19
- from .data_loading import load_data
19
+ from .data_loading import load_data, load_raw_data
20
20
  from .data_models import BenchmarkConfigParams, BenchmarkResult
21
21
  from .dataset_configs import get_all_dataset_configs
22
22
  from .enums import Device, GenerativeType, ModelType
@@ -28,7 +28,12 @@ from .model_loading import load_model
28
28
  from .scores import log_scores
29
29
  from .speed_benchmark import benchmark_speed
30
30
  from .tasks import SPEED
31
- from .utils import enforce_reproducibility, get_package_version
31
+ from .utils import (
32
+ enforce_reproducibility,
33
+ get_package_version,
34
+ internet_connection_available,
35
+ log_once,
36
+ )
32
37
 
33
38
  if t.TYPE_CHECKING:
34
39
  from .benchmark_modules import BenchmarkModule
@@ -83,6 +88,7 @@ class Benchmarker:
83
88
  debug: bool = False,
84
89
  run_with_cli: bool = False,
85
90
  requires_safetensors: bool = False,
91
+ download_only: bool = False,
86
92
  ) -> None:
87
93
  """Initialise the benchmarker.
88
94
 
@@ -164,14 +170,26 @@ class Benchmarker:
164
170
  requires_safetensors:
165
171
  Whether to only allow models that use the safetensors format. Defaults
166
172
  to False.
173
+ download_only:
174
+ Whether to only download models and datasets without performing any
175
+ benchmarking. Defaults to False.
167
176
 
168
177
  Raises:
169
178
  ValueError:
170
- If both `task` and `dataset` are specified.
179
+ If both `task` and `dataset` are specified, or if `download_only`
180
+ is True and we have no internet connection.
171
181
  """
172
182
  if task is not None and dataset is not None:
173
183
  raise ValueError("Only one of `task` and `dataset` can be specified.")
174
184
 
185
+ if not internet_connection_available() and download_only:
186
+ msg = "It appears you do not have an internet connection, but "
187
+ if run_with_cli:
188
+ msg += "the --download-only flag was set."
189
+ else:
190
+ msg += "the argument `download_only` was set to True."
191
+ raise ValueError(msg)
192
+
175
193
  # Bail early if hf_transfer is enabled but not installed.
176
194
  if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
177
195
  raise ImportError(
@@ -222,17 +240,82 @@ class Benchmarker:
222
240
 
223
241
  @property
224
242
  def benchmark_results(self) -> list[BenchmarkResult]:
225
- """The benchmark results."""
243
+ """The benchmark results.
244
+
245
+ Returns:
246
+ A list of benchmark results.
247
+
248
+ Raises:
249
+ ValueError:
250
+ If there is an error decoding a line in the results file.
251
+ """
226
252
  if self.results_path.exists():
253
+ benchmark_results: list[BenchmarkResult] = list()
227
254
  with self.results_path.open() as f:
228
- return [
229
- BenchmarkResult.from_dict(json.loads(line))
230
- for line in f
231
- if line.strip()
232
- ]
255
+ for line in f:
256
+ if line.strip():
257
+ try:
258
+ result_dict = json.loads(line.strip())
259
+ except json.JSONDecodeError as e:
260
+ raise ValueError(
261
+ f"Error decoding JSON line: {line.strip()}"
262
+ ) from e
263
+
264
+ # Fix for older records
265
+ has_old_raw_results = (
266
+ "results" in result_dict
267
+ and isinstance(result_dict["results"], dict)
268
+ and "raw" in result_dict["results"]
269
+ and isinstance(result_dict["results"]["raw"], dict)
270
+ and "test" in result_dict["results"]["raw"]
271
+ )
272
+ if has_old_raw_results:
273
+ result_dict["results"]["raw"] = result_dict["results"][
274
+ "raw"
275
+ ]["test"]
276
+
277
+ result = BenchmarkResult.from_dict(result_dict)
278
+ benchmark_results.append(result)
279
+ return benchmark_results
233
280
  else:
234
281
  return list()
235
282
 
283
+ def _download(
284
+ self,
285
+ dataset_config: "DatasetConfig",
286
+ model_config: "ModelConfig",
287
+ benchmark_config: "BenchmarkConfig",
288
+ ) -> None:
289
+ """Download data, metrics, and model for the given dataset, and model.
290
+
291
+ Args:
292
+ dataset_config: The configuration for the dataset.
293
+ model_config: The configuration for the model.
294
+ benchmark_config: The configuration for the benchmark.
295
+ """
296
+ log_once(f"Loading data for {dataset_config.pretty_name}", level=logging.INFO)
297
+ dataset = load_raw_data(
298
+ dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
299
+ )
300
+ del dataset
301
+
302
+ log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
303
+ model = load_model(
304
+ model_config=model_config,
305
+ dataset_config=dataset_config,
306
+ benchmark_config=benchmark_config,
307
+ )
308
+ del model
309
+
310
+ log_once(
311
+ f"Loading metrics for the '{dataset_config.task.name}' task",
312
+ level=logging.INFO,
313
+ )
314
+ for metric_name in dataset_config.task.metrics:
315
+ log_once(f"Loading metric {metric_name.name}", level=logging.DEBUG)
316
+ metric = metric_name.download(cache_dir=benchmark_config.cache_dir)
317
+ del metric
318
+
236
319
  def benchmark(
237
320
  self,
238
321
  model: list[str] | str,
@@ -336,6 +419,9 @@ class Benchmarker:
336
419
  requires_safetensors:
337
420
  Whether to only allow models that use the safetensors format. Defaults
338
421
  to the value specified when initialising the benchmarker.
422
+ download_only:
423
+ Whether to only download the models without evaluating them. Defaults
424
+ to the value specified when initialising the benchmarker.
339
425
 
340
426
  Returns:
341
427
  A list of benchmark results.
@@ -395,6 +481,28 @@ class Benchmarker:
395
481
  num_finished_benchmarks += len(dataset_configs)
396
482
  continue
397
483
 
484
+ if model_config.adapter_base_model_id:
485
+ open_issue_msg = (
486
+ "If offline support is important to you, please "
487
+ "consider opening an issue at https://github.com/EuroEval/EuroEval/issues."
488
+ )
489
+ if not internet_connection_available():
490
+ raise InvalidModel(
491
+ "Offline benchmarking of models with adapters is not currently "
492
+ "supported. "
493
+ f"An active internet connection is required. {open_issue_msg}"
494
+ )
495
+ elif benchmark_config.download_only:
496
+ log_once(
497
+ "You are using download only mode with a model that includes "
498
+ "an adapter. "
499
+ "Please note: Offline benchmarking of adapter models is not "
500
+ "currently supported. "
501
+ "An internet connection will be required during evaluation. "
502
+ f"{open_issue_msg}",
503
+ level=logging.WARNING,
504
+ )
505
+
398
506
  loaded_model: BenchmarkModule | None = None
399
507
  benchmark_params_to_revert: dict[str, t.Any] = dict()
400
508
  for dataset_config in dataset_configs:
@@ -645,6 +753,9 @@ class Benchmarker:
645
753
  requires_safetensors:
646
754
  Whether to only allow models that use the safetensors format. If None,
647
755
  then this value will not be updated.
756
+ download_only:
757
+ Whether to only download the models without evaluating them. If None,
758
+ then this value will not be updated.
648
759
 
649
760
  Returns:
650
761
  The updated benchmark configuration.
@@ -813,17 +924,19 @@ class Benchmarker:
813
924
  model_param=model_config.param,
814
925
  )
815
926
 
927
+ model_id_to_be_stored = model_config.model_id
928
+ if model_config.revision != "main":
929
+ model_id_to_be_stored += f"@{model_config.revision}"
930
+ if model_config.param is not None:
931
+ model_id_to_be_stored += f"#{model_config.param}"
932
+
816
933
  record = BenchmarkResult(
817
934
  dataset=dataset_config.name,
818
935
  task=dataset_config.task.name,
819
936
  dataset_languages=[
820
937
  language.code for language in dataset_config.languages
821
938
  ],
822
- model=(
823
- f"{model_config.model_id}@{model_config.revision}"
824
- if model_config.revision and model_config.revision != "main"
825
- else model_config.model_id
826
- ),
939
+ model=model_id_to_be_stored,
827
940
  results=results,
828
941
  num_model_parameters=model.num_params,
829
942
  max_sequence_length=model.model_max_length,
euroeval/cli.py CHANGED
@@ -216,6 +216,12 @@ from .tasks import get_all_tasks
216
216
  help="The type of generative model. Only relevant if the model is generative. If "
217
217
  "not specified, the type will be inferred automatically.",
218
218
  )
219
+ @click.option(
220
+ "--download-only",
221
+ is_flag=True,
222
+ help="Only download the requested model weights and datasets, and exit.",
223
+ default=False,
224
+ )
219
225
  def benchmark(
220
226
  model: tuple[str],
221
227
  dataset: tuple[str],
@@ -243,6 +249,7 @@ def benchmark(
243
249
  debug: bool,
244
250
  requires_safetensors: bool,
245
251
  generative_type: str | None,
252
+ download_only: bool,
246
253
  ) -> None:
247
254
  """Benchmark pretrained language models on language tasks."""
248
255
  models = list(model)
@@ -284,6 +291,7 @@ def benchmark(
284
291
  debug=debug,
285
292
  run_with_cli=True,
286
293
  requires_safetensors=requires_safetensors,
294
+ download_only=download_only,
287
295
  )
288
296
 
289
297
  # Perform the benchmark evaluation
euroeval/data_models.py CHANGED
@@ -228,6 +228,9 @@ class BenchmarkConfig:
228
228
  generative_type:
229
229
  The type of generative model to benchmark. Only relevant if the model is
230
230
  generative.
231
+ download_only:
232
+ Whether to only download the models, metrics and datasets without
233
+ evaluating.
231
234
  """
232
235
 
233
236
  model_languages: list[Language]
@@ -255,6 +258,7 @@ class BenchmarkConfig:
255
258
  run_with_cli: bool
256
259
  requires_safetensors: bool
257
260
  generative_type: GenerativeType | None
261
+ download_only: bool
258
262
 
259
263
 
260
264
  class BenchmarkConfigParams(pydantic.BaseModel):
euroeval/generation.py CHANGED
@@ -243,7 +243,9 @@ def generate_single_iteration(
243
243
  ground_truth = []
244
244
 
245
245
  itr_scores: dict[str, float] = model.compute_metrics(
246
- model_outputs_and_labels=(all_preds, ground_truth), dataset=dataset
246
+ model_outputs_and_labels=(all_preds, ground_truth),
247
+ dataset=dataset,
248
+ benchmark_config=benchmark_config,
247
249
  )
248
250
 
249
251
  return itr_scores
@@ -202,7 +202,7 @@ def apply_prompt(
202
202
  """
203
203
  # Sanity check
204
204
  if (
205
- generative_type == GenerativeType.INSTRUCTION_TUNED
205
+ generative_type in {GenerativeType.INSTRUCTION_TUNED, GenerativeType.REASONING}
206
206
  and always_populate_text_field
207
207
  and tokeniser is None
208
208
  ):
@@ -229,7 +229,10 @@ def apply_prompt(
229
229
  )
230
230
  label_mapping = dataset_config.prompt_label_mapping
231
231
  label = label_mapping.get(label, label)
232
- if generative_type == GenerativeType.INSTRUCTION_TUNED:
232
+ if generative_type in {
233
+ GenerativeType.INSTRUCTION_TUNED,
234
+ GenerativeType.REASONING,
235
+ }:
233
236
  prompt = dataset_config.instruction_prompt.format(**kwargs)
234
237
  return prompt, label
235
238
  else:
@@ -355,7 +358,7 @@ def apply_prompt(
355
358
  f"Unsupported task group: {dataset_config.task.task_group}."
356
359
  )
357
360
 
358
- if generative_type == GenerativeType.INSTRUCTION_TUNED:
361
+ if generative_type in {GenerativeType.INSTRUCTION_TUNED, GenerativeType.REASONING}:
359
362
  few_shot_messages = [
360
363
  dict(role=role, content=content)
361
364
  for prompt, label in few_shot_sections
@@ -408,7 +411,10 @@ def apply_prompt(
408
411
  else:
409
412
  prompt_prefix = ""
410
413
  if dataset_config.prompt_prefix:
411
- prompt_prefix = dataset_config.prompt_prefix + "\n\n"
414
+ labels_str = dataset_config.get_labels_str()
415
+ prompt_prefix = (
416
+ dataset_config.prompt_prefix.format(labels_str=labels_str) + "\n\n"
417
+ )
412
418
 
413
419
  few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
414
420
  if few_shot_prompt:
euroeval/metrics/base.py CHANGED
@@ -42,6 +42,18 @@ class Metric(abc.ABC):
42
42
  else lambda x: (100 * x, f"{x:.2%}")
43
43
  )
44
44
 
45
+ def download(self, cache_dir: str) -> "Metric":
46
+ """Initiates the download of the metric if needed.
47
+
48
+ Args:
49
+ cache_dir:
50
+ The directory where the metric will be downloaded to.
51
+
52
+ Returns:
53
+ The metric object itself.
54
+ """
55
+ return self
56
+
45
57
  @abc.abstractmethod
46
58
  def __call__(
47
59
  self,
@@ -3,9 +3,11 @@
3
3
  import collections.abc as c
4
4
  import logging
5
5
  import typing as t
6
+ from pathlib import Path
6
7
 
7
8
  import evaluate
8
9
  import numpy as np
10
+ from datasets import DownloadConfig
9
11
 
10
12
  from ..utils import HiddenPrints
11
13
  from .base import Metric
@@ -76,6 +78,23 @@ class HuggingFaceMetric(Metric):
76
78
  )
77
79
  self.metric: "EvaluationModule | None" = None
78
80
 
81
+ def download(self, cache_dir: str) -> "HuggingFaceMetric":
82
+ """Initiates the download of the metric if needed.
83
+
84
+ Args:
85
+ cache_dir:
86
+ The directory where the metric will be downloaded to.
87
+
88
+ Returns:
89
+ The metric object itself.
90
+ """
91
+ # Annoying but needed to make the metric download to a different cache dir
92
+ download_config = DownloadConfig(cache_dir=Path(cache_dir, "evaluate"))
93
+ self.metric = evaluate.load(
94
+ path=self.huggingface_id, download_config=download_config
95
+ )
96
+ return self
97
+
79
98
  def __call__(
80
99
  self,
81
100
  predictions: c.Sequence,
@@ -103,7 +122,9 @@ class HuggingFaceMetric(Metric):
103
122
  The calculated metric score, or None if the score should be ignored.
104
123
  """
105
124
  if self.metric is None:
106
- self.metric = evaluate.load(path=self.huggingface_id)
125
+ self.download(cache_dir=benchmark_config.cache_dir)
126
+
127
+ assert self.metric is not None
107
128
 
108
129
  with HiddenPrints():
109
130
  results = self.metric.compute(
@@ -176,7 +197,7 @@ bert_score_metric = HuggingFaceMetric(
176
197
  huggingface_id="bertscore",
177
198
  results_key="f1",
178
199
  compute_kwargs=dict(
179
- model_type="microsoft/mdeberta-v3-base", device="auto", batch_size=1
200
+ model_type="microsoft/mdeberta-v3-base", device="cpu", batch_size=16
180
201
  ),
181
202
  )
182
203
 
@@ -97,7 +97,7 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
97
97
  default_prompt_prefix="Hetta eru nakrir setningar og um teir eru mállæruliga "
98
98
  "rættir.",
99
99
  default_prompt_template="Setningur: {text}\nMállæruliga rættur: {label}",
100
- default_instruction_prompt="Setningur: {text}\n\nGreinið hvort setningurin er "
100
+ default_instruction_prompt="Setningur: {text}\n\nGreindu hvort setningurin er "
101
101
  "mállæruliga rættur ella ikki. Svara við {labels_str}, og einki annað.",
102
102
  ),
103
103
  FR: PromptConfig(
@@ -111,11 +111,12 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
111
111
  ),
112
112
  IS: PromptConfig(
113
113
  default_prompt_label_mapping=dict(correct="já", incorrect="nei"),
114
- default_prompt_prefix="Eftirfarandi eru setningar og hvort þær eru "
115
- "málfræðilega réttar.",
114
+ default_prompt_prefix="Hér fyrir neðan eru setningar ásamt mati á því hvort "
115
+ "þær eru málfræðilega réttar.",
116
116
  default_prompt_template="Setning: {text}\nMálfræðilega rétt: {label}",
117
- default_instruction_prompt="Setning: {text}\n\nGreinið hvort setningin er "
118
- "málfræðilega rétt eða ekki. Svaraðu með {labels_str}, og ekkert annað.",
117
+ default_instruction_prompt="Setning: {text}\n\nGreindu hvort setningin er "
118
+ "málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún "
119
+ "er það ekki.",
119
120
  ),
120
121
  IT: PromptConfig(
121
122
  default_prompt_label_mapping=dict(correct="si", incorrect="no"),
@@ -176,7 +176,7 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
176
176
  default_prompt_prefix="Her eru nakrir setningar og nakrar JSON orðabøkur við "
177
177
  "nevndar eindir, sum eru í setningunum.",
178
178
  default_prompt_template="Setningur: {text}\nNevndar eindir: {label}",
179
- default_instruction_prompt="Setningur: {text}\n\nGreinið nevndu einingarnar í "
179
+ default_instruction_prompt="Setningur: {text}\n\nGreindu nevndu einingarnar í "
180
180
  "setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum "
181
181
  "{labels_str}. Gildin ættu að vera listi yfir nevndu einingarnar af "
182
182
  "þeirri gerð, nákvæmlega eins og þær koma fram í setningunni.",
@@ -215,8 +215,8 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
215
215
  },
216
216
  default_prompt_prefix="Eftirfarandi eru setningar ásamt JSON lyklum með "
217
217
  "nefndum einingum sem koma fyrir í setningunum.",
218
- default_prompt_template="Setning: {text}\nNefndar einingar: {label}",
219
- default_instruction_prompt="Setning: {text}\n\nGreinið nefndu einingarnar í "
218
+ default_prompt_template="Setning: {text}\nNafneiningar: {label}",
219
+ default_instruction_prompt="Setning: {text}\n\nGreindu nefndu einingarnar í "
220
220
  "setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum "
221
221
  "{labels_str}. Gildin ættu að vera listi yfir nefndu "
222
222
  "einingarnar af þeirri gerð, nákvæmlega eins og þær koma fram í "
@@ -137,11 +137,11 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
137
137
  default_prompt_label_mapping=dict(
138
138
  positive="jákvætt", neutral="hlutlaust", negative="neikvætt"
139
139
  ),
140
- default_prompt_prefix="Eftirfarandi eru skjöl og viðhorf þeirra, sem geta "
141
- "verið {labels_str}.",
142
- default_prompt_template="Skjal: {text}\nViðhorf: {label}",
143
- default_instruction_prompt="Skjal: {text}\n\nFlokkaðu viðhorfið í skjalinu. "
144
- "Svaraðu með {labels_str}, og ekkert annað.",
140
+ default_prompt_prefix="Hér fyrir neðan eru textabrot ásamt lyndisgildi þeirra "
141
+ "sem getur verið 'jákvætt', 'hlutlaust' eða 'neikvætt'.",
142
+ default_prompt_template="Textabrot: {text}\nViðhorf: {label}",
143
+ default_instruction_prompt="Textabrot: {text}\n\nGreindu lyndið í "
144
+ "textabrotinu. Svaraðu með {labels_str}, og ekkert annað.",
145
145
  ),
146
146
  IT: PromptConfig(
147
147
  default_prompt_label_mapping=dict(
@@ -198,7 +198,7 @@ def extract_labels_from_generation(
198
198
  # If no candidate labels were found, we either pick the label with the smallest
199
199
  # word edit distance to the predicted label (if invalid model outputs are
200
200
  # allowed), or we raise an error
201
- if min(edit_distances) > 100:
201
+ if min(edit_distances) >= 1000:
202
202
  if dataset_config.allow_invalid_model_outputs:
203
203
  logger.warning(
204
204
  "No candidate labels found for the predicted label "
euroeval/tasks.py CHANGED
@@ -100,6 +100,7 @@ KNOW = Task(
100
100
  default_num_few_shot_examples=5,
101
101
  default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
102
102
  default_labels=["a", "b", "c", "d"],
103
+ default_allowed_model_types=[ModelType.GENERATIVE],
103
104
  uses_logprobs=True,
104
105
  )
105
106
 
@@ -112,6 +113,7 @@ MCRC = Task(
112
113
  default_num_few_shot_examples=5,
113
114
  default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
114
115
  default_labels=["a", "b", "c", "d"],
116
+ default_allowed_model_types=[ModelType.GENERATIVE],
115
117
  uses_logprobs=True,
116
118
  )
117
119
 
@@ -124,6 +126,7 @@ COMMON_SENSE = Task(
124
126
  default_num_few_shot_examples=5,
125
127
  default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
126
128
  default_labels=["a", "b", "c", "d"],
129
+ default_allowed_model_types=[ModelType.GENERATIVE],
127
130
  uses_logprobs=True,
128
131
  )
129
132
 
@@ -339,13 +339,18 @@ def get_end_of_chat_token_ids(
339
339
  return None
340
340
 
341
341
  user_message: dict[str, str] = dict(role="user", content="X")
342
- token_ids = apply_chat_template(
343
- conversation=[user_message],
344
- tokeniser=tokeniser,
345
- tokenise=True,
346
- add_generation_prompt=False,
347
- enable_thinking=generative_type == GenerativeType.REASONING,
348
- )
342
+ try:
343
+ token_ids = apply_chat_template(
344
+ conversation=[user_message],
345
+ tokeniser=tokeniser,
346
+ tokenise=True,
347
+ add_generation_prompt=False,
348
+ enable_thinking=generative_type == GenerativeType.REASONING,
349
+ )
350
+ except InvalidModel as e:
351
+ if "does not have a chat template" in str(e):
352
+ return None
353
+ raise e
349
354
  assert isinstance(token_ids, list)
350
355
 
351
356
  for idx, token in enumerate(tokeniser.convert_ids_to_tokens(token_ids)):
@@ -546,7 +551,6 @@ def apply_chat_template(
546
551
  tokeniser: "PreTrainedTokenizer",
547
552
  tokenise: bool,
548
553
  add_generation_prompt: bool,
549
- enable_thinking: bool,
550
554
  **extra_kwargs,
551
555
  ) -> str | list[int]:
552
556
  """Apply the chat template to a prompt.
@@ -563,10 +567,6 @@ def apply_chat_template(
563
567
  Whether to add a generation prompt at the end of the conversation. This is
564
568
  only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
565
569
  always add a generation prompt.
566
- enable_thinking:
567
- Whether to enable special handling for reasoning models, such as adding
568
- special tokens for thinking. This is only relevant for regular Hugging
569
- Face tokenisers, as Mistral tokenisers always handle reasoning models.
570
570
  **extra_kwargs:
571
571
  Extra keyword arguments to pass to the tokeniser's `apply_chat_template`
572
572
  method. Only relevant for regular Hugging Face tokenisers.
@@ -596,7 +596,6 @@ def apply_chat_template(
596
596
  conversation=conversation,
597
597
  add_generation_prompt=add_generation_prompt,
598
598
  tokenize=tokenise,
599
- enable_thinking=enable_thinking,
600
599
  **extra_kwargs,
601
600
  )
602
601
  return templated_prompt
euroeval/types.py CHANGED
@@ -8,8 +8,7 @@ if t.TYPE_CHECKING:
8
8
  from datasets.arrow_dataset import Dataset
9
9
  from numpy.typing import NDArray
10
10
 
11
- from .data_models import GenerativeModelOutput
12
-
11
+ from .data_models import BenchmarkConfig, GenerativeModelOutput
13
12
 
14
13
  ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
15
14
  Predictions: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
@@ -27,6 +26,7 @@ class ComputeMetricsFunction(t.Protocol):
27
26
  "NDArray | list[str] | list[list[str]]",
28
27
  ],
29
28
  dataset: "Dataset",
29
+ benchmark_config: "BenchmarkConfig",
30
30
  ) -> dict[str, float]:
31
31
  """Compute the metrics.
32
32
 
euroeval/utils.py CHANGED
@@ -8,6 +8,7 @@ import logging
8
8
  import os
9
9
  import random
10
10
  import re
11
+ import socket
11
12
  import sys
12
13
  import typing as t
13
14
  import warnings
@@ -18,10 +19,8 @@ import demjson3
18
19
  import huggingface_hub as hf_hub
19
20
  import litellm
20
21
  import numpy as np
21
- import requests
22
22
  import torch
23
23
  from datasets.utils import disable_progress_bar
24
- from requests.exceptions import RequestException
25
24
  from transformers import logging as tf_logging
26
25
 
27
26
  from .exceptions import InvalidBenchmark, InvalidModel, NaNValueInModelOutput
@@ -54,6 +53,68 @@ def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
54
53
  return str(cache_dir_path)
55
54
 
56
55
 
56
+ def resolve_model_path(download_dir: str) -> str:
57
+ """Resolve the path to the directory containing the model config files and weights.
58
+
59
+ Args:
60
+ download_dir:
61
+ The download directory
62
+
63
+ Returns:
64
+ The path to the model.
65
+ """
66
+ model_path = Path(download_dir)
67
+ # Get the 'path safe' version of the model id, which is the last dir in the path
68
+ model_id_path = model_path.name
69
+ # Hf hub `cache_dir` puts the files in models--`model_id_path`/snapshots
70
+ model_path = model_path / f"models--{model_id_path}" / "snapshots"
71
+ if not model_path.exists():
72
+ raise InvalidModel(
73
+ f"Attempted to load models from the {model_path} directory, "
74
+ "but it does not exist."
75
+ )
76
+
77
+ # Get all files in the model path
78
+ found_files = [
79
+ found_file for found_file in model_path.rglob("*") if found_file.is_file()
80
+ ]
81
+ if not found_files:
82
+ raise InvalidModel(f"No model files found at {model_path}")
83
+
84
+ # Make sure that there arent multiples of the files found
85
+ if len(found_files) == len(set(found_files)):
86
+ raise InvalidModel(
87
+ f"Found multiple model config files for {model_id_path.strip('models--')}"
88
+ f"at {model_path}"
89
+ )
90
+
91
+ # Check that found_files contains at least a 'config.json'
92
+ config_file = next(
93
+ (file for file in found_files if file.name == "config.json"), None
94
+ )
95
+ if config_file is None:
96
+ raise InvalidModel(
97
+ f"Missing required file 'config.json' for {model_id_path.strip('models--')}"
98
+ f"at {model_path}"
99
+ )
100
+ model_path = config_file.parent
101
+
102
+ # As a precaution we also check that all of the files are in the same directory
103
+ # if not we create a new dir with symlinks to all of the files from all snapshots
104
+ # this is especially useful for vllm where we can only specify one folder and e.g.,
105
+ # the safetensors version of the weights was added in an unmerged PR
106
+ if not all(
107
+ [found_file.parent == found_files[0].parent for found_file in found_files]
108
+ ):
109
+ new_model_path = model_path.parent / "model_files"
110
+ new_model_path.mkdir(exist_ok=True)
111
+ for found_file in found_files:
112
+ Path(new_model_path / found_file.name).symlink_to(found_file)
113
+ model_path = new_model_path
114
+
115
+ return str(model_path)
116
+
117
+
57
118
  def clear_memory() -> None:
58
119
  """Clears the memory of unused items."""
59
120
  for gc_generation in range(3):
@@ -91,6 +152,9 @@ def block_terminal_output() -> None:
91
152
  libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and
92
153
  disables most of the logging from the `transformers` library.
93
154
  """
155
+ if os.getenv("FULL_LOG") == "1":
156
+ return
157
+
94
158
  # Ignore miscellaneous warnings
95
159
  warnings.filterwarnings("ignore", category=UserWarning)
96
160
  warnings.filterwarnings("ignore", category=FutureWarning)
@@ -196,6 +260,7 @@ def get_min_cuda_compute_capability() -> float | None:
196
260
  return float(f"{major}.{minor}")
197
261
 
198
262
 
263
+ @cache
199
264
  def internet_connection_available() -> bool:
200
265
  """Checks if internet connection is available by pinging google.com.
201
266
 
@@ -203,10 +268,17 @@ def internet_connection_available() -> bool:
203
268
  Whether or not internet connection is available.
204
269
  """
205
270
  try:
206
- requests.get("https://www.google.com")
271
+ s = socket.create_connection(("1.1.1.1", 80))
272
+ s.close()
207
273
  return True
208
- except RequestException:
209
- return False
274
+ # a bit ugly but we dont want to actually import the pytest-socket exceptions
275
+ # we catch all exceptions and check if the name matches any known errors
276
+ except Exception as e:
277
+ pytest_socket_errors = ["SocketConnectBlockedError", "SocketBlockedError"]
278
+ if type(e).__name__ in pytest_socket_errors or isinstance(e, OSError):
279
+ return False
280
+ else:
281
+ raise e
210
282
 
211
283
 
212
284
  class HiddenPrints:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.1.0
3
+ Version: 16.2.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -61,13 +61,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
61
61
  Provides-Extra: all
62
62
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
64
- Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'all'
65
- Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
64
+ Requires-Dist: timm>=1.0.19; extra == 'all'
65
+ Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
- Requires-Dist: flashinfer-python>=0.3.1; (platform_system == 'Linux') and extra == 'generative'
70
- Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
69
+ Requires-Dist: timm>=1.0.19; extra == 'generative'
70
+ Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
71
71
  Description-Content-Type: text/markdown
72
72
 
73
73
  <div align='center'>
@@ -152,13 +152,13 @@ model:
152
152
  ```
153
153
  >>> from euroeval import Benchmarker
154
154
  >>> benchmark = Benchmarker()
155
- >>> benchmark(model="<model>")
155
+ >>> benchmark(model="<model-id>")
156
156
  ```
157
157
 
158
158
  To benchmark on a specific task and/or language, you simply specify the `task` or
159
159
  `language` arguments, shown here with same example as above:
160
160
  ```
161
- >>> benchmark(model="<model>", task="sentiment-classification", language="da")
161
+ >>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
162
162
  ```
163
163
 
164
164
  If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
@@ -168,6 +168,30 @@ models on the Danish sentiment classification task:
168
168
  >>> benchmark(task="sentiment-classification", language="da")
169
169
  ```
170
170
 
171
+ ### Benchmarking in an Offline Environment
172
+ If you need to benchmark in an offline environment, you need to download the models,
173
+ datasets and metrics beforehand. This can be done by adding the `--download-only`
174
+ argument, from the command line, or the `download_only` argument, if benchmarking from a
175
+ script. For example to download the model you want and all of the Danish sentiment
176
+ classification datasets:
177
+ ```
178
+ $ euroeval --model <model-id> --task sentiment-classification --language da --download-only
179
+ ```
180
+
181
+ Or from a script:
182
+ ```
183
+ >>> benchmark(
184
+ ... model="<model-id>",
185
+ ... task="sentiment-classification",
186
+ ... language="da",
187
+ ... download_only=True,
188
+ ... )
189
+ ```
190
+
191
+ Please note: Offline benchmarking of adapter models is not currently supported. An
192
+ internet connection will be required during evaluation. If offline support is important
193
+ to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
194
+
171
195
  ### Benchmarking from Docker
172
196
  A Dockerfile is provided in the repo, which can be downloaded and run, without needing
173
197
  to clone the repo and installing from source. This can be fetched programmatically by
@@ -1,32 +1,32 @@
1
- euroeval/__init__.py,sha256=8jqSCcDWvwwNb1guPi8cLAekPSOX9V8DpRx_v3-c19E,3730
2
- euroeval/benchmark_config_factory.py,sha256=NzNSiqix4hlVXk3xnyzdg2WDxomkectf97UWdVS3POo,11667
3
- euroeval/benchmarker.py,sha256=JkhvYxhVpQPcWmDLzwnB8Yy6tTqj3yfDWTefklbI7RM,50355
1
+ euroeval/__init__.py,sha256=mXTjuGrEE-1fIS9x28oJKg-gNGt4q7y2E74l330KEmY,3787
2
+ euroeval/benchmark_config_factory.py,sha256=NcdxQkGrstsprdz1QW3XrgS8B65uEP5SqxFJoL8zEEk,11831
3
+ euroeval/benchmarker.py,sha256=I82iVGwlRJ9BQ02u_bt5ngN-ZzWEJT2ReCrqXgh6lx4,55285
4
4
  euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
5
- euroeval/cli.py,sha256=wUGetj9Ld4wkS872ZOfYqHIJMh58o8L2MDi78wU5nxI,9099
5
+ euroeval/cli.py,sha256=GOAWzdtasJfOvTuVQszu-T1T9GfQ_un-blOICO-y7g4,9316
6
6
  euroeval/constants.py,sha256=NN7kcwQdlDyyGFSrLjsL_qKVRyoRqZ9sKO5SjlgtRwA,2741
7
7
  euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
8
- euroeval/data_models.py,sha256=S-PATp4F1wBwvra6wtjlJFXxZbZB_vEpJHXcdTTKA70,27593
8
+ euroeval/data_models.py,sha256=LNioJFW231RSSKZx7WIs46Xxs0KWgb7ElRyyULHSEzQ,27742
9
9
  euroeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
10
10
  euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
11
  euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
12
- euroeval/generation.py,sha256=MSrd0oIkoqwKsCOaIkY2CFF_urXLOfNR1OO5nMvcCpY,12476
13
- euroeval/generation_utils.py,sha256=OtEXLhI6L1vlbC768dH3xzj0qkokz43m0vswGKrRmBA,18061
12
+ euroeval/generation.py,sha256=Va3EOmFzOMBNfI4fh3nW5qhhrM3CBT8_4MaLwVtsF_E,12528
13
+ euroeval/generation_utils.py,sha256=d2_vylWXIeH4xIXgbsI5rN6dMt0zKp0zXExD6aOKWaA,18299
14
14
  euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
15
15
  euroeval/model_cache.py,sha256=h61cL_fy2Sd1sqYZis5lAWqvQIfQXXt_v8QZeftKNkg,9226
16
16
  euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
17
17
  euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
18
18
  euroeval/scores.py,sha256=HQQqyjdgm853FZ_ifIdnSltKfBhsY7pOITov6F3Et5o,3165
19
19
  euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
20
- euroeval/tasks.py,sha256=3qEOBAMmfeqgXqlGkCKzQ-s0Yw-0-jPRgFZ97EZCFng,4535
21
- euroeval/tokenisation_utils.py,sha256=jRIi9m8XmGh3LeZna47AWmJI9U9m4ojXQynQTe7kzWc,21344
22
- euroeval/types.py,sha256=SCKOALV_-F1PAIwQ7qHNdSF1Uy29TSu9nIc1NYJGUUs,2754
23
- euroeval/utils.py,sha256=c0tFw1IXZIqgLU4EfY_k28iJ1ZlCZ_oFoKZH2sGCKYg,16499
20
+ euroeval/tasks.py,sha256=EzEWFDo_0ffabBFiRu-mw80jENUioE8D_VEn_Dsv-F8,4703
21
+ euroeval/tokenisation_utils.py,sha256=nLeF2cdZSm5PZiAcDTtxY82nUJ-or8VU8YxYLa167EM,21158
22
+ euroeval/types.py,sha256=_iVy-RwiCGu9TNX2sfyJTdCvXy1akNGTCywAo-YpBqU,2815
23
+ euroeval/utils.py,sha256=DRJW6wtmNpRtuHt03diWo3S5m3rdxoPEQpd-KWi7aGY,19255
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=mHF8XS6GGUXV-sJtxmI5WJBWPLMHuh-4Z4OWjC25x9Y,11566
26
26
  euroeval/benchmark_modules/fresh.py,sha256=TveSQiFBi3xXgCEQBdHwkUQ685PDkKW0y3G5Yt5rkeM,10655
27
- euroeval/benchmark_modules/hf.py,sha256=oBjVumnSM9PW7ZocQwCGLKpbeGFWLN_71DBotxZo1aY,44038
28
- euroeval/benchmark_modules/litellm.py,sha256=6EKjHnUoPCpuupISZHXqZsXLG8tyiA1-G12a5C6L8MM,64629
29
- euroeval/benchmark_modules/vllm.py,sha256=sYFdVzB9CZX6_sGI4xghDyXoVn6I95_nbeFUWeSMXcc,43132
27
+ euroeval/benchmark_modules/hf.py,sha256=XmkoDFzaJqnd_5mmUkqCaOgAdRPFs3KZKZZ0cr83TlM,44742
28
+ euroeval/benchmark_modules/litellm.py,sha256=F3udd6NmhQOe3go_7rAcWg7mgZrNQpWWvLe-5U4E2RQ,64771
29
+ euroeval/benchmark_modules/vllm.py,sha256=yLy8TCTnodu4NdTiO7XSdxuHX60AJ1-7p6J3e5h7-iA,43994
30
30
  euroeval/dataset_configs/__init__.py,sha256=uuIZmElpJV8iupo5oDj3TeQhBDRANdWpLKYFASLirHA,2046
31
31
  euroeval/dataset_configs/danish.py,sha256=QABfgI7m-0-5AimDXegp5ssDSLcM2VrAI_RWsinSZP4,5631
32
32
  euroeval/dataset_configs/dutch.py,sha256=63Ro2yFym5MuIDXf5953vUYenw9B0kZSCmZbXjdy4Rs,5517
@@ -45,26 +45,26 @@ euroeval/dataset_configs/portuguese.py,sha256=gQ054SdLQ5fkm4IAP6Mdh5RcPDJPDITcuy
45
45
  euroeval/dataset_configs/spanish.py,sha256=DvJlMK6OQg4qmxKzQA2IficlBMB7BafvxqIVuTKiZyw,4902
46
46
  euroeval/dataset_configs/swedish.py,sha256=YWHp7hbJ25o36csSg9uXaQCEJK1BPb7u2RQZiCe0lNs,5445
47
47
  euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
48
- euroeval/metrics/base.py,sha256=4vnRIPfKUwTNe0ZVm5YC2jQNecwchGUpN6nAH5cX0PM,2288
49
- euroeval/metrics/huggingface.py,sha256=b_Z_FUELQcmK7HeJh0zlAZs3pim1uNHnFLu7nvlZ4_A,5824
48
+ euroeval/metrics/base.py,sha256=HST2XeZrUQZV_vTiieePiaznEov3CIGzuVNIITtLsQc,2596
49
+ euroeval/metrics/huggingface.py,sha256=iHKJnvOXRc_e8sxB2ff3WkfK64jXyn5KEnIxPyfD2fM,6522
50
50
  euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
51
51
  euroeval/metrics/pipeline.py,sha256=Wcan3eDWV7t4WRXMPWCCe_JsA-fZnIfZU2ESinbbL2I,10284
52
52
  euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
53
53
  euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
54
- euroeval/prompt_templates/linguistic_acceptability.py,sha256=pRR1QBnYt5DnfxQp6dw1OYFZfIct-1R9pfdgPGpjoco,8667
54
+ euroeval/prompt_templates/linguistic_acceptability.py,sha256=m23LrckohdnToQDsexdsW_5YyBfGTf5DTjiMI643F9A,8717
55
55
  euroeval/prompt_templates/multiple_choice.py,sha256=Q-8-ETqG-RZeLzR8v8WUBIN7djiNSfNpmYnZRUWcd84,6905
56
- euroeval/prompt_templates/named_entity_recognition.py,sha256=LT7J6Y9rUCJFimpnwujBZq_V5buSmXHJteIXbTOoaCE,16442
56
+ euroeval/prompt_templates/named_entity_recognition.py,sha256=HIX9EBkSIBl5JXceFtiZTdvzWr9YHM9-55D6bcjIyQ4,16436
57
57
  euroeval/prompt_templates/reading_comprehension.py,sha256=ogzmhiSZO6egrdxxQiWz6a0XMdC0vws-lg5yRKQoYV0,8730
58
- euroeval/prompt_templates/sentiment_classification.py,sha256=BwnTpSdsAN_rL693ImgtKIRc5T_2G6ptWW0jCdC02NQ,9454
58
+ euroeval/prompt_templates/sentiment_classification.py,sha256=b3TvH26M77vwFfn577NlGVW881qfV7YSm-Xba_w98Fc,9504
59
59
  euroeval/prompt_templates/summarization.py,sha256=4Sqwj6C7yNfqj4FFFCseJMLDoSZ13aIOgY0SjIzzsNo,6593
60
60
  euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
61
61
  euroeval/task_group_utils/multiple_choice_classification.py,sha256=i5sidJGAXnENRoB6pOelyaUeGP1qoxwPSzD-F9RLwWk,7106
62
62
  euroeval/task_group_utils/question_answering.py,sha256=eUczZntrC9lhCUQlwNQB49i-5Ei12cdRnrfq4pE-T7Y,27750
63
- euroeval/task_group_utils/sequence_classification.py,sha256=qWUUrh4X4jK2XfUzP4aoPDoJhVJifrnDEaaw_F48hig,16080
63
+ euroeval/task_group_utils/sequence_classification.py,sha256=TAqZCoMQ9I-HFhMH35_J1mY2SQg95HUbXcgrBIyhgk0,16082
64
64
  euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
65
65
  euroeval/task_group_utils/token_classification.py,sha256=Yjai937ia1nZBMOWySqCXr_dA6WiVLGvmb4Hm_TU0Bg,17118
66
- euroeval-16.1.0.dist-info/METADATA,sha256=pYdW0IZwY8vatTA55EERxBK1kMaQuGhqzNys5xiSqsM,13729
67
- euroeval-16.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
68
- euroeval-16.1.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
69
- euroeval-16.1.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
70
- euroeval-16.1.0.dist-info/RECORD,,
66
+ euroeval-16.2.0.dist-info/METADATA,sha256=GQ1C9avsX8wl0Hcj3wmXvziveGDFWUT2aUrhhjIDzwc,14590
67
+ euroeval-16.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
68
+ euroeval-16.2.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
69
+ euroeval-16.2.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
70
+ euroeval-16.2.0.dist-info/RECORD,,