EuroEval 15.4.0__py3-none-any.whl → 15.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -20,6 +20,7 @@ from huggingface_hub.utils import (
20
20
  HFValidationError,
21
21
  LocalTokenNotFoundError,
22
22
  )
23
+ from peft import PeftConfig
23
24
  from requests.exceptions import RequestException
24
25
  from torch import nn
25
26
  from transformers import (
@@ -34,6 +35,9 @@ from transformers import (
34
35
  Trainer,
35
36
  )
36
37
  from transformers.modelcard import TASK_MAPPING
38
+ from transformers.models.auto.modeling_auto import (
39
+ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
40
+ )
37
41
  from urllib3.exceptions import RequestError
38
42
 
39
43
  from ..constants import (
@@ -73,6 +77,7 @@ from ..utils import (
73
77
  get_class_by_name,
74
78
  get_eos_token,
75
79
  internet_connection_available,
80
+ log_once,
76
81
  )
77
82
  from .base import BenchmarkModule
78
83
 
@@ -727,53 +732,54 @@ def get_model_repo_info(
727
732
  # If the model does not exist locally, then we get the model info from the Hugging
728
733
  # Face Hub
729
734
  if model_info is None:
730
- try:
731
- model_info = hf_api.model_info(
732
- repo_id=model_id, revision=revision, token=token
733
- )
734
- except (GatedRepoError, LocalTokenNotFoundError) as e:
735
+ num_attempts = 3
736
+ for _ in range(num_attempts):
735
737
  try:
736
- hf_whoami(token=token)
737
- logger.warning(
738
- f"Could not access the model {model_id} with the revision "
739
- f"{revision}. The error was {str(e)!r}."
738
+ model_info = hf_api.model_info(
739
+ repo_id=model_id, revision=revision, token=token
740
740
  )
741
+ break
742
+ except (GatedRepoError, LocalTokenNotFoundError) as e:
743
+ try:
744
+ hf_whoami(token=token)
745
+ logger.warning(
746
+ f"Could not access the model {model_id} with the revision "
747
+ f"{revision}. The error was {str(e)!r}."
748
+ )
749
+ return None
750
+ except LocalTokenNotFoundError:
751
+ raise NeedsAdditionalArgument(
752
+ cli_argument="--api-key",
753
+ script_argument="api_key=<your-api-key>",
754
+ run_with_cli=benchmark_config.run_with_cli,
755
+ )
756
+ except (RepositoryNotFoundError, HFValidationError):
741
757
  return None
742
- except LocalTokenNotFoundError:
743
- raise NeedsAdditionalArgument(
744
- cli_argument="--api-key",
745
- script_argument="api_key=<your-api-key>",
746
- run_with_cli=benchmark_config.run_with_cli,
747
- )
748
- except (RepositoryNotFoundError, HFValidationError):
749
- return None
750
- except (OSError, RequestException):
751
- if internet_connection_available():
752
- raise HuggingFaceHubDown()
753
- else:
758
+ except (OSError, RequestException):
759
+ if internet_connection_available():
760
+ continue
754
761
  raise NoInternetConnection()
762
+ else:
763
+ raise HuggingFaceHubDown()
755
764
 
756
765
  # Get all the Hugging Face repository tags for the model. If the model is an adapter
757
766
  # model, then we also get the tags for the base model
758
767
  tags = model_info.tags or list()
759
- has_base_model_tag = any(
760
- tag.startswith("base_model:") and tag.count(":") == 1 for tag in tags
761
- )
762
768
  base_model_id: str | None = None
763
- if has_base_model_tag:
764
- has_adapter_config = model_info.siblings is not None and any(
765
- sibling.rfilename == "adapter_config.json"
766
- for sibling in model_info.siblings
769
+ has_adapter_config = model_info.siblings is not None and any(
770
+ sibling.rfilename == "adapter_config.json" for sibling in model_info.siblings
771
+ )
772
+ if has_adapter_config:
773
+ adapter_config = PeftConfig.from_pretrained(model_id, revision=revision)
774
+ base_model_id = adapter_config.base_model_name_or_path
775
+ log_once(
776
+ f"Model {model_id!r} identified as an adapter model, with base model "
777
+ f"{base_model_id!r}.",
778
+ level=logging.DEBUG,
767
779
  )
768
- if has_adapter_config:
769
- base_model_id = [
770
- tag.split(":")[1]
771
- for tag in tags
772
- if tag.startswith("base_model:") and tag.count(":") == 1
773
- ][0]
780
+ if base_model_id is not None:
774
781
  base_model_info = hf_api.model_info(
775
782
  repo_id=base_model_id,
776
- revision=revision,
777
783
  token=benchmark_config.api_key
778
784
  or os.getenv("HUGGINGFACE_API_KEY")
779
785
  or True,
@@ -781,12 +787,18 @@ def get_model_repo_info(
781
787
  tags += base_model_info.tags or list()
782
788
  tags = list(set(tags))
783
789
 
790
+ # TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
791
+ # 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
792
+ # when this PR has been merged in and published:
793
+ # https://github.com/huggingface/transformers/pull/37107
794
+ TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
795
+
784
796
  # Get the pipeline tag for the model. If it is not specified, then we determine it
785
797
  # by checking the model's architecture as written in the model's Hugging Face config
786
798
  pipeline_tag = model_info.pipeline_tag
787
799
  if pipeline_tag is None:
788
800
  hf_config = load_hf_model_config(
789
- model_id=model_id,
801
+ model_id=base_model_id or model_id,
790
802
  num_labels=0,
791
803
  id2label=dict(),
792
804
  label2id=dict(),
@@ -812,7 +824,6 @@ def get_model_repo_info(
812
824
  pipeline_tag = "fill-mask"
813
825
 
814
826
  if benchmark_config.only_allow_safetensors:
815
- # Check if any file ends with .safetensors
816
827
  repo_files = hf_api.list_repo_files(repo_id=model_id, revision=revision)
817
828
  has_safetensors = any(f.endswith(".safetensors") for f in repo_files)
818
829
  if not has_safetensors:
@@ -826,6 +837,26 @@ def get_model_repo_info(
826
837
  )
827
838
  raise InvalidModel(msg)
828
839
 
840
+ # Also check base model if we are evaluating an adapter
841
+ if base_model_id is not None:
842
+ base_repo_files = hf_api.list_repo_files(repo_id=base_model_id)
843
+ base_has_safetensors = any(
844
+ f.endswith(".safetensors") for f in base_repo_files
845
+ )
846
+ if not base_has_safetensors:
847
+ msg = (
848
+ f"Base model {base_model_id} does not have safetensors weights "
849
+ "available."
850
+ )
851
+ if benchmark_config.run_with_cli:
852
+ msg += " Skipping since the `--only-allow-safetensors` flag is set."
853
+ else:
854
+ msg += (
855
+ " Skipping since the `only_allow_safetensors` argument is set "
856
+ "to `True`."
857
+ )
858
+ raise InvalidModel(msg)
859
+
829
860
  return HFModelInfo(
830
861
  pipeline_tag=pipeline_tag, tags=tags, adapter_base_model_id=base_model_id
831
862
  )
@@ -30,6 +30,7 @@ from ..constants import (
30
30
  REASONING_MAX_TOKENS,
31
31
  TASK_GROUPS_USING_LOGPROBS,
32
32
  TASKS_USING_JSON,
33
+ VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
33
34
  )
34
35
  from ..data_models import (
35
36
  BenchmarkConfig,
@@ -65,6 +66,7 @@ from ..utils import (
65
66
  get_bos_token,
66
67
  get_end_of_chat_token_ids,
67
68
  get_eos_token,
69
+ get_min_cuda_compute_capability,
68
70
  log_once,
69
71
  should_prompts_be_stripped,
70
72
  )
@@ -145,6 +147,7 @@ class VLLMModel(HuggingFaceEncoderModel):
145
147
  if self.model_config.adapter_base_model_id is not None:
146
148
  adapter_path = snapshot_download(
147
149
  repo_id=self.model_config.model_id,
150
+ revision=self.model_config.revision,
148
151
  cache_dir=Path(self.model_config.model_cache_dir),
149
152
  )
150
153
  self.buffer["lora_request"] = LoRARequest(
@@ -373,12 +376,27 @@ class VLLMModel(HuggingFaceEncoderModel):
373
376
 
374
377
  # Generate sequences using vLLM
375
378
  input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
376
- raw_outputs = self._model.generate(
377
- prompts=prompts,
378
- sampling_params=sampling_params,
379
- use_tqdm=(not input_is_a_test),
380
- lora_request=self.buffer.get("lora_request"),
381
- )
379
+ num_attempts = 3
380
+ for _ in range(num_attempts):
381
+ try:
382
+ raw_outputs = self._model.generate(
383
+ prompts=prompts,
384
+ sampling_params=sampling_params,
385
+ use_tqdm=(not input_is_a_test),
386
+ lora_request=self.buffer.get("lora_request"),
387
+ )
388
+ break
389
+ except TypeError as e:
390
+ logger.debug(
391
+ f"Encountered error during vLLM generation: {str(e)}. Retrying..."
392
+ )
393
+ sleep(1)
394
+ else:
395
+ raise InvalidBenchmark(
396
+ f"Could not generate sequences after {num_attempts} attempts."
397
+ )
398
+
399
+ # Parse the raw model outputs
382
400
  completion_ids: list[list[int]] = [
383
401
  output.outputs[0].token_ids for output in raw_outputs
384
402
  ]
@@ -846,13 +864,16 @@ def load_model_and_tokenizer(
846
864
  # Prefer base model ID if the model is an adapter - the adapter will be added on
847
865
  # during inference in this case
848
866
  model_id = model_config.adapter_base_model_id or model_config.model_id
867
+ revision = (
868
+ model_config.revision if model_config.adapter_base_model_id is None else "main"
869
+ )
849
870
 
850
871
  hf_model_config = load_hf_model_config(
851
872
  model_id=model_id,
852
873
  num_labels=0,
853
874
  id2label=dict(),
854
875
  label2id=dict(),
855
- revision=model_config.revision,
876
+ revision=revision,
856
877
  model_cache_dir=model_config.model_cache_dir,
857
878
  api_key=benchmark_config.api_key,
858
879
  trust_remote_code=benchmark_config.trust_remote_code,
@@ -881,6 +902,23 @@ def load_model_and_tokenizer(
881
902
  )
882
903
  dtype = torch.float16
883
904
 
905
+ if hf_model_config.torch_dtype == torch.bfloat16:
906
+ min_cuda_compute_capability = get_min_cuda_compute_capability()
907
+ required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
908
+
909
+ if min_cuda_compute_capability is not None:
910
+ if min_cuda_compute_capability < required_capability:
911
+ logger.info(
912
+ "You are loading a model with "
913
+ f"dtype {hf_model_config.torch_dtype}, "
914
+ "which vLLM only supports for CUDA devices with"
915
+ f"CUDA compute capability >={required_capability}. "
916
+ "You are using one or more devices with "
917
+ f"compute capability {min_cuda_compute_capability}. "
918
+ "Setting dtype to float16 instead."
919
+ )
920
+ dtype = torch.float16
921
+
884
922
  if model_config.adapter_base_model_id is not None:
885
923
  download_dir = str(Path(model_config.model_cache_dir) / "base_model")
886
924
  else:
@@ -916,7 +954,7 @@ def load_model_and_tokenizer(
916
954
  max_model_len=min(true_max_model_len, 5_000),
917
955
  download_dir=download_dir,
918
956
  trust_remote_code=benchmark_config.trust_remote_code,
919
- revision=model_config.revision,
957
+ revision=revision,
920
958
  seed=4242,
921
959
  distributed_executor_backend=executor_backend,
922
960
  tensor_parallel_size=torch.cuda.device_count(),
@@ -994,6 +1032,7 @@ def load_tokenizer(
994
1032
  Returns:
995
1033
  The loaded tokenizer.
996
1034
  """
1035
+ revision = revision if adapter_base_model_id is None else "main"
997
1036
  config = AutoConfig.from_pretrained(
998
1037
  adapter_base_model_id or model_id,
999
1038
  revision=revision,
euroeval/constants.py CHANGED
@@ -54,3 +54,6 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
54
54
 
55
55
  # Hugging Face Hub tags used to classify models as merge models
56
56
  MERGE_TAGS = ["merge", "mergekit"]
57
+
58
+ # The minimum required CUDA compute capability for using bfloat16 in vLLM
59
+ VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
euroeval/data_models.py CHANGED
@@ -1,7 +1,6 @@
1
1
  """Data models used in EuroEval."""
2
2
 
3
3
  import collections.abc as c
4
- import importlib.metadata
5
4
  import json
6
5
  import pathlib
7
6
  import re
@@ -11,6 +10,8 @@ from dataclasses import dataclass, field
11
10
  import pydantic
12
11
  import torch
13
12
 
13
+ from euroeval.utils import get_package_version
14
+
14
15
  from .enums import Device, InferenceBackend, ModelType, TaskGroup
15
16
  from .types import ScoreDict
16
17
 
@@ -228,7 +229,11 @@ class BenchmarkResult(pydantic.BaseModel):
228
229
  generative_type: str | None
229
230
  few_shot: bool
230
231
  validation_split: bool
231
- euroeval_version: str = importlib.metadata.version("euroeval")
232
+ euroeval_version: str | None = get_package_version("euroeval")
233
+ transformers_version: str | None = get_package_version("transformers")
234
+ torch_version: str | None = get_package_version("torch")
235
+ vllm_version: str | None = get_package_version("vllm")
236
+ outlines_version: str | None = get_package_version("outlines")
232
237
 
233
238
  @classmethod
234
239
  def from_dict(cls, config: dict) -> "BenchmarkResult":
@@ -244,7 +244,7 @@ FOSENT_CONFIG = DatasetConfig(
244
244
  ALLOCINE_CONFIG = DatasetConfig(
245
245
  name="allocine",
246
246
  pretty_name="the truncated version of the French sentiment classification "
247
- "dataset Allocine",
247
+ "dataset AlloCiné",
248
248
  huggingface_id="EuroEval/allocine-mini",
249
249
  task=SENT,
250
250
  languages=[FR],
@@ -1467,9 +1467,9 @@ NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
1467
1467
  max_generated_tokens=256,
1468
1468
  )
1469
1469
 
1470
- MLSUM_CONFIG = DatasetConfig(
1471
- name="mlsum",
1472
- pretty_name="the truncated version of the German summarisation dataset MLSum",
1470
+ MLSUM_DE_CONFIG = DatasetConfig(
1471
+ name="mlsum-de",
1472
+ pretty_name="the truncated version of the German summarisation dataset MLSum-de",
1473
1473
  huggingface_id="EuroEval/mlsum-mini",
1474
1474
  task=SUMM,
1475
1475
  languages=[DE],
@@ -1484,7 +1484,7 @@ MLSUM_CONFIG = DatasetConfig(
1484
1484
 
1485
1485
  MLSUM_ES_CONFIG = DatasetConfig(
1486
1486
  name="mlsum-es",
1487
- pretty_name="the truncated version of the Spanish summarisation dataset MLSum",
1487
+ pretty_name="the truncated version of the Spanish summarisation dataset MLSum-es",
1488
1488
  huggingface_id="EuroEval/mlsum-es-mini",
1489
1489
  task=SUMM,
1490
1490
  languages=[ES],
euroeval/generation.py CHANGED
@@ -20,7 +20,12 @@ from .model_cache import (
20
20
  from .utils import clear_memory
21
21
 
22
22
  if t.TYPE_CHECKING:
23
- from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
23
+ from .data_models import (
24
+ BenchmarkConfig,
25
+ DatasetConfig,
26
+ GenerativeModelOutput,
27
+ ModelConfig,
28
+ )
24
29
 
25
30
  logger = logging.getLogger("euroeval")
26
31
 
@@ -163,6 +168,7 @@ def generate_single_iteration(
163
168
  if benchmark_config.debug:
164
169
  debug_log(
165
170
  batch=batch,
171
+ model_output=model_output,
166
172
  extracted_labels=extracted_labels, # type: ignore[arg-type]
167
173
  dataset_config=dataset_config,
168
174
  )
@@ -217,6 +223,7 @@ def generate_single_iteration(
217
223
 
218
224
  def debug_log(
219
225
  batch: dict[str, t.Any],
226
+ model_output: "GenerativeModelOutput",
220
227
  extracted_labels: list[dict | str | list[str]],
221
228
  dataset_config: "DatasetConfig",
222
229
  ) -> None:
@@ -225,6 +232,8 @@ def debug_log(
225
232
  Args:
226
233
  batch:
227
234
  The batch of examples to evaluate on.
235
+ model_output:
236
+ The output of the model.
228
237
  extracted_labels:
229
238
  The extracted labels from the model output.
230
239
  dataset_config:
@@ -290,7 +299,12 @@ def debug_log(
290
299
  else:
291
300
  input_texts = batch["text"]
292
301
 
293
- for input_text, prediction, label in zip(input_texts, extracted_labels, labels):
302
+ for input_text, raw_output, prediction, label in zip(
303
+ input_texts, model_output.sequences, extracted_labels, labels
304
+ ):
294
305
  logger.info(
295
- f"Input: '{input_text}'\nPrediction: '{prediction}'\nLabel: '{label}'"
306
+ f"Input: '{input_text}'\n"
307
+ f"Raw outout: '{raw_output}'\n"
308
+ f"Prediction: '{prediction}'\n"
309
+ f"Label: '{label}'"
296
310
  )
@@ -162,9 +162,7 @@ def get_closest_logprobs_labels(
162
162
  """
163
163
  english_labels = list(dataset_config.id2label.values())
164
164
  english2local = dataset_config.prompt_label_mapping
165
- candidate_labels = [
166
- english2local[lbl].lower() for lbl in english_labels
167
- ] + english_labels
165
+ candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
168
166
 
169
167
  output_labels: list[str] = list()
170
168
  for sample in generation_logprobs:
@@ -179,21 +177,48 @@ def get_closest_logprobs_labels(
179
177
  ]
180
178
  generated_labels = [label for label in generated_labels if label != ""]
181
179
 
182
- # We want to use the first generated label which starts with a candidate
180
+ # We want to use the first generated label which contains a unique candidate
183
181
  # label, as the output label
184
182
  output_label: str | None = None
185
- for generated_label in generated_labels:
186
- candidate_output_labels = [
183
+ previously_generated_labels: list[str] = list()
184
+ for label_idx, generated_label in enumerate(generated_labels):
185
+ generated_label = "".join(previously_generated_labels) + generated_label
186
+
187
+ # Get the candidate labels that starts with the generated label
188
+ candidate_output_labels = {
187
189
  candidate_label
188
190
  for candidate_label in candidate_labels
189
191
  if candidate_label.startswith(generated_label)
190
- ]
191
- if candidate_output_labels:
192
- output_label = candidate_output_labels[0]
192
+ }
193
+
194
+ # If we can uniquely determine the output label, we break the loop. If
195
+ # there are multiple possible labels then we store the current one, and
196
+ # concatenate it with the next generated label. We can only do this if
197
+ # the current one is the first one, however, since we're using greedy
198
+ # sampling. In case this happens for a label that is not the first one,
199
+ # we warn the user.
200
+ if len(candidate_output_labels) == 1:
201
+ output_label = candidate_output_labels.pop()
193
202
  break
203
+ elif len(candidate_output_labels) > 1:
204
+ if label_idx == 0:
205
+ previously_generated_labels.append(generated_label)
206
+ else:
207
+ output_label = candidate_output_labels.pop()
208
+ candidate_output_labels.add(output_label)
209
+ log_once(
210
+ "Multiple candidate labels found for the generated label "
211
+ f"{generated_label!r}: {candidate_output_labels}. Since "
212
+ "this is not the first generated label, we cannot "
213
+ "concatenate it with the next generated label. We are thus "
214
+ f"forced to use the arbitrary {output_label!r} as the "
215
+ "output label, potentially resulting in worse performance. "
216
+ "Please report this issue to the EuroEval team at "
217
+ "github.com/EuroEval/EuroEval/issues.",
218
+ level=logging.WARNING,
219
+ )
194
220
 
195
221
  if output_label is not None:
196
- output_label = english2local.get(output_label, output_label)
197
222
  output_labels.append(output_label)
198
223
  break
199
224
  else:
euroeval/types.py CHANGED
@@ -8,9 +8,9 @@ if t.TYPE_CHECKING:
8
8
  from .data_models import GenerativeModelOutput
9
9
 
10
10
 
11
- ScoreDict = dict[str, dict[str, float] | list[dict[str, float]]]
12
- Predictions = NDArray | list[str] | list[list[str]]
13
- Labels = NDArray | list[str] | list[list[str]]
11
+ ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
12
+ Predictions: t.TypeAlias = NDArray | list[str] | list[list[str]]
13
+ Labels: t.TypeAlias = NDArray | list[str] | list[list[str]]
14
14
 
15
15
 
16
16
  class ComputeMetricsFunction(t.Protocol):
euroeval/utils.py CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  import gc
4
4
  import importlib
5
+ import importlib.metadata
5
6
  import importlib.util
6
7
  import logging
7
8
  import os
8
9
  import random
9
- import re
10
10
  import sys
11
11
  import typing as t
12
12
  import warnings
@@ -16,7 +16,6 @@ from types import TracebackType
16
16
 
17
17
  import litellm
18
18
  import numpy as np
19
- import pkg_resources
20
19
  import requests
21
20
  import torch
22
21
  from datasets.utils import disable_progress_bar
@@ -84,33 +83,6 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
84
83
  return rng
85
84
 
86
85
 
87
- def is_module_installed(module: str) -> bool:
88
- """Check if a module is installed.
89
-
90
- This is used when dealing with spaCy models, as these are installed as separate
91
- Python packages.
92
-
93
- Args:
94
- module:
95
- The name of the module.
96
-
97
- Returns:
98
- Whether the module is installed or not.
99
- """
100
- # Get list of all modules, including their versions
101
- installed_modules_with_versions = list(pkg_resources.working_set)
102
-
103
- # Strip the module versions from the list of modules. Also make the modules lower
104
- # case and replace dashes with underscores
105
- installed_modules = [
106
- re.sub("[0-9. ]", "", str(module)).lower().replace("-", "_")
107
- for module in installed_modules_with_versions
108
- ]
109
-
110
- # Check if the module is installed by checking if the module name is in the list
111
- return module.lower() in installed_modules
112
-
113
-
114
86
  def block_terminal_output() -> None:
115
87
  """Blocks libraries from writing output to the terminal.
116
88
 
@@ -206,6 +178,21 @@ def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type |
206
178
  return None
207
179
 
208
180
 
181
+ def get_min_cuda_compute_capability() -> float | None:
182
+ """Gets the lowest cuda capability.
183
+
184
+ Returns:
185
+ Device capability as float, or None if CUDA is not available.
186
+ """
187
+ if not torch.cuda.is_available():
188
+ return None
189
+
190
+ device_range = range(torch.cuda.device_count())
191
+ capabilities = map(torch.cuda.get_device_capability, device_range)
192
+ major, minor = min(capabilities)
193
+ return float(f"{major}.{minor}")
194
+
195
+
209
196
  def kebab_to_pascal(kebab_string: str) -> str:
210
197
  """Converts a kebab-case string to PascalCase.
211
198
 
@@ -573,3 +560,19 @@ def log_once(message: str, level: int = logging.INFO) -> None:
573
560
  logger.critical(message)
574
561
  case _:
575
562
  raise ValueError(f"Invalid logging level: {level}")
563
+
564
+
565
+ def get_package_version(package_name: str) -> str | None:
566
+ """Get the version of a package.
567
+
568
+ Args:
569
+ package_name:
570
+ The name of the package.
571
+
572
+ Returns:
573
+ The version of the package, or None if the package is not installed.
574
+ """
575
+ try:
576
+ return importlib.metadata.version(package_name)
577
+ except importlib.metadata.PackageNotFoundError:
578
+ return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.4.0
3
+ Version: 15.4.2
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -42,6 +42,7 @@ Requires-Dist: more-itertools>=10.5.0
42
42
  Requires-Dist: numpy<2.0.0,>=1.23.0
43
43
  Requires-Dist: ollama>=0.4.7
44
44
  Requires-Dist: pandas>=2.2.0
45
+ Requires-Dist: peft>=0.15.0
45
46
  Requires-Dist: protobuf~=3.20.0
46
47
  Requires-Dist: pydantic>=2.6.0
47
48
  Requires-Dist: pyinfer>=0.0.3
@@ -61,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
61
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
62
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
63
64
  Requires-Dist: outlines>=0.1.11; extra == 'all'
64
- Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'all'
65
66
  Provides-Extra: generative
66
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
67
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
68
69
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
69
- Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
71
  Provides-Extra: human-evaluation
71
72
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
72
73
  Provides-Extra: test
@@ -3,14 +3,14 @@ euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHh
3
3
  euroeval/benchmarker.py,sha256=PIdqLPleLN3nml5Zb1g_dQaLzqxQhmgC8VuvD5yloV4,46524
4
4
  euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
5
5
  euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
- euroeval/constants.py,sha256=9iXe26WAigL9RYob3PhsB5c0dr11wCeRxrEfm_ssynM,1562
6
+ euroeval/constants.py,sha256=zL8dm7SEFpIgC2vaPhqzdKydVSWW-ZyMHenWPnNxWqQ,1681
7
7
  euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
8
- euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
9
- euroeval/dataset_configs.py,sha256=bjMUXvaEtTpo1Eql_mIRCG3K_lB2DZRdPWEAwR5N4ig,90627
8
+ euroeval/data_models.py,sha256=b4rOMdhoxkIPcnTQdwqq5iWaF6uia1OzAgdiOBvoGVM,14779
9
+ euroeval/dataset_configs.py,sha256=C5Gnp95cBeCmmuRA8Rznt0c4gMOn8Pilk_kDCleDMjg,90640
10
10
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
11
11
  euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
12
12
  euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
13
- euroeval/generation.py,sha256=UZ9nmKl4rbNBhW41iwpgw_tqfsEfe1UhOnjGudz9GWs,10382
13
+ euroeval/generation.py,sha256=dohSPYc4eASm5tJhNKfBlpJnellKG7nVeyx8yXXxMlE,10721
14
14
  euroeval/human_evaluation.py,sha256=5uOm8cZf5uy2jBPs-ih7g8ni-a3hUz8UiXVPh6PzUWw,27675
15
15
  euroeval/languages.py,sha256=d1SyG0KVtCAA_PYpFGZCgZcyVLIr7Q8uYKPxNw6WEBc,7909
16
16
  euroeval/model_cache.py,sha256=BhkyWrOhjskESbndy218LUv1ZiWRc48ScdH_42dKHtE,8275
@@ -19,22 +19,22 @@ euroeval/model_loading.py,sha256=ta07tMoSfK1kqjOynVXQA0vVrns6RzsCEE3g1_RGVVs,271
19
19
  euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
20
20
  euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
21
21
  euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
22
- euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
23
- euroeval/utils.py,sha256=MkiVI-0KmK4ilKJTTfYAynKaPDOzW1WjyRdZsYmnoIg,18803
22
+ euroeval/types.py,sha256=5DIhaVyzH8RO9jdJfibX9pwbZviQwU35dMsfszD2Whs,2406
23
+ euroeval/utils.py,sha256=CFjYMoKdcxLUEM-aF3pxf_3TnGWvGasjfb8pDMJVe9U,18772
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
26
26
  euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
27
- euroeval/benchmark_modules/hf.py,sha256=YeaaP_YGAlKG5G1KFq0bFOFWv42eH_zfmhuW3FAXjAA,41726
27
+ euroeval/benchmark_modules/hf.py,sha256=Typig7WDqOn_uGE24s_P_9PHvq-V0MrKGD7xbh0aYnk,43244
28
28
  euroeval/benchmark_modules/litellm.py,sha256=ZJ9dB683pXPHDf70OOJfmHn_y706xRYzstYLz2ytCKE,39784
29
- euroeval/benchmark_modules/vllm.py,sha256=5N2ytLR9cZIcPeza-ERQWwyvehDd0F1FUvXY3cKu4Oo,44519
29
+ euroeval/benchmark_modules/vllm.py,sha256=O8-dcVkU2jgZer44EOeTC8E4d-xQjPDOXnoyzXxAToQ,46179
30
30
  euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
31
31
  euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
32
32
  euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
33
- euroeval/task_utils/sequence_classification.py,sha256=FrkvFzxFSnZoXThgpQqvJCIy3_YemyqZFQ1L-YdMMiw,8527
33
+ euroeval/task_utils/sequence_classification.py,sha256=832iWpPR3CsnlBIYA976eN21WUFQLUmIlDxFIvOsROk,10266
34
34
  euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
35
35
  euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
36
- euroeval-15.4.0.dist-info/METADATA,sha256=HfNWsANdb8TJAyK__QPBhs7O5qsQp9G_gPlhVVNuK9c,10724
37
- euroeval-15.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- euroeval-15.4.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
- euroeval-15.4.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
- euroeval-15.4.0.dist-info/RECORD,,
36
+ euroeval-15.4.2.dist-info/METADATA,sha256=cvpyWIKPXNKn1Idv7w3C7z8MBVljmw50jBdskL_32oI,10752
37
+ euroeval-15.4.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ euroeval-15.4.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
+ euroeval-15.4.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
+ euroeval-15.4.2.dist-info/RECORD,,