EuroEval 15.4.1__py3-none-any.whl → 15.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

euroeval/utils.py CHANGED
@@ -2,6 +2,7 @@
2
2
 
3
3
  import gc
4
4
  import importlib
5
+ import importlib.metadata
5
6
  import importlib.util
6
7
  import logging
7
8
  import os
@@ -12,16 +13,13 @@ import typing as t
12
13
  import warnings
13
14
  from functools import cache
14
15
  from pathlib import Path
15
- from types import TracebackType
16
16
 
17
17
  import litellm
18
18
  import numpy as np
19
- import pkg_resources
20
19
  import requests
21
20
  import torch
22
21
  from datasets.utils import disable_progress_bar
23
22
  from requests.exceptions import RequestException
24
- from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
25
23
  from transformers import logging as tf_logging
26
24
 
27
25
  from .exceptions import InvalidModel, NaNValueInModelOutput
@@ -30,6 +28,11 @@ if importlib.util.find_spec("ray") is not None:
30
28
  import ray
31
29
 
32
30
  if t.TYPE_CHECKING:
31
+ from types import TracebackType
32
+
33
+ from transformers import PreTrainedTokenizer, PreTrainedTokenizerBase
34
+
35
+ from .data_models import DatasetConfig
33
36
  from .types import Predictions
34
37
 
35
38
 
@@ -84,33 +87,6 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
84
87
  return rng
85
88
 
86
89
 
87
- def is_module_installed(module: str) -> bool:
88
- """Check if a module is installed.
89
-
90
- This is used when dealing with spaCy models, as these are installed as separate
91
- Python packages.
92
-
93
- Args:
94
- module:
95
- The name of the module.
96
-
97
- Returns:
98
- Whether the module is installed or not.
99
- """
100
- # Get list of all modules, including their versions
101
- installed_modules_with_versions = list(pkg_resources.working_set)
102
-
103
- # Strip the module versions from the list of modules. Also make the modules lower
104
- # case and replace dashes with underscores
105
- installed_modules = [
106
- re.sub("[0-9. ]", "", str(module)).lower().replace("-", "_")
107
- for module in installed_modules_with_versions
108
- ]
109
-
110
- # Check if the module is installed by checking if the module name is in the list
111
- return module.lower() in installed_modules
112
-
113
-
114
90
  def block_terminal_output() -> None:
115
91
  """Blocks libraries from writing output to the terminal.
116
92
 
@@ -206,6 +182,21 @@ def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type |
206
182
  return None
207
183
 
208
184
 
185
+ def get_min_cuda_compute_capability() -> float | None:
186
+ """Gets the lowest cuda capability.
187
+
188
+ Returns:
189
+ Device capability as float, or None if CUDA is not available.
190
+ """
191
+ if not torch.cuda.is_available():
192
+ return None
193
+
194
+ device_range = range(torch.cuda.device_count())
195
+ capabilities = map(torch.cuda.get_device_capability, device_range)
196
+ major, minor = min(capabilities)
197
+ return float(f"{major}.{minor}")
198
+
199
+
209
200
  def kebab_to_pascal(kebab_string: str) -> str:
210
201
  """Converts a kebab-case string to PascalCase.
211
202
 
@@ -298,7 +289,7 @@ class HiddenPrints:
298
289
  self,
299
290
  exc_type: t.Type[BaseException],
300
291
  exc_val: BaseException,
301
- exc_tb: TracebackType,
292
+ exc_tb: "TracebackType",
302
293
  ) -> None:
303
294
  """Exit the context manager."""
304
295
  sys.stdout.close()
@@ -368,7 +359,6 @@ def should_prompts_be_stripped(
368
359
  return strip_prompts
369
360
 
370
361
 
371
- # TODO: This is currently not used - maybe remove.
372
362
  def should_prefix_space_be_added_to_labels(
373
363
  labels_to_be_generated: list[str], tokenizer: "PreTrainedTokenizer"
374
364
  ) -> bool:
@@ -573,3 +563,112 @@ def log_once(message: str, level: int = logging.INFO) -> None:
573
563
  logger.critical(message)
574
564
  case _:
575
565
  raise ValueError(f"Invalid logging level: {level}")
566
+
567
+
568
+ def get_package_version(package_name: str) -> str | None:
569
+ """Get the version of a package.
570
+
571
+ Args:
572
+ package_name:
573
+ The name of the package.
574
+
575
+ Returns:
576
+ The version of the package, or None if the package is not installed.
577
+ """
578
+ try:
579
+ return importlib.metadata.version(package_name)
580
+ except importlib.metadata.PackageNotFoundError:
581
+ return None
582
+
583
+
584
+ def get_first_label_token_mapping(
585
+ dataset_config: "DatasetConfig", tokenizer: "PreTrainedTokenizer | None"
586
+ ) -> dict[str, str] | bool:
587
+ """Check if the model should output scores.
588
+
589
+ Args:
590
+ dataset_config:
591
+ The dataset configuration.
592
+ tokenizer:
593
+ The tokenizer, or None if not available.
594
+
595
+ Returns:
596
+ A mapping from labels to the first token in each label, or alternatively a
597
+ Boolean value indicating whether the model should output scores (if the mapping
598
+ is outputted then the model will always output scores).
599
+ """
600
+ # Importing here to avoid circular imports
601
+ from .constants import TASK_GROUPS_USING_LOGPROBS
602
+
603
+ # If we do not have any tokenizer, then we cannot check if the model should output
604
+ # scores and we just assume it should if the dataset supports it
605
+ output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
606
+ if tokenizer is None:
607
+ if output_scores:
608
+ log_once(
609
+ "The model will output scores, since the dataset supports it and no "
610
+ "tokenizer is available.",
611
+ level=logging.DEBUG,
612
+ )
613
+ else:
614
+ log_once(
615
+ "The model will not output scores, since the dataset does not support "
616
+ "it and no tokenizer is available.",
617
+ level=logging.DEBUG,
618
+ )
619
+ return output_scores
620
+
621
+ # If there are labels associated with the dataset, and that the first token of each
622
+ # label is distinct, then we can safely use the logprobs
623
+ if output_scores and dataset_config.labels:
624
+ local_labels = [
625
+ dataset_config.prompt_label_mapping[label].strip()
626
+ for label in dataset_config.labels
627
+ ]
628
+
629
+ # Get the first token of each label, where we add a prefix space if needed
630
+ add_prefix_space = (
631
+ should_prefix_space_be_added_to_labels(
632
+ labels_to_be_generated=local_labels, tokenizer=tokenizer
633
+ )
634
+ and tokenizer.chat_template is None
635
+ )
636
+ first_tokens = [
637
+ tokenizer.tokenize(text=f" {label}" if add_prefix_space else label)[0]
638
+ for label in local_labels
639
+ ]
640
+ first_tokens = [
641
+ re.sub(
642
+ pattern=r"^[^a-zæøåüöä]+|[^a-zæøåüöä]+$", repl="", string=token.lower()
643
+ )
644
+ for token in first_tokens
645
+ ]
646
+
647
+ # Build a mapping from labels to the first token in each label if the first
648
+ # tokens are distinct
649
+ if len(first_tokens) == len(set(first_tokens)):
650
+ log_once(
651
+ "The model will output scores, since the first tokens of the labels "
652
+ "are distinct.",
653
+ level=logging.DEBUG,
654
+ )
655
+ return {
656
+ label: first_token
657
+ for label, first_token in zip(local_labels, first_tokens)
658
+ }
659
+ else:
660
+ log_once(
661
+ "The model will not output scores, since the first tokens of the "
662
+ "labels are not distinct. The first tokens for the labels "
663
+ f"{local_labels} are {first_tokens}"
664
+ )
665
+ return False
666
+
667
+ # Otherwise, we assume that the model should not output scores, to avoid potential
668
+ # evaluation errors. This will force the label extraction to rely on word edit
669
+ # distance instead of logprobs.
670
+ log_once(
671
+ "The model will not output scores, since the dataset does not have labels.",
672
+ level=logging.DEBUG,
673
+ )
674
+ return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.4.1
3
+ Version: 15.5.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -37,11 +37,12 @@ Requires-Dist: demjson3>=3.0.6
37
37
  Requires-Dist: evaluate>=0.4.1
38
38
  Requires-Dist: huggingface-hub>=0.24.0
39
39
  Requires-Dist: levenshtein>=0.24.0
40
- Requires-Dist: litellm>=1.61.13
40
+ Requires-Dist: litellm>=1.63.0
41
41
  Requires-Dist: more-itertools>=10.5.0
42
42
  Requires-Dist: numpy<2.0.0,>=1.23.0
43
43
  Requires-Dist: ollama>=0.4.7
44
44
  Requires-Dist: pandas>=2.2.0
45
+ Requires-Dist: peft>=0.15.0
45
46
  Requires-Dist: protobuf~=3.20.0
46
47
  Requires-Dist: pydantic>=2.6.0
47
48
  Requires-Dist: pyinfer>=0.0.3
@@ -61,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
61
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
62
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
63
64
  Requires-Dist: outlines>=0.1.11; extra == 'all'
64
- Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
65
66
  Provides-Extra: generative
66
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
67
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
68
69
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
69
- Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
71
  Provides-Extra: human-evaluation
71
72
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
72
73
  Provides-Extra: test
@@ -217,6 +218,7 @@ Replace <name-of-script> with the specific script you wish to execute, e.g.,
217
218
  $ uv run src/scripts/create_allocine.py
218
219
  ```
219
220
 
221
+
220
222
  ## Special Thanks :pray:
221
223
  - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
222
224
  models on the leaderboards.
@@ -1,12 +1,12 @@
1
- euroeval/__init__.py,sha256=l3V3ybiCj0I193jvn8wS9VK4UEc9ajiOq4SojChH6Xs,2615
1
+ euroeval/__init__.py,sha256=NiT6S4II1YpnNl5KFHDNogE-rvVkOHQy5pR483eq_do,2581
2
2
  euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
3
- euroeval/benchmarker.py,sha256=PIdqLPleLN3nml5Zb1g_dQaLzqxQhmgC8VuvD5yloV4,46524
3
+ euroeval/benchmarker.py,sha256=8Qt1NL7k5n-AfFrhR6139wmmsVS7CgRa-QjminH0d_c,47849
4
4
  euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
5
5
  euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
- euroeval/constants.py,sha256=9iXe26WAigL9RYob3PhsB5c0dr11wCeRxrEfm_ssynM,1562
6
+ euroeval/constants.py,sha256=CJavEDvKLSKAC4uyz44sFrY1W1AnjUsxkXF63SoMjw4,1985
7
7
  euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
8
- euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
9
- euroeval/dataset_configs.py,sha256=bjMUXvaEtTpo1Eql_mIRCG3K_lB2DZRdPWEAwR5N4ig,90627
8
+ euroeval/data_models.py,sha256=QssdR_msDTmsp9yKe0cVba0iCpgBTFTOaOUn44o1cl8,14770
9
+ euroeval/dataset_configs.py,sha256=6WiRW-VAAMIL6-1J6Nb6pCm6mf4I-oQ087zB0es3HHs,90644
10
10
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
11
11
  euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
12
12
  euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
@@ -19,22 +19,22 @@ euroeval/model_loading.py,sha256=ta07tMoSfK1kqjOynVXQA0vVrns6RzsCEE3g1_RGVVs,271
19
19
  euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
20
20
  euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
21
21
  euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
22
- euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
23
- euroeval/utils.py,sha256=MkiVI-0KmK4ilKJTTfYAynKaPDOzW1WjyRdZsYmnoIg,18803
22
+ euroeval/types.py,sha256=5DIhaVyzH8RO9jdJfibX9pwbZviQwU35dMsfszD2Whs,2406
23
+ euroeval/utils.py,sha256=bbq7WCcIrMKjBRaZ8EcnRpRAvL_F-tCxiL0We_po3QE,22397
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
26
26
  euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
27
- euroeval/benchmark_modules/hf.py,sha256=YeaaP_YGAlKG5G1KFq0bFOFWv42eH_zfmhuW3FAXjAA,41726
28
- euroeval/benchmark_modules/litellm.py,sha256=ZJ9dB683pXPHDf70OOJfmHn_y706xRYzstYLz2ytCKE,39784
29
- euroeval/benchmark_modules/vllm.py,sha256=5N2ytLR9cZIcPeza-ERQWwyvehDd0F1FUvXY3cKu4Oo,44519
27
+ euroeval/benchmark_modules/hf.py,sha256=VcgWZmSZc4B3FgeUGC0eWQIRv97luU22-KijaBfuqU0,43602
28
+ euroeval/benchmark_modules/litellm.py,sha256=pbTsq6Bb8cnFbdZMUSrUs-XlNAyaCIWNcEKKRIfprx8,45161
29
+ euroeval/benchmark_modules/vllm.py,sha256=7AZrvcwHevrQbXvbjTCp4S6HpM0Obk6CIQLbmUWIn9s,47483
30
30
  euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
31
31
  euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
32
32
  euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
33
- euroeval/task_utils/sequence_classification.py,sha256=bIsbAj123hEyW40QeSUW8Dpc2SyI3ZPCGexapr9qqjw,9826
33
+ euroeval/task_utils/sequence_classification.py,sha256=JDZfiTj5RdwYwlhhTqVBj2mVdwmkoykZ6wJzEbWj0lo,12225
34
34
  euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
35
35
  euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
36
- euroeval-15.4.1.dist-info/METADATA,sha256=OdTP-FAbbF9vUV3OTeV5Y-B6P7FXN2bAalG903ny8hU,10740
37
- euroeval-15.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- euroeval-15.4.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
- euroeval-15.4.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
- euroeval-15.4.1.dist-info/RECORD,,
36
+ euroeval-15.5.0.dist-info/METADATA,sha256=T48YoPuFBEFI5sxgUadzkD3tidIB3TA1mKEKsFuh7fs,10752
37
+ euroeval-15.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ euroeval-15.5.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
+ euroeval-15.5.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
+ euroeval-15.5.0.dist-info/RECORD,,