EuroEval 15.4.1__py3-none-any.whl → 15.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -20,6 +20,7 @@ from huggingface_hub.utils import (
20
20
  HFValidationError,
21
21
  LocalTokenNotFoundError,
22
22
  )
23
+ from peft import PeftConfig
23
24
  from requests.exceptions import RequestException
24
25
  from torch import nn
25
26
  from transformers import (
@@ -34,6 +35,9 @@ from transformers import (
34
35
  Trainer,
35
36
  )
36
37
  from transformers.modelcard import TASK_MAPPING
38
+ from transformers.models.auto.modeling_auto import (
39
+ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
40
+ )
37
41
  from urllib3.exceptions import RequestError
38
42
 
39
43
  from ..constants import (
@@ -73,6 +77,7 @@ from ..utils import (
73
77
  get_class_by_name,
74
78
  get_eos_token,
75
79
  internet_connection_available,
80
+ log_once,
76
81
  )
77
82
  from .base import BenchmarkModule
78
83
 
@@ -727,53 +732,54 @@ def get_model_repo_info(
727
732
  # If the model does not exist locally, then we get the model info from the Hugging
728
733
  # Face Hub
729
734
  if model_info is None:
730
- try:
731
- model_info = hf_api.model_info(
732
- repo_id=model_id, revision=revision, token=token
733
- )
734
- except (GatedRepoError, LocalTokenNotFoundError) as e:
735
+ num_attempts = 3
736
+ for _ in range(num_attempts):
735
737
  try:
736
- hf_whoami(token=token)
737
- logger.warning(
738
- f"Could not access the model {model_id} with the revision "
739
- f"{revision}. The error was {str(e)!r}."
738
+ model_info = hf_api.model_info(
739
+ repo_id=model_id, revision=revision, token=token
740
740
  )
741
+ break
742
+ except (GatedRepoError, LocalTokenNotFoundError) as e:
743
+ try:
744
+ hf_whoami(token=token)
745
+ logger.warning(
746
+ f"Could not access the model {model_id} with the revision "
747
+ f"{revision}. The error was {str(e)!r}."
748
+ )
749
+ return None
750
+ except LocalTokenNotFoundError:
751
+ raise NeedsAdditionalArgument(
752
+ cli_argument="--api-key",
753
+ script_argument="api_key=<your-api-key>",
754
+ run_with_cli=benchmark_config.run_with_cli,
755
+ )
756
+ except (RepositoryNotFoundError, HFValidationError):
741
757
  return None
742
- except LocalTokenNotFoundError:
743
- raise NeedsAdditionalArgument(
744
- cli_argument="--api-key",
745
- script_argument="api_key=<your-api-key>",
746
- run_with_cli=benchmark_config.run_with_cli,
747
- )
748
- except (RepositoryNotFoundError, HFValidationError):
749
- return None
750
- except (OSError, RequestException):
751
- if internet_connection_available():
752
- raise HuggingFaceHubDown()
753
- else:
758
+ except (OSError, RequestException):
759
+ if internet_connection_available():
760
+ continue
754
761
  raise NoInternetConnection()
762
+ else:
763
+ raise HuggingFaceHubDown()
755
764
 
756
765
  # Get all the Hugging Face repository tags for the model. If the model is an adapter
757
766
  # model, then we also get the tags for the base model
758
767
  tags = model_info.tags or list()
759
- has_base_model_tag = any(
760
- tag.startswith("base_model:") and tag.count(":") == 1 for tag in tags
761
- )
762
768
  base_model_id: str | None = None
763
- if has_base_model_tag:
764
- has_adapter_config = model_info.siblings is not None and any(
765
- sibling.rfilename == "adapter_config.json"
766
- for sibling in model_info.siblings
769
+ has_adapter_config = model_info.siblings is not None and any(
770
+ sibling.rfilename == "adapter_config.json" for sibling in model_info.siblings
771
+ )
772
+ if has_adapter_config:
773
+ adapter_config = PeftConfig.from_pretrained(model_id, revision=revision)
774
+ base_model_id = adapter_config.base_model_name_or_path
775
+ log_once(
776
+ f"Model {model_id!r} identified as an adapter model, with base model "
777
+ f"{base_model_id!r}.",
778
+ level=logging.DEBUG,
767
779
  )
768
- if has_adapter_config:
769
- base_model_id = [
770
- tag.split(":")[1]
771
- for tag in tags
772
- if tag.startswith("base_model:") and tag.count(":") == 1
773
- ][0]
780
+ if base_model_id is not None:
774
781
  base_model_info = hf_api.model_info(
775
782
  repo_id=base_model_id,
776
- revision=revision,
777
783
  token=benchmark_config.api_key
778
784
  or os.getenv("HUGGINGFACE_API_KEY")
779
785
  or True,
@@ -781,12 +787,18 @@ def get_model_repo_info(
781
787
  tags += base_model_info.tags or list()
782
788
  tags = list(set(tags))
783
789
 
790
+ # TEMP: This extends the `TASK_MAPPING` dictionary to include the missing
791
+ # 'image-text-to-text' pipeline tag. This will be added as part of `TASK_MAPPING`
792
+ # when this PR has been merged in and published:
793
+ # https://github.com/huggingface/transformers/pull/37107
794
+ TASK_MAPPING["image-text-to-text"] = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
795
+
784
796
  # Get the pipeline tag for the model. If it is not specified, then we determine it
785
797
  # by checking the model's architecture as written in the model's Hugging Face config
786
798
  pipeline_tag = model_info.pipeline_tag
787
799
  if pipeline_tag is None:
788
800
  hf_config = load_hf_model_config(
789
- model_id=model_id,
801
+ model_id=base_model_id or model_id,
790
802
  num_labels=0,
791
803
  id2label=dict(),
792
804
  label2id=dict(),
@@ -812,7 +824,6 @@ def get_model_repo_info(
812
824
  pipeline_tag = "fill-mask"
813
825
 
814
826
  if benchmark_config.only_allow_safetensors:
815
- # Check if any file ends with .safetensors
816
827
  repo_files = hf_api.list_repo_files(repo_id=model_id, revision=revision)
817
828
  has_safetensors = any(f.endswith(".safetensors") for f in repo_files)
818
829
  if not has_safetensors:
@@ -826,6 +837,26 @@ def get_model_repo_info(
826
837
  )
827
838
  raise InvalidModel(msg)
828
839
 
840
+ # Also check base model if we are evaluating an adapter
841
+ if base_model_id is not None:
842
+ base_repo_files = hf_api.list_repo_files(repo_id=base_model_id)
843
+ base_has_safetensors = any(
844
+ f.endswith(".safetensors") for f in base_repo_files
845
+ )
846
+ if not base_has_safetensors:
847
+ msg = (
848
+ f"Base model {base_model_id} does not have safetensors weights "
849
+ "available."
850
+ )
851
+ if benchmark_config.run_with_cli:
852
+ msg += " Skipping since the `--only-allow-safetensors` flag is set."
853
+ else:
854
+ msg += (
855
+ " Skipping since the `only_allow_safetensors` argument is set "
856
+ "to `True`."
857
+ )
858
+ raise InvalidModel(msg)
859
+
829
860
  return HFModelInfo(
830
861
  pipeline_tag=pipeline_tag, tags=tags, adapter_base_model_id=base_model_id
831
862
  )
@@ -30,6 +30,7 @@ from ..constants import (
30
30
  REASONING_MAX_TOKENS,
31
31
  TASK_GROUPS_USING_LOGPROBS,
32
32
  TASKS_USING_JSON,
33
+ VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
33
34
  )
34
35
  from ..data_models import (
35
36
  BenchmarkConfig,
@@ -65,6 +66,7 @@ from ..utils import (
65
66
  get_bos_token,
66
67
  get_end_of_chat_token_ids,
67
68
  get_eos_token,
69
+ get_min_cuda_compute_capability,
68
70
  log_once,
69
71
  should_prompts_be_stripped,
70
72
  )
@@ -145,6 +147,7 @@ class VLLMModel(HuggingFaceEncoderModel):
145
147
  if self.model_config.adapter_base_model_id is not None:
146
148
  adapter_path = snapshot_download(
147
149
  repo_id=self.model_config.model_id,
150
+ revision=self.model_config.revision,
148
151
  cache_dir=Path(self.model_config.model_cache_dir),
149
152
  )
150
153
  self.buffer["lora_request"] = LoRARequest(
@@ -373,12 +376,27 @@ class VLLMModel(HuggingFaceEncoderModel):
373
376
 
374
377
  # Generate sequences using vLLM
375
378
  input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
376
- raw_outputs = self._model.generate(
377
- prompts=prompts,
378
- sampling_params=sampling_params,
379
- use_tqdm=(not input_is_a_test),
380
- lora_request=self.buffer.get("lora_request"),
381
- )
379
+ num_attempts = 3
380
+ for _ in range(num_attempts):
381
+ try:
382
+ raw_outputs = self._model.generate(
383
+ prompts=prompts,
384
+ sampling_params=sampling_params,
385
+ use_tqdm=(not input_is_a_test),
386
+ lora_request=self.buffer.get("lora_request"),
387
+ )
388
+ break
389
+ except TypeError as e:
390
+ logger.debug(
391
+ f"Encountered error during vLLM generation: {str(e)}. Retrying..."
392
+ )
393
+ sleep(1)
394
+ else:
395
+ raise InvalidBenchmark(
396
+ f"Could not generate sequences after {num_attempts} attempts."
397
+ )
398
+
399
+ # Parse the raw model outputs
382
400
  completion_ids: list[list[int]] = [
383
401
  output.outputs[0].token_ids for output in raw_outputs
384
402
  ]
@@ -846,13 +864,16 @@ def load_model_and_tokenizer(
846
864
  # Prefer base model ID if the model is an adapter - the adapter will be added on
847
865
  # during inference in this case
848
866
  model_id = model_config.adapter_base_model_id or model_config.model_id
867
+ revision = (
868
+ model_config.revision if model_config.adapter_base_model_id is None else "main"
869
+ )
849
870
 
850
871
  hf_model_config = load_hf_model_config(
851
872
  model_id=model_id,
852
873
  num_labels=0,
853
874
  id2label=dict(),
854
875
  label2id=dict(),
855
- revision=model_config.revision,
876
+ revision=revision,
856
877
  model_cache_dir=model_config.model_cache_dir,
857
878
  api_key=benchmark_config.api_key,
858
879
  trust_remote_code=benchmark_config.trust_remote_code,
@@ -881,6 +902,23 @@ def load_model_and_tokenizer(
881
902
  )
882
903
  dtype = torch.float16
883
904
 
905
+ if hf_model_config.torch_dtype == torch.bfloat16:
906
+ min_cuda_compute_capability = get_min_cuda_compute_capability()
907
+ required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
908
+
909
+ if min_cuda_compute_capability is not None:
910
+ if min_cuda_compute_capability < required_capability:
911
+ logger.info(
912
+ "You are loading a model with "
913
+ f"dtype {hf_model_config.torch_dtype}, "
914
+ "which vLLM only supports for CUDA devices with"
915
+ f"CUDA compute capability >={required_capability}. "
916
+ "You are using one or more devices with "
917
+ f"compute capability {min_cuda_compute_capability}. "
918
+ "Setting dtype to float16 instead."
919
+ )
920
+ dtype = torch.float16
921
+
884
922
  if model_config.adapter_base_model_id is not None:
885
923
  download_dir = str(Path(model_config.model_cache_dir) / "base_model")
886
924
  else:
@@ -916,7 +954,7 @@ def load_model_and_tokenizer(
916
954
  max_model_len=min(true_max_model_len, 5_000),
917
955
  download_dir=download_dir,
918
956
  trust_remote_code=benchmark_config.trust_remote_code,
919
- revision=model_config.revision,
957
+ revision=revision,
920
958
  seed=4242,
921
959
  distributed_executor_backend=executor_backend,
922
960
  tensor_parallel_size=torch.cuda.device_count(),
@@ -994,6 +1032,7 @@ def load_tokenizer(
994
1032
  Returns:
995
1033
  The loaded tokenizer.
996
1034
  """
1035
+ revision = revision if adapter_base_model_id is None else "main"
997
1036
  config = AutoConfig.from_pretrained(
998
1037
  adapter_base_model_id or model_id,
999
1038
  revision=revision,
euroeval/constants.py CHANGED
@@ -54,3 +54,6 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
54
54
 
55
55
  # Hugging Face Hub tags used to classify models as merge models
56
56
  MERGE_TAGS = ["merge", "mergekit"]
57
+
58
+ # The minimum required CUDA compute capability for using bfloat16 in vLLM
59
+ VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
euroeval/data_models.py CHANGED
@@ -1,7 +1,6 @@
1
1
  """Data models used in EuroEval."""
2
2
 
3
3
  import collections.abc as c
4
- import importlib.metadata
5
4
  import json
6
5
  import pathlib
7
6
  import re
@@ -11,6 +10,8 @@ from dataclasses import dataclass, field
11
10
  import pydantic
12
11
  import torch
13
12
 
13
+ from euroeval.utils import get_package_version
14
+
14
15
  from .enums import Device, InferenceBackend, ModelType, TaskGroup
15
16
  from .types import ScoreDict
16
17
 
@@ -228,7 +229,11 @@ class BenchmarkResult(pydantic.BaseModel):
228
229
  generative_type: str | None
229
230
  few_shot: bool
230
231
  validation_split: bool
231
- euroeval_version: str = importlib.metadata.version("euroeval")
232
+ euroeval_version: str | None = get_package_version("euroeval")
233
+ transformers_version: str | None = get_package_version("transformers")
234
+ torch_version: str | None = get_package_version("torch")
235
+ vllm_version: str | None = get_package_version("vllm")
236
+ outlines_version: str | None = get_package_version("outlines")
232
237
 
233
238
  @classmethod
234
239
  def from_dict(cls, config: dict) -> "BenchmarkResult":
@@ -244,7 +244,7 @@ FOSENT_CONFIG = DatasetConfig(
244
244
  ALLOCINE_CONFIG = DatasetConfig(
245
245
  name="allocine",
246
246
  pretty_name="the truncated version of the French sentiment classification "
247
- "dataset Allocine",
247
+ "dataset AlloCiné",
248
248
  huggingface_id="EuroEval/allocine-mini",
249
249
  task=SENT,
250
250
  languages=[FR],
@@ -1467,9 +1467,9 @@ NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
1467
1467
  max_generated_tokens=256,
1468
1468
  )
1469
1469
 
1470
- MLSUM_CONFIG = DatasetConfig(
1471
- name="mlsum",
1472
- pretty_name="the truncated version of the German summarisation dataset MLSum",
1470
+ MLSUM_DE_CONFIG = DatasetConfig(
1471
+ name="mlsum-de",
1472
+ pretty_name="the truncated version of the German summarisation dataset MLSum-de",
1473
1473
  huggingface_id="EuroEval/mlsum-mini",
1474
1474
  task=SUMM,
1475
1475
  languages=[DE],
@@ -1484,7 +1484,7 @@ MLSUM_CONFIG = DatasetConfig(
1484
1484
 
1485
1485
  MLSUM_ES_CONFIG = DatasetConfig(
1486
1486
  name="mlsum-es",
1487
- pretty_name="the truncated version of the Spanish summarisation dataset MLSum",
1487
+ pretty_name="the truncated version of the Spanish summarisation dataset MLSum-es",
1488
1488
  huggingface_id="EuroEval/mlsum-es-mini",
1489
1489
  task=SUMM,
1490
1490
  languages=[ES],
@@ -162,8 +162,7 @@ def get_closest_logprobs_labels(
162
162
  """
163
163
  english_labels = list(dataset_config.id2label.values())
164
164
  english2local = dataset_config.prompt_label_mapping
165
- local_labels = [english2local[lbl].lower() for lbl in english_labels]
166
- candidate_labels = local_labels + english_labels
165
+ candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
167
166
 
168
167
  output_labels: list[str] = list()
169
168
  for sample in generation_logprobs:
@@ -182,38 +181,44 @@ def get_closest_logprobs_labels(
182
181
  # label, as the output label
183
182
  output_label: str | None = None
184
183
  previously_generated_labels: list[str] = list()
185
- for generated_label in generated_labels:
184
+ for label_idx, generated_label in enumerate(generated_labels):
186
185
  generated_label = "".join(previously_generated_labels) + generated_label
187
186
 
188
- # Get the candidate labels that contain the generated label
189
- candidate_output_labels = [
187
+ # Get the candidate labels that starts with the generated label
188
+ candidate_output_labels = {
190
189
  candidate_label
191
190
  for candidate_label in candidate_labels
192
- if generated_label in candidate_label
193
- ]
194
-
195
- # If we can uniquely determine the output label, we break the loop.
196
- # Since we have both the original local labels as well as the English
197
- # versions, we want to have 0 or 1 candidate labels from each set. This
198
- # means that ["positive", "positiv"] is fine as they're both referencing
199
- # the same label, but ["negativ", "neutral"] is not. In the bad case we
200
- # cannot use the scores and we fall back to using the
201
- # candidate label with the highest edit distance.
202
- at_most_one_english_label = (
203
- len(set(candidate_output_labels).intersection(english_labels)) <= 1
204
- )
205
- at_most_one_local_label = (
206
- len(set(candidate_output_labels).intersection(local_labels)) <= 1
207
- )
208
- if candidate_output_labels:
209
- if at_most_one_english_label and at_most_one_local_label:
210
- output_label = candidate_output_labels[0]
211
- break
212
- else:
191
+ if candidate_label.startswith(generated_label)
192
+ }
193
+
194
+ # If we can uniquely determine the output label, we break the loop. If
195
+ # there are multiple possible labels then we store the current one, and
196
+ # concatenate it with the next generated label. We can only do this if
197
+ # the current one is the first one, however, since we're using greedy
198
+ # sampling. In case this happens for a label that is not the first one,
199
+ # we warn the user.
200
+ if len(candidate_output_labels) == 1:
201
+ output_label = candidate_output_labels.pop()
202
+ break
203
+ elif len(candidate_output_labels) > 1:
204
+ if label_idx == 0:
213
205
  previously_generated_labels.append(generated_label)
206
+ else:
207
+ output_label = candidate_output_labels.pop()
208
+ candidate_output_labels.add(output_label)
209
+ log_once(
210
+ "Multiple candidate labels found for the generated label "
211
+ f"{generated_label!r}: {candidate_output_labels}. Since "
212
+ "this is not the first generated label, we cannot "
213
+ "concatenate it with the next generated label. We are thus "
214
+ f"forced to use the arbitrary {output_label!r} as the "
215
+ "output label, potentially resulting in worse performance. "
216
+ "Please report this issue to the EuroEval team at "
217
+ "github.com/EuroEval/EuroEval/issues.",
218
+ level=logging.WARNING,
219
+ )
214
220
 
215
221
  if output_label is not None:
216
- output_label = english2local.get(output_label, output_label)
217
222
  output_labels.append(output_label)
218
223
  break
219
224
  else:
euroeval/types.py CHANGED
@@ -8,9 +8,9 @@ if t.TYPE_CHECKING:
8
8
  from .data_models import GenerativeModelOutput
9
9
 
10
10
 
11
- ScoreDict = dict[str, dict[str, float] | list[dict[str, float]]]
12
- Predictions = NDArray | list[str] | list[list[str]]
13
- Labels = NDArray | list[str] | list[list[str]]
11
+ ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
12
+ Predictions: t.TypeAlias = NDArray | list[str] | list[list[str]]
13
+ Labels: t.TypeAlias = NDArray | list[str] | list[list[str]]
14
14
 
15
15
 
16
16
  class ComputeMetricsFunction(t.Protocol):
euroeval/utils.py CHANGED
@@ -2,11 +2,11 @@
2
2
 
3
3
  import gc
4
4
  import importlib
5
+ import importlib.metadata
5
6
  import importlib.util
6
7
  import logging
7
8
  import os
8
9
  import random
9
- import re
10
10
  import sys
11
11
  import typing as t
12
12
  import warnings
@@ -16,7 +16,6 @@ from types import TracebackType
16
16
 
17
17
  import litellm
18
18
  import numpy as np
19
- import pkg_resources
20
19
  import requests
21
20
  import torch
22
21
  from datasets.utils import disable_progress_bar
@@ -84,33 +83,6 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
84
83
  return rng
85
84
 
86
85
 
87
- def is_module_installed(module: str) -> bool:
88
- """Check if a module is installed.
89
-
90
- This is used when dealing with spaCy models, as these are installed as separate
91
- Python packages.
92
-
93
- Args:
94
- module:
95
- The name of the module.
96
-
97
- Returns:
98
- Whether the module is installed or not.
99
- """
100
- # Get list of all modules, including their versions
101
- installed_modules_with_versions = list(pkg_resources.working_set)
102
-
103
- # Strip the module versions from the list of modules. Also make the modules lower
104
- # case and replace dashes with underscores
105
- installed_modules = [
106
- re.sub("[0-9. ]", "", str(module)).lower().replace("-", "_")
107
- for module in installed_modules_with_versions
108
- ]
109
-
110
- # Check if the module is installed by checking if the module name is in the list
111
- return module.lower() in installed_modules
112
-
113
-
114
86
  def block_terminal_output() -> None:
115
87
  """Blocks libraries from writing output to the terminal.
116
88
 
@@ -206,6 +178,21 @@ def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type |
206
178
  return None
207
179
 
208
180
 
181
+ def get_min_cuda_compute_capability() -> float | None:
182
+ """Gets the lowest cuda capability.
183
+
184
+ Returns:
185
+ Device capability as float, or None if CUDA is not available.
186
+ """
187
+ if not torch.cuda.is_available():
188
+ return None
189
+
190
+ device_range = range(torch.cuda.device_count())
191
+ capabilities = map(torch.cuda.get_device_capability, device_range)
192
+ major, minor = min(capabilities)
193
+ return float(f"{major}.{minor}")
194
+
195
+
209
196
  def kebab_to_pascal(kebab_string: str) -> str:
210
197
  """Converts a kebab-case string to PascalCase.
211
198
 
@@ -573,3 +560,19 @@ def log_once(message: str, level: int = logging.INFO) -> None:
573
560
  logger.critical(message)
574
561
  case _:
575
562
  raise ValueError(f"Invalid logging level: {level}")
563
+
564
+
565
+ def get_package_version(package_name: str) -> str | None:
566
+ """Get the version of a package.
567
+
568
+ Args:
569
+ package_name:
570
+ The name of the package.
571
+
572
+ Returns:
573
+ The version of the package, or None if the package is not installed.
574
+ """
575
+ try:
576
+ return importlib.metadata.version(package_name)
577
+ except importlib.metadata.PackageNotFoundError:
578
+ return None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.4.1
3
+ Version: 15.4.2
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -42,6 +42,7 @@ Requires-Dist: more-itertools>=10.5.0
42
42
  Requires-Dist: numpy<2.0.0,>=1.23.0
43
43
  Requires-Dist: ollama>=0.4.7
44
44
  Requires-Dist: pandas>=2.2.0
45
+ Requires-Dist: peft>=0.15.0
45
46
  Requires-Dist: protobuf~=3.20.0
46
47
  Requires-Dist: pydantic>=2.6.0
47
48
  Requires-Dist: pyinfer>=0.0.3
@@ -61,12 +62,12 @@ Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == '
61
62
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
62
63
  Requires-Dist: gradio>=4.26.0; extra == 'all'
63
64
  Requires-Dist: outlines>=0.1.11; extra == 'all'
64
- Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'all'
65
66
  Provides-Extra: generative
66
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
67
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
68
69
  Requires-Dist: outlines>=0.1.11; extra == 'generative'
69
- Requires-Dist: vllm!=0.8.1,>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm==0.8.0; (platform_system == 'Linux') and extra == 'generative'
70
71
  Provides-Extra: human-evaluation
71
72
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
72
73
  Provides-Extra: test
@@ -3,10 +3,10 @@ euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHh
3
3
  euroeval/benchmarker.py,sha256=PIdqLPleLN3nml5Zb1g_dQaLzqxQhmgC8VuvD5yloV4,46524
4
4
  euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
5
5
  euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
- euroeval/constants.py,sha256=9iXe26WAigL9RYob3PhsB5c0dr11wCeRxrEfm_ssynM,1562
6
+ euroeval/constants.py,sha256=zL8dm7SEFpIgC2vaPhqzdKydVSWW-ZyMHenWPnNxWqQ,1681
7
7
  euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
8
- euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
9
- euroeval/dataset_configs.py,sha256=bjMUXvaEtTpo1Eql_mIRCG3K_lB2DZRdPWEAwR5N4ig,90627
8
+ euroeval/data_models.py,sha256=b4rOMdhoxkIPcnTQdwqq5iWaF6uia1OzAgdiOBvoGVM,14779
9
+ euroeval/dataset_configs.py,sha256=C5Gnp95cBeCmmuRA8Rznt0c4gMOn8Pilk_kDCleDMjg,90640
10
10
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
11
11
  euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
12
12
  euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
@@ -19,22 +19,22 @@ euroeval/model_loading.py,sha256=ta07tMoSfK1kqjOynVXQA0vVrns6RzsCEE3g1_RGVVs,271
19
19
  euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
20
20
  euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
21
21
  euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
22
- euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
23
- euroeval/utils.py,sha256=MkiVI-0KmK4ilKJTTfYAynKaPDOzW1WjyRdZsYmnoIg,18803
22
+ euroeval/types.py,sha256=5DIhaVyzH8RO9jdJfibX9pwbZviQwU35dMsfszD2Whs,2406
23
+ euroeval/utils.py,sha256=CFjYMoKdcxLUEM-aF3pxf_3TnGWvGasjfb8pDMJVe9U,18772
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
26
26
  euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
27
- euroeval/benchmark_modules/hf.py,sha256=YeaaP_YGAlKG5G1KFq0bFOFWv42eH_zfmhuW3FAXjAA,41726
27
+ euroeval/benchmark_modules/hf.py,sha256=Typig7WDqOn_uGE24s_P_9PHvq-V0MrKGD7xbh0aYnk,43244
28
28
  euroeval/benchmark_modules/litellm.py,sha256=ZJ9dB683pXPHDf70OOJfmHn_y706xRYzstYLz2ytCKE,39784
29
- euroeval/benchmark_modules/vllm.py,sha256=5N2ytLR9cZIcPeza-ERQWwyvehDd0F1FUvXY3cKu4Oo,44519
29
+ euroeval/benchmark_modules/vllm.py,sha256=O8-dcVkU2jgZer44EOeTC8E4d-xQjPDOXnoyzXxAToQ,46179
30
30
  euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
31
31
  euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
32
32
  euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
33
- euroeval/task_utils/sequence_classification.py,sha256=bIsbAj123hEyW40QeSUW8Dpc2SyI3ZPCGexapr9qqjw,9826
33
+ euroeval/task_utils/sequence_classification.py,sha256=832iWpPR3CsnlBIYA976eN21WUFQLUmIlDxFIvOsROk,10266
34
34
  euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
35
35
  euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
36
- euroeval-15.4.1.dist-info/METADATA,sha256=OdTP-FAbbF9vUV3OTeV5Y-B6P7FXN2bAalG903ny8hU,10740
37
- euroeval-15.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- euroeval-15.4.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
- euroeval-15.4.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
- euroeval-15.4.1.dist-info/RECORD,,
36
+ euroeval-15.4.2.dist-info/METADATA,sha256=cvpyWIKPXNKn1Idv7w3C7z8MBVljmw50jBdskL_32oI,10752
37
+ euroeval-15.4.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ euroeval-15.4.2.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
+ euroeval-15.4.2.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
+ euroeval-15.4.2.dist-info/RECORD,,