EuroEval 15.3.1__py3-none-any.whl → 15.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

euroeval/__init__.py CHANGED
@@ -14,6 +14,7 @@ warnings.filterwarnings("ignore", category=UserWarning)
14
14
  logging.getLogger("httpx").setLevel(logging.CRITICAL)
15
15
  logging.getLogger("datasets").setLevel(logging.CRITICAL)
16
16
  logging.getLogger("vllm").setLevel(logging.CRITICAL)
17
+ logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
17
18
 
18
19
  # Set up logging
19
20
  fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
@@ -66,6 +67,16 @@ os.environ["OMP_NUM_THREADS"] = "1"
66
67
  os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
67
68
 
68
69
 
70
+ # Avoid the "Cannot re-initialize CUDA in forked subprocess" error - see
71
+ # https://github.com/vllm-project/vllm/issues/6152 for more
72
+ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
73
+
74
+
75
+ # Use older version v0 of vLLM, as the newer one requires XGrammar as decoding backend,
76
+ # but XGrammar does not support having a maximal amount of elements in lists
77
+ os.environ["VLLM_USE_V1"] = "0"
78
+
79
+
69
80
  # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
70
81
  # former and LiteLLM uses the latter
71
82
  if os.getenv("HUGGINGFACE_API_KEY"):
@@ -12,7 +12,7 @@ from .dataset_configs import get_all_dataset_configs
12
12
  from .enums import Device
13
13
  from .exceptions import InvalidBenchmark
14
14
  from .languages import get_all_languages
15
- from .tasks import get_all_tasks
15
+ from .tasks import SPEED, get_all_tasks
16
16
  from .utils import log_once
17
17
 
18
18
  if t.TYPE_CHECKING:
@@ -294,7 +294,7 @@ def prepare_tasks_and_datasets(
294
294
  # Create the list of dataset tasks
295
295
  try:
296
296
  if task is None:
297
- tasks = list(task_mapping.values())
297
+ tasks = [t for t in task_mapping.values() if t != SPEED]
298
298
  elif isinstance(task, str):
299
299
  tasks = [task_mapping[task]]
300
300
  else:
@@ -224,8 +224,6 @@ class HuggingFaceEncoderModel(BenchmarkModule):
224
224
  "max_position_embeddings",
225
225
  "max_sequence_length",
226
226
  "model_max_length",
227
- "sliding_window",
228
- "sliding_window_size",
229
227
  "n_positions",
230
228
  ]
231
229
  for candidate_config_max_length in candidate_config_max_lengths:
@@ -804,7 +802,7 @@ def get_model_repo_info(
804
802
  generative_class_names = [
805
803
  class_name
806
804
  for tag in GENERATIVE_PIPELINE_TAGS
807
- for class_name in TASK_MAPPING[tag].values()
805
+ for class_name in TASK_MAPPING.get(tag, dict()).values()
808
806
  ]
809
807
  if class_names is not None and any(
810
808
  class_name in generative_class_names for class_name in class_names
@@ -1023,6 +1021,7 @@ def setup_model_for_question_answering(model: "PreTrainedModel") -> "PreTrainedM
1023
1021
  """
1024
1022
  # Get the models' token type embedding children, if they exist
1025
1023
  children = get_children_of_module(name="model", module=model)
1024
+ assert isinstance(children, dict)
1026
1025
 
1027
1026
  # If the model has token type embeddings then get them
1028
1027
  if children:
@@ -12,6 +12,7 @@ from functools import cached_property, partial
12
12
  from time import sleep
13
13
 
14
14
  import litellm
15
+ import ollama
15
16
  from datasets import DatasetDict
16
17
  from huggingface_hub import HfApi
17
18
  from huggingface_hub.errors import (
@@ -31,6 +32,7 @@ from litellm.exceptions import (
31
32
  )
32
33
  from litellm.types.utils import ModelResponse
33
34
  from requests.exceptions import RequestException
35
+ from tqdm.auto import tqdm
34
36
  from transformers import Trainer
35
37
 
36
38
  from ..constants import (
@@ -39,7 +41,13 @@ from ..constants import (
39
41
  TASK_GROUPS_USING_LOGPROBS,
40
42
  TASKS_USING_JSON,
41
43
  )
42
- from ..data_models import BenchmarkConfig, GenerativeModelOutput, ModelConfig, Task
44
+ from ..data_models import (
45
+ BenchmarkConfig,
46
+ DatasetConfig,
47
+ GenerativeModelOutput,
48
+ ModelConfig,
49
+ Task,
50
+ )
43
51
  from ..enums import (
44
52
  BatchingPreference,
45
53
  GenerativeType,
@@ -49,6 +57,7 @@ from ..enums import (
49
57
  )
50
58
  from ..exceptions import (
51
59
  InvalidBenchmark,
60
+ InvalidModel,
52
61
  NeedsAdditionalArgument,
53
62
  NeedsEnvironmentVariable,
54
63
  NeedsExtraInstalled,
@@ -60,7 +69,7 @@ from ..task_utils import (
60
69
  token_classification,
61
70
  )
62
71
  from ..types import ExtractLabelsFunction
63
- from ..utils import create_model_cache_dir
72
+ from ..utils import create_model_cache_dir, log_once
64
73
  from .base import BenchmarkModule
65
74
  from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokenizer
66
75
 
@@ -136,6 +145,34 @@ class LiteLLMModel(BenchmarkModule):
136
145
  batching_preference = BatchingPreference.SINGLE_SAMPLE
137
146
  high_priority = False
138
147
 
148
+ def __init__(
149
+ self,
150
+ model_config: ModelConfig,
151
+ dataset_config: DatasetConfig,
152
+ benchmark_config: BenchmarkConfig,
153
+ ) -> None:
154
+ """Initialise the model.
155
+
156
+ Args:
157
+ model_config:
158
+ The model configuration.
159
+ dataset_config:
160
+ The dataset configuration.
161
+ benchmark_config:
162
+ The benchmark configuration.
163
+ """
164
+ # Detect whether the model is an Ollama model, as we need to extract metadata
165
+ # differently for these models
166
+ self.is_ollama = model_config.model_id.startswith(
167
+ "ollama/"
168
+ ) or model_config.model_id.startswith("ollama_chat/")
169
+
170
+ super().__init__(
171
+ model_config=model_config,
172
+ dataset_config=dataset_config,
173
+ benchmark_config=benchmark_config,
174
+ )
175
+
139
176
  @property
140
177
  def generative_type(self) -> GenerativeType | None:
141
178
  """Get the generative type of the model.
@@ -269,10 +306,24 @@ class LiteLLMModel(BenchmarkModule):
269
306
  Returns:
270
307
  The number of parameters in the model.
271
308
  """
309
+ # Start by trying out the regex mapping, and use the value if it matches
272
310
  for key, value in NUM_PARAMS_MAPPING.items():
273
311
  if re.fullmatch(pattern=key, string=self.model_config.model_id) is not None:
274
312
  return value
275
313
 
314
+ # If it is an Ollama model then we can get the number of parameters from the
315
+ # Ollama Python SDK
316
+ if self.is_ollama:
317
+ ollama_model_id = self.model_config.model_id.split("/")[-1]
318
+ model_info = ollama.show(ollama_model_id).modelinfo
319
+ if model_info is not None:
320
+ num_params = model_info.get("general.parameter_count")
321
+ if num_params is not None:
322
+ return int(num_params)
323
+
324
+ # If it is a model accessed through the Hugging Face inference API then we can
325
+ # get the number of parameters from the Hugging Face model configuration from
326
+ # the Hugging Face Hub
276
327
  if self.model_config.model_id.startswith("huggingface/"):
277
328
  model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
278
329
  if HuggingFaceEncoderModel.model_exists(
@@ -329,10 +380,14 @@ class LiteLLMModel(BenchmarkModule):
329
380
  Returns:
330
381
  The vocabulary size of the model.
331
382
  """
383
+ # Start by trying out the regex mapping, and use the value if it matches
332
384
  for key, value in VOCAB_SIZE_MAPPING.items():
333
385
  if re.fullmatch(pattern=key, string=self.model_config.model_id) is not None:
334
386
  return value
335
387
 
388
+ # If it is a model accessed through the Hugging Face inference API then we can
389
+ # get the vocabulary size from the Hugging Face model configuration from the
390
+ # Hugging Face Hub
336
391
  if self.model_config.model_id.startswith("huggingface/"):
337
392
  model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
338
393
  if HuggingFaceEncoderModel.model_exists(
@@ -379,10 +434,40 @@ class LiteLLMModel(BenchmarkModule):
379
434
  Returns:
380
435
  The maximum length of the model.
381
436
  """
437
+ # Start by trying out the regex mapping, and use the value if it matches
382
438
  for key, value in MODEL_MAX_LENGTH_MAPPING.items():
383
439
  if re.fullmatch(pattern=key, string=self.model_config.model_id) is not None:
384
440
  return value
385
441
 
442
+ # If it is an Ollama model then we can get the maximum length from the Ollama
443
+ # Python SDK
444
+ if self.is_ollama:
445
+ ollama_model_id = self.model_config.model_id.split("/")[-1]
446
+ model_info = ollama.show(ollama_model_id).modelinfo
447
+ if model_info is not None:
448
+ context_length_keys = [
449
+ key for key in model_info.keys() if "context_length" in key.lower()
450
+ ]
451
+ if context_length_keys:
452
+ context_length = model_info[context_length_keys[0]]
453
+ if context_length is not None:
454
+ log_once(
455
+ f"Detected context length key {context_length_keys[0]!r} "
456
+ f"for Ollama model {ollama_model_id!r}",
457
+ level=logging.DEBUG,
458
+ )
459
+ return int(context_length)
460
+ else:
461
+ log_once(
462
+ f"Tried to get the maximum length of the Ollama model "
463
+ f"{ollama_model_id!r}, but could not find a context length. "
464
+ f"The model info was {model_info}. Returning -1",
465
+ level=logging.DEBUG,
466
+ )
467
+
468
+ # If it is a model accessed through the Hugging Face inference API then we can
469
+ # get the maximum length from the Hugging Face model configuration from the
470
+ # Hugging Face Hub
386
471
  if self.model_config.model_id.startswith("huggingface/"):
387
472
  model_id = self.model_config.model_id.split(sep="/", maxsplit=1)[-1]
388
473
  if HuggingFaceEncoderModel.model_exists(
@@ -523,6 +608,43 @@ class LiteLLMModel(BenchmarkModule):
523
608
  if model_id in litellm.model_list:
524
609
  return True
525
610
 
611
+ # If it is an Ollama model then try to download it
612
+ if model_id.startswith("ollama/") or model_id.startswith("ollama_chat/"):
613
+ ollama_model_id = model_id.split("/")[-1]
614
+ downloaded_ollama_models: list[str] = [
615
+ model_obj.model
616
+ for model_obj in ollama.list().models
617
+ if model_obj.model is not None
618
+ ]
619
+ if ollama_model_id not in downloaded_ollama_models:
620
+ try:
621
+ response = ollama.pull(model=ollama_model_id, stream=True)
622
+ with tqdm(
623
+ desc=f"Downloading {ollama_model_id}",
624
+ unit_scale=True,
625
+ unit="B",
626
+ leave=False,
627
+ ) as pbar:
628
+ for status in response:
629
+ if status.total is not None:
630
+ pbar.total = status.total
631
+ if status.completed is not None:
632
+ pbar.update(status.completed - pbar.n)
633
+ except ollama.ResponseError as e:
634
+ if "file does not exist" in str(e).lower():
635
+ return False
636
+ else:
637
+ raise InvalidModel(
638
+ f"Failed to download Ollama model {ollama_model_id}. The "
639
+ f"error message was: {e}"
640
+ )
641
+ else:
642
+ log_once(
643
+ f"Ollama model {ollama_model_id!r} already downloaded, so skipping "
644
+ "download.",
645
+ level=logging.DEBUG,
646
+ )
647
+
526
648
  num_attempts = 10
527
649
  for _ in range(num_attempts):
528
650
  try:
@@ -73,7 +73,6 @@ from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_conf
73
73
  if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
74
74
  from vllm import LLM, RequestOutput, SamplingParams
75
75
  from vllm.lora.request import LoRARequest
76
- from vllm.sampling_params import GuidedDecodingParams
77
76
 
78
77
  try:
79
78
  from vllm.model_executor.parallel_utils.parallel_state import (
@@ -82,6 +81,10 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
82
81
  except ImportError:
83
82
  from vllm.distributed.parallel_state import destroy_model_parallel
84
83
 
84
+ if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
85
+ from outlines.models.vllm import adapt_tokenizer
86
+ from outlines.processors import JSONLogitsProcessor
87
+
85
88
  if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
86
89
  import ray
87
90
 
@@ -319,12 +322,18 @@ class VLLMModel(HuggingFaceEncoderModel):
319
322
  for tag_name in ner_tag_names
320
323
  }
321
324
  pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
322
- schema = pydantic_class.model_json_schema()
323
- guided_decoding = GuidedDecodingParams(
324
- json=schema, backend="outlines", whitespace_pattern=r" ?"
325
+ logits_processor = JSONLogitsProcessor(
326
+ schema=pydantic_class,
327
+ tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), #  type: ignore
328
+ whitespace_pattern=r" ?",
329
+ )
330
+ log_once(
331
+ "Using structured generation with the schema "
332
+ f"{pydantic_class.model_json_schema()}",
333
+ level=logging.DEBUG,
325
334
  )
326
335
  else:
327
- guided_decoding = None
336
+ logits_processor = None
328
337
 
329
338
  # Define the parameters used for vLLM generation
330
339
  max_tokens: int = (
@@ -337,7 +346,7 @@ class VLLMModel(HuggingFaceEncoderModel):
337
346
  logprobs=MAX_LOGPROBS if self.buffer["output_scores"] else None,
338
347
  temperature=0.0,
339
348
  stop=[stop_token for stop_token in stop_tokens if stop_token],
340
- guided_decoding=guided_decoding,
349
+ logits_processors=[logits_processor] if logits_processor else None,
341
350
  )
342
351
 
343
352
  # If any of the prompts are empty then we need to replace them with a BOS token
@@ -881,8 +890,6 @@ def load_model_and_tokenizer(
881
890
  "max_position_embeddings",
882
891
  "max_sequence_length",
883
892
  "model_max_length",
884
- "sliding_window",
885
- "sliding_window_size",
886
893
  "n_positions",
887
894
  ]
888
895
  true_max_model_len_candidates: list[int] = list()
@@ -1087,7 +1094,8 @@ def get_end_of_reasoning_token_id(
1087
1094
  """Get the end of reasoning token ID for a generative model.
1088
1095
 
1089
1096
  This assumes that the reasoning token is of the form <X> and that the end of
1090
- reasoning token is </X> (for X being any string without spaces).
1097
+ reasoning token is </X> (for X being any string without spaces). We disallow the
1098
+ reasoning token to be the same as the beginning-of-sentence token.
1091
1099
 
1092
1100
  Args:
1093
1101
  model:
@@ -1106,6 +1114,7 @@ def get_end_of_reasoning_token_id(
1106
1114
  add_generation_prompt=True,
1107
1115
  tokenize=False,
1108
1116
  )
1117
+ assert isinstance(prompt, str)
1109
1118
 
1110
1119
  # Generate a completion and remove the BOS token from it, to not confuse it with the
1111
1120
  # potential reasoning token
@@ -1119,11 +1128,18 @@ def get_end_of_reasoning_token_id(
1119
1128
  .text
1120
1129
  )
1121
1130
  if tokenizer.bos_token is not None:
1122
- completion = completion.replace(tokenizer.bos_token, "").strip()
1131
+ if isinstance(tokenizer.bos_token, str):
1132
+ prompt = prompt.replace(tokenizer.bos_token, "").strip()
1133
+ completion = completion.replace(tokenizer.bos_token, "").strip()
1134
+ elif isinstance(tokenizer.bos_token, list):
1135
+ for bos_token in tokenizer.bos_token:
1136
+ prompt = prompt.replace(bos_token, "").strip()
1137
+ completion = completion.replace(bos_token, "").strip()
1123
1138
 
1124
1139
  # If it doesn't contain a reasoning token, we can't find the end of reasoning token
1125
- match = re.search(pattern=r"<\w+>", string=completion)
1126
- if match is None:
1140
+ prompt_match = re.search(pattern=r"<\w+>", string=prompt)
1141
+ completion_match = re.search(pattern=r"<\w+>", string=completion)
1142
+ if completion_match is None and prompt_match is None:
1127
1143
  log_once(
1128
1144
  message=(
1129
1145
  "Could not find a reasoning token, so assuming the model is not a "
@@ -1135,7 +1151,11 @@ def get_end_of_reasoning_token_id(
1135
1151
 
1136
1152
  # Check that the found reasoning token and its associated end-of-reasoning tokens
1137
1153
  # are both special tokens
1138
- reasoning_token = match.group()
1154
+ elif completion_match is not None:
1155
+ reasoning_token = completion_match.group()
1156
+ else:
1157
+ assert prompt_match is not None
1158
+ reasoning_token = prompt_match.group()
1139
1159
  end_of_reasoning_token = f"</{reasoning_token[1:-1]}>"
1140
1160
  special_tokens = [
1141
1161
  decoder_token.content
euroeval/benchmarker.py CHANGED
@@ -709,7 +709,7 @@ class Benchmarker:
709
709
 
710
710
  if dataset_config.task == SPEED:
711
711
  scores = benchmark_speed(
712
- model=model, benchmark_config=self.benchmark_config
712
+ model=model, benchmark_config=benchmark_config
713
713
  )
714
714
 
715
715
  else:
@@ -727,7 +727,7 @@ class Benchmarker:
727
727
  datasets=prepared_datasets,
728
728
  model_config=model_config,
729
729
  dataset_config=dataset_config,
730
- benchmark_config=self.benchmark_config,
730
+ benchmark_config=benchmark_config,
731
731
  )
732
732
  else:
733
733
  scores = finetune(
euroeval/constants.py CHANGED
@@ -13,7 +13,13 @@ REASONING_MAX_TOKENS = 8_192
13
13
 
14
14
 
15
15
  # The Hugging Face Hub pipeline tags used to classify models as generative
16
- GENERATIVE_PIPELINE_TAGS = ["text-generation", "text2text-generation"]
16
+ GENERATIVE_PIPELINE_TAGS = [
17
+ "text-generation",
18
+ "text2text-generation",
19
+ "image-text-to-text",
20
+ "audio-text-to-text",
21
+ "video-text-to-text",
22
+ ]
17
23
 
18
24
 
19
25
  # Used to disallow non-generative models to be evaluated on these task groups
euroeval/data_loading.py CHANGED
@@ -8,6 +8,7 @@ from datasets import Dataset, DatasetDict, load_dataset
8
8
  from datasets.exceptions import DatasetsError
9
9
  from huggingface_hub.errors import HfHubHTTPError
10
10
  from numpy.random import Generator
11
+ from requests import ReadTimeout
11
12
 
12
13
  from .data_models import BenchmarkConfig, DatasetConfig
13
14
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark
@@ -47,7 +48,7 @@ def load_data(
47
48
  token=unscramble("HjccJFhIozVymqXDVqTUTXKvYhZMTbfIjMxG_"),
48
49
  )
49
50
  break
50
- except (FileNotFoundError, DatasetsError, ConnectionError):
51
+ except (FileNotFoundError, DatasetsError, ConnectionError, ReadTimeout):
51
52
  logger.warning(
52
53
  f"Failed to load dataset {dataset_config.huggingface_id!r}. Retrying..."
53
54
  )
@@ -1,7 +1,22 @@
1
1
  """All dataset configurations used in EuroEval."""
2
2
 
3
3
  from .data_models import DatasetConfig
4
- from .languages import DA, DE, EN, FO, FR, IS, IT, NB, NL, NN, NO, SV, get_all_languages
4
+ from .languages import (
5
+ DA,
6
+ DE,
7
+ EN,
8
+ ES,
9
+ FO,
10
+ FR,
11
+ IS,
12
+ IT,
13
+ NB,
14
+ NL,
15
+ NN,
16
+ NO,
17
+ SV,
18
+ get_all_languages,
19
+ )
5
20
  from .tasks import COMMON_SENSE, KNOW, LA, MCRC, NER, RC, SENT, SPEED, SUMM
6
21
 
7
22
 
@@ -265,6 +280,25 @@ SENTIPOLC_CONFIG = DatasetConfig(
265
280
  )
266
281
 
267
282
 
283
+ SENTIMENT_HEADLINES_CONFIG = DatasetConfig(
284
+ name="sentiment-headlines-es",
285
+ pretty_name="the truncated version of the Spanish sentiment headlines dataset",
286
+ huggingface_id="EuroEval/sentiment-headlines-es",
287
+ task=SENT,
288
+ languages=[ES],
289
+ labels=["negative", "neutral", "positive"],
290
+ prompt_prefix="Lo siguiente son reseñas y su sentimiento, que puede ser "
291
+ "'positivo', 'neutral' o 'negativo'.",
292
+ prompt_template="Texto: {text}\nSentimiento: {label}",
293
+ prompt_label_mapping=dict(
294
+ positive="positivo", neutral="neutral", negative="negativo"
295
+ ),
296
+ instruction_prompt="Texto: {text}\n\nClasifica el sentimiento de la reseña. "
297
+ "Responde con 'positivo', 'neutral' o 'negativo', y nada más.",
298
+ num_few_shot_examples=12,
299
+ max_generated_tokens=5,
300
+ )
301
+
268
302
  ### NAMED ENTITY RECOGNITION DATASETS ###
269
303
 
270
304
  SUC3_CONFIG = DatasetConfig(
@@ -817,6 +851,45 @@ MULTINERD_IT_CONFIG = DatasetConfig(
817
851
  max_generated_tokens=128,
818
852
  )
819
853
 
854
+ CONLL_ES_CONFIG = DatasetConfig(
855
+ name="conll-es",
856
+ pretty_name="the Spanish part of the truncated version of the named entity "
857
+ "recognition dataset CoNLL 2002",
858
+ huggingface_id="EuroEval/conll-es-mini",
859
+ task=NER,
860
+ languages=[ES],
861
+ labels=[
862
+ "o",
863
+ "b-loc",
864
+ "i-loc",
865
+ "b-org",
866
+ "i-org",
867
+ "b-per",
868
+ "i-per",
869
+ "b-misc",
870
+ "i-misc",
871
+ ],
872
+ prompt_prefix="Lo siguiente son oraciones y diccionarios JSON con las entidades "
873
+ "nombradas que aparecen en la oración dada.",
874
+ prompt_template="Oración: {text}\nEntidades nombradas: {label}",
875
+ prompt_label_mapping={
876
+ "b-per": "persona",
877
+ "i-per": "persona",
878
+ "b-loc": "lugar",
879
+ "i-loc": "lugar",
880
+ "b-org": "organización",
881
+ "i-org": "organización",
882
+ "b-misc": "misceláneo",
883
+ "i-misc": "misceláneo",
884
+ },
885
+ instruction_prompt="Oración: {text}\n\nIdentifica las entidades nombradas en la "
886
+ "oración. Debes producir esto como un diccionario JSON con las claves 'persona', "
887
+ "'lugar', 'organización' y 'misceláneo'. Los valores deben ser listas de las "
888
+ "entidades nombradas de ese tipo, exactamente como aparecen en la oración.",
889
+ num_few_shot_examples=8,
890
+ max_generated_tokens=128,
891
+ unofficial=True,
892
+ )
820
893
 
821
894
  ### LINGUISTIC ACCEPTABILITY DATASETS ###
822
895
 
@@ -1029,6 +1102,22 @@ SCALA_IT_CONFIG = DatasetConfig(
1029
1102
  max_generated_tokens=5,
1030
1103
  )
1031
1104
 
1105
+ SCALA_ES_CONFIG = DatasetConfig(
1106
+ name="scala-es",
1107
+ pretty_name="the Spanish part of the linguistic acceptability dataset ScaLA",
1108
+ huggingface_id="EuroEval/scala-es",
1109
+ task=LA,
1110
+ languages=[ES],
1111
+ labels=["incorrect", "correct"],
1112
+ prompt_prefix="Lo siguiente son textos y si son gramaticalmente correctos.",
1113
+ prompt_template="Texto: {text}\nGramaticalmente correcto: {label}",
1114
+ prompt_label_mapping=dict(correct="sí", incorrect="no"),
1115
+ instruction_prompt="Texto: {text}\n\nDetermina si el texto es gramaticalmente "
1116
+ "correcto o no. Responde con 'sí' si el texto es correcto, y 'no' si no lo es.",
1117
+ num_few_shot_examples=12,
1118
+ max_generated_tokens=5,
1119
+ )
1120
+
1032
1121
  DUTCH_COLA_CONFIG = DatasetConfig(
1033
1122
  name="dutch-cola",
1034
1123
  pretty_name="the truncated version of the Dutch linguistic acceptability dataset "
@@ -1326,6 +1415,41 @@ FQUAD_CONFIG = DatasetConfig(
1326
1415
  max_generated_tokens=32,
1327
1416
  )
1328
1417
 
1418
+ XQUAD_ES_CONFIG = DatasetConfig(
1419
+ name="xquad-es",
1420
+ pretty_name="the Spanish version of the XQuAD reading comprehension dataset",
1421
+ huggingface_id="EuroEval/xquad-es",
1422
+ task=RC,
1423
+ languages=[ES],
1424
+ labels=["start_positions", "end_positions"],
1425
+ prompt_prefix="A continuación se presentan textos con sus preguntas y respuestas "
1426
+ "correspondientes.",
1427
+ prompt_template="Texto: {text}\nPregunta: {question}\nRespuesta en máximo 3 "
1428
+ "palabras: {label}",
1429
+ instruction_prompt="Texto: {text}\n\nResponda la siguiente pregunta sobre el "
1430
+ "texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
1431
+ num_few_shot_examples=4,
1432
+ max_generated_tokens=32,
1433
+ unofficial=True,
1434
+ )
1435
+
1436
+ MLQA_ES_CONFIG = DatasetConfig(
1437
+ name="mlqa-es",
1438
+ pretty_name="the Spanish version of the MLQA reading comprehension dataset",
1439
+ huggingface_id="EuroEval/mlqa-es",
1440
+ task=RC,
1441
+ languages=[ES],
1442
+ labels=["start_positions", "end_positions"],
1443
+ prompt_prefix="A continuación se presentan textos con sus preguntas y respuestas "
1444
+ "correspondientes.",
1445
+ prompt_template="Texto: {text}\nPregunta: {question}\nRespuesta en máximo 3 "
1446
+ "palabras: {label}",
1447
+ instruction_prompt="Texto: {text}\n\nResponda la siguiente pregunta sobre el "
1448
+ "texto anterior en máximo 3 palabras.\n\nPregunta: {question}",
1449
+ num_few_shot_examples=4,
1450
+ max_generated_tokens=32,
1451
+ )
1452
+
1329
1453
  ### SUMMARIZATION DATASETS ###
1330
1454
 
1331
1455
  NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
@@ -1358,6 +1482,19 @@ MLSUM_CONFIG = DatasetConfig(
1358
1482
  max_generated_tokens=256,
1359
1483
  )
1360
1484
 
1485
+ MLSUM_ES_CONFIG = DatasetConfig(
1486
+ name="mlsum-es",
1487
+ pretty_name="the truncated version of the Spanish summarisation dataset MLSum",
1488
+ huggingface_id="EuroEval/mlsum-es-mini",
1489
+ task=SUMM,
1490
+ languages=[ES],
1491
+ prompt_prefix="Los siguientes son artículos de noticias con sus resúmenes.",
1492
+ prompt_template="Artículo: {text}\nResumen: {target_text}",
1493
+ instruction_prompt="Artículo: {text}\n\nEscribe un resumen del artículo anterior.",
1494
+ num_few_shot_examples=1,
1495
+ max_generated_tokens=256,
1496
+ )
1497
+
1361
1498
  RRN_CONFIG = DatasetConfig(
1362
1499
  name="rrn",
1363
1500
  pretty_name="the truncated version of the Icelandic summarisation dataset "
@@ -1745,6 +1882,23 @@ MMLU_IT_CONFIG = DatasetConfig(
1745
1882
  max_generated_tokens=5,
1746
1883
  )
1747
1884
 
1885
+ MMLU_ES_CONFIG = DatasetConfig(
1886
+ name="mmlu-es",
1887
+ pretty_name="the truncated version of the Spanish knowledge dataset MMLU-es, "
1888
+ "translated from the English MMLU dataset",
1889
+ huggingface_id="EuroEval/mmlu-es-mini",
1890
+ task=KNOW,
1891
+ languages=[ES],
1892
+ labels=["a", "b", "c", "d"],
1893
+ prompt_prefix="Las siguientes son preguntas de opción múltiple (con respuestas).",
1894
+ prompt_template="Pregunta: {text}\nRespuesta: {label}",
1895
+ prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
1896
+ instruction_prompt="Pregunta: {text}\n\nResponda la pregunta anterior usando "
1897
+ "solo 'a', 'b', 'c' o 'd', y nada más.",
1898
+ num_few_shot_examples=5,
1899
+ max_generated_tokens=5,
1900
+ )
1901
+
1748
1902
  ARC_DA_CONFIG = DatasetConfig(
1749
1903
  name="arc-da",
1750
1904
  pretty_name="the truncated version of the Danish knowledge dataset ARC-da, "
@@ -1870,6 +2024,23 @@ ARC_CONFIG = DatasetConfig(
1870
2024
  unofficial=True,
1871
2025
  )
1872
2026
 
2027
+ HELLASWAG_ES_CONFIG = DatasetConfig(
2028
+ name="hellaswag-es",
2029
+ pretty_name="the truncated version of the Spanish common-sense reasoning dataset "
2030
+ "HellaSwag-es, translated from the English HellaSwag dataset",
2031
+ huggingface_id="EuroEval/hellaswag-es-mini",
2032
+ task=COMMON_SENSE,
2033
+ languages=[ES],
2034
+ labels=["a", "b", "c", "d"],
2035
+ prompt_prefix="Las siguientes son preguntas de opción múltiple (con respuestas).",
2036
+ prompt_template="Pregunta: {text}\nRespuesta: {label}",
2037
+ prompt_label_mapping=dict(a="a", b="b", c="c", d="d"),
2038
+ instruction_prompt="Pregunta: {text}\n\nResponda la pregunta anterior usando solo "
2039
+ "'a', 'b', 'c' o 'd', y nada más.",
2040
+ num_few_shot_examples=5,
2041
+ max_generated_tokens=5,
2042
+ )
2043
+
1873
2044
  # TODO: Faroese knowledge
1874
2045
 
1875
2046
 
@@ -1,18 +1,18 @@
1
1
  """Utility functions related to the token-classification task group."""
2
2
 
3
- import importlib.util
4
3
  import logging
5
4
  import re
6
5
  import typing as t
7
6
  from copy import deepcopy
8
7
 
8
+ import demjson3
9
9
  import evaluate
10
10
  import numpy as np
11
11
  from evaluate import EvaluationModule
12
12
  from transformers import PreTrainedTokenizer
13
13
 
14
14
  from ..data_models import BenchmarkConfig, DatasetConfig, GenerativeModelOutput
15
- from ..exceptions import InvalidBenchmark, NeedsExtraInstalled
15
+ from ..exceptions import InvalidBenchmark
16
16
  from ..utils import raise_if_model_output_contains_nan_values
17
17
 
18
18
  if t.TYPE_CHECKING:
@@ -20,9 +20,6 @@ if t.TYPE_CHECKING:
20
20
 
21
21
  from ..types import Labels, Predictions
22
22
 
23
- if importlib.util.find_spec("demjson3") is not None:
24
- import demjson3
25
-
26
23
 
27
24
  logger = logging.getLogger("euroeval")
28
25
 
@@ -201,13 +198,10 @@ def extract_labels_from_generation(
201
198
  Returns:
202
199
  The predicted labels.
203
200
  """
204
- if importlib.util.find_spec("demjson3") is None:
205
- raise NeedsExtraInstalled(extra="generative")
206
-
207
201
  raw_predictions = model_output.sequences
208
202
 
209
203
  # Attempt to extract the JSON dictionary from the predictions
210
- json_regex = r"\{.+?\}"
204
+ json_regex = r"\{[^{}]+?\}"
211
205
  json_matches = [
212
206
  re.search(pattern=json_regex, string=raw_prediction, flags=re.DOTALL)
213
207
  or raw_prediction
euroeval/utils.py CHANGED
@@ -141,6 +141,7 @@ def block_terminal_output() -> None:
141
141
  logging.getLogger("vllm.transformers_utils.tokenizer").setLevel(logging.CRITICAL)
142
142
  logging.getLogger("vllm.core.scheduler").setLevel(logging.CRITICAL)
143
143
  logging.getLogger("vllm.model_executor.weight_utils").setLevel(logging.CRITICAL)
144
+ logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
144
145
  logging.getLogger("httpx").setLevel(logging.CRITICAL)
145
146
  logging.getLogger("ray._private.worker").setLevel(logging.CRITICAL)
146
147
  logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 15.3.1
3
+ Version: 15.4.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -33,12 +33,14 @@ Requires-Dist: accelerate>=0.34.2
33
33
  Requires-Dist: bert-score>=0.3.13
34
34
  Requires-Dist: click>=8.1.3
35
35
  Requires-Dist: datasets>=2.15.0
36
+ Requires-Dist: demjson3>=3.0.6
36
37
  Requires-Dist: evaluate>=0.4.1
37
38
  Requires-Dist: huggingface-hub>=0.24.0
38
39
  Requires-Dist: levenshtein>=0.24.0
39
40
  Requires-Dist: litellm>=1.61.13
40
41
  Requires-Dist: more-itertools>=10.5.0
41
42
  Requires-Dist: numpy<2.0.0,>=1.23.0
43
+ Requires-Dist: ollama>=0.4.7
42
44
  Requires-Dist: pandas>=2.2.0
43
45
  Requires-Dist: protobuf~=3.20.0
44
46
  Requires-Dist: pydantic>=2.6.0
@@ -52,19 +54,19 @@ Requires-Dist: seqeval>=1.2.2
52
54
  Requires-Dist: setuptools>=75.8.2
53
55
  Requires-Dist: tenacity>=9.0.0
54
56
  Requires-Dist: termcolor>=2.0.0
55
- Requires-Dist: torch>=2.3.0
56
- Requires-Dist: transformers>=4.47.0
57
+ Requires-Dist: torch>=2.6.0
58
+ Requires-Dist: transformers>=4.50.0
57
59
  Provides-Extra: all
58
60
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
59
- Requires-Dist: demjson3>=3.0.6; extra == 'all'
60
61
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
61
62
  Requires-Dist: gradio>=4.26.0; extra == 'all'
62
- Requires-Dist: vllm<0.6.5,>=0.6.3; (platform_system == 'Linux') and extra == 'all'
63
+ Requires-Dist: outlines>=0.1.11; extra == 'all'
64
+ Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'all'
63
65
  Provides-Extra: generative
64
66
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
65
- Requires-Dist: demjson3>=3.0.6; extra == 'generative'
66
67
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
67
- Requires-Dist: vllm<0.6.5,>=0.6.3; (platform_system == 'Linux') and extra == 'generative'
68
+ Requires-Dist: outlines>=0.1.11; extra == 'generative'
69
+ Requires-Dist: vllm>=0.8.0; (platform_system == 'Linux') and extra == 'generative'
68
70
  Provides-Extra: human-evaluation
69
71
  Requires-Dist: gradio>=4.26.0; extra == 'human-evaluation'
70
72
  Provides-Extra: test
@@ -202,6 +204,19 @@ argument. This could for instance be `--model <model-id> --task
202
204
  sentiment-classification`.
203
205
 
204
206
 
207
+ ### Reproducing the datasets
208
+ All datasets used in this project are generated using the scripts located in the [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script with the following command
209
+
210
+ ```shell
211
+ $ uv run src/scripts/<name-of-script>.py
212
+ ```
213
+
214
+ Replace <name-of-script> with the specific script you wish to execute, e.g.,
215
+
216
+ ```shell
217
+ $ uv run src/scripts/create_allocine.py
218
+ ```
219
+
205
220
  ## Special Thanks :pray:
206
221
  - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
207
222
  models on the leaderboards.
@@ -1,12 +1,12 @@
1
- euroeval/__init__.py,sha256=3od9_ucHlILSbe4WCR8k5PbeorvmUr-VjOKXJ01I0fA,2165
2
- euroeval/benchmark_config_factory.py,sha256=pi4Lu--ySKZRd9ItG6VKS6BPLis64vL-7UE99VSXq5Y,12534
3
- euroeval/benchmarker.py,sha256=__DdnOvI9CNpgqPT1hsTl0GZFTyQ6KRfiQowCuh36sc,46534
1
+ euroeval/__init__.py,sha256=l3V3ybiCj0I193jvn8wS9VK4UEc9ajiOq4SojChH6Xs,2615
2
+ euroeval/benchmark_config_factory.py,sha256=JCjJS2pjtiuQ6tpwZ_DJFvNzwdbZu5YdJcHhFz-q6eU,12562
3
+ euroeval/benchmarker.py,sha256=PIdqLPleLN3nml5Zb1g_dQaLzqxQhmgC8VuvD5yloV4,46524
4
4
  euroeval/callbacks.py,sha256=bThUUxOgkMuESUQ5rrFRoSumKV8vNw53CslIZTpkt54,2438
5
5
  euroeval/cli.py,sha256=EMB6g6kRvxIqlfYLSoMzwLAtEd-fqXipo4A_HTkhjkA,8575
6
- euroeval/constants.py,sha256=qFrm3cRT6UlnTXfHUmxqZsr0SBsGskjV1qrUlnAW-aw,1473
7
- euroeval/data_loading.py,sha256=RoatBJMpGurP_y5O3KrEvly8Z_yYEapQnnMZ_tWWrlc,3272
6
+ euroeval/constants.py,sha256=9iXe26WAigL9RYob3PhsB5c0dr11wCeRxrEfm_ssynM,1562
7
+ euroeval/data_loading.py,sha256=7xXdoFSvEDzpw1FNR8E8YV4c9Vy86hlU5-qLm9RUejE,3318
8
8
  euroeval/data_models.py,sha256=4ZY9x2pINlRywTzYxxtrYG7qXMNdod5I9XBOlTJYT8E,14495
9
- euroeval/dataset_configs.py,sha256=Cj3McxA0JTC7RKzXofzpJfmIhoXAfF756f_1SZUaPlw,84391
9
+ euroeval/dataset_configs.py,sha256=bjMUXvaEtTpo1Eql_mIRCG3K_lB2DZRdPWEAwR5N4ig,90627
10
10
  euroeval/enums.py,sha256=L9LcNeruuhHvze9vKRogXY9vonRzoBqDzWSP6hxKQ7A,3195
11
11
  euroeval/exceptions.py,sha256=0U_MV-plENJCw2O8NM1RmADkfVxoT2QiFkL-XdTgIZg,5821
12
12
  euroeval/finetuning.py,sha256=_lDKlILpHwZ3KR_1S4v7yEbwo8czGAHP7zjUy8Q_Q-8,10701
@@ -20,21 +20,21 @@ euroeval/scores.py,sha256=OL1MPVSgBySc9gMGeZBnj_j6-EvpDtEOwjO12IgeP6o,2899
20
20
  euroeval/speed_benchmark.py,sha256=tDjQHsahdEI68IIYlI7CViQXlLbFzzzUrk2bEGpgS6k,3950
21
21
  euroeval/tasks.py,sha256=93qVhRf5eegXE3zUI0hpFBQarnHUpTQLyN5bBR0DYnc,5418
22
22
  euroeval/types.py,sha256=xvBn0eNynqAqwL7CGEgVFb_lCD9SdHUMvxJo7OXRfls,2367
23
- euroeval/utils.py,sha256=K4z2IQilLJo6Cf8bzM46PYTaylDv6bYi7FRbHTbZulE,18736
23
+ euroeval/utils.py,sha256=MkiVI-0KmK4ilKJTTfYAynKaPDOzW1WjyRdZsYmnoIg,18803
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=Kmg4rS3yawMUs_TQUHTeZyoxYdOx3lkgGe2iYa-LhbM,10741
26
26
  euroeval/benchmark_modules/fresh.py,sha256=k6bqDEnazRAX9ILVsRrzUTbkgNO4NcLCxHToCnLWV8M,9641
27
- euroeval/benchmark_modules/hf.py,sha256=n3VIUA7XOOTgbSMkmYp5S06iJV0kp7aMq8YzRb0EDLw,41741
28
- euroeval/benchmark_modules/litellm.py,sha256=uMPzUjTU54UHDmBImzWUFCGUupKvZNQN-2u0c8UaM3s,34488
29
- euroeval/benchmark_modules/vllm.py,sha256=cw7onFYXQ66cr2c4WTB90VYtQYc47lkwz6A25FW8sBs,43444
27
+ euroeval/benchmark_modules/hf.py,sha256=YeaaP_YGAlKG5G1KFq0bFOFWv42eH_zfmhuW3FAXjAA,41726
28
+ euroeval/benchmark_modules/litellm.py,sha256=ZJ9dB683pXPHDf70OOJfmHn_y706xRYzstYLz2ytCKE,39784
29
+ euroeval/benchmark_modules/vllm.py,sha256=5N2ytLR9cZIcPeza-ERQWwyvehDd0F1FUvXY3cKu4Oo,44519
30
30
  euroeval/task_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
31
31
  euroeval/task_utils/multiple_choice_classification.py,sha256=WnW_unOTPdfKd64-C5M18rZdYNB9QNfqq8Pca29XEdw,5877
32
32
  euroeval/task_utils/question_answering.py,sha256=G01s11JcQ7UxeBcKaCO3k0DL4zkVmEb7SxUyZS6T7Ns,27303
33
33
  euroeval/task_utils/sequence_classification.py,sha256=FrkvFzxFSnZoXThgpQqvJCIy3_YemyqZFQ1L-YdMMiw,8527
34
34
  euroeval/task_utils/text_to_text.py,sha256=DdLruAO4D9Iv5aAXx40la3X3pKbKLUn0-ViBJkMKsTI,5698
35
- euroeval/task_utils/token_classification.py,sha256=yT1YvZzmqNaVSRZ67BvyURhlkgTm3ltWPft4HxodZAE,17983
36
- euroeval-15.3.1.dist-info/METADATA,sha256=elF7s_zt2tj9Hl1EMMDfNoMtskYK5Xh9i-N36vvzfQs,10263
37
- euroeval-15.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
- euroeval-15.3.1.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
- euroeval-15.3.1.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
- euroeval-15.3.1.dist-info/RECORD,,
35
+ euroeval/task_utils/token_classification.py,sha256=aW2GGk-dqa7lioIsHirVgD8AMrQEAnVasmjEWQ4xu7w,17778
36
+ euroeval-15.4.0.dist-info/METADATA,sha256=HfNWsANdb8TJAyK__QPBhs7O5qsQp9G_gPlhVVNuK9c,10724
37
+ euroeval-15.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ euroeval-15.4.0.dist-info/entry_points.txt,sha256=tKQRxN0HX2mGtbZbZQdCRFUDZIecA_z4mZduueor3Ug,135
39
+ euroeval-15.4.0.dist-info/licenses/LICENSE,sha256=oZp5fpOSQ7w-vFui8QNwrBIosrO7cnpArItdbvn52Ao,1082
40
+ euroeval-15.4.0.dist-info/RECORD,,