EuroEval 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show
  1. euroeval/__init__.py +3 -2
  2. euroeval/benchmark_config_factory.py +0 -4
  3. euroeval/benchmark_modules/base.py +3 -16
  4. euroeval/benchmark_modules/fresh.py +2 -1
  5. euroeval/benchmark_modules/hf.py +99 -62
  6. euroeval/benchmark_modules/litellm.py +101 -41
  7. euroeval/benchmark_modules/vllm.py +91 -83
  8. euroeval/benchmarker.py +84 -78
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/constants.py +6 -0
  12. euroeval/data_loading.py +14 -11
  13. euroeval/data_models.py +12 -4
  14. euroeval/dataset_configs/__init__.py +2 -0
  15. euroeval/dataset_configs/czech.py +79 -0
  16. euroeval/dataset_configs/danish.py +10 -11
  17. euroeval/dataset_configs/dutch.py +0 -1
  18. euroeval/dataset_configs/english.py +0 -1
  19. euroeval/dataset_configs/estonian.py +11 -1
  20. euroeval/dataset_configs/finnish.py +0 -1
  21. euroeval/dataset_configs/french.py +0 -1
  22. euroeval/dataset_configs/german.py +0 -1
  23. euroeval/dataset_configs/italian.py +0 -1
  24. euroeval/dataset_configs/latvian.py +0 -1
  25. euroeval/dataset_configs/lithuanian.py +9 -3
  26. euroeval/dataset_configs/norwegian.py +0 -1
  27. euroeval/dataset_configs/polish.py +0 -1
  28. euroeval/dataset_configs/portuguese.py +0 -1
  29. euroeval/dataset_configs/slovak.py +60 -0
  30. euroeval/dataset_configs/spanish.py +0 -1
  31. euroeval/dataset_configs/swedish.py +10 -12
  32. euroeval/finetuning.py +21 -15
  33. euroeval/generation.py +10 -10
  34. euroeval/generation_utils.py +2 -3
  35. euroeval/logging_utils.py +250 -0
  36. euroeval/metrics/base.py +0 -3
  37. euroeval/metrics/huggingface.py +9 -5
  38. euroeval/metrics/llm_as_a_judge.py +5 -3
  39. euroeval/metrics/pipeline.py +17 -9
  40. euroeval/metrics/speed.py +0 -3
  41. euroeval/model_cache.py +11 -14
  42. euroeval/model_config.py +4 -5
  43. euroeval/model_loading.py +3 -0
  44. euroeval/prompt_templates/linguistic_acceptability.py +21 -3
  45. euroeval/prompt_templates/multiple_choice.py +25 -1
  46. euroeval/prompt_templates/named_entity_recognition.py +51 -11
  47. euroeval/prompt_templates/reading_comprehension.py +31 -3
  48. euroeval/prompt_templates/sentiment_classification.py +23 -1
  49. euroeval/prompt_templates/summarization.py +26 -6
  50. euroeval/scores.py +7 -7
  51. euroeval/speed_benchmark.py +3 -5
  52. euroeval/task_group_utils/multiple_choice_classification.py +0 -3
  53. euroeval/task_group_utils/question_answering.py +0 -3
  54. euroeval/task_group_utils/sequence_classification.py +43 -31
  55. euroeval/task_group_utils/text_to_text.py +17 -8
  56. euroeval/task_group_utils/token_classification.py +10 -9
  57. euroeval/tokenisation_utils.py +14 -12
  58. euroeval/utils.py +29 -146
  59. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/METADATA +4 -4
  60. euroeval-16.4.0.dist-info/RECORD +75 -0
  61. euroeval-16.3.0.dist-info/RECORD +0 -71
  62. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
  63. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
  64. {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -14,10 +14,9 @@ from time import sleep
14
14
  import torch
15
15
  from huggingface_hub import snapshot_download
16
16
  from pydantic import conlist, create_model
17
- from tqdm.auto import tqdm
18
- from transformers import MistralCommonTokenizer
19
17
  from transformers.models.auto.configuration_auto import AutoConfig
20
18
  from transformers.models.auto.tokenization_auto import AutoTokenizer
19
+ from transformers.tokenization_mistral_common import MistralCommonTokenizer
21
20
  from urllib3.exceptions import RequestError
22
21
 
23
22
  from ..constants import (
@@ -30,7 +29,7 @@ from ..constants import (
30
29
  REASONING_TOKENS,
31
30
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
32
31
  )
33
- from ..data_models import GenerativeModelOutput, ModelConfig
32
+ from ..data_models import GenerativeModelOutput, HashableDict, ModelConfig
34
33
  from ..enums import (
35
34
  BatchingPreference,
36
35
  GenerativeType,
@@ -50,6 +49,7 @@ from ..generation_utils import (
50
49
  raise_if_wrong_params,
51
50
  )
52
51
  from ..languages import get_all_languages
52
+ from ..logging_utils import get_pbar, log, log_once, no_terminal_output
53
53
  from ..task_group_utils import (
54
54
  question_answering,
55
55
  sequence_classification,
@@ -73,7 +73,6 @@ from ..utils import (
73
73
  get_hf_token,
74
74
  get_min_cuda_compute_capability,
75
75
  internet_connection_available,
76
- log_once,
77
76
  resolve_model_path,
78
77
  split_model_id,
79
78
  )
@@ -86,7 +85,7 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
86
85
  destroy_model_parallel,
87
86
  )
88
87
  from vllm.lora.request import LoRARequest
89
- from vllm.sampling_params import GuidedDecodingParams
88
+ from vllm.sampling_params import StructuredOutputsParams
90
89
 
91
90
  if t.TYPE_CHECKING:
92
91
  from datasets import DatasetDict
@@ -95,8 +94,6 @@ if t.TYPE_CHECKING:
95
94
 
96
95
  from ..data_models import BenchmarkConfig, DatasetConfig, Task
97
96
 
98
- logger = logging.getLogger("euroeval")
99
-
100
97
 
101
98
  class VLLMModel(HuggingFaceEncoderModel):
102
99
  """A generative model using the vLLM inference framework."""
@@ -132,9 +129,10 @@ class VLLMModel(HuggingFaceEncoderModel):
132
129
  model_config=model_config, allowed_params=self.allowed_params
133
130
  )
134
131
 
135
- model, tokeniser = load_model_and_tokeniser(
136
- model_config=model_config, benchmark_config=benchmark_config
137
- )
132
+ with no_terminal_output(disable=benchmark_config.verbose):
133
+ model, tokeniser = load_model_and_tokeniser(
134
+ model_config=model_config, benchmark_config=benchmark_config
135
+ )
138
136
  self._model: "LLM" = model
139
137
  self._tokeniser: "PreTrainedTokenizer" = tokeniser
140
138
 
@@ -245,6 +243,7 @@ class VLLMModel(HuggingFaceEncoderModel):
245
243
  return partial(
246
244
  sequence_classification.extract_labels_from_generation,
247
245
  dataset_config=self.dataset_config,
246
+ model_config=self.model_config,
248
247
  first_label_token_mapping=self.buffer["first_label_token_mapping"],
249
248
  )
250
249
  case TaskGroup.TEXT_TO_TEXT:
@@ -394,10 +393,11 @@ class VLLMModel(HuggingFaceEncoderModel):
394
393
  self.dataset_config.task.uses_structured_output
395
394
  or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
396
395
  ) and self.generative_type == GenerativeType.REASONING:
397
- guided_decoding = None
398
- logger.debug(
396
+ structured_outputs = None
397
+ log(
399
398
  "The dataset uses structured output, but we are not using it as the "
400
- "model is a reasoning model."
399
+ "model is a reasoning model.",
400
+ level=logging.DEBUG,
401
401
  )
402
402
  elif self.dataset_config.task.uses_structured_output:
403
403
  ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
@@ -412,9 +412,11 @@ class VLLMModel(HuggingFaceEncoderModel):
412
412
  f"{json.dumps(structured_generation_schema)}",
413
413
  level=logging.DEBUG,
414
414
  )
415
- guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
415
+ structured_outputs = StructuredOutputsParams(
416
+ json=structured_generation_schema
417
+ )
416
418
  elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
417
- guided_decoding = GuidedDecodingParams(
419
+ structured_outputs = StructuredOutputsParams(
418
420
  choice=[
419
421
  self.dataset_config.prompt_label_mapping[label]
420
422
  for label in self.dataset_config.labels
@@ -422,11 +424,11 @@ class VLLMModel(HuggingFaceEncoderModel):
422
424
  )
423
425
  log_once(
424
426
  "Using structured generation with the choices: "
425
- f"{guided_decoding.choice!r}.",
427
+ f"{structured_outputs.choice!r}.",
426
428
  level=logging.DEBUG,
427
429
  )
428
430
  else:
429
- guided_decoding = None
431
+ structured_outputs = None
430
432
  log_once(
431
433
  "Not using structured generation as the dataset does not require it.",
432
434
  level=logging.DEBUG,
@@ -445,14 +447,14 @@ class VLLMModel(HuggingFaceEncoderModel):
445
447
  else None,
446
448
  temperature=0.0,
447
449
  stop=[stop_token for stop_token in stop_tokens if stop_token],
448
- guided_decoding=guided_decoding,
450
+ structured_outputs=structured_outputs,
449
451
  )
450
452
 
451
453
  # If any of the prompts are empty then we need to replace them with a BOS token
452
454
  # so that the vLLM model can generate from them
453
455
  prompts: list[str] = inputs["text"]
454
456
  if any(len(prompt) == 0 for prompt in prompts):
455
- logger.debug("Found empty prompts, replacing with BOS token.")
457
+ log("Found empty prompts, replacing with BOS token.", level=logging.DEBUG)
456
458
  prompts = [
457
459
  prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
458
460
  for prompt in prompts
@@ -480,13 +482,14 @@ class VLLMModel(HuggingFaceEncoderModel):
480
482
  raw_outputs = self._model.generate(
481
483
  prompts=prompts,
482
484
  sampling_params=sampling_params,
483
- use_tqdm=False if input_is_a_test else get_pbar_without_leave,
485
+ use_tqdm=False if input_is_a_test else get_pbar,
484
486
  lora_request=self.buffer.get("lora_request"),
485
487
  )
486
488
  break
487
489
  except TypeError as e:
488
- logger.debug(
489
- f"Encountered error during vLLM generation: {str(e)}. Retrying..."
490
+ log(
491
+ f"Encountered error during vLLM generation: {str(e)}. Retrying...",
492
+ level=logging.DEBUG,
490
493
  )
491
494
  sleep(1)
492
495
  except ValueError as e:
@@ -498,10 +501,11 @@ class VLLMModel(HuggingFaceEncoderModel):
498
501
  re.search(pattern, str(e), flags=re.IGNORECASE) is not None
499
502
  for pattern in truncate_error_messages
500
503
  ):
501
- logger.info(
502
- "Prompts are too long, so truncating them and trying again..."
504
+ log(
505
+ "Prompts are too long, so truncating them and trying again...",
506
+ level=logging.WARNING,
503
507
  )
504
- logger.debug(f"The error message was: {str(e)}")
508
+ log(f"The error message was: {str(e)}", level=logging.DEBUG)
505
509
 
506
510
  # If we have already tried truncating the prompts a few times, then
507
511
  # we truncate a bit more aggressively
@@ -544,15 +548,16 @@ class VLLMModel(HuggingFaceEncoderModel):
544
548
  f"{num_extra_outputs!r} extra outputs."
545
549
  )
546
550
  else:
547
- logger.debug(
551
+ log(
548
552
  f"Filtered out {num_extra_outputs:,} extra outputs from the model, "
549
553
  "which occured as we interupted the generation when we truncated "
550
- "the prompts."
554
+ "the prompts.",
555
+ level=logging.DEBUG,
551
556
  )
552
557
 
553
558
  # Parse the raw model outputs
554
559
  completion_ids: list[list[int]] = [
555
- output.outputs[0].token_ids for output in raw_outputs
560
+ list(output.outputs[0].token_ids) for output in raw_outputs
556
561
  ]
557
562
  completions = self._tokeniser.batch_decode(
558
563
  sequences=[
@@ -563,30 +568,29 @@ class VLLMModel(HuggingFaceEncoderModel):
563
568
  self.end_of_reasoning_token is not None
564
569
  and self.generative_type == GenerativeType.REASONING
565
570
  ):
571
+ num_samples_without_eor_token = 0
566
572
  for idx in range(len(completions)):
567
573
  if self.end_of_reasoning_token in completions[idx]:
568
574
  completions[idx] = completions[idx].split(
569
575
  self.end_of_reasoning_token
570
576
  )[-1]
571
- elif self.benchmark_config.verbose:
572
- logger.warning(
573
- f"The model {self.model_config.model_id!r} is a reasoning "
574
- "model, but the generated output does not contain the end of "
575
- f"reasoning token ({self.end_of_reasoning_token!r}). Using "
576
- "an empty string as the prediction instead."
577
- )
578
- completions[idx] = ""
579
577
  else:
580
- log_once(
581
- f"The model {self.model_config.model_id!r} is a reasoning "
582
- "model, but the generated output does not contain the end of "
583
- f"reasoning token ({self.end_of_reasoning_token!r}). Using "
584
- "an empty string as the prediction instead. Only showing "
585
- "this warning once - see all occurrences if you run with the "
586
- "`verbose` flag.",
587
- level=logging.WARNING,
588
- )
578
+ num_samples_without_eor_token += 1
589
579
  completions[idx] = ""
580
+ if num_samples_without_eor_token > 0:
581
+ log_once(
582
+ f"The model {self.model_config.model_id!r} is a reasoning "
583
+ "model, but the generated output did not contain the end of "
584
+ f"reasoning token ({self.end_of_reasoning_token!r}) in "
585
+ f"{num_samples_without_eor_token:,}/{len(completions):,} of "
586
+ "the samples. Using an empty string for all these samples "
587
+ "instead.",
588
+ level=(
589
+ logging.WARNING
590
+ if num_samples_without_eor_token / len(completions) > 0.5
591
+ else logging.DEBUG
592
+ ),
593
+ )
590
594
  stop_token_pattern = re.compile(
591
595
  "|".join(re.escape(stop_token) for stop_token in stop_tokens)
592
596
  )
@@ -607,10 +611,10 @@ class VLLMModel(HuggingFaceEncoderModel):
607
611
  scores: list[list[list[tuple[str, float]]]] = [
608
612
  [
609
613
  [
610
- (obj.decoded_token, obj.logprob)
614
+ (obj.decoded_token or "", obj.logprob)
611
615
  for obj in token_logprobs_dict.values()
612
616
  ]
613
- for token_logprobs_dict in raw_output.outputs[0].logprobs
617
+ for token_logprobs_dict in raw_output.outputs[0].logprobs or list()
614
618
  ]
615
619
  for raw_output in raw_outputs
616
620
  ]
@@ -648,7 +652,13 @@ class VLLMModel(HuggingFaceEncoderModel):
648
652
  revision = model_id_components.revision
649
653
 
650
654
  model_info = get_model_repo_info(
651
- model_id=model_id, revision=revision, benchmark_config=benchmark_config
655
+ model_id=model_id,
656
+ revision=revision,
657
+ api_key=benchmark_config.api_key,
658
+ cache_dir=benchmark_config.cache_dir,
659
+ trust_remote_code=benchmark_config.trust_remote_code,
660
+ requires_safetensors=benchmark_config.requires_safetensors,
661
+ run_with_cli=benchmark_config.run_with_cli,
652
662
  )
653
663
  return (
654
664
  model_info is not None
@@ -674,7 +684,11 @@ class VLLMModel(HuggingFaceEncoderModel):
674
684
  model_info = get_model_repo_info(
675
685
  model_id=model_id_components.model_id,
676
686
  revision=model_id_components.revision,
677
- benchmark_config=benchmark_config,
687
+ api_key=benchmark_config.api_key,
688
+ cache_dir=benchmark_config.cache_dir,
689
+ trust_remote_code=benchmark_config.trust_remote_code,
690
+ requires_safetensors=benchmark_config.requires_safetensors,
691
+ run_with_cli=benchmark_config.run_with_cli,
678
692
  )
679
693
  if model_info is None:
680
694
  raise InvalidModel(f"The model {model_id!r} could not be found.")
@@ -751,8 +765,8 @@ def load_model_and_tokeniser(
751
765
  hf_model_config = load_hf_model_config(
752
766
  model_id=model_id,
753
767
  num_labels=0,
754
- id2label=dict(),
755
- label2id=dict(),
768
+ id2label=HashableDict(),
769
+ label2id=HashableDict(),
756
770
  revision=revision,
757
771
  model_cache_dir=model_config.model_cache_dir,
758
772
  api_key=benchmark_config.api_key,
@@ -779,32 +793,36 @@ def load_model_and_tokeniser(
779
793
  # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
780
794
  if hf_model_config.dtype == torch.float32:
781
795
  if torch.cuda.is_bf16_supported():
782
- logger.info(
796
+ log(
783
797
  "You are loading a model with dtype FP32, which we will convert to "
784
798
  "BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
785
- "GPU."
799
+ "GPU.",
800
+ level=logging.WARNING,
786
801
  )
787
802
  dtype = torch.bfloat16
788
803
  else:
789
- logger.info(
804
+ log(
790
805
  "You are loading a model with dtype FP32, which we will convert to "
791
806
  "FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
792
- "your GPU."
807
+ "your GPU.",
808
+ level=logging.WARNING,
793
809
  )
794
810
  dtype = torch.float16
795
811
 
796
812
  # If the model is a quantized model, we might need to change the dtype
797
813
  if quantization == "mxfp4" and hf_model_config.dtype is None:
798
814
  dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
799
- logger.debug(
815
+ log(
800
816
  "You are loading a quantized model where `dtype` has not been set. "
801
- f"Setting dtype to {dtype!r}."
817
+ f"Setting dtype to {dtype!r}.",
818
+ level=logging.DEBUG,
802
819
  )
803
820
  elif quantization is not None and hf_model_config.dtype != torch.float16:
804
- logger.info(
821
+ log(
805
822
  "You are loading a quantized model with dtype "
806
823
  f"{hf_model_config.dtype}, which vLLM does not support. Setting "
807
- "dtype to float16 instead."
824
+ "dtype to float16 instead.",
825
+ level=logging.WARNING,
808
826
  )
809
827
  dtype = torch.float16
810
828
 
@@ -815,12 +833,13 @@ def load_model_and_tokeniser(
815
833
 
816
834
  if min_cuda_compute_capability is not None:
817
835
  if min_cuda_compute_capability < required_capability:
818
- logger.info(
836
+ log(
819
837
  f"You are loading a model with dtype {hf_model_config.dtype}, "
820
838
  "which vLLM only supports for CUDA devices with CUDA compute "
821
839
  f"capability >={required_capability}. You are using one or more "
822
840
  f"devices with compute capability {min_cuda_compute_capability}. "
823
- "Setting dtype to float16 instead."
841
+ "Setting dtype to float16 instead.",
842
+ level=logging.WARNING,
824
843
  )
825
844
  dtype = torch.float16
826
845
 
@@ -987,13 +1006,17 @@ def load_tokeniser(
987
1006
  f"Could not load tokeniser for model {model_id!r}. The error was "
988
1007
  f"{str(e)}."
989
1008
  ) from e
990
- logger.debug(
1009
+ log(
991
1010
  f"Could not load tokeniser for {model_id!r}. Falling back to "
992
- f"{adapter_base_model_id!r}."
1011
+ f"{adapter_base_model_id!r}.",
1012
+ level=logging.DEBUG,
993
1013
  )
994
1014
  model_id = adapter_base_model_id
995
1015
  except (TimeoutError, RequestError):
996
- logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
1016
+ log(
1017
+ f"Couldn't load tokeniser for {model_id!r}. Retrying.",
1018
+ level=logging.WARNING,
1019
+ )
997
1020
  sleep(5)
998
1021
  continue
999
1022
  except (KeyError, ValueError) as e:
@@ -1192,32 +1215,17 @@ def get_custom_stop_tokens(
1192
1215
  if stop_token in prompt or stop_token in completion
1193
1216
  ]
1194
1217
  if stop_tokens:
1195
- logger.debug(
1218
+ log(
1196
1219
  f"Found the following custom stop tokens for model {model_id!r}: "
1197
- f"{stop_tokens}."
1220
+ f"{stop_tokens}.",
1221
+ level=logging.DEBUG,
1198
1222
  )
1199
1223
  else:
1200
- logger.debug(f"Found no custom stop tokens for model {model_id!r}.")
1224
+ log(f"Found no custom stop tokens for model {model_id!r}.", level=logging.DEBUG)
1201
1225
 
1202
1226
  return stop_tokens
1203
1227
 
1204
1228
 
1205
- def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
1206
- """Get a progress bar for vLLM which disappears after completion.
1207
-
1208
- Args:
1209
- *tqdm_args:
1210
- Positional arguments to pass to tqdm.
1211
- **tqdm_kwargs:
1212
- Additional keyword arguments to pass to tqdm.
1213
-
1214
- Returns:
1215
- A tqdm progress bar.
1216
- """
1217
- tqdm_kwargs.pop("leave", None) # Remove the 'leave' key if it exists
1218
- return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
1219
-
1220
-
1221
1229
  def get_vllm_tokenisation_params(
1222
1230
  tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
1223
1231
  ) -> dict[str, t.Any]: