EuroEval 15.4.2__py3-none-any.whl → 15.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (54) hide show
  1. euroeval/__init__.py +2 -2
  2. euroeval/benchmark_modules/base.py +3 -2
  3. euroeval/benchmark_modules/fresh.py +8 -6
  4. euroeval/benchmark_modules/hf.py +44 -33
  5. euroeval/benchmark_modules/litellm.py +314 -120
  6. euroeval/benchmark_modules/vllm.py +99 -59
  7. euroeval/benchmarker.py +52 -21
  8. euroeval/callbacks.py +2 -2
  9. euroeval/constants.py +9 -2
  10. euroeval/data_models.py +258 -44
  11. euroeval/dataset_configs/__init__.py +61 -0
  12. euroeval/dataset_configs/danish.py +120 -0
  13. euroeval/dataset_configs/dutch.py +123 -0
  14. euroeval/dataset_configs/english.py +88 -0
  15. euroeval/dataset_configs/faroese.py +53 -0
  16. euroeval/dataset_configs/french.py +83 -0
  17. euroeval/dataset_configs/german.py +91 -0
  18. euroeval/dataset_configs/icelandic.py +148 -0
  19. euroeval/dataset_configs/italian.py +81 -0
  20. euroeval/dataset_configs/norwegian.py +178 -0
  21. euroeval/dataset_configs/spanish.py +78 -0
  22. euroeval/dataset_configs/swedish.py +100 -0
  23. euroeval/exceptions.py +10 -10
  24. euroeval/finetuning.py +6 -10
  25. euroeval/generation.py +1 -0
  26. euroeval/human_evaluation.py +2 -2
  27. euroeval/languages.py +20 -13
  28. euroeval/model_cache.py +1 -1
  29. euroeval/model_loading.py +1 -12
  30. euroeval/prompt_templates/__init__.py +8 -0
  31. euroeval/prompt_templates/linguistic_acceptability.py +112 -0
  32. euroeval/prompt_templates/multiple_choice.py +97 -0
  33. euroeval/prompt_templates/named_entity_recognition.py +257 -0
  34. euroeval/prompt_templates/reading_comprehension.py +118 -0
  35. euroeval/prompt_templates/sentiment_classification.py +137 -0
  36. euroeval/prompt_templates/summarization.py +97 -0
  37. euroeval/speed_benchmark.py +1 -1
  38. euroeval/{task_utils → task_group_utils}/multiple_choice_classification.py +19 -11
  39. euroeval/{task_utils → task_group_utils}/question_answering.py +31 -30
  40. euroeval/{task_utils → task_group_utils}/sequence_classification.py +45 -10
  41. euroeval/{task_utils → task_group_utils}/text_to_text.py +1 -1
  42. euroeval/{task_utils → task_group_utils}/token_classification.py +3 -2
  43. euroeval/tasks.py +54 -0
  44. euroeval/tokenization_utils.py +343 -0
  45. euroeval/types.py +3 -1
  46. euroeval/utils.py +5 -254
  47. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/METADATA +31 -9
  48. euroeval-15.6.0.dist-info/RECORD +59 -0
  49. euroeval/dataset_configs.py +0 -2408
  50. euroeval-15.4.2.dist-info/RECORD +0 -40
  51. /euroeval/{task_utils → task_group_utils}/__init__.py +0 -0
  52. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/WHEEL +0 -0
  53. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/entry_points.txt +0 -0
  54. {euroeval-15.4.2.dist-info → euroeval-15.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,6 +1,7 @@
1
1
  """Generative models using the vLLM inference framework."""
2
2
 
3
3
  import collections.abc as c
4
+ import contextlib
4
5
  import importlib.util
5
6
  import itertools as it
6
7
  import json
@@ -20,15 +21,18 @@ from datasets import DatasetDict
20
21
  from huggingface_hub import snapshot_download
21
22
  from pydantic import conlist, create_model
22
23
  from tqdm.auto import tqdm
23
- from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, Trainer
24
+ from transformers.models.auto.configuration_auto import AutoConfig
25
+ from transformers.models.auto.tokenization_auto import AutoTokenizer
26
+ from transformers.tokenization_utils import PreTrainedTokenizer
27
+ from transformers.trainer import Trainer
24
28
  from urllib3.exceptions import RequestError
25
29
 
26
30
  from ..constants import (
27
31
  GENERATIVE_PIPELINE_TAGS,
32
+ MAX_CONTEXT_LENGTH,
28
33
  MAX_LOGPROBS,
29
34
  MERGE_TAGS,
30
35
  REASONING_MAX_TOKENS,
31
- TASK_GROUPS_USING_LOGPROBS,
32
36
  TASKS_USING_JSON,
33
37
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
34
38
  )
@@ -53,39 +57,39 @@ from ..exceptions import (
53
57
  NeedsExtraInstalled,
54
58
  )
55
59
  from ..languages import get_all_languages
56
- from ..task_utils import (
60
+ from ..task_group_utils import (
57
61
  question_answering,
58
62
  sequence_classification,
59
63
  text_to_text,
60
64
  token_classification,
61
65
  )
66
+ from ..tokenization_utils import (
67
+ get_bos_token,
68
+ get_end_of_chat_token_ids,
69
+ get_eos_token,
70
+ get_first_label_token_mapping,
71
+ should_prompts_be_stripped,
72
+ )
62
73
  from ..types import ExtractLabelsFunction
63
74
  from ..utils import (
64
75
  clear_memory,
65
76
  create_model_cache_dir,
66
- get_bos_token,
67
- get_end_of_chat_token_ids,
68
- get_eos_token,
69
77
  get_min_cuda_compute_capability,
70
78
  log_once,
71
- should_prompts_be_stripped,
72
79
  )
73
80
  from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
74
81
 
75
82
  if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
76
83
  from vllm import LLM, RequestOutput, SamplingParams
84
+ from vllm.distributed.parallel_state import (
85
+ destroy_distributed_environment,
86
+ destroy_model_parallel,
87
+ )
77
88
  from vllm.lora.request import LoRARequest
78
89
 
79
- try:
80
- from vllm.model_executor.parallel_utils.parallel_state import (
81
- destroy_model_parallel,
82
- )
83
- except ImportError:
84
- from vllm.distributed.parallel_state import destroy_model_parallel
85
-
86
90
  if t.TYPE_CHECKING or importlib.util.find_spec("outlines") is not None:
87
91
  from outlines.models.vllm import adapt_tokenizer
88
- from outlines.processors import JSONLogitsProcessor
92
+ from outlines.processors.structured import JSONLogitsProcessor
89
93
 
90
94
  if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
91
95
  import ray
@@ -122,11 +126,8 @@ class VLLMModel(HuggingFaceEncoderModel):
122
126
  ):
123
127
  raise NeedsExtraInstalled(extra="generative")
124
128
 
125
- output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
126
129
  model, tokenizer = load_model_and_tokenizer(
127
- model_config=model_config,
128
- benchmark_config=benchmark_config,
129
- output_scores=output_scores,
130
+ model_config=model_config, benchmark_config=benchmark_config
130
131
  )
131
132
  self._model: LLM = model
132
133
  self._tokenizer: PreTrainedTokenizer = tokenizer
@@ -142,8 +143,12 @@ class VLLMModel(HuggingFaceEncoderModel):
142
143
  benchmark_config=benchmark_config,
143
144
  )
144
145
 
145
- self.buffer["output_scores"] = output_scores
146
- self.buffer["instruction_model"] = self._tokenizer.chat_template is not None
146
+ self.buffer |= dict(
147
+ instruction_model=self._tokenizer.chat_template is not None,
148
+ first_label_token_mapping=get_first_label_token_mapping(
149
+ dataset_config=self.dataset_config, tokenizer=self._tokenizer
150
+ ),
151
+ )
147
152
  if self.model_config.adapter_base_model_id is not None:
148
153
  adapter_path = snapshot_download(
149
154
  repo_id=self.model_config.model_id,
@@ -154,6 +159,14 @@ class VLLMModel(HuggingFaceEncoderModel):
154
159
  lora_name="adapter", lora_int_id=1, lora_path=adapter_path
155
160
  )
156
161
 
162
+ def __del__(self) -> None:
163
+ """Clean up the model and tokenizer."""
164
+ clear_vllm()
165
+ if hasattr(self, "_model"):
166
+ del self._model
167
+ if hasattr(self, "_tokenizer"):
168
+ del self._tokenizer
169
+
157
170
  @property
158
171
  def generative_type(self) -> GenerativeType | None:
159
172
  """Get the generative type of the model.
@@ -185,6 +198,7 @@ class VLLMModel(HuggingFaceEncoderModel):
185
198
  return partial(
186
199
  sequence_classification.extract_labels_from_generation,
187
200
  dataset_config=self.dataset_config,
201
+ first_label_token_mapping=self.buffer["first_label_token_mapping"],
188
202
  )
189
203
  case TaskGroup.TEXT_TO_TEXT:
190
204
  return text_to_text.extract_labels_from_generation
@@ -327,7 +341,7 @@ class VLLMModel(HuggingFaceEncoderModel):
327
341
  pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
328
342
  logits_processor = JSONLogitsProcessor(
329
343
  schema=pydantic_class,
330
- tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), #  type: ignore
344
+ tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
331
345
  whitespace_pattern=r" ?",
332
346
  )
333
347
  log_once(
@@ -338,6 +352,12 @@ class VLLMModel(HuggingFaceEncoderModel):
338
352
  else:
339
353
  logits_processor = None
340
354
 
355
+ # Get the mapping from labels to the first token in the label. We call this each
356
+ # time we generate a new dataset since the dataset config can change
357
+ self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
358
+ dataset_config=self.dataset_config, tokenizer=self._tokenizer
359
+ )
360
+
341
361
  # Define the parameters used for vLLM generation
342
362
  max_tokens: int = (
343
363
  REASONING_MAX_TOKENS
@@ -346,7 +366,7 @@ class VLLMModel(HuggingFaceEncoderModel):
346
366
  )
347
367
  sampling_params = SamplingParams(
348
368
  max_tokens=max_tokens,
349
- logprobs=MAX_LOGPROBS if self.buffer["output_scores"] else None,
369
+ logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
350
370
  temperature=0.0,
351
371
  stop=[stop_token for stop_token in stop_tokens if stop_token],
352
372
  logits_processors=[logits_processor] if logits_processor else None,
@@ -416,7 +436,7 @@ class VLLMModel(HuggingFaceEncoderModel):
416
436
  completions = [completion.strip() for completion in completions]
417
437
 
418
438
  # Add logprobs scores to the output
419
- if self.buffer["output_scores"]:
439
+ if self.buffer["first_label_token_mapping"]:
420
440
  scores: list[list[list[tuple[str, float]]]] = [
421
441
  [
422
442
  [
@@ -846,7 +866,7 @@ class VLLMModel(HuggingFaceEncoderModel):
846
866
 
847
867
 
848
868
  def load_model_and_tokenizer(
849
- model_config: ModelConfig, benchmark_config: BenchmarkConfig, output_scores: bool
869
+ model_config: ModelConfig, benchmark_config: BenchmarkConfig
850
870
  ) -> "tuple[LLM, PreTrainedTokenizer]":
851
871
  """Load the model and tokenizer.
852
872
 
@@ -855,11 +875,9 @@ def load_model_and_tokenizer(
855
875
  The model configuration.
856
876
  benchmark_config:
857
877
  The benchmark configuration.
858
- output_scores:
859
- Whether to output scores.
860
878
 
861
879
  Returns:
862
- The loaded model and tokenizer.
880
+ A pair (model, tokenizer), with the loaded model and tokenizer
863
881
  """
864
882
  # Prefer base model ID if the model is an adapter - the adapter will be added on
865
883
  # during inference in this case
@@ -893,7 +911,27 @@ def load_model_and_tokenizer(
893
911
  if quantization == "awq" and importlib.util.find_spec("awq") is None:
894
912
  raise NeedsExtraInstalled(extra="quantization")
895
913
 
914
+ # Start with dtype being the "auto" vLLM dtype
896
915
  dtype: str | torch.dtype = "auto"
916
+
917
+ # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
918
+ if hf_model_config.torch_dtype == torch.float32:
919
+ if torch.cuda.is_bf16_supported():
920
+ logger.info(
921
+ "You are loading a model with dtype FP32, which we will convert to "
922
+ "BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
923
+ "GPU."
924
+ )
925
+ dtype = torch.bfloat16
926
+ else:
927
+ logger.info(
928
+ "You are loading a model with dtype FP32, which we will convert to "
929
+ "FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
930
+ "your GPU."
931
+ )
932
+ dtype = torch.float16
933
+
934
+ # If the model is a quantized model, we need to set the dtype to float16
897
935
  if quantization is not None and hf_model_config.torch_dtype != torch.float16:
898
936
  logger.info(
899
937
  "You are loading a quantized model with dtype "
@@ -902,6 +940,7 @@ def load_model_and_tokenizer(
902
940
  )
903
941
  dtype = torch.float16
904
942
 
943
+ # If the model is a bf16 model, we need to check the CUDA compute capability
905
944
  if hf_model_config.torch_dtype == torch.bfloat16:
906
945
  min_cuda_compute_capability = get_min_cuda_compute_capability()
907
946
  required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
@@ -940,29 +979,38 @@ def load_model_and_tokenizer(
940
979
  if len(true_max_model_len_candidates) > 0:
941
980
  true_max_model_len = min(true_max_model_len_candidates)
942
981
  else:
943
- true_max_model_len = 5_000
982
+ true_max_model_len = MAX_CONTEXT_LENGTH
944
983
 
945
- clear_vllm()
984
+ tokenizer = load_tokenizer(
985
+ model_id=model_config.model_id,
986
+ revision=model_config.revision,
987
+ adapter_base_model_id=model_config.adapter_base_model_id,
988
+ trust_remote_code=benchmark_config.trust_remote_code,
989
+ model_max_length=true_max_model_len,
990
+ model_cache_dir=model_config.model_cache_dir,
991
+ token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
992
+ )
946
993
 
947
- executor_backend = "ray" if torch.cuda.device_count() > 1 else "mp"
994
+ clear_vllm()
948
995
 
949
996
  try:
950
997
  model = LLM(
951
998
  model=model_id,
952
999
  tokenizer=model_id,
953
- gpu_memory_utilization=0.95,
954
- max_model_len=min(true_max_model_len, 5_000),
1000
+ gpu_memory_utilization=0.9,
1001
+ max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
955
1002
  download_dir=download_dir,
956
1003
  trust_remote_code=benchmark_config.trust_remote_code,
957
1004
  revision=revision,
958
1005
  seed=4242,
959
- distributed_executor_backend=executor_backend,
1006
+ distributed_executor_backend=(
1007
+ "ray" if torch.cuda.device_count() > 1 else "mp"
1008
+ ),
960
1009
  tensor_parallel_size=torch.cuda.device_count(),
961
1010
  disable_custom_all_reduce=True,
962
1011
  quantization=quantization,
963
1012
  dtype=dtype,
964
1013
  enforce_eager=True,
965
- max_logprobs=MAX_LOGPROBS if output_scores else None,
966
1014
  # TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
967
1015
  # so we disable it for now
968
1016
  enable_prefix_caching=False,
@@ -988,16 +1036,6 @@ def load_model_and_tokenizer(
988
1036
  model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
989
1037
  model.config = hf_model_config
990
1038
 
991
- tokenizer = load_tokenizer(
992
- model_id=model_config.model_id,
993
- revision=model_config.revision,
994
- adapter_base_model_id=model_config.adapter_base_model_id,
995
- trust_remote_code=benchmark_config.trust_remote_code,
996
- model_max_length=true_max_model_len,
997
- model_cache_dir=model_config.model_cache_dir,
998
- token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
999
- )
1000
-
1001
1039
  return model, tokenizer
1002
1040
 
1003
1041
 
@@ -1118,13 +1156,16 @@ def _run_engine_with_fixed_progress_bars(
1118
1156
 
1119
1157
  def clear_vllm() -> None:
1120
1158
  """Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
1121
- try:
1159
+ with contextlib.suppress(ValueError):
1122
1160
  destroy_model_parallel()
1123
- except ImportError:
1124
- pass
1125
- clear_memory()
1161
+ destroy_distributed_environment()
1162
+ if ray.is_initialized():
1163
+ ray.shutdown()
1164
+ with contextlib.suppress(AssertionError):
1165
+ torch.distributed.destroy_process_group()
1126
1166
  if ray.is_initialized():
1127
1167
  ray.shutdown()
1168
+ clear_memory()
1128
1169
 
1129
1170
 
1130
1171
  def get_end_of_reasoning_token_id(
@@ -1148,24 +1189,23 @@ def get_end_of_reasoning_token_id(
1148
1189
  if tokenizer.chat_template is None:
1149
1190
  prompt = "What is your name?"
1150
1191
  else:
1151
- prompt = tokenizer.apply_chat_template(
1192
+ templated_prompt = tokenizer.apply_chat_template(
1152
1193
  conversation=[dict(role="user", content="What is your name?")],
1153
1194
  add_generation_prompt=True,
1154
1195
  tokenize=False,
1155
1196
  )
1156
- assert isinstance(prompt, str)
1197
+ assert isinstance(templated_prompt, str)
1198
+ prompt = templated_prompt
1157
1199
 
1158
1200
  # Generate a completion and remove the BOS token from it, to not confuse it with the
1159
1201
  # potential reasoning token
1160
- completion = (
1161
- model.generate(
1162
- prompts=[prompt],
1163
- sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
1164
- use_tqdm=False,
1165
- )[0]
1166
- .outputs[0]
1167
- .text
1202
+ model_output = model.generate(
1203
+ prompts=[prompt],
1204
+ sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
1205
+ use_tqdm=False,
1168
1206
  )
1207
+ completion = model_output[0].outputs[0].text
1208
+
1169
1209
  if tokenizer.bos_token is not None:
1170
1210
  if isinstance(tokenizer.bos_token, str):
1171
1211
  prompt = prompt.replace(tokenizer.bos_token, "").strip()
euroeval/benchmarker.py CHANGED
@@ -1,5 +1,6 @@
1
1
  """Class that benchmarks language models."""
2
2
 
3
+ import contextlib
3
4
  import json
4
5
  import logging
5
6
  import re
@@ -13,7 +14,7 @@ from time import sleep
13
14
  from torch.distributed import destroy_process_group
14
15
 
15
16
  from .benchmark_config_factory import build_benchmark_config
16
- from .constants import GENERATIVE_PIPELINE_TAGS
17
+ from .constants import GENERATIVE_DATASET_TASK_GROUPS, GENERATIVE_PIPELINE_TAGS
17
18
  from .data_loading import load_data
18
19
  from .data_models import BenchmarkConfigParams, BenchmarkResult
19
20
  from .dataset_configs import get_all_dataset_configs
@@ -366,14 +367,18 @@ class Benchmarker:
366
367
  dataset_names=benchmark_config.datasets
367
368
  )
368
369
 
370
+ total_benchmarks = len(model_ids) * len(dataset_configs)
371
+ num_finished_benchmarks = 0
372
+
369
373
  current_benchmark_results: list[BenchmarkResult] = list()
370
- for m_id in model_ids:
374
+ for model_id in model_ids:
371
375
  try:
372
376
  model_config = get_model_config(
373
- model_id=m_id, benchmark_config=benchmark_config
377
+ model_id=model_id, benchmark_config=benchmark_config
374
378
  )
375
379
  except InvalidModel as e:
376
380
  logger.info(e.message)
381
+ num_finished_benchmarks += len(dataset_configs)
377
382
  continue
378
383
 
379
384
  loaded_model: BenchmarkModule | None = None
@@ -381,21 +386,35 @@ class Benchmarker:
381
386
  # Skip if we have already benchmarked this model on this dataset and
382
387
  # we are not forcing the benchmark
383
388
  if not benchmark_config.force and model_has_been_benchmarked(
384
- model_id=m_id,
389
+ model_id=model_id,
385
390
  dataset=dataset_config.name,
386
391
  few_shot=benchmark_config.few_shot,
387
392
  validation_split=not benchmark_config.evaluate_test_split,
388
393
  benchmark_results=self.benchmark_results,
389
394
  ):
390
395
  logger.debug(
391
- f"Skipping benchmarking {m_id} on {dataset_config.pretty_name},"
392
- " as it has already been benchmarked."
396
+ f"Skipping benchmarking {model_id} on "
397
+ f"{dataset_config.pretty_name}, as it "
398
+ "has already been benchmarked."
399
+ )
400
+ num_finished_benchmarks += 1
401
+ continue
402
+
403
+ # Skip if the model is an encoder model and the task is generative
404
+ task_is_generative = (
405
+ dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
406
+ )
407
+ if model_config.model_type == ModelType.ENCODER and task_is_generative:
408
+ logger.debug(
409
+ f"Skipping benchmarking {model_id} on "
410
+ f"{dataset_config.pretty_name}, as it is an encoder model and "
411
+ "the task is generative."
393
412
  )
394
413
  continue
395
414
 
396
415
  # We do not re-initialise generative models as their architecture is not
397
416
  # customised to specific datasets
398
- if model_config.task in GENERATIVE_PIPELINE_TAGS:
417
+ if model_config.model_type == ModelType.GENERATIVE:
399
418
  initial_logging(
400
419
  model_config=model_config,
401
420
  dataset_config=dataset_config,
@@ -413,6 +432,15 @@ class Benchmarker:
413
432
  if benchmark_config.raise_errors:
414
433
  raise e
415
434
  logger.info(e.message)
435
+
436
+ # Add the remaining number of benchmarks for the model to
437
+ # our benchmark counter, since we're skipping the rest of
438
+ # them
439
+ num_finished_benchmarks += (
440
+ len(dataset_configs)
441
+ - dataset_configs.index(dataset_config)
442
+ - 1
443
+ )
416
444
  break
417
445
  else:
418
446
  loaded_model.dataset_config = dataset_config
@@ -432,27 +460,33 @@ class Benchmarker:
432
460
  raise benchmark_output_or_err
433
461
 
434
462
  elif isinstance(benchmark_output_or_err, InvalidBenchmark):
435
- if benchmark_config.raise_errors:
436
- raise benchmark_output_or_err
437
- logger.info(
438
- f"{m_id} could not be benchmarked on "
439
- f"{dataset_config.pretty_name}. Skipping. The error message "
440
- f"raised was {benchmark_output_or_err.message!r}."
441
- )
463
+ logger.info(benchmark_output_or_err.message)
464
+ num_finished_benchmarks += 1
442
465
  continue
443
466
 
444
467
  elif isinstance(benchmark_output_or_err, InvalidModel):
445
- if benchmark_config.raise_errors:
446
- raise benchmark_output_or_err
447
468
  logger.info(benchmark_output_or_err.message)
469
+
470
+ # Add the remaining number of benchmarks for the model to our
471
+ # benchmark counter, since we're skipping the rest of them
472
+ num_finished_benchmarks += (
473
+ len(dataset_configs) - dataset_configs.index(dataset_config) - 1
474
+ )
448
475
  break
449
476
 
450
477
  else:
451
- record = benchmark_output_or_err
478
+ record: BenchmarkResult = benchmark_output_or_err
452
479
  current_benchmark_results.append(record)
453
480
  if benchmark_config.save_results:
454
481
  record.append_to_results(results_path=self.results_path)
455
482
 
483
+ num_finished_benchmarks += 1
484
+ logger.info(
485
+ f"Finished {num_finished_benchmarks} out of "
486
+ f"{total_benchmarks} benchmarks."
487
+ )
488
+
489
+ del loaded_model
456
490
  if benchmark_config.clear_model_cache:
457
491
  clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
458
492
 
@@ -464,11 +498,8 @@ class Benchmarker:
464
498
  # point and block the progress of another member of the process group. This
465
499
  # constraint has always been present, but this warning has only been added
466
500
  # since PyTorch 2.4 (function operator())
467
- try:
501
+ with contextlib.suppress(AssertionError):
468
502
  destroy_process_group()
469
- except AssertionError:
470
- pass
471
-
472
503
  return current_benchmark_results
473
504
 
474
505
  def _get_updated_benchmark_config(
euroeval/callbacks.py CHANGED
@@ -5,8 +5,8 @@ from collections.abc import Sized
5
5
 
6
6
  from torch.utils.data import DataLoader
7
7
  from tqdm.auto import tqdm
8
- from transformers import TrainerControl, TrainerState, TrainingArguments
9
- from transformers.trainer_callback import ProgressCallback
8
+ from transformers.trainer_callback import ProgressCallback, TrainerControl, TrainerState
9
+ from transformers.training_args import TrainingArguments
10
10
 
11
11
 
12
12
  class NeverLeaveProgressCallback(ProgressCallback):
euroeval/constants.py CHANGED
@@ -7,6 +7,13 @@ from .tasks import NER
7
7
  DUMMY_FILL_VALUE = 100
8
8
 
9
9
 
10
+ # This is the maximum allowed context length for models for the purpose of this
11
+ # benchmark. We will still report the models' true maximum context length in the
12
+ # metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
13
+ # all tokens in the context.
14
+ MAX_CONTEXT_LENGTH = 5_000
15
+
16
+
10
17
  # We need to raise the amount of tokens generated for reasoning models, to give them
11
18
  # time to think
12
19
  REASONING_MAX_TOKENS = 8_192
@@ -44,10 +51,10 @@ TASK_GROUPS_USING_LOGPROBS = [
44
51
 
45
52
  # The number of top log probabilities to return for generative models. For several APIs
46
53
  # this is the maximum number of log probabilities that can be returned
47
- MAX_LOGPROBS = 10
54
+ MAX_LOGPROBS = 8
48
55
 
49
56
 
50
- # We make sure to remove these metric attributed after each iteration, to avoid memory
57
+ # We make sure to remove these metric attributes after each iteration, to avoid memory
51
58
  # leaks
52
59
  METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
53
60