EuroEval 15.4.1__py3-none-any.whl → 15.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -25,11 +25,12 @@ from urllib3.exceptions import RequestError
25
25
 
26
26
  from ..constants import (
27
27
  GENERATIVE_PIPELINE_TAGS,
28
+ MAX_CONTEXT_LENGTH,
28
29
  MAX_LOGPROBS,
29
30
  MERGE_TAGS,
30
31
  REASONING_MAX_TOKENS,
31
- TASK_GROUPS_USING_LOGPROBS,
32
32
  TASKS_USING_JSON,
33
+ VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
33
34
  )
34
35
  from ..data_models import (
35
36
  BenchmarkConfig,
@@ -65,6 +66,8 @@ from ..utils import (
65
66
  get_bos_token,
66
67
  get_end_of_chat_token_ids,
67
68
  get_eos_token,
69
+ get_first_label_token_mapping,
70
+ get_min_cuda_compute_capability,
68
71
  log_once,
69
72
  should_prompts_be_stripped,
70
73
  )
@@ -120,11 +123,8 @@ class VLLMModel(HuggingFaceEncoderModel):
120
123
  ):
121
124
  raise NeedsExtraInstalled(extra="generative")
122
125
 
123
- output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
124
126
  model, tokenizer = load_model_and_tokenizer(
125
- model_config=model_config,
126
- benchmark_config=benchmark_config,
127
- output_scores=output_scores,
127
+ model_config=model_config, benchmark_config=benchmark_config
128
128
  )
129
129
  self._model: LLM = model
130
130
  self._tokenizer: PreTrainedTokenizer = tokenizer
@@ -140,11 +140,16 @@ class VLLMModel(HuggingFaceEncoderModel):
140
140
  benchmark_config=benchmark_config,
141
141
  )
142
142
 
143
- self.buffer["output_scores"] = output_scores
144
- self.buffer["instruction_model"] = self._tokenizer.chat_template is not None
143
+ self.buffer |= dict(
144
+ instruction_model=self._tokenizer.chat_template is not None,
145
+ first_label_token_mapping=get_first_label_token_mapping(
146
+ dataset_config=self.dataset_config, tokenizer=self._tokenizer
147
+ ),
148
+ )
145
149
  if self.model_config.adapter_base_model_id is not None:
146
150
  adapter_path = snapshot_download(
147
151
  repo_id=self.model_config.model_id,
152
+ revision=self.model_config.revision,
148
153
  cache_dir=Path(self.model_config.model_cache_dir),
149
154
  )
150
155
  self.buffer["lora_request"] = LoRARequest(
@@ -182,6 +187,7 @@ class VLLMModel(HuggingFaceEncoderModel):
182
187
  return partial(
183
188
  sequence_classification.extract_labels_from_generation,
184
189
  dataset_config=self.dataset_config,
190
+ first_label_token_mapping=self.buffer["first_label_token_mapping"],
185
191
  )
186
192
  case TaskGroup.TEXT_TO_TEXT:
187
193
  return text_to_text.extract_labels_from_generation
@@ -335,6 +341,12 @@ class VLLMModel(HuggingFaceEncoderModel):
335
341
  else:
336
342
  logits_processor = None
337
343
 
344
+ # Get the mapping from labels to the first token in the label. We call this each
345
+ # time we generate a new dataset since the dataset config can change
346
+ self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
347
+ dataset_config=self.dataset_config, tokenizer=self._tokenizer
348
+ )
349
+
338
350
  # Define the parameters used for vLLM generation
339
351
  max_tokens: int = (
340
352
  REASONING_MAX_TOKENS
@@ -343,7 +355,7 @@ class VLLMModel(HuggingFaceEncoderModel):
343
355
  )
344
356
  sampling_params = SamplingParams(
345
357
  max_tokens=max_tokens,
346
- logprobs=MAX_LOGPROBS if self.buffer["output_scores"] else None,
358
+ logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
347
359
  temperature=0.0,
348
360
  stop=[stop_token for stop_token in stop_tokens if stop_token],
349
361
  logits_processors=[logits_processor] if logits_processor else None,
@@ -373,12 +385,27 @@ class VLLMModel(HuggingFaceEncoderModel):
373
385
 
374
386
  # Generate sequences using vLLM
375
387
  input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
376
- raw_outputs = self._model.generate(
377
- prompts=prompts,
378
- sampling_params=sampling_params,
379
- use_tqdm=(not input_is_a_test),
380
- lora_request=self.buffer.get("lora_request"),
381
- )
388
+ num_attempts = 3
389
+ for _ in range(num_attempts):
390
+ try:
391
+ raw_outputs = self._model.generate(
392
+ prompts=prompts,
393
+ sampling_params=sampling_params,
394
+ use_tqdm=(not input_is_a_test),
395
+ lora_request=self.buffer.get("lora_request"),
396
+ )
397
+ break
398
+ except TypeError as e:
399
+ logger.debug(
400
+ f"Encountered error during vLLM generation: {str(e)}. Retrying..."
401
+ )
402
+ sleep(1)
403
+ else:
404
+ raise InvalidBenchmark(
405
+ f"Could not generate sequences after {num_attempts} attempts."
406
+ )
407
+
408
+ # Parse the raw model outputs
382
409
  completion_ids: list[list[int]] = [
383
410
  output.outputs[0].token_ids for output in raw_outputs
384
411
  ]
@@ -398,7 +425,7 @@ class VLLMModel(HuggingFaceEncoderModel):
398
425
  completions = [completion.strip() for completion in completions]
399
426
 
400
427
  # Add logprobs scores to the output
401
- if self.buffer["output_scores"]:
428
+ if self.buffer["first_label_token_mapping"]:
402
429
  scores: list[list[list[tuple[str, float]]]] = [
403
430
  [
404
431
  [
@@ -828,7 +855,7 @@ class VLLMModel(HuggingFaceEncoderModel):
828
855
 
829
856
 
830
857
  def load_model_and_tokenizer(
831
- model_config: ModelConfig, benchmark_config: BenchmarkConfig, output_scores: bool
858
+ model_config: ModelConfig, benchmark_config: BenchmarkConfig
832
859
  ) -> "tuple[LLM, PreTrainedTokenizer]":
833
860
  """Load the model and tokenizer.
834
861
 
@@ -837,22 +864,23 @@ def load_model_and_tokenizer(
837
864
  The model configuration.
838
865
  benchmark_config:
839
866
  The benchmark configuration.
840
- output_scores:
841
- Whether to output scores.
842
867
 
843
868
  Returns:
844
- The loaded model and tokenizer.
869
+ A pair (model, tokenizer), with the loaded model and tokenizer
845
870
  """
846
871
  # Prefer base model ID if the model is an adapter - the adapter will be added on
847
872
  # during inference in this case
848
873
  model_id = model_config.adapter_base_model_id or model_config.model_id
874
+ revision = (
875
+ model_config.revision if model_config.adapter_base_model_id is None else "main"
876
+ )
849
877
 
850
878
  hf_model_config = load_hf_model_config(
851
879
  model_id=model_id,
852
880
  num_labels=0,
853
881
  id2label=dict(),
854
882
  label2id=dict(),
855
- revision=model_config.revision,
883
+ revision=revision,
856
884
  model_cache_dir=model_config.model_cache_dir,
857
885
  api_key=benchmark_config.api_key,
858
886
  trust_remote_code=benchmark_config.trust_remote_code,
@@ -872,7 +900,27 @@ def load_model_and_tokenizer(
872
900
  if quantization == "awq" and importlib.util.find_spec("awq") is None:
873
901
  raise NeedsExtraInstalled(extra="quantization")
874
902
 
903
+ # Start with dtype being the "auto" vLLM dtype
875
904
  dtype: str | torch.dtype = "auto"
905
+
906
+ # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
907
+ if hf_model_config.torch_dtype == torch.float32:
908
+ if torch.cuda.is_bf16_supported():
909
+ logger.info(
910
+ "You are loading a model with dtype FP32, which we will convert to "
911
+ "BF16 as FP32 is not supported by vLLM and BF16 is supported by your "
912
+ "GPU."
913
+ )
914
+ dtype = torch.bfloat16
915
+ else:
916
+ logger.info(
917
+ "You are loading a model with dtype FP32, which we will convert to "
918
+ "FP16 as FP32 is not supported by vLLM and BF16 is not supported by "
919
+ "your GPU."
920
+ )
921
+ dtype = torch.float16
922
+
923
+ # If the model is a quantized model, we need to set the dtype to float16
876
924
  if quantization is not None and hf_model_config.torch_dtype != torch.float16:
877
925
  logger.info(
878
926
  "You are loading a quantized model with dtype "
@@ -881,6 +929,24 @@ def load_model_and_tokenizer(
881
929
  )
882
930
  dtype = torch.float16
883
931
 
932
+ # If the model is a bf16 model, we need to check the CUDA compute capability
933
+ if hf_model_config.torch_dtype == torch.bfloat16:
934
+ min_cuda_compute_capability = get_min_cuda_compute_capability()
935
+ required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
936
+
937
+ if min_cuda_compute_capability is not None:
938
+ if min_cuda_compute_capability < required_capability:
939
+ logger.info(
940
+ "You are loading a model with "
941
+ f"dtype {hf_model_config.torch_dtype}, "
942
+ "which vLLM only supports for CUDA devices with"
943
+ f"CUDA compute capability >={required_capability}. "
944
+ "You are using one or more devices with "
945
+ f"compute capability {min_cuda_compute_capability}. "
946
+ "Setting dtype to float16 instead."
947
+ )
948
+ dtype = torch.float16
949
+
884
950
  if model_config.adapter_base_model_id is not None:
885
951
  download_dir = str(Path(model_config.model_cache_dir) / "base_model")
886
952
  else:
@@ -902,7 +968,17 @@ def load_model_and_tokenizer(
902
968
  if len(true_max_model_len_candidates) > 0:
903
969
  true_max_model_len = min(true_max_model_len_candidates)
904
970
  else:
905
- true_max_model_len = 5_000
971
+ true_max_model_len = MAX_CONTEXT_LENGTH
972
+
973
+ tokenizer = load_tokenizer(
974
+ model_id=model_config.model_id,
975
+ revision=model_config.revision,
976
+ adapter_base_model_id=model_config.adapter_base_model_id,
977
+ trust_remote_code=benchmark_config.trust_remote_code,
978
+ model_max_length=true_max_model_len,
979
+ model_cache_dir=model_config.model_cache_dir,
980
+ token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
981
+ )
906
982
 
907
983
  clear_vllm()
908
984
 
@@ -913,10 +989,10 @@ def load_model_and_tokenizer(
913
989
  model=model_id,
914
990
  tokenizer=model_id,
915
991
  gpu_memory_utilization=0.95,
916
- max_model_len=min(true_max_model_len, 5_000),
992
+ max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
917
993
  download_dir=download_dir,
918
994
  trust_remote_code=benchmark_config.trust_remote_code,
919
- revision=model_config.revision,
995
+ revision=revision,
920
996
  seed=4242,
921
997
  distributed_executor_backend=executor_backend,
922
998
  tensor_parallel_size=torch.cuda.device_count(),
@@ -924,7 +1000,6 @@ def load_model_and_tokenizer(
924
1000
  quantization=quantization,
925
1001
  dtype=dtype,
926
1002
  enforce_eager=True,
927
- max_logprobs=MAX_LOGPROBS if output_scores else None,
928
1003
  # TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
929
1004
  # so we disable it for now
930
1005
  enable_prefix_caching=False,
@@ -950,16 +1025,6 @@ def load_model_and_tokenizer(
950
1025
  model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
951
1026
  model.config = hf_model_config
952
1027
 
953
- tokenizer = load_tokenizer(
954
- model_id=model_config.model_id,
955
- revision=model_config.revision,
956
- adapter_base_model_id=model_config.adapter_base_model_id,
957
- trust_remote_code=benchmark_config.trust_remote_code,
958
- model_max_length=true_max_model_len,
959
- model_cache_dir=model_config.model_cache_dir,
960
- token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
961
- )
962
-
963
1028
  return model, tokenizer
964
1029
 
965
1030
 
@@ -994,6 +1059,7 @@ def load_tokenizer(
994
1059
  Returns:
995
1060
  The loaded tokenizer.
996
1061
  """
1062
+ revision = revision if adapter_base_model_id is None else "main"
997
1063
  config = AutoConfig.from_pretrained(
998
1064
  adapter_base_model_id or model_id,
999
1065
  revision=revision,
@@ -1118,15 +1184,13 @@ def get_end_of_reasoning_token_id(
1118
1184
 
1119
1185
  # Generate a completion and remove the BOS token from it, to not confuse it with the
1120
1186
  # potential reasoning token
1121
- completion = (
1122
- model.generate(
1123
- prompts=[prompt],
1124
- sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
1125
- use_tqdm=False,
1126
- )[0]
1127
- .outputs[0]
1128
- .text
1187
+ model_output = model.generate(
1188
+ prompts=[prompt],
1189
+ sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
1190
+ use_tqdm=False,
1129
1191
  )
1192
+ completion = model_output[0].outputs[0].text
1193
+
1130
1194
  if tokenizer.bos_token is not None:
1131
1195
  if isinstance(tokenizer.bos_token, str):
1132
1196
  prompt = prompt.replace(tokenizer.bos_token, "").strip()
euroeval/benchmarker.py CHANGED
@@ -366,14 +366,18 @@ class Benchmarker:
366
366
  dataset_names=benchmark_config.datasets
367
367
  )
368
368
 
369
+ total_benchmarks = len(model_ids) * len(dataset_configs)
370
+ num_finished_benchmarks = 0
371
+
369
372
  current_benchmark_results: list[BenchmarkResult] = list()
370
- for m_id in model_ids:
373
+ for model_id in model_ids:
371
374
  try:
372
375
  model_config = get_model_config(
373
- model_id=m_id, benchmark_config=benchmark_config
376
+ model_id=model_id, benchmark_config=benchmark_config
374
377
  )
375
378
  except InvalidModel as e:
376
379
  logger.info(e.message)
380
+ num_finished_benchmarks += len(dataset_configs)
377
381
  continue
378
382
 
379
383
  loaded_model: BenchmarkModule | None = None
@@ -381,16 +385,18 @@ class Benchmarker:
381
385
  # Skip if we have already benchmarked this model on this dataset and
382
386
  # we are not forcing the benchmark
383
387
  if not benchmark_config.force and model_has_been_benchmarked(
384
- model_id=m_id,
388
+ model_id=model_id,
385
389
  dataset=dataset_config.name,
386
390
  few_shot=benchmark_config.few_shot,
387
391
  validation_split=not benchmark_config.evaluate_test_split,
388
392
  benchmark_results=self.benchmark_results,
389
393
  ):
390
394
  logger.debug(
391
- f"Skipping benchmarking {m_id} on {dataset_config.pretty_name},"
392
- " as it has already been benchmarked."
395
+ f"Skipping benchmarking {model_id} on "
396
+ f"{dataset_config.pretty_name}, as it "
397
+ "has already been benchmarked."
393
398
  )
399
+ num_finished_benchmarks += 1
394
400
  continue
395
401
 
396
402
  # We do not re-initialise generative models as their architecture is not
@@ -413,6 +419,15 @@ class Benchmarker:
413
419
  if benchmark_config.raise_errors:
414
420
  raise e
415
421
  logger.info(e.message)
422
+
423
+ # Add the remaining number of benchmarks for the model to
424
+ # our benchmark counter, since we're skipping the
425
+ # rest of them
426
+ num_finished_benchmarks += (
427
+ len(dataset_configs)
428
+ - dataset_configs.index(dataset_config)
429
+ - 1
430
+ )
416
431
  break
417
432
  else:
418
433
  loaded_model.dataset_config = dataset_config
@@ -435,16 +450,24 @@ class Benchmarker:
435
450
  if benchmark_config.raise_errors:
436
451
  raise benchmark_output_or_err
437
452
  logger.info(
438
- f"{m_id} could not be benchmarked on "
453
+ f"{model_id} could not be benchmarked on "
439
454
  f"{dataset_config.pretty_name}. Skipping. The error message "
440
455
  f"raised was {benchmark_output_or_err.message!r}."
441
456
  )
457
+ num_finished_benchmarks += 1
442
458
  continue
443
459
 
444
460
  elif isinstance(benchmark_output_or_err, InvalidModel):
445
461
  if benchmark_config.raise_errors:
446
462
  raise benchmark_output_or_err
447
463
  logger.info(benchmark_output_or_err.message)
464
+
465
+ # Add the remaining number of benchmarks for the model to
466
+ # our benchmark counter, since we're skipping the
467
+ # rest of them
468
+ num_finished_benchmarks += (
469
+ len(dataset_configs) - dataset_configs.index(dataset_config) - 1
470
+ )
448
471
  break
449
472
 
450
473
  else:
@@ -453,6 +476,12 @@ class Benchmarker:
453
476
  if benchmark_config.save_results:
454
477
  record.append_to_results(results_path=self.results_path)
455
478
 
479
+ num_finished_benchmarks += 1
480
+ logger.info(
481
+ f"Finished {num_finished_benchmarks} out of "
482
+ f"{total_benchmarks} benchmarks."
483
+ )
484
+
456
485
  if benchmark_config.clear_model_cache:
457
486
  clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
458
487
 
euroeval/constants.py CHANGED
@@ -7,6 +7,13 @@ from .tasks import NER
7
7
  DUMMY_FILL_VALUE = 100
8
8
 
9
9
 
10
+ # This is the maximum allowed context length for models for the purpose of this
11
+ # benchmark. We will still report the models' true maximum context length in the
12
+ # metadata, but we won't use it for evaluation, as vLLM needs to allocate memory for
13
+ # all tokens in the context.
14
+ MAX_CONTEXT_LENGTH = 5_000
15
+
16
+
10
17
  # We need to raise the amount of tokens generated for reasoning models, to give them
11
18
  # time to think
12
19
  REASONING_MAX_TOKENS = 8_192
@@ -47,10 +54,13 @@ TASK_GROUPS_USING_LOGPROBS = [
47
54
  MAX_LOGPROBS = 10
48
55
 
49
56
 
50
- # We make sure to remove these metric attributed after each iteration, to avoid memory
57
+ # We make sure to remove these metric attributes after each iteration, to avoid memory
51
58
  # leaks
52
59
  METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
53
60
 
54
61
 
55
62
  # Hugging Face Hub tags used to classify models as merge models
56
63
  MERGE_TAGS = ["merge", "mergekit"]
64
+
65
+ # The minimum required CUDA compute capability for using bfloat16 in vLLM
66
+ VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
euroeval/data_models.py CHANGED
@@ -1,7 +1,6 @@
1
1
  """Data models used in EuroEval."""
2
2
 
3
3
  import collections.abc as c
4
- import importlib.metadata
5
4
  import json
6
5
  import pathlib
7
6
  import re
@@ -13,6 +12,7 @@ import torch
13
12
 
14
13
  from .enums import Device, InferenceBackend, ModelType, TaskGroup
15
14
  from .types import ScoreDict
15
+ from .utils import get_package_version
16
16
 
17
17
 
18
18
  @dataclass
@@ -228,7 +228,11 @@ class BenchmarkResult(pydantic.BaseModel):
228
228
  generative_type: str | None
229
229
  few_shot: bool
230
230
  validation_split: bool
231
- euroeval_version: str = importlib.metadata.version("euroeval")
231
+ euroeval_version: str | None = get_package_version("euroeval")
232
+ transformers_version: str | None = get_package_version("transformers")
233
+ torch_version: str | None = get_package_version("torch")
234
+ vllm_version: str | None = get_package_version("vllm")
235
+ outlines_version: str | None = get_package_version("outlines")
232
236
 
233
237
  @classmethod
234
238
  def from_dict(cls, config: dict) -> "BenchmarkResult":
@@ -244,7 +244,7 @@ FOSENT_CONFIG = DatasetConfig(
244
244
  ALLOCINE_CONFIG = DatasetConfig(
245
245
  name="allocine",
246
246
  pretty_name="the truncated version of the French sentiment classification "
247
- "dataset Allocine",
247
+ "dataset AlloCiné",
248
248
  huggingface_id="EuroEval/allocine-mini",
249
249
  task=SENT,
250
250
  languages=[FR],
@@ -1467,9 +1467,9 @@ NORDJYLLAND_NEWS_CONFIG = DatasetConfig(
1467
1467
  max_generated_tokens=256,
1468
1468
  )
1469
1469
 
1470
- MLSUM_CONFIG = DatasetConfig(
1471
- name="mlsum",
1472
- pretty_name="the truncated version of the German summarisation dataset MLSum",
1470
+ MLSUM_DE_CONFIG = DatasetConfig(
1471
+ name="mlsum-de",
1472
+ pretty_name="the truncated version of the German summarisation dataset MLSum-de",
1473
1473
  huggingface_id="EuroEval/mlsum-mini",
1474
1474
  task=SUMM,
1475
1475
  languages=[DE],
@@ -1484,7 +1484,7 @@ MLSUM_CONFIG = DatasetConfig(
1484
1484
 
1485
1485
  MLSUM_ES_CONFIG = DatasetConfig(
1486
1486
  name="mlsum-es",
1487
- pretty_name="the truncated version of the Spanish summarisation dataset MLSum",
1487
+ pretty_name="the truncated version of the Spanish summarisation dataset MLSum-es",
1488
1488
  huggingface_id="EuroEval/mlsum-es-mini",
1489
1489
  task=SUMM,
1490
1490
  languages=[ES],
@@ -1643,7 +1643,7 @@ ORANGE_SUM_CONFIG = DatasetConfig(
1643
1643
 
1644
1644
  ILPOST_SUM_CONFIG = DatasetConfig(
1645
1645
  name="ilpost-sum",
1646
- pretty_name="the truncated version of the Italian summarisation dataset IlPost",
1646
+ pretty_name="the truncated version of the Italian summarisation dataset IlPost-Sum",
1647
1647
  huggingface_id="EuroEval/ilpost-sum",
1648
1648
  task=SUMM,
1649
1649
  languages=[IT],
@@ -10,6 +10,7 @@ import numpy as np
10
10
  from evaluate import EvaluationModule
11
11
 
12
12
  from ..data_models import BenchmarkConfig, GenerativeModelOutput
13
+ from ..exceptions import InvalidBenchmark
13
14
  from ..utils import log_once, raise_if_model_output_contains_nan_values
14
15
 
15
16
  if t.TYPE_CHECKING:
@@ -110,6 +111,7 @@ def extract_labels_from_generation(
110
111
  input_batch: dict[str, list],
111
112
  model_output: GenerativeModelOutput,
112
113
  dataset_config: "DatasetConfig",
114
+ first_label_token_mapping: dict[str, str] | bool,
113
115
  ) -> list[str]:
114
116
  """Extract the predicted labels from the generated output.
115
117
 
@@ -121,13 +123,19 @@ def extract_labels_from_generation(
121
123
  The raw generated output of the model.
122
124
  dataset_config:
123
125
  The configuration of the dataset.
126
+ first_label_token_mapping:
127
+ A mapping from labels to the first token in each label, or alternatively a
128
+ Boolean value indicating whether the model should output scores (if the
129
+ mapping is outputted then the model will always output scores).
124
130
 
125
131
  Returns:
126
132
  The predicted labels.
127
133
  """
128
134
  if model_output.scores is not None:
129
135
  return get_closest_logprobs_labels(
130
- generation_logprobs=model_output.scores, dataset_config=dataset_config
136
+ generation_logprobs=model_output.scores,
137
+ dataset_config=dataset_config,
138
+ first_label_token_mapping=first_label_token_mapping,
131
139
  )
132
140
  else:
133
141
  return get_closest_word_edit_labels(
@@ -138,6 +146,7 @@ def extract_labels_from_generation(
138
146
  def get_closest_logprobs_labels(
139
147
  generation_logprobs: list[list[list[tuple[str, float]]]],
140
148
  dataset_config: "DatasetConfig",
149
+ first_label_token_mapping: dict[str, str] | bool,
141
150
  ) -> list[str]:
142
151
  """Get the labels with the highest predicted logprob value.
143
152
 
@@ -152,6 +161,10 @@ def get_closest_logprobs_labels(
152
161
  (batch_size, num_tokens, num_logprobs).
153
162
  dataset_config:
154
163
  The configuration of the dataset.
164
+ first_label_token_mapping:
165
+ A mapping from labels to the first token in each label, or alternatively a
166
+ Boolean value indicating whether the model should output scores (if the
167
+ mapping is outputted then the model will always output scores).
155
168
 
156
169
  Returns:
157
170
  The predicted labels.
@@ -162,8 +175,7 @@ def get_closest_logprobs_labels(
162
175
  """
163
176
  english_labels = list(dataset_config.id2label.values())
164
177
  english2local = dataset_config.prompt_label_mapping
165
- local_labels = [english2local[lbl].lower() for lbl in english_labels]
166
- candidate_labels = local_labels + english_labels
178
+ candidate_labels = [english2local[lbl].lower() for lbl in english_labels]
167
179
 
168
180
  output_labels: list[str] = list()
169
181
  for sample in generation_logprobs:
@@ -182,38 +194,66 @@ def get_closest_logprobs_labels(
182
194
  # label, as the output label
183
195
  output_label: str | None = None
184
196
  previously_generated_labels: list[str] = list()
185
- for generated_label in generated_labels:
197
+ for label_idx, generated_label in enumerate(generated_labels):
186
198
  generated_label = "".join(previously_generated_labels) + generated_label
187
199
 
188
- # Get the candidate labels that contain the generated label
189
- candidate_output_labels = [
190
- candidate_label
191
- for candidate_label in candidate_labels
192
- if generated_label in candidate_label
193
- ]
194
-
195
- # If we can uniquely determine the output label, we break the loop.
196
- # Since we have both the original local labels as well as the English
197
- # versions, we want to have 0 or 1 candidate labels from each set. This
198
- # means that ["positive", "positiv"] is fine as they're both referencing
199
- # the same label, but ["negativ", "neutral"] is not. In the bad case we
200
- # cannot use the scores and we fall back to using the
201
- # candidate label with the highest edit distance.
202
- at_most_one_english_label = (
203
- len(set(candidate_output_labels).intersection(english_labels)) <= 1
204
- )
205
- at_most_one_local_label = (
206
- len(set(candidate_output_labels).intersection(local_labels)) <= 1
207
- )
208
- if candidate_output_labels:
209
- if at_most_one_english_label and at_most_one_local_label:
210
- output_label = candidate_output_labels[0]
211
- break
212
- else:
200
+ # Get the candidate labels that starts with the generated label
201
+ if isinstance(first_label_token_mapping, dict):
202
+ if any(
203
+ candidate_label not in first_label_token_mapping
204
+ for candidate_label in candidate_labels
205
+ ):
206
+ raise InvalidBenchmark(
207
+ "There is a label not present in the first label token "
208
+ "mapping - this should never happen! Please report this "
209
+ "issue to the EuroEval team at "
210
+ "github.com/EuroEval/EuroEval/issues."
211
+ )
212
+
213
+ candidate_output_labels = {
214
+ candidate_label
215
+ for candidate_label in candidate_labels
216
+ if generated_label == first_label_token_mapping[candidate_label]
217
+ }
218
+ else:
219
+ candidate_output_labels = {
220
+ candidate_label
221
+ for candidate_label in candidate_labels
222
+ if candidate_label.startswith(generated_label)
223
+ }
224
+
225
+ # If we can uniquely determine the output label, we break the loop. If
226
+ # there are multiple possible labels then we store the current one, and
227
+ # concatenate it with the next generated label. We can only do this if
228
+ # the current one is the first one, however, since we're using greedy
229
+ # sampling. In case this happens for a label that is not the first one,
230
+ # we warn the user.
231
+ if len(candidate_output_labels) == 1:
232
+ output_label = candidate_output_labels.pop()
233
+ break
234
+ elif len(candidate_output_labels) > 1:
235
+ if label_idx == 0:
213
236
  previously_generated_labels.append(generated_label)
237
+ else:
238
+ output_label = candidate_output_labels.pop()
239
+ candidate_output_labels.add(output_label)
240
+ raise InvalidBenchmark(
241
+ "Multiple candidate labels found for the generated label "
242
+ f"{generated_label!r}: {candidate_output_labels}. Since "
243
+ "this is not the first generated label, we cannot "
244
+ "concatenate it with the next generated label. We are thus "
245
+ f"forced to use the arbitrary {output_label!r} as the "
246
+ "output label, potentially resulting in worse performance. "
247
+ "Please report this issue to the EuroEval team at "
248
+ "github.com/EuroEval/EuroEval/issues."
249
+ )
250
+ elif len(candidate_output_labels) == 0:
251
+ logger.debug(
252
+ f"No candidate label found for the generated label "
253
+ f"{generated_label!r}. The generated label is thus ignored."
254
+ )
214
255
 
215
256
  if output_label is not None:
216
- output_label = english2local.get(output_label, output_label)
217
257
  output_labels.append(output_label)
218
258
  break
219
259
  else:
euroeval/types.py CHANGED
@@ -8,9 +8,9 @@ if t.TYPE_CHECKING:
8
8
  from .data_models import GenerativeModelOutput
9
9
 
10
10
 
11
- ScoreDict = dict[str, dict[str, float] | list[dict[str, float]]]
12
- Predictions = NDArray | list[str] | list[list[str]]
13
- Labels = NDArray | list[str] | list[list[str]]
11
+ ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
12
+ Predictions: t.TypeAlias = NDArray | list[str] | list[list[str]]
13
+ Labels: t.TypeAlias = NDArray | list[str] | list[list[str]]
14
14
 
15
15
 
16
16
  class ComputeMetricsFunction(t.Protocol):