EuroEval 15.6.1__py3-none-any.whl → 15.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -3,11 +3,9 @@
3
3
  import collections.abc as c
4
4
  import contextlib
5
5
  import importlib.util
6
- import itertools as it
7
6
  import json
8
7
  import logging
9
8
  import os
10
- import random
11
9
  import re
12
10
  import sys
13
11
  import typing as t
@@ -56,6 +54,7 @@ from ..exceptions import (
56
54
  NeedsEnvironmentVariable,
57
55
  NeedsExtraInstalled,
58
56
  )
57
+ from ..generation_utils import apply_prompt, extract_few_shot_examples
59
58
  from ..languages import get_all_languages
60
59
  from ..task_group_utils import (
61
60
  question_answering,
@@ -132,7 +131,7 @@ class VLLMModel(HuggingFaceEncoderModel):
132
131
  self._model: LLM = model
133
132
  self._tokenizer: PreTrainedTokenizer = tokenizer
134
133
  self.end_of_reasoning_token_id = get_end_of_reasoning_token_id(
135
- model=self._model, tokenizer=self._tokenizer
134
+ model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
136
135
  )
137
136
 
138
137
  # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
@@ -146,7 +145,10 @@ class VLLMModel(HuggingFaceEncoderModel):
146
145
  self.buffer |= dict(
147
146
  instruction_model=self._tokenizer.chat_template is not None,
148
147
  first_label_token_mapping=get_first_label_token_mapping(
149
- dataset_config=self.dataset_config, tokenizer=self._tokenizer
148
+ dataset_config=self.dataset_config,
149
+ model_config=self.model_config,
150
+ tokenizer=self._tokenizer,
151
+ generative_type=self.generative_type,
150
152
  ),
151
153
  )
152
154
  if self.model_config.adapter_base_model_id is not None:
@@ -255,14 +257,22 @@ class VLLMModel(HuggingFaceEncoderModel):
255
257
  )
256
258
 
257
259
  if self.benchmark_config.few_shot:
258
- few_shot_examples = self._extract_few_shot_examples(
259
- dataset=dataset, task=task, itr_idx=itr_idx
260
+ few_shot_examples = extract_few_shot_examples(
261
+ dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
260
262
  )
261
263
  else:
262
264
  few_shot_examples = list()
263
265
 
264
266
  dataset["test"] = dataset["test"].map(
265
- partial(self._apply_prompt, few_shot_examples=few_shot_examples, task=task),
267
+ partial(
268
+ apply_prompt,
269
+ few_shot_examples=few_shot_examples,
270
+ model_config=self.model_config,
271
+ dataset_config=self.dataset_config,
272
+ instruction_model=self.buffer["instruction_model"],
273
+ always_populate_text_field=True,
274
+ tokenizer=self._tokenizer,
275
+ ),
266
276
  batched=True,
267
277
  load_from_cache_file=False,
268
278
  keep_in_memory=True,
@@ -332,30 +342,40 @@ class VLLMModel(HuggingFaceEncoderModel):
332
342
  if end_of_chat_token:
333
343
  stop_tokens.append(end_of_chat_token)
334
344
 
345
+ logits_processor = None
335
346
  if self.dataset_config.task in TASKS_USING_JSON:
336
- ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
337
- keys_and_their_types: dict[str, t.Any] = {
338
- tag_name: (conlist(str, max_length=5), ...)
339
- for tag_name in ner_tag_names
340
- }
341
- pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
342
- logits_processor = JSONLogitsProcessor(
343
- schema=pydantic_class,
344
- tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
345
- whitespace_pattern=r" ?",
346
- )
347
- log_once(
348
- "Using structured generation with the schema "
349
- f"{pydantic_class.model_json_schema()}",
350
- level=logging.DEBUG,
351
- )
352
- else:
353
- logits_processor = None
347
+ if self.generative_type == GenerativeType.REASONING:
348
+ log_once(
349
+ f"The model {self.model_config.model_id!r} is a reasoning model "
350
+ "and thus does not support structured generation, so we do not "
351
+ "enable it.",
352
+ level=logging.DEBUG,
353
+ )
354
+ else:
355
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
356
+ keys_and_their_types: dict[str, t.Any] = {
357
+ tag_name: (conlist(str, max_length=5), ...)
358
+ for tag_name in ner_tag_names
359
+ }
360
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
361
+ logits_processor = JSONLogitsProcessor(
362
+ schema=pydantic_class,
363
+ tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
364
+ whitespace_pattern=r" ?",
365
+ )
366
+ log_once(
367
+ "Using structured generation with the JSON schema "
368
+ f"{pydantic_class.model_json_schema()}",
369
+ level=logging.DEBUG,
370
+ )
354
371
 
355
372
  # Get the mapping from labels to the first token in the label. We call this each
356
373
  # time we generate a new dataset since the dataset config can change
357
374
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
358
- dataset_config=self.dataset_config, tokenizer=self._tokenizer
375
+ dataset_config=self.dataset_config,
376
+ model_config=self.model_config,
377
+ tokenizer=self._tokenizer,
378
+ generative_type=self.generative_type,
359
379
  )
360
380
 
361
381
  # Define the parameters used for vLLM generation
@@ -391,7 +411,10 @@ class VLLMModel(HuggingFaceEncoderModel):
391
411
  ) and should_prompts_be_stripped(
392
412
  labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
393
413
  ):
394
- log_once(message="Stripping prompts.", level=logging.DEBUG)
414
+ log_once(
415
+ f"Stripping prompts for model {self.model_config.model_id!r}.",
416
+ level=logging.DEBUG,
417
+ )
395
418
  prompts = [prompt.strip() for prompt in prompts]
396
419
 
397
420
  # Generate sequences using vLLM
@@ -411,18 +434,65 @@ class VLLMModel(HuggingFaceEncoderModel):
411
434
  f"Encountered error during vLLM generation: {str(e)}. Retrying..."
412
435
  )
413
436
  sleep(1)
437
+ except ValueError as e:
438
+ # Truncate the prompts if they are too long for the model
439
+ truncate_error_messages = [
440
+ r"prompt \(length [0-9]+\) is longer than the maximum model length"
441
+ ]
442
+ if any(
443
+ re.search(pattern, str(e), flags=re.IGNORECASE) is not None
444
+ for pattern in truncate_error_messages
445
+ ):
446
+ logger.info(
447
+ "Prompts are too long, so truncating them and trying again..."
448
+ )
449
+ logger.debug(f"The error message was: {str(e)}")
450
+ tokenized_prompts = self._tokenizer(
451
+ text=prompts,
452
+ truncation=True,
453
+ max_length=max(
454
+ self._tokenizer.model_max_length - max_tokens, 0
455
+ ),
456
+ )
457
+ prompts = self._tokenizer.batch_decode(
458
+ sequences=tokenized_prompts.input_ids, skip_special_tokens=True
459
+ )
460
+ else:
461
+ raise InvalidBenchmark(
462
+ f"An error occurred during vLLM generation: {str(e)}"
463
+ )
414
464
  else:
415
465
  raise InvalidBenchmark(
416
466
  f"Could not generate sequences after {num_attempts} attempts."
417
467
  )
418
468
 
469
+ # When we shorten the prompts then some residual model outputs persist, so we
470
+ # need to filter these out
471
+ num_extra_outputs = len(raw_outputs) - len(prompts)
472
+ if num_extra_outputs > 0:
473
+ raw_outputs = raw_outputs[num_extra_outputs:]
474
+ if not all(
475
+ raw_output.prompt == prompt
476
+ for raw_output, prompt in zip(raw_outputs, prompts)
477
+ ):
478
+ raise InvalidBenchmark(
479
+ f"The prompts and the model outputs do not match. There were "
480
+ f"{num_extra_outputs!r} extra outputs."
481
+ )
482
+ else:
483
+ logger.debug(
484
+ f"Filtered out {num_extra_outputs:,} extra outputs from the model, "
485
+ "which occured as we interupted the generation when we truncated "
486
+ "the prompts."
487
+ )
488
+
419
489
  # Parse the raw model outputs
420
490
  completion_ids: list[list[int]] = [
421
491
  output.outputs[0].token_ids for output in raw_outputs
422
492
  ]
423
493
  if self.end_of_reasoning_token_id in completion_ids[0]:
424
494
  completion_ids = [
425
- token_ids[token_ids.index(self.end_of_reasoning_token_id) + 2 :]
495
+ token_ids[token_ids.index(self.end_of_reasoning_token_id) + 1 :]
426
496
  if self.end_of_reasoning_token_id in token_ids
427
497
  else token_ids
428
498
  for token_ids in completion_ids
@@ -435,6 +505,12 @@ class VLLMModel(HuggingFaceEncoderModel):
435
505
  )
436
506
  completions = [completion.strip() for completion in completions]
437
507
 
508
+ # Sanity check
509
+ if len(completions) != len(prompts):
510
+ raise InvalidBenchmark(
511
+ f"Expected {len(prompts):,} completions, but got {len(completions):,}."
512
+ )
513
+
438
514
  # Add logprobs scores to the output
439
515
  if self.buffer["first_label_token_mapping"]:
440
516
  scores: list[list[list[tuple[str, float]]]] = [
@@ -546,302 +622,6 @@ class VLLMModel(HuggingFaceEncoderModel):
546
622
 
547
623
  return model_config
548
624
 
549
- def _extract_few_shot_examples(
550
- self, dataset: DatasetDict, task: Task, itr_idx: int
551
- ) -> list[dict[str, t.Any]]:
552
- """Extract few-shot examples from a dataset.
553
-
554
- This will always extract the examples from the training split.
555
-
556
- We ensure that the few-shot examples are unique by picking them one at a time.
557
-
558
- Args:
559
- dataset:
560
- The dataset to extract the few-shot examples from.
561
- task:
562
- The task that is being benchmarked.
563
- itr_idx:
564
- The index of the dataset in the iterator.
565
-
566
- Returns:
567
- The few-shot examples.
568
- """
569
- random_seed = 4242 + itr_idx
570
- num_few_shots = self.dataset_config.num_few_shot_examples
571
- few_shot_examples: list[dict[str, t.Any]] = list()
572
- shuffled_train = dataset["train"].shuffle(seed=random_seed)
573
-
574
- match task.task_group:
575
- case (
576
- TaskGroup.SEQUENCE_CLASSIFICATION
577
- | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
578
- ):
579
- labels = it.cycle(self.dataset_config.labels)
580
- while (
581
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
582
- ):
583
- label = next(labels)
584
- possible_examples = shuffled_train.filter(
585
- lambda x: x["label"].lower() == label.lower()
586
- )
587
- if len(possible_examples) == 0:
588
- continue
589
- example = possible_examples.select(range(1))[0]
590
- few_shot_examples.append(example)
591
- shuffled_train = shuffled_train.filter(
592
- lambda x: x["text"] != example["text"]
593
- )
594
-
595
- case TaskGroup.TEXT_TO_TEXT:
596
- while (
597
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
598
- ):
599
- example = shuffled_train.select(range(1))[0]
600
- few_shot_examples.append(example)
601
- shuffled_train = shuffled_train.filter(
602
- lambda x: x["text"] != example["text"]
603
- )
604
-
605
- case TaskGroup.TOKEN_CLASSIFICATION:
606
- labels = it.cycle(
607
- [
608
- label.lower()
609
- for label in self.dataset_config.labels
610
- if label.lower().startswith("b-")
611
- ]
612
- )
613
- while (
614
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
615
- ):
616
- label = next(labels)
617
- possible_examples = shuffled_train.filter(
618
- lambda x: label in [tag.lower() for tag in x["labels"]]
619
- )
620
- if len(possible_examples) == 0:
621
- continue
622
- example = possible_examples.select(range(1))[0]
623
- few_shot_examples.append(example)
624
- shuffled_train = shuffled_train.filter(
625
- lambda x: x["tokens"] != example["tokens"]
626
- )
627
-
628
- case TaskGroup.QUESTION_ANSWERING:
629
- # Locate the maximum number of tokens that constitutes a short example
630
- for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
631
- train_with_short_examples = dataset["train"].filter(
632
- lambda example: len(example["context"]) < max_num_tokens
633
- )
634
- num_short_examples = len(train_with_short_examples)
635
- if num_short_examples >= self.dataset_config.num_few_shot_examples:
636
- break
637
- else:
638
- raise InvalidBenchmark(
639
- "Could not find enough short examples for few-shot learning."
640
- )
641
-
642
- shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
643
- while (
644
- len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
645
- ):
646
- example = shuffled_train.select(range(1))[0]
647
- few_shot_examples.append(example)
648
- shuffled_train = shuffled_train.filter(
649
- lambda x: x["context"] != example["context"]
650
- )
651
-
652
- case _:
653
- raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
654
-
655
- random.seed(random_seed)
656
- random.shuffle(few_shot_examples)
657
- return few_shot_examples
658
-
659
- def _apply_prompt(
660
- self,
661
- examples: dict[str, t.Any],
662
- few_shot_examples: list[dict[str, t.Any]],
663
- task: Task,
664
- ) -> dict[str, t.Any]:
665
- """Apply prompt template to an example, potentially with few-shot examples.
666
-
667
- Args:
668
- examples:
669
- The examples to apply the few-shot examples to.
670
- few_shot_examples:
671
- The few-shot examples to apply.
672
- task:
673
- The task that is being benchmarked.
674
-
675
- Returns:
676
- The example with the few-shot examples applied.
677
- """
678
-
679
- def create_prompt(**kwargs: str) -> tuple[str, str]:
680
- """Create a prompt from the given keyword arguments.
681
-
682
- Args:
683
- kwargs:
684
- The keyword arguments to use in the prompt.
685
-
686
- Returns:
687
- A pair (prompt, label), where "label" is an empty string if the model is
688
- not instruction tuned (as in this case it is included in the prompt).
689
- """
690
- label_key = "label" if "label" in kwargs else "target_text"
691
- label = kwargs.pop(label_key)
692
- assert label is not None, (
693
- f"Found a None label for the prompt: {kwargs}. This should not happen."
694
- )
695
- label_mapping = self.dataset_config.prompt_label_mapping
696
- label = label_mapping.get(label, label)
697
- if self.buffer["instruction_model"]:
698
- prompt = self.dataset_config.instruction_prompt.format(**kwargs)
699
- return prompt, label
700
- else:
701
- kwargs[label_key] = label
702
- return self.dataset_config.prompt_template.format(**kwargs), ""
703
-
704
- match task.task_group:
705
- case (
706
- TaskGroup.SEQUENCE_CLASSIFICATION
707
- | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
708
- ):
709
- few_shot_sections = [
710
- create_prompt(
711
- text=example["text"].replace("\n", " ").strip(),
712
- label=example["label"].replace("\n", " ").strip(),
713
- )
714
- for example in few_shot_examples
715
- ]
716
- new_sections = [
717
- create_prompt(text=text.replace("\n", " ").strip(), label="")
718
- for text in examples["text"]
719
- ]
720
-
721
- case TaskGroup.TEXT_TO_TEXT:
722
- few_shot_sections = [
723
- create_prompt(
724
- text=example["text"].replace("\n", " ").strip(),
725
- target_text=example["target_text"].replace("\n", " ").strip(),
726
- )
727
- for example in few_shot_examples
728
- ]
729
- new_sections = [
730
- create_prompt(text=text.replace("\n", " ").strip(), target_text="")
731
- for text in examples["text"]
732
- ]
733
-
734
- case TaskGroup.TOKEN_CLASSIFICATION:
735
-
736
- def create_label(example: dict) -> str:
737
- prompt_labels = self.dataset_config.prompt_label_mapping.values()
738
- labels: dict[str, list[str]] = {
739
- prompt_label: list() for prompt_label in prompt_labels
740
- }
741
- for token, label in zip(example["tokens"], example["labels"]):
742
- label = label.lower()
743
- if label == "o":
744
- continue
745
- prompt_label = self.dataset_config.prompt_label_mapping[label]
746
- if label.startswith("b-"):
747
- labels[prompt_label].append(token)
748
- elif label.startswith("i-"):
749
- labels[prompt_label][-1] += " " + token
750
- return json.dumps(labels, ensure_ascii=False)
751
-
752
- few_shot_sections = [
753
- create_prompt(
754
- text=" ".join(example["tokens"]).replace("\n", " ").strip(),
755
- label=create_label(example=example),
756
- )
757
- for example in few_shot_examples
758
- ]
759
- new_sections = [
760
- create_prompt(
761
- text=" ".join(tokens).replace("\n", " ").strip(), label=""
762
- )
763
- for tokens in examples["tokens"]
764
- ]
765
-
766
- case TaskGroup.QUESTION_ANSWERING:
767
- few_shot_sections = [
768
- create_prompt(
769
- text=example["context"].replace("\n", " ").strip(),
770
- question=example["question"].replace("\n", " ").strip(),
771
- label=example["answers"]["text"][0].replace("\n", " "),
772
- )
773
- for example in few_shot_examples
774
- ]
775
- new_sections = [
776
- create_prompt(
777
- text=context.replace("\n", " ").strip(),
778
- question=question.replace("\n", " ").strip(),
779
- label="",
780
- )
781
- for context, question in zip(
782
- examples["context"], examples["question"]
783
- )
784
- ]
785
-
786
- case _:
787
- raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
788
-
789
- if self.buffer["instruction_model"]:
790
- few_shot_messages = [
791
- dict(role=role, content=content)
792
- for prompt, label in few_shot_sections
793
- for role, content in [("user", prompt), ("assistant", label)]
794
- ]
795
-
796
- messages_list = [
797
- few_shot_messages + [dict(role="user", content=prompt)]
798
- for prompt, _ in new_sections
799
- ]
800
-
801
- # Pick the chat template that matches the language of the dataset, if such a
802
- # template exists
803
- chat_template: str | None = None
804
- if isinstance(self._tokenizer.chat_template, dict):
805
- language_codes = [
806
- language.code for language in self.dataset_config.languages
807
- ]
808
- for name, candidate_template in self._tokenizer.chat_template.items():
809
- if name.lower() in language_codes:
810
- chat_template = candidate_template
811
- log_once(
812
- f"Using the {name!r} chat template for the tokenizer.",
813
- level=logging.DEBUG,
814
- )
815
- break
816
-
817
- texts = [
818
- self._tokenizer.apply_chat_template(
819
- conversation=messages,
820
- tokenize=False,
821
- add_generation_prompt=True,
822
- chat_template=chat_template,
823
- )
824
- for messages in messages_list
825
- ]
826
-
827
- examples["text"] = texts
828
-
829
- else:
830
- prompt_prefix = ""
831
- if self.dataset_config.prompt_prefix:
832
- prompt_prefix = self.dataset_config.prompt_prefix + "\n\n"
833
-
834
- few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
835
- if few_shot_prompt:
836
- few_shot_prompt += "\n\n"
837
-
838
- examples["text"] = [
839
- prompt_prefix + few_shot_prompt + new_prompt
840
- for new_prompt, _ in new_sections
841
- ]
842
-
843
- return examples
844
-
845
625
  @property
846
626
  def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
847
627
  """The data collator used to prepare samples during finetuning.
@@ -1169,7 +949,7 @@ def clear_vllm() -> None:
1169
949
 
1170
950
 
1171
951
  def get_end_of_reasoning_token_id(
1172
- model: "LLM", tokenizer: "PreTrainedTokenizer"
952
+ model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
1173
953
  ) -> int | None:
1174
954
  """Get the end of reasoning token ID for a generative model.
1175
955
 
@@ -1182,6 +962,8 @@ def get_end_of_reasoning_token_id(
1182
962
  The vLLM model.
1183
963
  tokenizer:
1184
964
  The tokenizer.
965
+ model_id:
966
+ The model ID.
1185
967
 
1186
968
  Returns:
1187
969
  The end of reasoning token ID, or None if it could not be found.
@@ -1220,10 +1002,8 @@ def get_end_of_reasoning_token_id(
1220
1002
  completion_match = re.search(pattern=r"<\w+>", string=completion)
1221
1003
  if completion_match is None and prompt_match is None:
1222
1004
  log_once(
1223
- message=(
1224
- "Could not find a reasoning token, so assuming the model is not a "
1225
- "reasoning model."
1226
- ),
1005
+ f"Could not find a reasoning token for model {model_id!r}, so assuming "
1006
+ "the model is not a reasoning model.",
1227
1007
  level=logging.DEBUG,
1228
1008
  )
1229
1009
  return None
@@ -1249,20 +1029,17 @@ def get_end_of_reasoning_token_id(
1249
1029
  or end_of_reasoning_token not in special_tokens
1250
1030
  ):
1251
1031
  log_once(
1252
- message=(
1253
- f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1254
- f"token {end_of_reasoning_token!r}, but one of them is not registered "
1255
- "as a special token, so assuming it is not a real reasoning token."
1256
- ),
1032
+ f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1033
+ f"token {end_of_reasoning_token!r} for model {model_id!r}, but one of "
1034
+ "them is not registered as a special token, so assuming it is not a "
1035
+ "real reasoning token.",
1257
1036
  level=logging.DEBUG,
1258
1037
  )
1259
1038
  return None
1260
1039
 
1261
1040
  log_once(
1262
- message=(
1263
- f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1264
- f"token {end_of_reasoning_token!r}."
1265
- ),
1041
+ f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
1042
+ f"token {end_of_reasoning_token!r} for model {model_id!r}.",
1266
1043
  level=logging.DEBUG,
1267
1044
  )
1268
1045
 
euroeval/benchmarker.py CHANGED
@@ -774,6 +774,7 @@ class Benchmarker:
774
774
  metric_configs=dataset_config.task.metrics,
775
775
  scores=scores,
776
776
  model_id=model_config.model_id,
777
+ model_revision=model_config.revision,
777
778
  )
778
779
 
779
780
  record = BenchmarkResult(
@@ -782,7 +783,11 @@ class Benchmarker:
782
783
  dataset_languages=[
783
784
  language.code for language in dataset_config.languages
784
785
  ],
785
- model=model_config.model_id,
786
+ model=(
787
+ f"{model_config.model_id}@{model_config.revision}"
788
+ if model_config.revision and model_config.revision != "main"
789
+ else model_config.model_id
790
+ ),
786
791
  results=results,
787
792
  num_model_parameters=model.num_params,
788
793
  max_sequence_length=model.model_max_length,
@@ -1076,6 +1081,10 @@ def initial_logging(
1076
1081
  benchmark_config:
1077
1082
  The general benchmark configuration.
1078
1083
  """
1084
+ model_id = model_config.model_id
1085
+ if model_config.revision and model_config.revision != "main":
1086
+ model_id += f"@{model_config.revision}"
1087
+
1079
1088
  split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
1080
1089
  if model_config.task in GENERATIVE_PIPELINE_TAGS:
1081
1090
  if benchmark_config.few_shot:
@@ -1084,8 +1093,9 @@ def initial_logging(
1084
1093
  eval_type = "Zero-shot benchmarking"
1085
1094
  else:
1086
1095
  eval_type = "Benchmarking"
1096
+
1087
1097
  logger.info(
1088
- f"{eval_type} {model_config.model_id} on the {split_type} split of "
1098
+ f"{eval_type} {model_id} on the {split_type} split of "
1089
1099
  f"{dataset_config.pretty_name}"
1090
1100
  )
1091
1101
 
@@ -1095,6 +1105,7 @@ def initial_logging(
1095
1105
  "meaning that the resulting evaluation will not be included in the "
1096
1106
  "official leaderboard."
1097
1107
  )
1108
+
1098
1109
  if benchmark_config.debug:
1099
1110
  logger.info(
1100
1111
  "Running in debug mode. This will output additional information, as "
euroeval/constants.py CHANGED
@@ -16,7 +16,7 @@ MAX_CONTEXT_LENGTH = 5_000
16
16
 
17
17
  # We need to raise the amount of tokens generated for reasoning models, to give them
18
18
  # time to think
19
- REASONING_MAX_TOKENS = 8_192
19
+ REASONING_MAX_TOKENS = 32_768
20
20
 
21
21
 
22
22
  # The Hugging Face Hub pipeline tags used to classify models as generative