EuroEval 15.6.1__py3-none-any.whl → 15.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_modules/litellm.py +148 -284
- euroeval/benchmark_modules/vllm.py +115 -338
- euroeval/benchmarker.py +13 -2
- euroeval/constants.py +1 -1
- euroeval/data_loading.py +48 -26
- euroeval/data_models.py +3 -9
- euroeval/dataset_configs/dutch.py +5 -16
- euroeval/dataset_configs/finnish.py +60 -0
- euroeval/generation_utils.py +346 -0
- euroeval/prompt_templates/linguistic_acceptability.py +9 -1
- euroeval/prompt_templates/multiple_choice.py +8 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -1
- euroeval/prompt_templates/reading_comprehension.py +11 -1
- euroeval/prompt_templates/sentiment_classification.py +11 -1
- euroeval/prompt_templates/summarization.py +9 -1
- euroeval/scores.py +7 -1
- euroeval/task_group_utils/sequence_classification.py +27 -32
- euroeval/task_group_utils/text_to_text.py +10 -27
- euroeval/tasks.py +1 -1
- euroeval/tokenization_utils.py +22 -6
- {euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/METADATA +14 -2
- {euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/RECORD +25 -23
- {euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/WHEEL +0 -0
- {euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/entry_points.txt +0 -0
- {euroeval-15.6.1.dist-info → euroeval-15.7.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,11 +3,9 @@
|
|
|
3
3
|
import collections.abc as c
|
|
4
4
|
import contextlib
|
|
5
5
|
import importlib.util
|
|
6
|
-
import itertools as it
|
|
7
6
|
import json
|
|
8
7
|
import logging
|
|
9
8
|
import os
|
|
10
|
-
import random
|
|
11
9
|
import re
|
|
12
10
|
import sys
|
|
13
11
|
import typing as t
|
|
@@ -56,6 +54,7 @@ from ..exceptions import (
|
|
|
56
54
|
NeedsEnvironmentVariable,
|
|
57
55
|
NeedsExtraInstalled,
|
|
58
56
|
)
|
|
57
|
+
from ..generation_utils import apply_prompt, extract_few_shot_examples
|
|
59
58
|
from ..languages import get_all_languages
|
|
60
59
|
from ..task_group_utils import (
|
|
61
60
|
question_answering,
|
|
@@ -132,7 +131,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
132
131
|
self._model: LLM = model
|
|
133
132
|
self._tokenizer: PreTrainedTokenizer = tokenizer
|
|
134
133
|
self.end_of_reasoning_token_id = get_end_of_reasoning_token_id(
|
|
135
|
-
model=self._model, tokenizer=self._tokenizer
|
|
134
|
+
model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
|
|
136
135
|
)
|
|
137
136
|
|
|
138
137
|
# We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
|
|
@@ -146,7 +145,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
146
145
|
self.buffer |= dict(
|
|
147
146
|
instruction_model=self._tokenizer.chat_template is not None,
|
|
148
147
|
first_label_token_mapping=get_first_label_token_mapping(
|
|
149
|
-
dataset_config=self.dataset_config,
|
|
148
|
+
dataset_config=self.dataset_config,
|
|
149
|
+
model_config=self.model_config,
|
|
150
|
+
tokenizer=self._tokenizer,
|
|
151
|
+
generative_type=self.generative_type,
|
|
150
152
|
),
|
|
151
153
|
)
|
|
152
154
|
if self.model_config.adapter_base_model_id is not None:
|
|
@@ -255,14 +257,22 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
255
257
|
)
|
|
256
258
|
|
|
257
259
|
if self.benchmark_config.few_shot:
|
|
258
|
-
few_shot_examples =
|
|
259
|
-
dataset=dataset,
|
|
260
|
+
few_shot_examples = extract_few_shot_examples(
|
|
261
|
+
dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
|
|
260
262
|
)
|
|
261
263
|
else:
|
|
262
264
|
few_shot_examples = list()
|
|
263
265
|
|
|
264
266
|
dataset["test"] = dataset["test"].map(
|
|
265
|
-
partial(
|
|
267
|
+
partial(
|
|
268
|
+
apply_prompt,
|
|
269
|
+
few_shot_examples=few_shot_examples,
|
|
270
|
+
model_config=self.model_config,
|
|
271
|
+
dataset_config=self.dataset_config,
|
|
272
|
+
instruction_model=self.buffer["instruction_model"],
|
|
273
|
+
always_populate_text_field=True,
|
|
274
|
+
tokenizer=self._tokenizer,
|
|
275
|
+
),
|
|
266
276
|
batched=True,
|
|
267
277
|
load_from_cache_file=False,
|
|
268
278
|
keep_in_memory=True,
|
|
@@ -332,30 +342,40 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
332
342
|
if end_of_chat_token:
|
|
333
343
|
stop_tokens.append(end_of_chat_token)
|
|
334
344
|
|
|
345
|
+
logits_processor = None
|
|
335
346
|
if self.dataset_config.task in TASKS_USING_JSON:
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
347
|
+
if self.generative_type == GenerativeType.REASONING:
|
|
348
|
+
log_once(
|
|
349
|
+
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
350
|
+
"and thus does not support structured generation, so we do not "
|
|
351
|
+
"enable it.",
|
|
352
|
+
level=logging.DEBUG,
|
|
353
|
+
)
|
|
354
|
+
else:
|
|
355
|
+
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
356
|
+
keys_and_their_types: dict[str, t.Any] = {
|
|
357
|
+
tag_name: (conlist(str, max_length=5), ...)
|
|
358
|
+
for tag_name in ner_tag_names
|
|
359
|
+
}
|
|
360
|
+
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
361
|
+
logits_processor = JSONLogitsProcessor(
|
|
362
|
+
schema=pydantic_class,
|
|
363
|
+
tokenizer=adapt_tokenizer(tokenizer=self._tokenizer), # type: ignore
|
|
364
|
+
whitespace_pattern=r" ?",
|
|
365
|
+
)
|
|
366
|
+
log_once(
|
|
367
|
+
"Using structured generation with the JSON schema "
|
|
368
|
+
f"{pydantic_class.model_json_schema()}",
|
|
369
|
+
level=logging.DEBUG,
|
|
370
|
+
)
|
|
354
371
|
|
|
355
372
|
# Get the mapping from labels to the first token in the label. We call this each
|
|
356
373
|
# time we generate a new dataset since the dataset config can change
|
|
357
374
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
358
|
-
dataset_config=self.dataset_config,
|
|
375
|
+
dataset_config=self.dataset_config,
|
|
376
|
+
model_config=self.model_config,
|
|
377
|
+
tokenizer=self._tokenizer,
|
|
378
|
+
generative_type=self.generative_type,
|
|
359
379
|
)
|
|
360
380
|
|
|
361
381
|
# Define the parameters used for vLLM generation
|
|
@@ -391,7 +411,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
391
411
|
) and should_prompts_be_stripped(
|
|
392
412
|
labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
|
|
393
413
|
):
|
|
394
|
-
log_once(
|
|
414
|
+
log_once(
|
|
415
|
+
f"Stripping prompts for model {self.model_config.model_id!r}.",
|
|
416
|
+
level=logging.DEBUG,
|
|
417
|
+
)
|
|
395
418
|
prompts = [prompt.strip() for prompt in prompts]
|
|
396
419
|
|
|
397
420
|
# Generate sequences using vLLM
|
|
@@ -411,18 +434,65 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
411
434
|
f"Encountered error during vLLM generation: {str(e)}. Retrying..."
|
|
412
435
|
)
|
|
413
436
|
sleep(1)
|
|
437
|
+
except ValueError as e:
|
|
438
|
+
# Truncate the prompts if they are too long for the model
|
|
439
|
+
truncate_error_messages = [
|
|
440
|
+
r"prompt \(length [0-9]+\) is longer than the maximum model length"
|
|
441
|
+
]
|
|
442
|
+
if any(
|
|
443
|
+
re.search(pattern, str(e), flags=re.IGNORECASE) is not None
|
|
444
|
+
for pattern in truncate_error_messages
|
|
445
|
+
):
|
|
446
|
+
logger.info(
|
|
447
|
+
"Prompts are too long, so truncating them and trying again..."
|
|
448
|
+
)
|
|
449
|
+
logger.debug(f"The error message was: {str(e)}")
|
|
450
|
+
tokenized_prompts = self._tokenizer(
|
|
451
|
+
text=prompts,
|
|
452
|
+
truncation=True,
|
|
453
|
+
max_length=max(
|
|
454
|
+
self._tokenizer.model_max_length - max_tokens, 0
|
|
455
|
+
),
|
|
456
|
+
)
|
|
457
|
+
prompts = self._tokenizer.batch_decode(
|
|
458
|
+
sequences=tokenized_prompts.input_ids, skip_special_tokens=True
|
|
459
|
+
)
|
|
460
|
+
else:
|
|
461
|
+
raise InvalidBenchmark(
|
|
462
|
+
f"An error occurred during vLLM generation: {str(e)}"
|
|
463
|
+
)
|
|
414
464
|
else:
|
|
415
465
|
raise InvalidBenchmark(
|
|
416
466
|
f"Could not generate sequences after {num_attempts} attempts."
|
|
417
467
|
)
|
|
418
468
|
|
|
469
|
+
# When we shorten the prompts then some residual model outputs persist, so we
|
|
470
|
+
# need to filter these out
|
|
471
|
+
num_extra_outputs = len(raw_outputs) - len(prompts)
|
|
472
|
+
if num_extra_outputs > 0:
|
|
473
|
+
raw_outputs = raw_outputs[num_extra_outputs:]
|
|
474
|
+
if not all(
|
|
475
|
+
raw_output.prompt == prompt
|
|
476
|
+
for raw_output, prompt in zip(raw_outputs, prompts)
|
|
477
|
+
):
|
|
478
|
+
raise InvalidBenchmark(
|
|
479
|
+
f"The prompts and the model outputs do not match. There were "
|
|
480
|
+
f"{num_extra_outputs!r} extra outputs."
|
|
481
|
+
)
|
|
482
|
+
else:
|
|
483
|
+
logger.debug(
|
|
484
|
+
f"Filtered out {num_extra_outputs:,} extra outputs from the model, "
|
|
485
|
+
"which occured as we interupted the generation when we truncated "
|
|
486
|
+
"the prompts."
|
|
487
|
+
)
|
|
488
|
+
|
|
419
489
|
# Parse the raw model outputs
|
|
420
490
|
completion_ids: list[list[int]] = [
|
|
421
491
|
output.outputs[0].token_ids for output in raw_outputs
|
|
422
492
|
]
|
|
423
493
|
if self.end_of_reasoning_token_id in completion_ids[0]:
|
|
424
494
|
completion_ids = [
|
|
425
|
-
token_ids[token_ids.index(self.end_of_reasoning_token_id) +
|
|
495
|
+
token_ids[token_ids.index(self.end_of_reasoning_token_id) + 1 :]
|
|
426
496
|
if self.end_of_reasoning_token_id in token_ids
|
|
427
497
|
else token_ids
|
|
428
498
|
for token_ids in completion_ids
|
|
@@ -435,6 +505,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
435
505
|
)
|
|
436
506
|
completions = [completion.strip() for completion in completions]
|
|
437
507
|
|
|
508
|
+
# Sanity check
|
|
509
|
+
if len(completions) != len(prompts):
|
|
510
|
+
raise InvalidBenchmark(
|
|
511
|
+
f"Expected {len(prompts):,} completions, but got {len(completions):,}."
|
|
512
|
+
)
|
|
513
|
+
|
|
438
514
|
# Add logprobs scores to the output
|
|
439
515
|
if self.buffer["first_label_token_mapping"]:
|
|
440
516
|
scores: list[list[list[tuple[str, float]]]] = [
|
|
@@ -546,302 +622,6 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
546
622
|
|
|
547
623
|
return model_config
|
|
548
624
|
|
|
549
|
-
def _extract_few_shot_examples(
|
|
550
|
-
self, dataset: DatasetDict, task: Task, itr_idx: int
|
|
551
|
-
) -> list[dict[str, t.Any]]:
|
|
552
|
-
"""Extract few-shot examples from a dataset.
|
|
553
|
-
|
|
554
|
-
This will always extract the examples from the training split.
|
|
555
|
-
|
|
556
|
-
We ensure that the few-shot examples are unique by picking them one at a time.
|
|
557
|
-
|
|
558
|
-
Args:
|
|
559
|
-
dataset:
|
|
560
|
-
The dataset to extract the few-shot examples from.
|
|
561
|
-
task:
|
|
562
|
-
The task that is being benchmarked.
|
|
563
|
-
itr_idx:
|
|
564
|
-
The index of the dataset in the iterator.
|
|
565
|
-
|
|
566
|
-
Returns:
|
|
567
|
-
The few-shot examples.
|
|
568
|
-
"""
|
|
569
|
-
random_seed = 4242 + itr_idx
|
|
570
|
-
num_few_shots = self.dataset_config.num_few_shot_examples
|
|
571
|
-
few_shot_examples: list[dict[str, t.Any]] = list()
|
|
572
|
-
shuffled_train = dataset["train"].shuffle(seed=random_seed)
|
|
573
|
-
|
|
574
|
-
match task.task_group:
|
|
575
|
-
case (
|
|
576
|
-
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
577
|
-
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
578
|
-
):
|
|
579
|
-
labels = it.cycle(self.dataset_config.labels)
|
|
580
|
-
while (
|
|
581
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
582
|
-
):
|
|
583
|
-
label = next(labels)
|
|
584
|
-
possible_examples = shuffled_train.filter(
|
|
585
|
-
lambda x: x["label"].lower() == label.lower()
|
|
586
|
-
)
|
|
587
|
-
if len(possible_examples) == 0:
|
|
588
|
-
continue
|
|
589
|
-
example = possible_examples.select(range(1))[0]
|
|
590
|
-
few_shot_examples.append(example)
|
|
591
|
-
shuffled_train = shuffled_train.filter(
|
|
592
|
-
lambda x: x["text"] != example["text"]
|
|
593
|
-
)
|
|
594
|
-
|
|
595
|
-
case TaskGroup.TEXT_TO_TEXT:
|
|
596
|
-
while (
|
|
597
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
598
|
-
):
|
|
599
|
-
example = shuffled_train.select(range(1))[0]
|
|
600
|
-
few_shot_examples.append(example)
|
|
601
|
-
shuffled_train = shuffled_train.filter(
|
|
602
|
-
lambda x: x["text"] != example["text"]
|
|
603
|
-
)
|
|
604
|
-
|
|
605
|
-
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
606
|
-
labels = it.cycle(
|
|
607
|
-
[
|
|
608
|
-
label.lower()
|
|
609
|
-
for label in self.dataset_config.labels
|
|
610
|
-
if label.lower().startswith("b-")
|
|
611
|
-
]
|
|
612
|
-
)
|
|
613
|
-
while (
|
|
614
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
615
|
-
):
|
|
616
|
-
label = next(labels)
|
|
617
|
-
possible_examples = shuffled_train.filter(
|
|
618
|
-
lambda x: label in [tag.lower() for tag in x["labels"]]
|
|
619
|
-
)
|
|
620
|
-
if len(possible_examples) == 0:
|
|
621
|
-
continue
|
|
622
|
-
example = possible_examples.select(range(1))[0]
|
|
623
|
-
few_shot_examples.append(example)
|
|
624
|
-
shuffled_train = shuffled_train.filter(
|
|
625
|
-
lambda x: x["tokens"] != example["tokens"]
|
|
626
|
-
)
|
|
627
|
-
|
|
628
|
-
case TaskGroup.QUESTION_ANSWERING:
|
|
629
|
-
# Locate the maximum number of tokens that constitutes a short example
|
|
630
|
-
for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
|
|
631
|
-
train_with_short_examples = dataset["train"].filter(
|
|
632
|
-
lambda example: len(example["context"]) < max_num_tokens
|
|
633
|
-
)
|
|
634
|
-
num_short_examples = len(train_with_short_examples)
|
|
635
|
-
if num_short_examples >= self.dataset_config.num_few_shot_examples:
|
|
636
|
-
break
|
|
637
|
-
else:
|
|
638
|
-
raise InvalidBenchmark(
|
|
639
|
-
"Could not find enough short examples for few-shot learning."
|
|
640
|
-
)
|
|
641
|
-
|
|
642
|
-
shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
|
|
643
|
-
while (
|
|
644
|
-
len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
|
|
645
|
-
):
|
|
646
|
-
example = shuffled_train.select(range(1))[0]
|
|
647
|
-
few_shot_examples.append(example)
|
|
648
|
-
shuffled_train = shuffled_train.filter(
|
|
649
|
-
lambda x: x["context"] != example["context"]
|
|
650
|
-
)
|
|
651
|
-
|
|
652
|
-
case _:
|
|
653
|
-
raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
|
|
654
|
-
|
|
655
|
-
random.seed(random_seed)
|
|
656
|
-
random.shuffle(few_shot_examples)
|
|
657
|
-
return few_shot_examples
|
|
658
|
-
|
|
659
|
-
def _apply_prompt(
|
|
660
|
-
self,
|
|
661
|
-
examples: dict[str, t.Any],
|
|
662
|
-
few_shot_examples: list[dict[str, t.Any]],
|
|
663
|
-
task: Task,
|
|
664
|
-
) -> dict[str, t.Any]:
|
|
665
|
-
"""Apply prompt template to an example, potentially with few-shot examples.
|
|
666
|
-
|
|
667
|
-
Args:
|
|
668
|
-
examples:
|
|
669
|
-
The examples to apply the few-shot examples to.
|
|
670
|
-
few_shot_examples:
|
|
671
|
-
The few-shot examples to apply.
|
|
672
|
-
task:
|
|
673
|
-
The task that is being benchmarked.
|
|
674
|
-
|
|
675
|
-
Returns:
|
|
676
|
-
The example with the few-shot examples applied.
|
|
677
|
-
"""
|
|
678
|
-
|
|
679
|
-
def create_prompt(**kwargs: str) -> tuple[str, str]:
|
|
680
|
-
"""Create a prompt from the given keyword arguments.
|
|
681
|
-
|
|
682
|
-
Args:
|
|
683
|
-
kwargs:
|
|
684
|
-
The keyword arguments to use in the prompt.
|
|
685
|
-
|
|
686
|
-
Returns:
|
|
687
|
-
A pair (prompt, label), where "label" is an empty string if the model is
|
|
688
|
-
not instruction tuned (as in this case it is included in the prompt).
|
|
689
|
-
"""
|
|
690
|
-
label_key = "label" if "label" in kwargs else "target_text"
|
|
691
|
-
label = kwargs.pop(label_key)
|
|
692
|
-
assert label is not None, (
|
|
693
|
-
f"Found a None label for the prompt: {kwargs}. This should not happen."
|
|
694
|
-
)
|
|
695
|
-
label_mapping = self.dataset_config.prompt_label_mapping
|
|
696
|
-
label = label_mapping.get(label, label)
|
|
697
|
-
if self.buffer["instruction_model"]:
|
|
698
|
-
prompt = self.dataset_config.instruction_prompt.format(**kwargs)
|
|
699
|
-
return prompt, label
|
|
700
|
-
else:
|
|
701
|
-
kwargs[label_key] = label
|
|
702
|
-
return self.dataset_config.prompt_template.format(**kwargs), ""
|
|
703
|
-
|
|
704
|
-
match task.task_group:
|
|
705
|
-
case (
|
|
706
|
-
TaskGroup.SEQUENCE_CLASSIFICATION
|
|
707
|
-
| TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
|
|
708
|
-
):
|
|
709
|
-
few_shot_sections = [
|
|
710
|
-
create_prompt(
|
|
711
|
-
text=example["text"].replace("\n", " ").strip(),
|
|
712
|
-
label=example["label"].replace("\n", " ").strip(),
|
|
713
|
-
)
|
|
714
|
-
for example in few_shot_examples
|
|
715
|
-
]
|
|
716
|
-
new_sections = [
|
|
717
|
-
create_prompt(text=text.replace("\n", " ").strip(), label="")
|
|
718
|
-
for text in examples["text"]
|
|
719
|
-
]
|
|
720
|
-
|
|
721
|
-
case TaskGroup.TEXT_TO_TEXT:
|
|
722
|
-
few_shot_sections = [
|
|
723
|
-
create_prompt(
|
|
724
|
-
text=example["text"].replace("\n", " ").strip(),
|
|
725
|
-
target_text=example["target_text"].replace("\n", " ").strip(),
|
|
726
|
-
)
|
|
727
|
-
for example in few_shot_examples
|
|
728
|
-
]
|
|
729
|
-
new_sections = [
|
|
730
|
-
create_prompt(text=text.replace("\n", " ").strip(), target_text="")
|
|
731
|
-
for text in examples["text"]
|
|
732
|
-
]
|
|
733
|
-
|
|
734
|
-
case TaskGroup.TOKEN_CLASSIFICATION:
|
|
735
|
-
|
|
736
|
-
def create_label(example: dict) -> str:
|
|
737
|
-
prompt_labels = self.dataset_config.prompt_label_mapping.values()
|
|
738
|
-
labels: dict[str, list[str]] = {
|
|
739
|
-
prompt_label: list() for prompt_label in prompt_labels
|
|
740
|
-
}
|
|
741
|
-
for token, label in zip(example["tokens"], example["labels"]):
|
|
742
|
-
label = label.lower()
|
|
743
|
-
if label == "o":
|
|
744
|
-
continue
|
|
745
|
-
prompt_label = self.dataset_config.prompt_label_mapping[label]
|
|
746
|
-
if label.startswith("b-"):
|
|
747
|
-
labels[prompt_label].append(token)
|
|
748
|
-
elif label.startswith("i-"):
|
|
749
|
-
labels[prompt_label][-1] += " " + token
|
|
750
|
-
return json.dumps(labels, ensure_ascii=False)
|
|
751
|
-
|
|
752
|
-
few_shot_sections = [
|
|
753
|
-
create_prompt(
|
|
754
|
-
text=" ".join(example["tokens"]).replace("\n", " ").strip(),
|
|
755
|
-
label=create_label(example=example),
|
|
756
|
-
)
|
|
757
|
-
for example in few_shot_examples
|
|
758
|
-
]
|
|
759
|
-
new_sections = [
|
|
760
|
-
create_prompt(
|
|
761
|
-
text=" ".join(tokens).replace("\n", " ").strip(), label=""
|
|
762
|
-
)
|
|
763
|
-
for tokens in examples["tokens"]
|
|
764
|
-
]
|
|
765
|
-
|
|
766
|
-
case TaskGroup.QUESTION_ANSWERING:
|
|
767
|
-
few_shot_sections = [
|
|
768
|
-
create_prompt(
|
|
769
|
-
text=example["context"].replace("\n", " ").strip(),
|
|
770
|
-
question=example["question"].replace("\n", " ").strip(),
|
|
771
|
-
label=example["answers"]["text"][0].replace("\n", " "),
|
|
772
|
-
)
|
|
773
|
-
for example in few_shot_examples
|
|
774
|
-
]
|
|
775
|
-
new_sections = [
|
|
776
|
-
create_prompt(
|
|
777
|
-
text=context.replace("\n", " ").strip(),
|
|
778
|
-
question=question.replace("\n", " ").strip(),
|
|
779
|
-
label="",
|
|
780
|
-
)
|
|
781
|
-
for context, question in zip(
|
|
782
|
-
examples["context"], examples["question"]
|
|
783
|
-
)
|
|
784
|
-
]
|
|
785
|
-
|
|
786
|
-
case _:
|
|
787
|
-
raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
|
|
788
|
-
|
|
789
|
-
if self.buffer["instruction_model"]:
|
|
790
|
-
few_shot_messages = [
|
|
791
|
-
dict(role=role, content=content)
|
|
792
|
-
for prompt, label in few_shot_sections
|
|
793
|
-
for role, content in [("user", prompt), ("assistant", label)]
|
|
794
|
-
]
|
|
795
|
-
|
|
796
|
-
messages_list = [
|
|
797
|
-
few_shot_messages + [dict(role="user", content=prompt)]
|
|
798
|
-
for prompt, _ in new_sections
|
|
799
|
-
]
|
|
800
|
-
|
|
801
|
-
# Pick the chat template that matches the language of the dataset, if such a
|
|
802
|
-
# template exists
|
|
803
|
-
chat_template: str | None = None
|
|
804
|
-
if isinstance(self._tokenizer.chat_template, dict):
|
|
805
|
-
language_codes = [
|
|
806
|
-
language.code for language in self.dataset_config.languages
|
|
807
|
-
]
|
|
808
|
-
for name, candidate_template in self._tokenizer.chat_template.items():
|
|
809
|
-
if name.lower() in language_codes:
|
|
810
|
-
chat_template = candidate_template
|
|
811
|
-
log_once(
|
|
812
|
-
f"Using the {name!r} chat template for the tokenizer.",
|
|
813
|
-
level=logging.DEBUG,
|
|
814
|
-
)
|
|
815
|
-
break
|
|
816
|
-
|
|
817
|
-
texts = [
|
|
818
|
-
self._tokenizer.apply_chat_template(
|
|
819
|
-
conversation=messages,
|
|
820
|
-
tokenize=False,
|
|
821
|
-
add_generation_prompt=True,
|
|
822
|
-
chat_template=chat_template,
|
|
823
|
-
)
|
|
824
|
-
for messages in messages_list
|
|
825
|
-
]
|
|
826
|
-
|
|
827
|
-
examples["text"] = texts
|
|
828
|
-
|
|
829
|
-
else:
|
|
830
|
-
prompt_prefix = ""
|
|
831
|
-
if self.dataset_config.prompt_prefix:
|
|
832
|
-
prompt_prefix = self.dataset_config.prompt_prefix + "\n\n"
|
|
833
|
-
|
|
834
|
-
few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
|
|
835
|
-
if few_shot_prompt:
|
|
836
|
-
few_shot_prompt += "\n\n"
|
|
837
|
-
|
|
838
|
-
examples["text"] = [
|
|
839
|
-
prompt_prefix + few_shot_prompt + new_prompt
|
|
840
|
-
for new_prompt, _ in new_sections
|
|
841
|
-
]
|
|
842
|
-
|
|
843
|
-
return examples
|
|
844
|
-
|
|
845
625
|
@property
|
|
846
626
|
def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
|
|
847
627
|
"""The data collator used to prepare samples during finetuning.
|
|
@@ -1169,7 +949,7 @@ def clear_vllm() -> None:
|
|
|
1169
949
|
|
|
1170
950
|
|
|
1171
951
|
def get_end_of_reasoning_token_id(
|
|
1172
|
-
model: "LLM", tokenizer: "PreTrainedTokenizer"
|
|
952
|
+
model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
|
|
1173
953
|
) -> int | None:
|
|
1174
954
|
"""Get the end of reasoning token ID for a generative model.
|
|
1175
955
|
|
|
@@ -1182,6 +962,8 @@ def get_end_of_reasoning_token_id(
|
|
|
1182
962
|
The vLLM model.
|
|
1183
963
|
tokenizer:
|
|
1184
964
|
The tokenizer.
|
|
965
|
+
model_id:
|
|
966
|
+
The model ID.
|
|
1185
967
|
|
|
1186
968
|
Returns:
|
|
1187
969
|
The end of reasoning token ID, or None if it could not be found.
|
|
@@ -1220,10 +1002,8 @@ def get_end_of_reasoning_token_id(
|
|
|
1220
1002
|
completion_match = re.search(pattern=r"<\w+>", string=completion)
|
|
1221
1003
|
if completion_match is None and prompt_match is None:
|
|
1222
1004
|
log_once(
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
"reasoning model."
|
|
1226
|
-
),
|
|
1005
|
+
f"Could not find a reasoning token for model {model_id!r}, so assuming "
|
|
1006
|
+
"the model is not a reasoning model.",
|
|
1227
1007
|
level=logging.DEBUG,
|
|
1228
1008
|
)
|
|
1229
1009
|
return None
|
|
@@ -1249,20 +1029,17 @@ def get_end_of_reasoning_token_id(
|
|
|
1249
1029
|
or end_of_reasoning_token not in special_tokens
|
|
1250
1030
|
):
|
|
1251
1031
|
log_once(
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
),
|
|
1032
|
+
f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
|
|
1033
|
+
f"token {end_of_reasoning_token!r} for model {model_id!r}, but one of "
|
|
1034
|
+
"them is not registered as a special token, so assuming it is not a "
|
|
1035
|
+
"real reasoning token.",
|
|
1257
1036
|
level=logging.DEBUG,
|
|
1258
1037
|
)
|
|
1259
1038
|
return None
|
|
1260
1039
|
|
|
1261
1040
|
log_once(
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
f"token {end_of_reasoning_token!r}."
|
|
1265
|
-
),
|
|
1041
|
+
f"Detected reasoning token {reasoning_token!r} and end-of-reasoning "
|
|
1042
|
+
f"token {end_of_reasoning_token!r} for model {model_id!r}.",
|
|
1266
1043
|
level=logging.DEBUG,
|
|
1267
1044
|
)
|
|
1268
1045
|
|
euroeval/benchmarker.py
CHANGED
|
@@ -774,6 +774,7 @@ class Benchmarker:
|
|
|
774
774
|
metric_configs=dataset_config.task.metrics,
|
|
775
775
|
scores=scores,
|
|
776
776
|
model_id=model_config.model_id,
|
|
777
|
+
model_revision=model_config.revision,
|
|
777
778
|
)
|
|
778
779
|
|
|
779
780
|
record = BenchmarkResult(
|
|
@@ -782,7 +783,11 @@ class Benchmarker:
|
|
|
782
783
|
dataset_languages=[
|
|
783
784
|
language.code for language in dataset_config.languages
|
|
784
785
|
],
|
|
785
|
-
model=
|
|
786
|
+
model=(
|
|
787
|
+
f"{model_config.model_id}@{model_config.revision}"
|
|
788
|
+
if model_config.revision and model_config.revision != "main"
|
|
789
|
+
else model_config.model_id
|
|
790
|
+
),
|
|
786
791
|
results=results,
|
|
787
792
|
num_model_parameters=model.num_params,
|
|
788
793
|
max_sequence_length=model.model_max_length,
|
|
@@ -1076,6 +1081,10 @@ def initial_logging(
|
|
|
1076
1081
|
benchmark_config:
|
|
1077
1082
|
The general benchmark configuration.
|
|
1078
1083
|
"""
|
|
1084
|
+
model_id = model_config.model_id
|
|
1085
|
+
if model_config.revision and model_config.revision != "main":
|
|
1086
|
+
model_id += f"@{model_config.revision}"
|
|
1087
|
+
|
|
1079
1088
|
split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
|
|
1080
1089
|
if model_config.task in GENERATIVE_PIPELINE_TAGS:
|
|
1081
1090
|
if benchmark_config.few_shot:
|
|
@@ -1084,8 +1093,9 @@ def initial_logging(
|
|
|
1084
1093
|
eval_type = "Zero-shot benchmarking"
|
|
1085
1094
|
else:
|
|
1086
1095
|
eval_type = "Benchmarking"
|
|
1096
|
+
|
|
1087
1097
|
logger.info(
|
|
1088
|
-
f"{eval_type} {
|
|
1098
|
+
f"{eval_type} {model_id} on the {split_type} split of "
|
|
1089
1099
|
f"{dataset_config.pretty_name}"
|
|
1090
1100
|
)
|
|
1091
1101
|
|
|
@@ -1095,6 +1105,7 @@ def initial_logging(
|
|
|
1095
1105
|
"meaning that the resulting evaluation will not be included in the "
|
|
1096
1106
|
"official leaderboard."
|
|
1097
1107
|
)
|
|
1108
|
+
|
|
1098
1109
|
if benchmark_config.debug:
|
|
1099
1110
|
logger.info(
|
|
1100
1111
|
"Running in debug mode. This will output additional information, as "
|
euroeval/constants.py
CHANGED
|
@@ -16,7 +16,7 @@ MAX_CONTEXT_LENGTH = 5_000
|
|
|
16
16
|
|
|
17
17
|
# We need to raise the amount of tokens generated for reasoning models, to give them
|
|
18
18
|
# time to think
|
|
19
|
-
REASONING_MAX_TOKENS =
|
|
19
|
+
REASONING_MAX_TOKENS = 32_768
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
# The Hugging Face Hub pipeline tags used to classify models as generative
|