EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show
  1. euroeval/__init__.py +72 -0
  2. euroeval/benchmark_config_factory.py +358 -0
  3. euroeval/benchmark_modules/__init__.py +7 -0
  4. euroeval/benchmark_modules/base.py +354 -0
  5. euroeval/benchmark_modules/fresh.py +286 -0
  6. euroeval/benchmark_modules/hf.py +1185 -0
  7. euroeval/benchmark_modules/litellm.py +905 -0
  8. euroeval/benchmark_modules/vllm.py +1171 -0
  9. euroeval/benchmarker.py +1074 -0
  10. euroeval/callbacks.py +72 -0
  11. euroeval/cli.py +281 -0
  12. euroeval/constants.py +50 -0
  13. euroeval/data_loading.py +96 -0
  14. euroeval/data_models.py +474 -0
  15. euroeval/dataset_configs.py +2001 -0
  16. euroeval/enums.py +144 -0
  17. euroeval/exceptions.py +191 -0
  18. euroeval/finetuning.py +324 -0
  19. euroeval/generation.py +296 -0
  20. euroeval/human_evaluation.py +737 -0
  21. euroeval/languages.py +200 -0
  22. euroeval/model_cache.py +253 -0
  23. euroeval/model_config.py +77 -0
  24. euroeval/model_loading.py +78 -0
  25. euroeval/scores.py +90 -0
  26. euroeval/speed_benchmark.py +124 -0
  27. euroeval/task_utils/__init__.py +1 -0
  28. euroeval/task_utils/multiple_choice_classification.py +176 -0
  29. euroeval/task_utils/question_answering.py +698 -0
  30. euroeval/task_utils/sequence_classification.py +237 -0
  31. euroeval/task_utils/text_to_text.py +150 -0
  32. euroeval/task_utils/token_classification.py +464 -0
  33. euroeval/tasks.py +202 -0
  34. euroeval/types.py +97 -0
  35. euroeval/utils.py +574 -0
  36. euroeval-15.2.0.dist-info/METADATA +234 -0
  37. euroeval-15.2.0.dist-info/RECORD +40 -0
  38. euroeval-15.2.0.dist-info/WHEEL +4 -0
  39. euroeval-15.2.0.dist-info/entry_points.txt +4 -0
  40. euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1171 @@
1
+ """Generative models using the vLLM inference framework."""
2
+
3
+ import collections.abc as c
4
+ import importlib.util
5
+ import itertools as it
6
+ import json
7
+ import logging
8
+ import os
9
+ import random
10
+ import re
11
+ import sys
12
+ import typing as t
13
+ from functools import partial
14
+ from pathlib import Path
15
+ from time import sleep
16
+ from types import MethodType
17
+
18
+ import torch
19
+ from datasets import DatasetDict
20
+ from huggingface_hub import snapshot_download
21
+ from pydantic import conlist, create_model
22
+ from tqdm.auto import tqdm
23
+ from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, Trainer
24
+ from urllib3.exceptions import RequestError
25
+
26
+ from ..constants import (
27
+ GENERATIVE_PIPELINE_TAGS,
28
+ MAX_LOGPROBS,
29
+ MERGE_TAGS,
30
+ REASONING_MAX_TOKENS,
31
+ TASK_GROUPS_USING_LOGPROBS,
32
+ TASKS_USING_JSON,
33
+ )
34
+ from ..data_models import (
35
+ BenchmarkConfig,
36
+ DatasetConfig,
37
+ GenerativeModelOutput,
38
+ ModelConfig,
39
+ Task,
40
+ )
41
+ from ..enums import (
42
+ BatchingPreference,
43
+ GenerativeType,
44
+ InferenceBackend,
45
+ ModelType,
46
+ TaskGroup,
47
+ )
48
+ from ..exceptions import (
49
+ InvalidBenchmark,
50
+ InvalidModel,
51
+ NeedsEnvironmentVariable,
52
+ NeedsExtraInstalled,
53
+ )
54
+ from ..languages import get_all_languages
55
+ from ..task_utils import (
56
+ question_answering,
57
+ sequence_classification,
58
+ text_to_text,
59
+ token_classification,
60
+ )
61
+ from ..types import ExtractLabelsFunction
62
+ from ..utils import (
63
+ clear_memory,
64
+ create_model_cache_dir,
65
+ get_bos_token,
66
+ get_end_of_chat_token_ids,
67
+ get_eos_token,
68
+ log_once,
69
+ should_prompts_be_stripped,
70
+ )
71
+ from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
72
+
73
+ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
74
+ from vllm import LLM, RequestOutput, SamplingParams
75
+ from vllm.lora.request import LoRARequest
76
+ from vllm.sampling_params import GuidedDecodingParams
77
+
78
+ try:
79
+ from vllm.model_executor.parallel_utils.parallel_state import (
80
+ destroy_model_parallel,
81
+ )
82
+ except ImportError:
83
+ from vllm.distributed.parallel_state import destroy_model_parallel
84
+
85
+ if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
86
+ import ray
87
+
88
+ logger = logging.getLogger("euroeval")
89
+
90
+
91
+ class VLLMModel(HuggingFaceEncoderModel):
92
+ """A generative model using the vLLM inference framework."""
93
+
94
+ fresh_model = False
95
+ batching_preference = BatchingPreference.ALL_AT_ONCE
96
+ high_priority = True
97
+
98
+ def __init__(
99
+ self,
100
+ model_config: ModelConfig,
101
+ dataset_config: DatasetConfig,
102
+ benchmark_config: BenchmarkConfig,
103
+ ) -> None:
104
+ """Initialise the vLLM model.
105
+
106
+ Args:
107
+ model_config:
108
+ The model configuration.
109
+ dataset_config:
110
+ The dataset configuration.
111
+ benchmark_config:
112
+ The benchmark configuration.
113
+ """
114
+ if (
115
+ importlib.util.find_spec("vllm") is None
116
+ or importlib.util.find_spec("ray") is None
117
+ ):
118
+ raise NeedsExtraInstalled(extra="generative")
119
+
120
+ output_scores = dataset_config.task.task_group in TASK_GROUPS_USING_LOGPROBS
121
+ model, tokenizer = load_model_and_tokenizer(
122
+ model_config=model_config,
123
+ benchmark_config=benchmark_config,
124
+ output_scores=output_scores,
125
+ )
126
+ self._model: LLM = model
127
+ self._tokenizer: PreTrainedTokenizer = tokenizer
128
+ self.end_of_reasoning_token_id = get_end_of_reasoning_token_id(
129
+ model=self._model, tokenizer=self._tokenizer
130
+ )
131
+
132
+ # We specify `HuggingFaceEncoderModel` here instead of `VLLMModel`, as we want
133
+ # to call the `__init__` method of the `BenchmarkModule` class.
134
+ super(HuggingFaceEncoderModel, self).__init__(
135
+ model_config=model_config,
136
+ dataset_config=dataset_config,
137
+ benchmark_config=benchmark_config,
138
+ )
139
+
140
+ self.buffer["output_scores"] = output_scores
141
+ self.buffer["instruction_model"] = self._tokenizer.chat_template is not None
142
+ if self.model_config.adapter_base_model_id is not None:
143
+ adapter_path = snapshot_download(
144
+ repo_id=self.model_config.model_id,
145
+ cache_dir=Path(self.model_config.model_cache_dir),
146
+ )
147
+ self.buffer["lora_request"] = LoRARequest(
148
+ lora_name="adapter", lora_int_id=1, lora_path=adapter_path
149
+ )
150
+
151
+ @property
152
+ def generative_type(self) -> GenerativeType | None:
153
+ """Get the generative type of the model.
154
+
155
+ Returns:
156
+ The generative type of the model, or None if it has not been set yet.
157
+ """
158
+ if not hasattr(self, "_tokenizer"):
159
+ return None
160
+ elif self.end_of_reasoning_token_id is not None:
161
+ return GenerativeType.REASONING
162
+ elif self._tokenizer.chat_template is not None:
163
+ return GenerativeType.INSTRUCTION_TUNED
164
+ else:
165
+ return GenerativeType.BASE
166
+
167
+ @property
168
+ def extract_labels_from_generation(self) -> ExtractLabelsFunction:
169
+ """The function used to extract the labels from the generated output.
170
+
171
+ Returns:
172
+ The function used to extract the labels from the generated output.
173
+ """
174
+ match self.dataset_config.task.task_group:
175
+ case (
176
+ TaskGroup.SEQUENCE_CLASSIFICATION
177
+ | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
178
+ ):
179
+ return partial(
180
+ sequence_classification.extract_labels_from_generation,
181
+ dataset_config=self.dataset_config,
182
+ )
183
+ case TaskGroup.TEXT_TO_TEXT:
184
+ return text_to_text.extract_labels_from_generation
185
+ case TaskGroup.TOKEN_CLASSIFICATION:
186
+ return partial(
187
+ token_classification.extract_labels_from_generation,
188
+ dataset_config=self.dataset_config,
189
+ )
190
+ case TaskGroup.QUESTION_ANSWERING:
191
+ return question_answering.extract_labels_from_generation
192
+ case _:
193
+ raise NotImplementedError(
194
+ f"Unsupported task group: {self.dataset_config.task.task_group}."
195
+ )
196
+
197
+ def prepare_dataset(
198
+ self, dataset: DatasetDict, task: Task, itr_idx: int
199
+ ) -> DatasetDict:
200
+ """Prepare the dataset for the model.
201
+
202
+ This includes things like tokenisation.
203
+
204
+ Args:
205
+ dataset:
206
+ The dataset to prepare.
207
+ task:
208
+ The task to prepare the dataset for.
209
+ itr_idx:
210
+ The index of the dataset in the iterator.
211
+
212
+ Returns:
213
+ The prepared dataset.
214
+ """
215
+ if task.task_group == TaskGroup.QUESTION_ANSWERING:
216
+ dataset = dataset.map(
217
+ lambda examples: dict(
218
+ label=[
219
+ dict(
220
+ id=id,
221
+ answers=dict(
222
+ answer_start=answer_dct["answer_start"],
223
+ text=[
224
+ answer_text.lower()
225
+ for answer_text in answer_dct["text"]
226
+ ],
227
+ ),
228
+ )
229
+ for id, answer_dct in zip(examples["id"], examples["answers"])
230
+ ]
231
+ ),
232
+ batched=True,
233
+ load_from_cache_file=False,
234
+ keep_in_memory=True,
235
+ )
236
+
237
+ if self.benchmark_config.few_shot:
238
+ few_shot_examples = self._extract_few_shot_examples(
239
+ dataset=dataset, task=task, itr_idx=itr_idx
240
+ )
241
+ else:
242
+ few_shot_examples = list()
243
+
244
+ dataset["test"] = dataset["test"].map(
245
+ partial(self._apply_prompt, few_shot_examples=few_shot_examples, task=task),
246
+ batched=True,
247
+ load_from_cache_file=False,
248
+ keep_in_memory=True,
249
+ )
250
+
251
+ return dataset
252
+
253
+ def generate(self, inputs: dict) -> GenerativeModelOutput:
254
+ """Generate outputs from the model.
255
+
256
+ Args:
257
+ inputs:
258
+ A batch of inputs to pass through the model.
259
+
260
+ Returns:
261
+ The generated model outputs.
262
+ """
263
+ # Define which tokens to use as stopping criteria. We want to use the padding
264
+ # token, end-of-sentence token, and a double newline if the model isn't
265
+ # instruction tuned (since these separate the few-shot examples in the input in
266
+ # this case)
267
+ stop_tokens: list[str] = list()
268
+ if self.buffer["instruction_model"] is False:
269
+ stop_tokens.append("\n\n")
270
+ if self._tokenizer.pad_token_id is not None:
271
+ stop_tokens.append(self._tokenizer.pad_token)
272
+ if self._tokenizer.eos_token_id is not None:
273
+ stop_tokens.append(self._tokenizer.eos_token)
274
+ if self._tokenizer.pad_token_id is None:
275
+ self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
276
+ self._tokenizer.pad_token = self._tokenizer.eos_token
277
+ if (
278
+ self._tokenizer.bos_token_id is not None
279
+ and self._tokenizer.pad_token_id is None
280
+ ):
281
+ self._tokenizer.pad_token_id = self._tokenizer.bos_token_id
282
+ self._tokenizer.pad_token = self._tokenizer.bos_token
283
+ elif (
284
+ self._tokenizer.eos_token_id is not None
285
+ and self._tokenizer.pad_token_id is None
286
+ ):
287
+ self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
288
+ self._tokenizer.pad_token = self._tokenizer.eos_token
289
+ elif self._tokenizer.pad_token_id is None:
290
+ pad_token_candidates = ["<pad>", "[pad]", "<|endoftext|>", "<|im_end|>"]
291
+ pad_token_candidates.extend([c.upper() for c in pad_token_candidates])
292
+ for candidate in pad_token_candidates:
293
+ if candidate in self._tokenizer.get_vocab():
294
+ pad_token_id = self._tokenizer.get_vocab()[candidate]
295
+ self._tokenizer.pad_token = candidate
296
+ self._tokenizer.pad_token_id = pad_token_id
297
+ break
298
+ else:
299
+ raise InvalidModel(
300
+ "Could not find a suitable token to use as a padding token, since "
301
+ "the model does not have a BOS, EOS, or padding token, and does "
302
+ f"not have any of the following tokens in its vocabulary: "
303
+ f"{pad_token_candidates}."
304
+ )
305
+
306
+ assert self._tokenizer.pad_token_id is not None
307
+
308
+ # Add end of chat token as a stopping token, if it exists
309
+ end_of_chat_token_ids = get_end_of_chat_token_ids(tokenizer=self._tokenizer)
310
+ if end_of_chat_token_ids is not None:
311
+ end_of_chat_token = self._tokenizer.decode(end_of_chat_token_ids).strip()
312
+ if end_of_chat_token:
313
+ stop_tokens.append(end_of_chat_token)
314
+
315
+ if self.dataset_config.task in TASKS_USING_JSON:
316
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
317
+ keys_and_their_types: dict[str, t.Any] = {
318
+ tag_name: (conlist(str, max_length=5), ...)
319
+ for tag_name in ner_tag_names
320
+ }
321
+ pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
322
+ schema = pydantic_class.model_json_schema()
323
+ guided_decoding = GuidedDecodingParams(
324
+ json=schema, backend="outlines", whitespace_pattern=r" ?"
325
+ )
326
+ else:
327
+ guided_decoding = None
328
+
329
+ # Define the parameters used for vLLM generation
330
+ max_tokens: int = (
331
+ REASONING_MAX_TOKENS
332
+ if self.generative_type == GenerativeType.REASONING
333
+ else self.dataset_config.max_generated_tokens
334
+ )
335
+ sampling_params = SamplingParams(
336
+ max_tokens=max_tokens,
337
+ logprobs=MAX_LOGPROBS if self.buffer["output_scores"] else None,
338
+ temperature=0.0,
339
+ stop=[stop_token for stop_token in stop_tokens if stop_token],
340
+ guided_decoding=guided_decoding,
341
+ )
342
+
343
+ # If any of the prompts are empty then we need to replace them with a BOS token
344
+ # so that the vLLM model can generate from them
345
+ prompts: list[str] = inputs["text"]
346
+ if any(len(prompt) == 0 for prompt in prompts):
347
+ logger.debug("Found empty prompts, replacing with BOS token.")
348
+ prompts = [
349
+ prompt if len(prompt) > 0 else str(self._tokenizer.bos_token)
350
+ for prompt in prompts
351
+ ]
352
+
353
+ # Strip the prompts if the model's tokeniser requires it
354
+ labels_to_be_generated = list(self.dataset_config.prompt_label_mapping.values())
355
+ if len(labels_to_be_generated) == 0:
356
+ labels_to_be_generated = ["negative", "positive"]
357
+ if not self.buffer.get(
358
+ "instruction_model", False
359
+ ) and should_prompts_be_stripped(
360
+ labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
361
+ ):
362
+ log_once(message="Stripping prompts.", level=logging.DEBUG)
363
+ prompts = [prompt.strip() for prompt in prompts]
364
+
365
+ # Generate sequences using vLLM
366
+ input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
367
+ raw_outputs = self._model.generate(
368
+ prompts=prompts,
369
+ sampling_params=sampling_params,
370
+ use_tqdm=(not input_is_a_test),
371
+ lora_request=self.buffer.get("lora_request"),
372
+ )
373
+ completion_ids: list[list[int]] = [
374
+ output.outputs[0].token_ids for output in raw_outputs
375
+ ]
376
+ if self.end_of_reasoning_token_id in completion_ids[0]:
377
+ completion_ids = [
378
+ token_ids[token_ids.index(self.end_of_reasoning_token_id) + 2 :]
379
+ if self.end_of_reasoning_token_id in token_ids
380
+ else token_ids
381
+ for token_ids in completion_ids
382
+ ]
383
+ completions = self._tokenizer.batch_decode(
384
+ sequences=[
385
+ torch.LongTensor(completion_id) for completion_id in completion_ids
386
+ ],
387
+ skip_special_tokens=True,
388
+ )
389
+ completions = [completion.strip() for completion in completions]
390
+
391
+ # Add logprobs scores to the output
392
+ if self.buffer["output_scores"]:
393
+ scores: list[list[list[tuple[str, float]]]] = [
394
+ [
395
+ [
396
+ (obj.decoded_token, obj.logprob)
397
+ for obj in token_logprobs_dict.values()
398
+ ]
399
+ for token_logprobs_dict in raw_output.outputs[0].logprobs
400
+ ]
401
+ for raw_output in raw_outputs
402
+ ]
403
+ scores = [
404
+ score_list[
405
+ raw_output.outputs[0].token_ids.index(
406
+ self.end_of_reasoning_token_id
407
+ )
408
+ + 2 :
409
+ ]
410
+ if self.end_of_reasoning_token_id in raw_output.outputs[0].token_ids
411
+ else score_list
412
+ for raw_output, score_list in zip(raw_outputs, scores)
413
+ ]
414
+ output = GenerativeModelOutput(sequences=completions, scores=scores)
415
+ else:
416
+ output = GenerativeModelOutput(sequences=completions)
417
+
418
+ return output
419
+
420
+ @classmethod
421
+ def model_exists(
422
+ cls, model_id: str, benchmark_config: BenchmarkConfig
423
+ ) -> bool | NeedsExtraInstalled | NeedsEnvironmentVariable:
424
+ """Check if a model exists.
425
+
426
+ Args:
427
+ model_id:
428
+ The model ID.
429
+ benchmark_config:
430
+ The benchmark configuration.
431
+
432
+ Returns:
433
+ Whether the model exists, or an error describing why we cannot check
434
+ whether the model exists.
435
+ """
436
+ using_api = (
437
+ benchmark_config.api_base is not None
438
+ or benchmark_config.api_version is not None
439
+ )
440
+ if using_api:
441
+ return False
442
+
443
+ model_id, revision = (
444
+ model_id.split("@") if "@" in model_id else (model_id, "main")
445
+ )
446
+ model_info = get_model_repo_info(
447
+ model_id=model_id, revision=revision, benchmark_config=benchmark_config
448
+ )
449
+ return (
450
+ model_info is not None
451
+ and model_info.pipeline_tag in GENERATIVE_PIPELINE_TAGS
452
+ )
453
+
454
+ @classmethod
455
+ def get_model_config(
456
+ cls, model_id: str, benchmark_config: BenchmarkConfig
457
+ ) -> ModelConfig:
458
+ """Fetch the model configuration.
459
+
460
+ Args:
461
+ model_id:
462
+ The model ID.
463
+ benchmark_config:
464
+ The benchmark configuration.
465
+
466
+ Returns:
467
+ The model configuration.
468
+ """
469
+ model_id, revision = (
470
+ model_id.split("@") if "@" in model_id else (model_id, "main")
471
+ )
472
+ model_info = get_model_repo_info(
473
+ model_id=model_id, revision=revision, benchmark_config=benchmark_config
474
+ )
475
+ if model_info is None:
476
+ raise InvalidModel(f"The model {model_id!r} could not be found.")
477
+
478
+ language_mapping = get_all_languages()
479
+ language_codes = list(language_mapping.keys())
480
+
481
+ model_config = ModelConfig(
482
+ model_id=model_id,
483
+ revision=revision,
484
+ task=model_info.pipeline_tag,
485
+ languages=[
486
+ language_mapping[tag]
487
+ for tag in model_info.tags
488
+ if tag in language_codes
489
+ ],
490
+ merge=any(tag in model_info.tags for tag in MERGE_TAGS),
491
+ inference_backend=InferenceBackend.VLLM,
492
+ model_type=ModelType.GENERATIVE,
493
+ fresh=False,
494
+ model_cache_dir=create_model_cache_dir(
495
+ cache_dir=benchmark_config.cache_dir, model_id=model_id
496
+ ),
497
+ adapter_base_model_id=model_info.adapter_base_model_id,
498
+ )
499
+
500
+ return model_config
501
+
502
+ def _extract_few_shot_examples(
503
+ self, dataset: DatasetDict, task: Task, itr_idx: int
504
+ ) -> list[dict[str, t.Any]]:
505
+ """Extract few-shot examples from a dataset.
506
+
507
+ This will always extract the examples from the training split.
508
+
509
+ We ensure that the few-shot examples are unique by picking them one at a time.
510
+
511
+ Args:
512
+ dataset:
513
+ The dataset to extract the few-shot examples from.
514
+ task:
515
+ The task that is being benchmarked.
516
+ itr_idx:
517
+ The index of the dataset in the iterator.
518
+
519
+ Returns:
520
+ The few-shot examples.
521
+ """
522
+ random_seed = 4242 + itr_idx
523
+ num_few_shots = self.dataset_config.num_few_shot_examples
524
+ few_shot_examples: list[dict[str, t.Any]] = list()
525
+ shuffled_train = dataset["train"].shuffle(seed=random_seed)
526
+
527
+ match task.task_group:
528
+ case (
529
+ TaskGroup.SEQUENCE_CLASSIFICATION
530
+ | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
531
+ ):
532
+ labels = it.cycle(self.dataset_config.labels)
533
+ while (
534
+ len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
535
+ ):
536
+ label = next(labels)
537
+ possible_examples = shuffled_train.filter(
538
+ lambda x: x["label"].lower() == label.lower()
539
+ )
540
+ if len(possible_examples) == 0:
541
+ continue
542
+ example = possible_examples.select(range(1))[0]
543
+ few_shot_examples.append(example)
544
+ shuffled_train = shuffled_train.filter(
545
+ lambda x: x["text"] != example["text"]
546
+ )
547
+
548
+ case TaskGroup.TEXT_TO_TEXT:
549
+ while (
550
+ len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
551
+ ):
552
+ example = shuffled_train.select(range(1))[0]
553
+ few_shot_examples.append(example)
554
+ shuffled_train = shuffled_train.filter(
555
+ lambda x: x["text"] != example["text"]
556
+ )
557
+
558
+ case TaskGroup.TOKEN_CLASSIFICATION:
559
+ labels = it.cycle(
560
+ [
561
+ label.lower()
562
+ for label in self.dataset_config.labels
563
+ if label.lower().startswith("b-")
564
+ ]
565
+ )
566
+ while (
567
+ len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
568
+ ):
569
+ label = next(labels)
570
+ possible_examples = shuffled_train.filter(
571
+ lambda x: label in [tag.lower() for tag in x["labels"]]
572
+ )
573
+ if len(possible_examples) == 0:
574
+ continue
575
+ example = possible_examples.select(range(1))[0]
576
+ few_shot_examples.append(example)
577
+ shuffled_train = shuffled_train.filter(
578
+ lambda x: x["tokens"] != example["tokens"]
579
+ )
580
+
581
+ case TaskGroup.QUESTION_ANSWERING:
582
+ # Locate the maximum number of tokens that constitutes a short example
583
+ for max_num_tokens in [512, 1024, 2048, 4096, 8192]:
584
+ train_with_short_examples = dataset["train"].filter(
585
+ lambda example: len(example["context"]) < max_num_tokens
586
+ )
587
+ num_short_examples = len(train_with_short_examples)
588
+ if num_short_examples >= self.dataset_config.num_few_shot_examples:
589
+ break
590
+ else:
591
+ raise InvalidBenchmark(
592
+ "Could not find enough short examples for few-shot learning."
593
+ )
594
+
595
+ shuffled_train = train_with_short_examples.shuffle(seed=random_seed)
596
+ while (
597
+ len(few_shot_examples) < num_few_shots and len(shuffled_train) > 0
598
+ ):
599
+ example = shuffled_train.select(range(1))[0]
600
+ few_shot_examples.append(example)
601
+ shuffled_train = shuffled_train.filter(
602
+ lambda x: x["context"] != example["context"]
603
+ )
604
+
605
+ case _:
606
+ raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
607
+
608
+ random.seed(random_seed)
609
+ random.shuffle(few_shot_examples)
610
+ return few_shot_examples
611
+
612
+ def _apply_prompt(
613
+ self,
614
+ examples: dict[str, t.Any],
615
+ few_shot_examples: list[dict[str, t.Any]],
616
+ task: Task,
617
+ ) -> dict[str, t.Any]:
618
+ """Apply prompt template to an example, potentially with few-shot examples.
619
+
620
+ Args:
621
+ examples:
622
+ The examples to apply the few-shot examples to.
623
+ few_shot_examples:
624
+ The few-shot examples to apply.
625
+ task:
626
+ The task that is being benchmarked.
627
+
628
+ Returns:
629
+ The example with the few-shot examples applied.
630
+ """
631
+
632
+ def create_prompt(**kwargs: str) -> tuple[str, str]:
633
+ """Create a prompt from the given keyword arguments.
634
+
635
+ Args:
636
+ kwargs:
637
+ The keyword arguments to use in the prompt.
638
+
639
+ Returns:
640
+ A pair (prompt, label), where "label" is an empty string if the model is
641
+ not instruction tuned (as in this case it is included in the prompt).
642
+ """
643
+ label_key = "label" if "label" in kwargs else "target_text"
644
+ label = kwargs.pop(label_key)
645
+ assert label is not None, (
646
+ f"Found a None label for the prompt: {kwargs}. This should not happen."
647
+ )
648
+ label_mapping = self.dataset_config.prompt_label_mapping
649
+ label = label_mapping.get(label, label)
650
+ if self.buffer["instruction_model"]:
651
+ prompt = self.dataset_config.instruction_prompt.format(**kwargs)
652
+ return prompt, label
653
+ else:
654
+ kwargs[label_key] = label
655
+ return self.dataset_config.prompt_template.format(**kwargs), ""
656
+
657
+ match task.task_group:
658
+ case (
659
+ TaskGroup.SEQUENCE_CLASSIFICATION
660
+ | TaskGroup.MULTIPLE_CHOICE_CLASSIFICATION
661
+ ):
662
+ few_shot_sections = [
663
+ create_prompt(
664
+ text=example["text"].replace("\n", " ").strip(),
665
+ label=example["label"].replace("\n", " ").strip(),
666
+ )
667
+ for example in few_shot_examples
668
+ ]
669
+ new_sections = [
670
+ create_prompt(text=text.replace("\n", " ").strip(), label="")
671
+ for text in examples["text"]
672
+ ]
673
+
674
+ case TaskGroup.TEXT_TO_TEXT:
675
+ few_shot_sections = [
676
+ create_prompt(
677
+ text=example["text"].replace("\n", " ").strip(),
678
+ target_text=example["target_text"].replace("\n", " ").strip(),
679
+ )
680
+ for example in few_shot_examples
681
+ ]
682
+ new_sections = [
683
+ create_prompt(text=text.replace("\n", " ").strip(), target_text="")
684
+ for text in examples["text"]
685
+ ]
686
+
687
+ case TaskGroup.TOKEN_CLASSIFICATION:
688
+
689
+ def create_label(example: dict) -> str:
690
+ prompt_labels = self.dataset_config.prompt_label_mapping.values()
691
+ labels: dict[str, list[str]] = {
692
+ prompt_label: list() for prompt_label in prompt_labels
693
+ }
694
+ for token, label in zip(example["tokens"], example["labels"]):
695
+ label = label.lower()
696
+ if label == "o":
697
+ continue
698
+ prompt_label = self.dataset_config.prompt_label_mapping[label]
699
+ if label.startswith("b-"):
700
+ labels[prompt_label].append(token)
701
+ elif label.startswith("i-"):
702
+ labels[prompt_label][-1] += " " + token
703
+ return json.dumps(labels, ensure_ascii=False)
704
+
705
+ few_shot_sections = [
706
+ create_prompt(
707
+ text=" ".join(example["tokens"]).replace("\n", " ").strip(),
708
+ label=create_label(example=example),
709
+ )
710
+ for example in few_shot_examples
711
+ ]
712
+ new_sections = [
713
+ create_prompt(
714
+ text=" ".join(tokens).replace("\n", " ").strip(), label=""
715
+ )
716
+ for tokens in examples["tokens"]
717
+ ]
718
+
719
+ case TaskGroup.QUESTION_ANSWERING:
720
+ few_shot_sections = [
721
+ create_prompt(
722
+ text=example["context"].replace("\n", " ").strip(),
723
+ question=example["question"].replace("\n", " ").strip(),
724
+ label=example["answers"]["text"][0].replace("\n", " "),
725
+ )
726
+ for example in few_shot_examples
727
+ ]
728
+ new_sections = [
729
+ create_prompt(
730
+ text=context.replace("\n", " ").strip(),
731
+ question=question.replace("\n", " ").strip(),
732
+ label="",
733
+ )
734
+ for context, question in zip(
735
+ examples["context"], examples["question"]
736
+ )
737
+ ]
738
+
739
+ case _:
740
+ raise NotImplementedError(f"Unsupported task group: {task.task_group}.")
741
+
742
+ if self.buffer["instruction_model"]:
743
+ few_shot_messages = [
744
+ dict(role=role, content=content)
745
+ for prompt, label in few_shot_sections
746
+ for role, content in [("user", prompt), ("assistant", label)]
747
+ ]
748
+
749
+ messages_list = [
750
+ few_shot_messages + [dict(role="user", content=prompt)]
751
+ for prompt, _ in new_sections
752
+ ]
753
+
754
+ # Pick the chat template that matches the language of the dataset, if such a
755
+ # template exists
756
+ chat_template: str | None = None
757
+ if isinstance(self._tokenizer.chat_template, dict):
758
+ language_codes = [
759
+ language.code for language in self.dataset_config.languages
760
+ ]
761
+ for name, candidate_template in self._tokenizer.chat_template.items():
762
+ if name.lower() in language_codes:
763
+ chat_template = candidate_template
764
+ log_once(
765
+ f"Using the {name!r} chat template for the tokenizer.",
766
+ level=logging.DEBUG,
767
+ )
768
+ break
769
+
770
+ texts = [
771
+ self._tokenizer.apply_chat_template(
772
+ conversation=messages,
773
+ tokenize=False,
774
+ add_generation_prompt=True,
775
+ chat_template=chat_template,
776
+ )
777
+ for messages in messages_list
778
+ ]
779
+
780
+ examples["text"] = texts
781
+
782
+ else:
783
+ prompt_prefix = ""
784
+ if self.dataset_config.prompt_prefix:
785
+ prompt_prefix = self.dataset_config.prompt_prefix + "\n\n"
786
+
787
+ few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
788
+ if few_shot_prompt:
789
+ few_shot_prompt += "\n\n"
790
+
791
+ examples["text"] = [
792
+ prompt_prefix + few_shot_prompt + new_prompt
793
+ for new_prompt, _ in new_sections
794
+ ]
795
+
796
+ return examples
797
+
798
+ @property
799
+ def data_collator(self) -> c.Callable[[list[t.Any]], dict[str, t.Any]]:
800
+ """The data collator used to prepare samples during finetuning.
801
+
802
+ Returns:
803
+ The data collator.
804
+ """
805
+ raise NotImplementedError(
806
+ "The `data_collator` property has not been implemented for vLLM models."
807
+ )
808
+
809
+ @property
810
+ def trainer_class(self) -> t.Type["Trainer"]:
811
+ """The Trainer class to use for finetuning.
812
+
813
+ Returns:
814
+ The Trainer class.
815
+ """
816
+ raise NotImplementedError(
817
+ "The `trainer_class` property has not been implemented for vLLM models."
818
+ )
819
+
820
+
821
+ def load_model_and_tokenizer(
822
+ model_config: ModelConfig, benchmark_config: BenchmarkConfig, output_scores: bool
823
+ ) -> "tuple[LLM, PreTrainedTokenizer]":
824
+ """Load the model and tokenizer.
825
+
826
+ Args:
827
+ model_config:
828
+ The model configuration.
829
+ benchmark_config:
830
+ The benchmark configuration.
831
+ output_scores:
832
+ Whether to output scores.
833
+
834
+ Returns:
835
+ The loaded model and tokenizer.
836
+ """
837
+ # Prefer base model ID if the model is an adapter - the adapter will be added on
838
+ # during inference in this case
839
+ model_id = model_config.adapter_base_model_id or model_config.model_id
840
+
841
+ hf_model_config = load_hf_model_config(
842
+ model_id=model_id,
843
+ num_labels=0,
844
+ id2label=dict(),
845
+ label2id=dict(),
846
+ revision=model_config.revision,
847
+ model_cache_dir=model_config.model_cache_dir,
848
+ api_key=benchmark_config.api_key,
849
+ trust_remote_code=benchmark_config.trust_remote_code,
850
+ run_with_cli=benchmark_config.run_with_cli,
851
+ )
852
+
853
+ quantization = None
854
+ if hasattr(hf_model_config, "quantization_config"):
855
+ quantization = hf_model_config.quantization_config.get("quant_method")
856
+
857
+ # The quantised models require extra dependencies
858
+ if quantization == "gptq" and (
859
+ importlib.util.find_spec("auto_gptq") is None
860
+ or importlib.util.find_spec("optimum") is None
861
+ ):
862
+ raise NeedsExtraInstalled(extra="quantization")
863
+ if quantization == "awq" and importlib.util.find_spec("awq") is None:
864
+ raise NeedsExtraInstalled(extra="quantization")
865
+
866
+ dtype: str | torch.dtype = "auto"
867
+ if quantization is not None and hf_model_config.torch_dtype != torch.float16:
868
+ logger.info(
869
+ "You are loading a quantized model with dtype "
870
+ f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "
871
+ "dtype to float16 instead."
872
+ )
873
+ dtype = torch.float16
874
+
875
+ if model_config.adapter_base_model_id is not None:
876
+ download_dir = str(Path(model_config.model_cache_dir) / "base_model")
877
+ else:
878
+ download_dir = str(model_config.model_cache_dir)
879
+
880
+ potential_max_model_length_config_names = [
881
+ "max_position_embeddings",
882
+ "max_sequence_length",
883
+ "model_max_length",
884
+ "sliding_window",
885
+ "sliding_window_size",
886
+ "n_positions",
887
+ ]
888
+ true_max_model_len_candidates: list[int] = list()
889
+ for config_name in potential_max_model_length_config_names:
890
+ if hasattr(hf_model_config, config_name):
891
+ model_len = getattr(hf_model_config, config_name)
892
+ if model_len is not None:
893
+ true_max_model_len_candidates.append(model_len)
894
+
895
+ if len(true_max_model_len_candidates) > 0:
896
+ true_max_model_len = min(true_max_model_len_candidates)
897
+ else:
898
+ true_max_model_len = 5_000
899
+
900
+ clear_vllm()
901
+
902
+ executor_backend = "ray" if torch.cuda.device_count() > 1 else "mp"
903
+
904
+ try:
905
+ model = LLM(
906
+ model=model_id,
907
+ tokenizer=model_id,
908
+ gpu_memory_utilization=0.95,
909
+ max_model_len=min(true_max_model_len, 5_000),
910
+ download_dir=download_dir,
911
+ trust_remote_code=benchmark_config.trust_remote_code,
912
+ revision=model_config.revision,
913
+ seed=4242,
914
+ distributed_executor_backend=executor_backend,
915
+ tensor_parallel_size=torch.cuda.device_count(),
916
+ disable_custom_all_reduce=True,
917
+ quantization=quantization,
918
+ dtype=dtype,
919
+ enforce_eager=True,
920
+ max_logprobs=MAX_LOGPROBS if output_scores else None,
921
+ # TEMP: Prefix caching isn't supported with sliding window in vLLM yet,
922
+ # so we disable it for now
923
+ enable_prefix_caching=False,
924
+ enable_lora=model_config.adapter_base_model_id is not None,
925
+ max_lora_rank=256,
926
+ )
927
+ except (ValueError, OSError) as e:
928
+ if "awaiting a review from the repo authors" in str(e):
929
+ raise InvalidModel(
930
+ f"The model {model_id!r} is awaiting a review from the repository "
931
+ "authors. Please try again later."
932
+ )
933
+ elif "trust_remote_code" in str(e):
934
+ raise InvalidModel(
935
+ f"Loading the model {model_id!r} needs to trust remote code. "
936
+ "If you trust the suppliers of this model, then you can enable "
937
+ "this by setting the `--trust-remote-code` flag."
938
+ )
939
+ raise InvalidModel(
940
+ f"The model {model_id!r} could not be loaded. The error was {e!r}."
941
+ )
942
+
943
+ model._run_engine = MethodType(_run_engine_with_fixed_progress_bars, model)
944
+ model.config = hf_model_config
945
+
946
+ tokenizer = load_tokenizer(
947
+ model_id=model_config.model_id,
948
+ revision=model_config.revision,
949
+ adapter_base_model_id=model_config.adapter_base_model_id,
950
+ trust_remote_code=benchmark_config.trust_remote_code,
951
+ model_max_length=true_max_model_len,
952
+ model_cache_dir=model_config.model_cache_dir,
953
+ token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
954
+ )
955
+
956
+ return model, tokenizer
957
+
958
+
959
+ def load_tokenizer(
960
+ model_id: str,
961
+ revision: str,
962
+ adapter_base_model_id: str | None,
963
+ trust_remote_code: bool,
964
+ model_max_length: int,
965
+ model_cache_dir: str,
966
+ token: str | bool,
967
+ ) -> "PreTrainedTokenizer":
968
+ """Load the tokenizer.
969
+
970
+ Args:
971
+ model_id:
972
+ The model identifier.
973
+ revision:
974
+ The revision of the model.
975
+ adapter_base_model_id:
976
+ The base model ID for the adapter model. Can be None if the model is not an
977
+ adapter model.
978
+ trust_remote_code:
979
+ Whether to trust remote code.
980
+ model_max_length:
981
+ The maximum length of the model.
982
+ model_cache_dir:
983
+ The cache directory for the model.
984
+ token:
985
+ The Hugging Face API token.
986
+
987
+ Returns:
988
+ The loaded tokenizer.
989
+ """
990
+ config = AutoConfig.from_pretrained(
991
+ adapter_base_model_id or model_id,
992
+ revision=revision,
993
+ cache_dir=model_cache_dir,
994
+ token=token,
995
+ trust_remote_code=trust_remote_code,
996
+ )
997
+ num_retries = 5
998
+ for _ in range(num_retries):
999
+ try:
1000
+ tokenizer = AutoTokenizer.from_pretrained(
1001
+ model_id,
1002
+ use_fast=True,
1003
+ verbose=False,
1004
+ trust_remote_code=trust_remote_code,
1005
+ padding_side="left",
1006
+ truncation_side="left",
1007
+ model_max_length=model_max_length,
1008
+ config=config,
1009
+ token=token,
1010
+ )
1011
+ break
1012
+ except (json.JSONDecodeError, OSError, TypeError) as e:
1013
+ if adapter_base_model_id is None or model_id == adapter_base_model_id:
1014
+ raise InvalidModel(
1015
+ f"Could not load tokenizer for model {model_id!r}. The error was "
1016
+ f"{str(e)}."
1017
+ )
1018
+ logger.debug(
1019
+ f"Could not load tokenizer for {model_id!r}. Falling back to "
1020
+ f"{adapter_base_model_id!r}."
1021
+ )
1022
+ model_id = adapter_base_model_id
1023
+ except (TimeoutError, RequestError):
1024
+ logger.info(f"Couldn't load tokenizer for {model_id!r}. Retrying.")
1025
+ sleep(5)
1026
+ continue
1027
+ else:
1028
+ raise InvalidModel(
1029
+ f"Could not load tokenizer for model {model_id!r} after {num_retries} "
1030
+ "attempts."
1031
+ )
1032
+
1033
+ # Ensure that BOS, EOS and PAD tokens are set
1034
+ tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
1035
+ tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
1036
+ if tokenizer.pad_token_id is None:
1037
+ tokenizer.pad_token = tokenizer.eos_token
1038
+
1039
+ return tokenizer
1040
+
1041
+
1042
+ def _run_engine_with_fixed_progress_bars(
1043
+ self: "LLM", use_tqdm: bool
1044
+ ) -> list["RequestOutput"]:
1045
+ if use_tqdm:
1046
+ num_requests = self.llm_engine.get_num_unfinished_requests()
1047
+ pbar = tqdm(
1048
+ total=num_requests, leave=False, disable=hasattr(sys, "_called_from_test")
1049
+ )
1050
+ else:
1051
+ pbar = None
1052
+
1053
+ # Run the engine.
1054
+ outputs: list["RequestOutput"] = list()
1055
+ while self.llm_engine.has_unfinished_requests():
1056
+ step_outputs = self.llm_engine.step()
1057
+ for output in step_outputs:
1058
+ if output.finished:
1059
+ outputs.append(output)
1060
+ if pbar is not None:
1061
+ pbar.update(1)
1062
+
1063
+ if pbar is not None:
1064
+ pbar.close()
1065
+
1066
+ # Sort the outputs by request ID. This is necessary because some requests may be
1067
+ # finished earlier than its previous requests.
1068
+ outputs = sorted(outputs, key=lambda x: int(x.request_id))
1069
+
1070
+ return outputs
1071
+
1072
+
1073
+ def clear_vllm() -> None:
1074
+ """Clear the GPU memory used by the vLLM model, enabling re-initialisation."""
1075
+ try:
1076
+ destroy_model_parallel()
1077
+ except ImportError:
1078
+ pass
1079
+ clear_memory()
1080
+ if ray.is_initialized():
1081
+ ray.shutdown()
1082
+
1083
+
1084
+ def get_end_of_reasoning_token_id(
1085
+ model: "LLM", tokenizer: "PreTrainedTokenizer"
1086
+ ) -> int | None:
1087
+ """Get the end of reasoning token ID for a generative model.
1088
+
1089
+ This assumes that the reasoning token is of the form <X> and that the end of
1090
+ reasoning token is </X> (for X being any string without spaces).
1091
+
1092
+ Args:
1093
+ model:
1094
+ The vLLM model.
1095
+ tokenizer:
1096
+ The tokenizer.
1097
+
1098
+ Returns:
1099
+ The end of reasoning token ID, or None if it could not be found.
1100
+ """
1101
+ if tokenizer.chat_template is None:
1102
+ prompt = "What is your name?"
1103
+ else:
1104
+ prompt = tokenizer.apply_chat_template(
1105
+ conversation=[dict(role="user", content="What is your name?")],
1106
+ add_generation_prompt=True,
1107
+ tokenize=False,
1108
+ )
1109
+
1110
+ # Generate a completion and remove the BOS token from it, to not confuse it with the
1111
+ # potential reasoning token
1112
+ completion = (
1113
+ model.generate(
1114
+ prompts=[prompt],
1115
+ sampling_params=SamplingParams(max_tokens=3, temperature=0.0),
1116
+ use_tqdm=False,
1117
+ )[0]
1118
+ .outputs[0]
1119
+ .text
1120
+ )
1121
+ if tokenizer.bos_token is not None:
1122
+ completion = completion.replace(tokenizer.bos_token, "").strip()
1123
+
1124
+ # If it doesn't contain a reasoning token, we can't find the end of reasoning token
1125
+ match = re.search(pattern=r"<\w+>", string=completion)
1126
+ if match is None:
1127
+ log_once(
1128
+ message=(
1129
+ "Could not find a reasoning token, so assuming the model is not a "
1130
+ "reasoning model."
1131
+ ),
1132
+ level=logging.DEBUG,
1133
+ )
1134
+ return None
1135
+
1136
+ # Check that the found reasoning token and its associated end-of-reasoning tokens
1137
+ # are both special tokens
1138
+ reasoning_token = match.group()
1139
+ end_of_reasoning_token = f"</{reasoning_token[1:-1]}>"
1140
+ special_tokens = [
1141
+ decoder_token.content
1142
+ for decoder_token in tokenizer.added_tokens_decoder.values()
1143
+ ]
1144
+ special_tokens.extend(
1145
+ [encoder_token for encoder_token in tokenizer.added_tokens_encoder.keys()]
1146
+ )
1147
+ special_tokens.extend(tokenizer.all_special_tokens)
1148
+ if (
1149
+ reasoning_token not in special_tokens
1150
+ or end_of_reasoning_token not in special_tokens
1151
+ ):
1152
+ log_once(
1153
+ message=(
1154
+ f"Detected reasoning token {reasoning_token!r} and end of reasoning "
1155
+ f"token {end_of_reasoning_token!r}, but one of them is not registered "
1156
+ "as a special token, so assuming it is not a real reasoning token."
1157
+ ),
1158
+ level=logging.DEBUG,
1159
+ )
1160
+ return None
1161
+
1162
+ log_once(
1163
+ message=f"Detected reasoning token {reasoning_token!r}.", level=logging.DEBUG
1164
+ )
1165
+
1166
+ # Encode the end of reasoning token and return its ID
1167
+ end_of_reasoning_token_id = tokenizer.encode(
1168
+ text=end_of_reasoning_token, add_special_tokens=False
1169
+ )[0]
1170
+
1171
+ return end_of_reasoning_token_id