vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
vllm/core/scheduler.py ADDED
@@ -0,0 +1,1163 @@
1
+ import enum
2
+ import os
3
+ import random
4
+ import time
5
+ from collections import deque
6
+ from dataclasses import dataclass, field
7
+ from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
8
+
9
+ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
10
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
11
+ from vllm.core.policy import Policy, PolicyFactory
12
+ from vllm.logger import init_logger
13
+ from vllm.lora.request import LoRARequest
14
+ from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
15
+ SequenceGroupMetadata, SequenceStatus)
16
+ from vllm.utils import merge_dicts
17
+
18
+ logger = init_logger(__name__)
19
+
20
+ # Test-only. If configured, decode is preempted with
21
+ # ARTIFICIAL_PREEMPTION_PROB% probability.
22
+ ENABLE_ARTIFICIAL_PREEMPT = bool(
23
+ os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False)) # noqa
24
+ ARTIFICIAL_PREEMPTION_PROB = 0.5
25
+ ARTIFICIAL_PREEMPTION_MAX_CNT = 500
26
+
27
+
28
+ class PreemptionMode(enum.Enum):
29
+ """Preemption modes.
30
+
31
+ 1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
32
+ and swap them back in when the sequences are resumed.
33
+ 2. Recomputation: Discard the blocks of the preempted sequences and
34
+ recompute them when the sequences are resumed, treating the sequences as
35
+ new prompts.
36
+ """
37
+ SWAP = enum.auto()
38
+ RECOMPUTE = enum.auto()
39
+
40
+
41
+ @dataclass
42
+ class SchedulingBudget:
43
+ """The available slots for scheduling.
44
+
45
+ TODO(sang): Right now, the budget is request_id-aware meaning it can ignore
46
+ budget update from the same request_id. It is because in normal scheduling
47
+ path, we update RUNNING num_seqs ahead of time, meaning it could be
48
+ updated more than once when scheduling RUNNING requests. Since this won't
49
+ happen if we only have chunked prefill scheduling, we can remove this
50
+ feature from the API when chunked prefill is enabled by default.
51
+ """
52
+ token_budget: int
53
+ max_num_seqs: int
54
+ _requeset_ids_num_batched_tokens: Set[str] = field(default_factory=set)
55
+ _requeset_ids_num_curr_seqs: Set[str] = field(default_factory=set)
56
+ _num_batched_tokens: int = 0
57
+ _num_curr_seqs: int = 0
58
+
59
+ def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
60
+ assert num_new_tokens != 0
61
+ assert num_new_seqs != 0
62
+ return (self.num_batched_tokens + num_new_tokens <= self.token_budget
63
+ and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
64
+
65
+ def remaining_token_budget(self):
66
+ return self.token_budget - self.num_batched_tokens
67
+
68
+ def add_num_batched_tokens(self, req_id: str, num_batched_tokens: int):
69
+ if req_id in self._requeset_ids_num_batched_tokens:
70
+ return
71
+
72
+ self._requeset_ids_num_batched_tokens.add(req_id)
73
+ self._num_batched_tokens += num_batched_tokens
74
+
75
+ def subtract_num_batched_tokens(self, req_id: str,
76
+ num_batched_tokens: int):
77
+ if req_id in self._requeset_ids_num_batched_tokens:
78
+ self._requeset_ids_num_batched_tokens.remove(req_id)
79
+ self._num_batched_tokens -= num_batched_tokens
80
+
81
+ def add_num_seqs(self, req_id: str, num_curr_seqs: int):
82
+ if req_id in self._requeset_ids_num_curr_seqs:
83
+ return
84
+
85
+ self._requeset_ids_num_curr_seqs.add(req_id)
86
+ self._num_curr_seqs += num_curr_seqs
87
+
88
+ def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
89
+ if req_id in self._requeset_ids_num_curr_seqs:
90
+ self._requeset_ids_num_curr_seqs.remove(req_id)
91
+ self._num_curr_seqs -= num_curr_seqs
92
+
93
+ @property
94
+ def num_batched_tokens(self):
95
+ return self._num_batched_tokens
96
+
97
+ @property
98
+ def num_curr_seqs(self):
99
+ return self._num_curr_seqs
100
+
101
+
102
+ @dataclass
103
+ class ScheduledSequenceGroup:
104
+ # A sequence group that's scheduled.
105
+ seq_group: SequenceGroup
106
+ # The total chunk size (number of tokens) to process for next iteration.
107
+ # 1 for decoding. Same as prompt tokens for prefill, but if prefill is
108
+ # chunked, it can be smaller than that.
109
+ token_chunk_size: int
110
+
111
+
112
+ @dataclass
113
+ class SchedulerOutputs:
114
+ """The scheduling decision made from a scheduler."""
115
+ # Scheduled sequence groups.
116
+ scheduled_seq_groups: Iterable[ScheduledSequenceGroup]
117
+ # Number of prefill groups scheduled.
118
+ num_prefill_groups: int
119
+ # Total number of batched tokens.
120
+ num_batched_tokens: int
121
+ # Blocks to swap in. Dict of CPU -> GPU block number.
122
+ blocks_to_swap_in: Dict[int, int]
123
+ # Blocks to swap out. Dict of GPU -> CPU block number.
124
+ blocks_to_swap_out: Dict[int, int]
125
+ # Blocks to copy. Source to a list of dest blocks.
126
+ blocks_to_copy: Dict[int, List[int]]
127
+ # Sequence groups that are going to be ignored.
128
+ ignored_seq_groups: List[SequenceGroup]
129
+ # The number of slots for lookahead decoding.
130
+ num_lookahead_slots: int
131
+ # The number of requests in the running queue
132
+ running_queue_size: int
133
+
134
+ def __post_init__(self):
135
+ # Swap in and swap out should never happen at the same time.
136
+ assert not (self.blocks_to_swap_in and self.blocks_to_swap_out)
137
+
138
+ self.num_loras: int = len(self.lora_requests)
139
+ if self.num_loras > 0:
140
+ self._sort_by_lora_ids()
141
+
142
+ def is_empty(self) -> bool:
143
+ # NOTE: We do not consider the ignored sequence groups.
144
+ return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
145
+ and not self.blocks_to_swap_out and not self.blocks_to_copy)
146
+
147
+ def _sort_by_lora_ids(self):
148
+ self.scheduled_seq_groups = sorted(
149
+ self.scheduled_seq_groups,
150
+ key=lambda g: (g.seq_group.lora_int_id, g.seq_group.request_id))
151
+
152
+ @property
153
+ def lora_requests(self) -> Set[LoRARequest]:
154
+ return {
155
+ g.seq_group.lora_request
156
+ for g in self.scheduled_seq_groups
157
+ if g.seq_group.lora_request is not None
158
+ }
159
+
160
+
161
+ @dataclass
162
+ class SchedulerRunningOutputs:
163
+ """The requests that are scheduled from a running queue.
164
+
165
+ Could contain prefill (prefill that's chunked) or decodes. If there's not
166
+ enough memory, it can be preempted (for recompute) or swapped out.
167
+ """
168
+ # Selected sequences that are running and in a decoding phase.
169
+ decode_seq_groups: List[SequenceGroup]
170
+ # Selected sequences that are running and in a prefill phase.
171
+ # I.e., it means the prefill has been chunked.
172
+ prefill_seq_groups: List[SequenceGroup]
173
+ # The preempted sequences.
174
+ preempted: List[SequenceGroup]
175
+ # Sequences that are swapped out.
176
+ swapped_out: List[SequenceGroup]
177
+ # The blocks to swap out.
178
+ blocks_to_swap_out: Dict[int, int]
179
+ # The blocks to copy.
180
+ blocks_to_copy: Dict[int, List[int]]
181
+ # The number of slots for lookahead decoding.
182
+ num_lookahead_slots: int
183
+
184
+ @classmethod
185
+ def create_empty(cls) -> "SchedulerRunningOutputs":
186
+ return SchedulerRunningOutputs(
187
+ decode_seq_groups=[],
188
+ prefill_seq_groups=[],
189
+ preempted=[],
190
+ swapped_out=[],
191
+ blocks_to_swap_out={},
192
+ blocks_to_copy={},
193
+ num_lookahead_slots=0,
194
+ )
195
+
196
+
197
+ @dataclass
198
+ class SchedulerSwappedInOutputs:
199
+ """The requests that are scheduled from a swap queue.
200
+
201
+ Could contain prefill (prefill that's chunked) or decodes.
202
+ """
203
+ # Selected sequences that are going to be swapped in and is in a
204
+ # decoding phase.
205
+ decode_seq_groups: List[SequenceGroup]
206
+ # Selected sequences that are going to be swapped in and in a prefill
207
+ # phase. I.e., it means the prefill has been chunked.
208
+ prefill_seq_groups: List[SequenceGroup]
209
+ # The blocks to swap in.
210
+ blocks_to_swap_in: Dict[int, int]
211
+ # The blocks to copy.
212
+ blocks_to_copy: Dict[int, List[int]]
213
+ # The number of slots for lookahead decoding.
214
+ num_lookahead_slots: int
215
+ # Infeasible sequence groups.
216
+ infeasible_seq_groups: List[SequenceGroup]
217
+
218
+ @classmethod
219
+ def create_empty(cls) -> "SchedulerSwappedInOutputs":
220
+ return SchedulerSwappedInOutputs(
221
+ decode_seq_groups=[],
222
+ prefill_seq_groups=[],
223
+ blocks_to_swap_in={},
224
+ blocks_to_copy={},
225
+ num_lookahead_slots=0,
226
+ infeasible_seq_groups=[],
227
+ )
228
+
229
+
230
+ @dataclass
231
+ class SchedulerPrefillOutputs:
232
+ """The requests that are scheduled from a waiting queue.
233
+
234
+ Could contain a fresh prefill requests or preempted requests that need
235
+ to be recomputed from scratch.
236
+ """
237
+ # Selected sequences for prefill.
238
+ seq_groups: List[SequenceGroup]
239
+ # Ignored sequence groups.
240
+ ignored_seq_groups: List[SequenceGroup]
241
+ num_lookahead_slots: int
242
+
243
+ @classmethod
244
+ def create_empty(cls) -> "SchedulerPrefillOutputs":
245
+ return SchedulerPrefillOutputs(
246
+ seq_groups=[],
247
+ ignored_seq_groups=[],
248
+ num_lookahead_slots=0,
249
+ )
250
+
251
+
252
+ class Scheduler:
253
+
254
+ def __init__(
255
+ self,
256
+ scheduler_config: SchedulerConfig,
257
+ cache_config: CacheConfig,
258
+ lora_config: Optional[LoRAConfig],
259
+ ) -> None:
260
+ self.scheduler_config = scheduler_config
261
+ self.cache_config = cache_config
262
+ # Note for LoRA scheduling: the current policy is extremely
263
+ # simple and NOT fair. It can lead to starvation of some
264
+ # LoRAs. This should be improved in the future.
265
+ self.lora_config = lora_config
266
+
267
+ if self.scheduler_config.chunked_prefill_enabled:
268
+ self.prompt_limit = self.scheduler_config.max_model_len
269
+ else:
270
+ self.prompt_limit = min(
271
+ self.scheduler_config.max_model_len,
272
+ self.scheduler_config.max_num_batched_tokens)
273
+
274
+ BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
275
+ version="v2" if self.scheduler_config.
276
+ use_v2_block_manager else "v1")
277
+
278
+ # Create the block space manager.
279
+ self.block_manager = BlockSpaceManagerImpl(
280
+ block_size=self.cache_config.block_size,
281
+ num_gpu_blocks=self.cache_config.num_gpu_blocks,
282
+ num_cpu_blocks=self.cache_config.num_cpu_blocks,
283
+ sliding_window=self.cache_config.sliding_window,
284
+ enable_caching=self.cache_config.enable_prefix_caching)
285
+
286
+ # Sequence groups in the WAITING state.
287
+ # Contain new prefill or preempted requests.
288
+ self.waiting: Deque[SequenceGroup] = deque()
289
+ # Sequence groups in the RUNNING state.
290
+ # Contain decode requests.
291
+ self.running: Deque[SequenceGroup] = deque()
292
+ # Sequence groups in the SWAPPED state.
293
+ # Contain decode requests that are swapped out.
294
+ self.swapped: Deque[SequenceGroup] = deque()
295
+
296
+ # Time at previous scheduling step
297
+ self.prev_time = 0.0
298
+ # Did we schedule a prompt at previous step?
299
+ self.prev_prompt = False
300
+ # Latency of the last prompt step
301
+ self.last_prompt_latency = 0.0
302
+
303
+ # The following field is test-only. It is used to inject artificial
304
+ # preemption.
305
+ self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
306
+ self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
307
+ if self.enable_artificial_preemption
308
+ else 0)
309
+
310
+ @property
311
+ def lora_enabled(self) -> bool:
312
+ return bool(self.lora_config)
313
+
314
+ @property
315
+ def num_decoding_tokens_per_seq(self) -> int:
316
+ """The number of new tokens."""
317
+ return 1
318
+
319
+ def add_seq_group(self, seq_group: SequenceGroup) -> None:
320
+ # Add sequence groups to the waiting queue.
321
+ self.waiting.append(seq_group)
322
+
323
+ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
324
+ """Aborts a sequence group with the given ID.
325
+
326
+ Check if the sequence group with the given ID
327
+ is present in any of the state queue.
328
+ If present, remove the sequence group from the state queue.
329
+ Also, if any of the sequences in the sequence group is not finished,
330
+ free the sequence with status `FINISHED_ABORTED`.
331
+ Otherwise, do nothing.
332
+
333
+ Args:
334
+ request_id: The ID(s) of the sequence group to abort.
335
+ """
336
+ if isinstance(request_id, str):
337
+ request_id = (request_id, )
338
+ request_ids = set(request_id)
339
+ for state_queue in [self.waiting, self.running, self.swapped]:
340
+ aborted_groups: List[SequenceGroup] = []
341
+ for seq_group in state_queue:
342
+ if not request_ids:
343
+ # Using 'break' here may add two extra iterations,
344
+ # but is acceptable to reduce complexity.
345
+ break
346
+ if seq_group.request_id in request_ids:
347
+ # Appending aborted group into pending list.
348
+ aborted_groups.append(seq_group)
349
+ request_ids.remove(seq_group.request_id)
350
+ for aborted_group in aborted_groups:
351
+ # Remove the sequence group from the state queue.
352
+ state_queue.remove(aborted_group)
353
+ for seq in aborted_group.get_seqs():
354
+ if seq.is_finished():
355
+ continue
356
+ seq.status = SequenceStatus.FINISHED_ABORTED
357
+ self.free_seq(seq)
358
+
359
+ def has_unfinished_seqs(self) -> bool:
360
+ return len(self.waiting) != 0 or len(self.running) != 0 or len(
361
+ self.swapped) != 0
362
+
363
+ def get_num_unfinished_seq_groups(self) -> int:
364
+ return len(self.waiting) + len(self.running) + len(self.swapped)
365
+
366
+ def _schedule_running(
367
+ self,
368
+ running_queue: deque,
369
+ budget: SchedulingBudget,
370
+ curr_loras: Optional[Set[int]],
371
+ policy: Policy,
372
+ enable_chunking: bool = False,
373
+ ) -> Tuple[deque, SchedulerRunningOutputs]:
374
+ """Schedule sequence groups that are running.
375
+
376
+ Running queue should include decode and chunked prefill requests.
377
+
378
+ Args:
379
+ running_queue: The queue that contains running requests (i.e.,
380
+ decodes). The given arguments are NOT in-place modified.
381
+ budget: The scheduling budget. The argument is in-place updated
382
+ when any decodes are preempted.
383
+ curr_loras: Currently batched lora request ids. The argument is
384
+ in-place updated when any decodes are preempted.
385
+ policy: The sorting policy to sort running_queue.
386
+ enable_chunking: If True, seq group can be chunked and only a
387
+ chunked number of tokens are scheduled if
388
+ `budget.num_batched_tokens` has not enough capacity to schedule
389
+ all tokens.
390
+
391
+ Returns:
392
+ A tuple of remaining running queue (should be always 0) after
393
+ scheduling and SchedulerRunningOutputs.
394
+ """
395
+ # Blocks that need to be swapped or copied before model execution.
396
+ blocks_to_swap_out: Dict[int, int] = {}
397
+ blocks_to_copy: Dict[int, List[int]] = {}
398
+
399
+ decode_seq_groups: List[ScheduledSequenceGroup] = []
400
+ prefill_seq_groups: List[ScheduledSequenceGroup] = []
401
+ preempted: List[SequenceGroup] = []
402
+ swapped_out: List[SequenceGroup] = []
403
+
404
+ # NOTE(woosuk): Preemption happens only when there is no available slot
405
+ # to keep all the sequence groups in the RUNNING state.
406
+ # In this case, the policy is responsible for deciding which sequence
407
+ # groups to preempt.
408
+ now = time.time()
409
+ running_queue = policy.sort_by_priority(now, running_queue)
410
+ while running_queue:
411
+ seq_group = running_queue[0]
412
+ num_running_tokens = self._get_num_new_tokens(
413
+ seq_group, SequenceStatus.RUNNING, enable_chunking, budget)
414
+
415
+ if num_running_tokens == 0:
416
+ break
417
+
418
+ running_queue.popleft()
419
+ while not self._can_append_slots(seq_group):
420
+ budget.subtract_num_batched_tokens(seq_group.request_id,
421
+ num_running_tokens)
422
+ num_running_seqs = seq_group.get_max_num_running_seqs()
423
+ budget.subtract_num_seqs(seq_group.request_id,
424
+ num_running_seqs)
425
+ if curr_loras is not None and seq_group.lora_int_id > 0:
426
+ curr_loras.remove(seq_group.lora_int_id)
427
+
428
+ if running_queue:
429
+ # Preempt the lowest-priority sequence groups.
430
+ victim_seq_group = running_queue.pop()
431
+ preempted_mode = self._preempt(victim_seq_group,
432
+ blocks_to_swap_out)
433
+ if preempted_mode == PreemptionMode.RECOMPUTE:
434
+ preempted.append(victim_seq_group)
435
+ else:
436
+ swapped_out.append(victim_seq_group)
437
+ else:
438
+ # No other sequence groups can be preempted.
439
+ # Preempt the current sequence group.
440
+ preempted_mode = self._preempt(seq_group,
441
+ blocks_to_swap_out)
442
+ if preempted_mode == PreemptionMode.RECOMPUTE:
443
+ preempted.append(seq_group)
444
+ else:
445
+ swapped_out.append(seq_group)
446
+ break
447
+ else:
448
+ self._append_slots(seq_group, blocks_to_copy)
449
+ is_prefill = seq_group.is_prefill()
450
+ if is_prefill:
451
+ prefill_seq_groups.append(
452
+ ScheduledSequenceGroup(
453
+ seq_group=seq_group,
454
+ token_chunk_size=num_running_tokens))
455
+ else:
456
+ decode_seq_groups.append(
457
+ ScheduledSequenceGroup(seq_group=seq_group,
458
+ token_chunk_size=1))
459
+ budget.add_num_batched_tokens(seq_group.request_id,
460
+ num_running_tokens)
461
+ # OPTIMIZATION: Note that get_max_num_running_seqs is
462
+ # expensive. For the default scheduling chase where
463
+ # enable_chunking is False, num_seqs are updated before running
464
+ # this method, so we don't have to update it again here.
465
+ if enable_chunking:
466
+ num_running_seqs = seq_group.get_max_num_running_seqs()
467
+ budget.add_num_seqs(seq_group.request_id, num_running_seqs)
468
+ if curr_loras is not None and seq_group.lora_int_id > 0:
469
+ curr_loras.add(seq_group.lora_int_id)
470
+
471
+ return running_queue, SchedulerRunningOutputs(
472
+ decode_seq_groups=decode_seq_groups,
473
+ prefill_seq_groups=prefill_seq_groups,
474
+ preempted=preempted,
475
+ swapped_out=swapped_out,
476
+ blocks_to_swap_out=blocks_to_swap_out,
477
+ blocks_to_copy=blocks_to_copy,
478
+ num_lookahead_slots=self._get_num_lookahead_slots(
479
+ is_prefill=False))
480
+
481
+ def _schedule_swapped(
482
+ self,
483
+ swapped_queue: deque,
484
+ budget: SchedulingBudget,
485
+ curr_loras: Optional[Set[int]],
486
+ policy: Policy,
487
+ enable_chunking: bool = False,
488
+ ) -> Tuple[deque, SchedulerSwappedInOutputs]:
489
+ """Schedule sequence groups that are swapped out.
490
+
491
+ It schedules swapped requests as long as it fits `budget` and
492
+ curr_loras <= max_lora from the scheduling config. The input arguments
493
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
494
+
495
+ Args:
496
+ swapped_queue: The queue that contains swapped out requests.
497
+ The given arguments are NOT in-place modified.
498
+ budget: The scheduling budget. The argument is in-place updated
499
+ when any requests are swapped in.
500
+ curr_loras: Currently batched lora request ids. The argument is
501
+ in-place updated when any requests are swapped in.
502
+ policy: The sorting policy to sort swapped_queue.
503
+ enable_chunking: If True, seq group can be chunked and only a
504
+ chunked number of tokens are scheduled if
505
+ `budget.num_batched_tokens` has not enough capacity to schedule
506
+ all tokens.
507
+
508
+ Returns:
509
+ A tuple of remaining swapped_queue after scheduling and
510
+ SchedulerSwappedInOutputs.
511
+ """
512
+ # Blocks that need to be swapped or copied before model execution.
513
+ blocks_to_swap_in: Dict[int, int] = {}
514
+ blocks_to_copy: Dict[int, List[int]] = {}
515
+ decode_seq_groups: List[ScheduledSequenceGroup] = []
516
+ prefill_seq_groups: List[ScheduledSequenceGroup] = []
517
+ now = time.time()
518
+ swapped_queue = policy.sort_by_priority(now, swapped_queue)
519
+ infeasible_seq_groups: List[SequenceGroup] = []
520
+
521
+ leftover_swapped: Deque[SequenceGroup] = deque()
522
+ while swapped_queue:
523
+ seq_group = swapped_queue[0]
524
+
525
+ # If the sequence group cannot be swapped in, stop.
526
+ alloc_status = self.block_manager.can_swap_in(seq_group)
527
+ if alloc_status == AllocStatus.LATER:
528
+ break
529
+ elif alloc_status == AllocStatus.NEVER:
530
+ logger.warning(
531
+ "Failing the request %s because there's not enough kv "
532
+ "cache blocks to run the entire sequence.",
533
+ seq_group.request_id)
534
+ for seq in seq_group.get_seqs():
535
+ seq.status = SequenceStatus.FINISHED_IGNORED
536
+ infeasible_seq_groups.append(seq_group)
537
+ swapped_queue.popleft()
538
+ continue
539
+
540
+ lora_int_id = 0
541
+ if self.lora_enabled:
542
+ lora_int_id = seq_group.lora_int_id
543
+ assert curr_loras is not None
544
+ assert self.lora_config is not None
545
+ if (lora_int_id > 0 and (lora_int_id not in curr_loras)
546
+ and len(curr_loras) >= self.lora_config.max_loras):
547
+ # We don't have a space for another LoRA, so
548
+ # we ignore this request for now.
549
+ leftover_swapped.appendleft(seq_group)
550
+ swapped_queue.popleft()
551
+ continue
552
+
553
+ # The total number of sequences in the RUNNING state should not
554
+ # exceed the maximum number of sequences.
555
+ num_new_seqs = seq_group.get_max_num_running_seqs()
556
+ num_new_tokens = self._get_num_new_tokens(seq_group,
557
+ SequenceStatus.SWAPPED,
558
+ enable_chunking, budget)
559
+
560
+ if (num_new_tokens == 0
561
+ or not budget.can_schedule(num_new_tokens=num_new_tokens,
562
+ num_new_seqs=num_new_seqs)):
563
+ break
564
+
565
+ if lora_int_id > 0 and curr_loras is not None:
566
+ curr_loras.add(lora_int_id)
567
+ swapped_queue.popleft()
568
+ self._swap_in(seq_group, blocks_to_swap_in)
569
+ self._append_slots(seq_group, blocks_to_copy)
570
+ is_prefill = seq_group.is_prefill()
571
+ if is_prefill:
572
+ prefill_seq_groups.append(
573
+ ScheduledSequenceGroup(seq_group,
574
+ token_chunk_size=num_new_tokens))
575
+ else:
576
+ decode_seq_groups.append(
577
+ ScheduledSequenceGroup(seq_group, token_chunk_size=1))
578
+ budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens)
579
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
580
+
581
+ swapped_queue.extendleft(leftover_swapped)
582
+
583
+ return swapped_queue, SchedulerSwappedInOutputs(
584
+ decode_seq_groups=decode_seq_groups,
585
+ prefill_seq_groups=prefill_seq_groups,
586
+ blocks_to_swap_in=blocks_to_swap_in,
587
+ blocks_to_copy=blocks_to_copy,
588
+ num_lookahead_slots=self._get_num_lookahead_slots(
589
+ is_prefill=False),
590
+ infeasible_seq_groups=infeasible_seq_groups,
591
+ )
592
+
593
+ def _schedule_prefills(
594
+ self,
595
+ waiting_queue: deque,
596
+ budget: SchedulingBudget,
597
+ curr_loras: Optional[Set[int]],
598
+ enable_chunking: bool = False,
599
+ ) -> Tuple[deque, SchedulerPrefillOutputs]:
600
+ """Schedule sequence groups that are in prefill stage.
601
+
602
+ Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
603
+ as a new prefill (that starts from beginning -> most recently generated
604
+ tokens).
605
+
606
+ It schedules waiting requests as long as it fits `budget` and
607
+ curr_loras <= max_lora from the scheduling config. The input arguments
608
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
609
+
610
+ Args:
611
+ waiting_queue: The queue that contains prefill requests.
612
+ The given arguments are NOT in-place modified.
613
+ budget: The scheduling budget. The argument is in-place updated
614
+ when any requests are scheduled.
615
+ curr_loras: Currently batched lora request ids. The argument is
616
+ in-place updated when any requests are scheduled.
617
+ enable_chunking: If True, seq group can be chunked and only a
618
+ chunked number of tokens are scheduled if
619
+ `budget.num_batched_tokens` has not enough capacity to schedule
620
+ all tokens.
621
+
622
+ Returns:
623
+ A tuple of remaining waiting_queue after scheduling and
624
+ SchedulerSwappedInOutputs.
625
+ """
626
+ ignored_seq_groups: List[SequenceGroup] = []
627
+ seq_groups: List[SequenceGroup] = []
628
+ # We don't sort waiting queue because we assume it is sorted.
629
+ # Copy the queue so that the input queue is not modified.
630
+ waiting_queue = deque([s for s in waiting_queue])
631
+
632
+ leftover_waiting_sequences: Deque[SequenceGroup] = deque()
633
+ while self._passed_delay(time.time()) and waiting_queue:
634
+ seq_group = waiting_queue[0]
635
+
636
+ waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
637
+ assert len(waiting_seqs) == 1, (
638
+ "Waiting sequence group should have only one prompt "
639
+ "sequence.")
640
+ num_new_tokens = self._get_num_new_tokens(seq_group,
641
+ SequenceStatus.WAITING,
642
+ enable_chunking, budget)
643
+ if not enable_chunking:
644
+ num_prompt_tokens = waiting_seqs[0].get_len()
645
+ assert num_new_tokens == num_prompt_tokens
646
+
647
+ if num_new_tokens > self.prompt_limit:
648
+ logger.warning(
649
+ "Input prompt (%d tokens) is too long"
650
+ " and exceeds limit of %d", num_new_tokens,
651
+ self.prompt_limit)
652
+ for seq in waiting_seqs:
653
+ seq.status = SequenceStatus.FINISHED_IGNORED
654
+ ignored_seq_groups.append(seq_group)
655
+ waiting_queue.popleft()
656
+ continue
657
+
658
+ # If the sequence group cannot be allocated, stop.
659
+ can_allocate = self.block_manager.can_allocate(seq_group)
660
+ if can_allocate == AllocStatus.LATER:
661
+ break
662
+ elif can_allocate == AllocStatus.NEVER:
663
+ logger.warning(
664
+ "Input prompt (%d tokens) is too long"
665
+ " and exceeds the capacity of block_manager",
666
+ num_new_tokens)
667
+ for seq in waiting_seqs:
668
+ seq.status = SequenceStatus.FINISHED_IGNORED
669
+ ignored_seq_groups.append(seq_group)
670
+ waiting_queue.popleft()
671
+ continue
672
+
673
+ lora_int_id = 0
674
+ if self.lora_enabled:
675
+ lora_int_id = seq_group.lora_int_id
676
+ assert curr_loras is not None
677
+ assert self.lora_config is not None
678
+ if (self.lora_enabled and lora_int_id > 0
679
+ and lora_int_id not in curr_loras
680
+ and len(curr_loras) >= self.lora_config.max_loras):
681
+ # We don't have a space for another LoRA, so
682
+ # we ignore this request for now.
683
+ leftover_waiting_sequences.appendleft(seq_group)
684
+ waiting_queue.popleft()
685
+ continue
686
+
687
+ num_new_seqs = seq_group.get_max_num_running_seqs()
688
+ if (num_new_tokens == 0
689
+ or not budget.can_schedule(num_new_tokens=num_new_tokens,
690
+ num_new_seqs=num_new_seqs)):
691
+ break
692
+
693
+ # Can schedule this request.
694
+ if curr_loras is not None and lora_int_id > 0:
695
+ curr_loras.add(lora_int_id)
696
+ waiting_queue.popleft()
697
+ self._allocate_and_set_running(seq_group)
698
+ seq_groups.append(
699
+ ScheduledSequenceGroup(seq_group=seq_group,
700
+ token_chunk_size=num_new_tokens))
701
+ budget.add_num_batched_tokens(seq_group.request_id, num_new_tokens)
702
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
703
+
704
+ # Queue requests that couldn't be scheduled.
705
+ waiting_queue.extendleft(leftover_waiting_sequences)
706
+ if len(seq_groups) > 0:
707
+ self.prev_prompt = True
708
+
709
+ return waiting_queue, SchedulerPrefillOutputs(
710
+ seq_groups=seq_groups,
711
+ ignored_seq_groups=ignored_seq_groups,
712
+ num_lookahead_slots=self._get_num_lookahead_slots(is_prefill=True))
713
+
714
+ def _schedule_default(self) -> SchedulerOutputs:
715
+ """Schedule queued requests.
716
+
717
+ The current policy is designed to optimize the throughput. First,
718
+ it batches as many prefill requests as possible. And it schedules
719
+ decodes. If there's a pressure on GPU memory, decode requests can
720
+ be swapped or preempted.
721
+ """
722
+ # Include running requests to the budget.
723
+ budget = SchedulingBudget(
724
+ token_budget=self.scheduler_config.max_num_batched_tokens,
725
+ max_num_seqs=self.scheduler_config.max_num_seqs,
726
+ )
727
+ # Make sure we include num running seqs before scheduling prefill,
728
+ # so that we don't schedule beyond max_num_seqs for prefill.
729
+ for seq_group in self.running:
730
+ budget.add_num_seqs(seq_group.request_id,
731
+ seq_group.get_max_num_running_seqs())
732
+ curr_loras = set(
733
+ seq_group.lora_int_id
734
+ for seq_group in self.running) if self.lora_enabled else None
735
+
736
+ remaining_waiting, prefills = (self.waiting,
737
+ SchedulerPrefillOutputs.create_empty())
738
+ remaining_running, running_scheduled = (
739
+ self.running, SchedulerRunningOutputs.create_empty())
740
+ remaining_swapped, swapped_in = (
741
+ self.swapped, SchedulerSwappedInOutputs.create_empty())
742
+
743
+ # If any requests are swapped, prioritized swapped requests.
744
+ if not self.swapped:
745
+ remaining_waiting, prefills = self._schedule_prefills(
746
+ self.waiting, budget, curr_loras, enable_chunking=False)
747
+
748
+ fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs")
749
+ # Don't schedule decodes if prefills are scheduled.
750
+ # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
751
+ # only contains decode requests, not chunked prefills.
752
+ if len(prefills.seq_groups) == 0:
753
+ remaining_running, running_scheduled = self._schedule_running(
754
+ self.running,
755
+ budget,
756
+ curr_loras,
757
+ fcfs_policy,
758
+ enable_chunking=False)
759
+
760
+ # If any sequence group is preempted, do not swap in any sequence
761
+ # group. because it means there's no slot for new running requests.
762
+ if len(running_scheduled.preempted) + len(
763
+ running_scheduled.swapped_out) == 0:
764
+ remaining_swapped, swapped_in = self._schedule_swapped(
765
+ self.swapped, budget, curr_loras, fcfs_policy)
766
+
767
+ assert (budget.num_batched_tokens <=
768
+ self.scheduler_config.max_num_batched_tokens)
769
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
770
+
771
+ # Update waiting requests.
772
+ self.waiting = remaining_waiting
773
+ self.waiting.extendleft(running_scheduled.preempted)
774
+ # Update new running requests.
775
+ self.running = remaining_running
776
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
777
+ self.running.extend(
778
+ [s.seq_group for s in running_scheduled.decode_seq_groups])
779
+ self.running.extend(
780
+ [s.seq_group for s in swapped_in.decode_seq_groups])
781
+ # Update swapped requests.
782
+ self.swapped = remaining_swapped
783
+ self.swapped.extend(running_scheduled.swapped_out)
784
+
785
+ # There should be no prefill from running queue because this policy
786
+ # doesn't allow chunked prefills.
787
+ assert len(running_scheduled.prefill_seq_groups) == 0
788
+ assert len(swapped_in.prefill_seq_groups) == 0
789
+ return SchedulerOutputs(
790
+ scheduled_seq_groups=(prefills.seq_groups +
791
+ running_scheduled.decode_seq_groups +
792
+ swapped_in.decode_seq_groups),
793
+ num_prefill_groups=len(prefills.seq_groups),
794
+ num_batched_tokens=budget.num_batched_tokens,
795
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
796
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
797
+ blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy,
798
+ swapped_in.blocks_to_copy),
799
+ ignored_seq_groups=prefills.ignored_seq_groups +
800
+ swapped_in.infeasible_seq_groups,
801
+ num_lookahead_slots=running_scheduled.num_lookahead_slots,
802
+ running_queue_size=len(self.running),
803
+ )
804
+
805
+ def _schedule_chunked_prefill(self):
806
+ """Schedule queued requests.
807
+
808
+ Chunked prefill allows to chunk prefill requests, batch them together
809
+ with decode requests. This policy 1. schedule as many decoding requests
810
+ as possible. 2. schedule chunked prefill requests that are not
811
+ finished. 3. schedule swapped request. 4. schedule new prefill
812
+ requests.
813
+
814
+ The policy can sustain the high GPU utilization because it can put
815
+ prefill and decodes requests to the same batch, while it improves
816
+ inter token latency because decodes requests don't need to blocked
817
+ by prefill requests.
818
+ """
819
+ budget = SchedulingBudget(
820
+ token_budget=self.scheduler_config.max_num_batched_tokens,
821
+ max_num_seqs=self.scheduler_config.max_num_seqs,
822
+ )
823
+ curr_loras: Set[int] = set()
824
+
825
+ remaining_waiting, prefills = (self.waiting,
826
+ SchedulerPrefillOutputs.create_empty())
827
+ remaining_running, running_scheduled = (
828
+ self.running, SchedulerRunningOutputs.create_empty())
829
+ remaining_swapped, swapped_in = (
830
+ self.swapped, SchedulerSwappedInOutputs.create_empty())
831
+
832
+ # Decoding should be always scheduled first by fcfs.
833
+ fcfs_policy = PolicyFactory.get_policy(policy_name="fcfs")
834
+ remaining_running, running_scheduled = self._schedule_running(
835
+ self.running,
836
+ budget,
837
+ curr_loras,
838
+ fcfs_policy,
839
+ enable_chunking=True)
840
+
841
+ # Schedule swapped out requests.
842
+ # If preemption happens, it means we don't have space for swap-in.
843
+ if len(running_scheduled.preempted) + len(
844
+ running_scheduled.swapped_out) == 0:
845
+ remaining_swapped, swapped_in = self._schedule_swapped(
846
+ self.swapped, budget, curr_loras, fcfs_policy)
847
+
848
+ # Schedule new prefills.
849
+ remaining_waiting, prefills = self._schedule_prefills(
850
+ self.waiting, budget, curr_loras, enable_chunking=True)
851
+
852
+ assert (budget.num_batched_tokens <=
853
+ self.scheduler_config.max_num_batched_tokens)
854
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
855
+
856
+ # Update waiting requests.
857
+ self.waiting = remaining_waiting
858
+ self.waiting.extendleft(running_scheduled.preempted)
859
+ # Update new running requests.
860
+ self.running = remaining_running
861
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
862
+ self.running.extend(
863
+ [s.seq_group for s in running_scheduled.decode_seq_groups])
864
+ self.running.extend(
865
+ [s.seq_group for s in running_scheduled.prefill_seq_groups])
866
+ self.running.extend(
867
+ [s.seq_group for s in swapped_in.decode_seq_groups])
868
+ self.running.extend(
869
+ [s.seq_group for s in swapped_in.prefill_seq_groups])
870
+ # Update swapped requests.
871
+ self.swapped = remaining_swapped
872
+ self.swapped.extend(running_scheduled.swapped_out)
873
+ return SchedulerOutputs(
874
+ scheduled_seq_groups=(prefills.seq_groups +
875
+ running_scheduled.prefill_seq_groups +
876
+ swapped_in.prefill_seq_groups +
877
+ running_scheduled.decode_seq_groups +
878
+ swapped_in.decode_seq_groups),
879
+ num_prefill_groups=(len(prefills.seq_groups) +
880
+ len(swapped_in.prefill_seq_groups) +
881
+ len(running_scheduled.prefill_seq_groups)),
882
+ num_batched_tokens=budget.num_batched_tokens,
883
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
884
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
885
+ blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy,
886
+ swapped_in.blocks_to_copy),
887
+ ignored_seq_groups=prefills.ignored_seq_groups,
888
+ num_lookahead_slots=running_scheduled.num_lookahead_slots,
889
+ running_queue_size=len(self.running),
890
+ )
891
+
892
+ def _schedule(self) -> SchedulerOutputs:
893
+ """Schedule queued requests."""
894
+ if self.scheduler_config.chunked_prefill_enabled:
895
+ return self._schedule_chunked_prefill()
896
+ else:
897
+ return self._schedule_default()
898
+
899
+ def _can_append_slots(self, seq_group: SequenceGroup) -> bool:
900
+ """Determine whether or not we have enough space in the KV cache to
901
+ continue generation of the sequence group.
902
+ """
903
+ # It is True only for testing case to trigger artificial preemption.
904
+ if (self.enable_artificial_preemption
905
+ and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
906
+ and self.artificial_preempt_cnt > 0):
907
+ self.artificial_preempt_cnt -= 1
908
+ return False
909
+
910
+ # Appending slots only occurs in decoding.
911
+ is_prefill = False
912
+
913
+ return self.block_manager.can_append_slots(
914
+ seq_group=seq_group,
915
+ num_lookahead_slots=self._get_num_lookahead_slots(is_prefill),
916
+ )
917
+
918
+ def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
919
+ # Schedule sequence groups.
920
+ # This function call changes the internal states of the scheduler
921
+ # such as self.running, self.swapped, and self.waiting.
922
+ scheduler_outputs = self._schedule()
923
+ now = time.time()
924
+
925
+ # Create input data structures.
926
+ seq_group_metadata_list: List[SequenceGroupMetadata] = []
927
+ for i, scheduled_seq_group in enumerate(
928
+ scheduler_outputs.scheduled_seq_groups):
929
+ seq_group = scheduled_seq_group.seq_group
930
+ token_chunk_size = scheduled_seq_group.token_chunk_size
931
+ seq_group.maybe_set_first_scheduled_time(now)
932
+
933
+ # seq_id -> SequenceData
934
+ seq_data: Dict[int, SequenceData] = {}
935
+ # seq_id -> physical block numbers
936
+ block_tables: Dict[int, List[int]] = {}
937
+
938
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
939
+ seq_id = seq.seq_id
940
+ seq_data[seq_id] = seq.data
941
+ block_tables[seq_id] = self.block_manager.get_block_table(seq)
942
+ self.block_manager.access_all_blocks_in_seq(seq, now)
943
+
944
+ common_computed_block_nums = (
945
+ self.block_manager.get_common_computed_block_ids(
946
+ seq_group.get_seqs(status=SequenceStatus.RUNNING)))
947
+
948
+ do_sample = True
949
+ if seq_group.is_prefill():
950
+ seqs = seq_group.get_seqs()
951
+ # Prefill has only 1 sequence.
952
+ assert len(seqs) == 1
953
+ # In the next iteration, all prompt tokens are not computed.
954
+ # It means the prefill is chunked, and we don't need sampling.
955
+ # NOTE: We use get_len instead of get_prompt_len because when
956
+ # a sequence is preempted, prefill includes previous generated
957
+ # output tokens.
958
+ if (token_chunk_size + seqs[0].data.get_num_computed_tokens() <
959
+ seqs[0].data.get_len()):
960
+ do_sample = False
961
+
962
+ # It assumes the scheduled_seq_groups is ordered by
963
+ # prefill < decoding.
964
+ is_prompt = seq_group.is_prefill()
965
+ seq_group_metadata = SequenceGroupMetadata(
966
+ request_id=seq_group.request_id,
967
+ is_prompt=is_prompt,
968
+ seq_data=seq_data,
969
+ sampling_params=seq_group.sampling_params,
970
+ block_tables=block_tables,
971
+ do_sample=do_sample,
972
+ token_chunk_size=token_chunk_size,
973
+ lora_request=seq_group.lora_request,
974
+ computed_block_nums=common_computed_block_nums,
975
+ state=seq_group.state,
976
+ # `multi_modal_data` will only be present for the 1st comm
977
+ # between engine and worker.
978
+ # the subsequent comms can still use delta, but
979
+ # `multi_modal_data` will be None.
980
+ multi_modal_data=seq_group.multi_modal_data
981
+ if scheduler_outputs.num_prefill_groups > 0 else None,
982
+ )
983
+ seq_group_metadata_list.append(seq_group_metadata)
984
+
985
+ # Now that the batch has been created, we can assume all blocks in the
986
+ # batch will have been computed before the next scheduling invocation.
987
+ # This is because the engine assumes that a failure in model execution
988
+ # will crash the vLLM instance / will not retry.
989
+ for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
990
+ self.block_manager.mark_blocks_as_computed(
991
+ scheduled_seq_group.seq_group)
992
+
993
+ return seq_group_metadata_list, scheduler_outputs
994
+
995
+ def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
996
+ self.block_manager.fork(parent_seq, child_seq)
997
+
998
+ def free_seq(self, seq: Sequence) -> None:
999
+ """Free a sequence from a block table."""
1000
+ self.block_manager.free(seq)
1001
+
1002
+ def free_finished_seq_groups(self) -> None:
1003
+ self.running = deque(seq_group for seq_group in self.running
1004
+ if not seq_group.is_finished())
1005
+
1006
+ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
1007
+ self.block_manager.allocate(seq_group)
1008
+ for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
1009
+ seq.status = SequenceStatus.RUNNING
1010
+
1011
+ def _append_slots(
1012
+ self,
1013
+ seq_group: SequenceGroup,
1014
+ blocks_to_copy: Dict[int, List[int]],
1015
+ ) -> None:
1016
+ """Appends new slots to the sequences in the given sequence group.
1017
+
1018
+ Args:
1019
+ seq_group (SequenceGroup): The sequence group containing the
1020
+ sequences to append slots to.
1021
+ blocks_to_copy (Dict[int, List[int]]): A dictionary mapping source
1022
+ block indices to lists of destination block indices. This
1023
+ dictionary is updated with the new source and destination block
1024
+ indices for the appended slots.
1025
+ """
1026
+ num_lookahead_slots = self._get_num_lookahead_slots(is_prefill=False)
1027
+
1028
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1029
+ cows = self.block_manager.append_slots(seq, num_lookahead_slots)
1030
+
1031
+ for src, dests in cows.items():
1032
+ if src not in blocks_to_copy:
1033
+ blocks_to_copy[src] = []
1034
+ blocks_to_copy[src].extend(dests)
1035
+
1036
+ def _preempt(
1037
+ self,
1038
+ seq_group: SequenceGroup,
1039
+ blocks_to_swap_out: Dict[int, int],
1040
+ preemption_mode: Optional[PreemptionMode] = None,
1041
+ ) -> PreemptionMode:
1042
+ # If preemption mode is not specified, we determine the mode as follows:
1043
+ # We use recomputation by default since it incurs lower overhead than
1044
+ # swapping. However, when the sequence group has multiple sequences
1045
+ # (e.g., beam search), recomputation is not currently supported. In
1046
+ # such a case, we use swapping instead.
1047
+ # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
1048
+ # As swapped sequences are prioritized over waiting sequences,
1049
+ # sequence groups with multiple sequences are implicitly prioritized
1050
+ # over sequence groups with a single sequence.
1051
+ # TODO(woosuk): Support recomputation for sequence groups with multiple
1052
+ # sequences. This may require a more sophisticated CUDA kernel.
1053
+ if preemption_mode is None:
1054
+ if seq_group.get_max_num_running_seqs() == 1:
1055
+ preemption_mode = PreemptionMode.RECOMPUTE
1056
+ else:
1057
+ preemption_mode = PreemptionMode.SWAP
1058
+ if preemption_mode == PreemptionMode.RECOMPUTE:
1059
+ self._preempt_by_recompute(seq_group)
1060
+ elif preemption_mode == PreemptionMode.SWAP:
1061
+ self._preempt_by_swap(seq_group, blocks_to_swap_out)
1062
+ else:
1063
+ raise AssertionError("Invalid preemption mode.")
1064
+ return preemption_mode
1065
+
1066
+ def _preempt_by_recompute(
1067
+ self,
1068
+ seq_group: SequenceGroup,
1069
+ ) -> None:
1070
+ seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
1071
+ assert len(seqs) == 1
1072
+ for seq in seqs:
1073
+ seq.status = SequenceStatus.WAITING
1074
+ self.free_seq(seq)
1075
+ seq.reset_state_for_recompute()
1076
+
1077
+ def _preempt_by_swap(
1078
+ self,
1079
+ seq_group: SequenceGroup,
1080
+ blocks_to_swap_out: Dict[int, int],
1081
+ ) -> None:
1082
+ self._swap_out(seq_group, blocks_to_swap_out)
1083
+
1084
+ def _swap_in(
1085
+ self,
1086
+ seq_group: SequenceGroup,
1087
+ blocks_to_swap_in: Dict[int, int],
1088
+ ) -> None:
1089
+ mapping = self.block_manager.swap_in(seq_group)
1090
+ blocks_to_swap_in.update(mapping)
1091
+ for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
1092
+ seq.status = SequenceStatus.RUNNING
1093
+
1094
+ def _swap_out(
1095
+ self,
1096
+ seq_group: SequenceGroup,
1097
+ blocks_to_swap_out: Dict[int, int],
1098
+ ) -> None:
1099
+ if not self.block_manager.can_swap_out(seq_group):
1100
+ # FIXME(woosuk): Abort the sequence group instead of aborting the
1101
+ # entire engine.
1102
+ raise RuntimeError(
1103
+ "Aborted due to the lack of CPU swap space. Please increase "
1104
+ "the swap space to avoid this error.")
1105
+ mapping = self.block_manager.swap_out(seq_group)
1106
+ blocks_to_swap_out.update(mapping)
1107
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1108
+ seq.status = SequenceStatus.SWAPPED
1109
+
1110
+ def _passed_delay(self, now: float) -> bool:
1111
+ if self.prev_prompt:
1112
+ self.last_prompt_latency = now - self.prev_time
1113
+ self.prev_time, self.prev_prompt = now, False
1114
+ # Delay scheduling prompts to let waiting queue fill up
1115
+ if self.scheduler_config.delay_factor > 0 and self.waiting:
1116
+ earliest_arrival_time = min(
1117
+ [e.metrics.arrival_time for e in self.waiting])
1118
+ passed_delay = (
1119
+ (now - earliest_arrival_time) >
1120
+ (self.scheduler_config.delay_factor * self.last_prompt_latency)
1121
+ or not self.running)
1122
+ else:
1123
+ passed_delay = True
1124
+ return passed_delay
1125
+
1126
+ def _get_num_lookahead_slots(self, is_prefill: bool) -> int:
1127
+ """The number of slots to allocate per sequence per step, beyond known
1128
+ token ids. Speculative decoding uses these slots to store KV activations
1129
+ of tokens which may or may not be accepted.
1130
+
1131
+ Speculative decoding does not yet support prefill, so we do not perform
1132
+ lookahead allocation for prefill.
1133
+ """
1134
+ if is_prefill:
1135
+ return 0
1136
+
1137
+ return self.scheduler_config.num_lookahead_slots
1138
+
1139
+ def _get_num_new_tokens(self, seq_group: SequenceGroup,
1140
+ status: SequenceStatus, enable_chunking: bool,
1141
+ budget: SchedulingBudget) -> int:
1142
+ """Get the next new tokens to compute for a given sequence group
1143
+ that's in a given `status`.
1144
+
1145
+ The API could chunk the number of tokens to compute based on `budget`
1146
+ if `enable_chunking` is True. If a sequence group has multiple
1147
+ sequences (e.g., running beam search), it means it is in decoding
1148
+ phase, so chunking doesn't happen.
1149
+
1150
+ Returns 0 if the new token cannot be computed due to token budget.
1151
+ """
1152
+ num_new_tokens = 0
1153
+ seqs = seq_group.get_seqs(status=status)
1154
+ for seq in seqs:
1155
+ num_new_tokens += seq.get_num_new_tokens()
1156
+ assert num_new_tokens > 0
1157
+ # Chunk if a running request cannot fit in.
1158
+ # If number of seq > 1, it means it is doing beam search in a
1159
+ # decode phase. Do not chunk in that case.
1160
+ if enable_chunking and len(seqs) == 1:
1161
+ num_new_tokens = min(num_new_tokens,
1162
+ budget.remaining_token_budget())
1163
+ return num_new_tokens