vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,258 @@
1
+ """A block manager that manages token blocks."""
2
+ from typing import Dict, List, Optional
3
+ from typing import Sequence as GenericSequence
4
+
5
+ from vllm.core.block.block_table import BlockTable
6
+ from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
7
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
8
+ from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
9
+ from vllm.utils import Device
10
+
11
+ SeqId = int
12
+
13
+
14
+ class BlockSpaceManagerV2(BlockSpaceManager):
15
+ """BlockSpaceManager which manages the allocation of KV cache.
16
+
17
+ It owns responsibility for allocation, swapping, allocating memory for
18
+ autoregressively-generated tokens, and other advanced features such as
19
+ prefix caching, forking/copy-on-write, and sliding-window memory allocation.
20
+
21
+ The current implementation is partial; in particular prefix caching and
22
+ sliding-window are not feature complete. This class implements the design
23
+ described in https://github.com/vllm-project/vllm/pull/3492.
24
+
25
+ Lookahead slots
26
+ The block manager has the notion of a "lookahead slot". These are slots
27
+ in the KV cache that are allocated for a sequence. Unlike the other
28
+ allocated slots, the content of these slots is undefined -- the worker
29
+ may use the memory allocations in any way.
30
+
31
+ In practice, a worker could use these lookahead slots to run multiple
32
+ forward passes for a single scheduler invocation. Each successive
33
+ forward pass would write KV activations to the corresponding lookahead
34
+ slot. This allows low inter-token latency use-cases, where the overhead
35
+ of continuous batching scheduling is amortized over >1 generated tokens.
36
+
37
+ Speculative decoding uses lookahead slots to store KV activations of
38
+ proposal tokens.
39
+
40
+ See https://github.com/vllm-project/vllm/pull/3250 for more information
41
+ on lookahead scheduling.
42
+
43
+ Args:
44
+ block_size (int): The size of each memory block.
45
+ num_gpu_blocks (int): The number of memory blocks allocated on GPU.
46
+ num_cpu_blocks (int): The number of memory blocks allocated on CPU.
47
+ watermark (float, optional): The threshold used for memory swapping.
48
+ Defaults to 0.01.
49
+ sliding_window (Optional[int], optional): The size of the sliding
50
+ window. Defaults to None.
51
+ enable_caching (bool, optional): Flag indicating whether caching is
52
+ enabled. Defaults to False.
53
+ """
54
+
55
+ def __init__(
56
+ self,
57
+ block_size: int,
58
+ num_gpu_blocks: int,
59
+ num_cpu_blocks: int,
60
+ watermark: float = 0.01,
61
+ sliding_window: Optional[int] = None,
62
+ enable_caching: bool = False,
63
+ ) -> None:
64
+ self.block_size = block_size
65
+ self.num_total_gpu_blocks = num_gpu_blocks
66
+ self.num_total_cpu_blocks = num_cpu_blocks
67
+
68
+ assert sliding_window is None, "Sliding window not yet supported"
69
+
70
+ self.block_sliding_window = None
71
+
72
+ self.watermark = watermark
73
+ assert watermark >= 0.0
74
+
75
+ self.enable_caching = enable_caching
76
+
77
+ self.watermark_blocks = int(watermark * num_gpu_blocks)
78
+
79
+ self.block_allocator = CpuGpuBlockAllocator.create(
80
+ allocator_type="prefix_caching" if enable_caching else "naive",
81
+ num_gpu_blocks=num_gpu_blocks,
82
+ num_cpu_blocks=num_cpu_blocks,
83
+ block_size=block_size,
84
+ )
85
+
86
+ self.block_tables: Dict[SeqId, BlockTable] = {}
87
+
88
+ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
89
+ # FIXME(woosuk): Here we assume that all sequences in the group share
90
+ # the same prompt. This may not be true for preempted sequences.
91
+ seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
92
+
93
+ num_required_blocks = BlockTable.get_num_required_blocks(
94
+ seq.get_token_ids(),
95
+ block_size=self.block_size,
96
+ )
97
+
98
+ assert self.block_sliding_window is None
99
+ if self.block_sliding_window is not None:
100
+ num_required_blocks = min(num_required_blocks,
101
+ self.block_sliding_window)
102
+
103
+ num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
104
+ device=Device.GPU)
105
+
106
+ # Use watermark to avoid frequent cache eviction.
107
+ if (self.num_total_gpu_blocks - num_required_blocks <
108
+ self.watermark_blocks):
109
+ return AllocStatus.NEVER
110
+ if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
111
+ return AllocStatus.OK
112
+ else:
113
+ return AllocStatus.LATER
114
+
115
+ def allocate(self, seq_group: SequenceGroup) -> None:
116
+ waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
117
+ assert not (set(seq.seq_id for seq in waiting_seqs)
118
+ & self.block_tables.keys()), "block table already exists"
119
+
120
+ # NOTE: Here we assume that all sequences in the group have the same
121
+ # prompt.
122
+ seq = waiting_seqs[0]
123
+
124
+ block_table = BlockTable(
125
+ block_size=self.block_size,
126
+ block_allocator=self.block_allocator,
127
+ )
128
+ assert self.block_sliding_window is None
129
+ block_table.allocate(seq.get_token_ids())
130
+ self.block_tables[seq.seq_id] = block_table
131
+
132
+ # Assign the block table for each sequence.
133
+ for seq in waiting_seqs[1:]:
134
+ self.block_tables[seq.seq_id] = block_table.fork()
135
+
136
+ def can_append_slots(self, seq_group: SequenceGroup,
137
+ num_lookahead_slots: int) -> bool:
138
+ """Determine if there is enough space in the GPU KV cache to continue
139
+ generation of the specified sequence group.
140
+
141
+ We use a worst-case heuristic: assume each touched block will require a
142
+ new allocation (either via CoW or new block). We can append slots if the
143
+ number of touched blocks is less than the number of free blocks.
144
+
145
+ "Lookahead slots" are slots that are allocated in addition to the slots
146
+ for known tokens. The contents of the lookahead slots are not defined.
147
+ This is used by speculative decoding when speculating future tokens.
148
+ """
149
+
150
+ num_touched_blocks = 0
151
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
152
+ block_table = self.block_tables[seq.seq_id]
153
+
154
+ num_touched_blocks += (
155
+ block_table.get_num_blocks_touched_by_append_slots(
156
+ token_ids=block_table.get_unseen_token_ids(
157
+ seq.get_token_ids()),
158
+ num_lookahead_slots=num_lookahead_slots,
159
+ ))
160
+
161
+ num_free_gpu_blocks = self.block_allocator.get_num_free_blocks(
162
+ Device.GPU)
163
+ return num_touched_blocks <= num_free_gpu_blocks
164
+
165
+ def append_slots(
166
+ self,
167
+ seq: Sequence,
168
+ num_lookahead_slots: int,
169
+ ) -> Dict[int, List[int]]:
170
+
171
+ block_table = self.block_tables[seq.seq_id]
172
+
173
+ block_table.append_token_ids(
174
+ token_ids=block_table.get_unseen_token_ids(seq.get_token_ids()),
175
+ num_lookahead_slots=num_lookahead_slots,
176
+ )
177
+
178
+ # Return any new copy-on-writes.
179
+ new_cows = self.block_allocator.clear_copy_on_writes()
180
+ return new_cows
181
+
182
+ def free(self, seq: Sequence) -> None:
183
+ if seq.seq_id not in self.block_tables:
184
+ # Already freed or haven't been scheduled yet.
185
+ return
186
+ self.block_tables[seq.seq_id].free()
187
+ del self.block_tables[seq.seq_id]
188
+
189
+ def get_block_table(self, seq: Sequence) -> List[int]:
190
+ assert seq.seq_id in self.block_tables
191
+ block_ids = self.block_tables[seq.seq_id].physical_block_ids
192
+ assert all(b is not None for b in block_ids)
193
+ return block_ids # type: ignore
194
+
195
+ def access_all_blocks_in_seq(self, seq: Sequence, now: float):
196
+ # Update the last accessed time of all the blocks accessed
197
+ # in this step.
198
+ # And the accessed time is only useful for prefix caching now,
199
+ # as it support internal evictor policy for which cached
200
+ # block could be refilled, to keep cached content could be reused
201
+ # at max extend.
202
+ if self.enable_caching:
203
+ block_table = self.block_tables[seq.seq_id]
204
+ block_ids = []
205
+ for block_id in block_table.physical_block_ids:
206
+ block_ids.append(block_id)
207
+ self.block_allocator.mark_blocks_as_accessed(
208
+ block_ids, # type: ignore
209
+ now)
210
+
211
+ def mark_blocks_as_computed(self, seq_group: SequenceGroup):
212
+ # The only need for mark block as computed is for prefix caching,
213
+ # while currently we could determine whether one block is computed
214
+ # or not by check whether it has content hash.
215
+ # So this function is useless for block_v2.
216
+ pass
217
+
218
+ def get_common_computed_block_ids(
219
+ self, seqs: List[Sequence]) -> GenericSequence[int]:
220
+ """Determine which blocks for which we skip prefill.
221
+
222
+ With prefix caching we can skip prefill for previously-generated blocks.
223
+ Currently, the attention implementation only supports skipping cached
224
+ blocks if they are a contiguous prefix of cached blocks.
225
+
226
+ This method determines which blocks can be safely skipped for all
227
+ sequences in the sequence group.
228
+ """
229
+ seq_block_ids = [
230
+ self.block_tables[seq.seq_id].physical_block_ids for seq in seqs
231
+ ]
232
+ # NOTE(sang): This assumes seq_block_ids doesn't contain any None.
233
+ return self.block_allocator.get_common_computed_block_ids(
234
+ seq_block_ids) # type: ignore
235
+
236
+ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
237
+ src_block_table = self.block_tables[parent_seq.seq_id]
238
+ self.block_tables[child_seq.seq_id] = src_block_table.fork()
239
+
240
+ def can_swap_in(self, seq_group: SequenceGroup,
241
+ num_lookahead_slots: int) -> AllocStatus:
242
+ return AllocStatus.LATER
243
+
244
+ def swap_in(self, seq_group: SequenceGroup,
245
+ num_lookahead_slots: int) -> Dict[int, int]:
246
+ raise NotImplementedError
247
+
248
+ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
249
+ return False
250
+
251
+ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
252
+ raise NotImplementedError
253
+
254
+ def get_num_free_gpu_blocks(self) -> int:
255
+ return self.block_allocator.get_num_free_blocks(Device.GPU)
256
+
257
+ def get_num_free_cpu_blocks(self) -> int:
258
+ return self.block_allocator.get_num_free_blocks(Device.CPU)
@@ -0,0 +1,105 @@
1
+ import enum
2
+ from abc import ABC, abstractmethod, abstractproperty
3
+ from typing import OrderedDict
4
+
5
+ from vllm.block import PhysicalTokenBlock
6
+
7
+
8
+ class EvictionPolicy(enum.Enum):
9
+ """Enum for eviction policy used by make_evictor to instantiate the correct
10
+ Evictor subclass.
11
+ """
12
+ LRU = enum.auto()
13
+
14
+
15
+ class Evictor(ABC):
16
+ """The Evictor subclasses should be used by the BlockAllocator class to
17
+ handle eviction of freed PhysicalTokenBlocks.
18
+ """
19
+
20
+ @abstractmethod
21
+ def __init__(self):
22
+ pass
23
+
24
+ @abstractmethod
25
+ def __contains__(self, block_hash: int) -> bool:
26
+ pass
27
+
28
+ @abstractmethod
29
+ def evict(self) -> PhysicalTokenBlock:
30
+ """Runs the eviction algorithm and returns the evicted block"""
31
+ pass
32
+
33
+ @abstractmethod
34
+ def add(self, block: PhysicalTokenBlock):
35
+ """Adds block to the evictor, making it a candidate for eviction"""
36
+ pass
37
+
38
+ @abstractmethod
39
+ def remove(self, block_hash: int) -> PhysicalTokenBlock:
40
+ """Simply removes the block with the hash value block_hash from the
41
+ evictor. Caller is responsible for making sure that block_hash is
42
+ contained in the evictor before calling remove. Should be used to
43
+ "bring back" blocks that have been freed but not evicted yet.
44
+ """
45
+ pass
46
+
47
+ @abstractproperty
48
+ def num_blocks(self) -> int:
49
+ pass
50
+
51
+
52
+ class LRUEvictor(Evictor):
53
+ """Evicts in a least-recently-used order using the last_accessed timestamp
54
+ that's recorded in the PhysicalTokenBlock. If there are multiple blocks with
55
+ the same last_accessed time, then the one with the largest num_hashed_tokens
56
+ will be evicted. If two blocks each have the lowest last_accessed time and
57
+ highest num_hashed_tokens value, then one will be chose arbitrarily
58
+ """
59
+
60
+ def __init__(self):
61
+ self.free_table: OrderedDict[int, PhysicalTokenBlock] = OrderedDict()
62
+
63
+ def __contains__(self, block_hash: int) -> bool:
64
+ return block_hash in self.free_table
65
+
66
+ def evict(self) -> PhysicalTokenBlock:
67
+ if len(self.free_table) == 0:
68
+ raise ValueError("No usable cache memory left")
69
+
70
+ evicted_block = next(iter(self.free_table.values()))
71
+ # The blocks with the lowest timestamps should be placed consecutively
72
+ # at the start of OrderedDict. Loop through all these blocks to
73
+ # find the one with maximum number of hashed tokens.
74
+ for _, block in self.free_table.items():
75
+ if evicted_block.last_accessed < block.last_accessed:
76
+ break
77
+ if evicted_block.num_hashed_tokens < block.num_hashed_tokens:
78
+ evicted_block = block
79
+
80
+ self.free_table.pop(evicted_block.block_hash)
81
+
82
+ evicted_block.computed = False
83
+ return evicted_block
84
+
85
+ def add(self, block: PhysicalTokenBlock):
86
+ self.free_table[block.block_hash] = block
87
+
88
+ def remove(self, block_hash: int) -> PhysicalTokenBlock:
89
+ if block_hash not in self.free_table:
90
+ raise ValueError(
91
+ "Attempting to remove block that's not in the evictor")
92
+ block: PhysicalTokenBlock = self.free_table[block_hash]
93
+ self.free_table.pop(block_hash)
94
+ return block
95
+
96
+ @property
97
+ def num_blocks(self) -> int:
98
+ return len(self.free_table)
99
+
100
+
101
+ def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
102
+ if eviction_policy == EvictionPolicy.LRU:
103
+ return LRUEvictor()
104
+ else:
105
+ raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
@@ -0,0 +1,127 @@
1
+ import enum
2
+ from abc import ABC, abstractmethod, abstractproperty
3
+ from typing import OrderedDict, Tuple
4
+
5
+
6
+ class EvictionPolicy(enum.Enum):
7
+ """Enum for eviction policy used by make_evictor to instantiate the correct
8
+ Evictor subclass.
9
+ """
10
+ LRU = enum.auto()
11
+
12
+
13
+ class Evictor(ABC):
14
+ """The Evictor subclasses should be used by the BlockAllocator class to
15
+ handle eviction of freed PhysicalTokenBlocks.
16
+ """
17
+
18
+ @abstractmethod
19
+ def __init__(self):
20
+ pass
21
+
22
+ @abstractmethod
23
+ def __contains__(self, block_id: int) -> bool:
24
+ pass
25
+
26
+ @abstractmethod
27
+ def evict(self) -> Tuple[int, int]:
28
+ """Runs the eviction algorithm and returns the evicted block's
29
+ content hash along with physical block id along with physical block id
30
+ """
31
+ pass
32
+
33
+ @abstractmethod
34
+ def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
35
+ last_accessed: float):
36
+ """Adds block to the evictor, making it a candidate for eviction"""
37
+ pass
38
+
39
+ @abstractmethod
40
+ def update(self, block_id: int, last_accessed: float):
41
+ """Update corresponding block's access time in metadata"""
42
+ pass
43
+
44
+ @abstractmethod
45
+ def remove(self, block_id: int):
46
+ """Remove a given block id from the cache."""
47
+ pass
48
+
49
+ @abstractproperty
50
+ def num_blocks(self) -> int:
51
+ pass
52
+
53
+
54
+ class BlockMetaData():
55
+ """Data structure for storing key data describe cached block, so that
56
+ evitor could use to make its decision which one to choose for eviction
57
+
58
+ Here we use physical block id as the dict key, as there maybe several
59
+ blocks with the same content hash, but their physical id is unique.
60
+ """
61
+
62
+ def __init__(self, content_hash: int, num_hashed_tokens: int,
63
+ last_accessed: float):
64
+ self.content_hash = content_hash
65
+ self.num_hashed_tokens = num_hashed_tokens
66
+ self.last_accessed = last_accessed
67
+
68
+
69
+ class LRUEvictor(Evictor):
70
+ """Evicts in a least-recently-used order using the last_accessed timestamp
71
+ that's recorded in the PhysicalTokenBlock. If there are multiple blocks with
72
+ the same last_accessed time, then the one with the largest num_hashed_tokens
73
+ will be evicted. If two blocks each have the lowest last_accessed time and
74
+ highest num_hashed_tokens value, then one will be chose arbitrarily
75
+ """
76
+
77
+ def __init__(self):
78
+ self.free_table: OrderedDict[int, BlockMetaData] = OrderedDict()
79
+
80
+ def __contains__(self, block_id: int) -> bool:
81
+ return block_id in self.free_table
82
+
83
+ def evict(self) -> Tuple[int, int]:
84
+ if len(self.free_table) == 0:
85
+ raise ValueError("No usable cache memory left")
86
+
87
+ evicted_block = next(iter(self.free_table.values()))
88
+ evicted_block_id = next(iter(self.free_table.keys()))
89
+ # The blocks with the lowest timestamps should be placed consecutively
90
+ # at the start of OrderedDict. Loop through all these blocks to
91
+ # find the one with maximum number of hashed tokens.
92
+ for _id, block in self.free_table.items():
93
+ if evicted_block.last_accessed > block.last_accessed or (
94
+ evicted_block.last_accessed == block.last_accessed and
95
+ evicted_block.num_hashed_tokens < block.num_hashed_tokens):
96
+ evicted_block = block
97
+ evicted_block_id = _id
98
+
99
+ self.free_table.pop(evicted_block_id)
100
+
101
+ return evicted_block_id, evicted_block.content_hash
102
+
103
+ def add(self, block_id: int, content_hash: int, num_hashed_tokens: int,
104
+ last_accessed: float):
105
+ self.free_table[block_id] = BlockMetaData(content_hash,
106
+ num_hashed_tokens,
107
+ last_accessed)
108
+
109
+ def update(self, block_id: int, last_accessed: float):
110
+ self.free_table[block_id].last_accessed = last_accessed
111
+
112
+ def remove(self, block_id: int):
113
+ if block_id not in self.free_table:
114
+ raise ValueError(
115
+ "Attempting to remove block that's not in the evictor")
116
+ self.free_table.pop(block_id)
117
+
118
+ @property
119
+ def num_blocks(self) -> int:
120
+ return len(self.free_table)
121
+
122
+
123
+ def make_evictor(eviction_policy: EvictionPolicy) -> Evictor:
124
+ if eviction_policy == EvictionPolicy.LRU:
125
+ return LRUEvictor()
126
+ else:
127
+ raise ValueError(f"Unknown cache eviction policy: {eviction_policy}")
@@ -0,0 +1,113 @@
1
+ import enum
2
+ from abc import ABC, abstractmethod
3
+ from typing import Dict, List
4
+ from typing import Sequence as GenericSequence
5
+
6
+ from vllm.sequence import Sequence, SequenceGroup
7
+
8
+
9
+ class AllocStatus(enum.Enum):
10
+ """Result for BlockSpaceManager.can_allocate
11
+
12
+ 1. Ok: seq_group can be allocated now.
13
+ 2. Later: seq_group cannot be allocated.
14
+ The capacity of allocator is larger than seq_group required.
15
+ 3. Never: seq_group can never be allocated.
16
+ The seq_group is too large to allocated in GPU.
17
+ """
18
+ OK = enum.auto()
19
+ LATER = enum.auto()
20
+ NEVER = enum.auto()
21
+
22
+
23
+ class BlockSpaceManager(ABC):
24
+
25
+ @staticmethod
26
+ def get_block_space_manager_class(version: str):
27
+ version = version.lower()
28
+
29
+ if version == "v1":
30
+ from vllm.core.block_manager_v1 import BlockSpaceManagerV1
31
+ return BlockSpaceManagerV1
32
+
33
+ if version == "v2":
34
+ from vllm.core.block_manager_v2 import BlockSpaceManagerV2
35
+ return BlockSpaceManagerV2
36
+
37
+ raise ValueError(f"Unknown version {version=}")
38
+
39
+ @abstractmethod
40
+ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
41
+ pass
42
+
43
+ @abstractmethod
44
+ def allocate(self, seq_group: SequenceGroup) -> None:
45
+ pass
46
+
47
+ @abstractmethod
48
+ def can_append_slots(self, seq_group: SequenceGroup,
49
+ num_lookahead_slots: int) -> bool:
50
+ pass
51
+
52
+ @abstractmethod
53
+ def append_slots(
54
+ self,
55
+ seq: Sequence,
56
+ num_lookahead_slots: int,
57
+ ) -> Dict[int, List[int]]:
58
+ pass
59
+
60
+ @abstractmethod
61
+ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
62
+ pass
63
+
64
+ @abstractmethod
65
+ def can_swap_in(self, seq_group: SequenceGroup,
66
+ num_lookahead_slots: int) -> AllocStatus:
67
+ pass
68
+
69
+ @abstractmethod
70
+ def swap_in(self, seq_group: SequenceGroup,
71
+ num_lookahead_slots: int) -> Dict[int, int]:
72
+ pass
73
+
74
+ @abstractmethod
75
+ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
76
+ pass
77
+
78
+ @abstractmethod
79
+ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
80
+ pass
81
+
82
+ @abstractmethod
83
+ def free(self, seq: Sequence) -> None:
84
+ pass
85
+
86
+ @abstractmethod
87
+ def get_block_table(self, seq: Sequence) -> List[int]:
88
+ pass
89
+
90
+ @abstractmethod
91
+ def get_num_free_gpu_blocks(self) -> int:
92
+ pass
93
+
94
+ @abstractmethod
95
+ def get_num_free_cpu_blocks(self) -> int:
96
+ pass
97
+
98
+ @abstractmethod
99
+ def access_all_blocks_in_seq(
100
+ self,
101
+ seq: Sequence,
102
+ access_time: float,
103
+ ) -> None:
104
+ pass
105
+
106
+ @abstractmethod
107
+ def get_common_computed_block_ids(
108
+ self, seqs: List[Sequence]) -> GenericSequence[int]:
109
+ pass
110
+
111
+ @abstractmethod
112
+ def mark_blocks_as_computed(self, seq_group: SequenceGroup):
113
+ pass
vllm/core/policy.py ADDED
@@ -0,0 +1,45 @@
1
+ from collections import deque
2
+ from typing import Deque
3
+
4
+ from vllm.sequence import SequenceGroup
5
+
6
+
7
+ class Policy:
8
+
9
+ def get_priority(
10
+ self,
11
+ now: float,
12
+ seq_group: SequenceGroup,
13
+ ) -> float:
14
+ raise NotImplementedError
15
+
16
+ def sort_by_priority(
17
+ self,
18
+ now: float,
19
+ seq_groups: Deque[SequenceGroup],
20
+ ) -> Deque[SequenceGroup]:
21
+ return deque(
22
+ sorted(
23
+ seq_groups,
24
+ key=lambda seq_group: self.get_priority(now, seq_group),
25
+ reverse=True,
26
+ ))
27
+
28
+
29
+ class FCFS(Policy):
30
+
31
+ def get_priority(
32
+ self,
33
+ now: float,
34
+ seq_group: SequenceGroup,
35
+ ) -> float:
36
+ return now - seq_group.metrics.arrival_time
37
+
38
+
39
+ class PolicyFactory:
40
+
41
+ _POLICY_REGISTRY = {'fcfs': FCFS}
42
+
43
+ @classmethod
44
+ def get_policy(cls, policy_name: str, **kwargs) -> Policy:
45
+ return cls._POLICY_REGISTRY[policy_name](**kwargs)