vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,625 @@
1
+ """A block manager that manages token blocks."""
2
+ import math
3
+ from abc import ABC, abstractmethod
4
+ from itertools import count, takewhile
5
+ from os.path import commonprefix
6
+ from typing import Dict, List, Optional
7
+ from typing import Sequence as GenericSequence
8
+ from typing import Set
9
+
10
+ from vllm.block import BlockTable, PhysicalTokenBlock
11
+ from vllm.core.evictor_v1 import EvictionPolicy, Evictor, make_evictor
12
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
13
+ from vllm.logger import init_logger
14
+ from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
15
+ from vllm.utils import Device
16
+
17
+ logger = init_logger(__name__)
18
+
19
+
20
+ class BlockAllocatorBase(ABC):
21
+ """Manages free physical token blocks for a device.
22
+
23
+ The allocator maintains a list of free blocks and allocates a block when
24
+ requested. When a block is freed, its reference count is decremented. If
25
+ the reference count becomes zero, the block is added back to the free list.
26
+ """
27
+
28
+ @abstractmethod
29
+ def __init__(self,
30
+ device: Device,
31
+ block_size: int,
32
+ num_blocks: int,
33
+ eviction_policy: EvictionPolicy = EvictionPolicy.LRU):
34
+ pass
35
+
36
+ @abstractmethod
37
+ def allocate(self,
38
+ block_hash: Optional[int] = None,
39
+ num_hashed_tokens: int = 0) -> PhysicalTokenBlock:
40
+ pass
41
+
42
+ @abstractmethod
43
+ def free(self, block: PhysicalTokenBlock) -> None:
44
+ pass
45
+
46
+ @abstractmethod
47
+ def get_num_free_blocks(self) -> int:
48
+ pass
49
+
50
+ @abstractmethod
51
+ def get_num_total_blocks(self) -> int:
52
+ pass
53
+
54
+ @abstractmethod
55
+ def contains_block(self, block_hash: int) -> bool:
56
+ pass
57
+
58
+ @abstractmethod
59
+ def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
60
+ pass
61
+
62
+
63
+ class CachedBlockAllocator(BlockAllocatorBase):
64
+ """Manages free physical token blocks for a device.
65
+
66
+ The allocator maintains a list of free blocks and allocates a block when
67
+ requested. When a block is freed, its reference count is decremented. If
68
+ the reference count becomes zero, the block is added back to the free list.
69
+ """
70
+
71
+ def __init__(self,
72
+ device: Device,
73
+ block_size: int,
74
+ num_blocks: int,
75
+ eviction_policy: EvictionPolicy = EvictionPolicy.LRU) -> None:
76
+ self.device = device
77
+ self.block_size = block_size
78
+ self.num_blocks = num_blocks
79
+
80
+ self.current_num_blocks = 0
81
+ self.cached_blocks: Dict[int, PhysicalTokenBlock] = {}
82
+
83
+ self.evictor: Evictor = make_evictor(eviction_policy)
84
+
85
+ self.default_hash_ctr = count()
86
+
87
+ def allocate_block(self, block_hash: int,
88
+ num_hashed_tokens: int) -> PhysicalTokenBlock:
89
+ if self.current_num_blocks == self.num_blocks:
90
+ block = self.evictor.evict()
91
+ block.block_hash = block_hash
92
+ block.num_hashed_tokens = num_hashed_tokens
93
+ return block
94
+ block = PhysicalTokenBlock(device=self.device,
95
+ block_number=self.current_num_blocks,
96
+ block_size=self.block_size,
97
+ block_hash=block_hash,
98
+ num_hashed_tokens=num_hashed_tokens)
99
+ self.current_num_blocks += 1
100
+ return block
101
+
102
+ def allocate(self,
103
+ block_hash: Optional[int] = None,
104
+ num_hashed_tokens: int = 0) -> PhysicalTokenBlock:
105
+ if block_hash is None:
106
+ block_hash = next(self.default_hash_ctr)
107
+ if block_hash in self.evictor:
108
+ assert block_hash not in self.cached_blocks
109
+ block = self.evictor.remove(block_hash)
110
+ assert block.ref_count == 0
111
+ self.cached_blocks[block_hash] = block
112
+ block.ref_count += 1
113
+ assert block.block_hash == block_hash
114
+ return block
115
+ if block_hash not in self.cached_blocks:
116
+ self.cached_blocks[block_hash] = self.allocate_block(
117
+ block_hash, num_hashed_tokens)
118
+ block = self.cached_blocks[block_hash]
119
+ assert block.block_hash == block_hash
120
+ block.ref_count += 1
121
+ return block
122
+
123
+ def free(self, block: PhysicalTokenBlock) -> None:
124
+ if block.ref_count == 0:
125
+ raise ValueError(f"Double free! {block} is already freed.")
126
+ block.ref_count -= 1
127
+ if block.ref_count == 0:
128
+ assert block.block_hash not in self.evictor
129
+ self.evictor.add(block)
130
+
131
+ # Remove the block from the cached_blocks
132
+ del self.cached_blocks[block.block_hash]
133
+
134
+ def get_num_free_blocks(self) -> int:
135
+ return (self.num_blocks - self.current_num_blocks +
136
+ self.evictor.num_blocks)
137
+
138
+ def get_num_total_blocks(self) -> int:
139
+ return self.num_blocks
140
+
141
+ def contains_block(self, block_hash: int) -> bool:
142
+ return block_hash in self.cached_blocks or block_hash in self.evictor
143
+
144
+ def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
145
+ # Update the hash of block and the cached_blocks dictionary.
146
+ assert not self.contains_block(block_hash)
147
+ old_hash = block.block_hash
148
+ block.block_hash = block_hash
149
+ del self.cached_blocks[old_hash]
150
+ self.cached_blocks[block_hash] = block
151
+
152
+
153
+ class UncachedBlockAllocator(BlockAllocatorBase):
154
+ """Manages free physical token blocks for a device.
155
+
156
+ The allocator maintains a list of free blocks and allocates a block when
157
+ requested. When a block is freed, its reference count is decremented. If
158
+ the reference count becomes zero, the block is added back to the free list.
159
+ """
160
+
161
+ def __init__(
162
+ self,
163
+ device: Device,
164
+ block_size: int,
165
+ num_blocks: int,
166
+ ) -> None:
167
+ self.device = device
168
+ self.block_size = block_size
169
+ self.num_blocks = num_blocks
170
+
171
+ # Initialize the free blocks.
172
+ self.free_blocks: BlockTable = []
173
+ for i in range(num_blocks):
174
+ block = PhysicalTokenBlock(device=device,
175
+ block_number=i,
176
+ block_size=block_size,
177
+ block_hash=-1,
178
+ num_hashed_tokens=0)
179
+ self.free_blocks.append(block)
180
+
181
+ def allocate(self,
182
+ block_hash: Optional[int] = None,
183
+ num_hashed_tokens: int = 0) -> PhysicalTokenBlock:
184
+ if not self.free_blocks:
185
+ raise ValueError("Out of memory! No free blocks are available.")
186
+ block = self.free_blocks.pop()
187
+ block.ref_count = 1
188
+ return block
189
+
190
+ def free(self, block: PhysicalTokenBlock) -> None:
191
+ if block.ref_count == 0:
192
+ raise ValueError(f"Double free! {block} is already freed.")
193
+ block.ref_count -= 1
194
+ if block.ref_count == 0:
195
+ self.free_blocks.append(block)
196
+
197
+ def get_num_free_blocks(self) -> int:
198
+ return len(self.free_blocks)
199
+
200
+ def get_num_total_blocks(self) -> int:
201
+ return self.num_blocks
202
+
203
+ def contains_block(self, block_hash: int) -> bool:
204
+ raise NotImplementedError(
205
+ "Invalid codepath for uncached block allocator.")
206
+
207
+ def update_hash(self, block_hash: int, block: PhysicalTokenBlock):
208
+ raise NotImplementedError(
209
+ "Invalid codepath for uncached block allocator.")
210
+
211
+
212
+ class BlockSpaceManagerV1(BlockSpaceManager):
213
+ """Manages the mapping between logical and physical token blocks."""
214
+
215
+ def __init__(
216
+ self,
217
+ block_size: int,
218
+ num_gpu_blocks: int,
219
+ num_cpu_blocks: int,
220
+ watermark: float = 0.01,
221
+ sliding_window: Optional[int] = None,
222
+ enable_caching: bool = False,
223
+ ) -> None:
224
+ self.block_size = block_size
225
+ self.num_total_gpu_blocks = num_gpu_blocks
226
+ self.num_total_cpu_blocks = num_cpu_blocks
227
+
228
+ if enable_caching and sliding_window is not None:
229
+ raise NotImplementedError(
230
+ "Sliding window is not allowed with prefix caching enabled!")
231
+
232
+ self.block_sliding_window = None
233
+ if sliding_window is not None:
234
+ # Round up to nearest block size to regularize sliding window
235
+ # allocation sizes.
236
+ self.block_sliding_window = math.ceil(sliding_window / block_size)
237
+
238
+ self.watermark = watermark
239
+ assert watermark >= 0.0
240
+
241
+ self.enable_caching = enable_caching
242
+
243
+ self.watermark_blocks = int(watermark * num_gpu_blocks)
244
+
245
+ if self.enable_caching:
246
+ logger.info("Automatic prefix caching is enabled.")
247
+ self.gpu_allocator: BlockAllocatorBase = CachedBlockAllocator(
248
+ Device.GPU, block_size, num_gpu_blocks)
249
+ self.cpu_allocator: BlockAllocatorBase = CachedBlockAllocator(
250
+ Device.CPU, block_size, num_cpu_blocks)
251
+ else:
252
+ self.gpu_allocator = UncachedBlockAllocator(
253
+ Device.GPU, block_size, num_gpu_blocks)
254
+ self.cpu_allocator = UncachedBlockAllocator(
255
+ Device.CPU, block_size, num_cpu_blocks)
256
+ # Mapping: seq_id -> BlockTable.
257
+ self.block_tables: Dict[int, BlockTable] = {}
258
+
259
+ def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
260
+ # FIXME(woosuk): Here we assume that all sequences in the group share
261
+ # the same prompt. This may not be true for preempted sequences.
262
+ seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
263
+ num_required_blocks = len(seq.logical_token_blocks)
264
+
265
+ if self.block_sliding_window is not None:
266
+ num_required_blocks = min(num_required_blocks,
267
+ self.block_sliding_window)
268
+ num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
269
+
270
+ # Use watermark to avoid frequent cache eviction.
271
+ if (self.num_total_gpu_blocks - num_required_blocks <
272
+ self.watermark_blocks):
273
+ return AllocStatus.NEVER
274
+ if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
275
+ return AllocStatus.OK
276
+ else:
277
+ return AllocStatus.LATER
278
+
279
+ def allocate(self, seq_group: SequenceGroup) -> None:
280
+ # NOTE: Here we assume that all sequences in the group have the same
281
+ # prompt.
282
+ seq = seq_group.get_seqs(status=SequenceStatus.WAITING)[0]
283
+
284
+ # Allocate new physical token blocks that will store the prompt tokens.
285
+ num_prompt_blocks = len(seq.logical_token_blocks)
286
+
287
+ block_table: BlockTable = []
288
+ for logical_idx in range(num_prompt_blocks):
289
+ if (self.block_sliding_window is not None
290
+ and logical_idx >= self.block_sliding_window):
291
+ block = block_table[logical_idx % self.block_sliding_window]
292
+ # Set the reference counts of the token blocks.
293
+ block.ref_count = seq_group.num_seqs()
294
+ elif self.enable_caching:
295
+ block = self.gpu_allocator.allocate(
296
+ seq.hash_of_block(logical_idx),
297
+ seq.num_hashed_tokens_of_block(logical_idx))
298
+ else:
299
+ block = self.gpu_allocator.allocate()
300
+ # Set the reference counts of the token blocks.
301
+ block.ref_count = seq_group.num_seqs()
302
+ block_table.append(block)
303
+
304
+ # Assign the block table for each sequence.
305
+ for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
306
+ self.block_tables[seq.seq_id] = block_table.copy()
307
+
308
+ def can_append_slots(self,
309
+ seq_group: SequenceGroup,
310
+ num_lookahead_slots: int = 0) -> bool:
311
+ assert (num_lookahead_slots == 0
312
+ ), "lookahead allocation not supported in BlockSpaceManagerV1"
313
+
314
+ # Simple heuristic: If there is at least one free block
315
+ # for each sequence, we can append.
316
+ num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
317
+ num_seqs = seq_group.num_seqs(status=SequenceStatus.RUNNING)
318
+ return num_seqs <= num_free_gpu_blocks
319
+
320
+ def _promote_last_block(
321
+ self,
322
+ seq: Sequence,
323
+ last_block: PhysicalTokenBlock,
324
+ ) -> PhysicalTokenBlock:
325
+ assert self.enable_caching
326
+
327
+ # Compute a new hash for the block so that it can be shared by other
328
+ # Sequences
329
+ new_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
330
+
331
+ # if new_hash is already in the cached table, then free last_block
332
+ # and return the cached version
333
+ if self.gpu_allocator.contains_block(new_hash):
334
+ self.gpu_allocator.free(last_block)
335
+ return self.gpu_allocator.allocate(new_hash)
336
+ else:
337
+ self.gpu_allocator.update_hash(new_hash, last_block)
338
+ return last_block
339
+
340
+ def _is_last_block_full(
341
+ self,
342
+ seq: Sequence,
343
+ ) -> bool:
344
+ token_ids_len = seq.data.get_len()
345
+ return token_ids_len > 0 and token_ids_len % seq.block_size == 0
346
+
347
+ def _maybe_promote_last_block(
348
+ self,
349
+ seq: Sequence,
350
+ last_block: PhysicalTokenBlock,
351
+ ) -> PhysicalTokenBlock:
352
+ if self._is_last_block_full(seq):
353
+ return self._promote_last_block(seq, last_block)
354
+ else:
355
+ return last_block
356
+
357
+ def _allocate_last_physical_block(
358
+ self,
359
+ seq: Sequence,
360
+ ) -> PhysicalTokenBlock:
361
+ # Called before a new block is appended.
362
+ # This is in charge of allocating a new physical block (to be appended).
363
+
364
+ # None if the last block is not full. Otherwise, we set it to the
365
+ # content hash.
366
+ if not self.enable_caching:
367
+ return self.gpu_allocator.allocate()
368
+ block_hash: Optional[int] = None
369
+ if (self._is_last_block_full(seq)):
370
+ block_hash = seq.hash_of_block(len(seq.logical_token_blocks) - 1)
371
+ num_hashed_tokens = seq.num_hashed_tokens_of_block(
372
+ len(seq.logical_token_blocks) - 1)
373
+
374
+ # num_hashed_tokens is used to compute future hashes
375
+ # (e.g. in the hashing function, it is used to ask the sequence for
376
+ # prefix tokens)
377
+ new_block = self.gpu_allocator.allocate(block_hash, num_hashed_tokens)
378
+
379
+ # If the block has is None, then the block is not full.
380
+ # If the block is not full, then we expect it to have a refcount of 1.
381
+ if block_hash is None:
382
+ assert new_block.ref_count == 1
383
+ return new_block
384
+
385
+ def append_slots(
386
+ self,
387
+ seq: Sequence,
388
+ num_lookahead_slots: int = 0,
389
+ ) -> Dict[int, List[int]]:
390
+ """Allocate a physical slot for a new token."""
391
+ logical_blocks = seq.logical_token_blocks
392
+ block_table = self.block_tables[seq.seq_id]
393
+ # If we need to allocate a new physical block
394
+ if len(block_table) < len(logical_blocks):
395
+ # Currently this code only supports adding one physical block
396
+ assert len(block_table) == len(logical_blocks) - 1
397
+
398
+ if (self.block_sliding_window
399
+ and len(block_table) >= self.block_sliding_window):
400
+ # reuse a block
401
+ block_table.append(block_table[len(block_table) %
402
+ self.block_sliding_window])
403
+ else:
404
+ # The sequence hash a new logical block.
405
+ # Allocate a new physical block.
406
+ new_block = self._allocate_last_physical_block(seq)
407
+ block_table.append(new_block)
408
+ return {}
409
+
410
+ # We want to append the token to the last physical block.
411
+ last_block = block_table[-1]
412
+ assert last_block.device == Device.GPU
413
+ if last_block.ref_count == 1:
414
+ # Not shared with other sequences. Appendable.
415
+ if self.enable_caching:
416
+ # If the last block is now complete, we may reuse an old block
417
+ # to save memory.
418
+ maybe_new_block = self._maybe_promote_last_block(
419
+ seq, last_block)
420
+ block_table[-1] = maybe_new_block
421
+ return {}
422
+ else:
423
+ # The last block is shared with other sequences.
424
+ # Copy on Write: Allocate a new block and copy the tokens.
425
+ new_block = self._allocate_last_physical_block(seq)
426
+
427
+ block_table[-1] = new_block
428
+ self.gpu_allocator.free(last_block)
429
+ return {last_block.block_number: [new_block.block_number]}
430
+
431
+ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
432
+ # NOTE: fork does not allocate a new physical block.
433
+ # Thus, it is always safe from OOM.
434
+ src_block_table = self.block_tables[parent_seq.seq_id]
435
+ self.block_tables[child_seq.seq_id] = src_block_table.copy()
436
+ # When using a sliding window, blocks will be eventually reused.
437
+ # In this case the block tables will contain repeated blocks.
438
+ # When forking, we must make sure that each block's `ref_count`
439
+ # is only incremented by one, so we deduplicate them by wrapping
440
+ # them in a set.
441
+ for block in set(src_block_table):
442
+ block.ref_count += 1
443
+
444
+ def _get_physical_blocks(
445
+ self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]:
446
+ # NOTE: Here, we assume that the physical blocks are only shared by
447
+ # the sequences in the same group.
448
+ blocks: Set[PhysicalTokenBlock] = set()
449
+ for seq in seq_group.get_seqs():
450
+ if seq.is_finished():
451
+ continue
452
+ blocks.update(self.block_tables[seq.seq_id])
453
+ return list(blocks)
454
+
455
+ def can_swap_in(self,
456
+ seq_group: SequenceGroup,
457
+ num_lookahead_slots: int = 0) -> AllocStatus:
458
+ assert (num_lookahead_slots == 0
459
+ ), "BlockSpaceManagerV1 does not support lookahead allocation"
460
+ blocks = self._get_physical_blocks(seq_group)
461
+ num_swapped_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
462
+ num_free_blocks = self.gpu_allocator.get_num_free_blocks()
463
+ # NOTE: Conservatively, we assume that every sequence will allocate
464
+ # at least one free block right after the swap-in.
465
+ # NOTE: This should match the logic in can_append_slot().
466
+ num_required_blocks = len(blocks) + num_swapped_seqs
467
+ if self.gpu_allocator.get_num_total_blocks() < num_required_blocks:
468
+ return AllocStatus.NEVER
469
+ elif num_free_blocks - num_required_blocks >= self.watermark_blocks:
470
+ return AllocStatus.OK
471
+ else:
472
+ return AllocStatus.LATER
473
+
474
+ def swap_in(self,
475
+ seq_group: SequenceGroup,
476
+ num_lookahead_slots: int = 0) -> Dict[int, int]:
477
+ assert (num_lookahead_slots == 0
478
+ ), "BlockSpaceManagerV1 does not support lookahead allocation"
479
+
480
+ # CPU block -> GPU block.
481
+ mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
482
+ for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
483
+ new_block_table: BlockTable = []
484
+ block_table = self.block_tables[seq.seq_id]
485
+
486
+ for cpu_block in block_table:
487
+ if cpu_block in mapping:
488
+ gpu_block = mapping[cpu_block]
489
+ gpu_block.ref_count += 1
490
+ else:
491
+ gpu_block = self.gpu_allocator.allocate(
492
+ cpu_block.block_hash, cpu_block.num_hashed_tokens)
493
+ mapping[cpu_block] = gpu_block
494
+ new_block_table.append(gpu_block)
495
+ # Free the CPU block swapped in to GPU.
496
+ self.cpu_allocator.free(cpu_block)
497
+ self.block_tables[seq.seq_id] = new_block_table
498
+
499
+ block_number_mapping = {
500
+ cpu_block.block_number: gpu_block.block_number
501
+ for cpu_block, gpu_block in mapping.items()
502
+ }
503
+ return block_number_mapping
504
+
505
+ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
506
+ blocks = self._get_physical_blocks(seq_group)
507
+ return len(blocks) <= self.cpu_allocator.get_num_free_blocks()
508
+
509
+ def swap_out(self, seq_group: SequenceGroup) -> Dict[int, int]:
510
+ # GPU block -> CPU block.
511
+ mapping: Dict[PhysicalTokenBlock, PhysicalTokenBlock] = {}
512
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
513
+ new_block_table: BlockTable = []
514
+ block_table = self.block_tables[seq.seq_id]
515
+
516
+ for gpu_block in block_table:
517
+ if gpu_block in mapping:
518
+ cpu_block = mapping[gpu_block]
519
+ cpu_block.ref_count += 1
520
+ else:
521
+ cpu_block = self.cpu_allocator.allocate(
522
+ gpu_block.block_hash, gpu_block.num_hashed_tokens)
523
+ mapping[gpu_block] = cpu_block
524
+ new_block_table.append(cpu_block)
525
+ # Free the GPU block swapped out to CPU.
526
+ self.gpu_allocator.free(gpu_block)
527
+ self.block_tables[seq.seq_id] = new_block_table
528
+
529
+ block_number_mapping = {
530
+ gpu_block.block_number: cpu_block.block_number
531
+ for gpu_block, cpu_block in mapping.items()
532
+ }
533
+ return block_number_mapping
534
+
535
+ def _free_block_table(self, block_table: BlockTable) -> None:
536
+ # when using a sliding window, each seq will only use up
537
+ # to `self.block_sliding_window` blocks. When freeing
538
+ # the block table, we must make sure to not free blocks more
539
+ # than once. If no sliding window is used, there is no block
540
+ # reuse in the block table, so we must free all blocks.
541
+ blocks_to_free = (block_table[-self.block_sliding_window:]
542
+ if self.block_sliding_window is not None else
543
+ block_table)
544
+ for block in set(blocks_to_free):
545
+ if block.device == Device.GPU:
546
+ self.gpu_allocator.free(block)
547
+ else:
548
+ self.cpu_allocator.free(block)
549
+
550
+ def free(self, seq: Sequence) -> None:
551
+ if seq.seq_id not in self.block_tables:
552
+ # Already freed or haven't been scheduled yet.
553
+ return
554
+ block_table = self.block_tables[seq.seq_id]
555
+ self._free_block_table(block_table)
556
+ del self.block_tables[seq.seq_id]
557
+
558
+ def reset(self) -> None:
559
+ for block_table in self.block_tables.values():
560
+ self._free_block_table(block_table)
561
+ self.block_tables.clear()
562
+
563
+ def get_block_table(self, seq: Sequence) -> List[int]:
564
+ block_table = self.block_tables[seq.seq_id]
565
+ return [block.block_number for block in block_table]
566
+
567
+ def get_num_free_gpu_blocks(self) -> int:
568
+ return self.gpu_allocator.get_num_free_blocks()
569
+
570
+ def get_num_free_cpu_blocks(self) -> int:
571
+ return self.cpu_allocator.get_num_free_blocks()
572
+
573
+ def access_all_blocks_in_seq(
574
+ self,
575
+ seq: Sequence,
576
+ access_time: float,
577
+ ) -> None:
578
+ if self.enable_caching:
579
+ # Update the last accessed time of all the blocks accessed
580
+ # in this step.
581
+ block_table = self.block_tables[seq.seq_id]
582
+ for block in block_table:
583
+ block.last_accessed = access_time
584
+
585
+ def compute_full_blocks_in_seq(self, seq: Sequence):
586
+ if seq.seq_id not in self.block_tables:
587
+ return
588
+ max_full_block = seq.get_len() // self.block_size - 1
589
+ block_table = self.block_tables[seq.seq_id]
590
+ if max_full_block == -1:
591
+ return
592
+ for i in reversed(range(max_full_block)):
593
+ if block_table[i].computed:
594
+ break
595
+ block_table[i].computed = True
596
+
597
+ def get_all_computed_blocks(self, seq: Sequence) -> List[int]:
598
+ if seq.seq_id not in self.block_tables:
599
+ return []
600
+ block_table = self.block_tables[seq.seq_id]
601
+ # NOTE We exclude the last block to avoid the case where the entire
602
+ # prompt is cached. This would cause erroneous behavior in model
603
+ # runner.
604
+ return [
605
+ b.block_number
606
+ for b in takewhile(lambda b: b.computed, block_table[:-1])
607
+ ]
608
+
609
+ def get_common_computed_block_ids(
610
+ self, seqs: List[Sequence]) -> GenericSequence[int]:
611
+ """Return the block ids that are common for a given sequence group.
612
+
613
+ Used in prefill (can skip prefill of some blocks).
614
+ """
615
+ # Can return non-empty result only with prefix caching enabled.
616
+ if not self.enable_caching:
617
+ return []
618
+
619
+ ids_list = [self.get_all_computed_blocks(seq) for seq in seqs]
620
+ return commonprefix([ids for ids in ids_list if ids != []])
621
+
622
+ def mark_blocks_as_computed(self, seq_group: SequenceGroup):
623
+ if self.enable_caching:
624
+ for seq in seq_group.seqs_dict.values():
625
+ self.compute_full_blocks_in_seq(seq)