vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
vllm/core/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,295 @@
1
+ from typing import List, Optional
2
+
3
+ from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
4
+ from vllm.utils import Device, cdiv, chunk_list
5
+
6
+
7
+ class BlockTable:
8
+ """A class to manage blocks for a specific sequence.
9
+
10
+ The BlockTable maps a sequence of tokens to a list of blocks, where each
11
+ block represents a contiguous memory allocation for a portion of the
12
+ sequence. The blocks are managed by a DeviceAwareBlockAllocator, which is
13
+ responsible for allocating and freeing memory for the blocks.
14
+
15
+ Args:
16
+ block_size (int): The maximum number of tokens that can be stored in a
17
+ single block.
18
+ block_allocator (DeviceAwareBlockAllocator): The block allocator used to
19
+ manage memory for the blocks.
20
+ _blocks (Optional[List[Block]], optional): An optional list of existing
21
+ blocks to initialize the BlockTable with. If not provided, an empty
22
+ BlockTable is created.
23
+
24
+ Attributes:
25
+ _block_size (int): The maximum number of tokens that can be stored in a
26
+ single block.
27
+ _allocator (DeviceAwareBlockAllocator): The block allocator used to
28
+ manage memory for the blocks.
29
+ _blocks (Optional[List[Block]]): The list of blocks managed by this
30
+ BlockTable.
31
+ _num_full_slots (int): The number of tokens currently stored in the
32
+ blocks.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ block_size: int,
38
+ block_allocator: DeviceAwareBlockAllocator,
39
+ _blocks: Optional[List[Block]] = None,
40
+ ):
41
+ self._block_size = block_size
42
+ self._allocator = block_allocator
43
+ if _blocks is None:
44
+ _blocks = []
45
+ self._blocks: List[Block] = _blocks
46
+
47
+ # Use helper method instead of directly calculating, as blocks
48
+ # may not be allocated.
49
+ self._num_full_slots = len(self._get_all_token_ids())
50
+
51
+ @staticmethod
52
+ def get_num_required_blocks(token_ids: List[int], block_size: int) -> int:
53
+ """Calculates the minimum number of blocks required to store a given
54
+ sequence of token IDs.
55
+
56
+ This assumes worst-case scenario, where every block requires a new
57
+ allocation (e.g. ignoring prefix caching).
58
+
59
+ Args:
60
+ token_ids (List[int]): The sequence of token IDs to be stored.
61
+ block_size (int): The maximum number of tokens that can be stored in
62
+ a single block.
63
+
64
+ Returns:
65
+ int: The minimum number of blocks required to store the given
66
+ sequence of token IDs.
67
+ """
68
+ return cdiv(len(token_ids), block_size)
69
+
70
+ def allocate(self,
71
+ token_ids: List[int],
72
+ device: Device = Device.GPU) -> None:
73
+ """Allocates memory blocks for storing the given sequence of token IDs.
74
+
75
+ This method allocates the required number of blocks to store the given
76
+ sequence of token IDs.
77
+
78
+ Args:
79
+ token_ids (List[int]): The sequence of token IDs to be stored.
80
+ device (Device, optional): The device on which the blocks should be
81
+ allocated. Defaults to Device.GPU.
82
+ """
83
+ assert not self._is_allocated
84
+ assert token_ids
85
+ self._blocks = self._allocate_blocks_for_token_ids(prev_block=None,
86
+ token_ids=token_ids,
87
+ device=device)
88
+ self._num_full_slots = len(token_ids)
89
+
90
+ def append_token_ids(self,
91
+ token_ids: List[int],
92
+ num_lookahead_slots: int = 0) -> None:
93
+ """Appends a sequence of token IDs to the existing blocks in the
94
+ BlockTable.
95
+
96
+ This method appends the given sequence of token IDs to the existing
97
+ blocks in the BlockTable. If there is not enough space in the existing
98
+ blocks, new blocks are allocated using the `ensure_num_empty_slots`
99
+ method to accommodate the additional tokens.
100
+
101
+ The token IDs are divided into chunks of size `block_size` (except for
102
+ the first chunk, which may be smaller), and each chunk is appended to a
103
+ separate block.
104
+
105
+ Args:
106
+ token_ids (List[int]): The sequence of token IDs to be appended.
107
+ """
108
+ assert self._is_allocated
109
+ assert len(self._blocks) > 0
110
+
111
+ self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
112
+ num_lookahead_slots)
113
+
114
+ blocks = self._blocks[self._num_full_slots // self._block_size:]
115
+ token_blocks = self._chunk_token_blocks_for_append(token_ids)
116
+
117
+ for block, token_block in zip(blocks, token_blocks):
118
+ block.append_token_ids(token_block)
119
+
120
+ self._num_full_slots += len(token_ids)
121
+
122
+ def ensure_num_empty_slots(self, num_empty_slots: int) -> None:
123
+ """Ensures that the BlockTable has at least the specified number of
124
+ empty slots available.
125
+
126
+ This method checks if the BlockTable has enough empty slots (i.e.,
127
+ available space) to accommodate the requested number of tokens. If not,
128
+ it allocates additional blocks on the GPU to ensure that the required
129
+ number of empty slots is available.
130
+
131
+ Args:
132
+ num_empty_slots (int): The minimum number of empty slots required.
133
+ """
134
+ # Currently the block table only supports
135
+ # appending tokens to GPU blocks.
136
+ device = Device.GPU
137
+ assert self._is_allocated
138
+
139
+ if self._num_empty_slots >= num_empty_slots:
140
+ return
141
+
142
+ slots_to_allocate = num_empty_slots - self._num_empty_slots
143
+ blocks_to_allocate = cdiv(slots_to_allocate, self._block_size)
144
+
145
+ for _ in range(blocks_to_allocate):
146
+ assert len(self._blocks) > 0
147
+ self._blocks.append(
148
+ self._allocator.allocate_mutable(prev_block=self._blocks[-1],
149
+ device=device))
150
+
151
+ def fork(self) -> "BlockTable":
152
+ """Creates a new BlockTable instance with a copy of the blocks from the
153
+ current instance.
154
+
155
+ This method creates a new BlockTable instance with the same block size,
156
+ block allocator, and a copy of the blocks from the current instance. The
157
+ new BlockTable has its own independent set of blocks, but shares the
158
+ same underlying memory allocation with the original BlockTable.
159
+
160
+ Returns:
161
+ BlockTable: A new BlockTable instance with a copy of the blocks from
162
+ the current instance.
163
+ """
164
+ assert self._is_allocated
165
+ assert len(self._blocks) > 0
166
+ forked_blocks = self._allocator.fork(self._blocks[-1])
167
+ return BlockTable(
168
+ block_size=self._block_size,
169
+ block_allocator=self._allocator,
170
+ _blocks=forked_blocks,
171
+ )
172
+
173
+ def free(self) -> None:
174
+ """Frees the memory occupied by the blocks in the BlockTable.
175
+
176
+ This method iterates over all the blocks in the `_blocks` list and calls
177
+ the `free` method of the `_allocator` object to release the memory
178
+ occupied by each block. After freeing all the blocks, the `_blocks` list
179
+ is set to `None`.
180
+ """
181
+ assert self._is_allocated
182
+ for block in self._blocks:
183
+ self._allocator.free(block)
184
+ self._blocks = []
185
+
186
+ @property
187
+ def physical_block_ids(self) -> List[Optional[int]]:
188
+ """Returns a list of physical block indices for the blocks in the
189
+ BlockTable.
190
+
191
+ This property returns a list of integers, where each integer represents
192
+ the physical block index of a corresponding block in the `_blocks` list.
193
+ The physical block index is a unique identifier for the memory location
194
+ occupied by the block.
195
+
196
+ Returns:
197
+ List[int]: A list of physical block indices for the blocks in the
198
+ BlockTable.
199
+ """
200
+ assert self._is_allocated
201
+ return [block.block_id for block in self._blocks]
202
+
203
+ def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
204
+ """Get the number of "unseen" tokens in the sequence.
205
+
206
+ Unseen tokens are tokens in the sequence corresponding to this block
207
+ table, but are not yet appended to this block table.
208
+
209
+ Args:
210
+ sequence_token_ids (List[int]): The list of token ids in the
211
+ sequence.
212
+
213
+ Returns:
214
+ List[int]: The postfix of sequence_token_ids that has not yet been
215
+ appended to the block table.
216
+ """
217
+
218
+ # Since the block table is append-only, the unseen token ids are the
219
+ # ones after the appended ones.
220
+ return sequence_token_ids[self.num_full_slots:]
221
+
222
+ def _allocate_blocks_for_token_ids(self, prev_block: Optional[Block],
223
+ token_ids: List[int],
224
+ device: Device) -> List[Block]:
225
+ blocks = []
226
+ for block_token_ids in chunk_list(token_ids, self._block_size):
227
+ if len(block_token_ids) == self._block_size:
228
+ # If the block is full, create an immutable block.
229
+ prev_block = self._allocator.allocate_immutable(
230
+ prev_block, token_ids=block_token_ids, device=device)
231
+ else:
232
+ # Else, partially fill a mutable block with token ids.
233
+ prev_block = self._allocator.allocate_mutable(
234
+ prev_block=prev_block, device=device)
235
+ prev_block.append_token_ids(block_token_ids)
236
+ blocks.append(prev_block)
237
+
238
+ return blocks
239
+
240
+ def _get_all_token_ids(self) -> List[int]:
241
+ # NOTE: This function is O(seq_len); use sparingly.
242
+ token_ids: List[int] = []
243
+
244
+ if not self._is_allocated:
245
+ return token_ids
246
+
247
+ for block in self._blocks:
248
+ token_ids.extend(block.token_ids)
249
+
250
+ return token_ids
251
+
252
+ @property
253
+ def _is_allocated(self) -> bool:
254
+ return len(self._blocks) > 0
255
+
256
+ @property
257
+ def _num_empty_slots(self) -> int:
258
+ assert self._is_allocated
259
+ return len(self._blocks) * self._block_size - self._num_full_slots
260
+
261
+ @property
262
+ def num_full_slots(self) -> int:
263
+ """Returns the total number of tokens currently stored in the
264
+ BlockTable.
265
+
266
+ Returns:
267
+ int: The total number of tokens currently stored in the BlockTable.
268
+ """
269
+ return self._num_full_slots
270
+
271
+ def get_num_blocks_touched_by_append_slots(
272
+ self, token_ids: List[int], num_lookahead_slots: int) -> int:
273
+ """Determine how many blocks will be "touched" by appending the token
274
+ ids.
275
+
276
+ This is required for the scheduler to determine whether a sequence can
277
+ continue generation, or if it must be preempted.
278
+ """
279
+
280
+ all_token_ids = token_ids + [-1] * num_lookahead_slots
281
+ token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
282
+ return len(token_blocks)
283
+
284
+ def _chunk_token_blocks_for_append(
285
+ self, token_ids: List[int]) -> List[List[int]]:
286
+ """Split the token ids into block-sized chunks so they can be easily
287
+ appended to blocks. The first such "token block" may have less token ids
288
+ than the block size, since the last allocated block may be partially
289
+ full.
290
+ """
291
+ first_chunk_size = self._block_size - (self._num_full_slots %
292
+ self._block_size)
293
+ token_blocks = [token_ids[:first_chunk_size]] + chunk_list(
294
+ token_ids[first_chunk_size:], self._block_size)
295
+ return token_blocks
@@ -0,0 +1,199 @@
1
+ from collections import defaultdict
2
+ from typing import Dict, Iterable, List, Optional, Protocol
3
+
4
+ from vllm.core.block.interfaces import Block, BlockAllocator
5
+
6
+ BlockId = int
7
+ RefCount = int
8
+
9
+
10
+ class RefCounterProtocol(Protocol):
11
+
12
+ def incr(self, block_id: BlockId) -> RefCount:
13
+ raise NotImplementedError
14
+
15
+ def decr(self, block_id: BlockId) -> RefCount:
16
+ raise NotImplementedError
17
+
18
+ def get(self, block_id: BlockId) -> RefCount:
19
+ raise NotImplementedError
20
+
21
+
22
+ class RefCounter(RefCounterProtocol):
23
+ """A class for managing reference counts for a set of block indices.
24
+
25
+ The RefCounter class maintains a dictionary that maps block indices to their
26
+ corresponding reference counts. It provides methods to increment, decrement,
27
+ and retrieve the reference count for a given block index.
28
+
29
+ Args:
30
+ all_block_indices (Iterable[BlockId]): An iterable of block indices
31
+ to initialize the reference counter with.
32
+ """
33
+
34
+ def __init__(self, all_block_indices: Iterable[BlockId]):
35
+ deduped = set(all_block_indices)
36
+ self._refcounts: Dict[BlockId,
37
+ RefCount] = {index: 0
38
+ for index in deduped}
39
+
40
+ def incr(self, block_id: BlockId) -> RefCount:
41
+ assert block_id in self._refcounts
42
+ pre_incr_refcount = self._refcounts[block_id]
43
+
44
+ assert pre_incr_refcount >= 0
45
+
46
+ post_incr_refcount = pre_incr_refcount + 1
47
+ self._refcounts[block_id] = post_incr_refcount
48
+ return post_incr_refcount
49
+
50
+ def decr(self, block_id: BlockId) -> RefCount:
51
+ assert block_id in self._refcounts
52
+ refcount = self._refcounts[block_id]
53
+
54
+ assert refcount > 0
55
+ refcount -= 1
56
+
57
+ self._refcounts[block_id] = refcount
58
+
59
+ return refcount
60
+
61
+ def get(self, block_id: BlockId) -> RefCount:
62
+ assert block_id in self._refcounts
63
+ return self._refcounts[block_id]
64
+
65
+ def as_readonly(self) -> "ReadOnlyRefCounter":
66
+ return ReadOnlyRefCounter(self)
67
+
68
+
69
+ class ReadOnlyRefCounter(RefCounterProtocol):
70
+ """A read-only view of the RefCounter class.
71
+
72
+ The ReadOnlyRefCounter class provides a read-only interface to access the
73
+ reference counts maintained by a RefCounter instance. It does not allow
74
+ modifications to the reference counts.
75
+
76
+ Args:
77
+ refcounter (RefCounter): The RefCounter instance to create a read-only
78
+ view for.
79
+ """
80
+
81
+ def __init__(self, refcounter: RefCounter):
82
+ self._refcounter = refcounter
83
+
84
+ def incr(self, block_id: BlockId) -> RefCount:
85
+ raise ValueError("Incr not allowed")
86
+
87
+ def decr(self, block_id: BlockId) -> RefCount:
88
+ raise ValueError("Decr not allowed")
89
+
90
+ def get(self, block_id: BlockId) -> RefCount:
91
+ return self._refcounter.get(block_id)
92
+
93
+
94
+ class CopyOnWriteTracker:
95
+ """A class for tracking and managing copy-on-write operations for blocks.
96
+
97
+ The CopyOnWriteTracker class maintains a mapping of source block indices to
98
+ their corresponding copy-on-write destination block indices. It works in
99
+ conjunction with a RefCounter and a BlockAllocator to handle reference
100
+ counting and block allocation.
101
+
102
+ Args:
103
+ refcounter (RefCounter): The reference counter used to track block
104
+ reference counts.
105
+ allocator (BlockAllocator): The block allocator used to allocate and
106
+ free blocks.
107
+ """
108
+
109
+ def __init__(
110
+ self,
111
+ refcounter: RefCounterProtocol,
112
+ allocator: BlockAllocator,
113
+ ):
114
+ self._copy_on_writes: Dict[BlockId, List[BlockId]] = defaultdict(list)
115
+ self._refcounter = refcounter
116
+ self._allocator = allocator
117
+
118
+ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
119
+ """Performs a copy-on-write operation on the given block if it is not
120
+ appendable.
121
+
122
+ This method checks the reference count of the given block. If the
123
+ reference count is greater than 1, indicating that the block is shared,
124
+ a copy-on-write operation is performed. The original block is freed,
125
+ and a new block is allocated with the same content. The new block index
126
+ is returned.
127
+
128
+ Args:
129
+ block (Block): The block to check for copy-on-write.
130
+
131
+ Returns:
132
+ Optional[BlockId]: The block index of the new block if a copy-on
133
+ -write operation was performed, or the original block index if
134
+ no copy-on-write was necessary.
135
+ """
136
+ block_id = block.block_id
137
+ if block_id is None:
138
+ return block_id
139
+
140
+ refcount = self._refcounter.get(block_id)
141
+ assert refcount != 0
142
+ if refcount > 1:
143
+ src_block_id = block_id
144
+
145
+ # Decrement refcount of the old block.
146
+ self._allocator.free(block)
147
+
148
+ # Allocate a fresh new block.
149
+ block_id = self._allocator.allocate_mutable(
150
+ prev_block=block.prev_block).block_id
151
+
152
+ # Track src/dst copy.
153
+ assert src_block_id is not None
154
+ assert block_id is not None
155
+ self._copy_on_writes[src_block_id].append(block_id)
156
+
157
+ return block_id
158
+
159
+ def clear_cows(self) -> Dict[BlockId, List[BlockId]]:
160
+ """Clears the copy-on-write tracking information and returns the current
161
+ state.
162
+
163
+ This method returns a dictionary mapping source block indices to lists
164
+ of destination block indices for the current copy-on-write operations.
165
+ It then clears the internal tracking information.
166
+
167
+ Returns:
168
+ Dict[BlockId, List[BlockId]]: A dictionary mapping source
169
+ block indices to lists of destination block indices for the
170
+ current copy-on-write operations.
171
+ """
172
+ cows = dict(self._copy_on_writes)
173
+ self._copy_on_writes.clear()
174
+ return cows
175
+
176
+
177
+ def get_all_blocks_recursively(last_block: Block) -> List[Block]:
178
+ """Retrieves all the blocks in a sequence starting from the last block.
179
+
180
+ This function recursively traverses the sequence of blocks in reverse order,
181
+ starting from the given last block, and returns a list of all the blocks in
182
+ the sequence.
183
+
184
+ Args:
185
+ last_block (Block): The last block in the sequence.
186
+
187
+ Returns:
188
+ List[Block]: A list of all the blocks in the sequence, in the order they
189
+ appear.
190
+ """
191
+
192
+ def recurse(block: Block, lst: List[Block]) -> None:
193
+ if block.prev_block is not None:
194
+ recurse(block.prev_block, lst)
195
+ lst.append(block)
196
+
197
+ all_blocks: List[Block] = []
198
+ recurse(last_block, all_blocks)
199
+ return all_blocks