vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,606 @@
1
+ """Token blocks."""
2
+ from itertools import takewhile
3
+ from os.path import commonprefix
4
+ from typing import Dict, FrozenSet, Iterable, List, Optional
5
+
6
+ from vllm.core.block.common import (CopyOnWriteTracker,
7
+ get_all_blocks_recursively)
8
+ from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
9
+ from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
10
+ from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
11
+
12
+ PrefixHash = int
13
+
14
+ # By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
15
+ # so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME,
16
+ # then we know this block hasn't been accessed yet.
17
+ _DEFAULT_LAST_ACCESSED_TIME = -1
18
+
19
+
20
+ class PrefixCachingBlockAllocator(BlockAllocator):
21
+ """A block allocator that implements prefix caching.
22
+
23
+ The PrefixCachingBlockAllocator maintains a cache of blocks based on their
24
+ content hash. It reuses blocks with the same content hash to avoid redundant
25
+ memory allocation. The allocator also supports copy-on-write operations.
26
+
27
+ Args:
28
+ num_blocks (int): The total number of blocks to manage.
29
+ block_size (int): The size of each block in tokens.
30
+ block_ids(Optional[Iterable[int]], optional): An optional iterable of
31
+ block IDs. If not provided, block IDs will be assigned sequentially
32
+ from 0 to num_blocks - 1.
33
+ """
34
+
35
+ def __init__(
36
+ self,
37
+ num_blocks: int,
38
+ block_size: int,
39
+ block_ids: Optional[Iterable[int]] = None,
40
+ eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
41
+ ):
42
+ # A mapping of prefix hash to block index. All blocks which have a
43
+ # prefix hash will be in this dict, even if they have refcount 0.
44
+ self._cached_blocks: Dict[PrefixHash, BlockId] = {}
45
+
46
+ # A mapping of blockId to Block to track those cached blocks
47
+ self._blocks: Dict[BlockId, Block] = {}
48
+
49
+ # An allocator for blocks that do not have prefix hashes.
50
+ self._hashless_allocator = NaiveBlockAllocator(
51
+ create_block=self._create_block, # type: ignore
52
+ num_blocks=num_blocks,
53
+ block_size=block_size,
54
+ block_ids=block_ids,
55
+ )
56
+
57
+ self._block_size = block_size
58
+
59
+ # Evitor used to maintain how we want to handle those computed blocks
60
+ # if we find memory pressure is high.
61
+ self.evictor: Evictor = make_evictor(eviction_policy)
62
+
63
+ # We share the refcounter between allocators. This allows us to promote
64
+ # blocks originally allocated in the hashless allocator to immutable
65
+ # blocks.
66
+ self._refcounter = self._hashless_allocator.refcounter
67
+
68
+ self._cow_tracker = CopyOnWriteTracker(
69
+ refcounter=self._refcounter.as_readonly(),
70
+ allocator=self,
71
+ )
72
+
73
+ # Implements Block.Factory.
74
+ def _create_block(
75
+ self,
76
+ prev_block: Optional[Block],
77
+ token_ids: List[int],
78
+ block_size: int,
79
+ allocator: BlockAllocator,
80
+ block_id: Optional[int] = None,
81
+ computed: bool = False,
82
+ ) -> Block:
83
+ # Bind block to self.
84
+ allocator = self
85
+
86
+ return PrefixCachingBlock(
87
+ prev_block=prev_block,
88
+ token_ids=token_ids,
89
+ block_size=block_size,
90
+ block_id=block_id,
91
+ prefix_caching_allocator=allocator,
92
+ computed=computed,
93
+ )
94
+
95
+ def allocate_immutable(self,
96
+ prev_block: Optional[Block],
97
+ token_ids: List[int],
98
+ device: Optional[Device] = None) -> Block:
99
+ """Allocates an immutable block with the given token IDs, reusing cached
100
+ blocks if possible.
101
+
102
+ Args:
103
+ prev_block (Optional[Block]): The previous block in the sequence.
104
+ token_ids (List[int]): The token IDs to be stored in the block.
105
+
106
+ Returns:
107
+ Block: The allocated immutable block.
108
+ """
109
+ assert device is None
110
+ assert_prefix_caching_block_or_none(prev_block)
111
+
112
+ block = self._create_block(
113
+ prev_block=prev_block,
114
+ token_ids=token_ids,
115
+ block_size=self._block_size,
116
+ allocator=self,
117
+ )
118
+ assert block.content_hash is not None
119
+
120
+ cached_block_id = self._cached_blocks.get(block.content_hash, None)
121
+ if cached_block_id is not None:
122
+ block.block_id = cached_block_id
123
+ self._incr_refcount_cached_block(block, block.block_id)
124
+ return block
125
+
126
+ block = self.allocate_mutable(prev_block)
127
+ block.append_token_ids(token_ids)
128
+ assert block.content_hash is not None
129
+
130
+ return block
131
+
132
+ def allocate_mutable(self,
133
+ prev_block: Optional[Block],
134
+ device: Optional[Device] = None) -> Block:
135
+ """Allocates a mutable block. If there are no free blocks, this will
136
+ evict unused cached blocks.
137
+
138
+ Args:
139
+ prev_block (Block): The previous block in the sequence.
140
+ None is not allowed unlike it is super class.
141
+
142
+ Returns:
143
+ Block: The allocated mutable block.
144
+ """
145
+ assert device is None
146
+ assert_prefix_caching_block_or_none(prev_block)
147
+
148
+ try:
149
+ block = self._hashless_allocator.allocate_mutable(
150
+ prev_block=prev_block)
151
+
152
+ assert block.block_id not in self._blocks
153
+ assert block.block_id is not None
154
+ self._blocks[block.block_id] = block
155
+ return block
156
+ except BlockAllocator.NoFreeBlocksError:
157
+ # We must check the unused cached blocks before raising OOM.
158
+ pass
159
+
160
+ # If the evictor has blocks available for eviction, evict a block
161
+ # and return it.
162
+ if self.evictor.num_blocks > 0:
163
+ block_id, content_hash_to_evict = self.evictor.evict()
164
+
165
+ # Here we may have scenario that several blocks have
166
+ # the same content hash, but due to the latter coming block
167
+ # is coming from mutable to immutable path, their physical
168
+ # block is added into evictor.
169
+ # However in this case, we shall not pop the _cached_blocks,
170
+ # as the same content is still used by others, which means
171
+ # we need to check ref before decide to pop the list.
172
+
173
+ _block_id = self._cached_blocks[content_hash_to_evict]
174
+ refcount = self._refcounter.get(_block_id)
175
+ if refcount == 1:
176
+ self._cached_blocks.pop(content_hash_to_evict)
177
+ assert _block_id == block_id
178
+
179
+ self._refcounter.incr(block_id)
180
+
181
+ # the block comes from evictor already contain computed result
182
+ block = self._create_block(
183
+ prev_block=prev_block,
184
+ token_ids=[],
185
+ block_size=self._block_size,
186
+ allocator=self,
187
+ block_id=block_id,
188
+ computed=True,
189
+ )
190
+ assert block.content_hash is None
191
+
192
+ assert block.block_id not in self._blocks
193
+ assert block.block_id is not None
194
+ self._blocks[block.block_id] = block
195
+ return block
196
+
197
+ # No block available in hashless allocator, nor in unused cache blocks.
198
+ raise BlockAllocator.NoFreeBlocksError()
199
+
200
+ def _incr_refcount_cached_block(self, block: Block,
201
+ block_id: BlockId) -> None:
202
+ # since block is already computed, mark it
203
+ block.computed = True
204
+
205
+ refcount = self._refcounter.incr(block_id)
206
+ if refcount == 1:
207
+ # if block get referred, then it shall not be in evictor
208
+ # and put it into _blocks for tracking
209
+ if block_id in self.evictor:
210
+ self.evictor.remove(block_id)
211
+ self._blocks[block_id] = block
212
+
213
+ def free(self, block: Block) -> None:
214
+ """Decrement the refcount of the block. If the decremented refcount is
215
+ zero, store the block in the freelist.
216
+
217
+ If the block has a content hash (meaning it is immutable), then we will
218
+ keep the block around in case future allocations require it.
219
+ """
220
+ assert (block.block_id
221
+ is not None), "freeing unallocated block is undefined"
222
+
223
+ self._free_block_id_for_block(block.block_id, block)
224
+
225
+ block.block_id = None
226
+
227
+ def _free_block_id_for_block(self, block_id: BlockId,
228
+ block: Block) -> None:
229
+ assert isinstance(block, PrefixCachingBlock)
230
+
231
+ if block.content_hash is None:
232
+ refcount = self._refcounter.get(block_id)
233
+ # We have fork case where block would get more than one ref,
234
+ # so we cannot free it from tracking if ref cnt large than 1
235
+ if refcount <= 1:
236
+ assert block.block_id is not None
237
+ del self._blocks[block.block_id]
238
+ return self._hashless_allocator.free(block)
239
+
240
+ refcount = self._refcounter.decr(block_id)
241
+
242
+ # If no longer used, add the block to the evictor.
243
+ if refcount == 0:
244
+ assert block.content_hash in self._cached_blocks
245
+ assert block.block_id is not None
246
+ del self._blocks[block.block_id]
247
+ self.evictor.add(block.block_id, block.content_hash,
248
+ block.num_tokens_total, block.last_accessed)
249
+
250
+ def fork(self, last_block: Block) -> List[Block]:
251
+ """Creates a new sequence of blocks that shares the same underlying
252
+ memory as the original sequence.
253
+
254
+ Args:
255
+ last_block (Block): The last block in the original sequence.
256
+
257
+ Returns:
258
+ List[Block]: The new sequence of blocks that shares the same memory
259
+ as the original sequence.
260
+ """
261
+ source_blocks = get_all_blocks_recursively(last_block)
262
+
263
+ forked_blocks = []
264
+ prev_block = None
265
+ for block in source_blocks:
266
+ refcount = self._refcounter.incr(block.block_id)
267
+ assert refcount != 1, "can't fork free'd block"
268
+
269
+ forked_blocks.append(
270
+ self._create_block(
271
+ prev_block=prev_block,
272
+ token_ids=block.token_ids,
273
+ block_id=block.block_id,
274
+ block_size=self._block_size,
275
+ allocator=self,
276
+ ))
277
+ prev_block = forked_blocks[-1]
278
+
279
+ return forked_blocks
280
+
281
+ def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
282
+ assert device is None
283
+ # The number of free blocks is the number of hashless free blocks
284
+ # plus the number of blocks evictor could free from its list.
285
+ return self._hashless_allocator.get_num_free_blocks(
286
+ ) + self.evictor.num_blocks
287
+
288
+ def get_num_total_blocks(self) -> int:
289
+ return self._hashless_allocator.get_num_total_blocks()
290
+
291
+ @property
292
+ def all_block_ids(self) -> FrozenSet[int]:
293
+ return self._hashless_allocator.all_block_ids
294
+
295
+ def promote_to_immutable_block(self, block: Block) -> BlockId:
296
+ """Once a mutable block is full, it can be promoted to an immutable
297
+ block. This means that its content can be referenced by future blocks
298
+ having the same prefix.
299
+
300
+ Note that if we already have a cached block with the same content, we
301
+ will replace the newly-promoted block's mapping with the existing cached
302
+ block.
303
+
304
+ Args:
305
+ block: The mutable block to be promoted.
306
+
307
+ Returns:
308
+ BlockId: Either the original block index, or the block index of
309
+ the previously cached block matching the same content.
310
+ """
311
+ assert block.content_hash is not None
312
+ assert block.block_id is not None
313
+ assert self._refcounter.get(block.block_id) > 0
314
+
315
+ # If the content hash does not have a corresponding cached block,
316
+ # set this block as the cached block.
317
+ if block.content_hash not in self._cached_blocks:
318
+ self._cached_blocks[block.content_hash] = block.block_id
319
+ else:
320
+ self._free_block_id_for_block(block.block_id, block)
321
+ self._incr_refcount_cached_block(
322
+ block, self._cached_blocks[block.content_hash])
323
+
324
+ return self._cached_blocks[block.content_hash]
325
+
326
+ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
327
+ """Performs a copy-on-write operation on the given block if it is not
328
+ appendable.
329
+
330
+ Args:
331
+ block (Block): The block to check for copy-on-write.
332
+
333
+ Returns:
334
+ Optional[BlockId]: The block index of the new block if a copy-on
335
+ -write operation was performed, or the original block index if
336
+ no copy-on-write was necessary.
337
+ """
338
+ return self._cow_tracker.cow_block_if_not_appendable(block)
339
+
340
+ def clear_copy_on_writes(self) -> Dict[BlockId, List[BlockId]]:
341
+ """Returns the copy-on-write source->destination mapping and clears it.
342
+
343
+ Returns:
344
+ Dict[BlockId, List[BlockId]]: A dictionary mapping source
345
+ block indices to lists of destination block indices.
346
+ """
347
+ return self._cow_tracker.clear_cows()
348
+
349
+ def mark_blocks_as_accessed(self, block_ids: List[int],
350
+ now: float) -> None:
351
+ """Mark blocks as accessed, used in prefix caching.
352
+
353
+ If the block is added into evictor, we need to update corresponding
354
+ info in evictor's metadata.
355
+ """
356
+
357
+ for block_id in block_ids:
358
+ if block_id in self._blocks:
359
+ self._blocks[block_id].last_accessed = now
360
+ elif block_id in self.evictor:
361
+ self.evictor.update(block_id, now)
362
+ else:
363
+ raise ValueError(
364
+ "Mark block as accessed which is not belonged to GPU")
365
+
366
+ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
367
+ """Mark blocks as computed, used in prefix caching."""
368
+
369
+ for block_id in block_ids:
370
+ if block_id in self._blocks:
371
+ # only those full block is valid for prefix caching
372
+ if self._blocks[block_id].is_full:
373
+ self._blocks[block_id].computed = True
374
+ elif block_id not in self.evictor:
375
+ raise ValueError(f"Mark {block_id=} as computed which "
376
+ "is not belonged to GPU")
377
+
378
+ def block_is_computed(self, block_id: int) -> bool:
379
+ if block_id in self._blocks:
380
+ return self._blocks[block_id].computed
381
+ else:
382
+ return block_id in self.evictor
383
+
384
+ def get_common_computed_block_ids(
385
+ self, seq_block_ids: List[List[int]]) -> List[int]:
386
+ """Return the block ids that are common for a given sequence group.
387
+
388
+ Only those blocks that are immutable and already be marked
389
+ compyted would be taken consideration.
390
+ """
391
+
392
+ # NOTE We exclude the last block to avoid the case where the entire
393
+ # prompt is cached. This would cause erroneous behavior in model
394
+ # runner.
395
+
396
+ ids_list = [
397
+ list(
398
+ takewhile(lambda block_id: self.block_is_computed(block_id),
399
+ seq[:-1])) for seq in seq_block_ids
400
+ ]
401
+ # It returns a list of int although type annotation says list of string.
402
+ return commonprefix([
403
+ ids for ids in ids_list # type: ignore
404
+ if ids != []
405
+ ])
406
+
407
+
408
+ class PrefixCachingBlock(Block):
409
+ """A block implementation that supports prefix caching.
410
+
411
+ The PrefixCachingBlock class represents a block of token IDs with prefix
412
+ caching capabilities. It wraps a NaiveBlock internally and provides
413
+ additional functionality for content hashing and promoting immutable blocks
414
+ with the prefix caching allocator.
415
+
416
+ Args:
417
+ prev_block (Optional[PrefixCachingBlock]): The previous block in the
418
+ sequence.
419
+ token_ids (List[int]): The initial token IDs to be stored in the block.
420
+ block_size (int): The maximum number of token IDs that can be stored in
421
+ the block.
422
+ prefix_caching_allocator (BlockAllocator): The prefix
423
+ caching block allocator associated with this block.
424
+ block_id (Optional[int], optional): The physical block index
425
+ of this block. Defaults to None.
426
+ """
427
+
428
+ def __init__(
429
+ self,
430
+ prev_block: Optional[Block],
431
+ token_ids: List[int],
432
+ block_size: int,
433
+ prefix_caching_allocator: BlockAllocator,
434
+ block_id: Optional[int] = None,
435
+ computed: bool = False,
436
+ ):
437
+ assert isinstance(prefix_caching_allocator,
438
+ PrefixCachingBlockAllocator), (
439
+ "Currently this class is only tested with "
440
+ "PrefixCachingBlockAllocator.")
441
+ assert_prefix_caching_block_or_none(prev_block)
442
+
443
+ self._prev_block = prev_block
444
+ self._cached_content_hash: Optional[int] = None
445
+ self._cached_num_tokens_total: Optional[int] = None
446
+ self._prefix_caching_allocator = prefix_caching_allocator
447
+ self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
448
+ self._computed = computed
449
+
450
+ self._block = NaiveBlock(
451
+ prev_block=prev_block,
452
+ token_ids=token_ids,
453
+ block_size=block_size,
454
+ block_id=block_id,
455
+ allocator=prefix_caching_allocator,
456
+ _cow_target=self,
457
+ )
458
+
459
+ @property
460
+ def computed(self) -> bool:
461
+ return self._computed
462
+
463
+ @computed.setter
464
+ def computed(self, value) -> None:
465
+ self._computed = value
466
+
467
+ @property
468
+ def last_accessed(self) -> float:
469
+ return self._last_accessed
470
+
471
+ @last_accessed.setter
472
+ def last_accessed(self, last_accessed_ts: float):
473
+ self._last_accessed = last_accessed_ts
474
+
475
+ def append_token_ids(self, token_ids: List[int]) -> None:
476
+ """Appends the given token IDs to the block and registers the block as
477
+ immutable if the block becomes full.
478
+
479
+ Internally, the naive block handles CoW.
480
+
481
+ Args:
482
+ token_ids (List[int]): The token IDs to be appended to the block.
483
+ """
484
+ assert token_ids
485
+
486
+ # naive block handles CoW.
487
+ self._block.append_token_ids(token_ids)
488
+
489
+ # If the content hash is present, then the block can be made immutable.
490
+ # Register ourselves with the allocator, potentially replacing the
491
+ # physical block index.
492
+ if self.content_hash is not None:
493
+ self.block_id = (self._prefix_caching_allocator.
494
+ promote_to_immutable_block(self))
495
+
496
+ @property
497
+ def block_id(self) -> Optional[int]:
498
+ return self._block.block_id
499
+
500
+ @block_id.setter
501
+ def block_id(self, value) -> None:
502
+ self._block.block_id = value
503
+
504
+ @property
505
+ def is_full(self) -> bool:
506
+ return self._block.is_full
507
+
508
+ @property
509
+ def num_empty_slots(self) -> int:
510
+ return self._block.num_empty_slots
511
+
512
+ @property
513
+ def num_tokens_total(self) -> int:
514
+ """return the total tokens so far.
515
+
516
+ Here we iterate the block chain till to the first block, while
517
+ cache the result in local to prevent repeated computations.
518
+ """
519
+ if self._cached_num_tokens_total is not None:
520
+ return self._cached_num_tokens_total
521
+
522
+ _block: Optional[Block] = self
523
+ self._cached_num_tokens_total = 0
524
+
525
+ # TODO: current implement here take O(N^2), we expect future
526
+ # we have O(1) here
527
+ while _block is not None:
528
+ self._cached_num_tokens_total += len(_block.token_ids)
529
+ _block = _block.prev_block
530
+
531
+ return self._cached_num_tokens_total
532
+
533
+ @property
534
+ def block_size(self) -> int:
535
+ return self._block.block_size
536
+
537
+ @property
538
+ def token_ids(self) -> List[int]:
539
+ return self._block.token_ids
540
+
541
+ @property
542
+ def prev_block(self) -> Optional[Block]:
543
+ return self._prev_block
544
+
545
+ @property
546
+ def content_hash(self) -> Optional[int]:
547
+ """Return the content-based hash of the current block, or None if it is
548
+ not yet defined.
549
+
550
+ For the content-based hash to be defined, the current block must be
551
+ full.
552
+ """
553
+
554
+ # If the hash is already computed, return it.
555
+ if self._cached_content_hash is not None:
556
+ return self._cached_content_hash
557
+
558
+ # We cannot compute a hash for the current block because it is not full.
559
+ if not self.is_full:
560
+ return None
561
+
562
+ is_first_block = self._prev_block is None
563
+ prev_block_hash = (
564
+ None if is_first_block else
565
+ self._prev_block.content_hash # type: ignore
566
+ )
567
+
568
+ # Previous block exists but does not yet have a hash.
569
+ # Return no hash in this case.
570
+ if prev_block_hash is None and not is_first_block:
571
+ return None
572
+
573
+ self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
574
+ is_first_block,
575
+ prev_block_hash,
576
+ cur_block_token_ids=self.token_ids)
577
+ return self._cached_content_hash
578
+
579
+ @staticmethod
580
+ def hash_block_tokens(is_first_block: bool, prev_block_hash: Optional[int],
581
+ cur_block_token_ids: List[int]) -> int:
582
+ """Computes a hash value corresponding to the contents of a block and
583
+ the contents of the preceding block(s). The hash value is used for
584
+ prefix caching.
585
+
586
+ NOTE: Content-based hashing does not yet support LoRA.
587
+
588
+ Parameters:
589
+ - is_first_block (bool): A flag indicating if the block is the first in
590
+ the sequence.
591
+ - prev_block_hash (Optional[int]): The hash of the previous block. None
592
+ if this is the first block.
593
+ - cur_block_token_ids (List[int]): A list of token ids in the current
594
+ block. The current block is assumed to be full.
595
+
596
+ Returns:
597
+ - int: The computed hash value for the block.
598
+ """
599
+ assert (prev_block_hash is None) == is_first_block
600
+ return hash((is_first_block, prev_block_hash, *cur_block_token_ids))
601
+
602
+
603
+ def assert_prefix_caching_block_or_none(block: Optional[Block]):
604
+ if block is None:
605
+ return
606
+ assert isinstance(block, PrefixCachingBlock)