gpu-worker 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +115 -0
- package/api_client.py +288 -0
- package/batch_processor.py +436 -0
- package/bin/gpu-worker.js +275 -0
- package/cli.py +729 -0
- package/config.2gb.yaml +32 -0
- package/config.8gb.yaml +29 -0
- package/config.example.yaml +72 -0
- package/config.py +213 -0
- package/direct_server.py +140 -0
- package/distributed/__init__.py +35 -0
- package/distributed/grpc_server.py +561 -0
- package/distributed/kv_cache.py +555 -0
- package/distributed/model_shard.py +465 -0
- package/distributed/session.py +455 -0
- package/engines/__init__.py +215 -0
- package/engines/base.py +57 -0
- package/engines/image_gen.py +83 -0
- package/engines/llm.py +97 -0
- package/engines/llm_base.py +216 -0
- package/engines/llm_sglang.py +489 -0
- package/engines/llm_vllm.py +539 -0
- package/engines/speculative.py +513 -0
- package/engines/vision.py +139 -0
- package/machine_id.py +200 -0
- package/main.py +521 -0
- package/package.json +64 -0
- package/requirements-sglang.txt +12 -0
- package/requirements-vllm.txt +15 -0
- package/requirements.txt +35 -0
- package/scripts/postinstall.js +60 -0
- package/setup.py +43 -0
|
@@ -0,0 +1,555 @@
|
|
|
1
|
+
"""
|
|
2
|
+
分布式 KV-Cache 管理器
|
|
3
|
+
|
|
4
|
+
实现 PagedAttention 风格的 KV-Cache 管理,支持:
|
|
5
|
+
- GPU 内存分页管理
|
|
6
|
+
- 多级缓存(GPU → CPU → Redis)
|
|
7
|
+
- 前缀共享(RadixAttention 思想)
|
|
8
|
+
- 跨 Worker KV-Cache 传输
|
|
9
|
+
|
|
10
|
+
参考:vLLM PagedAttention, LMCache, Mooncake
|
|
11
|
+
"""
|
|
12
|
+
import asyncio
|
|
13
|
+
import hashlib
|
|
14
|
+
import logging
|
|
15
|
+
import time
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Dict, List, Optional, Tuple, Any
|
|
18
|
+
from collections import OrderedDict
|
|
19
|
+
from enum import Enum
|
|
20
|
+
|
|
21
|
+
import torch
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class CacheLocation(Enum):
|
|
27
|
+
"""缓存位置"""
|
|
28
|
+
GPU = "gpu"
|
|
29
|
+
CPU = "cpu"
|
|
30
|
+
REDIS = "redis"
|
|
31
|
+
REMOTE = "remote"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class CacheBlock:
|
|
36
|
+
"""
|
|
37
|
+
KV-Cache 分页块
|
|
38
|
+
|
|
39
|
+
采用 PagedAttention 的设计,将 KV-Cache 分成固定大小的块
|
|
40
|
+
"""
|
|
41
|
+
block_id: str
|
|
42
|
+
block_size: int = 16 # 每块 token 数
|
|
43
|
+
|
|
44
|
+
# 缓存数据
|
|
45
|
+
keys: Optional[torch.Tensor] = None # [num_heads, block_size, head_dim]
|
|
46
|
+
values: Optional[torch.Tensor] = None # [num_heads, block_size, head_dim]
|
|
47
|
+
|
|
48
|
+
# 元数据
|
|
49
|
+
layer_idx: int = 0
|
|
50
|
+
num_tokens: int = 0 # 实际使用的 token 数
|
|
51
|
+
ref_count: int = 1 # 引用计数(Copy-on-Write)
|
|
52
|
+
prefix_hash: str = ""
|
|
53
|
+
location: CacheLocation = CacheLocation.GPU
|
|
54
|
+
|
|
55
|
+
# 时间戳(用于 LRU 淘汰)
|
|
56
|
+
last_access: float = field(default_factory=time.time)
|
|
57
|
+
created_at: float = field(default_factory=time.time)
|
|
58
|
+
|
|
59
|
+
@property
|
|
60
|
+
def is_full(self) -> bool:
|
|
61
|
+
return self.num_tokens >= self.block_size
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def is_shared(self) -> bool:
|
|
65
|
+
return self.ref_count > 1
|
|
66
|
+
|
|
67
|
+
def add_ref(self) -> None:
|
|
68
|
+
self.ref_count += 1
|
|
69
|
+
|
|
70
|
+
def remove_ref(self) -> int:
|
|
71
|
+
self.ref_count = max(0, self.ref_count - 1)
|
|
72
|
+
return self.ref_count
|
|
73
|
+
|
|
74
|
+
def touch(self) -> None:
|
|
75
|
+
"""更新访问时间"""
|
|
76
|
+
self.last_access = time.time()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class PagedKVCache:
|
|
80
|
+
"""
|
|
81
|
+
分页 KV-Cache 管理器
|
|
82
|
+
|
|
83
|
+
实现 GPU 内存的分页管理,支持:
|
|
84
|
+
- 动态分配和释放
|
|
85
|
+
- 引用计数(Copy-on-Write)
|
|
86
|
+
- LRU 淘汰策略
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(
|
|
90
|
+
self,
|
|
91
|
+
num_layers: int,
|
|
92
|
+
num_heads: int,
|
|
93
|
+
head_dim: int,
|
|
94
|
+
block_size: int = 16,
|
|
95
|
+
max_blocks: int = 1000,
|
|
96
|
+
device: str = "cuda",
|
|
97
|
+
dtype: torch.dtype = torch.float16,
|
|
98
|
+
):
|
|
99
|
+
self.num_layers = num_layers
|
|
100
|
+
self.num_heads = num_heads
|
|
101
|
+
self.head_dim = head_dim
|
|
102
|
+
self.block_size = block_size
|
|
103
|
+
self.max_blocks = max_blocks
|
|
104
|
+
self.device = device
|
|
105
|
+
self.dtype = dtype
|
|
106
|
+
|
|
107
|
+
# 块存储
|
|
108
|
+
self._blocks: Dict[str, CacheBlock] = {}
|
|
109
|
+
self._free_blocks: List[str] = []
|
|
110
|
+
|
|
111
|
+
# 预分配内存池
|
|
112
|
+
self._key_pool: Optional[torch.Tensor] = None
|
|
113
|
+
self._value_pool: Optional[torch.Tensor] = None
|
|
114
|
+
self._block_to_slot: Dict[str, int] = {}
|
|
115
|
+
|
|
116
|
+
# LRU 队列
|
|
117
|
+
self._lru_queue: OrderedDict[str, float] = OrderedDict()
|
|
118
|
+
|
|
119
|
+
# 统计
|
|
120
|
+
self._stats = {
|
|
121
|
+
"allocations": 0,
|
|
122
|
+
"evictions": 0,
|
|
123
|
+
"hits": 0,
|
|
124
|
+
"misses": 0,
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# 初始化内存池
|
|
128
|
+
self._init_memory_pool()
|
|
129
|
+
|
|
130
|
+
def _init_memory_pool(self) -> None:
|
|
131
|
+
"""预分配内存池"""
|
|
132
|
+
if self.device.startswith("cuda") and torch.cuda.is_available():
|
|
133
|
+
# [max_blocks, num_heads, block_size, head_dim]
|
|
134
|
+
pool_shape = (self.max_blocks, self.num_heads, self.block_size, self.head_dim)
|
|
135
|
+
self._key_pool = torch.zeros(pool_shape, dtype=self.dtype, device=self.device)
|
|
136
|
+
self._value_pool = torch.zeros(pool_shape, dtype=self.dtype, device=self.device)
|
|
137
|
+
|
|
138
|
+
# 初始化空闲块列表
|
|
139
|
+
self._free_blocks = [f"block_{i}" for i in range(self.max_blocks)]
|
|
140
|
+
|
|
141
|
+
logger.info(
|
|
142
|
+
f"Initialized KV-Cache pool: {self.max_blocks} blocks, "
|
|
143
|
+
f"{self._get_pool_memory_gb():.2f} GB"
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
def _get_pool_memory_gb(self) -> float:
|
|
147
|
+
"""获取内存池大小(GB)"""
|
|
148
|
+
if self._key_pool is not None:
|
|
149
|
+
bytes_per_tensor = self._key_pool.numel() * self._key_pool.element_size()
|
|
150
|
+
return 2 * bytes_per_tensor / (1024 ** 3) # keys + values
|
|
151
|
+
return 0.0
|
|
152
|
+
|
|
153
|
+
def allocate_block(
|
|
154
|
+
self,
|
|
155
|
+
layer_idx: int,
|
|
156
|
+
prefix_hash: str = "",
|
|
157
|
+
) -> Optional[CacheBlock]:
|
|
158
|
+
"""
|
|
159
|
+
分配一个新块
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
layer_idx: 层索引
|
|
163
|
+
prefix_hash: 前缀哈希(用于共享)
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
CacheBlock 或 None(如果无可用块)
|
|
167
|
+
"""
|
|
168
|
+
# 检查是否有空闲块
|
|
169
|
+
if not self._free_blocks:
|
|
170
|
+
# 尝试淘汰
|
|
171
|
+
if not self._evict_lru():
|
|
172
|
+
logger.warning("No free blocks and eviction failed")
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
block_id = self._free_blocks.pop()
|
|
176
|
+
slot_idx = int(block_id.split("_")[1])
|
|
177
|
+
|
|
178
|
+
# 创建块
|
|
179
|
+
block = CacheBlock(
|
|
180
|
+
block_id=block_id,
|
|
181
|
+
block_size=self.block_size,
|
|
182
|
+
layer_idx=layer_idx,
|
|
183
|
+
prefix_hash=prefix_hash,
|
|
184
|
+
keys=self._key_pool[slot_idx] if self._key_pool is not None else None,
|
|
185
|
+
values=self._value_pool[slot_idx] if self._value_pool is not None else None,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
self._blocks[block_id] = block
|
|
189
|
+
self._block_to_slot[block_id] = slot_idx
|
|
190
|
+
self._lru_queue[block_id] = time.time()
|
|
191
|
+
|
|
192
|
+
self._stats["allocations"] += 1
|
|
193
|
+
|
|
194
|
+
return block
|
|
195
|
+
|
|
196
|
+
def free_block(self, block_id: str) -> None:
|
|
197
|
+
"""释放块"""
|
|
198
|
+
if block_id not in self._blocks:
|
|
199
|
+
return
|
|
200
|
+
|
|
201
|
+
block = self._blocks[block_id]
|
|
202
|
+
block.remove_ref()
|
|
203
|
+
|
|
204
|
+
if block.ref_count == 0:
|
|
205
|
+
# 清除数据
|
|
206
|
+
slot_idx = self._block_to_slot.get(block_id)
|
|
207
|
+
if slot_idx is not None and self._key_pool is not None:
|
|
208
|
+
self._key_pool[slot_idx].zero_()
|
|
209
|
+
self._value_pool[slot_idx].zero_()
|
|
210
|
+
|
|
211
|
+
del self._blocks[block_id]
|
|
212
|
+
del self._block_to_slot[block_id]
|
|
213
|
+
if block_id in self._lru_queue:
|
|
214
|
+
del self._lru_queue[block_id]
|
|
215
|
+
|
|
216
|
+
self._free_blocks.append(block_id)
|
|
217
|
+
|
|
218
|
+
def get_block(self, block_id: str) -> Optional[CacheBlock]:
|
|
219
|
+
"""获取块"""
|
|
220
|
+
block = self._blocks.get(block_id)
|
|
221
|
+
if block:
|
|
222
|
+
block.touch()
|
|
223
|
+
self._lru_queue.move_to_end(block_id)
|
|
224
|
+
self._stats["hits"] += 1
|
|
225
|
+
else:
|
|
226
|
+
self._stats["misses"] += 1
|
|
227
|
+
return block
|
|
228
|
+
|
|
229
|
+
def _evict_lru(self) -> bool:
|
|
230
|
+
"""淘汰最久未使用的块"""
|
|
231
|
+
# 找到可淘汰的块(ref_count == 1)
|
|
232
|
+
for block_id in self._lru_queue:
|
|
233
|
+
block = self._blocks.get(block_id)
|
|
234
|
+
if block and block.ref_count == 1:
|
|
235
|
+
self.free_block(block_id)
|
|
236
|
+
self._stats["evictions"] += 1
|
|
237
|
+
return True
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
241
|
+
"""获取统计信息"""
|
|
242
|
+
return {
|
|
243
|
+
**self._stats,
|
|
244
|
+
"total_blocks": len(self._blocks),
|
|
245
|
+
"free_blocks": len(self._free_blocks),
|
|
246
|
+
"memory_gb": self._get_pool_memory_gb(),
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
class KVCachePool:
|
|
251
|
+
"""
|
|
252
|
+
多层 KV-Cache 池
|
|
253
|
+
|
|
254
|
+
管理模型所有层的 KV-Cache
|
|
255
|
+
"""
|
|
256
|
+
|
|
257
|
+
def __init__(
|
|
258
|
+
self,
|
|
259
|
+
num_layers: int,
|
|
260
|
+
num_heads: int,
|
|
261
|
+
head_dim: int,
|
|
262
|
+
block_size: int = 16,
|
|
263
|
+
max_blocks_per_layer: int = 100,
|
|
264
|
+
device: str = "cuda",
|
|
265
|
+
):
|
|
266
|
+
self.num_layers = num_layers
|
|
267
|
+
self.num_heads = num_heads
|
|
268
|
+
self.head_dim = head_dim
|
|
269
|
+
|
|
270
|
+
# 每层一个 PagedKVCache
|
|
271
|
+
self._layer_caches: List[PagedKVCache] = [
|
|
272
|
+
PagedKVCache(
|
|
273
|
+
num_layers=1, # 每个缓存只管理一层
|
|
274
|
+
num_heads=num_heads,
|
|
275
|
+
head_dim=head_dim,
|
|
276
|
+
block_size=block_size,
|
|
277
|
+
max_blocks=max_blocks_per_layer,
|
|
278
|
+
device=device,
|
|
279
|
+
)
|
|
280
|
+
for _ in range(num_layers)
|
|
281
|
+
]
|
|
282
|
+
|
|
283
|
+
def allocate_sequence(
|
|
284
|
+
self,
|
|
285
|
+
seq_len: int,
|
|
286
|
+
prefix_hash: str = "",
|
|
287
|
+
) -> List[List[CacheBlock]]:
|
|
288
|
+
"""
|
|
289
|
+
为序列分配 KV-Cache
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
seq_len: 序列长度
|
|
293
|
+
prefix_hash: 前缀哈希
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
[[layer0_blocks], [layer1_blocks], ...]
|
|
297
|
+
"""
|
|
298
|
+
block_size = self._layer_caches[0].block_size
|
|
299
|
+
num_blocks = (seq_len + block_size - 1) // block_size
|
|
300
|
+
|
|
301
|
+
all_blocks = []
|
|
302
|
+
for layer_idx, cache in enumerate(self._layer_caches):
|
|
303
|
+
layer_blocks = []
|
|
304
|
+
for _ in range(num_blocks):
|
|
305
|
+
block = cache.allocate_block(layer_idx, prefix_hash)
|
|
306
|
+
if block is None:
|
|
307
|
+
# 回滚已分配的块
|
|
308
|
+
self._free_sequence_blocks(all_blocks)
|
|
309
|
+
raise RuntimeError(f"Failed to allocate KV-Cache for layer {layer_idx}")
|
|
310
|
+
layer_blocks.append(block)
|
|
311
|
+
all_blocks.append(layer_blocks)
|
|
312
|
+
|
|
313
|
+
return all_blocks
|
|
314
|
+
|
|
315
|
+
def _free_sequence_blocks(self, blocks: List[List[CacheBlock]]) -> None:
|
|
316
|
+
"""释放序列的所有块"""
|
|
317
|
+
for layer_idx, layer_blocks in enumerate(blocks):
|
|
318
|
+
for block in layer_blocks:
|
|
319
|
+
self._layer_caches[layer_idx].free_block(block.block_id)
|
|
320
|
+
|
|
321
|
+
def get_total_memory_gb(self) -> float:
|
|
322
|
+
"""获取总内存使用"""
|
|
323
|
+
return sum(cache._get_pool_memory_gb() for cache in self._layer_caches)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class DistributedKVCacheManager:
|
|
327
|
+
"""
|
|
328
|
+
分布式 KV-Cache 管理器
|
|
329
|
+
|
|
330
|
+
实现多级缓存架构:
|
|
331
|
+
L1: GPU HBM(最热数据,<1ms)
|
|
332
|
+
L2: CPU RAM(温数据,~5ms)
|
|
333
|
+
L3: Redis(冷数据,~10ms)
|
|
334
|
+
L4: 远程 Worker(共享前缀,~50ms)
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
def __init__(
|
|
338
|
+
self,
|
|
339
|
+
num_layers: int,
|
|
340
|
+
num_heads: int,
|
|
341
|
+
head_dim: int,
|
|
342
|
+
gpu_cache_blocks: int = 500,
|
|
343
|
+
cpu_cache_gb: float = 16.0,
|
|
344
|
+
redis_client = None,
|
|
345
|
+
block_size: int = 16,
|
|
346
|
+
device: str = "cuda",
|
|
347
|
+
):
|
|
348
|
+
self.num_layers = num_layers
|
|
349
|
+
self.num_heads = num_heads
|
|
350
|
+
self.head_dim = head_dim
|
|
351
|
+
self.block_size = block_size
|
|
352
|
+
|
|
353
|
+
# L1: GPU 缓存
|
|
354
|
+
self.gpu_cache = KVCachePool(
|
|
355
|
+
num_layers=num_layers,
|
|
356
|
+
num_heads=num_heads,
|
|
357
|
+
head_dim=head_dim,
|
|
358
|
+
block_size=block_size,
|
|
359
|
+
max_blocks_per_layer=gpu_cache_blocks,
|
|
360
|
+
device=device,
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# L2: CPU 缓存(简单 LRU)
|
|
364
|
+
self.cpu_cache: OrderedDict[str, Tuple[torch.Tensor, torch.Tensor]] = OrderedDict()
|
|
365
|
+
self.cpu_cache_max_items = int(cpu_cache_gb * 1024 * 1024 * 1024 / (
|
|
366
|
+
2 * num_heads * block_size * head_dim * 2 # float16
|
|
367
|
+
))
|
|
368
|
+
|
|
369
|
+
# L3: Redis 缓存
|
|
370
|
+
self.redis = redis_client
|
|
371
|
+
|
|
372
|
+
# 前缀索引
|
|
373
|
+
self._prefix_index: Dict[str, str] = {} # prefix_hash -> block_id
|
|
374
|
+
|
|
375
|
+
# 统计
|
|
376
|
+
self._stats = {
|
|
377
|
+
"l1_hits": 0,
|
|
378
|
+
"l2_hits": 0,
|
|
379
|
+
"l3_hits": 0,
|
|
380
|
+
"l4_hits": 0,
|
|
381
|
+
"misses": 0,
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
def compute_prefix_hash(self, token_ids: List[int]) -> str:
|
|
385
|
+
"""计算前缀哈希"""
|
|
386
|
+
data = bytes(token_ids)
|
|
387
|
+
return hashlib.sha256(data).hexdigest()[:16]
|
|
388
|
+
|
|
389
|
+
async def get_or_compute(
|
|
390
|
+
self,
|
|
391
|
+
prefix_hash: str,
|
|
392
|
+
layer_idx: int,
|
|
393
|
+
compute_fn,
|
|
394
|
+
) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
395
|
+
"""
|
|
396
|
+
获取或计算 KV-Cache
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
prefix_hash: 前缀哈希
|
|
400
|
+
layer_idx: 层索引
|
|
401
|
+
compute_fn: 计算函数(如果缓存未命中)
|
|
402
|
+
|
|
403
|
+
Returns:
|
|
404
|
+
(keys, values)
|
|
405
|
+
"""
|
|
406
|
+
cache_key = f"{prefix_hash}:{layer_idx}"
|
|
407
|
+
|
|
408
|
+
# L1: GPU 缓存
|
|
409
|
+
block = self.gpu_cache._layer_caches[layer_idx].get_block(
|
|
410
|
+
self._prefix_index.get(cache_key)
|
|
411
|
+
)
|
|
412
|
+
if block and block.keys is not None:
|
|
413
|
+
self._stats["l1_hits"] += 1
|
|
414
|
+
return block.keys, block.values
|
|
415
|
+
|
|
416
|
+
# L2: CPU 缓存
|
|
417
|
+
if cache_key in self.cpu_cache:
|
|
418
|
+
keys, values = self.cpu_cache[cache_key]
|
|
419
|
+
self.cpu_cache.move_to_end(cache_key)
|
|
420
|
+
self._stats["l2_hits"] += 1
|
|
421
|
+
|
|
422
|
+
# 提升到 L1
|
|
423
|
+
await self._promote_to_gpu(cache_key, keys, values, layer_idx, prefix_hash)
|
|
424
|
+
return keys.to(self.gpu_cache._layer_caches[0].device), values.to(self.gpu_cache._layer_caches[0].device)
|
|
425
|
+
|
|
426
|
+
# L3: Redis 缓存
|
|
427
|
+
if self.redis:
|
|
428
|
+
kv_bytes = await self._get_from_redis(cache_key)
|
|
429
|
+
if kv_bytes:
|
|
430
|
+
keys, values = self._deserialize_kv(kv_bytes)
|
|
431
|
+
self._stats["l3_hits"] += 1
|
|
432
|
+
|
|
433
|
+
# 提升到 L2 和 L1
|
|
434
|
+
self._add_to_cpu_cache(cache_key, keys, values)
|
|
435
|
+
await self._promote_to_gpu(cache_key, keys, values, layer_idx, prefix_hash)
|
|
436
|
+
return keys, values
|
|
437
|
+
|
|
438
|
+
# 缓存未命中,计算
|
|
439
|
+
self._stats["misses"] += 1
|
|
440
|
+
keys, values = await compute_fn()
|
|
441
|
+
|
|
442
|
+
# 存储到各级缓存
|
|
443
|
+
await self._store_kv(cache_key, keys, values, layer_idx, prefix_hash)
|
|
444
|
+
|
|
445
|
+
return keys, values
|
|
446
|
+
|
|
447
|
+
async def _promote_to_gpu(
|
|
448
|
+
self,
|
|
449
|
+
cache_key: str,
|
|
450
|
+
keys: torch.Tensor,
|
|
451
|
+
values: torch.Tensor,
|
|
452
|
+
layer_idx: int,
|
|
453
|
+
prefix_hash: str,
|
|
454
|
+
) -> None:
|
|
455
|
+
"""提升到 GPU 缓存"""
|
|
456
|
+
block = self.gpu_cache._layer_caches[layer_idx].allocate_block(
|
|
457
|
+
layer_idx, prefix_hash
|
|
458
|
+
)
|
|
459
|
+
if block and block.keys is not None:
|
|
460
|
+
block.keys.copy_(keys)
|
|
461
|
+
block.values.copy_(values)
|
|
462
|
+
self._prefix_index[cache_key] = block.block_id
|
|
463
|
+
|
|
464
|
+
def _add_to_cpu_cache(
|
|
465
|
+
self,
|
|
466
|
+
cache_key: str,
|
|
467
|
+
keys: torch.Tensor,
|
|
468
|
+
values: torch.Tensor,
|
|
469
|
+
) -> None:
|
|
470
|
+
"""添加到 CPU 缓存"""
|
|
471
|
+
# LRU 淘汰
|
|
472
|
+
while len(self.cpu_cache) >= self.cpu_cache_max_items:
|
|
473
|
+
self.cpu_cache.popitem(last=False)
|
|
474
|
+
|
|
475
|
+
self.cpu_cache[cache_key] = (keys.cpu(), values.cpu())
|
|
476
|
+
|
|
477
|
+
async def _get_from_redis(self, cache_key: str) -> Optional[bytes]:
|
|
478
|
+
"""从 Redis 获取"""
|
|
479
|
+
if self.redis is None:
|
|
480
|
+
return None
|
|
481
|
+
try:
|
|
482
|
+
return await self.redis.get(f"kv:{cache_key}")
|
|
483
|
+
except Exception as e:
|
|
484
|
+
logger.warning(f"Redis get error: {e}")
|
|
485
|
+
return None
|
|
486
|
+
|
|
487
|
+
async def _store_kv(
|
|
488
|
+
self,
|
|
489
|
+
cache_key: str,
|
|
490
|
+
keys: torch.Tensor,
|
|
491
|
+
values: torch.Tensor,
|
|
492
|
+
layer_idx: int,
|
|
493
|
+
prefix_hash: str,
|
|
494
|
+
) -> None:
|
|
495
|
+
"""存储 KV-Cache 到各级缓存"""
|
|
496
|
+
# L1: GPU
|
|
497
|
+
await self._promote_to_gpu(cache_key, keys, values, layer_idx, prefix_hash)
|
|
498
|
+
|
|
499
|
+
# L2: CPU
|
|
500
|
+
self._add_to_cpu_cache(cache_key, keys, values)
|
|
501
|
+
|
|
502
|
+
# L3: Redis(异步写入)
|
|
503
|
+
if self.redis:
|
|
504
|
+
asyncio.create_task(self._write_to_redis(cache_key, keys, values))
|
|
505
|
+
|
|
506
|
+
async def _write_to_redis(
|
|
507
|
+
self,
|
|
508
|
+
cache_key: str,
|
|
509
|
+
keys: torch.Tensor,
|
|
510
|
+
values: torch.Tensor,
|
|
511
|
+
ttl: int = 3600,
|
|
512
|
+
) -> None:
|
|
513
|
+
"""写入 Redis"""
|
|
514
|
+
if self.redis is None:
|
|
515
|
+
return
|
|
516
|
+
try:
|
|
517
|
+
kv_bytes = self._serialize_kv(keys, values)
|
|
518
|
+
await self.redis.setex(f"kv:{cache_key}", ttl, kv_bytes)
|
|
519
|
+
except Exception as e:
|
|
520
|
+
logger.warning(f"Redis write error: {e}")
|
|
521
|
+
|
|
522
|
+
def _serialize_kv(self, keys: torch.Tensor, values: torch.Tensor) -> bytes:
|
|
523
|
+
"""序列化 KV tensors"""
|
|
524
|
+
import io
|
|
525
|
+
import pickle
|
|
526
|
+
buffer = io.BytesIO()
|
|
527
|
+
pickle.dump({
|
|
528
|
+
"keys": keys.cpu().numpy(),
|
|
529
|
+
"values": values.cpu().numpy(),
|
|
530
|
+
}, buffer)
|
|
531
|
+
return buffer.getvalue()
|
|
532
|
+
|
|
533
|
+
def _deserialize_kv(self, data: bytes) -> Tuple[torch.Tensor, torch.Tensor]:
|
|
534
|
+
"""反序列化 KV tensors"""
|
|
535
|
+
import io
|
|
536
|
+
import pickle
|
|
537
|
+
buffer = io.BytesIO(data)
|
|
538
|
+
kv_dict = pickle.load(buffer)
|
|
539
|
+
return (
|
|
540
|
+
torch.from_numpy(kv_dict["keys"]),
|
|
541
|
+
torch.from_numpy(kv_dict["values"]),
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
545
|
+
"""获取统计信息"""
|
|
546
|
+
total_requests = sum(self._stats.values())
|
|
547
|
+
return {
|
|
548
|
+
**self._stats,
|
|
549
|
+
"total_requests": total_requests,
|
|
550
|
+
"l1_hit_rate": self._stats["l1_hits"] / max(1, total_requests),
|
|
551
|
+
"l2_hit_rate": self._stats["l2_hits"] / max(1, total_requests),
|
|
552
|
+
"l3_hit_rate": self._stats["l3_hits"] / max(1, total_requests),
|
|
553
|
+
"gpu_memory_gb": self.gpu_cache.get_total_memory_gb(),
|
|
554
|
+
"cpu_cache_items": len(self.cpu_cache),
|
|
555
|
+
}
|