gpu-worker 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,555 @@
1
+ """
2
+ 分布式 KV-Cache 管理器
3
+
4
+ 实现 PagedAttention 风格的 KV-Cache 管理,支持:
5
+ - GPU 内存分页管理
6
+ - 多级缓存(GPU → CPU → Redis)
7
+ - 前缀共享(RadixAttention 思想)
8
+ - 跨 Worker KV-Cache 传输
9
+
10
+ 参考:vLLM PagedAttention, LMCache, Mooncake
11
+ """
12
+ import asyncio
13
+ import hashlib
14
+ import logging
15
+ import time
16
+ from dataclasses import dataclass, field
17
+ from typing import Dict, List, Optional, Tuple, Any
18
+ from collections import OrderedDict
19
+ from enum import Enum
20
+
21
+ import torch
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ class CacheLocation(Enum):
27
+ """缓存位置"""
28
+ GPU = "gpu"
29
+ CPU = "cpu"
30
+ REDIS = "redis"
31
+ REMOTE = "remote"
32
+
33
+
34
+ @dataclass
35
+ class CacheBlock:
36
+ """
37
+ KV-Cache 分页块
38
+
39
+ 采用 PagedAttention 的设计,将 KV-Cache 分成固定大小的块
40
+ """
41
+ block_id: str
42
+ block_size: int = 16 # 每块 token 数
43
+
44
+ # 缓存数据
45
+ keys: Optional[torch.Tensor] = None # [num_heads, block_size, head_dim]
46
+ values: Optional[torch.Tensor] = None # [num_heads, block_size, head_dim]
47
+
48
+ # 元数据
49
+ layer_idx: int = 0
50
+ num_tokens: int = 0 # 实际使用的 token 数
51
+ ref_count: int = 1 # 引用计数(Copy-on-Write)
52
+ prefix_hash: str = ""
53
+ location: CacheLocation = CacheLocation.GPU
54
+
55
+ # 时间戳(用于 LRU 淘汰)
56
+ last_access: float = field(default_factory=time.time)
57
+ created_at: float = field(default_factory=time.time)
58
+
59
+ @property
60
+ def is_full(self) -> bool:
61
+ return self.num_tokens >= self.block_size
62
+
63
+ @property
64
+ def is_shared(self) -> bool:
65
+ return self.ref_count > 1
66
+
67
+ def add_ref(self) -> None:
68
+ self.ref_count += 1
69
+
70
+ def remove_ref(self) -> int:
71
+ self.ref_count = max(0, self.ref_count - 1)
72
+ return self.ref_count
73
+
74
+ def touch(self) -> None:
75
+ """更新访问时间"""
76
+ self.last_access = time.time()
77
+
78
+
79
+ class PagedKVCache:
80
+ """
81
+ 分页 KV-Cache 管理器
82
+
83
+ 实现 GPU 内存的分页管理,支持:
84
+ - 动态分配和释放
85
+ - 引用计数(Copy-on-Write)
86
+ - LRU 淘汰策略
87
+ """
88
+
89
+ def __init__(
90
+ self,
91
+ num_layers: int,
92
+ num_heads: int,
93
+ head_dim: int,
94
+ block_size: int = 16,
95
+ max_blocks: int = 1000,
96
+ device: str = "cuda",
97
+ dtype: torch.dtype = torch.float16,
98
+ ):
99
+ self.num_layers = num_layers
100
+ self.num_heads = num_heads
101
+ self.head_dim = head_dim
102
+ self.block_size = block_size
103
+ self.max_blocks = max_blocks
104
+ self.device = device
105
+ self.dtype = dtype
106
+
107
+ # 块存储
108
+ self._blocks: Dict[str, CacheBlock] = {}
109
+ self._free_blocks: List[str] = []
110
+
111
+ # 预分配内存池
112
+ self._key_pool: Optional[torch.Tensor] = None
113
+ self._value_pool: Optional[torch.Tensor] = None
114
+ self._block_to_slot: Dict[str, int] = {}
115
+
116
+ # LRU 队列
117
+ self._lru_queue: OrderedDict[str, float] = OrderedDict()
118
+
119
+ # 统计
120
+ self._stats = {
121
+ "allocations": 0,
122
+ "evictions": 0,
123
+ "hits": 0,
124
+ "misses": 0,
125
+ }
126
+
127
+ # 初始化内存池
128
+ self._init_memory_pool()
129
+
130
+ def _init_memory_pool(self) -> None:
131
+ """预分配内存池"""
132
+ if self.device.startswith("cuda") and torch.cuda.is_available():
133
+ # [max_blocks, num_heads, block_size, head_dim]
134
+ pool_shape = (self.max_blocks, self.num_heads, self.block_size, self.head_dim)
135
+ self._key_pool = torch.zeros(pool_shape, dtype=self.dtype, device=self.device)
136
+ self._value_pool = torch.zeros(pool_shape, dtype=self.dtype, device=self.device)
137
+
138
+ # 初始化空闲块列表
139
+ self._free_blocks = [f"block_{i}" for i in range(self.max_blocks)]
140
+
141
+ logger.info(
142
+ f"Initialized KV-Cache pool: {self.max_blocks} blocks, "
143
+ f"{self._get_pool_memory_gb():.2f} GB"
144
+ )
145
+
146
+ def _get_pool_memory_gb(self) -> float:
147
+ """获取内存池大小(GB)"""
148
+ if self._key_pool is not None:
149
+ bytes_per_tensor = self._key_pool.numel() * self._key_pool.element_size()
150
+ return 2 * bytes_per_tensor / (1024 ** 3) # keys + values
151
+ return 0.0
152
+
153
+ def allocate_block(
154
+ self,
155
+ layer_idx: int,
156
+ prefix_hash: str = "",
157
+ ) -> Optional[CacheBlock]:
158
+ """
159
+ 分配一个新块
160
+
161
+ Args:
162
+ layer_idx: 层索引
163
+ prefix_hash: 前缀哈希(用于共享)
164
+
165
+ Returns:
166
+ CacheBlock 或 None(如果无可用块)
167
+ """
168
+ # 检查是否有空闲块
169
+ if not self._free_blocks:
170
+ # 尝试淘汰
171
+ if not self._evict_lru():
172
+ logger.warning("No free blocks and eviction failed")
173
+ return None
174
+
175
+ block_id = self._free_blocks.pop()
176
+ slot_idx = int(block_id.split("_")[1])
177
+
178
+ # 创建块
179
+ block = CacheBlock(
180
+ block_id=block_id,
181
+ block_size=self.block_size,
182
+ layer_idx=layer_idx,
183
+ prefix_hash=prefix_hash,
184
+ keys=self._key_pool[slot_idx] if self._key_pool is not None else None,
185
+ values=self._value_pool[slot_idx] if self._value_pool is not None else None,
186
+ )
187
+
188
+ self._blocks[block_id] = block
189
+ self._block_to_slot[block_id] = slot_idx
190
+ self._lru_queue[block_id] = time.time()
191
+
192
+ self._stats["allocations"] += 1
193
+
194
+ return block
195
+
196
+ def free_block(self, block_id: str) -> None:
197
+ """释放块"""
198
+ if block_id not in self._blocks:
199
+ return
200
+
201
+ block = self._blocks[block_id]
202
+ block.remove_ref()
203
+
204
+ if block.ref_count == 0:
205
+ # 清除数据
206
+ slot_idx = self._block_to_slot.get(block_id)
207
+ if slot_idx is not None and self._key_pool is not None:
208
+ self._key_pool[slot_idx].zero_()
209
+ self._value_pool[slot_idx].zero_()
210
+
211
+ del self._blocks[block_id]
212
+ del self._block_to_slot[block_id]
213
+ if block_id in self._lru_queue:
214
+ del self._lru_queue[block_id]
215
+
216
+ self._free_blocks.append(block_id)
217
+
218
+ def get_block(self, block_id: str) -> Optional[CacheBlock]:
219
+ """获取块"""
220
+ block = self._blocks.get(block_id)
221
+ if block:
222
+ block.touch()
223
+ self._lru_queue.move_to_end(block_id)
224
+ self._stats["hits"] += 1
225
+ else:
226
+ self._stats["misses"] += 1
227
+ return block
228
+
229
+ def _evict_lru(self) -> bool:
230
+ """淘汰最久未使用的块"""
231
+ # 找到可淘汰的块(ref_count == 1)
232
+ for block_id in self._lru_queue:
233
+ block = self._blocks.get(block_id)
234
+ if block and block.ref_count == 1:
235
+ self.free_block(block_id)
236
+ self._stats["evictions"] += 1
237
+ return True
238
+ return False
239
+
240
+ def get_stats(self) -> Dict[str, Any]:
241
+ """获取统计信息"""
242
+ return {
243
+ **self._stats,
244
+ "total_blocks": len(self._blocks),
245
+ "free_blocks": len(self._free_blocks),
246
+ "memory_gb": self._get_pool_memory_gb(),
247
+ }
248
+
249
+
250
+ class KVCachePool:
251
+ """
252
+ 多层 KV-Cache 池
253
+
254
+ 管理模型所有层的 KV-Cache
255
+ """
256
+
257
+ def __init__(
258
+ self,
259
+ num_layers: int,
260
+ num_heads: int,
261
+ head_dim: int,
262
+ block_size: int = 16,
263
+ max_blocks_per_layer: int = 100,
264
+ device: str = "cuda",
265
+ ):
266
+ self.num_layers = num_layers
267
+ self.num_heads = num_heads
268
+ self.head_dim = head_dim
269
+
270
+ # 每层一个 PagedKVCache
271
+ self._layer_caches: List[PagedKVCache] = [
272
+ PagedKVCache(
273
+ num_layers=1, # 每个缓存只管理一层
274
+ num_heads=num_heads,
275
+ head_dim=head_dim,
276
+ block_size=block_size,
277
+ max_blocks=max_blocks_per_layer,
278
+ device=device,
279
+ )
280
+ for _ in range(num_layers)
281
+ ]
282
+
283
+ def allocate_sequence(
284
+ self,
285
+ seq_len: int,
286
+ prefix_hash: str = "",
287
+ ) -> List[List[CacheBlock]]:
288
+ """
289
+ 为序列分配 KV-Cache
290
+
291
+ Args:
292
+ seq_len: 序列长度
293
+ prefix_hash: 前缀哈希
294
+
295
+ Returns:
296
+ [[layer0_blocks], [layer1_blocks], ...]
297
+ """
298
+ block_size = self._layer_caches[0].block_size
299
+ num_blocks = (seq_len + block_size - 1) // block_size
300
+
301
+ all_blocks = []
302
+ for layer_idx, cache in enumerate(self._layer_caches):
303
+ layer_blocks = []
304
+ for _ in range(num_blocks):
305
+ block = cache.allocate_block(layer_idx, prefix_hash)
306
+ if block is None:
307
+ # 回滚已分配的块
308
+ self._free_sequence_blocks(all_blocks)
309
+ raise RuntimeError(f"Failed to allocate KV-Cache for layer {layer_idx}")
310
+ layer_blocks.append(block)
311
+ all_blocks.append(layer_blocks)
312
+
313
+ return all_blocks
314
+
315
+ def _free_sequence_blocks(self, blocks: List[List[CacheBlock]]) -> None:
316
+ """释放序列的所有块"""
317
+ for layer_idx, layer_blocks in enumerate(blocks):
318
+ for block in layer_blocks:
319
+ self._layer_caches[layer_idx].free_block(block.block_id)
320
+
321
+ def get_total_memory_gb(self) -> float:
322
+ """获取总内存使用"""
323
+ return sum(cache._get_pool_memory_gb() for cache in self._layer_caches)
324
+
325
+
326
+ class DistributedKVCacheManager:
327
+ """
328
+ 分布式 KV-Cache 管理器
329
+
330
+ 实现多级缓存架构:
331
+ L1: GPU HBM(最热数据,<1ms)
332
+ L2: CPU RAM(温数据,~5ms)
333
+ L3: Redis(冷数据,~10ms)
334
+ L4: 远程 Worker(共享前缀,~50ms)
335
+ """
336
+
337
+ def __init__(
338
+ self,
339
+ num_layers: int,
340
+ num_heads: int,
341
+ head_dim: int,
342
+ gpu_cache_blocks: int = 500,
343
+ cpu_cache_gb: float = 16.0,
344
+ redis_client = None,
345
+ block_size: int = 16,
346
+ device: str = "cuda",
347
+ ):
348
+ self.num_layers = num_layers
349
+ self.num_heads = num_heads
350
+ self.head_dim = head_dim
351
+ self.block_size = block_size
352
+
353
+ # L1: GPU 缓存
354
+ self.gpu_cache = KVCachePool(
355
+ num_layers=num_layers,
356
+ num_heads=num_heads,
357
+ head_dim=head_dim,
358
+ block_size=block_size,
359
+ max_blocks_per_layer=gpu_cache_blocks,
360
+ device=device,
361
+ )
362
+
363
+ # L2: CPU 缓存(简单 LRU)
364
+ self.cpu_cache: OrderedDict[str, Tuple[torch.Tensor, torch.Tensor]] = OrderedDict()
365
+ self.cpu_cache_max_items = int(cpu_cache_gb * 1024 * 1024 * 1024 / (
366
+ 2 * num_heads * block_size * head_dim * 2 # float16
367
+ ))
368
+
369
+ # L3: Redis 缓存
370
+ self.redis = redis_client
371
+
372
+ # 前缀索引
373
+ self._prefix_index: Dict[str, str] = {} # prefix_hash -> block_id
374
+
375
+ # 统计
376
+ self._stats = {
377
+ "l1_hits": 0,
378
+ "l2_hits": 0,
379
+ "l3_hits": 0,
380
+ "l4_hits": 0,
381
+ "misses": 0,
382
+ }
383
+
384
+ def compute_prefix_hash(self, token_ids: List[int]) -> str:
385
+ """计算前缀哈希"""
386
+ data = bytes(token_ids)
387
+ return hashlib.sha256(data).hexdigest()[:16]
388
+
389
+ async def get_or_compute(
390
+ self,
391
+ prefix_hash: str,
392
+ layer_idx: int,
393
+ compute_fn,
394
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
395
+ """
396
+ 获取或计算 KV-Cache
397
+
398
+ Args:
399
+ prefix_hash: 前缀哈希
400
+ layer_idx: 层索引
401
+ compute_fn: 计算函数(如果缓存未命中)
402
+
403
+ Returns:
404
+ (keys, values)
405
+ """
406
+ cache_key = f"{prefix_hash}:{layer_idx}"
407
+
408
+ # L1: GPU 缓存
409
+ block = self.gpu_cache._layer_caches[layer_idx].get_block(
410
+ self._prefix_index.get(cache_key)
411
+ )
412
+ if block and block.keys is not None:
413
+ self._stats["l1_hits"] += 1
414
+ return block.keys, block.values
415
+
416
+ # L2: CPU 缓存
417
+ if cache_key in self.cpu_cache:
418
+ keys, values = self.cpu_cache[cache_key]
419
+ self.cpu_cache.move_to_end(cache_key)
420
+ self._stats["l2_hits"] += 1
421
+
422
+ # 提升到 L1
423
+ await self._promote_to_gpu(cache_key, keys, values, layer_idx, prefix_hash)
424
+ return keys.to(self.gpu_cache._layer_caches[0].device), values.to(self.gpu_cache._layer_caches[0].device)
425
+
426
+ # L3: Redis 缓存
427
+ if self.redis:
428
+ kv_bytes = await self._get_from_redis(cache_key)
429
+ if kv_bytes:
430
+ keys, values = self._deserialize_kv(kv_bytes)
431
+ self._stats["l3_hits"] += 1
432
+
433
+ # 提升到 L2 和 L1
434
+ self._add_to_cpu_cache(cache_key, keys, values)
435
+ await self._promote_to_gpu(cache_key, keys, values, layer_idx, prefix_hash)
436
+ return keys, values
437
+
438
+ # 缓存未命中,计算
439
+ self._stats["misses"] += 1
440
+ keys, values = await compute_fn()
441
+
442
+ # 存储到各级缓存
443
+ await self._store_kv(cache_key, keys, values, layer_idx, prefix_hash)
444
+
445
+ return keys, values
446
+
447
+ async def _promote_to_gpu(
448
+ self,
449
+ cache_key: str,
450
+ keys: torch.Tensor,
451
+ values: torch.Tensor,
452
+ layer_idx: int,
453
+ prefix_hash: str,
454
+ ) -> None:
455
+ """提升到 GPU 缓存"""
456
+ block = self.gpu_cache._layer_caches[layer_idx].allocate_block(
457
+ layer_idx, prefix_hash
458
+ )
459
+ if block and block.keys is not None:
460
+ block.keys.copy_(keys)
461
+ block.values.copy_(values)
462
+ self._prefix_index[cache_key] = block.block_id
463
+
464
+ def _add_to_cpu_cache(
465
+ self,
466
+ cache_key: str,
467
+ keys: torch.Tensor,
468
+ values: torch.Tensor,
469
+ ) -> None:
470
+ """添加到 CPU 缓存"""
471
+ # LRU 淘汰
472
+ while len(self.cpu_cache) >= self.cpu_cache_max_items:
473
+ self.cpu_cache.popitem(last=False)
474
+
475
+ self.cpu_cache[cache_key] = (keys.cpu(), values.cpu())
476
+
477
+ async def _get_from_redis(self, cache_key: str) -> Optional[bytes]:
478
+ """从 Redis 获取"""
479
+ if self.redis is None:
480
+ return None
481
+ try:
482
+ return await self.redis.get(f"kv:{cache_key}")
483
+ except Exception as e:
484
+ logger.warning(f"Redis get error: {e}")
485
+ return None
486
+
487
+ async def _store_kv(
488
+ self,
489
+ cache_key: str,
490
+ keys: torch.Tensor,
491
+ values: torch.Tensor,
492
+ layer_idx: int,
493
+ prefix_hash: str,
494
+ ) -> None:
495
+ """存储 KV-Cache 到各级缓存"""
496
+ # L1: GPU
497
+ await self._promote_to_gpu(cache_key, keys, values, layer_idx, prefix_hash)
498
+
499
+ # L2: CPU
500
+ self._add_to_cpu_cache(cache_key, keys, values)
501
+
502
+ # L3: Redis(异步写入)
503
+ if self.redis:
504
+ asyncio.create_task(self._write_to_redis(cache_key, keys, values))
505
+
506
+ async def _write_to_redis(
507
+ self,
508
+ cache_key: str,
509
+ keys: torch.Tensor,
510
+ values: torch.Tensor,
511
+ ttl: int = 3600,
512
+ ) -> None:
513
+ """写入 Redis"""
514
+ if self.redis is None:
515
+ return
516
+ try:
517
+ kv_bytes = self._serialize_kv(keys, values)
518
+ await self.redis.setex(f"kv:{cache_key}", ttl, kv_bytes)
519
+ except Exception as e:
520
+ logger.warning(f"Redis write error: {e}")
521
+
522
+ def _serialize_kv(self, keys: torch.Tensor, values: torch.Tensor) -> bytes:
523
+ """序列化 KV tensors"""
524
+ import io
525
+ import pickle
526
+ buffer = io.BytesIO()
527
+ pickle.dump({
528
+ "keys": keys.cpu().numpy(),
529
+ "values": values.cpu().numpy(),
530
+ }, buffer)
531
+ return buffer.getvalue()
532
+
533
+ def _deserialize_kv(self, data: bytes) -> Tuple[torch.Tensor, torch.Tensor]:
534
+ """反序列化 KV tensors"""
535
+ import io
536
+ import pickle
537
+ buffer = io.BytesIO(data)
538
+ kv_dict = pickle.load(buffer)
539
+ return (
540
+ torch.from_numpy(kv_dict["keys"]),
541
+ torch.from_numpy(kv_dict["values"]),
542
+ )
543
+
544
+ def get_stats(self) -> Dict[str, Any]:
545
+ """获取统计信息"""
546
+ total_requests = sum(self._stats.values())
547
+ return {
548
+ **self._stats,
549
+ "total_requests": total_requests,
550
+ "l1_hit_rate": self._stats["l1_hits"] / max(1, total_requests),
551
+ "l2_hit_rate": self._stats["l2_hits"] / max(1, total_requests),
552
+ "l3_hit_rate": self._stats["l3_hits"] / max(1, total_requests),
553
+ "gpu_memory_gb": self.gpu_cache.get_total_memory_gb(),
554
+ "cpu_cache_items": len(self.cpu_cache),
555
+ }