PyPI - sglang - Versions diffs - 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl - Mend

sglang 0.4.10.post1py3-none-any.whl → 0.5.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

sglang/bench_one_batch.py +113 -17
sglang/compile_deep_gemm.py +8 -1
sglang/global_config.py +5 -1
sglang/srt/configs/model_config.py +35 -0
sglang/srt/conversation.py +9 -117
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +6 -1
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
sglang/srt/disaggregation/mooncake/conn.py +243 -135
sglang/srt/disaggregation/prefill.py +3 -0
sglang/srt/distributed/device_communicators/pynccl.py +7 -0
sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
sglang/srt/distributed/parallel_state.py +22 -9
sglang/srt/entrypoints/context.py +244 -0
sglang/srt/entrypoints/engine.py +8 -5
sglang/srt/entrypoints/harmony_utils.py +370 -0
sglang/srt/entrypoints/http_server.py +106 -15
sglang/srt/entrypoints/openai/protocol.py +227 -1
sglang/srt/entrypoints/openai/serving_chat.py +278 -42
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +174 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_distribution.py +4 -2
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/harmony_tool_parser.py +130 -0
sglang/srt/hf_transformers_utils.py +55 -13
sglang/srt/jinja_template_utils.py +8 -1
sglang/srt/layers/attention/aiter_backend.py +5 -8
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/flashattention_backend.py +7 -11
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
sglang/srt/layers/attention/vision.py +40 -15
sglang/srt/layers/communicator.py +35 -8
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/linear.py +9 -8
sglang/srt/layers/logits_processor.py +9 -1
sglang/srt/layers/moe/cutlass_moe.py +20 -6
sglang/srt/layers/moe/ep_moe/layer.py +87 -107
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
sglang/srt/layers/moe/topk.py +12 -3
sglang/srt/layers/moe/utils.py +59 -0
sglang/srt/layers/quantization/__init__.py +22 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +8 -7
sglang/srt/layers/quantization/fp8_kernel.py +0 -4
sglang/srt/layers/quantization/fp8_utils.py +29 -0
sglang/srt/layers/quantization/modelopt_quant.py +259 -64
sglang/srt/layers/quantization/mxfp4.py +651 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/__init__.py +0 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +225 -1
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +15 -4
sglang/srt/lora/lora_manager.py +70 -14
sglang/srt/lora/lora_registry.py +10 -2
sglang/srt/lora/mem_pool.py +43 -5
sglang/srt/managers/cache_controller.py +61 -32
sglang/srt/managers/data_parallel_controller.py +52 -2
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +21 -4
sglang/srt/managers/mm_utils.py +5 -11
sglang/srt/managers/schedule_batch.py +30 -8
sglang/srt/managers/schedule_policy.py +3 -1
sglang/srt/managers/scheduler.py +170 -18
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +59 -22
sglang/srt/managers/tokenizer_manager.py +137 -67
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/managers/utils.py +45 -1
sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
sglang/srt/mem_cache/hicache_storage.py +13 -21
sglang/srt/mem_cache/hiradix_cache.py +53 -5
sglang/srt/mem_cache/memory_pool_host.py +1 -1
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
sglang/srt/model_executor/cuda_graph_runner.py +24 -9
sglang/srt/model_executor/forward_batch_info.py +48 -17
sglang/srt/model_executor/model_runner.py +24 -2
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +95 -50
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma3n_mm.py +39 -0
sglang/srt/models/glm4_moe.py +102 -27
sglang/srt/models/gpt_oss.py +1134 -0
sglang/srt/models/grok.py +3 -3
sglang/srt/models/llama4.py +13 -2
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mllama4.py +428 -19
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_moe.py +7 -4
sglang/srt/models/qwen3_moe.py +39 -14
sglang/srt/models/step3_vl.py +10 -1
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/base_processor.py +4 -3
sglang/srt/multimodal/processors/gemma3n.py +0 -7
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/operations_strategy.py +1 -1
sglang/srt/reasoning_parser.py +18 -39
sglang/srt/server_args.py +218 -23
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
sglang/srt/two_batch_overlap.py +163 -9
sglang/srt/utils.py +41 -26
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/runners.py +4 -4
sglang/test/test_utils.py +4 -4
sglang/version.py +1 -1
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0

sglang/srt/mem_cache/hiradix_cache.py CHANGED Viewed

@@ -2,11 +2,12 @@ import heapq
 import logging
 import threading
 import time
+from queue import Queue
 from typing import List, Optional
 import torch
-from sglang.srt.managers.cache_controller import HiCacheController
+from sglang.srt.managers.cache_controller import HiCacheController, PrefetchOperation
 from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
 from sglang.srt.mem_cache.base_prefix_cache import MatchResult
 from sglang.srt.mem_cache.memory_pool import (
@@ -37,6 +38,7 @@ class HiRadixCache(RadixCache):
         hicache_io_backend: str,
         hicache_mem_layout: str,
         hicache_storage_backend: Optional[str] = None,
+        hicache_storage_prefetch_policy: Optional[str] = "best_effort",
     ):
         if hicache_io_backend == "direct":
@@ -85,6 +87,13 @@ class HiRadixCache(RadixCache):
             prefetch_threshold=self.prefetch_threshold,
         )
+        self.prefetch_stop_policy = hicache_storage_prefetch_policy
+        # todo: customizable storage prefetch timeout
+        self.prefetch_timeout = 3  # seconds
+        logger.info(
+            f"HiCache storage prefetch policy: {hicache_storage_prefetch_policy}"
+        )
         # record the nodes with ongoing write through
         self.ongoing_write_through = {}
         # record the node segments with ongoing load back
@@ -385,9 +394,10 @@ class HiRadixCache(RadixCache):
         for _ in range(queue_size.item()):
             req_id = self.cache_controller.prefetch_revoke_queue.get()
             if req_id in self.ongoing_prefetch:
-                last_host_node, _, _, _ = self.ongoing_prefetch[req_id]
+                last_host_node, token_ids, _, _ = self.ongoing_prefetch[req_id]
                 last_host_node.release_host()
                 del self.ongoing_prefetch[req_id]
+                self.cache_controller.prefetch_tokens_occupied -= len(token_ids)
             else:
                 # the revoked operation already got terminated
                 pass
@@ -419,10 +429,41 @@ class HiRadixCache(RadixCache):
             host_node.release_host()
             del self.ongoing_backup[ack_id]
-    def check_prefetch_progress(self, req_id: str):
+    def can_terminate_prefetch(self, operation: PrefetchOperation):
+        can_terminate = True
+        if self.prefetch_stop_policy == "best_effort":
+            return can_terminate
+        completed = (
+            operation.completed_tokens == len(operation.hash_value) * self.page_size
+        )
+        if self.prefetch_stop_policy == "wait_complete":
+            can_terminate = completed
+        elif self.prefetch_stop_policy == "timeout":
+            can_terminate = completed or (
+                time.monotonic() - operation.start_time > self.prefetch_timeout
+            )
+        else:
+            # unknown prefetch stop policy, just return True
+            return True
+        if self.tp_world_size > 1:
+            can_terminate = torch.tensor(can_terminate, dtype=torch.int)
+            torch.distributed.all_reduce(
+                can_terminate,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.tp_group,
+            )
+            can_terminate = bool(can_terminate.item())
+        return can_terminate
+    def check_prefetch_progress(self, req_id: str) -> bool:
         if req_id not in self.ongoing_prefetch:
             # there is no ongoing prefetch for this request or it has been revoked
-            return
+            return True
         # todo: more policies for prefetch progress such as timeout
         # the current policy is to prefetch with best effort and terminate when queuing is over
@@ -430,13 +471,16 @@ class HiRadixCache(RadixCache):
             req_id
         ]
+        if not self.can_terminate_prefetch(operation):
+            return False
         completed_tokens, hash_value = self.cache_controller.terminate_prefetch(
             operation
         )
         logger.debug(f"Prefetch {req_id} completed with {completed_tokens} tokens")
         min_completed_tokens = completed_tokens
-        if self.tp_world_size > 1:
+        if self.tp_world_size > 1 and self.prefetch_stop_policy != "wait_complete":
             # synchrnoize TP workers to make the same update to hiradix cache
             completed_tokens_tensor = torch.tensor(
                 min_completed_tokens, dtype=torch.int
@@ -464,6 +508,9 @@ class HiRadixCache(RadixCache):
         )
         last_host_node.release_host()
         del self.ongoing_prefetch[req_id]
+        self.cache_controller.prefetch_tokens_occupied -= len(token_ids)
+        return True
     def match_prefix(self, key: List[int], **kwargs):
         empty_value = torch.empty((0,), dtype=torch.int64, device=self.device)
@@ -531,6 +578,7 @@ class HiRadixCache(RadixCache):
             host_indices,
             operation,
         )
+        self.cache_controller.prefetch_tokens_occupied += len(new_input_tokens)
     def _insert_helper_host(self, node: TreeNode, key: List, host_value, hash_value):
         node.last_access_time = time.monotonic()

sglang/srt/mem_cache/memory_pool_host.py CHANGED Viewed

@@ -618,7 +618,7 @@ class MLATokenToKVPoolHost(HostKVCache):
             elif self.layout == "page_first":
                 transfer_kv_all_layer_mla_lf_pf(
                     src_layers=device_pool.data_ptrs,
-                    dst_k=self.kv_buffer,
+                    dst=self.kv_buffer,
                     src_indices=device_indices,
                     dst_indices=host_indices,
                     item_size=self.token_stride_size,

sglang/srt/mem_cache/multimodal_cache.py CHANGED Viewed

@@ -1,24 +1,46 @@
+import logging
+from collections import OrderedDict
 from typing import Dict
 import torch
+# Set up logging for cache behavior
+logger = logging.getLogger(__name__)
 class MultiModalCache:
-    """MultiModalCache is used to store vlm encoder results"""
+    """MultiModalCache is used to store vlm encoder results with LRU eviction"""
     def __init__(
         self,
         max_size: int,
     ):
         self.max_size = max_size
-        self.mm_cache: Dict[int, torch.Tensor] = {}
+        self.mm_cache: OrderedDict[int, torch.Tensor] = OrderedDict()
         self.current_size = 0
+    def _allocate(self, embedding_size: int) -> bool:
+        """Allocate space by evicting least recently used entries"""
+        evictions = 0
+        while self.current_size + embedding_size > self.max_size and self.mm_cache:
+            _, old_embedding = self.mm_cache.popitem(last=False)
+            evicted_size = self._get_tensor_size(old_embedding)
+            self.current_size -= evicted_size
+            evictions += evicted_size
+        if evictions > 0:
+            logger.debug(
+                f"Cache eviction: evicted {evictions} bytes, remaining size: {self.current_size}/{self.max_size} bytes"
+            )
+        if self.current_size + embedding_size > self.max_size:
+            return False
+        return True
     def put(self, mm_hash: int, embedding: torch.Tensor) -> bool:
-        if mm_hash in self.mm_cache:
-            return True
         data_size = self._get_tensor_size(embedding)
-        if self.current_size + data_size > self.max_size:
+        # Lazy free cache if not enough space
+        if not self._allocate(data_size):
             return False
         self.mm_cache[mm_hash] = embedding
         self.current_size += data_size
@@ -28,14 +50,12 @@ class MultiModalCache:
         return mm_hash in self.mm_cache
     def get(self, mm_hash: int) -> torch.Tensor:
-        return self.mm_cache.get(mm_hash)
-    def free(self, mm_hash: int) -> bool:
-        if mm_hash not in self.mm_cache:
-            return False
-        old_embedding = self.mm_cache.pop(mm_hash)
-        self.current_size -= self._get_tensor_size(old_embedding)
-        return True
+        """Get embedding and update LRU order"""
+        if mm_hash in self.mm_cache:
+            # Move to end (most recently used)
+            self.mm_cache.move_to_end(mm_hash)
+            return self.mm_cache[mm_hash]
+        return None
     def clear(self):
         self.mm_cache.clear()

sglang/srt/mem_cache/radix_cache_cpp.py ADDED Viewed

@@ -0,0 +1,229 @@
+from __future__ import annotations
+import logging
+from typing import TYPE_CHECKING, List, Set
+import torch
+from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
+from sglang.srt.mem_cache.cpp_radix_tree.radix_tree import (
+    IOHandle,
+    RadixTreeCpp,
+    TreeNodeCpp,
+)
+from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
+logger = logging.getLogger(__name__)
+class RadixCacheCpp(BasePrefixCache):
+    def _merge_tensor(self, l: List[torch.Tensor]) -> torch.Tensor:
+        """
+        Merge a list of tensors into a single tensor.
+        Args:
+            l (List[torch.Tensor]): List of tensors to merge.
+        Returns:
+            torch.Tensor: Merged tensor.
+        """
+        if len(l) == 0:
+            return torch.empty(0, dtype=torch.int64, device=self.device)
+        elif len(l) == 1:
+            return l[0]
+        else:
+            return torch.cat(l)
+    def __init__(
+        self,
+        disable: bool,
+        use_hicache: bool,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool: BaseTokenToKVPoolAllocator,
+        tp_cache_group: torch.distributed.ProcessGroup,
+        page_size: int,
+        hicache_ratio: float,
+        hicache_size: int,
+        hicache_write_policy: str,
+        enable_kv_cache_events: bool = False,
+        hicache_oracle: bool = False,
+        enable_write_cancel: bool = False,
+    ):
+        self.disable = disable
+        self.enable_write_cancel = enable_write_cancel
+        assert (
+            enable_kv_cache_events is False
+        ), "HiRadixCache does not support kv cache events yet"
+        self.kv_cache = token_to_kv_pool.get_kvcache()
+        # record the nodes with ongoing write through
+        self.ongoing_write_through: Set[IOHandle] = set()
+        # record the node segments with ongoing load back
+        self.ongoing_load_back: Set[IOHandle] = set()
+        # todo: dynamically adjust the threshold
+        self.write_through_threshold = (
+            1 if hicache_write_policy == "write_through" else 2
+        )
+        self.device = token_to_kv_pool.device
+        self.token_to_kv_pool = token_to_kv_pool
+        self.req_to_token_pool = req_to_token_pool
+        self.page_size = page_size
+        self.tp_group = tp_cache_group
+        if not use_hicache:
+            self.tree = RadixTreeCpp(
+                disabled=self.disable,
+                page_size=page_size,
+                host_size=None,  # no host cache, this should be removed in the future
+                write_through_threshold=self.write_through_threshold,
+            )
+            self.cache_controller = None
+            return  # early return if hicache is not used
+        raise NotImplementedError("Host cache is not supported yet")
+    def reset(self):
+        if self.cache_controller is not None:
+            # need to clear the acks before resetting the cache controller
+            raise NotImplementedError("Host cache is not supported yet")
+        self.tree.reset()
+    def match_prefix(self, key: List[int], **kwargs) -> MatchResult:
+        device_indices_vec, host_indices_length, node_gpu, node_cpu = (
+            self.tree.match_prefix(key)
+        )
+        return MatchResult(
+            device_indices=self._merge_tensor(device_indices_vec),
+            last_device_node=node_gpu,
+            last_host_node=node_cpu,
+            host_hit_length=host_indices_length,
+        )
+    def _insert(self, key: List[int], value: torch.Tensor) -> int:
+        """
+        Insert a key-value pair into the radix tree.
+        Args:
+            key (List[int]): The key to insert, represented as a list of integers.
+            value (torch.Tensor): The value to associate with the key.
+        Returns:
+            int: Number of device indices that were already present in the tree before the insertion.
+        """
+        ongoing_write, length = self.tree.writing_through(key, value)
+        if self.cache_controller is None:
+            assert len(ongoing_write) == 0, "Implementation error"
+            return length
+        raise NotImplementedError("Host cache is not supported yet")
+    def dec_lock_ref(self, node: TreeNodeCpp):
+        """
+        Decrement the reference count of a node to root of the radix tree.
+        Args:
+            node (TreeNodeCpp): The handle of the node to decrement the reference count for.
+        """
+        self.tree.lock_ref(node, False)  # do not increment
+    def inc_lock_ref(self, node: TreeNodeCpp):
+        """
+        Increment the reference count of from a node to root of the radix tree.
+        Args:
+            node (TreeNodeCpp): The handle of the node to increment the reference count for.
+        """
+        self.tree.lock_ref(node, True)
+    def evict(self, num_tokens: int):
+        evicted_device_indices = self.tree.evict(num_tokens)
+        for indice in evicted_device_indices:
+            self.token_to_kv_pool.free(indice)
+    def evictable_size(self):
+        return self.tree.evictable_size()
+    def protected_size(self):
+        return self.tree.protected_size()
+    def total_size(self):
+        return self.tree.total_size()
+    def cache_finished_req(self, req: Req):
+        """Cache request when it finishes."""
+        assert req.req_pool_idx is not None
+        token_ids = (req.origin_input_ids + req.output_ids)[:-1]
+        overall_len = len(token_ids)  # prefill + decode
+        kv_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx, :overall_len]
+        # NOTE: our C++ implementation don't need `token_ids` and `kv_indices` to be page-aligned
+        # it will automatically align them, but length of them should be equal
+        old_prefix_len = len(req.prefix_indices) // self.page_size * self.page_size
+        new_prefix_len = self._insert(token_ids, kv_indices)
+        # NOTE: kv_indices[:old_prefix_len] == req.prefix_indices
+        assert old_prefix_len <= new_prefix_len, "Wrong prefix indices"
+        # KVCache between old & new is newly generated, but already exists in the pool
+        # we need to free this newly generated kv indices
+        if old_prefix_len < new_prefix_len:
+            self.token_to_kv_pool.free(kv_indices[old_prefix_len:new_prefix_len])
+        # need to free the unaligned part, since it cannot be inserted into the radix tree
+        if self.page_size != 1 and (  # unaligned tail only exists when page_size > 1
+            (unaligned_len := overall_len % self.page_size) > 0
+        ):
+            # NOTE: sglang PagedAllocator support unaligned free (which will automatically align it)
+            self.token_to_kv_pool.free(kv_indices[overall_len - unaligned_len :])
+        # Remove req slot release the cache lock
+        self.dec_lock_ref(req.last_node)
+        self.req_to_token_pool.free(req.req_pool_idx)
+    def cache_unfinished_req(self, req: Req):
+        """Cache request when it is unfinished."""
+        assert req.req_pool_idx is not None
+        token_ids = req.fill_ids
+        prefill_len = len(token_ids)  # prefill only (maybe chunked)
+        kv_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx, :prefill_len]
+        # NOTE: our C++ implementation don't need `token_ids` and `kv_indices` to be page-aligned
+        # it will automatically align them, but length of them should be equal
+        old_prefix_len = len(req.prefix_indices) // self.page_size * self.page_size
+        new_prefix_len = self._insert(token_ids, kv_indices)
+        # NOTE: kv_indices[:old_prefix_len] == req.prefix_indices
+        assert old_prefix_len <= new_prefix_len, "Wrong prefix indices"
+        # TODO(dark): optimize the `insert` and `match` (e.g. merge into 1 function)
+        # The prefix indices need to updated to reuse the kv indices in the pool
+        new_indices_vec, _, new_last_node, _ = self.tree.match_prefix(token_ids)
+        new_indices = self._merge_tensor(new_indices_vec)
+        assert new_prefix_len <= len(new_indices)
+        # KVCache between old & new is newly generated, but already exists in the pool
+        # we need to free this newly generated kv indices and reuse the indices in the pool
+        if old_prefix_len < new_prefix_len:
+            self.token_to_kv_pool.free(kv_indices[old_prefix_len:new_prefix_len])
+            reused_indices = new_indices[old_prefix_len:new_prefix_len]
+            self.req_to_token_pool.req_to_token[
+                req.req_pool_idx, old_prefix_len:new_prefix_len
+            ] = reused_indices
+        if req.last_node != new_last_node:
+            self.dec_lock_ref(req.last_node)
+            self.inc_lock_ref(new_last_node)
+        # NOTE: there might be unaligned tail, so we may need to append it
+        assert len(new_indices) <= prefill_len < len(new_indices) + self.page_size
+        if self.page_size != 1 and len(new_indices) < prefill_len:
+            req.prefix_indices = torch.cat(
+                [new_indices, kv_indices[len(new_indices) :]]
+            )
+        else:
+            req.prefix_indices = new_indices
+        req.last_node = new_last_node
+    def pretty_print(self):
+        return self.tree.debug_print()

sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py CHANGED Viewed

@@ -96,6 +96,8 @@ class Hf3fsClient:
         )
         self.iov_r = make_iovec(self.shm_r, self.hf3fs_mount_point)
         self.iov_w = make_iovec(self.shm_w, self.hf3fs_mount_point)
+        self.shm_r.unlink()
+        self.shm_w.unlink()
         self.rlock = threading.RLock()
         self.wlock = threading.RLock()
@@ -176,8 +178,6 @@ class Hf3fsClient:
         del self.iov_w
         self.shm_r.close()
         self.shm_w.close()
-        self.shm_r.unlink()
-        self.shm_w.unlink()
     def flush(self) -> None:
         os.fsync(self.file)

sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp ADDED Viewed

@@ -0,0 +1,35 @@
+#include <torch/extension.h>
+#include <cstring>
+#include <vector>
+void read_shm(const torch::Tensor &shm, std::vector<torch::Tensor> dst) {
+  py::gil_scoped_release release;
+  char *src_ptr = static_cast<char *>(shm.data_ptr());
+  size_t current = 0;
+  for (size_t i = 0; i < dst.size(); ++i) {
+    auto &t = dst[i];
+    size_t t_bytes = t.numel() * t.element_size();
+    char *dst_ptr = static_cast<char *>(t.data_ptr());
+    std::memcpy(dst_ptr, src_ptr + current, t_bytes);
+    current += t_bytes;
+  }
+}
+void write_shm(const std::vector<torch::Tensor> src, torch::Tensor &shm) {
+  py::gil_scoped_release release;
+  char *dst_ptr = static_cast<char *>(shm.data_ptr());
+  size_t current = 0;
+  for (size_t i = 0; i < src.size(); ++i) {
+    auto &t = src[i];
+    size_t t_bytes = t.numel() * t.element_size();
+    char *src_ptr = static_cast<char *>(t.data_ptr());
+    std::memcpy(dst_ptr + current, src_ptr, t_bytes);
+    current += t_bytes;
+  }
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("read_shm", &read_shm, "Read tensors from shared memory");
+  m.def("write_shm", &write_shm, "Write tensors to shared memory");
+}

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -29,6 +29,9 @@ from torch.profiler import ProfilerActivity, profile
 from sglang.srt.custom_op import CustomOp
 from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    set_graph_pool_id,
+)
 from sglang.srt.distributed.parallel_state import GroupCoordinator, graph_capture
 from sglang.srt.layers.dp_attention import DPPaddingMode, get_attention_tp_size
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
@@ -372,6 +375,11 @@ class CudaGraphRunner:
                 dtype=torch.bool,
                 device="cuda",
             )
+            self.next_token_logits_buffer = torch.zeros(
+                (self.max_num_token, self.model_runner.model_config.vocab_size),
+                dtype=torch.float,
+                device="cuda",
+            )
         # Capture
         try:
@@ -517,6 +525,7 @@ class CudaGraphRunner:
         else:
             encoder_lens = None
         mrope_positions = self.mrope_positions[:, :bs]
+        next_token_logits_buffer = self.next_token_logits_buffer[:num_tokens]
         self.num_token_non_padded[...] = num_tokens
         # pipeline parallelism
@@ -567,11 +576,11 @@ class CudaGraphRunner:
             )
         if self.model_runner.server_args.enable_lora:
-            # It is safe to capture CUDA graph using empty LoRA path, as the LoRA kernels will always be launched whenever
-            # `--enable-lora` is set to True (and return immediately if the LoRA path is empty for perf optimization).
-            lora_paths = [None] * bs
+            # It is safe to capture CUDA graph using empty LoRA id, as the LoRA kernels will always be launched whenever
+            # `--enable-lora` is set to True (and return immediately if the LoRA id is empty for perf optimization).
+            lora_ids = [None] * bs
         else:
-            lora_paths = None
+            lora_ids = None
         forward_batch = ForwardBatch(
             forward_mode=self.capture_forward_mode,
@@ -579,6 +588,8 @@ class CudaGraphRunner:
             input_ids=input_ids,
             req_pool_indices=req_pool_indices,
             seq_lens=seq_lens,
+            next_token_logits_buffer=next_token_logits_buffer,
+            orig_seq_lens=seq_lens,
             req_to_token_pool=self.model_runner.req_to_token_pool,
             token_to_kv_pool=self.model_runner.token_to_kv_pool,
             attn_backend=self.model_runner.attn_backend,
@@ -597,11 +608,11 @@ class CudaGraphRunner:
             capture_hidden_mode=self.capture_hidden_mode,
             num_token_non_padded=self.num_token_non_padded,
             global_forward_mode=self.capture_forward_mode,
-            lora_paths=lora_paths,
+            lora_ids=lora_ids,
         )
         self.tbo_plugin.capture_one_batch_size(forward_batch, num_tokens=num_tokens)
-        if lora_paths is not None:
+        if lora_ids is not None:
             self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
         # Attention backend
@@ -643,11 +654,15 @@ class CudaGraphRunner:
             run_once()
-        global global_graph_memory_pool
-        with torch.cuda.graph(graph, pool=global_graph_memory_pool, stream=stream):
+        if get_global_graph_memory_pool() is None:
+            set_global_graph_memory_pool(torch.cuda.graph_pool_handle())
+        # Set graph pool id globally to be able to use symmetric memory
+        set_graph_pool_id(get_global_graph_memory_pool())
+        with torch.cuda.graph(
+            graph, pool=get_global_graph_memory_pool(), stream=stream
+        ):
             out = run_once()
-        global_graph_memory_pool = graph.pool()
         return graph, out
     def recapture_if_needed(self, forward_batch: ForwardBatch):

sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

sglang 0.4.10.post1py3-none-any.whl → 0.5.0rc0py3-none-any.whl