PyPI - sglang - Versions diffs - 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sglang 0.5.0rc2py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_one_batch.py +0 -6
sglang/bench_one_batch_server.py +7 -2
sglang/bench_serving.py +3 -3
sglang/eval/llama3_eval.py +0 -1
sglang/srt/configs/model_config.py +24 -9
sglang/srt/configs/update_config.py +40 -5
sglang/srt/constrained/xgrammar_backend.py +23 -11
sglang/srt/conversation.py +2 -15
sglang/srt/disaggregation/ascend/conn.py +1 -3
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +1 -1
sglang/srt/disaggregation/launch_lb.py +7 -1
sglang/srt/disaggregation/mini_lb.py +11 -5
sglang/srt/disaggregation/mooncake/conn.py +141 -47
sglang/srt/disaggregation/prefill.py +261 -5
sglang/srt/disaggregation/utils.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/device_communicators/pynccl.py +68 -18
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
sglang/srt/distributed/naive_distributed.py +112 -0
sglang/srt/distributed/parallel_state.py +90 -4
sglang/srt/entrypoints/context.py +20 -1
sglang/srt/entrypoints/engine.py +27 -2
sglang/srt/entrypoints/http_server.py +12 -0
sglang/srt/entrypoints/openai/protocol.py +2 -2
sglang/srt/entrypoints/openai/serving_chat.py +22 -6
sglang/srt/entrypoints/openai/serving_completions.py +9 -1
sglang/srt/entrypoints/openai/serving_responses.py +2 -2
sglang/srt/eplb/expert_distribution.py +2 -3
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +24 -0
sglang/srt/host_shared_memory.py +83 -0
sglang/srt/layers/attention/ascend_backend.py +132 -22
sglang/srt/layers/attention/flashattention_backend.py +24 -17
sglang/srt/layers/attention/flashinfer_backend.py +11 -3
sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
sglang/srt/layers/attention/triton_backend.py +85 -46
sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
sglang/srt/layers/attention/utils.py +94 -15
sglang/srt/layers/attention/vision.py +40 -13
sglang/srt/layers/attention/vision_utils.py +65 -0
sglang/srt/layers/communicator.py +51 -3
sglang/srt/layers/dp_attention.py +23 -4
sglang/srt/layers/elementwise.py +94 -0
sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
sglang/srt/layers/layernorm.py +8 -1
sglang/srt/layers/linear.py +24 -0
sglang/srt/layers/logits_processor.py +5 -1
sglang/srt/layers/moe/__init__.py +31 -0
sglang/srt/layers/moe/ep_moe/layer.py +37 -33
sglang/srt/layers/moe/fused_moe_native.py +14 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
sglang/srt/layers/moe/moe_runner/base.py +13 -0
sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
sglang/srt/layers/moe/router.py +15 -9
sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +167 -83
sglang/srt/layers/moe/utils.py +159 -18
sglang/srt/layers/quantization/__init__.py +13 -14
sglang/srt/layers/quantization/awq.py +7 -7
sglang/srt/layers/quantization/base_config.py +2 -6
sglang/srt/layers/quantization/blockwise_int8.py +4 -12
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
sglang/srt/layers/quantization/fp8.py +127 -119
sglang/srt/layers/quantization/fp8_kernel.py +195 -24
sglang/srt/layers/quantization/fp8_utils.py +34 -9
sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
sglang/srt/layers/quantization/gptq.py +5 -4
sglang/srt/layers/quantization/marlin_utils.py +11 -3
sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
sglang/srt/layers/quantization/modelopt_quant.py +165 -68
sglang/srt/layers/quantization/moe_wna16.py +10 -15
sglang/srt/layers/quantization/mxfp4.py +206 -37
sglang/srt/layers/quantization/quark/quark.py +390 -0
sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
sglang/srt/layers/quantization/unquant.py +34 -70
sglang/srt/layers/quantization/utils.py +25 -0
sglang/srt/layers/quantization/w4afp8.py +7 -8
sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
sglang/srt/layers/quantization/w8a8_int8.py +5 -13
sglang/srt/layers/radix_attention.py +6 -0
sglang/srt/layers/rotary_embedding.py +1 -0
sglang/srt/lora/lora_manager.py +21 -22
sglang/srt/lora/lora_registry.py +3 -3
sglang/srt/lora/mem_pool.py +26 -24
sglang/srt/lora/utils.py +10 -12
sglang/srt/managers/cache_controller.py +76 -18
sglang/srt/managers/detokenizer_manager.py +10 -2
sglang/srt/managers/io_struct.py +9 -0
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/schedule_batch.py +4 -9
sglang/srt/managers/scheduler.py +25 -16
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/template_manager.py +7 -5
sglang/srt/managers/tokenizer_manager.py +60 -21
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/managers/utils.py +59 -1
sglang/srt/mem_cache/allocator.py +7 -5
sglang/srt/mem_cache/allocator_ascend.py +0 -11
sglang/srt/mem_cache/hicache_storage.py +14 -4
sglang/srt/mem_cache/memory_pool.py +3 -3
sglang/srt/mem_cache/memory_pool_host.py +35 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
sglang/srt/model_executor/cuda_graph_runner.py +25 -12
sglang/srt/model_executor/forward_batch_info.py +4 -1
sglang/srt/model_executor/model_runner.py +43 -32
sglang/srt/model_executor/npu_graph_runner.py +94 -0
sglang/srt/model_loader/loader.py +24 -6
sglang/srt/models/dbrx.py +12 -6
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +3 -1
sglang/srt/models/deepseek_v2.py +224 -223
sglang/srt/models/ernie4.py +2 -2
sglang/srt/models/glm4_moe.py +25 -63
sglang/srt/models/glm4v.py +52 -1
sglang/srt/models/glm4v_moe.py +8 -11
sglang/srt/models/gpt_oss.py +34 -74
sglang/srt/models/granitemoe.py +0 -1
sglang/srt/models/grok.py +376 -48
sglang/srt/models/interns1.py +12 -47
sglang/srt/models/internvl.py +6 -51
sglang/srt/models/llama4.py +0 -2
sglang/srt/models/minicpm3.py +0 -1
sglang/srt/models/mixtral.py +0 -2
sglang/srt/models/nemotron_nas.py +435 -0
sglang/srt/models/olmoe.py +0 -1
sglang/srt/models/phi4mm.py +3 -21
sglang/srt/models/qwen2_5_vl.py +2 -0
sglang/srt/models/qwen2_moe.py +3 -18
sglang/srt/models/qwen3.py +2 -2
sglang/srt/models/qwen3_classification.py +7 -1
sglang/srt/models/qwen3_moe.py +9 -38
sglang/srt/models/step3_vl.py +2 -1
sglang/srt/models/xverse_moe.py +11 -5
sglang/srt/multimodal/processors/base_processor.py +3 -3
sglang/srt/multimodal/processors/internvl.py +7 -2
sglang/srt/multimodal/processors/llava.py +11 -7
sglang/srt/offloader.py +433 -0
sglang/srt/operations.py +6 -1
sglang/srt/reasoning_parser.py +4 -3
sglang/srt/server_args.py +237 -104
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
sglang/srt/speculative/eagle_utils.py +36 -13
sglang/srt/speculative/eagle_worker.py +56 -3
sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
sglang/srt/two_batch_overlap.py +16 -11
sglang/srt/utils.py +68 -70
sglang/test/runners.py +8 -5
sglang/test/test_block_fp8.py +5 -6
sglang/test/test_block_fp8_ep.py +13 -19
sglang/test/test_cutlass_moe.py +4 -6
sglang/test/test_cutlass_w4a8_moe.py +4 -3
sglang/test/test_fp4_moe.py +4 -3
sglang/test/test_utils.py +7 -0
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/METADATA +7 -7
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/RECORD +179 -161
sglang/srt/layers/quantization/fp4.py +0 -557
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -78,6 +78,7 @@ from sglang.srt.managers.io_struct import (
     ExpertDistributionReqOutput,
     FlushCacheReqInput,
     FlushCacheReqOutput,
+    FreezeGCReq,
     GenerateReqInput,
     GetInternalStateReq,
     GetInternalStateReqOutput,
@@ -122,7 +123,9 @@ from sglang.srt.metrics.collector import TokenizerMetricsCollector
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import (
+    configure_gc_warning,
     dataclass_to_string_truncated,
+    freeze_gc,
     get_bool_env_var,
     get_zmq_socket,
     kill_process_tree,
@@ -298,7 +301,7 @@ class TokenizerManager:
         # The registry dynamically updates as adapters are loaded / unloaded during runtime. It
         # serves as the source of truth for available adapters and maps user-friendly LoRA names
         # to internally used unique LoRA IDs.
-        self.lora_registry = LoRARegistry(self.server_args.lora_paths or {})
+        self.lora_registry = LoRARegistry(self.server_args.lora_paths)
         # Lock to serialize LoRA update operations.
         # Please note that, unlike `model_update_lock`, this does not block inference, allowing
         # LoRA updates and inference to overlap.
@@ -352,6 +355,10 @@ class TokenizerManager:
                 collect_tokens_histogram=self.server_args.collect_tokens_histogram,
             )
+        # Configure GC warning
+        if self.server_args.gc_warning_threshold_secs > 0.0:
+            configure_gc_warning(self.server_args.gc_warning_threshold_secs)
         # Communicators
         self.init_weights_update_group_communicator = _Communicator(
             self.send_to_scheduler, server_args.dp_size
@@ -446,6 +453,10 @@ class TokenizerManager:
                     ProfileReqOutput,
                     self.profile_communicator.handle_recv,
                 ),
+                (
+                    FreezeGCReq,
+                    lambda x: None,
+                ),  # For handling case when scheduler skips detokenizer and forwards back to the tokenizer manager, we ignore it.
                 (
                     GetInternalStateReqOutput,
                     self.get_internal_state_communicator.handle_recv,
@@ -565,14 +576,24 @@ class TokenizerManager:
         self, obj: Union[GenerateReqInput, EmbeddingReqInput], input_ids: List[int]
     ) -> None:
         """Validates that the input token count and the requested token count doesn't exceed the model's context length."""
+        # FIXME: unify the length validation logic with the one in the scheduler.
+        _max_req_len = self.context_len
         input_token_num = len(input_ids) if input_ids is not None else 0
-        # Check if input alone exceeds context length
         if input_token_num >= self.context_len:
-            raise ValueError(
-                f"The input ({input_token_num} tokens) is longer than the "
-                f"model's context length ({self.context_len} tokens)."
-            )
+            if self.server_args.allow_auto_truncate:
+                logger.warning(
+                    f"The input ({input_token_num} tokens) is longer than the "
+                    f"model's context length ({self.context_len} tokens). "
+                    "Truncating the input."
+                )
+                del input_ids[_max_req_len:]
+                input_token_num = len(input_ids)
+            else:
+                raise ValueError(
+                    f"The input ({input_token_num} tokens) is longer than the "
+                    f"model's context length ({self.context_len} tokens)."
+                )
         if isinstance(obj, EmbeddingReqInput) and self.is_generation:
             raise ValueError(
@@ -584,17 +605,27 @@ class TokenizerManager:
         max_new_tokens = obj.sampling_params.get("max_new_tokens")
         if (
             max_new_tokens is not None
-            and (max_new_tokens + input_token_num) >= self.context_len
+            and (max_new_tokens + input_token_num) >= _max_req_len
         ):
-            total_tokens = max_new_tokens + input_token_num
-            error_msg = (
-                f"Requested token count exceeds the model's maximum context length "
-                f"of {self.context_len} tokens. You requested a total of {total_tokens} "
-                f"tokens: {input_token_num} tokens from the input messages and "
-                f"{max_new_tokens} tokens for the completion. Please reduce the number "
-                f"of tokens in the input messages or the completion to fit within the limit."
-            )
-            raise ValueError(error_msg)
+            if self.server_args.allow_auto_truncate:
+                logger.warning(
+                    f"Requested token count ({input_token_num} input + {max_new_tokens} new) "
+                    f"exceeds the model's context length ({self.context_len} tokens). "
+                    "Truncating max_new_tokens."
+                )
+                obj.sampling_params["max_new_tokens"] = max(
+                    0, _max_req_len - input_token_num
+                )
+            else:
+                total_tokens = max_new_tokens + input_token_num
+                error_msg = (
+                    f"Requested token count exceeds the model's maximum context length "
+                    f"of {self.context_len} tokens. You requested a total of {total_tokens} "
+                    f"tokens: {input_token_num} tokens from the input messages and "
+                    f"{max_new_tokens} tokens for the completion. Please reduce the number "
+                    f"of tokens in the input messages or the completion to fit within the limit."
+                )
+                raise ValueError(error_msg)
         if isinstance(obj, GenerateReqInput):
             if (
@@ -782,15 +813,17 @@ class TokenizerManager:
                     ):
                         raise ValueError(finish_reason["message"])
-                    if (
-                        finish_reason.get("type") == "abort"
-                        and finish_reason.get("status_code")
-                        == HTTPStatus.SERVICE_UNAVAILABLE
+                    if finish_reason.get("type") == "abort" and finish_reason.get(
+                        "status_code"
+                    ) in (
+                        HTTPStatus.SERVICE_UNAVAILABLE,
+                        HTTPStatus.INTERNAL_SERVER_ERROR,
                     ):
                         # This is an abort request initiated by scheduler.
                         # Delete the key to prevent resending abort request to the scheduler and
                         # to ensure aborted request state is cleaned up.
-                        del self.rid_to_state[state.obj.rid]
+                        if state.obj.rid in self.rid_to_state:
+                            del self.rid_to_state[state.obj.rid]
                         # Mark ongoing LoRA request as finished.
                         if self.server_args.enable_lora and state.obj.lora_path:
@@ -1337,6 +1370,12 @@ class TokenizerManager:
         logging.info(f"Config logging: {obj=}")
         self.log_request_metadata = self.get_log_request_metadata()
+    async def freeze_gc(self):
+        """Send a freeze_gc message to the scheduler first, then freeze locally."""
+        self.send_to_scheduler.send_pyobj(FreezeGCReq())
+        freeze_gc("Tokenizer Manager")
+        return None
     def create_abort_task(self, obj: GenerateReqInput):
         # Abort the request if the client is disconnected.
         async def abort_request():

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -92,6 +92,7 @@ class TpModelWorker:
             pp_rank=pp_rank,
             pp_size=server_args.pp_size,
             nccl_port=nccl_port,
+            dp_rank=dp_rank,
             server_args=server_args,
             is_draft_worker=is_draft_worker,
             req_to_token_pool=req_to_token_pool,

sglang/srt/managers/utils.py CHANGED Viewed

@@ -1,9 +1,16 @@
+from __future__ import annotations
 import logging
 import multiprocessing as mp
 from http import HTTPStatus
-from typing import Dict, List, Optional
+from typing import TYPE_CHECKING, Dict, List, Optional
+from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import FINISH_ABORT, Req
+from sglang.srt.model_executor.forward_batch_info import PPProxyTensors
+if TYPE_CHECKING:
+    from sglang.srt.managers.scheduler import GenerationBatchResult
 logger = logging.getLogger(__name__)
@@ -41,6 +48,57 @@ def validate_input_length(
     return None
+def get_logprob_dict_from_result(result: GenerationBatchResult) -> dict:
+    logits_output = result.logits_output
+    assert logits_output is not None
+    return {
+        "extend_input_len_per_req": result.extend_input_len_per_req,
+        "extend_logprob_start_len_per_req": result.extend_logprob_start_len_per_req,
+        "next_token_logprobs": result.logits_output.next_token_logprobs,
+        "next_token_top_logprobs_val": result.logits_output.next_token_top_logprobs_val,
+        "next_token_top_logprobs_idx": result.logits_output.next_token_top_logprobs_idx,
+        "next_token_token_ids_logprobs_val": result.logits_output.next_token_token_ids_logprobs_val,
+        "next_token_token_ids_logprobs_idx": result.logits_output.next_token_token_ids_logprobs_idx,
+        "input_token_logprobs": result.logits_output.input_token_logprobs,
+        "input_top_logprobs_val": result.logits_output.input_top_logprobs_val,
+        "input_top_logprobs_idx": result.logits_output.input_top_logprobs_idx,
+        "input_token_ids_logprobs_val": result.logits_output.input_token_ids_logprobs_val,
+        "input_token_ids_logprobs_idx": result.logits_output.input_token_ids_logprobs_idx,
+    }
+def get_logprob_from_pp_outputs(
+    next_pp_outputs: PPProxyTensors,
+) -> tuple[LogitsProcessorOutput, list[int], list[int]]:
+    logits_output = LogitsProcessorOutput(
+        # Do not send logits and hidden states because they are large
+        next_token_logits=None,
+        hidden_states=None,
+        next_token_logprobs=next_pp_outputs["next_token_logprobs"],
+        next_token_top_logprobs_val=next_pp_outputs["next_token_top_logprobs_val"],
+        next_token_top_logprobs_idx=next_pp_outputs["next_token_top_logprobs_idx"],
+        next_token_token_ids_logprobs_val=next_pp_outputs[
+            "next_token_token_ids_logprobs_val"
+        ],
+        next_token_token_ids_logprobs_idx=next_pp_outputs[
+            "next_token_token_ids_logprobs_idx"
+        ],
+        input_token_logprobs=next_pp_outputs["input_token_logprobs"],
+        input_top_logprobs_val=next_pp_outputs["input_top_logprobs_val"],
+        input_top_logprobs_idx=next_pp_outputs["input_top_logprobs_idx"],
+        input_token_ids_logprobs_val=next_pp_outputs["input_token_ids_logprobs_val"],
+        input_token_ids_logprobs_idx=next_pp_outputs["input_token_ids_logprobs_idx"],
+    )
+    extend_input_len_per_req = next_pp_outputs["extend_input_len_per_req"]
+    extend_logprob_start_len_per_req = next_pp_outputs[
+        "extend_logprob_start_len_per_req"
+    ]
+    return logits_output, extend_input_len_per_req, extend_logprob_start_len_per_req
 class DPBalanceMeta:
     """
     This class will be use in scheduler and dp controller

sglang/srt/mem_cache/allocator.py CHANGED Viewed

@@ -434,15 +434,12 @@ class PagedTokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
         device: str,
         kvcache: KVCache,
         need_sort: bool,
-        max_num_extend_tokens: int,
     ):
         super().__init__(size, page_size, dtype, device, kvcache, need_sort)
         self.num_pages = size // page_size
-        self.max_num_extend_tokens_next_power_of_2 = next_power_of_2(
-            max_num_extend_tokens
-        )
         self.debug_mode = get_bool_env_var("SGLANG_DEBUG_MEMORY_POOL")
         self.ret_values = torch.empty((), dtype=torch.int64, device=self.device)
+        self.seen_max_num_extend_tokens_next_power_of_2 = 1
         self.clear()
     def alloc(self, need_size: int):
@@ -480,6 +477,11 @@ class PagedTokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
                 (last_loc + 1) % self.page_size == prefix_lens % self.page_size
             )
+        self.seen_max_num_extend_tokens_next_power_of_2 = max(
+            self.seen_max_num_extend_tokens_next_power_of_2,
+            next_power_of_2(extend_num_tokens),
+        )
         bs = len(prefix_lens)
         if self.need_sort and extend_num_tokens // self.page_size + bs + 1 > len(
             self.free_pages
@@ -498,7 +500,7 @@ class PagedTokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
             self.ret_values,
             next_power_of_2(bs),
             self.page_size,
-            self.max_num_extend_tokens_next_power_of_2,
+            self.seen_max_num_extend_tokens_next_power_of_2,
         )
         if self.debug_mode:

sglang/srt/mem_cache/allocator_ascend.py CHANGED Viewed

@@ -66,17 +66,6 @@ def alloc_extend_kernel_ascend(
 class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
-    def __init__(
-        self,
-        size: int,
-        page_size: int,
-        dtype: torch.dtype,
-        device: str,
-        kvcache: KVCache,
-        need_sort: bool,
-    ):
-        super().__init__(size, page_size, dtype, device, kvcache, need_sort, 1)
     def alloc_extend(
         self,
         prefix_lens: torch.Tensor,

sglang/srt/mem_cache/hicache_storage.py CHANGED Viewed

@@ -13,6 +13,11 @@ from sglang.srt.distributed import (
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
 def get_hash_str(token_ids: List[int], prior_hash: str = None) -> str:
@@ -101,11 +106,16 @@ class HiCacheStorage(ABC):
 class HiCacheFile(HiCacheStorage):
-    def __init__(self, file_path: str = "/tmp/hicache"):
+    def __init__(self, file_path: str = "/tmp/hicache", is_mla: bool = False):
         self.file_path = os.getenv("SGLANG_HICACHE_FILE_BACKEND_STORAGE_DIR", file_path)
-        tp_rank = get_tensor_model_parallel_rank()
-        tp_size = get_tensor_model_parallel_world_size()
-        self.tp_suffix = f"_{tp_rank}_{tp_size}" if tp_size > 1 else ""
+        if is_dp_attention_enabled():
+            tp_rank = get_attention_tp_rank()
+            tp_size = get_attention_tp_size()
+        else:
+            tp_rank = get_tensor_model_parallel_rank()
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_suffix = f"_{tp_rank}_{tp_size}" if tp_size > 1 and not is_mla else ""
         if not os.path.exists(self.file_path) and tp_rank == 0:
             os.makedirs(self.file_path)
             logger.info(f"Created HiCacheFile storage directory at {self.file_path}")

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -849,7 +849,7 @@ class MLATokenToKVPool(KVCache):
             cache_k_rope = cache_k_rope.view(self.store_dtype)
         set_mla_kv_buffer_triton(
-            self.kv_buffer[layer_id], loc, cache_k_nope, cache_k_rope
+            self.kv_buffer[layer_id - self.start_layer], loc, cache_k_nope, cache_k_rope
         )
     def get_cpu_copy(self, indices):
@@ -951,7 +951,7 @@ class AscendMLAPagedTokenToKVPool(MLATokenToKVPool):
             cache_k = cache_k.to(self.dtype)
         if self.store_dtype != self.dtype:
-            cache_k = cache_k.view(store_dtype)
+            cache_k = cache_k.view(self.store_dtype)
         import torch_npu
@@ -1070,7 +1070,7 @@ def copy_all_layer_kv_cache(
     num_loop = tl.cdiv(stride, BLOCK_SIZE)
     for i in range(num_loop):
         copy_offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
-        mask = (num_locs_offset < num_locs)[:, None] and (copy_offset < stride)[None, :]
+        mask = (num_locs_offset < num_locs)[:, None] & (copy_offset < stride)[None, :]
         value = tl.load(
             data_ptr + src_locs[:, None] * stride + copy_offset[None, :], mask=mask
         )

sglang/srt/mem_cache/memory_pool_host.py CHANGED Viewed

@@ -7,6 +7,7 @@ from functools import wraps
 import psutil
 import torch
+from sglang.srt.distributed import get_tensor_model_parallel_rank
 from sglang.srt.mem_cache.memory_pool import KVCache, MHATokenToKVPool, MLATokenToKVPool
 from sglang.srt.utils import is_npu
@@ -307,6 +308,9 @@ class MHATokenToKVPoolHost(HostKVCache):
         return self.head_dim * self.head_num * self.layer_num * self.dtype.itemsize * 2
+    def get_ksize_per_token(self):
+        return self.get_size_per_token() // 2
     def init_kv_buffer(self):
         if self.layout == "layer_first":
             dims = (2, self.layer_num, self.size, self.head_num, self.head_dim)
@@ -484,8 +488,8 @@ class MHATokenToKVPoolHost(HostKVCache):
             ptr_list.append(k_ptr)
             ptr_list.append(v_ptr)
             key_ = keys[index // self.page_size]
-            key_list.append(f"{key_}_k")
-            key_list.append(f"{key_}_v")
+            key_list.append(f"{key_}_{get_tensor_model_parallel_rank()}_k")
+            key_list.append(f"{key_}_{get_tensor_model_parallel_rank()}_v")
         element_size = (
             self.layer_num
             * self.dtype.itemsize
@@ -496,6 +500,21 @@ class MHATokenToKVPoolHost(HostKVCache):
         element_size_list = [element_size] * len(key_list)
         return key_list, ptr_list, element_size_list
+    def get_buffer_with_hash(self, keys, indices):
+        assert self.layout == "page_first"
+        assert len(keys) == (len(indices) // self.page_size)
+        key_list = []
+        buf_list = []
+        for key, i in zip(keys, range(0, len(indices), self.page_size)):
+            key_list.append(f"{key}-k")
+            buf_list.append(self.k_buffer[i : i + self.page_size])
+            key_list.append(f"{key}-v")
+            buf_list.append(self.v_buffer[i : i + self.page_size])
+        return key_list, buf_list
 class MLATokenToKVPoolHost(HostKVCache):
     device_pool: MLATokenToKVPool
@@ -538,6 +557,9 @@ class MLATokenToKVPoolHost(HostKVCache):
             * self.layer_num
         )
+    def get_ksize_per_token(self):
+        return self.get_size_per_token()
     def init_kv_buffer(self):
         if self.layout == "layer_first":
             dims = (
@@ -704,3 +726,14 @@ class MLATokenToKVPoolHost(HostKVCache):
         )
         element_size_list = [element_size] * len(key_list)
         return key_list, ptr_list, element_size_list
+    def get_buffer_with_hash(self, keys, indices):
+        assert self.layout == "page_first"
+        assert len(keys) == (len(indices) // self.page_size)
+        buf_list = []
+        for i in range(0, len(indices), self.page_size):
+            buf_list.append(self.kv_buffer[i : i + self.page_size])
+        return keys, buf_list

sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py CHANGED Viewed

@@ -7,10 +7,15 @@ import signal
 import threading
 from abc import ABC, abstractmethod
 from functools import wraps
-from typing import List, Optional, Tuple
+from typing import Any, List, Optional, Tuple
 import torch
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_rank,
+    is_dp_attention_enabled,
+)
 from sglang.srt.mem_cache.hicache_storage import HiCacheStorage
 from sglang.srt.mem_cache.storage.hf3fs.client_hf3fs import Hf3fsClient
@@ -167,13 +172,20 @@ class HiCacheHF3FS(HiCacheStorage):
     @staticmethod
     def from_env_config(
-        rank: int, bytes_per_page: int, dtype: torch.dtype
+        bytes_per_page: int, dtype: torch.dtype, rank: int = None
     ) -> "HiCacheHF3FS":
         from sglang.srt.mem_cache.storage.hf3fs.mini_3fs_metadata_server import (
             Hf3fsGlobalMetadataClient,
             Hf3fsLocalMetadataClient,
         )
+        if rank is None:
+            rank = (
+                get_attention_tp_rank()
+                if is_dp_attention_enabled()
+                else get_tensor_model_parallel_rank()
+            )
         config_path = os.getenv(HiCacheHF3FS.default_env_var)
         if not config_path:
             return HiCacheHF3FS(
@@ -228,15 +240,23 @@ class HiCacheHF3FS(HiCacheStorage):
         )
     def get(
-        self, key: str, target_location: Optional[torch.Tensor] = None
+        self,
+        key: str,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
     ) -> torch.Tensor | None:
-        return self.batch_get([key], [target_location] if target_location else None)[0]
+        return self.batch_get(
+            [key],
+            [target_location] if target_location is not None else None,
+            [target_sizes] if target_sizes is not None else None,
+        )[0]
     @synchronized()
     def batch_get(
         self,
         keys: List[str],
-        target_locations: Optional[List[torch.Tensor]] = None,
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
     ) -> List[torch.Tensor | None]:
         page_indices = self.metadata_client.get_page_indices(self.rank, keys)
@@ -246,9 +266,15 @@ class HiCacheHF3FS(HiCacheStorage):
                 batch_indices.append(i)
                 file_offsets.append(page_index * self.bytes_per_page)
-        file_results = [
-            torch.empty(self.numel, dtype=self.dtype) for _ in range(len(batch_indices))
-        ]
+        if target_locations is not None:
+            for target_location in target_locations:
+                assert target_location.is_contiguous()
+            file_results = target_locations
+        else:
+            file_results = [
+                torch.empty(self.numel, dtype=self.dtype)
+                for _ in range(len(batch_indices))
+            ]
         futures = [
             self.executor.submit(
@@ -273,10 +299,27 @@ class HiCacheHF3FS(HiCacheStorage):
         return results
-    def set(self, key: str, value: torch.Tensor) -> bool:
-        return self.batch_set([key], [value])
+    def set(
+        self,
+        key: str,
+        value: Optional[Any] = None,
+        target_location: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
+        return self.batch_set(
+            [key],
+            [value] if value is not None else None,
+            [target_location] if target_location is not None else None,
+            [target_sizes] if target_sizes is not None else None,
+        )
-    def batch_set(self, keys: List[str], values: List[torch.Tensor]) -> bool:
+    def batch_set(
+        self,
+        keys: List[str],
+        values: Optional[Any] = None,
+        target_locations: Optional[Any] = None,
+        target_sizes: Optional[Any] = None,
+    ) -> bool:
         # Todo: Add prefix block's hash key
         key_with_prefix = [(key, "") for key in keys]
         indices = self.metadata_client.reserve_and_allocate_page_indices(
@@ -292,7 +335,8 @@ class HiCacheHF3FS(HiCacheStorage):
             batch_indices.append(i)
             file_offsets.append(page_index * self.bytes_per_page)
-            file_values.append(value.contiguous())
+            assert value.is_contiguous()
+            file_values.append(value)
         futures = [
             self.executor.submit(

sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py CHANGED Viewed

@@ -19,14 +19,13 @@ logger = logging.getLogger(__name__)
 def get_hash_str_mooncake(token_ids: List[int], prior_hash: str = None):
-    local_rank = get_tensor_model_parallel_rank()
     prefix_str = ""
     if prior_hash:
         prefix_str = hashlib.sha256(prior_hash.encode()).hexdigest()
     current_token_ids_bytes = np.array(token_ids).tobytes()
     current_hash_object = hashlib.sha256(current_token_ids_bytes)
     current_hash_hex = current_hash_object.hexdigest()
-    return f"{prefix_str}_{int(current_hash_hex[:16], 16)}_{local_rank}"
+    return f"{prefix_str}_{int(current_hash_hex[:16], 16)}"
 @dataclass
@@ -97,7 +96,7 @@ class MooncakeStoreConfig:
 class MooncakeStore(HiCacheStorage):
-    def __init__(self):
+    def __init__(self, is_mla: bool = False):
         try:
             from mooncake.store import MooncakeDistributedStore
         except ImportError as e:
@@ -127,6 +126,7 @@ class MooncakeStore(HiCacheStorage):
             logger.info("Connect to Mooncake store successfully.")
             self.warmup()
             logger.info("Mooncake store warmup successfully.")
+            self.is_mla = is_mla
         except ValueError as e:
             logger.error("Configuration loading failed: %s", e)
@@ -223,11 +223,15 @@ class MooncakeStore(HiCacheStorage):
     def exists(self, keys) -> bool | dict:
         _keys = []
+        local_rank = get_tensor_model_parallel_rank()
         for key in keys:
             if key is None:
                 return None
-            _keys.append(f"{key}_k")
+            if self.is_mla:
+                _keys.append(f"{key}_k")
+            else:
+                _keys.append(f"{key}_{local_rank}_k")
         result = {k: v for k, v in zip(keys, self.store.batch_is_exist(_keys))}
         return result

sglang 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl

sglang 0.5.0rc2py3-none-any.whl → 0.5.1py3-none-any.whl