PyPI - sglang - Versions diffs - 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl - Mend

sglang 0.5.2rc2py3-none-any.whl → 0.5.3rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

sglang/bench_one_batch_server.py +10 -1
sglang/bench_serving.py +257 -29
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/device_config.py +3 -1
sglang/srt/configs/dots_vlm.py +139 -0
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +50 -6
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/connector/__init__.py +8 -1
sglang/srt/connector/remote_instance.py +82 -0
sglang/srt/constrained/base_grammar_backend.py +48 -12
sglang/srt/constrained/llguidance_backend.py +0 -1
sglang/srt/constrained/outlines_backend.py +0 -1
sglang/srt/constrained/xgrammar_backend.py +28 -9
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +11 -3
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +15 -12
sglang/srt/disaggregation/decode.py +21 -10
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -445
sglang/srt/disaggregation/mooncake/conn.py +18 -10
sglang/srt/disaggregation/nixl/conn.py +180 -16
sglang/srt/disaggregation/prefill.py +5 -3
sglang/srt/disaggregation/utils.py +5 -50
sglang/srt/distributed/parallel_state.py +24 -3
sglang/srt/entrypoints/engine.py +38 -17
sglang/srt/entrypoints/grpc_request_manager.py +580 -0
sglang/srt/entrypoints/grpc_server.py +680 -0
sglang/srt/entrypoints/http_server.py +85 -54
sglang/srt/entrypoints/openai/protocol.py +4 -1
sglang/srt/entrypoints/openai/serving_base.py +46 -3
sglang/srt/entrypoints/openai/serving_chat.py +36 -16
sglang/srt/entrypoints/openai/serving_completions.py +12 -3
sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
sglang/srt/entrypoints/openai/serving_responses.py +6 -3
sglang/srt/entrypoints/openai/serving_score.py +1 -0
sglang/srt/eplb/eplb_manager.py +2 -2
sglang/srt/eplb/expert_distribution.py +26 -13
sglang/srt/eplb/expert_location.py +8 -3
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/base_format_detector.py +3 -6
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +6 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/grpc/__init__.py +1 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
sglang/srt/hf_transformers_utils.py +4 -0
sglang/srt/layers/activation.py +142 -9
sglang/srt/layers/attention/ascend_backend.py +11 -4
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashinfer_backend.py +6 -4
sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
sglang/srt/layers/attention/mamba/mamba.py +64 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/triton_backend.py +18 -1
sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/dp_attention.py +30 -1
sglang/srt/layers/layernorm.py +32 -15
sglang/srt/layers/linear.py +34 -3
sglang/srt/layers/logits_processor.py +29 -10
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
sglang/srt/layers/moe/ep_moe/layer.py +182 -62
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +30 -9
sglang/srt/layers/moe/utils.py +12 -6
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +76 -47
sglang/srt/layers/quantization/fp8_utils.py +50 -31
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +147 -47
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +64 -40
sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/w4afp8.py +30 -17
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +76 -38
sglang/srt/layers/sampler.py +162 -18
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/triton_backend.py +90 -2
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +4 -1
sglang/srt/lora/lora_manager.py +35 -112
sglang/srt/lora/mem_pool.py +24 -10
sglang/srt/lora/utils.py +18 -9
sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
sglang/srt/managers/cache_controller.py +158 -160
sglang/srt/managers/data_parallel_controller.py +105 -35
sglang/srt/managers/detokenizer_manager.py +8 -4
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +199 -12
sglang/srt/managers/mm_utils.py +1 -0
sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
sglang/srt/managers/schedule_batch.py +77 -56
sglang/srt/managers/schedule_policy.py +1 -1
sglang/srt/managers/scheduler.py +187 -39
sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
sglang/srt/managers/tokenizer_manager.py +259 -519
sglang/srt/managers/tp_worker.py +53 -4
sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
sglang/srt/mem_cache/hicache_storage.py +3 -23
sglang/srt/mem_cache/hiradix_cache.py +103 -43
sglang/srt/mem_cache/memory_pool.py +347 -48
sglang/srt/mem_cache/memory_pool_host.py +105 -46
sglang/srt/mem_cache/radix_cache.py +0 -2
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
sglang/srt/mem_cache/swa_radix_cache.py +0 -2
sglang/srt/metrics/collector.py +493 -76
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +13 -5
sglang/srt/model_executor/forward_batch_info.py +59 -2
sglang/srt/model_executor/model_runner.py +356 -29
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +128 -4
sglang/srt/model_loader/weight_utils.py +2 -1
sglang/srt/models/apertus.py +686 -0
sglang/srt/models/bailing_moe.py +798 -218
sglang/srt/models/bailing_moe_nextn.py +168 -0
sglang/srt/models/deepseek_v2.py +109 -15
sglang/srt/models/dots_vlm.py +174 -0
sglang/srt/models/dots_vlm_vit.py +337 -0
sglang/srt/models/ernie4.py +1 -1
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +1 -1
sglang/srt/models/glm4v.py +4 -2
sglang/srt/models/glm4v_moe.py +3 -0
sglang/srt/models/gpt_oss.py +1 -1
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +13 -0
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/mllama4.py +25 -0
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2.py +7 -0
sglang/srt/models/qwen2_5_vl.py +27 -3
sglang/srt/models/qwen2_moe.py +56 -12
sglang/srt/models/qwen3_moe.py +1 -1
sglang/srt/models/qwen3_next.py +1042 -0
sglang/srt/models/qwen3_next_mtp.py +112 -0
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/multimodal/processors/dots_vlm.py +99 -0
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +141 -129
sglang/srt/multimodal/processors/qwen_vl.py +15 -5
sglang/srt/offloader.py +27 -3
sglang/srt/remote_instance_weight_loader_utils.py +69 -0
sglang/srt/sampling/sampling_batch_info.py +18 -15
sglang/srt/server_args.py +276 -35
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
sglang/srt/speculative/eagle_utils.py +0 -2
sglang/srt/speculative/eagle_worker.py +43 -4
sglang/srt/speculative/spec_info.py +5 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/tracing/trace.py +552 -0
sglang/srt/utils.py +34 -3
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_trtllm_mla_backend.py +169 -5
sglang/test/runners.py +4 -0
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_disaggregation_utils.py +66 -0
sglang/test/test_fp4_moe.py +370 -1
sglang/test/test_utils.py +28 -1
sglang/utils.py +11 -0
sglang/version.py +1 -1
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
sglang/srt/disaggregation/launch_lb.py +0 -118
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -12,10 +12,11 @@
 # limitations under the License.
 # ==============================================================================
 """A tensor parallel worker."""
+from __future__ import annotations
 import logging
 import threading
-from typing import Optional, Tuple, Union
+from typing import TYPE_CHECKING, Optional, Tuple, Union
 import torch
@@ -29,8 +30,10 @@ from sglang.srt.hf_transformers_utils import (
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.io_struct import (
     GetWeightsByNameReqInput,
+    InitWeightsSendGroupForRemoteInstanceReqInput,
     InitWeightsUpdateGroupReqInput,
     LoadLoRAAdapterReqInput,
+    SendWeightsToRemoteInstanceReqInput,
     UnloadLoRAAdapterReqInput,
     UpdateWeightFromDiskReqInput,
     UpdateWeightsFromDistributedReqInput,
@@ -45,6 +48,9 @@ from sglang.srt.patch_torch import monkey_patch_torch_reductions
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import MultiprocessingSerializer, broadcast_pyobj, set_random_seed
+if TYPE_CHECKING:
+    from sglang.srt.managers.cache_controller import LayerDoneCounter
 logger = logging.getLogger(__name__)
@@ -78,7 +84,13 @@ class TpModelWorker:
                 if not is_draft_worker
                 else server_args.speculative_draft_model_path
             ),
+            model_revision=(
+                server_args.revision
+                if not is_draft_worker
+                else server_args.speculative_draft_model_revision
+            ),
             is_draft_model=is_draft_worker,
+            tp_rank=tp_rank,
         )
         self.model_runner = ModelRunner(
@@ -137,7 +149,7 @@ class TpModelWorker:
         assert self.max_running_requests > 0, "max_running_request is zero"
         self.max_queued_requests = server_args.max_queued_requests
         assert (
-            self.max_running_requests > 0
+            self.max_queued_requests > 0
         ), "max_queued_requests is zero. We need to be at least 1 to schedule a request."
         self.max_req_len = min(
             self.model_config.context_len - 1,
@@ -162,10 +174,10 @@ class TpModelWorker:
         self.hicache_layer_transfer_counter = None
-    def register_hicache_layer_transfer_counter(self, counter):
+    def register_hicache_layer_transfer_counter(self, counter: LayerDoneCounter):
         self.hicache_layer_transfer_counter = counter
-    def set_hicache_consumer(self, consumer_index):
+    def set_hicache_consumer(self, consumer_index: int):
         if self.hicache_layer_transfer_counter is not None:
             self.hicache_layer_transfer_counter.set_consumer(consumer_index)
@@ -225,6 +237,9 @@ class TpModelWorker:
     ) -> Tuple[
         Union[LogitsProcessorOutput, torch.Tensor], Optional[torch.Tensor], bool
     ]:
+        # update the consumer index of hicache to the running batch
+        self.set_hicache_consumer(model_worker_batch.hicache_consumer_index)
         forward_batch = ForwardBatch.init_new(model_worker_batch, self.model_runner)
         pp_proxy_tensors = None
@@ -244,6 +259,15 @@ class TpModelWorker:
             if skip_sample:
                 next_token_ids = None
+                # For prefill-only requests, we still need to compute logprobs even when sampling is skipped
+                if (
+                    model_worker_batch.is_prefill_only
+                    and model_worker_batch.return_logprob
+                ):
+                    # Compute logprobs without full sampling
+                    self.model_runner.compute_logprobs_only(
+                        logits_output, model_worker_batch
+                    )
             else:
                 next_token_ids = self.model_runner.sample(
                     logits_output, model_worker_batch
@@ -280,6 +304,31 @@ class TpModelWorker:
         )
         return success, message
+    def init_weights_send_group_for_remote_instance(
+        self, recv_req: InitWeightsSendGroupForRemoteInstanceReqInput
+    ):
+        success, message = (
+            self.model_runner.init_weights_send_group_for_remote_instance(
+                recv_req.master_address,
+                recv_req.ports,
+                recv_req.group_rank,
+                recv_req.world_size,
+                recv_req.group_name,
+                recv_req.backend,
+            )
+        )
+        return success, message
+    def send_weights_to_remote_instance(
+        self, recv_req: SendWeightsToRemoteInstanceReqInput
+    ):
+        success, message = self.model_runner.send_weights_to_remote_instance(
+            recv_req.master_address,
+            recv_req.ports,
+            recv_req.group_name,
+        )
+        return success, message
     def update_weights_from_distributed(
         self, recv_req: UpdateWeightsFromDistributedReqInput
     ):

sglang/srt/managers/tp_worker_overlap_thread.py CHANGED Viewed

@@ -12,21 +12,24 @@
 # limitations under the License.
 # ==============================================================================
 """A tensor parallel worker."""
+from __future__ import annotations
 import dataclasses
 import logging
 import signal
 import threading
 from queue import Queue
-from typing import Optional, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple
 import psutil
 import torch
 from sglang.srt.managers.io_struct import (
     GetWeightsByNameReqInput,
+    InitWeightsSendGroupForRemoteInstanceReqInput,
     InitWeightsUpdateGroupReqInput,
     LoadLoRAAdapterReqInput,
+    SendWeightsToRemoteInstanceReqInput,
     UnloadLoRAAdapterReqInput,
     UpdateWeightFromDiskReqInput,
     UpdateWeightsFromDistributedReqInput,
@@ -38,6 +41,9 @@ from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import DynamicGradMode, get_compiler_backend
 from sglang.utils import get_exception_traceback
+if TYPE_CHECKING:
+    from sglang.srt.managers.cache_controller import LayerDoneCounter
 logger = logging.getLogger(__name__)
@@ -79,7 +85,7 @@ class TpModelWorkerClient:
         )
         # Launch threads
-        self.input_queue = Queue()
+        self.input_queue = Queue[Tuple[ModelWorkerBatch, int, torch.Event]]()
         self.output_queue = Queue()
         self.forward_stream = torch.get_device_module(self.device).Stream()
         self.forward_thread = threading.Thread(
@@ -93,13 +99,9 @@ class TpModelWorkerClient:
         self.hicache_layer_transfer_counter = None
-    def register_hicache_layer_transfer_counter(self, counter):
+    def register_hicache_layer_transfer_counter(self, counter: LayerDoneCounter):
         self.hicache_layer_transfer_counter = counter
-    def set_hicache_consumer(self, consumer_index):
-        if self.hicache_layer_transfer_counter is not None:
-            self.hicache_layer_transfer_counter.set_consumer(consumer_index)
     def get_worker_info(self):
         return self.worker.get_worker_info()
@@ -147,7 +149,7 @@ class TpModelWorkerClient:
     @DynamicGradMode()
     def forward_thread_func_(self):
         batch_pt = 0
-        batch_lists = [None] * 2
+        batch_lists: List = [None] * 2
         while True:
             model_worker_batch, future_token_ids_ct, sync_event = self.input_queue.get()
@@ -169,26 +171,31 @@ class TpModelWorkerClient:
             input_ids = model_worker_batch.input_ids
             resolve_future_token_ids(input_ids, self.future_token_ids_map)
-            # update the consumer index of hicache to the running batch
-            self.set_hicache_consumer(model_worker_batch.hicache_consumer_index)
             # Run forward
             logits_output, next_token_ids, can_run_cuda_graph = (
                 self.worker.forward_batch_generation(
-                    model_worker_batch, model_worker_batch.launch_done
+                    model_worker_batch,
+                    model_worker_batch.launch_done,
+                    #  Skip sampling for prefill-only requests
+                    skip_sample=model_worker_batch.is_prefill_only,
                 )
             )
             # Update the future token ids map
             bs = len(model_worker_batch.seq_lens)
+            if model_worker_batch.is_prefill_only:
+                # For prefill-only requests, create dummy token IDs on CPU
+                next_token_ids = torch.zeros(bs, dtype=torch.long)
             self.future_token_ids_map[
                 future_token_ids_ct + 1 : future_token_ids_ct + bs + 1
             ] = next_token_ids
             # Copy results to the CPU
             if model_worker_batch.return_logprob:
-                logits_output.next_token_logprobs = (
-                    logits_output.next_token_logprobs.to("cpu", non_blocking=True)
-                )
+                if logits_output.next_token_logprobs is not None:
+                    logits_output.next_token_logprobs = (
+                        logits_output.next_token_logprobs.to("cpu", non_blocking=True)
+                    )
                 if logits_output.input_token_logprobs is not None:
                     logits_output.input_token_logprobs = (
                         logits_output.input_token_logprobs.to("cpu", non_blocking=True)
@@ -197,7 +204,9 @@ class TpModelWorkerClient:
                 logits_output.hidden_states = logits_output.hidden_states.to(
                     "cpu", non_blocking=True
                 )
-            next_token_ids = next_token_ids.to("cpu", non_blocking=True)
+            # Only copy to CPU if not already on CPU
+            if next_token_ids.device.type != "cpu":
+                next_token_ids = next_token_ids.to("cpu", non_blocking=True)
             copy_done.record()
             self.output_queue.put(
@@ -221,10 +230,10 @@ class TpModelWorkerClient:
             logits_output.next_token_logprobs = (
                 logits_output.next_token_logprobs.tolist()
             )
-            if logits_output.input_token_logprobs is not None:
-                logits_output.input_token_logprobs = tuple(
-                    logits_output.input_token_logprobs.tolist()
-                )
+        if logits_output.input_token_logprobs is not None:
+            logits_output.input_token_logprobs = tuple(
+                logits_output.input_token_logprobs.tolist()
+            )
         next_token_ids = next_token_ids.tolist()
         return logits_output, next_token_ids, can_run_cuda_graph
@@ -269,6 +278,20 @@ class TpModelWorkerClient:
         success, message = self.worker.init_weights_update_group(recv_req)
         return success, message
+    def init_weights_send_group_for_remote_instance(
+        self, recv_req: InitWeightsSendGroupForRemoteInstanceReqInput
+    ):
+        success, message = self.worker.init_weights_send_group_for_remote_instance(
+            recv_req
+        )
+        return success, message
+    def send_weights_to_remote_instance(
+        self, recv_req: SendWeightsToRemoteInstanceReqInput
+    ):
+        success, message = self.worker.send_weights_to_remote_instance(recv_req)
+        return success, message
     def update_weights_from_distributed(
         self, recv_req: UpdateWeightsFromDistributedReqInput
     ):

sglang/srt/mem_cache/hicache_storage.py CHANGED Viewed

@@ -103,20 +103,6 @@ class HiCacheStorage(ABC):
         """
         pass
-    @abstractmethod
-    def delete(self, key: str) -> bool:
-        """
-        Delete the entry associated with the given key.
-        """
-        pass
-    @abstractmethod
-    def clear(self) -> bool:
-        """
-        Clear all entries in the storage.
-        """
-        pass
     def batch_exists(self, keys: List[str]) -> int:
         """
         Check if the keys exist in the storage.
@@ -128,6 +114,9 @@ class HiCacheStorage(ABC):
                 return i
         return len(keys)
+    def get_stats(self):
+        return None
 class HiCacheFile(HiCacheStorage):
@@ -224,15 +213,6 @@ class HiCacheFile(HiCacheStorage):
         tensor_path = os.path.join(self.file_path, f"{key}.bin")
         return os.path.exists(tensor_path)
-    def delete(self, key: str) -> None:
-        key = self._get_suffixed_key(key)
-        tensor_path = os.path.join(self.file_path, f"{key}.bin")
-        try:
-            os.remove(tensor_path)
-        except FileNotFoundError:
-            logger.warning(f"Key {key} does not exist. Cannot delete.")
-            return
     def clear(self) -> bool:
         try:
             for filename in os.listdir(self.file_path):

sglang/srt/mem_cache/hiradix_cache.py CHANGED Viewed

@@ -20,6 +20,7 @@ from sglang.srt.mem_cache.memory_pool_host import (
     MLATokenToKVPoolHost,
 )
 from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
+from sglang.srt.metrics.collector import StorageMetricsCollector
 logger = logging.getLogger(__name__)
@@ -37,6 +38,7 @@ class HiRadixCache(RadixCache):
         hicache_write_policy: str,
         hicache_io_backend: str,
         hicache_mem_layout: str,
+        enable_metrics: bool,
         hicache_storage_backend: Optional[str] = None,
         hicache_storage_prefetch_policy: Optional[str] = "best_effort",
         model_name: Optional[str] = None,
@@ -73,6 +75,8 @@ class HiRadixCache(RadixCache):
         self.tp_group = tp_cache_group
         self.tp_world_size = torch.distributed.get_world_size(group=self.tp_group)
         self.enable_storage = hicache_storage_backend is not None
+        self.enable_storage_metrics = self.enable_storage and enable_metrics
         # todo: customizable storage prefetch threshold and timeout
         self.prefetch_threshold = 256
         self.prefetch_timeout = 3  # seconds
@@ -92,6 +96,14 @@ class HiRadixCache(RadixCache):
             model_name=model_name,
             storage_backend_extra_config=storage_backend_extra_config,
         )
+        if self.enable_storage_metrics:
+            # TODO: support pp
+            labels = {
+                "storage_backend": hicache_storage_backend,
+                "tp_rank": self.cache_controller.tp_rank,
+                "dp_rank": self.cache_controller.dp_rank,
+            }
+            self.metrics_collector = StorageMetricsCollector(labels=labels)
         # record the nodes with ongoing write through
         self.ongoing_write_through = {}
@@ -122,11 +134,24 @@ class HiRadixCache(RadixCache):
             height += 1
         return height
-    def clear_storage_backend(self):
+    def clear_storage_backend(self) -> bool:
         if self.enable_storage:
-            self.cache_controller.storage_backend.clear()
-            logger.info("Hierarchical cache storage backend cleared successfully!")
-            return True
+            try:
+                # Check if the storage backend has a clear method (for nixl backends)
+                if hasattr(self.cache_controller.storage_backend, "clear"):
+                    self.cache_controller.storage_backend.clear()
+                    logger.info(
+                        "Hierarchical cache storage backend cleared successfully!"
+                    )
+                    return True
+                else:
+                    logger.warning(
+                        f"Storage backend {type(self.cache_controller.storage_backend).__name__} does not support clear operation."
+                    )
+                    return False
+            except Exception as e:
+                logger.error(f"Failed to clear hierarchical cache storage backend: {e}")
+                return False
         else:
             logger.warning("Hierarchical cache storage backend is not enabled.")
             return False
@@ -176,41 +201,57 @@ class HiRadixCache(RadixCache):
         if write_back:
             # blocking till all write back complete
             while len(self.ongoing_write_through) > 0:
-                ack_id = self.cache_controller.ack_write_queue.get()
-                del self.ongoing_write_through[ack_id]
+                for _, finish_event, ack_list in self.cache_controller.ack_write_queue:
+                    finish_event.synchronize()
+                    for ack_id in ack_list:
+                        del self.ongoing_write_through[ack_id]
+                self.cache_controller.ack_write_queue.clear()
+                assert len(self.ongoing_write_through) == 0
             return
-        queue_size = torch.tensor(
-            self.cache_controller.ack_write_queue.qsize(), dtype=torch.int
-        )
+        # NOTE: all ranks has the same ongoing_write_through, can skip sync if empty
+        if len(self.ongoing_write_through) == 0:
+            return
+        finish_count = 0
+        for _, finish_event, ack_list in self.cache_controller.ack_write_queue:
+            if not finish_event.query():
+                break
+            finish_count += 1
+        queue_size = torch.tensor(finish_count, dtype=torch.int, device="cpu")
         if self.tp_world_size > 1:
-            # synchrnoize TP workers to make the same update to radix cache
+            # synchronize TP workers to make the same update to radix cache
             torch.distributed.all_reduce(
                 queue_size,
                 op=torch.distributed.ReduceOp.MIN,
                 group=self.tp_group,
             )
-        for _ in range(queue_size.item()):
-            ack_id = self.cache_controller.ack_write_queue.get()
-            backuped_node = self.ongoing_write_through[ack_id]
-            self.dec_lock_ref(backuped_node)
-            del self.ongoing_write_through[ack_id]
-            if self.enable_storage:
-                self.write_backup_storage(backuped_node)
+        finish_count = int(queue_size.item())
+        while finish_count > 0:
+            _, finish_event, ack_list = self.cache_controller.ack_write_queue.pop(0)
+            finish_event.synchronize()
+            for ack_id in ack_list:
+                backuped_node = self.ongoing_write_through.pop(ack_id)
+                self.dec_lock_ref(backuped_node)
+                if self.enable_storage:
+                    self.write_backup_storage(backuped_node)
+            finish_count -= 1
     def loading_check(self):
-        while not self.cache_controller.ack_load_queue.empty():
-            try:
-                ack_id = self.cache_controller.ack_load_queue.get_nowait()
-                start_node, end_node = self.ongoing_load_back[ack_id]
-                self.dec_lock_ref(end_node)
-                while end_node != start_node:
-                    assert end_node.loading
-                    end_node.loading = False
-                    end_node = end_node.parent
-                # clear the reference
-                del self.ongoing_load_back[ack_id]
-            except Exception:
+        finish_count = 0
+        for _, finish_event, ack_list in self.cache_controller.ack_load_queue:
+            if not finish_event.query():
+                # the KV cache loading is still ongoing
                 break
+            finish_count += 1
+            # no need to sync across TP workers as batch forwarding is synced
+            for ack_id in ack_list:
+                end_node = self.ongoing_load_back.pop(ack_id)
+                self.dec_lock_ref(end_node)
+        # ACK until all events are processed
+        del self.cache_controller.ack_load_queue[:finish_count]
     def evictable_size(self):
         return self.evictable_size_
@@ -335,12 +376,11 @@ class HiRadixCache(RadixCache):
             # no sufficient GPU memory to load back KV caches
             return None
-        self.ongoing_load_back[last_hit_node.id] = (ancester_node, last_hit_node)
+        self.ongoing_load_back[last_hit_node.id] = last_hit_node
         offset = 0
         for node in nodes_to_load:
             node.value = device_indices[offset : offset + len(node.host_value)]
             offset += len(node.host_value)
-            node.loading = True
         self.evictable_size_ += len(device_indices)
         self.inc_lock_ref(last_hit_node)
@@ -369,16 +409,22 @@ class HiRadixCache(RadixCache):
             last_node,
         )
-    def ready_to_load_host_cache(self):
-        producer_index = self.cache_controller.layer_done_counter.next_producer()
-        self.load_cache_event.set()
-        return producer_index
+    def ready_to_load_host_cache(self) -> int:
+        """
+        Notify the cache controller to start the KV cache loading.
+        Return the consumer index for the schedule batch manager to track.
+        """
+        return self.cache_controller.start_loading()
     def check_hicache_events(self):
         self.writing_check()
         self.loading_check()
         if self.enable_storage:
             self.drain_storage_control_queues()
+        if self.enable_storage_metrics:
+            self.metrics_collector.log_storage_metrics(
+                self.cache_controller.storage_backend.get_stats()
+            )
     def drain_storage_control_queues(self):
         """
@@ -414,10 +460,13 @@ class HiRadixCache(RadixCache):
         # process backup acks
         for _ in range(n_backup):
-            ack_id = cc.ack_backup_queue.get()
+            operation = cc.ack_backup_queue.get()
+            ack_id = operation.id
             entry = self.ongoing_backup.pop(ack_id, None)
             if entry is not None:
                 entry.release_host()
+            if self.enable_storage_metrics:
+                self.metrics_collector.log_backuped_tokens(operation.completed_tokens)
         # release host memory
         host_indices_list = []
@@ -450,15 +499,22 @@ class HiRadixCache(RadixCache):
             # unknown prefetch stop policy, just return True
             return True
+        operation_terminated = operation.is_terminated()
         if self.tp_world_size > 1:
-            can_terminate = torch.tensor(can_terminate, dtype=torch.int)
+            states = torch.tensor(
+                [1 - int(can_terminate), int(operation_terminated)],
+                dtype=torch.int,
+            )
             torch.distributed.all_reduce(
-                can_terminate,
-                op=torch.distributed.ReduceOp.MIN,
+                states,
+                op=torch.distributed.ReduceOp.MAX,
                 group=self.tp_group,
             )
-            can_terminate = bool(can_terminate.item())
+            can_terminate = states[0].item() == 0
+            operation_terminated = states[1].item() == 1
+        # the operation should be terminated if it is already terminated on any TP worker
+        # or it meets the termination condition on all TP workers
+        can_terminate = can_terminate or operation_terminated
         return can_terminate
     def check_prefetch_progress(self, req_id: str) -> bool:
@@ -485,7 +541,7 @@ class HiRadixCache(RadixCache):
         logger.debug(f"Prefetch {req_id} completed with {completed_tokens} tokens")
         min_completed_tokens = completed_tokens
-        if self.tp_world_size > 1 and self.prefetch_stop_policy != "wait_complete":
+        if self.tp_world_size > 1:
             # synchrnoize TP workers to make the same update to hiradix cache
             completed_tokens_tensor = torch.tensor(
                 min_completed_tokens, dtype=torch.int
@@ -515,6 +571,11 @@ class HiRadixCache(RadixCache):
         del self.ongoing_prefetch[req_id]
         self.cache_controller.prefetch_tokens_occupied -= len(token_ids)
+        if self.enable_storage_metrics:
+            self.metrics_collector.log_prefetched_tokens(
+                min_completed_tokens - matched_length
+            )
         return True
     def match_prefix(self, key: List[int], **kwargs):
@@ -658,7 +719,6 @@ class HiRadixCache(RadixCache):
         new_node.parent = child.parent
         new_node.lock_ref = child.lock_ref
         new_node.key = child.key[:split_len]
-        new_node.loading = child.loading
         new_node.hit_count = child.hit_count
         # split value and host value if exists

sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

sglang 0.5.2rc2py3-none-any.whl → 0.5.3rc0py3-none-any.whl