PyPI - sglang - Versions diffs - 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl - Mend

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

sglang/bench_one_batch.py +2 -1
sglang/eval/loogle_eval.py +7 -0
sglang/srt/_custom_ops.py +29 -1
sglang/srt/configs/deepseekvl2.py +11 -2
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +10 -8
sglang/srt/configs/update_config.py +3 -1
sglang/srt/conversation.py +2 -1
sglang/srt/custom_op.py +5 -2
sglang/srt/disaggregation/common/conn.py +34 -6
sglang/srt/disaggregation/decode.py +9 -1
sglang/srt/disaggregation/mini_lb.py +3 -2
sglang/srt/disaggregation/mooncake/conn.py +93 -76
sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
sglang/srt/disaggregation/nixl/conn.py +17 -13
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
sglang/srt/distributed/parallel_state.py +103 -15
sglang/srt/entrypoints/engine.py +31 -33
sglang/srt/entrypoints/http_server.py +20 -32
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +48 -6
sglang/srt/eplb/expert_location_dispatch.py +1 -1
sglang/srt/function_call/base_format_detector.py +74 -12
sglang/srt/function_call/deepseekv3_detector.py +26 -11
sglang/srt/function_call/ebnf_composer.py +95 -63
sglang/srt/function_call/function_call_parser.py +4 -2
sglang/srt/function_call/kimik2_detector.py +41 -16
sglang/srt/function_call/llama32_detector.py +6 -3
sglang/srt/function_call/mistral_detector.py +11 -3
sglang/srt/function_call/pythonic_detector.py +16 -14
sglang/srt/function_call/qwen25_detector.py +12 -3
sglang/srt/function_call/qwen3_coder_detector.py +151 -0
sglang/srt/hf_transformers_utils.py +0 -1
sglang/srt/layers/activation.py +24 -3
sglang/srt/layers/attention/base_attn_backend.py +3 -1
sglang/srt/layers/attention/flashattention_backend.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +40 -1
sglang/srt/layers/communicator.py +12 -12
sglang/srt/layers/dp_attention.py +72 -24
sglang/srt/layers/linear.py +13 -102
sglang/srt/layers/logits_processor.py +34 -24
sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
sglang/srt/layers/moe/ep_moe/layer.py +23 -402
sglang/srt/layers/moe/fused_moe_native.py +7 -47
sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
sglang/srt/layers/moe/topk.py +190 -23
sglang/srt/layers/quantization/__init__.py +20 -134
sglang/srt/layers/quantization/awq.py +578 -11
sglang/srt/layers/quantization/awq_triton.py +339 -0
sglang/srt/layers/quantization/base_config.py +85 -10
sglang/srt/layers/quantization/blockwise_int8.py +17 -55
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
sglang/srt/layers/quantization/fp8.py +273 -62
sglang/srt/layers/quantization/fp8_kernel.py +210 -46
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +501 -143
sglang/srt/layers/quantization/marlin_utils.py +790 -0
sglang/srt/layers/quantization/modelopt_quant.py +34 -112
sglang/srt/layers/quantization/moe_wna16.py +45 -49
sglang/srt/layers/quantization/petit.py +252 -0
sglang/srt/layers/quantization/petit_utils.py +104 -0
sglang/srt/layers/quantization/qoq.py +7 -6
sglang/srt/layers/quantization/scalar_type.py +352 -0
sglang/srt/layers/quantization/unquant.py +422 -0
sglang/srt/layers/quantization/utils.py +340 -9
sglang/srt/layers/quantization/w4afp8.py +8 -4
sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
sglang/srt/layers/quantization/w8a8_int8.py +51 -115
sglang/srt/layers/radix_attention.py +5 -3
sglang/srt/layers/vocab_parallel_embedding.py +1 -41
sglang/srt/lora/lora.py +0 -4
sglang/srt/lora/lora_manager.py +162 -164
sglang/srt/lora/lora_registry.py +124 -0
sglang/srt/lora/mem_pool.py +83 -35
sglang/srt/lora/utils.py +12 -5
sglang/srt/managers/cache_controller.py +288 -0
sglang/srt/managers/io_struct.py +60 -30
sglang/srt/managers/mm_utils.py +7 -8
sglang/srt/managers/schedule_batch.py +163 -113
sglang/srt/managers/schedule_policy.py +68 -27
sglang/srt/managers/scheduler.py +256 -86
sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
sglang/srt/managers/tokenizer_manager.py +38 -27
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/allocator.py +74 -23
sglang/srt/mem_cache/base_prefix_cache.py +14 -2
sglang/srt/mem_cache/chunk_cache.py +5 -2
sglang/srt/mem_cache/hicache_storage.py +168 -0
sglang/srt/mem_cache/hiradix_cache.py +194 -5
sglang/srt/mem_cache/memory_pool.py +16 -1
sglang/srt/mem_cache/memory_pool_host.py +44 -2
sglang/srt/mem_cache/radix_cache.py +26 -0
sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +66 -31
sglang/srt/model_executor/forward_batch_info.py +210 -25
sglang/srt/model_executor/model_runner.py +147 -42
sglang/srt/model_loader/loader.py +7 -1
sglang/srt/model_loader/utils.py +4 -4
sglang/srt/models/clip.py +1 -1
sglang/srt/models/deepseek.py +9 -6
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +192 -173
sglang/srt/models/deepseek_vl2.py +5 -5
sglang/srt/models/gemma.py +48 -0
sglang/srt/models/gemma2.py +52 -0
sglang/srt/models/gemma3_causal.py +63 -0
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -4
sglang/srt/models/granitemoe.py +385 -0
sglang/srt/models/grok.py +9 -3
sglang/srt/models/hunyuan.py +63 -16
sglang/srt/models/internvl.py +1 -1
sglang/srt/models/kimi_vl.py +1 -1
sglang/srt/models/llama.py +41 -0
sglang/srt/models/llama4.py +11 -11
sglang/srt/models/llava.py +2 -2
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +0 -2
sglang/srt/models/minicpmo.py +3 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mixtral.py +9 -2
sglang/srt/models/mllama.py +3 -5
sglang/srt/models/mllama4.py +13 -6
sglang/srt/models/olmoe.py +8 -5
sglang/srt/models/persimmon.py +330 -0
sglang/srt/models/phi.py +321 -0
sglang/srt/models/phi4mm.py +44 -4
sglang/srt/models/phi4mm_audio.py +1260 -0
sglang/srt/models/phi4mm_utils.py +1917 -0
sglang/srt/models/phimoe.py +9 -3
sglang/srt/models/qwen.py +37 -0
sglang/srt/models/qwen2.py +41 -0
sglang/srt/models/qwen2_5_vl.py +4 -4
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +53 -9
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/qwen3.py +65 -1
sglang/srt/models/qwen3_moe.py +57 -24
sglang/srt/models/vila.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +91 -97
sglang/srt/multimodal/processors/clip.py +21 -19
sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
sglang/srt/multimodal/processors/gemma3.py +13 -17
sglang/srt/multimodal/processors/gemma3n.py +19 -23
sglang/srt/multimodal/processors/internvl.py +9 -10
sglang/srt/multimodal/processors/janus_pro.py +12 -27
sglang/srt/multimodal/processors/kimi_vl.py +12 -14
sglang/srt/multimodal/processors/llava.py +4 -2
sglang/srt/multimodal/processors/minicpm.py +35 -44
sglang/srt/multimodal/processors/mlama.py +21 -18
sglang/srt/multimodal/processors/mllama4.py +4 -5
sglang/srt/multimodal/processors/phi4mm.py +63 -39
sglang/srt/multimodal/processors/pixtral.py +14 -35
sglang/srt/multimodal/processors/qwen_audio.py +65 -0
sglang/srt/multimodal/processors/qwen_vl.py +16 -21
sglang/srt/multimodal/processors/vila.py +14 -14
sglang/srt/reasoning_parser.py +46 -4
sglang/srt/sampling/sampling_batch_info.py +6 -5
sglang/srt/sampling/sampling_params.py +8 -1
sglang/srt/server_args.py +454 -270
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
sglang/srt/speculative/eagle_utils.py +51 -23
sglang/srt/speculative/eagle_worker.py +59 -44
sglang/srt/two_batch_overlap.py +10 -5
sglang/srt/utils.py +44 -69
sglang/test/runners.py +14 -3
sglang/test/test_activation.py +50 -1
sglang/test/test_block_fp8.py +8 -3
sglang/test/test_block_fp8_ep.py +1 -1
sglang/test/test_custom_ops.py +12 -7
sglang/test/test_cutlass_w4a8_moe.py +1 -3
sglang/test/test_fp4_moe.py +1 -3
sglang/test/test_marlin_moe.py +286 -0
sglang/test/test_marlin_utils.py +171 -0
sglang/test/test_utils.py +35 -0
sglang/version.py +1 -1
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
sglang/srt/layers/quantization/quant_utils.py +0 -166
sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch.py CHANGED Viewed

@@ -271,12 +271,13 @@ def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
             batch,
             dp_size=model_runner.server_args.dp_size,
             attn_tp_size=1,
-            tp_cpu_group=model_runner.tp_group.cpu_group,
+            tp_group=model_runner.tp_group,
             get_idle_batch=None,
             disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
             spec_algorithm=SpeculativeAlgorithm.NONE,
             speculative_num_draft_tokens=None,
             require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
+            disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
         )

sglang/eval/loogle_eval.py CHANGED Viewed

@@ -73,6 +73,8 @@ async def benchmark(args):
     tasks: List[asyncio.Task] = []
     for idx, ex in enumerate(dataset):
+        if idx >= args.num_prompts:
+            break
         tasks.append(
             asyncio.create_task(
                 fetch_response(
@@ -103,6 +105,8 @@ def analyse(args):
     hyps: List[str] = []
     refs: List[str] = []
     for idx, ex in enumerate(tqdm(dataset, desc="Loading responses")):
+        if idx >= args.num_prompts:
+            break
         pkl_file = output_dir / f"response_{idx}.pkl"
         if not pkl_file.exists():
             raise FileNotFoundError(pkl_file)
@@ -150,6 +154,9 @@ if __name__ == "__main__":
     parser.add_argument(
         "--output-dir", default="tmp-output-dir", help="Directory for cached responses"
     )
+    parser.add_argument(
+        "--num-prompts", type=int, default=10000, help="Number of prompts to run"
+    )
     args = parser.parse_args()
     asyncio.run(benchmark(args))

sglang/srt/_custom_ops.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
 import logging
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 import torch
@@ -114,6 +114,34 @@ else:
     def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
         return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
+    # ROCM custom quick allreduce
+    def init_custom_qr(
+        rank: int, world_size: int, qr_max_size: Optional[int] = None
+    ) -> int:
+        return sgl_kernel.allreduce.init_custom_qr(world_size, rank, qr_max_size)
+    def qr_get_handle(fa: int) -> torch.Tensor:
+        return sgl_kernel.allreduce.qr_get_handle(fa)
+    def qr_open_handles(fa: int, handles: list[torch.Tensor]) -> None:
+        sgl_kernel.allreduce.qr_open_handles(fa, handles)
+    def qr_all_reduce(
+        fa: int,
+        inp: torch.Tensor,
+        out: torch.Tensor,
+        quant_level: int,
+        cast_bf2half: bool,
+    ) -> None:
+        sgl_kernel.allreduce.qr_all_reduce(fa, inp, out, quant_level, cast_bf2half)
+    def qr_destroy(fa: int) -> None:
+        sgl_kernel.allreduce.qr_destroy(fa)
+    def qr_max_size() -> int:
+        return sgl_kernel.allreduce.qr_max_size()
 def mscclpp_generate_unique_id() -> bytes:
     return sgl_kernel.allreduce.mscclpp_generate_unique_id()

sglang/srt/configs/deepseekvl2.py CHANGED Viewed

@@ -42,6 +42,9 @@ def select_best_resolution(image_size, candidate_resolutions):
 class DictOutput(object):
+    def items(self):
+        return self.__dict__.items()
     def keys(self):
         return self.__dict__.keys()
@@ -59,7 +62,9 @@ class DictOutput(object):
 class VLChatProcessorOutput(DictOutput):
     input_ids: torch.LongTensor
     target_ids: torch.LongTensor
-    images: torch.Tensor
+    pixel_values: (
+        torch.Tensor
+    )  # rename from "images" to "pixel_values" for compatibility
     images_seq_mask: torch.BoolTensor
     images_spatial_crop: torch.LongTensor
@@ -312,10 +317,14 @@ class DeepseekVLV2Processor(ProcessorMixin):
             images = torch.stack(images_list, dim=0)
             images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+        images_spatial_crop = torch.stack(
+            [images_spatial_crop], dim=0
+        )  # stack the tensor to make it a batch of 1
         prepare = VLChatProcessorOutput(
             input_ids=input_ids,
             target_ids=target_ids,
-            images=images,
+            pixel_values=images,
             images_seq_mask=images_seq_mask,
             images_spatial_crop=images_spatial_crop,
         )

sglang/srt/configs/internvl.py CHANGED Viewed

@@ -9,6 +9,7 @@ from transformers import (
     LlamaConfig,
     PretrainedConfig,
     PreTrainedTokenizer,
+    Qwen2Config,
 )
 from sglang.utils import logger
@@ -311,6 +312,8 @@ class InternVLChatConfig(PretrainedConfig):
             self.llm_config = LlamaConfig(**llm_config)
         elif llm_config.get("architectures")[0] == "InternLM2ForCausalLM":
             self.llm_config = InternLM2Config(**llm_config)
+        elif llm_config.get("architectures")[0] == "Qwen2ForCausalLM":
+            self.llm_config = Qwen2Config(**llm_config)
         else:
             raise ValueError(
                 "Unsupported architecture: {}".format(

sglang/srt/configs/janus_pro.py CHANGED Viewed

@@ -284,6 +284,9 @@ class VLMImageProcessor(BaseImageProcessor):
 class DictOutput(object):
+    def items(self):
+        return self.__dict__.items()
     def keys(self):
         return self.__dict__.keys()

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -53,7 +53,7 @@ class ModelConfig:
         trust_remote_code: bool = True,
         revision: Optional[str] = None,
         context_length: Optional[int] = None,
-        model_override_args: Optional[str] = None,
+        model_override_args: str = "{}",
         is_embedding: Optional[bool] = None,
         enable_multimodal: Optional[bool] = None,
         dtype: str = "auto",
@@ -61,13 +61,13 @@ class ModelConfig:
         override_config_file: Optional[str] = None,
         is_draft_model: bool = False,
         hybrid_kvcache_ratio: Optional[float] = None,
-        impl: Union[str, ModelImpl] = ModelImpl.AUTO,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
         self.model_path = model_path
         self.revision = revision
         self.quantization = quantization
-        self.impl = impl
+        self.model_impl = model_impl
         # Parse args
         self.maybe_pull_model_tokenizer_from_remote()
@@ -286,7 +286,7 @@ class ModelConfig:
             dtype=server_args.dtype,
             quantization=server_args.quantization,
             hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
-            impl=server_args.impl,
+            model_impl=server_args.model_impl,
             **kwargs,
         )
@@ -391,6 +391,7 @@ class ModelConfig:
             "compressed-tensors",
             "fbgemm_fp8",
             "w8a8_fp8",
+            "petit_nvfp4",
         ]
         optimized_quantization_methods = [
             "fp8",
@@ -408,9 +409,11 @@ class ModelConfig:
             "moe_wna16",
             "qoq",
             "w4afp8",
+            "petit_nvfp4",
         ]
         compatible_quantization_methods = {
             "modelopt_fp4": ["modelopt"],
+            "petit_nvfp4": ["modelopt"],
             "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
             "w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
         }
@@ -472,7 +475,7 @@ class ModelConfig:
     def get_hf_eos_token_id(self) -> Optional[Set[int]]:
         eos_ids = getattr(self.hf_config, "eos_token_id", None)
-        if eos_ids:
+        if eos_ids is not None:
             # it can be either int or list of int
             eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
         if eos_ids is None:
@@ -711,7 +714,6 @@ def get_hybrid_layer_ids(model_architectures: List[str], num_hidden_layers: int)
             i for i in range(num_hidden_layers) if (i + 1) % 4 == 0
         ]
     else:
-        raise ValueError(
-            "get_hybrid_layer_ids is only implemented for Llama4ForConditionalGeneration"
-        )
+        swa_attention_layer_ids = None
+        full_attention_layer_ids = None
     return swa_attention_layer_ids, full_attention_layer_ids

sglang/srt/configs/update_config.py CHANGED Viewed

@@ -115,5 +115,7 @@ def adjust_config_with_unaligned_cpu_tp(
     model_config = update_intermediate_size(
         model_config, "intermediate_size", intermediate_padding_size
     )
+    model_config = update_intermediate_size(
+        model_config, "intermediate_size_mlp", intermediate_padding_size
+    )
     return model_config

sglang/srt/conversation.py CHANGED Viewed

@@ -729,6 +729,7 @@ register_conv_template(
         sep="<|end|>",
         stop_str="<|end|>",
         image_token="<|endoftext10|>",
+        audio_token="<|endoftext11|>",
     )
 )
@@ -983,7 +984,7 @@ register_conv_template(
 @register_conv_template_matching_function
 def match_internvl(model_path: str):
-    if re.search(r"internvl2_5", model_path, re.IGNORECASE):
+    if re.search(r"internvl", model_path, re.IGNORECASE):
         return "internvl-2-5"

sglang/srt/custom_op.py CHANGED Viewed

@@ -29,15 +29,18 @@ class CustomOp(nn.Module):
         self._original_forward_method = self._forward_method
         # NOTE: Temporarily workaround MoE
+        # The performance of torch.compile on this layer is not always good when bs > 1,
+        # so we decide to only use torch.compile when bs=1
         if "FusedMoE" in self.__class__.__name__:
             if num_tokens == 1:
                 from sglang.srt.layers.moe.fused_moe_native import (
                     fused_moe_forward_native,
                 )
-                # The performance of torch.compile on this layer is not always good when bs > 1,
-                # so we decide to only use torch.compile when bs =1
                 self._forward_method = fused_moe_forward_native
+        elif "TopK" in self.__class__.__name__:
+            if num_tokens == 1:
+                self._forward_method = self.forward_native
         else:
             self._forward_method = self.forward_native
         self.is_torch_compile = True

sglang/srt/disaggregation/common/conn.py CHANGED Viewed

@@ -23,7 +23,14 @@ from sglang.srt.disaggregation.base.conn import (
 )
 from sglang.srt.disaggregation.utils import DisaggregationMode
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import get_free_port, get_ip, get_local_ip_by_remote
+from sglang.srt.utils import (
+    format_tcp_address,
+    get_free_port,
+    get_ip,
+    get_local_ip_by_remote,
+    is_valid_ipv6_address,
+    maybe_wrap_ipv6_address,
+)
 logger = logging.getLogger(__name__)
@@ -65,11 +72,18 @@ class CommonKVManager(BaseKVManager):
     def _register_to_bootstrap(self):
         """Register KVSender to bootstrap server via HTTP POST."""
         if self.dist_init_addr:
-            ip_address = socket.gethostbyname(self.dist_init_addr.split(":")[0])
+            if self.dist_init_addr.startswith("["):  # [ipv6]:port or [ipv6]
+                if self.dist_init_addr.endswith("]"):
+                    host = self.dist_init_addr
+                else:
+                    host, _ = self.dist_init_addr.rsplit(":", 1)
+            else:
+                host = socket.gethostbyname(self.dist_init_addr.rsplit(":", 1)[0])
         else:
-            ip_address = get_ip()
+            host = get_ip()
+            host = maybe_wrap_ipv6_address(host)
-        bootstrap_server_url = f"{ip_address}:{self.bootstrap_port}"
+        bootstrap_server_url = f"{host}:{self.bootstrap_port}"
         url = f"http://{bootstrap_server_url}/route"
         payload = {
             "role": "Prefill",
@@ -92,8 +106,10 @@ class CommonKVManager(BaseKVManager):
             logger.error(f"Prefill Failed to register to bootstrap server: {e}")
     @cache
-    def _connect(self, endpoint: str):
+    def _connect(self, endpoint: str, is_ipv6: bool = False):
         socket = zmq.Context().socket(zmq.PUSH)
+        if is_ipv6:
+            socket.setsockopt(zmq.IPV6, 1)
         socket.connect(endpoint)
         return socket
@@ -263,15 +279,27 @@ class CommonKVReceiver(BaseKVReceiver):
             return None
     @classmethod
-    def _connect(cls, endpoint: str):
+    def _connect(cls, endpoint: str, is_ipv6: bool = False):
         with cls._global_lock:
             if endpoint not in cls._socket_cache:
                 sock = cls._ctx.socket(zmq.PUSH)
+                if is_ipv6:
+                    sock.setsockopt(zmq.IPV6, 1)
                 sock.connect(endpoint)
                 cls._socket_cache[endpoint] = sock
                 cls._socket_locks[endpoint] = threading.Lock()
             return cls._socket_cache[endpoint], cls._socket_locks[endpoint]
+    @classmethod
+    def _connect_to_bootstrap_server(cls, bootstrap_info: dict):
+        ip_address = bootstrap_info["rank_ip"]
+        port = bootstrap_info["rank_port"]
+        is_ipv6_address = is_valid_ipv6_address(ip_address)
+        sock, lock = cls._connect(
+            format_tcp_address(ip_address, port), is_ipv6=is_ipv6_address
+        )
+        return sock, lock
     def _register_kv_args(self):
         pass

sglang/srt/disaggregation/decode.py CHANGED Viewed

@@ -439,7 +439,15 @@ class DecodePreallocQueue:
             else 0
         )
-        allocatable_tokens = self.token_to_kv_pool_allocator.available_size() - max(
+        if self.scheduler.model_config.is_hybrid:
+            available_size = min(
+                self.token_to_kv_pool_allocator.full_available_size(),
+                self.token_to_kv_pool_allocator.swa_available_size(),
+            )
+        else:
+            available_size = self.token_to_kv_pool_allocator.available_size()
+        allocatable_tokens = available_size - max(
             # preserve some space for future decode
             self.num_reserved_decode_tokens
             * (

sglang/srt/disaggregation/mini_lb.py CHANGED Viewed

@@ -17,6 +17,7 @@ from fastapi import FastAPI, HTTPException
 from fastapi.responses import ORJSONResponse, Response, StreamingResponse
 from sglang.srt.disaggregation.utils import PDRegistryRequest
+from sglang.srt.utils import maybe_wrap_ipv6_address
 AIOHTTP_STREAM_READ_CHUNK_SIZE = (
     1024 * 64
@@ -271,7 +272,7 @@ async def handle_generate_request(request_data: dict):
     # Parse and transform prefill_server for bootstrap data
     parsed_url = urllib.parse.urlparse(prefill_server)
-    hostname = parsed_url.hostname
+    hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
     modified_request = request_data.copy()
     batch_size = _get_request_batch_size(modified_request)
@@ -309,7 +310,7 @@ async def _forward_to_backend(request_data: dict, endpoint_name: str):
     # Parse and transform prefill_server for bootstrap data
     parsed_url = urllib.parse.urlparse(prefill_server)
-    hostname = parsed_url.hostname
+    hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
     modified_request = request_data.copy()
     modified_request.update(
         {

sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post4py3-none-any.whl