PyPI - sglang - Versions diffs - 0.4.9.post1__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl - Mend

sglang 0.4.9.post1py3-none-any.whl → 0.4.9.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_one_batch.py +2 -1
sglang/eval/loogle_eval.py +7 -0
sglang/srt/configs/deepseekvl2.py +11 -2
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +33 -8
sglang/srt/configs/update_config.py +3 -1
sglang/srt/conversation.py +22 -2
sglang/srt/custom_op.py +5 -2
sglang/srt/disaggregation/ascend/__init__.py +6 -0
sglang/srt/disaggregation/ascend/conn.py +44 -0
sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
sglang/srt/disaggregation/decode.py +9 -1
sglang/srt/disaggregation/mooncake/conn.py +59 -70
sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
sglang/srt/disaggregation/utils.py +25 -3
sglang/srt/distributed/parallel_state.py +33 -0
sglang/srt/entrypoints/engine.py +30 -26
sglang/srt/entrypoints/http_server.py +1 -0
sglang/srt/entrypoints/openai/protocol.py +11 -0
sglang/srt/entrypoints/openai/serving_chat.py +28 -2
sglang/srt/eplb/expert_location_dispatch.py +1 -1
sglang/srt/function_call/function_call_parser.py +4 -0
sglang/srt/function_call/kimik2_detector.py +220 -0
sglang/srt/function_call/qwen3_detector.py +150 -0
sglang/srt/hf_transformers_utils.py +17 -0
sglang/srt/jinja_template_utils.py +8 -0
sglang/srt/layers/activation.py +13 -0
sglang/srt/layers/attention/flashattention_backend.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +40 -1
sglang/srt/layers/communicator.py +17 -4
sglang/srt/layers/linear.py +24 -103
sglang/srt/layers/moe/ep_moe/kernels.py +6 -3
sglang/srt/layers/moe/ep_moe/layer.py +24 -402
sglang/srt/layers/moe/fused_moe_native.py +7 -47
sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +35 -45
sglang/srt/layers/moe/fused_moe_triton/layer.py +21 -398
sglang/srt/layers/moe/topk.py +195 -14
sglang/srt/layers/parameter.py +19 -3
sglang/srt/layers/quantization/__init__.py +20 -134
sglang/srt/layers/quantization/awq.py +578 -11
sglang/srt/layers/quantization/awq_triton.py +339 -0
sglang/srt/layers/quantization/base_config.py +85 -10
sglang/srt/layers/quantization/blockwise_int8.py +17 -55
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +24 -73
sglang/srt/layers/quantization/fp8.py +273 -62
sglang/srt/layers/quantization/fp8_kernel.py +212 -48
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +501 -143
sglang/srt/layers/quantization/marlin_utils.py +790 -0
sglang/srt/layers/quantization/modelopt_quant.py +26 -108
sglang/srt/layers/quantization/moe_wna16.py +46 -51
sglang/srt/layers/quantization/petit.py +252 -0
sglang/srt/layers/quantization/petit_utils.py +104 -0
sglang/srt/layers/quantization/qoq.py +7 -6
sglang/srt/layers/quantization/scalar_type.py +352 -0
sglang/srt/layers/quantization/unquant.py +422 -0
sglang/srt/layers/quantization/utils.py +343 -3
sglang/srt/layers/quantization/w4afp8.py +8 -4
sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
sglang/srt/layers/quantization/w8a8_int8.py +718 -58
sglang/srt/layers/vocab_parallel_embedding.py +1 -41
sglang/srt/lora/lora.py +0 -4
sglang/srt/lora/lora_manager.py +87 -53
sglang/srt/lora/mem_pool.py +81 -33
sglang/srt/lora/utils.py +12 -5
sglang/srt/managers/cache_controller.py +241 -0
sglang/srt/managers/io_struct.py +65 -28
sglang/srt/managers/mm_utils.py +61 -101
sglang/srt/managers/schedule_batch.py +162 -111
sglang/srt/managers/schedule_policy.py +68 -27
sglang/srt/managers/scheduler.py +264 -62
sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
sglang/srt/managers/tokenizer_manager.py +27 -3
sglang/srt/managers/tp_worker.py +14 -0
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/allocator.py +7 -16
sglang/srt/mem_cache/base_prefix_cache.py +14 -2
sglang/srt/mem_cache/chunk_cache.py +5 -2
sglang/srt/mem_cache/hicache_storage.py +152 -0
sglang/srt/mem_cache/hiradix_cache.py +179 -4
sglang/srt/mem_cache/memory_pool.py +81 -41
sglang/srt/mem_cache/memory_pool_host.py +41 -2
sglang/srt/mem_cache/radix_cache.py +26 -0
sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +5 -6
sglang/srt/model_executor/forward_batch_info.py +27 -2
sglang/srt/model_executor/model_runner.py +109 -22
sglang/srt/model_loader/loader.py +30 -13
sglang/srt/model_loader/utils.py +4 -4
sglang/srt/models/clip.py +1 -1
sglang/srt/models/deepseek.py +9 -6
sglang/srt/models/deepseek_janus_pro.py +2 -2
sglang/srt/models/deepseek_v2.py +252 -187
sglang/srt/models/deepseek_vl2.py +6 -6
sglang/srt/models/gemma.py +48 -0
sglang/srt/models/gemma2.py +52 -0
sglang/srt/models/gemma3_causal.py +63 -0
sglang/srt/models/gemma3_mm.py +2 -2
sglang/srt/models/gemma3n_mm.py +8 -7
sglang/srt/models/granitemoe.py +385 -0
sglang/srt/models/grok.py +9 -3
sglang/srt/models/hunyuan.py +63 -16
sglang/srt/models/internvl.py +9 -3
sglang/srt/models/kimi_vl.py +9 -3
sglang/srt/models/llama.py +43 -0
sglang/srt/models/llama4.py +11 -11
sglang/srt/models/llava.py +5 -3
sglang/srt/models/llavavid.py +2 -2
sglang/srt/models/minicpm.py +0 -2
sglang/srt/models/minicpmo.py +4 -9
sglang/srt/models/minicpmv.py +2 -2
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mixtral.py +9 -2
sglang/srt/models/mixtral_quant.py +4 -0
sglang/srt/models/mllama.py +3 -5
sglang/srt/models/mllama4.py +16 -7
sglang/srt/models/olmoe.py +8 -5
sglang/srt/models/persimmon.py +330 -0
sglang/srt/models/phi.py +321 -0
sglang/srt/models/phi4mm.py +52 -6
sglang/srt/models/phi4mm_audio.py +1260 -0
sglang/srt/models/phi4mm_utils.py +1917 -0
sglang/srt/models/phimoe.py +559 -0
sglang/srt/models/qwen.py +37 -0
sglang/srt/models/qwen2.py +43 -0
sglang/srt/models/qwen2_5_vl.py +10 -7
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +53 -5
sglang/srt/models/qwen2_vl.py +13 -2
sglang/srt/models/qwen3.py +65 -1
sglang/srt/models/qwen3_moe.py +56 -18
sglang/srt/models/vila.py +9 -3
sglang/srt/multimodal/processors/base_processor.py +273 -219
sglang/srt/multimodal/processors/clip.py +21 -19
sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
sglang/srt/multimodal/processors/gemma3.py +13 -15
sglang/srt/multimodal/processors/gemma3n.py +19 -23
sglang/srt/multimodal/processors/internvl.py +10 -11
sglang/srt/multimodal/processors/janus_pro.py +12 -27
sglang/srt/multimodal/processors/kimi_vl.py +12 -14
sglang/srt/multimodal/processors/llava.py +4 -2
sglang/srt/multimodal/processors/minicpm.py +37 -45
sglang/srt/multimodal/processors/mlama.py +21 -18
sglang/srt/multimodal/processors/mllama4.py +5 -6
sglang/srt/multimodal/processors/phi4mm.py +63 -39
sglang/srt/multimodal/processors/pixtral.py +14 -35
sglang/srt/multimodal/processors/qwen_audio.py +65 -0
sglang/srt/multimodal/processors/qwen_vl.py +211 -93
sglang/srt/multimodal/processors/vila.py +14 -14
sglang/srt/sampling/sampling_params.py +8 -1
sglang/srt/server_args.py +404 -234
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +9 -1
sglang/srt/two_batch_overlap.py +1 -0
sglang/srt/utils.py +181 -32
sglang/test/runners.py +14 -3
sglang/test/test_block_fp8.py +8 -3
sglang/test/test_block_fp8_ep.py +1 -1
sglang/test/test_custom_ops.py +12 -7
sglang/test/test_cutlass_w4a8_moe.py +1 -3
sglang/test/test_fp4_moe.py +1 -3
sglang/test/test_marlin_moe.py +286 -0
sglang/test/test_marlin_utils.py +171 -0
sglang/test/test_utils.py +35 -0
sglang/version.py +1 -1
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post3.dist-info}/METADATA +10 -9
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post3.dist-info}/RECORD +178 -153
sglang/srt/layers/quantization/quant_utils.py +0 -166
sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post3.dist-info}/top_level.txt +0 -0

sglang/bench_one_batch.py CHANGED Viewed

@@ -271,12 +271,13 @@ def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
             batch,
             dp_size=model_runner.server_args.dp_size,
             attn_tp_size=1,
-            tp_cpu_group=model_runner.tp_group.cpu_group,
+            tp_group=model_runner.tp_group,
             get_idle_batch=None,
             disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
             spec_algorithm=SpeculativeAlgorithm.NONE,
             speculative_num_draft_tokens=None,
             require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
+            disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
         )

sglang/eval/loogle_eval.py CHANGED Viewed

@@ -73,6 +73,8 @@ async def benchmark(args):
     tasks: List[asyncio.Task] = []
     for idx, ex in enumerate(dataset):
+        if idx >= args.num_prompts:
+            break
         tasks.append(
             asyncio.create_task(
                 fetch_response(
@@ -103,6 +105,8 @@ def analyse(args):
     hyps: List[str] = []
     refs: List[str] = []
     for idx, ex in enumerate(tqdm(dataset, desc="Loading responses")):
+        if idx >= args.num_prompts:
+            break
         pkl_file = output_dir / f"response_{idx}.pkl"
         if not pkl_file.exists():
             raise FileNotFoundError(pkl_file)
@@ -150,6 +154,9 @@ if __name__ == "__main__":
     parser.add_argument(
         "--output-dir", default="tmp-output-dir", help="Directory for cached responses"
     )
+    parser.add_argument(
+        "--num-prompts", type=int, default=10000, help="Number of prompts to run"
+    )
     args = parser.parse_args()
     asyncio.run(benchmark(args))

sglang/srt/configs/deepseekvl2.py CHANGED Viewed

@@ -42,6 +42,9 @@ def select_best_resolution(image_size, candidate_resolutions):
 class DictOutput(object):
+    def items(self):
+        return self.__dict__.items()
     def keys(self):
         return self.__dict__.keys()
@@ -59,7 +62,9 @@ class DictOutput(object):
 class VLChatProcessorOutput(DictOutput):
     input_ids: torch.LongTensor
     target_ids: torch.LongTensor
-    images: torch.Tensor
+    pixel_values: (
+        torch.Tensor
+    )  # rename from "images" to "pixel_values" for compatibility
     images_seq_mask: torch.BoolTensor
     images_spatial_crop: torch.LongTensor
@@ -312,10 +317,14 @@ class DeepseekVLV2Processor(ProcessorMixin):
             images = torch.stack(images_list, dim=0)
             images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+        images_spatial_crop = torch.stack(
+            [images_spatial_crop], dim=0
+        )  # stack the tensor to make it a batch of 1
         prepare = VLChatProcessorOutput(
             input_ids=input_ids,
             target_ids=target_ids,
-            images=images,
+            pixel_values=images,
             images_seq_mask=images_seq_mask,
             images_spatial_crop=images_spatial_crop,
         )

sglang/srt/configs/internvl.py CHANGED Viewed

@@ -9,6 +9,7 @@ from transformers import (
     LlamaConfig,
     PretrainedConfig,
     PreTrainedTokenizer,
+    Qwen2Config,
 )
 from sglang.utils import logger
@@ -311,6 +312,8 @@ class InternVLChatConfig(PretrainedConfig):
             self.llm_config = LlamaConfig(**llm_config)
         elif llm_config.get("architectures")[0] == "InternLM2ForCausalLM":
             self.llm_config = InternLM2Config(**llm_config)
+        elif llm_config.get("architectures")[0] == "Qwen2ForCausalLM":
+            self.llm_config = Qwen2Config(**llm_config)
         else:
             raise ValueError(
                 "Unsupported architecture: {}".format(

sglang/srt/configs/janus_pro.py CHANGED Viewed

@@ -284,6 +284,9 @@ class VLMImageProcessor(BaseImageProcessor):
 class DictOutput(object):
+    def items(self):
+        return self.__dict__.items()
     def keys(self):
         return self.__dict__.keys()

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -25,6 +25,7 @@ from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import (
     get_config,
     get_context_length,
+    get_generation_config,
     get_hf_text_config,
 )
 from sglang.srt.layers.quantization import QUANTIZATION_METHODS
@@ -52,7 +53,7 @@ class ModelConfig:
         trust_remote_code: bool = True,
         revision: Optional[str] = None,
         context_length: Optional[int] = None,
-        model_override_args: Optional[str] = None,
+        model_override_args: str = "{}",
         is_embedding: Optional[bool] = None,
         enable_multimodal: Optional[bool] = None,
         dtype: str = "auto",
@@ -60,13 +61,13 @@ class ModelConfig:
         override_config_file: Optional[str] = None,
         is_draft_model: bool = False,
         hybrid_kvcache_ratio: Optional[float] = None,
-        impl: Union[str, ModelImpl] = ModelImpl.AUTO,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
         self.model_path = model_path
         self.revision = revision
         self.quantization = quantization
-        self.impl = impl
+        self.model_impl = model_impl
         # Parse args
         self.maybe_pull_model_tokenizer_from_remote()
@@ -83,6 +84,13 @@ class ModelConfig:
             **kwargs,
         )
+        self.hf_generation_config = get_generation_config(
+            self.model_path,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs,
+        )
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.attention_chunk_size = getattr(
             self.hf_text_config, "attention_chunk_size", None
@@ -278,7 +286,7 @@ class ModelConfig:
             dtype=server_args.dtype,
             quantization=server_args.quantization,
             hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
-            impl=server_args.impl,
+            model_impl=server_args.model_impl,
             **kwargs,
         )
@@ -383,6 +391,7 @@ class ModelConfig:
             "compressed-tensors",
             "fbgemm_fp8",
             "w8a8_fp8",
+            "petit_nvfp4",
         ]
         optimized_quantization_methods = [
             "fp8",
@@ -400,9 +409,11 @@ class ModelConfig:
             "moe_wna16",
             "qoq",
             "w4afp8",
+            "petit_nvfp4",
         ]
         compatible_quantization_methods = {
             "modelopt_fp4": ["modelopt"],
+            "petit_nvfp4": ["modelopt"],
             "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
             "w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
         }
@@ -413,7 +424,9 @@ class ModelConfig:
         quant_cfg = self._parse_quant_hf_config()
         if quant_cfg is not None:
-            quant_method = quant_cfg.get("quant_method", "").lower()
+            quant_method = quant_cfg.get(
+                "quant_method", "" if not self.quantization else self.quantization
+            ).lower()
             # Detect which checkpoint is it
             for _, method in QUANTIZATION_METHODS.items():
@@ -465,6 +478,19 @@ class ModelConfig:
         if eos_ids:
             # it can be either int or list of int
             eos_ids = {eos_ids} if isinstance(eos_ids, int) else set(eos_ids)
+        if eos_ids is None:
+            eos_ids = set()
+        if self.hf_generation_config:
+            generation_eos_ids = getattr(
+                self.hf_generation_config, "eos_token_id", None
+            )
+            if generation_eos_ids:
+                generation_eos_ids = (
+                    {generation_eos_ids}
+                    if isinstance(generation_eos_ids, int)
+                    else set(generation_eos_ids)
+                )
+                eos_ids = eos_ids | generation_eos_ids
         return eos_ids
     def maybe_pull_model_tokenizer_from_remote(self) -> None:
@@ -688,7 +714,6 @@ def get_hybrid_layer_ids(model_architectures: List[str], num_hidden_layers: int)
             i for i in range(num_hidden_layers) if (i + 1) % 4 == 0
         ]
     else:
-        raise ValueError(
-            "get_hybrid_layer_ids is only implemented for Llama4ForConditionalGeneration"
-        )
+        swa_attention_layer_ids = None
+        full_attention_layer_ids = None
     return swa_attention_layer_ids, full_attention_layer_ids

sglang/srt/configs/update_config.py CHANGED Viewed

@@ -115,5 +115,7 @@ def adjust_config_with_unaligned_cpu_tp(
     model_config = update_intermediate_size(
         model_config, "intermediate_size", intermediate_padding_size
     )
+    model_config = update_intermediate_size(
+        model_config, "intermediate_size_mlp", intermediate_padding_size
+    )
     return model_config

sglang/srt/conversation.py CHANGED Viewed

@@ -88,9 +88,11 @@ class Conversation:
     stop_str: Union[str, List[str]] = None
     # The string that represents an image token in the prompt
     image_token: str = "<image>"
+    video_token: str = "<video>"
     audio_token: str = "<audio>"
     image_data: Optional[List[str]] = None
+    video_data: Optional[List[str]] = None
     modalities: Optional[List[str]] = None
     stop_token_ids: Optional[int] = None
@@ -380,11 +382,15 @@ class Conversation:
         self.messages.append([role, message])
     def append_image(self, image: str):
-        """Append a new message."""
+        """Append a new image."""
         self.image_data.append(image)
+    def append_video(self, video: str):
+        """Append a new video."""
+        self.video_data.append(video)
     def append_audio(self, audio: str):
-        """Append a new message."""
+        """Append a new audio."""
         self.audio_data.append(audio)
     def update_last_message(self, message: str):
@@ -433,6 +439,7 @@ class Conversation:
             sep2=self.sep2,
             stop_str=self.stop_str,
             image_token=self.image_token,
+            video_token=self.video_token,
             audio_token=self.audio_token,
         )
@@ -495,8 +502,12 @@ def generate_embedding_convs(
             sep2=conv_template.sep2,
             stop_str=conv_template.stop_str,
             image_data=[],
+            video_data=[],
+            audio_data=[],
             modalities=[],
             image_token=conv_template.image_token,
+            video_token=conv_template.video_token,
+            audio_token=conv_template.audio_token,
         )
         real_content = ""
@@ -557,10 +568,12 @@ def generate_chat_conv(
         sep2=conv.sep2,
         stop_str=conv.stop_str,
         image_data=[],
+        video_data=[],
         audio_data=[],
         modalities=[],
         image_token=conv.image_token,
         audio_token=conv.audio_token,
+        video_token=conv.video_token,
     )
     if isinstance(request.messages, str):
@@ -602,6 +615,7 @@ def generate_chat_conv(
                     image_token = ""
                 audio_token = conv.audio_token
+                video_token = conv.video_token
                 for content in message.content:
                     if content.type == "text":
                         if num_image_url > 16:
@@ -614,6 +628,9 @@ def generate_chat_conv(
                         else:
                             real_content += image_token
                         conv.append_image(content.image_url.url)
+                    elif content.type == "video_url":
+                        real_content += video_token
+                        conv.append_video(content.video_url.url)
                     elif content.type == "audio_url":
                         real_content += audio_token
                         conv.append_audio(content.audio_url.url)
@@ -712,6 +729,7 @@ register_conv_template(
         sep="<|end|>",
         stop_str="<|end|>",
         image_token="<|endoftext10|>",
+        audio_token="<|endoftext11|>",
     )
 )
@@ -810,6 +828,7 @@ register_conv_template(
         sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
         stop_str=["<|im_end|>"],
         image_token="<|vision_start|><|image_pad|><|vision_end|>",
+        video_token="<|vision_start|><|video_pad|><|vision_end|>",
     )
 )
@@ -870,6 +889,7 @@ register_conv_template(
         sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
         stop_str=("<|im_end|>", "<|endoftext|>"),
         image_token="(<image>./</image>)",
+        video_token="(<video>./</video>)",
     )
 )

sglang/srt/custom_op.py CHANGED Viewed

@@ -29,15 +29,18 @@ class CustomOp(nn.Module):
         self._original_forward_method = self._forward_method
         # NOTE: Temporarily workaround MoE
+        # The performance of torch.compile on this layer is not always good when bs > 1,
+        # so we decide to only use torch.compile when bs=1
         if "FusedMoE" in self.__class__.__name__:
             if num_tokens == 1:
                 from sglang.srt.layers.moe.fused_moe_native import (
                     fused_moe_forward_native,
                 )
-                # The performance of torch.compile on this layer is not always good when bs > 1,
-                # so we decide to only use torch.compile when bs =1
                 self._forward_method = fused_moe_forward_native
+        elif "TopK" in self.__class__.__name__:
+            if num_tokens == 1:
+                self._forward_method = self.forward_native
         else:
             self._forward_method = self.forward_native
         self.is_torch_compile = True

sglang/srt/disaggregation/ascend/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+from sglang.srt.disaggregation.ascend.conn import (
+    AscendKVBootstrapServer,
+    AscendKVManager,
+    AscendKVReceiver,
+    AscendKVSender,
+)

sglang/srt/disaggregation/ascend/conn.py ADDED Viewed

@@ -0,0 +1,44 @@
+import logging
+from sglang.srt.disaggregation.ascend.transfer_engine import AscendTransferEngine
+from sglang.srt.disaggregation.mooncake.conn import (
+    MooncakeKVBootstrapServer,
+    MooncakeKVManager,
+    MooncakeKVReceiver,
+    MooncakeKVSender,
+)
+from sglang.srt.utils import get_local_ip_by_remote
+logger = logging.getLogger(__name__)
+class AscendKVManager(MooncakeKVManager):
+    def init_engine(self):
+        # TransferEngine initialized on ascend.
+        local_ip = get_local_ip_by_remote()
+        self.engine = AscendTransferEngine(
+            hostname=local_ip,
+            npu_id=self.kv_args.gpu_id,
+            disaggregation_mode=self.disaggregation_mode,
+        )
+    def register_buffer_to_engine(self):
+        self.engine.register(
+            self.kv_args.kv_data_ptrs[0], sum(self.kv_args.kv_data_lens)
+        )
+        # The Ascend backend optimize batch registration for small memory blocks.
+        self.engine.batch_register(
+            self.kv_args.aux_data_ptrs, self.kv_args.aux_data_lens
+        )
+class AscendKVSender(MooncakeKVSender):
+    pass
+class AscendKVReceiver(MooncakeKVReceiver):
+    pass
+class AscendKVBootstrapServer(MooncakeKVBootstrapServer):
+    pass

sglang/srt/disaggregation/ascend/transfer_engine.py ADDED Viewed

@@ -0,0 +1,58 @@
+import logging
+import os
+from typing import List, Optional
+from sglang.srt.disaggregation.mooncake.transfer_engine import MooncakeTransferEngine
+from sglang.srt.disaggregation.utils import DisaggregationMode
+logger = logging.getLogger(__name__)
+class AscendTransferEngine(MooncakeTransferEngine):
+    def __init__(
+        self, hostname: str, npu_id: int, disaggregation_mode: DisaggregationMode
+    ):
+        try:
+            from mf_adapter import TransferEngine
+        except ImportError as e:
+            raise ImportError(
+                "Please install mf_adapter, for details, see docs/backend/pd_disaggregation.md"
+            ) from e
+        self.engine = TransferEngine()
+        self.hostname = hostname
+        self.npu_id = npu_id
+        # Centralized storage address of the AscendTransferEngine
+        self.store_url = os.getenv("ASCEND_MF_STORE_URL")
+        if disaggregation_mode == DisaggregationMode.PREFILL:
+            self.role = "Prefill"
+        elif disaggregation_mode == DisaggregationMode.DECODE:
+            self.role = "Decode"
+        else:
+            logger.error(f"Unsupported DisaggregationMode: {disaggregation_mode}")
+            raise ValueError(f"Unsupported DisaggregationMode: {disaggregation_mode}")
+        self.session_id = f"{self.hostname}:{self.engine.get_rpc_port()}"
+        self.initialize()
+    def initialize(self) -> None:
+        """Initialize the ascend transfer instance."""
+        ret_value = self.engine.initialize(
+            self.store_url,
+            self.session_id,
+            self.role,
+            self.npu_id,
+        )
+        if ret_value != 0:
+            logger.error("Ascend Transfer Engine initialization failed.")
+            raise RuntimeError("Ascend Transfer Engine initialization failed.")
+    def batch_register(self, ptrs: List[int], lengths: List[int]):
+        try:
+            ret_value = self.engine.batch_register_memory(ptrs, lengths)
+        except Exception:
+            # Mark register as failed
+            ret_value = -1
+        if ret_value != 0:
+            logger.debug(f"Ascend memory registration for ptr {ptrs} failed.")

sglang/srt/disaggregation/decode.py CHANGED Viewed

@@ -439,7 +439,15 @@ class DecodePreallocQueue:
             else 0
         )
-        allocatable_tokens = self.token_to_kv_pool_allocator.available_size() - max(
+        if self.scheduler.model_config.is_hybrid:
+            available_size = min(
+                self.token_to_kv_pool_allocator.full_available_size(),
+                self.token_to_kv_pool_allocator.swa_available_size(),
+            )
+        else:
+            available_size = self.token_to_kv_pool_allocator.available_size()
+        allocatable_tokens = available_size - max(
             # preserve some space for future decode
             self.num_reserved_decode_tokens
             * (

sglang 0.4.9.post1__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl

sglang 0.4.9.post1py3-none-any.whl → 0.4.9.post3py3-none-any.whl