PyPI - sglang - Versions diffs - 0.4.9.post2__tar.gz → 0.4.9.post3__tar.gz - Mend

sglang 0.4.9.post2tar.gz → 0.4.9.post3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (776) hide show

{sglang-0.4.9.post2/sglang.egg-info → sglang-0.4.9.post3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.9.post2
+Version: 0.4.9.post3
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -246,14 +246,14 @@ Requires-Dist: sentencepiece; extra == "runtime-common"
 Requires-Dist: soundfile==0.13.1; extra == "runtime-common"
 Requires-Dist: scipy; extra == "runtime-common"
 Requires-Dist: torchao==0.9.0; extra == "runtime-common"
-Requires-Dist: transformers==4.53.0; extra == "runtime-common"
+Requires-Dist: transformers==4.53.2; extra == "runtime-common"
 Requires-Dist: timm==1.0.16; extra == "runtime-common"
 Requires-Dist: uvicorn; extra == "runtime-common"
 Requires-Dist: uvloop; extra == "runtime-common"
 Requires-Dist: xgrammar==0.1.21; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.2.5; extra == "srt"
+Requires-Dist: sgl-kernel==0.2.6.post1; extra == "srt"
 Requires-Dist: torch==2.7.1; extra == "srt"
 Requires-Dist: torchaudio==2.7.1; extra == "srt"
 Requires-Dist: torchvision==0.22.1; extra == "srt"
@@ -272,7 +272,7 @@ Requires-Dist: flashinfer_python==0.2.7.post1; extra == "blackwell"
 Provides-Extra: srt-hip
 Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
 Requires-Dist: torch; extra == "srt-hip"
-Requires-Dist: vllm==0.6.7.dev2; extra == "srt-hip"
+Requires-Dist: petit_kernel; extra == "srt-hip"
 Provides-Extra: srt-xpu
 Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
 Provides-Extra: srt-hpu
@@ -381,14 +381,14 @@ Dynamic: license-file
 - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
 - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
 - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
-- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
-- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
+- [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
 - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 <details>
 <summary>More</summary>
 - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
+- [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
 - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
 - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -415,10 +415,10 @@ The core features include:
 - [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
 ## Benchmark and Performance
-Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/).
+Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).
 ## Roadmap
-[Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
+[Development Roadmap (2025 H2)](https://github.com/sgl-project/sglang/issues/7736)
 ## Adoption and Sponsorship
 SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/README.md RENAMED Viewed

@@ -25,14 +25,14 @@
 - [2025/05] 🔥 Deploying DeepSeek with PD Disaggregation and Large-scale Expert Parallelism on 96 H100 GPUs ([blog](https://lmsys.org/blog/2025-05-05-large-scale-ep/)).
 - [2025/03] Supercharge DeepSeek-R1 Inference on AMD Instinct MI300X ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1-Part2/README.html))
 - [2025/03] SGLang Joins PyTorch Ecosystem: Efficient LLM Serving Engine ([PyTorch blog](https://pytorch.org/blog/sglang-joins-pytorch/))
-- [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
-- [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
+- [2024/12] v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
 - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
 <details>
 <summary>More</summary>
 - [2025/02] Unlock DeepSeek-R1 Inference Performance on AMD Instinct™ MI300X GPU ([AMD blog](https://rocm.blogs.amd.com/artificial-intelligence/DeepSeekR1_Perf/README.html))
+- [2025/01] SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeepSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html), [10+ other companies](https://x.com/lmsysorg/status/1887262321636221412))
 - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
 - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
 - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -59,10 +59,10 @@ The core features include:
 - [Contribution Guide](https://docs.sglang.ai/references/contribution_guide.html)
 ## Benchmark and Performance
-Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/).
+Learn more in the release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/), [v0.4 blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/), [Large-scale expert parallelism](https://lmsys.org/blog/2025-05-05-large-scale-ep/).
 ## Roadmap
-[Development Roadmap (2025 H1)](https://github.com/sgl-project/sglang/issues/4042)
+[Development Roadmap (2025 H2)](https://github.com/sgl-project/sglang/issues/7736)
 ## Adoption and Sponsorship
 SGLang has been deployed at large scale, generating trillions of tokens in production each day. It is trusted and adopted by a wide range of leading enterprises and institutions, including xAI, AMD, NVIDIA, Intel, LinkedIn, Cursor, Oracle Cloud, Google Cloud, Microsoft Azure, AWS, Atlas Cloud, Voltage Park, Nebius, DataCrunch, Novita, InnoMatrix, MIT, UCLA, the University of Washington, Stanford, UC Berkeley, Tsinghua University, Jam & Tea Studios, Baseten, and other major technology organizations across North America and Asia. As an open-source LLM inference engine, SGLang has become the de facto industry standard, with deployments running on over 1,000,000 GPUs worldwide.

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.9.post2"
+version = "0.4.9.post3"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -45,7 +45,7 @@ runtime_common = [
     "soundfile==0.13.1",
     "scipy",
     "torchao==0.9.0",
-    "transformers==4.53.0",
+    "transformers==4.53.2",
     "timm==1.0.16",
     "uvicorn",
     "uvloop",
@@ -54,7 +54,7 @@ runtime_common = [
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.2.5",
+    "sgl-kernel==0.2.6.post1",
     "torch==2.7.1",
     "torchaudio==2.7.1",
     "torchvision==0.22.1",
@@ -79,7 +79,7 @@ blackwell = [
 srt_hip = [
     "sglang[runtime_common]",
     "torch",
-    "vllm==0.6.7.dev2",
+    "petit_kernel",
 ]
 # xpu is not enabled in public vllm and torch whl,

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/bench_one_batch.py RENAMED Viewed

@@ -271,12 +271,13 @@ def _maybe_prepare_mlp_sync_batch(batch: ScheduleBatch, model_runner):
             batch,
             dp_size=model_runner.server_args.dp_size,
             attn_tp_size=1,
-            tp_cpu_group=model_runner.tp_group.cpu_group,
+            tp_group=model_runner.tp_group,
             get_idle_batch=None,
             disable_cuda_graph=model_runner.server_args.disable_cuda_graph,
             spec_algorithm=SpeculativeAlgorithm.NONE,
             speculative_num_draft_tokens=None,
             require_mlp_tp_gather=require_mlp_tp_gather(model_runner.server_args),
+            disable_overlap_schedule=model_runner.server_args.disable_overlap_schedule,
         )

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/eval/loogle_eval.py RENAMED Viewed

@@ -73,6 +73,8 @@ async def benchmark(args):
     tasks: List[asyncio.Task] = []
     for idx, ex in enumerate(dataset):
+        if idx >= args.num_prompts:
+            break
         tasks.append(
             asyncio.create_task(
                 fetch_response(
@@ -103,6 +105,8 @@ def analyse(args):
     hyps: List[str] = []
     refs: List[str] = []
     for idx, ex in enumerate(tqdm(dataset, desc="Loading responses")):
+        if idx >= args.num_prompts:
+            break
         pkl_file = output_dir / f"response_{idx}.pkl"
         if not pkl_file.exists():
             raise FileNotFoundError(pkl_file)
@@ -150,6 +154,9 @@ if __name__ == "__main__":
     parser.add_argument(
         "--output-dir", default="tmp-output-dir", help="Directory for cached responses"
     )
+    parser.add_argument(
+        "--num-prompts", type=int, default=10000, help="Number of prompts to run"
+    )
     args = parser.parse_args()
     asyncio.run(benchmark(args))

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/srt/configs/deepseekvl2.py RENAMED Viewed

@@ -42,6 +42,9 @@ def select_best_resolution(image_size, candidate_resolutions):
 class DictOutput(object):
+    def items(self):
+        return self.__dict__.items()
     def keys(self):
         return self.__dict__.keys()
@@ -59,7 +62,9 @@ class DictOutput(object):
 class VLChatProcessorOutput(DictOutput):
     input_ids: torch.LongTensor
     target_ids: torch.LongTensor
-    images: torch.Tensor
+    pixel_values: (
+        torch.Tensor
+    )  # rename from "images" to "pixel_values" for compatibility
     images_seq_mask: torch.BoolTensor
     images_spatial_crop: torch.LongTensor
@@ -312,10 +317,14 @@ class DeepseekVLV2Processor(ProcessorMixin):
             images = torch.stack(images_list, dim=0)
             images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
+        images_spatial_crop = torch.stack(
+            [images_spatial_crop], dim=0
+        )  # stack the tensor to make it a batch of 1
         prepare = VLChatProcessorOutput(
             input_ids=input_ids,
             target_ids=target_ids,
-            images=images,
+            pixel_values=images,
             images_seq_mask=images_seq_mask,
             images_spatial_crop=images_spatial_crop,
         )

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/srt/configs/internvl.py RENAMED Viewed

@@ -9,6 +9,7 @@ from transformers import (
     LlamaConfig,
     PretrainedConfig,
     PreTrainedTokenizer,
+    Qwen2Config,
 )
 from sglang.utils import logger
@@ -311,6 +312,8 @@ class InternVLChatConfig(PretrainedConfig):
             self.llm_config = LlamaConfig(**llm_config)
         elif llm_config.get("architectures")[0] == "InternLM2ForCausalLM":
             self.llm_config = InternLM2Config(**llm_config)
+        elif llm_config.get("architectures")[0] == "Qwen2ForCausalLM":
+            self.llm_config = Qwen2Config(**llm_config)
         else:
             raise ValueError(
                 "Unsupported architecture: {}".format(

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/srt/configs/janus_pro.py RENAMED Viewed

@@ -284,6 +284,9 @@ class VLMImageProcessor(BaseImageProcessor):
 class DictOutput(object):
+    def items(self):
+        return self.__dict__.items()
     def keys(self):
         return self.__dict__.keys()

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/srt/configs/model_config.py RENAMED Viewed

@@ -53,7 +53,7 @@ class ModelConfig:
         trust_remote_code: bool = True,
         revision: Optional[str] = None,
         context_length: Optional[int] = None,
-        model_override_args: Optional[str] = None,
+        model_override_args: str = "{}",
         is_embedding: Optional[bool] = None,
         enable_multimodal: Optional[bool] = None,
         dtype: str = "auto",
@@ -61,13 +61,13 @@ class ModelConfig:
         override_config_file: Optional[str] = None,
         is_draft_model: bool = False,
         hybrid_kvcache_ratio: Optional[float] = None,
-        impl: Union[str, ModelImpl] = ModelImpl.AUTO,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
         self.model_path = model_path
         self.revision = revision
         self.quantization = quantization
-        self.impl = impl
+        self.model_impl = model_impl
         # Parse args
         self.maybe_pull_model_tokenizer_from_remote()
@@ -286,7 +286,7 @@ class ModelConfig:
             dtype=server_args.dtype,
             quantization=server_args.quantization,
             hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
-            impl=server_args.impl,
+            model_impl=server_args.model_impl,
             **kwargs,
         )
@@ -391,6 +391,7 @@ class ModelConfig:
             "compressed-tensors",
             "fbgemm_fp8",
             "w8a8_fp8",
+            "petit_nvfp4",
         ]
         optimized_quantization_methods = [
             "fp8",
@@ -408,9 +409,11 @@ class ModelConfig:
             "moe_wna16",
             "qoq",
             "w4afp8",
+            "petit_nvfp4",
         ]
         compatible_quantization_methods = {
             "modelopt_fp4": ["modelopt"],
+            "petit_nvfp4": ["modelopt"],
             "w8a8_int8": ["compressed-tensors", "compressed_tensors"],
             "w8a8_fp8": ["compressed-tensors", "compressed_tensors"],
         }
@@ -711,7 +714,6 @@ def get_hybrid_layer_ids(model_architectures: List[str], num_hidden_layers: int)
             i for i in range(num_hidden_layers) if (i + 1) % 4 == 0
         ]
     else:
-        raise ValueError(
-            "get_hybrid_layer_ids is only implemented for Llama4ForConditionalGeneration"
-        )
+        swa_attention_layer_ids = None
+        full_attention_layer_ids = None
     return swa_attention_layer_ids, full_attention_layer_ids

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/srt/configs/update_config.py RENAMED Viewed

@@ -115,5 +115,7 @@ def adjust_config_with_unaligned_cpu_tp(
     model_config = update_intermediate_size(
         model_config, "intermediate_size", intermediate_padding_size
     )
+    model_config = update_intermediate_size(
+        model_config, "intermediate_size_mlp", intermediate_padding_size
+    )
     return model_config

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/srt/conversation.py RENAMED Viewed

@@ -729,6 +729,7 @@ register_conv_template(
         sep="<|end|>",
         stop_str="<|end|>",
         image_token="<|endoftext10|>",
+        audio_token="<|endoftext11|>",
     )
 )

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/srt/custom_op.py RENAMED Viewed

@@ -29,15 +29,18 @@ class CustomOp(nn.Module):
         self._original_forward_method = self._forward_method
         # NOTE: Temporarily workaround MoE
+        # The performance of torch.compile on this layer is not always good when bs > 1,
+        # so we decide to only use torch.compile when bs=1
         if "FusedMoE" in self.__class__.__name__:
             if num_tokens == 1:
                 from sglang.srt.layers.moe.fused_moe_native import (
                     fused_moe_forward_native,
                 )
-                # The performance of torch.compile on this layer is not always good when bs > 1,
-                # so we decide to only use torch.compile when bs =1
                 self._forward_method = fused_moe_forward_native
+        elif "TopK" in self.__class__.__name__:
+            if num_tokens == 1:
+                self._forward_method = self.forward_native
         else:
             self._forward_method = self.forward_native
         self.is_torch_compile = True

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/srt/disaggregation/decode.py RENAMED Viewed

@@ -439,7 +439,15 @@ class DecodePreallocQueue:
             else 0
         )
-        allocatable_tokens = self.token_to_kv_pool_allocator.available_size() - max(
+        if self.scheduler.model_config.is_hybrid:
+            available_size = min(
+                self.token_to_kv_pool_allocator.full_available_size(),
+                self.token_to_kv_pool_allocator.swa_available_size(),
+            )
+        else:
+            available_size = self.token_to_kv_pool_allocator.available_size()
+        allocatable_tokens = available_size - max(
             # preserve some space for future decode
             self.num_reserved_decode_tokens
             * (

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/srt/disaggregation/mooncake/conn.py RENAMED Viewed

@@ -321,67 +321,60 @@ class MooncakeKVManager(BaseKVManager):
         This may introduce performance overhead (increased TTFT) for long sequences.
         """
         # Extract configuration
-        local_tp_rank = self.kv_args.engine_rank
         local_tp_size = self.tp_size // self.dp_size
+        local_tp_rank_in_group = self.kv_args.engine_rank % local_tp_size
+        src_kv_item_len = self.kv_args.kv_item_lens[0]
+        dst_tp_rank_in_group = dst_tp_rank % dst_tp_size
         num_kv_heads = self.kv_args.kv_head_num
         num_layers = len(self.kv_args.kv_data_ptrs)
         page_size = self.kv_args.page_size
         # Calculate head distribution
-        heads_per_decode_rank = num_kv_heads * local_tp_size // dst_tp_size
-        heads_per_prefill_rank = num_kv_heads
-        decode_global_head_start = dst_tp_rank * heads_per_decode_rank
-        prefill_global_head_start = local_tp_rank * heads_per_prefill_rank
-        bytes_per_head = dst_kv_item_len // heads_per_decode_rank // page_size
-        decode_rank_item_lens = [dst_kv_item_len for _ in range(num_layers)]
+        src_heads_per_rank = num_kv_heads
+        dst_heads_per_rank = num_kv_heads * local_tp_size // dst_tp_size
+        bytes_per_head_slice_to_send = (
+            dst_kv_item_len // page_size // dst_heads_per_rank
+        )
         # Determine slicing parameters based on TP configuration
         if local_tp_size > dst_tp_size:
-            src_head_offset = 0
-            num_heads_to_send = heads_per_prefill_rank
-            dst_head_offset = prefill_global_head_start - decode_global_head_start
+            # Send KVCache from multiple prefill instances to 1 decode instance
+            src_head_start_offset = 0
+            num_heads_to_send = src_heads_per_rank
+            dst_head_start_offset = local_tp_rank_in_group * src_heads_per_rank
         else:
-            src_head_offset = decode_global_head_start - prefill_global_head_start
-            num_heads_to_send = heads_per_decode_rank
-            dst_head_offset = 0
+            # Send KVCache from 1 prefill instance to multiple decode instances
+            src_head_start_offset = dst_tp_rank_in_group * dst_heads_per_rank
+            num_heads_to_send = dst_heads_per_rank
+            dst_head_start_offset = 0
-        layer_transfer_params = []
+        layers_params = []
         for layer_id in range(num_layers):
-            item_len_of_prefill_rank_page = self.kv_args.kv_item_lens[layer_id]
-            # Page stride on the target dst decode rank for its slice pages
-            item_len_of_decode_rank_page = decode_rank_item_lens[layer_id]
-            if item_len_of_prefill_rank_page == 0 or num_kv_heads == 0:
-                logger.error(
-                    f"Invalid item_len_of_prefill_rank_page or num_kv_heads for layer {layer_id}"
-                )
-                return -1
-            # Calculate precise byte offset and length for the sub-slice within the prefill page data
-            src_slice_offset = src_head_offset * bytes_per_head
-            dst_slice_offset = dst_head_offset * bytes_per_head
-            slice_lens_per_page = num_heads_to_send * bytes_per_head
+            # Calculate precise byte offset and length for the sub-slice within the token
+            src_head_slice_offset = src_head_start_offset * bytes_per_head_slice_to_send
+            dst_head_slice_offset = dst_head_start_offset * bytes_per_head_slice_to_send
+            heads_bytes_per_token_to_send = (
+                num_heads_to_send * bytes_per_head_slice_to_send
+            )
-            # Sanity check: The data sub-slice to be sent should fit into the decode instance's page.
-            # This means slice_lens_per_page <= item_len_of_decode_rank_page
-            if slice_lens_per_page > item_len_of_decode_rank_page:
+            # Sanity check: The data sub-slice to be sent should fit into the dst buffer.
+            # This means heads_bytes_per_token_to_send <= (dst_kv_item_len // page_size)
+            if heads_bytes_per_token_to_send > (dst_kv_item_len // page_size):
                 logger.error(
                     f"[{mooncake_session_id}] Layer {layer_id}: "
-                    f"slice size ({slice_lens_per_page}) exceeds "
-                    f"target page size ({item_len_of_decode_rank_page})"
+                    f"slice size ({heads_bytes_per_token_to_send}) exceeds "
+                    f"target token slot size ({dst_kv_item_len // page_size})"
                 )
                 return -1
-            layer_transfer_params.append(
+            layers_params.append(
                 (
                     self.kv_args.kv_data_ptrs[layer_id],
                     dst_kv_ptrs[layer_id],
-                    item_len_of_prefill_rank_page,
-                    item_len_of_decode_rank_page,
-                    src_slice_offset,
-                    dst_slice_offset,
-                    slice_lens_per_page,
+                    src_kv_item_len,
+                    dst_kv_item_len,
+                    src_head_slice_offset,
+                    dst_head_slice_offset,
+                    heads_bytes_per_token_to_send,
                 )
             )
@@ -391,9 +384,9 @@ class MooncakeKVManager(BaseKVManager):
                 dst_ptr,
                 src_item_len,
                 dst_item_len,
-                src_offset,
-                dst_offset,
-                slice_lens_per_page,
+                src_head_slice_offset,
+                dst_head_slice_offset,
+                heads_bytes_per_token_to_send,
             ) = layer_params
             src_addr_list = []
             dst_addr_list = []
@@ -424,17 +417,12 @@ class MooncakeKVManager(BaseKVManager):
                     )
                     # Calculate final src and dst addresses by applying head-slice offsets
-                    src_slice_addr = src_token_slot_start_addr + src_offset
-                    dst_slice_addr = dst_token_slot_start_addr + dst_offset
+                    src_slice_addr = src_token_slot_start_addr + src_head_slice_offset
+                    dst_slice_addr = dst_token_slot_start_addr + dst_head_slice_offset
                     src_addr_list.append(src_slice_addr)
                     dst_addr_list.append(dst_slice_addr)
-                    length_list.append(slice_lens_per_page)
-                    logger.debug(
-                        f"SYNC: sid={mooncake_session_id}, "
-                        f"src={src_slice_addr}, dst={dst_slice_addr}, len={slice_lens_per_page}"
-                    )
+                    length_list.append(heads_bytes_per_token_to_send)
             return self.engine.batch_transfer_sync(
                 mooncake_session_id, src_addr_list, dst_addr_list, length_list
@@ -445,7 +433,7 @@ class MooncakeKVManager(BaseKVManager):
                 process_layer_tp_aware,
                 layer_params,
             )
-            for layer_params in layer_transfer_params
+            for layer_params in layers_params
         ]
         for future in concurrent.futures.as_completed(futures):
@@ -533,12 +521,12 @@ class MooncakeKVManager(BaseKVManager):
                         if len(chunked_dst_kv_indice) < len(
                             kv_chunk.prefill_kv_indices
                         ):
-                            kv_chunk.prefill_kv_indices = kv_chunk.prefill_kv_indices[
-                                : len(chunked_dst_kv_indice)
-                            ]
                             logger.warning(
                                 f"len(chunked_dst_kv_indice) = {len(chunked_dst_kv_indice)}, len(kv_chunk.prefill_kv_indices) = {len(kv_chunk.prefill_kv_indices)}"
                             )
+                            kv_chunk.prefill_kv_indices = kv_chunk.prefill_kv_indices[
+                                : len(chunked_dst_kv_indice)
+                            ]
                         target_rank_registration_info: KVArgsRegisterInfo = (
                             self.decode_kv_args_table[req.mooncake_session_id]

{sglang-0.4.9.post2 → sglang-0.4.9.post3}/sglang/srt/distributed/parallel_state.py RENAMED Viewed

@@ -1065,8 +1065,23 @@ def init_model_parallel_group(
 _TP: Optional[GroupCoordinator] = None
+# duplicate GroupCoordinator for prefill in PD-Multiplexing
+_PDMUX_PREFILL_TP_GROUP: Optional[GroupCoordinator] = None
+_ENABLE_PDMUX_P_TP: bool = False
+def set_pdmux_status(enable_prefill_multiplexing: bool):
+    global _ENABLE_PDMUX_P_TP
+    _ENABLE_PDMUX_P_TP = enable_prefill_multiplexing
 def get_tp_group() -> GroupCoordinator:
+    if _ENABLE_PDMUX_P_TP:
+        assert (
+            _PDMUX_PREFILL_TP_GROUP is not None
+        ), "tensor model parallel group for PD-Multiplexing Prefill is not initialized"
+        return _PDMUX_PREFILL_TP_GROUP
     assert _TP is not None, "tensor model parallel group is not initialized"
     return _TP
@@ -1182,6 +1197,7 @@ def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
     backend: Optional[str] = None,
+    duplicate_tp_group: bool = False,
 ) -> None:
     """
     Initialize model parallel groups.
@@ -1239,6 +1255,23 @@ def initialize_model_parallel(
         group_name="tp",
     )
+    if duplicate_tp_group:
+        global _PDMUX_PREFILL_TP_GROUP
+        assert (
+            _PDMUX_PREFILL_TP_GROUP is None
+        ), "tensor model parallel group for PD-Multiplexing Prefill is already initialized"
+        _PDMUX_PREFILL_TP_GROUP = init_model_parallel_group(
+            group_ranks,
+            get_world_group().local_rank,
+            backend,
+            use_message_queue_broadcaster=get_bool_env_var(
+                "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true"
+            ),
+            group_name="pdmux_prefill_tp",
+        )
+        _TP.pynccl_comm.disabled = False
+        _PDMUX_PREFILL_TP_GROUP.pynccl_comm.disabled = False
     # Build the pipeline model-parallel groups.
     num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
     global _PP

sglang 0.4.9.post2__tar.gz → 0.4.9.post3__tar.gz

sglang 0.4.9.post2tar.gz → 0.4.9.post3tar.gz