PyPI - sglang - Versions diffs - 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl - Mend

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +6 -1
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +8 -7
sglang/srt/disaggregation/decode.py +8 -4
sglang/srt/disaggregation/mooncake/conn.py +43 -25
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/distributed/parallel_state.py +4 -2
sglang/srt/entrypoints/context.py +3 -20
sglang/srt/entrypoints/engine.py +13 -8
sglang/srt/entrypoints/harmony_utils.py +2 -0
sglang/srt/entrypoints/http_server.py +68 -5
sglang/srt/entrypoints/openai/protocol.py +2 -9
sglang/srt/entrypoints/openai/serving_chat.py +60 -265
sglang/srt/entrypoints/openai/serving_completions.py +1 -0
sglang/srt/entrypoints/openai/tool_server.py +4 -3
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/jinja_template_utils.py +6 -0
sglang/srt/layers/attention/aiter_backend.py +370 -107
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +55 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +24 -27
sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
sglang/srt/layers/attention/vision.py +9 -1
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +11 -13
sglang/srt/layers/dp_attention.py +118 -27
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/logits_processor.py +12 -18
sglang/srt/layers/moe/cutlass_moe.py +11 -16
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +60 -2
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +4 -1
sglang/srt/layers/multimodal.py +156 -40
sglang/srt/layers/quantization/__init__.py +10 -35
sglang/srt/layers/quantization/awq.py +15 -16
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +22 -10
sglang/srt/layers/quantization/gptq.py +12 -17
sglang/srt/layers/quantization/marlin_utils.py +15 -5
sglang/srt/layers/quantization/modelopt_quant.py +58 -41
sglang/srt/layers/quantization/mxfp4.py +20 -3
sglang/srt/layers/quantization/utils.py +52 -2
sglang/srt/layers/quantization/w4afp8.py +20 -11
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +281 -2
sglang/srt/layers/sampler.py +5 -2
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +66 -116
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +12 -48
sglang/srt/lora/lora_registry.py +20 -9
sglang/srt/lora/mem_pool.py +20 -63
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +24 -29
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +20 -6
sglang/srt/managers/mm_utils.py +1 -2
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +43 -49
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +18 -11
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/tokenizer_manager.py +53 -44
sglang/srt/mem_cache/allocator.py +39 -214
sglang/srt/mem_cache/allocator_ascend.py +158 -0
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +34 -24
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +33 -35
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +29 -23
sglang/srt/model_executor/forward_batch_info.py +33 -14
sglang/srt/model_executor/model_runner.py +179 -81
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/models/deepseek_nextn.py +2 -1
sglang/srt/models/deepseek_v2.py +79 -38
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +8 -9
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +11 -11
sglang/srt/models/glm4_moe_nextn.py +2 -1
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +142 -20
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +10 -27
sglang/srt/models/llama4.py +19 -6
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen2_moe.py +20 -5
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/qwen3_classification.py +78 -0
sglang/srt/models/qwen3_moe.py +18 -5
sglang/srt/models/registry.py +1 -1
sglang/srt/models/step3_vl.py +6 -2
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/operations.py +17 -2
sglang/srt/reasoning_parser.py +316 -0
sglang/srt/sampling/sampling_batch_info.py +7 -4
sglang/srt/server_args.py +142 -140
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +16 -12
sglang/srt/utils.py +3 -3
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/test/test_marlin_moe.py +1 -1
sglang/test/test_marlin_utils.py +1 -1
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
sglang/lang/backend/__init__.py +0 -0
sglang/srt/function_call/harmony_tool_parser.py +0 -130
sglang/srt/layers/quantization/scalar_type.py +0 -352
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -1,6 +1,7 @@
 # Adapted from https://raw.githubusercontent.com/vllm-project/vllm/refs/tags/v0.6.6.post1/vllm/model_executor/layers/rotary_embedding.py
 """Rotary Positional Embeddings."""
+import itertools
 import math
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -221,6 +222,7 @@ class RotaryEmbedding(CustomOp):
         query: torch.Tensor,
         key: torch.Tensor,
         offsets: Optional[torch.Tensor] = None,
+        fused_set_kv_buffer_arg=None,  # Optional[FusedSetKVBufferArg]
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         if _is_cuda and (self.head_size in [64, 128, 256, 512]):
             apply_rope_with_cos_sin_cache_inplace(
@@ -230,8 +232,17 @@ class RotaryEmbedding(CustomOp):
                 head_size=self.head_size,
                 cos_sin_cache=self.cos_sin_cache,
                 is_neox=self.is_neox_style,
+                # Compatible with old sgl-kernel
+                **(
+                    dict(fused_set_kv_buffer_arg=fused_set_kv_buffer_arg)
+                    if fused_set_kv_buffer_arg is not None
+                    else {}
+                ),
             )
         else:
+            assert (
+                fused_set_kv_buffer_arg is None
+            ), "save kv cache is not supported for vllm_rotary_embedding."
             self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype)
             self.vllm_rotary_embedding(
                 positions,
@@ -679,7 +690,7 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         )
         # Re-dispatch
-        if _is_hip or _is_npu:
+        if _is_hip:
             self._forward_method = self.forward_native
     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
@@ -764,6 +775,46 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
             key = key_rot
         return query.to(dtype), key.to(dtype)
+    def forward_npu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # NOTE: now npu_mrope can only support `numQHeads*headSize <= 4096` pattern,
+        # and generalization to more scenarios will be supported in the future.
+        if query.shape[1] * query.shape[2] > 4096:
+            return self.forward_native(positions, query, key, offsets)
+        num_tokens = query.shape[0]
+        rotary_mode = "half" if self.is_neox_style else "interleave"
+        self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(positions.device)
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
+        query_rot, key_rot = torch_npu.npu_mrope(
+            torch.add(positions, offsets) if offsets is not None else positions,
+            query_rot.reshape(num_tokens, -1),
+            key_rot.reshape(num_tokens, -1),
+            self.cos_sin_cache,
+            self.rotary_dim,
+            mrope_section=[0, 0, 0],
+            rotary_mode=rotary_mode,
+        )
+        query_rot = query_rot.reshape(num_tokens, -1, self.rotary_dim)
+        key_rot = key_rot.reshape(num_tokens, -1, self.rotary_dim)
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        return query, key
     def forward_cpu(
         self,
         positions: torch.Tensor,
@@ -946,7 +997,37 @@ class MRotaryEmbedding(RotaryEmbedding):
         self.mrope_section = mrope_section
         if self.mrope_section:
-            assert sum(self.mrope_section) == rotary_dim // 2
+            expected_sum = rotary_dim // 2
+            actual_sum = sum(self.mrope_section)
+            if actual_sum != expected_sum:
+                print(
+                    f"MRoPE section sum mismatch: expected {expected_sum}, got {actual_sum}. "
+                    f"Adjusting mrope_section to match rotary_dim // 2 = {expected_sum}"
+                )
+                # Auto-correct by scaling the mrope_section proportionally
+                if actual_sum > 0:
+                    scale_factor = expected_sum / actual_sum
+                    self.mrope_section = [
+                        max(1, int(section * scale_factor))
+                        for section in self.mrope_section
+                    ]
+                    # Ensure the sum exactly matches by adjusting the last element
+                    current_sum = sum(self.mrope_section)
+                    if current_sum != expected_sum:
+                        self.mrope_section[-1] += expected_sum - current_sum
+                else:
+                    # If all sections are 0, create a default distribution
+                    self.mrope_section = [
+                        expected_sum // len(self.mrope_section)
+                    ] * len(self.mrope_section)
+                    # Handle remainder
+                    remainder = expected_sum % len(self.mrope_section)
+                    for i in range(remainder):
+                        self.mrope_section[i] += 1
+                print(
+                    f"Corrected mrope_section: {self.mrope_section} (sum={sum(self.mrope_section)})"
+                )
     def forward(
         self,
@@ -1153,6 +1234,204 @@ class MRotaryEmbedding(RotaryEmbedding):
             mrope_position_deltas = max_position_ids + 1 - s
             return position_ids, mrope_position_deltas
+    # Adapted from https://github.com/vllm-project/vllm/blob/3779eb8c81449b924a23457fc77e45a0e6171178/vllm/model_executor/layers/rotary_embedding.py#L1120
+    @staticmethod
+    def get_rope_index_glm4v(
+        input_ids: torch.Tensor,
+        hf_config: Any,
+        image_grid_thw: Union[list[list[int]], torch.Tensor],
+        video_grid_thw: Union[list[list[int]], torch.Tensor],
+        attention_mask: torch.Tensor,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Get mrope input positions and delta value for GLM4V."""
+        image_token_id = hf_config.image_token_id
+        video_start_token_id = hf_config.video_start_token_id
+        video_end_token_id = hf_config.video_end_token_id
+        spatial_merge_size = hf_config.vision_config.spatial_merge_size
+        mrope_position_deltas = []
+        if input_ids is not None and (
+            image_grid_thw is not None or video_grid_thw is not None
+        ):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = torch.ones_like(total_input_ids)
+            position_ids = torch.ones(
+                3,
+                input_ids.shape[0],
+                input_ids.shape[1],
+                dtype=input_ids.dtype,
+                device=input_ids.device,
+            )
+            image_index, video_index = 0, 0
+            video_group_index = 0
+            attention_mask = attention_mask.to(total_input_ids.device)
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                input_tokens = input_ids.tolist()
+                input_token_type = []
+                video_check_flg = False
+                for token in input_tokens:
+                    if token == video_start_token_id:
+                        video_check_flg = True
+                    elif token == video_end_token_id:
+                        video_check_flg = False
+                    if token == image_token_id and not video_check_flg:
+                        input_token_type.append("image")
+                    elif token == image_token_id and video_check_flg:
+                        input_token_type.append("video")
+                    else:
+                        input_token_type.append("text")
+                input_type_group = []
+                for key, group in itertools.groupby(
+                    enumerate(input_token_type), lambda x: x[1]
+                ):
+                    group = list(group)
+                    start_index = group[0][0]
+                    end_index = group[-1][0] + 1
+                    input_type_group.append((key, start_index, end_index))
+                llm_pos_ids_list = []
+                video_frame_num = 1
+                for modality_type, start_idx, end_idx in input_type_group:
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+                    if modality_type == "image":
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t.item(),
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+                        t_index = (
+                            torch.arange(llm_grid_t)
+                            .view(-1, 1)
+                            .expand(-1, llm_grid_h * llm_grid_w)
+                            .flatten()
+                        )
+                        h_index = (
+                            torch.arange(llm_grid_h)
+                            .view(1, -1, 1)
+                            .expand(llm_grid_t, -1, llm_grid_w)
+                            .flatten()
+                        )
+                        w_index = (
+                            torch.arange(llm_grid_w)
+                            .view(1, 1, -1)
+                            .expand(llm_grid_t, llm_grid_h, -1)
+                            .flatten()
+                        )
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + st_idx
+                        )
+                        image_index += 1
+                        video_frame_num = 1
+                    elif modality_type == "video":
+                        t, h, w = (
+                            video_frame_num,
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t,
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+                        for t_idx in range(llm_grid_t):
+                            t_index = (
+                                torch.tensor(t_idx)
+                                .view(-1, 1)
+                                .expand(-1, llm_grid_h * llm_grid_w)
+                                .flatten()
+                            )
+                            h_index = (
+                                torch.arange(llm_grid_h)
+                                .view(1, -1, 1)
+                                .expand(1, -1, llm_grid_w)
+                                .flatten()
+                            )
+                            w_index = (
+                                torch.arange(llm_grid_w)
+                                .view(1, 1, -1)
+                                .expand(1, llm_grid_h, -1)
+                                .flatten()
+                            )
+                            llm_pos_ids_list.append(
+                                torch.stack([t_index, h_index, w_index]) + st_idx
+                            )
+                        video_group_index += 1
+                        if video_group_index >= video_grid_thw[video_index][0]:
+                            video_index += 1
+                            video_group_index = 0
+                        video_frame_num += 1
+                    else:
+                        text_len = end_idx - start_idx
+                        llm_pos_ids_list.append(
+                            torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                        )
+                        video_frame_num = 1
+                llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(
+                    position_ids.device
+                )
+                mrope_position_deltas.append(
+                    llm_positions.max() + 1 - len(total_input_ids[i])
+                )
+            mrope_position_deltas = torch.tensor(
+                mrope_position_deltas, device=input_ids.device
+            ).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = (
+                    position_ids.unsqueeze(0)
+                    .expand(3, -1, -1)
+                    .to(attention_mask.device)
+                )
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(
+                    -1, keepdim=True
+                )[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    torch.arange(input_ids.shape[1], device=input_ids.device)
+                    .view(1, 1, -1)
+                    .expand(3, input_ids.shape[0], -1)
+                )
+                mrope_position_deltas = torch.zeros(
+                    [input_ids.shape[0], 1],
+                    device=input_ids.device,
+                    dtype=input_ids.dtype,
+                )
+            return position_ids, mrope_position_deltas
     @staticmethod
     def get_next_input_positions(
         mrope_position_delta: int,

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -6,7 +6,10 @@ import torch.distributed as dist
 from torch import nn
 from sglang.srt.distributed import get_tp_group
-from sglang.srt.layers.dp_attention import get_attention_tp_group
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_group,
+    is_dp_attention_enabled,
+)
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
@@ -32,7 +35,7 @@ class Sampler(nn.Module):
         self.use_nan_detection = global_server_args_dict["enable_nan_detection"]
         self.tp_sync_group = get_tp_group().device_group
-        if global_server_args_dict["enable_dp_attention"]:
+        if is_dp_attention_enabled():
             self.tp_sync_group = get_attention_tp_group().device_group
     def forward(

sglang/srt/lora/backend/base_backend.py CHANGED Viewed

@@ -5,22 +5,6 @@ import torch
 from sglang.srt.lora.utils import LoRABatchInfo
-def get_fuse_output_add_from_name(name: str) -> bool:
-    mapping = {
-        "triton": True,
-        "flashinfer": False,
-    }
-    return mapping.get(name, False)
-def get_fuse_stacked_lora_b_from_name(name: str) -> bool:
-    mapping = {
-        "triton": True,
-        "flashinfer": False,
-    }
-    return mapping.get(name, False)
 class BaseLoRABackend:
     """Base class for different Lora backends.
        Each backend has its own implementation of Lora kernels.
@@ -28,15 +12,11 @@ class BaseLoRABackend:
     Args:
         name: name of backend
         batch_info: information of current batch for use
-        fuse_output_add: if set to True, the output buffer for storing result will be passed in when doing lora_b forward,
-                                 and the operation of adding will be fused into kernel
     """
     def __init__(self, name: str, batch_info: LoRABatchInfo = None):
         self.name = name
         self.batch_info = batch_info
-        self.fuse_output_add = get_fuse_output_add_from_name(name)
-        self.fuse_stacked_lora_b = get_fuse_stacked_lora_b_from_name(name)
     def run_lora_a_sgemm(
         self, x: torch.Tensor, weights: torch.Tensor, *args, **kwargs
@@ -126,8 +106,8 @@ def get_backend_from_name(name: str) -> BaseLoRABackend:
         return TritonLoRABackend
     elif name == "flashinfer":
-        from sglang.srt.lora.backend.flashinfer_backend import FlashInferLoRABackend
-        return FlashInferLoRABackend
+        raise ValueError(
+            "FlashInfer LoRA backend has been deprecated, please use `triton` instead."
+        )
     else:
         raise ValueError(f"Invalid backend: {name}")

sglang/srt/lora/layers.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from typing import List, Tuple
 import torch
 from torch import nn
@@ -79,18 +77,13 @@ class ColumnParallelLinearWithLoRA(BaseLayerWithLoRA):
         self.B_buffer = B_buffer
     def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
-        backend_kwargs = {"base_output": base_output}
         lora_a_output = self.lora_backend.run_lora_a_sgemm(x, self.A_buffer)
         lora_output = self.lora_backend.run_lora_b_sgemm(
-            lora_a_output,
-            self.B_buffer[0],
-            **backend_kwargs,
-        )
-        return (
-            lora_output
-            if self.lora_backend.fuse_output_add
-            else base_output + lora_output
+            x=lora_a_output,
+            weights=self.B_buffer,
+            base_output=base_output,
         )
+        return lora_output
     def forward(self, input_: torch.Tensor):
         # duplicate the logic in ColumnParallelLinear
@@ -135,37 +128,16 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     ):
         self.set_lora = True
         self.A_buffer_gate_up = A_buffer
-        if self.lora_backend.fuse_stacked_lora_b:
-            # B_buffer_gate_up: (num_lora, 2 * output_dim, r)
-            if getattr(self, "B_buffer_gate_up", None) is None:
-                self.B_buffer_gate_up = torch.empty(
-                    (
-                        B_buffer[0].shape[0],
-                        2 * B_buffer[0].shape[1],
-                        B_buffer[0].shape[2],
-                    ),
-                    dtype=B_buffer[0].dtype,
-                    device=B_buffer[0].device,
-                )
-            self.B_buffer_gate_up[:, : B_buffer[0].shape[1], :].copy_(B_buffer[0])
-            self.B_buffer_gate_up[:, B_buffer[0].shape[1] :, :].copy_(B_buffer[1])
-        else:
-            self.B_buffer_gate_up = (B_buffer[0], B_buffer[1])
+        self.B_buffer_gate_up = B_buffer
     def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
-        backend_kwargs = {"base_output": base_output}
         lora_output = self.lora_backend.run_gate_up_lora(
-            x,
-            self.A_buffer_gate_up,
-            self.B_buffer_gate_up,
-            **backend_kwargs,
-        )
-        return (
-            lora_output
-            if self.lora_backend.fuse_output_add
-            else base_output + lora_output
+            x=x,
+            gate_up_lora_a=self.A_buffer_gate_up,
+            gate_up_lora_b=self.B_buffer_gate_up,
+            base_output=base_output,
         )
+        return lora_output
     def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
         return A
@@ -173,9 +145,16 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
     def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int):
         # Since the outputs for both gate and up are identical, we use a random one.
         shard_size = self.base_layer.output_partition_sizes[0]
+        gate_size = self.base_layer.output_sizes[0]
         start_idx = tp_rank * shard_size
         end_idx = (tp_rank + 1) * shard_size
-        return B[:, start_idx:end_idx, :]
+        return torch.concat(
+            (
+                B[start_idx:end_idx, :],
+                B[gate_size + start_idx : gate_size + end_idx],
+            ),
+            dim=0,
+        )
 class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
@@ -185,86 +164,46 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         lora_backend: BaseLoRABackend,
     ) -> None:
         super().__init__(base_layer, lora_backend)
+        q_proj_shard_size = self.base_layer.q_proj_shard_size
+        kv_proj_shard_size = self.base_layer.kv_proj_shard_size
+        self.output_offset = torch.tensor(
+            [
+                0,
+                q_proj_shard_size,
+                q_proj_shard_size + kv_proj_shard_size,
+                q_proj_shard_size + 2 * kv_proj_shard_size,
+            ],
+            dtype=torch.int32,
+            device=next(self.base_layer.parameters()).device,
+        )
+        # For computing number of launched blocks
+        self.max_qkv_out_dim = max(q_proj_shard_size, kv_proj_shard_size)
     def set_lora_info(
         self,
         A_buffer_qkv: torch.Tensor,
-        B_buffer_q: torch.Tensor,
-        B_buffer_kv: torch.Tensor,
+        B_buffer_qkv: torch.Tensor,
     ):
         self.set_lora = True
         self.A_buffer_qkv = A_buffer_qkv
-        if self.lora_backend.fuse_stacked_lora_b:
-            assert (
-                B_buffer_q.shape[-1] == B_buffer_kv.shape[-1]
-            ), "The lora rank of q and kv should be the same when enabling fusion of qkv lora_b"
-            output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2]
-            # B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r)
-            if getattr(self, "B_buffer_qkv", None) is None:
-                self.B_buffer_qkv = torch.empty(
-                    (
-                        B_buffer_q[0].shape[0],
-                        output_dim_q + 2 * output_dim_kv,
-                        B_buffer_q[0].shape[2],
-                    ),
-                    dtype=B_buffer_q[0].dtype,
-                    device=B_buffer_q[0].device,
-                )
-            self.B_buffer_qkv[:, :output_dim_q, :].copy_(B_buffer_q[0])
-            self.B_buffer_qkv[:, output_dim_q : output_dim_q + output_dim_kv, :].copy_(
-                B_buffer_kv[0]
-            )
-            self.B_buffer_qkv[:, output_dim_q + output_dim_kv :, :].copy_(
-                B_buffer_kv[1]
-            )
-            # Offsets of q/k/v in output dimension
-            if getattr(self, "output_offset", None) is None:
-                self.output_offset = torch.tensor(
-                    [
-                        0,
-                        output_dim_q,
-                        output_dim_q + output_dim_kv,
-                        output_dim_q + 2 * output_dim_kv,
-                    ],
-                    dtype=torch.int32,
-                    device=B_buffer_q.device,
-                )
-            # For computing number of launched blocks
-            self.max_qkv_out_dim = max(output_dim_q, output_dim_kv)
-        else:
-            self.B_buffer_qkv = (
-                B_buffer_q,
-                B_buffer_kv,
-            )
+        self.B_buffer_qkv = B_buffer_qkv
     def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
-        backend_kwargs = {"base_output": base_output}
-        if self.lora_backend.fuse_stacked_lora_b:
-            backend_kwargs["output_offset"] = self.output_offset
-            backend_kwargs["max_qkv_out_dim"] = self.max_qkv_out_dim
         lora_output = self.lora_backend.run_qkv_lora(
-            x,
-            self.A_buffer_qkv,
-            self.B_buffer_qkv,
-            **backend_kwargs,
-        )
-        return (
-            lora_output
-            if self.lora_backend.fuse_output_add
-            else base_output + lora_output
+            x=x,
+            qkv_lora_a=self.A_buffer_qkv,
+            qkv_lora_b=self.B_buffer_qkv,
+            base_output=base_output,
+            output_offset=self.output_offset,
+            max_qkv_out_dim=self.max_qkv_out_dim,
         )
+        return lora_output
     def slice_lora_a_weights(self, A: torch.Tensor, tp_rank: int):
         return A
-    def slice_lora_b_weights(
-        self, B: List[torch.Tensor], tp_rank: int
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        B_q, B_kv = B
+    def slice_lora_b_weights(self, B: torch.Tensor, tp_rank: int) -> torch.Tensor:
         base_layer = self.base_layer
         q_proj_shard_size = base_layer.q_proj_shard_size
         kv_proj_shard_size = base_layer.kv_proj_shard_size
@@ -277,7 +216,19 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         kv_start_idx = kv_proj_shard_size * kv_shard_id
         kv_end_idx = kv_start_idx + kv_proj_shard_size
-        return B_q[q_start_idx:q_end_idx, :], B_kv[:, kv_start_idx:kv_end_idx, :]
+        q_size, k_size, _ = base_layer.output_sizes
+        B_q_shard = B[q_start_idx:q_end_idx, :]
+        B_k_shard = B[q_size + kv_start_idx : q_size + kv_end_idx, :]
+        B_v_shard = B[q_size + k_size + kv_start_idx : q_size + k_size + kv_end_idx, :]
+        return torch.concat(
+            (
+                B_q_shard,
+                B_k_shard,
+                B_v_shard,
+            ),
+            dim=0,
+        )
 class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
@@ -294,20 +245,15 @@ class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
         self.B_buffer = B_buffer
     def apply_lora(self, base_output: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
-        backend_kwargs = {"base_output": base_output}
         lora_a_output = self.lora_backend.run_lora_a_sgemm(x, self.A_buffer)
         lora_output = self.lora_backend.run_lora_b_sgemm(
-            lora_a_output,
-            self.B_buffer[0],
-            **backend_kwargs,
-        )
-        return (
-            lora_output
-            if self.lora_backend.fuse_output_add
-            else base_output + lora_output
+            x=lora_a_output,
+            weights=self.B_buffer,
+            base_output=base_output,
         )
+        return lora_output
-    def forward(self, input_: torch.Tensor):
+    def forward(self, input_: torch.Tensor, skip_all_reduce=False):
         # duplicate the logic in RowParallelLinear
         if self.base_layer.input_is_parallel:
             input_parallel = input_
@@ -324,7 +270,11 @@ class RowParallelLinearWithLoRA(BaseLayerWithLoRA):
         if self.set_lora:
             output_parallel = self.apply_lora(output_parallel, input_parallel)
-        if self.base_layer.reduce_results and self.base_layer.tp_size > 1:
+        if (
+            self.base_layer.reduce_results
+            and self.base_layer.tp_size > 1
+            and not skip_all_reduce
+        ):
             output_ = tensor_model_parallel_all_reduce(output_parallel)
         else:
             output_ = output_parallel

sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc2py3-none-any.whl