PyPI - sglang - Versions diffs - 0.4.6__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl - Mend

sglang 0.4.6py3-none-any.whl → 0.4.6.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

sglang/bench_one_batch.py +2 -0
sglang/check_env.py +3 -3
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/kimi_vl.py +38 -0
sglang/srt/configs/kimi_vl_moonvit.py +32 -0
sglang/srt/configs/model_config.py +15 -0
sglang/srt/conversation.py +122 -1
sglang/srt/disaggregation/decode.py +8 -2
sglang/srt/disaggregation/fake/__init__.py +1 -0
sglang/srt/disaggregation/fake/conn.py +88 -0
sglang/srt/disaggregation/prefill.py +12 -3
sglang/srt/disaggregation/utils.py +16 -2
sglang/srt/entrypoints/engine.py +52 -21
sglang/srt/entrypoints/http_server.py +27 -2
sglang/srt/function_call_parser.py +97 -0
sglang/srt/hf_transformers_utils.py +2 -0
sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
sglang/srt/layers/attention/flashinfer_backend.py +107 -82
sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
sglang/srt/layers/attention/flashmla_backend.py +3 -0
sglang/srt/layers/attention/utils.py +1 -1
sglang/srt/layers/dp_attention.py +5 -2
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +10 -8
sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
sglang/srt/layers/quantization/__init__.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +1 -1
sglang/srt/layers/quantization/fp8.py +20 -22
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/utils.py +35 -0
sglang/srt/lora/layers.py +35 -9
sglang/srt/lora/lora_manager.py +84 -35
sglang/srt/managers/data_parallel_controller.py +52 -34
sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
sglang/srt/managers/schedule_batch.py +34 -15
sglang/srt/managers/scheduler.py +273 -67
sglang/srt/managers/scheduler_output_processor_mixin.py +26 -10
sglang/srt/managers/tp_worker.py +52 -17
sglang/srt/managers/tp_worker_overlap_thread.py +18 -7
sglang/srt/mem_cache/memory_pool.py +70 -36
sglang/srt/model_executor/cuda_graph_runner.py +82 -19
sglang/srt/model_executor/forward_batch_info.py +31 -1
sglang/srt/model_executor/model_runner.py +123 -58
sglang/srt/models/deepseek_nextn.py +1 -257
sglang/srt/models/deepseek_v2.py +78 -18
sglang/srt/models/kimi_vl.py +308 -0
sglang/srt/models/kimi_vl_moonvit.py +639 -0
sglang/srt/models/llama.py +92 -30
sglang/srt/models/llama4.py +2 -1
sglang/srt/models/llama_eagle.py +4 -1
sglang/srt/models/llama_eagle3.py +4 -1
sglang/srt/models/qwen2_moe.py +8 -3
sglang/srt/models/qwen2_vl.py +0 -12
sglang/srt/models/qwen3_moe.py +8 -3
sglang/srt/openai_api/adapter.py +49 -8
sglang/srt/openai_api/protocol.py +13 -1
sglang/srt/reasoning_parser.py +25 -1
sglang/srt/server_args.py +83 -24
sglang/srt/speculative/eagle_worker.py +3 -2
sglang/srt/utils.py +91 -9
sglang/test/runners.py +4 -0
sglang/test/send_one.py +84 -28
sglang/test/test_utils.py +67 -0
sglang/version.py +1 -1
{sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
{sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +85 -60
{sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
{sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -72,8 +72,8 @@ _is_hip = is_hip()
 _is_cuda = is_cuda()
 if _is_hip:
-    from aiter import ActivationType
-    from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages, ck_moe_2stages_win4
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages
     from aiter.ops.shuffle import shuffle_weight
 if not _is_cuda:
@@ -484,7 +484,7 @@ class Fp8MoEMethod:
         if self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = (
                 torch.uint32
-                if get_bool_env_var("USE_INT4_WEIGHT")
+                if get_bool_env_var("SGLANG_INT4_WEIGHT")
                 else torch.float8_e4m3fn
             )
         tp_size = get_tensor_model_parallel_world_size()
@@ -511,7 +511,7 @@ class Fp8MoEMethod:
                     )
         # WEIGHTS
-        if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+        if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
             # INT4 MoE weight - INT32 packed
             w13_weight = torch.nn.Parameter(
                 torch.empty(
@@ -585,7 +585,7 @@ class Fp8MoEMethod:
             if (
                 _is_hip
-            ):  # and get_bool_env_var("CK_MOE"): TODO: add check back after triton kernel
+            ):  # and get_bool_env_var("SGLANG_AITER_MOE"): TODO: add check back after triton kernel
                 # ROCm - using column scaling, duplicate scaling numbers in case per tensor scaling
                 w13_weight_scale1 = torch.nn.Parameter(
                     torch.ones(num_experts, 2 * intermediate_size, dtype=torch.float32),
@@ -612,7 +612,7 @@ class Fp8MoEMethod:
             set_weight_attrs(w13_weight_scale, extra_weight_attrs)
             set_weight_attrs(w2_weight_scale, extra_weight_attrs)
-            if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+            if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
                 extra_weight_attrs.update(
                     {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
                 )
@@ -644,7 +644,7 @@ class Fp8MoEMethod:
             layer.w2_input_scale = None
     def process_weights_after_loading(self, layer: Module) -> None:
-        if _is_hip and get_bool_env_var("USE_INT4_WEIGHT"):
+        if _is_hip and get_bool_env_var("SGLANG_INT4_WEIGHT"):
             self.process_weights_hip_int4(layer)
             return
@@ -675,7 +675,7 @@ class Fp8MoEMethod:
                 )
                 layer.w2_input_scale = None
-                if get_bool_env_var("CK_MOE"):
+                if get_bool_env_var("SGLANG_AITER_MOE"):
                     # Pre-shuffle weights
                     layer.w13_weight.data = shuffle_weight(
                         layer.w13_weight.contiguous(), (16, 16)
@@ -798,17 +798,15 @@ class Fp8MoEMethod:
             return
     def process_weights_hip_int4(self, layer: Module):
-        # TODO: and get_bool_env_var("CK_MOE"): add after triton kernel added
+        # TODO: and get_bool_env_var("SGLANG_AITER_MOE"): add after triton kernel added
         # INT4-FP8 (INT4 MoE Weight, FP8 Compute)
         # Weight Permutation
         layer.w13_weight = torch.nn.Parameter(
-            # permute_weight(layer.w13_weight.data),
             shuffle_weight(layer.w13_weight.data, (16, 16)),
             requires_grad=False,
         )
         torch.cuda.empty_cache()
         layer.w2_weight = torch.nn.Parameter(
-            # permute_weight(layer.w2_weight.data),
             shuffle_weight(layer.w2_weight.data, (16, 16)),
             requires_grad=False,
         )
@@ -847,23 +845,21 @@ class Fp8MoEMethod:
             padding_size,  # Avoid circular import
         )
-        if get_bool_env_var("CK_MOE"):
+        if get_bool_env_var("SGLANG_AITER_MOE"):
             layer.w13_weight = torch.nn.Parameter(
-                # permute_weight(layer.w13_weight.data),
                 shuffle_weight(layer.w13_weight.data, (16, 16)),
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
             layer.w2_weight = torch.nn.Parameter(
-                # permute_weight(layer.w2_weight.data),
                 shuffle_weight(layer.w2_weight.data, (16, 16)),
                 requires_grad=False,
             )
             torch.cuda.empty_cache()
-            # ROCm (CK_MOE): using column-wise scaling
+            # ROCm (SGLANG_AITER_MOE): using column-wise scaling
             layer.w13_weight_scale1 *= layer.w13_weight_scale.unsqueeze(-1)
             layer.w2_weight_scale1 *= layer.w2_weight_scale.unsqueeze(-1)
-        elif get_bool_env_var("MOE_PADDING"):
+        elif get_bool_env_var("SGLANG_MOE_PADDING"):
             # If ROCm, apply weight padding (min. Mem channel contention) only if set
             layer.w13_weight = torch.nn.Parameter(
                 F.pad(layer.w13_weight.data, (0, padding_size), "constant", 0),
@@ -912,15 +908,16 @@ class Fp8MoEMethod:
         )
         if _is_hip:
-            if get_bool_env_var("USE_INT4_WEIGHT"):
-                # TODO: add triton kernel and add check get_bool_env_var("CK_MOE")
+            if get_bool_env_var("SGLANG_INT4_WEIGHT"):
+                # TODO: add triton kernel and add check get_bool_env_var("SGLANG_AITER_MOE")
                 assert not no_combine, f"{no_combine=} is not supported."
-                return ck_moe_2stages_win4(
+                return ck_moe_2stages(
                     x,
                     layer.w13_weight,
                     layer.w2_weight,
                     topk_weights,
                     topk_ids,
+                    QuantType.per_Token,
                     layer.w13_weight_scale1,
                     layer.w2_weight_scale1,
                     activation=(
@@ -930,13 +927,13 @@ class Fp8MoEMethod:
                     ),
                 )
-            if get_bool_env_var("CK_MOE"):
+            if get_bool_env_var("SGLANG_AITER_MOE"):
                 assert not no_combine, f"{no_combine=} is not supported."
                 if self.block_quant:
-                    # TODO(CK_MOE): FP8 block_quant only supports 'silu' for the time-being.
+                    # TODO(SGLANG_AITER_MOE): FP8 block_quant only supports 'silu' for the time-being.
                     assert (
                         activation == "silu"
-                    ), f"CK_MOE: FP8 bloack_quant {activation=} will be supported later, unset CK_MOE"
+                    ), f"SGLANG_AITER_MOE: FP8 bloack_quant {activation=} will be supported later, unset SGLANG_AITER_MOE"
                     return asm_moe(
                         x,
                         layer.w13_weight,
@@ -955,6 +952,7 @@ class Fp8MoEMethod:
                         layer.w2_weight,
                         topk_weights,
                         topk_ids,
+                        QuantType.per_Token,
                         layer.w13_weight_scale1,
                         layer.w2_weight_scale1,
                         activation=(

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -31,7 +31,7 @@ from sglang.srt.utils import (
 _is_hip = is_hip()
 _is_cuda = is_cuda()
-if _is_hip and get_bool_env_var("CK_MOE"):
+if _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
     from aiter import gemm_a8w8_blockscale
 if _is_cuda:
@@ -132,7 +132,7 @@ def apply_w8a8_block_fp8_linear(
         output = fp8_blockwise_scaled_mm(
             q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype
         )
-    elif _is_hip and get_bool_env_var("CK_MOE"):
+    elif _is_hip and get_bool_env_var("SGLANG_AITER_MOE"):
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=False
         )

sglang/srt/layers/utils.py ADDED Viewed

@@ -0,0 +1,35 @@
+import logging
+import re
+import torch
+logger = logging.getLogger(__name__)
+def get_layer_id(weight_name):
+    # example weight name: model.layers.10.self_attn.qkv_proj.weight
+    match = re.search(r"layers\.(\d+)\.", weight_name)
+    if match:
+        return int(match.group(1))
+    return None
+class PPMissingLayer(torch.nn.Identity):
+    # Adapted from
+    # https://github.com/vllm-project/vllm/blob/18ed3132d2bfe1df9a74729457b69243955221e8/vllm/model_executor/models/utils.py#L468C1-L486C1
+    """
+    A placeholder layer for missing layers in a pipeline parallel model.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.return_tuple = kwargs.get("return_tuple", False)
+    def forward(self, *args, **kwargs):
+        """
+        Return the first arg from args or the first value from kwargs.
+        Wraps the input in a tuple if `self.return_tuple` is True.
+        """
+        input = args[0] if args else next(iter(kwargs.values()))
+        return (input,) if self.return_tuple else input

sglang/srt/lora/layers.py CHANGED Viewed

@@ -136,11 +136,19 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
         self.set_lora = True
         self.A_buffer_gate_up = A_buffer
         if self.lora_backend.fuse_stacked_lora_b:
-            # TODO: avoid using contiguous() in GPU.
             # B_buffer_gate_up: (num_lora, 2 * output_dim, r)
-            self.B_buffer_gate_up = torch.cat(
-                (B_buffer[0], B_buffer[1]), dim=-2
-            ).contiguous()
+            if not hasattr(self, "B_buffer_gate_up") or self.B_buffer_gate_up is None:
+                self.B_buffer_gate_up = torch.empty(
+                    (
+                        B_buffer[0].shape[0],
+                        2 * B_buffer[0].shape[1],
+                        B_buffer[0].shape[2],
+                    ),
+                    dtype=B_buffer[0].dtype,
+                    device=B_buffer[0].device,
+                )
+            self.B_buffer_gate_up[:, : B_buffer[0].shape[1], :].copy_(B_buffer[0])
+            self.B_buffer_gate_up[:, B_buffer[0].shape[1] :, :].copy_(B_buffer[1])
         else:
             self.B_buffer_gate_up = (B_buffer[0], B_buffer[1])
@@ -171,7 +179,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
 class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
-    def init__(
+    def __init__(
         self,
         base_layer: QKVParallelLinear,
         lora_backend: BaseLoRABackend,
@@ -194,12 +202,30 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
             output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2]
             # B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r)
-            self.B_buffer_qkv = torch.cat(
-                (B_buffer_q[0], B_buffer_kv[0], B_buffer_kv[1]), dim=-2
-            ).contiguous()
+            if not hasattr(self, "B_buffer_qkv") or self.B_buffer_qkv is None:
+                self.B_buffer_qkv = torch.empty(
+                    (
+                        B_buffer_q[0].shape[0],
+                        output_dim_q + 2 * output_dim_kv,
+                        B_buffer_q[0].shape[2],
+                    ),
+                    dtype=B_buffer_q[0].dtype,
+                    device=B_buffer_q[0].device,
+                )
+            self.B_buffer_qkv[:, :output_dim_q, :].copy_(B_buffer_q[0])
+            self.B_buffer_qkv[:, output_dim_q : output_dim_q + output_dim_kv, :].copy_(
+                B_buffer_kv[0]
+            )
+            self.B_buffer_qkv[:, output_dim_q + output_dim_kv :, :].copy_(
+                B_buffer_kv[1]
+            )
             # Offsets of q/k/v in output dimension
-            self.output_offset = torch.tensor(
+            if not hasattr(self, "output_offset") or self.output_offset is None:
+                self.output_offset = torch.empty(
+                    4, dtype=torch.int32, device=B_buffer_q.device
+                )
+            self.output_offset[:4] = torch.tensor(
                 [
                     0,
                     output_dim_q,

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -72,6 +72,23 @@ class LoRAManager:
         self.init_loras()
         self.init_lora_memory_pool()
+    def init_cuda_graph_batch_info(self, max_bs_in_cuda_graph: int):
+        self.max_bs_in_cuda_graph = max_bs_in_cuda_graph
+        with torch.device("cuda"):
+            self.cuda_graph_batch_info = LoRABatchInfo(
+                bs=self.max_bs_in_cuda_graph,
+                seg_lens=torch.zeros(self.max_bs_in_cuda_graph, dtype=torch.int32),
+                seg_indptr=torch.zeros(
+                    self.max_bs_in_cuda_graph + 1, dtype=torch.int32
+                ),
+                max_len=0,
+                weight_indices=torch.zeros(
+                    self.max_bs_in_cuda_graph, dtype=torch.int32
+                ),
+                lora_ranks=torch.zeros(self.max_loras_per_batch, dtype=torch.int32),
+                scalings=torch.zeros(self.max_loras_per_batch, dtype=torch.float),
+            )
     def init_loras(self):
         # Config of each LoRA adapter
         self.configs: Dict[str, LoRAConfig] = {}
@@ -136,43 +153,75 @@ class LoRAManager:
         assert len(cur_uids) <= self.max_loras_per_batch
         self.memory_pool.prepare_lora_batch(cur_uids, self.loras)
-        # FIXME: Handle lora uid with None more safely
-        if cur_uids == set([None]):
-            return
-        # set up batch info shared by all lora moruldes
+        # set up batch info shared by all lora modules
         bs = forward_batch.batch_size
-        seg_lens = (
-            forward_batch.extend_seq_lens
-            if forward_batch.forward_mode.is_extend()
-            else torch.ones(bs, device=self.device)
-        )
-        seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
-        seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
-        max_len = int(torch.max(seg_lens))
-        weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
-        lora_ranks = torch.empty(
-            (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
-        )
-        scalings = torch.empty(
-            (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
-        )
-        for i, lora_path in enumerate(forward_batch.lora_paths):
-            weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
-            lora = self.loras[lora_path]
-            lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
-            scalings[weight_indices[i]] = lora.scaling
-        batch_info = LoRABatchInfo(
-            bs=bs,
-            seg_lens=seg_lens,
-            seg_indptr=seg_indptr,
-            max_len=max_len,
-            weight_indices=weight_indices,
-            lora_ranks=lora_ranks,
-            scalings=scalings,
-        )
+        if hasattr(self, "max_bs_in_cuda_graph") and bs <= self.max_bs_in_cuda_graph:
+            # Do in-place updates when CUDA graph is enabled. Note that
+            # if CUDA graph is enabled, the batch whose bs <= max_bs_in_cuda_graph
+            # will also use these preallocated buffers, no matter whether
+            # the batch can use CUDA graph or not.
+            self.cuda_graph_batch_info.bs = bs
+            if forward_batch.forward_mode.is_extend():
+                self.cuda_graph_batch_info.seg_lens[:bs].copy_(
+                    forward_batch.extend_seq_lens
+                )
+            else:
+                self.cuda_graph_batch_info.seg_lens[:bs].fill_(1)
+            torch.cumsum(
+                self.cuda_graph_batch_info.seg_lens[:bs],
+                dim=0,
+                out=self.cuda_graph_batch_info.seg_indptr[1 : bs + 1],
+            )
+            self.cuda_graph_batch_info.max_len = int(
+                torch.max(self.cuda_graph_batch_info.seg_lens[:bs])
+            )
+            for i, lora_path in enumerate(forward_batch.lora_paths):
+                self.cuda_graph_batch_info.weight_indices[i] = (
+                    self.memory_pool.get_buffer_id(lora_path)
+                )
+                if lora_path is not None:
+                    lora = self.loras[lora_path]
+                    self.cuda_graph_batch_info.lora_ranks[
+                        self.cuda_graph_batch_info.weight_indices[i]
+                    ] = lora.config.hf_config["r"]
+                    self.cuda_graph_batch_info.scalings[
+                        self.cuda_graph_batch_info.weight_indices[i]
+                    ] = lora.scaling
+            batch_info = self.cuda_graph_batch_info
+        else:
+            seg_lens = (
+                forward_batch.extend_seq_lens
+                if forward_batch.forward_mode.is_extend()
+                else torch.ones(bs, device=self.device)
+            )
+            seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
+            seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
+            max_len = int(torch.max(seg_lens))
+            weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
+            lora_ranks = torch.empty(
+                (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
+            )
+            scalings = torch.empty(
+                (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
+            )
+            for i, lora_path in enumerate(forward_batch.lora_paths):
+                weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
+                if lora_path is not None:
+                    lora = self.loras[lora_path]
+                    lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
+                    scalings[weight_indices[i]] = lora.scaling
+            batch_info = LoRABatchInfo(
+                bs=bs,
+                seg_lens=seg_lens,
+                seg_indptr=seg_indptr,
+                max_len=max_len,
+                weight_indices=weight_indices,
+                lora_ranks=lora_ranks,
+                scalings=scalings,
+            )
         self.lora_backend.set_batch_info(batch_info)
         # call set_lora_info for each lora modules

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -181,44 +181,62 @@ class DataParallelController:
             enable=server_args.enable_memory_saver
         )
-        # Launch tensor parallel scheduler processes
         scheduler_pipe_readers = []
-        tp_size_per_node = server_args.tp_size // server_args.nnodes
+        nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
+        tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
         tp_rank_range = range(
-            tp_size_per_node * server_args.node_rank,
-            tp_size_per_node * (server_args.node_rank + 1),
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
+        )
+        pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
+        pp_rank_range = range(
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
         )
-        for tp_rank in tp_rank_range:
-            rank_port_args = port_args
-            if server_args.enable_dp_attention:
-                # dp attention has different sharding logic
-                _, _, dp_rank = compute_dp_attention_world_info(
-                    server_args.enable_dp_attention,
-                    tp_rank,
-                    server_args.tp_size,
-                    server_args.dp_size,
+        for pp_rank in pp_rank_range:
+            for tp_rank in tp_rank_range:
+                rank_port_args = port_args
+                if server_args.enable_dp_attention:
+                    # dp attention has different sharding logic
+                    _, _, dp_rank = compute_dp_attention_world_info(
+                        server_args.enable_dp_attention,
+                        tp_rank,
+                        server_args.tp_size,
+                        server_args.dp_size,
+                    )
+                    # compute zmq ports for this dp rank
+                    rank_port_args = PortArgs.init_new(server_args, dp_rank)
+                    # Data parallelism resues the tensor parallelism group,
+                    # so all dp ranks should use the same nccl port.
+                    rank_port_args.nccl_port = port_args.nccl_port
+                reader, writer = mp.Pipe(duplex=False)
+                gpu_id = (
+                    server_args.base_gpu_id
+                    + base_gpu_id
+                    + ((pp_rank % pp_size_per_node) * tp_size_per_node)
+                    + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
                 )
-                # compute zmq ports for this dp rank
-                rank_port_args = PortArgs.init_new(server_args, dp_rank)
-                # Data parallelism resues the tensor parallelism group,
-                # so all dp ranks should use the same nccl port.
-                rank_port_args.nccl_port = port_args.nccl_port
-            reader, writer = mp.Pipe(duplex=False)
-            gpu_id = (
-                server_args.base_gpu_id
-                + base_gpu_id
-                + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
-            )
-            proc = mp.Process(
-                target=run_scheduler_process,
-                args=(server_args, rank_port_args, gpu_id, tp_rank, dp_rank, writer),
-            )
-            with memory_saver_adapter.configure_subprocess():
-                proc.start()
-            self.scheduler_procs.append(proc)
-            scheduler_pipe_readers.append(reader)
+                proc = mp.Process(
+                    target=run_scheduler_process,
+                    args=(
+                        server_args,
+                        rank_port_args,
+                        gpu_id,
+                        tp_rank,
+                        pp_rank,
+                        dp_rank,
+                        writer,
+                    ),
+                )
+                with memory_saver_adapter.configure_subprocess():
+                    proc.start()
+                self.scheduler_procs.append(proc)
+                scheduler_pipe_readers.append(reader)
         # Wait for model to finish loading
         scheduler_info = []

sglang/srt/managers/multimodal_processors/kimi_vl.py ADDED Viewed

@@ -0,0 +1,73 @@
+import asyncio
+import math
+from typing import List, Union
+import torch
+from PIL import Image
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    MultimodalSpecialTokens,
+)
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration
+# Compatible with KimiVLForConditionalGeneration
+class KimiVLImageProcessor(SGLangBaseProcessor):
+    models = [KimiVLForConditionalGeneration]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.IMAGE_TOKEN = "<|media_pad|>"
+        self.im_token_id = _processor.tokenizer.convert_tokens_to_ids(self.IMAGE_TOKEN)
+        self.im_start = "<|media_start|>"
+        self.im_start_id = _processor.tokenizer.convert_tokens_to_ids(self.im_start)
+        self.im_end = "<|media_end|>"
+        self.im_end_id = _processor.tokenizer.convert_tokens_to_ids(self.im_end)
+        self.im_content = "<|media_content|>"
+        self.im_content_id = _processor.tokenizer.convert_tokens_to_ids(self.im_content)
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        max_req_input_len,
+        *args,
+        **kwargs,
+    ):
+        if not image_data:
+            return None
+        if isinstance(image_data, str):
+            image_data = [image_data]
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=MultimodalSpecialTokens(image_token=self.IMAGE_TOKEN),
+            max_req_input_len=max_req_input_len,
+        )
+        ret = self.process_mm_data(
+            input_text=base_output.input_text,
+            images=base_output.images,
+        )
+        return {
+            "input_ids": ret["input_ids"].flatten().tolist(),
+            "mm_items": [
+                MultimodalDataItem(
+                    pixel_values=ret["pixel_values"],
+                    image_grid_thws=ret["image_grid_hws"],
+                    modality=Modality.IMAGE,
+                )
+            ],
+            "im_token_id": self.im_token_id,
+            "im_start_id": self.im_start_id,
+            "im_end_id": self.im_end_id,
+            "im_content_id": self.im_content_id,
+        }

sglang 0.4.6__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl

sglang 0.4.6py3-none-any.whl → 0.4.6.post2py3-none-any.whl