PyPI - sglang - Versions diffs - 0.4.6.post1__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl - Mend

sglang 0.4.6.post1py3-none-any.whl → 0.4.6.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

sglang/bench_one_batch.py +2 -0
sglang/check_env.py +3 -3
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/kimi_vl.py +38 -0
sglang/srt/configs/kimi_vl_moonvit.py +32 -0
sglang/srt/configs/model_config.py +15 -0
sglang/srt/conversation.py +122 -1
sglang/srt/entrypoints/engine.py +44 -22
sglang/srt/function_call_parser.py +97 -0
sglang/srt/hf_transformers_utils.py +2 -0
sglang/srt/layers/attention/cutlass_mla_backend.py +1 -1
sglang/srt/layers/attention/flashinfer_backend.py +107 -82
sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -16
sglang/srt/layers/attention/flashmla_backend.py +3 -0
sglang/srt/layers/dp_attention.py +5 -2
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +1 -3
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +8 -6
sglang/srt/layers/quantization/__init__.py +2 -2
sglang/srt/layers/quantization/deep_gemm.py +1 -1
sglang/srt/layers/utils.py +35 -0
sglang/srt/lora/layers.py +35 -9
sglang/srt/lora/lora_manager.py +84 -35
sglang/srt/managers/data_parallel_controller.py +52 -34
sglang/srt/managers/multimodal_processors/kimi_vl.py +73 -0
sglang/srt/managers/schedule_batch.py +25 -15
sglang/srt/managers/scheduler.py +263 -59
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -1
sglang/srt/managers/tp_worker.py +51 -16
sglang/srt/managers/tp_worker_overlap_thread.py +9 -3
sglang/srt/mem_cache/memory_pool.py +70 -36
sglang/srt/model_executor/cuda_graph_runner.py +82 -19
sglang/srt/model_executor/forward_batch_info.py +31 -1
sglang/srt/model_executor/model_runner.py +115 -57
sglang/srt/models/deepseek_nextn.py +1 -257
sglang/srt/models/deepseek_v2.py +78 -18
sglang/srt/models/kimi_vl.py +308 -0
sglang/srt/models/kimi_vl_moonvit.py +639 -0
sglang/srt/models/llama.py +92 -30
sglang/srt/models/llama4.py +2 -1
sglang/srt/models/llama_eagle.py +4 -1
sglang/srt/models/llama_eagle3.py +4 -1
sglang/srt/models/qwen2_moe.py +8 -3
sglang/srt/models/qwen2_vl.py +0 -12
sglang/srt/models/qwen3_moe.py +8 -3
sglang/srt/openai_api/adapter.py +34 -22
sglang/srt/openai_api/protocol.py +11 -1
sglang/srt/server_args.py +67 -22
sglang/srt/speculative/eagle_worker.py +3 -2
sglang/srt/utils.py +88 -9
sglang/test/runners.py +4 -0
sglang/test/test_utils.py +29 -0
sglang/version.py +1 -1
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/METADATA +5 -4
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/RECORD +61 -51
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/WHEEL +1 -1
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.6.post1.dist-info → sglang-0.4.6.post2.dist-info}/top_level.txt +0 -0

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -72,6 +72,23 @@ class LoRAManager:
         self.init_loras()
         self.init_lora_memory_pool()
+    def init_cuda_graph_batch_info(self, max_bs_in_cuda_graph: int):
+        self.max_bs_in_cuda_graph = max_bs_in_cuda_graph
+        with torch.device("cuda"):
+            self.cuda_graph_batch_info = LoRABatchInfo(
+                bs=self.max_bs_in_cuda_graph,
+                seg_lens=torch.zeros(self.max_bs_in_cuda_graph, dtype=torch.int32),
+                seg_indptr=torch.zeros(
+                    self.max_bs_in_cuda_graph + 1, dtype=torch.int32
+                ),
+                max_len=0,
+                weight_indices=torch.zeros(
+                    self.max_bs_in_cuda_graph, dtype=torch.int32
+                ),
+                lora_ranks=torch.zeros(self.max_loras_per_batch, dtype=torch.int32),
+                scalings=torch.zeros(self.max_loras_per_batch, dtype=torch.float),
+            )
     def init_loras(self):
         # Config of each LoRA adapter
         self.configs: Dict[str, LoRAConfig] = {}
@@ -136,43 +153,75 @@ class LoRAManager:
         assert len(cur_uids) <= self.max_loras_per_batch
         self.memory_pool.prepare_lora_batch(cur_uids, self.loras)
-        # FIXME: Handle lora uid with None more safely
-        if cur_uids == set([None]):
-            return
-        # set up batch info shared by all lora moruldes
+        # set up batch info shared by all lora modules
         bs = forward_batch.batch_size
-        seg_lens = (
-            forward_batch.extend_seq_lens
-            if forward_batch.forward_mode.is_extend()
-            else torch.ones(bs, device=self.device)
-        )
-        seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
-        seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
-        max_len = int(torch.max(seg_lens))
-        weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
-        lora_ranks = torch.empty(
-            (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
-        )
-        scalings = torch.empty(
-            (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
-        )
-        for i, lora_path in enumerate(forward_batch.lora_paths):
-            weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
-            lora = self.loras[lora_path]
-            lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
-            scalings[weight_indices[i]] = lora.scaling
-        batch_info = LoRABatchInfo(
-            bs=bs,
-            seg_lens=seg_lens,
-            seg_indptr=seg_indptr,
-            max_len=max_len,
-            weight_indices=weight_indices,
-            lora_ranks=lora_ranks,
-            scalings=scalings,
-        )
+        if hasattr(self, "max_bs_in_cuda_graph") and bs <= self.max_bs_in_cuda_graph:
+            # Do in-place updates when CUDA graph is enabled. Note that
+            # if CUDA graph is enabled, the batch whose bs <= max_bs_in_cuda_graph
+            # will also use these preallocated buffers, no matter whether
+            # the batch can use CUDA graph or not.
+            self.cuda_graph_batch_info.bs = bs
+            if forward_batch.forward_mode.is_extend():
+                self.cuda_graph_batch_info.seg_lens[:bs].copy_(
+                    forward_batch.extend_seq_lens
+                )
+            else:
+                self.cuda_graph_batch_info.seg_lens[:bs].fill_(1)
+            torch.cumsum(
+                self.cuda_graph_batch_info.seg_lens[:bs],
+                dim=0,
+                out=self.cuda_graph_batch_info.seg_indptr[1 : bs + 1],
+            )
+            self.cuda_graph_batch_info.max_len = int(
+                torch.max(self.cuda_graph_batch_info.seg_lens[:bs])
+            )
+            for i, lora_path in enumerate(forward_batch.lora_paths):
+                self.cuda_graph_batch_info.weight_indices[i] = (
+                    self.memory_pool.get_buffer_id(lora_path)
+                )
+                if lora_path is not None:
+                    lora = self.loras[lora_path]
+                    self.cuda_graph_batch_info.lora_ranks[
+                        self.cuda_graph_batch_info.weight_indices[i]
+                    ] = lora.config.hf_config["r"]
+                    self.cuda_graph_batch_info.scalings[
+                        self.cuda_graph_batch_info.weight_indices[i]
+                    ] = lora.scaling
+            batch_info = self.cuda_graph_batch_info
+        else:
+            seg_lens = (
+                forward_batch.extend_seq_lens
+                if forward_batch.forward_mode.is_extend()
+                else torch.ones(bs, device=self.device)
+            )
+            seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
+            seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
+            max_len = int(torch.max(seg_lens))
+            weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
+            lora_ranks = torch.empty(
+                (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
+            )
+            scalings = torch.empty(
+                (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
+            )
+            for i, lora_path in enumerate(forward_batch.lora_paths):
+                weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
+                if lora_path is not None:
+                    lora = self.loras[lora_path]
+                    lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
+                    scalings[weight_indices[i]] = lora.scaling
+            batch_info = LoRABatchInfo(
+                bs=bs,
+                seg_lens=seg_lens,
+                seg_indptr=seg_indptr,
+                max_len=max_len,
+                weight_indices=weight_indices,
+                lora_ranks=lora_ranks,
+                scalings=scalings,
+            )
         self.lora_backend.set_batch_info(batch_info)
         # call set_lora_info for each lora modules

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -181,44 +181,62 @@ class DataParallelController:
             enable=server_args.enable_memory_saver
         )
-        # Launch tensor parallel scheduler processes
         scheduler_pipe_readers = []
-        tp_size_per_node = server_args.tp_size // server_args.nnodes
+        nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
+        tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
         tp_rank_range = range(
-            tp_size_per_node * server_args.node_rank,
-            tp_size_per_node * (server_args.node_rank + 1),
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
+            tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
+        )
+        pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
+        pp_rank_range = range(
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
+            pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
         )
-        for tp_rank in tp_rank_range:
-            rank_port_args = port_args
-            if server_args.enable_dp_attention:
-                # dp attention has different sharding logic
-                _, _, dp_rank = compute_dp_attention_world_info(
-                    server_args.enable_dp_attention,
-                    tp_rank,
-                    server_args.tp_size,
-                    server_args.dp_size,
+        for pp_rank in pp_rank_range:
+            for tp_rank in tp_rank_range:
+                rank_port_args = port_args
+                if server_args.enable_dp_attention:
+                    # dp attention has different sharding logic
+                    _, _, dp_rank = compute_dp_attention_world_info(
+                        server_args.enable_dp_attention,
+                        tp_rank,
+                        server_args.tp_size,
+                        server_args.dp_size,
+                    )
+                    # compute zmq ports for this dp rank
+                    rank_port_args = PortArgs.init_new(server_args, dp_rank)
+                    # Data parallelism resues the tensor parallelism group,
+                    # so all dp ranks should use the same nccl port.
+                    rank_port_args.nccl_port = port_args.nccl_port
+                reader, writer = mp.Pipe(duplex=False)
+                gpu_id = (
+                    server_args.base_gpu_id
+                    + base_gpu_id
+                    + ((pp_rank % pp_size_per_node) * tp_size_per_node)
+                    + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
                 )
-                # compute zmq ports for this dp rank
-                rank_port_args = PortArgs.init_new(server_args, dp_rank)
-                # Data parallelism resues the tensor parallelism group,
-                # so all dp ranks should use the same nccl port.
-                rank_port_args.nccl_port = port_args.nccl_port
-            reader, writer = mp.Pipe(duplex=False)
-            gpu_id = (
-                server_args.base_gpu_id
-                + base_gpu_id
-                + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
-            )
-            proc = mp.Process(
-                target=run_scheduler_process,
-                args=(server_args, rank_port_args, gpu_id, tp_rank, dp_rank, writer),
-            )
-            with memory_saver_adapter.configure_subprocess():
-                proc.start()
-            self.scheduler_procs.append(proc)
-            scheduler_pipe_readers.append(reader)
+                proc = mp.Process(
+                    target=run_scheduler_process,
+                    args=(
+                        server_args,
+                        rank_port_args,
+                        gpu_id,
+                        tp_rank,
+                        pp_rank,
+                        dp_rank,
+                        writer,
+                    ),
+                )
+                with memory_saver_adapter.configure_subprocess():
+                    proc.start()
+                self.scheduler_procs.append(proc)
+                scheduler_pipe_readers.append(reader)
         # Wait for model to finish loading
         scheduler_info = []

sglang/srt/managers/multimodal_processors/kimi_vl.py ADDED Viewed

@@ -0,0 +1,73 @@
+import asyncio
+import math
+from typing import List, Union
+import torch
+from PIL import Image
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor as SGLangBaseProcessor,
+)
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    MultimodalSpecialTokens,
+)
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.kimi_vl import KimiVLForConditionalGeneration
+# Compatible with KimiVLForConditionalGeneration
+class KimiVLImageProcessor(SGLangBaseProcessor):
+    models = [KimiVLForConditionalGeneration]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.IMAGE_TOKEN = "<|media_pad|>"
+        self.im_token_id = _processor.tokenizer.convert_tokens_to_ids(self.IMAGE_TOKEN)
+        self.im_start = "<|media_start|>"
+        self.im_start_id = _processor.tokenizer.convert_tokens_to_ids(self.im_start)
+        self.im_end = "<|media_end|>"
+        self.im_end_id = _processor.tokenizer.convert_tokens_to_ids(self.im_end)
+        self.im_content = "<|media_content|>"
+        self.im_content_id = _processor.tokenizer.convert_tokens_to_ids(self.im_content)
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        max_req_input_len,
+        *args,
+        **kwargs,
+    ):
+        if not image_data:
+            return None
+        if isinstance(image_data, str):
+            image_data = [image_data]
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            image_data=image_data,
+            multimodal_tokens=MultimodalSpecialTokens(image_token=self.IMAGE_TOKEN),
+            max_req_input_len=max_req_input_len,
+        )
+        ret = self.process_mm_data(
+            input_text=base_output.input_text,
+            images=base_output.images,
+        )
+        return {
+            "input_ids": ret["input_ids"].flatten().tolist(),
+            "mm_items": [
+                MultimodalDataItem(
+                    pixel_values=ret["pixel_values"],
+                    image_grid_thws=ret["image_grid_hws"],
+                    modality=Modality.IMAGE,
+                )
+            ],
+            "im_token_id": self.im_token_id,
+            "im_start_id": self.im_start_id,
+            "im_end_id": self.im_end_id,
+            "im_content_id": self.im_content_id,
+        }

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -66,23 +66,24 @@ INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
 # Put some global args for easy access
 global_server_args_dict = {
     "attention_backend": ServerArgs.attention_backend,
-    "sampling_backend": ServerArgs.sampling_backend,
-    "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
-    "torchao_config": ServerArgs.torchao_config,
-    "enable_nan_detection": ServerArgs.enable_nan_detection,
-    "enable_dp_attention": ServerArgs.enable_dp_attention,
-    "enable_ep_moe": ServerArgs.enable_ep_moe,
-    "enable_deepep_moe": ServerArgs.enable_deepep_moe,
+    "chunked_prefill_size": ServerArgs.chunked_prefill_size,
     "deepep_mode": ServerArgs.deepep_mode,
     "device": ServerArgs.device,
-    "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
-    "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
+    "disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
     "disable_radix_cache": ServerArgs.disable_radix_cache,
+    "enable_deepep_moe": ServerArgs.enable_deepep_moe,
+    "enable_dp_attention": ServerArgs.enable_dp_attention,
+    "enable_ep_moe": ServerArgs.enable_ep_moe,
+    "enable_nan_detection": ServerArgs.enable_nan_detection,
     "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
+    "max_micro_batch_size": ServerArgs.max_micro_batch_size,
     "moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
-    "chunked_prefill_size": ServerArgs.chunked_prefill_size,
     "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
-    "disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
+    "sampling_backend": ServerArgs.sampling_backend,
+    "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
+    "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
+    "torchao_config": ServerArgs.torchao_config,
+    "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
 }
 logger = logging.getLogger(__name__)
@@ -728,6 +729,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # Events
     launch_done: Optional[threading.Event] = None
+    # For chunked prefill in PP
+    chunked_req: Optional[Req] = None
     # Sampling info
     sampling_info: SamplingBatchInfo = None
     next_batch_sampling_info: SamplingBatchInfo = None
@@ -761,7 +765,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     # For extend and mixed chunekd prefill
     prefix_lens: List[int] = None
     extend_lens: List[int] = None
-    extend_num_tokens: int = None
+    extend_num_tokens: Optional[int] = None
     decoding_reqs: List[Req] = None
     extend_logprob_start_lens: List[int] = None
     # It comes empty list if logprob is not required.
@@ -803,6 +807,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         enable_overlap: bool,
         spec_algorithm: SpeculativeAlgorithm,
         enable_custom_logit_processor: bool,
+        chunked_req: Optional[Req] = None,
     ):
         return_logprob = any(req.return_logprob for req in reqs)
@@ -820,6 +825,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             spec_algorithm=spec_algorithm,
             enable_custom_logit_processor=enable_custom_logit_processor,
             return_hidden_states=any(req.return_hidden_states for req in reqs),
+            chunked_req=chunked_req,
         )
     def batch_size(self):
@@ -1236,7 +1242,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     def retract_decode(self, server_args: ServerArgs):
         """Retract the decoding requests when there is not enough memory."""
-        sorted_indices = [i for i in range(len(self.reqs))]
+        sorted_indices = list(range(len(self.reqs)))
         # TODO(lsyin): improve retraction policy for radix cache
         # For spec decoding, filter_batch API can only filter
@@ -1413,15 +1419,19 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     def filter_batch(
         self,
-        chunked_req_to_exclude: Optional[Req] = None,
+        chunked_req_to_exclude: Optional[Union[Req, List[Req]]] = None,
         keep_indices: Optional[List[int]] = None,
     ):
         if keep_indices is None:
+            if isinstance(chunked_req_to_exclude, Req):
+                chunked_req_to_exclude = [chunked_req_to_exclude]
+            elif chunked_req_to_exclude is None:
+                chunked_req_to_exclude = []
             keep_indices = [
                 i
                 for i in range(len(self.reqs))
                 if not self.reqs[i].finished()
-                and self.reqs[i] is not chunked_req_to_exclude
+                and not self.reqs[i] in chunked_req_to_exclude
             ]
         if keep_indices is None or len(keep_indices) == 0:

sglang 0.4.6.post1__py3-none-any.whl → 0.4.6.post2__py3-none-any.whl

sglang 0.4.6.post1py3-none-any.whl → 0.4.6.post2py3-none-any.whl