PyPI - sglang - Versions diffs - 0.4.5.post3__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl - Mend

sglang 0.4.5.post3py3-none-any.whl → 0.4.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

sglang/bench_one_batch.py +19 -3
sglang/bench_serving.py +8 -9
sglang/compile_deep_gemm.py +45 -4
sglang/srt/code_completion_parser.py +1 -1
sglang/srt/configs/deepseekvl2.py +1 -1
sglang/srt/configs/model_config.py +9 -3
sglang/srt/constrained/llguidance_backend.py +78 -61
sglang/srt/conversation.py +34 -1
sglang/srt/disaggregation/decode.py +67 -13
sglang/srt/disaggregation/fake/__init__.py +1 -0
sglang/srt/disaggregation/fake/conn.py +88 -0
sglang/srt/disaggregation/mini_lb.py +45 -8
sglang/srt/disaggregation/mooncake/conn.py +198 -31
sglang/srt/disaggregation/prefill.py +36 -12
sglang/srt/disaggregation/utils.py +16 -2
sglang/srt/entrypoints/engine.py +9 -0
sglang/srt/entrypoints/http_server.py +35 -4
sglang/srt/function_call_parser.py +77 -5
sglang/srt/layers/attention/base_attn_backend.py +3 -0
sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
sglang/srt/layers/attention/flashattention_backend.py +28 -10
sglang/srt/layers/attention/flashmla_backend.py +8 -11
sglang/srt/layers/attention/utils.py +1 -1
sglang/srt/layers/attention/vision.py +2 -0
sglang/srt/layers/layernorm.py +38 -16
sglang/srt/layers/logits_processor.py +2 -2
sglang/srt/layers/moe/fused_moe_native.py +2 -4
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +20 -17
sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
sglang/srt/layers/pooler.py +6 -0
sglang/srt/layers/quantization/awq.py +5 -1
sglang/srt/layers/quantization/deep_gemm.py +17 -10
sglang/srt/layers/quantization/fp8.py +20 -22
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/int8_kernel.py +32 -1
sglang/srt/layers/radix_attention.py +13 -3
sglang/srt/layers/rotary_embedding.py +170 -126
sglang/srt/managers/data_parallel_controller.py +10 -3
sglang/srt/managers/io_struct.py +7 -0
sglang/srt/managers/mm_utils.py +85 -28
sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
sglang/srt/managers/schedule_batch.py +38 -12
sglang/srt/managers/scheduler.py +41 -28
sglang/srt/managers/scheduler_output_processor_mixin.py +25 -9
sglang/srt/managers/tokenizer_manager.py +5 -1
sglang/srt/managers/tp_worker.py +3 -3
sglang/srt/managers/tp_worker_overlap_thread.py +9 -4
sglang/srt/mem_cache/memory_pool.py +87 -0
sglang/srt/model_executor/cuda_graph_runner.py +4 -3
sglang/srt/model_executor/forward_batch_info.py +51 -95
sglang/srt/model_executor/model_runner.py +19 -25
sglang/srt/models/deepseek.py +12 -2
sglang/srt/models/deepseek_nextn.py +101 -6
sglang/srt/models/deepseek_v2.py +144 -70
sglang/srt/models/deepseek_vl2.py +9 -4
sglang/srt/models/gemma3_causal.py +1 -1
sglang/srt/models/llama4.py +0 -1
sglang/srt/models/minicpmo.py +5 -1
sglang/srt/models/mllama4.py +2 -2
sglang/srt/models/qwen2_5_vl.py +3 -6
sglang/srt/models/qwen2_vl.py +3 -7
sglang/srt/models/roberta.py +178 -0
sglang/srt/openai_api/adapter.py +50 -11
sglang/srt/openai_api/protocol.py +2 -0
sglang/srt/reasoning_parser.py +25 -1
sglang/srt/server_args.py +31 -24
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
sglang/srt/torch_memory_saver_adapter.py +10 -1
sglang/srt/utils.py +5 -1
sglang/test/runners.py +6 -13
sglang/test/send_one.py +84 -28
sglang/test/test_utils.py +74 -18
sglang/version.py +1 -1
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/METADATA +5 -6
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/RECORD +97 -80
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/WHEEL +1 -1
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/disaggregation/decode.py CHANGED Viewed

@@ -32,6 +32,7 @@ from torch.distributed import ProcessGroup
 from sglang.srt.disaggregation.base import BaseKVManager, BaseKVReceiver, KVArgs, KVPoll
 from sglang.srt.disaggregation.utils import (
     DisaggregationMode,
+    FakeBootstrapHost,
     KVClassType,
     ReqToMetadataIdxAllocator,
     TransferBackend,
@@ -133,11 +134,16 @@ class DecodePreallocQueue:
     def add(self, req: Req) -> None:
         """Add a request to the pending queue."""
-        kv_receiver_class = get_kv_class(self.transfer_backend, KVClassType.RECEIVER)
+        if req.bootstrap_host == FakeBootstrapHost:
+            # Fake transfer for warmup reqs
+            kv_receiver_class = get_kv_class(TransferBackend.FAKE, KVClassType.RECEIVER)
+        else:
+            kv_receiver_class = get_kv_class(
+                self.transfer_backend, KVClassType.RECEIVER
+            )
         kv_receiver = kv_receiver_class(
             mgr=self.kv_manager,
-            bootstrap_addr=f"{req.bootstrap_host}:{self.bootstrap_port}",
+            bootstrap_addr=f"{req.bootstrap_host}:{req.bootstrap_port}",
             bootstrap_room=req.bootstrap_room,
         )
         self.queue.append(DecodeRequest(req=req, kv_receiver=kv_receiver))
@@ -307,7 +313,7 @@ class DecodeTransferQueue:
     def extend(self, req_conns) -> None:
         self.queue.extend(req_conns)
-    def pop_transferred(self) -> List[Req]:
+    def pop_transferred(self) -> List[DecodeRequest]:
         if not self.queue:
             return []
@@ -330,7 +336,7 @@ class DecodeTransferQueue:
                 assert len(decode_req.req.output_ids) == 0
                 assert decode_req.req.transferred_output_id is None
                 decode_req.req.transferred_output_id = output_id
-                transferred_reqs.append(decode_req.req)
+                transferred_reqs.append(decode_req)
                 indices_to_remove.add(i)
             elif poll in [
                 KVPoll.Bootstrapping,
@@ -444,8 +450,17 @@ class ScheduleBatchDisaggregationDecodeMixin:
 class SchedulerDisaggregationDecodeMixin:
+    def _prepare_idle_batch_and_run(self, batch, delay_process=False):
+        batch, _ = self.prepare_dp_attn_batch(batch)
+        result = None
+        if batch:
+            result = self.run_batch(batch)
+            if not delay_process:
+                self.process_batch_result(batch, result)
+        return batch, result
     @torch.no_grad()
-    def event_loop_normal_disagg_decode(self):
+    def event_loop_normal_disagg_decode(self: Scheduler):
         """A normal scheduler loop for decode worker in disaggregation mode."""
         while True:
@@ -456,14 +471,25 @@ class SchedulerDisaggregationDecodeMixin:
             batch = self.get_next_disagg_decode_batch_to_run()
             self.cur_batch = batch
+            prepare_dp_attn_flag = (
+                self.server_args.enable_dp_attention
+                or self.server_args.enable_sp_layernorm
+            )
             if batch:
                 # Generate fake extend output.
                 if batch.forward_mode.is_extend():
                     # Note: Logprobs should be handled on the prefill engine.
                     self.stream_output(batch.reqs, False)
+                    if prepare_dp_attn_flag:
+                        self._prepare_idle_batch_and_run(None)
                 else:
+                    if prepare_dp_attn_flag:
+                        self.prepare_dp_attn_batch(batch)
                     result = self.run_batch(batch)
                     self.process_batch_result(batch, result)
+            elif prepare_dp_attn_flag:
+                batch, _ = self._prepare_idle_batch_and_run(None)
             if batch is None and (
                 len(self.disagg_decode_transfer_queue.queue)
@@ -477,10 +503,10 @@ class SchedulerDisaggregationDecodeMixin:
             self.last_batch = batch
     @torch.no_grad()
-    def event_loop_overlap_disagg_decode(self):
+    def event_loop_overlap_disagg_decode(self: Scheduler):
         result_queue = deque()
         self.last_batch: Optional[ScheduleBatch] = None
-        self.last_batch_is_extend = False  # last batch is modifed in-place, so we need another variable to track if it's extend
+        self.last_batch_in_queue = False  # last batch is modifed in-place, so we need another variable to track if it's extend
         while True:
             recv_reqs = self.recv_requests()
@@ -489,20 +515,41 @@ class SchedulerDisaggregationDecodeMixin:
             self.process_decode_queue()
             batch = self.get_next_disagg_decode_batch_to_run()
             self.cur_batch = batch
-            last_batch_is_extend = False
+            last_batch_in_queue = False
+            prepare_dp_attn_flag = (
+                self.server_args.enable_dp_attention
+                or self.server_args.enable_sp_layernorm
+            )
             if batch:
                 # Generate fake extend output.
                 if batch.forward_mode.is_extend():
                     # Note: Logprobs should be handled on the prefill engine.
                     self.stream_output(batch.reqs, False)
-                    last_batch_is_extend = True
+                    if prepare_dp_attn_flag:
+                        batch_, result = self._prepare_idle_batch_and_run(
+                            None, delay_process=True
+                        )
+                        if batch_:
+                            result_queue.append((batch_.copy(), result))
+                            last_batch_in_queue = True
                 else:
+                    if prepare_dp_attn_flag:
+                        self.prepare_dp_attn_batch(batch)
                     result = self.run_batch(batch)
                     result_queue.append((batch.copy(), result))
+                    last_batch_in_queue = True
+            elif prepare_dp_attn_flag:
+                batch, result = self._prepare_idle_batch_and_run(
+                    None, delay_process=True
+                )
+                if batch:
+                    result_queue.append((batch.copy(), result))
+                    last_batch_in_queue = True
             # Process the results of the previous batch but skip if the last batch is extend
-            if self.last_batch and not self.last_batch_is_extend:
+            if self.last_batch and self.last_batch_in_queue:
                 tmp_batch, tmp_result = result_queue.popleft()
                 self.process_batch_result(tmp_batch, tmp_result)
@@ -516,7 +563,7 @@ class SchedulerDisaggregationDecodeMixin:
                 self.new_token_ratio = self.init_new_token_ratio
             self.last_batch = batch
-            self.last_batch_is_extend = last_batch_is_extend
+            self.last_batch_in_queue = last_batch_in_queue
     def get_next_disagg_decode_batch_to_run(
         self: Scheduler,
@@ -600,8 +647,15 @@ class SchedulerDisaggregationDecodeMixin:
     def process_decode_queue(self: Scheduler):
         req_conns = self.disagg_decode_prealloc_queue.pop_preallocated()
+        def _num_pre_alloc(req):
+            return len(req.req.origin_input_ids) + max(len(req.req.output_ids) - 1, 0)
+        self.num_tokens_pre_allocated += sum(_num_pre_alloc(req) for req in req_conns)
         self.disagg_decode_transfer_queue.extend(req_conns)
         alloc_reqs = (
             self.disagg_decode_transfer_queue.pop_transferred()
         )  # the requests which kv has arrived
-        self.waiting_queue.extend(alloc_reqs)
+        self.num_tokens_pre_allocated -= sum(_num_pre_alloc(req) for req in alloc_reqs)
+        self.waiting_queue.extend([req.req for req in alloc_reqs])

sglang/srt/disaggregation/fake/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .conn import FakeKVReceiver, FakeKVSender

sglang/srt/disaggregation/fake/conn.py ADDED Viewed

@@ -0,0 +1,88 @@
+import logging
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+import numpy.typing as npt
+from sglang.srt.disaggregation.base.conn import (
+    BaseKVManager,
+    BaseKVReceiver,
+    BaseKVSender,
+    KVArgs,
+    KVPoll,
+)
+logger = logging.getLogger(__name__)
+# For warmup reqs, we don't kv transfer, we use the fake sender and receiver
+class FakeKVSender(BaseKVSender):
+    def __init__(self, mgr: BaseKVManager, bootstrap_addr: str, bootstrap_room: int):
+        self.has_sent = False
+    def poll(self) -> KVPoll:
+        if self.has_sent is False:
+            # Assume handshake completed instantly
+            return KVPoll.WaitingForInput
+        else:
+            # Assume transfer completed instantly
+            logger.info("FakeKVSender poll success")
+            return KVPoll.Success
+    def init(
+        self,
+        kv_indices: list[int],
+        aux_index: Optional[int] = None,
+        dest_ranks: Optional[list[int]] = None,
+    ):
+        logger.info(
+            f"FakeKVSender init with kv_indices: {kv_indices}, aux_index: {aux_index}, dest_ranks: {dest_ranks}"
+        )
+        pass
+    def send(
+        self,
+        kv_indices: npt.NDArray[np.int64],
+        index_slice: slice,
+        is_last: bool,
+    ):
+        logger.info(
+            f"FakeKVSender send with kv_indices: {kv_indices}, index_slice: {index_slice}, is_last: {is_last}"
+        )
+        if is_last:
+            self.has_sent = True
+            logger.info(f"FakeKVSender send success")
+        else:
+            self.has_sent = False
+            logger.info(f"FakeKVSender send fake transfering")
+    def failure_exception(self):
+        raise Exception("Fake KVSender Exception")
+class FakeKVReceiver(BaseKVReceiver):
+    def __init__(
+        self,
+        mgr: BaseKVManager,
+        bootstrap_addr: str,
+        bootstrap_room: Optional[int] = None,
+    ):
+        self.has_init = False
+    def poll(self) -> KVPoll:
+        if self.has_init is False:
+            # Assume handshake completed instantly
+            return KVPoll.WaitingForInput
+        else:
+            # Assume transfer completed instantly
+            logger.info("FakeKVReceiver poll success")
+            return KVPoll.Success
+    def init(self, kv_indices: list[int], aux_index: Optional[int] = None):
+        self.has_init = True
+        logger.info(
+            f"FakeKVReceiver init with kv_indices: {kv_indices}, aux_index: {aux_index}"
+        )
+    def failure_exception(self):
+        raise Exception("Fake KVReceiver Exception")

sglang/srt/disaggregation/mini_lb.py CHANGED Viewed

@@ -6,6 +6,7 @@ import asyncio
 import random
 import urllib
 from itertools import chain
+from typing import List
 import aiohttp
 import orjson
@@ -14,13 +15,22 @@ from fastapi import FastAPI, HTTPException
 from fastapi.responses import ORJSONResponse, Response, StreamingResponse
+class PrefillConfig:
+    def __init__(self, url: str, bootstrap_port: int):
+        self.url = url
+        self.bootstrap_port = bootstrap_port
 class MiniLoadBalancer:
-    def __init__(self, prefill_servers, decode_servers):
-        self.prefill_servers = prefill_servers
+    def __init__(self, prefill_configs: List[PrefillConfig], decode_servers: List[str]):
+        self.prefill_configs = prefill_configs
+        self.prefill_servers = [p.url for p in prefill_configs]
         self.decode_servers = decode_servers
     def select_pair(self):
-        return random.choice(self.prefill_servers), random.choice(self.decode_servers)
+        prefill_config = random.choice(self.prefill_configs)
+        decode_server = random.choice(self.decode_servers)
+        return prefill_config.url, prefill_config.bootstrap_port, decode_server
     async def generate(
         self, modified_request, prefill_server, decode_server, endpoint
@@ -160,7 +170,7 @@ async def get_model_info():
 @app.post("/generate")
 async def handle_generate_request(request_data: dict):
-    prefill_server, decode_server = load_balancer.select_pair()
+    prefill_server, bootstrap_port, decode_server = load_balancer.select_pair()
     # Parse and transform prefill_server for bootstrap data
     parsed_url = urllib.parse.urlparse(prefill_server)
@@ -172,6 +182,7 @@ async def handle_generate_request(request_data: dict):
         modified_request.update(
             {
                 "bootstrap_host": [hostname] * batch_size,
+                "bootstrap_port": [bootstrap_port] * batch_size,
                 "bootstrap_room": [
                     _generate_bootstrap_room() for _ in range(batch_size)
                 ],
@@ -181,6 +192,7 @@ async def handle_generate_request(request_data: dict):
         modified_request.update(
             {
                 "bootstrap_host": hostname,
+                "bootstrap_port": bootstrap_port,
                 "bootstrap_room": _generate_bootstrap_room(),
             }
         )
@@ -197,7 +209,7 @@ async def handle_generate_request(request_data: dict):
 @app.post("/v1/chat/completions")
 async def handle_completion_request(request_data: dict):
-    prefill_server, decode_server = load_balancer.select_pair()
+    prefill_server, bootstrap_port, decode_server = load_balancer.select_pair()
     # Parse and transform prefill_server for bootstrap data
     parsed_url = urllib.parse.urlparse(prefill_server)
@@ -206,6 +218,7 @@ async def handle_completion_request(request_data: dict):
     modified_request.update(
         {
             "bootstrap_host": hostname,
+            "bootstrap_port": bootstrap_port,
             "bootstrap_room": random.randint(0, 2**63 - 1),
         }
     )
@@ -255,9 +268,9 @@ async def get_models():
             raise HTTPException(status_code=500, detail=str(e))
-def run(prefill_addrs, decode_addrs, host, port):
+def run(prefill_configs, decode_addrs, host, port):
     global load_balancer
-    load_balancer = MiniLoadBalancer(prefill_addrs, decode_addrs)
+    load_balancer = MiniLoadBalancer(prefill_configs, decode_addrs)
     uvicorn.run(app, host=host, port=port)
@@ -268,6 +281,11 @@ if __name__ == "__main__":
     parser.add_argument(
         "--prefill", required=True, help="Comma-separated URLs for prefill servers"
     )
+    parser.add_argument(
+        "--prefill-bootstrap-ports",
+        help="Comma-separated bootstrap ports for prefill servers",
+        default="8998",
+    )
     parser.add_argument(
         "--decode", required=True, help="Comma-separated URLs for decode servers"
     )
@@ -278,4 +296,23 @@ if __name__ == "__main__":
         "--port", type=int, default=8000, help="Port to bind the server (default: 8000)"
     )
     args = parser.parse_args()
-    run(args.prefill.split(","), args.decode.split(","), args.host, args.port)
+    prefill_urls = args.prefill.split(",")
+    bootstrap_ports = [int(p) for p in args.prefill_bootstrap_ports.split(",")]
+    if len(bootstrap_ports) == 1:
+        bootstrap_ports = bootstrap_ports * len(prefill_urls)
+    else:
+        if len(bootstrap_ports) != len(prefill_urls):
+            raise ValueError(
+                "Number of prefill URLs must match number of bootstrap ports"
+            )
+            exit(1)
+    prefill_configs = []
+    for url, port in zip(prefill_urls, bootstrap_ports):
+        prefill_configs.append(PrefillConfig(url, port))
+    decode_addrs = args.decode.split(",")
+    run(prefill_configs, decode_addrs, args.host, args.port)

sglang 0.4.5.post3__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl

sglang 0.4.5.post3py3-none-any.whl → 0.4.6.post1py3-none-any.whl