PyPI - sglang - Versions diffs - 0.4.5.post1__tar.gz → 0.4.5.post2__tar.gz - Mend

sglang 0.4.5.post1tar.gz → 0.4.5.post2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (612) hide show

{sglang-0.4.5.post1/sglang.egg-info → sglang-0.4.5.post2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sglang
-Version: 0.4.5.post1
+Version: 0.4.5.post2
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -246,7 +246,7 @@ Requires-Dist: compressed-tensors; extra == "runtime-common"
 Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
 Provides-Extra: srt
 Requires-Dist: sglang[runtime_common]; extra == "srt"
-Requires-Dist: sgl-kernel==0.0.9.post1; extra == "srt"
+Requires-Dist: sgl-kernel==0.0.9.post2; extra == "srt"
 Requires-Dist: flashinfer_python==0.2.3; extra == "srt"
 Requires-Dist: torch==2.5.1; extra == "srt"
 Requires-Dist: torchvision==0.20.1; extra == "srt"
@@ -381,7 +381,7 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/README.md RENAMED Viewed

@@ -43,7 +43,7 @@ SGLang is a fast serving framework for large language models and vision language
 It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
 The core features include:
-- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, and quantization (FP8/INT4/AWQ/GPTQ).
+- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, zero-overhead CPU scheduler, continuous batching, token attention (paged attention), speculative decoding, tensor parallelism, chunked prefill, structured outputs, quantization (FP8/INT4/AWQ/GPTQ), and multi-lora batching.
 - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
 - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
 - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.4.5.post1"
+version = "0.4.5.post2"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -47,7 +47,7 @@ runtime_common = [
 srt = [
     "sglang[runtime_common]",
-    "sgl-kernel==0.0.9.post1",
+    "sgl-kernel==0.0.9.post2",
     "flashinfer_python==0.2.3",
     "torch==2.5.1",
     "torchvision==0.20.1",

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/__init__.py RENAMED Viewed

@@ -24,6 +24,7 @@ from sglang.api import (
     user_end,
     video,
 )
+from sglang.global_config import global_config
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.lang.choices import (
     greedy_token_selection,
@@ -31,6 +32,7 @@ from sglang.lang.choices import (
     unconditional_likelihood_normalized,
 )
 from sglang.utils import LazyImport
+from sglang.version import __version__
 ServerArgs = LazyImport("sglang.srt.server_args", "ServerArgs")
 Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
@@ -38,10 +40,6 @@ LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
 OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
 VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
-# Other configs
-from sglang.global_config import global_config
-from sglang.version import __version__
 __all__ = [
     "Engine",
     "Runtime",

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/bench_one_batch.py RENAMED Viewed

@@ -207,7 +207,7 @@ def prepare_extend_inputs_for_correctness_test(
 def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
-    input_ids = np.ones((batch_size, input_len), dtype=np.int32)
+    input_ids = np.random.randint(0, 10000, (batch_size, input_len), dtype=np.int32)
     sampling_params = SamplingParams(
         temperature=0,
         max_new_tokens=BenchArgs.output_len,
@@ -396,7 +396,7 @@ def latency_test_run_once(
         decode_latencies.append(latency)
         if i < 5:
             rank_print(
-                f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+                f"Decode. Batch size: {batch_size}, latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
             )
     if profile:

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/bench_serving.py RENAMED Viewed

@@ -707,10 +707,6 @@ def sample_random_requests(
         # Download sharegpt if necessary
         if not os.path.isfile(dataset_path):
-            print(
-                "If you do not want to randomly sample from a dataset,"
-                " please use --dataset-name random-ids."
-            )
             dataset_path = download_and_cache_file(SHAREGPT_URL)
         # Load the dataset.

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/lang/backend/anthropic.py RENAMED Viewed

@@ -1,7 +1,3 @@
-from typing import List, Optional, Union
-import numpy as np
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template
 from sglang.lang.interpreter import StreamExecutor

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/lang/backend/base_backend.py RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import Callable, List, Optional, Union
+from typing import List, Optional, Union
 from sglang.lang.chat_template import get_chat_template
 from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/lang/backend/openai.py RENAMED Viewed

@@ -2,7 +2,7 @@ import dataclasses
 import logging
 import time
 import warnings
-from typing import Callable, List, Optional, Union
+from typing import List, Optional, Union
 import numpy as np

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/lang/backend/vertexai.py RENAMED Viewed

@@ -1,6 +1,5 @@
 import os
 import warnings
-from typing import Optional
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/lang/compiler.py RENAMED Viewed

@@ -5,13 +5,7 @@ from typing import List, Union
 from sglang.global_config import global_config
 from sglang.lang.interpreter import ProgramState, StreamExecutor, cache_program
-from sglang.lang.ir import (
-    SglArgument,
-    SglConstantText,
-    SglExpr,
-    SglSamplingParams,
-    SglVariable,
-)
+from sglang.lang.ir import SglArgument, SglExpr, SglSamplingParams, SglVariable
 def compile_func(function, backend):

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/lang/tracer.py RENAMED Viewed

@@ -1,20 +1,16 @@
 """Tracing a program."""
 import uuid
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
-from sglang.global_config import global_config
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.interpreter import ProgramState, ProgramStateGroup
 from sglang.lang.ir import (
     SglArgument,
-    SglCommitLazy,
-    SglConcateAndAppend,
     SglConstantText,
     SglExpr,
     SglExprList,
     SglFork,
-    SglFunction,
     SglGen,
     SglGetForkItem,
     SglRoleBegin,
@@ -230,8 +226,8 @@ class TracerProgramState(ProgramState):
         self.cur_role = None
     def _execute_var_scope_end(self, expr: SglVarScopeEnd):
-        new_node = SglVariable(name, source=self.last_node)
-        self.variables[name] = new_node
+        new_node = SglVariable(expr.name, source=self.last_node)
+        self.variables[expr.name] = new_node
     def get_var(self, name):
         ret = self.arguments.get(name, None)

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/srt/_custom_ops.py RENAMED Viewed

@@ -1,10 +1,8 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/_custom_ops.py
 import logging
-import os
 from typing import List, Tuple
 import torch
-import torch.library
 from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/srt/constrained/outlines_jump_forward.py RENAMED Viewed

@@ -19,10 +19,13 @@ Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
 import dataclasses
 import logging
 from collections import defaultdict
+from typing import Optional
 import interegular
 from interegular import InvalidSyntax
-from outlines.caching import cache as disk_cache
+from outlines.caching import cache
+from sglang.srt.utils import get_bool_env_var
 try:
     # outlines >= 0.1.0
@@ -34,6 +37,9 @@ except ImportError:
 IP_REGEX = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
+# Env var was set in sglang.srt.server_args.ServerArgs.__post__init__
+DISABLE_DISK_CACHE = get_bool_env_var("SGLANG_DISABLE_OUTLINES_DISK_CACHE", "true")
 logger = logging.getLogger(__name__)
@@ -45,6 +51,13 @@ class JumpEdge:
     byte_next_state: int = None
+def disk_cache(expire: Optional[float] = None, typed=False, ignore=()):
+    if not DISABLE_DISK_CACHE:
+        return cache(expire, typed, ignore)
+    else:
+        return lambda fn: None
 @disk_cache()
 def init_state_to_jump_forward(regex_string):
     try:

sglang-0.4.5.post2/sglang/srt/constrained/triton_ops/bitmask_ops.py ADDED Viewed

@@ -0,0 +1,141 @@
+# Adapt from
+# https://github.com/mlc-ai/xgrammar/blob/v0.1.17/python/xgrammar/kernels/apply_token_bitmask_inplace_triton.py
+from typing import List, Optional, Union
+import torch
+import triton
+import triton.language as tl
+from sglang.srt.utils import get_device_core_count
+@triton.jit
+def apply_token_bitmask_inplace_kernel(
+    logits_ptr,
+    bitmask_ptr,
+    indices_ptr,
+    num_rows,
+    vocab_size,
+    logits_strides,
+    bitmask_strides,
+    NUM_SMS: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Apply a bitmask to logits in-place using Triton. The bitmask is a 01 bitwise compressed tensor,
+    where 0 means the token is masked and 1 means the token is not masked. After applying the bitmask,
+    the masked logits will be set to -inf.
+    Parameters
+    ----------
+    logits_ptr : tl.tensor
+        Pointer to the logits tensor to apply the bitmask to.
+    bitmask_ptr : tl.tensor
+        Pointer to the bitmask tensor to apply.
+    indices_ptr : Optional[tl.tensor]
+        Optional pointer to indices tensor specifying which rows to apply the mask to.
+    num_rows : int
+        Number of rows to process. If indices_ptr is provided, this is the number of unique indices.
+    vocab_size : int
+        Size of the vocabulary dimension. If the logits does not have a vocab padding, this is the
+        same as the logits's second dimension. Otherwise, this is the actual size of the vocabulary.
+    logits_strides : int
+        Stride between rows in the logits tensor.
+    bitmask_strides : int
+        Stride between rows in the bitmask tensor.
+    NUM_SMS : int
+        Number of streaming multiprocessors to use.
+    BLOCK_SIZE : int
+        Size of processing blocks.
+    """
+    pid = tl.program_id(0)
+    num_blocks = tl.cdiv(vocab_size, BLOCK_SIZE)
+    for work_id in tl.range(pid, num_rows * num_blocks, NUM_SMS):
+        row_id = work_id // num_blocks
+        block_offset = (work_id % num_blocks) * BLOCK_SIZE
+        batch_id = row_id if indices_ptr is None else tl.load(indices_ptr + row_id)
+        offsets = block_offset + tl.arange(0, BLOCK_SIZE)
+        bitmask_offsets = block_offset // 32 + tl.arange(0, BLOCK_SIZE // 32)
+        vocab_mask = offsets < vocab_size
+        packed_bitmask_mask = bitmask_offsets < bitmask_strides
+        packed_bitmask = tl.load(
+            bitmask_ptr + batch_id * bitmask_strides + bitmask_offsets,
+            packed_bitmask_mask,
+        )
+        bitmask = ((packed_bitmask[:, None] >> (tl.arange(0, 32)[None, :])) & 1) == 0
+        bitmask = bitmask.reshape(BLOCK_SIZE)
+        tl.store(
+            logits_ptr + batch_id * logits_strides + offsets,
+            -float("inf"),
+            vocab_mask & bitmask,
+        )
+def apply_token_bitmask_inplace_triton(
+    logits: torch.Tensor,
+    bitmask: torch.Tensor,
+    indices: Optional[Union[List[int], torch.Tensor]] = None,
+):
+    NUM_SMS = get_device_core_count()
+    BLOCK_SIZE = 4096
+    BITS_PER_BLOCK = 32
+    # Check input dtype
+    assert bitmask.dtype == torch.int32, "bitmask must be of type int32"
+    # Check input tensor shapes.
+    logits_shape = logits.shape
+    bitmask_shape = bitmask.shape
+    if logits.ndim == 1:
+        logits_shape = (1, logits_shape[0])
+    if bitmask.ndim == 1:
+        bitmask_shape = (1, bitmask_shape[0])
+    required_bitmask_width = (logits_shape[1] + BITS_PER_BLOCK - 1) // BITS_PER_BLOCK
+    assert required_bitmask_width >= bitmask_shape[1], (
+        f"Bitmask width too large: allow at most {required_bitmask_width} int32s for "
+        f"logits' width {logits_shape[1]}, but got {bitmask_shape[1]}"
+    )
+    vocab_size = min(logits_shape[1], bitmask_shape[1] * BITS_PER_BLOCK)
+    num_rows = None
+    if isinstance(indices, list) or isinstance(indices, torch.Tensor):
+        indices = torch.tensor(indices, dtype=torch.int32, device=logits.device)
+        num_rows = indices.shape[0]
+    else:
+        assert (
+            logits_shape[0] == bitmask_shape[0]
+        ), f"batch size mismatch: logits {logits_shape[0]} vs bitmask {bitmask_shape[0]}"
+        num_rows = logits_shape[0]
+    if NUM_SMS > 0:
+        grid = (NUM_SMS,)
+    else:
+        num_blocks = triton.cdiv(vocab_size, BLOCK_SIZE)
+        grid = (num_rows * num_blocks,)
+        NUM_SMS = triton.next_power_of_2(grid[0])
+    apply_token_bitmask_inplace_kernel[grid](
+        logits,
+        bitmask,
+        indices,
+        num_rows,
+        vocab_size,
+        logits_shape[1],
+        bitmask_shape[1],
+        NUM_SMS,
+        BLOCK_SIZE,
+        num_warps=BLOCK_SIZE // 32 // (16 // logits.element_size()),
+        num_stages=3,
+    )

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/srt/constrained/xgrammar_backend.py RENAMED Viewed

@@ -25,13 +25,16 @@ from xgrammar import (
     StructuralTagItem,
     TokenizerInfo,
     allocate_token_bitmask,
-    apply_token_bitmask_inplace,
 )
 from sglang.srt.constrained.base_grammar_backend import (
     BaseGrammarBackend,
     BaseGrammarObject,
 )
+from sglang.srt.constrained.triton_ops.bitmask_ops import (
+    apply_token_bitmask_inplace_triton,
+)
+from sglang.srt.utils import get_bool_env_var
 logger = logging.getLogger(__name__)
@@ -55,6 +58,18 @@ class XGrammarGrammar(BaseGrammarObject):
         self.override_stop_tokens = override_stop_tokens
         self.finished = False
+        # Fix (from vLLM team): postpone the import of apply_token_bitmask_inplace_kernels to the
+        # class init site to avoid re-initializing CUDA in forked subprocess.
+        from xgrammar.kernels import apply_token_bitmask_inplace_kernels
+        self.use_token_bitmask_triton = get_bool_env_var(
+            "SGLANG_TOKEN_BITMASK_TRITON", "false"
+        )
+        self.apply_vocab_mask_cuda = apply_token_bitmask_inplace_kernels.get(
+            "cuda", None
+        )
+        self.apply_vocab_mask_cpu = apply_token_bitmask_inplace_kernels.get("cpu", None)
     def accept_token(self, token: int):
         assert self.matcher.accept_token(token)
@@ -97,9 +112,16 @@ class XGrammarGrammar(BaseGrammarObject):
     def move_vocab_mask(vocab_mask: torch.Tensor, device) -> torch.Tensor:
         return vocab_mask.to(device, non_blocking=True)
-    @staticmethod
-    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
-        apply_token_bitmask_inplace(logits, vocab_mask)
+    def apply_vocab_mask(self, logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
+        if (
+            not self.use_token_bitmask_triton
+            and logits.device.type == "cuda"
+            and self.apply_vocab_mask_cuda
+        ):
+            return self.apply_vocab_mask_cuda(logits, vocab_mask)
+        if logits.device.type == "cpu" and self.apply_vocab_mask_cpu:
+            return self.apply_vocab_mask_cpu(logits, vocab_mask)
+        apply_token_bitmask_inplace_triton(logits, vocab_mask)
     def copy(self):
         matcher = GrammarMatcher(

sglang-0.4.5.post2/sglang/srt/custom_op.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import Optional
+import torch
+from torch import nn
+from sglang.srt.utils import is_cuda, is_hip
+_is_cuda = is_cuda()
+_is_hip = is_hip()
+class CustomOp(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._forward_method = self.dispatch_forward()
+    def forward(self, *args, **kwargs):
+        return self._forward_method(*args, **kwargs)
+    def forward_native(self, *args, **kwargs):
+        raise NotImplementedError
+    def forward_cuda(self, *args, **kwargs):
+        raise NotImplementedError
+    def forward_hip(self, *args, **kwargs):
+        return self.forward_cuda(*args, **kwargs)
+    def forward_xpu(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
+    def forward_hpu(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
+    def forward_cpu(self, *args, **kwargs):
+        return self.forward_native(*args, **kwargs)
+    def dispatch_forward(self):
+        if _is_cuda:
+            return self.forward_cuda
+        elif _is_hip:
+            return self.forward_hip
+        else:
+            return self.forward_native

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/srt/disaggregation/decode.py RENAMED Viewed

@@ -35,6 +35,7 @@ from sglang.srt.disaggregation.utils import (
     ReqToMetadataIdxAllocator,
     TransferBackend,
     get_kv_class,
+    kv_to_page_indices,
     poll_and_all_reduce,
 )
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
@@ -121,7 +122,7 @@ class DecodePreallocQueue:
         kv_args.aux_item_lens = [
             metadata_buffer[0].nbytes for metadata_buffer in self.metadata_buffers
         ]
-        kv_args.ib_device = "mock-ib-device"
+        kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
         kv_args.gpu_id = self.scheduler.gpu_id
         kv_manager_class = get_kv_class(self.transfer_backend, KVClassType.MANAGER)
         kv_manager = kv_manager_class(
@@ -205,7 +206,10 @@ class DecodePreallocQueue:
                 self.req_to_metadata_buffer_idx_allocator.alloc()
             )
             assert decode_req.metadata_buffer_index is not None
-            decode_req.kv_receiver.init(kv_indices, decode_req.metadata_buffer_index)
+            page_indices = kv_to_page_indices(
+                kv_indices, self.token_to_kv_pool_allocator.page_size
+            )
+            decode_req.kv_receiver.init(page_indices, decode_req.metadata_buffer_index)
             preallocated_reqs.append(decode_req)
             indices_to_remove.add(i)
@@ -245,10 +249,30 @@ class DecodePreallocQueue:
         assert req_pool_indices is not None
         req.req_pool_idx = req_pool_indices[0]
-        kv_loc = self.token_to_kv_pool_allocator.alloc(
-            len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
-        )
+        if self.token_to_kv_pool_allocator.page_size == 1:
+            kv_loc = self.token_to_kv_pool_allocator.alloc(
+                len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
+            )
+        else:
+            num_tokens = len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
+            kv_loc = self.token_to_kv_pool_allocator.alloc_extend(
+                prefix_lens=torch.tensor(
+                    [0],
+                    dtype=torch.int64,
+                    device=self.token_to_kv_pool_allocator.device,
+                ),
+                seq_lens=torch.tensor(
+                    [num_tokens],
+                    dtype=torch.int64,
+                    device=self.token_to_kv_pool_allocator.device,
+                ),
+                last_loc=torch.tensor(
+                    [-1],
+                    dtype=torch.int64,
+                    device=self.token_to_kv_pool_allocator.device,
+                ),
+                extend_num_tokens=num_tokens,
+            )
         assert kv_loc is not None
         self.req_to_token_pool.write((req.req_pool_idx, slice(0, len(kv_loc))), kv_loc)
@@ -419,6 +443,38 @@ class ScheduleBatchDisaggregationDecodeMixin:
 class SchedulerDisaggregationDecodeMixin:
+    @torch.no_grad()
+    def event_loop_normal_disagg_decode(self):
+        """A normal scheduler loop for decode worker in disaggregation mode."""
+        while True:
+            recv_reqs = self.recv_requests()
+            self.process_input_requests(recv_reqs)
+            # polling and allocating kv cache
+            self.process_decode_queue()
+            batch = self.get_next_disagg_decode_batch_to_run()
+            self.cur_batch = batch
+            if batch:
+                # Generate fake extend output.
+                if batch.forward_mode.is_extend():
+                    # Note: Logprobs should be handled on the prefill engine.
+                    self.stream_output(batch.reqs, False)
+                else:
+                    result = self.run_batch(batch)
+                    self.process_batch_result(batch, result)
+            if batch is None and (
+                len(self.disagg_decode_transfer_queue.queue)
+                + len(self.disagg_decode_prealloc_queue.queue)
+                == 0
+            ):
+                # When the server is idle, do self-check and re-init some states
+                self.check_memory()
+                self.new_token_ratio = self.init_new_token_ratio
+            self.last_batch = batch
     def get_next_disagg_decode_batch_to_run(
         self: Scheduler,
     ) -> Optional[Tuple[ScheduleBatch, bool]]:

{sglang-0.4.5.post1 → sglang-0.4.5.post2}/sglang/srt/disaggregation/mini_lb.py RENAMED Viewed

@@ -26,7 +26,11 @@ class MiniLoadBalancer:
         self, modified_request, prefill_server, decode_server
     ) -> ORJSONResponse:
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(
+            timeout=aiohttp.ClientTimeout(
+                total=3600
+            )  # Add timeout for request reliability
+        ) as session:
             tasks = [
                 session.post(f"{prefill_server}/generate", json=modified_request),
                 session.post(f"{decode_server}/generate", json=modified_request),

sglang 0.4.5.post1__tar.gz → 0.4.5.post2__tar.gz

sglang 0.4.5.post1tar.gz → 0.4.5.post2tar.gz