PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -37,6 +37,7 @@ import hashlib
 import logging
 import threading
 from enum import Enum, auto
+from http import HTTPStatus
 from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
 import numpy as np
@@ -48,7 +49,11 @@ from sglang.global_config import global_config
 from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.constrained.base_grammar_backend import BaseGrammarObject
 from sglang.srt.disaggregation.base import BaseKVSender
-from sglang.srt.disaggregation.decode import ScheduleBatchDisaggregationDecodeMixin
+from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
+    ScheduleBatchDisaggregationDecodeMixin,
+)
+from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
+from sglang.srt.layers.multimodal import gpu_tensor_hash
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator
@@ -57,7 +62,7 @@ from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, Forw
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import flatten_nested_list, get_compiler_backend
+from sglang.srt.utils import flatten_nested_list, support_triton
 if TYPE_CHECKING:
     from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
@@ -65,29 +70,38 @@ if TYPE_CHECKING:
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
+GLOBAL_SERVER_ARGS_KEYS = [
+    "attention_backend",
+    "debug_tensor_dump_inject",
+    "debug_tensor_dump_output_folder",
+    "chunked_prefill_size",
+    "deepep_mode",
+    "device",
+    "disable_chunked_prefix_cache",
+    "disable_radix_cache",
+    "enable_deepep_moe",
+    "enable_dp_attention",
+    "enable_two_batch_overlap",
+    "enable_dp_lm_head",
+    "enable_ep_moe",
+    "deepep_config",
+    "enable_nan_detection",
+    "flashinfer_mla_disable_ragged",
+    "max_micro_batch_size",
+    "moe_dense_tp_size",
+    "ep_dispatch_algorithm",
+    "disable_shared_experts_fusion",
+    "sampling_backend",
+    "speculative_accept_threshold_acc",
+    "speculative_accept_threshold_single",
+    "torchao_config",
+    "triton_attention_reduce_in_fp32",
+    "ep_num_redundant_experts",
+    "mm_attention_backend",
+]
 # Put some global args for easy access
-global_server_args_dict = {
-    "attention_backend": ServerArgs.attention_backend,
-    "chunked_prefill_size": ServerArgs.chunked_prefill_size,
-    "deepep_mode": ServerArgs.deepep_mode,
-    "device": ServerArgs.device,
-    "disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
-    "disable_radix_cache": ServerArgs.disable_radix_cache,
-    "enable_deepep_moe": ServerArgs.enable_deepep_moe,
-    "enable_dp_attention": ServerArgs.enable_dp_attention,
-    "enable_dp_lm_head": ServerArgs.enable_dp_lm_head,
-    "enable_ep_moe": ServerArgs.enable_ep_moe,
-    "enable_nan_detection": ServerArgs.enable_nan_detection,
-    "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
-    "max_micro_batch_size": ServerArgs.max_micro_batch_size,
-    "moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
-    "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
-    "sampling_backend": ServerArgs.sampling_backend,
-    "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
-    "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
-    "torchao_config": ServerArgs.torchao_config,
-    "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
-}
+global_server_args_dict = {k: getattr(ServerArgs, k) for k in GLOBAL_SERVER_ARGS_KEYS}
 logger = logging.getLogger(__name__)
@@ -177,10 +191,10 @@ class MultimodalDataItem:
     image_offsets: Optional[list] = None
     # the real data, pixel_values or audio_features
-    # data: Union[List[torch.Tensor], List[np.array]]
-    pixel_values: Union[torch.Tensor, np.array] = None
-    image_grid_thws: Union[torch.Tensor, np.array] = None
-    video_grid_thws: Union[torch.Tensor, np.array] = None
+    # data: Union[List[torch.Tensor], List[np.ndarray]]
+    pixel_values: Union[torch.Tensor, np.ndarray] = None
+    image_grid_thw: Union[torch.Tensor, np.ndarray] = None
+    video_grid_thws: Union[torch.Tensor, np.ndarray] = None
     image_emb_mask: Optional[torch.Tensor] = None
     image_spatial_crop: Optional[torch.Tensor] = None
@@ -189,8 +203,14 @@ class MultimodalDataItem:
     # [num_images, (n, w, h)]
     tgt_size: Tuple[int, int] = None
-    audio_features: Union[torch.Tensor, np.array] = None
+    # kimi-vl related
+    image_grid_hws: Optional[List[torch.Tensor]] = None
+    audio_features: Union[torch.Tensor, np.ndarray] = None
     audio_feature_lens: Optional[List[torch.Tensor]] = None
+    audio_offsets: Optional[List[Tuple[int, int]]] = None
+    precomputed_features: Optional[Union[torch.Tensor, np.ndarray]] = None
     @staticmethod
     def is_empty_list(l):
@@ -219,7 +239,8 @@ class MultimodalDataItem:
                     for x in tensor_list
                 ]
                 tensor = torch.concat(tensor_list)
+            if tensor.is_cuda:
+                return gpu_tensor_hash(tensor)
             tensor = tensor.detach().contiguous()
             if tensor.dtype == torch.bfloat16:
@@ -249,7 +270,9 @@ class MultimodalDataItem:
                 return tensor_hash([f])
             return data_hash(f)
-        if self.is_audio():
+        if self.precomputed_features is not None:
+            self.hash = hash_feature(self.precomputed_features)
+        elif self.is_audio():
             self.hash = hash_feature(self.audio_features)
         else:
             self.hash = hash_feature(self.pixel_values)
@@ -258,19 +281,24 @@ class MultimodalDataItem:
         self.pad_value = self.hash % (1 << 30)
     def is_audio(self):
-        return (
-            self.modality == Modality.AUDIO
-        ) and not MultimodalDataItem.is_empty_list(self.audio_features)
+        return (self.modality == Modality.AUDIO) and (
+            self.precomputed_features is not None
+            or not MultimodalDataItem.is_empty_list(self.audio_features)
+        )
     def is_image(self):
         return (
             self.modality == Modality.IMAGE or self.modality == Modality.MULTI_IMAGES
-        ) and not MultimodalDataItem.is_empty_list(self.pixel_values)
+        ) and (
+            self.precomputed_features is not None
+            or not MultimodalDataItem.is_empty_list(self.pixel_values)
+        )
     def is_video(self):
-        return (
-            self.modality == Modality.VIDEO
-        ) and not MultimodalDataItem.is_empty_list(self.pixel_values)
+        return (self.modality == Modality.VIDEO) and (
+            self.precomputed_features is not None
+            or not MultimodalDataItem.is_empty_list(self.pixel_values)
+        )
     def is_valid(self) -> bool:
         return self.is_image() or self.is_video() or self.is_audio()
@@ -279,6 +307,16 @@ class MultimodalDataItem:
         ...
         # TODO
+    @staticmethod
+    def from_dict(obj: dict):
+        kwargs = dict(obj)
+        modality = kwargs.pop("modality")
+        if isinstance(modality, str):
+            modality = Modality[modality]
+        ret = MultimodalDataItem(modality=modality, **kwargs)
+        ret.validate()
+        return ret
 @dataclasses.dataclass
 class MultimodalInputs:
@@ -304,8 +342,9 @@ class MultimodalInputs:
     video_token_id: Optional[int] = None
     # audio
-    audio_start_id: Optional[torch.Tensor] = None
-    audio_end_id: Optional[torch.Tensor] = None
+    audio_token_id: Optional[int] = None
+    audio_start_id: Optional[int] = None
+    audio_end_id: Optional[int] = None
     @staticmethod
     def from_dict(obj: dict):
@@ -329,6 +368,7 @@ class MultimodalInputs:
             "slice_end_id",
             "audio_start_id",
             "audio_end_id",
+            "audio_token_id",
         ]
         for arg in optional_args:
             if arg in obj:
@@ -411,6 +451,7 @@ class Req:
         bootstrap_host: Optional[str] = None,
         bootstrap_port: Optional[int] = None,
         bootstrap_room: Optional[int] = None,
+        data_parallel_rank: Optional[int] = None,
     ):
         # Input and output info
         self.rid = rid
@@ -565,6 +606,9 @@ class Req:
         self.bootstrap_room: Optional[int] = bootstrap_room
         self.disagg_kv_sender: Optional[BaseKVSender] = None
+        # For data parallel rank routing
+        self.data_parallel_rank: Optional[int] = data_parallel_rank
         # the start index of the sent kv cache
         # We want to send it chunk by chunk for chunked prefill.
         # After every chunk forward, we do the following:
@@ -578,9 +622,6 @@ class Req:
         self.tmp_end_idx: int = -1
         self.metadata_buffer_index: int = -1
-        # The first output_id transferred from prefill instance.
-        self.transferred_output_id: Optional[int] = None
     @property
     def seqlen(self):
         return len(self.origin_input_ids) + len(self.output_ids)
@@ -744,6 +785,16 @@ class Req:
         logger.info(f"{prefix}: {self.time_stats}")
         self.has_log_time_stats = True
+    def set_finish_with_abort(self, error_msg: str):
+        if get_tensor_model_parallel_rank() == 0:
+            logger.error(f"{error_msg}, {self.rid=}")
+        self.multimodal_inputs = None
+        self.grammar = None
+        self.origin_input_ids = [0]  # set it to one token to skip the long prefill
+        self.finished_reason = FINISH_ABORT(
+            error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
+        )
     def __repr__(self):
         return (
             f"Req(rid={self.rid}, "
@@ -805,6 +856,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     global_num_tokens: Optional[List[int]] = None
     global_num_tokens_for_logprob: Optional[List[int]] = None
     can_run_dp_cuda_graph: bool = False
+    tbo_split_seq_index: Optional[int] = None
+    global_forward_mode: Optional[ForwardMode] = None
     # For processing logprobs
     return_logprob: bool = False
@@ -1069,7 +1122,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         else:
             self.encoder_out_cache_loc = torch.cat(encoder_out_cache_loc)
-        assert len(self.out_cache_loc) == self.extend_num_tokens
+        assert (
+            len(self.out_cache_loc) == self.extend_num_tokens
+        ), f"Expected {len(self.out_cache_loc)}, got {self.extend_num_tokens}"
     def prepare_for_extend(self):
         self.forward_mode = ForwardMode.EXTEND
@@ -1226,7 +1281,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         self.extend_input_logprob_token_ids = extend_input_logprob_token_ids
         # Write to req_to_token_pool
-        if global_server_args_dict["attention_backend"] != "torch_native":
+        if support_triton(global_server_args_dict.get("attention_backend")):
             # TODO: some tensors can be reused for ForwardBatchInfo (e.g., extend_lens, cumsum_start)
             write_req_to_token_pool_triton[(bs,)](
@@ -1290,7 +1345,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         page_size = self.token_to_kv_pool_allocator.page_size
         if page_size == 1:
             return len(self.reqs)
-        return sum(1 for req in self.reqs if req.seqlen % page_size == 0)
+        # In the decoding phase, the length of a request's KV cache should be
+        # the total length of the request minus 1
+        return sum(1 for req in self.reqs if (req.seqlen - 1) % page_size == 0)
     def check_decode_mem(self, buf_multiplier=1):
         tokens_required = (
@@ -1579,7 +1636,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         if self.spec_info:
             self.spec_info.merge_batch(other.spec_info)
-    def get_model_worker_batch(self) -> ModelWorkerBatch:
+    def get_model_worker_batch(
+        self, seq_lens_cpu_cache: Optional[torch.Tensor] = None
+    ) -> ModelWorkerBatch:
         if self.forward_mode.is_decode_or_idle():
             extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = None
         else:
@@ -1589,15 +1648,20 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         # Create seq_lens_cpu when needed
         if (
-            (
+            global_server_args_dict["attention_backend"] == "fa3"
+            or (
                 global_server_args_dict["use_mla_backend"]
                 and global_server_args_dict["attention_backend"] == "flashinfer"
             )
             or global_server_args_dict["attention_backend"] == "flashmla"
-            or global_server_args_dict["attention_backend"] == "fa3"
             or global_server_args_dict["attention_backend"] == "cutlass_mla"
+            or global_server_args_dict["enable_two_batch_overlap"]
         ):
-            seq_lens_cpu = self.seq_lens.cpu()
+            seq_lens_cpu = (
+                seq_lens_cpu_cache
+                if seq_lens_cpu_cache is not None
+                else self.seq_lens.cpu()
+            )
         else:
             seq_lens_cpu = None
@@ -1616,6 +1680,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             req_pool_indices=self.req_pool_indices,
             seq_lens=self.seq_lens,
             out_cache_loc=self.out_cache_loc,
+            seq_lens_cpu=seq_lens_cpu,
             seq_lens_sum=self.seq_lens_sum,
             return_logprob=self.return_logprob,
             top_logprobs_nums=self.top_logprobs_nums,
@@ -1623,7 +1688,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             global_num_tokens=self.global_num_tokens,
             global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
             can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
-            seq_lens_cpu=seq_lens_cpu,
+            tbo_split_seq_index=self.tbo_split_seq_index,
+            global_forward_mode=self.global_forward_mode,
             extend_num_tokens=self.extend_num_tokens,
             extend_seq_lens=extend_seq_lens,
             extend_prefix_lens=extend_prefix_lens,
@@ -1685,11 +1751,11 @@ class ModelWorkerBatch:
     req_pool_indices: torch.Tensor
     # The sequence length
     seq_lens: torch.Tensor
-    seq_lens_cpu: Optional[torch.Tensor]
     # The indices of output tokens in the token_to_kv_pool_allocator
     out_cache_loc: torch.Tensor
-    # The sum of all sequence lengths
+    # The sequence length tensor on CPU
+    seq_lens_cpu: Optional[torch.Tensor]
     seq_lens_sum: int
     # For logprob
@@ -1701,6 +1767,8 @@ class ModelWorkerBatch:
     global_num_tokens: Optional[List[int]]
     global_num_tokens_for_logprob: Optional[List[int]]
     can_run_dp_cuda_graph: bool
+    tbo_split_seq_index: Optional[int]
+    global_forward_mode: Optional[ForwardMode]
     # For extend
     extend_num_tokens: Optional[int]
@@ -1774,10 +1842,72 @@ def write_req_to_token_pool_triton(
         )
-@torch.compile(dynamic=True, backend=get_compiler_backend())
-def get_last_loc(req_to_token, req_pool_indices_tensor, prefix_lens_tensor):
+def get_last_loc(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    if global_server_args_dict["attention_backend"] != "torch_native":
+        impl = get_last_loc_triton
+    else:
+        impl = get_last_loc_torch
+    return impl(req_to_token, req_pool_indices_tensor, prefix_lens_tensor)
+def get_last_loc_torch(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
     return torch.where(
         prefix_lens_tensor > 0,
         req_to_token[req_pool_indices_tensor, prefix_lens_tensor - 1],
         torch.full_like(prefix_lens_tensor, -1),
     )
+@triton.jit
+def get_last_loc_kernel(
+    req_to_token,
+    req_pool_indices_tensor,
+    prefix_lens_tensor,
+    result,
+    num_tokens,
+    req_to_token_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offset = tl.arange(0, BLOCK_SIZE) + pid * BLOCK_SIZE
+    mask = offset < num_tokens
+    prefix_lens = tl.load(prefix_lens_tensor + offset, mask=mask, other=0)
+    req_pool_indices = tl.load(req_pool_indices_tensor + offset, mask=mask, other=0)
+    token_mask = prefix_lens > 0
+    token_index = req_pool_indices * req_to_token_stride + (prefix_lens - 1)
+    tokens = tl.load(req_to_token + token_index, mask=token_mask, other=-1)
+    tl.store(result + offset, tokens, mask=mask)
+def get_last_loc_triton(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    BLOCK_SIZE = 256
+    num_tokens = prefix_lens_tensor.shape[0]
+    result = torch.empty_like(prefix_lens_tensor)
+    grid = (triton.cdiv(num_tokens, BLOCK_SIZE),)
+    get_last_loc_kernel[grid](
+        req_to_token,
+        req_pool_indices_tensor,
+        prefix_lens_tensor,
+        result,
+        num_tokens,
+        req_to_token.stride(0),
+        BLOCK_SIZE,
+    )
+    return result

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -22,11 +22,7 @@ from typing import Dict, List, Optional, Set, Union
 import torch
-from sglang.srt.managers.schedule_batch import (
-    Req,
-    ScheduleBatch,
-    global_server_args_dict,
-)
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
 from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
@@ -468,6 +464,9 @@ class PrefillAdder:
             return AddReqResult.OTHER
         with self._lock_node(req.last_node):
+            if total_tokens > self.rem_total_tokens:
+                return AddReqResult.NO_TOKEN
             if (
                 enable_hierarchical_cache
                 and req.last_node_global is not None

sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl