PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

sglang/srt/managers/multimodal_processors/phi4mm.py ADDED Viewed

@@ -0,0 +1,87 @@
+import logging
+from typing import List, Union
+from sglang.srt.managers.multimodal_processors.base_processor import (
+    BaseMultimodalProcessor,
+    MultimodalSpecialTokens,
+)
+from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
+from sglang.srt.models.phi4mm import Phi4MMForCausalLM
+logger = logging.getLogger(__name__)
+_IMAGE_SPECIAL_TOKEN = "<|endoftext10|>"
+_IMAGE_SPECIAL_TOKEN_ID = 200010
+class Phi4MMImageProcessor(BaseMultimodalProcessor):
+    models = [Phi4MMForCausalLM]
+    def __init__(self, hf_config, server_args, _processor):
+        super().__init__(hf_config, server_args, _processor)
+        self.multimodal_tokens = MultimodalSpecialTokens(
+            image_token=_IMAGE_SPECIAL_TOKEN,
+        )
+    async def process_mm_data_async(
+        self,
+        image_data: List[Union[str, bytes]],
+        input_text,
+        request_obj,
+        max_req_input_len,
+        **kwargs,
+    ):
+        audio_data = request_obj.audio_data
+        if not image_data and not audio_data:
+            return None
+        if not isinstance(image_data, list):
+            image_data = [image_data]
+        if not isinstance(audio_data, list):
+            audio_data = [audio_data]
+        if audio_data:
+            logger.warning(
+                "Currently SGLang does not support audio data for Phi4MM. We are working on it. You can file an issue to help us prioritize."
+            )
+            audio_data = []
+        base_output = self.load_mm_data(
+            prompt=input_text,
+            max_req_input_len=max_req_input_len,
+            audio_data=audio_data,
+            image_data=image_data,
+            multimodal_tokens=self.multimodal_tokens,
+        )
+        if base_output is None:
+            return None
+        res = self.process_mm_data(
+            input_text=base_output.input_text,
+            images=base_output.images,
+            audios=base_output.audios,
+        )
+        input_ids = res["input_ids"].flatten()
+        image_offsets = self.get_mm_items_offset(
+            input_ids=input_ids,
+            mm_token_id=_IMAGE_SPECIAL_TOKEN_ID,
+        )
+        items = [
+            MultimodalDataItem(
+                pixel_values=res["input_image_embeds"],
+                image_sizes=res["image_sizes"],
+                image_emb_mask=res["image_attention_mask"],
+                image_offsets=image_offsets,
+                modality=Modality.IMAGE,
+            )
+        ]
+        return {
+            "mm_items": items,
+            "input_ids": input_ids.tolist(),
+            "im_token_id": _IMAGE_SPECIAL_TOKEN_ID,
+        }

sglang/srt/managers/multimodal_processors/qwen_vl.py CHANGED Viewed

@@ -32,8 +32,8 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
         )
         self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
         self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
-        self.image_token_id = hf_config.image_token_id
-        self.video_token_id = hf_config.video_token_id
+        self.IM_TOKEN_ID = hf_config.image_token_id
+        self.VIDEO_TOKEN_ID = hf_config.video_token_id
         self.vision_start_token_id = hf_config.vision_start_token_id
         self.vision_end_token_id = hf_config.vision_end_token_id
         self.NUM_TOKEN_PER_FRAME = 770
@@ -125,87 +125,45 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
         async def resize_image_async(image):
             return resize_image(image)
-        images_are_preprocessed = self.mm_inputs_are_preprocessed(base_output.images)
-        if base_output.images and not images_are_preprocessed:
+        # Qwen-specific: resize images if they are raw Image objects
+        if base_output.images and isinstance(base_output.images[0], Image.Image):
             resize_tasks = [resize_image_async(image) for image in base_output.images]
             base_output.images = await asyncio.gather(*resize_tasks)
-        ret = self.process_mm_data(
-            input_text=base_output.input_text,
-            images=None if images_are_preprocessed else base_output.images,
-        )
-        input_ids = ret["input_ids"].flatten().tolist()
-        image_offsets = self.get_mm_items_offset(
-            input_ids=ret["input_ids"].flatten(), mm_token_id=self.image_token_id
-        )
-        image_grid_thw = None
         video_grid_thw = None  # TODO
-        items = []
-        if base_output.images:
-            if images_are_preprocessed:
-                image_grid_thw = torch.concat(
-                    [
-                        torch.as_tensor(item.image_grid_thws)
-                        for item in base_output.images
-                    ]
-                )
-                all_pixel_values = [
-                    item.pixel_values
-                    for item in base_output.images
-                    if item.pixel_values is not None
-                ]
-                all_precomputed_features = [
-                    item.precomputed_features
-                    for item in base_output.images
-                    if item.precomputed_features is not None
-                ]
-                pixel_values = (
-                    torch.concat(all_pixel_values) if all_pixel_values else None
-                )
-                precomputed_features = (
-                    torch.concat(all_precomputed_features)
-                    if all_precomputed_features
-                    else None
-                )
-            else:
-                image_grid_thw = ret["image_grid_thw"]
-                pixel_values = ret["pixel_values"]
-                precomputed_features = None
-            items += [
-                MultimodalDataItem(
-                    pixel_values=pixel_values,
-                    image_grid_thws=image_grid_thw,
-                    video_grid_thws=video_grid_thw,
-                    precomputed_features=precomputed_features,
-                    image_offsets=image_offsets,
-                    modality=Modality.IMAGE,
-                )
-            ]
+        combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
+        if combined_mm_item is None:
+            # Note(Xinyuan): This is the case where image loading fails.
+            return None
+        video_grid_thw = None  # TODO
+        second_per_grid_ts = getattr(combined_mm_item, "second_per_grid_ts", None)
         mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
             spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
-            image_token_id=self.image_token_id,
-            video_token_id=self.video_token_id,
+            image_token_id=self.IM_TOKEN_ID,
+            video_token_id=self.VIDEO_TOKEN_ID,
             vision_start_token_id=self.vision_start_token_id,
             model_type=self.hf_config.model_type,
             tokens_per_second=getattr(
                 self.hf_config.vision_config, "tokens_per_second", None
             ),
-            input_ids=torch.tensor(input_ids).unsqueeze(0),
-            image_grid_thw=image_grid_thw,
+            input_ids=input_ids.unsqueeze(0),
+            image_grid_thw=combined_mm_item.image_grid_thw,
             video_grid_thw=video_grid_thw,
-            second_per_grid_ts=ret.get("second_per_grid_ts", None),
+            second_per_grid_ts=second_per_grid_ts,
         )
         mrope_positions = mrope_positions.squeeze(1)
         return {
-            "input_ids": input_ids,
-            "mm_items": items,
+            "input_ids": input_ids.tolist(),
+            "mm_items": [combined_mm_item],
             "im_start_id": self.IM_START_TOKEN_ID,
             "im_end_id": self.IM_END_TOKEN_ID,
-            "im_token_id": self.image_token_id,
-            "video_token_id": self.video_token_id,
+            "im_token_id": self.IM_TOKEN_ID,
+            "video_token_id": self.VIDEO_TOKEN_ID,
             "mrope_positions": mrope_positions,
             "mrope_position_delta": mrope_position_delta,
         }

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -37,6 +37,7 @@ import hashlib
 import logging
 import threading
 from enum import Enum, auto
+from http import HTTPStatus
 from typing import TYPE_CHECKING, List, Optional, Set, Tuple, Union
 import numpy as np
@@ -51,6 +52,7 @@ from sglang.srt.disaggregation.base import BaseKVSender
 from sglang.srt.disaggregation.decode_schedule_batch_mixin import (
     ScheduleBatchDisaggregationDecodeMixin,
 )
+from sglang.srt.distributed.parallel_state import get_tensor_model_parallel_rank
 from sglang.srt.layers.multimodal import gpu_tensor_hash
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
@@ -60,7 +62,7 @@ from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, Forw
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
-from sglang.srt.utils import flatten_nested_list, get_compiler_backend
+from sglang.srt.utils import flatten_nested_list, support_triton
 if TYPE_CHECKING:
     from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
@@ -68,32 +70,38 @@ if TYPE_CHECKING:
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
+GLOBAL_SERVER_ARGS_KEYS = [
+    "attention_backend",
+    "debug_tensor_dump_inject",
+    "debug_tensor_dump_output_folder",
+    "chunked_prefill_size",
+    "deepep_mode",
+    "device",
+    "disable_chunked_prefix_cache",
+    "disable_radix_cache",
+    "enable_deepep_moe",
+    "enable_dp_attention",
+    "enable_two_batch_overlap",
+    "enable_dp_lm_head",
+    "enable_ep_moe",
+    "deepep_config",
+    "enable_nan_detection",
+    "flashinfer_mla_disable_ragged",
+    "max_micro_batch_size",
+    "moe_dense_tp_size",
+    "ep_dispatch_algorithm",
+    "disable_shared_experts_fusion",
+    "sampling_backend",
+    "speculative_accept_threshold_acc",
+    "speculative_accept_threshold_single",
+    "torchao_config",
+    "triton_attention_reduce_in_fp32",
+    "ep_num_redundant_experts",
+    "mm_attention_backend",
+]
 # Put some global args for easy access
-global_server_args_dict = {
-    "attention_backend": ServerArgs.attention_backend,
-    "chunked_prefill_size": ServerArgs.chunked_prefill_size,
-    "deepep_mode": ServerArgs.deepep_mode,
-    "device": ServerArgs.device,
-    "disable_chunked_prefix_cache": ServerArgs.disable_chunked_prefix_cache,
-    "disable_radix_cache": ServerArgs.disable_radix_cache,
-    "enable_deepep_moe": ServerArgs.enable_deepep_moe,
-    "enable_dp_attention": ServerArgs.enable_dp_attention,
-    "enable_dp_lm_head": ServerArgs.enable_dp_lm_head,
-    "enable_ep_moe": ServerArgs.enable_ep_moe,
-    "deepep_config": ServerArgs.deepep_config,
-    "enable_nan_detection": ServerArgs.enable_nan_detection,
-    "flashinfer_mla_disable_ragged": ServerArgs.flashinfer_mla_disable_ragged,
-    "max_micro_batch_size": ServerArgs.max_micro_batch_size,
-    "moe_dense_tp_size": ServerArgs.moe_dense_tp_size,
-    "ep_dispatch_algorithm": ServerArgs.ep_dispatch_algorithm,
-    "n_share_experts_fusion": ServerArgs.n_share_experts_fusion,
-    "sampling_backend": ServerArgs.sampling_backend,
-    "speculative_accept_threshold_acc": ServerArgs.speculative_accept_threshold_acc,
-    "speculative_accept_threshold_single": ServerArgs.speculative_accept_threshold_single,
-    "torchao_config": ServerArgs.torchao_config,
-    "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
-    "ep_num_redundant_experts": ServerArgs.ep_num_redundant_experts,
-}
+global_server_args_dict = {k: getattr(ServerArgs, k) for k in GLOBAL_SERVER_ARGS_KEYS}
 logger = logging.getLogger(__name__)
@@ -185,7 +193,7 @@ class MultimodalDataItem:
     # the real data, pixel_values or audio_features
     # data: Union[List[torch.Tensor], List[np.ndarray]]
     pixel_values: Union[torch.Tensor, np.ndarray] = None
-    image_grid_thws: Union[torch.Tensor, np.ndarray] = None
+    image_grid_thw: Union[torch.Tensor, np.ndarray] = None
     video_grid_thws: Union[torch.Tensor, np.ndarray] = None
     image_emb_mask: Optional[torch.Tensor] = None
@@ -195,6 +203,9 @@ class MultimodalDataItem:
     # [num_images, (n, w, h)]
     tgt_size: Tuple[int, int] = None
+    # kimi-vl related
+    image_grid_hws: Optional[List[torch.Tensor]] = None
     audio_features: Union[torch.Tensor, np.ndarray] = None
     audio_feature_lens: Optional[List[torch.Tensor]] = None
     audio_offsets: Optional[List[Tuple[int, int]]] = None
@@ -440,6 +451,7 @@ class Req:
         bootstrap_host: Optional[str] = None,
         bootstrap_port: Optional[int] = None,
         bootstrap_room: Optional[int] = None,
+        data_parallel_rank: Optional[int] = None,
     ):
         # Input and output info
         self.rid = rid
@@ -594,6 +606,9 @@ class Req:
         self.bootstrap_room: Optional[int] = bootstrap_room
         self.disagg_kv_sender: Optional[BaseKVSender] = None
+        # For data parallel rank routing
+        self.data_parallel_rank: Optional[int] = data_parallel_rank
         # the start index of the sent kv cache
         # We want to send it chunk by chunk for chunked prefill.
         # After every chunk forward, we do the following:
@@ -770,6 +785,16 @@ class Req:
         logger.info(f"{prefix}: {self.time_stats}")
         self.has_log_time_stats = True
+    def set_finish_with_abort(self, error_msg: str):
+        if get_tensor_model_parallel_rank() == 0:
+            logger.error(f"{error_msg}, {self.rid=}")
+        self.multimodal_inputs = None
+        self.grammar = None
+        self.origin_input_ids = [0]  # set it to one token to skip the long prefill
+        self.finished_reason = FINISH_ABORT(
+            error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
+        )
     def __repr__(self):
         return (
             f"Req(rid={self.rid}, "
@@ -831,6 +856,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
     global_num_tokens: Optional[List[int]] = None
     global_num_tokens_for_logprob: Optional[List[int]] = None
     can_run_dp_cuda_graph: bool = False
+    tbo_split_seq_index: Optional[int] = None
+    global_forward_mode: Optional[ForwardMode] = None
     # For processing logprobs
     return_logprob: bool = False
@@ -1254,7 +1281,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         self.extend_input_logprob_token_ids = extend_input_logprob_token_ids
         # Write to req_to_token_pool
-        if global_server_args_dict["attention_backend"] != "torch_native":
+        if support_triton(global_server_args_dict.get("attention_backend")):
             # TODO: some tensors can be reused for ForwardBatchInfo (e.g., extend_lens, cumsum_start)
             write_req_to_token_pool_triton[(bs,)](
@@ -1318,7 +1345,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         page_size = self.token_to_kv_pool_allocator.page_size
         if page_size == 1:
             return len(self.reqs)
-        return sum(1 for req in self.reqs if req.seqlen % page_size == 0)
+        # In the decoding phase, the length of a request's KV cache should be
+        # the total length of the request minus 1
+        return sum(1 for req in self.reqs if (req.seqlen - 1) % page_size == 0)
     def check_decode_mem(self, buf_multiplier=1):
         tokens_required = (
@@ -1607,7 +1636,9 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         if self.spec_info:
             self.spec_info.merge_batch(other.spec_info)
-    def get_model_worker_batch(self) -> ModelWorkerBatch:
+    def get_model_worker_batch(
+        self, seq_lens_cpu_cache: Optional[torch.Tensor] = None
+    ) -> ModelWorkerBatch:
         if self.forward_mode.is_decode_or_idle():
             extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = None
         else:
@@ -1617,15 +1648,20 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
         # Create seq_lens_cpu when needed
         if (
-            (
+            global_server_args_dict["attention_backend"] == "fa3"
+            or (
                 global_server_args_dict["use_mla_backend"]
                 and global_server_args_dict["attention_backend"] == "flashinfer"
             )
             or global_server_args_dict["attention_backend"] == "flashmla"
-            or global_server_args_dict["attention_backend"] == "fa3"
             or global_server_args_dict["attention_backend"] == "cutlass_mla"
+            or global_server_args_dict["enable_two_batch_overlap"]
         ):
-            seq_lens_cpu = self.seq_lens.cpu()
+            seq_lens_cpu = (
+                seq_lens_cpu_cache
+                if seq_lens_cpu_cache is not None
+                else self.seq_lens.cpu()
+            )
         else:
             seq_lens_cpu = None
@@ -1644,6 +1680,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             req_pool_indices=self.req_pool_indices,
             seq_lens=self.seq_lens,
             out_cache_loc=self.out_cache_loc,
+            seq_lens_cpu=seq_lens_cpu,
             seq_lens_sum=self.seq_lens_sum,
             return_logprob=self.return_logprob,
             top_logprobs_nums=self.top_logprobs_nums,
@@ -1651,7 +1688,8 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
             global_num_tokens=self.global_num_tokens,
             global_num_tokens_for_logprob=self.global_num_tokens_for_logprob,
             can_run_dp_cuda_graph=self.can_run_dp_cuda_graph,
-            seq_lens_cpu=seq_lens_cpu,
+            tbo_split_seq_index=self.tbo_split_seq_index,
+            global_forward_mode=self.global_forward_mode,
             extend_num_tokens=self.extend_num_tokens,
             extend_seq_lens=extend_seq_lens,
             extend_prefix_lens=extend_prefix_lens,
@@ -1713,11 +1751,11 @@ class ModelWorkerBatch:
     req_pool_indices: torch.Tensor
     # The sequence length
     seq_lens: torch.Tensor
-    seq_lens_cpu: Optional[torch.Tensor]
     # The indices of output tokens in the token_to_kv_pool_allocator
     out_cache_loc: torch.Tensor
-    # The sum of all sequence lengths
+    # The sequence length tensor on CPU
+    seq_lens_cpu: Optional[torch.Tensor]
     seq_lens_sum: int
     # For logprob
@@ -1729,6 +1767,8 @@ class ModelWorkerBatch:
     global_num_tokens: Optional[List[int]]
     global_num_tokens_for_logprob: Optional[List[int]]
     can_run_dp_cuda_graph: bool
+    tbo_split_seq_index: Optional[int]
+    global_forward_mode: Optional[ForwardMode]
     # For extend
     extend_num_tokens: Optional[int]
@@ -1802,10 +1842,72 @@ def write_req_to_token_pool_triton(
         )
-@torch.compile(dynamic=True, backend=get_compiler_backend())
-def get_last_loc(req_to_token, req_pool_indices_tensor, prefix_lens_tensor):
+def get_last_loc(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    if global_server_args_dict["attention_backend"] != "torch_native":
+        impl = get_last_loc_triton
+    else:
+        impl = get_last_loc_torch
+    return impl(req_to_token, req_pool_indices_tensor, prefix_lens_tensor)
+def get_last_loc_torch(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
     return torch.where(
         prefix_lens_tensor > 0,
         req_to_token[req_pool_indices_tensor, prefix_lens_tensor - 1],
         torch.full_like(prefix_lens_tensor, -1),
     )
+@triton.jit
+def get_last_loc_kernel(
+    req_to_token,
+    req_pool_indices_tensor,
+    prefix_lens_tensor,
+    result,
+    num_tokens,
+    req_to_token_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    offset = tl.arange(0, BLOCK_SIZE) + pid * BLOCK_SIZE
+    mask = offset < num_tokens
+    prefix_lens = tl.load(prefix_lens_tensor + offset, mask=mask, other=0)
+    req_pool_indices = tl.load(req_pool_indices_tensor + offset, mask=mask, other=0)
+    token_mask = prefix_lens > 0
+    token_index = req_pool_indices * req_to_token_stride + (prefix_lens - 1)
+    tokens = tl.load(req_to_token + token_index, mask=token_mask, other=-1)
+    tl.store(result + offset, tokens, mask=mask)
+def get_last_loc_triton(
+    req_to_token: torch.Tensor,
+    req_pool_indices_tensor: torch.Tensor,
+    prefix_lens_tensor: torch.Tensor,
+) -> torch.Tensor:
+    BLOCK_SIZE = 256
+    num_tokens = prefix_lens_tensor.shape[0]
+    result = torch.empty_like(prefix_lens_tensor)
+    grid = (triton.cdiv(num_tokens, BLOCK_SIZE),)
+    get_last_loc_kernel[grid](
+        req_to_token,
+        req_pool_indices_tensor,
+        prefix_lens_tensor,
+        result,
+        num_tokens,
+        req_to_token.stride(0),
+        BLOCK_SIZE,
+    )
+    return result

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl