PyPI - sglang - Versions diffs - 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

sglang 0.4.4.post3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_serving.py +49 -7
sglang/lang/chat_template.py +24 -0
sglang/srt/_custom_ops.py +59 -92
sglang/srt/configs/model_config.py +5 -0
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/conversation.py +29 -4
sglang/srt/custom_op.py +5 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +27 -79
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/entrypoints/engine.py +0 -5
sglang/srt/layers/attention/flashattention_backend.py +678 -83
sglang/srt/layers/attention/flashinfer_backend.py +5 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -80
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
sglang/srt/layers/moe/fused_moe_native.py +5 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +416 -50
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +49 -3
sglang/srt/layers/quantization/__init__.py +5 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
sglang/srt/layers/quantization/fp8.py +3 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -4
sglang/srt/layers/quantization/moe_wna16.py +503 -0
sglang/srt/layers/quantization/utils.py +1 -1
sglang/srt/layers/quantization/w8a8_int8.py +2 -0
sglang/srt/layers/radix_attention.py +2 -0
sglang/srt/layers/rotary_embedding.py +63 -12
sglang/srt/managers/cache_controller.py +34 -11
sglang/srt/managers/mm_utils.py +202 -156
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
sglang/srt/managers/multimodal_processors/clip.py +7 -26
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
sglang/srt/managers/multimodal_processors/llava.py +34 -14
sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
sglang/srt/managers/multimodal_processors/mlama.py +10 -23
sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
sglang/srt/managers/schedule_batch.py +185 -128
sglang/srt/managers/scheduler.py +4 -4
sglang/srt/managers/tokenizer_manager.py +1 -1
sglang/srt/managers/utils.py +1 -6
sglang/srt/mem_cache/hiradix_cache.py +62 -52
sglang/srt/mem_cache/memory_pool.py +72 -6
sglang/srt/mem_cache/paged_allocator.py +39 -0
sglang/srt/metrics/collector.py +23 -53
sglang/srt/model_executor/cuda_graph_runner.py +8 -6
sglang/srt/model_executor/forward_batch_info.py +10 -10
sglang/srt/model_executor/model_runner.py +60 -57
sglang/srt/model_loader/loader.py +8 -0
sglang/srt/models/clip.py +12 -7
sglang/srt/models/deepseek_janus_pro.py +10 -15
sglang/srt/models/deepseek_v2.py +212 -121
sglang/srt/models/deepseek_vl2.py +105 -104
sglang/srt/models/gemma3_mm.py +14 -80
sglang/srt/models/llama.py +16 -5
sglang/srt/models/llama4.py +420 -0
sglang/srt/models/llava.py +31 -19
sglang/srt/models/llavavid.py +16 -7
sglang/srt/models/minicpmo.py +63 -147
sglang/srt/models/minicpmv.py +17 -27
sglang/srt/models/mllama.py +29 -14
sglang/srt/models/mllama4.py +154 -0
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_5_vl.py +21 -31
sglang/srt/models/qwen2_vl.py +20 -21
sglang/srt/openai_api/adapter.py +18 -6
sglang/srt/platforms/interface.py +371 -0
sglang/srt/server_args.py +99 -14
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
sglang/srt/speculative/eagle_utils.py +140 -28
sglang/srt/speculative/eagle_worker.py +93 -24
sglang/srt/utils.py +104 -51
sglang/test/test_custom_ops.py +55 -0
sglang/test/test_utils.py +13 -26
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/METADATA +4 -3
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/RECORD +99 -84
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/top_level.txt +0 -0

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -149,6 +149,7 @@ class HiCacheController:
         self,
         token_to_kv_pool_allocator: TokenToKVPoolAllocator,
         mem_pool_host: HostKVCache,
+        page_size: int,
         load_cache_event: threading.Event = None,
         write_policy: str = "write_through_selective",
     ):
@@ -156,6 +157,7 @@ class HiCacheController:
         self.mem_pool_device = token_to_kv_pool_allocator.get_kvcache()
         self.mem_pool_host = mem_pool_host
         self.write_policy = write_policy
+        self.page_size = page_size
         self.load_cache_event = load_cache_event
         self.layer_done_counter = LayerDoneCounter(self.mem_pool_device.layer_num)
@@ -184,7 +186,12 @@ class HiCacheController:
         self.load_stream = torch.cuda.Stream()
         self.write_thread = threading.Thread(
-            target=self.write_thread_func_buffer, daemon=True
+            target=(
+                self.write_thread_func_buffer
+                if self.page_size == 1
+                else self.write_thread_func_direct
+            ),
+            daemon=True,
         )
         self.load_thread = threading.Thread(
             target=self.load_thread_func_layer_by_layer, daemon=True
@@ -205,7 +212,12 @@ class HiCacheController:
         self.ack_load_queue.queue.clear()
         self.write_thread = threading.Thread(
-            target=self.write_thread_func_buffer, daemon=True
+            target=(
+                self.write_thread_func_buffer
+                if self.page_size == 1
+                else self.write_thread_func_direct
+            ),
+            daemon=True,
         )
         self.load_thread = threading.Thread(
             target=self.load_thread_func_layer_by_layer, daemon=True
@@ -260,10 +272,12 @@ class HiCacheController:
             while not self.stop_event.is_set():
                 try:
                     operation = self.write_queue.get(block=True, timeout=1)
-                    operation.data = self.mem_pool_device.get_flat_data(
-                        operation.device_indices
+                    self.mem_pool_host.write_page_all_layers(
+                        operation.host_indices,
+                        operation.device_indices,
+                        self.mem_pool_device,
                     )
-                    self.mem_pool_host.transfer(operation.host_indices, operation.data)
+                    self.write_stream.synchronize()
                     self.mem_pool_host.complete_io(operation.host_indices)
                     for node_id in operation.node_ids:
                         if node_id != 0:
@@ -320,12 +334,21 @@ class HiCacheController:
                 self.layer_done_counter.reset()
                 for i in range(self.mem_pool_host.layer_num):
-                    flat_data = self.mem_pool_host.get_flat_data_by_layer(
-                        batch_operation.host_indices, i
-                    )
-                    self.mem_pool_device.transfer_per_layer(
-                        batch_operation.device_indices, flat_data, i
-                    )
+                    if self.page_size == 1:
+                        flat_data = self.mem_pool_host.get_flat_data_by_layer(
+                            batch_operation.host_indices, i
+                        )
+                        self.mem_pool_device.transfer_per_layer(
+                            batch_operation.device_indices, flat_data, i
+                        )
+                    else:
+                        self.mem_pool_host.load_page_per_layer(
+                            batch_operation.host_indices,
+                            batch_operation.device_indices,
+                            self.mem_pool_device,
+                            i,
+                        )
+                        self.load_stream.synchronize()
                     self.layer_done_counter.increment()
                 self.mem_pool_host.complete_io(batch_operation.host_indices)

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-    Multimodality utils
+    Multi-modality utils
 """
 from abc import abstractmethod
@@ -9,11 +9,13 @@ import torch
 from torch import nn
 from sglang.srt.managers.schedule_batch import (
+    MultimodalDataItem,
     MultimodalInputs,
     global_server_args_dict,
     logger,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.utils import print_warning_once
 from sglang.utils import logger
@@ -26,7 +28,7 @@ class MultiModalityDataPaddingPattern:
     @abstractmethod
     def pad_input_tokens(
-        self, input_ids: List[int], image_inputs: MultimodalInputs
+        self, input_ids: List[int], mm_inputs: MultimodalInputs
     ) -> List[int]:
         """
         Pad the input ids sequence containing data tokens, and replace them with pad_values
@@ -49,13 +51,13 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
         """
         This function will replace the data-tokens inbetween with pad_values accordingly
         """
-        pad_values = mm_inputs.pad_values
+        pad_values = [item.pad_value for item in mm_inputs.mm_items]
         data_token_pairs = self.data_token_id_pairs
-        mm_inputs.image_offsets = []
+        mm_inputs.data_offsets = []
         if data_token_pairs is None:
             data_token_pairs = [mm_inputs.im_start_id, mm_inputs.im_end_id]
         if data_token_pairs is None:
-            logger.warning(
+            print_warning_once(
                 "No data_token_pairs provided, RadixAttention might be influenced."
             )
             return input_ids
@@ -77,10 +79,10 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
             if input_ids[start_idx] in start_token_ids:
                 data_idx += 1
-                mm_inputs.image_offsets += [start_idx]
+                mm_inputs.data_offsets += [start_idx]
-            if data_idx >= len(mm_inputs.pad_values):
-                data_idx = len(mm_inputs.pad_values) - 1
+            if data_idx >= len(pad_values):
+                data_idx = len(pad_values) - 1
             num_tokens = end_idx - start_idx - 1
             pad_value = pad_values[data_idx]
@@ -94,68 +96,19 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
         return padded_ids
-class MultModalityDataPaddingPatternSingleToken(MultiModalityDataPaddingPattern):
-    """In this pattern, data is represented with a special token_id ( image_inputs.im_token_id ),
-         which needs first to be expanded to multiple tokens, then replaced with their padding values
-    This strategy should be used when a single data token represents content that should
-    be expanded to multiple tokens during processing.
-    """
-    def __init__(
-        self, num_data_token_calc_func: Callable[[Tuple[int, int, int]], int]
-    ) -> None:
-        self.num_data_token_calc_func = num_data_token_calc_func
-    def pad_input_tokens(
-        self, input_ids: List[int], mm_inputs: MultimodalInputs
-    ) -> List[int]:
-        """
-        This function will follow the procedure of:
-            1. the data token will be expanded, of which the final number will be calculated by `num_data_token_calc_func`
-            2. the padded data tokens will be replaced with their pad_values
-        """
-        image_grid_thws = mm_inputs.image_grid_thws
-        pad_values = mm_inputs.pad_values
-        image_indices = [
-            idx for idx, token in enumerate(input_ids) if token == mm_inputs.im_token_id
-        ]
-        mm_inputs.image_offsets = []
-        input_ids_with_image = []
-        for image_cnt, _ in enumerate(image_grid_thws):
-            # print(f"image_cnt {image_cnt}")
-            num_image_tokens = self.num_data_token_calc_func(image_grid_thws[image_cnt])
-            if image_cnt == 0:
-                non_image_tokens = input_ids[: image_indices[image_cnt]]
-            else:
-                non_image_tokens = input_ids[
-                    image_indices[image_cnt - 1] + 1 : image_indices[image_cnt]
-                ]
-            input_ids_with_image.extend(non_image_tokens)
-            mm_inputs.image_offsets.append(len(input_ids_with_image))
-            pad_ids = pad_values * (
-                (num_image_tokens + len(pad_values)) // len(pad_values)
-            )
-            input_ids_with_image.extend(pad_ids[:num_image_tokens])
-        input_ids_with_image.extend(input_ids[image_indices[-1] + 1 :])
-        return input_ids_with_image
 class MultiModalityDataPaddingPatternImageTokens(MultiModalityDataPaddingPattern):
-    """In this pattern, data tokens should be represented as image tokens (e.g. <image><image>....<image>)"""
+    """In this pattern, data tokens should be represented as repetitions of a single token
+    e.g. <image><image>....<image>, or <audio><audio>...<audio>
+    """
     def __init__(self, image_token_id: torch.Tensor) -> None:
         self.image_token_id = image_token_id
-    def pad_input_tokens(self, input_ids: List[int], image_inputs) -> List[int]:
+    def pad_input_tokens(self, input_ids: List[int], mm_inputs) -> List[int]:
         """
         This function will replace the data-tokens in between with pad_values accordingly
         """
-        pad_values = image_inputs.pad_values
+        pad_values = [item.pad_value for item in mm_inputs.mm_items]
         assert len(pad_values) != 0
         input_ids_tensor = torch.tensor(input_ids)
@@ -170,138 +123,227 @@ class MultiModalityDataPaddingPatternImageTokens(MultiModalityDataPaddingPattern
         return input_ids_tensor.tolist()
+def get_embedding_and_mask(
+    data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
+    embedding_items: List[MultimodalDataItem],
+    placeholder_tensor: torch.Tensor,
+    input_ids: torch.Tensor,
+):
+    """
+    Get the multimodal embedding and its mask from input_ids
+    """
+    # 1. Get the embedding
+    embedding = data_embedding_func(embedding_items)
+    # 2. Check the embedding
+    if embedding.dim() == 2:
+        num_mm_tokens_in_embedding = embedding.shape[0]
+    else:
+        num_mm_tokens_in_embedding = embedding.shape[0] * embedding.shape[1]
+    # the mask of multimodal tokens from input_ids
+    special_multimodal_mask = torch.isin(
+        input_ids,
+        placeholder_tensor,
+    ).unsqueeze(-1)
+    num_mm_tokens_in_input_ids = special_multimodal_mask.sum()
+    if num_mm_tokens_in_input_ids != num_mm_tokens_in_embedding:
+        logger.warning(
+            f"Number of tokens in multimodal embedding does not match those in the input text."
+            f"Got {num_mm_tokens_in_input_ids} tokens in the text but {num_mm_tokens_in_embedding} "
+            "tokens from multimodal embeddings."
+        )
+        if num_mm_tokens_in_input_ids < num_mm_tokens_in_embedding:
+            # TODO: chunked prefill will split special tokens from input_ids into several passes, failing the embedding
+            # a fix may be cache the unfinished multimodal embedding for future reuse, determine the tokens to embed with
+            # extend_start_loc and extend_seq_lens
+            chunked_prefill_size = global_server_args_dict["chunked_prefill_size"]
+            if chunked_prefill_size != -1:
+                logger.warning(
+                    "You may want to avoid this issue by raising `chunked_prefill_size`, or disabling chunked prefill"
+                )
+            # extract from the end: this is a compromise
+            if embedding.dim() == 2:
+                embedding = embedding[-num_mm_tokens_in_input_ids:, :]
+            else:
+                num_multimodal = num_mm_tokens_in_input_ids // embedding.shape[0]
+                embedding = embedding[-num_multimodal:, :]
+        else:
+            raise RuntimeError(
+                "Insufficient multimodal embedding length. This is an internal error"
+            )
+    return embedding, special_multimodal_mask
 def embed_mm_inputs(
-    mm_input: MultimodalInputs,
+    mm_inputs: MultimodalInputs,
     input_ids: torch.Tensor,
     input_embedding: nn.Embedding,
-    mm_data_embedding_func: Callable[[MultimodalInputs], torch.Tensor],
+    image_data_embedding_func: Callable[
+        [List[MultimodalDataItem]], torch.Tensor
+    ] = None,
+    audio_data_embedding_func: Callable[
+        [List[MultimodalDataItem]], torch.Tensor
+    ] = None,
     placeholder_token_ids: List[int] = None,
 ) -> Optional[torch.Tensor]:
     """
-    Calculate the image embeddings if necessary, then scatter the result with
-    the help of a boolean mask denoting the embed locations
+    Calculate the multimodal embeddings if necessary, then scatter the result with the help of a boolean mask denoting the embed locations
-    Returns:
-        final embedding: Optional[torch.Tensor]
+        Args:
+            placeholder_token_ids: denoting the token of multimodal data in input_ids.
+                If none, the pad_values of multimodal items are used
+        Returns:
+            final embedding: Optional[torch.Tensor]
     """
-    if mm_input is None:
+    if mm_inputs is None:
         return None
-    placeholder_token_ids = placeholder_token_ids or mm_input.pad_values
+    # 1. Calculate the multimodal data which exists in input_ids, with the help of pad_values
+    # we assume that multimodal data are represented with its pad_values in input_ids
+    placeholder_token_ids = placeholder_token_ids or [
+        item.pad_value for item in mm_inputs.mm_items
+    ]
-    # boolean masking the special tokens
-    special_image_mask = torch.isin(
-        input_ids,
-        torch.tensor(placeholder_token_ids, device=input_ids.device),
-    ).unsqueeze(-1)
+    placeholder_tensor = torch.tensor(placeholder_token_ids, device=input_ids.device)
-    num_image_tokens_in_input_ids = special_image_mask.sum()
-    # print(f"{num_image_tokens_in_input_ids}")
-    # print(f"{input_ids}")
+    placeholder_masks = torch.isin(input_ids, placeholder_tensor)
-    # return
-    if num_image_tokens_in_input_ids == 0:
-        # unexpected
+    appearing_pad_values = torch.unique(
+        input_ids[placeholder_masks], return_counts=False
+    )
+    if appearing_pad_values.numel() == 0:
+        # all been prefixed
         inputs_embeds = input_embedding(input_ids)
     else:
-        # print(f"Getting image feature")
-        image_embedding = mm_data_embedding_func(mm_input)
-        # print(f"image_embedding: {image_embedding.shape}")
+        appearing_items = [
+            item
+            for item in mm_inputs.mm_items
+            if item.pad_value is not None and item.pad_value in appearing_pad_values
+        ]
-        if image_embedding.dim() == 2:
-            num_image_tokens_in_embedding = image_embedding.shape[0]
-        else:
-            num_image_tokens_in_embedding = (
-                image_embedding.shape[0] * image_embedding.shape[1]
+        using_all_items = False
+        if len(appearing_items) == 0:
+            # This happens mostly when arg placeholder_token_ids is passed
+            logger.warning_once(
+                "No multimodal data item's pad value exist in placeholder ids. Using all items"
             )
-        if num_image_tokens_in_input_ids != num_image_tokens_in_embedding:
-            num_image = num_image_tokens_in_input_ids // image_embedding.shape[1]
-            image_embedding = image_embedding[:num_image, :]
-            logger.warning(
-                f"Number of images does not match number of special image tokens in the input text. "
-                f"Got {num_image_tokens_in_input_ids} image tokens in the text but {num_image_tokens_in_embedding} "
-                "tokens from image embeddings."
+            using_all_items = True
+            appearing_items = mm_inputs.mm_items
+        embeddings, masks = [], []
+        # 2. Get multimodal embedding separately
+        # TODO: make this more generic
+        # Try get image embedding if any
+        if (
+            any(True for item in appearing_items if item.is_image())
+            and image_data_embedding_func
+        ):
+            items = [item for item in appearing_items if item.is_image()]
+            embedding, mask = get_embedding_and_mask(
+                data_embedding_func=image_data_embedding_func,
+                embedding_items=items,
+                placeholder_tensor=(
+                    placeholder_tensor
+                    if using_all_items
+                    else torch.tensor(
+                        [item.pad_value for item in items],
+                        device=input_ids.device,
+                    )
+                ),
+                input_ids=input_ids,
             )
+            embeddings += [embedding]
+            masks += [mask]
-            # TODO: chunked prefill will split special tokens from input_ids into several passes, failing the embedding
-            # a fix may be cache the unfinished image embedding for future reuse, determine the tokens to embed with
-            # extend_start_loc and extend_seq_lens
-            if num_image_tokens_in_input_ids > num_image_tokens_in_embedding:
-                chunked_prefill_size = global_server_args_dict["chunked_prefill_size"]
-                if chunked_prefill_size != -1:
-                    logger.warning(
-                        "You may want to avoid this issue by raising `chunked_prefill_size`, or disabling chunked_prefill"
+        # Try get audio embedding if any
+        if (
+            any(True for item in appearing_items if item.is_audio())
+            and audio_data_embedding_func
+        ):
+            items = [item for item in appearing_items if item.is_audio()]
+            embedding, mask = get_embedding_and_mask(
+                data_embedding_func=audio_data_embedding_func,
+                embedding_items=items,
+                placeholder_tensor=(
+                    placeholder_tensor
+                    if using_all_items
+                    else torch.tensor(
+                        [item.pad_value for item in items],
+                        device=input_ids.device,
                     )
+                ),
+                input_ids=input_ids,
+            )
+            embeddings += [embedding]
+            masks += [mask]
+        # 3. Get input embeddings
         vocab_size = input_embedding.num_embeddings
-        # Important: clamp after getting original image regions
-        # Clamp input ids. This is because the input_ids for the image tokens are
-        # filled with the hash values of the image for the prefix matching in the radix attention.
+        # Important: clamp after getting original multimodal regions
+        # Clamp input ids. This is because the input_ids for the multimodal tokens are
+        # filled with the hash values of the multimodal for the prefix matching in the radix attention.
         # There values are useless because their embeddings will be replaced by vision embeddings anyway.
         input_ids.clamp_(min=0, max=vocab_size - 1)
         inputs_embeds = input_embedding(input_ids)
-        special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
-            inputs_embeds.device
-        )
-        inputs_embeds = inputs_embeds.masked_scatter(
-            special_image_mask,
-            image_embedding.to(inputs_embeds.device, inputs_embeds.dtype),
-        )
-    return inputs_embeds
-def embed_image_embedding(
-    inputs_embeds: torch.Tensor,
-    image_embedding: torch.Tensor,
-    image_bounds: torch.Tensor,
-) -> torch.Tensor:
-    """
-    scatter image_embedding into inputs_embeds according to image_bounds
-    """
-    if len(image_bounds) > 0:
-        image_indices = torch.stack(
-            [
-                torch.arange(start, end, dtype=torch.long)
-                for start, end in image_bounds.tolist()
-            ]
-        ).to(inputs_embeds.device)
-        inputs_embeds.scatter_(
-            0,
-            image_indices.view(-1, 1).repeat(1, inputs_embeds.shape[-1]),
-            image_embedding.view(-1, image_embedding.shape[-1]),
-        )
+        # 4. scatter embeddings into input embedding
+        for embedding, mask in zip(embeddings, masks):
+            mask = mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+            inputs_embeds = inputs_embeds.masked_scatter(
+                mask,
+                embedding.to(inputs_embeds.device, inputs_embeds.dtype),
+            )
     return inputs_embeds
 def general_mm_embed_routine(
     input_ids: torch.Tensor,
     forward_batch: ForwardBatch,
-    embed_tokens: nn.Embedding,
-    mm_data_embedding_func: Callable[[MultimodalInputs], torch.Tensor],
+    language_model: nn.Module,
+    image_data_embedding_func: Callable[
+        [List[MultimodalDataItem]], torch.Tensor
+    ] = None,
+    audio_data_embedding_func: Callable[
+        [List[MultimodalDataItem]], torch.Tensor
+    ] = None,
     placeholder_token_ids: List[int] = None,
-):
+    **kwargs,
+) -> torch.Tensor:
     """
-    a general wrapper function to get final input embeds from multimodal models
-    with a language model as causal model
+    A general wrapper function to get final input embeds from multimodal models with a language model as causal model
         Args:
             placeholder_token_ids (List[int]): the ids of mm data placeholder tokens
+            image_data_embedding_func : the function returning the image embedding
+            audio_data_embedding_func : the function returning the image embedding
+        Returns:
+            inputs_embedding
+            forwarded hidden states
     """
+    assert hasattr(language_model, "get_input_embeddings")
+    embed_tokens = language_model.get_input_embeddings()
     if (
         not forward_batch.forward_mode.is_decode()
         and forward_batch.contains_mm_inputs()
     ):
-        image = forward_batch.merge_mm_inputs()
+        mm_input = forward_batch.merge_mm_inputs()
         inputs_embeds = embed_mm_inputs(
-            mm_input=image,
+            mm_inputs=mm_input,
             input_ids=input_ids,
             input_embedding=embed_tokens,
-            mm_data_embedding_func=mm_data_embedding_func,
+            image_data_embedding_func=image_data_embedding_func,
+            audio_data_embedding_func=audio_data_embedding_func,
             placeholder_token_ids=placeholder_token_ids,
         )
         # once used, mm_inputs is useless
@@ -310,7 +352,13 @@ def general_mm_embed_routine(
     else:
         inputs_embeds = embed_tokens(input_ids)
-    return inputs_embeds
+    hidden_states = language_model(
+        input_ids=None,
+        forward_batch=forward_batch,
+        input_embeds=inputs_embeds,
+        **kwargs,
+    )
+    return hidden_states
 def get_multimodal_data_bounds(
@@ -322,15 +370,13 @@ def get_multimodal_data_bounds(
     Returns:
         [bounds_count, 2]
     """
-    # All the images in the batch should share the same special image
-    # bound token ids.
+    # All the multimodal data in the batch should share the same special bound token ids.
     start_tokens = [s for s, _e in token_pairs]
     end_tokens = [e for _s, e in token_pairs]
     assert all(isinstance(t, int) for t in start_tokens)
     assert all(isinstance(t, int) for t in end_tokens)
-    # print(input_ids)
     start_cond = torch.isin(
         input_ids, torch.tensor(start_tokens, device=input_ids.device)
     )
@@ -339,7 +385,7 @@ def get_multimodal_data_bounds(
     (data_start_tokens,) = torch.where(start_cond)
     (data_end_tokens,) = torch.where(end_cond)
-    # the im_start_id sometimes can be cached as prefix, but it is needed for the embedding of the images
+    # the im_start_id sometimes can be cached as prefix, but it is needed for the embedding of the multimodal data
     if len(data_start_tokens) != len(data_end_tokens):
         if (
             len(data_start_tokens) + 1 == len(data_end_tokens)
@@ -352,14 +398,14 @@ def get_multimodal_data_bounds(
                     data_start_tokens,
                 ]
             )
-    valid_image_nums = min(len(data_start_tokens), len(data_end_tokens))
+    valid_mm_data_nums = min(len(data_start_tokens), len(data_end_tokens))
-    if valid_image_nums == 0:
+    if valid_mm_data_nums == 0:
         return torch.zeros((0, 2), device=input_ids.device)
     # Filter out pairs where start_token >= end_token
     valid_pairs = []
-    for i in range(valid_image_nums):
+    for i in range(valid_mm_data_nums):
         start_token = data_start_tokens[i]
         end_token = data_end_tokens[i]
         if start_token < end_token:

sglang/srt/managers/multimodal_processor.py CHANGED Viewed

@@ -64,5 +64,3 @@ def get_mm_processor(
         f"No processor registered for architecture: {hf_config.architectures}.\n"
         f"Registered architectures: {[model_cls.__name__ for model_cls in PROCESSOR_MAPPING.keys()]}"
     )
-    self.image_proce

sglang 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl

sglang 0.4.4.post3py3-none-any.whl → 0.4.5py3-none-any.whl