PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

sglang/srt/managers/mm_utils.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Multi-modality utils
 """
+import dataclasses
 import logging
 from abc import abstractmethod
 from typing import Callable, List, Optional, Tuple
@@ -15,10 +16,15 @@ from sglang.srt.managers.schedule_batch import (
     MultimodalInputs,
     global_server_args_dict,
 )
+from sglang.srt.mem_cache.multimodal_cache import MultiModalCache
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.utils import flatten_nested_list, print_warning_once
+from sglang.utils import logger
-logger = logging.getLogger(__name__)
+# NOTE: Using the shared logger from sglang.utils instead of creating a module-specific logger
+# to ensure consistent logging behavior across the codebase. This prevents issues with log
+# propagation that can cause some log messages (like 'server is fired up') to not appear
+# in the console when multimodal support is enabled.
 class MultiModalityDataPaddingPattern:
@@ -41,11 +47,26 @@ class MultiModalityDataPaddingPattern:
 class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern):
     """In this pattern, data tokens should be enclosed by special token pairs (e.g. <image>...</image>, data_token_pairs)
+    The padded value in a region enclosed by a token pair with be the same one, as the MultimodalDataItem's pad value
     This strategy should be applied when data content is marked by start/end token pairs in the input sequence.
     """
-    def __init__(self, data_token_pairs: Optional[List[Tuple[int, int]]]) -> None:
+    def __init__(
+        self,
+        data_token_pairs: Optional[List[Tuple[int, int]]],
+        data_start_token_ids: Optional[List[int]] = None,
+    ) -> None:
+        """
+        Args:
+            data_start_token_ids marks the start of a single multimodal data
+            See Minicpmo's slice_start_id for example
+        """
         self.data_token_id_pairs = data_token_pairs
+        self.data_start_token_ids = data_start_token_ids or [
+            s for s, _e in data_token_pairs
+        ]
     def pad_input_tokens(
         self, input_ids: List[int], mm_inputs: MultimodalInputs
@@ -79,7 +100,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
         for start_idx, end_idx in zip(start_indices, end_indices):
             padded_ids.extend(input_ids[last_idx : start_idx + 1])
-            if input_ids[start_idx] in start_token_ids:
+            if input_ids[start_idx] in self.data_start_token_ids:
                 data_idx += 1
                 mm_inputs.data_offsets += [start_idx]
@@ -170,46 +191,156 @@ class MultiModalityDataPaddingPatternMultimodalTokens(MultiModalityDataPaddingPa
                 output_ids_tensor[start_idx:end_idx] = pad_value
             else:
                 logger.warning(f"Skipping region {i} due to None pad_value.")
         return output_ids_tensor.tolist()
-def get_embedding_and_mask(
-    data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
-    embedding_items: List[MultimodalDataItem],
-    placeholder_tensor: torch.Tensor,
-    input_ids: torch.Tensor,
-):
+embedding_cache = None
+def init_embedding_cache(max_size: int):
+    global embedding_cache
+    embedding_cache = MultiModalCache(max_size)
+def get_embedding_hash(embedding_items: List[MultimodalDataItem]) -> int:
+    hash_list = [item.hash for item in embedding_items]
+    return hash(tuple(hash_list))
+def get_embedding_chunk(
+    embedding: torch.Tensor,
+    extend_prefix_len: int,
+    extend_seq_len: int,
+    items_offset: List[Tuple[int, int]],
+) -> Tuple[torch.Tensor, int, int]:
     """
-    Get the multimodal embedding and its mask from input_ids
+    Extract a chunk of embeddings based on the specified prefix length, sequence length, and offset ranges.
+    Args:
+        embedding: The full embedding tensor to extract a chunk from
+        extend_prefix_len: The starting position (prefix length) for extraction
+        extend_seq_len: The number of tokens to extract
+        items_offset: List of [start, end] offset ranges for multimodal items in the input sequence
+    Returns:
+        A tuple containing:
+        - The extracted embedding chunk as a tensor
+        - The start index used for extraction
+        - The end index used for extraction
+    Note:
+        If there's no overlap between the requested range and the offset ranges,
+        an empty tensor is returned with zeros for start and end indices.
+    """
+    start_index, end_index = 0, 0
+    extend_start_index = extend_prefix_len
+    extend_end_index = extend_prefix_len + extend_seq_len - 1
+    for start, end in items_offset:
+        if extend_start_index >= start and extend_start_index <= end:
+            start_index += extend_start_index - start
+        elif extend_start_index > end:
+            start_index += end - start + 1
+        if extend_end_index >= start and extend_end_index <= end:
+            end_index += extend_end_index - start + 1
+        elif extend_end_index > end:
+            end_index += end - start + 1
+    # some models embedding is 3-dim, reshape it to 2-dim
+    embedding = embedding.reshape(-1, embedding.shape[-1])
+    embedding_chunk = embedding[start_index:end_index]
+    return embedding_chunk, start_index, end_index
+def _get_precomputed_embedding(
+    items: List[MultimodalDataItem],
+) -> Optional[torch.Tensor]:
+    """
+    If all items have precomputed_features, return their concatenation.
+    If some but not all have precomputed_features, raise NotImplementedError.
+    If none have precomputed_features, return None.
     """
-    # 1. Get the embedding
-    embedding = data_embedding_func(embedding_items)
+    precomputed_features = [item.precomputed_features for item in items]
+    if any(feature is not None for feature in precomputed_features):
+        if not all(feature is not None for feature in precomputed_features):
+            raise NotImplementedError(
+                "MM inputs where only some items are precomputed."
+            )
+        result = torch.concat(precomputed_features)
+        # some models embedding is 3-dim, reshape it to 2-dim (similar to get_embedding_chunk)
+        result = result.reshape(-1, result.shape[-1])
+        return result
+    return None
-    # 2. Check the embedding
-    if embedding.dim() == 2:
-        num_mm_tokens_in_embedding = embedding.shape[0]
-    else:
-        num_mm_tokens_in_embedding = embedding.shape[0] * embedding.shape[1]
-    # the mask of multimodal tokens from input_ids
-    special_multimodal_mask = torch.isin(
-        input_ids,
-        placeholder_tensor,
-    ).unsqueeze(-1)
+def _get_chunked_prefill_embedding(
+    data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
+    embedding_items: List[MultimodalDataItem],
+    items_size: List[int],
+    prefix_length: List[int],
+    extend_length: List[int],
+    items_offset_list: List[List[Tuple[int, int]]],
+) -> Optional[torch.Tensor]:
+    # Calculate embedding for each request, try to get it from cache to avoid repeated calculation
+    embedding_list = []
+    for i in range(len(items_size) - 1):
+        if items_size[i] == items_size[i + 1]:
+            continue
+        embedding_items_per_req = embedding_items[items_size[i] : items_size[i + 1]]
+        items_offset = items_offset_list[i]
+        embedding_items_hash = get_embedding_hash(embedding_items_per_req)
+        # if all items has been prefixed, we do not need to calculate embedding
+        if all([offset_end < prefix_length[i] for _, offset_end in items_offset]):
+            continue
+        embedding_per_req = embedding_cache.get(embedding_items_hash)
+        if embedding_per_req is None:
+            embedding_per_req = data_embedding_func(embedding_items_per_req)
+            if not embedding_cache.put(embedding_items_hash, embedding_per_req):
+                print_warning_once(
+                    "Multimodal embedding cache is full. Consider increasing the "
+                    "`SGLANG_VLM_CACHE_SIZE_MB` environment variable."
+                )
+        embedding_per_req_chunk, _, end_index = get_embedding_chunk(
+            embedding=embedding_per_req,
+            extend_prefix_len=prefix_length[i],
+            extend_seq_len=extend_length[i],
+            items_offset=items_offset,
+        )
+        # remove this item from cache if chunk reaches to the end
+        embedding_per_req_length = (
+            embedding_per_req.shape[0]
+            if embedding_per_req.dim() == 2
+            else embedding_per_req.shape[0] * embedding_per_req.shape[1]
+        )
+        if end_index == embedding_per_req_length:
+            embedding_cache.free(embedding_items_hash)
+        embedding_list.append(embedding_per_req_chunk)
+    if len(embedding_list) == 0:
+        return None
+    return torch.concat(embedding_list, dim=0)
+def _get_multimodal_mask(
+    input_ids: torch.Tensor, placeholder_tensor: torch.Tensor
+) -> torch.Tensor:
+    return torch.isin(input_ids, placeholder_tensor).unsqueeze(-1)
-    num_mm_tokens_in_input_ids = special_multimodal_mask.sum().item()
+def _adjust_embedding_length(
+    embedding: torch.Tensor,
+    mask: torch.Tensor,
+    logger,
+) -> torch.Tensor:
+    num_mm_tokens_in_embedding = embedding.shape[0]
+    num_mm_tokens_in_input_ids = mask.sum().item()
     if num_mm_tokens_in_input_ids != num_mm_tokens_in_embedding:
         logger.warning(
-            f"Number of tokens in multimodal embedding does not match those in the input text."
+            f"Number of tokens in multimodal embedding does not match those in the input text. "
             f"Got {num_mm_tokens_in_input_ids} tokens in the text but {num_mm_tokens_in_embedding} "
-            "tokens from multimodal embeddings."
+            f"tokens from multimodal embeddings."
         )
         if num_mm_tokens_in_input_ids < num_mm_tokens_in_embedding:
-            # TODO: chunked prefill will split special tokens from input_ids into several passes, failing the embedding
-            # a fix may be cache the unfinished multimodal embedding for future reuse, determine the tokens to embed with
-            # extend_start_loc and extend_seq_lens
             chunked_prefill_size = global_server_args_dict["chunked_prefill_size"]
             if chunked_prefill_size != -1:
                 logger.warning(
@@ -225,12 +356,61 @@ def get_embedding_and_mask(
             raise RuntimeError(
                 f"Insufficient multimodal embedding length: {num_mm_tokens_in_input_ids=} vs {num_mm_tokens_in_embedding=}. This is an internal error"
             )
+    return embedding
+def get_embedding_and_mask(
+    data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
+    embedding_items: List[MultimodalDataItem],
+    placeholder_tensor: torch.Tensor,
+    input_ids: torch.Tensor,
+    items_size: List[int],
+    prefix_length: List[int],
+    extend_length: List[int],
+    items_offset_list: List[List[Tuple[int, int]]],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Generate multimodal embeddings and create a mask for identifying their positions in the input sequence.
+    Args:
+        data_embedding_func: Function that generates embeddings for multimodal items
+        embedding_items: List of multimodal items to embed
+        placeholder_tensor: Tensor containing token IDs that serve as placeholders for multimodal content
+        input_ids: The input token IDs tensor
+        items_size: Cumulative sizes of multimodal items per request
+        prefix_length: Prefix lengths for each request
+        extend_length: Sequence lengths for each request
+        items_offset_list: List of offset ranges for multimodal items in each request
+    Returns:
+        A tuple containing:
+        - The generated embeddings tensor
+        - A boolean mask tensor indicating where these embeddings should be placed
+    """
+    # 1. Get embedding
+    embedding = _get_precomputed_embedding(embedding_items)
+    if embedding is None:
+        embedding = _get_chunked_prefill_embedding(
+            data_embedding_func,
+            embedding_items,
+            items_size,
+            prefix_length,
+            extend_length,
+            items_offset_list,
+        )
+        if embedding is None:
+            return None, None
+    # 2. Get mask
+    special_multimodal_mask = _get_multimodal_mask(input_ids, placeholder_tensor)
+    # 3. Adjust embedding length if needed
+    embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)
     return embedding, special_multimodal_mask
 def embed_mm_inputs(
-    mm_inputs: MultimodalInputs,
+    mm_inputs_list: List[MultimodalInputs],
+    extend_prefix_lens: List[int],
+    extend_seq_lens: List[int],
     input_ids: torch.Tensor,
     input_embedding: nn.Embedding,
     image_data_embedding_func: Callable[
@@ -242,125 +422,133 @@ def embed_mm_inputs(
     placeholder_tokens: dict[Modality, List[int]] = None,
 ) -> Optional[torch.Tensor]:
     """
-    Calculate the multimodal embeddings if necessary, then scatter the result with the help of a boolean mask denoting the embed locations
-        Args:
-            placeholder_tokens: denoting the token of multimodal data in input_ids.
-                If none, the pad_values of multimodal items are used
+    Embed multimodal inputs and integrate them with text token embeddings.
+    Args:
+        mm_inputs_list: List of multimodal inputs to process
+        extend_prefix_lens: Prefix lengths for each request
+        extend_seq_lens: Sequence lengths for each request
+        input_ids: Input token IDs tensor
+        input_embedding: Embedding layer for text tokens
+        image_data_embedding_func: Function to embed image data
+        audio_data_embedding_func: Function to embed audio data
+        placeholder_tokens: Token IDs for multimodal placeholders (uses pad_values if None)
-        Returns:
-            final embedding: Optional[torch.Tensor]
+    Returns:
+        Combined embedding tensor with multimodal content integrated
     """
-    if mm_inputs is None:
+    if mm_inputs_list is None:
         return None
     # 1. Calculate the multimodal data which exists in input_ids, with the help of pad_values
     # we assume that multimodal data are represented with its pad_values in input_ids
-    # See `pad_input_ids` for more detail
-    # if placeholder_tokens is specified
-    if placeholder_tokens is not None:
-        placeholder_token_ids = flatten_nested_list(
-            [placeholder_token for placeholder_token in placeholder_tokens.values()]
-        )
-    else:
-        placeholder_token_ids = [item.pad_value for item in mm_inputs.mm_items]
-    assert isinstance(placeholder_token_ids[0], int)
-    placeholder_tensor = torch.tensor(placeholder_token_ids, device=input_ids.device)
-    placeholder_masks = torch.isin(input_ids, placeholder_tensor)
-    appearing_pad_values = torch.unique(
-        input_ids[placeholder_masks], return_counts=False
-    )
+    item_flatten_list = []
+    for mm_inputs in mm_inputs_list:
+        item_flatten_list += [item for item in mm_inputs.mm_items if item is not None]
-    if appearing_pad_values.numel() == 0:
-        # all been prefixed
-        inputs_embeds = input_embedding(input_ids)
-    else:
-        appearing_items = [
-            item
-            for item in mm_inputs.mm_items
-            if item.pad_value is not None and item.pad_value in appearing_pad_values
-        ]
+    embeddings, masks = [], []
-        using_all_items = False
-        if len(appearing_items) == 0:
-            # This happens mostly when arg placeholder_token_ids is passed
-            logger.warning(
-                "No multimodal data item's pad value exist in placeholder ids. Using all items"
+    # 2. Get multimodal embedding separately
+    # TODO: make this more generic
+    # Try get image embedding if any
+    if (
+        any(True for item in item_flatten_list if item.is_image())
+        and image_data_embedding_func
+    ):
+        items = [item for item in item_flatten_list if item.is_image()]
+        placeholder_tensor = torch.tensor(
+            [item.pad_value for item in items],
+            device=input_ids.device,
+        )
+        # calculate per request items length offset
+        items_size = torch.zeros(len(mm_inputs_list) + 1, dtype=int)
+        items_offsets = []
+        for i, mm_inputs in enumerate(mm_inputs_list):
+            image_items = [item for item in mm_inputs.mm_items if item.is_image()]
+            items_size[i + 1] = len(image_items)
+            items_offsets.append(
+                flatten_nested_list(
+                    [
+                        item.image_offsets
+                        for item in mm_inputs.mm_items
+                        if item.is_image()
+                    ]
+                )
             )
-            using_all_items = True
-            appearing_items = mm_inputs.mm_items
+        items_size = torch.cumsum(items_size, dim=0).tolist()
-        embeddings, masks = [], []
+        embedding, mask = get_embedding_and_mask(
+            data_embedding_func=image_data_embedding_func,
+            embedding_items=items,
+            placeholder_tensor=placeholder_tensor,
+            input_ids=input_ids,
+            items_size=items_size,
+            prefix_length=extend_prefix_lens,
+            extend_length=extend_seq_lens,
+            items_offset_list=items_offsets,
+        )
+        embeddings += [embedding]
+        masks += [mask]
-        # 2. Get multimodal embedding separately
-        # TODO: make this more generic
-        # Try get image embedding if any
-        if (
-            any(True for item in appearing_items if item.is_image())
-            and image_data_embedding_func
-        ):
-            items = [item for item in appearing_items if item.is_image()]
-            embedding, mask = get_embedding_and_mask(
-                data_embedding_func=image_data_embedding_func,
-                embedding_items=items,
-                placeholder_tensor=(
-                    # use the specified modality token to identify the location to embed
-                    placeholder_tokens[Modality.IMAGE]
-                    if using_all_items
-                    else torch.tensor(
-                        [item.pad_value for item in items],
-                        device=input_ids.device,
-                    )
-                ),
-                input_ids=input_ids,
+    # Try get audio embedding if any
+    if (
+        any(True for item in item_flatten_list if item.is_audio())
+        and audio_data_embedding_func
+    ):
+        items = [item for item in item_flatten_list if item.is_audio()]
+        placeholder_tensor = torch.tensor(
+            [item.pad_value for item in items],
+            device=input_ids.device,
+        )
+        items_offsets = []
+        # calculate per request items length offset
+        items_size = torch.zeros(len(mm_inputs_list) + 1, dtype=int)
+        for i, mm_inputs in enumerate(mm_inputs_list):
+            audio_items = [item for item in mm_inputs.mm_items if item.is_audio()]
+            items_size[i + 1] = len(audio_items)
+            items_offsets.append(
+                flatten_nested_list(
+                    [
+                        item.audio_offsets
+                        for item in mm_inputs.mm_items
+                        if item.is_audio()
+                    ]
+                )
             )
-            embeddings += [embedding]
-            masks += [mask]
+        items_size = torch.cumsum(items_size, dim=0)
-        # Try get audio embedding if any
-        if (
-            any(True for item in appearing_items if item.is_audio())
-            and audio_data_embedding_func
-        ):
-            items = [item for item in appearing_items if item.is_audio()]
-            embedding, mask = get_embedding_and_mask(
-                data_embedding_func=audio_data_embedding_func,
-                embedding_items=items,
-                placeholder_tensor=(
-                    placeholder_tokens[Modality.AUDIO]
-                    if using_all_items
-                    else torch.tensor(
-                        [item.pad_value for item in items],
-                        device=input_ids.device,
-                    )
-                ),
-                input_ids=input_ids,
-            )
-            embeddings += [embedding]
-            masks += [mask]
-        # 3. Get input embeddings
-        vocab_size = input_embedding.num_embeddings
-        # Important: clamp after getting original multimodal regions
-        # Clamp input ids. This is because the input_ids for the multimodal tokens are
-        # filled with the hash values of the multimodal for the prefix matching in the radix attention.
-        # There values are useless because their embeddings will be replaced by vision embeddings anyway.
-        input_ids.clamp_(min=0, max=vocab_size - 1)
-        inputs_embeds = input_embedding(input_ids)
-        # 4. Scatter embeddings into input embedding
-        for embedding, mask in zip(embeddings, masks):
-            mask = mask.expand_as(inputs_embeds).to(inputs_embeds.device)
-            inputs_embeds = inputs_embeds.masked_scatter(
-                mask,
-                embedding.to(inputs_embeds.device, inputs_embeds.dtype),
-            )
+        embedding, mask = get_embedding_and_mask(
+            data_embedding_func=audio_data_embedding_func,
+            embedding_items=items,
+            placeholder_tensor=placeholder_tensor,
+            input_ids=input_ids,
+            items_size=items_size,
+            prefix_length=extend_prefix_lens,
+            extend_length=extend_seq_lens,
+            items_offset_list=items_offsets,
+        )
+        embeddings += [embedding]
+        masks += [mask]
+    # 3. Get input embeddings
+    vocab_size = input_embedding.num_embeddings
+    # Important: clamp after getting original multimodal regions
+    # Clamp input ids. This is because the input_ids for the multimodal tokens are
+    # filled with the hash values of the multimodal for the prefix matching in the radix attention.
+    # There values are useless because their embeddings will be replaced by vision embeddings anyway.
+    input_ids.clamp_(min=0, max=vocab_size - 1)
+    inputs_embeds = input_embedding(input_ids)
+    # 4. scatter embeddings into input embedding
+    for embedding, mask in zip(embeddings, masks):
+        if embedding is None or mask is None:
+            continue
+        mask = mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+        inputs_embeds = inputs_embeds.masked_scatter(
+            mask,
+            embedding.to(inputs_embeds.device, inputs_embeds.dtype),
+        )
     return inputs_embeds
@@ -368,37 +556,53 @@ def general_mm_embed_routine(
     input_ids: torch.Tensor,
     forward_batch: ForwardBatch,
     language_model: nn.Module,
-    image_data_embedding_func: Callable[
-        [List[MultimodalDataItem]], torch.Tensor
+    image_data_embedding_func: Optional[
+        Callable[[List[MultimodalDataItem]], torch.Tensor]
     ] = None,
-    audio_data_embedding_func: Callable[
-        [List[MultimodalDataItem]], torch.Tensor
+    audio_data_embedding_func: Optional[
+        Callable[[List[MultimodalDataItem]], torch.Tensor]
     ] = None,
-    placeholder_tokens: dict[Modality, List[int]] = None,
+    placeholder_tokens: Optional[dict[Modality, List[int]]] = None,
     **kwargs,
 ) -> torch.Tensor:
     """
-    A general wrapper function to get final input embeds from multimodal models with a language model as causal model
-        Args:
-            placeholder_token_ids (List[int]): the ids of mm data placeholder tokens
-            image_data_embedding_func : the function returning the image embedding
-            audio_data_embedding_func : the function returning the image embedding
+    Process multimodal inputs and forward through language model.
-        Returns:
-            forwarded hidden states
+    Args:
+        input_ids: Input token IDs tensor
+        forward_batch: Batch information for model forward pass
+        language_model: Base language model to use
+        image_data_embedding_func: Function to embed image data
+        audio_data_embedding_func: Function to embed audio data
+        placeholder_tokens: Token IDs for multimodal placeholders
+        **kwargs: Additional arguments passed to language model
+    Returns:
+        Hidden states from language model forward pass
     """
     assert hasattr(language_model, "get_input_embeddings")
     embed_tokens = language_model.get_input_embeddings()
     if (
         not forward_batch.forward_mode.is_decode()
         and forward_batch.contains_mm_inputs()
     ):
-        mm_input = forward_batch.merge_mm_inputs()
+        mm_inputs_list = [
+            mm_input for mm_input in forward_batch.mm_inputs if mm_input is not None
+        ]
+        extend_prefix_lens = [
+            prefix_len
+            for i, prefix_len in enumerate(forward_batch.extend_prefix_lens_cpu)
+            if forward_batch.mm_inputs[i] is not None
+        ]
+        extend_seq_lens = [
+            seq_len
+            for i, seq_len in enumerate(forward_batch.extend_seq_lens_cpu)
+            if forward_batch.mm_inputs[i] is not None
+        ]
         inputs_embeds = embed_mm_inputs(
-            mm_inputs=mm_input,
+            mm_inputs_list=mm_inputs_list,
+            extend_prefix_lens=extend_prefix_lens,
+            extend_seq_lens=extend_seq_lens,
             input_ids=input_ids,
             input_embedding=embed_tokens,
             image_data_embedding_func=image_data_embedding_func,

sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl