PyPI - sglang - Versions diffs - 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl - Mend

sglang 0.4.4.post3py3-none-any.whl → 0.4.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_serving.py +49 -7
sglang/lang/chat_template.py +24 -0
sglang/srt/_custom_ops.py +59 -92
sglang/srt/configs/model_config.py +5 -0
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/conversation.py +29 -4
sglang/srt/custom_op.py +5 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +27 -79
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/entrypoints/engine.py +0 -5
sglang/srt/layers/attention/flashattention_backend.py +678 -83
sglang/srt/layers/attention/flashinfer_backend.py +5 -7
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -3
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/moe/ep_moe/kernels.py +142 -0
sglang/srt/layers/moe/ep_moe/layer.py +79 -80
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +382 -199
sglang/srt/layers/moe/fused_moe_native.py +5 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1024,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +416 -50
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -0
sglang/srt/layers/moe/topk.py +49 -3
sglang/srt/layers/quantization/__init__.py +5 -1
sglang/srt/layers/quantization/blockwise_int8.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -1
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +34 -10
sglang/srt/layers/quantization/fp8.py +3 -1
sglang/srt/layers/quantization/fp8_utils.py +1 -4
sglang/srt/layers/quantization/moe_wna16.py +503 -0
sglang/srt/layers/quantization/utils.py +1 -1
sglang/srt/layers/quantization/w8a8_int8.py +2 -0
sglang/srt/layers/radix_attention.py +2 -0
sglang/srt/layers/rotary_embedding.py +63 -12
sglang/srt/managers/cache_controller.py +34 -11
sglang/srt/managers/mm_utils.py +202 -156
sglang/srt/managers/multimodal_processor.py +0 -2
sglang/srt/managers/multimodal_processors/base_processor.py +45 -77
sglang/srt/managers/multimodal_processors/clip.py +7 -26
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +17 -58
sglang/srt/managers/multimodal_processors/gemma3.py +12 -27
sglang/srt/managers/multimodal_processors/janus_pro.py +21 -47
sglang/srt/managers/multimodal_processors/llava.py +34 -14
sglang/srt/managers/multimodal_processors/minicpm.py +35 -38
sglang/srt/managers/multimodal_processors/mlama.py +10 -23
sglang/srt/managers/multimodal_processors/mllama4.py +161 -0
sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -45
sglang/srt/managers/schedule_batch.py +185 -128
sglang/srt/managers/scheduler.py +4 -4
sglang/srt/managers/tokenizer_manager.py +1 -1
sglang/srt/managers/utils.py +1 -6
sglang/srt/mem_cache/hiradix_cache.py +62 -52
sglang/srt/mem_cache/memory_pool.py +72 -6
sglang/srt/mem_cache/paged_allocator.py +39 -0
sglang/srt/metrics/collector.py +23 -53
sglang/srt/model_executor/cuda_graph_runner.py +8 -6
sglang/srt/model_executor/forward_batch_info.py +10 -10
sglang/srt/model_executor/model_runner.py +60 -57
sglang/srt/model_loader/loader.py +8 -0
sglang/srt/models/clip.py +12 -7
sglang/srt/models/deepseek_janus_pro.py +10 -15
sglang/srt/models/deepseek_v2.py +212 -121
sglang/srt/models/deepseek_vl2.py +105 -104
sglang/srt/models/gemma3_mm.py +14 -80
sglang/srt/models/llama.py +16 -5
sglang/srt/models/llama4.py +420 -0
sglang/srt/models/llava.py +31 -19
sglang/srt/models/llavavid.py +16 -7
sglang/srt/models/minicpmo.py +63 -147
sglang/srt/models/minicpmv.py +17 -27
sglang/srt/models/mllama.py +29 -14
sglang/srt/models/mllama4.py +154 -0
sglang/srt/models/qwen2.py +9 -6
sglang/srt/models/qwen2_5_vl.py +21 -31
sglang/srt/models/qwen2_vl.py +20 -21
sglang/srt/openai_api/adapter.py +18 -6
sglang/srt/platforms/interface.py +371 -0
sglang/srt/server_args.py +99 -14
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -5
sglang/srt/speculative/eagle_utils.py +140 -28
sglang/srt/speculative/eagle_worker.py +93 -24
sglang/srt/utils.py +104 -51
sglang/test/test_custom_ops.py +55 -0
sglang/test/test_utils.py +13 -26
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/METADATA +4 -3
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/RECORD +99 -84
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/WHEEL +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.4.post3.dist-info → sglang-0.4.5.dist-info}/top_level.txt +0 -0

sglang/srt/models/minicpmo.py CHANGED Viewed

@@ -40,16 +40,19 @@ from transformers.models.whisper.modeling_whisper import (
 from sglang.srt.layers.quantization import QuantizationConfig
 from sglang.srt.managers.mm_utils import (
     MultiModalityDataPaddingPatternTokenPairs,
-    embed_mm_inputs,
-    get_multimodal_data_bounds,
+    general_mm_embed_routine,
 )
-from sglang.srt.managers.schedule_batch import MultimodalInputs
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.managers.schedule_batch import (
+    MultimodalDataItem,
+    MultimodalInputs,
+    flatten_nested_list,
+)
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.utils import set_default_torch_dtype
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.minicpmv import (
     Idefics2VisionTransformer,
-    MiniCPMVBaseModel,
+    MiniCPMBaseModel,
     Resampler2_5,
 )
 from sglang.srt.models.qwen2 import Qwen2ForCausalLM
@@ -1409,7 +1412,7 @@ class MultiModalProjector(nn.Module):
         return hidden_states
-class MiniCPMO(MiniCPMVBaseModel):
+class MiniCPMO(MiniCPMBaseModel):
     def __init__(
         self,
         config: PretrainedConfig,
@@ -1537,7 +1540,7 @@ class MiniCPMO(MiniCPMVBaseModel):
         return input_lengths_after_cnn, input_lengths_after_pooling
-    def get_audio_embedding_streaming(self, multimodal_input: MultimodalInputs):
+    def get_audio_embedding_streaming(self, items: List[MultimodalDataItem]):
         r"""
         Extract audio embeddings in a streaming manner using cached key-value pairs.
@@ -1545,26 +1548,15 @@ class MiniCPMO(MiniCPMVBaseModel):
         for faster inference on subsequent audio frames. It only supports batch_size=1 and is intended
         for streaming scenarios.
-        Args:
-            multimodal_input (dict):
-                - **"audio_features"** (`torch.FloatTensor`): Input mel-spectrograms of shape `(batch_size, 80, frames)`.
-                - **"audio_feature_lens"** (List[List[int]]): Lengths of each audio segment for each item in the batch.
         Returns:
             List[List[torch.Tensor]]: audio embeddings
         """
-        # print("audio embedding")
-        wavforms = (
-            []
-            if multimodal_input.audio_features is None
-            else multimodal_input.audio_features
+        wavforms = flatten_nested_list(
+            [item.audio_features for item in items if item.audio_features]
         )
         # list, [[x1, x2], [y1], [z1]]
-        audio_feature_lens_raw = (
-            []
-            if multimodal_input.audio_feature_lens is None
-            else multimodal_input.audio_feature_lens
+        audio_feature_lens_raw = flatten_nested_list(
+            [item.audio_feature_lens for item in items if item.audio_feature_lens]
         )
         # exist audio
@@ -1650,7 +1642,7 @@ class MiniCPMO(MiniCPMVBaseModel):
             ret[i, start:ending] = True
         return ret
-    def get_audio_embedding(self, multimodal_input: MultimodalInputs, chunk_length=-1):
+    def get_audio_embedding(self, items: List[MultimodalDataItem], chunk_length=-1):
         r"""
         Extract full audio embeddings with optional chunk-based attention.
@@ -1659,31 +1651,25 @@ class MiniCPMO(MiniCPMVBaseModel):
         not use key-value caching and is suitable for non-streaming inference.
         Args:
-            multimodal_input (dict):
-                - **"audio_features"** (`torch.FloatTensor`): Input mel-spectrograms of shape `(batch_size, 80, frames)`.
-                - **"audio_feature_lens"** (List[List[int]]): Lengths of each audio segment for each item in the batch.
             chunk_length (int, optional): Determines whether to use full attention (-1) or chunk-based
                 attention (>0) during embedding computation.
         Returns:
             List[List[torch.Tensor]]: audio embeddings
         """
-        # print("audio embedding")
         # (bs, 80, frames) or [], multi audios need filled in advance
-        wavforms = (
-            []
-            if multimodal_input.audio_features is None
-            else multimodal_input.audio_features
+        wavforms = flatten_nested_list(
+            [item.audio_features for item in items if item.audio_features]
         )
         # list, [[x1, x2], [y1], [z1]]
-        audio_feature_lens_raw = (
-            []
-            if multimodal_input.audio_feature_lens is None
-            else multimodal_input.audio_feature_lens
+        audio_feature_lens_raw = flatten_nested_list(
+            [item.audio_feature_lens for item in items if item.audio_feature_lens]
         )
         final_audio_embeds = []
+        assert isinstance(wavforms, list)
+        assert isinstance(wavforms[0], torch.Tensor)
         # exist audio
         for wavform in wavforms:
             if len(wavform) > 0:
@@ -1757,86 +1743,46 @@ class MiniCPMO(MiniCPMVBaseModel):
                     final_audio_embeds.append(target_audio_embeds)
             return final_audio_embeds
+    def get_audio_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        embedding = self.get_omni_embedding(
+            items=items,
+            chunk_length=self.config.audio_chunk_length,
+            stream_input=False,
+        )
+        return embedding
     def get_omni_embedding(
         self,
-        input_ids,
-        multimodal_input: MultimodalInputs,
-        input_embeds: torch.Tensor,
-        forward_mode: ForwardMode,
+        items: List[MultimodalDataItem],
         chunk_length=-1,
         stream_input=False,
     ):
         """
         Args:
-            multimodal_input:
-            input_embeds:
             chunk_length: whisper use full attention or chunk attention
             stream_input: use streaming audio embedding
         Returns:
             final embeddings with audio feature
         """
-        input_embeds = input_embeds.unsqueeze(0)
-        if not forward_mode.is_decode() and multimodal_input.contains_audio_inputs():
-            audio_bounds = get_multimodal_data_bounds(
-                input_ids=input_ids,
-                pad_values=multimodal_input.pad_values,
-                token_pairs=[
-                    (multimodal_input.audio_start_id, multimodal_input.audio_end_id)
-                ],
-            )
-            if audio_bounds.numel() == 0:
-                input_embeds = input_embeds.squeeze(0)
-                # TODO
-                logger.warn("Unimplemented logic. Please try disabling chunked prefill")
-                return input_embeds
-            audio_bounds = audio_bounds.unsqueeze(0)
-            bs = len(input_embeds)
-            if stream_input:
-                audio_embeddings = self.get_audio_embedding_streaming(multimodal_input)
-            else:
-                audio_embeddings = self.get_audio_embedding(
-                    multimodal_input, chunk_length
-                )
-            # batch size
-            assert len(audio_embeddings) == len(input_embeds)
-            if len(audio_embeddings) > 0:
-                if self.config.chunk_input:
-                    for i in range(bs):
-                        audio_embs = torch.cat(audio_embeddings[i], dim=0).to(
-                            device=input_embeds.device, dtype=input_embeds.dtype
-                        )
-                        audio_start_pos = 0
-                        for bound in audio_bounds[i]:
-                            audio_len = bound[1] - bound[0] + 1
-                            input_embeds[0, bound[0] : bound[1] + 1] = audio_embs[
-                                audio_start_pos : audio_start_pos + audio_len, :
-                            ]
-                            audio_start_pos += audio_len
-                else:
-                    for i in range(bs):
-                        audio_embs = audio_embeddings[i]
-                        bounds = audio_bounds[i]
-                        for embs, bound in zip(audio_embs, bounds):
-                            audio_indices = torch.arange(
-                                bound[0], bound[1], dtype=torch.long
-                            ).to(input_embeds.device)
-                            if embs.shape[0] != len(audio_indices):
-                                raise ValueError(
-                                    f"Shape mismatch: Trying to assign embeddings of shape {embs.shape} "
-                                    f"to input indices of length {len(audio_indices)}"
-                                )
-                            input_embeds[i, audio_indices] = embs.to(input_embeds.dtype)
-        input_embeds = input_embeds.squeeze(0)
-        return input_embeds
-    def get_image_features(
-        self,
-        image_inputs: MultimodalInputs,
-    ) -> torch.Tensor:
-        pixel_values = image_inputs.pixel_values
-        tgt_sizes = image_inputs.tgt_sizes
+        if stream_input:
+            audio_embeddings = self.get_audio_embedding_streaming(items)
+        else:
+            audio_embeddings = self.get_audio_embedding(items, chunk_length)
+        bs = len(audio_embeddings)
+        # batch size
+        audio_embs = torch.cat(flatten_nested_list(audio_embeddings), dim=0)
+        return audio_embs
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+        # list of tensors
+        pixel_values = flatten_nested_list([item.pixel_values for item in items])
+        tgt_sizes = torch.stack(
+            flatten_nested_list([item.tgt_size for item in items]), dim=0
+        )
+        assert len(pixel_values) == tgt_sizes.shape[0]
         device = self.vpm.embeddings.position_embedding.weight.device
         dtype = self.vpm.embeddings.position_embedding.weight.dtype
         all_pixel_values_lst = [
@@ -1845,10 +1791,10 @@ class MiniCPMO(MiniCPMVBaseModel):
         max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
         assert isinstance(max_patches, int)
         all_pixel_values = torch.nn.utils.rnn.pad_sequence(
             all_pixel_values_lst, batch_first=True, padding_value=0.0
         )
         B, L, _ = all_pixel_values.shape
         all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
         patch_attn_mask = torch.zeros(
@@ -1875,53 +1821,23 @@ class MiniCPMO(MiniCPMVBaseModel):
         forward_batch: ForwardBatch,
         **kwargs: Any,
     ) -> torch.Tensor:
-        inputs_embeds = None
-        # TODO(mick): optimize the logic here: clamp, merge and embedding should happens at most once
-        if (
-            not forward_batch.forward_mode.is_decode()
-            and forward_batch.contains_image_inputs()
-        ):
-            mm_inputs = forward_batch.merge_mm_inputs()
-            inputs_embeds = embed_mm_inputs(
-                mm_input=mm_inputs,
-                input_ids=input_ids,
-                input_embedding=self.get_input_embeddings(),
-                mm_data_embedding_func=self.get_image_features,
-                placeholder_token_ids=[mm_inputs.im_token_id] + mm_inputs.pad_values,
-            )
-        input_ids = input_ids.clamp(
-            min=0, max=self.get_input_embeddings().num_embeddings - 1
+        mm_input = forward_batch.merge_mm_inputs()
+        placeholder_token_ids = (
+            ([mm_input.im_token_id] + [item.pad_value for item in mm_input.mm_items])
+            if forward_batch.contains_mm_inputs()
+            else []
         )
-        if inputs_embeds is None:
-            inputs_embeds = self.llm.get_input_embeddings(input_ids)
-        if (
-            not forward_batch.forward_mode.is_decode()
-            and self.config.init_audio
-            and forward_batch.contains_audio_inputs()
-        ):
-            mm_input = forward_batch.merge_mm_inputs()
-            inputs_embeds = self.get_omni_embedding(
-                input_ids=input_ids,
-                multimodal_input=mm_input,
-                input_embeds=inputs_embeds,
-                forward_mode=forward_batch.forward_mode,
-                chunk_length=self.config.audio_chunk_length,
-                stream_input=False,
-            )
-        forward_batch.mm_inputs = None
-        hidden_states = self.llm.model(
-            input_ids=None,
-            positions=positions,
+        hidden_states = general_mm_embed_routine(
+            input_ids=input_ids,
             forward_batch=forward_batch,
-            input_embeds=inputs_embeds,
-        )
-        return self.logits_processor(
-            input_ids, hidden_states, self.llm.lm_head, forward_batch
+            language_model=self.llm,
+            image_data_embedding_func=self.get_image_feature,
+            audio_data_embedding_func=self.get_audio_feature,
+            placeholder_token_ids=placeholder_token_ids,
+            positions=positions,
         )
+        return hidden_states
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang/srt/models/minicpmv.py CHANGED Viewed

@@ -54,12 +54,12 @@ from sglang.srt.managers.mm_utils import (
     MultiModalityDataPaddingPatternTokenPairs,
     general_mm_embed_routine,
 )
-from sglang.srt.managers.schedule_batch import MultimodalInputs
+from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.utils import set_default_torch_dtype
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
-from sglang.srt.utils import add_prefix
+from sglang.srt.utils import add_prefix, flatten_nested_list
 RawImageType = Union[Image.Image, torch.Tensor]
@@ -661,7 +661,7 @@ def get_version_by_config(config: PretrainedConfig) -> Tuple[int, ...]:
     return tuple(int(x) for x in version_str.split("."))
-class MiniCPMVBaseModel(nn.Module):
+class MiniCPMBaseModel(nn.Module):
     """
     The abstract class of MiniCPMV can only be inherited, but cannot be
     instantiated.
@@ -853,7 +853,7 @@ class MiniCPMVBaseModel(nn.Module):
         return vlm_embedding, vision_hidden_states
     def get_input_embeddings(self) -> nn.Embedding:
-        return self.llm.get_input_embedding()
+        return self.llm.get_input_embeddings()
     def forward(
         self,
@@ -862,23 +862,14 @@ class MiniCPMVBaseModel(nn.Module):
         forward_batch: ForwardBatch,
         **kwargs: Any,
     ) -> torch.Tensor:
-        inputs_embeds = general_mm_embed_routine(
+        hidden_states = general_mm_embed_routine(
             input_ids=input_ids,
             forward_batch=forward_batch,
-            embed_tokens=self.get_input_embeddings(),
-            mm_data_embedding_func=self.get_image_features,
-        )
-        hidden_states = self.llm.model(
-            input_ids=None,
+            image_data_embedding_func=self.get_image_feature,
+            language_model=self.llm,
             positions=positions,
-            forward_batch=forward_batch,
-            input_embeds=inputs_embeds,
-        )
-        return self.logits_processor(
-            input_ids, hidden_states, self.llm.lm_head, forward_batch
         )
+        return hidden_states
     def init_llm(
         self,
@@ -913,11 +904,11 @@ class MiniCPMVBaseModel(nn.Module):
     ) -> torch.Tensor:
         raise NotImplementedError
-    def get_image_features(self, image_inputs: MultimodalInputs) -> torch.Tensor:
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
         raise NotImplementedError
-class MiniCPMV2_6(MiniCPMVBaseModel):
+class MiniCPMV2_6(MiniCPMBaseModel):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -1023,14 +1014,13 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
         )
         return vision_embedding
-    def get_image_features(
-        self,
-        image_inputs: MultimodalInputs,
-    ) -> torch.Tensor:
+    def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
         # list of tensors
-        pixel_values = image_inputs.pixel_values
-        tgt_sizes = image_inputs.tgt_sizes
+        pixel_values = flatten_nested_list([item.pixel_values for item in items])
+        tgt_sizes = torch.stack(
+            flatten_nested_list([item.tgt_size for item in items]), dim=0
+        )
+        assert len(pixel_values) == tgt_sizes.shape[0]
         device = self.vpm.embeddings.position_embedding.weight.device
         dtype = self.vpm.embeddings.position_embedding.weight.dtype
@@ -1040,10 +1030,10 @@ class MiniCPMV2_6(MiniCPMVBaseModel):
         max_patches = (tgt_sizes[:, 0] * tgt_sizes[:, 1]).max().item()
         assert isinstance(max_patches, int)
         all_pixel_values = torch.nn.utils.rnn.pad_sequence(
             all_pixel_values_lst, batch_first=True, padding_value=0.0
         )
         B, L, _ = all_pixel_values.shape
         all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
         patch_attn_mask = torch.zeros(

sglang/srt/models/mllama.py CHANGED Viewed

@@ -796,14 +796,16 @@ class MllamaForConditionalGeneration(nn.Module):
         self.logits_processor = LogitsProcessor(config.text_config)
         self.capture_mode = False
-    def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
-        pixel_values = image_inputs.pixel_values
-        pad_values = image_inputs.pad_values
+    def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+        pixel_values = torch.cat(
+            [item.pixel_values for item in mm_inputs.mm_items], dim=0
+        )
+        pad_values = [item.pad_value for item in mm_inputs.mm_items]
         num_concurrent_media, num_tiles = pixel_values.shape[1:3]
         num_patches = self.vision_model.num_patches
         image_len = num_concurrent_media * num_tiles * num_patches
-        image_inputs.num_image_tokens = image_len
+        mm_inputs.num_image_tokens = image_len
         pad_ids = pad_values * ((image_len + len(pad_values)) // len(pad_values))
@@ -815,10 +817,16 @@ class MllamaForConditionalGeneration(nn.Module):
         # pixel_values: shape (bs, num_image, num_tiles, 3, image_res, image_res)
         max_num_images = max_num_tiles = bs = 0
-        for i, im in enumerate(forward_batch.mm_inputs):
-            if not forward_batch.encoder_cached[i] and im is not None:
-                max_num_images = max(max_num_images, im.pixel_values.shape[1])
-                max_num_tiles = max(max_num_tiles, im.pixel_values.shape[2])
+        for i, mm_input in enumerate(forward_batch.mm_inputs):
+            if not forward_batch.encoder_cached[i] and mm_input is not None:
+                pixel_values = torch.cat(
+                    [item.pixel_values for item in mm_input.mm_items], dim=0
+                )
+                # max_num_images = max(max_num_images, sum(1 if item.is_image() else 0 for item in mm_input.items))
+                max_num_images = max(max_num_images, pixel_values.shape[1])
+                max_num_tiles = max(max_num_tiles, pixel_values.shape[2])
                 bs += 1
         if max_num_images * max_num_tiles * bs == 0:
@@ -842,17 +850,24 @@ class MllamaForConditionalGeneration(nn.Module):
             )
             i = 0
             encoder_lens_need = []
-            for k, im in enumerate(forward_batch.mm_inputs):
-                if forward_batch.encoder_cached[k] or im is None:
+            for k, mm_input in enumerate(forward_batch.mm_inputs):
+                if forward_batch.encoder_cached[k] or mm_input is None:
                     continue
                 encoder_lens_need.append(forward_batch.encoder_lens[k])
-                for j in range(im.pixel_values.shape[1]):
-                    img = im.pixel_values[0, j]
+                pixel_values = torch.cat(
+                    [item.pixel_values for item in mm_input.mm_items], dim=0
+                )
+                for j in range(pixel_values.shape[1]):
+                    img = pixel_values[0, j]
                     num_tiles = img.shape[0]
                     batched_images[i, j, :num_tiles] = img
-                    batched_ar_ids[i, j] = im.aspect_ratio_ids[0, j]
-                    batched_ar_mask[i, j, :num_tiles] = im.aspect_ratio_mask[0, j]
+                    batched_ar_ids[i, j] = mm_input.mm_items[0].aspect_ratio_id[0, j]
+                    batched_ar_mask[i, j, :num_tiles] = mm_input.mm_items[
+                        0
+                    ].aspect_ratio_mask[0, j]
                 i += 1
         return batched_images, batched_ar_ids, batched_ar_mask, encoder_lens_need

sglang/srt/models/mllama4.py ADDED Viewed

@@ -0,0 +1,154 @@
+# TODO: add Aapted from vllm/mllama4.py
+from collections.abc import Iterable
+from typing import Optional, Set, Tuple
+import torch
+from torch import nn
+from transformers import Llama4Config
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization import QuantizationConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import default_weight_loader
+from sglang.srt.utils import add_prefix
+class Llama4ForConditionalGeneration(nn.Module):
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    }
+    def __init__(
+        self,
+        config: Llama4Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        # Initialize the language model
+        from sglang.srt.models.llama4 import Llama4ForCausalLM
+        self.language_model = Llama4ForCausalLM(
+            config.text_config,
+            quant_config=quant_config,
+            prefix=add_prefix("language_model", prefix),
+        )
+        self.logits_processor = LogitsProcessor(config.text_config)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        return self.language_model(input_ids, positions, forward_batch)
+    def permute_qk_weight_for_rotary(
+        self,
+        name: str,
+        loaded_weight: torch.Tensor,
+    ) -> Tuple[str, torch.Tensor]:
+        def permute(w: torch.Tensor, n_heads: int):
+            attn_in = self.language_model.config.head_dim * n_heads
+            attn_out = self.language_model.config.hidden_size
+            return (
+                w.view(n_heads, attn_in // n_heads // 2, 2, attn_out)
+                .transpose(1, 2)
+                .reshape(attn_in, attn_out)
+            )
+        modules = name.split(".")
+        # rotary embeds should be sliced
+        if ("wk" in modules or "k_proj" in modules) and modules[-1] == "weight":
+            loaded_weight = permute(
+                loaded_weight, self.language_model.config.num_key_value_heads
+            )
+        elif ("wq" in modules or "q_proj" in modules) and modules[-1] == "weight":
+            loaded_weight = permute(
+                loaded_weight, self.language_model.config.num_attention_heads
+            )
+        return name, loaded_weight
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> Set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            (".self_attn.qkv_proj", ".self_attn.q_proj", "q"),
+            (".self_attn.qkv_proj", ".self_attn.k_proj", "k"),
+            (".self_attn.qkv_proj", ".self_attn.v_proj", "v"),
+            (".shared_expert.gate_up_proj", ".shared_expert.gate_proj", 0),
+            (".shared_expert.gate_up_proj", ".shared_expert.up_proj", 1),
+            (".feed_forward.gate_up_proj", ".feed_forward.gate_proj", 0),
+            (".feed_forward.gate_up_proj", ".feed_forward.up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        num_experts = self.config.text_config.num_local_experts
+        for name, loaded_weight in weights:
+            if name.startswith("vision_model") or name.startswith(
+                "multi_modal_projector"
+            ):
+                continue
+            name, loaded_weight = self.permute_qk_weight_for_rotary(name, loaded_weight)
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                if ".experts" in name:
+                    if ".gate_up_proj" in name:
+                        name_list = [
+                            name.replace(".experts.gate_up_proj", ".experts.w13_weight")
+                        ] * 2
+                        loaded_weight_list = loaded_weight.chunk(2, dim=-1)
+                        shard_id_list = ["w1", "w3"]
+                    else:
+                        name_list = [
+                            name.replace(".experts.down_proj", ".experts.w2_weight")
+                        ]
+                        shard_id_list = ["w2"]
+                        loaded_weight_list = [loaded_weight]
+                    for name, loaded_weight, shard_id in zip(
+                        name_list, loaded_weight_list, shard_id_list
+                    ):
+                        param = params_dict[name]
+                        weight_loader = param.weight_loader
+                        for expert_id in range(num_experts):
+                            weight_loader(
+                                param,
+                                loaded_weight[expert_id].T,
+                                name,
+                                shard_id=shard_id,
+                                expert_id=expert_id,
+                            )
+                else:
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+EntryClass = Llama4ForConditionalGeneration

sglang/srt/models/qwen2.py CHANGED Viewed

@@ -261,11 +261,14 @@ class Qwen2Model(nn.Module):
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+    def get_input_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
         if hasattr(self.config, "scale_emb"):
-            return self.embed_tokens(input_ids) * self.config.scale_emb
+            return self.get_input_embeddings()(input_ids) * self.config.scale_emb
         else:
-            return self.embed_tokens(input_ids)
+            return self.get_input_embeddings()(input_ids)
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
     def forward(
         self,
@@ -358,10 +361,10 @@ class Qwen2ForCausalLM(nn.Module):
         self.logits_processor = LogitsProcessor(config)
         self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
-    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def get_input_embedding(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embedding(input_ids)
-    def get_input_embedding(self) -> nn.Embedding:
+    def get_input_embeddings(self) -> nn.Embedding:
         return self.model.embed_tokens
     @torch.no_grad()

sglang 0.4.4.post3__py3-none-any.whl → 0.4.5__py3-none-any.whl

sglang 0.4.4.post3py3-none-any.whl → 0.4.5py3-none-any.whl