PyPI - sglang - Versions diffs - 0.4.9.post1__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl - Mend

sglang 0.4.9.post1py3-none-any.whl → 0.4.9.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

sglang/srt/configs/model_config.py +24 -1
sglang/srt/conversation.py +21 -2
sglang/srt/disaggregation/ascend/__init__.py +6 -0
sglang/srt/disaggregation/ascend/conn.py +44 -0
sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
sglang/srt/disaggregation/mooncake/conn.py +15 -14
sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
sglang/srt/disaggregation/utils.py +25 -3
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/entrypoints/http_server.py +1 -0
sglang/srt/entrypoints/openai/protocol.py +11 -0
sglang/srt/entrypoints/openai/serving_chat.py +7 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/kimik2_detector.py +220 -0
sglang/srt/hf_transformers_utils.py +18 -0
sglang/srt/jinja_template_utils.py +8 -0
sglang/srt/layers/communicator.py +17 -4
sglang/srt/layers/linear.py +12 -2
sglang/srt/layers/moe/ep_moe/kernels.py +2 -1
sglang/srt/layers/moe/ep_moe/layer.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -2
sglang/srt/layers/moe/topk.py +8 -2
sglang/srt/layers/parameter.py +19 -3
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/moe_wna16.py +1 -2
sglang/srt/layers/quantization/w8a8_int8.py +738 -14
sglang/srt/managers/io_struct.py +27 -2
sglang/srt/managers/mm_utils.py +55 -94
sglang/srt/managers/schedule_batch.py +16 -5
sglang/srt/managers/scheduler.py +21 -1
sglang/srt/managers/tokenizer_manager.py +16 -0
sglang/srt/mem_cache/memory_pool.py +65 -40
sglang/srt/model_executor/forward_batch_info.py +13 -1
sglang/srt/model_loader/loader.py +23 -12
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +62 -17
sglang/srt/models/deepseek_vl2.py +1 -1
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +6 -3
sglang/srt/models/internvl.py +8 -2
sglang/srt/models/kimi_vl.py +8 -2
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llava.py +3 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpmo.py +1 -2
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral_quant.py +4 -0
sglang/srt/models/mllama4.py +13 -4
sglang/srt/models/phi4mm.py +8 -2
sglang/srt/models/phimoe.py +553 -0
sglang/srt/models/qwen2.py +2 -0
sglang/srt/models/qwen2_5_vl.py +10 -7
sglang/srt/models/qwen2_vl.py +12 -1
sglang/srt/models/vila.py +8 -2
sglang/srt/multimodal/processors/base_processor.py +197 -137
sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
sglang/srt/multimodal/processors/gemma3.py +4 -2
sglang/srt/multimodal/processors/gemma3n.py +1 -1
sglang/srt/multimodal/processors/internvl.py +1 -1
sglang/srt/multimodal/processors/janus_pro.py +1 -1
sglang/srt/multimodal/processors/kimi_vl.py +1 -1
sglang/srt/multimodal/processors/minicpm.py +4 -3
sglang/srt/multimodal/processors/mllama4.py +1 -1
sglang/srt/multimodal/processors/phi4mm.py +1 -1
sglang/srt/multimodal/processors/pixtral.py +1 -1
sglang/srt/multimodal/processors/qwen_vl.py +203 -80
sglang/srt/multimodal/processors/vila.py +1 -1
sglang/srt/server_args.py +11 -4
sglang/srt/utils.py +154 -31
sglang/version.py +1 -1
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +4 -3
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +75 -70
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post1.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0

sglang/srt/model_loader/loader.py CHANGED Viewed

@@ -64,10 +64,13 @@ from sglang.srt.model_loader.weight_utils import (
 from sglang.srt.utils import (
     get_bool_env_var,
     get_device_capability,
+    is_npu,
     is_pin_memory_available,
     set_weight_attrs,
 )
+_is_npu = is_npu()
 @contextmanager
 def device_loading_context(module: torch.nn.Module, target_device: torch.device):
@@ -127,18 +130,19 @@ def _get_quantization_config(
         # (yizhang2077) workaround for nvidia/Llama-4-Maverick-17B-128E-Eagle3
         if quant_config is None:
             return None
-        major, minor = get_device_capability()
-        if major is not None and minor is not None:
-            assert 0 <= minor < 10
-            capability = major * 10 + minor
-            if capability < quant_config.get_min_capability():
-                raise ValueError(
-                    f"The quantization method {model_config.quantization} "
-                    "is not supported for the current GPU. "
-                    f"Minimum capability: {quant_config.get_min_capability()}. "
-                    f"Current capability: {capability}."
-                )
+        if not _is_npu:
+            major, minor = get_device_capability()
+            if major is not None and minor is not None:
+                assert 0 <= minor < 10
+                capability = major * 10 + minor
+                if capability < quant_config.get_min_capability():
+                    raise ValueError(
+                        f"The quantization method {model_config.quantization} "
+                        "is not supported for the current GPU. "
+                        f"Minimum capability: {quant_config.get_min_capability()}. "
+                        f"Current capability: {capability}."
+                    )
         supported_dtypes = quant_config.get_supported_act_dtypes()
         if model_config.dtype not in supported_dtypes:
             raise ValueError(
@@ -157,6 +161,13 @@ def _initialize_model(
     """Initialize a model with the given configurations."""
     model_class, _ = get_model_architecture(model_config)
     packed_modules_mapping = getattr(model_class, "packed_modules_mapping", {})
+    if _is_npu:
+        packed_modules_mapping["fused_qkv_a_proj_with_mqa"] = [
+            "q_a_proj",
+            "kv_a_proj_with_mqa",
+        ]
+        packed_modules_mapping["qkv_proj"] = ["q_proj", "k_proj", "v_proj"]
+        packed_modules_mapping["gate_up_proj"] = ["gate_proj", "up_proj"]
     quant_config = _get_quantization_config(
         model_config, load_config, packed_modules_mapping
     )

sglang/srt/models/deepseek_janus_pro.py CHANGED Viewed

@@ -1989,7 +1989,7 @@ class MultiModalityCausalLM(MultiModalityPreTrainedModel):
         hidden_states = general_mm_embed_routine(
             input_ids=input_ids,
             forward_batch=forward_batch,
-            image_data_embedding_func=self.get_image_feature,
+            multimodal_model=self,
             language_model=self.language_model,
             positions=positions,
         )

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -77,6 +77,7 @@ from sglang.srt.layers.quantization.int8_utils import (
 )
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.rotary_embedding import get_rope, get_rope_wrapper
+from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
@@ -100,6 +101,7 @@ from sglang.srt.utils import (
     get_int_env_var,
     is_cpu,
     is_cuda,
+    is_flashinfer_available,
     is_hip,
     is_non_idle_and_non_empty,
     log_info_on_rank0,
@@ -132,6 +134,9 @@ if _is_hip:
         decode_attention_fwd_grouped_rope,
     )
+_is_flashinfer_available = is_flashinfer_available()
+_is_sm100_supported = is_cuda() and is_sm100_supported()
 logger = logging.getLogger(__name__)
@@ -195,13 +200,13 @@ class DeepseekV2MLP(nn.Module):
             )
         self.act_fn = SiluAndMul()
-    def forward(self, x, forward_batch=None):
+    def forward(self, x, forward_batch=None, can_fuse_mlp_allreduce=False):
         if (self.tp_size == 1) and x.shape[0] == 0:
             return x
         gate_up, _ = self.gate_up_proj(x)
         x = self.act_fn(gate_up)
-        x, _ = self.down_proj(x)
+        x, _ = self.down_proj(x, can_fuse_mlp_allreduce=can_fuse_mlp_allreduce)
         return x
@@ -409,7 +414,10 @@ class DeepseekV2MoE(nn.Module):
         ]
     def forward(
-        self, hidden_states: torch.Tensor, forward_batch: Optional[ForwardBatch] = None
+        self,
+        hidden_states: torch.Tensor,
+        forward_batch: Optional[ForwardBatch] = None,
+        can_fuse_mlp_allreduce: bool = False,
     ) -> torch.Tensor:
         if not self._enable_deepep_moe:
             DUAL_STREAM_TOKEN_THRESHOLD = 1024
@@ -418,13 +426,17 @@ class DeepseekV2MoE(nn.Module):
                 and self.num_fused_shared_experts == 0
                 and hidden_states.shape[0] <= DUAL_STREAM_TOKEN_THRESHOLD
             ):
-                return self.forward_normal_dual_stream(hidden_states)
+                return self.forward_normal_dual_stream(
+                    hidden_states, can_fuse_mlp_allreduce
+                )
             else:
-                return self.forward_normal(hidden_states)
+                return self.forward_normal(hidden_states, can_fuse_mlp_allreduce)
         else:
             return self.forward_deepep(hidden_states, forward_batch)
-    def forward_normal_dual_stream(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def forward_normal_dual_stream(
+        self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
+    ) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits = self.gate(hidden_states)
@@ -440,11 +452,13 @@ class DeepseekV2MoE(nn.Module):
                 final_hidden_states *= self.routed_scaling_factor
         current_stream.wait_stream(self.alt_stream)
         final_hidden_states = final_hidden_states + shared_output
-        if self.tp_size > 1:
+        if self.tp_size > 1 and not can_fuse_mlp_allreduce:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states
-    def forward_normal(self, hidden_states: torch.Tensor) -> torch.Tensor:
+    def forward_normal(
+        self, hidden_states: torch.Tensor, can_fuse_mlp_allreduce: bool = False
+    ) -> torch.Tensor:
         if hasattr(self, "shared_experts") and use_intel_amx_backend(
             self.shared_experts.gate_up_proj
         ):
@@ -461,7 +475,7 @@ class DeepseekV2MoE(nn.Module):
             final_hidden_states *= self.routed_scaling_factor
         if shared_output is not None:
             final_hidden_states = final_hidden_states + shared_output
-        if self.tp_size > 1:
+        if self.tp_size > 1 and not can_fuse_mlp_allreduce:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states
@@ -514,7 +528,7 @@ class DeepseekV2MoE(nn.Module):
             None,  # a2_scale
             True,  # is_vnni
         )
-        if self.tp_size > 1:
+        if self.tp_size > 1 and not self.can_fuse_mlp_allreduce:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states
@@ -1818,6 +1832,29 @@ class DeepseekV2DecoderLayer(nn.Module):
             and layer_id % self.config.moe_layer_freq == 0
         )
+    def _should_fuse_mlp_allreduce_with_next_layer(self, forward_batch) -> bool:
+        """Check if MLP allreduce can be fused with next layer's add_rmsnorm"""
+        if (
+            self.layer_id == self.config.num_hidden_layers - 1
+            or get_tensor_model_parallel_world_size() <= 1
+        ):
+            return False
+        if not global_server_args_dict.get("enable_flashinfer_allreduce_fusion", False):
+            return False
+        if not _is_sm100_supported or not _is_flashinfer_available:
+            return False
+        if hasattr(forward_batch, "input_ids") and (
+            forward_batch.input_ids.shape[0] == 0
+            or forward_batch.input_ids.shape[0] > 128
+        ):
+            return False
+        return True
     def forward(
         self,
         positions: torch.Tensor,
@@ -1842,12 +1879,22 @@ class DeepseekV2DecoderLayer(nn.Module):
             hidden_states, residual, forward_batch
         )
-        hidden_states = self.mlp(hidden_states, forward_batch)
-        hidden_states, residual = self.layer_communicator.postprocess_layer(
-            hidden_states, residual, forward_batch
+        can_fuse_mlp_allreduce = (
+            self._should_fuse_mlp_allreduce_with_next_layer(forward_batch)
+            and not (self.enable_dp_attention and self.speculative_algorithm.is_eagle())
+            and not self.is_nextn
         )
+        hidden_states = self.mlp(hidden_states, forward_batch, can_fuse_mlp_allreduce)
+        if can_fuse_mlp_allreduce:
+            hidden_states._sglang_needs_allreduce_fusion = True
+        if not can_fuse_mlp_allreduce:
+            hidden_states, residual = self.layer_communicator.postprocess_layer(
+                hidden_states, residual, forward_batch
+            )
         return hidden_states, residual
     def op_comm_prepare_attn(
@@ -2146,7 +2193,6 @@ class DeepseekV2ForCausalLM(nn.Module):
             # This may affect the accuracy of fp8 model.
             # Fix deepseek v3 blockwise bmm by using deep_gemm
             use_deep_gemm_bmm = False
-            model_dtype = torch.get_default_dtype()
             if w.dtype in (
                 torch.float8_e4m3fn,
@@ -2172,7 +2218,6 @@ class DeepseekV2ForCausalLM(nn.Module):
                         _is_cuda
                         and weight_block_size[0] == 128
                         and weight_block_size[1] == 128
-                        and model_dtype == torch.bfloat16
                     ):
                         if (
                             deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
@@ -2186,7 +2231,7 @@ class DeepseekV2ForCausalLM(nn.Module):
                                 weight,
                                 weight_scale,
                                 weight_block_size,
-                                model_dtype,
+                                torch.bfloat16,
                             )
                     else:
                         w, scale = block_quant_to_tensor_quant(

sglang/srt/models/deepseek_vl2.py CHANGED Viewed

@@ -227,7 +227,7 @@ class DeepseekVL2ForCausalLM(nn.Module):
             input_ids=input_ids,
             positions=positions,
             forward_batch=forward_batch,
-            image_data_embedding_func=self.get_image_feature,
+            multimodal_model=self,
             language_model=self.language_model,
         )

sglang/srt/models/gemma3_mm.py CHANGED Viewed

@@ -374,7 +374,7 @@ class Gemma3ForConditionalGeneration(PreTrainedModel):
             input_ids=llm_input_ids,
             forward_batch=forward_batch,
             language_model=self.language_model,
-            image_data_embedding_func=self.get_image_feature,
+            multimodal_model=self,
             positions=positions,
         )

sglang/srt/models/gemma3n_mm.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 import re
 from functools import lru_cache
-from typing import Dict, Iterable, List, Optional, Set, Tuple, TypedDict, Union
+from typing import Iterable, List, Optional, Set, Tuple, TypedDict, Union
 import torch
 from torch import nn
@@ -25,6 +25,7 @@ from sglang.srt.managers.mm_utils import (
     general_mm_embed_routine,
 )
 from sglang.srt.managers.schedule_batch import (
+    Modality,
     MultimodalDataItem,
     MultimodalInputs,
     flatten_nested_list,
@@ -434,8 +435,10 @@ class Gemma3nForConditionalGeneration(PreTrainedModel):
             input_ids=input_ids,
             forward_batch=forward_batch,
             language_model=self.language_model,
-            image_data_embedding_func=self.get_image_feature,
-            audio_data_embedding_func=self.get_audio_feature,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+                Modality.AUDIO: self.get_audio_feature,
+            },
             positions=positions,
             per_layer_inputs=per_layer_inputs,
         )

sglang/srt/models/internvl.py CHANGED Viewed

@@ -29,7 +29,11 @@ from sglang.srt.managers.mm_utils import (
     MultiModalityDataPaddingPatternTokenPairs,
     general_mm_embed_routine,
 )
-from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.deepseek_janus_pro import DropPath
@@ -523,7 +527,9 @@ class InternVLChatModel(nn.Module):
             input_ids=input_ids,
             forward_batch=forward_batch,
             language_model=self.language_model,
-            image_data_embedding_func=self.get_image_feature,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
             positions=positions,
         )

sglang/srt/models/kimi_vl.py CHANGED Viewed

@@ -67,7 +67,11 @@ from sglang.srt.managers.mm_utils import (
     MultiModalityDataPaddingPatternMultimodalTokens,
     general_mm_embed_routine,
 )
-from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import (
     default_weight_loader,
@@ -168,7 +172,9 @@ class KimiVLForConditionalGeneration(nn.Module):
             input_ids=input_ids,
             forward_batch=forward_batch,
             language_model=self.language_model,
-            image_data_embedding_func=self.get_image_feature,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
             positions=positions,
         )

sglang/srt/models/llama.py CHANGED Viewed

@@ -575,6 +575,8 @@ class LlamaForCausalLM(nn.Module):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                if name not in params_dict:
+                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)

sglang/srt/models/llava.py CHANGED Viewed

@@ -787,7 +787,9 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM):
             forward_batch=forward_batch,
             get_embedding=get_embedding,
             language_model=self.language_model,
-            image_data_embedding_func=self.get_image_feature,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
             placeholder_tokens=None,  # using mm_item.pad_value
             positions=positions,
         )

sglang/srt/models/llavavid.py CHANGED Viewed

@@ -142,7 +142,7 @@ class LlavaVidForCausalLM(nn.Module):
                 )
                 image_offsets = [
                     flatten_nested_list(
-                        [item.image_offsets for item in image_inputs[i].mm_items]
+                        [item.offsets for item in image_inputs[i].mm_items]
                     )
                     for i in range(bs)
                     if need_vision[i]

sglang/srt/models/minicpmo.py CHANGED Viewed

@@ -1827,8 +1827,7 @@ class MiniCPMO(MiniCPMBaseModel):
             input_ids=input_ids,
             forward_batch=forward_batch,
             language_model=self.llm,
-            image_data_embedding_func=self.get_image_feature,
-            audio_data_embedding_func=self.get_audio_feature,
+            multimodal_model=self,
             positions=positions,
         )
         return hidden_states

sglang/srt/models/minicpmv.py CHANGED Viewed

@@ -573,7 +573,7 @@ class MiniCPMBaseModel(nn.Module):
         hidden_states = general_mm_embed_routine(
             input_ids=input_ids,
             forward_batch=forward_batch,
-            image_data_embedding_func=self.get_image_feature,
+            multimodal_model=self,
             language_model=self.llm,
             positions=positions,
         )

sglang/srt/models/mixtral_quant.py CHANGED Viewed

@@ -407,6 +407,8 @@ class QuantMixtralForCausalLM(nn.Module):
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
+                if name not in params_dict:
+                    continue
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -418,6 +420,8 @@ class QuantMixtralForCausalLM(nn.Module):
                 # Skip experts that are not assigned to this worker.
                 if "block_sparse_moe.experts." in name and name not in params_dict:
                     continue
+                if name not in params_dict:
+                    continue
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader", default_weight_loader)
                 weight_loader(param, loaded_weight)

sglang/srt/models/mllama4.py CHANGED Viewed

@@ -6,8 +6,11 @@ from typing import List, Optional, Set, Tuple
 import torch
 from torch import nn
-from transformers import Llama4Config, Llama4VisionModel
-from transformers.models.llama4.modeling_llama4 import Llama4MultiModalProjector
+from transformers import Llama4Config
+from transformers.models.llama4.modeling_llama4 import (
+    Llama4MultiModalProjector,
+    Llama4VisionModel,
+)
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
@@ -16,7 +19,11 @@ from sglang.srt.managers.mm_utils import (
     MultiModalityDataPaddingPatternMultimodalTokens,
     general_mm_embed_routine,
 )
-from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.utils import add_prefix, is_cpu
@@ -166,7 +173,9 @@ class Llama4ForConditionalGeneration(nn.Module):
             input_ids=input_ids,
             forward_batch=forward_batch,
             language_model=self.language_model,
-            image_data_embedding_func=image_embedding_func,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
             positions=positions,
         )

sglang/srt/models/phi4mm.py CHANGED Viewed

@@ -31,7 +31,11 @@ from sglang.srt.managers.mm_utils import (
     MultiModalityDataPaddingPatternMultimodalTokens,
     general_mm_embed_routine,
 )
-from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+from sglang.srt.managers.schedule_batch import (
+    Modality,
+    MultimodalDataItem,
+    MultimodalInputs,
+)
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
 from sglang.srt.models.idefics2 import Idefics2VisionTransformer
@@ -439,7 +443,9 @@ class Phi4MMForCausalLM(nn.Module):
             input_ids=input_ids,
             forward_batch=forward_batch,
             language_model=self.language_model,
-            image_data_embedding_func=self.get_image_feature,
+            data_embedding_funcs={
+                Modality.IMAGE: self.get_image_feature,
+            },
             positions=positions,
         )

sglang 0.4.9.post1__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl

sglang 0.4.9.post1py3-none-any.whl → 0.4.9.post2py3-none-any.whl