PyPI - sglang - Versions diffs - 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl - Mend

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (200) hide show

sglang/bench_one_batch.py +2 -1
sglang/eval/loogle_eval.py +7 -0
sglang/srt/_custom_ops.py +29 -1
sglang/srt/configs/deepseekvl2.py +11 -2
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +10 -8
sglang/srt/configs/update_config.py +3 -1
sglang/srt/conversation.py +2 -1
sglang/srt/custom_op.py +5 -2
sglang/srt/disaggregation/common/conn.py +34 -6
sglang/srt/disaggregation/decode.py +9 -1
sglang/srt/disaggregation/mini_lb.py +3 -2
sglang/srt/disaggregation/mooncake/conn.py +93 -76
sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
sglang/srt/disaggregation/nixl/conn.py +17 -13
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
sglang/srt/distributed/parallel_state.py +103 -15
sglang/srt/entrypoints/engine.py +31 -33
sglang/srt/entrypoints/http_server.py +20 -32
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +48 -6
sglang/srt/eplb/expert_location_dispatch.py +1 -1
sglang/srt/function_call/base_format_detector.py +74 -12
sglang/srt/function_call/deepseekv3_detector.py +26 -11
sglang/srt/function_call/ebnf_composer.py +95 -63
sglang/srt/function_call/function_call_parser.py +4 -2
sglang/srt/function_call/kimik2_detector.py +41 -16
sglang/srt/function_call/llama32_detector.py +6 -3
sglang/srt/function_call/mistral_detector.py +11 -3
sglang/srt/function_call/pythonic_detector.py +16 -14
sglang/srt/function_call/qwen25_detector.py +12 -3
sglang/srt/function_call/qwen3_coder_detector.py +151 -0
sglang/srt/hf_transformers_utils.py +0 -1
sglang/srt/layers/activation.py +24 -3
sglang/srt/layers/attention/base_attn_backend.py +3 -1
sglang/srt/layers/attention/flashattention_backend.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +40 -1
sglang/srt/layers/communicator.py +12 -12
sglang/srt/layers/dp_attention.py +72 -24
sglang/srt/layers/linear.py +13 -102
sglang/srt/layers/logits_processor.py +34 -24
sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
sglang/srt/layers/moe/ep_moe/layer.py +23 -402
sglang/srt/layers/moe/fused_moe_native.py +7 -47
sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
sglang/srt/layers/moe/topk.py +190 -23
sglang/srt/layers/quantization/__init__.py +20 -134
sglang/srt/layers/quantization/awq.py +578 -11
sglang/srt/layers/quantization/awq_triton.py +339 -0
sglang/srt/layers/quantization/base_config.py +85 -10
sglang/srt/layers/quantization/blockwise_int8.py +17 -55
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
sglang/srt/layers/quantization/fp8.py +273 -62
sglang/srt/layers/quantization/fp8_kernel.py +210 -46
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +501 -143
sglang/srt/layers/quantization/marlin_utils.py +790 -0
sglang/srt/layers/quantization/modelopt_quant.py +34 -112
sglang/srt/layers/quantization/moe_wna16.py +45 -49
sglang/srt/layers/quantization/petit.py +252 -0
sglang/srt/layers/quantization/petit_utils.py +104 -0
sglang/srt/layers/quantization/qoq.py +7 -6
sglang/srt/layers/quantization/scalar_type.py +352 -0
sglang/srt/layers/quantization/unquant.py +422 -0
sglang/srt/layers/quantization/utils.py +340 -9
sglang/srt/layers/quantization/w4afp8.py +8 -4
sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
sglang/srt/layers/quantization/w8a8_int8.py +51 -115
sglang/srt/layers/radix_attention.py +5 -3
sglang/srt/layers/vocab_parallel_embedding.py +1 -41
sglang/srt/lora/lora.py +0 -4
sglang/srt/lora/lora_manager.py +162 -164
sglang/srt/lora/lora_registry.py +124 -0
sglang/srt/lora/mem_pool.py +83 -35
sglang/srt/lora/utils.py +12 -5
sglang/srt/managers/cache_controller.py +288 -0
sglang/srt/managers/io_struct.py +60 -30
sglang/srt/managers/mm_utils.py +7 -8
sglang/srt/managers/schedule_batch.py +163 -113
sglang/srt/managers/schedule_policy.py +68 -27
sglang/srt/managers/scheduler.py +256 -86
sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
sglang/srt/managers/tokenizer_manager.py +38 -27
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/allocator.py +74 -23
sglang/srt/mem_cache/base_prefix_cache.py +14 -2
sglang/srt/mem_cache/chunk_cache.py +5 -2
sglang/srt/mem_cache/hicache_storage.py +168 -0
sglang/srt/mem_cache/hiradix_cache.py +194 -5
sglang/srt/mem_cache/memory_pool.py +16 -1
sglang/srt/mem_cache/memory_pool_host.py +44 -2
sglang/srt/mem_cache/radix_cache.py +26 -0
sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +66 -31
sglang/srt/model_executor/forward_batch_info.py +210 -25
sglang/srt/model_executor/model_runner.py +147 -42
sglang/srt/model_loader/loader.py +7 -1
sglang/srt/model_loader/utils.py +4 -4
sglang/srt/models/clip.py +1 -1
sglang/srt/models/deepseek.py +9 -6
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +192 -173
sglang/srt/models/deepseek_vl2.py +5 -5
sglang/srt/models/gemma.py +48 -0
sglang/srt/models/gemma2.py +52 -0
sglang/srt/models/gemma3_causal.py +63 -0
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -4
sglang/srt/models/granitemoe.py +385 -0
sglang/srt/models/grok.py +9 -3
sglang/srt/models/hunyuan.py +63 -16
sglang/srt/models/internvl.py +1 -1
sglang/srt/models/kimi_vl.py +1 -1
sglang/srt/models/llama.py +41 -0
sglang/srt/models/llama4.py +11 -11
sglang/srt/models/llava.py +2 -2
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +0 -2
sglang/srt/models/minicpmo.py +3 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mixtral.py +9 -2
sglang/srt/models/mllama.py +3 -5
sglang/srt/models/mllama4.py +13 -6
sglang/srt/models/olmoe.py +8 -5
sglang/srt/models/persimmon.py +330 -0
sglang/srt/models/phi.py +321 -0
sglang/srt/models/phi4mm.py +44 -4
sglang/srt/models/phi4mm_audio.py +1260 -0
sglang/srt/models/phi4mm_utils.py +1917 -0
sglang/srt/models/phimoe.py +9 -3
sglang/srt/models/qwen.py +37 -0
sglang/srt/models/qwen2.py +41 -0
sglang/srt/models/qwen2_5_vl.py +4 -4
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +53 -9
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/qwen3.py +65 -1
sglang/srt/models/qwen3_moe.py +57 -24
sglang/srt/models/vila.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +91 -97
sglang/srt/multimodal/processors/clip.py +21 -19
sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
sglang/srt/multimodal/processors/gemma3.py +13 -17
sglang/srt/multimodal/processors/gemma3n.py +19 -23
sglang/srt/multimodal/processors/internvl.py +9 -10
sglang/srt/multimodal/processors/janus_pro.py +12 -27
sglang/srt/multimodal/processors/kimi_vl.py +12 -14
sglang/srt/multimodal/processors/llava.py +4 -2
sglang/srt/multimodal/processors/minicpm.py +35 -44
sglang/srt/multimodal/processors/mlama.py +21 -18
sglang/srt/multimodal/processors/mllama4.py +4 -5
sglang/srt/multimodal/processors/phi4mm.py +63 -39
sglang/srt/multimodal/processors/pixtral.py +14 -35
sglang/srt/multimodal/processors/qwen_audio.py +65 -0
sglang/srt/multimodal/processors/qwen_vl.py +16 -21
sglang/srt/multimodal/processors/vila.py +14 -14
sglang/srt/reasoning_parser.py +46 -4
sglang/srt/sampling/sampling_batch_info.py +6 -5
sglang/srt/sampling/sampling_params.py +8 -1
sglang/srt/server_args.py +454 -270
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
sglang/srt/speculative/eagle_utils.py +51 -23
sglang/srt/speculative/eagle_worker.py +59 -44
sglang/srt/two_batch_overlap.py +10 -5
sglang/srt/utils.py +44 -69
sglang/test/runners.py +14 -3
sglang/test/test_activation.py +50 -1
sglang/test/test_block_fp8.py +8 -3
sglang/test/test_block_fp8_ep.py +1 -1
sglang/test/test_custom_ops.py +12 -7
sglang/test/test_cutlass_w4a8_moe.py +1 -3
sglang/test/test_fp4_moe.py +1 -3
sglang/test/test_marlin_moe.py +286 -0
sglang/test/test_marlin_utils.py +171 -0
sglang/test/test_utils.py +35 -0
sglang/version.py +1 -1
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
sglang/srt/layers/quantization/quant_utils.py +0 -166
sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0

sglang/srt/models/hunyuan.py CHANGED Viewed

@@ -40,6 +40,7 @@ from sglang.srt.layers.linear import (
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.rotary_embedding import get_rope
@@ -152,13 +153,16 @@ class HunYuanSparseMoeBlock(nn.Module):
                 else config.moe_intermediate_size[layer_id]
             )
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=True if top_k > 1 else False,
+        )
         self.experts = FusedMoE(
             num_experts=config.num_experts,
-            top_k=top_k,
             hidden_size=config.hidden_size,
             intermediate_size=intermediate_size,
             reduce_results=False,
-            renormalize=True if top_k > 1 else False,
             quant_config=quant_config,
         )
@@ -195,9 +199,8 @@ class HunYuanSparseMoeBlock(nn.Module):
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(
-            hidden_states=hidden_states, router_logits=router_logits
-        )
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
         if shared_output is not None:
             final_hidden_states = final_hidden_states + shared_output
         if self.tp_size > 1:
@@ -206,6 +209,42 @@ class HunYuanSparseMoeBlock(nn.Module):
         return final_hidden_states.view(orig_shape)
+def get_head_dim(config):
+    if hasattr(config, "head_dim"):
+        return int(config.head_dim)
+    if hasattr(config, "attention_head_dim"):
+        return int(config.attention_head_dim)
+    # since some hunyuan model don't follow the self.hidden_size // self.total_num_heads rule
+    # wrong setting may cause runtime error, just throw error if this field is missing.
+    raise ValueError("Missing head dim config, try set head_dim in config.json")
+def check_head_dim(config):
+    # Some models may lack `head_dim` and use `attention_head_dim` instead.
+    # This attribute is also used by flashinfer_backend.py, so we check for
+    # consistency and raise an error if it's not met to avoid silent failures.
+    # Although we could adapt the HunYuan model to use `attention_head_dim`,
+    # flashinfer expects `head_dim`, so we enforce its presence for correctness.
+    calc_head_dim = config.hidden_size // config.num_attention_heads
+    if hasattr(config, "attention_head_dim"):
+        if calc_head_dim != config.attention_head_dim and not hasattr(
+            config, "head_dim"
+        ):
+            # in this case, flash infer(and other components may calculate wrong value.)
+            raise ValueError(
+                f"HunYuan model config error: calculated head_dim {calc_head_dim} != attention_head_dim {config.attention_head_dim}"
+                + f"\nPlease Add head_dim:{config.attention_head_dim} in config.json to make sure correctly inference."
+            )
+        if hasattr(config, "head_dim") and config.attention_head_dim != config.head_dim:
+            raise ValueError(
+                f"HunYuan model config error: head_dim({config.head_dim}) != attention_head_dim({config.attention_head_dim})"
+                + f"\nPlease change head_dim:{config.attention_head_dim} in config.json to make sure correctly inference."
+            )
 class HunYuanAttention(nn.Module):
     def __init__(
@@ -240,9 +279,11 @@ class HunYuanAttention(nn.Module):
             assert tp_size % self.total_num_kv_heads == 0
         self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
         # MistralConfig has an optional head_dim introduced by Mistral-Nemo
-        self.head_dim = getattr(
-            config, "head_dim", self.hidden_size // self.total_num_heads
-        )
+        # Prioritize `head_dim` but fall back to `attention_head_dim` for Hunyuan models.
+        self.head_dim = get_head_dim(config)
+        check_head_dim(config)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -493,7 +534,6 @@ class HunYuanModel(nn.Module):
             hidden_states = self.get_input_embeddings(input_ids)
         residual = None
-        cla_factor = _get_cla_factor(self.config)
         prev_kv_states = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
@@ -560,6 +600,11 @@ class HunYuanMoEV1ForCausalLM(nn.Module):
         if config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
+        self.hidden_size = config.hidden_size
+        self.head_dim = get_head_dim(config)
+        check_head_dim(config)
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(config, logit_scale=logit_scale)
         self.sampler = Sampler()
@@ -582,16 +627,14 @@ class HunYuanMoEV1ForCausalLM(nn.Module):
             self.config, "num_key_value_heads", self.config.num_attention_heads
         )
         num_key_value_groups = num_attention_heads // num_kv_heads
-        hidden_size = self.config.hidden_size
-        attention_head_dim = self.config.hidden_size // num_attention_heads
         qkv = qkv.reshape(
-            num_kv_heads, num_key_value_groups + 2, attention_head_dim, hidden_size
+            num_kv_heads, num_key_value_groups + 2, self.head_dim, self.hidden_size
         )
         q, k, v = torch.split(qkv, (num_key_value_groups, 1, 1), dim=1)
-        q = q.reshape(-1, hidden_size)
-        k = k.reshape(-1, hidden_size)
-        v = v.reshape(-1, hidden_size)
+        q = q.reshape(-1, self.hidden_size)
+        k = k.reshape(-1, self.hidden_size)
+        v = v.reshape(-1, self.hidden_size)
         return torch.concat((q, k, v))
         # return qkv.reshape((num_kv_heads, num_key_value_groups+2 , attention_head_dim, hidden_size)).permute((1,0,2,3)).reshape((-1, hidden_size)),
@@ -768,4 +811,8 @@ class HunYuanMoEV1ForCausalLM(nn.Module):
                 )
-EntryClass = HunYuanMoEV1ForCausalLM
+class HunYuanDenseV1ForCausalLM(HunYuanMoEV1ForCausalLM):
+    pass
+EntryClass = [HunYuanMoEV1ForCausalLM, HunYuanDenseV1ForCausalLM]

sglang/srt/models/internvl.py CHANGED Viewed

@@ -510,7 +510,7 @@ class InternVLChatModel(nn.Module):
         Returns:
             image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
         """
-        pixel_values = torch.cat([item.pixel_values for item in items])
+        pixel_values = torch.cat([item.feature for item in items])
         image_features = self.extract_feature(pixel_values)
         return image_features

sglang/srt/models/kimi_vl.py CHANGED Viewed

@@ -144,7 +144,7 @@ class KimiVLForConditionalGeneration(nn.Module):
     def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
         pixel_values = (
-            torch.cat([item.pixel_values for item in items], dim=0)
+            torch.cat([item.feature for item in items], dim=0)
             .type(self.vision_tower.dtype)
             .to(self.vision_tower.device)
         )

sglang/srt/models/llama.py CHANGED Viewed

@@ -480,6 +480,47 @@ class LlamaForCausalLM(nn.Module):
         else:
             return hidden_states
+    @torch.no_grad()
+    def forward_split_prefill(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        split_interval: Tuple[int, int],  # [start, end) 0-based
+        input_embeds: torch.Tensor = None,
+    ) -> Optional[LogitsProcessorOutput]:
+        start, end = split_interval
+        # embed
+        if start == 0:
+            if input_embeds is None:
+                forward_batch.hidden_states = self.model.embed_tokens(input_ids)
+            else:
+                forward_batch.hidden_states = input_embeds
+        # decoder layer
+        for i in range(start, end):
+            layer = self.model.layers[i]
+            forward_batch.hidden_states, forward_batch.residual = layer(
+                positions,
+                forward_batch.hidden_states,
+                forward_batch,
+                forward_batch.residual,
+            )
+        if end == self.model.config.num_hidden_layers:
+            # norm
+            hidden_states, _ = self.model.norm(
+                forward_batch.hidden_states, forward_batch.residual
+            )
+            forward_batch.hidden_states = hidden_states
+            # logits process
+            result = self.logits_processor(
+                input_ids, forward_batch.hidden_states, self.lm_head, forward_batch
+            )
+        else:
+            result = None
+        return result
     @property
     def start_layer(self):
         return self.model.start_layer

sglang/srt/models/llama4.py CHANGED Viewed

@@ -40,6 +40,7 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.rotary_embedding import get_rope
@@ -103,14 +104,17 @@ class Llama4MoE(nn.Module):
             prefix=add_prefix("router", prefix),
         )
+        self.topk = TopK(
+            top_k=self.top_k,
+            renormalize=False,
+            custom_routing_function=Llama4MoE.custom_routing_function,
+        )
         self.experts = FusedMoE(
             num_experts=config.num_local_experts,
-            top_k=config.num_experts_per_tok,
             hidden_size=config.hidden_size,
-            custom_routing_function=Llama4MoE.custom_routing_function,
             intermediate_size=intermediate_size_moe,
             reduce_results=False,
-            renormalize=False,
             quant_config=quant_config,
             apply_router_weight_on_input=True,
             prefix=add_prefix("experts", prefix),
@@ -147,10 +151,8 @@ class Llama4MoE(nn.Module):
         # router_scores: [num_tokens, num_experts]
         router_logits, _ = self.router(hidden_states)
         shared_out = self.shared_expert(hidden_states)
-        routed_out = self.experts(
-            hidden_states=hidden_states,
-            router_logits=router_logits,
-        )
+        topk_output = self.topk(hidden_states, router_logits)
+        routed_out = self.experts(hidden_states, topk_output)
         return shared_out, routed_out
     def _forward_core_shared_routed_overlap(self, hidden_states):
@@ -163,10 +165,8 @@ class Llama4MoE(nn.Module):
         with self.device_module.stream(alt_stream):
             # router_scores: [num_tokens, num_experts]
             router_logits, _ = self.router(hidden_states)
-            routed_out = self.experts(
-                hidden_states=hidden_states,
-                router_logits=router_logits,
-            )
+            topk_output = self.topk(hidden_states, router_logits)
+            routed_out = self.experts(hidden_states, topk_output)
         self.device_module.current_stream().wait_stream(alt_stream)
         return shared_out, routed_out

sglang/srt/models/llava.py CHANGED Viewed

@@ -186,7 +186,7 @@ class LlavaBaseForCausalLM(nn.Module):
                 bs = forward_batch.batch_size
                 pixel_values = flatten_nested_list(
                     [
-                        [item.pixel_values for item in image_inputs[i].mm_items]
+                        [item.feature for item in image_inputs[i].mm_items]
                         for i in range(bs)
                         if need_vision[i]
                     ]
@@ -753,7 +753,7 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM):
         features = []
         for item in items:
             # in each item, we assume pixel_values is always batched
-            pixel_values, image_sizes = item.pixel_values, item.image_sizes
+            pixel_values, image_sizes = item.feature, item.image_sizes
             image_outputs = self.vision_tower(
                 pixel_values, image_sizes, output_hidden_states=True
             )

sglang/srt/models/llavavid.py CHANGED Viewed

@@ -135,7 +135,7 @@ class LlavaVidForCausalLM(nn.Module):
             if need_vision.any():
                 pixel_values = flatten_nested_list(
                     [
-                        [item.pixel_values for item in image_inputs[i].mm_items]
+                        [item.feature for item in image_inputs[i].mm_items]
                         for i in range(bs)
                         if need_vision[i]
                     ]

sglang/srt/models/minicpm.py CHANGED Viewed

@@ -138,8 +138,6 @@ class MiniCPMAttention(nn.Module):
             base=rope_theta,
             rope_scaling=rope_scaling,
         )
-        # set rope as fp32 instead of bf16
-        self.rotary_emb.cos_sin_cache = self.rotary_emb._compute_cos_sin_cache()
         self.attn = RadixAttention(
             self.num_heads,
             self.head_dim,

sglang/srt/models/minicpmo.py CHANGED Viewed

@@ -1552,9 +1552,7 @@ class MiniCPMO(MiniCPMBaseModel):
         Returns:
             List[List[torch.Tensor]]: audio embeddings
         """
-        wavforms = flatten_nested_list(
-            [item.audio_features for item in items if item.audio_features]
-        )
+        wavforms = flatten_nested_list([item.feature for item in items if item.feature])
         # list, [[x1, x2], [y1], [z1]]
         audio_feature_lens_raw = flatten_nested_list(
             [item.audio_feature_lens for item in items if item.audio_feature_lens]
@@ -1659,9 +1657,7 @@ class MiniCPMO(MiniCPMBaseModel):
             List[List[torch.Tensor]]: audio embeddings
         """
         # (bs, 80, frames) or [], multi audios need filled in advance
-        wavforms = flatten_nested_list(
-            [item.audio_features for item in items if item.audio_features]
-        )
+        wavforms = flatten_nested_list([item.feature for item in items if item.feature])
         # list, [[x1, x2], [y1], [z1]]
         audio_feature_lens_raw = flatten_nested_list(
             [item.audio_feature_lens for item in items if item.audio_feature_lens]
@@ -1778,7 +1774,7 @@ class MiniCPMO(MiniCPMBaseModel):
     def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
         # list of tensors
-        pixel_values = flatten_nested_list([item.pixel_values for item in items])
+        pixel_values = flatten_nested_list([item.feature for item in items])
         tgt_sizes = torch.stack(
             flatten_nested_list([item.tgt_size for item in items]), dim=0
         )

sglang/srt/models/minicpmv.py CHANGED Viewed

@@ -724,7 +724,7 @@ class MiniCPMV2_6(MiniCPMBaseModel):
     def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
         # list of tensors
-        pixel_values = flatten_nested_list([item.pixel_values for item in items])
+        pixel_values = flatten_nested_list([item.feature for item in items])
         tgt_sizes = torch.stack(
             flatten_nested_list([item.tgt_size for item in items]), dim=0
         )

sglang/srt/models/mistral.py CHANGED Viewed

@@ -56,7 +56,7 @@ class Mistral3ForConditionalGeneration:
         features = []
         for item in items:
             # in each item, we assume pixel_values is always batched
-            pixel_values, image_sizes = item.pixel_values, item.image_sizes
+            pixel_values, image_sizes = item.feature, item.image_sizes
             image_outputs = self.vision_tower(
                 pixel_values, image_sizes, output_hidden_states=True
             )

sglang/srt/models/mixtral.py CHANGED Viewed

@@ -37,6 +37,7 @@ from sglang.srt.layers.linear import (
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.ep_moe.layer import EPMoE
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.rotary_embedding import get_rope
@@ -86,6 +87,12 @@ class MixtralMoE(nn.Module):
             quant_config=None,
             prefix=add_prefix("gate", prefix),
         )
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=True,
+        )
         MoEImpl = EPMoE if global_server_args_dict["enable_ep_moe"] else FusedMoE
         self.experts = MoEImpl(
             num_experts=num_experts,
@@ -93,7 +100,6 @@ class MixtralMoE(nn.Module):
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             params_dtype=params_dtype,
-            renormalize=True,
             quant_config=quant_config,
             tp_size=tp_size,
             prefix=add_prefix("experts", prefix),
@@ -105,7 +111,8 @@ class MixtralMoE(nn.Module):
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states, router_logits)
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states.view(orig_shape)

sglang/srt/models/mllama.py CHANGED Viewed

@@ -838,9 +838,7 @@ class MllamaForConditionalGeneration(nn.Module):
         self.logits_processor = LogitsProcessor(config.text_config)
     def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
-        pixel_values = torch.cat(
-            [item.pixel_values for item in mm_inputs.mm_items], dim=0
-        )
+        pixel_values = torch.cat([item.feature for item in mm_inputs.mm_items], dim=0)
         pad_values = [item.pad_value for item in mm_inputs.mm_items]
         num_concurrent_media, num_tiles = pixel_values.shape[1:3]
@@ -862,7 +860,7 @@ class MllamaForConditionalGeneration(nn.Module):
             if not forward_batch.encoder_cached[i] and mm_input is not None:
                 pixel_values = torch.cat(
-                    [item.pixel_values for item in mm_input.mm_items], dim=0
+                    [item.feature for item in mm_input.mm_items], dim=0
                 )
                 max_num_images = max(max_num_images, pixel_values.shape[1])
@@ -897,7 +895,7 @@ class MllamaForConditionalGeneration(nn.Module):
                 encoder_lens_need.append(forward_batch.encoder_lens[k])
                 pixel_values = torch.cat(
-                    [item.pixel_values for item in mm_input.mm_items], dim=0
+                    [item.feature for item in mm_input.mm_items], dim=0
                 )
                 for j in range(pixel_values.shape[1]):
                     img = pixel_values[0, j]

sglang/srt/models/mllama4.py CHANGED Viewed

@@ -23,6 +23,7 @@ from sglang.srt.managers.schedule_batch import (
     Modality,
     MultimodalDataItem,
     MultimodalInputs,
+    global_server_args_dict,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -55,13 +56,17 @@ class Llama4ForConditionalGeneration(nn.Module):
         self.quant_config = quant_config
         # Check if this is a text-only model (modelopt fp8 llama4 has no vision components)
-        self.has_vision = self._has_vision_weights(config)
-        if not self.has_vision:
+        self.has_vision_weights = self._has_vision_weights(config)
+        if not self.has_vision_weights:
             logger.warning(
                 "No vision weights found in checkpoint. Model will run in text-only mode. "
                 "Multimodal capabilities (image processing) will be unavailable."
             )
+        self.has_vision = (
+            self.has_vision_weights and global_server_args_dict["enable_multimodal"]
+        )
         if self.has_vision:
             self.vision_model = Llama4VisionModel(config.vision_config)
             self.multi_modal_projector = Llama4MultiModalProjector(config)
@@ -81,6 +86,7 @@ class Llama4ForConditionalGeneration(nn.Module):
         self.logits_processor = LogitsProcessor(
             config.text_config if hasattr(config, "text_config") else config
         )
+        self.padding_pattern = MultiModalityDataPaddingPatternMultimodalTokens()
     def _has_vision_weights(self, config) -> bool:
         """Check if the model has vision components by examining the checkpoint."""
@@ -135,8 +141,7 @@ class Llama4ForConditionalGeneration(nn.Module):
             return False
     def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
-        pattern = MultiModalityDataPaddingPatternMultimodalTokens()
-        return pattern.pad_input_tokens(input_ids, mm_inputs)
+        return self.padding_pattern.pad_input_tokens(input_ids, mm_inputs)
     def get_image_feature(
         self,
@@ -147,7 +152,7 @@ class Llama4ForConditionalGeneration(nn.Module):
             raise ValueError("Vision model not available for text-only checkpoint")
         pixel_values = (
-            torch.concat([item.pixel_values for item in items])
+            torch.concat([item.feature for item in items])
             .to(next(self.vision_model.parameters()).device)
             .type(next(self.vision_model.parameters()).dtype)
         )
@@ -269,7 +274,9 @@ class Llama4ForConditionalGeneration(nn.Module):
     def _should_skip_weight(self, name: str) -> bool:
         """Check if we should skip loading this weight."""
-        return "vision" in name and not self.has_vision
+        return not self.has_vision and (
+            "vision" in name or "multi_modal_projector" in name
+        )
     def _transform_weight_name(self, name: str) -> str:
         """Transform weight name by adding language_model prefix if needed."""

sglang/srt/models/olmoe.py CHANGED Viewed

@@ -32,6 +32,7 @@ from sglang.srt.layers.linear import (
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+from sglang.srt.layers.moe.topk import TopK
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.layers.rotary_embedding import get_rope
@@ -76,13 +77,16 @@ class OlmoeMoE(nn.Module):
             prefix=add_prefix("gate", prefix),
         )
+        self.topk = TopK(
+            top_k=top_k,
+            renormalize=False,
+        )
         self.experts = FusedMoE(
             num_experts=num_experts,
-            top_k=top_k,
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             reduce_results=True,
-            renormalize=False,
             quant_config=quant_config,
             tp_size=tp_size,
             prefix=add_prefix("experts", prefix),
@@ -94,9 +98,8 @@ class OlmoeMoE(nn.Module):
         hidden_states = hidden_states.view(-1, self.hidden_size)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(
-            hidden_states=hidden_states, router_logits=router_logits
-        )
+        topk_output = self.topk(hidden_states, router_logits)
+        final_hidden_states = self.experts(hidden_states, topk_output)
         return final_hidden_states.view(orig_shape)

sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post4py3-none-any.whl