PyPI - sglang - Versions diffs - 0.4.4.post1__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl - Mend

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (172) hide show

sglang/__init__.py +2 -0
sglang/api.py +6 -0
sglang/bench_one_batch.py +1 -1
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +3 -1
sglang/check_env.py +3 -4
sglang/lang/backend/openai.py +18 -5
sglang/lang/chat_template.py +28 -7
sglang/lang/interpreter.py +7 -3
sglang/lang/ir.py +10 -0
sglang/srt/_custom_ops.py +1 -1
sglang/srt/code_completion_parser.py +174 -0
sglang/srt/configs/__init__.py +2 -6
sglang/srt/configs/deepseekvl2.py +667 -0
sglang/srt/configs/janus_pro.py +3 -4
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +63 -11
sglang/srt/configs/utils.py +25 -0
sglang/srt/connector/__init__.py +51 -0
sglang/srt/connector/base_connector.py +112 -0
sglang/srt/connector/redis.py +85 -0
sglang/srt/connector/s3.py +122 -0
sglang/srt/connector/serde/__init__.py +31 -0
sglang/srt/connector/serde/safe_serde.py +29 -0
sglang/srt/connector/serde/serde.py +43 -0
sglang/srt/connector/utils.py +35 -0
sglang/srt/conversation.py +88 -0
sglang/srt/disaggregation/conn.py +81 -0
sglang/srt/disaggregation/decode.py +495 -0
sglang/srt/disaggregation/mini_lb.py +285 -0
sglang/srt/disaggregation/prefill.py +249 -0
sglang/srt/disaggregation/utils.py +44 -0
sglang/srt/distributed/parallel_state.py +10 -3
sglang/srt/entrypoints/engine.py +55 -5
sglang/srt/entrypoints/http_server.py +71 -12
sglang/srt/function_call_parser.py +133 -54
sglang/srt/hf_transformers_utils.py +28 -3
sglang/srt/layers/activation.py +4 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +295 -0
sglang/srt/layers/attention/flashinfer_backend.py +1 -1
sglang/srt/layers/attention/flashmla_backend.py +284 -0
sglang/srt/layers/attention/triton_backend.py +171 -38
sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
sglang/srt/layers/attention/utils.py +53 -0
sglang/srt/layers/attention/vision.py +9 -28
sglang/srt/layers/dp_attention.py +32 -21
sglang/srt/layers/layernorm.py +24 -2
sglang/srt/layers/linear.py +17 -5
sglang/srt/layers/logits_processor.py +25 -7
sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
sglang/srt/layers/moe/ep_moe/layer.py +273 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
sglang/srt/layers/moe/fused_moe_native.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
sglang/srt/layers/moe/topk.py +31 -18
sglang/srt/layers/parameter.py +1 -1
sglang/srt/layers/quantization/__init__.py +184 -126
sglang/srt/layers/quantization/base_config.py +5 -0
sglang/srt/layers/quantization/blockwise_int8.py +1 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
sglang/srt/layers/quantization/fp8.py +76 -34
sglang/srt/layers/quantization/fp8_kernel.py +24 -8
sglang/srt/layers/quantization/fp8_utils.py +284 -28
sglang/srt/layers/quantization/gptq.py +36 -9
sglang/srt/layers/quantization/kv_cache.py +98 -0
sglang/srt/layers/quantization/modelopt_quant.py +9 -7
sglang/srt/layers/quantization/utils.py +153 -0
sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
sglang/srt/layers/rotary_embedding.py +66 -87
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/layers.py +68 -0
sglang/srt/lora/lora.py +2 -22
sglang/srt/lora/lora_manager.py +47 -23
sglang/srt/lora/mem_pool.py +110 -51
sglang/srt/lora/utils.py +12 -1
sglang/srt/managers/cache_controller.py +2 -5
sglang/srt/managers/data_parallel_controller.py +30 -8
sglang/srt/managers/expert_distribution.py +81 -0
sglang/srt/managers/io_struct.py +39 -3
sglang/srt/managers/mm_utils.py +373 -0
sglang/srt/managers/multimodal_processor.py +68 -0
sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
sglang/srt/managers/schedule_batch.py +133 -30
sglang/srt/managers/scheduler.py +273 -20
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +59 -23
sglang/srt/managers/tp_worker.py +1 -1
sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
sglang/srt/managers/utils.py +6 -1
sglang/srt/mem_cache/hiradix_cache.py +18 -7
sglang/srt/mem_cache/memory_pool.py +255 -98
sglang/srt/mem_cache/paged_allocator.py +2 -2
sglang/srt/mem_cache/radix_cache.py +4 -4
sglang/srt/model_executor/cuda_graph_runner.py +27 -13
sglang/srt/model_executor/forward_batch_info.py +68 -11
sglang/srt/model_executor/model_runner.py +70 -6
sglang/srt/model_loader/loader.py +160 -2
sglang/srt/model_loader/weight_utils.py +45 -0
sglang/srt/models/deepseek_janus_pro.py +29 -86
sglang/srt/models/deepseek_nextn.py +22 -10
sglang/srt/models/deepseek_v2.py +208 -77
sglang/srt/models/deepseek_vl2.py +358 -0
sglang/srt/models/gemma3_causal.py +684 -0
sglang/srt/models/gemma3_mm.py +462 -0
sglang/srt/models/llama.py +47 -7
sglang/srt/models/llama_eagle.py +1 -0
sglang/srt/models/llama_eagle3.py +196 -0
sglang/srt/models/llava.py +3 -3
sglang/srt/models/llavavid.py +3 -3
sglang/srt/models/minicpmo.py +1995 -0
sglang/srt/models/minicpmv.py +62 -137
sglang/srt/models/mllama.py +4 -4
sglang/srt/models/phi3_small.py +1 -1
sglang/srt/models/qwen2.py +3 -0
sglang/srt/models/qwen2_5_vl.py +68 -146
sglang/srt/models/qwen2_classification.py +75 -0
sglang/srt/models/qwen2_moe.py +9 -1
sglang/srt/models/qwen2_vl.py +25 -63
sglang/srt/openai_api/adapter.py +124 -28
sglang/srt/openai_api/protocol.py +23 -2
sglang/srt/sampling/sampling_batch_info.py +1 -1
sglang/srt/sampling/sampling_params.py +6 -6
sglang/srt/server_args.py +99 -9
sglang/srt/speculative/build_eagle_tree.py +7 -347
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
sglang/srt/speculative/eagle_utils.py +208 -252
sglang/srt/speculative/eagle_worker.py +139 -53
sglang/srt/speculative/spec_info.py +6 -1
sglang/srt/torch_memory_saver_adapter.py +22 -0
sglang/srt/utils.py +182 -21
sglang/test/__init__.py +0 -0
sglang/test/attention/__init__.py +0 -0
sglang/test/attention/test_flashattn_backend.py +312 -0
sglang/test/runners.py +2 -0
sglang/test/test_activation.py +2 -1
sglang/test/test_block_fp8.py +5 -4
sglang/test/test_block_fp8_ep.py +2 -1
sglang/test/test_dynamic_grad_mode.py +58 -0
sglang/test/test_layernorm.py +3 -2
sglang/test/test_utils.py +55 -4
sglang/utils.py +31 -0
sglang/version.py +1 -1
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/METADATA +12 -8
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/RECORD +167 -123
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/WHEEL +1 -1
sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
sglang/srt/managers/image_processor.py +0 -55
sglang/srt/managers/image_processors/base_image_processor.py +0 -219
sglang/srt/managers/image_processors/minicpmv.py +0 -86
sglang/srt/managers/multi_modality_padding.py +0 -134
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info/licenses}/LICENSE +0 -0
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/models/gemma3_mm.py ADDED Viewed

@@ -0,0 +1,462 @@
+# Copyright 2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Adapted from:
+# https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/gemma3_mm.py
+import logging
+from functools import lru_cache
+from typing import Dict, Iterable, List, Optional, Set, Tuple, TypedDict
+import torch
+from torch import nn
+from transformers import (
+    AutoModel,
+    BatchFeature,
+    Gemma3Config,
+    Gemma3Processor,
+    PreTrainedModel,
+)
+from transformers.models.gemma3.processing_gemma3 import Gemma3ProcessorKwargs
+from sglang.srt.hf_transformers_utils import get_processor
+from sglang.srt.layers.layernorm import Gemma3RMSNorm
+from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.managers.mm_utils import (
+    MultiModalityDataPaddingPatternTokenPairs,
+    general_mm_embed_routine,
+)
+from sglang.srt.managers.schedule_batch import MultimodalInputs
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from sglang.srt.models.gemma3_causal import Gemma3ForCausalLM
+from sglang.srt.utils import add_prefix
+logger = logging.getLogger(__name__)
+cached_get_processor = lru_cache(get_processor)
+class Gemma3ImagePixelInputs(TypedDict):
+    pixel_values: torch.Tensor
+    """Shape: `(batch_size * num_images, num_channels, height, width)`"""
+class Gemma3MultiModalProjector(nn.Module):
+    """Projector for Gemma3 multimodal."""
+    def __init__(self, config: Gemma3Config):
+        super().__init__()
+        self.mm_input_projection_weight = nn.Parameter(
+            torch.zeros(
+                config.vision_config.hidden_size, config.text_config.hidden_size
+            )
+        )
+        self.mm_soft_emb_norm = Gemma3RMSNorm(
+            config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps
+        )
+        self.patches_per_image = int(
+            config.vision_config.image_size // config.vision_config.patch_size
+        )
+        self.tokens_per_side = int(config.mm_tokens_per_image**0.5)
+        self.kernel_size = self.patches_per_image // self.tokens_per_side
+        self.avg_pool = nn.AvgPool2d(
+            kernel_size=self.kernel_size, stride=self.kernel_size
+        )
+    def forward(self, vision_outputs: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_length, hidden_size = vision_outputs.shape
+        # Reshape for pooling
+        reshaped_vision_outputs = vision_outputs.transpose(1, 2)
+        reshaped_vision_outputs = reshaped_vision_outputs.reshape(
+            batch_size, hidden_size, self.patches_per_image, self.patches_per_image
+        )
+        reshaped_vision_outputs = reshaped_vision_outputs.contiguous()
+        # Apply pooling
+        pooled_vision_outputs = self.avg_pool(reshaped_vision_outputs)
+        pooled_vision_outputs = pooled_vision_outputs.flatten(2)
+        pooled_vision_outputs = pooled_vision_outputs.transpose(1, 2)
+        # Apply normalization
+        normed_vision_outputs = self.mm_soft_emb_norm(pooled_vision_outputs)
+        # Project to text embedding space
+        projected_vision_outputs = torch.matmul(
+            normed_vision_outputs, self.mm_input_projection_weight
+        )
+        return projected_vision_outputs.type_as(vision_outputs)
+class Gemma3ForConditionalGeneration(PreTrainedModel):
+    config_class = Gemma3Config
+    """Gemma3 multimodal model for conditional generation."""
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    # Gemma does not apply LoRA to the embedding layer.
+    embedding_modules = {}
+    embedding_padding_modules = []
+    supports_lora = True
+    def __init__(
+        self,
+        config: Gemma3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__(config=config)
+        self.config = config
+        self.quant_config = quant_config
+        # Vision components
+        # TODO: replace with vision attention
+        # self.vision_tower = SiglipVisionModel(
+        #     config.vision_config,
+        #     quant_config,
+        #     prefix=add_prefix("vision_tower", prefix),
+        # )
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+        self.multi_modal_projector = Gemma3MultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        # Text model
+        self.language_model = Gemma3ForCausalLM(
+            config.text_config, quant_config, prefix=add_prefix("model", prefix)
+        )
+        if self.language_model.logits_processor.logit_scale:
+            logit_scale = getattr(config, "logit_scale", 1.0)
+            self.language_model.logits_processor.logit_scale *= logit_scale
+        self.post_init()
+    def pad_input_ids(
+        self, input_ids: List[int], image_inputs: MultimodalInputs
+    ) -> List[int]:
+        """Pad input IDs with image tokens."""
+        # Get special token IDs
+        im_start_id: int = image_inputs.im_start_id
+        im_end_id: int = image_inputs.im_end_id
+        media_token_pairs = [(im_start_id, im_end_id)]
+        pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
+        ids = pattern.pad_input_tokens(input_ids, image_inputs)
+        return ids
+    def prepare_attn_masks(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        mask_dtype: torch.dtype,
+        **kwargs,
+    ) -> Dict:
+        """Prepare attention masks for multimodal inputs."""
+        kwargs["has_images"] = True
+        # Distinguish sequences by position id 0
+        start_indices = (positions == 0).cpu().nonzero()
+        num_seqs = len(start_indices)
+        seq_lens = []
+        for i in range(num_seqs):
+            start_idx = start_indices[i].item()
+            if i < num_seqs - 1:
+                end_idx = start_indices[i + 1].item()
+            else:
+                end_idx = len(input_ids)
+            seq_lens.append(end_idx - start_idx)
+        kwargs["seq_lens"] = seq_lens
+        # Create attention masks
+        global_attn_masks = []
+        local_attn_masks = []
+        sliding_window = self.config.text_config.interleaved_sliding_window
+        start_idx = 0
+        for seq_len in seq_lens:
+            end_idx = start_idx + seq_len
+            input_token_ids = input_ids[start_idx:end_idx]
+            start_idx = end_idx
+            # Create global causal mask
+            global_attn_mask = torch.empty(
+                1,
+                1,
+                seq_len,
+                seq_len,
+                dtype=mask_dtype,
+                device=input_ids.device,
+            )
+            global_attn_mask.fill_(float("-inf"))
+            global_attn_mask = global_attn_mask.triu(diagonal=1)
+            # Consider bidirectional attention between image tokens
+            img_mask = torch.zeros_like(global_attn_mask)
+            img_pos = input_token_ids == self.config.image_token_index
+            img_mask[:, :, :, img_pos] += 1
+            img_mask[:, :, img_pos, :] += 1
+            global_attn_mask = torch.where(img_mask == 2, 0, global_attn_mask)
+            global_attn_masks.append(global_attn_mask)
+            # Create local causal mask with sliding window
+            local_attn_mask = torch.ones_like(global_attn_mask)
+            local_attn_mask = torch.tril(local_attn_mask, diagonal=-sliding_window)
+            local_attn_mask = torch.where(
+                local_attn_mask == 0, global_attn_mask, float("-inf")
+            )
+            local_attn_masks.append(local_attn_mask)
+        kwargs["global_attn_masks"] = global_attn_masks
+        kwargs["local_attn_masks"] = local_attn_masks
+        return kwargs
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.language_model.get_input_embeddings()
+    def get_image_feature(self, image_input: MultimodalInputs):
+        """
+        Projects the last hidden state from the vision model into language model space.
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        pixel_values = image_input.pixel_values
+        pixel_values = pixel_values.to("cuda")
+        pixel_values = pixel_values.to(dtype=self.language_model.dtype())
+        vision_outputs = self.vision_tower(pixel_values=pixel_values).last_hidden_state
+        image_features = self.multi_modal_projector(vision_outputs)
+        return image_features
+    def embed_mm_inputs(
+        self,
+        input_ids: torch.Tensor,
+        forward_batch: ForwardBatch,
+        image_input: MultimodalInputs,
+    ) -> torch.Tensor:
+        if input_ids is None:
+            raise ValueError("Unimplemented")
+        # boolean-masking image tokens
+        special_image_mask = torch.isin(
+            input_ids,
+            torch.tensor(image_input.pad_values, device=input_ids.device),
+        ).unsqueeze(-1)
+        num_image_tokens_in_input_ids = special_image_mask.sum()
+        inputs_embeds = None
+        if num_image_tokens_in_input_ids == 0:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            return inputs_embeds
+        else:
+            # print(f"image tokens from input_ids: {inputs_embeds[special_image_mask].numel()}")
+            image_features = self.get_image_feature(image_input.pixel_values)
+            # print(f"image tokens from image embeddings: {image_features.numel()}")
+            num_image_tokens_in_embedding = (
+                image_features.shape[0] * image_features.shape[1]
+            )
+            if num_image_tokens_in_input_ids != num_image_tokens_in_embedding:
+                num_image = num_image_tokens_in_input_ids // image_features.shape[1]
+                image_features = image_features[:num_image, :]
+                logger.warning(
+                    f"Number of images does not match number of special image tokens in the input text. "
+                    f"Got {num_image_tokens_in_input_ids} image tokens in the text but {num_image_tokens_in_embedding} "
+                    "tokens from image embeddings."
+                )
+            # Important: clamp after extracting original image boundaries
+            input_ids.clamp_(min=0, max=self.vocab_size - 1)
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
+                inputs_embeds.device
+            )
+            image_features = image_features.to(
+                inputs_embeds.device, inputs_embeds.dtype
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(
+                special_image_mask, image_features
+            )
+        return inputs_embeds
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        forward_batch: ForwardBatch,
+        input_embeds: torch.Tensor = None,
+        **kwargs: object,
+    ) -> LogitsProcessor:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+            logits_to_keep (`int` or `torch.Tensor`, *optional*):
+                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
+                This is useful when using packed tensor format (single dimension for batch and sequence length).
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
+        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/Gemma3-test-224px-hf")
+        >>> processor = AutoProcessor.from_pretrained("google/Gemma3-test-224px-hf")
+        >>> prompt = "answer en Where is the cow standing?"
+        >>> url = "https://huggingface.co/gv-hf/Gemma3-test-224px-hf/resolve/main/cow_beach_1.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "answer en Where is the cow standing?\nbeach"
+        ```"""
+        # Important: position_ids in Gemma3 are 1-indexed
+        # This really does cost me sometime
+        positions += 1
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_index >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_index
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+        inputs_embeds = general_mm_embed_routine(
+            input_ids=llm_input_ids,
+            forward_batch=forward_batch,
+            embed_tokens=self.get_input_embeddings(),
+            mm_data_embedding_func=self.get_image_feature,
+        )
+        outputs = self.language_model(
+            input_ids=None,
+            positions=positions,
+            forward_batch=forward_batch,
+            input_embeds=inputs_embeds,
+            **kwargs,
+        )
+        return outputs
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        """Load weights for the model."""
+        params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
+        for name, loaded_weight in weights:
+            if "language_model" in name:
+                # Gemma3ForCausalLM.load_weights(self, [(name.replace("language_model.", ""), loaded_weight)])
+                causal_loaded_params = Gemma3ForCausalLM.load_weights(
+                    self, [(name, loaded_weight)]
+                )
+                loaded_params.update(causal_loaded_params)
+                continue
+            else:
+                # Skip lm_head.weight as it's tied with embed_tokens
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Remapping the name of FP8 kv-scale
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader", default_weight_loader)
+                weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            pass
+            # raise RuntimeError(
+            #     f"Some weights are not initialized from checkpoints: {unloaded_params}")
+        return loaded_params
+EntryClass = Gemma3ForConditionalGeneration
+AutoModel.register(Gemma3Config, Gemma3ForConditionalGeneration, exist_ok=True)

sglang/srt/models/llama.py CHANGED Viewed

@@ -17,7 +17,7 @@
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 import logging
-from typing import Any, Dict, Iterable, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 import torch
 from torch import nn
@@ -129,6 +129,8 @@ class LlamaAttention(nn.Module):
         self.head_dim = getattr(
             config, "head_dim", self.hidden_size // self.total_num_heads
         )
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1)
+        self.rotary_dim = int(partial_rotary_factor * self.head_dim)
         self.q_size = self.num_heads * self.head_dim
         self.kv_size = self.num_kv_heads * self.head_dim
         self.scaling = self.head_dim**-0.5
@@ -154,7 +156,7 @@ class LlamaAttention(nn.Module):
         self.rotary_emb = get_rope(
             self.head_dim,
-            rotary_dim=self.head_dim,
+            rotary_dim=self.rotary_dim,
             max_position=max_position_embeddings,
             base=rope_theta,
             rope_scaling=rope_scaling,
@@ -285,6 +287,7 @@ class LlamaModel(nn.Module):
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layers_to_capture = []
     def forward(
         self,
@@ -292,13 +295,16 @@ class LlamaModel(nn.Module):
         positions: torch.Tensor,
         forward_batch: ForwardBatch,
         input_embeds: torch.Tensor = None,
-    ) -> torch.Tensor:
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor]]]:
         if input_embeds is None:
             hidden_states = self.embed_tokens(input_ids)
         else:
             hidden_states = input_embeds
         residual = None
+        aux_hidden_states = []
         for i in range(len(self.layers)):
+            if i in self.layers_to_capture:
+                aux_hidden_states.append(hidden_states + residual)
             layer = self.layers[i]
             hidden_states, residual = layer(
                 positions,
@@ -307,7 +313,11 @@ class LlamaModel(nn.Module):
                 residual,
             )
         hidden_states, _ = self.norm(hidden_states, residual)
-        return hidden_states
+        if len(aux_hidden_states) == 0:
+            return hidden_states
+        return hidden_states, aux_hidden_states
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
@@ -335,7 +345,6 @@ class LlamaModel(nn.Module):
 class LlamaForCausalLM(nn.Module):
     # BitandBytes specific attributes
     default_bitsandbytes_target_modules = [
         ".gate_proj.",
@@ -391,6 +400,8 @@ class LlamaForCausalLM(nn.Module):
             (".gate_up_proj", ".up_proj", 1),
         ]
+        self.capture_aux_hidden_states = False
     @torch.no_grad()
     def forward(
         self,
@@ -400,10 +411,19 @@ class LlamaForCausalLM(nn.Module):
         input_embeds: torch.Tensor = None,
         get_embedding: bool = False,
     ) -> LogitsProcessorOutput:
-        hidden_states = self.model(input_ids, positions, forward_batch, input_embeds)
+        aux_hidden_states = None
+        if self.capture_aux_hidden_states:
+            hidden_states, aux_hidden_states = self.model(
+                input_ids, positions, forward_batch, input_embeds
+            )
+        else:
+            hidden_states = self.model(
+                input_ids, positions, forward_batch, input_embeds
+            )
         if not get_embedding:
             return self.logits_processor(
-                input_ids, hidden_states, self.lm_head, forward_batch
+                input_ids, hidden_states, self.lm_head, forward_batch, aux_hidden_states
             )
         else:
             return self.pooler(hidden_states, forward_batch)
@@ -586,9 +606,29 @@ class LlamaForCausalLM(nn.Module):
         torch.cuda.empty_cache()
         torch.cuda.synchronize()
+    def get_embed(self):
+        return self.model.embed_tokens.weight
+    def set_embed(self, embed):
+        # NOTE: If draft hidden size != target hidden size, the embed weight cannot be shared for EAGLE3
+        if (
+            hasattr(self.config, "target_hidden_size")
+            and self.config.target_hidden_size != self.config.hidden_size
+        ):
+            return
+        del self.model.embed_tokens.weight
+        self.model.embed_tokens.weight = embed
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
     def load_kv_cache_scales(self, quantization_param_path: str) -> None:
         self.model.load_kv_cache_scales(quantization_param_path)
+    def set_eagle3_layers_to_capture(self):
+        self.capture_aux_hidden_states = True
+        num_layers = self.config.num_hidden_layers
+        self.model.layers_to_capture = [2, num_layers // 2, num_layers - 3]
 class Phi3ForCausalLM(LlamaForCausalLM):
     pass

sglang/srt/models/llama_eagle.py CHANGED Viewed

@@ -134,6 +134,7 @@ class LlamaForCausalLMEagle(LlamaForCausalLM):
             )
         self.logits_processor = LogitsProcessor(config)
+        self.capture_aux_hidden_states = False
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:

sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post2__py3-none-any.whl

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post2py3-none-any.whl