PyPI - sglang - Versions diffs - 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl - Mend

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

sglang/__init__.py +2 -2
sglang/api.py +2 -2
sglang/bench_latency.py +1 -553
sglang/bench_offline_throughput.py +48 -20
sglang/bench_one_batch.py +472 -0
sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
sglang/bench_serving.py +125 -6
sglang/check_env.py +3 -6
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/runtime_endpoint.py +2 -2
sglang/srt/configs/model_config.py +13 -14
sglang/srt/constrained/__init__.py +13 -14
sglang/srt/constrained/base_grammar_backend.py +13 -15
sglang/srt/constrained/outlines_backend.py +28 -17
sglang/srt/constrained/outlines_jump_forward.py +13 -15
sglang/srt/constrained/xgrammar_backend.py +47 -58
sglang/srt/conversation.py +13 -15
sglang/srt/hf_transformers_utils.py +13 -15
sglang/srt/layers/activation.py +16 -13
sglang/srt/layers/attention/flashinfer_backend.py +106 -54
sglang/srt/layers/attention/triton_backend.py +9 -7
sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
sglang/srt/layers/custom_op_util.py +25 -0
sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +11 -4
sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
sglang/srt/layers/fused_moe_triton/layer.py +633 -0
sglang/srt/layers/layernorm.py +17 -15
sglang/srt/layers/logits_processor.py +23 -25
sglang/srt/layers/quantization/__init__.py +77 -17
sglang/srt/layers/radix_attention.py +13 -15
sglang/srt/layers/rotary_embedding.py +13 -13
sglang/srt/layers/sampler.py +4 -8
sglang/srt/layers/torchao_utils.py +2 -0
sglang/srt/lora/lora.py +13 -14
sglang/srt/lora/lora_config.py +13 -14
sglang/srt/lora/lora_manager.py +22 -24
sglang/srt/managers/data_parallel_controller.py +98 -27
sglang/srt/managers/detokenizer_manager.py +13 -15
sglang/srt/managers/io_struct.py +63 -21
sglang/srt/managers/schedule_batch.py +154 -59
sglang/srt/managers/schedule_policy.py +18 -16
sglang/srt/managers/scheduler.py +278 -109
sglang/srt/managers/session_controller.py +61 -0
sglang/srt/managers/tokenizer_manager.py +63 -18
sglang/srt/managers/tp_worker.py +25 -16
sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
sglang/srt/metrics/collector.py +13 -15
sglang/srt/metrics/func_timer.py +13 -15
sglang/srt/mm_utils.py +13 -14
sglang/srt/model_executor/cuda_graph_runner.py +63 -25
sglang/srt/model_executor/forward_batch_info.py +128 -32
sglang/srt/model_executor/model_runner.py +132 -64
sglang/srt/model_parallel.py +98 -0
sglang/srt/models/chatglm.py +15 -16
sglang/srt/models/commandr.py +15 -16
sglang/srt/models/dbrx.py +15 -16
sglang/srt/models/deepseek.py +15 -15
sglang/srt/models/deepseek_v2.py +162 -59
sglang/srt/models/exaone.py +14 -15
sglang/srt/models/gemma.py +14 -14
sglang/srt/models/gemma2.py +31 -25
sglang/srt/models/gemma2_reward.py +13 -14
sglang/srt/models/gpt_bigcode.py +14 -14
sglang/srt/models/grok.py +15 -15
sglang/srt/models/internlm2.py +13 -15
sglang/srt/models/internlm2_reward.py +13 -14
sglang/srt/models/llama.py +21 -21
sglang/srt/models/llama_classification.py +13 -14
sglang/srt/models/llama_reward.py +13 -14
sglang/srt/models/llava.py +14 -16
sglang/srt/models/llavavid.py +14 -16
sglang/srt/models/minicpm.py +13 -15
sglang/srt/models/minicpm3.py +13 -15
sglang/srt/models/mistral.py +13 -15
sglang/srt/models/mixtral.py +15 -15
sglang/srt/models/mixtral_quant.py +14 -14
sglang/srt/models/olmo.py +22 -20
sglang/srt/models/olmoe.py +23 -20
sglang/srt/models/phi3_small.py +447 -0
sglang/srt/models/qwen.py +14 -14
sglang/srt/models/qwen2.py +22 -19
sglang/srt/models/qwen2_moe.py +17 -18
sglang/srt/models/qwen2_vl.py +13 -6
sglang/srt/models/stablelm.py +18 -16
sglang/srt/models/torch_native_llama.py +107 -93
sglang/srt/models/xverse.py +13 -14
sglang/srt/models/xverse_moe.py +15 -16
sglang/srt/models/yivl.py +13 -15
sglang/srt/openai_api/adapter.py +19 -17
sglang/srt/openai_api/protocol.py +14 -16
sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
sglang/srt/sampling/sampling_batch_info.py +61 -57
sglang/srt/sampling/sampling_params.py +14 -16
sglang/srt/server.py +86 -35
sglang/srt/server_args.py +96 -80
sglang/srt/utils.py +266 -68
sglang/test/few_shot_gsm8k.py +8 -4
sglang/test/runners.py +38 -20
sglang/test/srt/sampling/penaltylib/utils.py +23 -21
sglang/test/test_utils.py +31 -20
sglang/version.py +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +66 -57
sglang-0.3.6.post1.dist-info/RECORD +164 -0
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +1 -1
sglang/srt/layers/fused_moe/__init__.py +0 -1
sglang-0.3.5.post2.dist-info/RECORD +0 -156
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/models/phi3_small.py ADDED Viewed

@@ -0,0 +1,447 @@
+import math
+from typing import Iterable, Optional, Tuple, Union
+import torch
+from torch import nn
+from transformers import Phi3Config
+from transformers.configuration_utils import PretrainedConfig
+from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.utils import make_layers
+from sglang.srt.layers.linear import (
+    MergedColumnParallelLinear,
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from sglang.srt.layers.logits_processor import LogitsProcessor, LogitsProcessorOutput
+from sglang.srt.layers.pooler import Pooler, PoolingType
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
+from sglang.srt.layers.radix_attention import RadixAttention
+from sglang.srt.layers.torchao_utils import apply_torchao_config_
+from sglang.srt.layers.vocab_parallel_embedding import (
+    DEFAULT_VOCAB_PADDING_SIZE,
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from sglang.srt.managers.schedule_batch import global_server_args_dict
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+@torch.jit.script
+def quick_gelu(x):
+    return x * torch.sigmoid(1.702 * x)
+@torch.jit.script
+def gegelu(input, limit: Optional[float] = None):
+    a_gelu, a_linear = input[..., ::2], input[..., 1::2]
+    if limit is not None:
+        a_gelu = torch.where(
+            torch.isinf(a_gelu), a_gelu, a_gelu.clamp(min=None, max=limit)
+        )
+        a_linear = torch.where(
+            torch.isinf(a_linear),
+            a_linear,
+            a_linear.clamp(min=-limit, max=limit),
+        )
+    out_gelu = quick_gelu(a_gelu)
+    return out_gelu * (a_linear + 1)
+class Phi3SmallMLP(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.config = config
+        assert (
+            self.config.hidden_act == "gegelu"
+        ), "Only `gegelu` is supported for the 4.7 series of models .."
+        self.hidden_size = config.hidden_size
+        self.gegelu_limit = config.gegelu_limit
+        self.intermediate_size = config.intermediate_size
+        self.up_proj = MergedColumnParallelLinear(
+            self.hidden_size,
+            2 * [self.intermediate_size],
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.up_proj",
+        )
+        self.down_proj = RowParallelLinear(
+            self.intermediate_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+        )
+    def forward(self, x):
+        gate_up, _ = self.up_proj(x)
+        x = gegelu(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+class Phi3SmallSelfAttention(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int = 0,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.layer_id = layer_id
+        self.config = config
+        self.sparse_block_size = config.blocksparse_block_size
+        self.homo_heads = config.blocksparse_homo_head_pattern
+        self.local_blocks = config.blocksparse_num_local_blocks
+        self.vert_stride = config.blocksparse_vert_stride
+        assert (
+            config.blocksparse_block_size == config.blocksparse_triton_kernel_block_size
+        )
+        self.hidden_size = config.hidden_size
+        # Number of Query Heads
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.tp_size = get_tensor_model_parallel_world_size()
+        # Number of total Key Value Heads before tensor parallel
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_q_per_kv = self.num_heads // self.num_key_value_heads
+        if self.tp_size > 1:
+            assert self.num_key_value_heads % self.tp_size == 0
+        self.num_kv_heads_per_partion = max(1, self.num_key_value_heads // self.tp_size)
+        self.num_heads_per_partition = self.num_heads // self.tp_size
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_embedding_base = config.rope_embedding_base
+        self.rope_position_scale = config.rope_position_scale
+        self.is_causal = True
+        norm_factor = None
+        if config.mup_use_scaling:
+            norm_factor = self.head_dim / config.mup_attn_multiplier
+        else:
+            norm_factor = math.sqrt(self.head_dim)
+        self.scale = 1 / norm_factor
+        self.query_key_value = QKVParallelLinear(
+            self.hidden_size,
+            self.head_dim,
+            self.num_heads,
+            self.num_key_value_heads,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+        self.dense = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+        if getattr(self.config, "rope_scaling", None) is not None:
+            rope_scaling = self.config.rope_scaling
+            for key in rope_scaling:
+                if isinstance(rope_scaling[key], list):
+                    rope_scaling[key] = tuple(rope_scaling[key])
+            if "factor" not in rope_scaling:
+                rope_scaling["factor"] = self.rope_position_scale
+        else:
+            rope_scaling = {
+                "rope_type": "linear",
+                "factor": self.rope_position_scale,
+            }
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=self.max_position_embeddings,
+            base=self.rope_embedding_base,
+            rope_scaling=rope_scaling,
+        )
+        # blocksparse params
+        self.blocksparse_block_size = config.blocksparse_block_size
+        self.blocksparse_num_local_blocks = config.blocksparse_num_local_blocks
+        self.blocksparse_vert_stride = config.blocksparse_vert_stride
+        use_dense_attn = (
+            getattr(self.config, "dense_attention_every_n_layers", None)
+            and (self.layer_id + 1) % self.config.dense_attention_every_n_layers == 0
+        )
+        bs_params = None
+        if not use_dense_attn:
+            bs_params = {
+                "max_seqlen": self.max_position_embeddings,
+                "num_heads": self.num_heads_per_partition,
+                "num_kv_heads": self.num_kv_heads_per_partion,
+                "block_size": self.sparse_block_size,
+                "local_blocks": self.local_blocks,
+                "vert_stride": self.vert_stride,
+                "homo_head": self.homo_heads,
+            }
+        self.attn = RadixAttention(
+            self.num_heads_per_partition,
+            self.head_dim,
+            self.scale,
+            num_kv_heads=self.num_kv_heads_per_partion,
+            layer_id=layer_id,
+        )
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        qkv, _ = self.query_key_value(hidden_states)
+        qkv = qkv.view(qkv.shape[:-1] + (-1, (self.num_q_per_kv + 2), self.head_dim))
+        q, k, v = qkv.split([self.num_q_per_kv, 1, 1], dim=-2)
+        # NOTE: this is required by RotaryEmbed, which indeed does not have to
+        # TODO: allow 3D QK for rotary forward
+        q = q.reshape(-1, self.head_dim * self.num_heads_per_partition)
+        k = k.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
+        v = v.reshape(-1, self.head_dim * self.num_kv_heads_per_partion)
+        q, k = self.rotary_emb(positions, q, k)
+        attn_output = self.attn(q, k, v, forward_batch=forward_batch)
+        output, _ = self.dense(attn_output)
+        return output
+class Phi3SmallDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        layer_id: int,
+        cache_config=None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Phi3SmallSelfAttention(
+            config, layer_id, quant_config=quant_config
+        )
+        self.mlp = Phi3SmallMLP(config, quant_config)
+        self.input_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_epsilon
+        )
+        self.post_attention_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_epsilon
+        )
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        forward_batch: ForwardBatch,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            forward_batch=forward_batch,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+class Phi3SmallModel(nn.Module):
+    def __init__(
+        self,
+        config: Phi3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
+        super().__init__()
+        self.config = config
+        cache_config = None
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size, config.hidden_size
+        )
+        self.mup_embedding_multiplier = config.mup_embedding_multiplier
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda prefix: Phi3SmallDecoderLayer(
+                config, int(prefix.split(".")[-1]), cache_config, quant_config
+            ),
+            prefix=f"{prefix}.layers",
+        )
+        self.final_layernorm = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_epsilon
+        )
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor],
+    ) -> Union[torch.Tensor]:
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
+        if (
+            self.mup_embedding_multiplier is not None
+            and self.mup_embedding_multiplier > 0.0
+        ):
+            hidden_states = hidden_states * self.mup_embedding_multiplier
+        for i in range(self.start_layer, self.end_layer):
+            layer = self.layers[i]
+            hidden_states = layer(positions, hidden_states, forward_batch=forward_batch)
+        hidden_states = self.final_layernorm(hidden_states)
+        return hidden_states
+class Phi3SmallForCausalLM(nn.Module):
+    _tied_weights_keys = ["lm_head.weight"]
+    def __init__(
+        self,
+        config: Phi3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        cache_config=None,
+    ):
+        super().__init__()
+        self.config = config
+        self.quant_config = quant_config
+        self.model = Phi3SmallModel(
+            config=config,
+            quant_config=quant_config,
+            prefix="model",
+        )
+        self.torchao_config = global_server_args_dict["torchao_config"]
+        self.vocab_size = config.vocab_size
+        self.mup_width_multiplier = config.mup_width_multiplier
+        self.lm_head = ParallelLMHead(
+            self.vocab_size,
+            config.hidden_size,
+            org_num_embeddings=config.vocab_size,
+            padding_size=DEFAULT_VOCAB_PADDING_SIZE,
+            quant_config=quant_config,
+        )
+        if self.config.tie_word_embeddings:
+            self.lm_head.weight = self.model.embed_tokens.weight
+        self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
+        # tokens in tiktoken but not used
+        if hasattr(config, "dummy_token_indices"):
+            device = self.lm_head.weight.device
+            self.register_buffer(
+                "dummy_token_indices",
+                torch.LongTensor(config.dummy_token_indices).to(device),
+                persistent=False,
+            )
+        else:
+            self.dummy_token_indices = None
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, value):
+        self.lm_head = value
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata)
+        if self.dummy_token_indices is not None and logits is not None:
+            logits.index_fill_(-1, self.dummy_token_indices, -torch.inf)
+        return logits
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: Optional[torch.LongTensor],
+        forward_batch: ForwardBatch,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        get_embedding: bool = False,
+    ) -> LogitsProcessorOutput:
+        hidden_states = self.model(
+            input_ids=input_ids,
+            positions=positions,
+            forward_batch=forward_batch,
+            inputs_embeds=inputs_embeds,
+        )
+        if not get_embedding:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head.weight, forward_batch
+            )
+        else:
+            return self.pooler(hidden_states, forward_batch)
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            if name.endswith(".bias") and name not in params_dict:
+                continue
+            param = params_dict[name]
+            weight_loader = getattr(param, "weight_loader", default_weight_loader)
+            weight_loader(param, loaded_weight)
+        apply_torchao_config_(self, params_dict, set(["proj.weight"]))
+EntryClass = Phi3SmallForCausalLM

sglang/srt/models/qwen.py CHANGED Viewed

@@ -1,20 +1,20 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/c7f2cf2b7f67bce5842fedfdba508440fe257375/vllm/model_executor/models/qwen.py#L1
 from typing import Any, Dict, Iterable, Optional, Tuple
 import torch

sglang/srt/models/qwen2.py CHANGED Viewed

@@ -1,21 +1,21 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # Adapted from llama2.py
 # Modify details for the adaptation of Qwen2 model.
 """Inference-only Qwen2 model compatible with HuggingFace weights."""
 from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
@@ -40,6 +40,7 @@ from sglang.srt.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.utils import make_layers
 Qwen2Config = None
@@ -230,11 +231,13 @@ class Qwen2Model(nn.Module):
             config.vocab_size,
             config.hidden_size,
         )
-        self.layers = nn.ModuleList(
-            [
-                Qwen2DecoderLayer(config, i, quant_config=quant_config)
-                for i in range(config.num_hidden_layers)
-            ]
+        self.layers = make_layers(
+            config.num_hidden_layers,
+            lambda idx, prefix: Qwen2DecoderLayer(
+                layer_id=idx,
+                config=config,
+                quant_config=quant_config,
+            ),
         )
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

sglang/srt/models/qwen2_moe.py CHANGED Viewed

@@ -1,23 +1,22 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-# coding=utf-8
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 # Adapted from
 # https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen2_moe.py
 """Inference-only Qwen2MoE model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Iterable, Optional, Tuple
 import torch
 import torch.nn.functional as F
@@ -27,11 +26,11 @@ from vllm.distributed import (
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.activation import SiluAndMul
+from sglang.srt.layers.fused_moe_triton import FusedMoE
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (
     MergedColumnParallelLinear,

sglang/srt/models/qwen2_vl.py CHANGED Viewed

@@ -44,6 +44,7 @@ from sglang.srt.layers.attention.triton_ops.prefill_attention import (
 )
 from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
 from sglang.srt.layers.logits_processor import LogitsProcessor
+from sglang.srt.layers.pooler import Pooler, PoolingType
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
 from sglang.srt.managers.schedule_batch import ImageInputs
@@ -559,6 +560,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
             )
         self.logits_processor = LogitsProcessor(config)
+        self.pooler = Pooler(pooling_type=PoolingType.LAST, normalize=True)
     def _process_image_input(self, image_input: Qwen2VLImageInputs) -> torch.Tensor:
         pixel_values = image_input["pixel_values"].type(self.visual.dtype)
@@ -577,6 +579,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         forward_batch: ForwardBatch,
+        get_embedding: bool = False,
     ):
         """Run forward pass for Qwen2-VL.
@@ -599,8 +602,8 @@ class Qwen2VLForConditionalGeneration(nn.Module):
             image_inputs = [
                 img for img in forward_batch.image_inputs if img is not None
             ]
-        positions = forward_batch.mrope_positions
+        if getattr(self.config, "rope_scaling", {}).get("type", None) == "mrope":
+            positions = forward_batch.mrope_positions
         if (
             forward_batch.forward_mode.is_decode()
             or image_inputs is None
@@ -616,7 +619,7 @@ class Qwen2VLForConditionalGeneration(nn.Module):
             inputs_embeds = self.model.embed_tokens(input_ids)
             extend_start_loc_cpu = forward_batch.extend_start_loc.cpu().numpy()
-            prefix_lens_cpu = forward_batch.extend_prefix_lens.cpu().numpy()
+            prefix_lens_cpu = forward_batch.extend_prefix_lens_cpu
             for i, image in enumerate(forward_batch.image_inputs):
                 if image is None:
                     continue
@@ -655,9 +658,13 @@ class Qwen2VLForConditionalGeneration(nn.Module):
             forward_batch=forward_batch,
             input_embeds=inputs_embeds,
         )
-        return self.logits_processor(
-            input_ids, hidden_states, self.lm_head.weight, forward_batch
-        )
+        if not get_embedding:
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head.weight, forward_batch
+            )
+        else:
+            return self.pooler(hidden_states, forward_batch)
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl