PyPI - sglang - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl - Mend

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

sglang/bench_latency.py +2 -1
sglang/lang/chat_template.py +17 -0
sglang/launch_server_llavavid.py +1 -1
sglang/srt/configs/__init__.py +3 -0
sglang/srt/configs/model_config.py +27 -2
sglang/srt/configs/qwen2vl.py +133 -0
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/conversation.py +27 -0
sglang/srt/hf_transformers_utils.py +16 -1
sglang/srt/layers/attention/__init__.py +16 -5
sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
sglang/srt/layers/attention/flashinfer_backend.py +174 -54
sglang/srt/layers/attention/triton_backend.py +22 -6
sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
sglang/srt/layers/linear.py +89 -63
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/rotary_embedding.py +112 -0
sglang/srt/layers/sampler.py +51 -39
sglang/srt/lora/lora.py +3 -1
sglang/srt/managers/data_parallel_controller.py +1 -1
sglang/srt/managers/detokenizer_manager.py +4 -0
sglang/srt/managers/image_processor.py +186 -13
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/schedule_batch.py +238 -68
sglang/srt/managers/scheduler.py +69 -50
sglang/srt/managers/tokenizer_manager.py +24 -4
sglang/srt/managers/tp_worker.py +26 -111
sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
sglang/srt/mem_cache/memory_pool.py +56 -10
sglang/srt/mem_cache/radix_cache.py +4 -3
sglang/srt/model_executor/cuda_graph_runner.py +87 -28
sglang/srt/model_executor/forward_batch_info.py +83 -3
sglang/srt/model_executor/model_runner.py +32 -11
sglang/srt/models/chatglm.py +3 -3
sglang/srt/models/deepseek_v2.py +2 -2
sglang/srt/models/mllama.py +1004 -0
sglang/srt/models/qwen2_vl.py +724 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_batch_info.py +13 -3
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +12 -0
sglang/srt/server_args.py +10 -0
sglang/srt/utils.py +22 -0
sglang/test/run_eval.py +2 -0
sglang/test/runners.py +20 -1
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +100 -3
sglang/version.py +1 -1
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0

sglang/bench_latency.py CHANGED Viewed

@@ -227,8 +227,9 @@ def extend(reqs, model_runner):
         req_to_token_pool=model_runner.req_to_token_pool,
         token_to_kv_pool=model_runner.token_to_kv_pool,
         tree_cache=None,
+        model_config=model_runner.model_config,
     )
-    batch.prepare_for_extend(model_runner.model_config.vocab_size)
+    batch.prepare_for_extend()
     model_worker_batch = batch.get_model_worker_batch()
     forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
     logits_output = model_runner.forward(forward_batch)

sglang/lang/chat_template.py CHANGED Viewed

@@ -133,6 +133,22 @@ register_chat_template(
     )
 )
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
+register_chat_template(
+    ChatTemplate(
+        name="qwen2-vl",
+        default_system_prompt="You are a helpful assistant.",
+        role_prefix_and_suffix={
+            "system": ("<|im_start|>system\n", "<|im_end|>\n"),
+            "user": ("<|im_start|>user\n", "<|im_end|>\n"),
+            "assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
+        },
+        style=ChatTemplateStyle.PLAIN,
+        stop_str=("<|im_end|>"),
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+    )
+)
 register_chat_template(
     ChatTemplate(
@@ -213,6 +229,7 @@ register_chat_template(
             ),
         },
         stop_str=("<|eot_id|>",),
+        image_token="<|image|>",
     )
 )

sglang/launch_server_llavavid.py CHANGED Viewed

@@ -14,7 +14,7 @@ if __name__ == "__main__":
     model_override_args["num_frames"] = 16
     model_override_args["model_type"] = "llavavid"
     if model_override_args["num_frames"] == 32:
-        model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
+        model_override_args["rope_scaling"] = {"factor": 2.0, "rope_type": "linear"}
         model_override_args["max_sequence_length"] = 4096 * 2
         model_override_args["tokenizer_model_max_length"] = 4096 * 2
         model_override_args["model_max_length"] = 4096 * 2

sglang/srt/configs/__init__.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from sglang.srt.configs.exaone import ExaoneConfig
+from sglang.srt.configs.qwen2vl import Qwen2VLConfig, Qwen2VLVisionConfig
 __all__ = [
     "ExaoneConfig",
+    "Qwen2VLConfig",
+    "Qwen2VLVisionConfig",
 ]

sglang/srt/configs/model_config.py CHANGED Viewed

@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+import logging
+import os
 from enum import IntEnum, auto
 from typing import Optional
@@ -20,6 +22,8 @@ from transformers import PretrainedConfig
 from sglang.srt.hf_transformers_utils import get_config, get_context_length
+logger = logging.getLogger(__name__)
 class AttentionArch(IntEnum):
     MLA = auto()
@@ -46,10 +50,29 @@ class ModelConfig:
             model_override_args=model_override_args,
         )
         self.hf_text_config = get_hf_text_config(self.hf_config)
+        derived_context_len = get_context_length(self.hf_text_config)
+        allow_long_context = os.environ.get(
+            "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
+        )
         if context_length is not None:
-            self.context_len = context_length
+            if context_length > derived_context_len:
+                if allow_long_context:
+                    logger.warning(
+                        f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
+                        f"This may lead to incorrect model outputs or CUDA errors."
+                    )
+                    self.context_len = context_length
+                else:
+                    raise ValueError(
+                        f"User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
+                        f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config. "
+                        f"To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
+                    )
+            else:
+                self.context_len = context_length
         else:
-            self.context_len = get_context_length(self.hf_text_config)
+            self.context_len = derived_context_len
         # Unify the config keys for hf_text_config
         self.head_dim = getattr(
@@ -89,6 +112,8 @@ class ModelConfig:
         self.num_hidden_layers = self.hf_text_config.num_hidden_layers
         self.vocab_size = self.hf_text_config.vocab_size
+        self.is_encoder_decoder = self.hf_config.model_type in ["mllama"]
     # adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py#L289
     def get_total_num_kv_heads(self) -> int:
         """Returns the total number of KV heads."""

sglang/srt/configs/qwen2vl.py ADDED Viewed

@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen2VL model configuration"""
+import os
+from typing import Union
+from transformers import PretrainedConfig
+class Qwen2VLVisionConfig(PretrainedConfig):
+    model_type = "qwen2_vl"
+    def __init__(
+        self,
+        depth=32,
+        embed_dim=1280,
+        hidden_size=3584,
+        hidden_act="quick_gelu",
+        mlp_ratio=4,
+        num_heads=16,
+        in_channels=3,
+        patch_size=14,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.embed_dim = embed_dim
+        self.hidden_size = hidden_size
+        self.hidden_act = hidden_act
+        self.mlp_ratio = mlp_ratio
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.patch_size = patch_size
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+        config_dict, kwargs = cls.get_config_dict(
+            pretrained_model_name_or_path, **kwargs
+        )
+        if config_dict.get("model_type") == "qwen2_vl":
+            config_dict = config_dict["vision_config"]
+        return cls.from_dict(config_dict, **kwargs)
+class Qwen2VLConfig(PretrainedConfig):
+    model_type = "qwen2_vl"
+    def __init__(
+        self,
+        vocab_size=152064,
+        hidden_size=8192,
+        intermediate_size=29568,
+        num_hidden_layers=80,
+        num_attention_heads=64,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=32768,
+        initializer_range=0.02,
+        rms_norm_eps=1e-05,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=1000000.0,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=80,
+        attention_dropout=0.0,
+        vision_config=None,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = Qwen2VLVisionConfig(**vision_config)
+        elif vision_config is None:
+            self.vision_config = Qwen2VLVisionConfig()
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.max_window_layers = max_window_layers
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+        self.rope_scaling = rope_scaling
+        # NOTE: the following section from original transformers config
+        # for Qwen2-VL is commented out to address rope config loading issue
+        #
+        # if self.rope_scaling is not None and "type" in self.rope_scaling:
+        #     if self.rope_scaling["type"] == "mrope":
+        #         self.rope_scaling["type"] = "default"
+        #     self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        # rope_config_validation(self)
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

sglang/srt/constrained/fsm_cache.py CHANGED Viewed

@@ -73,9 +73,16 @@ class FSMCache(BaseToolCache):
     def init_value(self, key):
         key_type, key_string = key
         if key_type == "json":
-            regex = build_regex_from_schema(
-                key_string, whitespace_pattern=self.constrained_json_whitespace_pattern
-            )
+            try:
+                regex = build_regex_from_schema(
+                    key_string,
+                    whitespace_pattern=self.constrained_json_whitespace_pattern,
+                )
+            except NotImplementedError as e:
+                logger.warning(
+                    f"skip invalid json schema: json_schema={key_string}, {e=}"
+                )
+                return None, key_string
         elif key_type == "regex":
             regex = key_string
         else:

sglang/srt/conversation.py CHANGED Viewed

@@ -509,6 +509,19 @@ register_conv_template(
     )
 )
+register_conv_template(
+    Conversation(
+        name="llama_3_vision",
+        system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
+        system_template="<|start_header_id|>system<|end_header_id|>\n\n{system_message}<|eot_id|>",
+        roles=("user", "assistant"),
+        sep_style=SeparatorStyle.LLAMA3,
+        sep="",
+        stop_str=["<|end_of_text|>", "<|eot_id|>"],
+        image_token="<|image|>",
+    )
+)
 register_conv_template(
     Conversation(
         name="llava_llama_3",
@@ -530,3 +543,17 @@ register_conv_template(
         stop_str=["<|im_end|>", "<|action_end|>"],
     )
 )
+# Reference: https://huggingface.co/docs/transformers/main/model_doc/qwen2_vl#usage-example
+register_conv_template(
+    Conversation(
+        name="qwen2-vl",
+        system_message="You are a helpful assistant.",
+        system_template="<|im_start|>system\n{system_message}",
+        roles=("<|im_start|>user", "<|im_start|>assistant"),
+        sep="<|im_end|>\n",
+        sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
+        stop_str=["<|im_end|>"],
+        image_token="<|vision_start|><|image_pad|><|vision_end|>",
+    )
+)

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -33,12 +33,13 @@ from transformers import (
 try:
     from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig
-    from sglang.srt.configs import ExaoneConfig
+    from sglang.srt.configs import ExaoneConfig, Qwen2VLConfig
     _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
         ChatGLMConfig.model_type: ChatGLMConfig,
         DbrxConfig.model_type: DbrxConfig,
         ExaoneConfig.model_type: ExaoneConfig,
+        Qwen2VLConfig.model_type: Qwen2VLConfig,
     }
 except ImportError:
     # We want this file to run without vllm dependency
@@ -162,6 +163,8 @@ def get_tokenizer(
             "Using a slow tokenizer. This might cause a significant "
             "slowdown. Consider using a fast tokenizer instead."
         )
+    attach_additional_stop_token_ids(tokenizer)
     return tokenizer
@@ -180,4 +183,16 @@ def get_processor(
         tokenizer_revision=tokenizer_revision,
         **kwargs,
     )
+    attach_additional_stop_token_ids(processor.tokenizer)
     return processor
+def attach_additional_stop_token_ids(tokenizer):
+    # Special handling for stop token <|eom_id|> generated by llama 3 tool use.
+    if "<|eom_id|>" in tokenizer.get_added_vocab():
+        tokenizer.additional_stop_token_ids = set(
+            [tokenizer.get_added_vocab()["<|eom_id|>"]]
+        )
+    else:
+        tokenizer.additional_stop_token_ids = None

sglang/srt/layers/attention/__init__.py CHANGED Viewed

@@ -1,8 +1,10 @@
 from abc import ABC, abstractmethod
+from typing import Optional
 import torch
 from torch import nn
+from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -19,13 +21,22 @@ class AttentionBackend(ABC):
         raise NotImplementedError()
     def init_forward_metadata_capture_cuda_graph(
-        self, bs: int, req_pool_indices: torch.Tensor, seq_lens: torch.Tensor
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor] = None,
     ):
         """Init the metadata for a forward pass for capturing a cuda graph."""
         raise NotImplementedError()
     def init_forward_metadata_replay_cuda_graph(
-        self, bs: int, req_pool_indices: torch.Tensor, seq_lens: torch.Tensor
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor] = None,
     ):
         """Init the metadata for a forward pass for replying a cuda graph."""
         raise NotImplementedError()
@@ -39,7 +50,7 @@ class AttentionBackend(ABC):
         q: torch.Tensor,
         k: torch.Tensor,
         v: torch.Tensor,
-        layer: nn.Module,
+        layer: RadixAttention,
         forward_batch: ForwardBatch,
     ):
         """Run forward on an attention layer."""
@@ -53,7 +64,7 @@ class AttentionBackend(ABC):
         q: torch.Tensor,
         k: torch.Tensor,
         v: torch.Tensor,
-        layer: nn.Module,
+        layer: RadixAttention,
         forward_batch: ForwardBatch,
     ):
         """Run a forward for decode."""
@@ -64,7 +75,7 @@ class AttentionBackend(ABC):
         q: torch.Tensor,
         k: torch.Tensor,
         v: torch.Tensor,
-        layer: nn.Module,
+        layer: RadixAttention,
         forward_batch: ForwardBatch,
     ):
         """Run a forward for extend."""

sglang/srt/layers/attention/double_sparsity_backend.py CHANGED Viewed

@@ -10,6 +10,7 @@ from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
     from sglang.srt.model_executor.model_runner import ModelRunner
@@ -134,8 +135,13 @@ class DoubleSparseAttnBackend(AttentionBackend):
         )
     def init_forward_metadata_capture_cuda_graph(
-        self, bs: int, req_pool_indices: torch.Tensor, seq_lens: torch.Tensor
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens=None,
     ):
+        # NOTE: encoder_lens expected to be zeros or None
         self.forward_metadata = (
             self.cuda_graph_start_loc,
             self.cuda_graph_attn_logits,
@@ -144,15 +150,23 @@ class DoubleSparseAttnBackend(AttentionBackend):
         )
     def init_forward_metadata_replay_cuda_graph(
-        self, bs: int, req_pool_indices: torch.Tensor, seq_lens: torch.Tensor
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens=None,
     ):
+        # NOTE: encoder_lens expected to be zeros or None
         self.cuda_graph_start_loc.zero_()
         self.cuda_graph_start_loc[1:bs] = torch.cumsum(seq_lens[: bs - 1], dim=0)
     def get_cuda_graph_seq_len_fill_value(self):
         return 1
-    def forward_extend(self, q, k, v, layer: nn.Module, forward_batch: ForwardBatch):
+    def forward_extend(
+        self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
+    ):
         # TODO: reuse the buffer across layers
         if layer.qk_head_dim != layer.v_head_dim:
             o = q.new_empty((q.shape[0], layer.tp_q_head_num * layer.v_head_dim))
@@ -168,7 +182,7 @@ class DoubleSparseAttnBackend(AttentionBackend):
         )
         forward_batch.token_to_kv_pool.set_kv_buffer(
-            layer.layer_id, forward_batch.out_cache_loc, k, v, k_label
+            layer, forward_batch.out_cache_loc, k, v, k_label
         )
         (
@@ -197,7 +211,9 @@ class DoubleSparseAttnBackend(AttentionBackend):
         )
         return o
-    def forward_decode(self, q, k, v, layer: nn.Module, forward_batch: ForwardBatch):
+    def forward_decode(
+        self, q, k, v, layer: RadixAttention, forward_batch: ForwardBatch
+    ):
         # During torch.compile, there is a bug in rotary_emb that causes the
         # output value to have a 3D tensor shape. This reshapes the output correctly.
         q = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
@@ -227,7 +243,7 @@ class DoubleSparseAttnBackend(AttentionBackend):
         )
         forward_batch.token_to_kv_pool.set_kv_buffer(
-            layer.layer_id, forward_batch.out_cache_loc, k, v, k_label
+            layer, forward_batch.out_cache_loc, k, v, k_label
         )
         # NOTE(Andy) shouldn't be used when max_len_in_batch < heavy_token_num

sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl