PyPI - sglang - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl - Mend

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

sglang/bench_latency.py +2 -1
sglang/lang/chat_template.py +17 -0
sglang/launch_server_llavavid.py +1 -1
sglang/srt/configs/__init__.py +3 -0
sglang/srt/configs/model_config.py +27 -2
sglang/srt/configs/qwen2vl.py +133 -0
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/conversation.py +27 -0
sglang/srt/hf_transformers_utils.py +16 -1
sglang/srt/layers/attention/__init__.py +16 -5
sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
sglang/srt/layers/attention/flashinfer_backend.py +174 -54
sglang/srt/layers/attention/triton_backend.py +22 -6
sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
sglang/srt/layers/linear.py +89 -63
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/rotary_embedding.py +112 -0
sglang/srt/layers/sampler.py +51 -39
sglang/srt/lora/lora.py +3 -1
sglang/srt/managers/data_parallel_controller.py +1 -1
sglang/srt/managers/detokenizer_manager.py +4 -0
sglang/srt/managers/image_processor.py +186 -13
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/schedule_batch.py +238 -68
sglang/srt/managers/scheduler.py +69 -50
sglang/srt/managers/tokenizer_manager.py +24 -4
sglang/srt/managers/tp_worker.py +26 -111
sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
sglang/srt/mem_cache/memory_pool.py +56 -10
sglang/srt/mem_cache/radix_cache.py +4 -3
sglang/srt/model_executor/cuda_graph_runner.py +87 -28
sglang/srt/model_executor/forward_batch_info.py +83 -3
sglang/srt/model_executor/model_runner.py +32 -11
sglang/srt/models/chatglm.py +3 -3
sglang/srt/models/deepseek_v2.py +2 -2
sglang/srt/models/mllama.py +1004 -0
sglang/srt/models/qwen2_vl.py +724 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_batch_info.py +13 -3
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +12 -0
sglang/srt/server_args.py +10 -0
sglang/srt/utils.py +22 -0
sglang/test/run_eval.py +2 -0
sglang/test/runners.py +20 -1
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +100 -3
sglang/version.py +1 -1
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -25,6 +25,8 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
 - ScheduleBatch is managed by `scheduler.py::Scheduler`.
   It contains high-level scheduling data. Most of the data is on the CPU.
 - ModelWorkerBatch is managed by `tp_worker.py::TpModelWorker`.
+  It is a subset of `ScheduleBatch` that only contains data related to the model forward on GPU.
+  It will be transformed from CPU scheduler to GPU model runner.
 - ForwardBatch is managed by `model_runner.py::ModelRunner`.
   It contains low-level tensor data. Most of the data consists of GPU tensors.
 """
@@ -33,9 +35,10 @@ from dataclasses import dataclass
 from enum import IntEnum, auto
 from typing import TYPE_CHECKING, List, Optional
-import numpy as np
 import torch
+from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
 if TYPE_CHECKING:
     from sglang.srt.layers.attention import AttentionBackend
     from sglang.srt.managers.schedule_batch import ImageInputs, ModelWorkerBatch
@@ -84,6 +87,9 @@ class ForwardBatch:
     # The indices of output tokens in the token_to_kv_pool
     out_cache_loc: torch.Tensor
+    # The sum of all sequence lengths
+    seq_lens_sum: int
     # For logprob
     return_logprob: bool = False
     top_logprobs_nums: Optional[List[int]] = None
@@ -92,6 +98,7 @@ class ForwardBatch:
     positions: torch.Tensor = None
     # For extend
+    extend_num_tokens: Optional[int] = None
     extend_seq_lens: Optional[torch.Tensor] = None
     extend_prefix_lens: Optional[torch.Tensor] = None
     extend_start_loc: Optional[torch.Tensor] = None
@@ -101,6 +108,12 @@ class ForwardBatch:
     # For multimodal
     image_inputs: Optional[List[ImageInputs]] = None
+    # Encoder-decoder
+    encoder_cached: Optional[List[bool]] = None
+    encoder_lens: Optional[torch.Tensor] = None
+    encoder_lens_cpu: Optional[List[int]] = None
+    encoder_out_cache_loc: Optional[torch.Tensor] = None
     # For LoRA
     lora_paths: Optional[List[str]] = None
@@ -112,14 +125,71 @@ class ForwardBatch:
     token_to_kv_pool: BaseTokenToKVPool = None
     attn_backend: AttentionBackend = None
+    # For Qwen2-VL
+    mrope_positions: torch.Tensor = None
+    def compute_mrope_positions(
+        self, model_runner: ModelRunner, batch: ModelWorkerBatch
+    ):
+        device = model_runner.device
+        hf_config = model_runner.model_config.hf_config
+        mrope_positions_list = [None] * self.seq_lens.shape[0]
+        if self.forward_mode.is_decode():
+            for i, _ in enumerate(mrope_positions_list):
+                mrope_positions_list[i] = MRotaryEmbedding.get_next_input_positions(
+                    batch.mrope_positions_delta[i][0],
+                    int(self.seq_lens[i]) - 1,
+                    int(self.seq_lens[i]),
+                )
+        elif self.forward_mode.is_extend():
+            extend_start_loc_cpu = self.extend_start_loc.cpu().numpy()
+            for i, image_inputs in enumerate(batch.image_inputs):
+                extend_start_loc, extend_seq_len, extend_prefix_len = (
+                    extend_start_loc_cpu[i],
+                    batch.extend_seq_lens[i],
+                    batch.extend_prefix_lens[i],
+                )
+                if image_inputs is None:
+                    # text only
+                    mrope_positions = [
+                        [
+                            pos
+                            for pos in range(
+                                extend_prefix_len, extend_prefix_len + extend_seq_len
+                            )
+                        ]
+                    ] * 3
+                    mrope_position_delta = 0
+                else:
+                    # TODO: current qwen2-vl do not support radix cache since mrope position calculation
+                    mrope_positions, mrope_position_delta = (
+                        MRotaryEmbedding.get_input_positions(
+                            input_tokens=self.input_ids[
+                                extend_start_loc : extend_start_loc + extend_seq_len
+                            ],
+                            image_grid_thw=image_inputs.image_grid_thws,
+                            vision_start_token_id=hf_config.vision_start_token_id,
+                            spatial_merge_size=hf_config.vision_config.spatial_merge_size,
+                            context_len=0,
+                        )
+                    )
+                mrope_positions_list[i] = mrope_positions
+                batch.mrope_positions_delta[i].append(mrope_position_delta)
+        self.mrope_positions = torch.concat(
+            [torch.tensor(pos, device=device) for pos in mrope_positions_list],
+            axis=1,
+        )
+        self.mrope_positions = self.mrope_positions.to(torch.int64)
     @classmethod
     def init_new(
         cls,
         batch: ModelWorkerBatch,
         model_runner: ModelRunner,
     ):
-        device = model_runner.device
+        device = model_runner.device
         ret = cls(
             forward_mode=batch.forward_mode,
             batch_size=len(batch.seq_lens),
@@ -127,6 +197,12 @@ class ForwardBatch:
             req_pool_indices=batch.req_pool_indices,
             seq_lens=batch.seq_lens,
             out_cache_loc=batch.out_cache_loc,
+            image_inputs=batch.image_inputs,
+            encoder_cached=batch.encoder_cached,
+            encoder_lens=batch.encoder_lens,
+            encoder_lens_cpu=batch.encoder_lens_cpu,
+            encoder_out_cache_loc=batch.encoder_out_cache_loc,
+            seq_lens_sum=batch.seq_lens_sum,
             return_logprob=batch.return_logprob,
             top_logprobs_nums=batch.top_logprobs_nums,
             lora_paths=batch.lora_paths,
@@ -144,10 +220,11 @@ class ForwardBatch:
                 ],
                 axis=0,
             )
-            ret.image_inputs = batch.image_inputs
+            ret.extend_num_tokens = batch.extend_num_tokens
             ret.extend_seq_lens = torch.tensor(
                 batch.extend_seq_lens, dtype=torch.int32
             ).to(device, non_blocking=True)
             ret.extend_prefix_lens = torch.tensor(
                 batch.extend_prefix_lens, dtype=torch.int32
             ).to(device, non_blocking=True)
@@ -156,6 +233,9 @@ class ForwardBatch:
             ret.extend_seq_lens_cpu = batch.extend_seq_lens
             ret.extend_logprob_start_lens_cpu = batch.extend_logprob_start_lens
+        if model_runner.model_is_mrope:
+            ret.compute_mrope_positions(model_runner, batch)
         # Init attention information
         ret.req_to_token_pool = model_runner.req_to_token_pool
         ret.token_to_kv_pool = model_runner.token_to_kv_pool

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -59,8 +59,11 @@ from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
     enable_show_time_cost,
     get_available_gpu_memory,
+    is_attention_free_model,
+    is_embedding_model,
     is_generation_model,
     is_multimodal_model,
+    model_has_inner_state,
     monkey_patch_vllm_dummy_weight_loader,
     monkey_patch_vllm_p2p_access_check,
 )
@@ -117,11 +120,16 @@ class ModelRunner:
             )
         if self.is_multimodal_model:
-            logger.info(
+            logger.warning(
                 "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
             )
             server_args.chunked_prefill_size = None
             server_args.mem_fraction_static *= 0.95
+            # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
+            if self.model_config.hf_config.architectures == [
+                "Qwen2VLForConditionalGeneration"
+            ]:
+                server_args.disable_radix_cache = True
         # Global vars
         if server_args.show_time_cost:
@@ -262,7 +270,6 @@ class ModelRunner:
             if hasattr(self.model, "get_attention_sliding_window_size")
             else None
         )
-        self.has_cross_attention = getattr(self.model, "has_cross_attention", False)
         self.is_generation = is_generation_model(
             self.model_config.hf_config.architectures, self.server_args.is_embedding
         )
@@ -316,11 +323,13 @@ class ModelRunner:
         def get_weight_iter(config):
             iter = loader._get_weights_iterator(
-                config.model,
-                config.revision,
-                fall_back_to_pt=getattr(
-                    self.model, "fall_back_to_pt_during_load", True
-                ),
+                DefaultModelLoader.Source(
+                    config.model,
+                    revision=config.revision,
+                    fall_back_to_pt=getattr(
+                        self.model, "fall_back_to_pt_during_load", True
+                    ),
+                )
             )
             return iter
@@ -444,6 +453,7 @@ class ModelRunner:
             size=max_num_reqs + 1,
             max_context_len=self.model_config.context_len + 4,
             device=self.device,
+            use_records=False,
         )
         if (
             self.model_config.attention_arch == AttentionArch.MLA
@@ -499,7 +509,7 @@ class ModelRunner:
                 "Window attention is not supported in the triton attention backend. "
                 "Please use `--attention-backend flashinfer`."
             )
-            assert not self.has_cross_attention, (
+            assert not self.model_config.is_encoder_decoder, (
                 "Cross attention is not supported in the triton attention backend. "
                 "Please use `--attention-backend flashinfer`."
             )
@@ -547,9 +557,7 @@ class ModelRunner:
         self.cuda_graph_runner = CudaGraphRunner(self)
     def forward_decode(self, forward_batch: ForwardBatch):
-        if self.cuda_graph_runner and self.cuda_graph_runner.can_run(
-            forward_batch.batch_size
-        ):
+        if self.cuda_graph_runner and self.cuda_graph_runner.can_run(forward_batch):
             return self.cuda_graph_runner.replay(forward_batch)
         forward_batch.positions = (forward_batch.seq_lens - 1).to(torch.int64)
@@ -617,6 +625,15 @@ class ModelRunner:
         return logits
+    @property
+    def model_is_mrope(self) -> bool:
+        """Detect if the model has "mrope" rope_scaling type.
+        mrope requires keep "rope_deltas" between prompt and decoding phases."""
+        rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
+        if rope_scaling is None:
+            return False
+        return rope_scaling.get("type", None) == "mrope"
 @lru_cache()
 def import_model_classes():
@@ -662,3 +679,7 @@ def load_model_cls_srt(model_arch: str) -> Optional[Type[nn.Module]]:
 # Monkey patch model loader
 setattr(ModelRegistry, "_try_load_model_cls", load_model_cls_srt)
+setattr(ModelRegistry, "is_multimodal_model", is_multimodal_model)
+setattr(ModelRegistry, "is_attention_free_model", is_attention_free_model)
+setattr(ModelRegistry, "model_has_inner_state", model_has_inner_state)
+setattr(ModelRegistry, "is_embedding_model", is_embedding_model)

sglang/srt/models/chatglm.py CHANGED Viewed

@@ -303,7 +303,7 @@ class GLMTransformer(nn.Module):
         return hidden_states
-class ChatGLMModel(nn.Module):
+class ChatGLMM(nn.Module):
     def __init__(
         self,
         config,
@@ -366,7 +366,7 @@ class ChatGLMForCausalLM(nn.Module):
         self.config: ChatGLMConfig = config
         self.quant_config = quant_config
         self.max_position_embeddings = getattr(config, "max_sequence_length", 8192)
-        self.transformer = ChatGLMModel(config, cache_config, quant_config)
+        self.transformer = ChatGLMM(config, cache_config, quant_config)
         self.lm_head = self.transformer.output_layer
         self.logits_processor = LogitsProcessor(config)
@@ -401,4 +401,4 @@ class ChatGLMModel(ChatGLMForCausalLM):
     pass
-EntryClass = [ChatGLMForCausalLM, ChatGLMModel]
+EntryClass = [ChatGLMModel]

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -250,7 +250,7 @@ class DeepseekV2Attention(nn.Module):
             bias=False,
             quant_config=quant_config,
         )
-        rope_scaling["type"] = "deepseek_yarn"
+        rope_scaling["rope_type"] = "deepseek_yarn"
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,
@@ -398,7 +398,7 @@ class DeepseekV2AttentionMLA(nn.Module):
             bias=False,
             quant_config=quant_config,
         )
-        rope_scaling["type"] = "deepseek_yarn"
+        rope_scaling["rope_type"] = "deepseek_yarn"
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
             rotary_dim=qk_rope_head_dim,

sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl