PyPI - sglang - Versions diffs - 0.4.1.post4__py3-none-any.whl → 0.4.1.post5__py3-none-any.whl - Mend

sglang 0.4.1.post4py3-none-any.whl → 0.4.1.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

sglang/bench_serving.py +18 -1
sglang/lang/interpreter.py +71 -1
sglang/lang/ir.py +2 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/chatglm.py +78 -0
sglang/srt/configs/dbrx.py +279 -0
sglang/srt/configs/model_config.py +1 -1
sglang/srt/hf_transformers_utils.py +9 -14
sglang/srt/layers/attention/__init__.py +8 -1
sglang/srt/layers/attention/flashinfer_backend.py +4 -2
sglang/srt/layers/linear.py +159 -55
sglang/srt/layers/logits_processor.py +6 -6
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +16 -5
sglang/srt/layers/moe/fused_moe_triton/layer.py +2 -3
sglang/srt/layers/parameter.py +431 -0
sglang/srt/layers/quantization/__init__.py +3 -2
sglang/srt/layers/quantization/fp8.py +1 -1
sglang/srt/layers/quantization/modelopt_quant.py +174 -0
sglang/srt/layers/vocab_parallel_embedding.py +1 -1
sglang/srt/managers/cache_controller.py +307 -0
sglang/srt/managers/data_parallel_controller.py +2 -0
sglang/srt/managers/schedule_batch.py +7 -1
sglang/srt/managers/scheduler.py +10 -6
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +6 -2
sglang/srt/mem_cache/memory_pool.py +206 -1
sglang/srt/metrics/collector.py +22 -30
sglang/srt/model_executor/cuda_graph_runner.py +14 -7
sglang/srt/model_executor/forward_batch_info.py +20 -15
sglang/srt/model_executor/model_runner.py +10 -4
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/grok.py +25 -16
sglang/srt/models/llama.py +9 -2
sglang/srt/sampling/sampling_batch_info.py +1 -0
sglang/srt/server.py +11 -8
sglang/srt/server_args.py +12 -1
sglang/srt/speculative/eagle_utils.py +93 -85
sglang/srt/speculative/eagle_worker.py +47 -33
sglang/srt/utils.py +32 -5
sglang/test/test_programs.py +23 -1
sglang/test/test_utils.py +36 -7
sglang/version.py +1 -1
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post5.dist-info}/METADATA +6 -7
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post5.dist-info}/RECORD +48 -43
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post5.dist-info}/WHEEL +1 -1
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post5.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post4.dist-info → sglang-0.4.1.post5.dist-info}/top_level.txt +0 -0

sglang/srt/metrics/collector.py CHANGED Viewed

@@ -114,26 +114,20 @@ class TokenizerMetricsCollector:
             documentation="Histogram of time to first token in seconds.",
             labelnames=labels.keys(),
             buckets=[
-                0.001,
-                0.005,
-                0.01,
-                0.02,
-                0.04,
-                0.06,
-                0.08,
                 0.1,
                 0.25,
                 0.5,
                 0.75,
-                1.0,
-                2.5,
-                5.0,
-                7.5,
-                10.0,
-                15.0,
-                20.0,
-                25.0,
-                30.0,
+                1,
+                2,
+                5,
+                10,
+                20,
+                40,
+                60,
+                80,
+                120,
+                160,
             ],
         )
@@ -168,21 +162,19 @@ class TokenizerMetricsCollector:
             documentation="Histogram of End-to-end request latency in seconds",
             labelnames=labels.keys(),
             buckets=[
-                0.3,
+                0.1,
+                0.25,
                 0.5,
-                0.8,
-                1.0,
-                1.5,
-                2.0,
-                2.5,
-                5.0,
-                10.0,
-                15.0,
-                20.0,
-                30.0,
-                40.0,
-                50.0,
-                60.0,
+                1,
+                2,
+                5,
+                10,
+                20,
+                40,
+                60,
+                80,
+                120,
+                160,
             ],
         )

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -124,10 +124,12 @@ class CudaGraphRunner:
         self.tp_size = self.model_runner.tp_size
         # Batch sizes to capture
-        if model_runner.server_args.disable_cuda_graph_padding:
-            self.capture_bs = list(range(1, 33)) + [64, 128]
-        else:
-            self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
+        self.capture_bs = self.model_runner.server_args.cuda_graph_bs
+        if self.capture_bs is None:
+            if model_runner.server_args.disable_cuda_graph_padding:
+                self.capture_bs = list(range(1, 33)) + [64, 128]
+            else:
+                self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
         if max(self.capture_bs) > model_runner.req_to_token_pool.size:
             # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
@@ -322,6 +324,8 @@ class CudaGraphRunner:
             global_num_tokens = None
             gathered_buffer = None
+        spec_info = self.get_spec_info(num_tokens, positions)
         forward_batch = ForwardBatch(
             forward_mode=self.capture_forward_mode,
             batch_size=bs,
@@ -338,10 +342,13 @@ class CudaGraphRunner:
             top_logprobs_nums=[0] * bs,
             positions=positions,
             global_num_tokens=global_num_tokens,
-            mrope_positions=mrope_positions,
             gathered_buffer=gathered_buffer,
+            mrope_positions=mrope_positions,
             spec_algorithm=self.model_runner.spec_algorithm,
-            spec_info=self.get_spec_info(num_tokens, positions),
+            spec_info=spec_info,
+            capture_hidden_mode=(
+                spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
+            ),
         )
         # Attention backend
@@ -446,10 +453,10 @@ class CudaGraphRunner:
             if self.model_runner.is_draft_worker:
                 spec_info = EAGLEDraftInput()
+                spec_info.load_server_args(self.model_runner.server_args)
                 spec_info.hidden_states = self.hidden_states[:num_tokens]
                 spec_info.positions = positions
                 spec_info.capture_hidden_mode = CaptureHiddenMode.FULL
-                spec_info.init(self.model_runner.server_args)
             else:
                 spec_info = EagleVerifyInput(
                     None,

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -106,6 +106,24 @@ class ForwardMode(IntEnum):
     def is_dummy_first(self):
         return self == ForwardMode.DUMMY_FIRST
+    def is_decode_or_idle(self):
+        return self == ForwardMode.DECODE or self == ForwardMode.IDLE
+class CaptureHiddenMode(IntEnum):
+    NULL = auto()
+    FULL = auto()
+    LAST = auto()
+    def need_capture(self):
+        return self != CaptureHiddenMode.NULL
+    def is_full(self):
+        return self == CaptureHiddenMode.FULL
+    def is_last(self):
+        return self == CaptureHiddenMode.LAST
 @dataclass
 class ForwardBatch:
@@ -174,6 +192,7 @@ class ForwardBatch:
     # Speculative decoding
     spec_info: SpecInfo = None
     spec_algorithm: SpeculativeAlgorithm = None
+    capture_hidden_mode: CaptureHiddenMode = None
     # For Qwen2-VL
     mrope_positions: torch.Tensor = None
@@ -265,6 +284,7 @@ class ForwardBatch:
             sampling_info=batch.sampling_info,
             spec_algorithm=batch.spec_algorithm,
             spec_info=batch.spec_info,
+            capture_hidden_mode=batch.capture_hidden_mode,
             input_embeds=batch.input_embeds,
         )
@@ -400,18 +420,3 @@ def compute_position_torch(
 @maybe_torch_compile(dynamic=True)
 def clamp_position(seq_lens):
     return torch.clamp((seq_lens - 1), min=0).to(torch.int64)
-class CaptureHiddenMode(IntEnum):
-    NULL = auto()
-    FULL = auto()
-    LAST = auto()
-    def need_capture(self):
-        return self != CaptureHiddenMode.NULL
-    def is_full(self):
-        return self == CaptureHiddenMode.FULL
-    def is_last(self):
-        return self == CaptureHiddenMode.LAST

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -89,6 +89,7 @@ class ModelRunner:
         self.is_draft_worker = is_draft_worker
         self.is_generation = model_config.is_generation
         self.is_multimodal = model_config.is_multimodal
+        self.should_log = tp_rank == 0
         self.spec_algorithm = SpeculativeAlgorithm.from_string(
             server_args.speculative_algorithm
         )
@@ -117,15 +118,21 @@ class ModelRunner:
         if self.is_multimodal:
             self.mem_fraction_static *= 0.95
+            logger.info(
+                f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
+                f"because this is a multimodal model."
+            )
             if self.model_config.hf_config.architectures == [
                 "MllamaForConditionalGeneration"
             ]:
                 logger.info("Automatically turn off --chunked-prefill-size for mllama.")
                 server_args.chunked_prefill_size = -1
-            # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
             if self.model_config.hf_config.architectures == [
                 "Qwen2VLForConditionalGeneration"
             ]:
+                # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
                 logger.info(
                     "Automatically turn off --chunked-prefill-size and disable radix cache for qwen2-vl."
                 )
@@ -198,7 +205,7 @@ class ModelRunner:
         if self.device == "cuda":
             backend = "nccl"
         elif self.device == "xpu":
-            # TODO(liangan1):Just use gloo to bypass the initilization fail
+            # TODO(liangan1): Just use gloo to bypass the initilization fail
             # Need to use xccl for xpu backend in the future
             backend = "gloo"
         elif self.device == "hpu":
@@ -627,7 +634,6 @@ class ModelRunner:
             )
     def init_double_sparsity_channel_config(self, selected_channel):
         selected_channel = "." + selected_channel + "_proj"
         self.sorted_channels = []
         # load channel config
@@ -718,7 +724,7 @@ class ModelRunner:
         elif forward_batch.forward_mode.is_idle():
             return self.forward_idle(forward_batch)
         else:
-            raise ValueError(f"Invaid forward mode: {forward_batch.forward_mode}")
+            raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}")
     def sample(
         self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch

sglang/srt/models/chatglm.py CHANGED Viewed

@@ -23,8 +23,8 @@ from torch import nn
 from torch.nn import LayerNorm
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.transformers_utils.configs import ChatGLMConfig
+from sglang.srt.configs import ChatGLMConfig
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.layernorm import RMSNorm
 from sglang.srt.layers.linear import (

sglang/srt/models/dbrx.py CHANGED Viewed

@@ -25,8 +25,8 @@ from vllm.distributed import (
     tensor_model_parallel_all_reduce,
 )
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.transformers_utils.configs.dbrx import DbrxConfig
+from sglang.srt.configs import DbrxConfig
 from sglang.srt.layers.linear import (
     QKVParallelLinear,
     ReplicatedLinear,

sglang/srt/models/grok.py CHANGED Viewed

@@ -57,6 +57,7 @@ class Grok1MLP(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
         reduce_results=True,
+        use_presharded_weights: bool = False,
     ) -> None:
         super().__init__()
         self.gate_up_proj = MergedColumnParallelLinear(
@@ -65,6 +66,7 @@ class Grok1MLP(nn.Module):
             bias=False,
             quant_config=quant_config,
             prefix=f"{prefix}.gate_up_proj",
+            use_presharded_weights=use_presharded_weights,
         )
         self.down_proj = RowParallelLinear(
             intermediate_size,
@@ -73,6 +75,7 @@ class Grok1MLP(nn.Module):
             quant_config=quant_config,
             prefix=f"{prefix}.down_proj",
             reduce_results=reduce_results,
+            use_presharded_weights=use_presharded_weights,
         )
         self.act_fn = GeluAndMul(approximate="tanh")
@@ -103,6 +106,7 @@ class Grok1MoE(nn.Module):
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
         reduce_results=True,
+        use_presharded_weights: bool = False,
     ):
         super().__init__()
         self.hidden_size = hidden_size
@@ -129,6 +133,7 @@ class Grok1MoE(nn.Module):
             renormalize=False,
             quant_config=quant_config,
             tp_size=tp_size,
+            use_presharded_weights=use_presharded_weights,
         )
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
@@ -156,6 +161,7 @@ class Grok1Attention(nn.Module):
         max_position: int = 4096 * 32,
         rope_theta: float = 10000,
         quant_config: Optional[QuantizationConfig] = None,
+        reduce_results: bool = True,
     ) -> None:
         super().__init__()
         self.config = config
@@ -194,6 +200,7 @@ class Grok1Attention(nn.Module):
             hidden_size,
             bias=False,
             quant_config=quant_config,
+            reduce_results=reduce_results,
         )
         self.rotary_emb = get_rope(
             self.head_dim,
@@ -234,10 +241,12 @@ class Grok1DecoderLayer(nn.Module):
         config: PretrainedConfig,
         layer_id: int = 0,
         quant_config: Optional[QuantizationConfig] = None,
+        use_presharded_weights: bool = False,
     ) -> None:
         super().__init__()
         self.num_experts = config.num_local_experts
         self.hidden_size = config.hidden_size
+        self.layer_id = layer_id
         rope_theta = getattr(config, "rope_theta", 10000)
         self.self_attn = Grok1Attention(
@@ -262,6 +271,7 @@ class Grok1DecoderLayer(nn.Module):
             ),
             quant_config=quant_config,
             reduce_results=True,
+            use_presharded_weights=use_presharded_weights,
         )
         self.pre_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attn_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -299,6 +309,7 @@ class Grok1Model(nn.Module):
         self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
+        use_presharded_weights: bool = False,
     ) -> None:
         super().__init__()
         self.config = config
@@ -311,7 +322,12 @@ class Grok1Model(nn.Module):
         )
         self.layers = nn.ModuleList(
             [
-                Grok1DecoderLayer(config, i, quant_config=quant_config)
+                Grok1DecoderLayer(
+                    config,
+                    i,
+                    quant_config=quant_config,
+                    use_presharded_weights=use_presharded_weights,
+                )
                 for i in range(config.num_hidden_layers)
             ]
         )
@@ -347,11 +363,7 @@ class Grok1ForCausalLM(nn.Module):
         super().__init__()
         self.config = config
         self.quant_config = quant_config
-        self.model = Grok1Model(config, quant_config=quant_config)
-        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
-        self.logits_processor = LogitsProcessor(config)
-        # Monkey patch _prepare_weights to load pre-sharded weights
         if (
             self.config.num_local_experts > 0
             and get_tensor_model_parallel_world_size() > 1
@@ -361,6 +373,14 @@ class Grok1ForCausalLM(nn.Module):
         else:
             self.use_presharded_weights = False
+        self.model = Grok1Model(
+            config,
+            quant_config=quant_config,
+            use_presharded_weights=self.use_presharded_weights,
+        )
+        self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+        self.logits_processor = LogitsProcessor(config)
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -376,10 +396,7 @@ class Grok1ForCausalLM(nn.Module):
     def load_weights(
         self,
         weights: Iterable[Tuple[str, torch.Tensor]],
-        use_presharded_weights: bool | None = None,
     ):
-        if use_presharded_weights is None:
-            use_presharded_weights = self.use_presharded_weights
         num_experts = self.config.num_local_experts
         stacked_params_mapping = [
@@ -435,20 +452,12 @@ class Grok1ForCausalLM(nn.Module):
                         continue
                     name = name.replace(weight_name, param_name)
-                    if use_presharded_weights:
-                        extra_kwargs = {
-                            "use_presharded_weights": use_presharded_weights
-                        }
-                    else:
-                        extra_kwargs = {}
                     load_weight_wrapper(
                         name,
                         loaded_weight,
                         name,
                         shard_id=shard_id,
                         expert_id=expert_id,
-                        **extra_kwargs,
                     )
                     break
                 else:

sglang/srt/models/llama.py CHANGED Viewed

@@ -100,6 +100,7 @@ class LlamaAttention(nn.Module):
         max_position_embeddings: int = 8192,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        bias: bool = False,
     ) -> None:
         super().__init__()
         self.hidden_size = hidden_size
@@ -132,14 +133,14 @@ class LlamaAttention(nn.Module):
             self.head_dim,
             self.total_num_heads,
             self.total_num_kv_heads,
-            bias=False,
+            bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.qkv_proj",
         )
         self.o_proj = RowParallelLinear(
             self.total_num_heads * self.head_dim,
             hidden_size,
-            bias=False,
+            bias=bias,
             quant_config=quant_config,
             prefix=f"{prefix}.o_proj",
         )
@@ -194,6 +195,11 @@ class LlamaDecoderLayer(nn.Module):
             )
         rope_is_neox_style = getattr(config, "rope_is_neox_style", True)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        # Support llamafy/Qwen-Qwen2.5-7B-Instruct-llamafied with attention_bias
+        # Support internlm/internlm-7b with bias
+        attention_bias = getattr(config, "attention_bias", False) or getattr(
+            config, "bias", False
+        )
         self.self_attn = LlamaAttention(
             config=config,
             hidden_size=self.hidden_size,
@@ -206,6 +212,7 @@ class LlamaDecoderLayer(nn.Module):
             max_position_embeddings=max_position_embeddings,
             quant_config=quant_config,
             prefix=f"{prefix}.self_attn",
+            bias=attention_bias,
         )
         self.mlp = LlamaMLP(
             hidden_size=self.hidden_size,

sglang/srt/sampling/sampling_batch_info.py CHANGED Viewed

@@ -232,6 +232,7 @@ class SamplingBatchInfo:
         self.logit_bias = SamplingBatchInfo.merge_bias_tensor(
             self.logit_bias, other.logit_bias, len(self), len(other), self.device
         )
+        self.need_min_p_sampling = self.need_min_p_sampling or other.need_min_p_sampling
     def apply_logits_bias(self, logits: torch.Tensor):
         # Apply logit_bias

sglang/srt/server.py CHANGED Viewed

@@ -127,14 +127,12 @@ async def health() -> Response:
 async def health_generate(request: Request) -> Response:
     """Check the health of the inference server by generating one token."""
+    sampling_params = {"max_new_tokens": 1, "temperature": 0.7}
     if tokenizer_manager.is_generation:
-        gri = GenerateReqInput(
-            input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
-        )
+        gri = GenerateReqInput(input_ids=[0], sampling_params=sampling_params)
     else:
-        gri = EmbeddingReqInput(
-            input_ids=[0], sampling_params={"max_new_tokens": 1, "temperature": 0.7}
-        )
+        gri = EmbeddingReqInput(input_ids=[0], sampling_params=sampling_params)
     try:
         async for _ in tokenizer_manager.generate_request(gri, request):
@@ -546,7 +544,12 @@ def launch_server(
     # Send a warmup request
     t = threading.Thread(
-        target=_wait_and_warmup, args=(server_args, pipe_finish_writer)
+        target=_wait_and_warmup,
+        args=(
+            server_args,
+            pipe_finish_writer,
+            tokenizer_manager.image_token_id,
+        ),
     )
     t.start()
@@ -616,7 +619,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     mp.set_start_method("spawn", force=True)
-def _wait_and_warmup(server_args, pipe_finish_writer):
+def _wait_and_warmup(server_args, pipe_finish_writer, image_token_text):
     headers = {}
     url = server_args.url()
     if server_args.api_key:

sglang/srt/server_args.py CHANGED Viewed

@@ -148,6 +148,7 @@ class ServerArgs:
     enable_torch_compile: bool = False
     torch_compile_max_bs: int = 32
     cuda_graph_max_bs: Optional[int] = None
+    cuda_graph_bs: Optional[List[int]] = None
     torchao_config: str = ""
     enable_nan_detection: bool = False
     enable_p2p_check: bool = False
@@ -361,6 +362,7 @@ class ServerArgs:
                 "awq_marlin",
                 "bitsandbytes",
                 "gguf",
+                "modelopt",
             ],
             help="The quantization method.",
         )
@@ -802,6 +804,12 @@ class ServerArgs:
             default=ServerArgs.cuda_graph_max_bs,
             help="Set the maximum batch size for cuda graph.",
         )
+        parser.add_argument(
+            "--cuda-graph-bs",
+            type=int,
+            nargs="+",
+            help="Set the list of batch sizes for cuda graph.",
+        )
         parser.add_argument(
             "--torchao-config",
             type=str,
@@ -920,7 +928,10 @@ class PortArgs:
         while True:
             if is_port_available(port):
                 break
-            port += 42
+            if port < 60000:
+                port += 42
+            else:
+                port -= 43
         return PortArgs(
             tokenizer_ipc_name=tempfile.NamedTemporaryFile(delete=False).name,

sglang 0.4.1.post4__py3-none-any.whl → 0.4.1.post5__py3-none-any.whl

sglang 0.4.1.post4py3-none-any.whl → 0.4.1.post5py3-none-any.whl