PyPI - sglang - Versions diffs - 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl - Mend

sglang 0.4.1.post6py3-none-any.whl → 0.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

sglang/__init__.py +21 -23
sglang/api.py +2 -7
sglang/bench_offline_throughput.py +41 -27
sglang/bench_one_batch.py +60 -4
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +83 -71
sglang/lang/backend/runtime_endpoint.py +183 -4
sglang/lang/chat_template.py +46 -4
sglang/launch_server.py +1 -1
sglang/srt/_custom_ops.py +80 -42
sglang/srt/configs/device_config.py +1 -1
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +1 -0
sglang/srt/constrained/base_grammar_backend.py +21 -0
sglang/srt/constrained/xgrammar_backend.py +8 -4
sglang/srt/conversation.py +14 -1
sglang/srt/distributed/__init__.py +3 -3
sglang/srt/distributed/communication_op.py +2 -1
sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
sglang/srt/distributed/device_communicators/pynccl.py +80 -1
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
sglang/srt/distributed/parallel_state.py +1 -1
sglang/srt/distributed/utils.py +2 -1
sglang/srt/entrypoints/engine.py +452 -0
sglang/srt/entrypoints/http_server.py +603 -0
sglang/srt/function_call_parser.py +494 -0
sglang/srt/layers/activation.py +8 -8
sglang/srt/layers/attention/flashinfer_backend.py +10 -9
sglang/srt/layers/attention/triton_backend.py +4 -6
sglang/srt/layers/attention/vision.py +204 -0
sglang/srt/layers/dp_attention.py +71 -0
sglang/srt/layers/layernorm.py +5 -5
sglang/srt/layers/linear.py +65 -14
sglang/srt/layers/logits_processor.py +49 -64
sglang/srt/layers/moe/ep_moe/layer.py +24 -16
sglang/srt/layers/moe/fused_moe_native.py +84 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
sglang/srt/layers/parameter.py +18 -8
sglang/srt/layers/quantization/__init__.py +20 -23
sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
sglang/srt/layers/quantization/fp8.py +10 -4
sglang/srt/layers/quantization/modelopt_quant.py +1 -2
sglang/srt/layers/quantization/w8a8_int8.py +1 -1
sglang/srt/layers/radix_attention.py +2 -2
sglang/srt/layers/rotary_embedding.py +1184 -31
sglang/srt/layers/sampler.py +64 -6
sglang/srt/layers/torchao_utils.py +12 -6
sglang/srt/layers/vocab_parallel_embedding.py +2 -2
sglang/srt/lora/lora.py +1 -9
sglang/srt/managers/configure_logging.py +3 -0
sglang/srt/managers/data_parallel_controller.py +79 -72
sglang/srt/managers/detokenizer_manager.py +24 -6
sglang/srt/managers/image_processor.py +158 -2
sglang/srt/managers/io_struct.py +57 -3
sglang/srt/managers/schedule_batch.py +78 -45
sglang/srt/managers/schedule_policy.py +26 -12
sglang/srt/managers/scheduler.py +326 -201
sglang/srt/managers/session_controller.py +1 -0
sglang/srt/managers/tokenizer_manager.py +210 -121
sglang/srt/managers/tp_worker.py +6 -4
sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
sglang/srt/managers/utils.py +44 -0
sglang/srt/mem_cache/memory_pool.py +10 -32
sglang/srt/metrics/collector.py +15 -6
sglang/srt/model_executor/cuda_graph_runner.py +26 -30
sglang/srt/model_executor/forward_batch_info.py +5 -7
sglang/srt/model_executor/model_runner.py +44 -19
sglang/srt/model_loader/loader.py +83 -6
sglang/srt/model_loader/weight_utils.py +145 -6
sglang/srt/models/baichuan.py +6 -6
sglang/srt/models/chatglm.py +2 -2
sglang/srt/models/commandr.py +17 -5
sglang/srt/models/dbrx.py +13 -5
sglang/srt/models/deepseek.py +3 -3
sglang/srt/models/deepseek_v2.py +11 -11
sglang/srt/models/exaone.py +2 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +15 -25
sglang/srt/models/gpt2.py +3 -5
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/granite.py +2 -2
sglang/srt/models/grok.py +4 -3
sglang/srt/models/internlm2.py +2 -2
sglang/srt/models/llama.py +7 -5
sglang/srt/models/minicpm.py +2 -2
sglang/srt/models/minicpm3.py +9 -9
sglang/srt/models/minicpmv.py +1238 -0
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mixtral_quant.py +3 -3
sglang/srt/models/mllama.py +2 -2
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/olmo2.py +4 -4
sglang/srt/models/olmoe.py +7 -13
sglang/srt/models/phi3_small.py +2 -2
sglang/srt/models/qwen.py +2 -2
sglang/srt/models/qwen2.py +41 -4
sglang/srt/models/qwen2_moe.py +3 -3
sglang/srt/models/qwen2_vl.py +22 -122
sglang/srt/models/stablelm.py +2 -2
sglang/srt/models/torch_native_llama.py +20 -7
sglang/srt/models/xverse.py +6 -6
sglang/srt/models/xverse_moe.py +6 -6
sglang/srt/openai_api/adapter.py +139 -37
sglang/srt/openai_api/protocol.py +7 -4
sglang/srt/sampling/custom_logit_processor.py +38 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
sglang/srt/sampling/sampling_batch_info.py +143 -18
sglang/srt/sampling/sampling_params.py +3 -1
sglang/srt/server.py +4 -1090
sglang/srt/server_args.py +77 -15
sglang/srt/speculative/eagle_utils.py +37 -15
sglang/srt/speculative/eagle_worker.py +11 -13
sglang/srt/utils.py +164 -129
sglang/test/runners.py +8 -13
sglang/test/test_programs.py +2 -1
sglang/test/test_utils.py +83 -22
sglang/utils.py +12 -2
sglang/version.py +1 -1
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
sglang/launch_server_llavavid.py +0 -25
sglang/srt/constrained/__init__.py +0 -16
sglang/srt/distributed/device_communicators/__init__.py +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
{sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -27,7 +27,7 @@ import logging
 import threading
 from enum import IntEnum
 from functools import wraps
-from typing import List, Tuple, Union
+from typing import List, Optional, Tuple, Union
 import numpy as np
 import psutil
@@ -49,7 +49,6 @@ class ReqToTokenPool:
         size: int,
         max_context_len: int,
         device: str,
-        use_records: bool,
         enable_memory_saver: bool,
     ):
         memory_saver_adapter = TorchMemorySaverAdapter.create(
@@ -64,17 +63,9 @@ class ReqToTokenPool:
                 (size, max_context_len), dtype=torch.int32, device=device
             )
         self.free_slots = list(range(size))
-        self.write_records = []
-        self.use_records = use_records
-        if self.use_records:
-            self.write = self.write_with_records
-        else:
-            self.write = self.write_without_records
     def write(self, indices, values):
-        # Keep the signature for type checking. It will be assigned during runtime.
-        raise NotImplementedError()
+        self.req_to_token[indices] = values
     def available_size(self):
         return len(self.free_slots)
@@ -96,23 +87,6 @@ class ReqToTokenPool:
     def clear(self):
         self.free_slots = list(range(self.size))
-        self.write_records = []
-    def write_without_records(self, indices, values):
-        self.req_to_token[indices] = values
-    def write_with_records(self, indices, values):
-        self.req_to_token[indices] = values
-        self.write_records.append((indices, values))
-    def get_write_records(self):
-        ret = self.write_records
-        self.write_records = []
-        return ret
-    def apply_write_records(self, write_records: List[Tuple]):
-        for indices, values in write_records:
-            self.req_to_token[indices] = values
 class BaseTokenToKVPool:
@@ -296,13 +270,17 @@ class MHATokenToKVPool(BaseTokenToKVPool):
         loc: torch.Tensor,
         cache_k: torch.Tensor,
         cache_v: torch.Tensor,
-        k_scale: float = 1.0,
-        v_scale: float = 1.0,
+        k_scale: Optional[float] = None,
+        v_scale: Optional[float] = None,
     ):
         layer_id = layer.layer_id
         if cache_k.dtype != self.dtype:
-            cache_k = (cache_k / k_scale).to(self.dtype)
-            cache_v = (cache_v / v_scale).to(self.dtype)
+            if k_scale is not None:
+                cache_k.div_(k_scale)
+            if v_scale is not None:
+                cache_v.div_(v_scale)
+            cache_k = cache_k.to(self.dtype)
+            cache_v = cache_v.to(self.dtype)
         if self.store_dtype != self.dtype:
             self.k_buffer[layer_id][loc] = cache_k.view(self.store_dtype)
             self.v_buffer[layer_id][loc] = cache_v.view(self.store_dtype)

sglang/srt/metrics/collector.py CHANGED Viewed

@@ -25,6 +25,7 @@ class SchedulerStats:
     gen_throughput: float = 0.0
     num_queue_reqs: int = 0
     cache_hit_rate: float = 0.0
+    spec_accept_length: float = 0.0
 class SchedulerMetricsCollector:
@@ -37,42 +38,49 @@ class SchedulerMetricsCollector:
         self.num_running_reqs = Gauge(
             name="sglang:num_running_reqs",
-            documentation="The number of running requests",
+            documentation="The number of running requests.",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
         self.num_used_tokens = Gauge(
             name="sglang:num_used_tokens",
-            documentation="The number of used tokens",
+            documentation="The number of used tokens.",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
         self.token_usage = Gauge(
             name="sglang:token_usage",
-            documentation="The token usage",
+            documentation="The token usage.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
         self.gen_throughput = Gauge(
             name="sglang:gen_throughput",
-            documentation="The generate throughput (token/s)",
+            documentation="The generation throughput (token/s).",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
         self.num_queue_reqs = Gauge(
             name="sglang:num_queue_reqs",
-            documentation="The number of requests in the waiting queue",
+            documentation="The number of requests in the waiting queue.",
             labelnames=labels.keys(),
             multiprocess_mode="sum",
         )
         self.cache_hit_rate = Gauge(
             name="sglang:cache_hit_rate",
-            documentation="The cache hit rate",
+            documentation="The prefix cache hit rate.",
+            labelnames=labels.keys(),
+            multiprocess_mode="mostrecent",
+        )
+        self.spec_accept_length = Gauge(
+            name="sglang:spec_accept_length",
+            documentation="The average acceptance length of speculative decoding.",
             labelnames=labels.keys(),
             multiprocess_mode="mostrecent",
         )
@@ -88,6 +96,7 @@ class SchedulerMetricsCollector:
         self._log_gauge(self.gen_throughput, stats.gen_throughput)
         self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
         self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
+        self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
 class TokenizerMetricsCollector:

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -21,10 +21,10 @@ from typing import TYPE_CHECKING, Callable
 import torch
 import tqdm
-from vllm.distributed import get_tensor_model_parallel_rank
-from vllm.distributed.parallel_state import graph_capture
 from vllm.model_executor.custom_op import CustomOp
+from sglang.srt.distributed import get_tensor_model_parallel_rank
+from sglang.srt.distributed.parallel_state import GroupCoordinator, graph_capture
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.moe.fused_moe_native import fused_moe_forward_native
 from sglang.srt.layers.torchao_utils import save_gemlite_cache
@@ -33,13 +33,12 @@ from sglang.srt.model_executor.forward_batch_info import (
     ForwardBatch,
     ForwardMode,
 )
-from sglang.srt.utils import monkey_patch_vllm_all_gather
 if TYPE_CHECKING:
     from sglang.srt.model_executor.model_runner import ModelRunner
-def _to_torch(model: torch.nn.Module, reverse: bool, batch_size: int):
+def _to_torch(model: torch.nn.Module, reverse: bool, num_tokens: int):
     for sub in model._modules.values():
         if isinstance(sub, CustomOp):
             if reverse:
@@ -48,7 +47,7 @@ def _to_torch(model: torch.nn.Module, reverse: bool, batch_size: int):
             else:
                 # NOTE: Temporarily workaround MoE
                 if "FusedMoE" in sub.__class__.__name__:
-                    if batch_size == 1:
+                    if num_tokens == 1:
                         # The performance of torch.compile on this layer is not always good when bs > 1,
                         # so we decide to only use torch.compile when bs =1
                         sub._forward_method = fused_moe_forward_native
@@ -56,23 +55,22 @@ def _to_torch(model: torch.nn.Module, reverse: bool, batch_size: int):
                     sub._forward_method = sub.forward_native
                 setattr(sub, "is_torch_compile", True)
         if isinstance(sub, torch.nn.Module):
-            _to_torch(sub, reverse, batch_size)
+            _to_torch(sub, reverse, num_tokens)
 @contextmanager
 def patch_model(
     model: torch.nn.Module,
     enable_compile: bool,
-    batch_size: int,
-    tp_group: "GroupCoordinator",
+    num_tokens: int,
+    tp_group: GroupCoordinator,
 ):
     """Patch the model to make it compatible with with torch.compile"""
     backup_ca_comm = None
     try:
         if enable_compile:
-            _to_torch(model, reverse=False, batch_size=batch_size)
-            monkey_patch_vllm_all_gather()
+            _to_torch(model, reverse=False, num_tokens=num_tokens)
             backup_ca_comm = tp_group.ca_comm
             # Use custom-allreduce here.
             # We found the custom allreduce is much faster than the built-in allreduce in torch,
@@ -87,8 +85,7 @@ def patch_model(
             yield model.forward
     finally:
         if enable_compile:
-            _to_torch(model, reverse=True, batch_size=batch_size)
-            monkey_patch_vllm_all_gather(reverse=True)
+            _to_torch(model, reverse=True, num_tokens=num_tokens)
             tp_group.ca_comm = backup_ca_comm
@@ -122,6 +119,7 @@ class CudaGraphRunner:
         self.is_encoder_decoder = self.model_runner.model_config.is_encoder_decoder
         self.enable_dp_attention = self.model_runner.server_args.enable_dp_attention
         self.tp_size = self.model_runner.tp_size
+        self.dp_size = self.model_runner.server_args.dp_size
         # Batch sizes to capture
         self.capture_bs = self.model_runner.server_args.cuda_graph_bs
@@ -151,9 +149,18 @@ class CudaGraphRunner:
             and bs <= model_runner.server_args.cuda_graph_max_bs
         ]
+        self.compile_bs = (
+            [
+                bs
+                for bs in self.capture_bs
+                if bs <= self.model_runner.server_args.torch_compile_max_bs
+            ]
+            if self.use_torch_compile
+            else []
+        )
         self.capture_forward_mode = ForwardMode.DECODE
         self.num_tokens_per_bs = 1
         if model_runner.spec_algorithm.is_eagle():
             if self.model_runner.is_draft_worker:
                 self.num_tokens_per_bs = (
@@ -165,16 +172,6 @@ class CudaGraphRunner:
                     self.model_runner.server_args.speculative_num_draft_tokens
                 )
-        self.compile_bs = (
-            [
-                bs
-                for bs in self.capture_bs
-                if bs <= self.model_runner.server_args.torch_compile_max_bs
-            ]
-            if self.use_torch_compile
-            else []
-        )
         # Attention backend
         self.max_bs = max(self.capture_bs)
         self.max_num_token = self.max_bs * self.num_tokens_per_bs
@@ -182,7 +179,6 @@ class CudaGraphRunner:
         self.seq_len_fill_value = (
             self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
         )
         # FIXME(lsyin): leave it here for now, I don't know whether it is necessary
         self.encoder_len_fill_value = 0
@@ -191,14 +187,14 @@ class CudaGraphRunner:
         # Common inputs
         with torch.device("cuda"):
-            self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int32)
+            self.input_ids = torch.zeros((self.max_num_token,), dtype=torch.int64)
             self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
             self.seq_lens = torch.full(
                 (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
             )
-            self.out_cache_loc = torch.zeros((self.max_num_token,), dtype=torch.int32)
+            self.out_cache_loc = torch.zeros((self.max_num_token,), dtype=torch.int64)
             self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
-            self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int32)
+            self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int64)
             # Speculative_inference
             if model_runner.spec_algorithm.is_eagle():
@@ -218,7 +214,7 @@ class CudaGraphRunner:
             if self.enable_dp_attention:
                 self.gathered_buffer = torch.zeros(
                     (
-                        self.max_bs * self.tp_size,
+                        self.max_bs * self.dp_size,
                         self.model_runner.model_config.hidden_size,
                     ),
                     dtype=self.model_runner.dtype,
@@ -287,8 +283,8 @@ class CudaGraphRunner:
                 with patch_model(
                     self.model_runner.model,
                     bs in self.compile_bs,
-                    bs,
-                    self.model_runner.tp_group,
+                    num_tokens=bs * self.num_tokens_per_bs,
+                    tp_group=self.model_runner.tp_group,
                 ) as forward:
                     (
                         graph,

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -38,7 +38,7 @@ import triton
 import triton.language as tl
 from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
-from sglang.srt.utils import maybe_torch_compile
+from sglang.srt.utils import get_compiler_backend
 if TYPE_CHECKING:
     from sglang.srt.layers.attention import AttentionBackend
@@ -282,6 +282,9 @@ class ForwardBatch:
             can_run_dp_cuda_graph=batch.can_run_dp_cuda_graph,
             lora_paths=batch.lora_paths,
             sampling_info=batch.sampling_info,
+            req_to_token_pool=model_runner.req_to_token_pool,
+            token_to_kv_pool=model_runner.token_to_kv_pool,
+            attn_backend=model_runner.attn_backend,
             spec_algorithm=batch.spec_algorithm,
             spec_info=batch.spec_info,
             capture_hidden_mode=batch.capture_hidden_mode,
@@ -336,11 +339,6 @@ class ForwardBatch:
         if model_runner.model_is_mrope:
             ret.compute_mrope_positions(model_runner, batch)
-        # Init attention information
-        ret.req_to_token_pool = model_runner.req_to_token_pool
-        ret.token_to_kv_pool = model_runner.token_to_kv_pool
-        ret.attn_backend = model_runner.attn_backend
         # Init lora information
         if model_runner.server_args.lora_paths is not None:
             model_runner.lora_manager.prepare_lora_batch(ret)
@@ -417,6 +415,6 @@ def compute_position_torch(
     return positions.to(torch.int64), extend_start_loc
-@maybe_torch_compile(dynamic=True)
+@torch.compile(dynamic=True, backend=get_compiler_backend())
 def clamp_position(seq_lens):
     return torch.clamp((seq_lens - 1), min=0).to(torch.int64)

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -21,20 +21,26 @@ from typing import List, Optional, Tuple
 import torch
 import torch.distributed as dist
-from vllm.distributed import (
+from sglang.srt.configs.device_config import DeviceConfig
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.configs.model_config import AttentionArch, ModelConfig
+from sglang.srt.distributed import (
     get_tp_group,
     init_distributed_environment,
     initialize_model_parallel,
     set_custom_all_reduce,
 )
-from sglang.srt.configs.device_config import DeviceConfig
-from sglang.srt.configs.load_config import LoadConfig
-from sglang.srt.configs.model_config import AttentionArch, ModelConfig
+from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state
 from sglang.srt.layers.attention.double_sparsity_backend import DoubleSparseAttnBackend
 from sglang.srt.layers.attention.flashinfer_backend import FlashInferAttnBackend
 from sglang.srt.layers.attention.torch_native_backend import TorchNativeAttnBackend
 from sglang.srt.layers.attention.triton_backend import TritonAttnBackend
+from sglang.srt.layers.dp_attention import (
+    get_attention_tp_group,
+    get_attention_tp_size,
+    initialize_dp_attention,
+)
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.layers.sampler import Sampler
 from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
@@ -57,8 +63,8 @@ from sglang.srt.utils import (
     init_custom_process_group,
     is_cuda,
     is_hip,
+    monkey_patch_p2p_access_check,
     monkey_patch_vllm_gguf_config,
-    monkey_patch_vllm_p2p_access_check,
     set_cpu_offload_max_bytes,
 )
@@ -101,8 +107,10 @@ class ModelRunner:
             self.model_config.attention_arch == AttentionArch.MLA
             and not self.server_args.disable_mla
         ):
-            logger.info("MLA optimization is turned on. Use triton backend.")
-            self.server_args.attention_backend = "triton"
+            # TODO: add MLA optimization on CPU
+            if self.server_args.device != "cpu":
+                logger.info("MLA optimization is turned on. Use triton backend.")
+                self.server_args.attention_backend = "triton"
         if self.server_args.enable_double_sparsity:
             logger.info(
@@ -159,6 +167,7 @@ class ModelRunner:
                 "enable_nan_detection": server_args.enable_nan_detection,
                 "enable_dp_attention": server_args.enable_dp_attention,
                 "enable_ep_moe": server_args.enable_ep_moe,
+                "device": server_args.device,
             }
         )
@@ -176,9 +185,12 @@ class ModelRunner:
         self.load_model()
         # Apply torchao quantization
-        apply_torchao_config_to_model(
-            self.model, global_server_args_dict["torchao_config"]
-        )
+        torchao_applied = getattr(self.model, "torchao_applied", False)
+        # In layered loading, torchao may have been applied
+        if not torchao_applied:
+            apply_torchao_config_to_model(
+                self.model, global_server_args_dict["torchao_config"]
+            )
         # Apply torch TP if the model supports it
         supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
@@ -206,7 +218,7 @@ class ModelRunner:
     def init_torch_distributed(self):
         logger.info("Init torch distributed begin.")
-        # Init torch distributed
         torch.get_device_module(self.device).set_device(self.gpu_id)
         if self.device == "cuda":
             backend = "nccl"
@@ -216,9 +228,12 @@ class ModelRunner:
             backend = "gloo"
         elif self.device == "hpu":
             backend = "hccl"
+        elif self.device == "cpu":
+            backend = "gloo"
         if not self.server_args.enable_p2p_check:
-            monkey_patch_vllm_p2p_access_check(self.gpu_id)
+            monkey_patch_p2p_access_check()
         if self.server_args.dist_init_addr:
             dist_init_method = f"tcp://{self.server_args.dist_init_addr}"
         else:
@@ -226,7 +241,7 @@ class ModelRunner:
         set_custom_all_reduce(not self.server_args.disable_custom_all_reduce)
         if not self.is_draft_worker:
-            # Only initilzie the distributed environment on the target model worker.
+            # Only initialize the distributed environment on the target model worker.
             init_distributed_environment(
                 backend=backend,
                 world_size=self.tp_size,
@@ -235,11 +250,18 @@ class ModelRunner:
                 distributed_init_method=dist_init_method,
             )
             initialize_model_parallel(tensor_model_parallel_size=self.tp_size)
+            initialize_dp_attention(
+                enable_dp_attention=self.server_args.enable_dp_attention,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
+                dp_size=self.server_args.dp_size,
+            )
         min_per_gpu_memory = get_available_gpu_memory(
             self.device, self.gpu_id, distributed=self.tp_size > 1
         )
         self.tp_group = get_tp_group()
+        self.attention_tp_group = get_attention_tp_group()
         # Check memory for tensor parallelism
         if self.tp_size > 1:
@@ -257,7 +279,8 @@ class ModelRunner:
         )
         # This can reduce thread conflicts and speed up weight loading.
-        torch.set_num_threads(1)
+        if self.device != "cpu":
+            torch.set_num_threads(1)
         if self.device == "cuda":
             if torch.cuda.get_device_capability()[0] < 8:
                 logger.info(
@@ -277,12 +300,15 @@ class ModelRunner:
             monkey_patch_vllm_gguf_config()
         # Load the model
+        # Remove monkey_patch when linear.py quant remove dependencies with vllm
+        monkey_patch_vllm_parallel_state()
         with self.memory_saver_adapter.region():
             self.model = get_model(
                 model_config=self.model_config,
                 load_config=self.load_config,
                 device_config=DeviceConfig(self.device),
             )
+        monkey_patch_vllm_parallel_state(reverse=True)
         if self.server_args.kv_cache_dtype == "fp8_e4m3":
             if self.server_args.quantization_param_path is not None:
@@ -521,7 +547,7 @@ class ModelRunner:
             )
         else:
             cell_size = (
-                self.model_config.get_num_kv_heads(self.tp_size)
+                self.model_config.get_num_kv_heads(get_attention_tp_size())
                 * self.model_config.head_dim
                 * self.model_config.num_hidden_layers
                 * 2
@@ -595,7 +621,6 @@ class ModelRunner:
             size=max_num_reqs + 1,
             max_context_len=self.model_config.context_len + 4,
             device=self.device,
-            use_records=False,
             enable_memory_saver=self.server_args.enable_memory_saver,
         )
         if (
@@ -615,7 +640,7 @@ class ModelRunner:
             self.token_to_kv_pool = DoubleSparseTokenToKVPool(
                 self.max_total_num_tokens,
                 dtype=self.kv_cache_dtype,
-                head_num=self.model_config.get_num_kv_heads(self.tp_size),
+                head_num=self.model_config.get_num_kv_heads(get_attention_tp_size()),
                 head_dim=self.model_config.head_dim,
                 layer_num=self.model_config.num_hidden_layers,
                 device=self.device,
@@ -626,7 +651,7 @@ class ModelRunner:
             self.token_to_kv_pool = MHATokenToKVPool(
                 self.max_total_num_tokens,
                 dtype=self.kv_cache_dtype,
-                head_num=self.model_config.get_num_kv_heads(self.tp_size),
+                head_num=self.model_config.get_num_kv_heads(get_attention_tp_size()),
                 head_dim=self.model_config.head_dim,
                 layer_num=self.model_config.num_hidden_layers,
                 device=self.device,

sglang/srt/model_loader/loader.py CHANGED Viewed

@@ -21,14 +21,14 @@ from huggingface_hub import HfApi, hf_hub_download
 from torch import nn
 from transformers import AutoModelForCausalLM, PretrainedConfig
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
-from vllm.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
 from sglang.srt.configs.device_config import DeviceConfig
 from sglang.srt.configs.load_config import LoadConfig, LoadFormat
 from sglang.srt.configs.model_config import ModelConfig
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.model_loader.utils import (
     get_model_architecture,
@@ -374,6 +374,78 @@ class DefaultModelLoader(BaseModelLoader):
         return model.eval()
+class LayeredModelLoader(DefaultModelLoader):
+    """Model loader that loads weights layer by layer so that one can quantize a
+    layer before loading another to make the peak memory envelope smaller."""
+    def __init__(self, load_config: LoadConfig):
+        # Back to the default load format
+        load_config.load_format = LoadFormat.AUTO
+        super().__init__(load_config)
+    def load_model(
+        self,
+        *,
+        model_config: ModelConfig,
+        device_config: DeviceConfig,
+    ) -> nn.Module:
+        from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
+        from sglang.srt.managers.schedule_batch import global_server_args_dict
+        torchao_config = global_server_args_dict.get("torchao_config")
+        target_device = torch.device(device_config.device)
+        with set_default_torch_dtype(model_config.dtype):
+            # Create model on meta device
+            with torch.device("meta"):
+                model = _initialize_model(
+                    model_config,
+                    self.load_config,
+                )
+            # Check model's layered load support
+            if not hasattr(model, "load_weights_to_module"):
+                raise ValueError(
+                    "LayeredModelLoader requires the model to have a "
+                    "`load_weights_to_module` method. "
+                    f"{model_config.model_path} does not support it."
+                )
+            # Get all weights from disk
+            weights = self._get_all_weights(model_config, model)
+            # Helper function to recursively fill the weights of a module
+            def fill_module(module, fqn: List[str], weights):
+                """
+                fqn: list of strings representing the fully qualified name of `module`.
+                """
+                # Layer by layer
+                for name, submod in module.named_children():
+                    fill_module(submod, fqn + [name], weights)
+                # First materialize on target device
+                module.to_empty(device=target_device, recurse=False)
+                fqn_path = ".".join(fqn)
+                # Fill weights
+                model.load_weights_to_module(
+                    fqn_path,
+                    weights,
+                )
+                # Quantize weights if applicable
+                if torchao_config and "proj" in fqn_path:
+                    # Note: `None` here is needed to indicate no filter, see
+                    # `apply_torchao_config_to_model` for details.
+                    apply_torchao_config_to_model(module, torchao_config, None)
+            # Start calling on root module
+            fill_module(model, [], weights)
+        if torchao_config:
+            model.torchao_applied = True
+        return model.eval()
 class DummyModelLoader(BaseModelLoader):
     """Model loader that will set model weights to random values."""
@@ -496,7 +568,8 @@ class ShardedStateLoader(BaseModelLoader):
         device_config: DeviceConfig,
     ) -> nn.Module:
         from safetensors.torch import safe_open
-        from vllm.distributed import get_tensor_model_parallel_rank
+        from sglang.srt.distributed import get_tensor_model_parallel_rank
         local_model_path = self._prepare_weights(
             model_config.model_path, model_config.revision
@@ -556,7 +629,8 @@ class ShardedStateLoader(BaseModelLoader):
         max_size: Optional[int] = None,
     ) -> None:
         from safetensors.torch import save_file
-        from vllm.distributed import get_tensor_model_parallel_rank
+        from sglang.srt.distributed import get_tensor_model_parallel_rank
         if pattern is None:
             pattern = ShardedStateLoader.DEFAULT_PATTERN
@@ -1147,4 +1221,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     if load_config.load_format == LoadFormat.GGUF:
         return GGUFModelLoader(load_config)
+    if load_config.load_format == LoadFormat.LAYERED:
+        return LayeredModelLoader(load_config)
     return DefaultModelLoader(load_config)

sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl

sglang 0.4.1.post6py3-none-any.whl → 0.4.2py3-none-any.whl