PyPI - sglang - Versions diffs - 0.3.5.post1__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

sglang 0.3.5.post1py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

sglang/bench_latency.py +1 -553
sglang/bench_offline_throughput.py +337 -0
sglang/bench_one_batch.py +474 -0
sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
sglang/bench_serving.py +115 -31
sglang/check_env.py +3 -6
sglang/srt/constrained/base_grammar_backend.py +4 -3
sglang/srt/constrained/outlines_backend.py +39 -26
sglang/srt/constrained/xgrammar_backend.py +58 -14
sglang/srt/layers/activation.py +3 -0
sglang/srt/layers/attention/flashinfer_backend.py +93 -48
sglang/srt/layers/attention/triton_backend.py +9 -7
sglang/srt/layers/custom_op_util.py +26 -0
sglang/srt/layers/fused_moe/fused_moe.py +11 -4
sglang/srt/layers/fused_moe/patch.py +4 -2
sglang/srt/layers/layernorm.py +4 -0
sglang/srt/layers/logits_processor.py +10 -10
sglang/srt/layers/sampler.py +4 -8
sglang/srt/layers/torchao_utils.py +2 -0
sglang/srt/managers/data_parallel_controller.py +74 -9
sglang/srt/managers/detokenizer_manager.py +1 -14
sglang/srt/managers/io_struct.py +27 -0
sglang/srt/managers/schedule_batch.py +104 -38
sglang/srt/managers/schedule_policy.py +5 -1
sglang/srt/managers/scheduler.py +210 -56
sglang/srt/managers/session_controller.py +62 -0
sglang/srt/managers/tokenizer_manager.py +38 -0
sglang/srt/managers/tp_worker.py +12 -1
sglang/srt/managers/tp_worker_overlap_thread.py +49 -52
sglang/srt/model_executor/cuda_graph_runner.py +43 -6
sglang/srt/model_executor/forward_batch_info.py +109 -15
sglang/srt/model_executor/model_runner.py +102 -43
sglang/srt/model_parallel.py +98 -0
sglang/srt/models/deepseek_v2.py +147 -44
sglang/srt/models/gemma2.py +9 -8
sglang/srt/models/llava.py +1 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/olmo.py +3 -3
sglang/srt/models/phi3_small.py +447 -0
sglang/srt/models/qwen2_vl.py +13 -6
sglang/srt/models/torch_native_llama.py +94 -78
sglang/srt/openai_api/adapter.py +11 -4
sglang/srt/openai_api/protocol.py +30 -27
sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
sglang/srt/sampling/sampling_batch_info.py +58 -57
sglang/srt/sampling/sampling_params.py +3 -3
sglang/srt/server.py +29 -2
sglang/srt/server_args.py +97 -60
sglang/srt/utils.py +103 -51
sglang/test/runners.py +25 -6
sglang/test/srt/sampling/penaltylib/utils.py +23 -21
sglang/test/test_utils.py +33 -22
sglang/version.py +1 -1
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/METADATA +43 -43
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/RECORD +62 -56
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/WHEEL +1 -1
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/LICENSE +0 -0
{sglang-0.3.5.post1.dist-info → sglang-0.3.6.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -18,6 +18,7 @@ limitations under the License.
 import gc
 import importlib
 import importlib.resources
+import inspect
 import json
 import logging
 import pkgutil
@@ -56,9 +57,11 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
+    crash_on_warnings,
     enable_show_time_cost,
     get_available_gpu_memory,
-    monkey_patch_vllm_dummy_weight_loader,
+    is_hip,
+    monkey_patch_vllm_model_config,
     monkey_patch_vllm_p2p_access_check,
 )
@@ -113,7 +116,7 @@ class ModelRunner:
             )
         if self.is_multimodal:
-            logger.warning(
+            logger.info(
                 "Automatically turn off --chunked-prefill-size and adjust --mem-fraction-static for multimodal models."
             )
             server_args.chunked_prefill_size = None
@@ -139,8 +142,8 @@ class ModelRunner:
                 "triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
                 "disable_mla": server_args.disable_mla,
                 "torchao_config": server_args.torchao_config,
-                "disable_penalizer": server_args.disable_penalizer,
-                "disable_nan_detection": server_args.disable_nan_detection,
+                "enable_nan_detection": server_args.enable_nan_detection,
+                "enable_dp_attention": server_args.enable_dp_attention,
             }
         )
@@ -148,6 +151,15 @@ class ModelRunner:
         min_per_gpu_memory = self.init_torch_distributed()
         self.sampler = Sampler()
         self.load_model()
+        # Apply torch TP if model supports it
+        supports_torch_tp = getattr(self.model, "supports_torch_tp", False)
+        if self.tp_size > 1 and supports_torch_tp:
+            self.apply_torch_tp()
+            self.torch_tp_applied = True
+        else:
+            self.torch_tp_applied = False
         if server_args.lora_paths is not None:
             self.init_lora_manager()
         self.init_memory_pool(
@@ -215,6 +227,47 @@ class ModelRunner:
         return min_per_gpu_memory
+    def setup_model(self):
+        try:
+            from vllm.config import VllmConfig
+            vllm_config = VllmConfig()
+            vllm_config.model_config = self.vllm_model_config
+            vllm_config.load_config = self.load_config
+            vllm_config.device_config = DeviceConfig(self.device)
+            vllm_config.quant_config = VllmConfig._get_quantization_config(
+                vllm_config.model_config, vllm_config.load_config
+            )
+            return get_model(vllm_config=vllm_config)
+        except ImportError:
+            return get_model(
+                model_config=self.vllm_model_config,
+                load_config=self.load_config,
+                device_config=DeviceConfig(self.device),
+                parallel_config=None,
+                scheduler_config=None,
+                lora_config=None,
+                cache_config=None,
+            )
+    def get_model_config_params(self):
+        sig = inspect.signature(VllmModelConfig.__init__)
+        params = {
+            "model": self.server_args.model_path,
+            "quantization": self.server_args.quantization,
+            "tokenizer": None,
+            "tokenizer_mode": None,
+            "trust_remote_code": self.server_args.trust_remote_code,
+            "dtype": self.server_args.dtype,
+            "seed": self.server_args.random_seed,
+            "skip_tokenizer_init": True,
+        }
+        if "task" in sig.parameters:
+            params["task"] = ""
+        return params
     def load_model(self):
         logger.info(
             f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
@@ -232,39 +285,25 @@ class ModelRunner:
                     raise RuntimeError("SGLang only supports sm75 and above.")
         # Prepare the vllm model config
-        monkey_patch_vllm_dummy_weight_loader()
-        self.load_config = LoadConfig(load_format=self.server_args.load_format)
-        self.vllm_model_config = VllmModelConfig(
-            model=self.server_args.model_path,
-            quantization=self.server_args.quantization,
-            tokenizer=None,
-            tokenizer_mode=None,
-            trust_remote_code=self.server_args.trust_remote_code,
-            dtype=self.server_args.dtype,
-            seed=self.server_args.random_seed,
-            skip_tokenizer_init=True,
+        self.load_config = LoadConfig(
+            load_format=self.server_args.load_format,
+            download_dir=self.server_args.download_dir,
         )
+        monkey_patch_vllm_model_config()
+        self.vllm_model_config = VllmModelConfig(**self.get_model_config_params())
         if self.model_config.model_override_args is not None:
             self.vllm_model_config.hf_config.update(
                 self.model_config.model_override_args
             )
-        self.dtype = self.vllm_model_config.dtype
-        # Load the model
-        self.model = get_model(
-            model_config=self.vllm_model_config,
-            load_config=self.load_config,
-            device_config=DeviceConfig(self.device),
-            parallel_config=None,
-            scheduler_config=None,
-            lora_config=None,
-            cache_config=None,
-        )
+        self.model = self.setup_model()
         self.sliding_window_size = (
             self.model.get_attention_sliding_window_size()
             if hasattr(self.model, "get_attention_sliding_window_size")
             else None
         )
+        self.dtype = self.vllm_model_config.dtype
         logger.info(
             f"Load weight end. "
@@ -290,17 +329,9 @@ class ModelRunner:
         target_device = torch.device(self.device)
         try:
-            # TODO: Use a better method to check this
-            vllm_model_config = VllmModelConfig(
-                model=model_path,
-                quantization=self.server_args.quantization,
-                tokenizer=None,
-                tokenizer_mode=None,
-                trust_remote_code=self.server_args.trust_remote_code,
-                dtype=self.server_args.dtype,
-                seed=self.server_args.random_seed,
-                skip_tokenizer_init=True,
-            )
+            model_config_params = self.get_model_config_params()
+            model_config_params["model"] = model_path
+            vllm_model_config = VllmModelConfig(**model_config_params)
         except Exception as e:
             message = f"Failed to load model config: {e}."
             return False, message
@@ -409,7 +440,10 @@ class ModelRunner:
         if self.server_args.kv_cache_dtype == "auto":
             self.kv_cache_dtype = self.dtype
         elif self.server_args.kv_cache_dtype == "fp8_e5m2":
-            self.kv_cache_dtype = torch.float8_e5m2
+            if is_hip():  # Using natively supported format
+                self.kv_cache_dtype = torch.float8_e5m2fnuz
+            else:
+                self.kv_cache_dtype = torch.float8_e5m2
         else:
             raise ValueError(
                 f"Unsupported kv_cache_dtype: {self.server_args.kv_cache_dtype}."
@@ -548,6 +582,13 @@ class ModelRunner:
         logger.info("Capture cuda graph begin. This can take up to several minutes.")
         self.cuda_graph_runner = CudaGraphRunner(self)
+    def apply_torch_tp(self):
+        logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
+        from sglang.srt.model_parallel import tensor_parallel
+        device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
+        tensor_parallel(self.model, device_mesh)
     def forward_decode(self, forward_batch: ForwardBatch):
         if self.cuda_graph_runner and self.cuda_graph_runner.can_run(forward_batch):
             return self.cuda_graph_runner.replay(forward_batch)
@@ -573,21 +614,37 @@ class ModelRunner:
                 get_embedding=True,
             )
+    def forward_idle(self, forward_batch: ForwardBatch):
+        if self.cuda_graph_runner and self.cuda_graph_runner.can_run(forward_batch):
+            return self.cuda_graph_runner.replay(forward_batch)
+        return self.model.forward(
+            forward_batch.input_ids, forward_batch.positions, forward_batch
+        )
     def forward(self, forward_batch: ForwardBatch) -> LogitsProcessorOutput:
         if forward_batch.forward_mode.is_decode():
             return self.forward_decode(forward_batch)
         elif forward_batch.forward_mode.is_extend():
             return self.forward_extend(forward_batch)
+        elif forward_batch.forward_mode.is_idle():
+            return self.forward_idle(forward_batch)
         else:
             raise ValueError(f"Invaid forward mode: {forward_batch.forward_mode}")
     def sample(
         self, logits_output: LogitsProcessorOutput, forward_batch: ForwardBatch
     ) -> torch.Tensor:
-        # Put CPU-heavy tasks here. They will be overlapped with the forward pass.
         sampling_info = forward_batch.sampling_info
-        sampling_info.update_regex_vocab_mask()
-        sampling_info.update_penalties()
+        if sampling_info.sampling_info_done:
+            # Overlap mode: the function update_regex_vocab_mask was executed
+            # in process_batch_result of the last batch.
+            if sampling_info.grammars:
+                sampling_info.sampling_info_done.wait()
+        else:
+            # Normal mode: Put CPU-heavy tasks here. They will be overlapped with the forward pass.
+            sampling_info.update_regex_vocab_mask()
+            sampling_info.update_penalties()
         logits = self.apply_logits_bias(logits_output.next_token_logits, sampling_info)
         # Sample the next tokens.
@@ -613,7 +670,7 @@ class ModelRunner:
         # Apply regex vocab_mask
         if sampling_info.vocab_mask is not None:
-            logits = logits.masked_fill(sampling_info.vocab_mask, float("-inf"))
+            sampling_info.apply_mask(logits=logits, vocab_mask=sampling_info.vocab_mask)
         return logits
@@ -637,7 +694,9 @@ def import_model_classes():
             try:
                 module = importlib.import_module(name)
             except Exception as e:
-                logger.warning(f"Ignore import error when loading {name}. " f"{e}")
+                logger.warning(f"Ignore import error when loading {name}. {e}")
+                if crash_on_warnings():
+                    raise ValueError(f"Ignore import error when loading {name}. {e}")
                 continue
             if hasattr(module, "EntryClass"):
                 entry = module.EntryClass

sglang/srt/model_parallel.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""
+Common utilities for torch model parallelism.
+"""
+from typing import Optional
+import torch
+from torch.distributed.device_mesh import DeviceMesh
+try:
+    from torch.distributed.tensor import DTensor, Shard
+except ImportError:
+    # torch 2.4 or older
+    from torch.distributed._tensor import DTensor, Shard
+from torch.distributed._functional_collectives import AsyncCollectiveTensor
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    RowwiseParallel,
+    parallelize_module,
+)
+class ColwiseParallelSharded(ColwiseParallel):
+    """
+    A version of ColwiseParallel where the local weight has been already
+    sharded.  This is used for the fused wqkv case, where during loading, we
+    already sharded wq, wk, wv before fusing them.
+    """
+    # Override the _partition_linear_fn in ColwiseParallel
+    def _partition_linear_fn(self, name, module, device_mesh):
+        # colwise shard weight/bias to Shard(0), weight be Shard(0)
+        # means Colwise as Linear is input * weight^T + bias, where
+        # weight would become Shard(1)
+        for name, param in module.named_parameters():
+            dtensor = DTensor.from_local(param, device_mesh, [Shard(0)])
+            dist_param = torch.nn.Parameter(dtensor, requires_grad=False)
+            module.register_parameter(name, dist_param)
+class RowwiseParallelMaybeWait(RowwiseParallel):
+    """
+    A version of RowwiseParallel that waits for the output (establish dependency
+    between comm stream and compute stream in CUDA sense) before going into the
+    next op. This is needed to workaround the current interaction between
+    AsyncCollectiveTensor and custom ops, such as `class RMSNorm(CustomOp)`.
+    """
+    @staticmethod
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        outputs = super(
+            RowwiseParallelMaybeWait, RowwiseParallelMaybeWait
+        )._prepare_output_fn(
+            output_layouts, use_local_output, mod, outputs, device_mesh
+        )
+        # wait for the output to be ready
+        if isinstance(outputs, AsyncCollectiveTensor):
+            return outputs.wait()
+        else:
+            return outputs
+def tensor_parallel(
+    module: torch.nn.Module,
+    device_mesh: Optional[DeviceMesh] = None,
+):
+    """
+    Tensor parallelize the model across the given device mesh.
+    Args:
+        module (`torch.nn.Module`):
+            The module to tensor parallelize.
+        device_mesh (`torch.distributed.DeviceMesh`):
+            The device mesh to use for tensor parallelism.
+    """
+    # Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module.
+    # No op if `_tp_plan` attribute does not exist under the module.
+    # This is a helper function to be used with `model.apply` to recursively
+    # parallelize a model.
+    def tplize(mod: torch.nn.Module) -> None:
+        tp_plan = getattr(mod, "_tp_plan", None)
+        if tp_plan is None:
+            return
+        for child_name, tp_style in tp_plan.items():
+            submod = mod.get_submodule(child_name)
+            if tp_style == "Colwise":
+                parallelize_module(submod, device_mesh, ColwiseParallel())
+            elif tp_style == "Rowwise":
+                parallelize_module(submod, device_mesh, RowwiseParallelMaybeWait())
+            elif tp_style == "Colwise_Sharded":
+                parallelize_module(submod, device_mesh, ColwiseParallelSharded())
+            else:
+                raise ValueError(f"Unknown TP style {tp_style}")
+    # `apply` is a native method of `nn.Module` that recursively applies a
+    # function to every submodule.
+    module.apply(tplize)

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -22,7 +22,9 @@ import torch
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.distributed import (
+    get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
+    get_tp_group,
     tensor_model_parallel_all_reduce,
 )
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -338,6 +340,7 @@ class DeepseekV2AttentionMLA(nn.Module):
         cache_config=None,
         quant_config: Optional[QuantizationConfig] = None,
         layer_id=None,
+        use_dp=False,
     ) -> None:
         super().__init__()
         self.layer_id = layer_id
@@ -351,29 +354,80 @@ class DeepseekV2AttentionMLA(nn.Module):
         self.num_heads = num_heads
         tp_size = get_tensor_model_parallel_world_size()
         assert num_heads % tp_size == 0
-        self.num_local_heads = num_heads // tp_size
+        self.num_local_heads = num_heads if use_dp else num_heads // tp_size
         self.scaling = self.qk_head_dim**-0.5
         self.rope_theta = rope_theta
         self.max_position_embeddings = max_position_embeddings
-        if self.q_lora_rank is not None:
-            self.q_a_proj = ReplicatedLinear(
-                self.hidden_size,
-                self.q_lora_rank,
+        if use_dp:
+            # For data parallel attention
+            if self.q_lora_rank is not None:
+                self.q_a_proj = ReplicatedLinear(
+                    self.hidden_size,
+                    self.q_lora_rank,
+                    bias=False,
+                    quant_config=quant_config,
+                )
+                self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+                self.q_b_proj = ReplicatedLinear(
+                    q_lora_rank,
+                    self.num_heads * self.qk_head_dim,
+                    bias=False,
+                    quant_config=quant_config,
+                )
+            else:
+                self.q_proj = ReplicatedLinear(
+                    self.hidden_size,
+                    self.num_heads * self.qk_head_dim,
+                    bias=False,
+                    quant_config=quant_config,
+                )
+            self.kv_b_proj = ReplicatedLinear(
+                self.kv_lora_rank,
+                self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
                 bias=False,
                 quant_config=quant_config,
             )
-            self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
-            self.q_b_proj = ColumnParallelLinear(
-                q_lora_rank,
-                self.num_heads * self.qk_head_dim,
+            # O projection.
+            self.o_proj = ReplicatedLinear(
+                self.num_heads * self.v_head_dim,
+                self.hidden_size,
                 bias=False,
                 quant_config=quant_config,
             )
         else:
-            self.q_proj = ColumnParallelLinear(
+            # For tensor parallel attention
+            if self.q_lora_rank is not None:
+                self.q_a_proj = ReplicatedLinear(
+                    self.hidden_size,
+                    self.q_lora_rank,
+                    bias=False,
+                    quant_config=quant_config,
+                )
+                self.q_a_layernorm = RMSNorm(self.q_lora_rank, eps=config.rms_norm_eps)
+                self.q_b_proj = ColumnParallelLinear(
+                    q_lora_rank,
+                    self.num_heads * self.qk_head_dim,
+                    bias=False,
+                    quant_config=quant_config,
+                )
+            else:
+                self.q_proj = ColumnParallelLinear(
+                    self.hidden_size,
+                    self.num_heads * self.qk_head_dim,
+                    bias=False,
+                    quant_config=quant_config,
+                )
+            self.kv_b_proj = ColumnParallelLinear(
+                self.kv_lora_rank,
+                self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+                bias=False,
+                quant_config=quant_config,
+            )
+            # O projection.
+            self.o_proj = RowParallelLinear(
+                self.num_heads * self.v_head_dim,
                 self.hidden_size,
-                self.num_heads * self.qk_head_dim,
                 bias=False,
                 quant_config=quant_config,
             )
@@ -385,19 +439,6 @@ class DeepseekV2AttentionMLA(nn.Module):
             quant_config=quant_config,
         )
         self.kv_a_layernorm = RMSNorm(self.kv_lora_rank, eps=config.rms_norm_eps)
-        self.kv_b_proj = ColumnParallelLinear(
-            self.kv_lora_rank,
-            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
-            bias=False,
-            quant_config=quant_config,
-        )
-        # O projection.
-        self.o_proj = RowParallelLinear(
-            self.num_heads * self.v_head_dim,
-            self.hidden_size,
-            bias=False,
-            quant_config=quant_config,
-        )
         rope_scaling["rope_type"] = "deepseek_yarn"
         self.rotary_emb = get_rope(
             qk_rope_head_dim,
@@ -491,6 +532,36 @@ class DeepseekV2AttentionMLA(nn.Module):
         return output
+def all_gather(
+    input_tensor: torch.Tensor, forward_batch: ForwardBatch, rank, world_size, group
+):
+    if world_size == 1:
+        return input_tensor
+    all_lens = forward_batch.global_num_tokens
+    max_len = max(forward_batch.global_num_tokens)
+    padded_tensor = torch.nn.functional.pad(
+        input_tensor, (0, 0, 0, max_len - input_tensor.shape[0])
+    )
+    torch.distributed.all_gather_into_tensor(
+        forward_batch.gathered_buffer, padded_tensor, group=group
+    )
+    gathered_tensors = torch.concat(
+        [
+            forward_batch.gathered_buffer[i * max_len : i * max_len + all_lens[i]]
+            for i in range(world_size)
+        ]
+    )
+    start_index = 0 if rank == 0 else sum(all_lens[:rank])
+    end_index = start_index + all_lens[rank]
+    return gathered_tensors, start_index, end_index
 class DeepseekV2DecoderLayer(nn.Module):
     def __init__(
@@ -505,6 +576,14 @@ class DeepseekV2DecoderLayer(nn.Module):
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
+        self.enable_dp_attention = (
+            not global_server_args_dict["disable_mla"]
+            and global_server_args_dict["enable_dp_attention"]
+        )
+        if self.enable_dp_attention:
+            self.tp_rank = get_tensor_model_parallel_rank()
+            self.tp_size = get_tensor_model_parallel_world_size()
+            self.tp_group = get_tp_group().device_group
         if not global_server_args_dict["disable_mla"]:
             self.self_attn = DeepseekV2AttentionMLA(
                 config=config,
@@ -523,6 +602,7 @@ class DeepseekV2DecoderLayer(nn.Module):
                 cache_config=cache_config,
                 quant_config=quant_config,
                 layer_id=layer_id,
+                use_dp=self.enable_dp_attention,
             )
         else:
             self.self_attn = DeepseekV2Attention(
@@ -569,20 +649,32 @@ class DeepseekV2DecoderLayer(nn.Module):
         residual: Optional[torch.Tensor],
     ) -> torch.Tensor:
         # Self Attention
-        if residual is None:
-            residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states, residual = self.input_layernorm(hidden_states, residual)
-        hidden_states = self.self_attn(
-            positions=positions,
-            hidden_states=hidden_states,
-            forward_batch=forward_batch,
-        )
+        if not forward_batch.forward_mode.is_idle():
+            if residual is None:
+                residual = hidden_states
+                hidden_states = self.input_layernorm(hidden_states)
+            else:
+                hidden_states, residual = self.input_layernorm(hidden_states, residual)
+            hidden_states = self.self_attn(
+                positions=positions,
+                hidden_states=hidden_states,
+                forward_batch=forward_batch,
+            )
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual
+            )
         # Fully Connected
-        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
-        hidden_states = self.mlp(hidden_states)
+        if self.enable_dp_attention:
+            hidden_states, start_idx, end_idx = all_gather(
+                hidden_states, forward_batch, self.tp_rank, self.tp_size, self.tp_group
+            )
+            hidden_states = self.mlp(hidden_states)
+            hidden_states = hidden_states[start_idx:end_idx]
+        else:
+            hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
@@ -603,6 +695,7 @@ class DeepseekV2Model(nn.Module):
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
+            enable_tp=not global_server_args_dict["enable_dp_attention"],
         )
         self.layers = nn.ModuleList(
             [
@@ -630,7 +723,8 @@ class DeepseekV2Model(nn.Module):
             hidden_states, residual = layer(
                 positions, hidden_states, forward_batch, residual
             )
-        hidden_states, _ = self.norm(hidden_states, residual)
+        if not forward_batch.forward_mode.is_idle():
+            hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
@@ -646,10 +740,18 @@ class DeepseekV2ForCausalLM(nn.Module):
         self.config = config
         self.quant_config = quant_config
         self.model = DeepseekV2Model(config, cache_config, quant_config)
-        self.lm_head = ParallelLMHead(
-            config.vocab_size, config.hidden_size, quant_config=quant_config
-        )
-        self.logits_processor = LogitsProcessor(config)
+        if global_server_args_dict["enable_dp_attention"]:
+            self.lm_head = ReplicatedLinear(
+                config.hidden_size,
+                config.vocab_size,
+                bias=False,
+            )
+            self.logits_processor = LogitsProcessor(config, skip_all_gather=True)
+        else:
+            self.lm_head = ParallelLMHead(
+                config.vocab_size, config.hidden_size, quant_config=quant_config
+            )
+            self.logits_processor = LogitsProcessor(config)
     @torch.no_grad()
     def forward(
@@ -659,9 +761,10 @@ class DeepseekV2ForCausalLM(nn.Module):
         forward_batch: ForwardBatch,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, forward_batch)
-        return self.logits_processor(
-            input_ids, hidden_states, self.lm_head.weight, forward_batch
-        )
+        if not forward_batch.forward_mode.is_idle():
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head.weight, forward_batch
+            )
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [

sglang 0.3.5.post1__py3-none-any.whl → 0.3.6__py3-none-any.whl

sglang 0.3.5.post1py3-none-any.whl → 0.3.6py3-none-any.whl