PyPI - sglang - Versions diffs - 0.4.1.post3__py3-none-any.whl → 0.4.1.post5__py3-none-any.whl - Mend

sglang 0.4.1.post3py3-none-any.whl → 0.4.1.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

sglang/bench_one_batch.py +2 -0
sglang/bench_serving.py +18 -1
sglang/lang/interpreter.py +71 -1
sglang/lang/ir.py +2 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/chatglm.py +78 -0
sglang/srt/configs/dbrx.py +279 -0
sglang/srt/configs/model_config.py +1 -1
sglang/srt/hf_transformers_utils.py +9 -14
sglang/srt/layers/attention/__init__.py +22 -6
sglang/srt/layers/attention/double_sparsity_backend.py +0 -52
sglang/srt/layers/attention/flashinfer_backend.py +215 -83
sglang/srt/layers/attention/torch_native_backend.py +1 -38
sglang/srt/layers/attention/triton_backend.py +20 -11
sglang/srt/layers/attention/triton_ops/decode_attention.py +4 -0
sglang/srt/layers/linear.py +159 -55
sglang/srt/layers/logits_processor.py +170 -215
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +198 -29
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -7
sglang/srt/layers/parameter.py +431 -0
sglang/srt/layers/quantization/__init__.py +3 -2
sglang/srt/layers/quantization/fp8.py +3 -3
sglang/srt/layers/quantization/modelopt_quant.py +174 -0
sglang/srt/layers/sampler.py +57 -21
sglang/srt/layers/torchao_utils.py +17 -3
sglang/srt/layers/vocab_parallel_embedding.py +1 -1
sglang/srt/managers/cache_controller.py +307 -0
sglang/srt/managers/data_parallel_controller.py +2 -0
sglang/srt/managers/io_struct.py +1 -2
sglang/srt/managers/schedule_batch.py +33 -3
sglang/srt/managers/schedule_policy.py +159 -90
sglang/srt/managers/scheduler.py +68 -28
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +27 -21
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
sglang/srt/mem_cache/memory_pool.py +206 -1
sglang/srt/metrics/collector.py +22 -30
sglang/srt/model_executor/cuda_graph_runner.py +129 -77
sglang/srt/model_executor/forward_batch_info.py +51 -21
sglang/srt/model_executor/model_runner.py +72 -64
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek_v2.py +34 -7
sglang/srt/models/grok.py +109 -29
sglang/srt/models/llama.py +9 -2
sglang/srt/openai_api/adapter.py +0 -17
sglang/srt/openai_api/protocol.py +3 -3
sglang/srt/sampling/sampling_batch_info.py +22 -0
sglang/srt/sampling/sampling_params.py +9 -1
sglang/srt/server.py +20 -13
sglang/srt/server_args.py +120 -58
sglang/srt/speculative/build_eagle_tree.py +347 -0
sglang/srt/speculative/eagle_utils.py +626 -0
sglang/srt/speculative/eagle_worker.py +184 -0
sglang/srt/speculative/spec_info.py +5 -0
sglang/srt/utils.py +47 -7
sglang/test/test_programs.py +23 -1
sglang/test/test_utils.py +36 -7
sglang/version.py +1 -1
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/METADATA +12 -12
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/RECORD +86 -57
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/WHEEL +1 -1
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/LICENSE +0 -0
{sglang-0.4.1.post3.dist-info → sglang-0.4.1.post5.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/triton_backend.py CHANGED Viewed

@@ -1,15 +1,16 @@
 from __future__ import annotations
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Optional
 import torch
 from sglang.srt.layers.attention import AttentionBackend
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention
     from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInfo
 class TritonAttnBackend(AttentionBackend):
@@ -80,11 +81,17 @@ class TritonAttnBackend(AttentionBackend):
     def init_forward_metadata_capture_cuda_graph(
         self,
         bs: int,
+        num_tokens: int,
         req_pool_indices: torch.Tensor,
         seq_lens: torch.Tensor,
-        encoder_lens=None,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
     ):
-        # NOTE: encoder_lens expected to be zeros or None
+        assert encoder_lens is None, "Not supported"
+        assert forward_mode.is_decode(), "Not supported"
+        assert spec_info is None, "Not supported"
         self.forward_metadata = (
             self.cuda_graph_attn_logits,
             None,
@@ -96,7 +103,9 @@ class TritonAttnBackend(AttentionBackend):
         req_pool_indices: torch.Tensor,
         seq_lens: torch.Tensor,
         seq_lens_sum: int,
-        encoder_lens=None,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
     ):
         # NOTE: encoder_lens expected to be zeros or None
         self.cuda_graph_start_loc.zero_()
@@ -107,9 +116,9 @@ class TritonAttnBackend(AttentionBackend):
     def forward_extend(
         self,
-        q,
-        k,
-        v,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
         layer: RadixAttention,
         forward_batch: ForwardBatch,
         save_kv_cache=True,
@@ -146,9 +155,9 @@ class TritonAttnBackend(AttentionBackend):
     def forward_decode(
         self,
-        q,
-        k,
-        v,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
         layer: RadixAttention,
         forward_batch: ForwardBatch,
         save_kv_cache=True,

sglang/srt/layers/attention/triton_ops/decode_attention.py CHANGED Viewed

@@ -406,6 +406,10 @@ def _decode_grouped_att_m_fwd(
     Lk = k_buffer.shape[-1]
     Lv = v_buffer.shape[-1]
+    # [TODO] work around shmem limit on MI3xx
+    if is_hip_ and Lk >= 576:
+        BLOCK = 16
     if Lk == 576:
         BLOCK_DMODEL = 512
         BLOCK_DPE = 64

sglang/srt/layers/linear.py CHANGED Viewed

@@ -18,14 +18,15 @@ from vllm.distributed import (
 # workaround
 from vllm.model_executor.layers.linear import LinearBase
-from vllm.model_executor.parameter import (
+from sglang.srt.layers.parameter import (
     BasevLLMParameter,
     PackedColumnParameter,
     PackedvLLMParameter,
     PerTensorScaleParameter,
     RowvLLMParameter,
+    _ColumnvLLMParameter,
 )
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
@@ -44,6 +45,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "MarlinLinearMethod",
     "GPTQLinearMethod",
     "QQQLinearMethod",
+    "ModelOptFp8LinearMethod",
 ]
@@ -93,6 +95,62 @@ def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
     return param[shard_id], loaded_weight
+def load_column_qkv_weight(
+    self, loaded_weight, num_heads, shard_id, shard_offset, shard_size, tp_rank
+):
+    if (
+        isinstance(self, (PackedColumnParameter, PackedvLLMParameter))
+        and self.output_dim == self.packed_dim
+    ):
+        shard_size, shard_offset = self.adjust_shard_indexes_for_packing(
+            shard_offset=shard_offset, shard_size=shard_size
+        )
+    param_data = self.data
+    shard_id = tp_rank if shard_id == "q" else tp_rank // num_heads
+    param_data = param_data.narrow(self.output_dim, shard_offset, shard_size)
+    loaded_weight = loaded_weight.narrow(
+        self.output_dim, shard_id * shard_size, shard_size
+    )
+    assert param_data.shape == loaded_weight.shape
+    param_data.copy_(loaded_weight)
+def load_column_parallel_weight(
+    self, loaded_weight: torch.Tensor, tp_rank, use_presharded_weights: bool = False
+):
+    if isinstance(self, _ColumnvLLMParameter):
+        if not use_presharded_weights:
+            shard_size = self.data.shape[self.output_dim]
+            loaded_weight = loaded_weight.narrow(
+                self.output_dim, tp_rank * shard_size, shard_size
+            )
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+    else:
+        self.data.copy_(loaded_weight)
+def load_row_parallel_weight(
+    self, loaded_weight: torch.Tensor, tp_rank, use_presharded_weights: bool = False
+):
+    if isinstance(self, RowvLLMParameter):
+        if not use_presharded_weights:
+            shard_size = self.data.shape[self.input_dim]
+            loaded_weight = loaded_weight.narrow(
+                self.input_dim, tp_rank * shard_size, shard_size
+            )
+        if len(loaded_weight.shape) == 0:
+            loaded_weight = loaded_weight.reshape(1)
+        assert self.data.shape == loaded_weight.shape
+        self.data.copy_(loaded_weight)
+    else:
+        self.data.copy_(loaded_weight)
 class LinearMethodBase(QuantizeMethodBase):
     """Base class for different (maybe quantized) linear methods."""
@@ -286,6 +344,8 @@ class ColumnParallelLinear(LinearBase):
         quant_config: Optional[QuantizationConfig] = None,
         output_sizes: Optional[List[int]] = None,
         prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
     ):
         super().__init__(
             input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
@@ -294,7 +354,11 @@ class ColumnParallelLinear(LinearBase):
         self.gather_output = gather_output
         # Divide the weight matrix along the last dimension.
-        tp_size = get_tensor_model_parallel_world_size()
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
         assert self.quant_method is not None
         self.output_size_per_partition = divide(self.output_size, tp_size)
         self.output_partition_sizes = [self.output_size_per_partition]
@@ -335,7 +399,6 @@ class ColumnParallelLinear(LinearBase):
             self.register_parameter("bias", None)
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
         output_dim = getattr(param, "output_dim", None)
         # Special case for GGUF
@@ -355,7 +418,7 @@ class ColumnParallelLinear(LinearBase):
         # no need to narrow here
         if output_dim is not None and not use_bitsandbytes_4bit:
             shard_size = param_data.shape[output_dim]
-            start_idx = tp_rank * shard_size
+            start_idx = self.tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         # Special case for loading scales off disk, which often do not
@@ -363,7 +426,9 @@ class ColumnParallelLinear(LinearBase):
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
-        assert param_data.shape == loaded_weight.shape
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=}, {loaded_weight.shape=}"
         param_data.copy_(loaded_weight)
     def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
@@ -392,7 +457,7 @@ class ColumnParallelLinear(LinearBase):
         s = f"in_features={self.input_size}"
         s += f", output_features={self.output_size_per_partition}"
         s += f", bias={self.bias is not None}"
-        s += f", tp_size={get_tensor_model_parallel_world_size()}"
+        s += f", tp_size={self.tp_size}"
         s += f", gather_output={self.gather_output}"
         return s
@@ -430,10 +495,18 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
     ):
         self.output_sizes = output_sizes
-        tp_size = get_tensor_model_parallel_world_size()
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
         assert all(output_size % tp_size == 0 for output_size in output_sizes)
+        self.use_presharded_weights = use_presharded_weights
         super().__init__(
             input_size=input_size,
             output_size=sum(output_sizes),
@@ -443,6 +516,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             params_dtype=params_dtype,
             quant_config=quant_config,
             prefix=prefix,
+            tp_rank=tp_rank,
+            tp_size=tp_size,
         )
     def weight_loader(
@@ -462,12 +537,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             return
         if is_gguf_weight:
-            tp_size = get_tensor_model_parallel_world_size()
-            tp_rank = get_tensor_model_parallel_rank()
             output_dim = getattr(param, "output_dim", None)
-            shard_size = loaded_weight.size(output_dim) // tp_size
-            start_idx = tp_rank * shard_size
+            shard_size = loaded_weight.size(output_dim) // self.tp_size
+            start_idx = self.tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
@@ -493,7 +565,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                         param_data, loaded_weight, 0
                     )
-                assert param_data.shape == loaded_weight.shape
+                assert (
+                    param_data.shape == loaded_weight.shape
+                ), f"{param_data.shape=}, {loaded_weight.shape=}"
                 param_data.copy_(loaded_weight)
                 return
             current_shard_offset = 0
@@ -521,11 +595,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             return
         assert loaded_shard_id < len(self.output_sizes)
-        tp_rank = get_tensor_model_parallel_rank()
-        tp_size = get_tensor_model_parallel_world_size()
         if output_dim is not None:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
-            shard_size = self.output_sizes[loaded_shard_id] // tp_size
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
             # Special case for quantization.
             # If quantized, we need to adjust the offset and size to account
             # for the packing.
@@ -544,10 +616,10 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                 shard_offset = loaded_weight.shape[output_dim] * loaded_shard_id
             param_data = param_data.narrow(output_dim, shard_offset, shard_size)
-            start_idx = tp_rank * shard_size
+            start_idx = self.tp_rank * shard_size
             # bitsandbytes loads the weights of the specific portion
             # no need to narrow here
-            if not use_bitsandbytes_4bit:
+            if not use_bitsandbytes_4bit and not self.use_presharded_weights:
                 loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
         # Special case for AQLM codebooks.
         elif is_metadata:
@@ -571,7 +643,9 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
                     "the same for all partitions."
                 )
-        assert param_data.shape == loaded_weight.shape
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=}, {loaded_weight.shape=}"
         param_data.copy_(loaded_weight)
     def _load_fused_module_from_checkpoint(
@@ -628,26 +702,27 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         assert loaded_shard_id < len(self.output_sizes)
-        tp_size = get_tensor_model_parallel_world_size()
         if isinstance(param, BlockQuantScaleParameter):
             weight_block_size = self.quant_method.quant_config.weight_block_size
             block_n, _ = weight_block_size[0], weight_block_size[1]
             shard_offset = (
                 (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n
-            ) // tp_size
+            ) // self.tp_size
             shard_size = (
-                (self.output_sizes[loaded_shard_id] + block_n - 1) // block_n // tp_size
+                (self.output_sizes[loaded_shard_id] + block_n - 1)
+                // block_n
+                // self.tp_size
             )
         else:
-            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // tp_size
-            shard_size = self.output_sizes[loaded_shard_id] // tp_size
+            shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size
+            shard_size = self.output_sizes[loaded_shard_id] // self.tp_size
         param.load_merged_column_weight(
             loaded_weight=loaded_weight,
             shard_id=loaded_shard_id,
             shard_offset=shard_offset,
             shard_size=shard_size,
+            use_presharded_weights=self.use_presharded_weights,
         )
@@ -688,6 +763,8 @@ class QKVParallelLinear(ColumnParallelLinear):
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
     ):
         self.hidden_size = hidden_size
         self.head_size = head_size
@@ -696,7 +773,11 @@ class QKVParallelLinear(ColumnParallelLinear):
             total_num_kv_heads = total_num_heads
         self.total_num_kv_heads = total_num_kv_heads
         # Divide the weight matrix along the last dimension.
-        tp_size = get_tensor_model_parallel_world_size()
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
         self.num_heads = divide(self.total_num_heads, tp_size)
         if tp_size >= self.total_num_kv_heads:
             self.num_kv_heads = 1
@@ -723,6 +804,8 @@ class QKVParallelLinear(ColumnParallelLinear):
             params_dtype=params_dtype,
             quant_config=quant_config,
             prefix=prefix,
+            tp_rank=tp_rank,
+            tp_size=tp_size,
         )
     def _get_shard_offset_mapping(self, loaded_shard_id: str):
@@ -813,13 +896,24 @@ class QKVParallelLinear(ColumnParallelLinear):
             shard_offset = (shard_offset + block_n - 1) // block_n
             shard_size = (shard_size + block_n - 1) // block_n
-        param.load_qkv_weight(
-            loaded_weight=loaded_weight,
-            num_heads=self.num_kv_head_replicas,
-            shard_id=loaded_shard_id,
-            shard_offset=shard_offset,
-            shard_size=shard_size,
-        )
+        if isinstance(param, _ColumnvLLMParameter):
+            load_column_qkv_weight(
+                param,
+                loaded_weight,
+                num_heads=self.num_kv_head_replicas,
+                shard_id=loaded_shard_id,
+                shard_offset=shard_offset,
+                shard_size=shard_size,
+                tp_rank=self.tp_rank,
+            )
+        else:
+            param.load_qkv_weight(
+                loaded_weight=loaded_weight,
+                num_heads=self.num_kv_head_replicas,
+                shard_id=loaded_shard_id,
+                shard_offset=shard_offset,
+                shard_size=shard_size,
+            )
     def weight_loader(
         self,
@@ -839,12 +933,9 @@ class QKVParallelLinear(ColumnParallelLinear):
             return
         if is_gguf_weight:
-            tp_size = get_tensor_model_parallel_world_size()
-            tp_rank = get_tensor_model_parallel_rank()
             output_dim = getattr(param, "output_dim", None)
-            shard_size = loaded_weight.size(output_dim) // tp_size
-            start_idx = tp_rank * shard_size
+            shard_size = loaded_weight.size(output_dim) // self.tp_size
+            start_idx = self.tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
@@ -871,7 +962,9 @@ class QKVParallelLinear(ColumnParallelLinear):
                         param_data, loaded_weight, 0
                     )
-                assert param_data.shape == loaded_weight.shape
+                assert (
+                    param_data.shape == loaded_weight.shape
+                ), f"{param_data.shape=}, {loaded_weight.shape=}"
                 param_data.copy_(loaded_weight)
                 return
             shard_offsets = [
@@ -933,7 +1026,6 @@ class QKVParallelLinear(ColumnParallelLinear):
                 self.weight_loader(param, loaded_weight_shard, shard_id)
             return
-        tp_rank = get_tensor_model_parallel_rank()
         assert loaded_shard_id in ["q", "k", "v"]
         # If output dim is defined, use the default loading process.
@@ -983,9 +1075,9 @@ class QKVParallelLinear(ColumnParallelLinear):
             param_data = param_data.narrow(output_dim, shard_offset, shard_size)
             if loaded_shard_id == "q":
-                shard_id = tp_rank
+                shard_id = self.tp_rank
             else:
-                shard_id = tp_rank // self.num_kv_head_replicas
+                shard_id = self.tp_rank // self.num_kv_head_replicas
             start_idx = shard_id * shard_size
             # bitsandbytes loads the weights of the specific portion
@@ -1013,7 +1105,9 @@ class QKVParallelLinear(ColumnParallelLinear):
                     "for all partitions."
                 )
-        assert param_data.shape == loaded_weight.shape
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=}, {loaded_weight.shape=}"
         param_data.copy_(loaded_weight)
@@ -1054,6 +1148,9 @@ class RowParallelLinear(LinearBase):
         reduce_results: bool = True,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        tp_rank: Optional[int] = None,
+        tp_size: Optional[int] = None,
+        use_presharded_weights: bool = False,
     ):
         super().__init__(
             input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
@@ -1063,10 +1160,14 @@ class RowParallelLinear(LinearBase):
         self.reduce_results = reduce_results
         # Divide the weight matrix along the last dimension.
-        self.tp_rank = get_tensor_model_parallel_rank()
-        self.tp_size = get_tensor_model_parallel_world_size()
+        if tp_rank is None:
+            tp_rank = get_tensor_model_parallel_rank()
+        if tp_size is None:
+            tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank, self.tp_size = tp_rank, tp_size
         self.input_size_per_partition = divide(input_size, self.tp_size)
         assert self.quant_method is not None
+        self.use_presharded_weights = use_presharded_weights
         self.quant_method.create_weights(
             layer=self,
@@ -1100,8 +1201,6 @@ class RowParallelLinear(LinearBase):
             self.register_parameter("bias", None)
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
-        tp_rank = get_tensor_model_parallel_rank()
-        tp_size = get_tensor_model_parallel_world_size()
         input_dim = getattr(param, "input_dim", None)
         use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
@@ -1115,15 +1214,19 @@ class RowParallelLinear(LinearBase):
         if is_gguf_weight and isinstance(param, UninitializedParameter):
             weight_shape = list(loaded_weight.shape)
             if input_dim:
-                weight_shape[input_dim] = weight_shape[input_dim] // tp_size
+                weight_shape[input_dim] = weight_shape[input_dim] // self.tp_size
             param.materialize(tuple(weight_shape), dtype=loaded_weight.dtype)
         param_data = param.data
         # bitsandbytes loads the weights of the specific portion
         # no need to narrow here
-        if input_dim is not None and not use_bitsandbytes_4bit:
+        if (
+            input_dim is not None
+            and not use_bitsandbytes_4bit
+            and not self.use_presharded_weights
+        ):
             shard_size = param_data.shape[input_dim]
-            start_idx = tp_rank * shard_size
+            start_idx = self.tp_rank * shard_size
             loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size)
         # Special case for loading scales off disk, which often do not
@@ -1131,7 +1234,9 @@ class RowParallelLinear(LinearBase):
         if len(loaded_weight.shape) == 0:
             loaded_weight = loaded_weight.reshape(1)
-        assert param_data.shape == loaded_weight.shape
+        assert (
+            param_data.shape == loaded_weight.shape
+        ), f"{param_data.shape=}, {loaded_weight.shape=}"
         param_data.copy_(loaded_weight)
     def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor):
@@ -1148,11 +1253,10 @@ class RowParallelLinear(LinearBase):
         if self.input_is_parallel:
             input_parallel = input_
         else:
-            tp_rank = get_tensor_model_parallel_rank()
             splitted_input = split_tensor_along_last_dim(
                 input_, num_partitions=self.tp_size
             )
-            input_parallel = splitted_input[tp_rank].contiguous()
+            input_parallel = splitted_input[self.tp_rank].contiguous()
         # Matrix multiply.
         assert self.quant_method is not None

sglang 0.4.1.post3__py3-none-any.whl → 0.4.1.post5__py3-none-any.whl

sglang 0.4.1.post3py3-none-any.whl → 0.4.1.post5py3-none-any.whl