PyPI - sglang - Versions diffs - 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl - Mend

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

sglang/__init__.py +2 -0
sglang/api.py +6 -0
sglang/bench_one_batch.py +1 -1
sglang/bench_one_batch_server.py +1 -1
sglang/bench_serving.py +26 -4
sglang/check_env.py +3 -4
sglang/lang/backend/openai.py +18 -5
sglang/lang/chat_template.py +28 -7
sglang/lang/interpreter.py +7 -3
sglang/lang/ir.py +10 -0
sglang/srt/_custom_ops.py +1 -1
sglang/srt/code_completion_parser.py +174 -0
sglang/srt/configs/__init__.py +2 -6
sglang/srt/configs/deepseekvl2.py +676 -0
sglang/srt/configs/janus_pro.py +3 -4
sglang/srt/configs/load_config.py +1 -0
sglang/srt/configs/model_config.py +49 -8
sglang/srt/configs/utils.py +25 -0
sglang/srt/connector/__init__.py +51 -0
sglang/srt/connector/base_connector.py +112 -0
sglang/srt/connector/redis.py +85 -0
sglang/srt/connector/s3.py +122 -0
sglang/srt/connector/serde/__init__.py +31 -0
sglang/srt/connector/serde/safe_serde.py +29 -0
sglang/srt/connector/serde/serde.py +43 -0
sglang/srt/connector/utils.py +35 -0
sglang/srt/conversation.py +88 -0
sglang/srt/disaggregation/conn.py +81 -0
sglang/srt/disaggregation/decode.py +495 -0
sglang/srt/disaggregation/mini_lb.py +285 -0
sglang/srt/disaggregation/prefill.py +249 -0
sglang/srt/disaggregation/utils.py +44 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/parallel_state.py +42 -8
sglang/srt/entrypoints/engine.py +55 -5
sglang/srt/entrypoints/http_server.py +78 -13
sglang/srt/entrypoints/verl_engine.py +2 -0
sglang/srt/function_call_parser.py +133 -55
sglang/srt/hf_transformers_utils.py +28 -3
sglang/srt/layers/activation.py +4 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +434 -0
sglang/srt/layers/attention/flashinfer_backend.py +1 -1
sglang/srt/layers/attention/flashmla_backend.py +284 -0
sglang/srt/layers/attention/triton_backend.py +171 -38
sglang/srt/layers/attention/triton_ops/decode_attention.py +94 -31
sglang/srt/layers/attention/triton_ops/extend_attention.py +14 -5
sglang/srt/layers/attention/utils.py +53 -0
sglang/srt/layers/attention/vision.py +9 -28
sglang/srt/layers/dp_attention.py +41 -19
sglang/srt/layers/layernorm.py +24 -2
sglang/srt/layers/linear.py +17 -5
sglang/srt/layers/logits_processor.py +25 -7
sglang/srt/layers/moe/ep_moe/kernels.py +110 -11
sglang/srt/layers/moe/ep_moe/layer.py +273 -1
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +416 -0
sglang/srt/layers/moe/fused_moe_native.py +2 -1
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +23 -32
sglang/srt/layers/moe/fused_moe_triton/layer.py +1 -2
sglang/srt/layers/moe/topk.py +60 -20
sglang/srt/layers/parameter.py +1 -1
sglang/srt/layers/quantization/__init__.py +80 -53
sglang/srt/layers/quantization/awq.py +200 -0
sglang/srt/layers/quantization/base_config.py +5 -0
sglang/srt/layers/quantization/blockwise_int8.py +1 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +0 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +652 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +658 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +9 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +162 -0
sglang/srt/layers/quantization/compressed_tensors/utils.py +218 -0
sglang/srt/layers/quantization/fp8.py +76 -34
sglang/srt/layers/quantization/fp8_kernel.py +25 -8
sglang/srt/layers/quantization/fp8_utils.py +284 -28
sglang/srt/layers/quantization/gptq.py +36 -19
sglang/srt/layers/quantization/kv_cache.py +98 -0
sglang/srt/layers/quantization/modelopt_quant.py +9 -7
sglang/srt/layers/quantization/utils.py +153 -0
sglang/srt/layers/quantization/w8a8_fp8.py +70 -19
sglang/srt/layers/rotary_embedding.py +78 -87
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/backend/base_backend.py +4 -4
sglang/srt/lora/backend/flashinfer_backend.py +12 -9
sglang/srt/lora/backend/triton_backend.py +5 -8
sglang/srt/lora/layers.py +87 -33
sglang/srt/lora/lora.py +2 -22
sglang/srt/lora/lora_manager.py +67 -30
sglang/srt/lora/mem_pool.py +117 -52
sglang/srt/lora/triton_ops/gate_up_lora_b.py +10 -4
sglang/srt/lora/triton_ops/qkv_lora_b.py +8 -3
sglang/srt/lora/triton_ops/sgemm_lora_a.py +16 -5
sglang/srt/lora/triton_ops/sgemm_lora_b.py +11 -6
sglang/srt/lora/utils.py +18 -1
sglang/srt/managers/cache_controller.py +2 -5
sglang/srt/managers/data_parallel_controller.py +30 -8
sglang/srt/managers/expert_distribution.py +81 -0
sglang/srt/managers/io_struct.py +43 -5
sglang/srt/managers/mm_utils.py +373 -0
sglang/srt/managers/multimodal_processor.py +68 -0
sglang/srt/managers/multimodal_processors/base_processor.py +275 -0
sglang/srt/managers/multimodal_processors/clip.py +63 -0
sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +119 -0
sglang/srt/managers/multimodal_processors/gemma3.py +83 -0
sglang/srt/managers/{image_processors → multimodal_processors}/janus_pro.py +20 -15
sglang/srt/managers/{image_processors → multimodal_processors}/llava.py +10 -15
sglang/srt/managers/multimodal_processors/minicpm.py +167 -0
sglang/srt/managers/{image_processors → multimodal_processors}/mlama.py +7 -8
sglang/srt/managers/{image_processors → multimodal_processors}/qwen_vl.py +28 -22
sglang/srt/managers/schedule_batch.py +134 -30
sglang/srt/managers/scheduler.py +290 -31
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/tokenizer_manager.py +59 -24
sglang/srt/managers/tp_worker.py +4 -1
sglang/srt/managers/tp_worker_overlap_thread.py +3 -3
sglang/srt/managers/utils.py +6 -1
sglang/srt/mem_cache/hiradix_cache.py +18 -7
sglang/srt/mem_cache/memory_pool.py +255 -98
sglang/srt/mem_cache/paged_allocator.py +2 -2
sglang/srt/mem_cache/radix_cache.py +4 -4
sglang/srt/model_executor/cuda_graph_runner.py +36 -21
sglang/srt/model_executor/forward_batch_info.py +68 -11
sglang/srt/model_executor/model_runner.py +75 -8
sglang/srt/model_loader/loader.py +171 -3
sglang/srt/model_loader/weight_utils.py +51 -3
sglang/srt/models/clip.py +563 -0
sglang/srt/models/deepseek_janus_pro.py +31 -88
sglang/srt/models/deepseek_nextn.py +22 -10
sglang/srt/models/deepseek_v2.py +329 -73
sglang/srt/models/deepseek_vl2.py +358 -0
sglang/srt/models/gemma3_causal.py +694 -0
sglang/srt/models/gemma3_mm.py +468 -0
sglang/srt/models/llama.py +47 -7
sglang/srt/models/llama_eagle.py +1 -0
sglang/srt/models/llama_eagle3.py +196 -0
sglang/srt/models/llava.py +3 -3
sglang/srt/models/llavavid.py +3 -3
sglang/srt/models/minicpmo.py +1995 -0
sglang/srt/models/minicpmv.py +62 -137
sglang/srt/models/mllama.py +4 -4
sglang/srt/models/phi3_small.py +1 -1
sglang/srt/models/qwen2.py +3 -0
sglang/srt/models/qwen2_5_vl.py +68 -146
sglang/srt/models/qwen2_classification.py +75 -0
sglang/srt/models/qwen2_moe.py +9 -1
sglang/srt/models/qwen2_vl.py +25 -63
sglang/srt/openai_api/adapter.py +201 -104
sglang/srt/openai_api/protocol.py +33 -7
sglang/srt/patch_torch.py +71 -0
sglang/srt/sampling/sampling_batch_info.py +1 -1
sglang/srt/sampling/sampling_params.py +6 -6
sglang/srt/server_args.py +114 -14
sglang/srt/speculative/build_eagle_tree.py +7 -347
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +41 -5
sglang/srt/speculative/eagle_utils.py +208 -252
sglang/srt/speculative/eagle_worker.py +140 -54
sglang/srt/speculative/spec_info.py +6 -1
sglang/srt/torch_memory_saver_adapter.py +22 -0
sglang/srt/utils.py +215 -21
sglang/test/__init__.py +0 -0
sglang/test/attention/__init__.py +0 -0
sglang/test/attention/test_flashattn_backend.py +312 -0
sglang/test/runners.py +29 -2
sglang/test/test_activation.py +2 -1
sglang/test/test_block_fp8.py +5 -4
sglang/test/test_block_fp8_ep.py +2 -1
sglang/test/test_dynamic_grad_mode.py +58 -0
sglang/test/test_layernorm.py +3 -2
sglang/test/test_utils.py +56 -5
sglang/utils.py +31 -0
sglang/version.py +1 -1
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/METADATA +16 -8
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/RECORD +180 -132
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/WHEEL +1 -1
sglang/srt/configs/qwen2_5_vl_config.py +0 -1006
sglang/srt/managers/image_processor.py +0 -55
sglang/srt/managers/image_processors/base_image_processor.py +0 -219
sglang/srt/managers/image_processors/minicpmv.py +0 -86
sglang/srt/managers/multi_modality_padding.py +0 -134
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info/licenses}/LICENSE +0 -0
{sglang-0.4.4.post1.dist-info → sglang-0.4.4.post3.dist-info}/top_level.txt +0 -0

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -2,9 +2,12 @@ from typing import Dict, List, Optional, Set, Tuple
 import torch
+from sglang.srt.distributed import divide
 from sglang.srt.hf_transformers_utils import AutoConfig
+from sglang.srt.lora.layers import BaseLayerWithLoRA
 from sglang.srt.lora.lora import LoRAAdapter
 from sglang.srt.lora.utils import (
+    ROW_PARALLELISM_LINEAR_LORA_NAMES,
     LoRAType,
     get_hidden_dim,
     get_stacked_multiply,
@@ -21,6 +24,9 @@ class LoRAMemoryPool:
         max_loras_per_batch: int,
         max_lora_dim: int,
         dtype: torch.dtype,
+        tp_size: int,
+        tp_rank: int,
+        lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]],
     ):
         self.base_hf_config: AutoConfig = base_hf_config
@@ -28,6 +34,9 @@ class LoRAMemoryPool:
         self.max_loras_per_batch: int = max_loras_per_batch
         self.max_lora_dim: int = max_lora_dim
         self.dtype: torch.dtype = dtype
+        self.tp_size: int = tp_size
+        self.tp_rank: int = tp_rank
+        self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = lora_modules
         # Both A_buffer and B_buffer maps lora weight names to its buffer space.
         # A_buffer contains num_layer number of row-major tensors with shape
@@ -45,6 +54,41 @@ class LoRAMemoryPool:
         # Here we don't initalize to None since None is a valid uid
         self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
+    def get_lora_A_shape(
+        self, module_name: str, base_model: torch.nn.Module
+    ) -> Tuple[int]:
+        """
+        Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+        """
+        input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model)
+        c = get_stacked_multiply(module_name)
+        if self.tp_size > 1:
+            if module_name in ROW_PARALLELISM_LINEAR_LORA_NAMES:
+                input_dim = divide(input_dim, self.tp_size)
+        return (
+            self.max_loras_per_batch,
+            self.max_lora_dim * c,
+            input_dim,
+        )
+    def get_lora_B_shape(
+        self, module_name: str, base_model: torch.nn.Module
+    ) -> Tuple[int]:
+        """
+        Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+        """
+        _, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
+        c = get_stacked_multiply(module_name)
+        if self.tp_size > 1:
+            if module_name not in ROW_PARALLELISM_LINEAR_LORA_NAMES:
+                output_dim = divide(output_dim, self.tp_size)
+        return (
+            c,
+            self.max_loras_per_batch,
+            output_dim,
+            self.max_lora_dim,
+        )
     def init_buffers(
         self,
         lora_weight_names: Set[Tuple[str]],
@@ -54,42 +98,31 @@ class LoRAMemoryPool:
         # lora_weight_names is a set of name pairs indicating each pair of lora modules to load
         #   e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
         self.lora_weight_names: Set[Tuple[str]] = lora_weight_names
-        for module_A, module_B in lora_weight_names:
-            # Init A tensor, column_major=False
-            input_dim, _ = get_hidden_dim(module_A, self.base_hf_config, base_model)
-            c = get_stacked_multiply(module_A)
-            if module_A not in self.A_buffer:
-                self.A_buffer[module_A] = [
-                    torch.empty(
-                        (
-                            self.max_loras_per_batch,
-                            self.max_lora_dim * c,
-                            input_dim,
-                        ),
-                        dtype=self.dtype,
-                        device="cuda",
-                    )
-                    for i in range(self.num_layer)
-                ]
-            # Init B tensor, column_major=True
-            _, output_dim = get_hidden_dim(module_B, self.base_hf_config, base_model)
-            c = get_stacked_multiply(module_B)
-            if module_B not in self.B_buffer:
-                self.B_buffer[module_B] = [
-                    torch.empty(
-                        (
-                            c,  # stacked lora_b modules might need separation
-                            self.max_loras_per_batch,
-                            output_dim,
-                            self.max_lora_dim,
-                        ),
-                        dtype=self.dtype,
-                        device="cuda",
-                    )
-                    for i in range(self.num_layer)
-                ]
+        device = next(base_model.parameters()).device
+        lora_module_A_names = set([name[0] for name in lora_weight_names])
+        lora_module_B_names = set([name[1] for name in lora_weight_names])
+        # Init A tensor, column_major=False
+        for module_A in lora_module_A_names:
+            lora_A_shape = self.get_lora_A_shape(module_A, base_model)
+            self.A_buffer[module_A] = [
+                torch.empty(
+                    lora_A_shape,
+                    dtype=self.dtype,
+                    device=device,
+                )
+                for i in range(self.num_layer)
+            ]
+        # Init B tensor, column_major=True
+        for module_B in lora_module_B_names:
+            lora_B_shape = self.get_lora_B_shape(module_B, base_model)
+            self.B_buffer[module_B] = [
+                torch.empty(
+                    lora_B_shape,
+                    dtype=self.dtype,
+                    device=device,
+                )
+                for _ in range(self.num_layer)
+            ]
     def prepare_lora_batch(
         self,
@@ -130,36 +163,68 @@ class LoRAMemoryPool:
         if uid is None:
             for i in range(self.num_layer):
                 for k in self.A_buffer.keys():
-                    self.A_buffer[k][i][buffer_id] *= 0
+                    self.A_buffer[k][i][buffer_id] = 0
             return
         assert lora_adapter is not None
+        lora_rank = lora_adapter.config.hf_config["r"]
         for layer_id in range(self.num_layer):
             layer_weights = lora_adapter.layers[layer_id].weights
+            temp_A_buffer: Dict[str, torch.Tensor] = {}
+            temp_B_buffer: Dict[str, torch.Tensor] = {}
             for name, weights in layer_weights.items():
                 if "lora_A" in name:
                     lora_weight_name = get_weight_name(
                         name, self.lora_weight_names, LoRAType.LORA_A
                     )
-                    if lora_weight_name:
-                        self.A_buffer[lora_weight_name][layer_id][buffer_id].copy_(
-                            weights
-                        )
+                    temp_A_buffer[lora_weight_name] = weights
                 else:
                     lora_weight_name = get_weight_name(
                         name, self.lora_weight_names, LoRAType.LORA_B
                     )
-                    if lora_weight_name:
-                        c = get_stacked_multiply(lora_weight_name)
-                        if c > 1:
-                            for stacked_id in range(c):
-                                self.B_buffer[lora_weight_name][layer_id][stacked_id][
-                                    buffer_id
-                                ].copy_(weights[stacked_id])
-                        else:
-                            self.B_buffer[lora_weight_name][layer_id][0][
-                                buffer_id
-                            ].copy_(weights)
+                    temp_B_buffer[lora_weight_name] = weights
+            if self.tp_size > 1:
+                cur_layer_modules = self.lora_modules[layer_id]
+                for module_name, module in cur_layer_modules:
+                    if "qkv_proj" in module_name:
+                        temp_A_buffer["qkv_proj"] = module.slice_lora_a_weights(
+                            temp_A_buffer["qkv_proj"], self.tp_rank
+                        )
+                        temp_B_buffer["q_proj"], temp_B_buffer["kv_proj"] = (
+                            module.slice_lora_b_weights(
+                                [temp_B_buffer["q_proj"], temp_B_buffer["kv_proj"]],
+                                self.tp_rank,
+                            )
+                        )
+                    else:
+                        weight_name = get_weight_name(
+                            module_name, self.lora_weight_names, LoRAType.LORA_A
+                        )
+                        temp_A_buffer[weight_name] = module.slice_lora_a_weights(
+                            temp_A_buffer[weight_name], self.tp_rank
+                        )
+                        temp_B_buffer[weight_name] = module.slice_lora_b_weights(
+                            temp_B_buffer[weight_name], self.tp_rank
+                        )
+            for name, weights in temp_A_buffer.items():
+                c = get_stacked_multiply(name)
+                self.A_buffer[name][layer_id][buffer_id][: lora_rank * c, :].copy_(
+                    weights
+                )
+            for name, weights in temp_B_buffer.items():
+                c = get_stacked_multiply(name)
+                if c > 1:
+                    for stacked_id in range(c):
+                        self.B_buffer[name][layer_id][stacked_id][buffer_id][
+                            :, :lora_rank
+                        ].copy_(weights[stacked_id])
+                else:
+                    self.B_buffer[name][layer_id][0][buffer_id][:, :lora_rank].copy_(
+                        weights
+                    )
     def get_tensor(
         self, weight_name: str, layer_id: int, lora_type: LoRAType

sglang/srt/lora/triton_ops/gate_up_lora_b.py CHANGED Viewed

@@ -22,17 +22,18 @@ def _gate_up_lora_b_kernel(
     w_stride_2,
     output_stride_0,
     output_stride_1,
-    # Information on sequence lengths and weight id
+    # Information on sequence lengths,ranks and weight id
     seg_lens,
     seg_indptr,
     weight_indices,
+    lora_ranks,
     # Meta parameters
     BLOCK_S: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
     # For fused output scaling and adding
     fuse_scaling_add,
-    scaling,
+    scalings,
 ):
     # This kernel packs 2 sgemms (gate/up) into a single kernel.
@@ -51,6 +52,11 @@ def _gate_up_lora_b_kernel(
     w_index = tl.load(weight_indices + batch_id)
     seg_start = tl.load(seg_indptr + batch_id)
     n_start = gate_up_id * output_dim  # offset on output dim
+    rank = tl.load(lora_ranks + w_index)
+    scaling = tl.load(scalings + w_index)
+    # Adjust K (rank) according to the specific LoRA adapter
+    K = tl.minimum(K, rank)
     # The tile in output matrix will have (pid_s, pid_n) as id
     num_pid_n = tl.cdiv(output_dim, BLOCK_N)
@@ -109,7 +115,6 @@ def gate_up_lora_b_fwd(
     batch_info: LoRABatchInfo,
     output_dim: int,
     base_output: torch.Tensor = None,
-    scaling: float = 1.0,
 ) -> torch.Tensor:
     # x: (s, 2 * r)
@@ -160,11 +165,12 @@ def gate_up_lora_b_fwd(
         batch_info.seg_lens,
         batch_info.seg_indptr,
         batch_info.weight_indices,
+        batch_info.lora_ranks,
         BLOCK_S,
         BLOCK_OUT,
         BLOCK_R,
         fuse_scaling_add,
-        scaling,
+        batch_info.scalings,
     )
     return output

sglang/srt/lora/triton_ops/qkv_lora_b.py CHANGED Viewed

@@ -26,6 +26,7 @@ def _qkv_lora_b_kernel(
     seg_lens,
     seg_indptr,
     weight_indices,
+    lora_ranks,
     # Offsets of q/k/v slice on output dimension
     n_offs,
     # Meta parameters
@@ -34,7 +35,7 @@ def _qkv_lora_b_kernel(
     BLOCK_K: tl.constexpr,
     # For fused output scaling and adding
     fuse_scaling_add,
-    scaling,
+    scalings,
 ):
     # This kernel packs 3 sgemms (q/k/v) into a single kernel.
@@ -54,6 +55,10 @@ def _qkv_lora_b_kernel(
     seg_start = tl.load(seg_indptr + batch_id)
     n_start = tl.load(n_offs + qkv_id)
     n_size = tl.load(n_offs + qkv_id + 1) - n_start
+    rank = tl.load(lora_ranks + w_index)
+    scaling = tl.load(scalings + w_index)
+    # Adjust K (rank) according to the specific LoRA adapter
+    K = tl.minimum(K, rank)
     # The tile in output matrix will have (pid_s, pid_n) as id
     num_pid_n = tl.cdiv(max_qkv_out_dim, BLOCK_N)
@@ -112,7 +117,6 @@ def qkv_lora_b_fwd(
     output_offset: torch.Tensor,
     max_qkv_out_dim: int,
     base_output: torch.Tensor = None,
-    scaling: float = 1.0,
 ) -> torch.Tensor:
     # x: (s, 3 * r)
@@ -171,12 +175,13 @@ def qkv_lora_b_fwd(
         batch_info.seg_lens,
         batch_info.seg_indptr,
         batch_info.weight_indices,
+        batch_info.lora_ranks,
         output_offset,
         BLOCK_S,
         BLOCK_OUT,
         BLOCK_R,
         fuse_scaling_add,
-        scaling,
+        batch_info.scalings,
     )
     return output

sglang/srt/lora/triton_ops/sgemm_lora_a.py CHANGED Viewed

@@ -12,8 +12,9 @@ def _sgemm_lora_a_kernel(
     weights,
     output,
     # Matrix dimensions
-    N,  # r
+    N,  # stack_num * r
     K,  # input_dim
+    stack_num,
     # Strides
     x_stride_0,
     x_stride_1,
@@ -22,10 +23,11 @@ def _sgemm_lora_a_kernel(
     w_stride_2,
     output_stride_0,
     output_stride_1,
-    # Information on sequence lengths and weight id
+    # Information on sequence lengths,ranks and weight id
     seg_lens,
     seg_indptr,
     weight_indices,
+    lora_ranks,
     # Meta parameters
     BLOCK_S: tl.constexpr,
     BLOCK_N: tl.constexpr,
@@ -43,6 +45,9 @@ def _sgemm_lora_a_kernel(
     seg_len = tl.load(seg_lens + batch_id)
     w_index = tl.load(weight_indices + batch_id)
     seg_start = tl.load(seg_indptr + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+    # Adjust N (stack_num * max_rank) according to the specific LoRA adapter
+    N = tl.minimum(N, rank * stack_num)
     # The tile in output matrix will have (pid_s, pid_n) as id
     num_pid_n = tl.cdiv(N, BLOCK_N)
@@ -91,11 +96,15 @@ def _sgemm_lora_a_kernel(
 def sgemm_lora_a_fwd(
-    x: torch.Tensor, weights: torch.Tensor, batch_info: LoRABatchInfo
+    x: torch.Tensor,
+    weights: torch.Tensor,
+    batch_info: LoRABatchInfo,
+    stack_num: int = 1,
 ) -> torch.Tensor:
     # x: (s, input_dim)
-    # weights: (num_lora, r, input_dim)
-    # output: (s, r)
+    # weights: (num_lora, stack_num * r, input_dim)
+    # output: (s, stack_num * r)
+    # stack_num: run_qkv_lora: 3, run_gate_up_lora: 2
     # when called by run_qkv_lora, the weights.shape[-2] will be 3 * r
     # input_dim is much larger than r
@@ -126,6 +135,7 @@ def sgemm_lora_a_fwd(
         output,
         R,
         K,
+        stack_num,
         x.stride(0),
         x.stride(1),
         weights.stride(0),
@@ -136,6 +146,7 @@ def sgemm_lora_a_fwd(
         batch_info.seg_lens,
         batch_info.seg_indptr,
         batch_info.weight_indices,
+        batch_info.lora_ranks,
         BLOCK_S,
         BLOCK_R,
         BLOCK_K,

sglang/srt/lora/triton_ops/sgemm_lora_b.py CHANGED Viewed

@@ -26,13 +26,14 @@ def _sgemm_lora_b_kernel(
     seg_lens,
     seg_indptr,
     weight_indices,
+    lora_ranks,
     # Meta parameters
     BLOCK_S: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_K: tl.constexpr,
     # For fused output scaling and adding
     fuse_scaling_add,
-    scaling,
+    scalings,
 ):
     # x: (s, K), s is the sum of sequence lengths
     # weights: (num_lora, N, K)
@@ -45,6 +46,10 @@ def _sgemm_lora_b_kernel(
     seg_len = tl.load(seg_lens + batch_id)
     w_index = tl.load(weight_indices + batch_id)
     seg_start = tl.load(seg_indptr + batch_id)
+    rank = tl.load(lora_ranks + w_index)
+    scaling = tl.load(scalings + w_index)
+    # Adjust K (rank) according to the specific LoRA adapter
+    K = tl.minimum(K, rank)
     # The tile in output matrix will have (pid_s, pid_n) as id
     num_pid_n = tl.cdiv(N, BLOCK_N)
@@ -100,12 +105,11 @@ def sgemm_lora_b_fwd(
     weights: torch.Tensor,
     batch_info: LoRABatchInfo,
     base_output: torch.Tensor = None,
-    scaling: float = 1.0,
 ) -> torch.Tensor:
-    # x: (s, r)
-    # weights: (num_lora, output_dim, r)
+    # x: (s, max_r)
+    # weights: (num_lora, output_dim, max_r)
     # output: (s, output_dim)
-    # output_dim is much larger than r
+    # output_dim is much larger than max_r
     assert x.is_contiguous()
     assert weights.is_contiguous()
@@ -150,10 +154,11 @@ def sgemm_lora_b_fwd(
         batch_info.seg_lens,
         batch_info.seg_indptr,
         batch_info.weight_indices,
+        batch_info.lora_ranks,
         BLOCK_S,
         BLOCK_N,
         BLOCK_R,
         fuse_scaling_add,
-        scaling,
+        batch_info.scalings,
     )
     return output

sglang/srt/lora/utils.py CHANGED Viewed

@@ -25,6 +25,12 @@ class LoRABatchInfo:
     # The index of lora adapter used by each sequence, in shape (bs,)
     weight_indices: torch.Tensor
+    # ranks of each lora adapter, in shape (lora_num,)
+    lora_ranks: torch.Tensor
+    # scaling of each lora adapter, in shape (lora_num,)
+    scalings: torch.Tensor
 class LoRAType(Enum):
     LORA_A = 0
@@ -133,9 +139,20 @@ def get_weight_name(
     target_name is name of a given module,
     lora_weight_names is a set of lora stacked name pairs (see get_stacked_name method above)
     If there is a weight name in lora_weight_names that can match target_name, return this name
-    Else return None
+    Else raise ValueError.
     """
     idx = 0 if lora_type == LoRAType.LORA_A else 1
     for weight_name_pair in lora_weight_names:
         if weight_name_pair[idx] in target_name:
             return weight_name_pair[idx]
+    raise ValueError(
+        f"Cannot find weight name for {target_name} in {lora_weight_names}"
+    )
+# TODO: [PR #4274] For future use to simplify the mapping between HF module names and customized module names.
+VOCAB_PARALLELISM_EMBEDDING_NAMES = ["embeddings"]
+COLUMN_PARALLELISM_LINEAR_LORA_NAMES = ["gate_proj", "up_proj"]
+MERGED_COLUMN_PARALLELISM_LINEAR_LORA_NAMES = ["gate_up_proj"]
+QKV_PARALLELISM_LINEAR_LORA_NAMES = ["qkv_proj"]
+ROW_PARALLELISM_LINEAR_LORA_NAMES = ["o_proj", "down_proj"]

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -22,10 +22,7 @@ from typing import List, Optional
 import torch
-from sglang.srt.mem_cache.memory_pool import (
-    MHATokenToKVPoolHost,
-    TokenToKVPoolAllocator,
-)
+from sglang.srt.mem_cache.memory_pool import HostKVCache, TokenToKVPoolAllocator
 logger = logging.getLogger(__name__)
@@ -151,7 +148,7 @@ class HiCacheController:
     def __init__(
         self,
         token_to_kv_pool_allocator: TokenToKVPoolAllocator,
-        mem_pool_host: MHATokenToKVPoolHost,
+        mem_pool_host: HostKVCache,
         load_cache_event: threading.Event = None,
         write_policy: str = "write_through_selective",
     ):

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -82,10 +82,12 @@ class DataParallelController:
         self.scheduler_procs = []
         self.workers = [None] * server_args.dp_size
-        if not server_args.enable_dp_attention:
-            dp_port_args = self.launch_dp_schedulers(server_args, port_args)
-        else:
+        if server_args.enable_dp_attention:
             dp_port_args = self.launch_dp_attention_schedulers(server_args, port_args)
+            self.control_message_step = server_args.tp_size
+        else:
+            dp_port_args = self.launch_dp_schedulers(server_args, port_args)
+            self.control_message_step = 1
         # Only node rank 0 runs the real data parallel controller that dispatches the requests.
         if server_args.node_rank == 0:
@@ -105,6 +107,7 @@ class DataParallelController:
         threads = []
         sockets = []
         dp_port_args = []
+        ready_events = []
         for dp_rank in range(server_args.dp_size):
             tmp_port_args = PortArgs.init_new(server_args)
             tmp_port_args.tokenizer_ipc_name = port_args.tokenizer_ipc_name
@@ -115,10 +118,13 @@ class DataParallelController:
             # We hold it first so that the next dp worker gets a different port
             sockets.append(bind_port(tmp_port_args.nccl_port))
+            ready_event = threading.Event()
+            ready_events.append(ready_event)
             # Create a thread for each worker
             thread = threading.Thread(
-                target=self.launch_tensor_parallel_group,
-                args=(server_args, tmp_port_args, base_gpu_id, dp_rank),
+                target=self.launch_tensor_parallel_group_thread,
+                args=(server_args, tmp_port_args, base_gpu_id, dp_rank, ready_event),
             )
             threads.append(thread)
             base_gpu_id += server_args.tp_size * server_args.gpu_id_step
@@ -130,11 +136,27 @@ class DataParallelController:
         # Start all threads
         for thread in threads:
             thread.start()
-        for thread in threads:
-            thread.join()
+        for event in ready_events:
+            event.wait()
         return dp_port_args
+    def launch_tensor_parallel_group_thread(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        base_gpu_id: int,
+        dp_rank: int,
+        ready_event: threading.Event,
+    ):
+        self.launch_tensor_parallel_group(server_args, port_args, base_gpu_id, dp_rank)
+        ready_event.set()
+        # This thread cannot be closed because otherwise the `kill_itself_when_parent_died`
+        # function in scheduler.py will kill the scheduler.
+        while True:
+            pass
     def launch_dp_attention_schedulers(self, server_args, port_args):
         self.launch_tensor_parallel_group(server_args, port_args, 0, None)
         dp_port_args = []
@@ -223,7 +245,7 @@ class DataParallelController:
                     self.dispatching(recv_req)
                 else:
                     # Send other control messages to first worker of tp group
-                    for worker in self.workers[:: self.server_args.tp_size]:
+                    for worker in self.workers[:: self.control_message_step]:
                         worker.send_pyobj(recv_req)

sglang/srt/managers/expert_distribution.py ADDED Viewed

@@ -0,0 +1,81 @@
+import json
+import logging
+import time
+from collections import defaultdict
+from typing import Dict, List, Tuple
+import torch
+logger = logging.getLogger(__name__)
+# global expert distribution recording
+class ExpertDistributionRecorder:
+    # This class is a singleton class
+    def __new__(cls):
+        if not hasattr(cls, "instance"):
+            cls.instance = super(ExpertDistributionRecorder, cls).__new__(cls)
+        return cls.instance
+    def __init__(self):
+        # the length of the dictionary is the number of layers
+        # the length of the list is the number of tokens
+        # the length of the tuple is topk's k value
+        self._expert_distribution_record: Dict[int, List[Tuple[int]]] = defaultdict(
+            list
+        )
+        self._record = False
+        self._current_layer_id = "UNKNOWN"
+    def set_current_layer(self, layer_idx):
+        self._current_layer_id = layer_idx
+    def record_new_token(self, topk_ids):
+        if not self._record:
+            return
+        topk_ids_list = topk_ids.to("cpu", non_blocking=True).numpy().tolist()
+        torch.cuda.synchronize()
+        for i in topk_ids_list:
+            self._expert_distribution_record[self._current_layer_id].append(tuple(i))
+    def reset(self):
+        """Reset the expert distribution recorder."""
+        logger.info("Resetting expert distribution record...")
+        self._record = False
+        self._expert_distribution_record.clear()
+        self._current_layer_id = "UNKNOWN"
+    def start_record(self):
+        """Start recording the expert distribution. Reset the recorder and set the recording flag to True."""
+        if self._record == True:
+            logger.warning(
+                "SGLang server is already recording expert ids. Did you forget to dump the expert ids recorded so far by sending requests to the `/stop_expert_distribution_record` and `/dump_expert_distribution_record` endpoints?"
+            )
+        self.reset()
+        self._record = True
+    def stop_record(self):
+        """Stop recording the expert distribution. Set the recording flag to False."""
+        if self._record == False:
+            logger.warning(
+                "SGLang server has not been recording expert ids. Did you forget to start recording by sending request to the `/start_expert_distribution_record` endpoint?"
+            )
+        self._record = False
+    def dump_record(self):
+        """Dump the expert distribution record to a file. Reset the recorder after dumping."""
+        results = {}
+        for layer_idx, layer_record in self._expert_distribution_record.items():
+            results[layer_idx] = defaultdict(int)
+            for token_record in layer_record:
+                for expert_idx in token_record:
+                    results[layer_idx][expert_idx] += 1
+        with open(
+            f"expert_distribution_rank{torch.distributed.get_rank()}_timestamp{time.time()}.csv",
+            "w",
+        ) as fd:
+            fd.write("layer_id,expert_id,count\n")
+            for layer_idx, layer_results in results.items():
+                for expert_idx, count in layer_results.items():
+                    fd.write(f"{layer_idx},{expert_idx},{count}\n")
+        self.reset()

sglang 0.4.4.post1__py3-none-any.whl → 0.4.4.post3__py3-none-any.whl

sglang 0.4.4.post1py3-none-any.whl → 0.4.4.post3py3-none-any.whl