PyPI - sglang - Versions diffs - 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl - Mend

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (170) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +6 -1
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +8 -7
sglang/srt/disaggregation/decode.py +8 -4
sglang/srt/disaggregation/mooncake/conn.py +43 -25
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/distributed/parallel_state.py +4 -2
sglang/srt/entrypoints/context.py +3 -20
sglang/srt/entrypoints/engine.py +13 -8
sglang/srt/entrypoints/harmony_utils.py +2 -0
sglang/srt/entrypoints/http_server.py +68 -5
sglang/srt/entrypoints/openai/protocol.py +2 -9
sglang/srt/entrypoints/openai/serving_chat.py +60 -265
sglang/srt/entrypoints/openai/serving_completions.py +1 -0
sglang/srt/entrypoints/openai/tool_server.py +4 -3
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/jinja_template_utils.py +6 -0
sglang/srt/layers/attention/aiter_backend.py +370 -107
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +55 -13
sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +24 -27
sglang/srt/layers/attention/trtllm_mha_backend.py +8 -6
sglang/srt/layers/attention/trtllm_mla_backend.py +129 -25
sglang/srt/layers/attention/vision.py +9 -1
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +11 -13
sglang/srt/layers/dp_attention.py +118 -27
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/logits_processor.py +12 -18
sglang/srt/layers/moe/cutlass_moe.py +11 -16
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +60 -2
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +7 -9
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +4 -1
sglang/srt/layers/multimodal.py +156 -40
sglang/srt/layers/quantization/__init__.py +10 -35
sglang/srt/layers/quantization/awq.py +15 -16
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +0 -1
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +22 -10
sglang/srt/layers/quantization/gptq.py +12 -17
sglang/srt/layers/quantization/marlin_utils.py +15 -5
sglang/srt/layers/quantization/modelopt_quant.py +58 -41
sglang/srt/layers/quantization/mxfp4.py +20 -3
sglang/srt/layers/quantization/utils.py +52 -2
sglang/srt/layers/quantization/w4afp8.py +20 -11
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +281 -2
sglang/srt/layers/sampler.py +5 -2
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +66 -116
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +12 -48
sglang/srt/lora/lora_registry.py +20 -9
sglang/srt/lora/mem_pool.py +20 -63
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +24 -29
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +20 -6
sglang/srt/managers/mm_utils.py +1 -2
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +43 -49
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +18 -11
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/tokenizer_manager.py +53 -44
sglang/srt/mem_cache/allocator.py +39 -214
sglang/srt/mem_cache/allocator_ascend.py +158 -0
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +34 -24
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +33 -35
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +29 -23
sglang/srt/model_executor/forward_batch_info.py +33 -14
sglang/srt/model_executor/model_runner.py +179 -81
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/models/deepseek_nextn.py +2 -1
sglang/srt/models/deepseek_v2.py +79 -38
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +8 -9
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +11 -11
sglang/srt/models/glm4_moe_nextn.py +2 -1
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +142 -20
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +10 -27
sglang/srt/models/llama4.py +19 -6
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen2_moe.py +20 -5
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/qwen3_classification.py +78 -0
sglang/srt/models/qwen3_moe.py +18 -5
sglang/srt/models/registry.py +1 -1
sglang/srt/models/step3_vl.py +6 -2
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/operations.py +17 -2
sglang/srt/reasoning_parser.py +316 -0
sglang/srt/sampling/sampling_batch_info.py +7 -4
sglang/srt/server_args.py +142 -140
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +16 -12
sglang/srt/utils.py +3 -3
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/test/test_marlin_moe.py +1 -1
sglang/test/test_marlin_utils.py +1 -1
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/METADATA +27 -31
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/RECORD +166 -142
sglang/lang/backend/__init__.py +0 -0
sglang/srt/function_call/harmony_tool_parser.py +0 -130
sglang/srt/layers/quantization/scalar_type.py +0 -352
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc0.dist-info → sglang-0.5.0rc2.dist-info}/top_level.txt +0 -0

sglang/srt/lora/lora.py CHANGED Viewed

@@ -117,7 +117,6 @@ class LoRAAdapter(nn.Module):
                 q_name = weight_name
                 k_name = weight_name.replace("q_proj", "k_proj")
                 v_name = weight_name.replace("q_proj", "v_proj")
-                kv_name = weight_name.replace("q_proj", "kv_proj")
                 qkv_name = weight_name.replace("q_proj", "qkv_proj")
                 # If k_proj doesn't have lora, initialize it to zero
@@ -126,57 +125,27 @@ class LoRAAdapter(nn.Module):
                     if "k_proj" in target_module
                     else torch.zeros_like(weights[v_name])
                 )
-                if "lora_A" in weight_name:
-                    weights[qkv_name] = torch.cat(
-                        (
-                            weights[q_name],
-                            k_proj_weight,
-                            weights[v_name],
-                        ),
-                        0,
-                    )
-                    weights.pop(q_name)
-                    if "k_proj" in target_module:
-                        weights.pop(k_name)
-                    weights.pop(v_name)
-                else:
-                    weights[kv_name] = torch.stack(
-                        [
-                            k_proj_weight,
-                            weights[v_name],
-                        ],
-                        dim=0,
-                    )
-                    if "k_proj" in target_module:
-                        weights.pop(k_name)
-                    weights.pop(v_name)
+                weights[qkv_name] = torch.cat(
+                    (
+                        weights[q_name],
+                        k_proj_weight,
+                        weights[v_name],
+                    ),
+                    0,
+                )
+                weights.pop(q_name)
+                if "k_proj" in target_module:
+                    weights.pop(k_name)
+                weights.pop(v_name)
             elif "qkv_proj" in weight_name:
                 # If qkv_proj is already stacked, we normalize it following the SGL convention.
                 qkv_name = weight_name
                 q_name = weight_name.replace("qkv_proj", "q_proj")
                 k_name = weight_name.replace("qkv_proj", "k_proj")
                 v_name = weight_name.replace("qkv_proj", "v_proj")
-                kv_name = weight_name.replace("qkv_proj", "kv_proj")
                 if "lora_A" in weight_name:
                     weights[qkv_name] = weights[qkv_name].repeat(3, 1)
-                else:
-                    head_size = (
-                        self.base_hf_config.hidden_size
-                        // self.base_hf_config.num_attention_heads
-                    )
-                    weights[q_name], k_proj_weight, v_proj_weight = torch.split(
-                        weights[qkv_name],
-                        [
-                            head_size * self.base_hf_config.num_attention_heads,
-                            head_size * self.base_hf_config.num_key_value_heads,
-                            head_size * self.base_hf_config.num_key_value_heads,
-                        ],
-                        dim=0,
-                    )
-                    weights[kv_name] = torch.stack(
-                        [k_proj_weight, v_proj_weight],
-                        dim=0,
-                    )
+                # else: no-op as LoRA B weight is already stacked.
     def normalize_gate_up_proj(
         self, weight_names: List[str], weights: Dict[str, torch.Tensor]
@@ -187,20 +156,14 @@ class LoRAAdapter(nn.Module):
                 gate_up_name = weight_name.replace("gate_proj", "gate_up_proj")
                 if up_name not in weights:
                     weights[up_name] = torch.zeros_like(weights[weight_name])
-                    # FIXME: Add gate-only support for flashinfer in future implementations
                     assert self.lora_backend.name == "triton", (
                         f"LoRA weight initialization currently only supported for 'triton' backend. "
                         f"Received backend: {self.lora_backend.name}. Please verify your backend configuration "
                         f"or consider implementing custom initialization logic for other backends."
                     )
-                if "lora_A" in weight_name:
-                    weights[gate_up_name] = torch.cat(
-                        (weights[weight_name], weights[up_name]), 0
-                    )
-                else:
-                    weights[gate_up_name] = torch.stack(
-                        [weights[weight_name], weights[up_name]], dim=0
-                    )
+                weights[gate_up_name] = torch.cat(
+                    (weights[weight_name], weights[up_name]), 0
+                )
                 weights.pop(weight_name)
                 if up_name in weights:
                     weights.pop(up_name)
@@ -209,12 +172,4 @@ class LoRAAdapter(nn.Module):
                 gate_up_name = weight_name
                 if "lora_A" in weight_name:
                     weights[gate_up_name] = weights[gate_up_name].repeat(2, 1)
-                else:
-                    output_dim = weights[gate_up_name].shape[0] // 2
-                    weights[gate_up_name] = torch.stack(
-                        [
-                            weights[gate_up_name][:output_dim, :],
-                            weights[gate_up_name][output_dim:, :],
-                        ],
-                        dim=0,
-                    )
+                # else: no-op as LoRA B weight is already stacked.

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -31,7 +31,6 @@ from sglang.srt.lora.mem_pool import LoRAMemoryPool
 from sglang.srt.lora.utils import (
     LoRABatchInfo,
     LoRAType,
-    get_customized_names_from_hf_names,
     get_layer_id,
     get_normalized_lora_weight_names,
     get_weight_name,
@@ -345,40 +344,19 @@ class LoRAManager:
             )
         self.lora_backend.set_batch_info(batch_info)
-        # TODO (lifuhuang): one potential perf optimization that is worth considering is to see if we can call
-        # this method only when loading/unloading LoRA adapters, instead of calling it for every micro-batch.
-        self.update_lora_info()
     def update_lora_info(self):
         """
         Update all LoRA modules to associate them with the latest memory buffer.
         """
         for layer_id, layer_modules in enumerate(self.lora_modules):
             for module_name, module in layer_modules.items():
-                if "qkv_proj" in module_name:
-                    module.set_lora_info(
-                        self.memory_pool.get_tensor(
-                            "qkv_proj", layer_id, LoRAType.LORA_A
-                        ),
-                        self.memory_pool.get_tensor(
-                            "q_proj", layer_id, LoRAType.LORA_B
-                        ),
-                        self.memory_pool.get_tensor(
-                            "kv_proj", layer_id, LoRAType.LORA_B
-                        ),
-                    )
-                else:
-                    weight_name = get_weight_name(
-                        module_name, self.memory_pool.lora_weight_names, LoRAType.LORA_A
-                    )
-                    module.set_lora_info(
-                        self.memory_pool.get_tensor(
-                            weight_name, layer_id, LoRAType.LORA_A
-                        ),
-                        self.memory_pool.get_tensor(
-                            weight_name, layer_id, LoRAType.LORA_B
-                        ),
-                    )
+                weight_name = get_weight_name(
+                    module_name, self.memory_pool.lora_weight_names
+                )
+                module.set_lora_info(
+                    self.memory_pool.get_tensor(weight_name, layer_id, LoRAType.LORA_A),
+                    self.memory_pool.get_tensor(weight_name, layer_id, LoRAType.LORA_B),
+                )
     def init_state(
         self,
@@ -405,6 +383,7 @@ class LoRAManager:
         self.init_lora_weight_names()
         self.init_lora_modules()
         self.init_memory_pool()
+        self.update_lora_info()
     def init_lora_adapters(self, lora_paths: Optional[Dict[str, LoRARef]] = None):
         # Configs of all active LoRA adapters, indexed by LoRA ID.
@@ -461,9 +440,9 @@ class LoRAManager:
         Add new LoRA weight names if needed based on the current `self.configs`.
         """
-        # Target lora weight names for lora_a and lora_b modules respectively.
-        lora_A, lora_B = get_normalized_lora_weight_names(self.target_modules)
-        self.lora_weight_names: Tuple[Set[str]] = (set(lora_A), set(lora_B))
+        self.lora_weight_names: Set[str] = get_normalized_lora_weight_names(
+            self.target_modules
+        )
     def load_lora_weights(self, lora_ref: LoRARef):
         """
@@ -479,15 +458,6 @@ class LoRAManager:
         lora_adapter.initialize_weights()
         self.loras[lora_ref.lora_id] = lora_adapter
-        # Additional checks for flashinfer backend
-        # FIXME remove the restrictions after supporting multi-rank for flashinfer backend
-        if self.lora_backend == "flashinfer":
-            lora_dims = set(x.r for x in self.configs.values())
-            scalings = set(x.scaling for x in self.loras.values())
-            assert (
-                len(lora_dims) == 1 and len(scalings) == 1
-            ), "Flashinfer backend currently only supports single LoRA rank and scaling across all adapters. "
     def init_memory_pool(self):
         """(Re)initialize the LoRA memory pool based on the current configurations."""
         self.memory_pool = LoRAMemoryPool(
@@ -512,12 +482,6 @@ class LoRAManager:
             {} for _ in range(self.base_hf_config.num_hidden_layers)
         ]
-        # Target module names of customized layers defined in python/sglang/srt/layers
-        # e.g., {"qkv_proj", "o_proj"}
-        customized_target_names = get_customized_names_from_hf_names(
-            self.target_modules, self.base_model
-        )
         for module_name, module in self.base_model.named_modules():
             # TODO (lifuhuang): in the future, we should consider generalizing the
             # should_apply_lora function to support mapping by full module name instead
@@ -530,7 +494,7 @@ class LoRAManager:
                 continue
             # The module should be converted if it is included in target_names
-            if module_name.split(".")[-1] in customized_target_names:
+            if module_name.split(".")[-1] in self.lora_weight_names:
                 layer_id = get_layer_id(module_name)
                 self.lora_modules[layer_id][module_name] = self.set_lora_module(
                     module_name, module

sglang/srt/lora/lora_registry.py CHANGED Viewed

@@ -14,7 +14,6 @@
 import asyncio
-from collections import defaultdict
 from dataclasses import dataclass, field, fields
 from typing import Dict, List, Optional, Union
 from uuid import uuid4
@@ -106,7 +105,6 @@ class LoRARegistry:
                     f"LoRA with name {lora_name} does not exist. Loaded LoRAs: {self._registry.keys()}"
                 )
             del self._registry[lora_name]
-            del self._counters[lora_ref.lora_id]
         return lora_ref.lora_id
@@ -117,6 +115,9 @@ class LoRARegistry:
         """
         def _lookup(name: str) -> str:
+            if name is None:
+                return None
             lora_ref = self._registry.get(name, None)
             if lora_ref is None:
                 raise ValueError(
@@ -135,7 +136,11 @@ class LoRARegistry:
                 # Increment the counters only after all IDs are looked up.
                 await asyncio.gather(
-                    *[self._counters[id].increment(notify_all=False) for id in lora_ids]
+                    *[
+                        self._counters[id].increment(notify_all=False)
+                        for id in lora_ids
+                        if id is not None
+                    ]
                 )
                 return lora_ids
             else:
@@ -153,7 +158,11 @@ class LoRARegistry:
                 await self._counters[lora_id].decrement()
             elif isinstance(lora_id, list):
                 await asyncio.gather(
-                    *[self._counters[id].decrement() for id in lora_id]
+                    *[
+                        self._counters[id].decrement()
+                        for id in lora_id
+                        if id is not None
+                    ]
                 )
             else:
                 raise TypeError("lora_id must be either a string or a list of strings.")
@@ -169,11 +178,13 @@ class LoRARegistry:
         assert (
             lora_id not in self._registry
         ), "wait_for_unload should only be called after the LoRA adapter has been unregistered. "
-        counter = self._counters.get(lora_id)
-        if counter:
-            # Wait until no requests are using this LoRA adapter.
-            await counter.wait_for_zero()
-            del self._counters[lora_id]
+        assert (
+            lora_id in self._counters
+        ), "The LoRA ID should still have a counter if it has been registered before."
+        # Wait until no requests are using this LoRA adapter.
+        await self._counters[lora_id].wait_for_zero()
+        del self._counters[lora_id]
     def _register_adapter(self, lora_ref: LoRARef):
         """

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -52,7 +52,7 @@ class LoRAMemoryPool:
         tp_size: int,
         tp_rank: int,
         max_lora_rank: int,
-        lora_weight_names: Tuple[Set[str], Set[str]],
+        lora_weight_names: Set[str],
         base_model: torch.nn.Module,
     ):
         self.base_hf_config: AutoConfig = base_hf_config
@@ -62,9 +62,7 @@ class LoRAMemoryPool:
         self.tp_size: int = tp_size
         self.tp_rank: int = tp_rank
         self.max_lora_rank: int = max_lora_rank
-        # lora weight names for LoRA A and B respectively.
-        self.lora_weight_names: Tuple[Set[str], Set[str]] = lora_weight_names
+        self.lora_weight_names: Set[str] = lora_weight_names
         # Both A_buffer and B_buffer maps lora weight names to its buffer space.
         # A_buffer contains num_layer number of row-major tensors with shape
@@ -97,12 +95,8 @@ class LoRAMemoryPool:
             """
             if config.r > self.max_lora_rank:
                 return False
-            weights_a, weights_b = get_normalized_lora_weight_names(
-                config.target_modules
-            )
-            return weights_a.issubset(self.lora_weight_names[0]) and weights_b.issubset(
-                self.lora_weight_names[1]
-            )
+            weights = get_normalized_lora_weight_names(config.target_modules)
+            return weights.issubset(self.lora_weight_names)
         if isinstance(config, LoRAConfig):
             return _can_support(config)
@@ -132,11 +126,9 @@ class LoRAMemoryPool:
         Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
         """
         _, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
-        c = get_stacked_multiply(module_name)
         if self.tp_size > 1 and module_name not in ROW_PARALLELISM_LINEAR_LORA_NAMES:
             output_dim = divide(output_dim, self.tp_size)
         return (
-            c,
             self.max_loras_per_batch,
             output_dim,
             max_lora_dim,
@@ -165,13 +157,13 @@ class LoRAMemoryPool:
         init_buffer(
             self.A_buffer,
-            self.lora_weight_names[0],
+            self.lora_weight_names,
             self.get_lora_A_shape,
         )
         init_buffer(
             self.B_buffer,
-            self.lora_weight_names[1],
+            self.lora_weight_names,
             self.get_lora_B_shape,
         )
@@ -246,7 +238,7 @@ class LoRAMemoryPool:
             return
         assert lora_adapter is not None
-        lora_rank = lora_adapter.config.hf_config["r"]
+        lora_rank = lora_adapter.config.r
         for layer_id in range(self.num_layer):
             layer_weights = lora_adapter.layers[layer_id].weights
             temp_A_buffer: Dict[str, Optional[torch.Tensor]] = {
@@ -256,73 +248,38 @@ class LoRAMemoryPool:
                 weight_name: None for weight_name in self.B_buffer
             }
             for name, weights in layer_weights.items():
+                lora_weight_name = get_weight_name(name, self.lora_weight_names)
                 if "lora_A" in name:
-                    lora_weight_name = get_weight_name(
-                        name, self.lora_weight_names, LoRAType.LORA_A
-                    )
                     temp_A_buffer[lora_weight_name] = weights
                 else:
-                    lora_weight_name = get_weight_name(
-                        name, self.lora_weight_names, LoRAType.LORA_B
-                    )
                     temp_B_buffer[lora_weight_name] = weights
             if self.tp_size > 1:
                 cur_layer_modules = lora_modules[layer_id]
                 for module_name, module in cur_layer_modules.items():
-                    weight_name = get_weight_name(
-                        module_name, self.lora_weight_names, LoRAType.LORA_A
-                    )
+                    weight_name = get_weight_name(module_name, self.lora_weight_names)
                     if temp_A_buffer[weight_name] is None:
                         # Skip weight slicing if the weight is not present in the adapter
                         continue
-                    if "qkv_proj" in module_name:
-                        temp_A_buffer["qkv_proj"] = module.slice_lora_a_weights(
-                            temp_A_buffer["qkv_proj"], self.tp_rank
-                        )
-                        temp_B_buffer["q_proj"], temp_B_buffer["kv_proj"] = (
-                            module.slice_lora_b_weights(
-                                [temp_B_buffer["q_proj"], temp_B_buffer["kv_proj"]],
-                                self.tp_rank,
-                            )
-                        )
-                    else:
-                        # TODO (lifuhuang): Ideally, we should call `get_weight_name` separately for both A and B.
-                        # Currently, we're reusing A's weight name as a workaround, relying on the fact that A and
-                        # B share the same name except for `qkv_proj`. We should clean this up once we deprecate the
-                        # FlashInfer LoRA backend.
-                        temp_A_buffer[weight_name] = module.slice_lora_a_weights(
-                            temp_A_buffer[weight_name], self.tp_rank
-                        )
-                        temp_B_buffer[weight_name] = module.slice_lora_b_weights(
-                            temp_B_buffer[weight_name], self.tp_rank
-                        )
+                    temp_A_buffer[weight_name] = module.slice_lora_a_weights(
+                        temp_A_buffer[weight_name], self.tp_rank
+                    )
+                    temp_B_buffer[weight_name] = module.slice_lora_b_weights(
+                        temp_B_buffer[weight_name], self.tp_rank
+                    )
             for name, weights in temp_A_buffer.items():
                 c = get_stacked_multiply(name)
-                buffer_view = self.A_buffer[name][layer_id][buffer_id][
-                    : lora_rank * c, :
-                ]
+                target_buffer = self.A_buffer[name][layer_id]
+                buffer_view = target_buffer[buffer_id, : lora_rank * c, :]
                 load_lora_weight_tensor(buffer_view, weights)
             for name, weights in temp_B_buffer.items():
-                c = get_stacked_multiply(name)
-                if c > 1:
-                    for stacked_id in range(c):
-                        buffer_view = self.B_buffer[name][layer_id][stacked_id][
-                            buffer_id
-                        ][:, :lora_rank]
-                        weight_slice = (
-                            weights[stacked_id] if weights is not None else None
-                        )
-                        load_lora_weight_tensor(buffer_view, weight_slice)
-                else:
-                    buffer_view = self.B_buffer[name][layer_id][0][buffer_id][
-                        :, :lora_rank
-                    ]
-                    load_lora_weight_tensor(buffer_view, weights)
+                target_buffer = self.B_buffer[name][layer_id]
+                buffer_view = target_buffer[buffer_id, :, :lora_rank]
+                load_lora_weight_tensor(buffer_view, weights)
     def get_tensor(
         self, weight_name: str, layer_id: int, lora_type: LoRAType

sglang/srt/lora/triton_ops/qkv_lora_b.py CHANGED Viewed

@@ -119,7 +119,7 @@ def _qkv_lora_b_kernel(
     output_ptr = (output + seg_start * output_stride_0 + n_start * output_stride_1) + (
         s_offset[:, None] * output_stride_0 + n_offset[None, :] * output_stride_1
     )
-    output_mask = (s_offset[:, None] < seg_len) and (n_offset[None, :] < n_size)
+    output_mask = (s_offset[:, None] < seg_len) & (n_offset[None, :] < n_size)
     partial_sum += tl.load(output_ptr, mask=output_mask)
     tl.store(output_ptr, partial_sum, mask=output_mask)

sglang/srt/lora/utils.py CHANGED Viewed

@@ -47,34 +47,6 @@ def get_layer_id(name: str) -> int:
     return int(match.group(1))
-def get_customized_names_from_hf_names(
-    hf_module_names: Set[str], base_model: torch.nn.Module
-) -> Set[str]:
-    """
-    This function takes in a set of huggingface style module names:
-         e.g., {"k_proj", "q_proj", "v_proj", "o_proj"}
-    and outputs a set of module names of customized sglang layers:
-         e.g., {"qkv_proj", "o_proj"}
-    """
-    if hasattr(base_model, "get_module_name"):
-        return {base_model.get_module_name(name) for name in hf_module_names}
-    else:
-        """
-        Fallback solution of mapping from config module name to module name in model class.
-        Please check if it aligns with your base model.
-        Please implement the function in the model class if it is not.
-        You can reference this function in llama.py.
-        """
-        params_mapping = {
-            "q_proj": "qkv_proj",
-            "k_proj": "qkv_proj",
-            "v_proj": "qkv_proj",
-            "gate_proj": "gate_up_proj",
-            "up_proj": "gate_up_proj",
-        }
-        return {params_mapping.get(name, name) for name in hf_module_names}
 def get_hidden_dim(
     module_name: str, config: AutoConfig, base_model: torch.nn.Module
 ) -> Tuple[int]:
@@ -92,14 +64,20 @@ def get_hidden_dim(
         Please implement the function in the model class if it is not.
         You can reference this function in llama.py.
         """
-        if module_name in ["q_proj", "o_proj", "qkv_proj"]:
-            return config.hidden_size, config.hidden_size
-        elif module_name in ["kv_proj"]:
-            return config.hidden_size, config.hidden_size // (
-                config.num_attention_heads // config.num_key_value_heads
+        head_dim = getattr(
+            config, "head_dim", config.hidden_size // config.num_attention_heads
+        )
+        if module_name == "qkv_proj":
+            return config.hidden_size, head_dim * (
+                config.num_attention_heads + config.num_key_value_heads * 2
+            )
+        elif module_name == "o_proj":
+            return (
+                head_dim * config.num_attention_heads,
+                config.hidden_size,
             )
         elif module_name == "gate_up_proj":
-            return config.hidden_size, config.intermediate_size
+            return config.hidden_size, config.intermediate_size * 2
         elif module_name == "down_proj":
             return config.intermediate_size, config.hidden_size
         else:
@@ -108,26 +86,22 @@ def get_hidden_dim(
 def get_normalized_lora_weight_names(
     target_modules: Iterable[str],
-) -> Tuple[set[str], set[str]]:
+) -> set[str]:
     """
     Mapping a list of target module name to names of the normalized LoRA weights.
-    Returned tuple contains (name for Lora A, name for Lora B)
     """
     params_mapping = {
-        "q_proj": (["qkv_proj"], ["q_proj"]),
-        "k_proj": (["qkv_proj"], ["kv_proj"]),
-        "v_proj": (["qkv_proj"], ["kv_proj"]),
-        "gate_proj": (["gate_up_proj"], ["gate_up_proj"]),
-        "up_proj": (["gate_up_proj"], ["gate_up_proj"]),
-        "qkv_proj": (["qkv_proj"], ["q_proj", "kv_proj"]),
-        "gate_up_proj": (["gate_up_proj"], ["gate_up_proj"]),
+        "q_proj": "qkv_proj",
+        "k_proj": "qkv_proj",
+        "v_proj": "qkv_proj",
+        "gate_proj": "gate_up_proj",
+        "up_proj": "gate_up_proj",
     }
-    result = (set(), set())
+    result = set()
     for name in target_modules:
-        lora_a, lora_b = params_mapping.get(name, ([name], [name]))
-        result[0].update(lora_a)
-        result[1].update(lora_b)
+        weight_name = params_mapping.get(name, name)
+        result.add(weight_name)
     return result
@@ -137,23 +111,21 @@ def get_stacked_multiply(module_name: str) -> int:
     """
     stacked_rank = {
         "qkv_proj": 3,
-        "kv_proj": 2,
         "gate_up_proj": 2,
     }
     return stacked_rank[module_name] if module_name in stacked_rank else 1
 def get_weight_name(
-    target_name: str, lora_weight_names: Tuple[Set[str]], lora_type: LoRAType
+    target_name: str, lora_weight_names: Tuple[Set[str]]
 ) -> Optional[str]:
     """
-    target_name is name of a given module,
-    lora_weight_names is a set of lora stacked name pairs (see get_stacked_name method above)
+    Get the weight name in lora_weight_names that can match target_name.
     If there is a weight name in lora_weight_names that can match target_name, return this name
     Else raise ValueError.
     """
-    idx = 0 if lora_type == LoRAType.LORA_A else 1
-    for weight_name in lora_weight_names[idx]:
+    for weight_name in lora_weight_names:
         if weight_name in target_name:
             return weight_name
     raise ValueError(
@@ -161,9 +133,4 @@ def get_weight_name(
     )
-# TODO: [PR #4274] For future use to simplify the mapping between HF module names and customized module names.
-VOCAB_PARALLELISM_EMBEDDING_NAMES = ["embeddings"]
-COLUMN_PARALLELISM_LINEAR_LORA_NAMES = ["gate_proj", "up_proj"]
-MERGED_COLUMN_PARALLELISM_LINEAR_LORA_NAMES = ["gate_up_proj"]
-QKV_PARALLELISM_LINEAR_LORA_NAMES = ["qkv_proj"]
 ROW_PARALLELISM_LINEAR_LORA_NAMES = ["o_proj", "down_proj"]

sglang 0.5.0rc0__py3-none-any.whl → 0.5.0rc2__py3-none-any.whl

sglang 0.5.0rc0py3-none-any.whl → 0.5.0rc2py3-none-any.whl