PyPI - sglang - Versions diffs - 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

sglang 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (152) hide show

sglang/__init__.py +2 -0
sglang/api.py +7 -0
sglang/bench_one_batch.py +8 -6
sglang/bench_serving.py +1 -1
sglang/lang/interpreter.py +40 -1
sglang/lang/ir.py +27 -0
sglang/math_utils.py +8 -0
sglang/srt/_custom_ops.py +2 -2
sglang/srt/code_completion_parser.py +2 -44
sglang/srt/configs/model_config.py +6 -0
sglang/srt/constants.py +3 -0
sglang/srt/conversation.py +19 -3
sglang/srt/custom_op.py +5 -1
sglang/srt/disaggregation/base/__init__.py +1 -1
sglang/srt/disaggregation/base/conn.py +25 -11
sglang/srt/disaggregation/common/__init__.py +5 -1
sglang/srt/disaggregation/common/utils.py +42 -0
sglang/srt/disaggregation/decode.py +211 -72
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
sglang/srt/disaggregation/fake/__init__.py +1 -1
sglang/srt/disaggregation/fake/conn.py +15 -9
sglang/srt/disaggregation/mini_lb.py +34 -4
sglang/srt/disaggregation/mooncake/__init__.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +30 -29
sglang/srt/disaggregation/nixl/__init__.py +6 -1
sglang/srt/disaggregation/nixl/conn.py +17 -12
sglang/srt/disaggregation/prefill.py +144 -55
sglang/srt/disaggregation/utils.py +155 -123
sglang/srt/distributed/parallel_state.py +12 -4
sglang/srt/entrypoints/engine.py +37 -29
sglang/srt/entrypoints/http_server.py +153 -72
sglang/srt/entrypoints/http_server_engine.py +0 -3
sglang/srt/entrypoints/openai/__init__.py +0 -0
sglang/srt/{openai_api → entrypoints/openai}/protocol.py +84 -10
sglang/srt/entrypoints/openai/serving_base.py +149 -0
sglang/srt/entrypoints/openai/serving_chat.py +921 -0
sglang/srt/entrypoints/openai/serving_completions.py +424 -0
sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
sglang/srt/entrypoints/openai/serving_score.py +61 -0
sglang/srt/entrypoints/openai/usage_processor.py +81 -0
sglang/srt/entrypoints/openai/utils.py +72 -0
sglang/srt/eplb_simulator/__init__.py +1 -0
sglang/srt/eplb_simulator/reader.py +51 -0
sglang/srt/function_call/base_format_detector.py +7 -4
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/function_call/ebnf_composer.py +64 -10
sglang/srt/function_call/function_call_parser.py +6 -6
sglang/srt/function_call/llama32_detector.py +1 -1
sglang/srt/function_call/mistral_detector.py +1 -1
sglang/srt/function_call/pythonic_detector.py +1 -1
sglang/srt/function_call/qwen25_detector.py +1 -1
sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
sglang/srt/layers/activation.py +40 -3
sglang/srt/layers/attention/aiter_backend.py +20 -4
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +39 -15
sglang/srt/layers/attention/flashattention_backend.py +71 -72
sglang/srt/layers/attention/flashinfer_backend.py +10 -8
sglang/srt/layers/attention/flashinfer_mla_backend.py +29 -28
sglang/srt/layers/attention/flashmla_backend.py +7 -12
sglang/srt/layers/attention/tbo_backend.py +3 -3
sglang/srt/layers/attention/triton_backend.py +138 -130
sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
sglang/srt/layers/attention/vision.py +51 -24
sglang/srt/layers/communicator.py +28 -10
sglang/srt/layers/dp_attention.py +11 -2
sglang/srt/layers/layernorm.py +29 -2
sglang/srt/layers/linear.py +0 -4
sglang/srt/layers/logits_processor.py +2 -14
sglang/srt/layers/moe/ep_moe/kernels.py +165 -7
sglang/srt/layers/moe/ep_moe/layer.py +249 -33
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +11 -37
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -4
sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
sglang/srt/layers/moe/topk.py +107 -12
sglang/srt/layers/pooler.py +56 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +23 -80
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
sglang/srt/layers/quantization/fp8.py +25 -17
sglang/srt/layers/quantization/fp8_kernel.py +44 -15
sglang/srt/layers/quantization/fp8_utils.py +87 -22
sglang/srt/layers/quantization/modelopt_quant.py +62 -8
sglang/srt/layers/quantization/utils.py +5 -2
sglang/srt/layers/radix_attention.py +2 -3
sglang/srt/layers/rotary_embedding.py +42 -2
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/lora_manager.py +249 -105
sglang/srt/lora/mem_pool.py +53 -50
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +33 -14
sglang/srt/managers/io_struct.py +31 -10
sglang/srt/managers/multimodal_processors/base_processor.py +2 -2
sglang/srt/managers/multimodal_processors/vila.py +85 -0
sglang/srt/managers/schedule_batch.py +79 -37
sglang/srt/managers/schedule_policy.py +70 -56
sglang/srt/managers/scheduler.py +220 -79
sglang/srt/managers/template_manager.py +226 -0
sglang/srt/managers/tokenizer_manager.py +40 -10
sglang/srt/managers/tp_worker.py +12 -2
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
sglang/srt/mem_cache/base_prefix_cache.py +52 -8
sglang/srt/mem_cache/chunk_cache.py +11 -15
sglang/srt/mem_cache/hiradix_cache.py +38 -25
sglang/srt/mem_cache/memory_pool.py +213 -505
sglang/srt/mem_cache/memory_pool_host.py +380 -0
sglang/srt/mem_cache/radix_cache.py +56 -28
sglang/srt/model_executor/cuda_graph_runner.py +198 -100
sglang/srt/model_executor/forward_batch_info.py +32 -10
sglang/srt/model_executor/model_runner.py +28 -12
sglang/srt/model_loader/loader.py +16 -2
sglang/srt/model_loader/weight_utils.py +11 -2
sglang/srt/models/bert.py +113 -13
sglang/srt/models/deepseek_nextn.py +29 -27
sglang/srt/models/deepseek_v2.py +213 -173
sglang/srt/models/glm4.py +312 -0
sglang/srt/models/internvl.py +46 -102
sglang/srt/models/mimo_mtp.py +2 -18
sglang/srt/models/roberta.py +117 -9
sglang/srt/models/vila.py +305 -0
sglang/srt/reasoning_parser.py +21 -11
sglang/srt/sampling/sampling_batch_info.py +24 -0
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server_args.py +351 -238
sglang/srt/speculative/build_eagle_tree.py +1 -1
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -9
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +130 -14
sglang/srt/speculative/eagle_utils.py +468 -116
sglang/srt/speculative/eagle_worker.py +258 -84
sglang/srt/torch_memory_saver_adapter.py +19 -15
sglang/srt/two_batch_overlap.py +4 -2
sglang/srt/utils.py +235 -11
sglang/test/attention/test_prefix_chunk_info.py +2 -0
sglang/test/runners.py +38 -3
sglang/test/test_block_fp8.py +1 -0
sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
sglang/test/test_block_fp8_ep.py +2 -0
sglang/test/test_utils.py +4 -1
sglang/utils.py +9 -0
sglang/version.py +1 -1
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/METADATA +8 -14
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/RECORD +150 -128
sglang/srt/entrypoints/verl_engine.py +0 -179
sglang/srt/openai_api/adapter.py +0 -1990
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/utils.py CHANGED Viewed

@@ -6,11 +6,14 @@ from typing import List, Mapping, Tuple, Union
 import torch
 from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
-from sglang.srt.utils import is_cuda
+from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_npu
 _is_cuda = is_cuda()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
-if not _is_cuda:
+if not (_is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available)):
     from vllm._custom_ops import scaled_fp8_quant

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -18,7 +18,6 @@ from typing import Optional
 from torch import nn
-from sglang.srt.layers.linear import UnquantizedLinearMethod
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -52,9 +51,9 @@ class RadixAttention(nn.Module):
         sliding_window_size: int = -1,
         is_cross_attention: bool = False,
         quant_config: Optional[QuantizationConfig] = None,
-        attn_type=AttentionType.DECODER,
-        prefix: str = "",
+        attn_type: AttentionType = AttentionType.DECODER,
         use_irope: bool = False,
+        prefix: str = "",
     ):
         super().__init__()
         self.tp_q_head_num = num_heads

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -8,10 +8,13 @@ import torch
 import torch.nn as nn
 from sglang.srt.custom_op import CustomOp
-from sglang.srt.utils import is_cuda, is_hip
+from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_hip, is_npu
 _is_cuda = is_cuda()
 _is_hip = is_hip()
+_is_npu = is_npu()
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 if _is_cuda:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
@@ -84,7 +87,9 @@ class RotaryEmbedding(CustomOp):
         if not _is_cuda:
             cache = cache.to(dtype)
-        if not _is_cuda or self.head_size not in [64, 128, 256, 512]:
+        if (
+            not (_is_cuda or _is_npu) or self.head_size not in [64, 128, 256, 512]
+        ) and not (_is_cpu and _is_cpu_amx_available):
             from vllm._custom_ops import rotary_embedding
             self.vllm_rotary_embedding = rotary_embedding
@@ -147,6 +152,26 @@ class RotaryEmbedding(CustomOp):
         key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
         return query, key
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+        if _is_cpu_amx_available:
+            return torch.ops.sgl_kernel.rotary_embedding_cpu(
+                positions,
+                query,
+                key,
+                self.head_size,
+                self.cos_sin_cache,
+                self.is_neox_style,
+            )
+        else:
+            return self.forward_native(positions, query, key, offsets)
     def forward_cuda(
         self,
         positions: torch.Tensor,
@@ -696,6 +721,21 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
             key = key_rot
         return query.to(dtype), key.to(dtype)
+    def forward_cpu(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        positions = torch.add(positions, offsets) if offsets is not None else positions
+        if _is_cpu_amx_available:
+            return torch.ops.sgl_kernel.rotary_embedding_cpu(
+                positions, query, key, self.head_size, self.cos_sin_cache, False
+            )
+        else:
+            return self.forward_native(positions, query, key, offsets)
 class Llama3RotaryEmbedding(RotaryEmbedding):

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -91,7 +91,7 @@ class Sampler(nn.Module):
                         )
                     else:
                         batch_next_token_ids = top_k_top_p_sampling_from_probs(
-                            probs,
+                            probs.contiguous(),
                             sampling_info.top_ks,
                             sampling_info.top_ps,
                             filter_apply_order="joint",

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # and "Punica: Multi-Tenant LoRA Serving"
 import logging
-from typing import Dict, List, Set, Tuple
+from typing import Dict, Set, Tuple
 import torch
@@ -45,7 +45,6 @@ class LoRAManager:
     def __init__(
         self,
         base_model: torch.nn.Module,
-        lora_paths: Dict[str, str],
         base_hf_config: AutoConfig,
         max_loras_per_batch: int,
         load_config: LoadConfig,
@@ -55,7 +54,6 @@ class LoRAManager:
         tp_rank: int = 0,
     ):
         self.base_model: torch.nn.Module = base_model
-        self.lora_paths: Dict[str, str] = lora_paths
         self.base_hf_config: AutoConfig = base_hf_config
         self.max_loras_per_batch: int = max_loras_per_batch
         self.load_config: LoadConfig = load_config
@@ -69,8 +67,8 @@ class LoRAManager:
         backend_type = get_backend_from_name(lora_backend)
         self.lora_backend: BaseLoRABackend = backend_type(lora_backend)
-        self.init_loras()
-        self.init_lora_memory_pool()
+        # Initialize mutable internal state of the LoRAManager.
+        self.init_state()
     def init_cuda_graph_batch_info(self, max_bs_in_cuda_graph: int):
         self.max_bs_in_cuda_graph = max_bs_in_cuda_graph
@@ -81,7 +79,7 @@ class LoRAManager:
                 seg_indptr=torch.zeros(
                     self.max_bs_in_cuda_graph + 1, dtype=torch.int32
                 ),
-                max_len=0,
+                max_len=1,
                 weight_indices=torch.zeros(
                     self.max_bs_in_cuda_graph, dtype=torch.int32
                 ),
@@ -89,76 +87,103 @@ class LoRAManager:
                 scalings=torch.zeros(self.max_loras_per_batch, dtype=torch.float),
             )
-    def init_loras(self):
-        # Config of each LoRA adapter
-        self.configs: Dict[str, LoRAConfig] = {}
+            # Initialize seg_lens and seg_indptr for CUDA graph as they remain constant
+            # across batches.
+            self.cuda_graph_batch_info.seg_lens[: self.max_bs_in_cuda_graph].fill_(1)
+            torch.cumsum(
+                self.cuda_graph_batch_info.seg_lens[: self.max_bs_in_cuda_graph],
+                dim=0,
+                out=self.cuda_graph_batch_info.seg_indptr[
+                    1 : self.max_bs_in_cuda_graph + 1
+                ],
+            )
-        # Target module names in huggingface lora configs.
-        # e.g., {"k_proj", "q_proj", "v_proj", "o_proj"}
-        self.hf_target_names: Set[str] = set()
-        for name, path in self.lora_paths.items():
-            self.configs[name] = LoRAConfig(path)
-            self.hf_target_names.update(self.configs[name].target_modules)
+    def load_lora_adapters(self, lora_paths: Dict[str, str]):
+        """
+        Load LoRA adapters from the specified paths.
+        TODO (lifuhuang): This method should be exposed to the server/engine API to support dynamic LoRA loading.
-        # Target lora weight names for lora_a and lora_b modules respectively.
-        weights_A: List[str] = []
-        weights_B: List[str] = []
-        for module in self.hf_target_names:
-            lora_A, lora_B = get_normalized_lora_weight_names(module)
-            weights_A += lora_A
-            weights_B += lora_B
-        self.lora_weight_names: Tuple[Set[str]] = set(weights_A), set(weights_B)
+        Args:
+            lora_paths (Dict[str, str]): A dictionary mapping LoRA adapter names to their file paths.
+            If a LoRA adapter is already loaded, it will be skipped with a warning.
+        """
-        # load all weights to cpu
-        self.loras: Dict[str, LoRAAdapter] = {}
-        for name in self.lora_paths.keys():
-            lora_adapter = LoRAAdapter(
-                name,
-                self.configs[name],
-                self.base_hf_config,
-                self.load_config,
-                self.lora_backend,
-            )
-            lora_adapter.initialize_weights()
-            self.loras[name] = lora_adapter
+        for lora_name, lora_path in lora_paths.items():
+            if lora_name in self.loras:
+                logger.warning(
+                    f"LoRA adapter {lora_name} is already loaded."
+                    "If you want to reload it, please unload it first."
+                )
+                continue
-        # misc lora configs
-        self.max_lora_dim: int = max([x.hf_config["r"] for x in self.configs.values()])
+            self.configs[lora_name] = LoRAConfig(lora_path)
-        if self.lora_backend == "flashinfer":
-            # FIXME remove the restrictions after supporting multi-rank for flashinfer backend
-            max_lora_dim = max([x.hf_config["r"] for x in self.configs.values()])
-            scaling = list(self.loras.values())[0].scaling
-            assert all(x.hf_config["r"] == max_lora_dim for x in self.configs.values())
-            assert all(x.scaling == scaling for x in self.loras.values())
+        self.update_state_from_configs()
-        # Convert original model layers to layers with LoRA
-        self.convert_to_lora_layers()
+    def unload_lora_adapters(self, lora_names: Set[str]):
+        """
+        Unload LoRA adapters by their names. This will remove the adapters from the memory pool and
+        delete the corresponding LoRA modules.
-    def init_lora_memory_pool(self):
-        # Initialize memory pool
-        self.memory_pool = LoRAMemoryPool(
-            self.base_hf_config,
-            self.max_loras_per_batch,
-            self.max_lora_dim,
-            self.dtype,
-            self.tp_size,
-            self.tp_rank,
-            self.lora_modules,
-        )
+        Args:
+            lora_names (Set[str]): A set of LoRA adapter names to unload.
+        """
+        for lora_name in lora_names:
+            if lora_name in self.loras:
+                del self.configs[lora_name]
+            else:
+                logger.warning(f"LoRA adapter {lora_name} is not loaded.")
-        # Initialize target lora modules in memory pool
-        self.memory_pool.init_buffers(self.lora_weight_names, self.base_model)
+        self.update_state_from_configs()
     def prepare_lora_batch(self, forward_batch: ForwardBatch):
         # load active loras into lora memory pool
         cur_uids = set(forward_batch.lora_paths)
         assert len(cur_uids) <= self.max_loras_per_batch
-        self.memory_pool.prepare_lora_batch(cur_uids, self.loras)
+        self.memory_pool.prepare_lora_batch(cur_uids, self.loras, self.lora_modules)
         # set up batch info shared by all lora modules
         bs = forward_batch.batch_size
+        def transfer_adapter_info(
+            weight_indices_out: torch.Tensor,
+            lora_ranks_out: torch.Tensor,
+            scalings_out: torch.Tensor,
+        ):
+            """
+            Transfer adapter metadata (weight indices, LoRA rank, scalings) from host
+            to device (CUDA) asynchronously.
+            """
+            weight_indices = [0] * len(forward_batch.lora_paths)
+            lora_ranks = [0] * self.max_loras_per_batch
+            scalings = [0] * self.max_loras_per_batch
+            for i, lora_path in enumerate(forward_batch.lora_paths):
+                weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
+                if lora_path is not None:
+                    lora = self.loras[lora_path]
+                    lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
+                    scalings[weight_indices[i]] = lora.scaling
+            # Use pinned memory to avoid synchronizations during host-to-device transfer
+            weight_indices_tensor = torch.tensor(
+                weight_indices, dtype=torch.int32, pin_memory=True, device="cpu"
+            )
+            lora_ranks_tensor = torch.tensor(
+                lora_ranks, dtype=torch.int32, pin_memory=True, device="cpu"
+            )
+            scalings_tensor = torch.tensor(
+                scalings, dtype=torch.float, pin_memory=True, device="cpu"
+            )
+            # Copy to device tensors asynchronously
+            weight_indices_out[:bs].copy_(weight_indices_tensor, non_blocking=True)
+            lora_ranks_out[: self.max_loras_per_batch].copy_(
+                lora_ranks_tensor, non_blocking=True
+            )
+            scalings_out[: self.max_loras_per_batch].copy_(
+                scalings_tensor, non_blocking=True
+            )
         if (
             hasattr(self, "max_bs_in_cuda_graph")
             and bs <= self.max_bs_in_cuda_graph
@@ -166,51 +191,46 @@ class LoRAManager:
         ):
             # Do in-place updates when CUDA graph is enabled and the batch forward mode
             # could use CUDA graph.
-            self.cuda_graph_batch_info.bs = bs
-            self.cuda_graph_batch_info.seg_lens[:bs].fill_(1)
-            torch.cumsum(
-                self.cuda_graph_batch_info.seg_lens[:bs],
-                dim=0,
-                out=self.cuda_graph_batch_info.seg_indptr[1 : bs + 1],
+            transfer_adapter_info(
+                self.cuda_graph_batch_info.weight_indices,
+                self.cuda_graph_batch_info.lora_ranks,
+                self.cuda_graph_batch_info.scalings,
             )
-            self.cuda_graph_batch_info.max_len = 1
-            for i, lora_path in enumerate(forward_batch.lora_paths):
-                self.cuda_graph_batch_info.weight_indices[i] = (
-                    self.memory_pool.get_buffer_id(lora_path)
-                )
-                if lora_path is not None:
-                    lora = self.loras[lora_path]
-                    self.cuda_graph_batch_info.lora_ranks[
-                        self.cuda_graph_batch_info.weight_indices[i]
-                    ] = lora.config.hf_config["r"]
-                    self.cuda_graph_batch_info.scalings[
-                        self.cuda_graph_batch_info.weight_indices[i]
-                    ] = lora.scaling
+            self.cuda_graph_batch_info.bs = bs
+            self.cuda_graph_batch_info.max_len = 1
             batch_info = self.cuda_graph_batch_info
         else:
+            weight_indices = torch.empty((bs,), dtype=torch.int32, device=self.device)
+            lora_ranks = torch.zeros(
+                (self.max_loras_per_batch,), dtype=torch.int64, device=self.device
+            )
+            scalings = torch.zeros(
+                (self.max_loras_per_batch,), dtype=torch.float, device=self.device
+            )
+            transfer_adapter_info(
+                weight_indices,
+                lora_ranks,
+                scalings,
+            )
             seg_lens = (
                 forward_batch.extend_seq_lens
                 if forward_batch.forward_mode.is_extend()
                 else torch.ones(bs, device=self.device)
             )
+            max_len = (
+                # Calculate max_len from the CPU copy to avoid D2H transfer.
+                max(forward_batch.extend_seq_lens_cpu)
+                if forward_batch.forward_mode.is_extend()
+                else 1
+            )
             seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
             seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
-            max_len = int(torch.max(seg_lens))
-            weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
-            lora_ranks = torch.zeros(
-                (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
-            )
-            scalings = torch.zeros(
-                (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
-            )
-            for i, lora_path in enumerate(forward_batch.lora_paths):
-                weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
-                if lora_path is not None:
-                    lora = self.loras[lora_path]
-                    lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
-                    scalings[weight_indices[i]] = lora.scaling
             batch_info = LoRABatchInfo(
                 bs=bs,
                 seg_lens=seg_lens,
@@ -222,9 +242,16 @@ class LoRAManager:
             )
         self.lora_backend.set_batch_info(batch_info)
-        # call set_lora_info for each lora modules
-        for layer_id, modules in self.lora_modules.items():
-            for module_name, module in modules:
+        # TODO (lifuhuang): one potential perf optimization that is worth considering is to see if we can call
+        # this method only when loading/unloading LoRA adapters, instead of calling it for every micro-batch.
+        self.update_lora_info()
+    def update_lora_info(self):
+        """
+        Update all LoRA modules to associate them with the latest memory buffer.
+        """
+        for layer_id, layer_modules in self.lora_modules.items():
+            for module_name, module in layer_modules.items():
                 if "qkv_proj" in module_name:
                     module.set_lora_info(
                         self.memory_pool.get_tensor(
@@ -250,23 +277,139 @@ class LoRAManager:
                         ),
                     )
+    def init_state(self):
+        """
+        Initialize the internal (mutable) state of the LoRAManager.
+        These states are mutable via the `update_state_from_configs` as LoRA adapters are loaded and unloaded dynamically.
+        """
+        # Configs of all active LoRA adapters.
+        self.configs: Dict[str, LoRAConfig] = {}
+        # LoRA adapter weights cached in CPU memory.
+        self.loras: Dict[str, LoRAAdapter] = {}
+        # Supported weight names (e.g., qkv_proj) for LoRA A and B respectively.
+        self.lora_weight_names: Tuple[Set[str]] = (set(), set())
+        # Look-up table that essentially maps (layer_index, module_name) to the corresponding LoRA module.
+        self.lora_modules: Dict[int, Dict[str, BaseLayerWithLoRA]] = {
+            i: {} for i in range(self.base_hf_config.num_hidden_layers)
+        }
+        # Initialize memory pool
+        self.memory_pool = LoRAMemoryPool(
+            self.base_hf_config,
+            self.max_loras_per_batch,
+            self.dtype,
+            self.tp_size,
+            self.tp_rank,
+        )
+    def update_state_from_configs(self):
+        """
+        Update the internal state of the LoRAManager based on the current `self.configs`. This method
+        should be called whenever `self.configs` is modified (e.g., when new LoRA adapters are loaded).
+        This includes:
+        - Initializing LoRA adapters if they are not already loaded.
+        - Collect all LoRA weight names based on the current loaded adapters.
+        - Lazily monkey-patching the base model to use LoRA layers where applicable.
+        - Preparing the GPU buffer pool for active LoRA weights.
+        """
+        # Target module names in huggingface lora configs.
+        # e.g., {"k_proj", "q_proj", "v_proj", "o_proj"}
+        hf_target_module_names: Set[str] = set()
+        for config in self.configs.values():
+            hf_target_module_names.update(config.target_modules)
+        max_lora_dim: int = max([x.hf_config["r"] for x in self.configs.values()])
+        # Loads / unloads LoRA adapters based on the latest configs.
+        self.update_lora_adapters()
+        # Lazily update states for new LoRA weight name (e.g., qkv_proj) as needed.
+        #
+        # Please note that the following update operations are "monotonic" by design, meaning that we update
+        # multiple places to support the new weight names when the first adapter targeting such weight names
+        # is loaded. However, we never "rollback" the support (e.g., convert LoRA layer back to base layer)
+        # even if the associated adapters are unloaded later for both simplicity and practicality reasons: the
+        # list of LoRA weight names is expected to be extremely finite and stable.
+        self.update_lora_weight_names(hf_target_module_names)
+        self.update_lora_modules(hf_target_module_names)
+        self.update_memory_buffers(max_lora_dim)
+    def update_lora_weight_names(self, hf_target_names: Set[str]):
+        """
+        Add new LoRA weight names if needed based on the current `self.configs`.
+        """
+        # Target lora weight names for lora_a and lora_b modules respectively.
+        for module in hf_target_names:
+            lora_A, lora_B = get_normalized_lora_weight_names(module)
+            self.lora_weight_names[0].update(lora_A)
+            self.lora_weight_names[1].update(lora_B)
+    def update_lora_adapters(self):
+        """
+        Update the LoRA adapters in CPU memory based on the current `self.configs`.
+        It loads any new adapters that are not already loaded, and unloads any adapters
+        that are no longer in `self.configs` (e.g., unloaded).
+        """
+        # Load new adapter weights to cpu
+        for name, config in self.configs.items():
+            if name not in self.loras:
+                logger.info(f"Loading weight of LoRA adapter {name} from {config.path}")
+                lora_adapter = LoRAAdapter(
+                    name,
+                    config,
+                    self.base_hf_config,
+                    self.load_config,
+                    self.lora_backend,
+                )
+                lora_adapter.initialize_weights()
+                self.loras[name] = lora_adapter
+        # Clean up unused LoRA adapters
+        for name in self.loras:
+            if name not in self.configs:
+                logger.info(f"Unloading LoRA adapter {name}")
+                del self.loras[name]
+        # Additional checks for flashinfer backend
+        # FIXME remove the restrictions after supporting multi-rank for flashinfer backend
+        if self.lora_backend == "flashinfer":
+            lora_dims = set(x.hf_config["r"] for x in self.configs.values())
+            scalings = set(x.scaling for x in self.loras.values())
+            assert (
+                len(lora_dims) == 1 and len(scalings) == 1
+            ), "Flashinfer backend currently only supports single LoRA rank and scaling across all adapters. "
+    def update_memory_buffers(self, max_lora_dim: int):
+        """
+        Update the LoRA memory pool buffers based on the current LoRA configurations and update
+        LoRA modules to use the new buffers. This method should be called after the LoRA configurations
+        are set or updated.
+        """
+        self.memory_pool.init_buffers(
+            self.lora_weight_names, self.base_model, max_lora_dim
+        )
     def set_lora_module(self, module_name, module):
         lora_module = get_lora_layer(module, self.lora_backend)
         replace_submodule(self.base_model, module_name, lora_module)
         return lora_module
-    def convert_to_lora_layers(self):
+    def update_lora_modules(self, hf_target_names: Set[str]):
         # Target module names of customized layers defined in python/sglang/srt/layers
         # e.g., {"qkv_proj", "o_proj"}
         customized_target_names = get_customized_names_from_hf_names(
-            self.hf_target_names, self.base_model
+            hf_target_names, self.base_model
         )
-        # Monkey patch to use the LoRA version layers
-        self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = {
-            i: [] for i in range(self.base_hf_config.num_hidden_layers)
-        }
         for module_name, module in self.base_model.named_modules():
             # TODO (lifuhuang): in the future, we should consider generalizing the
             # should_apply_lora function to support mapping by full module name instead
@@ -281,6 +424,7 @@ class LoRAManager:
             # The module should be converted if it is included in target_names
             if module_name.split(".")[-1] in customized_target_names:
                 layer_id = get_layer_id(module_name)
-                self.lora_modules[layer_id].append(
-                    (module_name, self.set_lora_module(module_name, module))
-                )
+                if module_name not in self.lora_modules[layer_id]:
+                    self.lora_modules[layer_id][module_name] = self.set_lora_module(
+                        module_name, module
+                    )

sglang 0.4.7__py3-none-any.whl → 0.4.8__py3-none-any.whl

sglang 0.4.7py3-none-any.whl → 0.4.8py3-none-any.whl