PyPI - sglang - Versions diffs - 0.4.7.post1__py3-none-any.whl → 0.4.8__py3-none-any.whl - Mend

sglang 0.4.7.post1py3-none-any.whl → 0.4.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

sglang/bench_one_batch.py +8 -6
sglang/srt/_custom_ops.py +2 -2
sglang/srt/code_completion_parser.py +2 -44
sglang/srt/constants.py +3 -0
sglang/srt/conversation.py +13 -3
sglang/srt/custom_op.py +5 -1
sglang/srt/disaggregation/decode.py +22 -28
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -3
sglang/srt/disaggregation/mini_lb.py +34 -4
sglang/srt/disaggregation/mooncake/conn.py +12 -16
sglang/srt/disaggregation/prefill.py +17 -13
sglang/srt/disaggregation/utils.py +46 -18
sglang/srt/distributed/parallel_state.py +12 -4
sglang/srt/entrypoints/engine.py +22 -28
sglang/srt/entrypoints/http_server.py +149 -79
sglang/srt/entrypoints/http_server_engine.py +0 -3
sglang/srt/entrypoints/openai/__init__.py +0 -0
sglang/srt/{openai_api → entrypoints/openai}/protocol.py +67 -29
sglang/srt/entrypoints/openai/serving_base.py +149 -0
sglang/srt/entrypoints/openai/serving_chat.py +921 -0
sglang/srt/entrypoints/openai/serving_completions.py +424 -0
sglang/srt/entrypoints/openai/serving_embedding.py +169 -0
sglang/srt/entrypoints/openai/serving_rerank.py +102 -0
sglang/srt/entrypoints/openai/serving_score.py +61 -0
sglang/srt/entrypoints/openai/usage_processor.py +81 -0
sglang/srt/entrypoints/openai/utils.py +72 -0
sglang/srt/function_call/base_format_detector.py +7 -4
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/function_call/ebnf_composer.py +64 -10
sglang/srt/function_call/function_call_parser.py +6 -6
sglang/srt/function_call/llama32_detector.py +1 -1
sglang/srt/function_call/mistral_detector.py +1 -1
sglang/srt/function_call/pythonic_detector.py +1 -1
sglang/srt/function_call/qwen25_detector.py +1 -1
sglang/srt/{openai_api/utils.py → jinja_template_utils.py} +6 -5
sglang/srt/layers/activation.py +21 -3
sglang/srt/layers/attention/aiter_backend.py +5 -2
sglang/srt/layers/attention/base_attn_backend.py +1 -1
sglang/srt/layers/attention/cutlass_mla_backend.py +1 -0
sglang/srt/layers/attention/flashattention_backend.py +19 -9
sglang/srt/layers/attention/flashinfer_backend.py +9 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +7 -4
sglang/srt/layers/attention/flashmla_backend.py +5 -2
sglang/srt/layers/attention/tbo_backend.py +3 -3
sglang/srt/layers/attention/triton_backend.py +19 -11
sglang/srt/layers/communicator.py +5 -5
sglang/srt/layers/dp_attention.py +11 -2
sglang/srt/layers/layernorm.py +29 -2
sglang/srt/layers/logits_processor.py +2 -2
sglang/srt/layers/moe/ep_moe/kernels.py +159 -2
sglang/srt/layers/moe/ep_moe/layer.py +207 -1
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +6 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +75 -12
sglang/srt/layers/moe/topk.py +91 -4
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +6 -2
sglang/srt/layers/quantization/fp8.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +62 -8
sglang/srt/layers/quantization/utils.py +5 -2
sglang/srt/layers/rotary_embedding.py +42 -2
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/lora_manager.py +173 -74
sglang/srt/lora/mem_pool.py +49 -45
sglang/srt/lora/utils.py +1 -1
sglang/srt/managers/cache_controller.py +33 -15
sglang/srt/managers/io_struct.py +9 -12
sglang/srt/managers/schedule_batch.py +40 -31
sglang/srt/managers/schedule_policy.py +70 -56
sglang/srt/managers/scheduler.py +147 -62
sglang/srt/managers/template_manager.py +226 -0
sglang/srt/managers/tokenizer_manager.py +11 -8
sglang/srt/managers/tp_worker.py +12 -2
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/{paged_allocator.py → allocator.py} +125 -34
sglang/srt/mem_cache/base_prefix_cache.py +52 -8
sglang/srt/mem_cache/chunk_cache.py +11 -16
sglang/srt/mem_cache/hiradix_cache.py +34 -23
sglang/srt/mem_cache/memory_pool.py +118 -114
sglang/srt/mem_cache/radix_cache.py +20 -16
sglang/srt/model_executor/cuda_graph_runner.py +76 -45
sglang/srt/model_executor/forward_batch_info.py +18 -5
sglang/srt/model_executor/model_runner.py +22 -6
sglang/srt/model_loader/loader.py +8 -1
sglang/srt/model_loader/weight_utils.py +11 -2
sglang/srt/models/deepseek_nextn.py +29 -27
sglang/srt/models/deepseek_v2.py +108 -26
sglang/srt/models/glm4.py +312 -0
sglang/srt/models/mimo_mtp.py +2 -18
sglang/srt/reasoning_parser.py +21 -11
sglang/srt/server_args.py +36 -8
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +131 -10
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +125 -12
sglang/srt/speculative/eagle_utils.py +80 -8
sglang/srt/speculative/eagle_worker.py +124 -41
sglang/srt/torch_memory_saver_adapter.py +19 -15
sglang/srt/utils.py +177 -11
sglang/test/test_block_fp8_ep.py +1 -0
sglang/test/test_utils.py +1 -0
sglang/version.py +1 -1
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/METADATA +4 -10
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/RECORD +104 -93
sglang/srt/entrypoints/verl_engine.py +0 -179
sglang/srt/openai_api/adapter.py +0 -2148
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/WHEEL +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.7.post1.dist-info → sglang-0.4.8.dist-info}/top_level.txt +0 -0

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # and "Punica: Multi-Tenant LoRA Serving"
 import logging
-from typing import Dict, List, Set, Tuple
+from typing import Dict, Set, Tuple
 import torch
@@ -45,7 +45,6 @@ class LoRAManager:
     def __init__(
         self,
         base_model: torch.nn.Module,
-        lora_paths: Dict[str, str],
         base_hf_config: AutoConfig,
         max_loras_per_batch: int,
         load_config: LoadConfig,
@@ -55,7 +54,6 @@ class LoRAManager:
         tp_rank: int = 0,
     ):
         self.base_model: torch.nn.Module = base_model
-        self.lora_paths: Dict[str, str] = lora_paths
         self.base_hf_config: AutoConfig = base_hf_config
         self.max_loras_per_batch: int = max_loras_per_batch
         self.load_config: LoadConfig = load_config
@@ -69,8 +67,8 @@ class LoRAManager:
         backend_type = get_backend_from_name(lora_backend)
         self.lora_backend: BaseLoRABackend = backend_type(lora_backend)
-        self.init_loras()
-        self.init_lora_memory_pool()
+        # Initialize mutable internal state of the LoRAManager.
+        self.init_state()
     def init_cuda_graph_batch_info(self, max_bs_in_cuda_graph: int):
         self.max_bs_in_cuda_graph = max_bs_in_cuda_graph
@@ -100,72 +98,49 @@ class LoRAManager:
                 ],
             )
-    def init_loras(self):
-        # Config of each LoRA adapter
-        self.configs: Dict[str, LoRAConfig] = {}
+    def load_lora_adapters(self, lora_paths: Dict[str, str]):
+        """
+        Load LoRA adapters from the specified paths.
+        TODO (lifuhuang): This method should be exposed to the server/engine API to support dynamic LoRA loading.
+        Args:
+            lora_paths (Dict[str, str]): A dictionary mapping LoRA adapter names to their file paths.
+            If a LoRA adapter is already loaded, it will be skipped with a warning.
+        """
+        for lora_name, lora_path in lora_paths.items():
+            if lora_name in self.loras:
+                logger.warning(
+                    f"LoRA adapter {lora_name} is already loaded."
+                    "If you want to reload it, please unload it first."
+                )
+                continue
-        # Target module names in huggingface lora configs.
-        # e.g., {"k_proj", "q_proj", "v_proj", "o_proj"}
-        self.hf_target_names: Set[str] = set()
-        for name, path in self.lora_paths.items():
-            self.configs[name] = LoRAConfig(path)
-            self.hf_target_names.update(self.configs[name].target_modules)
+            self.configs[lora_name] = LoRAConfig(lora_path)
-        # Target lora weight names for lora_a and lora_b modules respectively.
-        weights_A: List[str] = []
-        weights_B: List[str] = []
-        for module in self.hf_target_names:
-            lora_A, lora_B = get_normalized_lora_weight_names(module)
-            weights_A += lora_A
-            weights_B += lora_B
-        self.lora_weight_names: Tuple[Set[str]] = set(weights_A), set(weights_B)
+        self.update_state_from_configs()
-        # load all weights to cpu
-        self.loras: Dict[str, LoRAAdapter] = {}
-        for name in self.lora_paths.keys():
-            lora_adapter = LoRAAdapter(
-                name,
-                self.configs[name],
-                self.base_hf_config,
-                self.load_config,
-                self.lora_backend,
-            )
-            lora_adapter.initialize_weights()
-            self.loras[name] = lora_adapter
-        # misc lora configs
-        self.max_lora_dim: int = max([x.hf_config["r"] for x in self.configs.values()])
+    def unload_lora_adapters(self, lora_names: Set[str]):
+        """
+        Unload LoRA adapters by their names. This will remove the adapters from the memory pool and
+        delete the corresponding LoRA modules.
-        if self.lora_backend == "flashinfer":
-            # FIXME remove the restrictions after supporting multi-rank for flashinfer backend
-            max_lora_dim = max([x.hf_config["r"] for x in self.configs.values()])
-            scaling = list(self.loras.values())[0].scaling
-            assert all(x.hf_config["r"] == max_lora_dim for x in self.configs.values())
-            assert all(x.scaling == scaling for x in self.loras.values())
+        Args:
+            lora_names (Set[str]): A set of LoRA adapter names to unload.
+        """
+        for lora_name in lora_names:
+            if lora_name in self.loras:
+                del self.configs[lora_name]
+            else:
+                logger.warning(f"LoRA adapter {lora_name} is not loaded.")
-        # Convert original model layers to layers with LoRA
-        self.convert_to_lora_layers()
-    def init_lora_memory_pool(self):
-        # Initialize memory pool
-        self.memory_pool = LoRAMemoryPool(
-            self.base_hf_config,
-            self.max_loras_per_batch,
-            self.max_lora_dim,
-            self.dtype,
-            self.tp_size,
-            self.tp_rank,
-            self.lora_modules,
-        )
-        # Initialize target lora modules in memory pool
-        self.memory_pool.init_buffers(self.lora_weight_names, self.base_model)
+        self.update_state_from_configs()
     def prepare_lora_batch(self, forward_batch: ForwardBatch):
         # load active loras into lora memory pool
         cur_uids = set(forward_batch.lora_paths)
         assert len(cur_uids) <= self.max_loras_per_batch
-        self.memory_pool.prepare_lora_batch(cur_uids, self.loras)
+        self.memory_pool.prepare_lora_batch(cur_uids, self.loras, self.lora_modules)
         # set up batch info shared by all lora modules
         bs = forward_batch.batch_size
@@ -267,9 +242,16 @@ class LoRAManager:
             )
         self.lora_backend.set_batch_info(batch_info)
-        # call set_lora_info for each lora modules
-        for layer_id, modules in self.lora_modules.items():
-            for module_name, module in modules:
+        # TODO (lifuhuang): one potential perf optimization that is worth considering is to see if we can call
+        # this method only when loading/unloading LoRA adapters, instead of calling it for every micro-batch.
+        self.update_lora_info()
+    def update_lora_info(self):
+        """
+        Update all LoRA modules to associate them with the latest memory buffer.
+        """
+        for layer_id, layer_modules in self.lora_modules.items():
+            for module_name, module in layer_modules.items():
                 if "qkv_proj" in module_name:
                     module.set_lora_info(
                         self.memory_pool.get_tensor(
@@ -295,23 +277,139 @@ class LoRAManager:
                         ),
                     )
+    def init_state(self):
+        """
+        Initialize the internal (mutable) state of the LoRAManager.
+        These states are mutable via the `update_state_from_configs` as LoRA adapters are loaded and unloaded dynamically.
+        """
+        # Configs of all active LoRA adapters.
+        self.configs: Dict[str, LoRAConfig] = {}
+        # LoRA adapter weights cached in CPU memory.
+        self.loras: Dict[str, LoRAAdapter] = {}
+        # Supported weight names (e.g., qkv_proj) for LoRA A and B respectively.
+        self.lora_weight_names: Tuple[Set[str]] = (set(), set())
+        # Look-up table that essentially maps (layer_index, module_name) to the corresponding LoRA module.
+        self.lora_modules: Dict[int, Dict[str, BaseLayerWithLoRA]] = {
+            i: {} for i in range(self.base_hf_config.num_hidden_layers)
+        }
+        # Initialize memory pool
+        self.memory_pool = LoRAMemoryPool(
+            self.base_hf_config,
+            self.max_loras_per_batch,
+            self.dtype,
+            self.tp_size,
+            self.tp_rank,
+        )
+    def update_state_from_configs(self):
+        """
+        Update the internal state of the LoRAManager based on the current `self.configs`. This method
+        should be called whenever `self.configs` is modified (e.g., when new LoRA adapters are loaded).
+        This includes:
+        - Initializing LoRA adapters if they are not already loaded.
+        - Collect all LoRA weight names based on the current loaded adapters.
+        - Lazily monkey-patching the base model to use LoRA layers where applicable.
+        - Preparing the GPU buffer pool for active LoRA weights.
+        """
+        # Target module names in huggingface lora configs.
+        # e.g., {"k_proj", "q_proj", "v_proj", "o_proj"}
+        hf_target_module_names: Set[str] = set()
+        for config in self.configs.values():
+            hf_target_module_names.update(config.target_modules)
+        max_lora_dim: int = max([x.hf_config["r"] for x in self.configs.values()])
+        # Loads / unloads LoRA adapters based on the latest configs.
+        self.update_lora_adapters()
+        # Lazily update states for new LoRA weight name (e.g., qkv_proj) as needed.
+        #
+        # Please note that the following update operations are "monotonic" by design, meaning that we update
+        # multiple places to support the new weight names when the first adapter targeting such weight names
+        # is loaded. However, we never "rollback" the support (e.g., convert LoRA layer back to base layer)
+        # even if the associated adapters are unloaded later for both simplicity and practicality reasons: the
+        # list of LoRA weight names is expected to be extremely finite and stable.
+        self.update_lora_weight_names(hf_target_module_names)
+        self.update_lora_modules(hf_target_module_names)
+        self.update_memory_buffers(max_lora_dim)
+    def update_lora_weight_names(self, hf_target_names: Set[str]):
+        """
+        Add new LoRA weight names if needed based on the current `self.configs`.
+        """
+        # Target lora weight names for lora_a and lora_b modules respectively.
+        for module in hf_target_names:
+            lora_A, lora_B = get_normalized_lora_weight_names(module)
+            self.lora_weight_names[0].update(lora_A)
+            self.lora_weight_names[1].update(lora_B)
+    def update_lora_adapters(self):
+        """
+        Update the LoRA adapters in CPU memory based on the current `self.configs`.
+        It loads any new adapters that are not already loaded, and unloads any adapters
+        that are no longer in `self.configs` (e.g., unloaded).
+        """
+        # Load new adapter weights to cpu
+        for name, config in self.configs.items():
+            if name not in self.loras:
+                logger.info(f"Loading weight of LoRA adapter {name} from {config.path}")
+                lora_adapter = LoRAAdapter(
+                    name,
+                    config,
+                    self.base_hf_config,
+                    self.load_config,
+                    self.lora_backend,
+                )
+                lora_adapter.initialize_weights()
+                self.loras[name] = lora_adapter
+        # Clean up unused LoRA adapters
+        for name in self.loras:
+            if name not in self.configs:
+                logger.info(f"Unloading LoRA adapter {name}")
+                del self.loras[name]
+        # Additional checks for flashinfer backend
+        # FIXME remove the restrictions after supporting multi-rank for flashinfer backend
+        if self.lora_backend == "flashinfer":
+            lora_dims = set(x.hf_config["r"] for x in self.configs.values())
+            scalings = set(x.scaling for x in self.loras.values())
+            assert (
+                len(lora_dims) == 1 and len(scalings) == 1
+            ), "Flashinfer backend currently only supports single LoRA rank and scaling across all adapters. "
+    def update_memory_buffers(self, max_lora_dim: int):
+        """
+        Update the LoRA memory pool buffers based on the current LoRA configurations and update
+        LoRA modules to use the new buffers. This method should be called after the LoRA configurations
+        are set or updated.
+        """
+        self.memory_pool.init_buffers(
+            self.lora_weight_names, self.base_model, max_lora_dim
+        )
     def set_lora_module(self, module_name, module):
         lora_module = get_lora_layer(module, self.lora_backend)
         replace_submodule(self.base_model, module_name, lora_module)
         return lora_module
-    def convert_to_lora_layers(self):
+    def update_lora_modules(self, hf_target_names: Set[str]):
         # Target module names of customized layers defined in python/sglang/srt/layers
         # e.g., {"qkv_proj", "o_proj"}
         customized_target_names = get_customized_names_from_hf_names(
-            self.hf_target_names, self.base_model
+            hf_target_names, self.base_model
         )
-        # Monkey patch to use the LoRA version layers
-        self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = {
-            i: [] for i in range(self.base_hf_config.num_hidden_layers)
-        }
         for module_name, module in self.base_model.named_modules():
             # TODO (lifuhuang): in the future, we should consider generalizing the
             # should_apply_lora function to support mapping by full module name instead
@@ -326,6 +424,7 @@ class LoRAManager:
             # The module should be converted if it is included in target_names
             if module_name.split(".")[-1] in customized_target_names:
                 layer_id = get_layer_id(module_name)
-                self.lora_modules[layer_id].append(
-                    (module_name, self.set_lora_module(module_name, module))
-                )
+                if module_name not in self.lora_modules[layer_id]:
+                    self.lora_modules[layer_id][module_name] = self.set_lora_module(
+                        module_name, module
+                    )

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Callable, Dict, List, Optional, Set, Tuple
 import torch
@@ -22,21 +22,16 @@ class LoRAMemoryPool:
         self,
         base_hf_config: AutoConfig,
         max_loras_per_batch: int,
-        max_lora_dim: int,
         dtype: torch.dtype,
         tp_size: int,
         tp_rank: int,
-        lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]],
     ):
         self.base_hf_config: AutoConfig = base_hf_config
         self.num_layer: int = base_hf_config.num_hidden_layers
         self.max_loras_per_batch: int = max_loras_per_batch
-        self.max_lora_dim: int = max_lora_dim
         self.dtype: torch.dtype = dtype
         self.tp_size: int = tp_size
         self.tp_rank: int = tp_rank
-        self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = lora_modules
         # Both A_buffer and B_buffer maps lora weight names to its buffer space.
         # A_buffer contains num_layer number of row-major tensors with shape
@@ -55,79 +50,84 @@ class LoRAMemoryPool:
         self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
     def get_lora_A_shape(
-        self, module_name: str, base_model: torch.nn.Module
+        self, module_name: str, base_model: torch.nn.Module, max_lora_dim: int
     ) -> Tuple[int]:
         """
         Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
         """
         input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model)
         c = get_stacked_multiply(module_name)
-        if self.tp_size > 1:
-            if module_name in ROW_PARALLELISM_LINEAR_LORA_NAMES:
-                input_dim = divide(input_dim, self.tp_size)
+        if self.tp_size > 1 and module_name in ROW_PARALLELISM_LINEAR_LORA_NAMES:
+            input_dim = divide(input_dim, self.tp_size)
         return (
             self.max_loras_per_batch,
-            self.max_lora_dim * c,
+            max_lora_dim * c,
             input_dim,
         )
     def get_lora_B_shape(
-        self, module_name: str, base_model: torch.nn.Module
+        self, module_name: str, base_model: torch.nn.Module, max_lora_dim: int
     ) -> Tuple[int]:
         """
         Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
         """
         _, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
         c = get_stacked_multiply(module_name)
-        if self.tp_size > 1:
-            if module_name not in ROW_PARALLELISM_LINEAR_LORA_NAMES:
-                output_dim = divide(output_dim, self.tp_size)
+        if self.tp_size > 1 and module_name not in ROW_PARALLELISM_LINEAR_LORA_NAMES:
+            output_dim = divide(output_dim, self.tp_size)
         return (
             c,
             self.max_loras_per_batch,
             output_dim,
-            self.max_lora_dim,
+            max_lora_dim,
         )
     def init_buffers(
         self,
         lora_weight_names: Tuple[Set[str]],
         base_model: torch.nn.Module,
+        max_lora_dim: int,
     ):
         # lora_weight_names is a set of name pairs indicating each pair of lora modules to load
         #   e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
         self.lora_weight_names: Tuple[Set[str]] = lora_weight_names
         device = next(base_model.parameters()).device
-        # Init A tensor, column_major=False
-        for module_A in lora_weight_names[0]:
-            lora_A_shape = self.get_lora_A_shape(module_A, base_model)
-            self.A_buffer[module_A] = [
-                torch.empty(
-                    lora_A_shape,
-                    dtype=self.dtype,
-                    device=device,
-                )
-                for _ in range(self.num_layer)
-            ]
-        # Init B tensor, column_major=True
-        for module_B in lora_weight_names[1]:
-            lora_B_shape = self.get_lora_B_shape(module_B, base_model)
-            self.B_buffer[module_B] = [
-                torch.empty(
-                    lora_B_shape,
-                    dtype=self.dtype,
-                    device=device,
-                )
-                for _ in range(self.num_layer)
-            ]
+        def update_buffer(
+            buffer: Dict[str, List[torch.Tensor]],
+            lora_weight_names: Set[str],
+            get_lora_shape_fn: Callable[[str, torch.nn.Module, int], Tuple[int]],
+        ):
+            new_weight_names = lora_weight_names - buffer.keys()
+            for module_name in new_weight_names:
+                lora_shape = get_lora_shape_fn(module_name, base_model, max_lora_dim)
+                buffer[module_name] = [
+                    torch.empty(
+                        lora_shape,
+                        dtype=self.dtype,
+                        device=device,
+                    )
+                    for _ in range(self.num_layer)
+                ]
+        update_buffer(
+            self.A_buffer,
+            lora_weight_names[0],
+            self.get_lora_A_shape,
+        )
+        update_buffer(
+            self.B_buffer,
+            lora_weight_names[1],
+            self.get_lora_B_shape,
+        )
     def prepare_lora_batch(
         self,
         cur_uids: Set[Optional[str]],
         lora_adapters: Dict[str, LoRAAdapter],
+        lora_modules: Dict[int, Dict[str, BaseLayerWithLoRA]],
     ):
         def get_available_buffer_slot():
             for buffer_id in range(self.max_loras_per_batch):
                 # Prioritize empty slots
@@ -147,14 +147,19 @@ class LoRAMemoryPool:
         for uid in cur_uids:
             if uid not in self.uid_to_buffer_id:
                 buffer_id = get_available_buffer_slot()
+                lora_adapter = lora_adapters.get(uid, None)
                 self.load_lora_weight_to_buffer(
-                    uid, buffer_id, lora_adapters.get(uid, None)
+                    uid, buffer_id, lora_adapter, lora_modules
                 )
                 self.uid_to_buffer_id[uid] = buffer_id
                 self.buffer_id_to_uid[buffer_id] = uid
     def load_lora_weight_to_buffer(
-        self, uid: str, buffer_id: int, lora_adapter: LoRAAdapter = None
+        self,
+        uid: str,
+        buffer_id: int,
+        lora_adapter: LoRAAdapter,
+        lora_modules: Dict[int, Dict[str, BaseLayerWithLoRA]],
     ):
         def check_lora_weight_shape(buffer_view: torch.Tensor, weight: torch.Tensor):
             assert (
@@ -186,8 +191,8 @@ class LoRAMemoryPool:
                     temp_B_buffer[lora_weight_name] = weights
             if self.tp_size > 1:
-                cur_layer_modules = self.lora_modules[layer_id]
-                for module_name, module in cur_layer_modules:
+                cur_layer_modules = lora_modules[layer_id]
+                for module_name, module in cur_layer_modules.items():
                     if "qkv_proj" in module_name:
                         temp_A_buffer["qkv_proj"] = module.slice_lora_a_weights(
                             temp_A_buffer["qkv_proj"], self.tp_rank
@@ -236,7 +241,6 @@ class LoRAMemoryPool:
     def get_tensor(
         self, weight_name: str, layer_id: int, lora_type: LoRAType
     ) -> torch.Tensor:
         if lora_type == LoRAType.LORA_A:
             return self.A_buffer[weight_name][layer_id]

sglang/srt/lora/utils.py CHANGED Viewed

@@ -108,7 +108,7 @@ def get_hidden_dim(
 def get_normalized_lora_weight_names(name: str) -> Tuple[List[str], List[str]]:
     """
-    Mapping a target module name to names of the normized LoRA weights.
+    Mapping a target module name to names of the normalized LoRA weights.
     Returned tuple contains (name for Lora A, name for Lora B)
     """
     params_mapping = {

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -18,34 +18,50 @@ import logging
 import math
 import threading
 from queue import Empty, Full, PriorityQueue, Queue
-from typing import List, Optional
+from typing import TYPE_CHECKING, List, Optional
 import torch
-from sglang.srt.mem_cache.memory_pool import TokenToKVPoolAllocator
-from sglang.srt.mem_cache.memory_pool_host import HostKVCache
+if TYPE_CHECKING:
+    from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
+    from sglang.srt.mem_cache.memory_pool_host import HostKVCache
 logger = logging.getLogger(__name__)
 class LayerDoneCounter:
     def __init__(self, num_layers):
-        self.counter = num_layers
-        self.condition = threading.Condition()
+        self.num_layers = num_layers
+        # extra producer and consumer counters for overlap mode
+        self.num_counters = 3
+        self.counters = [num_layers] * self.num_counters
+        self.conditions = [threading.Condition() for _ in range(self.num_counters)]
+        self.producer_index = 0
+        self.consumer_index = 0
+    def next_producer(self):
+        return (self.producer_index + 1) % self.num_counters
+    def update_producer(self):
+        self.producer_index = self.next_producer()
+        return self.producer_index
+    def set_consumer(self, index):
+        self.consumer_index = index
     def increment(self):
-        with self.condition:
-            self.counter += 1
-            self.condition.notify_all()
+        with self.conditions[self.producer_index]:
+            self.counters[self.producer_index] += 1
+            self.conditions[self.producer_index].notify_all()
     def wait_until(self, threshold):
-        with self.condition:
-            while self.counter <= threshold:
-                self.condition.wait()
+        with self.conditions[self.consumer_index]:
+            while self.counters[self.consumer_index] <= threshold:
+                self.conditions[self.consumer_index].wait()
     def reset(self):
-        with self.condition:
-            self.counter = 0
+        with self.conditions[self.producer_index]:
+            self.counters[self.producer_index] = 0
 class CacheOperation:
@@ -148,7 +164,7 @@ class HiCacheController:
     def __init__(
         self,
-        token_to_kv_pool_allocator: TokenToKVPoolAllocator,
+        token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
         mem_pool_host: HostKVCache,
         page_size: int,
         load_cache_event: threading.Event = None,
@@ -296,7 +312,6 @@ class HiCacheController:
         while not self.stop_event.is_set():
             try:
                 operation = self.load_queue.get(block=True, timeout=1)
-                # time.sleep(18e-6 * len(operation.host_indices))
                 operation.data = self.mem_pool_host.get_flat_data(
                     operation.host_indices
                 )
@@ -320,6 +335,7 @@ class HiCacheController:
             if not self.load_cache_event.is_set():
                 continue
             self.load_cache_event.clear()
+            self.layer_done_counter.update_producer()
             batch_operation = None
             while self.load_queue.qsize() > 0:
@@ -331,6 +347,7 @@ class HiCacheController:
             if batch_operation is None:
                 continue
+            # start layer-wise KV cache transfer from CPU to GPU
             self.layer_done_counter.reset()
             for i in range(self.mem_pool_host.layer_num):
                 if self.page_size == 1:
@@ -466,6 +483,7 @@ class HiCacheController:
             except Exception as e:
                 logger.error(e)
+    # todo (zhiqiang): double buffering to be deprecated
     def write_thread_func_buffer(self):
         aux_thread = threading.Thread(target=self.write_aux_func, daemon=True)
         aux_thread.start()

sglang 0.4.7.post1__py3-none-any.whl → 0.4.8__py3-none-any.whl

sglang 0.4.7.post1py3-none-any.whl → 0.4.8py3-none-any.whl