PyPI - sglang - Versions diffs - 0.4.2.post2__py3-none-any.whl → 0.4.2.post4__py3-none-any.whl - Mend

sglang 0.4.2.post2py3-none-any.whl → 0.4.2.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -16,307 +16,115 @@
 # and "Punica: Multi-Tenant LoRA Serving"
 import logging
-import re
+from typing import Dict, List, Set, Tuple
 import torch
-from sglang.srt.lora.backend import FlashInferLoraBackend, TritonLoraBackend
-from sglang.srt.lora.lora import LoRAAdapter, LoraBatchInfo, get_lora_layer
+from sglang.srt.configs.load_config import LoadConfig
+from sglang.srt.hf_transformers_utils import AutoConfig
+from sglang.srt.lora.backend import BaseLoRABackend, get_backend_from_name
+from sglang.srt.lora.layers import get_lora_layer
+from sglang.srt.lora.lora import LoRAAdapter
 from sglang.srt.lora.lora_config import LoRAConfig
+from sglang.srt.lora.mem_pool import LoRAMemoryPool
+from sglang.srt.lora.utils import (
+    LoRABatchInfo,
+    LoRAType,
+    get_customized_names_from_hf_names,
+    get_layer_id,
+    get_stacked_name,
+    get_weight_name,
+)
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.utils import is_flashinfer_available, replace_submodule
+from sglang.srt.utils import replace_submodule
 logger = logging.getLogger(__name__)
-def get_module_name(name):
-    # Fallback solution of mapping from config module name to module name in model class.
-    # Please check if it aligns with your base model.
-    # Please implement the function in the model class if it is not.
-    # You can reference this function in llama.py.
-    params_mapping = {
-        "q_proj": "qkv_proj",
-        "k_proj": "qkv_proj",
-        "v_proj": "qkv_proj",
-        "gate_proj": "gate_up_proj",
-        "up_proj": "gate_up_proj",
-    }
-    return params_mapping.get(name, name)
-def get_hidden_dim(module_name, config):
-    # Fallback solution of get_hidden_dim for different modules
-    # Please check if it aligns with your base model.
-    # Please implement the function in the model class if it is not.
-    # You can reference this function in llama.py.
-    if module_name in ["q_proj", "o_proj", "qkv_proj"]:
-        return config.hidden_size, config.hidden_size
-    elif module_name in ["kv_proj"]:
-        return config.hidden_size, config.hidden_size // (
-            config.num_attention_heads // config.num_key_value_heads
-        )
-    elif module_name == "gate_up_proj":
-        return config.hidden_size, config.intermediate_size
-    elif module_name == "down_proj":
-        return config.intermediate_size, config.hidden_size
-    else:
-        raise NotImplementedError()
-def get_stacked_name(name):
-    # origin name -> (name for A, name for B)
-    params_mapping = {
-        "q_proj": ("qkv_proj", "q_proj"),
-        "k_proj": ("qkv_proj", "kv_proj"),
-        "v_proj": ("qkv_proj", "kv_proj"),
-        "gate_proj": ("gate_up_proj", "gate_up_proj"),
-        "up_proj": ("gate_up_proj", "gate_up_proj"),
-    }
-    return params_mapping.get(name, (name, name))
-def get_backend_from_name(name):
-    backend_mapping = {
-        "triton": TritonLoraBackend,
-        "flashinfer": FlashInferLoraBackend,
-    }
-    if name in backend_mapping:
-        return backend_mapping[name]
-    raise Exception(
-        f"No supported lora backend called {name}. It should be one of {list(backend_mapping.keys())}"
-    )
-def get_layer_id(name):
-    match = re.search(r"layers\.(\d+)\.", name)
-    if match is None:
-        return None
-    return int(match.group(1))
 class LoRAManager:
     def __init__(
         self,
-        base_model,
-        lora_paths,
-        base_hf_config,
-        max_loras_per_batch,
-        load_config,
-        dtype,
-        lora_backend,
+        base_model: torch.nn.Module,
+        lora_paths: Dict[str, str],
+        base_hf_config: AutoConfig,
+        max_loras_per_batch: int,
+        load_config: LoadConfig,
+        dtype: torch.dtype,
+        lora_backend: str = "triton",
     ):
-        self.base_model = base_model
-        self.lora_paths = lora_paths
-        self.base_hf_config = base_hf_config
-        self.max_loras_per_batch = max_loras_per_batch
-        self.load_config = load_config
-        self.dtype = dtype
-        logger.info(f"Using {lora_backend} as backend of Lora kernels.")
+        self.base_model: torch.nn.Module = base_model
+        self.lora_paths: Dict[str, str] = lora_paths
+        self.base_hf_config: AutoConfig = base_hf_config
+        self.max_loras_per_batch: int = max_loras_per_batch
+        self.load_config: LoadConfig = load_config
+        self.dtype: torch.dtype = dtype
+        # LoRA backend for running sgemm kernels
+        logger.info(f"Using {lora_backend} as backend of LoRA kernels.")
         backend_type = get_backend_from_name(lora_backend)
-        self.lora_backend = backend_type(lora_backend)
+        self.lora_backend: BaseLoRABackend = backend_type(lora_backend)
         self.init_loras()
         self.init_lora_memory_pool()
-        self.init_lora_batch()
-    def match_target_modules(self, module_name):
-        for target_module in self.target_modules:
-            if module_name.split(".")[-1] == target_module:
-                return True
-        return False
-    def get_target_modules(self):
-        modules = []
-        for module_name, module in self.base_model.named_modules():
-            if self.match_target_modules(module_name):
-                modules.append((module_name, module))
-        return modules
-    def set_lora_module(self, module_name, module):
-        lora_module = get_lora_layer(
-            module, self.max_lora_dim, self.scaling, self.lora_backend
-        )
-        replace_submodule(self.base_model, module_name, lora_module)
-        return lora_module
     def init_loras(self):
-        # get configs and target modules
-        self.configs = {}
-        self.origin_target_modules = set()
+        # Config of each LoRA adapter
+        self.configs: Dict[str, LoRAConfig] = {}
+        # Target module names in huggingface lora configs.
+        # e.g., {"k_proj", "q_proj", "v_proj", "o_proj"}
+        self.hf_target_names: Set[str] = set()
         for name, path in self.lora_paths.items():
             self.configs[name] = LoRAConfig(path)
-            self.origin_target_modules = set(self.origin_target_modules) | set(
+            self.hf_target_names = set(self.hf_target_names) | set(
                 self.configs[name].target_modules
             )
-        if hasattr(self.base_model, "get_module_name"):
-            self.target_modules = {
-                self.base_model.get_module_name(module)
-                for module in self.origin_target_modules
-            }
-        else:
-            logger.warning(
-                "WARNING: get_module_name() is not defined, "
-                "which is used to map config module name to model implementation module name."
-                "Use the default one, but please check if it is correct for your model."
-            )
-            self.target_modules = {
-                get_module_name(module) for module in self.origin_target_modules
-            }
-        self.target_weights = set(
-            [get_stacked_name(module) for module in self.origin_target_modules]
+        # Target lora weight names for lora_a and lora_b modules repectively.
+        # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
+        self.lora_weight_names: Set[Tuple[str]] = set(
+            [get_stacked_name(module) for module in self.hf_target_names]
         )
         # load all weights to cpu
-        self.loras = []
-        self.lora_id = {}
+        self.loras: Dict[str, LoRAAdapter] = {}
         for name in self.lora_paths.keys():
-            self.lora_id[name] = len(self.loras)
-            self.loras.append(
-                LoRAAdapter(
-                    name,
-                    self.configs[name],
-                    self.base_hf_config,
-                    self.load_config,
-                    self.lora_backend,
-                )
+            lora_adapter = LoRAAdapter(
+                name,
+                self.configs[name],
+                self.base_hf_config,
+                self.load_config,
+                self.lora_backend,
             )
-            self.loras[-1].initialize_weights()
+            lora_adapter.initialize_weights()
+            self.loras[name] = lora_adapter
         # misc lora configs
-        self.max_lora_dim = max([x.hf_config["r"] for x in self.configs.values()])
-        self.scaling = self.loras[0].scaling
-        # FIXME remove the restrictions
+        # FIXME remove the restrictions after implementing unified paging
+        self.max_lora_dim: int = max([x.hf_config["r"] for x in self.configs.values()])
+        self.scaling: float = list(self.loras.values())[0].scaling
         assert all(x.hf_config["r"] == self.max_lora_dim for x in self.configs.values())
-        assert all(x.scaling == self.scaling for x in self.loras)
+        assert all(x.scaling == self.scaling for x in self.loras.values())
-        # monkey patch to use the LoRA version
-        self.lora_modules = []
-        for module_name, module in self.get_target_modules():
-            self.lora_modules.append(
-                (module_name, self.set_lora_module(module_name, module))
-            )
+        # Convert original model layers to layers with LoRA
+        self.convert_to_lora_layers()
     def init_lora_memory_pool(self):
-        # preallocate lora memory pool
-        self.A_buffer = {}
-        self.B_buffer = {}
-        num_layer = self.base_hf_config.num_hidden_layers
-        for module_A, module_B in self.target_weights:
-            # init A tensor, column_major=True
-            if hasattr(self.base_model, "get_hidden_dim"):
-                hidden_dim_A, _ = self.base_model.get_hidden_dim(module_A)
-            else:
-                logger.warning(
-                    "WARNING: get_hidden_dim() is not defined, "
-                    "which is used to get the hidden dim for different lora modules"
-                    "Use the default one, but please check if it is correct for your model."
-                )
-                hidden_dim_A, _ = get_hidden_dim(module_A, self.base_hf_config)
-            c = self.loras[-1].get_stacked_multiply(module_A)
-            if module_A not in self.A_buffer:
-                self.A_buffer[module_A] = [
-                    torch.empty(
-                        (
-                            self.max_loras_per_batch,
-                            self.max_lora_dim * c,
-                            hidden_dim_A,
-                        ),
-                        dtype=self.dtype,
-                        device="cuda",
-                    )
-                    for i in range(num_layer)
-                ]
-            # init B tensor, column_major=True
-            if hasattr(self.base_model, "get_hidden_dim"):
-                _, hidden_dim_B = self.base_model.get_hidden_dim(module_B)
-            else:
-                logger.warning(
-                    "WARNING: get_hidden_dim() is not defined, "
-                    "which is used to get the hidden dim for different lora modules"
-                    "Use the default one, but please check if it is correct for your model."
-                )
-                _, hidden_dim_B = get_hidden_dim(module_B, self.base_hf_config)
-            c = self.loras[-1].get_stacked_multiply(module_B)
-            if module_B not in self.B_buffer:
-                self.B_buffer[module_B] = [
-                    torch.empty(
-                        (
-                            c,
-                            self.max_loras_per_batch,
-                            hidden_dim_B,
-                            self.max_lora_dim,
-                        ),
-                        dtype=self.dtype,
-                        device="cuda",
-                    )
-                    for i in range(num_layer)
-                ]
-    def init_lora_batch(self):
-        self.active_uids = set()  # set of active loras
-        self.buffer_id = {}  # lora uid -> idx in memory pool
-    def get_weight_name(self, name, idx):
-        for target_weight_name in self.target_weights:
-            if target_weight_name[idx] in name:
-                return target_weight_name[idx]
-    def load_lora(self, uid, buffer_id):
-        num_layer = self.base_hf_config.num_hidden_layers
-        if uid is None:
-            for i in range(num_layer):
-                for k in self.A_buffer.keys():
-                    self.A_buffer[k][i][buffer_id] *= 0
-            return
+        # Initialize memory pool
+        self.memory_pool = LoRAMemoryPool(
+            self.base_hf_config, self.max_loras_per_batch, self.max_lora_dim, self.dtype
+        )
-        for i in range(num_layer):
-            layer_weights = self.loras[self.lora_id[uid]].layers[i].weights
-            for name, weights in layer_weights.items():
-                if "lora_A" in name:
-                    lora_weight_name = self.get_weight_name(name, 0)
-                    if lora_weight_name:
-                        self.A_buffer[lora_weight_name][i][buffer_id].copy_(weights)
-                else:
-                    lora_weight_name = self.get_weight_name(name, 1)
-                    if lora_weight_name:
-                        c = self.loras[-1].get_stacked_multiply(lora_weight_name)
-                        if c > 1:
-                            for j in range(c):
-                                self.B_buffer[lora_weight_name][i][j][buffer_id].copy_(
-                                    weights[j]
-                                )
-                        else:
-                            self.B_buffer[lora_weight_name][i][0][buffer_id].copy_(
-                                weights
-                            )
+        # Initialize target lora modules in memory pool
+        self.memory_pool.init_buffers(self.lora_weight_names, self.base_model)
     def prepare_lora_batch(self, forward_batch: ForwardBatch):
         # load active loras into lora memory pool
         cur_uids = set(forward_batch.lora_paths)
         assert len(cur_uids) <= self.max_loras_per_batch
-        i = 0
-        j = len(self.active_uids)
-        evictable_uids = list(self.active_uids)
-        for uid in cur_uids:
-            if uid not in self.active_uids:
-                if j < self.max_loras_per_batch:
-                    index = j
-                    j += 1
-                else:
-                    while i < len(evictable_uids) and evictable_uids[i] in cur_uids:
-                        i += 1
-                    assert i < len(evictable_uids)
-                    self.active_uids.remove(evictable_uids[i])
-                    self.buffer_id.pop(evictable_uids[i])
-                    index = i
-                    i += 1
-                self.load_lora(uid, index)
-                self.active_uids.add(uid)
-                self.buffer_id[uid] = index
+        self.memory_pool.prepare_lora_batch(cur_uids, self.loras)
+        # FIXME: Handle lora uid with None more safely
         if cur_uids == set([None]):
             return
@@ -332,9 +140,9 @@ class LoRAManager:
         max_len = int(torch.max(seg_lens))
         weight_indices = torch.empty((bs,), dtype=torch.int64, device="cuda")
         for i, lora_path in enumerate(forward_batch.lora_paths):
-            weight_indices[i] = self.buffer_id[lora_path]
+            weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
-        batch_info = LoraBatchInfo(
+        batch_info = LoRABatchInfo(
             bs=bs,
             seg_lens=seg_lens,
             seg_indptr=seg_indptr,
@@ -346,16 +154,40 @@ class LoRAManager:
         # call set_lora_info for each lora modules
         for module_name, module in self.lora_modules:
             layer_id = get_layer_id(module_name)
             if "qkv_proj" not in module_name:
-                weight_name = self.get_weight_name(module_name, 0)
+                weight_name = get_weight_name(
+                    module_name, self.lora_weight_names, LoRAType.LORA_A
+                )
                 module.set_lora_info(
-                    self.A_buffer[weight_name][layer_id],
-                    self.B_buffer[weight_name][layer_id],
+                    self.memory_pool.get_tensor(weight_name, layer_id, LoRAType.LORA_A),
+                    self.memory_pool.get_tensor(weight_name, layer_id, LoRAType.LORA_B),
                 )
             else:
                 module.set_lora_info(
-                    self.A_buffer["qkv_proj"][layer_id],
-                    self.B_buffer["q_proj"][layer_id],
-                    self.B_buffer["kv_proj"][layer_id],
+                    self.memory_pool.get_tensor("qkv_proj", layer_id, LoRAType.LORA_A),
+                    self.memory_pool.get_tensor("q_proj", layer_id, LoRAType.LORA_B),
+                    self.memory_pool.get_tensor("kv_proj", layer_id, LoRAType.LORA_B),
+                )
+    def set_lora_module(self, module_name, module):
+        lora_module = get_lora_layer(
+            module, self.max_lora_dim, self.scaling, self.lora_backend
+        )
+        replace_submodule(self.base_model, module_name, lora_module)
+        return lora_module
+    def convert_to_lora_layers(self):
+        # Target module names of customized layers defined in python/sglang/srt/layers
+        # e.g., {"qkv_proj", "o_proj"}
+        customized_target_names = get_customized_names_from_hf_names(
+            self.hf_target_names, self.base_model
+        )
+        # Monkey patch to use the LoRA version layers
+        self.lora_modules: List[Tuple[str, torch.nn.Module]] = []
+        for module_name, module in self.base_model.named_modules():
+            # The module should be converted if it is included in target_names
+            if module_name.split(".")[-1] in customized_target_names:
+                self.lora_modules.append(
+                    (module_name, self.set_lora_module(module_name, module))
                 )

sglang/srt/lora/mem_pool.py ADDED Viewed

@@ -0,0 +1,174 @@
+from typing import Dict, List, Optional, Set, Tuple
+import torch
+from sglang.srt.hf_transformers_utils import AutoConfig
+from sglang.srt.lora.lora import LoRAAdapter
+from sglang.srt.lora.utils import (
+    LoRAType,
+    get_hidden_dim,
+    get_stacked_multiply,
+    get_weight_name,
+)
+class LoRAMemoryPool:
+    """Class for memory pool management of lora modules"""
+    def __init__(
+        self,
+        base_hf_config: AutoConfig,
+        max_loras_per_batch: int,
+        max_lora_dim: int,
+        dtype: torch.dtype,
+    ):
+        self.base_hf_config: AutoConfig = base_hf_config
+        self.num_layer: int = base_hf_config.num_hidden_layers
+        self.max_loras_per_batch: int = max_loras_per_batch
+        self.max_lora_dim: int = max_lora_dim
+        self.dtype: torch.dtype = dtype
+        # Both A_buffer and B_buffer maps lora weight names to its buffer space.
+        # A_buffer contains num_layer number of row-major tensors with shape
+        #   (max_loras_per_batch, stacked_num * max_lora_dim, input_dim)
+        # B_buffer contains num_layer number of column-major tensors with shape
+        #   (stacked_num, max_loras_per_batch, output_dim, max_lora_dim)
+        self.A_buffer: Dict[str, List[torch.Tensor]] = {}
+        self.B_buffer: Dict[str, List[torch.Tensor]] = {}
+        # Lora uid -> buffer idx in memory pool
+        self.uid_to_buffer_id: Dict[Optional[str], int] = {}
+        # Buffer idx -> lora uid in memory pool
+        # All uids are initalized as empty strings for empty buffer slots
+        # Here we don't initalize to None since None is a valid uid
+        self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
+    def init_buffers(
+        self,
+        lora_weight_names: Set[Tuple[str]],
+        base_model: torch.nn.Module,
+    ):
+        # lora_weight_names is a set of name pairs indicating each pair of lora modules to load
+        #   e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
+        self.lora_weight_names: Set[Tuple[str]] = lora_weight_names
+        for module_A, module_B in lora_weight_names:
+            # Init A tensor, column_major=False
+            input_dim, _ = get_hidden_dim(module_A, self.base_hf_config, base_model)
+            c = get_stacked_multiply(module_A)
+            if module_A not in self.A_buffer:
+                self.A_buffer[module_A] = [
+                    torch.empty(
+                        (
+                            self.max_loras_per_batch,
+                            self.max_lora_dim * c,
+                            input_dim,
+                        ),
+                        dtype=self.dtype,
+                        device="cuda",
+                    )
+                    for i in range(self.num_layer)
+                ]
+            # Init B tensor, column_major=True
+            _, output_dim = get_hidden_dim(module_B, self.base_hf_config, base_model)
+            c = get_stacked_multiply(module_B)
+            if module_B not in self.B_buffer:
+                self.B_buffer[module_B] = [
+                    torch.empty(
+                        (
+                            c,  # stacked lora_b modules might need separation
+                            self.max_loras_per_batch,
+                            output_dim,
+                            self.max_lora_dim,
+                        ),
+                        dtype=self.dtype,
+                        device="cuda",
+                    )
+                    for i in range(self.num_layer)
+                ]
+    def prepare_lora_batch(
+        self,
+        cur_uids: Set[Optional[str]],
+        lora_adapters: Dict[str, LoRAAdapter],
+    ):
+        def get_available_buffer_slot():
+            for buffer_id in range(self.max_loras_per_batch):
+                # Prioritize empty slots
+                if self.buffer_id_to_uid[buffer_id] == "":
+                    return buffer_id, ""
+            for buffer_id in range(self.max_loras_per_batch):
+                # Evict unneeded lora
+                if self.buffer_id_to_uid[buffer_id] not in cur_uids:
+                    return buffer_id, self.buffer_id_to_uid[buffer_id]
+            raise ValueError(
+                "No available buffer slots found. Please ensure the number of active loras is less than max_loras_per_batch."
+            )
+        for uid in cur_uids:
+            if uid not in self.uid_to_buffer_id:
+                buffer_id, evicted_lora_uid = get_available_buffer_slot()
+                if evicted_lora_uid != "":
+                    self.uid_to_buffer_id.pop(evicted_lora_uid)
+                self.load_lora_weight_to_buffer(
+                    uid, buffer_id, lora_adapters.get(uid, None)
+                )
+                self.uid_to_buffer_id[uid] = buffer_id
+                self.buffer_id_to_uid[buffer_id] = uid
+    def load_lora_weight_to_buffer(
+        self, uid: str, buffer_id: int, lora_adapter: LoRAAdapter = None
+    ):
+        if uid is None:
+            for i in range(self.num_layer):
+                for k in self.A_buffer.keys():
+                    self.A_buffer[k][i][buffer_id] *= 0
+            return
+        assert lora_adapter is not None
+        for layer_id in range(self.num_layer):
+            layer_weights = lora_adapter.layers[layer_id].weights
+            for name, weights in layer_weights.items():
+                if "lora_A" in name:
+                    lora_weight_name = get_weight_name(
+                        name, self.lora_weight_names, LoRAType.LORA_A
+                    )
+                    if lora_weight_name:
+                        self.A_buffer[lora_weight_name][layer_id][buffer_id].copy_(
+                            weights
+                        )
+                else:
+                    lora_weight_name = get_weight_name(
+                        name, self.lora_weight_names, LoRAType.LORA_B
+                    )
+                    if lora_weight_name:
+                        c = get_stacked_multiply(lora_weight_name)
+                        if c > 1:
+                            for stacked_id in range(c):
+                                self.B_buffer[lora_weight_name][layer_id][stacked_id][
+                                    buffer_id
+                                ].copy_(weights[stacked_id])
+                        else:
+                            self.B_buffer[lora_weight_name][layer_id][0][
+                                buffer_id
+                            ].copy_(weights)
+    def get_tensor(
+        self, weight_name: str, layer_id: int, lora_type: LoRAType
+    ) -> torch.Tensor:
+        if lora_type == LoRAType.LORA_A:
+            return self.A_buffer[weight_name][layer_id]
+        return self.B_buffer[weight_name][layer_id]
+    def get_buffer_id(self, lora_uid: str):
+        return self.uid_to_buffer_id[lora_uid]

sglang/srt/lora/triton_ops/__init__.py CHANGED Viewed

@@ -1,5 +1,11 @@
+from .gate_up_lora_b import gate_up_lora_b_fwd
 from .qkv_lora_b import qkv_lora_b_fwd
 from .sgemm_lora_a import sgemm_lora_a_fwd
 from .sgemm_lora_b import sgemm_lora_b_fwd
-__all__ = ["qkv_lora_b_fwd", "sgemm_lora_a_fwd", "sgemm_lora_b_fwd"]
+__all__ = [
+    "gate_up_lora_b_fwd",
+    "qkv_lora_b_fwd",
+    "sgemm_lora_a_fwd",
+    "sgemm_lora_b_fwd",
+]

sglang 0.4.2.post2__py3-none-any.whl → 0.4.2.post4__py3-none-any.whl

sglang 0.4.2.post2py3-none-any.whl → 0.4.2.post4py3-none-any.whl