PyPI - sglang - Versions diffs - 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl - Mend

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

sglang/bench_one_batch.py +113 -17
sglang/srt/configs/model_config.py +35 -0
sglang/srt/conversation.py +9 -5
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +6 -1
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
sglang/srt/disaggregation/mooncake/conn.py +243 -135
sglang/srt/disaggregation/prefill.py +2 -0
sglang/srt/distributed/parallel_state.py +11 -9
sglang/srt/entrypoints/context.py +244 -0
sglang/srt/entrypoints/engine.py +4 -3
sglang/srt/entrypoints/harmony_utils.py +370 -0
sglang/srt/entrypoints/http_server.py +71 -0
sglang/srt/entrypoints/openai/protocol.py +227 -1
sglang/srt/entrypoints/openai/serving_chat.py +278 -42
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +174 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/harmony_tool_parser.py +130 -0
sglang/srt/hf_transformers_utils.py +30 -3
sglang/srt/jinja_template_utils.py +8 -1
sglang/srt/layers/attention/aiter_backend.py +5 -8
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/vision.py +13 -5
sglang/srt/layers/communicator.py +21 -4
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/linear.py +2 -7
sglang/srt/layers/moe/cutlass_moe.py +20 -6
sglang/srt/layers/moe/ep_moe/layer.py +77 -73
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +416 -35
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
sglang/srt/layers/moe/topk.py +12 -3
sglang/srt/layers/moe/utils.py +16 -0
sglang/srt/layers/quantization/__init__.py +22 -0
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +3 -6
sglang/srt/layers/quantization/fp8_utils.py +29 -0
sglang/srt/layers/quantization/modelopt_quant.py +259 -64
sglang/srt/layers/quantization/mxfp4.py +651 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/__init__.py +0 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +225 -1
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +8 -3
sglang/srt/lora/lora_manager.py +70 -14
sglang/srt/lora/lora_registry.py +3 -2
sglang/srt/lora/mem_pool.py +43 -5
sglang/srt/managers/cache_controller.py +55 -30
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +15 -3
sglang/srt/managers/mm_utils.py +5 -11
sglang/srt/managers/schedule_batch.py +28 -7
sglang/srt/managers/scheduler.py +26 -12
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +35 -1
sglang/srt/managers/tokenizer_manager.py +24 -6
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/mem_cache/hiradix_cache.py +53 -5
sglang/srt/mem_cache/memory_pool_host.py +1 -1
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +7 -6
sglang/srt/model_executor/forward_batch_info.py +35 -14
sglang/srt/model_executor/model_runner.py +19 -2
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +72 -33
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma3n_mm.py +39 -0
sglang/srt/models/glm4_moe.py +24 -12
sglang/srt/models/gpt_oss.py +1134 -0
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_moe.py +6 -0
sglang/srt/models/qwen3_moe.py +32 -6
sglang/srt/models/step3_vl.py +9 -0
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/reasoning_parser.py +18 -39
sglang/srt/server_args.py +142 -7
sglang/srt/two_batch_overlap.py +157 -5
sglang/srt/utils.py +38 -2
sglang/test/runners.py +2 -2
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +16 -14
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +105 -84
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -1172,6 +1172,202 @@ class MRotaryEmbedding(RotaryEmbedding):
         )
+class DualChunkRotaryEmbedding(CustomOp):
+    """Rotary positional embedding for Dual Chunk Attention."""
+    def __init__(
+        self,
+        head_size: int,
+        rotary_dim: int,
+        max_position_embeddings: int,
+        base: int,
+        is_neox_style: bool,
+        dtype: torch.dtype,
+        chunk_size: int,
+        local_size: int,
+    ) -> None:
+        super().__init__()
+        self.head_size = head_size
+        self.rotary_dim = rotary_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        self.is_neox_style = is_neox_style
+        self.chunk_size = chunk_size
+        self.local_size = local_size
+        self.dtype = dtype
+        self.device = torch.device(f"cuda:{torch.cuda.current_device()}")
+        (q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache) = (
+            self._compute_cos_sin_cache()
+        )
+        self.register_buffer("cos_sin_q_cache", q_cache, persistent=False)
+        self.register_buffer("cos_sin_qc_cache", qc_cache, persistent=False)
+        self.register_buffer("cos_sin_k_cache", k_cache, persistent=False)
+        self.register_buffer(
+            "cos_sin_qc_no_clamp_cache", qc_no_clamp_cache, persistent=False
+        )
+        self.register_buffer("cos_sin_q_inter_cache", q_inter_cache, persistent=False)
+    def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor:
+        """Compute the inverse frequency."""
+        # NOTE(woosuk): The HF implementation uses `torch.arange(...).float()`.
+        # However, we use `torch.arange(..., dtype=torch.float)` instead to
+        # avoid numerical issues with large base values (e.g., 10000000).
+        # This may cause a slight numerical difference between the HF
+        # implementation and ours.
+        # NOTE(woosuk): To exactly match the HF implementation, we need to
+        # use CPU to compute the cache and then move it to GPU. However, we
+        # create the cache on GPU for faster initialization. This may cause
+        # a slight numerical difference between the HF implementation and ours.
+        inv_freq = 1.0 / (
+            base
+            ** (
+                torch.arange(0, self.rotary_dim, 2, dtype=torch.float) / self.rotary_dim
+            )
+        )
+        return inv_freq
+    def _compute_cos_sin_cache(self) -> torch.Tensor:
+        """Compute the cos and sin cache."""
+        inv_freq = self._compute_inv_freq(self.base)
+        chunk_len = self.chunk_size - self.local_size
+        q_t = torch.arange(chunk_len, dtype=torch.float)
+        qc_t = (torch.arange(chunk_len, dtype=torch.float) + chunk_len).clamp(
+            max=self.chunk_size
+        )
+        k_t = torch.arange(self.max_position_embeddings, dtype=torch.float) % chunk_len
+        # count from chunk_len, no clamp(self.chunk_size) restriction
+        qc_no_clamp_t = torch.arange(chunk_len, dtype=torch.float) + chunk_len
+        # count from self.chunk_size for q_inter's rope
+        q_inter_t = torch.arange(chunk_len, dtype=torch.float) + self.chunk_size
+        q_freqs = torch.outer(q_t, inv_freq)
+        qc_freqs = torch.outer(qc_t, inv_freq)
+        k_freqs = torch.outer(k_t, inv_freq)
+        qc_no_clamp_freqs = torch.outer(qc_no_clamp_t, inv_freq)
+        q_inter_freqs = torch.outer(q_inter_t, inv_freq)
+        q_cos = q_freqs.cos()
+        q_sin = q_freqs.sin()
+        qc_cos = qc_freqs.cos()
+        qc_sin = qc_freqs.sin()
+        k_cos = k_freqs.cos()
+        k_sin = k_freqs.sin()
+        qc_no_clamp_cos = qc_no_clamp_freqs.cos()
+        qc_no_clamp_sin = qc_no_clamp_freqs.sin()
+        q_inter_cos = q_inter_freqs.cos()
+        q_inter_sin = q_inter_freqs.sin()
+        q_cache = torch.cat((q_cos, q_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        qc_cache = torch.cat((qc_cos, qc_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        k_cache = torch.cat((k_cos, k_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        qc_no_clamp_cache = torch.cat((qc_no_clamp_cos, qc_no_clamp_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        q_inter_cache = torch.cat((q_inter_cos, q_inter_sin), dim=-1).to(
+            dtype=self.dtype, device=self.device
+        )
+        return q_cache, qc_cache, k_cache, qc_no_clamp_cache, q_inter_cache
+    def forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        offsets: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+        query_rot = query[..., : self.rotary_dim]
+        key_rot = key[..., : self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim :]
+            key_pass = key[..., self.rotary_dim :]
+        else:
+            query_pass = None
+            key_pass = None
+        positions_with_offsets = (
+            torch.add(positions, offsets) if offsets is not None else positions
+        )
+        key = self._apply_rotary_embedding(
+            self.cos_sin_k_cache[positions_with_offsets], key_rot, key_pass
+        )
+        chunk_len = self.chunk_size - self.local_size
+        query = self._apply_rotary_embedding(
+            self.cos_sin_q_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_succ = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_inter = self._apply_rotary_embedding(
+            self.cos_sin_qc_cache[chunk_len - 1].repeat(positions.shape[0], 1),
+            query_rot,
+            query_pass,
+        )
+        query_succ_critical = self._apply_rotary_embedding(
+            self.cos_sin_qc_no_clamp_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        query_inter_critical = self._apply_rotary_embedding(
+            self.cos_sin_q_inter_cache[positions_with_offsets % chunk_len],
+            query_rot,
+            query_pass,
+        )
+        # merge query into one tensor to simplify the interfaces
+        query = torch.cat(
+            (
+                query,
+                query_succ,
+                query_inter,
+                query_succ_critical,
+                query_inter_critical,
+            ),
+            dim=-1,
+        )
+        return query, key
+    def _apply_rotary_embedding(self, cos_sin, hidden_rot, hidden_pass):
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+        hidden_rot = hidden_rot * cos + rotate_fn(hidden_rot) * sin
+        if self.rotary_dim < self.head_size:
+            hidden = torch.cat((hidden_rot, hidden_pass), dim=-1)
+        else:
+            hidden = hidden_rot
+        return hidden.flatten(-2).squeeze(0)
+    def extra_repr(self) -> str:
+        s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}"
+        s += f", max_position_embeddings={self.max_position_embeddings}"
+        s += f", base={self.base}, is_neox_style={self.is_neox_style}"
+        s += f", chunk_size={self.chunk_size}, local_size={self.local_size}"
+        return s
 _ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
@@ -1184,6 +1380,7 @@ def get_rope(
     rope_scaling: Optional[Dict[str, Any]] = None,
     dtype: Optional[torch.dtype] = None,
     partial_rotary_factor: float = 1.0,
+    dual_chunk_attention_config: Optional[Dict[str, Any]] = None,
 ) -> RotaryEmbedding:
     if dtype is None:
         dtype = torch.get_default_dtype()
@@ -1195,6 +1392,17 @@ def get_rope(
         rope_scaling_args = tuple(rope_scaling_tuple.items())
     else:
         rope_scaling_args = None
+    if dual_chunk_attention_config is not None:
+        dual_chunk_attention_tuple = {
+            k: tuple(v) if isinstance(v, list) else v
+            for k, v in dual_chunk_attention_config.items()
+            if k != "sparse_attention_config"
+        }
+        dual_chunk_attention_args = tuple(dual_chunk_attention_tuple.items())
+    else:
+        dual_chunk_attention_args = None
     if partial_rotary_factor < 1.0:
         rotary_dim = int(rotary_dim * partial_rotary_factor)
     key = (
@@ -1204,12 +1412,28 @@ def get_rope(
         base,
         is_neox_style,
         rope_scaling_args,
+        dual_chunk_attention_args,
         dtype,
     )
     if key in _ROPE_DICT:
         return _ROPE_DICT[key]
-    if rope_scaling is None:
+    if dual_chunk_attention_config is not None:
+        extra_kwargs = {
+            k: v
+            for k, v in dual_chunk_attention_config.items()
+            if k in ("chunk_size", "local_size")
+        }
+        rotary_emb = DualChunkRotaryEmbedding(
+            head_size,
+            rotary_dim,
+            max_position,
+            base,
+            is_neox_style,
+            dtype,
+            **extra_kwargs,
+        )
+    elif rope_scaling is None:
         rotary_emb = RotaryEmbedding(
             head_size, rotary_dim, max_position, base, is_neox_style, dtype
         )

sglang/srt/layers/utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import logging
 import re
+from functools import lru_cache
 import torch
@@ -35,7 +36,15 @@ class PPMissingLayer(torch.nn.Identity):
         return (input,) if self.return_tuple else input
+@lru_cache(maxsize=1)
 def is_sm100_supported(device=None) -> bool:
     return (torch.cuda.get_device_capability(device)[0] == 10) and (
         torch.version.cuda >= "12.8"
     )
+@lru_cache(maxsize=1)
+def is_sm90_supported(device=None) -> bool:
+    return (torch.cuda.get_device_capability(device)[0] == 9) and (
+        torch.version.cuda >= "12.3"
+    )

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -26,7 +26,12 @@ from sglang.srt.layers.quantization.base_config import (
     method_has_implemented_embedding,
 )
 from sglang.srt.layers.quantization.unquant import UnquantizedEmbeddingMethod
-from sglang.srt.utils import cpu_has_amx_support, is_cpu, set_weight_attrs
+from sglang.srt.utils import (
+    cpu_has_amx_support,
+    get_compiler_backend,
+    is_cpu,
+    set_weight_attrs,
+)
 DEFAULT_VOCAB_PADDING_SIZE = 64
@@ -117,7 +122,7 @@ class VocabParallelEmbeddingShardIndices:
         assert self.num_added_elements <= self.num_added_elements_padded
-@torch.jit.script
+@torch.compile(dynamic=True, backend=get_compiler_backend())
 def get_masked_input_and_mask(
     input_: torch.Tensor,
     org_vocab_start_index: int,
@@ -126,7 +131,7 @@ def get_masked_input_and_mask(
     added_vocab_start_index: int,
     added_vocab_end_index: int,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-    # torch.jit.script will fuse all of the pointwise ops below
+    # torch.compile will fuse all of the pointwise ops below
     # into a single kernel, making it very fast
     org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index)
     added_vocab_mask = (input_ >= added_vocab_start_index) & (

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -144,6 +144,7 @@ class LoRAManager:
             # keep metadata for displayed messages
             self.lora_refs[lora_ref.lora_id] = lora_ref
+            self.num_pinned_loras += int(lora_ref.pinned)
         except Exception as e:
             return self.create_lora_update_result(
                 success=False,
@@ -157,13 +158,22 @@ class LoRAManager:
         Validate if an adapter can be loaded into the current LoRA memory pool and generate error if it is incompatible.
         """
+        # Check if the LoRA adapter shape is compatible with the current LoRA memory pool configuration.
         memory_pool = getattr(self, "memory_pool", None)
         incompatible = memory_pool and not memory_pool.can_support(lora_config)
         if incompatible:
             raise ValueError(
-                f"LoRA adapter {lora_ref.lora_name} with rank {lora_config.r} is incompatible with the current LoRA memory pool configuration. "
-                "Please ensure that the LoRA adapter's rank is within the configured `--max_lora_rank` and that the target modules are "
-                "included in `--enable_lora_modules`."
+                f"LoRA adapter {lora_ref.lora_name} with rank {lora_config.r} is incompatible with the current "
+                "LoRA memory pool configuration. Please ensure that the LoRA adapter's rank is within the configured "
+                "`--max-lora-rank` and that the target modules are included in `--lora-target-modules`."
+            )
+        # Ensure pinned LoRA adapters does not exceed maximal limit or cause starvation.
+        if lora_ref.pinned and self.num_pinned_loras >= self.max_loras_per_batch - 1:
+            raise ValueError(
+                f"Failed to load LoRA adapter {lora_ref.lora_name} as a pinned adapter. It is not allowed to pin all slots "
+                "in the LoRA memory pool to avoid starvation for unpinned adapters and base models. Please increase your "
+                "`--max-loras-per-batch` or load it as unpinned LoRA adapters."
             )
     def unload_lora_adapter(self, lora_ref: LoRARef) -> LoRAUpdateResult:
@@ -172,15 +182,17 @@ class LoRAManager:
         delete the corresponding LoRA modules.
         """
-        adapter = self.configs.get(lora_ref.lora_id, None)
+        adapter = self.configs.get(lora_ref.lora_id)
+        lora_ref = self.lora_refs.get(lora_ref.lora_id)
         assert (
-            adapter is not None
+            adapter is not None and lora_ref is not None
         ), f"LoRA adapter with ID {lora_ref.lora_id} is not loaded. This should have been verified before request is sent to the backend."
         try:
             del self.configs[lora_ref.lora_id]
             del self.loras[lora_ref.lora_id]
             del self.lora_refs[lora_ref.lora_id]
+            self.num_pinned_loras -= int(lora_ref.pinned)
         except Exception as e:
             return self.create_lora_update_result(
                 success=False,
@@ -189,15 +201,49 @@ class LoRAManager:
         return self.create_lora_update_result(success=True)
+    def validate_lora_batch(self, lora_ids: set[str]) -> bool:
+        """
+        Validate if the LoRA IDs in the batch can be loaded into the current LoRA memory pool.
+        """
+        if len(lora_ids) > self.max_loras_per_batch:
+            return False
+        # skip pinned LoRA check if no pinned LoRA adapters are loaded.
+        if self.num_pinned_loras == 0:
+            return True
+        # counting the number of pinned LoRA adapters in the batch.
+        pinned_loras_in_batch = 0
+        for lora_id in lora_ids:
+            if lora_id is not None:
+                lora_ref = self.lora_refs.get(lora_id)
+                assert (
+                    lora_ref is not None
+                ), f"LoRA ID {lora_id} not found in lora_refs."
+                pinned_loras_in_batch += int(lora_ref.pinned)
+        assert pinned_loras_in_batch <= self.num_pinned_loras, (
+            f"Number of pinned LoRA adapters in the batch ({pinned_loras_in_batch}) exceeds the total number of pinned adapters "
+            f"({self.num_pinned_loras}). This indicates a bug in the LoRA loading logic."
+        )
+        required_slots = len(lora_ids) - pinned_loras_in_batch
+        mem_pool_vacancy = self.memory_pool.max_loras_per_batch - self.num_pinned_loras
+        return required_slots <= mem_pool_vacancy
     def prepare_lora_batch(self, forward_batch: ForwardBatch):
         # Load active loras into lora memory pool
-        # TODO (lifuhuang): The naming of `forward_batch.lora_paths` is confusing. It actually contains a set of unique
-        # LoRA IDs, not LoRA paths. While unfortunately we cannot change the name in API for backward compatibility, we
-        # should consider (1) renaming the incorrect usage within the system, and (2) deprecating the parameter name in
-        # the current API schema and introducing a better request schema in the future (e.g., use `model_name`).
-        cur_uids = set(forward_batch.lora_paths)
+        cur_uids = set(forward_batch.lora_ids)
         assert len(cur_uids) <= self.max_loras_per_batch
-        self.memory_pool.prepare_lora_batch(cur_uids, self.loras, self.lora_modules)
+        self.memory_pool.prepare_lora_batch(
+            cur_uids=cur_uids,
+            lora_adapters=self.loras,
+            lora_modules=self.lora_modules,
+            lora_refs=self.lora_refs.copy(),  # copy snapshot of current lora_refs to avoid mutation during the batch preparation.
+        )
         # set up batch info shared by all lora modules
         bs = forward_batch.batch_size
@@ -211,10 +257,10 @@ class LoRAManager:
             Transfer adapter metadata (weight indices, LoRA rank, scalings) from host
             to device (CUDA) asynchronously.
             """
-            weight_indices = [0] * len(forward_batch.lora_paths)
+            weight_indices = [0] * len(forward_batch.lora_ids)
             lora_ranks = [0] * self.max_loras_per_batch
             scalings = [0] * self.max_loras_per_batch
-            for i, uid in enumerate(forward_batch.lora_paths):
+            for i, uid in enumerate(forward_batch.lora_ids):
                 weight_indices[i] = self.memory_pool.get_buffer_id(uid)
                 if uid is not None:
                     lora = self.loras[uid]
@@ -370,6 +416,9 @@ class LoRAManager:
         # Mapping from LoRA ID to LoRARef object.
         self.lora_refs: Dict[str, LoRARef] = {}
+        # Count of pinned LoRA adapters.
+        self.num_pinned_loras: int = 0
         if lora_paths:
             for lora_ref in lora_paths.values():
                 result = self.load_lora_adapter(lora_ref)
@@ -390,13 +439,20 @@ class LoRAManager:
         else:
             self.target_modules = set()
             for config in self.configs.values():
+                if not isinstance(config.target_modules, list):
+                    raise ValueError(
+                        f"SGLang currently only supports inferring LoRA target modules when a list of "
+                        "suffixes is provided in `target_modules` field of PEFT config. Please explicitly "
+                        "specify `--lora-target-modules` during server startup. You can specify `all` to "
+                        "enable all support modules types. "
+                    )
                 self.target_modules.update(config.target_modules)
         if max_lora_rank is not None:
             self.max_lora_rank = max_lora_rank
         else:
             self.max_lora_rank = max(
-                [x.hf_config["r"] for x in self.configs.values()],
+                [x.r for x in self.configs.values()],
                 default=0,
             )

sglang/srt/lora/lora_registry.py CHANGED Viewed

@@ -28,14 +28,15 @@ class LoRARef:
     """
     Reference record for a LoRA model.
-    This object guarantees a unique ``lora_id`` and may include ``lora_name`` and ``lora_path``. The ID
-    eliminates conflicts from reused LoRA names or paths and can be used to generate deterministic cache
+    This object guarantees a unique ``lora_id`` and may include ``lora_name``, ``lora_path``, and ``pinned``.
+    The ID eliminates conflicts from reused LoRA names or paths and can be used to generate deterministic cache
     keys (e.g., radix cache).
     """
     lora_id: str = field(default_factory=lambda: uuid4().hex)
     lora_name: Optional[str] = None
     lora_path: Optional[str] = None
+    pinned: Optional[bool] = None
     def __post_init__(self):
         if self.lora_id is None:

sglang/srt/lora/mem_pool.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union
 import torch
@@ -7,6 +8,7 @@ from sglang.srt.hf_transformers_utils import AutoConfig
 from sglang.srt.lora.layers import BaseLayerWithLoRA
 from sglang.srt.lora.lora import LoRAAdapter
 from sglang.srt.lora.lora_config import LoRAConfig
+from sglang.srt.lora.lora_registry import LoRARef
 from sglang.srt.lora.utils import (
     ROW_PARALLELISM_LINEAR_LORA_NAMES,
     LoRAType,
@@ -16,6 +18,28 @@ from sglang.srt.lora.utils import (
     get_weight_name,
 )
+logger = logging.getLogger(__name__)
+class EmptySlot:
+    """
+    Singleton class to represent an empty slot in the memory pool.
+    This is used to improve readability by not using special str as a placeholder.
+    """
+    __slots__ = ()
+    def __repr__(self):
+        return "|EMPTY|"
+    def __new__(cls):
+        if not hasattr(cls, "_instance"):
+            cls._instance = super().__new__(cls)
+        return cls._instance
+EMPTY_SLOT = EmptySlot()
 class LoRAMemoryPool:
     """Class for memory pool management of lora modules"""
@@ -54,9 +78,11 @@ class LoRAMemoryPool:
         self.uid_to_buffer_id: Dict[Optional[str], int] = {}
         # Buffer idx -> lora uid in memory pool
-        # All uids are initialized as empty strings for empty buffer slots
+        # All uids are initialized as `EmptySlot` for empty buffer slots
         # Here we don't initialize to None since None is a valid uid
-        self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
+        self.buffer_id_to_uid: List[Union[str, None, EmptySlot]] = [
+            EMPTY_SLOT
+        ] * self.max_loras_per_batch
         self.init_buffers(base_model)
@@ -154,17 +180,29 @@ class LoRAMemoryPool:
         cur_uids: Set[Optional[str]],
         lora_adapters: Dict[str, LoRAAdapter],
         lora_modules: List[Dict[str, BaseLayerWithLoRA]],
+        lora_refs: Dict[str, LoRARef],
     ):
         def get_available_buffer_slot():
             for buffer_id in range(self.max_loras_per_batch):
                 # Prioritize empty slots
-                if self.buffer_id_to_uid[buffer_id] == "":
+                if self.buffer_id_to_uid[buffer_id] == EMPTY_SLOT:
                     return buffer_id
             for buffer_id in range(self.max_loras_per_batch):
+                uid = self.buffer_id_to_uid[buffer_id]
                 # Evict unneeded lora
-                if self.buffer_id_to_uid[buffer_id] not in cur_uids:
-                    self.uid_to_buffer_id.pop(self.buffer_id_to_uid[buffer_id])
+                if uid not in cur_uids:
+                    # Skip pinned LoRAs
+                    # TODO (lifuhuang): we might consider supporting pinning base model (uid == None) in the future.
+                    if uid is not None:
+                        lora_ref = lora_refs.get(uid)
+                        if lora_ref is not None and lora_ref.pinned:
+                            continue
+                    self.uid_to_buffer_id.pop(uid)
+                    logger.debug(f"Evicting LoRA {uid} from buffer slot {buffer_id}.")
+                    self.buffer_id_to_uid[buffer_id] = EMPTY_SLOT
                     return buffer_id
             raise ValueError(

sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc0py3-none-any.whl