PyPI - sglang - Versions diffs - 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl - Mend

sglang 0.4.3.post1py3-none-any.whl → 0.4.3.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -261,26 +261,27 @@ class VocabParallelEmbedding(torch.nn.Module):
         )
         self.embedding_dim = embedding_dim
-        linear_method = None
+        quant_method = None
         if quant_config is not None:
-            linear_method = quant_config.get_quant_method(self, prefix=prefix)
-        if linear_method is None:
-            linear_method = UnquantizedEmbeddingMethod()
+            quant_method = quant_config.get_quant_method(self, prefix=prefix)
+            print("quant_method", quant_method)
+        if quant_method is None:
+            quant_method = UnquantizedEmbeddingMethod()
         # If we are making an embedding layer, then our quantization linear
         # method must implement the embedding operation. If we are another
         # layer type like ParallelLMHead, this is not important.
         is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
-        linear_method_implements_embedding = method_has_implemented_embedding(
-            type(linear_method)
+        quant_method_implements_embedding = method_has_implemented_embedding(
+            type(quant_method)
         )
-        if is_embedding_layer and not linear_method_implements_embedding:
+        if is_embedding_layer and not quant_method_implements_embedding:
             raise NotImplementedError(
-                f"The class {type(linear_method).__name__} must implement "
+                f"The class {type(quant_method).__name__} must implement "
                 "the 'embedding' method, see UnquantizedEmbeddingMethod."
             )
-        self.linear_method: QuantizeMethodBase = linear_method
+        self.quant_method: QuantizeMethodBase = quant_method
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
@@ -301,7 +302,7 @@ class VocabParallelEmbedding(torch.nn.Module):
             - self.shard_indices.added_vocab_start_index
         )
-        self.linear_method.create_weights(
+        self.quant_method.create_weights(
             self,
             self.embedding_dim,
             [self.num_embeddings_per_partition],
@@ -446,7 +447,7 @@ class VocabParallelEmbedding(torch.nn.Module):
             packed_factor = (
                 param.packed_factor
                 if isinstance(param, BasevLLMParameter)
-                else param.pack_factor
+                else param.packed_factor
             )
             assert loaded_weight.shape[output_dim] == (
                 self.org_vocab_size // param.packed_factor
@@ -457,7 +458,7 @@ class VocabParallelEmbedding(torch.nn.Module):
             assert loaded_weight.shape[output_dim] == (
                 self.org_vocab_size
                 // (self.tp_size if self.use_presharded_weights else 1)
-            )
+            ), f"{self.org_vocab_size=} {self.use_presharded_weights=} {loaded_weight.shape[output_dim]=}"
         # Copy the data.
         if not self.use_presharded_weights:
@@ -479,7 +480,7 @@ class VocabParallelEmbedding(torch.nn.Module):
         else:
             masked_input = input_
         # Get the embeddings.
-        output_parallel = self.linear_method.embedding(self, masked_input.long())
+        output_parallel = self.quant_method.embedding(self, masked_input.long())
         # Mask the output embedding.
         if self.tp_size > 1:
             output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)

sglang/srt/lora/lora.py CHANGED Viewed

@@ -18,6 +18,7 @@
 # LoRA layers class inheritance adapted from:
 # https://github.com/vllm-project/vllm/blob/4abf6336ec65c270343eb895e7b18786e9274176/vllm/lora/layers.py
+import logging
 import re
 from typing import Dict, List
@@ -30,6 +31,8 @@ from sglang.srt.lora.backend import BaseLoRABackend
 from sglang.srt.lora.lora_config import LoRAConfig
 from sglang.srt.model_loader.loader import DefaultModelLoader
+logger = logging.getLogger(__name__)
 class LoRALayer(nn.Module):
     def __init__(self, config: LoRAConfig, base_hf_config: AutoConfig):
@@ -173,6 +176,18 @@ class LoRAAdapter(nn.Module):
             if "gate_proj" in weight_name:
                 up_name = weight_name.replace("gate_proj", "up_proj")
                 gate_up_name = weight_name.replace("gate_proj", "gate_up_proj")
+                if up_name not in weights:
+                    logger.warning(
+                        f"Gate projection {weight_name} does not have a corresponding up projection {up_name}. "
+                        f"Initializing up projection to zero."
+                    )
+                    weights[up_name] = torch.zeros_like(weights[weight_name])
+                    # FIXME: Add gate-only support for flashinfer in future implementations
+                    assert self.lora_backend.name == "triton", (
+                        f"LoRA weight initialization currently only supported for 'triton' backend. "
+                        f"Received backend: {self.lora_backend.name}. Please verify your backend configuration "
+                        f"or consider implementing custom initialization logic for other backends."
+                    )
                 if "lora_A" in weight_name:
                     weights[gate_up_name] = torch.cat(
                         (weights[weight_name], weights[up_name]), 0
@@ -182,4 +197,5 @@ class LoRAAdapter(nn.Module):
                         [weights[weight_name], weights[up_name]], dim=0
                     )
                 weights.pop(weight_name)
-                weights.pop(up_name)
+                if up_name in weights:
+                    weights.pop(up_name)

sglang/srt/lora/lora_config.py CHANGED Viewed

@@ -26,6 +26,11 @@ class LoRAConfig:
         self.path = path
         self.hf_config = self.get_lora_config()
         self.target_modules = self.hf_config["target_modules"]
+        # TODO: Support more modules
+        if any(module in self.target_modules for module in ["embed_tokens", "lm_head"]):
+            raise ValueError("Not supported yet")
         self.r = self.hf_config["r"]
         self.lora_alpha = self.hf_config["lora_alpha"]

sglang/srt/lora/lora_manager.py CHANGED Viewed

@@ -76,9 +76,7 @@ class LoRAManager:
         self.hf_target_names: Set[str] = set()
         for name, path in self.lora_paths.items():
             self.configs[name] = LoRAConfig(path)
-            self.hf_target_names = set(self.hf_target_names) | set(
-                self.configs[name].target_modules
-            )
+            self.hf_target_names.update(self.configs[name].target_modules)
         # Target lora weight names for lora_a and lora_b modules repectively.
         # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -5,9 +5,7 @@ Copyright 2023-2025 SGLang Team
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -15,14 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+import concurrent.futures
 import logging
+import math
 import threading
-from queue import PriorityQueue, Queue
-from typing import Optional
+from queue import Empty, Full, PriorityQueue, Queue
+from typing import List, Optional
 import torch
-from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPoolHost
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MHATokenToKVPoolHost
 logger = logging.getLogger(__name__)
@@ -55,6 +55,27 @@ class CacheOperation:
         self.priority = min(self.priority, other.priority)
         self.node_ids.extend(other.node_ids)
+    def split(self, factor) -> List["CacheOperation"]:
+        # split an operation into smaller operations to reduce the size of intermediate buffers
+        if factor <= 1:
+            return [self]
+        chunk_size = math.ceil(len(self.host_indices) / factor)
+        split_ops = []
+        for i in range(0, len(self.host_indices), chunk_size):
+            split_ops.append(
+                CacheOperation(
+                    host_indices=self.host_indices[i : i + chunk_size],
+                    device_indices=self.device_indices[i : i + chunk_size],
+                    node_id=0,
+                )
+            )
+        # Inherit the node_ids on the final chunk
+        if split_ops:
+            split_ops[-1].node_ids = self.node_ids
+        return split_ops
     def __lt__(self, other: "CacheOperation"):
         return self.priority < other.priority
@@ -64,7 +85,10 @@ class TransferBuffer:
     Overlapping buffer preparation and transfer operations to improve throughput.
     """
-    def __init__(self, buffer_count: int = 3, max_buffer_size: int = 1000) -> None:
+    def __init__(
+        self, stop_event, buffer_count: int = 3, max_buffer_size: int = 1000
+    ) -> None:
+        self.stop_event = stop_event
         self.buffers = Queue(maxsize=buffer_count)
         # todo: adjust the buffer size based on throughput profile of the system
         self.max_buffer_size = max_buffer_size
@@ -75,22 +99,36 @@ class TransferBuffer:
     def empty(self) -> bool:
         return self.buffers.empty()
-    def put(self, item, block=True) -> None:
-        self.buffers.put(item, block=block)
+    def put(self, item, block=True, timeout=1) -> None:
+        while not self.stop_event.is_set():
+            try:
+                self.buffers.put(item, block=block, timeout=timeout)
+                break
+            except Full:
+                if not block:
+                    break
+                continue
+            except Exception as e:
+                logger.error(e)
-    def get(self, block=True) -> Optional[CacheOperation]:
+    def get(self, block=True, timeout=1) -> Optional[CacheOperation]:
         try:
-            return self.buffers.get(block=block)
+            return self.buffers.get(block=block, timeout=timeout)
+        except Empty:
+            return None
         except Exception as e:
             logger.error(e)
+    def clear(self):
+        self.buffers.queue.clear()
 class HiCacheController:
     def __init__(
         self,
         mem_pool_device: MHATokenToKVPool,
-        mem_pool_host: MLATokenToKVPoolHost,
+        mem_pool_host: MHATokenToKVPoolHost,
         write_policy: str = "write_through_selective",
     ):
@@ -111,8 +149,11 @@ class HiCacheController:
         self.ack_write_queue = Queue()
         self.ack_load_queue = Queue()
-        self.write_buffer = TransferBuffer()
-        self.load_buffer = TransferBuffer()
+        self.stop_event = threading.Event()
+        self.write_buffer = TransferBuffer(self.stop_event)
+        self.load_buffer = TransferBuffer(
+            self.stop_event, buffer_count=10, max_buffer_size=100
+        )
         self.write_stream = torch.cuda.Stream()
         self.load_stream = torch.cuda.Stream()
@@ -126,6 +167,28 @@ class HiCacheController:
         self.write_thread.start()
         self.load_thread.start()
+    def reset(self):
+        self.stop_event.set()
+        self.write_thread.join()
+        self.load_thread.join()
+        self.write_queue.queue.clear()
+        self.load_queue.queue.clear()
+        self.write_buffer.clear()
+        self.load_buffer.clear()
+        self.ack_write_queue.queue.clear()
+        self.ack_load_queue.queue.clear()
+        self.write_thread = threading.Thread(
+            target=self.write_thread_func_buffer, daemon=True
+        )
+        self.load_thread = threading.Thread(
+            target=self.load_thread_func_buffer, daemon=True
+        )
+        self.stop_event.clear()
+        self.write_thread.start()
+        self.load_thread.start()
     def write(
         self,
         device_indices: torch.Tensor,
@@ -138,10 +201,10 @@ class HiCacheController:
         host_indices = self.mem_pool_host.alloc(len(device_indices))
         if host_indices is None:
             return None
+        self.mem_pool_host.protect_write(host_indices)
         self.write_queue.put(
             CacheOperation(host_indices, device_indices, node_id, priority)
         )
-        self.mem_pool_host.protect_write(host_indices)
         return host_indices
     def load(
@@ -156,10 +219,10 @@ class HiCacheController:
         device_indices = self.mem_pool_device.alloc(len(host_indices))
         if device_indices is None:
             return None
+        self.mem_pool_host.protect_load(host_indices)
         self.load_queue.put(
             CacheOperation(host_indices, device_indices, node_id, priority)
         )
-        self.mem_pool_host.protect_load(host_indices)
         return device_indices
     def write_thread_func_direct(self):
@@ -167,16 +230,19 @@ class HiCacheController:
         Directly write through KV caches to host memory without buffering.
         """
         with torch.cuda.stream(self.write_stream):
-            while True:
+            while not self.stop_event.is_set():
                 try:
-                    operation = self.write_queue.get(block=True)
+                    operation = self.write_queue.get(block=True, timeout=1)
                     operation.data = self.mem_pool_device.get_flat_data(
                         operation.device_indices
                     )
                     self.mem_pool_host.transfer(operation.host_indices, operation.data)
                     self.mem_pool_host.complete_io(operation.host_indices)
                     for node_id in operation.node_ids:
-                        self.ack_write_queue.put(node_id)
+                        if node_id != 0:
+                            self.ack_write_queue.put(node_id)
+                except Empty:
+                    continue
                 except Exception as e:
                     logger.error(e)
@@ -185,9 +251,10 @@ class HiCacheController:
         Directly load KV caches from host memory to device memory without buffering.
         """
         with torch.cuda.stream(self.load_stream):
-            while True:
+            while not self.stop_event.is_set():
                 try:
-                    operation = self.load_queue.get(block=True)
+                    operation = self.load_queue.get(block=True, timeout=1)
+                    # time.sleep(18e-6 * len(operation.host_indices))
                     operation.data = self.mem_pool_host.get_flat_data(
                         operation.host_indices
                     )
@@ -196,7 +263,10 @@ class HiCacheController:
                     )
                     self.mem_pool_host.complete_io(operation.host_indices)
                     for node_id in operation.node_ids:
-                        self.ack_load_queue.put(node_id)
+                        if node_id != 0:
+                            self.ack_load_queue.put(node_id)
+                except Empty:
+                    continue
                 except Exception as e:
                     logger.error(e)
@@ -204,39 +274,98 @@ class HiCacheController:
         """
         Auxiliary function to prepare the buffer for write operations.
         """
+        def _to_op(op_):
+            assert op_.device_indices.is_cuda, "Device indices should be on GPU"
+            op_.data = self.mem_pool_device.get_flat_data(op_.device_indices).to(
+                self.mem_pool_host.device
+            )
+            self.write_buffer.put(op_)
+            return op_
         buffer = None
-        while True:
-            try:
-                operation = self.write_queue.get(block=True)
-                if buffer is None:
-                    buffer = operation
-                else:
-                    buffer.merge(operation)
-                if (
-                    no_wait
-                    or len(buffer.host_indices) >= self.write_buffer.max_buffer_size
-                    or self.write_queue.empty()
-                    or self.write_buffer.empty()
-                ):
-                    assert (
-                        buffer.device_indices.is_cuda
-                    ), "Device indices should be on GPU"
-                    buffer.data = self.mem_pool_device.get_flat_data(
-                        buffer.device_indices
-                    ).contiguous()
-                    self.write_buffer.put(buffer, block=True)
-                    buffer = None
-            except Exception as e:
-                logger.error(e)
+        with torch.cuda.stream(self.write_stream):
+            while not self.stop_event.is_set():
+                try:
+                    operation = self.write_queue.get(block=True, timeout=1)
+                    factor = (
+                        len(operation.device_indices)
+                        // self.write_buffer.max_buffer_size
+                    )
+                    if factor >= 1:
+                        if buffer is not None:
+                            _to_op(buffer)
+                            buffer = None
+                        if factor < 2:
+                            _to_op(operation)
+                        else:
+                            split_ops = operation.split(factor)
+                            for op_ in split_ops:
+                                _to_op(op_)
+                        continue
+                    if buffer is None:
+                        buffer = operation
+                    else:
+                        buffer.merge(operation)
+                    if (
+                        no_wait
+                        or len(buffer.host_indices) >= self.write_buffer.max_buffer_size
+                        or self.write_queue.empty()
+                        or self.write_buffer.empty()
+                    ):
+                        _to_op(buffer)
+                        buffer = None
+                except Empty:
+                    continue
+                except Exception as e:
+                    logger.error(e)
     def load_aux_func(self):
         """
         Auxiliary function to prepare the buffer for load operations.
         """
+        def _pin_op(op_, put=True):
+            op_.data = (
+                self.mem_pool_host.get_flat_data(op_.host_indices)
+                .contiguous()
+                .pin_memory()
+            )
+            if put:
+                self.load_buffer.put(op_)
+            return op_
         buffer = None
-        while True:
+        while not self.stop_event.is_set():
             try:
-                operation = self.load_queue.get(block=True)
+                operation = self.load_queue.get(block=True, timeout=1)
+                factor = len(operation.host_indices) // self.load_buffer.max_buffer_size
+                if factor >= 1:
+                    if buffer is not None:
+                        _pin_op(buffer)
+                        buffer = None
+                    if factor < 2:
+                        _pin_op(operation)
+                    else:
+                        split_ops = operation.split(factor)
+                        split_args = [(op_, True) for op_ in split_ops[:-1]]
+                        split_args.append((split_ops[-1], False))
+                        # Spawn threads to pin each op concurrently
+                        with concurrent.futures.ThreadPoolExecutor() as executor:
+                            pinned_ops = list(
+                                executor.map(
+                                    lambda x: _pin_op(x[0], put=x[1]), split_args
+                                )
+                            )
+                        # preserve the order of last op to ensure correct ack
+                        self.load_buffer.put(pinned_ops[-1])
+                    continue
                 if buffer is None:
                     buffer = operation
                 else:
@@ -246,41 +375,43 @@ class HiCacheController:
                     or self.load_queue.empty()
                     or self.load_buffer.empty()
                 ):
-                    buffer.data = (
-                        self.mem_pool_host.get_flat_data(buffer.host_indices)
-                        .contiguous()
-                        .pin_memory()
-                    )
-                    self.load_buffer.put(buffer, block=True)
+                    _pin_op(buffer)
                     buffer = None
+            except Empty:
+                continue
             except Exception as e:
                 logger.error(e)
     def write_thread_func_buffer(self):
         aux_thread = threading.Thread(target=self.write_aux_func, daemon=True)
         aux_thread.start()
-        with torch.cuda.stream(self.write_stream):
-            while True:
-                operation = self.write_buffer.get()
-                if operation is None:
-                    continue
-                self.mem_pool_host.transfer(operation.host_indices, operation.data)
-                self.mem_pool_host.complete_io(operation.host_indices)
-                for node_id in operation.node_ids:
+        while not self.stop_event.is_set():
+            operation = self.write_buffer.get()
+            if operation is None:
+                continue
+            self.mem_pool_host.assign_flat_data(operation.host_indices, operation.data)
+            self.mem_pool_host.complete_io(operation.host_indices)
+            for node_id in operation.node_ids:
+                if node_id != 0:
                     self.ack_write_queue.put(node_id)
+        aux_thread.join()
     def load_thread_func_buffer(self):
         aux_thread = threading.Thread(target=self.load_aux_func, daemon=True)
         aux_thread.start()
         with torch.cuda.stream(self.load_stream):
-            while True:
+            while not self.stop_event.is_set():
                 operation = self.load_buffer.get()
                 if operation is None:
                     continue
                 self.mem_pool_device.transfer(operation.device_indices, operation.data)
                 self.mem_pool_host.complete_io(operation.host_indices)
                 for node_id in operation.node_ids:
-                    self.ack_load_queue.put(node_id)
+                    if node_id != 0:
+                        self.ack_load_queue.put(node_id)
+        aux_thread.join()
     def evict_device(
         self, device_indices: torch.Tensor, host_indices: torch.Tensor

sglang/srt/managers/configure_logging.py CHANGED Viewed

@@ -28,6 +28,7 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--url", type=str, default="http://localhost:30000")
     parser.add_argument("--log-requests", action="store_true")
+    parser.add_argument("--log-requests-level", type=int, default=2)
     parser.add_argument(
         "--dump-requests-folder", type=str, default="/tmp/sglang_request_dump"
     )
@@ -38,7 +39,7 @@ if __name__ == "__main__":
         args.url + "/configure_logging",
         json={
             "log_requests": args.log_requests,
-            "log_requests_level": 1,  # Log full requests
+            "log_requests_level": args.log_requests_level,  # Log full requests
             "dump_requests_folder": args.dump_requests_folder,
             "dump_requests_threshold": args.dump_requests_threshold,
         },

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -121,7 +121,7 @@ class DataParallelController:
                 args=(server_args, tmp_port_args, base_gpu_id, dp_rank),
             )
             threads.append(thread)
-            base_gpu_id += server_args.tp_size
+            base_gpu_id += server_args.tp_size * server_args.gpu_id_step
         # Free all sockets before starting the threads to launch TP workers
         for sock in sockets:
@@ -177,7 +177,11 @@ class DataParallelController:
                 rank_port_args.nccl_port = port_args.nccl_port
             reader, writer = mp.Pipe(duplex=False)
-            gpu_id = server_args.base_gpu_id + base_gpu_id + tp_rank % tp_size_per_node
+            gpu_id = (
+                server_args.base_gpu_id
+                + base_gpu_id
+                + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
+            )
             proc = mp.Process(
                 target=run_scheduler_process,
                 args=(server_args, rank_port_args, gpu_id, tp_rank, dp_rank, writer),

sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

sglang 0.4.3.post1py3-none-any.whl → 0.4.3.post3py3-none-any.whl