PyPI - sglang - Versions diffs - 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl - Mend

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

sglang/srt/layers/quantization/w8a8_int8.py CHANGED Viewed

@@ -1,8 +1,8 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 import torch
-from sglang.srt.utils import is_cuda_available
+from sglang.srt.utils import is_cuda_available, set_weight_attrs
 is_cuda = is_cuda_available()
 if is_cuda:
@@ -10,6 +10,7 @@ if is_cuda:
 from torch.nn.parameter import Parameter
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.linear import LinearMethodBase
 from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter
 from sglang.srt.layers.quantization.base_config import (
@@ -55,9 +56,12 @@ class W8A8Int8Config(QuantizationConfig):
         prefix: str,
     ) -> Optional["QuantizeMethodBase"]:
         from sglang.srt.layers.linear import LinearBase
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         if isinstance(layer, LinearBase):
             return W8A8Int8LinearMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return W8A8Int8MoEMethod(self)
         return None
     def get_scaled_act_names(self) -> List[str]:
@@ -81,7 +85,7 @@ class W8A8Int8LinearMethod(LinearMethodBase):
         input_size: int,
         output_size: int,
         params_dtype: torch.dtype,
-        **extra_weight_attrs
+        **extra_weight_attrs,
     ):
         weight_loader = extra_weight_attrs.get("weight_loader")
@@ -115,3 +119,148 @@ class W8A8Int8LinearMethod(LinearMethodBase):
         return int8_scaled_mm(
             x_q, layer.weight, x_scale, layer.weight_scale, out_dtype=x.dtype, bias=bias
         )
+class W8A8Int8MoEMethod:
+    """MoE method for INT8.
+    Supports loading INT8 checkpoints with static weight scale and
+    dynamic/static activation scale.
+    Also supports loading quantized FP16/BF16 model checkpoints with dynamic
+    activation scaling. The weight scaling factor will be initialized after
+    the model weights are loaded.
+    Args:
+        quant_config: The quantization config.
+    """
+    def __new__(cls, *args, **kwargs):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoEMethodBase
+        if not hasattr(cls, "_initialized"):
+            original_init = cls.__init__
+            new_cls = type(
+                cls.__name__,
+                (FusedMoEMethodBase,),
+                {
+                    "__init__": original_init,
+                    **{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
+                },
+            )
+            obj = super(new_cls, new_cls).__new__(new_cls)
+            obj.__init__(*args, **kwargs)
+            return obj
+        return super().__new__(cls)
+    def __init__(self, quant_config):
+        self.quant_config = quant_config
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+        tp_size = get_tensor_model_parallel_world_size()
+        # WEIGHTS
+        w13_weight = torch.nn.Parameter(
+            torch.empty(
+                num_experts, 2 * intermediate_size, hidden_size, dtype=torch.int8
+            ),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        set_weight_attrs(w13_weight, extra_weight_attrs)
+        w2_weight = torch.nn.Parameter(
+            torch.empty(num_experts, hidden_size, intermediate_size, dtype=torch.int8),
+            requires_grad=False,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        set_weight_attrs(w2_weight, extra_weight_attrs)
+        w13_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, 2 * intermediate_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        w2_weight_scale = torch.nn.Parameter(
+            torch.ones(num_experts, hidden_size, 1, dtype=torch.float32),
+            requires_grad=False,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.CHANNEL.value}
+        )
+        set_weight_attrs(w13_weight_scale, extra_weight_attrs)
+        set_weight_attrs(w2_weight_scale, extra_weight_attrs)
+        w13_input_scale = None
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+        w2_input_scale = None
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.w13_weight = Parameter(layer.w13_weight, requires_grad=False)
+        layer.w2_weight = Parameter(layer.w2_weight, requires_grad=False)
+        layer.w13_weight_scale = Parameter(
+            layer.w13_weight_scale.data, requires_grad=False
+        )
+        layer.w2_weight_scale = Parameter(
+            layer.w2_weight_scale.data, requires_grad=False
+        )
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        inplace: bool = True,
+        no_combine: bool = False,
+    ) -> torch.Tensor:
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+        from sglang.srt.layers.moe.topk import select_experts
+        # Expert selection
+        topk_weights, topk_ids = select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            custom_routing_function=custom_routing_function,
+            correction_bias=correction_bias,
+        )
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=inplace,
+            activation=activation,
+            use_int8_w8a8=True,
+            w1_scale=(layer.w13_weight_scale),
+            w2_scale=(layer.w2_weight_scale),
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            no_combine=no_combine,
+        )

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -403,12 +403,12 @@ def _yarn_find_correction_range(
 def _yarn_linear_ramp_mask(
-    low: float, high: float, dim: int, dtype: torch.dtype
+    low: float, high: float, dim: int, dtype: torch.dtype, device: torch.device = None
 ) -> torch.Tensor:
     if low == high:
         high += 0.001  # Prevent singularity
-    linear_func = (torch.arange(dim, dtype=dtype) - low) / (high - low)
+    linear_func = (torch.arange(dim, dtype=dtype, device=device) - low) / (high - low)
     ramp_func = torch.clamp(linear_func, 0, 1)
     return ramp_func
@@ -688,7 +688,9 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         # Get n-d rotational scaling corrected for extrapolation
         inv_freq_mask = (
             1
-            - _yarn_linear_ramp_mask(low, high, self.rotary_dim // 2, dtype=torch.float)
+            - _yarn_linear_ramp_mask(
+                low, high, self.rotary_dim // 2, dtype=torch.float, device=self.device
+            )
         ) * self.extrapolation_factor
         inv_freq = (
             inv_freq_interpolation * (1 - inv_freq_mask)

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import logging
-from typing import List, Optional
+from typing import List
 import torch
 import torch.distributed as dist
@@ -42,7 +42,6 @@ class Sampler(nn.Module):
         return_logprob: bool,
         top_logprobs_nums: List[int],
         token_ids_logprobs: List[List[int]],
-        batch_next_token_ids: Optional[torch.Tensor] = None,
     ):
         """Run a sampler & compute logprobs and update logits_output accordingly.
@@ -72,8 +71,7 @@ class Sampler(nn.Module):
         if sampling_info.is_all_greedy:
             # Use torch.argmax if all requests use greedy sampling
-            if batch_next_token_ids is None:
-                batch_next_token_ids = torch.argmax(logits, -1)
+            batch_next_token_ids = torch.argmax(logits, -1)
             if return_logprob:
                 logprobs = torch.nn.functional.log_softmax(logits, dim=-1)
         else:
@@ -94,43 +92,39 @@ class Sampler(nn.Module):
                         top_p_normalize_probs_torch(probs, sampling_info.top_ps)
                     ).clamp(min=torch.finfo(probs.dtype).min)
-                if batch_next_token_ids is None:
-                    max_top_k_round, batch_size = 32, probs.shape[0]
-                    uniform_samples = torch.rand(
-                        (max_top_k_round, batch_size), device=probs.device
+                max_top_k_round, batch_size = 32, probs.shape[0]
+                uniform_samples = torch.rand(
+                    (max_top_k_round, batch_size), device=probs.device
+                )
+                if sampling_info.need_min_p_sampling:
+                    probs = top_k_renorm_prob(probs, sampling_info.top_ks)
+                    probs = top_p_renorm_prob(probs, sampling_info.top_ps)
+                    batch_next_token_ids = min_p_sampling_from_probs(
+                        probs, uniform_samples, sampling_info.min_ps
                     )
-                    if sampling_info.need_min_p_sampling:
-                        probs = top_k_renorm_prob(probs, sampling_info.top_ks)
-                        probs = top_p_renorm_prob(probs, sampling_info.top_ps)
-                        batch_next_token_ids = min_p_sampling_from_probs(
-                            probs, uniform_samples, sampling_info.min_ps
-                        )
-                    else:
-                        batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
-                            probs,
-                            uniform_samples,
-                            sampling_info.top_ks,
-                            sampling_info.top_ps,
-                            filter_apply_order="joint",
-                        )
-                        if self.use_nan_detection and not torch.all(success):
-                            logger.warning("Detected errors during sampling!")
-                            batch_next_token_ids = torch.zeros_like(
-                                batch_next_token_ids
-                            )
-            elif global_server_args_dict["sampling_backend"] == "pytorch":
-                if batch_next_token_ids is None:
-                    # A slower fallback implementation with torch native operations.
-                    batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
+                else:
+                    batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
                         probs,
+                        uniform_samples,
                         sampling_info.top_ks,
                         sampling_info.top_ps,
-                        sampling_info.min_ps,
-                        sampling_info.need_min_p_sampling,
+                        filter_apply_order="joint",
                     )
+                    if self.use_nan_detection and not torch.all(success):
+                        logger.warning("Detected errors during sampling!")
+                        batch_next_token_ids = torch.zeros_like(batch_next_token_ids)
+            elif global_server_args_dict["sampling_backend"] == "pytorch":
+                # A slower fallback implementation with torch native operations.
+                batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
+                    probs,
+                    sampling_info.top_ks,
+                    sampling_info.top_ps,
+                    sampling_info.min_ps,
+                    sampling_info.need_min_p_sampling,
+                )
                 if return_logprob:
                     # clamp to avoid -inf
                     logprobs = torch.log(

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -264,7 +264,6 @@ class VocabParallelEmbedding(torch.nn.Module):
         quant_method = None
         if quant_config is not None:
             quant_method = quant_config.get_quant_method(self, prefix=prefix)
-            print("quant_method", quant_method)
         if quant_method is None:
             quant_method = UnquantizedEmbeddingMethod()

sglang/srt/lora/backend/__init__.py CHANGED Viewed

@@ -1,23 +1,20 @@
-from .base_backend import BaseLoRABackend
-from .flashinfer_backend import FlashInferLoRABackend
-from .triton_backend import TritonLoRABackend
+from sglang.srt.lora.backend.base_backend import BaseLoRABackend
 def get_backend_from_name(name: str) -> BaseLoRABackend:
     """
     Get corresponding backend class from backend's name
     """
-    backend_mapping = {
-        "triton": TritonLoRABackend,
-        "flashinfer": FlashInferLoRABackend,
-    }
+    if name == "triton":
+        from sglang.srt.lora.backend.triton_backend import TritonLoRABackend
-    if name in backend_mapping:
-        return backend_mapping[name]
+        return TritonLoRABackend
+    elif name == "flashinfer":
+        from sglang.srt.lora.backend.flashinfer_backend import FlashInferLoRABackend
-    raise Exception(
-        f"No supported lora backend called {name}. It should be one of {list(backend_mapping.keys())}"
-    )
+        return FlashInferLoRABackend
+    else:
+        raise ValueError(f"Invalid backend: {name}")
 __all__ = [

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -22,11 +22,34 @@ from typing import List, Optional
 import torch
-from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MHATokenToKVPoolHost
+from sglang.srt.mem_cache.memory_pool import (
+    MHATokenToKVPoolHost,
+    TokenToKVPoolAllocator,
+)
 logger = logging.getLogger(__name__)
+class LayerDoneCounter:
+    def __init__(self, num_layers):
+        self.counter = num_layers
+        self.condition = threading.Condition()
+    def increment(self):
+        with self.condition:
+            self.counter += 1
+            self.condition.notify_all()
+    def wait_until(self, threshold):
+        with self.condition:
+            while self.counter <= threshold:
+                self.condition.wait()
+    def reset(self):
+        with self.condition:
+            self.counter = 0
 class CacheOperation:
     counter = 0
@@ -127,15 +150,20 @@ class HiCacheController:
     def __init__(
         self,
-        mem_pool_device: MHATokenToKVPool,
+        token_to_kv_pool_allocator: TokenToKVPoolAllocator,
         mem_pool_host: MHATokenToKVPoolHost,
+        load_cache_event: threading.Event = None,
         write_policy: str = "write_through_selective",
     ):
-        self.mem_pool_device = mem_pool_device
+        self.mem_pool_device_allocator = token_to_kv_pool_allocator
+        self.mem_pool_device = token_to_kv_pool_allocator.get_kvcache()
         self.mem_pool_host = mem_pool_host
         self.write_policy = write_policy
+        self.load_cache_event = load_cache_event
+        self.layer_done_counter = LayerDoneCounter(self.mem_pool_device.layer_num)
+        self.mem_pool_device.register_layer_transfer_counter(self.layer_done_counter)
         if write_policy not in [
             "write_through",
             "write_through_selective",
@@ -162,7 +190,7 @@ class HiCacheController:
             target=self.write_thread_func_buffer, daemon=True
         )
         self.load_thread = threading.Thread(
-            target=self.load_thread_func_buffer, daemon=True
+            target=self.load_thread_func_layer_by_layer, daemon=True
         )
         self.write_thread.start()
         self.load_thread.start()
@@ -183,7 +211,7 @@ class HiCacheController:
             target=self.write_thread_func_buffer, daemon=True
         )
         self.load_thread = threading.Thread(
-            target=self.load_thread_func_buffer, daemon=True
+            target=self.load_thread_func_layer_by_layer, daemon=True
         )
         self.stop_event.clear()
         self.write_thread.start()
@@ -216,10 +244,12 @@ class HiCacheController:
         """
         Load KV caches from host memory to device memory.
         """
-        device_indices = self.mem_pool_device.alloc(len(host_indices))
+        device_indices = self.mem_pool_device_allocator.alloc(len(host_indices))
         if device_indices is None:
             return None
         self.mem_pool_host.protect_load(host_indices)
+        # to ensure the device indices are ready before accessed by another CUDA stream
+        torch.cuda.current_stream().synchronize()
         self.load_queue.put(
             CacheOperation(host_indices, device_indices, node_id, priority)
         )
@@ -270,6 +300,42 @@ class HiCacheController:
                 except Exception as e:
                     logger.error(e)
+    def load_thread_func_layer_by_layer(self):
+        """
+        Load KV caches from host memory to device memory layer by layer.
+        """
+        with torch.cuda.stream(self.load_stream):
+            while not self.stop_event.is_set():
+                self.load_cache_event.wait(timeout=1)
+                if not self.load_cache_event.is_set():
+                    continue
+                self.load_cache_event.clear()
+                batch_operation = None
+                while self.load_queue.qsize() > 0:
+                    op = self.load_queue.get(block=True)
+                    if batch_operation is None:
+                        batch_operation = op
+                    else:
+                        batch_operation.merge(op)
+                if batch_operation is None:
+                    continue
+                self.layer_done_counter.reset()
+                for i in range(self.mem_pool_host.layer_num):
+                    flat_data = self.mem_pool_host.get_flat_data_by_layer(
+                        batch_operation.host_indices, i
+                    )
+                    self.mem_pool_device.transfer_per_layer(
+                        batch_operation.device_indices, flat_data, i
+                    )
+                    self.layer_done_counter.increment()
+                self.mem_pool_host.complete_io(batch_operation.host_indices)
+                for node_id in batch_operation.node_ids:
+                    if node_id != 0:
+                        self.ack_load_queue.put(node_id)
     def write_aux_func(self, no_wait=False):
         """
         Auxiliary function to prepare the buffer for write operations.
@@ -417,7 +483,7 @@ class HiCacheController:
         self, device_indices: torch.Tensor, host_indices: torch.Tensor
     ) -> int:
         if self.mem_pool_host.is_synced(host_indices):
-            self.mem_pool_device.free(device_indices)
+            self.mem_pool_device_allocator.free(device_indices)
             self.mem_pool_host.update_backup(host_indices)
             return len(device_indices)
         else:

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -54,7 +54,7 @@ class LoadBalanceMethod(Enum):
 class DataParallelController:
     """A controller that dispatches requests to multiple data parallel workers."""
-    def __init__(self, server_args, port_args) -> None:
+    def __init__(self, server_args: ServerArgs, port_args: PortArgs) -> None:
         # Parse args
         self.max_total_num_tokens = None
         self.server_args = server_args

sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl