PyPI - sglang - Versions diffs - 0.3.6.post3__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl - Mend

sglang 0.3.6.post3py3-none-any.whl → 0.4.0.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

sglang/__init__.py +1 -1
sglang/bench_one_batch.py +4 -0
sglang/bench_serving.py +13 -0
sglang/check_env.py +1 -1
sglang/srt/_custom_ops.py +118 -0
sglang/srt/configs/device_config.py +17 -0
sglang/srt/configs/load_config.py +84 -0
sglang/srt/configs/model_config.py +161 -4
sglang/srt/configs/qwen2vl.py +5 -8
sglang/srt/constrained/outlines_backend.py +11 -1
sglang/srt/constrained/outlines_jump_forward.py +8 -1
sglang/srt/constrained/xgrammar_backend.py +5 -5
sglang/srt/distributed/__init__.py +3 -0
sglang/srt/distributed/communication_op.py +34 -0
sglang/srt/distributed/device_communicators/__init__.py +0 -0
sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
sglang/srt/distributed/device_communicators/pynccl.py +204 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
sglang/srt/distributed/parallel_state.py +1275 -0
sglang/srt/distributed/utils.py +223 -0
sglang/srt/hf_transformers_utils.py +37 -1
sglang/srt/layers/attention/__init__.py +5 -2
sglang/srt/layers/attention/double_sparsity_backend.py +22 -8
sglang/srt/layers/attention/flashinfer_backend.py +33 -20
sglang/srt/layers/attention/torch_native_backend.py +299 -0
sglang/srt/layers/attention/triton_backend.py +22 -8
sglang/srt/layers/attention/triton_ops/extend_attention.py +3 -0
sglang/srt/layers/ep_moe/__init__.py +0 -0
sglang/srt/layers/ep_moe/kernels.py +349 -0
sglang/srt/layers/ep_moe/layer.py +661 -0
sglang/srt/layers/fused_moe_patch.py +20 -11
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/logits_processor.py +17 -3
sglang/srt/layers/quantization/__init__.py +36 -2
sglang/srt/layers/quantization/fp8.py +559 -0
sglang/srt/layers/quantization/fp8_utils.py +27 -0
sglang/srt/layers/radix_attention.py +4 -2
sglang/srt/layers/sampler.py +2 -0
sglang/srt/layers/torchao_utils.py +23 -45
sglang/srt/layers/vocab_parallel_embedding.py +1 -0
sglang/srt/lora/lora.py +1 -1
sglang/srt/managers/io_struct.py +48 -2
sglang/srt/managers/schedule_batch.py +19 -14
sglang/srt/managers/schedule_policy.py +7 -4
sglang/srt/managers/scheduler.py +145 -85
sglang/srt/managers/tokenizer_manager.py +166 -68
sglang/srt/managers/tp_worker.py +36 -3
sglang/srt/managers/tp_worker_overlap_thread.py +28 -8
sglang/srt/mem_cache/memory_pool.py +5 -1
sglang/srt/model_executor/cuda_graph_runner.py +30 -7
sglang/srt/model_executor/forward_batch_info.py +9 -4
sglang/srt/model_executor/model_runner.py +146 -153
sglang/srt/model_loader/__init__.py +34 -0
sglang/srt/model_loader/loader.py +1139 -0
sglang/srt/model_loader/utils.py +41 -0
sglang/srt/model_loader/weight_utils.py +640 -0
sglang/srt/model_parallel.py +1 -5
sglang/srt/models/baichuan.py +9 -10
sglang/srt/models/chatglm.py +6 -15
sglang/srt/models/commandr.py +4 -5
sglang/srt/models/dbrx.py +2 -3
sglang/srt/models/deepseek.py +4 -11
sglang/srt/models/deepseek_v2.py +90 -18
sglang/srt/models/exaone.py +2 -3
sglang/srt/models/gemma.py +2 -6
sglang/srt/models/gemma2.py +3 -14
sglang/srt/models/gemma2_reward.py +0 -1
sglang/srt/models/gpt2.py +5 -12
sglang/srt/models/gpt_bigcode.py +6 -22
sglang/srt/models/grok.py +3 -8
sglang/srt/models/internlm2.py +2 -3
sglang/srt/models/internlm2_reward.py +0 -1
sglang/srt/models/llama.py +96 -31
sglang/srt/models/llama_classification.py +1 -2
sglang/srt/models/llama_embedding.py +1 -2
sglang/srt/models/llama_reward.py +2 -3
sglang/srt/models/llava.py +1 -4
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +4 -7
sglang/srt/models/minicpm3.py +6 -19
sglang/srt/models/mixtral.py +24 -14
sglang/srt/models/mixtral_quant.py +2 -3
sglang/srt/models/mllama.py +3 -7
sglang/srt/models/olmo.py +2 -8
sglang/srt/models/olmo2.py +0 -1
sglang/srt/models/olmoe.py +3 -5
sglang/srt/models/phi3_small.py +8 -13
sglang/srt/models/qwen.py +2 -3
sglang/srt/models/qwen2.py +10 -9
sglang/srt/models/qwen2_moe.py +4 -16
sglang/srt/models/qwen2_vl.py +2 -6
sglang/srt/models/registry.py +99 -0
sglang/srt/models/stablelm.py +2 -3
sglang/srt/models/torch_native_llama.py +6 -17
sglang/srt/models/xverse.py +2 -4
sglang/srt/models/xverse_moe.py +4 -11
sglang/srt/models/yivl.py +2 -3
sglang/srt/openai_api/adapter.py +9 -5
sglang/srt/openai_api/protocol.py +1 -0
sglang/srt/sampling/sampling_batch_info.py +9 -8
sglang/srt/server.py +270 -173
sglang/srt/server_args.py +102 -29
sglang/srt/utils.py +295 -28
sglang/test/test_utils.py +7 -0
sglang/version.py +1 -1
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/METADATA +5 -4
sglang-0.4.0.post1.dist-info/RECORD +189 -0
sglang-0.3.6.post3.dist-info/RECORD +0 -162
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/LICENSE +0 -0
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/WHEEL +0 -0
{sglang-0.3.6.post3.dist-info → sglang-0.4.0.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/quantization/fp8_utils.py ADDED Viewed

@@ -0,0 +1,27 @@
+from typing import Optional, Tuple
+import torch
+def normalize_e4m3fn_to_e4m3fnuz(
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    assert weight.dtype == torch.float8_e4m3fn
+    # The bits pattern 10000000(-128) represents zero in e4m3fn
+    # but NaN in e4m3fnuz. So here we set it to 0.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_as_int8 = weight.view(torch.int8)
+    ROCM_FP8_NAN_AS_INT = -128
+    weight_as_int8[weight_as_int8 == ROCM_FP8_NAN_AS_INT] = 0
+    weight = weight_as_int8.view(torch.float8_e4m3fnuz)
+    # For the same bits representation, e4m3fnuz value is half of
+    # the e4m3fn value, so we should double the scaling factor to
+    # get the same dequantized value.
+    # https://onnx.ai/onnx/technical/float8.html
+    weight_scale = weight_scale * 2.0
+    if input_scale is not None:
+        input_scale = input_scale * 2.0
+    return weight, weight_scale, input_scale

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -48,11 +48,13 @@ class RadixAttention(nn.Module):
         self.sliding_window_size = sliding_window_size or -1
         self.is_cross_attention = is_cross_attention
-    def forward(self, q, k, v, forward_batch: ForwardBatch):
+    def forward(self, q, k, v, forward_batch: ForwardBatch, save_kv_cache=True):
         if k is not None:
             # For cross-layer sharing, kv can be None
             assert v is not None
             k = k.view(-1, self.tp_k_head_num, self.qk_head_dim)
             v = v.view(-1, self.tp_v_head_num, self.v_head_dim)
-        return forward_batch.attn_backend.forward(q, k, v, self, forward_batch)
+        return forward_batch.attn_backend.forward(
+            q, k, v, self, forward_batch, save_kv_cache
+        )

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -111,5 +111,7 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
     probs_sort[probs_sort < min_p_thresholds.view(-1, 1)] = 0.0
     probs_sort.div_(probs_sort.max(dim=-1, keepdim=True)[0])
     sampled_index = torch.multinomial(probs_sort, num_samples=1)
+    # int32 range is enough to represent the token ids
+    probs_idx = probs_idx.to(torch.int32)
     batch_next_token_ids = torch.gather(probs_idx, dim=1, index=sampled_index).view(-1)
     return batch_next_token_ids

sglang/srt/layers/torchao_utils.py CHANGED Viewed

@@ -2,23 +2,24 @@
 Common utilities for torchao.
 """
-from typing import Dict, Set
 import torch
-def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
-    """Quantize a Tensor with torchao quantization specified by torchao_config
+def apply_torchao_config_to_model(
+    model: torch.nn.Module, torchao_config: str, filter_fn=None
+):
+    """Quantize a modelwith torchao quantization specified by torchao_config
     Args:
-       `param`: weight parameter of the linear module
-       `torchao_config`: type of quantization and their arguments we want to use to
-        quantize the Tensor, e.g. int4wo-128 means int4 weight only quantization with group_size
+       `model`: a model to be quantized based on torchao_config
+       `torchao_config` (str): type of quantization and their arguments we want to use to
+        quantize the model, e.g. int4wo-128 means int4 weight only quantization with group_size
         128
     """
     # Lazy import to suppress some warnings
     from torchao.quantization import (
         float8_dynamic_activation_float8_weight,
+        float8_weight_only,
         int4_weight_only,
         int8_dynamic_activation_int8_weight,
         int8_weight_only,
@@ -26,12 +27,17 @@ def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
     )
     from torchao.quantization.observer import PerRow, PerTensor
-    dummy_linear = torch.nn.Linear(param.shape[1], param.shape[0], bias=False)
-    dummy_linear.weight = param
-    if "int8wo" in torchao_config:
-        quantize_(dummy_linear, int8_weight_only())
+    if filter_fn is None:
+        def filter_fn(module, fqn):
+            return "proj" in fqn
+    if torchao_config == "" or torchao_config is None:
+        return model
+    elif "int8wo" in torchao_config:
+        quantize_(model, int8_weight_only(), filter_fn=filter_fn)
     elif "int8dq" in torchao_config:
-        quantize_(dummy_linear, int8_dynamic_activation_int8_weight())
+        quantize_(model, int8_dynamic_activation_int8_weight(), filter_fn=filter_fn)
     elif "int4wo" in torchao_config:
         group_size = int(torchao_config.split("-")[-1])
         assert group_size in [
@@ -40,13 +46,11 @@ def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
             128,
             256,
         ], f"int4wo groupsize needs to be one of [32, 64, 128, 256] but got {group_size}"
-        quantize_(dummy_linear, int4_weight_only(group_size=group_size))
+        quantize_(model, int4_weight_only(group_size=group_size), filter_fn=filter_fn)
     elif "fp8wo" in torchao_config:
-        from torchao.quantization import float8_weight_only
         # this requires newer hardware
         # [rank0]: AssertionError: fp8e4nv data type is not supported on CUDA arch < 89
-        quantize_(dummy_linear, float8_weight_only())
+        quantize_(model, float8_weight_only(), filter_fn=filter_fn)
     elif "fp8dq" in torchao_config:
         granularity = torchao_config.split("-")[-1]
         GRANULARITY_MAP = {
@@ -57,39 +61,13 @@ def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
             granularity in GRANULARITY_MAP
         ), f"Supported granularity are: {GRANULARITY_MAP.keys()}, got {granularity}"
         quantize_(
-            dummy_linear,
+            model,
             float8_dynamic_activation_float8_weight(
                 granularity=GRANULARITY_MAP[granularity]
             ),
+            filter_fn=filter_fn,
         )
     else:
         raise ValueError(f"Unexpected config: {torchao_config}")
-    return dummy_linear.weight
-def apply_torchao_config_(
-    self: torch.nn.Module,
-    params_dict: Dict[str, torch.Tensor],
-    param_suffixes: Set[str],
-) -> None:
-    """A util function used for quantizing the weight parameters after they are loaded if
-       self.torchao_config is specified
-    Args:
-      `self`: the model we want to quantize
-      `params_dict`: dictionary mapping from param_name to the parameter Tensor
-      `param_suffixes`: a set of suffixes, we'll quantize the Tensor matching these suffixes
-    Returns:
-       None, the `params_dict` is modified inplace and the weights of `self` model are quantized
-    """
-    if self.torchao_config:
-        for param_suffix in param_suffixes:
-            for name in params_dict:
-                param = params_dict[name]
-                if param_suffix in name and param.ndim == 2:
-                    params_dict[name] = torchao_quantize_param_data(
-                        param, self.torchao_config
-                    )
-        self.load_state_dict(params_dict, assign=True)
+    return model

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -222,6 +222,7 @@ class VocabParallelEmbedding(torch.nn.Module):
         enable_tp: bool = True,
     ):
         super().__init__()
+        self.quant_config = quant_config
         self.enable_tp = enable_tp
         if self.enable_tp:

sglang/srt/lora/lora.py CHANGED Viewed

@@ -31,7 +31,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from sglang.srt.layers.linear import (
     ColumnParallelLinear,
@@ -40,6 +39,7 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.model_loader.loader import DefaultModelLoader
 class BaseLayerWithLoRA(nn.Module):

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -352,7 +352,7 @@ class FlushCacheReq:
 @dataclass
-class UpdateWeightReqInput:
+class UpdateWeightFromDiskReqInput:
     # The model path with the new weights
     model_path: str
     # The format to load the weights
@@ -360,11 +360,57 @@ class UpdateWeightReqInput:
 @dataclass
-class UpdateWeightReqOutput:
+class UpdateWeightFromDiskReqOutput:
     success: bool
     message: str
+@dataclass
+class UpdateWeightsFromDistributedReqInput:
+    name: str
+    dtype: str
+    shape: List[int]
+@dataclass
+class UpdateWeightsFromDistributedReqOutput:
+    success: bool
+    message: str
+@dataclass
+class InitWeightsUpdateGroupReqInput:
+    # The master address
+    master_address: str
+    # The master port
+    master_port: int
+    # The rank offset
+    rank_offset: int
+    # The world size
+    world_size: int
+    # The group name
+    group_name: str = "weight_update_group"
+    # The backend
+    backend: str = "nccl"
+@dataclass
+class InitWeightsUpdateGroupReqOutput:
+    success: bool
+    message: str
+@dataclass
+class GetWeightsByNameReqInput:
+    name: str
+    truncate_size: int = 100
+@dataclass
+class GetWeightsByNameReqOutput:
+    parameter: list
 @dataclass
 class AbortReq:
     # The request id

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -58,6 +58,7 @@ global_server_args_dict = {
     "torchao_config": ServerArgs.torchao_config,
     "enable_nan_detection": ServerArgs.enable_nan_detection,
     "enable_dp_attention": ServerArgs.enable_dp_attention,
+    "enable_ep_moe": ServerArgs.enable_ep_moe,
 }
@@ -743,20 +744,24 @@ class ScheduleBatch:
         extend_lens = torch.tensor(self.extend_lens, dtype=torch.int32).to(
             self.device, non_blocking=True
         )
-        write_req_to_token_pool_triton[(bs,)](
-            self.req_to_token_pool.req_to_token,
-            self.req_pool_indices,
-            pre_lens,
-            self.seq_lens,
-            extend_lens,
-            self.out_cache_loc,
-            self.req_to_token_pool.req_to_token.shape[1],
-        )
-        # The triton kernel is equivalent to the following python code.
-        # self.req_to_token_pool.write(
-        #    (req.req_pool_idx, slice(pre_len, seq_len)),
-        #    out_cache_loc[pt : pt + req.extend_input_len],
-        # )
+        if global_server_args_dict["attention_backend"] != "torch_native":
+            write_req_to_token_pool_triton[(bs,)](
+                self.req_to_token_pool.req_to_token,
+                self.req_pool_indices,
+                pre_lens,
+                self.seq_lens,
+                extend_lens,
+                self.out_cache_loc,
+                self.req_to_token_pool.req_to_token.shape[1],
+            )
+        else:
+            pt = 0
+            for i in range(bs):
+                self.req_to_token_pool.write(
+                    (self.req_pool_indices[i], slice(pre_lens[i], self.seq_lens[i])),
+                    self.out_cache_loc[pt : pt + self.extend_lens[i]],
+                )
+                pt += self.extend_lens[i]
         # TODO: some tensors can be reused for ForwardBatchInfo (e.g., extend_lens, cumsum_start)
         if self.model_config.is_encoder_decoder:

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -142,7 +142,7 @@ class PrefillAdder:
         self.req_states = None
         self.can_run_list = []
-        self.new_inflight_req = None
+        self.new_being_chunked_req = None
         self.log_hit_tokens = 0
         self.log_input_tokens = 0
@@ -182,7 +182,7 @@ class PrefillAdder:
         self.log_hit_tokens += prefix_len
         self.log_input_tokens += extend_input_len
-    def add_inflight_req(self, req: Req):
+    def add_being_chunked_req(self, req: Req):
         truncated = req.extend_input_len > self.rem_chunk_tokens
         req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
         req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
@@ -269,10 +269,13 @@ class PrefillAdder:
         else:
             # Chunked prefill
             trunc_len = self.rem_chunk_tokens
+            if trunc_len == 0:
+                return AddReqResult.OTHER
             req.extend_input_len = trunc_len
             req.fill_ids = req.fill_ids[:trunc_len]
             self.can_run_list.append(req)
-            self.new_inflight_req = req
+            self.new_being_chunked_req = req
             self._prefill_one_req(0, trunc_len, 0)
         return self.budget_state()
@@ -326,7 +329,7 @@ class PrefillAdder:
                 req.extend_input_len = trunc_len
                 req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len]
                 self.can_run_list.append(req)
-                self.new_inflight_req = req
+                self.new_being_chunked_req = req
                 self.tree_cache.inc_lock_ref(req.last_node)
                 self._prefill_one_req(prefix_len, trunc_len, 0)

sglang 0.3.6.post3__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl

sglang 0.3.6.post3py3-none-any.whl → 0.4.0.post1py3-none-any.whl