PyPI - sglang - Versions diffs - 0.3.6.post2__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

sglang 0.3.6.post2py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

sglang/bench_offline_throughput.py +55 -2
sglang/bench_one_batch.py +7 -6
sglang/bench_one_batch_server.py +4 -3
sglang/bench_serving.py +13 -0
sglang/check_env.py +1 -1
sglang/launch_server.py +3 -2
sglang/srt/_custom_ops.py +118 -0
sglang/srt/configs/device_config.py +17 -0
sglang/srt/configs/load_config.py +84 -0
sglang/srt/configs/model_config.py +161 -4
sglang/srt/configs/qwen2vl.py +5 -8
sglang/srt/constrained/outlines_backend.py +6 -1
sglang/srt/constrained/outlines_jump_forward.py +8 -1
sglang/srt/distributed/__init__.py +3 -0
sglang/srt/distributed/communication_op.py +34 -0
sglang/srt/distributed/device_communicators/__init__.py +0 -0
sglang/srt/distributed/device_communicators/cuda_wrapper.py +182 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +352 -0
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +291 -0
sglang/srt/distributed/device_communicators/hpu_communicator.py +48 -0
sglang/srt/distributed/device_communicators/pynccl.py +204 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +362 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +568 -0
sglang/srt/distributed/device_communicators/xpu_communicator.py +47 -0
sglang/srt/distributed/parallel_state.py +1275 -0
sglang/srt/distributed/utils.py +223 -0
sglang/srt/hf_transformers_utils.py +37 -1
sglang/srt/layers/attention/flashinfer_backend.py +13 -15
sglang/srt/layers/attention/torch_native_backend.py +285 -0
sglang/srt/layers/fused_moe_patch.py +20 -11
sglang/srt/layers/linear.py +1 -0
sglang/srt/layers/logits_processor.py +17 -3
sglang/srt/layers/quantization/__init__.py +34 -0
sglang/srt/layers/vocab_parallel_embedding.py +1 -0
sglang/srt/lora/lora.py +1 -1
sglang/srt/managers/data_parallel_controller.py +7 -11
sglang/srt/managers/detokenizer_manager.py +7 -4
sglang/srt/managers/image_processor.py +1 -1
sglang/srt/managers/io_struct.py +48 -12
sglang/srt/managers/schedule_batch.py +42 -36
sglang/srt/managers/schedule_policy.py +7 -4
sglang/srt/managers/scheduler.py +111 -46
sglang/srt/managers/session_controller.py +0 -3
sglang/srt/managers/tokenizer_manager.py +169 -100
sglang/srt/managers/tp_worker.py +36 -3
sglang/srt/managers/tp_worker_overlap_thread.py +32 -5
sglang/srt/model_executor/cuda_graph_runner.py +16 -7
sglang/srt/model_executor/forward_batch_info.py +9 -4
sglang/srt/model_executor/model_runner.py +136 -150
sglang/srt/model_loader/__init__.py +34 -0
sglang/srt/model_loader/loader.py +1139 -0
sglang/srt/model_loader/utils.py +41 -0
sglang/srt/model_loader/weight_utils.py +640 -0
sglang/srt/models/baichuan.py +9 -10
sglang/srt/models/chatglm.py +6 -15
sglang/srt/models/commandr.py +2 -3
sglang/srt/models/dbrx.py +2 -3
sglang/srt/models/deepseek.py +4 -11
sglang/srt/models/deepseek_v2.py +3 -11
sglang/srt/models/exaone.py +2 -3
sglang/srt/models/gemma.py +2 -6
sglang/srt/models/gemma2.py +3 -14
sglang/srt/models/gemma2_reward.py +0 -1
sglang/srt/models/gpt2.py +5 -12
sglang/srt/models/gpt_bigcode.py +6 -22
sglang/srt/models/grok.py +14 -51
sglang/srt/models/internlm2.py +2 -3
sglang/srt/models/internlm2_reward.py +0 -1
sglang/srt/models/llama.py +97 -27
sglang/srt/models/llama_classification.py +1 -2
sglang/srt/models/llama_embedding.py +1 -2
sglang/srt/models/llama_reward.py +2 -3
sglang/srt/models/llava.py +10 -12
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +4 -7
sglang/srt/models/minicpm3.py +6 -19
sglang/srt/models/mixtral.py +12 -5
sglang/srt/models/mixtral_quant.py +2 -3
sglang/srt/models/mllama.py +3 -7
sglang/srt/models/olmo.py +2 -8
sglang/srt/models/olmo2.py +391 -0
sglang/srt/models/olmoe.py +3 -5
sglang/srt/models/phi3_small.py +8 -8
sglang/srt/models/qwen.py +2 -3
sglang/srt/models/qwen2.py +10 -9
sglang/srt/models/qwen2_moe.py +4 -11
sglang/srt/models/qwen2_vl.py +12 -9
sglang/srt/models/registry.py +99 -0
sglang/srt/models/stablelm.py +2 -3
sglang/srt/models/torch_native_llama.py +6 -12
sglang/srt/models/xverse.py +2 -4
sglang/srt/models/xverse_moe.py +4 -11
sglang/srt/models/yivl.py +2 -3
sglang/srt/openai_api/adapter.py +10 -6
sglang/srt/openai_api/protocol.py +1 -0
sglang/srt/server.py +303 -204
sglang/srt/server_args.py +65 -31
sglang/srt/utils.py +253 -48
sglang/test/test_utils.py +27 -7
sglang/utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/METADATA +2 -1
sglang-0.4.0.dist-info/RECORD +184 -0
sglang/srt/layers/fused_moe_grok/__init__.py +0 -1
sglang/srt/layers/fused_moe_grok/fused_moe.py +0 -692
sglang/srt/layers/fused_moe_grok/layer.py +0 -630
sglang-0.3.6.post2.dist-info/RECORD +0 -164
{sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/LICENSE +0 -0
{sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/WHEEL +0 -0
{sglang-0.3.6.post2.dist-info → sglang-0.4.0.dist-info}/top_level.txt +0 -0

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -23,6 +23,7 @@ from vllm.distributed import (
     tensor_model_parallel_all_gather,
 )
+from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
@@ -163,7 +164,7 @@ class LogitsProcessor(nn.Module):
         self,
         input_ids,
         hidden_states,
-        weight,
+        lm_head: VocabParallelEmbedding,
         logits_metadata: Union[LogitsMetadata, ForwardBatch],
     ):
         if isinstance(logits_metadata, ForwardBatch):
@@ -178,7 +179,7 @@ class LogitsProcessor(nn.Module):
             last_index = torch.cumsum(logits_metadata.extend_seq_lens, dim=0) - 1
             last_hidden = hidden_states[last_index]
-        last_logits = torch.matmul(last_hidden, weight.T)
+        last_logits = self._get_logits(last_hidden, lm_head)
         if self.do_tensor_parallel_all_gather:
             last_logits = tensor_model_parallel_all_gather(last_logits)
         last_logits = last_logits[:, : self.config.vocab_size].float()
@@ -229,7 +230,7 @@ class LogitsProcessor(nn.Module):
                 # Compute the logits and logprobs for all required tokens
                 states = torch.cat(states, dim=0)
-                all_logits = torch.matmul(states, weight.T)
+                all_logits = self._get_logits(states, lm_head)
                 if self.do_tensor_parallel_all_gather:
                     all_logits = tensor_model_parallel_all_gather(all_logits)
                 all_logits = all_logits[:, : self.config.vocab_size].float()
@@ -276,6 +277,19 @@ class LogitsProcessor(nn.Module):
                     output_top_logprobs=output_top_logprobs,
                 )
+    def _get_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
+        embedding_bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        if hasattr(lm_head, "weight"):
+            logits = torch.matmul(hidden_states, lm_head.weight.T)
+        else:
+            # GGUF models
+            logits = lm_head.linear_method.apply(lm_head, hidden_states, embedding_bias)
+        return logits
 def test():
     all_logprobs = torch.tensor(

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -117,10 +117,44 @@ def fp8_get_quant_method(self, layer, prefix):
     return None
+def gptq_get_quant_method(self, layer, prefix):
+    from vllm.model_executor.layers.linear import LinearBase
+    from vllm.model_executor.layers.quantization.gptq_marlin import (
+        GPTQMarlinLinearMethod,
+        GPTQMarlinMoEMethod,
+    )
+    from sglang.srt.layers.fused_moe_triton.layer import FusedMoE
+    if isinstance(layer, LinearBase):
+        return GPTQMarlinLinearMethod(self)
+    elif isinstance(layer, FusedMoE):
+        return GPTQMarlinMoEMethod(self)
+    return None
+def awq_get_quant_method(self, layer, prefix):
+    from vllm.model_executor.layers.linear import LinearBase
+    from vllm.model_executor.layers.quantization.awq_marlin import (
+        AWQMarlinLinearMethod,
+        AWQMoEMethod,
+    )
+    from sglang.srt.layers.fused_moe_triton.layer import FusedMoE
+    if isinstance(layer, LinearBase):
+        return AWQMarlinLinearMethod(self)
+    elif isinstance(layer, FusedMoE):
+        return AWQMoEMethod(self)
+    return None
 def apply_monkey_patches():
     """Apply all monkey patches in one place."""
     setattr(Fp8MoEMethod, "apply", fp8_moe_apply)
     setattr(Fp8Config, "get_quant_method", fp8_get_quant_method)
+    setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
+    setattr(AWQMarlinConfig, "get_quant_method", awq_get_quant_method)
 # Apply patches when module is imported

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -222,6 +222,7 @@ class VocabParallelEmbedding(torch.nn.Module):
         enable_tp: bool = True,
     ):
         super().__init__()
+        self.quant_config = quant_config
         self.enable_tp = enable_tp
         if self.enable_tp:

sglang/srt/lora/lora.py CHANGED Viewed

@@ -31,7 +31,6 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead,
     VocabParallelEmbedding,
 )
-from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from sglang.srt.layers.linear import (
     ColumnParallelLinear,
@@ -40,6 +39,7 @@ from sglang.srt.layers.linear import (
     RowParallelLinear,
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.model_loader.loader import DefaultModelLoader
 class BaseLayerWithLoRA(nn.Module):

sglang/srt/managers/data_parallel_controller.py CHANGED Viewed

@@ -15,9 +15,11 @@
 import logging
 import multiprocessing as mp
+import signal
 import threading
 from enum import Enum, auto
+import psutil
 import zmq
 from sglang.srt.managers.io_struct import (
@@ -26,13 +28,7 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.scheduler import run_scheduler_process
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import (
-    bind_port,
-    configure_logger,
-    get_zmq_socket,
-    kill_parent_process,
-    suppress_other_loggers,
-)
+from sglang.srt.utils import bind_port, configure_logger, get_zmq_socket
 from sglang.utils import get_exception_traceback
 logger = logging.getLogger(__name__)
@@ -235,7 +231,7 @@ def run_data_parallel_controller_process(
     pipe_writer,
 ):
     configure_logger(server_args)
-    suppress_other_loggers()
+    parent_process = psutil.Process().parent()
     try:
         controller = DataParallelController(server_args, port_args)
@@ -244,6 +240,6 @@ def run_data_parallel_controller_process(
         )
         controller.event_loop()
     except Exception:
-        msg = get_exception_traceback()
-        logger.error(msg)
-        kill_parent_process()
+        traceback = get_exception_traceback()
+        logger.error(f"DataParallelController hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -15,9 +15,11 @@
 import dataclasses
 import logging
+import signal
 from collections import OrderedDict
 from typing import List, Union
+import psutil
 import zmq
 from sglang.srt.hf_transformers_utils import get_tokenizer
@@ -28,7 +30,7 @@ from sglang.srt.managers.io_struct import (
 )
 from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.srt.utils import configure_logger, get_zmq_socket, kill_parent_process
+from sglang.srt.utils import configure_logger, get_zmq_socket
 from sglang.utils import find_printable_text, get_exception_traceback
 logger = logging.getLogger(__name__)
@@ -193,11 +195,12 @@ def run_detokenizer_process(
     port_args: PortArgs,
 ):
     configure_logger(server_args)
+    parent_process = psutil.Process().parent()
     try:
         manager = DetokenizerManager(server_args, port_args)
         manager.event_loop()
     except Exception:
-        msg = get_exception_traceback()
-        logger.error(msg)
-        kill_parent_process()
+        traceback = get_exception_traceback()
+        logger.error(f"DetokenizerManager hit an exception: {traceback}")
+        parent_process.send_signal(signal.SIGQUIT)

sglang/srt/managers/image_processor.py CHANGED Viewed

@@ -338,7 +338,7 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
             "pixel_values": pixel_values,
             "image_hashes": image_hashes,
             "image_sizes": image_sizes,
-            "modalities": request_obj.modalities,
+            "modalities": request_obj.modalities or ["image"],
             "image_grid_thws": image_grid_thws,
         }

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -352,7 +352,7 @@ class FlushCacheReq:
 @dataclass
-class UpdateWeightReqInput:
+class UpdateWeightFromDiskReqInput:
     # The model path with the new weights
     model_path: str
     # The format to load the weights
@@ -360,30 +360,66 @@ class UpdateWeightReqInput:
 @dataclass
-class UpdateWeightReqOutput:
+class UpdateWeightFromDiskReqOutput:
     success: bool
     message: str
 @dataclass
-class AbortReq:
-    # The request id
-    rid: str
+class UpdateWeightsFromDistributedReqInput:
+    name: str
+    dtype: str
+    shape: List[int]
-class ProfileReq(Enum):
-    START_PROFILE = 1
-    STOP_PROFILE = 2
+@dataclass
+class UpdateWeightsFromDistributedReqOutput:
+    success: bool
+    message: str
 @dataclass
-class GetMemPoolSizeReq:
-    pass
+class InitWeightsUpdateGroupReqInput:
+    # The master address
+    master_address: str
+    # The master port
+    master_port: int
+    # The rank offset
+    rank_offset: int
+    # The world size
+    world_size: int
+    # The group name
+    group_name: str = "weight_update_group"
+    # The backend
+    backend: str = "nccl"
+@dataclass
+class InitWeightsUpdateGroupReqOutput:
+    success: bool
+    message: str
 @dataclass
-class GetMemPoolSizeReqOutput:
-    size: int
+class GetWeightsByNameReqInput:
+    name: str
+    truncate_size: int = 100
+@dataclass
+class GetWeightsByNameReqOutput:
+    parameter: list
+@dataclass
+class AbortReq:
+    # The request id
+    rid: str
+class ProfileReq(Enum):
+    START_PROFILE = 1
+    STOP_PROFILE = 2
 @dataclass

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -124,7 +124,7 @@ class FINISH_ABORT(BaseFinishReason):
 class ImageInputs:
     """The image related inputs."""
-    pixel_values: torch.Tensor
+    pixel_values: Union[torch.Tensor, np.array]
     image_hashes: Optional[list] = None
     image_sizes: Optional[list] = None
     image_offsets: Optional[list] = None
@@ -132,7 +132,7 @@ class ImageInputs:
     modalities: Optional[list] = None
     num_image_tokens: Optional[int] = None
-    image_embeds: Optional[List[torch.Tensor]] = None
+    # Llava related
     aspect_ratio_ids: Optional[List[torch.Tensor]] = None
     aspect_ratio_mask: Optional[List[torch.Tensor]] = None
@@ -141,19 +141,17 @@ class ImageInputs:
     mrope_position_delta: Optional[torch.Tensor] = None
     @staticmethod
-    def from_dict(obj, vocab_size):
-        # Use image hash as fake token_ids, which is then used for prefix matching
+    def from_dict(obj: dict):
         ret = ImageInputs(
             pixel_values=obj["pixel_values"],
-            image_hashes=hash(tuple(obj["image_hashes"])),
+            image_hashes=obj["image_hashes"],
         )
-        image_hash = ret.image_hashes
-        ret.pad_values = [
-            (image_hash) % vocab_size,
-            (image_hash >> 16) % vocab_size,
-            (image_hash >> 32) % vocab_size,
-            (image_hash >> 64) % vocab_size,
-        ]
+        # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
+        # Please note that if the `input_ids` is later used in the model forward,
+        # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
+        # errors in cuda kernels. See also llava.py for example.
+        ret.pad_values = [x % (1 << 30) for x in ret.image_hashes]
         optional_args = [
             "image_sizes",
@@ -168,17 +166,16 @@ class ImageInputs:
         return ret
-    def merge(self, other, vocab_size):
+    def merge(self, other):
         assert self.pixel_values.shape[1:] == other.pixel_values.shape[1:]
         self.pixel_values = np.concatenate([self.pixel_values, other.pixel_values])
-        self.image_hashes += other.image_hashes
-        self.pad_values = [
-            (self.image_hashes) % vocab_size,
-            (self.image_hashes >> 16) % vocab_size,
-            (self.image_hashes >> 32) % vocab_size,
-            (self.image_hashes >> 64) % vocab_size,
-        ]
+        # Use image hash as fake token_ids. We use this as the key for prefix matching in the radix cache.
+        # Please note that if the `input_ids` is later used in the model forward,
+        # you also need to clamp the values within the range of [0, vocab_size) to avoid out-of-bound
+        # errors in cuda kernels. See also llava.py for example.
+        self.image_hashes += other.image_hashes
+        self.pad_values = [x % (1 << 30) for x in self.image_hashes]
         optional_args = [
             "image_sizes",
@@ -231,6 +228,7 @@ class Req:
         self.tokenizer = None
         self.finished_reason = None
         self.stream = False
+        self.to_abort = False
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -290,11 +288,11 @@ class Req:
         # The number of cached tokens, that were already cached in the KV cache
         self.cached_tokens = 0
-    def extend_image_inputs(self, image_inputs, vocab_size):
+    def extend_image_inputs(self, image_inputs):
         if self.image_inputs is None:
             self.image_inputs = image_inputs
         else:
-            self.image_inputs.merge(image_inputs, vocab_size)
+            self.image_inputs.merge(image_inputs)
     # whether request reached finished condition
     def finished(self) -> bool:
@@ -368,6 +366,10 @@ class Req:
         if self.finished():
             return
+        if self.to_abort:
+            self.finished_reason = FINISH_ABORT()
+            return
         if len(self.output_ids) >= self.sampling_params.max_new_tokens:
             self.finished_reason = FINISH_LENGTH(
                 length=self.sampling_params.max_new_tokens
@@ -741,20 +743,24 @@ class ScheduleBatch:
         extend_lens = torch.tensor(self.extend_lens, dtype=torch.int32).to(
             self.device, non_blocking=True
         )
-        write_req_to_token_pool_triton[(bs,)](
-            self.req_to_token_pool.req_to_token,
-            self.req_pool_indices,
-            pre_lens,
-            self.seq_lens,
-            extend_lens,
-            self.out_cache_loc,
-            self.req_to_token_pool.req_to_token.shape[1],
-        )
-        # The triton kernel is equivalent to the following python code.
-        # self.req_to_token_pool.write(
-        #    (req.req_pool_idx, slice(pre_len, seq_len)),
-        #    out_cache_loc[pt : pt + req.extend_input_len],
-        # )
+        if global_server_args_dict["attention_backend"] != "torch_native":
+            write_req_to_token_pool_triton[(bs,)](
+                self.req_to_token_pool.req_to_token,
+                self.req_pool_indices,
+                pre_lens,
+                self.seq_lens,
+                extend_lens,
+                self.out_cache_loc,
+                self.req_to_token_pool.req_to_token.shape[1],
+            )
+        else:
+            pt = 0
+            for i in range(bs):
+                self.req_to_token_pool.write(
+                    (self.req_pool_indices[i], slice(pre_lens[i], self.seq_lens[i])),
+                    self.out_cache_loc[pt : pt + self.extend_lens[i]],
+                )
+                pt += self.extend_lens[i]
         # TODO: some tensors can be reused for ForwardBatchInfo (e.g., extend_lens, cumsum_start)
         if self.model_config.is_encoder_decoder:

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -142,7 +142,7 @@ class PrefillAdder:
         self.req_states = None
         self.can_run_list = []
-        self.new_inflight_req = None
+        self.new_being_chunked_req = None
         self.log_hit_tokens = 0
         self.log_input_tokens = 0
@@ -182,7 +182,7 @@ class PrefillAdder:
         self.log_hit_tokens += prefix_len
         self.log_input_tokens += extend_input_len
-    def add_inflight_req(self, req: Req):
+    def add_being_chunked_req(self, req: Req):
         truncated = req.extend_input_len > self.rem_chunk_tokens
         req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
         req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
@@ -269,10 +269,13 @@ class PrefillAdder:
         else:
             # Chunked prefill
             trunc_len = self.rem_chunk_tokens
+            if trunc_len == 0:
+                return AddReqResult.OTHER
             req.extend_input_len = trunc_len
             req.fill_ids = req.fill_ids[:trunc_len]
             self.can_run_list.append(req)
-            self.new_inflight_req = req
+            self.new_being_chunked_req = req
             self._prefill_one_req(0, trunc_len, 0)
         return self.budget_state()
@@ -326,7 +329,7 @@ class PrefillAdder:
                 req.extend_input_len = trunc_len
                 req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len]
                 self.can_run_list.append(req)
-                self.new_inflight_req = req
+                self.new_being_chunked_req = req
                 self.tree_cache.inc_lock_ref(req.last_node)
                 self._prefill_one_req(prefix_len, trunc_len, 0)

sglang 0.3.6.post2__py3-none-any.whl → 0.4.0__py3-none-any.whl

sglang 0.3.6.post2py3-none-any.whl → 0.4.0py3-none-any.whl