PyPI - sglang - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

sglang 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

sglang/bench_latency.py +31 -13
sglang/bench_server_latency.py +21 -10
sglang/bench_serving.py +101 -7
sglang/global_config.py +0 -1
sglang/srt/conversation.py +11 -2
sglang/srt/layers/attention/__init__.py +27 -5
sglang/srt/layers/attention/double_sparsity_backend.py +281 -0
sglang/srt/layers/attention/flashinfer_backend.py +352 -83
sglang/srt/layers/attention/triton_backend.py +6 -4
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
sglang/srt/layers/attention/triton_ops/prefill_attention.py +4 -2
sglang/srt/layers/sampler.py +6 -2
sglang/srt/managers/data_parallel_controller.py +177 -0
sglang/srt/managers/detokenizer_manager.py +31 -10
sglang/srt/managers/io_struct.py +11 -2
sglang/srt/managers/schedule_batch.py +126 -43
sglang/srt/managers/schedule_policy.py +2 -1
sglang/srt/managers/scheduler.py +245 -142
sglang/srt/managers/tokenizer_manager.py +14 -1
sglang/srt/managers/tp_worker.py +111 -1
sglang/srt/mem_cache/chunk_cache.py +8 -4
sglang/srt/mem_cache/memory_pool.py +77 -4
sglang/srt/mem_cache/radix_cache.py +15 -7
sglang/srt/model_executor/cuda_graph_runner.py +4 -4
sglang/srt/model_executor/forward_batch_info.py +16 -21
sglang/srt/model_executor/model_runner.py +100 -36
sglang/srt/models/baichuan.py +2 -3
sglang/srt/models/chatglm.py +5 -6
sglang/srt/models/commandr.py +1 -2
sglang/srt/models/dbrx.py +1 -2
sglang/srt/models/deepseek.py +4 -5
sglang/srt/models/deepseek_v2.py +5 -6
sglang/srt/models/exaone.py +1 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +5 -5
sglang/srt/models/gpt_bigcode.py +5 -5
sglang/srt/models/grok.py +1 -2
sglang/srt/models/internlm2.py +1 -2
sglang/srt/models/llama.py +1 -2
sglang/srt/models/llama_classification.py +1 -2
sglang/srt/models/llama_reward.py +2 -3
sglang/srt/models/llava.py +4 -8
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +1 -2
sglang/srt/models/minicpm3.py +5 -6
sglang/srt/models/mixtral.py +1 -2
sglang/srt/models/mixtral_quant.py +1 -2
sglang/srt/models/olmo.py +352 -0
sglang/srt/models/olmoe.py +1 -2
sglang/srt/models/qwen.py +1 -2
sglang/srt/models/qwen2.py +1 -2
sglang/srt/models/qwen2_moe.py +4 -5
sglang/srt/models/stablelm.py +1 -2
sglang/srt/models/torch_native_llama.py +1 -2
sglang/srt/models/xverse.py +1 -2
sglang/srt/models/xverse_moe.py +4 -5
sglang/srt/models/yivl.py +1 -2
sglang/srt/openai_api/adapter.py +97 -52
sglang/srt/openai_api/protocol.py +10 -2
sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
sglang/srt/sampling/sampling_batch_info.py +105 -59
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server.py +171 -37
sglang/srt/server_args.py +127 -48
sglang/srt/utils.py +37 -14
sglang/test/few_shot_gsm8k.py +4 -1
sglang/test/few_shot_gsm8k_engine.py +144 -0
sglang/test/srt/sampling/penaltylib/utils.py +16 -12
sglang/version.py +1 -1
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/METADATA +82 -32
sglang-0.3.4.dist-info/RECORD +143 -0
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/WHEEL +1 -1
sglang/srt/layers/attention/flashinfer_utils.py +0 -237
sglang-0.3.3.dist-info/RECORD +0 -139
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/LICENSE +0 -0
{sglang-0.3.3.dist-info → sglang-0.3.4.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/triton_ops/extend_attention.py CHANGED Viewed

@@ -26,7 +26,9 @@ from sglang.srt.layers.attention.triton_ops.prefill_attention import (
     context_attention_fwd,
 )
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+is_cuda_available = torch.cuda.is_available()
+if is_cuda_available:
+    CUDA_CAPABILITY = torch.cuda.get_device_capability()
 @triton.jit
@@ -286,12 +288,12 @@ def extend_attention_fwd(
         BLOCK_DPE = 0
     BLOCK_DV = triton.next_power_of_2(Lv)
-    if CUDA_CAPABILITY[0] >= 9:
+    if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
         if Lq <= 256:
             BLOCK_M, BLOCK_N = (128, 64)
         else:
             BLOCK_M, BLOCK_N = (32, 64)
-    elif CUDA_CAPABILITY[0] >= 8:
+    elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
         if Lq <= 128:
             BLOCK_M, BLOCK_N = (128, 128)
         elif Lq <= 256:

sglang/srt/layers/attention/triton_ops/prefill_attention.py CHANGED Viewed

@@ -24,7 +24,9 @@ import torch
 import triton
 import triton.language as tl
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+is_cuda_available = torch.cuda.is_available()
+if is_cuda_available:
+    CUDA_CAPABILITY = torch.cuda.get_device_capability()
 @triton.jit
@@ -145,7 +147,7 @@ def _fwd_kernel(
 def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
-    if CUDA_CAPABILITY[0] >= 8:
+    if is_cuda_available and CUDA_CAPABILITY[0] >= 8:
         BLOCK = 128
     else:
         BLOCK = 64

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -21,6 +21,10 @@ logger = logging.getLogger(__name__)
 class Sampler(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.use_nan_detectioin = not global_server_args_dict["disable_nan_detection"]
     def forward(
         self,
         logits: Union[torch.Tensor, LogitsProcessorOutput],
@@ -36,13 +40,13 @@ class Sampler(nn.Module):
         logits = None
         del logits
-        if torch.any(torch.isnan(probs)):
+        if self.use_nan_detectioin and torch.any(torch.isnan(probs)):
             logger.warning("Detected errors during sampling! NaN in the probability.")
             probs = torch.where(
                 torch.isnan(probs), torch.full_like(probs, 1e-10), probs
             )
-        if sampling_info.top_ks.max().item() <= 1:
+        if sampling_info.is_all_greedy:
             # Use torch.argmax if all requests use greedy sampling
             batch_next_token_ids = torch.argmax(probs, -1)
         elif global_server_args_dict["sampling_backend"] == "flashinfer":

sglang/srt/managers/data_parallel_controller.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""A controller that dispatches requests to multiple data parallel workers."""
+import logging
+import multiprocessing as mp
+from enum import Enum, auto
+import zmq
+from sglang.srt.managers.io_struct import (
+    TokenizedEmbeddingReqInput,
+    TokenizedGenerateReqInput,
+    TokenizedRewardReqInput,
+)
+from sglang.srt.managers.scheduler import run_scheduler_process
+from sglang.srt.server_args import PortArgs, ServerArgs
+from sglang.srt.utils import (
+    configure_logger,
+    kill_parent_process,
+    suppress_other_loggers,
+)
+from sglang.utils import get_exception_traceback
+logger = logging.getLogger(__name__)
+class LoadBalanceMethod(Enum):
+    """Load balance method."""
+    ROUND_ROBIN = auto()
+    SHORTEST_QUEUE = auto()
+    @classmethod
+    def from_str(cls, method: str):
+        method = method.upper()
+        try:
+            return cls[method]
+        except KeyError as exc:
+            raise ValueError(f"Invalid load balance method: {method}") from exc
+class DataParallelController:
+    """A controller that dispatches requests to multiple data parallel workers."""
+    def __init__(self, server_args, port_args) -> None:
+        # Parse args
+        self.server_args = server_args
+        self.port_args = port_args
+        self.load_balance_method = LoadBalanceMethod.from_str(
+            server_args.load_balance_method
+        )
+        # Init inter-process communication
+        self.context = zmq.Context(1 + server_args.dp_size)
+        self.recv_from_tokenizer = self.context.socket(zmq.PULL)
+        self.recv_from_tokenizer.bind(f"ipc://{port_args.scheduler_input_ipc_name}")
+        # Dispatch method
+        self.round_robin_counter = 0
+        dispatch_lookup = {
+            LoadBalanceMethod.ROUND_ROBIN: self.round_robin_scheduler,
+            LoadBalanceMethod.SHORTEST_QUEUE: self.shortest_queue_scheduler,
+        }
+        self.dispatching = dispatch_lookup[self.load_balance_method]
+        # Start data parallel workers
+        base_gpu_id = 0
+        self.workers = []
+        for dp_rank in range(server_args.dp_size):
+            tmp_port_args = PortArgs.init_new(server_args)
+            tmp_port_args.detokenizer_ipc_name = port_args.detokenizer_ipc_name
+            send_to = self.launch_tensor_parallel_group(
+                server_args,
+                tmp_port_args,
+                base_gpu_id,
+                dp_rank,
+            )
+            self.workers.append(send_to)
+            base_gpu_id += server_args.tp_size
+    def launch_tensor_parallel_group(
+        self,
+        server_args: ServerArgs,
+        port_args: PortArgs,
+        base_gpu_id: int,
+        dp_rank: int,
+    ):
+        # Launch tensor parallel scheduler processes
+        scheduler_procs = []
+        scheduler_pipe_readers = []
+        tp_size_per_node = server_args.tp_size // server_args.nnodes
+        tp_rank_range = range(
+            tp_size_per_node * server_args.node_rank,
+            tp_size_per_node * (server_args.node_rank + 1),
+        )
+        for tp_rank in tp_rank_range:
+            reader, writer = mp.Pipe(duplex=False)
+            gpu_id = base_gpu_id + tp_rank % tp_size_per_node
+            proc = mp.Process(
+                target=run_scheduler_process,
+                args=(server_args, port_args, gpu_id, tp_rank, dp_rank, writer),
+            )
+            proc.start()
+            scheduler_procs.append(proc)
+            scheduler_pipe_readers.append(reader)
+        send_to = self.context.socket(zmq.PUSH)
+        send_to.connect(f"ipc://{port_args.scheduler_input_ipc_name}")
+        # Wait for model to finish loading
+        for i in range(len(scheduler_pipe_readers)):
+            scheduler_pipe_readers[i].recv()
+        return send_to
+    def round_robin_scheduler(self, req):
+        self.workers[self.round_robin_counter].send_pyobj(req)
+        self.round_robin_counter = (self.round_robin_counter + 1) % len(self.workers)
+    def shortest_queue_scheduler(self, input_requests):
+        raise NotImplementedError()
+    def event_loop(self):
+        while True:
+            while True:
+                try:
+                    recv_req = self.recv_from_tokenizer.recv_pyobj(zmq.NOBLOCK)
+                except zmq.ZMQError:
+                    break
+                if isinstance(
+                    recv_req,
+                    (
+                        TokenizedGenerateReqInput,
+                        TokenizedEmbeddingReqInput,
+                        TokenizedRewardReqInput,
+                    ),
+                ):
+                    self.dispatching(recv_req)
+                else:
+                    # Send other control messages to all workers
+                    for worker in self.workers:
+                        worker.queue.put(recv_req)
+def run_data_parallel_controller_process(
+    server_args: ServerArgs,
+    port_args: PortArgs,
+    pipe_writer,
+):
+    configure_logger(server_args)
+    suppress_other_loggers()
+    try:
+        controller = DataParallelController(server_args, port_args)
+        pipe_writer.send("ready")
+        controller.event_loop()
+    except Exception:
+        msg = get_exception_traceback()
+        logger.error(msg)
+        kill_parent_process()

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -18,7 +18,7 @@ limitations under the License.
 import dataclasses
 import logging
 from collections import OrderedDict
-from typing import List
+from typing import List, Union
 import zmq
@@ -29,7 +29,7 @@ from sglang.srt.managers.io_struct import (
     BatchTokenIDOut,
     UpdateWeightReqOutput,
 )
-from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR
+from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import configure_logger, kill_parent_process
 from sglang.utils import find_printable_text, get_exception_traceback
@@ -75,6 +75,21 @@ class DetokenizerManager:
         self.decode_status = LimitedCapacityDict()
+    def trim_eos(self, output: Union[str, List[int]], finished_reason, no_stop_trim):
+        if no_stop_trim:
+            return output
+        # Trim stop str. TODO(lmzheng): handle the case where multiple stop strs are hit
+        if isinstance(finished_reason, FINISH_MATCHED_STR) and isinstance(output, str):
+            pos = output.find(finished_reason.matched)
+            return output[:pos] if pos != -1 else output
+        if isinstance(finished_reason, FINISH_MATCHED_TOKEN) and isinstance(
+            output, list
+        ):
+            assert len(output) > 0
+            return output[:-1]
+        return output
     def event_loop(self):
         """The event loop that handles requests"""
@@ -122,7 +137,13 @@ class DetokenizerManager:
                     s = self.decode_status[rid]
                     s.decode_ids = recv_obj.decode_ids[i]
-                read_ids.append(s.decode_ids[s.surr_offset :])
+                read_ids.append(
+                    self.trim_eos(
+                        s.decode_ids[s.surr_offset :],
+                        recv_obj.finished_reason[i],
+                        recv_obj.no_stop_trim[i],
+                    )
+                )
                 surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
             # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
@@ -152,13 +173,13 @@ class DetokenizerManager:
                     else:
                         new_text = find_printable_text(new_text)
-                output_strs.append(s.decoded_text + new_text)
-                # Trim stop str. TODO(lmzheng): handle the case where multiple stop strs are hit
-                if isinstance(recv_obj.finished_reason[i], FINISH_MATCHED_STR):
-                    pos = output_strs[i].find(recv_obj.finished_reason[i].matched)
-                    if pos != -1:
-                        output_strs[i] = output_strs[i][:pos]
+                output_strs.append(
+                    self.trim_eos(
+                        s.decoded_text + new_text,
+                        recv_obj.finished_reason[i],
+                        recv_obj.no_stop_trim[i],
+                    )
+                )
             self.send_to_tokenizer.send_pyobj(
                 BatchStrOut(

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -20,6 +20,7 @@ processes (TokenizerManager, DetokenizerManager, Controller).
 import uuid
 from dataclasses import dataclass
+from enum import Enum
 from typing import Dict, List, Optional, Union
 from sglang.srt.managers.schedule_batch import BaseFinishReason
@@ -55,6 +56,9 @@ class GenerateReqInput:
     # LoRA related
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    # Whether it is a single request or a batch request
+    is_single: bool = True
     def post_init(self):
         if (self.text is None and self.input_ids is None) or (
             self.text is not None and self.input_ids is not None
@@ -119,8 +123,7 @@ class GenerateReqInput:
             elif not isinstance(self.image_data, list):
                 self.image_data = [self.image_data] * num
             elif isinstance(self.image_data, list):
-                # FIXME incorrect order for duplication
-                self.image_data = self.image_data * num
+                pass
             if self.sampling_params is None:
                 self.sampling_params = [{}] * num
@@ -295,6 +298,7 @@ class BatchTokenIDOut:
     spaces_between_special_tokens: List[bool]
     meta_info: List[Dict]
     finished_reason: List[BaseFinishReason]
+    no_stop_trim: List[bool]
 @dataclass
@@ -344,3 +348,8 @@ class UpdateWeightReqOutput:
 class AbortReq:
     # The request id
     rid: str
+class ProfileReq(Enum):
+    START_PROFILE = 1
+    STOP_PROFILE = 2

sglang 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

sglang 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl