PyPI - sglang - Versions diffs - 0.3.3.post1__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

sglang 0.3.3.post1py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

sglang/bench_latency.py +28 -10
sglang/bench_server_latency.py +21 -10
sglang/bench_serving.py +101 -7
sglang/global_config.py +0 -1
sglang/srt/layers/attention/__init__.py +27 -5
sglang/srt/layers/attention/double_sparsity_backend.py +281 -0
sglang/srt/layers/attention/flashinfer_backend.py +352 -83
sglang/srt/layers/attention/triton_backend.py +6 -4
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
sglang/srt/layers/attention/triton_ops/prefill_attention.py +4 -2
sglang/srt/layers/sampler.py +6 -2
sglang/srt/managers/detokenizer_manager.py +31 -10
sglang/srt/managers/io_struct.py +4 -0
sglang/srt/managers/schedule_batch.py +120 -43
sglang/srt/managers/schedule_policy.py +2 -1
sglang/srt/managers/scheduler.py +202 -140
sglang/srt/managers/tokenizer_manager.py +5 -1
sglang/srt/managers/tp_worker.py +111 -1
sglang/srt/mem_cache/chunk_cache.py +8 -4
sglang/srt/mem_cache/memory_pool.py +77 -4
sglang/srt/mem_cache/radix_cache.py +15 -7
sglang/srt/model_executor/cuda_graph_runner.py +4 -4
sglang/srt/model_executor/forward_batch_info.py +16 -21
sglang/srt/model_executor/model_runner.py +60 -1
sglang/srt/models/baichuan.py +2 -3
sglang/srt/models/chatglm.py +5 -6
sglang/srt/models/commandr.py +1 -2
sglang/srt/models/dbrx.py +1 -2
sglang/srt/models/deepseek.py +4 -5
sglang/srt/models/deepseek_v2.py +5 -6
sglang/srt/models/exaone.py +1 -2
sglang/srt/models/gemma.py +2 -2
sglang/srt/models/gemma2.py +5 -5
sglang/srt/models/gpt_bigcode.py +5 -5
sglang/srt/models/grok.py +1 -2
sglang/srt/models/internlm2.py +1 -2
sglang/srt/models/llama.py +1 -2
sglang/srt/models/llama_classification.py +1 -2
sglang/srt/models/llama_reward.py +2 -3
sglang/srt/models/llava.py +4 -8
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +1 -2
sglang/srt/models/minicpm3.py +5 -6
sglang/srt/models/mixtral.py +1 -2
sglang/srt/models/mixtral_quant.py +1 -2
sglang/srt/models/olmo.py +352 -0
sglang/srt/models/olmoe.py +1 -2
sglang/srt/models/qwen.py +1 -2
sglang/srt/models/qwen2.py +1 -2
sglang/srt/models/qwen2_moe.py +4 -5
sglang/srt/models/stablelm.py +1 -2
sglang/srt/models/torch_native_llama.py +1 -2
sglang/srt/models/xverse.py +1 -2
sglang/srt/models/xverse_moe.py +4 -5
sglang/srt/models/yivl.py +1 -2
sglang/srt/openai_api/adapter.py +92 -49
sglang/srt/openai_api/protocol.py +10 -2
sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
sglang/srt/sampling/sampling_batch_info.py +92 -58
sglang/srt/sampling/sampling_params.py +2 -0
sglang/srt/server.py +116 -17
sglang/srt/server_args.py +121 -45
sglang/srt/utils.py +11 -3
sglang/test/few_shot_gsm8k.py +4 -1
sglang/test/few_shot_gsm8k_engine.py +144 -0
sglang/test/srt/sampling/penaltylib/utils.py +16 -12
sglang/version.py +1 -1
{sglang-0.3.3.post1.dist-info → sglang-0.3.4.dist-info}/METADATA +72 -29
{sglang-0.3.3.post1.dist-info → sglang-0.3.4.dist-info}/RECORD +73 -70
{sglang-0.3.3.post1.dist-info → sglang-0.3.4.dist-info}/WHEEL +1 -1
sglang/srt/layers/attention/flashinfer_utils.py +0 -237
{sglang-0.3.3.post1.dist-info → sglang-0.3.4.dist-info}/LICENSE +0 -0
{sglang-0.3.3.post1.dist-info → sglang-0.3.4.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/triton_ops/extend_attention.py CHANGED Viewed

@@ -26,7 +26,9 @@ from sglang.srt.layers.attention.triton_ops.prefill_attention import (
     context_attention_fwd,
 )
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+is_cuda_available = torch.cuda.is_available()
+if is_cuda_available:
+    CUDA_CAPABILITY = torch.cuda.get_device_capability()
 @triton.jit
@@ -286,12 +288,12 @@ def extend_attention_fwd(
         BLOCK_DPE = 0
     BLOCK_DV = triton.next_power_of_2(Lv)
-    if CUDA_CAPABILITY[0] >= 9:
+    if is_cuda_available and CUDA_CAPABILITY[0] >= 9:
         if Lq <= 256:
             BLOCK_M, BLOCK_N = (128, 64)
         else:
             BLOCK_M, BLOCK_N = (32, 64)
-    elif CUDA_CAPABILITY[0] >= 8:
+    elif is_cuda_available and CUDA_CAPABILITY[0] >= 8:
         if Lq <= 128:
             BLOCK_M, BLOCK_N = (128, 128)
         elif Lq <= 256:

sglang/srt/layers/attention/triton_ops/prefill_attention.py CHANGED Viewed

@@ -24,7 +24,9 @@ import torch
 import triton
 import triton.language as tl
-CUDA_CAPABILITY = torch.cuda.get_device_capability()
+is_cuda_available = torch.cuda.is_available()
+if is_cuda_available:
+    CUDA_CAPABILITY = torch.cuda.get_device_capability()
 @triton.jit
@@ -145,7 +147,7 @@ def _fwd_kernel(
 def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
-    if CUDA_CAPABILITY[0] >= 8:
+    if is_cuda_available and CUDA_CAPABILITY[0] >= 8:
         BLOCK = 128
     else:
         BLOCK = 64

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -21,6 +21,10 @@ logger = logging.getLogger(__name__)
 class Sampler(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.use_nan_detectioin = not global_server_args_dict["disable_nan_detection"]
     def forward(
         self,
         logits: Union[torch.Tensor, LogitsProcessorOutput],
@@ -36,13 +40,13 @@ class Sampler(nn.Module):
         logits = None
         del logits
-        if torch.any(torch.isnan(probs)):
+        if self.use_nan_detectioin and torch.any(torch.isnan(probs)):
             logger.warning("Detected errors during sampling! NaN in the probability.")
             probs = torch.where(
                 torch.isnan(probs), torch.full_like(probs, 1e-10), probs
             )
-        if sampling_info.top_ks.max().item() <= 1:
+        if sampling_info.is_all_greedy:
             # Use torch.argmax if all requests use greedy sampling
             batch_next_token_ids = torch.argmax(probs, -1)
         elif global_server_args_dict["sampling_backend"] == "flashinfer":

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -18,7 +18,7 @@ limitations under the License.
 import dataclasses
 import logging
 from collections import OrderedDict
-from typing import List
+from typing import List, Union
 import zmq
@@ -29,7 +29,7 @@ from sglang.srt.managers.io_struct import (
     BatchTokenIDOut,
     UpdateWeightReqOutput,
 )
-from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR
+from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR, FINISH_MATCHED_TOKEN
 from sglang.srt.server_args import PortArgs, ServerArgs
 from sglang.srt.utils import configure_logger, kill_parent_process
 from sglang.utils import find_printable_text, get_exception_traceback
@@ -75,6 +75,21 @@ class DetokenizerManager:
         self.decode_status = LimitedCapacityDict()
+    def trim_eos(self, output: Union[str, List[int]], finished_reason, no_stop_trim):
+        if no_stop_trim:
+            return output
+        # Trim stop str. TODO(lmzheng): handle the case where multiple stop strs are hit
+        if isinstance(finished_reason, FINISH_MATCHED_STR) and isinstance(output, str):
+            pos = output.find(finished_reason.matched)
+            return output[:pos] if pos != -1 else output
+        if isinstance(finished_reason, FINISH_MATCHED_TOKEN) and isinstance(
+            output, list
+        ):
+            assert len(output) > 0
+            return output[:-1]
+        return output
     def event_loop(self):
         """The event loop that handles requests"""
@@ -122,7 +137,13 @@ class DetokenizerManager:
                     s = self.decode_status[rid]
                     s.decode_ids = recv_obj.decode_ids[i]
-                read_ids.append(s.decode_ids[s.surr_offset :])
+                read_ids.append(
+                    self.trim_eos(
+                        s.decode_ids[s.surr_offset :],
+                        recv_obj.finished_reason[i],
+                        recv_obj.no_stop_trim[i],
+                    )
+                )
                 surr_ids.append(s.decode_ids[s.surr_offset : s.read_offset])
             # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
@@ -152,13 +173,13 @@ class DetokenizerManager:
                     else:
                         new_text = find_printable_text(new_text)
-                output_strs.append(s.decoded_text + new_text)
-                # Trim stop str. TODO(lmzheng): handle the case where multiple stop strs are hit
-                if isinstance(recv_obj.finished_reason[i], FINISH_MATCHED_STR):
-                    pos = output_strs[i].find(recv_obj.finished_reason[i].matched)
-                    if pos != -1:
-                        output_strs[i] = output_strs[i][:pos]
+                output_strs.append(
+                    self.trim_eos(
+                        s.decoded_text + new_text,
+                        recv_obj.finished_reason[i],
+                        recv_obj.no_stop_trim[i],
+                    )
+                )
             self.send_to_tokenizer.send_pyobj(
                 BatchStrOut(

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -56,6 +56,9 @@ class GenerateReqInput:
     # LoRA related
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    # Whether it is a single request or a batch request
+    is_single: bool = True
     def post_init(self):
         if (self.text is None and self.input_ids is None) or (
             self.text is not None and self.input_ids is not None
@@ -295,6 +298,7 @@ class BatchTokenIDOut:
     spaces_between_special_tokens: List[bool]
     meta_info: List[Dict]
     finished_reason: List[BaseFinishReason]
+    no_stop_trim: List[bool]
 @dataclass

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -53,6 +53,7 @@ global_server_args_dict = {
     "triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
     "disable_mla": ServerArgs.disable_mla,
     "torchao_config": ServerArgs.torchao_config,
+    "disable_nan_detection": ServerArgs.disable_nan_detection,
 }
@@ -196,6 +197,9 @@ class Req:
         # this does not include the jump forward tokens.
         self.completion_tokens_wo_jump_forward = 0
+        # The number of cached tokens, that were already cached in the KV store
+        self.cached_tokens = 0
         # For vision inputs
         self.image_inputs: Optional[ImageInputs] = None
@@ -203,6 +207,7 @@ class Req:
         self.prefix_indices = []
         self.extend_input_len = 0
         self.last_node = None
+        self.is_inflight_req = 0
         # Logprobs (arguments)
         self.return_logprob = False
@@ -391,25 +396,30 @@ class Req:
         return f"rid(n={self.rid}, " f"input_ids={self.origin_input_ids}, "
+bid = 0
 @dataclass
 class ScheduleBatch:
     """Store all inforamtion of a batch."""
     # Request, memory pool, and cache
     reqs: List[Req]
-    req_to_token_pool: ReqToTokenPool
-    token_to_kv_pool: BaseTokenToKVPool
-    tree_cache: BasePrefixCache
+    req_to_token_pool: ReqToTokenPool = None
+    token_to_kv_pool: BaseTokenToKVPool = None
+    tree_cache: BasePrefixCache = None
     forward_mode: ForwardMode = None
     sampling_info: SamplingBatchInfo = None
     # Batched arguments to model runner
-    input_ids: List[int] = None
-    req_pool_indices: List[int] = None
-    seq_lens: List[int] = None
+    input_ids: torch.Tensor = None
+    req_pool_indices: torch.Tensor = None
+    seq_lens: torch.Tensor = None
     out_cache_loc: torch.Tensor = None
+    output_ids: torch.Tensor = None
     # For processing logprobs
     return_logprob: bool = False
     top_logprobs_nums: Optional[List[int]] = None
@@ -419,6 +429,7 @@ class ScheduleBatch:
     extend_lens: List[int] = None
     extend_num_tokens: int = None
     running_bs: int = None
+    decoding_reqs: List[Req] = None
     # Stream
     has_stream: bool = False
@@ -492,17 +503,24 @@ class ScheduleBatch:
         pt = 0
         for i, req in enumerate(reqs):
+            already_computed = (
+                req.extend_logprob_start_len + 1 + req.cached_tokens
+                if req.extend_logprob_start_len > 0
+                else 0
+            )
+            req.cached_tokens += len(req.prefix_indices) - already_computed
             req.req_pool_idx = req_pool_indices[i]
             pre_len, seq_len = len(req.prefix_indices), len(req.fill_ids)
             seq_lens.append(seq_len)
             assert seq_len - pre_len == req.extend_input_len
             if pre_len > 0:
-                self.req_to_token_pool.req_to_token[req.req_pool_idx][
-                    :pre_len
-                ] = req.prefix_indices
+                self.req_to_token_pool.req_to_token[req.req_pool_idx, :pre_len] = (
+                    req.prefix_indices
+                )
-            self.req_to_token_pool.req_to_token[req.req_pool_idx][pre_len:seq_len] = (
+            self.req_to_token_pool.req_to_token[req.req_pool_idx, pre_len:seq_len] = (
                 out_cache_loc[pt : pt + req.extend_input_len]
             )
@@ -518,10 +536,15 @@ class ScheduleBatch:
             pt += req.extend_input_len
         # Set fields
-        with out_cache_loc.device:
-            self.input_ids = torch.tensor(sum(input_ids, []), dtype=torch.int32)
-            self.req_pool_indices = torch.tensor(req_pool_indices)
-            self.seq_lens = torch.tensor(seq_lens)
+        self.input_ids = torch.tensor(sum(input_ids, []), dtype=torch.int32).to(
+            self.device, non_blocking=True
+        )
+        self.req_pool_indices = torch.tensor(req_pool_indices, dtype=torch.int32).to(
+            self.device, non_blocking=True
+        )
+        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int32).to(
+            self.device, non_blocking=True
+        )
         self.extend_num_tokens = extend_num_tokens
         self.out_cache_loc = out_cache_loc
@@ -531,7 +554,9 @@ class ScheduleBatch:
         self.extend_lens = [r.extend_input_len for r in reqs]
         self.extend_logprob_start_lens = [r.extend_logprob_start_len for r in reqs]
-        self.sampling_info = SamplingBatchInfo.from_schedule_batch(self, vocab_size)
+        self.sampling_info = SamplingBatchInfo.from_schedule_batch(
+            self, vocab_size, global_server_args_dict["disable_penalizer"]
+        )
     def mix_with_running(self, running_batch: "ScheduleBatch"):
         self.forward_mode = ForwardMode.MIXED
@@ -586,9 +611,11 @@ class ScheduleBatch:
         retracted_reqs = []
         seq_lens_cpu = self.seq_lens.cpu().numpy()
+        first_iter = True
         while (
             self.token_to_kv_pool.available_size()
             < len(sorted_indices) * global_config.retract_decode_steps
+            or first_iter
         ):
             if len(sorted_indices) == 1:
                 # Corner case: only one request left
@@ -597,6 +624,7 @@ class ScheduleBatch:
                 ), "No space left for only one request"
                 break
+            first_iter = False
             idx = sorted_indices.pop()
             req = self.reqs[idx]
             retracted_reqs.append(req)
@@ -637,7 +665,7 @@ class ScheduleBatch:
             req.last_update_decode_tokens = 0
             req.logprob_start_len = 10**9
-        self.filter_batch(sorted_indices)
+        self.filter_batch(keep_indices=sorted_indices)
         # Reqs in batch are filtered
         total_decoded_tokens = sum(len(r.output_ids) for r in self.reqs)
@@ -652,7 +680,7 @@ class ScheduleBatch:
     def check_for_jump_forward(self, pad_input_ids_func):
         jump_forward_reqs = []
-        filter_indices = [i for i in range(len(self.reqs))]
+        keep_indices = set(i for i in range(len(self.reqs)))
         for i, req in enumerate(self.reqs):
             if req.jump_forward_map is not None:
@@ -712,63 +740,71 @@ class ScheduleBatch:
                         )
                     jump_forward_reqs.append(req)
-                    filter_indices.remove(i)
+                    keep_indices.remove(i)
-        self.filter_batch(filter_indices)
+        self.filter_batch(keep_indices=list(keep_indices))
         return jump_forward_reqs
-    def prepare_for_decode(self, input_ids=None):
+    def prepare_for_decode(self):
         self.forward_mode = ForwardMode.DECODE
-        if input_ids is None:
-            input_ids = [
-                r.output_ids[-1] if r.output_ids else r.origin_input_ids[-1]
-                for r in self.reqs
-            ]
-        self.input_ids = torch.tensor(
-            input_ids, dtype=torch.int32, device=self.seq_lens.device
-        )
-        self.seq_lens.add_(1)
+        self.input_ids = self.output_ids
+        self.output_ids = None
+        if self.sampling_info.penalizer_orchestrator:
+            self.sampling_info.penalizer_orchestrator.cumulate_output_tokens(
+                self.input_ids
+            )
         # Alloc mem
         bs = len(self.reqs)
         self.out_cache_loc = self.alloc_token_slots(bs)
-        self.req_to_token_pool.req_to_token[
-            self.req_pool_indices, self.seq_lens - 1
-        ] = self.out_cache_loc
+        self.req_to_token_pool.req_to_token[self.req_pool_indices, self.seq_lens] = (
+            self.out_cache_loc
+        )
+        self.seq_lens.add_(1)
-    def filter_batch(self, unfinished_indices: List[int]):
-        if unfinished_indices is None or len(unfinished_indices) == 0:
+    def filter_batch(
+        self,
+        current_inflight_req: Optional[Req] = None,
+        keep_indices: Optional[List[int]] = None,
+    ):
+        if keep_indices is None:
+            keep_indices = [
+                i
+                for i in range(len(self.reqs))
+                if not self.reqs[i].finished()
+                and self.reqs[i] is not current_inflight_req
+            ]
+        if keep_indices is None or len(keep_indices) == 0:
             # Filter out all requests
             self.reqs = []
             return
-        if len(unfinished_indices) == len(self.reqs):
+        if len(keep_indices) == len(self.reqs):
             # No need to filter
             return
-        self.reqs = [self.reqs[i] for i in unfinished_indices]
-        new_indices = torch.tensor(
-            unfinished_indices, dtype=torch.int32, device=self.seq_lens.device
+        self.reqs = [self.reqs[i] for i in keep_indices]
+        new_indices = torch.tensor(keep_indices, dtype=torch.int32).to(
+            self.device, non_blocking=True
         )
         self.req_pool_indices = self.req_pool_indices[new_indices]
         self.seq_lens = self.seq_lens[new_indices]
         self.out_cache_loc = None
+        self.output_ids = self.output_ids[new_indices]
         self.return_logprob = any(req.return_logprob for req in self.reqs)
         if self.return_logprob:
-            self.top_logprobs_nums = [
-                self.top_logprobs_nums[i] for i in unfinished_indices
-            ]
+            self.top_logprobs_nums = [self.top_logprobs_nums[i] for i in keep_indices]
         else:
             self.top_logprobs_nums = None
         self.has_stream = any(req.stream for req in self.reqs)
         self.has_regex = any(req.regex_fsm for req in self.reqs)
-        self.sampling_info.filter_batch(unfinished_indices, new_indices)
+        self.sampling_info.filter_batch(keep_indices, new_indices)
     def merge_batch(self, other: "ScheduleBatch"):
         # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
@@ -781,6 +817,8 @@ class ScheduleBatch:
         )
         self.seq_lens = torch.concat([self.seq_lens, other.seq_lens])
         self.out_cache_loc = None
+        if self.output_ids is not None:
+            self.output_ids = torch.concat([self.output_ids, other.output_ids])
         if self.return_logprob and other.return_logprob:
             self.top_logprobs_nums.extend(other.top_logprobs_nums)
         elif self.return_logprob:
@@ -813,7 +851,11 @@ class ScheduleBatch:
         else:
             self.sampling_info.regex_fsms = None
+        global bid
+        bid += 1
         return ModelWorkerBatch(
+            bid=bid,
             forward_mode=self.forward_mode,
             input_ids=self.input_ids,
             req_pool_indices=self.req_pool_indices,
@@ -829,9 +871,26 @@ class ScheduleBatch:
             sampling_info=self.sampling_info,
         )
+    def copy(self):
+        return ScheduleBatch(
+            reqs=self.reqs,
+            forward_mode=self.forward_mode,
+            out_cache_loc=self.out_cache_loc,
+            return_logprob=self.return_logprob,
+            decoding_reqs=self.decoding_reqs,
+        )
+    def __str__(self):
+        return (
+            f"ScheduleBatch(forward_mode={self.forward_mode.name}, "
+            f"#req={(len(self.reqs))})"
+        )
 @dataclass
 class ModelWorkerBatch:
+    # The batch id
+    bid: int
     # The forward mode
     forward_mode: ForwardMode
     # The input ids
@@ -860,3 +919,21 @@ class ModelWorkerBatch:
     # Sampling info
     sampling_info: SamplingBatchInfo
+    def copy(self):
+        return ModelWorkerBatch(
+            bid=self.bid,
+            forward_mode=self.forward_mode,
+            input_ids=self.input_ids.clone(),
+            req_pool_indices=self.req_pool_indices,
+            seq_lens=self.seq_lens.clone(),
+            out_cache_loc=self.out_cache_loc,
+            return_logprob=self.return_logprob,
+            top_logprobs_nums=self.top_logprobs_nums,
+            extend_seq_lens=self.extend_seq_lens,
+            extend_prefix_lens=self.extend_prefix_lens,
+            extend_logprob_start_lens=self.extend_logprob_start_lens,
+            image_inputs=self.image_inputs,
+            lora_paths=self.lora_paths,
+            sampling_info=self.sampling_info.copy(),
+        )

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -45,12 +45,13 @@ class SchedulePolicy:
     def calc_priority(self, waiting_queue: List[Req]):
         # Compute matched prefix length
         prefix_computed = False
-        if self.policy in ["lpm", "dfs-weight"]:
+        if self.policy == "lpm" or self.policy == "dfs-weight":
             for r in waiting_queue:
                 # NOTE: the prefix_indices must always be aligned with last_node
                 r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
                     rid=r.rid, key=r.adjust_max_prefix_ids()
                 )
             prefix_computed = True
         if self.policy == "lpm":

sglang 0.3.3.post1__py3-none-any.whl → 0.3.4__py3-none-any.whl

sglang 0.3.3.post1py3-none-any.whl → 0.3.4py3-none-any.whl