PyPI - sglang - Versions diffs - 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl - Mend

sglang 0.1.19py3-none-any.whl → 0.1.21py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

sglang/__init__.py +1 -1
sglang/backend/runtime_endpoint.py +14 -4
sglang/bench_latency.py +6 -3
sglang/global_config.py +22 -16
sglang/lang/chat_template.py +2 -2
sglang/lang/ir.py +3 -3
sglang/srt/layers/radix_attention.py +14 -37
sglang/srt/layers/token_attention.py +2 -9
sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
sglang/srt/managers/controller/infer_batch.py +256 -42
sglang/srt/managers/controller/manager_multi.py +6 -2
sglang/srt/managers/controller/manager_single.py +125 -50
sglang/srt/managers/controller/model_runner.py +69 -284
sglang/srt/managers/controller/radix_cache.py +4 -3
sglang/srt/managers/controller/schedule_heuristic.py +4 -0
sglang/srt/managers/controller/tp_worker.py +44 -44
sglang/srt/memory_pool.py +52 -50
sglang/srt/models/minicpm.py +1 -8
sglang/srt/models/qwen2_moe.py +126 -107
sglang/srt/server.py +11 -15
sglang/srt/server_args.py +12 -4
sglang/srt/utils.py +1 -1
{sglang-0.1.19.dist-info → sglang-0.1.21.dist-info}/METADATA +9 -1
{sglang-0.1.19.dist-info → sglang-0.1.21.dist-info}/RECORD +27 -26
{sglang-0.1.19.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
{sglang-0.1.19.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
{sglang-0.1.19.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0

sglang/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = "0.1.19"
+__version__ = "0.1.21"
 # SGL API Components
 from sglang.api import (

sglang/backend/runtime_endpoint.py CHANGED Viewed

@@ -12,7 +12,6 @@ from sglang.utils import http_request
 class RuntimeEndpoint(BaseBackend):
     def __init__(
         self,
         base_url: str,
@@ -38,7 +37,8 @@ class RuntimeEndpoint(BaseBackend):
         self.model_info = res.json()
         self.chat_template = get_chat_template_by_model_path(
-            self.model_info["model_path"])
+            self.model_info["model_path"]
+        )
     def get_model_name(self):
         return self.model_info["model_path"]
@@ -124,7 +124,12 @@ class RuntimeEndpoint(BaseBackend):
         else:
             raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
-        for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
+        for item in [
+            "return_logprob",
+            "logprob_start_len",
+            "top_logprobs_num",
+            "return_text_in_logprobs",
+        ]:
             value = getattr(sampling_params, item, None)
             if value is not None:
                 data[item] = value
@@ -171,7 +176,12 @@ class RuntimeEndpoint(BaseBackend):
         else:
             raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
-        for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
+        for item in [
+            "return_logprob",
+            "logprob_start_len",
+            "top_logprobs_num",
+            "return_text_in_logprobs",
+        ]:
             value = getattr(sampling_params, item, None)
             if value is not None:
                 data[item] = value

sglang/bench_latency.py CHANGED Viewed

@@ -70,6 +70,7 @@ class BenchArgs:
 def load_model(server_args, tp_rank):
     suppress_other_loggers()
+    rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
     model_config = ModelConfig(path=server_args.model_path)
     model_runner = ModelRunner(
@@ -81,7 +82,7 @@ def load_model(server_args, tp_rank):
         nccl_port=28888,
         server_args=server_args,
     )
-    print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
+    rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
     tokenizer = get_tokenizer(
         server_args.tokenizer_path,
         tokenizer_mode=server_args.tokenizer_mode,
@@ -201,7 +202,7 @@ def correctness_test(
     # Print
     for i in range(len(reqs)):
-        print(tokenizer.decode(output_ids[i]))
+        rank_print(tokenizer.decode(output_ids[i]))
 def latency_test(
@@ -213,7 +214,7 @@ def latency_test(
     # Load the model
     model_runner, tokenizer = load_model(server_args, tp_rank)
-    print(
+    rank_print(
         f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
     )
@@ -299,6 +300,8 @@ def main(server_args, bench_args):
     for proc in workers:
         proc.join()
+    proc.terminate()
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

sglang/global_config.py CHANGED Viewed

@@ -8,36 +8,42 @@ class GlobalConfig:
         # 2: output final text after every run
         self.verbosity = 0
+        # Default backend of the language
         self.default_backend = None
-        # Output configs
+        # Runtime constants: Request dependency time due to network delay
+        self.request_dependency_delay = 0.02
+        self.wait_for_new_request_delay = 0.0006
+        # Runtime constants: New generation token ratio estimation
+        self.base_new_token_ratio = 0.4
+        self.base_min_new_token_ratio = 0.2
+        self.new_token_ratio_decay = 0.0001
+        self.new_token_ratio_recovery = 0.05
+        # Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
+        # This can improve the speed for large batch sizes during prefill.
+        self.layer_sync_threshold = 8192
+        # Runtime constants: others
+        self.num_continue_decode_steps = 10
+        self.flashinfer_workspace_size = 192 * 1024 * 1024
+        # Output tokenization configs
         self.skip_special_tokens_in_output = True
         self.spaces_between_special_tokens_in_out = True
-        # Optimization configs
+        # Interpreter optimization configs
         self.eager_fill_image = False
         self.enable_precache_with_tracing = True
         self.enable_parallel_encoding = True
         self.enable_parallel_decoding = True
+        # Deprecated
         # Choices: ["no_adjust", "adjust_cache"]
         # no_adjust: Do not adjust the position embedding of KV cache.
         # adjust_cache: Adjust the position embedding of KV cache.
         self.concate_and_append_mode = "no_adjust"
-        # Request dependency time due to network delay
-        self.request_dependency_delay = 0.02
-        self.wait_for_new_request_delay = 0.0006
-        # New generation token ratio estimation
-        self.base_new_token_ratio = 0.4
-        self.base_min_new_token_ratio = 0.2
-        self.new_token_ratio_decay = 0.0001
-        self.new_token_ratio_recovery = 0.05
-        # The threshold (number of tokens) to trigger layer-wise cuda sync.
-        # This can improve the speed for large batch sizes during prefill.
-        self.layer_sync_threshold = 8192
 global_config = GlobalConfig()

sglang/lang/chat_template.py CHANGED Viewed

@@ -84,7 +84,7 @@ register_chat_template(
             "system": ("SYSTEM:", "\n"),
             "user": ("USER:", "\n"),
             "assistant": ("ASSISTANT:", "\n"),
-        }
+        },
     )
 )
@@ -177,7 +177,7 @@ register_chat_template(
             "assistant": ("", "<|im_end|>\n"),
         },
         style=ChatTemplateStyle.PLAIN,
-        stop_str=("<|im_end|>",)
+        stop_str=("<|im_end|>",),
     )
 )

sglang/lang/ir.py CHANGED Viewed

@@ -24,9 +24,9 @@ class SglSamplingParams:
     presence_penalty: float = 0.0
     ignore_eos: bool = False
     return_logprob: Optional[bool] = None
-    logprob_start_len: Optional[int] = None,
-    top_logprobs_num: Optional[int] = None,
-    return_text_in_logprobs: Optional[bool] = None,
+    logprob_start_len: Optional[int] = (None,)
+    top_logprobs_num: Optional[int] = (None,)
+    return_text_in_logprobs: Optional[bool] = (None,)
     # for constrained generation, not included in to_xxx_kwargs
     dtype: Optional[str] = None

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -1,6 +1,5 @@
 """Radix attention."""
-import numpy as np
 import torch
 from flashinfer.cascade import merge_state
 from torch import nn
@@ -8,6 +7,7 @@ from torch import nn
 from sglang.global_config import global_config
 from sglang.srt.layers.extend_attention import extend_attention_fwd
 from sglang.srt.layers.token_attention import token_attention_fwd
+from sglang.srt.managers.controller.infer_batch import global_server_args_dict
 from sglang.srt.managers.controller.model_runner import ForwardMode, InputMetadata
@@ -29,24 +29,14 @@ class RadixAttention(nn.Module):
         self.scaling = scaling
         self.layer_id = layer_id
-        from sglang.srt.managers.controller.model_runner import global_server_args_dict
         if not global_server_args_dict.get("disable_flashinfer", False):
-            self.prefill_forward = self.prefill_forward_flashinfer
-            self.extend_forward = self.prefill_forward_flashinfer
+            self.extend_forward = self.extend_forward_flashinfer
             self.decode_forward = self.decode_forward_flashinfer
-            # flashinfer now accepts float logit_cap argument
-            self.logit_cap = logit_cap if logit_cap is not None and logit_cap > 0 else 0
         else:
-            self.prefill_forward = self.prefill_forward_triton
             self.extend_forward = self.extend_forward_triton
             self.decode_forward = self.decode_forward_triton
-            self.logit_cap = logit_cap if logit_cap is not None else 0
-    def prefill_forward_triton(self, q, k, v, input_metadata: InputMetadata):
-        # In SGLang, we call both the typical "prefill" and "prefill with cache" as "extend".
-        # See the extend_forward_xxx functions.
-        raise NotImplementedError()
+        self.logit_cap = logit_cap if logit_cap is not None and logit_cap > 0 else 0
     def extend_forward_triton(self, q, k, v, input_metadata: InputMetadata):
         o = torch.empty_like(q)
@@ -60,13 +50,13 @@ class RadixAttention(nn.Module):
             input_metadata.token_to_kv_pool.get_value_buffer(self.layer_id),
             input_metadata.req_to_token_pool.req_to_token,
             input_metadata.req_pool_indices,
-            input_metadata.start_loc,
+            input_metadata.triton_start_loc,
             input_metadata.seq_lens,
-            input_metadata.prefix_lens,
+            input_metadata.triton_prefix_lens,
             input_metadata.extend_start_loc,
             input_metadata.extend_seq_lens,
-            input_metadata.max_seq_len,
-            input_metadata.max_extend_len,
+            input_metadata.triton_max_seq_len,
+            input_metadata.triton_max_extend_len,
             sm_scale=self.scaling,
             logit_cap=self.logit_cap,
         )
@@ -84,10 +74,9 @@ class RadixAttention(nn.Module):
             o.view(-1, self.tp_q_head_num, self.head_dim),
             input_metadata.req_to_token_pool.req_to_token,
             input_metadata.req_pool_indices,
-            input_metadata.start_loc,
+            input_metadata.triton_start_loc,
             input_metadata.seq_lens,
-            input_metadata.max_seq_len,
-            input_metadata.other_kv_index,
+            input_metadata.triton_max_seq_len,
             input_metadata.total_num_tokens,
             sm_scale=self.scaling,
             logit_cap=self.logit_cap,
@@ -95,7 +84,7 @@ class RadixAttention(nn.Module):
         return o
-    def prefill_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata):
+    def extend_forward_flashinfer(self, q, k, v, input_metadata: InputMetadata):
         o1, s1 = input_metadata.flashinfer_prefill_wrapper_ragged.forward_return_lse(
             q.contiguous().view(-1, self.tp_q_head_num, self.head_dim),
             k.contiguous().view(-1, self.tp_k_head_num, self.head_dim),
@@ -105,7 +94,7 @@ class RadixAttention(nn.Module):
             logits_soft_cap=self.logit_cap,
         )
-        if input_metadata.no_prefix:
+        if input_metadata.extend_no_prefix:
             o = o1
         else:
             o2, s2 = input_metadata.flashinfer_prefill_wrapper_paged.forward_return_lse(
@@ -141,25 +130,13 @@ class RadixAttention(nn.Module):
         k = k.view(-1, self.tp_k_head_num, self.head_dim)
         v = v.view(-1, self.tp_v_head_num, self.head_dim)
-        if input_metadata.forward_mode == ForwardMode.PREFILL:
-            return self.prefill_forward(q, k, v, input_metadata)
-        elif input_metadata.forward_mode == ForwardMode.EXTEND:
+        if input_metadata.forward_mode == ForwardMode.EXTEND:
             return self.extend_forward(q, k, v, input_metadata)
         elif input_metadata.forward_mode == ForwardMode.DECODE:
             return self.decode_forward(q, k, v, input_metadata)
     def store_kv_cache(self, cache_k, cache_v, input_metadata: InputMetadata):
         key_buffer = input_metadata.token_to_kv_pool.get_key_buffer(self.layer_id)
+        key_buffer[input_metadata.out_cache_loc] = cache_k
         value_buffer = input_metadata.token_to_kv_pool.get_value_buffer(self.layer_id)
-        if input_metadata.out_cache_loc is not None:
-            key_buffer[input_metadata.out_cache_loc] = cache_k
-            value_buffer[input_metadata.out_cache_loc] = cache_v
-        elif input_metadata.out_cache_cont_start is not None:
-            key_buffer[
-                input_metadata.out_cache_cont_start : input_metadata.out_cache_cont_end
-            ] = cache_k
-            value_buffer[
-                input_metadata.out_cache_cont_start : input_metadata.out_cache_cont_end
-            ] = cache_v
-        else:
-            raise RuntimeError()
+        value_buffer[input_metadata.out_cache_loc] = cache_v

sglang/srt/layers/token_attention.py CHANGED Viewed

@@ -107,7 +107,6 @@ def _fwd_kernel_stage2(
     stride_obs,
     stride_oh,
     stride_req_to_token_b,
-    other_kv_index,  # To fix a NAN issue
     kv_group_num: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
@@ -138,7 +137,7 @@ def _fwd_kernel_stage2(
             + cur_batch_req_idx * stride_req_to_token_b
             + (start_n + offs_n),
             mask=(start_n + offs_n) < cur_batch_seq_len,
-            other=other_kv_index,
+            other=0,
         )
         qk = tl.load(
@@ -250,7 +249,6 @@ def _token_softmax_reducev_fwd(
     b_req_idx,
     b_start_loc,
     b_seq_len,
-    other_kv_index,
 ):
     BLOCK = 64
     batch, head = b_seq_len.shape[0], logics.shape[0]
@@ -277,7 +275,6 @@ def _token_softmax_reducev_fwd(
             o.stride(0),
             o.stride(1),
             req_to_tokens.stride(0),
-            other_kv_index,
         )
         return
@@ -295,7 +292,6 @@ def _token_softmax_reducev_fwd(
         o.stride(0),
         o.stride(1),
         req_to_tokens.stride(0),
-        other_kv_index,
         kv_group_num=kv_group_num,
         BLOCK_DMODEL=v_buffer.shape[-1],
         BLOCK_N=BLOCK,
@@ -315,9 +311,8 @@ def token_attention_fwd(
     b_start_loc,
     b_seq_len,
     max_len_in_batch,
-    other_kv_index,
     total_num_tokens,
-    sm_scale=None,
+    sm_scale,
     logit_cap=-1,
     att_m=None,
 ):
@@ -325,7 +320,6 @@ def token_attention_fwd(
         att_m = torch.empty(
             (q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda"
         )
-    sm_scale = 1.0 / (Lq**0.5) if sm_scale is None else sm_scale
     _token_att_m_fwd(
         q,
@@ -347,5 +341,4 @@ def token_attention_fwd(
         b_req_idx,
         b_start_loc,
         b_seq_len,
-        other_kv_index,
     )

sglang/srt/managers/controller/cuda_graph_runner.py ADDED Viewed

@@ -0,0 +1,196 @@
+"""Run the model with cuda graph."""
+import bisect
+import torch
+from vllm.distributed.parallel_state import graph_capture
+from sglang.global_config import global_config
+from sglang.srt.layers.logits_processor import LogitProcessorOutput
+from sglang.srt.managers.controller.infer_batch import (
+    Batch,
+    ForwardMode,
+    InputMetadata,
+    init_flashinfer_args,
+)
+class CudaGraphRunner:
+    def __init__(self, model_runner, max_batch_size_to_capture):
+        self.model_runner = model_runner
+        self.graphs = {}
+        self.input_buffers = {}
+        self.output_buffers = {}
+        self.flashinfer_handlers = {}
+        self.graph_memory_pool = None
+        # Common inputs
+        self.max_bs = max_batch_size_to_capture
+        self.input_ids = torch.zeros((self.max_bs,), dtype=torch.int32, device="cuda")
+        self.req_pool_indices = torch.zeros(
+            (self.max_bs,), dtype=torch.int32, device="cuda"
+        )
+        self.seq_lens = torch.ones((self.max_bs,), dtype=torch.int32, device="cuda")
+        self.position_ids_offsets = torch.zeros(
+            (self.max_bs,), dtype=torch.int32, device="cuda"
+        )
+        self.out_cache_loc = torch.zeros(
+            (self.max_bs,), dtype=torch.int32, device="cuda"
+        )
+        # FlashInfer inputs
+        self.flashinfer_workspace_buffer = (
+            self.model_runner.flashinfer_workspace_buffers[0]
+        )
+        self.flashinfer_kv_indptr = torch.zeros(
+            (self.max_bs + 1,), dtype=torch.int32, device="cuda"
+        )
+        self.flashinfer_kv_indices = torch.zeros(
+            (self.max_bs * model_runner.model_config.context_len,),
+            dtype=torch.int32,
+            device="cuda",
+        )
+        self.flashinfer_kv_last_page_len = torch.ones(
+            (self.max_bs,), dtype=torch.int32, device="cuda"
+        )
+    def can_run(self, batch_size):
+        return batch_size < self.max_bs
+    def capture(self, batch_size_list):
+        self.batch_size_list = batch_size_list
+        with graph_capture() as graph_capture_context:
+            self.stream = graph_capture_context.stream
+            for bs in batch_size_list:
+                (
+                    graph,
+                    input_buffers,
+                    output_buffers,
+                    flashinfer_handler,
+                ) = self.capture_one_batch_size(bs)
+                self.graphs[bs] = graph
+                self.input_buffers[bs] = input_buffers
+                self.output_buffers[bs] = output_buffers
+                self.flashinfer_handlers[bs] = flashinfer_handler
+    def capture_one_batch_size(self, bs):
+        from flashinfer import BatchDecodeWithPagedKVCacheWrapper
+        from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
+        graph = torch.cuda.CUDAGraph()
+        stream = self.stream
+        # Common inputs
+        input_ids = self.input_ids[:bs]
+        req_pool_indices = self.req_pool_indices[:bs]
+        seq_lens = self.seq_lens[:bs]
+        position_ids_offsets = self.position_ids_offsets[:bs]
+        out_cache_loc = self.out_cache_loc[:bs]
+        # FlashInfer inputs
+        if not _grouped_size_compiled_for_decode_kernels(
+            self.model_runner.model_config.num_attention_heads
+            // self.model_runner.tp_size,
+            self.model_runner.model_config.get_num_kv_heads(self.model_runner.tp_size),
+        ):
+            use_tensor_cores = True
+        else:
+            use_tensor_cores = False
+        flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
+            self.flashinfer_workspace_buffer,
+            "NHD",
+            use_cuda_graph=True,
+            use_tensor_cores=use_tensor_cores,
+            paged_kv_indptr_buffer=self.flashinfer_kv_indptr[: bs + 1],
+            paged_kv_indices_buffer=self.flashinfer_kv_indices,
+            paged_kv_last_page_len_buffer=self.flashinfer_kv_last_page_len[:bs],
+        )
+        init_flashinfer_args(
+            ForwardMode.DECODE,
+            self.model_runner,
+            req_pool_indices,
+            seq_lens,
+            None,
+            flashinfer_decode_wrapper,
+        )
+        # Run and capture
+        def run_once():
+            input_metadata = InputMetadata.create(
+                self.model_runner,
+                forward_mode=ForwardMode.DECODE,
+                req_pool_indices=req_pool_indices,
+                seq_lens=seq_lens,
+                prefix_lens=None,
+                position_ids_offsets=position_ids_offsets,
+                out_cache_loc=out_cache_loc,
+                return_logprob=False,
+                top_logprobs_nums=0,
+                skip_flashinfer_init=True,
+            )
+            input_metadata.flashinfer_decode_wrapper = flashinfer_decode_wrapper
+            return self.model_runner.model.forward(
+                input_ids, input_metadata.positions, input_metadata
+            )
+        for _ in range(2):
+            run_once()
+        torch.cuda.synchronize()
+        with torch.cuda.graph(graph, pool=self.graph_memory_pool, stream=stream):
+            out = run_once()
+        torch.cuda.synchronize()
+        self.graph_memory_pool = graph.pool()
+        return graph, None, out, flashinfer_decode_wrapper
+    def replay(self, batch: Batch):
+        assert batch.out_cache_loc is not None
+        assert not batch.return_logprob
+        raw_bs = len(batch.reqs)
+        # Pad
+        index = bisect.bisect_left(self.batch_size_list, raw_bs)
+        bs = self.batch_size_list[index]
+        if bs != raw_bs:
+            self.seq_lens.zero_()
+            self.position_ids_offsets.fill_(1)
+            self.out_cache_loc.zero_()
+        # Common inputs
+        self.input_ids[:raw_bs] = batch.input_ids
+        self.req_pool_indices[:raw_bs] = batch.req_pool_indices
+        self.seq_lens[:raw_bs] = batch.seq_lens
+        self.position_ids_offsets[:raw_bs] = batch.position_ids_offsets
+        self.out_cache_loc[:raw_bs] = batch.out_cache_loc
+        # FlashInfer inputs
+        init_flashinfer_args(
+            ForwardMode.DECODE,
+            self.model_runner,
+            self.req_pool_indices[:bs],
+            self.seq_lens[:bs],
+            None,
+            self.flashinfer_handlers[bs],
+        )
+        # Replay
+        self.graphs[bs].replay()
+        output = self.output_buffers[bs]
+        # Unpad
+        if bs == raw_bs:
+            return output
+        else:
+            output = LogitProcessorOutput(
+                next_token_logits=output.next_token_logits[:raw_bs],
+                next_token_logprobs=output.next_token_logprobs[:raw_bs]
+                if output.next_token_logprobs is not None
+                else None,
+                normalized_prompt_logprobs=None,
+                prefill_token_logprobs=None,
+                prefill_top_logprobs=None,
+                decode_top_logprobs=output.decode_top_logprobs[:raw_bs]
+                if output.decode_top_logprobs is not None
+                else None,
+            )
+        return output

sglang 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl

sglang 0.1.19py3-none-any.whl → 0.1.21py3-none-any.whl