PyPI - sglang - Versions diffs - 0.2.15__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

sglang 0.2.15py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

sglang/bench_latency.py +10 -6
sglang/bench_serving.py +33 -38
sglang/global_config.py +0 -4
sglang/lang/backend/runtime_endpoint.py +13 -6
sglang/lang/interpreter.py +1 -1
sglang/launch_server.py +3 -6
sglang/launch_server_llavavid.py +7 -8
sglang/srt/{model_config.py → configs/model_config.py} +5 -0
sglang/srt/constrained/__init__.py +2 -0
sglang/srt/constrained/fsm_cache.py +29 -38
sglang/srt/constrained/jump_forward.py +0 -1
sglang/srt/conversation.py +4 -1
sglang/srt/hf_transformers_utils.py +2 -4
sglang/srt/layers/attention_backend.py +480 -0
sglang/srt/layers/flashinfer_utils.py +235 -0
sglang/srt/layers/logits_processor.py +64 -77
sglang/srt/layers/radix_attention.py +11 -161
sglang/srt/layers/sampler.py +40 -35
sglang/srt/layers/torchao_utils.py +75 -0
sglang/srt/layers/{decode_attention.py → triton_attention/decode_attention.py} +67 -63
sglang/srt/layers/{extend_attention.py → triton_attention/extend_attention.py} +40 -132
sglang/srt/layers/{prefill_attention.py → triton_attention/prefill_attention.py} +13 -7
sglang/srt/lora/lora.py +403 -0
sglang/srt/lora/lora_config.py +43 -0
sglang/srt/lora/lora_manager.py +256 -0
sglang/srt/managers/controller_multi.py +1 -5
sglang/srt/managers/controller_single.py +0 -5
sglang/srt/managers/io_struct.py +16 -1
sglang/srt/managers/policy_scheduler.py +122 -5
sglang/srt/managers/schedule_batch.py +110 -74
sglang/srt/managers/tokenizer_manager.py +24 -15
sglang/srt/managers/tp_worker.py +181 -115
sglang/srt/model_executor/cuda_graph_runner.py +60 -133
sglang/srt/model_executor/forward_batch_info.py +35 -312
sglang/srt/model_executor/model_runner.py +118 -141
sglang/srt/models/baichuan.py +416 -0
sglang/srt/models/chatglm.py +6 -8
sglang/srt/models/commandr.py +1 -5
sglang/srt/models/dbrx.py +1 -5
sglang/srt/models/deepseek.py +1 -5
sglang/srt/models/deepseek_v2.py +1 -5
sglang/srt/models/exaone.py +8 -43
sglang/srt/models/gemma.py +1 -5
sglang/srt/models/gemma2.py +1 -5
sglang/srt/models/gpt_bigcode.py +1 -5
sglang/srt/models/grok.py +1 -5
sglang/srt/models/internlm2.py +1 -5
sglang/srt/models/{llama2.py → llama.py} +48 -26
sglang/srt/models/llama_classification.py +14 -40
sglang/srt/models/llama_embedding.py +7 -6
sglang/srt/models/llava.py +38 -16
sglang/srt/models/llavavid.py +7 -8
sglang/srt/models/minicpm.py +1 -5
sglang/srt/models/minicpm3.py +665 -0
sglang/srt/models/mistral.py +2 -3
sglang/srt/models/mixtral.py +6 -5
sglang/srt/models/mixtral_quant.py +1 -5
sglang/srt/models/qwen.py +1 -5
sglang/srt/models/qwen2.py +1 -5
sglang/srt/models/qwen2_moe.py +6 -5
sglang/srt/models/stablelm.py +1 -5
sglang/srt/models/xverse.py +375 -0
sglang/srt/models/xverse_moe.py +445 -0
sglang/srt/openai_api/adapter.py +65 -46
sglang/srt/openai_api/protocol.py +11 -3
sglang/srt/sampling/sampling_batch_info.py +67 -58
sglang/srt/server.py +24 -14
sglang/srt/server_args.py +130 -28
sglang/srt/utils.py +12 -0
sglang/test/few_shot_gsm8k.py +132 -0
sglang/test/runners.py +114 -22
sglang/test/test_programs.py +70 -0
sglang/test/test_utils.py +89 -1
sglang/utils.py +38 -4
sglang/version.py +1 -1
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/METADATA +31 -18
sglang-0.3.1.dist-info/RECORD +129 -0
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/WHEEL +1 -1
sglang-0.2.15.dist-info/RECORD +0 -118
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/LICENSE +0 -0
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 """
 Copyright 2023-2024 SGLang Team
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,15 +15,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-"""Run the model with cuda graph."""
+"""Run the model with cuda graph and torch.compile."""
 import bisect
 from contextlib import contextmanager
-from typing import Callable, List
+from typing import TYPE_CHECKING, Callable
 import torch
-from flashinfer import BatchDecodeWithPagedKVCacheWrapper
-from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
 from vllm.distributed.parallel_state import graph_capture
 from vllm.model_executor.custom_op import CustomOp
@@ -30,24 +30,23 @@ from sglang.srt.layers.logits_processor import (
     LogitsProcessor,
     LogitsProcessorOutput,
 )
-from sglang.srt.layers.sampler import SampleOutput
 from sglang.srt.managers.schedule_batch import ScheduleBatch
-from sglang.srt.model_executor.forward_batch_info import (
-    ForwardMode,
-    InputMetadata,
-    update_flashinfer_indices,
-)
-from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
 from sglang.srt.utils import monkey_patch_vllm_all_gather
+if TYPE_CHECKING:
+    from sglang.srt.model_executor.model_runner import ModelRunner
 def _to_torch(model: torch.nn.Module, reverse: bool = False):
     for sub in model._modules.values():
         if isinstance(sub, CustomOp):
             if reverse:
                 sub._forward_method = sub.forward_cuda
+                setattr(sub, "is_torch_compile", False)
             else:
                 sub._forward_method = sub.forward_native
+                setattr(sub, "is_torch_compile", True)
         if isinstance(sub, torch.nn.Module):
             _to_torch(sub, reverse)
@@ -56,6 +55,7 @@ def _to_torch(model: torch.nn.Module, reverse: bool = False):
 def patch_model(
     model: torch.nn.Module, enable_compile: bool, tp_group: "GroupCoordinator"
 ):
+    """Patch the model to make it compatible with with torch.compile"""
     backup_ca_comm = None
     try:
@@ -87,28 +87,33 @@ def set_torch_compile_config():
 class CudaGraphRunner:
-    def __init__(
-        self,
-        model_runner: "ModelRunner",
-        max_batch_size_to_capture: int,
-        use_torch_compile: bool,
-        disable_padding: bool,
-    ):
+    """A CudaGraphRunner runs the forward pass of a model with cuda graph and torch.compile."""
+    def __init__(self, model_runner: "ModelRunner"):
+        # Parse args
         self.model_runner = model_runner
         self.graphs = {}
         self.input_buffers = {}
         self.output_buffers = {}
         self.flashinfer_handlers = {}
         self.graph_memory_pool = None
-        self.disable_padding = disable_padding
+        self.use_torch_compile = model_runner.server_args.enable_torch_compile
+        self.disable_padding = model_runner.server_args.disable_cuda_graph_padding
+        # Batch sizes to capture
+        if self.model_runner.server_args.disable_cuda_graph_padding:
+            self.capture_bs = list(range(1, 32)) + [64, 128]
+        else:
+            self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
+        self.compile_bs = [1, 2, 4, 8, 16, 24, 32] if self.use_torch_compile else []
         # Common inputs
-        self.max_bs = max_batch_size_to_capture
+        self.max_bs = max(self.capture_bs)
         self.input_ids = torch.zeros((self.max_bs,), dtype=torch.int32, device="cuda")
         self.req_pool_indices = torch.zeros(
             (self.max_bs,), dtype=torch.int32, device="cuda"
         )
-        self.seq_lens = torch.zeros((self.max_bs,), dtype=torch.int32, device="cuda")
+        self.seq_lens = torch.ones((self.max_bs,), dtype=torch.int32, device="cuda")
         self.position_ids_offsets = torch.ones(
             (self.max_bs,), dtype=torch.int32, device="cuda"
         )
@@ -116,56 +121,38 @@ class CudaGraphRunner:
             (self.max_bs,), dtype=torch.int32, device="cuda"
         )
-        # FlashInfer inputs
-        self.flashinfer_kv_indptr = torch.zeros(
-            (self.max_bs + 1,), dtype=torch.int32, device="cuda"
-        )
-        self.flashinfer_kv_indices = torch.zeros(
-            (self.max_bs * model_runner.model_config.context_len,),
-            dtype=torch.int32,
-            device="cuda",
+        # Attention backend
+        self.model_runner.attn_backend.init_cuda_graph_state(self.max_bs)
+        self.seq_len_fill_value = (
+            self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
         )
-        self.flashinfer_kv_last_page_len = torch.ones(
-            (self.max_bs,), dtype=torch.int32, device="cuda"
-        )
-        if model_runner.sliding_window_size is None:
-            self.flashinfer_workspace_buffer = (
-                self.model_runner.flashinfer_workspace_buffer
-            )
-        else:
-            self.flashinfer_workspace_buffer = (
-                self.model_runner.flashinfer_workspace_buffer
-            )
-            self.flashinfer_kv_indptr = [
-                self.flashinfer_kv_indptr,
-                self.flashinfer_kv_indptr.clone(),
-            ]
-            self.flashinfer_kv_indices = [
-                self.flashinfer_kv_indices,
-                self.flashinfer_kv_indices.clone(),
-            ]
-        # Sampling inputs
-        vocab_size = model_runner.model_config.vocab_size
-        self.sampling_info = SamplingBatchInfo.dummy_one(self.max_bs, vocab_size)
-        self.compile_bs = [1, 2, 4, 8, 16, 24, 32] if use_torch_compile else []
-        if use_torch_compile:
+        if self.use_torch_compile:
             set_torch_compile_config()
+        # Capture
+        try:
+            self.capture()
+        except RuntimeError as e:
+            raise Exception(
+                f"Capture cuda graph failed: {e}\n"
+                "Possible solutions:\n"
+                "1. disable cuda graph by --disable-cuda-graph\n"
+                "2. set --mem-fraction-static to a smaller value\n"
+                "3. disable torch compile by not using --enable-torch-compile\n"
+                "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
+            )
     def can_run(self, batch_size: int):
         if self.disable_padding:
             return batch_size in self.graphs
         else:
             return batch_size <= self.max_bs
-    def capture(self, batch_size_list: List[int]):
-        self.batch_size_list = batch_size_list
+    def capture(self):
         with graph_capture() as graph_capture_context:
             self.stream = graph_capture_context.stream
-            for bs in batch_size_list:
+            for bs in self.capture_bs:
                 with patch_model(
                     self.model_runner.model,
                     bs in self.compile_bs,
@@ -173,14 +160,10 @@ class CudaGraphRunner:
                 ) as forward:
                     (
                         graph,
-                        input_buffers,
                         output_buffers,
-                        flashinfer_handler,
                     ) = self.capture_one_batch_size(bs, forward)
                     self.graphs[bs] = graph
-                    self.input_buffers[bs] = input_buffers
                     self.output_buffers[bs] = output_buffers
-                    self.flashinfer_handlers[bs] = flashinfer_handler
     def capture_one_batch_size(self, bs: int, forward: Callable):
         graph = torch.cuda.CUDAGraph()
@@ -193,67 +176,26 @@ class CudaGraphRunner:
         position_ids_offsets = self.position_ids_offsets[:bs]
         out_cache_loc = self.out_cache_loc[:bs]
-        # FlashInfer inputs
-        if not _grouped_size_compiled_for_decode_kernels(
-            self.model_runner.model_config.num_attention_heads
-            // self.model_runner.tp_size,
-            self.model_runner.model_config.get_num_kv_heads(self.model_runner.tp_size),
-        ):
-            use_tensor_cores = True
-        else:
-            use_tensor_cores = False
-        if self.model_runner.sliding_window_size is None:
-            flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
-                self.flashinfer_workspace_buffer,
-                "NHD",
-                use_cuda_graph=True,
-                use_tensor_cores=use_tensor_cores,
-                paged_kv_indptr_buffer=self.flashinfer_kv_indptr[: bs + 1],
-                paged_kv_indices_buffer=self.flashinfer_kv_indices,
-                paged_kv_last_page_len_buffer=self.flashinfer_kv_last_page_len[:bs],
-            )
-        else:
-            flashinfer_decode_wrapper = []
-            for i in range(2):
-                flashinfer_decode_wrapper.append(
-                    BatchDecodeWithPagedKVCacheWrapper(
-                        self.flashinfer_workspace_buffer,
-                        "NHD",
-                        use_cuda_graph=True,
-                        use_tensor_cores=use_tensor_cores,
-                        paged_kv_indptr_buffer=self.flashinfer_kv_indptr[i][: bs + 1],
-                        paged_kv_indices_buffer=self.flashinfer_kv_indices[i],
-                        paged_kv_last_page_len_buffer=self.flashinfer_kv_last_page_len[
-                            :bs
-                        ],
-                    )
-                )
-        update_flashinfer_indices(
-            ForwardMode.DECODE,
-            self.model_runner,
-            req_pool_indices,
-            seq_lens,
-            None,
-            flashinfer_decode_wrapper,
+        # Attention backend
+        self.model_runner.attn_backend.init_forward_metadata_capture_cuda_graph(
+            bs, req_pool_indices, seq_lens
         )
         # Run and capture
         def run_once():
             input_metadata = InputMetadata(
                 forward_mode=ForwardMode.DECODE,
-                sampling_info=self.sampling_info[:bs],
                 batch_size=bs,
                 req_pool_indices=req_pool_indices,
                 seq_lens=seq_lens,
                 req_to_token_pool=self.model_runner.req_to_token_pool,
                 token_to_kv_pool=self.model_runner.token_to_kv_pool,
+                attn_backend=self.model_runner.attn_backend,
                 out_cache_loc=out_cache_loc,
                 return_logprob=False,
-                top_logprobs_nums=0,
+                top_logprobs_nums=[0] * bs,
                 positions=(seq_lens - 1 + position_ids_offsets).to(torch.int64),
-                flashinfer_decode_wrapper=flashinfer_decode_wrapper,
             )
             return forward(input_ids, input_metadata.positions, input_metadata)
         for _ in range(2):
@@ -275,17 +217,17 @@ class CudaGraphRunner:
         self.model_runner.tp_group.barrier()
         self.graph_memory_pool = graph.pool()
-        return graph, None, out, flashinfer_decode_wrapper
+        return graph, out
     def replay(self, batch: ScheduleBatch):
         assert batch.out_cache_loc is not None
         raw_bs = len(batch.reqs)
         # Pad
-        index = bisect.bisect_left(self.batch_size_list, raw_bs)
-        bs = self.batch_size_list[index]
+        index = bisect.bisect_left(self.capture_bs, raw_bs)
+        bs = self.capture_bs[index]
         if bs != raw_bs:
-            self.seq_lens.zero_()
+            self.seq_lens.fill_(self.seq_len_fill_value)
             self.position_ids_offsets.fill_(1)
             self.out_cache_loc.zero_()
@@ -296,24 +238,14 @@ class CudaGraphRunner:
         self.position_ids_offsets[:raw_bs] = batch.position_ids_offsets
         self.out_cache_loc[:raw_bs] = batch.out_cache_loc
-        # FlashInfer inputs
-        update_flashinfer_indices(
-            ForwardMode.DECODE,
-            self.model_runner,
-            self.req_pool_indices[:bs],
-            self.seq_lens[:bs],
-            None,
-            self.flashinfer_handlers[bs],
+        # Attention backend
+        self.model_runner.attn_backend.init_forward_metadata_replay_cuda_graph(
+            bs, self.req_pool_indices, self.seq_lens
         )
-        # Sampling inputs
-        self.sampling_info.inplace_assign(raw_bs, batch.sampling_info)
         # Replay
-        torch.cuda.synchronize()
         self.graphs[bs].replay()
-        torch.cuda.synchronize()
-        sample_output, logits_output = self.output_buffers[bs]
+        logits_output = self.output_buffers[bs]
         # Unpad
         if bs != raw_bs:
@@ -325,11 +257,6 @@ class CudaGraphRunner:
                 input_top_logprobs=None,
                 output_top_logprobs=None,
             )
-            sample_output = SampleOutput(
-                sample_output.success[:raw_bs],
-                sample_output.probs[:raw_bs],
-                sample_output.batch_next_token_ids[:raw_bs],
-            )
         # Extract logprobs
         if batch.return_logprob:
@@ -346,4 +273,4 @@ class CudaGraphRunner:
                     logits_output.next_token_logprobs, logits_metadata
                 )[1]
-        return sample_output, logits_output
+        return logits_output

sglang 0.2.15__py3-none-any.whl → 0.3.1__py3-none-any.whl

sglang 0.2.15py3-none-any.whl → 0.3.1py3-none-any.whl