PyPI - sglang - Versions diffs - 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl - Mend

sglang 0.2.13py3-none-any.whl → 0.2.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

sglang/api.py +6 -0
sglang/bench_latency.py +7 -3
sglang/bench_serving.py +50 -26
sglang/check_env.py +15 -0
sglang/lang/chat_template.py +10 -5
sglang/lang/compiler.py +4 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +9 -0
sglang/launch_server.py +8 -1
sglang/srt/conversation.py +50 -1
sglang/srt/hf_transformers_utils.py +22 -23
sglang/srt/layers/activation.py +24 -1
sglang/srt/layers/decode_attention.py +338 -50
sglang/srt/layers/fused_moe/layer.py +2 -2
sglang/srt/layers/layernorm.py +3 -0
sglang/srt/layers/logits_processor.py +60 -23
sglang/srt/layers/radix_attention.py +3 -4
sglang/srt/layers/sampler.py +154 -0
sglang/srt/managers/controller_multi.py +2 -8
sglang/srt/managers/controller_single.py +7 -10
sglang/srt/managers/detokenizer_manager.py +20 -9
sglang/srt/managers/io_struct.py +44 -11
sglang/srt/managers/policy_scheduler.py +5 -2
sglang/srt/managers/schedule_batch.py +52 -167
sglang/srt/managers/tokenizer_manager.py +192 -83
sglang/srt/managers/tp_worker.py +130 -43
sglang/srt/mem_cache/memory_pool.py +82 -8
sglang/srt/mm_utils.py +79 -7
sglang/srt/model_executor/cuda_graph_runner.py +49 -11
sglang/srt/model_executor/forward_batch_info.py +59 -27
sglang/srt/model_executor/model_runner.py +210 -61
sglang/srt/models/chatglm.py +4 -12
sglang/srt/models/commandr.py +5 -1
sglang/srt/models/dbrx.py +5 -1
sglang/srt/models/deepseek.py +5 -1
sglang/srt/models/deepseek_v2.py +5 -1
sglang/srt/models/gemma.py +5 -1
sglang/srt/models/gemma2.py +15 -7
sglang/srt/models/gpt_bigcode.py +5 -1
sglang/srt/models/grok.py +16 -2
sglang/srt/models/internlm2.py +5 -1
sglang/srt/models/llama2.py +7 -3
sglang/srt/models/llama_classification.py +2 -2
sglang/srt/models/llama_embedding.py +4 -0
sglang/srt/models/llava.py +176 -59
sglang/srt/models/minicpm.py +5 -1
sglang/srt/models/mixtral.py +5 -1
sglang/srt/models/mixtral_quant.py +5 -1
sglang/srt/models/qwen.py +5 -2
sglang/srt/models/qwen2.py +13 -3
sglang/srt/models/qwen2_moe.py +5 -14
sglang/srt/models/stablelm.py +5 -1
sglang/srt/openai_api/adapter.py +117 -37
sglang/srt/sampling/sampling_batch_info.py +209 -0
sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -0
sglang/srt/server.py +84 -56
sglang/srt/server_args.py +43 -15
sglang/srt/utils.py +26 -16
sglang/test/runners.py +23 -31
sglang/test/simple_eval_common.py +9 -10
sglang/test/simple_eval_gpqa.py +2 -1
sglang/test/simple_eval_humaneval.py +2 -2
sglang/test/simple_eval_math.py +2 -1
sglang/test/simple_eval_mmlu.py +2 -1
sglang/test/test_activation.py +55 -0
sglang/test/test_utils.py +36 -53
sglang/version.py +1 -1
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/METADATA +92 -25
sglang-0.2.14.dist-info/RECORD +114 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
sglang/launch_server_llavavid.py +0 -29
sglang-0.2.13.dist-info/RECORD +0 -112
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
{sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -25,16 +25,18 @@ from vllm.distributed.parallel_state import graph_capture
 from vllm.model_executor.custom_op import CustomOp
 from sglang.srt.layers.logits_processor import (
-    LogitProcessorOutput,
     LogitsMetadata,
     LogitsProcessor,
+    LogitsProcessorOutput,
 )
+from sglang.srt.layers.sampler import SampleOutput
 from sglang.srt.managers.schedule_batch import ScheduleBatch
 from sglang.srt.model_executor.forward_batch_info import (
     ForwardMode,
     InputMetadata,
     update_flashinfer_indices,
 )
+from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.utils import monkey_patch_vllm_all_gather
@@ -84,13 +86,20 @@ def set_torch_compile_config():
 class CudaGraphRunner:
-    def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile):
+    def __init__(
+        self,
+        model_runner,
+        max_batch_size_to_capture: int,
+        use_torch_compile: bool,
+        disable_padding: bool,
+    ):
         self.model_runner = model_runner
         self.graphs = {}
         self.input_buffers = {}
         self.output_buffers = {}
         self.flashinfer_handlers = {}
         self.graph_memory_pool = None
+        self.disable_padding = disable_padding
         # Common inputs
         self.max_bs = max_batch_size_to_capture
@@ -136,13 +145,20 @@ class CudaGraphRunner:
                 self.flashinfer_kv_indices.clone(),
             ]
+        # Sampling inputs
+        vocab_size = model_runner.model_config.vocab_size
+        self.sampling_info = SamplingBatchInfo.dummy_one(self.max_bs, vocab_size)
         self.compile_bs = [1, 2, 4, 8, 16, 24, 32] if use_torch_compile else []
         if use_torch_compile:
             set_torch_compile_config()
     def can_run(self, batch_size):
-        return batch_size < self.max_bs
+        if self.disable_padding:
+            return batch_size in self.graphs
+        else:
+            return batch_size <= self.max_bs
     def capture(self, batch_size_list):
         self.batch_size_list = batch_size_list
@@ -224,6 +240,7 @@ class CudaGraphRunner:
         def run_once():
             input_metadata = InputMetadata(
                 forward_mode=ForwardMode.DECODE,
+                sampling_info=self.sampling_info[:bs],
                 batch_size=bs,
                 req_pool_indices=req_pool_indices,
                 seq_lens=seq_lens,
@@ -239,12 +256,23 @@ class CudaGraphRunner:
             return forward(input_ids, input_metadata.positions, input_metadata)
         for _ in range(2):
+            torch.cuda.synchronize()
+            self.model_runner.tp_group.barrier()
             run_once()
+            torch.cuda.synchronize()
+            self.model_runner.tp_group.barrier()
         torch.cuda.synchronize()
+        self.model_runner.tp_group.barrier()
         with torch.cuda.graph(graph, pool=self.graph_memory_pool, stream=stream):
             out = run_once()
         torch.cuda.synchronize()
+        self.model_runner.tp_group.barrier()
         self.graph_memory_pool = graph.pool()
         return graph, None, out, flashinfer_decode_wrapper
@@ -277,25 +305,35 @@ class CudaGraphRunner:
             self.flashinfer_handlers[bs],
         )
+        # Sampling inputs
+        self.sampling_info.inplace_assign(raw_bs, batch.sampling_info)
         # Replay
+        torch.cuda.synchronize()
         self.graphs[bs].replay()
-        output = self.output_buffers[bs]
+        torch.cuda.synchronize()
+        sample_output, logits_output = self.output_buffers[bs]
         # Unpad
         if bs != raw_bs:
-            output = LogitProcessorOutput(
-                next_token_logits=output.next_token_logits[:raw_bs],
+            logits_output = LogitsProcessorOutput(
+                next_token_logits=logits_output.next_token_logits[:raw_bs],
                 next_token_logprobs=None,
                 normalized_prompt_logprobs=None,
                 input_token_logprobs=None,
                 input_top_logprobs=None,
                 output_top_logprobs=None,
             )
+            sample_output = SampleOutput(
+                sample_output.success[:raw_bs],
+                sample_output.probs[:raw_bs],
+                sample_output.batch_next_token_ids[:raw_bs],
+            )
         # Extract logprobs
         if batch.return_logprob:
-            output.next_token_logprobs = torch.nn.functional.log_softmax(
-                output.next_token_logits, dim=-1
+            logits_output.next_token_logprobs = torch.nn.functional.log_softmax(
+                logits_output.next_token_logits, dim=-1
             )
             return_top_logprob = any(x > 0 for x in batch.top_logprobs_nums)
             if return_top_logprob:
@@ -303,8 +341,8 @@ class CudaGraphRunner:
                     forward_mode=ForwardMode.DECODE,
                     top_logprobs_nums=batch.top_logprobs_nums,
                 )
-                output.output_top_logprobs = LogitsProcessor.get_top_logprobs(
-                    output.next_token_logprobs, logits_metadata
+                logits_output.output_top_logprobs = LogitsProcessor.get_top_logprobs(
+                    logits_output.next_token_logprobs, logits_metadata
                 )[1]
-        return output
+        return sample_output, logits_output

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 """
 Copyright 2023-2024 SGLang Team
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,7 +18,7 @@ limitations under the License.
 """ModelRunner runs the forward passes of the models."""
 from dataclasses import dataclass
 from enum import IntEnum, auto
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, List
 import numpy as np
 import torch
@@ -26,6 +28,7 @@ from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
 if TYPE_CHECKING:
     from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 class ForwardMode(IntEnum):
@@ -42,6 +45,7 @@ class InputMetadata:
     """Store all inforamtion of a forward pass."""
     forward_mode: ForwardMode
+    sampling_info: SamplingBatchInfo
     batch_size: int
     req_pool_indices: torch.Tensor
     seq_lens: torch.Tensor
@@ -61,9 +65,11 @@ class InputMetadata:
     extend_start_loc: torch.Tensor = None
     extend_no_prefix: bool = None
-    # Output options
+    # For logprob
     return_logprob: bool = False
     top_logprobs_nums: List[int] = None
+    extend_seq_lens_cpu: List[int] = None
+    logprob_start_lens_cpu: List[int] = None
     # For multimodal
     pixel_values: List[torch.Tensor] = None
@@ -86,14 +92,19 @@ class InputMetadata:
         reqs = batch.reqs
         self.pixel_values = [r.pixel_values for r in reqs]
         self.image_sizes = [r.image_size for r in reqs]
-        self.image_offsets = [
-            (
-                (r.image_offset - len(r.prefix_indices))
-                if r.image_offset is not None
-                else 0
-            )
-            for r in reqs
-        ]
+        self.image_offsets = []
+        for r in reqs:
+            if isinstance(r.image_offset, list):
+                self.image_offsets.append(
+                    [
+                        (image_offset - len(r.prefix_indices))
+                        for image_offset in r.image_offset
+                    ]
+                )
+            elif isinstance(r.image_offset, int):
+                self.image_offsets.append(r.image_offset - len(r.prefix_indices))
+            elif r.image_offset is None:
+                self.image_offsets.append(0)
     def compute_positions(self, batch: ScheduleBatch):
         position_ids_offsets = batch.position_ids_offsets
@@ -109,8 +120,8 @@ class InputMetadata:
                 self.positions = torch.tensor(
                     np.concatenate(
                         [
-                            np.arange(len(req.prefix_indices), len(req.fill_ids))
-                            for req in batch.reqs
+                            np.arange(batch.prefix_lens_cpu[i], len(req.fill_ids))
+                            for i, req in enumerate(batch.reqs)
                         ],
                         axis=0,
                     ),
@@ -123,7 +134,7 @@ class InputMetadata:
                     np.concatenate(
                         [
                             np.arange(
-                                len(req.prefix_indices) + position_ids_offsets_cpu[i],
+                                batch.prefix_lens_cpu[i] + position_ids_offsets_cpu[i],
                                 len(req.fill_ids) + position_ids_offsets_cpu[i],
                             )
                             for i, req in enumerate(batch.reqs)
@@ -139,14 +150,29 @@ class InputMetadata:
     def compute_extend_infos(self, batch: ScheduleBatch):
         if self.forward_mode == ForwardMode.DECODE:
             self.extend_seq_lens = self.extend_start_loc = self.extend_no_prefix = None
+            self.extend_seq_lens_cpu = self.logprob_start_lens_cpu = None
         else:
             extend_lens_cpu = [
-                len(r.fill_ids) - len(r.prefix_indices) for r in batch.reqs
+                len(r.fill_ids) - batch.prefix_lens_cpu[i]
+                for i, r in enumerate(batch.reqs)
             ]
             self.extend_seq_lens = torch.tensor(extend_lens_cpu, device="cuda")
             self.extend_start_loc = torch.zeros_like(self.seq_lens)
             self.extend_start_loc[1:] = torch.cumsum(self.extend_seq_lens[:-1], dim=0)
-            self.extend_no_prefix = all(len(r.prefix_indices) == 0 for r in batch.reqs)
+            self.extend_no_prefix = all(l == 0 for l in batch.prefix_lens_cpu)
+            self.extend_seq_lens_cpu = extend_lens_cpu
+            self.logprob_start_lens_cpu = [
+                (
+                    min(
+                        req.logprob_start_len - batch.prefix_lens_cpu[i],
+                        extend_lens_cpu[i] - 1,
+                    )
+                    if req.logprob_start_len >= batch.prefix_lens_cpu[i]
+                    else extend_lens_cpu[i] - 1  # Fake extend, actually decode
+                )
+                for i, req in enumerate(batch.reqs)
+            ]
     @classmethod
     def from_schedule_batch(
@@ -157,6 +183,7 @@ class InputMetadata:
     ):
         ret = cls(
             forward_mode=forward_mode,
+            sampling_info=batch.sampling_info,
             batch_size=batch.batch_size(),
             req_pool_indices=batch.req_pool_indices,
             seq_lens=batch.seq_lens,
@@ -167,6 +194,8 @@ class InputMetadata:
             top_logprobs_nums=batch.top_logprobs_nums,
         )
+        ret.sampling_info.prepare_penalties()
         ret.compute_positions(batch)
         ret.compute_extend_infos(batch)
@@ -180,14 +209,8 @@ class InputMetadata:
         if forward_mode != ForwardMode.DECODE:
             ret.init_multimuldal_info(batch)
-        prefix_lens = None
-        if forward_mode != ForwardMode.DECODE:
-            prefix_lens = torch.tensor(
-                [len(r.prefix_indices) for r in batch.reqs], device="cuda"
-            )
         if model_runner.server_args.disable_flashinfer:
-            ret.init_triton_args(batch, prefix_lens)
+            ret.init_triton_args(batch)
         flashinfer_use_ragged = False
         if not model_runner.server_args.disable_flashinfer:
@@ -198,30 +221,35 @@ class InputMetadata:
             ):
                 flashinfer_use_ragged = True
             ret.init_flashinfer_handlers(
-                model_runner, prefix_lens, flashinfer_use_ragged
+                model_runner, batch.prefix_lens_cpu, flashinfer_use_ragged
             )
         return ret
-    def init_triton_args(self, batch: ScheduleBatch, prefix_lens):
+    def init_triton_args(self, batch: ScheduleBatch):
         """Init auxiliary variables for triton attention backend."""
         self.triton_max_seq_len = int(torch.max(self.seq_lens))
-        self.triton_prefix_lens = prefix_lens
         self.triton_start_loc = torch.zeros_like(self.seq_lens, dtype=torch.int32)
         self.triton_start_loc[1:] = torch.cumsum(self.seq_lens[:-1], dim=0)
         if self.forward_mode == ForwardMode.DECODE:
             self.triton_max_extend_len = None
         else:
-            extend_seq_lens = self.seq_lens - prefix_lens
+            self.triton_prefix_lens = torch.tensor(batch.prefix_lens_cpu, device="cuda")
+            extend_seq_lens = self.seq_lens - self.triton_prefix_lens
             self.triton_max_extend_len = int(torch.max(extend_seq_lens))
     def init_flashinfer_handlers(
         self,
         model_runner,
-        prefix_lens,
+        prefix_lens_cpu,
         flashinfer_use_ragged,
     ):
+        if self.forward_mode != ForwardMode.DECODE:
+            prefix_lens = torch.tensor(prefix_lens_cpu, device="cuda")
+        else:
+            prefix_lens = None
         update_flashinfer_indices(
             self.forward_mode,
             model_runner,
@@ -294,6 +322,8 @@ def update_flashinfer_indices(
                 num_kv_heads,
                 head_dim,
                 1,
+                data_type=model_runner.kv_cache_dtype,
+                q_data_type=model_runner.dtype,
             )
         else:
             # extend part
@@ -372,6 +402,8 @@ def update_flashinfer_indices(
                     num_kv_heads,
                     head_dim,
                     1,
+                    data_type=model_runner.kv_cache_dtype,
+                    q_data_type=model_runner.dtype,
                 )
             else:
                 # extend part

sglang 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl

sglang 0.2.13py3-none-any.whl → 0.2.14py3-none-any.whl