PyPI - sglang - Versions diffs - 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

sglang 0.3.2py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

sglang/__init__.py +2 -0
sglang/api.py +23 -1
sglang/bench_latency.py +46 -25
sglang/bench_serving.py +2 -2
sglang/lang/backend/runtime_endpoint.py +14 -1
sglang/lang/interpreter.py +16 -6
sglang/lang/ir.py +20 -4
sglang/srt/configs/model_config.py +11 -9
sglang/srt/constrained/fsm_cache.py +9 -1
sglang/srt/constrained/jump_forward.py +15 -2
sglang/srt/layers/activation.py +4 -4
sglang/srt/layers/attention/__init__.py +49 -0
sglang/srt/layers/attention/flashinfer_backend.py +277 -0
sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
sglang/srt/layers/attention/triton_backend.py +161 -0
sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
sglang/srt/layers/layernorm.py +4 -4
sglang/srt/layers/logits_processor.py +19 -15
sglang/srt/layers/pooler.py +3 -3
sglang/srt/layers/quantization/__init__.py +0 -2
sglang/srt/layers/radix_attention.py +6 -4
sglang/srt/layers/sampler.py +6 -4
sglang/srt/layers/torchao_utils.py +18 -0
sglang/srt/lora/lora.py +20 -21
sglang/srt/lora/lora_manager.py +97 -25
sglang/srt/managers/detokenizer_manager.py +31 -18
sglang/srt/managers/image_processor.py +187 -0
sglang/srt/managers/io_struct.py +99 -75
sglang/srt/managers/schedule_batch.py +184 -63
sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
sglang/srt/managers/scheduler.py +1021 -0
sglang/srt/managers/tokenizer_manager.py +120 -248
sglang/srt/managers/tp_worker.py +28 -925
sglang/srt/mem_cache/memory_pool.py +34 -52
sglang/srt/model_executor/cuda_graph_runner.py +15 -19
sglang/srt/model_executor/forward_batch_info.py +94 -95
sglang/srt/model_executor/model_runner.py +76 -75
sglang/srt/models/baichuan.py +10 -10
sglang/srt/models/chatglm.py +12 -12
sglang/srt/models/commandr.py +10 -10
sglang/srt/models/dbrx.py +12 -12
sglang/srt/models/deepseek.py +10 -10
sglang/srt/models/deepseek_v2.py +14 -15
sglang/srt/models/exaone.py +10 -10
sglang/srt/models/gemma.py +10 -10
sglang/srt/models/gemma2.py +11 -11
sglang/srt/models/gpt_bigcode.py +10 -10
sglang/srt/models/grok.py +10 -10
sglang/srt/models/internlm2.py +10 -10
sglang/srt/models/llama.py +14 -10
sglang/srt/models/llama_classification.py +5 -5
sglang/srt/models/llama_embedding.py +4 -4
sglang/srt/models/llama_reward.py +142 -0
sglang/srt/models/llava.py +39 -33
sglang/srt/models/llavavid.py +31 -28
sglang/srt/models/minicpm.py +10 -10
sglang/srt/models/minicpm3.py +14 -15
sglang/srt/models/mixtral.py +10 -10
sglang/srt/models/mixtral_quant.py +10 -10
sglang/srt/models/olmoe.py +10 -10
sglang/srt/models/qwen.py +10 -10
sglang/srt/models/qwen2.py +11 -11
sglang/srt/models/qwen2_moe.py +10 -10
sglang/srt/models/stablelm.py +10 -10
sglang/srt/models/torch_native_llama.py +506 -0
sglang/srt/models/xverse.py +10 -10
sglang/srt/models/xverse_moe.py +10 -10
sglang/srt/sampling/sampling_batch_info.py +36 -27
sglang/srt/sampling/sampling_params.py +3 -1
sglang/srt/server.py +170 -119
sglang/srt/server_args.py +54 -27
sglang/srt/utils.py +101 -128
sglang/test/runners.py +71 -26
sglang/test/test_programs.py +38 -5
sglang/test/test_utils.py +18 -9
sglang/version.py +1 -1
{sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/METADATA +37 -19
sglang-0.3.3.dist-info/RECORD +139 -0
sglang/srt/layers/attention_backend.py +0 -474
sglang/srt/managers/controller_multi.py +0 -207
sglang/srt/managers/controller_single.py +0 -164
sglang-0.3.2.dist-info/RECORD +0 -135
/sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
/sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
{sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
{sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
{sglang-0.3.2.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -18,7 +18,6 @@ The definition of objects transfered between different
 processes (TokenizerManager, DetokenizerManager, Controller).
 """
-import copy
 import uuid
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
@@ -37,7 +36,7 @@ class GenerateReqInput:
     # See also python/sglang/srt/utils.py:load_image.
     image_data: Optional[Union[List[str], str]] = None
     # The sampling_params. See descriptions below.
-    sampling_params: Union[List[Dict], Dict] = None
+    sampling_params: Optional[Union[List[Dict], Dict]] = None
     # The request id.
     rid: Optional[Union[List[str], str]] = None
     # Whether to return logprobs.
@@ -53,9 +52,6 @@ class GenerateReqInput:
     stream: bool = False
     # The modalities of the image data [image, multi-images, video]
     modalities: Optional[List[str]] = None
-    is_single: bool = True
     # LoRA related
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
@@ -65,19 +61,41 @@ class GenerateReqInput:
         ):
             raise ValueError("Either text or input_ids should be provided.")
-        if (
-            isinstance(self.sampling_params, dict)
-            and self.sampling_params.get("n", 1) != 1
-        ):
-            is_single = False
+        self.is_single = False
+        if self.text is not None:
+            if isinstance(self.text, str):
+                self.is_single = True
+                self.batch_size = 1
+            else:
+                self.batch_size = len(self.text)
         else:
-            if self.text is not None:
-                is_single = isinstance(self.text, str)
+            if isinstance(self.input_ids[0], int):
+                self.is_single = True
+                self.batch_size = 1
             else:
-                is_single = isinstance(self.input_ids[0], int)
-        self.is_single = is_single
-        if is_single:
+                self.batch_size = len(self.input_ids)
+        if self.sampling_params is None:
+            self.parallel_sample_num = 1
+        elif isinstance(self.sampling_params, dict):
+            self.parallel_sample_num = self.sampling_params.get("n", 1)
+        else:  # isinstance(self.sampling_params, list):
+            self.parallel_sample_num = self.sampling_params[0].get("n", 1)
+            for sp in self.sampling_params:
+                # TODO cope with the case that the parallel_sample_num is different for different samples
+                assert self.parallel_sample_num == sp.get(
+                    "n", 1
+                ), "The parallel_sample_num should be the same for all samples in sample params."
+        if self.parallel_sample_num > 1:
+            if self.is_single:
+                self.is_single = False
+                if self.text is not None:
+                    self.text = [self.text]
+                if self.input_ids is not None:
+                    self.input_ids = [self.input_ids]
+        if self.is_single:
             if self.sampling_params is None:
                 self.sampling_params = {}
             if self.rid is None:
@@ -89,79 +107,54 @@ class GenerateReqInput:
             if self.top_logprobs_num is None:
                 self.top_logprobs_num = 0
         else:
-            parallel_sample_num_list = []
-            if isinstance(self.sampling_params, dict):
-                parallel_sample_num = self.sampling_params.get("n", 1)
-            elif isinstance(self.sampling_params, list):
-                for sp in self.sampling_params:
-                    parallel_sample_num = sp.get("n", 1)
-                    parallel_sample_num_list.append(parallel_sample_num)
-                parallel_sample_num = max(parallel_sample_num_list)
-                all_equal = all(
-                    element == parallel_sample_num
-                    for element in parallel_sample_num_list
-                )
-                if parallel_sample_num > 1 and (not all_equal):
-                    # TODO cope with the case that the parallel_sample_num is different for different samples
-                    raise ValueError(
-                        "The parallel_sample_num should be the same for all samples in sample params."
-                    )
+            if self.parallel_sample_num == 1:
+                num = self.batch_size
             else:
-                parallel_sample_num = 1
-            self.parallel_sample_num = parallel_sample_num
-            if parallel_sample_num != 1:
-                # parallel sampling +1 represents the original prefill stage
-                num = parallel_sample_num + 1
-                if isinstance(self.text, list):
-                    # suppot batch operation
-                    self.batch_size = len(self.text)
-                    num = num * len(self.text)
-                elif isinstance(self.input_ids, list) and isinstance(
-                    self.input_ids[0], list
-                ):
-                    self.batch_size = len(self.input_ids)
-                    num = num * len(self.input_ids)
-                else:
-                    self.batch_size = 1
-            else:
-                # support select operation
-                num = len(self.text) if self.text is not None else len(self.input_ids)
-                self.batch_size = num
+                # FIXME support cascade inference
+                # first bs samples are used for caching the prefix for parallel sampling
+                num = self.batch_size + self.parallel_sample_num * self.batch_size
             if self.image_data is None:
                 self.image_data = [None] * num
             elif not isinstance(self.image_data, list):
                 self.image_data = [self.image_data] * num
             elif isinstance(self.image_data, list):
-                # multi-image with n > 1
+                # FIXME incorrect order for duplication
                 self.image_data = self.image_data * num
             if self.sampling_params is None:
                 self.sampling_params = [{}] * num
             elif not isinstance(self.sampling_params, list):
                 self.sampling_params = [self.sampling_params] * num
+            else:
+                assert self.parallel_sample_num == 1
             if self.rid is None:
                 self.rid = [uuid.uuid4().hex for _ in range(num)]
             else:
-                if not isinstance(self.rid, list):
-                    raise ValueError("The rid should be a list.")
+                assert isinstance(self.rid, list), "The rid should be a list."
+                assert self.parallel_sample_num == 1
             if self.return_logprob is None:
                 self.return_logprob = [False] * num
             elif not isinstance(self.return_logprob, list):
                 self.return_logprob = [self.return_logprob] * num
+            else:
+                assert self.parallel_sample_num == 1
             if self.logprob_start_len is None:
                 self.logprob_start_len = [-1] * num
             elif not isinstance(self.logprob_start_len, list):
                 self.logprob_start_len = [self.logprob_start_len] * num
+            else:
+                assert self.parallel_sample_num == 1
             if self.top_logprobs_num is None:
                 self.top_logprobs_num = [0] * num
             elif not isinstance(self.top_logprobs_num, list):
                 self.top_logprobs_num = [self.top_logprobs_num] * num
+            else:
+                assert self.parallel_sample_num == 1
 @dataclass
@@ -172,12 +165,8 @@ class TokenizedGenerateReqInput:
     input_text: str
     # The input token ids
     input_ids: List[int]
-    # The pixel values for input images
-    pixel_values: List[float]
-    # The hash values of input images
-    image_hashes: List[int]
-    # The image sizes
-    image_sizes: List[List[int]]
+    # The image input
+    image_inputs: dict
     # The sampling parameters
     sampling_params: SamplingParams
     # Whether to return the logprobs
@@ -188,8 +177,6 @@ class TokenizedGenerateReqInput:
     top_logprobs_num: int
     # Whether to stream output
     stream: bool
-    # Modalities of the input images
-    modalites: Optional[List[str]] = None
     # LoRA related
     lora_path: Optional[str] = None  # None means just use the base model
@@ -206,8 +193,6 @@ class EmbeddingReqInput:
     # Dummy sampling params for compatibility
     sampling_params: Union[List[Dict], Dict] = None
-    is_single: bool = True
     def post_init(self):
         if (self.text is None and self.input_ids is None) or (
             self.text is not None and self.input_ids is not None
@@ -215,12 +200,11 @@ class EmbeddingReqInput:
             raise ValueError("Either text or input_ids should be provided.")
         if self.text is not None:
-            is_single = isinstance(self.text, str)
+            self.is_single = isinstance(self.text, str)
         else:
-            is_single = isinstance(self.input_ids[0], int)
-        self.is_single = is_single
+            self.is_single = isinstance(self.input_ids[0], int)
-        if is_single:
+        if self.is_single:
             if self.rid is None:
                 self.rid = uuid.uuid4().hex
             if self.sampling_params is None:
@@ -254,6 +238,50 @@ class TokenizedEmbeddingReqInput:
     sampling_params: SamplingParams
+@dataclass
+class RewardReqInput:
+    # The input prompt in the chat format. It can be a single prompt or a batch of prompts.
+    conv: Union[List[List[Dict]], List[Dict]]
+    # The request id.
+    rid: Optional[Union[List[str], str]] = None
+    # Dummy sampling params for compatibility
+    sampling_params: Union[List[Dict], Dict] = None
+    def post_init(self):
+        self.is_single = isinstance(self.conv[0], dict)
+        if self.is_single:
+            if self.rid is None:
+                self.rid = uuid.uuid4().hex
+            if self.sampling_params is None:
+                self.sampling_params = {}
+            self.sampling_params["max_new_tokens"] = 1
+        else:
+            # support select operation
+            self.batch_size = len(self.conv)
+            if self.rid is None:
+                self.rid = [uuid.uuid4().hex for _ in range(self.batch_size)]
+            else:
+                if not isinstance(self.rid, list):
+                    raise ValueError("The rid should be a list.")
+            if self.sampling_params is None:
+                self.sampling_params = [{}] * self.batch_size
+            for i in range(self.batch_size):
+                self.sampling_params[i]["max_new_tokens"] = 1
+@dataclass
+class TokenizedRewardReqInput:
+    # The request id
+    rid: str
+    # The input text
+    input_text: str
+    # The input token ids
+    input_ids: List[int]
+    # Dummy sampling params for compatibility
+    sampling_params: SamplingParams
 @dataclass
 class BatchTokenIDOut:
     # The request id
@@ -268,10 +296,6 @@ class BatchTokenIDOut:
     meta_info: List[Dict]
     finished_reason: List[BaseFinishReason]
-    def __post_init__(self):
-        # deepcopy meta_info to avoid modification in place
-        self.meta_info = copy.deepcopy(self.meta_info)
 @dataclass
 class BatchStrOut:

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from __future__ import annotations
 """
 Copyright 2023-2024 SGLang Team
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,7 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-"""Meta data for requests and batches"""
+"""
+Store information about requests and batches.
+The following is the flow of data structures for a batch:
+ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
+- ScheduleBatch is managed by `scheduler.py::Scheduler`.
+  It contains high-level scheduling data. Most of the data is on the CPU.
+- ModelWorkerBatch is managed by `tp_worker.py::TpModelWorker`.
+- ForwardBatch is managed by `model_runner.py::ModelRunner`.
+  It contains low-level tensor data. Most of the data consists of GPU tensors.
+"""
 import logging
 from dataclasses import dataclass
@@ -31,6 +41,7 @@ from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
+from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs
 INIT_INCREMENTAL_DETOKENIZATION_OFFSET = 5
@@ -102,14 +113,50 @@ class FINISH_ABORT(BaseFinishReason):
         }
+@dataclass
+class ImageInputs:
+    """The image related inputs."""
+    pixel_values: torch.Tensor
+    image_hash: int
+    image_sizes: Optional[list] = None
+    image_offsets: Optional[list] = None
+    pad_values: Optional[list] = None
+    modalities: Optional[list] = None
+    image_embeds: Optional[List[torch.Tensor]] = None
+    aspect_ratio_ids: Optional[List[torch.Tensor]] = None
+    aspect_ratio_mask: Optional[List[torch.Tensor]] = None
+    @staticmethod
+    def from_dict(obj, vocab_size):
+        # Use image hash as fake token_ids, which is then used for prefix matching
+        ret = ImageInputs(
+            pixel_values=obj["pixel_values"],
+            image_hash=hash(tuple(obj["image_hashes"])),
+        )
+        image_hash = ret.image_hash
+        ret.pad_values = [
+            (image_hash) % vocab_size,
+            (image_hash >> 16) % vocab_size,
+            (image_hash >> 32) % vocab_size,
+            (image_hash >> 64) % vocab_size,
+        ]
+        ret.image_sizes = obj["image_sizes"]
+        # Only when pixel values is not None we have modalities
+        ret.modalities = obj["modalities"] or ["image"]
+        return ret
 class Req:
-    """Store all inforamtion of a request."""
+    """The input and output status of a request."""
     def __init__(
         self,
         rid: str,
         origin_input_text: str,
         origin_input_ids: Tuple[int],
+        sampling_params: SamplingParams,
         lora_path: Optional[str] = None,
     ):
         # Input and output info
@@ -119,6 +166,8 @@ class Req:
         self.origin_input_ids = origin_input_ids
         self.output_ids = []  # Each decode stage's output ids
         self.fill_ids = None  # fill_ids = origin_input_ids + output_ids
+        self.sampling_params = sampling_params
         self.lora_path = lora_path
         # Memory info
@@ -127,6 +176,7 @@ class Req:
         # Check finish
         self.tokenizer = None
         self.finished_reason = None
+        self.stream = False
         # For incremental decoding
         # ----- | --------- read_ids -------|
@@ -147,21 +197,13 @@ class Req:
         self.completion_tokens_wo_jump_forward = 0
         # For vision inputs
-        self.pixel_values = None
-        self.image_sizes = None
-        self.image_offsets = None
-        self.pad_value = None
-        self.modalities = None
+        self.image_inputs: Optional[ImageInputs] = None
         # Prefix info
         self.prefix_indices = []
         self.extend_input_len = 0
         self.last_node = None
-        # Sampling parameters
-        self.sampling_params = None
-        self.stream = False
         # Logprobs (arguments)
         self.return_logprob = False
         self.logprob_start_len = 0
@@ -363,28 +405,32 @@ class ScheduleBatch:
     sampling_info: SamplingBatchInfo = None
     # Batched arguments to model runner
-    input_ids: torch.Tensor = None
-    req_pool_indices: torch.Tensor = None
-    seq_lens: torch.Tensor = None
-    position_ids_offsets: torch.Tensor = None
+    input_ids: List[int] = None
+    req_pool_indices: List[int] = None
+    seq_lens: List[int] = None
     out_cache_loc: torch.Tensor = None
-    extend_num_tokens: int = None
-    # For mixed chunekd prefill
-    prefix_lens_cpu: List[int] = None
-    running_bs: int = None
     # For processing logprobs
     return_logprob: bool = False
-    top_logprobs_nums: List[int] = None
+    top_logprobs_nums: Optional[List[int]] = None
+    # For extend and mixed chunekd prefill
+    prefix_lens: List[int] = None
+    extend_lens: List[int] = None
+    extend_num_tokens: int = None
+    running_bs: int = None
     # Stream
     has_stream: bool = False
+    # Has regex
+    has_regex: bool = False
     @classmethod
     def init_new(cls, reqs, req_to_token_pool, token_to_kv_pool, tree_cache):
         return_logprob = any(req.return_logprob for req in reqs)
         has_stream = any(req.stream for req in reqs)
+        has_regex = any(req.regex_fsm for req in reqs)
         return cls(
             reqs=reqs,
@@ -393,6 +439,7 @@ class ScheduleBatch:
             tree_cache=tree_cache,
             return_logprob=return_logprob,
             has_stream=has_stream,
+            has_regex=has_regex,
         )
     def batch_size(self):
@@ -436,12 +483,12 @@ class ScheduleBatch:
         seq_lens = []
         # Allocate memory
-        req_pool_indices_cpu = self.alloc_req_slots(bs)
+        req_pool_indices = self.alloc_req_slots(bs)
         out_cache_loc = self.alloc_token_slots(extend_num_tokens)
         pt = 0
         for i, req in enumerate(reqs):
-            req.req_pool_idx = req_pool_indices_cpu[i]
+            req.req_pool_idx = req_pool_indices[i]
             pre_len, seq_len = len(req.prefix_indices), len(req.fill_ids)
             seq_lens.append(seq_len)
             assert seq_len - pre_len == req.extend_input_len
@@ -467,18 +514,19 @@ class ScheduleBatch:
             pt += req.extend_input_len
         # Set fields
-        with torch.device("cuda"):
+        with out_cache_loc.device:
             self.input_ids = torch.tensor(sum(input_ids, []), dtype=torch.int32)
-            self.req_pool_indices = torch.tensor(req_pool_indices_cpu)
-            self.seq_lens = torch.tensor(seq_lens, dtype=torch.int32)
-            self.position_ids_offsets = torch.zeros((bs,), dtype=torch.int64)
+            self.req_pool_indices = torch.tensor(req_pool_indices)
+            self.seq_lens = torch.tensor(seq_lens)
         self.extend_num_tokens = extend_num_tokens
         self.out_cache_loc = out_cache_loc
-        self.top_logprobs_nums = [r.top_logprobs_num for r in reqs]
-        self.prefix_lens_cpu = [len(r.prefix_indices) for r in reqs]
-        self.extend_lens_cpu = [r.extend_input_len for r in reqs]
-        self.extend_logprob_start_lens_cpu = [r.extend_logprob_start_len for r in reqs]
+        if self.return_logprob:
+            self.top_logprobs_nums = [r.top_logprobs_num for r in reqs]
+        self.prefix_lens = [len(r.prefix_indices) for r in reqs]
+        self.extend_lens = [r.extend_input_len for r in reqs]
+        self.extend_logprob_start_lens = [r.extend_logprob_start_len for r in reqs]
         self.sampling_info = SamplingBatchInfo.from_schedule_batch(self, vocab_size)
     def mix_with_running(self, running_batch: "ScheduleBatch"):
@@ -493,20 +541,20 @@ class ScheduleBatch:
         out_cache_loc = torch.cat([self.out_cache_loc, running_batch.out_cache_loc])
         extend_num_tokens = self.extend_num_tokens + running_bs
-        self.merge(running_batch)
+        self.merge_batch(running_batch)
         self.input_ids = input_ids
         self.out_cache_loc = out_cache_loc
         self.extend_num_tokens = extend_num_tokens
         # NOTE: prefix_indices is what has been cached, but we don't cache each decode step
-        self.prefix_lens_cpu.extend(
+        self.prefix_lens.extend(
             [
                 len(r.origin_input_ids) + len(r.output_ids) - 1
                 for r in running_batch.reqs
             ]
         )
-        self.extend_lens_cpu.extend([1] * running_bs)
-        self.extend_logprob_start_lens_cpu.extend([0] * running_bs)
+        self.extend_lens.extend([1] * running_bs)
+        self.extend_logprob_start_lens.extend([0] * running_bs)
     def check_decode_mem(self):
         bs = len(self.reqs)
@@ -598,7 +646,7 @@ class ScheduleBatch:
         return retracted_reqs, new_estimate_ratio
-    def check_for_jump_forward(self, model_runner):
+    def check_for_jump_forward(self, pad_input_ids_func):
         jump_forward_reqs = []
         filter_indices = [i for i in range(len(self.reqs))]
@@ -654,15 +702,9 @@ class ScheduleBatch:
                     self.tree_cache.cache_finished_req(req, cur_all_ids)
                     # re-applying image padding
-                    if req.pixel_values is not None:
-                        (
-                            req.origin_input_ids,
-                            req.image_offsets,
-                        ) = model_runner.model.pad_input_ids(
-                            req.origin_input_ids_unpadded,
-                            req.pad_value,
-                            req.pixel_values,
-                            req.image_sizes,
+                    if req.image_inputs is not None:
+                        req.origin_input_ids = pad_input_ids_func(
+                            req.origin_input_ids_unpadded, req.image_inputs
                         )
                     jump_forward_reqs.append(req)
@@ -681,7 +723,9 @@ class ScheduleBatch:
                 for r in self.reqs
             ]
-        self.input_ids = torch.tensor(input_ids, dtype=torch.int32, device="cuda")
+        self.input_ids = torch.tensor(
+            input_ids, dtype=torch.int32, device=self.seq_lens.device
+        )
         self.seq_lens.add_(1)
         # Alloc mem
@@ -703,33 +747,110 @@ class ScheduleBatch:
             return
         self.reqs = [self.reqs[i] for i in unfinished_indices]
-        new_indices = torch.tensor(unfinished_indices, dtype=torch.int32, device="cuda")
-        self.seq_lens = self.seq_lens[new_indices]
-        self.input_ids = None
+        new_indices = torch.tensor(
+            unfinished_indices, dtype=torch.int32, device=self.seq_lens.device
+        )
         self.req_pool_indices = self.req_pool_indices[new_indices]
-        self.position_ids_offsets = self.position_ids_offsets[new_indices]
+        self.seq_lens = self.seq_lens[new_indices]
         self.out_cache_loc = None
-        self.top_logprobs_nums = [self.top_logprobs_nums[i] for i in unfinished_indices]
         self.return_logprob = any(req.return_logprob for req in self.reqs)
+        if self.return_logprob:
+            self.top_logprobs_nums = [
+                self.top_logprobs_nums[i] for i in unfinished_indices
+            ]
+        else:
+            self.top_logprobs_nums = None
         self.has_stream = any(req.stream for req in self.reqs)
+        self.has_regex = any(req.regex_fsm for req in self.reqs)
-        self.sampling_info.filter(unfinished_indices, new_indices)
+        self.sampling_info.filter_batch(unfinished_indices, new_indices)
-    def merge(self, other: "ScheduleBatch"):
+    def merge_batch(self, other: "ScheduleBatch"):
         # Penalizer orchestrator must be merged before Batch.reqs is merged. This is because
         # orchestrator.merge() depends on Batch.reqs during preparation of each penalizers, so it
         # needs to be called with pre-merged Batch.reqs.
-        self.sampling_info.merge(other.sampling_info)
+        self.sampling_info.merge_batch(other.sampling_info)
-        self.reqs.extend(other.reqs)
         self.req_pool_indices = torch.concat(
             [self.req_pool_indices, other.req_pool_indices]
         )
         self.seq_lens = torch.concat([self.seq_lens, other.seq_lens])
-        self.position_ids_offsets = torch.concat(
-            [self.position_ids_offsets, other.position_ids_offsets]
-        )
         self.out_cache_loc = None
-        self.top_logprobs_nums.extend(other.top_logprobs_nums)
-        self.return_logprob = any(req.return_logprob for req in self.reqs)
-        self.has_stream = any(req.stream for req in self.reqs)
+        if self.return_logprob and other.return_logprob:
+            self.top_logprobs_nums.extend(other.top_logprobs_nums)
+        elif self.return_logprob:
+            self.top_logprobs_nums.extend([0] * len(other.reqs))
+        elif other.return_logprob:
+            self.top_logprobs_nums = [0] * len(self.reqs) + other.top_logprobs_nums
+        self.reqs.extend(other.reqs)
+        self.return_logprob = self.return_logprob or other.return_logprob
+        self.has_stream = self.has_stream or other.has_stream
+        self.has_regex = self.has_regex or other.has_regex
+    def get_model_worker_batch(self):
+        if self.forward_mode.is_decode():
+            extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = (
+                image_inputs
+            ) = None
+        else:
+            extend_seq_lens = self.extend_lens
+            extend_prefix_lens = self.prefix_lens
+            extend_logprob_start_lens = self.extend_logprob_start_lens
+            image_inputs = [r.image_inputs for r in self.reqs]
+        lora_paths = [req.lora_path for req in self.reqs]
+        if self.has_regex:
+            self.sampling_info.regex_fsms = [req.regex_fsm for req in self.reqs]
+            self.sampling_info.regex_fsm_states = [
+                req.regex_fsm_state for req in self.reqs
+            ]
+        return ModelWorkerBatch(
+            forward_mode=self.forward_mode,
+            input_ids=self.input_ids,
+            req_pool_indices=self.req_pool_indices,
+            seq_lens=self.seq_lens,
+            out_cache_loc=self.out_cache_loc,
+            return_logprob=self.return_logprob,
+            top_logprobs_nums=self.top_logprobs_nums,
+            extend_seq_lens=extend_seq_lens,
+            extend_prefix_lens=extend_prefix_lens,
+            extend_logprob_start_lens=extend_logprob_start_lens,
+            image_inputs=image_inputs,
+            lora_paths=lora_paths,
+            sampling_info=self.sampling_info,
+        )
+@dataclass
+class ModelWorkerBatch:
+    # The forward mode
+    forward_mode: ForwardMode
+    # The input ids
+    input_ids: torch.Tensor
+    # The indices of requests in the req_to_token_pool
+    req_pool_indices: torch.Tensor
+    # The sequence length
+    seq_lens: torch.Tensor
+    # The indices of output tokens in the token_to_kv_pool
+    out_cache_loc: torch.Tensor
+    # For logprob
+    return_logprob: bool
+    top_logprobs_nums: Optional[List[int]]
+    # For extend
+    extend_seq_lens: Optional[List[int]]
+    extend_prefix_lens: Optional[List[int]]
+    extend_logprob_start_lens: Optional[List[int]]
+    # For multimodal
+    image_inputs: Optional[List[ImageInputs]]
+    # For LoRA
+    lora_paths: Optional[List[str]]
+    # Sampling info
+    sampling_info: SamplingBatchInfo

sglang 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl

sglang 0.3.2py3-none-any.whl → 0.3.3py3-none-any.whl