PyPI - sglang - Versions diffs - 0.3.1.post3__py3-none-any.whl → 0.3.3__py3-none-any.whl - Mend

sglang 0.3.1.post3py3-none-any.whl → 0.3.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

sglang/__init__.py +2 -0
sglang/api.py +23 -1
sglang/bench_latency.py +48 -33
sglang/bench_server_latency.py +0 -6
sglang/bench_serving.py +2 -2
sglang/lang/backend/runtime_endpoint.py +14 -1
sglang/lang/interpreter.py +16 -6
sglang/lang/ir.py +20 -4
sglang/srt/configs/model_config.py +11 -9
sglang/srt/constrained/fsm_cache.py +9 -1
sglang/srt/constrained/jump_forward.py +15 -2
sglang/srt/hf_transformers_utils.py +1 -0
sglang/srt/layers/activation.py +4 -4
sglang/srt/layers/attention/__init__.py +49 -0
sglang/srt/layers/attention/flashinfer_backend.py +277 -0
sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
sglang/srt/layers/attention/triton_backend.py +161 -0
sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
sglang/srt/layers/fused_moe/patch.py +117 -0
sglang/srt/layers/layernorm.py +4 -4
sglang/srt/layers/logits_processor.py +19 -15
sglang/srt/layers/pooler.py +3 -3
sglang/srt/layers/quantization/__init__.py +0 -2
sglang/srt/layers/radix_attention.py +6 -4
sglang/srt/layers/sampler.py +6 -4
sglang/srt/layers/torchao_utils.py +18 -0
sglang/srt/lora/lora.py +20 -21
sglang/srt/lora/lora_manager.py +97 -25
sglang/srt/managers/detokenizer_manager.py +31 -18
sglang/srt/managers/image_processor.py +187 -0
sglang/srt/managers/io_struct.py +99 -75
sglang/srt/managers/schedule_batch.py +187 -68
sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
sglang/srt/managers/scheduler.py +1021 -0
sglang/srt/managers/tokenizer_manager.py +120 -247
sglang/srt/managers/tp_worker.py +28 -925
sglang/srt/mem_cache/memory_pool.py +34 -52
sglang/srt/mem_cache/radix_cache.py +5 -5
sglang/srt/model_executor/cuda_graph_runner.py +25 -25
sglang/srt/model_executor/forward_batch_info.py +94 -97
sglang/srt/model_executor/model_runner.py +76 -78
sglang/srt/models/baichuan.py +10 -10
sglang/srt/models/chatglm.py +12 -12
sglang/srt/models/commandr.py +10 -10
sglang/srt/models/dbrx.py +12 -12
sglang/srt/models/deepseek.py +10 -10
sglang/srt/models/deepseek_v2.py +14 -15
sglang/srt/models/exaone.py +10 -10
sglang/srt/models/gemma.py +10 -10
sglang/srt/models/gemma2.py +11 -11
sglang/srt/models/gpt_bigcode.py +10 -10
sglang/srt/models/grok.py +10 -10
sglang/srt/models/internlm2.py +10 -10
sglang/srt/models/llama.py +22 -10
sglang/srt/models/llama_classification.py +5 -5
sglang/srt/models/llama_embedding.py +4 -4
sglang/srt/models/llama_reward.py +142 -0
sglang/srt/models/llava.py +39 -33
sglang/srt/models/llavavid.py +31 -28
sglang/srt/models/minicpm.py +10 -10
sglang/srt/models/minicpm3.py +14 -15
sglang/srt/models/mixtral.py +10 -10
sglang/srt/models/mixtral_quant.py +10 -10
sglang/srt/models/olmoe.py +10 -10
sglang/srt/models/qwen.py +10 -10
sglang/srt/models/qwen2.py +11 -11
sglang/srt/models/qwen2_moe.py +10 -10
sglang/srt/models/stablelm.py +10 -10
sglang/srt/models/torch_native_llama.py +506 -0
sglang/srt/models/xverse.py +10 -10
sglang/srt/models/xverse_moe.py +10 -10
sglang/srt/openai_api/adapter.py +7 -0
sglang/srt/sampling/sampling_batch_info.py +36 -27
sglang/srt/sampling/sampling_params.py +3 -1
sglang/srt/server.py +170 -119
sglang/srt/server_args.py +54 -27
sglang/srt/utils.py +101 -128
sglang/test/runners.py +76 -33
sglang/test/test_programs.py +38 -5
sglang/test/test_utils.py +53 -9
sglang/version.py +1 -1
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/METADATA +42 -23
sglang-0.3.3.dist-info/RECORD +139 -0
sglang/srt/layers/attention_backend.py +0 -482
sglang/srt/managers/controller_multi.py +0 -207
sglang/srt/managers/controller_single.py +0 -164
sglang-0.3.1.post3.dist-info/RECORD +0 -134
/sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
/sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
{sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -16,9 +16,9 @@ limitations under the License.
 """Memory pool."""
 import logging
-from abc import ABC, abstractmethod
 from typing import List, Tuple, Union
+import numpy as np
 import torch
 logger = logging.getLogger(__name__)
@@ -27,12 +27,17 @@ logger = logging.getLogger(__name__)
 class ReqToTokenPool:
     """A memory pool that maps a request to its token locations."""
-    def __init__(self, size: int, max_context_len: int):
+    def __init__(self, size: int, max_context_len: int, device: str):
         self.size = size
-        self.free_slots = list(range(size))
+        self.max_context_len = max_context_len
+        self.device = device
         self.req_to_token = torch.empty(
-            (size, max_context_len), dtype=torch.int32, device="cuda"
+            (size, max_context_len), dtype=torch.int32, device=device
         )
+        self.free_slots = list(range(size))
+    def available_size(self):
+        return len(self.free_slots)
     def alloc(self, need_size: int) -> List[int]:
         if need_size > len(self.free_slots):
@@ -53,86 +58,55 @@ class ReqToTokenPool:
         self.free_slots = list(range(self.size))
-class BaseTokenToKVPool(ABC):
+class BaseTokenToKVPool:
     """A memory pool that maps a token to its kv cache locations"""
     def __init__(
         self,
         size: int,
         dtype: torch.dtype,
+        device: str,
     ):
         self.size = size
         self.dtype = dtype
+        self.device = device
         if dtype == torch.float8_e5m2:
             # NOTE: Store as torch.uint8 because Tensor index_put is not implemented for torch.float8_e5m2
             self.store_dtype = torch.uint8
         else:
             self.store_dtype = dtype
-        # We also add one slot. This slot is used for writing dummy output from padded tokens.
-        self.mem_state = torch.ones((self.size + 1,), dtype=torch.bool, device="cuda")
-        # Prefetch buffer
-        self.prefetch_buffer = torch.empty(0, device="cuda", dtype=torch.int32)
-        self.prefetch_chunk_size = 512
-        self.can_use_mem_size = self.size
+        self.free_slots = None
         self.clear()
     def available_size(self):
-        return self.can_use_mem_size + len(self.prefetch_buffer)
+        return len(self.free_slots)
     def alloc(self, need_size: int):
-        buffer_len = len(self.prefetch_buffer)
-        if need_size <= buffer_len:
-            select_index = self.prefetch_buffer[:need_size]
-            self.prefetch_buffer = self.prefetch_buffer[need_size:]
-            return select_index
-        addition_size = need_size - buffer_len
-        alloc_size = max(addition_size, self.prefetch_chunk_size)
-        select_index = (
-            torch.nonzero(self.mem_state).squeeze(1)[:alloc_size].to(torch.int32)
-        )
-        if select_index.shape[0] < addition_size:
+        if need_size > len(self.free_slots):
             return None
-        self.mem_state[select_index] = False
-        self.can_use_mem_size -= len(select_index)
-        self.prefetch_buffer = torch.cat((self.prefetch_buffer, select_index))
-        ret_index = self.prefetch_buffer[:need_size]
-        self.prefetch_buffer = self.prefetch_buffer[need_size:]
+        select_index = self.free_slots[:need_size]
+        self.free_slots = self.free_slots[need_size:]
-        return ret_index
+        return torch.tensor(select_index, dtype=torch.int32, device=self.device)
     def free(self, free_index: torch.Tensor):
-        self.mem_state[free_index] = True
-        self.can_use_mem_size += len(free_index)
+        self.free_slots = np.concatenate((self.free_slots, free_index.cpu().numpy()))
     def clear(self):
-        self.prefetch_buffer = torch.empty(0, device="cuda", dtype=torch.int32)
+        # The padded slot 0 is used for writing dummy outputs from padded tokens.
+        self.free_slots = np.arange(1, self.size + 1)
-        self.mem_state.fill_(True)
-        self.can_use_mem_size = self.size
-        # We also add one slot. This slot is used for writing dummy output from padded tokens.
-        self.mem_state[0] = False
-    @abstractmethod
     def get_key_buffer(self, layer_id: int) -> torch.Tensor:
         raise NotImplementedError()
-    @abstractmethod
     def get_value_buffer(self, layer_id: int) -> torch.Tensor:
         raise NotImplementedError()
-    @abstractmethod
     def get_kv_buffer(self, layer_id: int) -> Tuple[torch.Tensor, torch.Tensor]:
         raise NotImplementedError()
-    @abstractmethod
     def set_kv_buffer(
         self,
         layer_id: int,
@@ -152,19 +126,25 @@ class MHATokenToKVPool(BaseTokenToKVPool):
         head_num: int,
         head_dim: int,
         layer_num: int,
+        device: str,
     ):
-        super().__init__(size, dtype)
+        super().__init__(size, dtype, device)
         # [size, head_num, head_dim] for each layer
+        # The padded slot 0 is used for writing dummy outputs from padded tokens.
         self.k_buffer = [
             torch.empty(
-                (size + 1, head_num, head_dim), dtype=self.store_dtype, device="cuda"
+                (size + 1, head_num, head_dim),
+                dtype=self.store_dtype,
+                device=device,
             )
             for _ in range(layer_num)
         ]
         self.v_buffer = [
             torch.empty(
-                (size + 1, head_num, head_dim), dtype=self.store_dtype, device="cuda"
+                (size + 1, head_num, head_dim),
+                dtype=self.store_dtype,
+                device=device,
             )
             for _ in range(layer_num)
         ]
@@ -210,15 +190,17 @@ class MLATokenToKVPool(BaseTokenToKVPool):
         kv_lora_rank: int,
         qk_rope_head_dim: int,
         layer_num: int,
+        device: str,
     ):
-        super().__init__(size, dtype)
+        super().__init__(size, dtype, device)
         self.kv_lora_rank = kv_lora_rank
+        # The padded slot 0 is used for writing dummy outputs from padded tokens.
         self.kv_buffer = [
             torch.empty(
                 (size + 1, 1, kv_lora_rank + qk_rope_head_dim),
                 dtype=self.store_dtype,
-                device="cuda",
+                device=device,
             )
             for _ in range(layer_num)
         ]

sglang/srt/mem_cache/radix_cache.py CHANGED Viewed

@@ -291,15 +291,15 @@ class RadixCache(BasePrefixCache):
     def _collect_leaves(self):
         ret_list = []
+        stack = [self.root_node]
-        def dfs_(cur_node):
+        while stack:
+            cur_node = stack.pop()
             if len(cur_node.children) == 0:
                 ret_list.append(cur_node)
+            else:
+                stack.extend(cur_node.children.values())
-            for x in cur_node.children.values():
-                dfs_(x)
-        dfs_(self.root_node)
         return ret_list

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -25,13 +25,13 @@ import torch
 from vllm.distributed.parallel_state import graph_capture
 from vllm.model_executor.custom_op import CustomOp
+from sglang.srt.layers.fused_moe.patch import fused_moe_forward_native
 from sglang.srt.layers.logits_processor import (
     LogitsMetadata,
     LogitsProcessor,
     LogitsProcessorOutput,
 )
-from sglang.srt.managers.schedule_batch import ScheduleBatch
-from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.utils import monkey_patch_vllm_all_gather
 if TYPE_CHECKING:
@@ -41,14 +41,15 @@ if TYPE_CHECKING:
 def _to_torch(model: torch.nn.Module, reverse: bool = False):
     for sub in model._modules.values():
         if isinstance(sub, CustomOp):
-            # NOTE: FusedMoE torch native implementaiton is not efficient
-            if "FusedMoE" in sub.__class__.__name__:
-                continue
             if reverse:
                 sub._forward_method = sub.forward_cuda
                 setattr(sub, "is_torch_compile", False)
             else:
-                sub._forward_method = sub.forward_native
+                # NOTE: Temporarily workaround MoE
+                if "FusedMoE" in sub.__class__.__name__:
+                    sub._forward_method = fused_moe_forward_native
+                else:
+                    sub._forward_method = sub.forward_native
                 setattr(sub, "is_torch_compile", True)
         if isinstance(sub, torch.nn.Module):
             _to_torch(sub, reverse)
@@ -67,7 +68,9 @@ def patch_model(
             monkey_patch_vllm_all_gather()
             backup_ca_comm = tp_group.ca_comm
             tp_group.ca_comm = None
-            yield torch.compile(model.forward, mode="max-autotune-no-cudagraphs")
+            yield torch.compile(
+                torch.no_grad()(model.forward), mode="max-autotune-no-cudagraphs"
+            )
         else:
             yield model.forward
     finally:
@@ -139,7 +142,6 @@ class CudaGraphRunner:
             self.seq_lens = torch.full(
                 (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
             )
-            self.position_ids_offsets = torch.ones((self.max_bs,), dtype=torch.int32)
             self.out_cache_loc = torch.zeros((self.max_bs,), dtype=torch.int32)
         # Capture
@@ -150,7 +152,7 @@ class CudaGraphRunner:
                 f"Capture cuda graph failed: {e}\n"
                 "Possible solutions:\n"
                 "1. disable cuda graph by --disable-cuda-graph\n"
-                "2. set --mem-fraction-static to a smaller value\n"
+                "2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
                 "3. disable torch compile by not using --enable-torch-compile\n"
                 "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
             )
@@ -185,7 +187,6 @@ class CudaGraphRunner:
         input_ids = self.input_ids[:bs]
         req_pool_indices = self.req_pool_indices[:bs]
         seq_lens = self.seq_lens[:bs]
-        position_ids_offsets = self.position_ids_offsets[:bs]
         out_cache_loc = self.out_cache_loc[:bs]
         # Attention backend
@@ -195,9 +196,10 @@ class CudaGraphRunner:
         # Run and capture
         def run_once():
-            input_metadata = InputMetadata(
+            forward_batch = ForwardBatch(
                 forward_mode=ForwardMode.DECODE,
                 batch_size=bs,
+                input_ids=input_ids,
                 req_pool_indices=req_pool_indices,
                 seq_lens=seq_lens,
                 req_to_token_pool=self.model_runner.req_to_token_pool,
@@ -206,9 +208,9 @@ class CudaGraphRunner:
                 out_cache_loc=out_cache_loc,
                 return_logprob=False,
                 top_logprobs_nums=[0] * bs,
-                positions=(seq_lens - 1 + position_ids_offsets).to(torch.int64),
+                positions=torch.clamp((seq_lens - 1), min=0).to(torch.int64),
             )
-            return forward(input_ids, input_metadata.positions, input_metadata)
+            return forward(input_ids, forward_batch.positions, forward_batch)
         for _ in range(2):
             torch.cuda.synchronize()
@@ -231,24 +233,22 @@ class CudaGraphRunner:
         self.graph_memory_pool = graph.pool()
         return graph, out
-    def replay(self, batch: ScheduleBatch):
-        assert batch.out_cache_loc is not None
-        raw_bs = len(batch.reqs)
+    def replay(self, forward_batch: ForwardBatch):
+        assert forward_batch.out_cache_loc is not None
+        raw_bs = forward_batch.batch_size
         # Pad
         index = bisect.bisect_left(self.capture_bs, raw_bs)
         bs = self.capture_bs[index]
         if bs != raw_bs:
             self.seq_lens.fill_(self.seq_len_fill_value)
-            self.position_ids_offsets.fill_(1)
             self.out_cache_loc.zero_()
         # Common inputs
-        self.input_ids[:raw_bs] = batch.input_ids
-        self.req_pool_indices[:raw_bs] = batch.req_pool_indices
-        self.seq_lens[:raw_bs] = batch.seq_lens
-        self.position_ids_offsets[:raw_bs] = batch.position_ids_offsets
-        self.out_cache_loc[:raw_bs] = batch.out_cache_loc
+        self.input_ids[:raw_bs] = forward_batch.input_ids
+        self.req_pool_indices[:raw_bs] = forward_batch.req_pool_indices
+        self.seq_lens[:raw_bs] = forward_batch.seq_lens
+        self.out_cache_loc[:raw_bs] = forward_batch.out_cache_loc
         # Attention backend
         self.model_runner.attn_backend.init_forward_metadata_replay_cuda_graph(
@@ -271,15 +271,15 @@ class CudaGraphRunner:
             )
         # Extract logprobs
-        if batch.return_logprob:
+        if forward_batch.return_logprob:
             logits_output.next_token_logprobs = torch.nn.functional.log_softmax(
                 logits_output.next_token_logits, dim=-1
             )
-            return_top_logprob = any(x > 0 for x in batch.top_logprobs_nums)
+            return_top_logprob = any(x > 0 for x in forward_batch.top_logprobs_nums)
             if return_top_logprob:
                 logits_metadata = LogitsMetadata(
                     forward_mode=ForwardMode.DECODE,
-                    top_logprobs_nums=batch.top_logprobs_nums,
+                    top_logprobs_nums=forward_batch.top_logprobs_nums,
                 )
                 logits_output.output_top_logprobs = LogitsProcessor.get_top_logprobs(
                     logits_output.next_token_logprobs, logits_metadata

sglang/srt/model_executor/forward_batch_info.py CHANGED Viewed

@@ -15,19 +15,33 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-"""Meta data for a forward pass."""
+"""
+Store information about a forward batch.
+The following is the flow of data structures for a batch:
+ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
+- ScheduleBatch is managed by `scheduler.py::Scheduler`.
+  It contains high-level scheduling data. Most of the data is on the CPU.
+- ModelWorkerBatch is managed by `tp_worker.py::TpModelWorker`.
+- ForwardBatch is managed by `model_runner.py::ModelRunner`.
+  It contains low-level tensor data. Most of the data consists of GPU tensors.
+"""
 from dataclasses import dataclass
 from enum import IntEnum, auto
-from typing import TYPE_CHECKING, List
+from typing import TYPE_CHECKING, List, Optional
 import numpy as np
 import torch
 if TYPE_CHECKING:
-    from sglang.srt.layers.attention_backend import AttentionBackend
-    from sglang.srt.managers.schedule_batch import ScheduleBatch
+    from sglang.srt.layers.attention import AttentionBackend
+    from sglang.srt.managers.schedule_batch import ImageInputs, ModelWorkerBatch
     from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
     from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 class ForwardMode(IntEnum):
@@ -37,7 +51,7 @@ class ForwardMode(IntEnum):
     EXTEND = auto()
     # Decode one token.
     DECODE = auto()
-    # Contains both PREFILL and EXTEND.
+    # Contains both EXTEND and DECODE.
     MIXED = auto()
     def is_prefill(self):
@@ -54,123 +68,106 @@ class ForwardMode(IntEnum):
 @dataclass
-class InputMetadata:
-    """Store all inforamtion of a forward pass."""
+class ForwardBatch:
+    """Store all inputs of a forward pass."""
+    # The forward mode
     forward_mode: ForwardMode
+    # The batch size
     batch_size: int
+    # The input ids
+    input_ids: torch.Tensor
+    # The indices of requests in the req_to_token_pool
     req_pool_indices: torch.Tensor
+    # The sequence length
     seq_lens: torch.Tensor
-    req_to_token_pool: ReqToTokenPool
-    token_to_kv_pool: BaseTokenToKVPool
-    attn_backend: AttentionBackend
-    # Output location of the KV cache
+    # The indices of output tokens in the token_to_kv_pool
     out_cache_loc: torch.Tensor
+    # For logprob
+    return_logprob: bool = False
+    top_logprobs_nums: Optional[List[int]] = None
     # Position information
     positions: torch.Tensor = None
     # For extend
-    extend_seq_lens: torch.Tensor = None
-    extend_prefix_lens: torch.Tensor = None
-    extend_start_loc: torch.Tensor = None
-    extend_no_prefix: bool = None
-    # For logprob
-    return_logprob: bool = False
-    top_logprobs_nums: List[int] = None
-    extend_seq_lens_cpu: List[int] = None
-    extend_logprob_start_lens_cpu: List[int] = None
+    extend_seq_lens: Optional[torch.Tensor] = None
+    extend_prefix_lens: Optional[torch.Tensor] = None
+    extend_start_loc: Optional[torch.Tensor] = None
+    extend_seq_lens_cpu: Optional[List[int]] = None
+    extend_logprob_start_lens_cpu: Optional[List[int]] = None
     # For multimodal
-    pixel_values: List[torch.Tensor] = None
-    image_sizes: List[List[List[int]]] = None
-    image_offsets: List[List[int]] = None
-    modalities: List[List[str]] = None
-    def init_multimuldal_info(self, batch: ScheduleBatch):
-        reqs = batch.reqs
-        self.pixel_values = [r.pixel_values for r in reqs]
-        self.image_sizes = [r.image_sizes for r in reqs]
-        self.image_offsets = [r.image_offsets for r in reqs]
-        self.modalities = [r.modalities for r in reqs]
-    def compute_positions(self, batch: ScheduleBatch):
-        position_ids_offsets = batch.position_ids_offsets
-        if self.forward_mode.is_decode():
-            if True:
-                self.positions = self.seq_lens - 1
-            else:
-                # Deprecated
-                self.positions = (self.seq_lens - 1) + position_ids_offsets
-        else:
-            if True:
-                self.positions = torch.tensor(
-                    np.concatenate(
-                        [
-                            np.arange(batch.prefix_lens_cpu[i], len(req.fill_ids))
-                            for i, req in enumerate(batch.reqs)
-                        ],
-                        axis=0,
-                    ),
-                    device="cuda",
-                )
-            else:
-                # Deprecated
-                position_ids_offsets_cpu = position_ids_offsets.cpu().numpy()
-                self.positions = torch.tensor(
-                    np.concatenate(
-                        [
-                            np.arange(
-                                batch.prefix_lens_cpu[i] + position_ids_offsets_cpu[i],
-                                len(req.fill_ids) + position_ids_offsets_cpu[i],
-                            )
-                            for i, req in enumerate(batch.reqs)
-                        ],
-                        axis=0,
-                    ),
-                    device="cuda",
-                )
-        # Positions should be in long type
-        self.positions = self.positions.to(torch.int64)
-    def compute_extend_infos(self, batch: ScheduleBatch):
-        self.extend_seq_lens = torch.tensor(batch.extend_lens_cpu, device="cuda")
-        self.extend_prefix_lens = torch.tensor(batch.prefix_lens_cpu, device="cuda")
-        self.extend_start_loc = torch.zeros_like(self.extend_seq_lens)
-        self.extend_start_loc[1:] = torch.cumsum(self.extend_seq_lens[:-1], dim=0)
-        self.extend_no_prefix = all(x == 0 for x in batch.prefix_lens_cpu)
-        self.extend_seq_lens_cpu = batch.extend_lens_cpu
-        self.extend_logprob_start_lens_cpu = batch.extend_logprob_start_lens_cpu
+    image_inputs: Optional[List[ImageInputs]] = None
+    # For LoRA
+    lora_paths: Optional[List[str]] = None
+    # Sampling info
+    sampling_info: SamplingBatchInfo = None
+    # Attention backend
+    req_to_token_pool: ReqToTokenPool = None
+    token_to_kv_pool: BaseTokenToKVPool = None
+    attn_backend: AttentionBackend = None
     @classmethod
-    def from_schedule_batch(
+    def init_new(
         cls,
-        model_runner: "ModelRunner",
-        batch: ScheduleBatch,
+        batch: ModelWorkerBatch,
+        model_runner: ModelRunner,
     ):
+        device = "cuda"
         ret = cls(
             forward_mode=batch.forward_mode,
-            batch_size=batch.batch_size(),
+            batch_size=len(batch.seq_lens),
+            input_ids=batch.input_ids,
             req_pool_indices=batch.req_pool_indices,
             seq_lens=batch.seq_lens,
-            req_to_token_pool=model_runner.req_to_token_pool,
-            token_to_kv_pool=model_runner.token_to_kv_pool,
-            attn_backend=model_runner.attn_backend,
             out_cache_loc=batch.out_cache_loc,
             return_logprob=batch.return_logprob,
             top_logprobs_nums=batch.top_logprobs_nums,
+            lora_paths=batch.lora_paths,
+            sampling_info=batch.sampling_info,
         )
-        ret.compute_positions(batch)
-        if not batch.forward_mode.is_decode():
-            ret.init_multimuldal_info(batch)
-            ret.compute_extend_infos(batch)
-        model_runner.attn_backend.init_forward_metadata(batch, ret)
+        # Init position information
+        if ret.forward_mode.is_decode():
+            ret.positions = (ret.seq_lens - 1).to(torch.int64)
+        else:
+            ret.positions = torch.tensor(
+                np.concatenate(
+                    [
+                        np.arange(prefix_len, prefix_len + extend_len)
+                        for prefix_len, extend_len in zip(
+                            batch.extend_prefix_lens, batch.extend_seq_lens
+                        )
+                    ],
+                    axis=0,
+                ),
+                device=device,
+            ).to(torch.int64)
+            ret.image_inputs = batch.image_inputs
+            ret.extend_seq_lens = torch.tensor(batch.extend_seq_lens, device=device)
+            ret.extend_prefix_lens = torch.tensor(
+                batch.extend_prefix_lens, device=device
+            )
+            ret.extend_start_loc = torch.zeros_like(ret.extend_seq_lens)
+            ret.extend_start_loc[1:] = torch.cumsum(ret.extend_seq_lens[:-1], dim=0)
+            ret.extend_seq_lens_cpu = batch.extend_seq_lens
+            ret.extend_logprob_start_lens_cpu = batch.extend_logprob_start_lens
+        # Init attention information
+        ret.req_to_token_pool = model_runner.req_to_token_pool
+        ret.token_to_kv_pool = model_runner.token_to_kv_pool
+        ret.attn_backend = model_runner.attn_backend
+        model_runner.attn_backend.init_forward_metadata(ret)
+        # Init lora information
+        if model_runner.server_args.lora_paths is not None:
+            model_runner.lora_manager.prepare_lora_batch(ret)
         return ret

sglang 0.3.1.post3__py3-none-any.whl → 0.3.3__py3-none-any.whl

sglang 0.3.1.post3py3-none-any.whl → 0.3.3py3-none-any.whl