PyPI - sglang - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.11__py3-none-any.whl - Mend

sglang 0.2.10py3-none-any.whl → 0.2.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

sglang/__init__.py +8 -0
sglang/api.py +10 -2
sglang/bench_latency.py +145 -36
sglang/check_env.py +24 -2
sglang/global_config.py +0 -1
sglang/lang/backend/base_backend.py +3 -1
sglang/lang/backend/openai.py +8 -3
sglang/lang/backend/runtime_endpoint.py +46 -29
sglang/lang/choices.py +164 -0
sglang/lang/interpreter.py +6 -13
sglang/lang/ir.py +11 -2
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/radix_attention.py +2 -5
sglang/srt/managers/schedule_batch.py +95 -324
sglang/srt/managers/tokenizer_manager.py +6 -3
sglang/srt/managers/tp_worker.py +20 -22
sglang/srt/mem_cache/memory_pool.py +9 -14
sglang/srt/model_executor/cuda_graph_runner.py +3 -3
sglang/srt/model_executor/forward_batch_info.py +256 -0
sglang/srt/model_executor/model_runner.py +6 -10
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_v2.py +1 -1
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -1
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +1 -1
sglang/srt/models/internlm2.py +1 -1
sglang/srt/models/llama2.py +1 -1
sglang/srt/models/llama_classification.py +1 -1
sglang/srt/models/llava.py +1 -2
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +1 -1
sglang/srt/models/mixtral.py +1 -1
sglang/srt/models/mixtral_quant.py +1 -1
sglang/srt/models/qwen.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +1 -1
sglang/srt/models/stablelm.py +1 -1
sglang/srt/openai_api/adapter.py +34 -12
sglang/srt/openai_api/protocol.py +6 -0
sglang/srt/server.py +24 -6
sglang/srt/server_args.py +4 -0
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/METADATA +34 -24
{sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/RECORD +52 -50
{sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/LICENSE +0 -0
{sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/WHEEL +0 -0
{sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/top_level.txt +0 -0

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -39,13 +39,13 @@ from sglang.srt.managers.policy_scheduler import PolicyScheduler
 from sglang.srt.managers.schedule_batch import (
     FINISH_ABORT,
     BaseFinishReason,
-    Batch,
-    ForwardMode,
     Req,
+    ScheduleBatch,
 )
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.radix_cache import RadixCache
 from sglang.srt.model_config import ModelConfig
+from sglang.srt.model_executor.forward_batch_info import ForwardMode
 from sglang.srt.model_executor.model_runner import ModelRunner
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
@@ -172,7 +172,7 @@ class ModelTpServer:
         # Init running status
         self.waiting_queue: List[Req] = []
-        self.running_batch: Batch = None
+        self.running_batch: ScheduleBatch = None
         self.out_pyobjs = []
         self.decode_forward_ct = 0
         self.stream_interval = server_args.stream_interval
@@ -200,7 +200,6 @@ class ModelTpServer:
         )
         self.new_token_ratio = self.min_new_token_ratio
         self.new_token_ratio_decay = global_config.new_token_ratio_decay
-        self.new_token_ratio_recovery = global_config.new_token_ratio_recovery
     def exposed_step(self, recv_reqs):
         try:
@@ -290,10 +289,10 @@ class ModelTpServer:
                 "KV cache pool leak detected!"
             )
-        if self.req_to_token_pool.can_use_mem_size != self.req_to_token_pool.size:
+        if len(self.req_to_token_pool.free_slots) != self.req_to_token_pool.size:
             warnings.warn(
                 "Warning: "
-                f"available req slots={self.req_to_token_pool.can_use_mem_size}, "
+                f"available req slots={len(self.req_to_token_pool.free_slots)}, "
                 f"total slots={self.req_to_token_pool.size}\n"
                 "Memory pool leak detected!"
             )
@@ -353,7 +352,7 @@ class ModelTpServer:
         )
         self.waiting_queue.append(req)
-    def get_new_prefill_batch(self) -> Optional[Batch]:
+    def get_new_prefill_batch(self) -> Optional[ScheduleBatch]:
         # TODO(lsyin): organize this function
         running_bs = (
             len(self.running_batch.reqs) if self.running_batch is not None else 0
@@ -364,12 +363,13 @@ class ModelTpServer:
         # Compute matched prefix length
         for req in self.waiting_queue:
             req.input_ids = req.origin_input_ids + req.output_ids
+            try_match_ids = req.input_ids
+            if req.return_logprob:
+                try_match_ids = req.input_ids[: req.logprob_start_len]
+            # NOTE: the prefix_indices must always be aligned with last_node
             prefix_indices, last_node = self.tree_cache.match_prefix(
-                rid=req.rid,
-                key=req.input_ids,
+                rid=req.rid, key=try_match_ids
             )
-            if req.return_logprob:
-                prefix_indices = prefix_indices[: req.logprob_start_len]
             req.extend_input_len = len(req.input_ids) - len(prefix_indices)
             req.prefix_indices = prefix_indices
             req.last_node = last_node
@@ -525,7 +525,7 @@ class ModelTpServer:
             )
         # Return the new batch
-        new_batch = Batch.init_new(
+        new_batch = ScheduleBatch.init_new(
             can_run_list,
             self.req_to_token_pool,
             self.token_to_kv_pool,
@@ -534,7 +534,7 @@ class ModelTpServer:
         self.waiting_queue = [x for x in self.waiting_queue if x not in can_run_list]
         return new_batch
-    def forward_prefill_batch(self, batch: Batch):
+    def forward_prefill_batch(self, batch: ScheduleBatch):
         # Build batch tensors
         batch.prepare_for_extend(
             self.model_config.vocab_size, self.int_token_logit_bias
@@ -623,14 +623,13 @@ class ModelTpServer:
                 )
             req.output_top_logprobs.append(output.output_top_logprobs[i])
-    def cache_filled_batch(self, batch: Batch):
-        req_pool_indices_cpu = batch.req_pool_indices.cpu().numpy()
+    def cache_filled_batch(self, batch: ScheduleBatch):
         for i, req in enumerate(batch.reqs):
             new_prefix_indices, new_last_node = self.tree_cache.cache_req(
                 rid=req.rid,
                 token_ids=tuple(req.input_ids),
                 last_uncached_pos=len(req.prefix_indices),
-                req_pool_idx=req_pool_indices_cpu[i],
+                req_pool_idx=req.req_pool_idx,
                 del_in_memory_pool=False,
                 old_last_node=req.last_node,
             )
@@ -638,9 +637,9 @@ class ModelTpServer:
             if req is self.current_inflight_req:
                 # inflight request would get a new req idx
-                self.req_to_token_pool.free(int(req_pool_indices_cpu[i]))
+                self.req_to_token_pool.free(req.req_pool_idx)
-    def forward_decode_batch(self, batch: Batch):
+    def forward_decode_batch(self, batch: ScheduleBatch):
         # Check if decode out of memory
         if not batch.check_decode_mem():
             old_ratio = self.new_token_ratio
@@ -699,7 +698,7 @@ class ModelTpServer:
         self.handle_finished_requests(batch)
-    def handle_finished_requests(self, batch: Batch):
+    def handle_finished_requests(self, batch: ScheduleBatch):
         output_rids = []
         output_vids = []
         decoded_texts = []
@@ -781,14 +780,13 @@ class ModelTpServer:
         # Remove finished reqs
         if finished_indices:
             # Update radix cache
-            req_pool_indices_cpu = batch.req_pool_indices.tolist()
             for i in finished_indices:
                 req = batch.reqs[i]
                 self.tree_cache.cache_req(
                     rid=req.rid,
                     token_ids=tuple(req.origin_input_ids + req.output_ids)[:-1],
                     last_uncached_pos=len(req.prefix_indices),
-                    req_pool_idx=req_pool_indices_cpu[i],
+                    req_pool_idx=req.req_pool_idx,
                 )
                 self.tree_cache.dec_lock_ref(req.last_node)
@@ -799,7 +797,7 @@ class ModelTpServer:
             else:
                 batch.reqs = []
-    def filter_out_inflight(self, batch: Batch):
+    def filter_out_inflight(self, batch: ScheduleBatch):
         # TODO(lsyin): reduce the overhead, make a special version for this
         if self.current_inflight_req is None:
             return

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -16,6 +16,7 @@ limitations under the License.
 """Memory pool."""
 import logging
+from typing import List
 import torch
@@ -27,34 +28,28 @@ class ReqToTokenPool:
     def __init__(self, size: int, max_context_len: int):
         self.size = size
-        self.mem_state = torch.ones((size,), dtype=torch.bool, device="cuda")
+        self.free_slots = list(range(size))
         self.req_to_token = torch.empty(
             (size, max_context_len), dtype=torch.int32, device="cuda"
         )
-        self.can_use_mem_size = size
-    def alloc(self, need_size: int):
-        if need_size > self.can_use_mem_size:
+    def alloc(self, need_size: int) -> List[int]:
+        if need_size > len(self.free_slots):
             return None
-        select_index = (
-            torch.nonzero(self.mem_state).squeeze(1)[:need_size].to(torch.int32)
-        )
-        self.mem_state[select_index] = False
-        self.can_use_mem_size -= need_size
+        select_index = self.free_slots[:need_size]
+        self.free_slots = self.free_slots[need_size:]
         return select_index
     def free(self, free_index):
-        self.mem_state[free_index] = True
         if isinstance(free_index, (int,)):
-            self.can_use_mem_size += 1
+            self.free_slots.append(free_index)
         else:
-            self.can_use_mem_size += free_index.shape[0]
+            self.free_slots.extend(free_index)
     def clear(self):
-        self.mem_state.fill_(True)
-        self.can_use_mem_size = len(self.mem_state)
+        self.free_slots = list(range(self.size))
 class BaseTokenToKVPool:

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -29,8 +29,8 @@ from sglang.srt.layers.logits_processor import (
     LogitsMetadata,
     LogitsProcessor,
 )
-from sglang.srt.managers.schedule_batch import (
-    Batch,
+from sglang.srt.managers.schedule_batch import ScheduleBatch
+from sglang.srt.model_executor.forward_batch_info import (
     ForwardMode,
     InputMetadata,
     init_flashinfer_args,
@@ -202,7 +202,7 @@ class CudaGraphRunner:
         self.graph_memory_pool = graph.pool()
         return graph, None, out, flashinfer_decode_wrapper
-    def replay(self, batch: Batch):
+    def replay(self, batch: ScheduleBatch):
         assert batch.out_cache_loc is not None
         raw_bs = len(batch.reqs)

sglang/srt/model_executor/forward_batch_info.py ADDED Viewed

@@ -0,0 +1,256 @@
+"""
+Copyright 2023-2024 SGLang Team
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+"""ModelRunner runs the forward passes of the models."""
+from dataclasses import dataclass
+from enum import IntEnum, auto
+from typing import List
+import numpy as np
+import torch
+from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
+class ForwardMode(IntEnum):
+    # Prefill a new sequence. This is deprecated now. "EXTEND" covers this case.
+    PREFILL = auto()
+    # Extend a sequence. The KV cache of the first part of the sequence is already computed (e.g., system prompt).
+    EXTEND = auto()
+    # Decode one token.
+    DECODE = auto()
+@dataclass
+class InputMetadata:
+    """Store all inforamtion of a forward pass."""
+    forward_mode: ForwardMode
+    batch_size: int
+    total_num_tokens: int
+    req_pool_indices: torch.Tensor
+    seq_lens: torch.Tensor
+    positions: torch.Tensor
+    req_to_token_pool: ReqToTokenPool
+    token_to_kv_pool: BaseTokenToKVPool
+    # For extend
+    extend_seq_lens: torch.Tensor
+    extend_start_loc: torch.Tensor
+    extend_no_prefix: bool
+    # Output location of the KV cache
+    out_cache_loc: torch.Tensor = None
+    # Output options
+    return_logprob: bool = False
+    top_logprobs_nums: List[int] = None
+    # Trition attention backend
+    triton_max_seq_len: int = 0
+    triton_max_extend_len: int = 0
+    triton_start_loc: torch.Tensor = None
+    triton_prefix_lens: torch.Tensor = None
+    # FlashInfer attention backend
+    flashinfer_prefill_wrapper_ragged: "BatchPrefillWithRaggedKVCacheWrapper" = None
+    flashinfer_prefill_wrapper_paged: "BatchPrefillWithPagedKVCacheWrapper" = None
+    flashinfer_decode_wrapper: "BatchDecodeWithPagedKVCacheWrapper" = None
+    flashinfer_use_ragged: bool = False
+    @classmethod
+    def create(
+        cls,
+        model_runner,
+        forward_mode,
+        req_pool_indices,
+        seq_lens,
+        prefix_lens,
+        position_ids_offsets,
+        out_cache_loc,
+        top_logprobs_nums=None,
+        return_logprob=False,
+        skip_flashinfer_init=False,
+    ):
+        flashinfer_use_ragged = False
+        if not skip_flashinfer_init and not model_runner.server_args.disable_flashinfer:
+            if forward_mode != ForwardMode.DECODE and int(torch.sum(seq_lens)) > 4096:
+                flashinfer_use_ragged = True
+            init_flashinfer_args(
+                forward_mode,
+                model_runner,
+                req_pool_indices,
+                seq_lens,
+                prefix_lens,
+                model_runner.flashinfer_decode_wrapper,
+                flashinfer_use_ragged,
+            )
+        batch_size = len(req_pool_indices)
+        if forward_mode == ForwardMode.DECODE:
+            positions = ((seq_lens - 1) + position_ids_offsets).to(torch.int64)
+            extend_seq_lens = extend_start_loc = extend_no_prefix = None
+            if not model_runner.server_args.disable_flashinfer:
+                # This variable is not needed in this case,
+                # we do not compute it to make it compatbile with cuda graph.
+                total_num_tokens = None
+            else:
+                total_num_tokens = int(torch.sum(seq_lens))
+        else:
+            seq_lens_cpu = seq_lens.cpu().numpy()
+            prefix_lens_cpu = prefix_lens.cpu().numpy()
+            position_ids_offsets_cpu = position_ids_offsets.cpu().numpy()
+            positions = torch.tensor(
+                np.concatenate(
+                    [
+                        np.arange(
+                            prefix_lens_cpu[i] + position_ids_offsets_cpu[i],
+                            seq_lens_cpu[i] + position_ids_offsets_cpu[i],
+                        )
+                        for i in range(batch_size)
+                    ],
+                    axis=0,
+                ),
+                device="cuda",
+            )
+            extend_seq_lens = seq_lens - prefix_lens
+            extend_start_loc = torch.zeros_like(seq_lens)
+            extend_start_loc[1:] = torch.cumsum(extend_seq_lens[:-1], dim=0)
+            extend_no_prefix = torch.all(prefix_lens == 0)
+            total_num_tokens = int(torch.sum(seq_lens))
+        ret = cls(
+            forward_mode=forward_mode,
+            batch_size=batch_size,
+            total_num_tokens=total_num_tokens,
+            req_pool_indices=req_pool_indices,
+            seq_lens=seq_lens,
+            positions=positions,
+            req_to_token_pool=model_runner.req_to_token_pool,
+            token_to_kv_pool=model_runner.token_to_kv_pool,
+            out_cache_loc=out_cache_loc,
+            extend_seq_lens=extend_seq_lens,
+            extend_start_loc=extend_start_loc,
+            extend_no_prefix=extend_no_prefix,
+            return_logprob=return_logprob,
+            top_logprobs_nums=top_logprobs_nums,
+            flashinfer_prefill_wrapper_ragged=model_runner.flashinfer_prefill_wrapper_ragged,
+            flashinfer_prefill_wrapper_paged=model_runner.flashinfer_prefill_wrapper_paged,
+            flashinfer_decode_wrapper=model_runner.flashinfer_decode_wrapper,
+            flashinfer_use_ragged=flashinfer_use_ragged,
+        )
+        if model_runner.server_args.disable_flashinfer:
+            (
+                ret.triton_max_seq_len,
+                ret.triton_max_extend_len,
+                ret.triton_start_loc,
+                ret.triton_prefix_lens,
+            ) = init_triton_args(forward_mode, seq_lens, prefix_lens)
+        return ret
+def init_flashinfer_args(
+    forward_mode,
+    model_runner,
+    req_pool_indices,
+    seq_lens,
+    prefix_lens,
+    flashinfer_decode_wrapper,
+    flashinfer_use_ragged=False,
+):
+    """Init auxiliary variables for FlashInfer attention backend."""
+    num_qo_heads = model_runner.model_config.num_attention_heads // model_runner.tp_size
+    num_kv_heads = model_runner.model_config.get_num_kv_heads(model_runner.tp_size)
+    head_dim = model_runner.model_config.head_dim
+    batch_size = len(req_pool_indices)
+    total_num_tokens = int(torch.sum(seq_lens))
+    if flashinfer_use_ragged:
+        paged_kernel_lens = prefix_lens
+    else:
+        paged_kernel_lens = seq_lens
+    kv_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda")
+    kv_indptr[1:] = torch.cumsum(paged_kernel_lens, dim=0)
+    req_pool_indices_cpu = req_pool_indices.cpu().numpy()
+    paged_kernel_lens_cpu = paged_kernel_lens.cpu().numpy()
+    kv_indices = torch.cat(
+        [
+            model_runner.req_to_token_pool.req_to_token[
+                req_pool_indices_cpu[i], : paged_kernel_lens_cpu[i]
+            ]
+            for i in range(batch_size)
+        ],
+        dim=0,
+    ).contiguous()
+    kv_last_page_len = torch.ones((batch_size,), dtype=torch.int32, device="cuda")
+    if forward_mode == ForwardMode.DECODE:
+        flashinfer_decode_wrapper.end_forward()
+        flashinfer_decode_wrapper.begin_forward(
+            kv_indptr,
+            kv_indices,
+            kv_last_page_len,
+            num_qo_heads,
+            num_kv_heads,
+            head_dim,
+            1,
+        )
+    else:
+        # extend part
+        qo_indptr = torch.zeros((batch_size + 1,), dtype=torch.int32, device="cuda")
+        qo_indptr[1:] = torch.cumsum(seq_lens - prefix_lens, dim=0)
+        if flashinfer_use_ragged:
+            model_runner.flashinfer_prefill_wrapper_ragged.end_forward()
+            model_runner.flashinfer_prefill_wrapper_ragged.begin_forward(
+                qo_indptr,
+                qo_indptr,
+                num_qo_heads,
+                num_kv_heads,
+                head_dim,
+            )
+        # cached part
+        model_runner.flashinfer_prefill_wrapper_paged.end_forward()
+        model_runner.flashinfer_prefill_wrapper_paged.begin_forward(
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            kv_last_page_len,
+            num_qo_heads,
+            num_kv_heads,
+            head_dim,
+            1,
+        )
+def init_triton_args(forward_mode, seq_lens, prefix_lens):
+    """Init auxiliary variables for triton attention backend."""
+    batch_size = len(seq_lens)
+    max_seq_len = int(torch.max(seq_lens))
+    start_loc = torch.zeros((batch_size,), dtype=torch.int32, device="cuda")
+    start_loc[1:] = torch.cumsum(seq_lens[:-1], dim=0)
+    if forward_mode == ForwardMode.DECODE:
+        max_extend_len = None
+    else:
+        extend_seq_lens = seq_lens - prefix_lens
+        max_extend_len = int(torch.max(extend_seq_lens))
+    return max_seq_len, max_extend_len, start_loc, prefix_lens

sglang/srt/model_executor/model_runner.py CHANGED Viewed

@@ -41,18 +41,14 @@ from vllm.distributed import (
 from vllm.model_executor.models import ModelRegistry
 from sglang.global_config import global_config
-from sglang.srt.managers.schedule_batch import (
-    Batch,
-    ForwardMode,
-    InputMetadata,
-    global_server_args_dict,
-)
+from sglang.srt.managers.schedule_batch import ScheduleBatch, global_server_args_dict
 from sglang.srt.mem_cache.memory_pool import (
     MHATokenToKVPool,
     MLATokenToKVPool,
     ReqToTokenPool,
 )
 from sglang.srt.model_config import AttentionArch
+from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
 from sglang.srt.server_args import ServerArgs
 from sglang.srt.utils import (
     get_available_gpu_memory,
@@ -350,7 +346,7 @@ class ModelRunner:
             )
     @torch.inference_mode()
-    def forward_decode(self, batch: Batch):
+    def forward_decode(self, batch: ScheduleBatch):
         if self.cuda_graph_runner and self.cuda_graph_runner.can_run(len(batch.reqs)):
             return self.cuda_graph_runner.replay(batch)
@@ -370,7 +366,7 @@ class ModelRunner:
         )
     @torch.inference_mode()
-    def forward_extend(self, batch: Batch):
+    def forward_extend(self, batch: ScheduleBatch):
         input_metadata = InputMetadata.create(
             self,
             forward_mode=ForwardMode.EXTEND,
@@ -387,7 +383,7 @@ class ModelRunner:
         )
     @torch.inference_mode()
-    def forward_extend_multi_modal(self, batch: Batch):
+    def forward_extend_multi_modal(self, batch: ScheduleBatch):
         input_metadata = InputMetadata.create(
             self,
             forward_mode=ForwardMode.EXTEND,
@@ -408,7 +404,7 @@ class ModelRunner:
             batch.image_offsets,
         )
-    def forward(self, batch: Batch, forward_mode: ForwardMode):
+    def forward(self, batch: ScheduleBatch, forward_mode: ForwardMode):
         if self.is_multimodal_model and forward_mode == ForwardMode.EXTEND:
             return self.forward_extend_multi_modal(batch)
         elif forward_mode == ForwardMode.DECODE:

sglang/srt/models/chatglm.py CHANGED Viewed

@@ -45,7 +45,7 @@ from vllm.transformers_utils.configs import ChatGLMConfig
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.model_executor.model_runner import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 LoraConfig = None

sglang/srt/models/commandr.py CHANGED Viewed

@@ -64,7 +64,7 @@ from vllm.model_executor.utils import set_weight_attrs
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.model_executor.model_runner import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 @torch.compile

sglang/srt/models/dbrx.py CHANGED Viewed

@@ -45,7 +45,7 @@ from vllm.transformers_utils.configs.dbrx import DbrxConfig
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.model_executor.model_runner import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 class DbrxRouter(nn.Module):

sglang/srt/models/deepseek.py CHANGED Viewed

@@ -46,7 +46,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.managers.schedule_batch import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 class DeepseekMLP(nn.Module):

sglang/srt/models/deepseek_v2.py CHANGED Viewed

@@ -46,7 +46,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
 from sglang.srt.managers.schedule_batch import global_server_args_dict
-from sglang.srt.model_executor.model_runner import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 class DeepseekV2MLP(nn.Module):

sglang/srt/models/gemma.py CHANGED Viewed

@@ -37,7 +37,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.model_executor.model_runner import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 class GemmaMLP(nn.Module):

sglang/srt/models/gemma2.py CHANGED Viewed

@@ -42,7 +42,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.model_executor.model_runner import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 class GemmaRMSNorm(CustomOp):

sglang/srt/models/gpt_bigcode.py CHANGED Viewed

@@ -35,7 +35,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.managers.schedule_batch import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 class GPTBigCodeAttention(nn.Module):

sglang/srt/models/grok.py CHANGED Viewed

@@ -52,7 +52,7 @@ from vllm.utils import print_warning_once
 from sglang.srt.layers.fused_moe import fused_moe
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.model_executor.model_runner import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 use_fused = True

sglang/srt/models/internlm2.py CHANGED Viewed

@@ -40,7 +40,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.model_executor.model_runner import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 class InternLM2MLP(nn.Module):

sglang/srt/models/llama2.py CHANGED Viewed

@@ -41,7 +41,7 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.radix_attention import RadixAttention
-from sglang.srt.model_executor.model_runner import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 class LlamaMLP(nn.Module):

sglang/srt/models/llama_classification.py CHANGED Viewed

@@ -25,7 +25,7 @@ from vllm.model_executor.layers.quantization.base_config import QuantizationConf
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from sglang.srt.layers.logits_processor import LogitProcessorOutput
-from sglang.srt.model_executor.model_runner import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import InputMetadata
 from sglang.srt.models.llama2 import LlamaModel

sglang/srt/models/llava.py CHANGED Viewed

@@ -32,13 +32,12 @@ from vllm.config import CacheConfig
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from sglang.srt.managers.schedule_batch import ForwardMode
 from sglang.srt.mm_utils import (
     get_anyres_image_grid_shape,
     unpad_image,
     unpad_image_shape,
 )
-from sglang.srt.model_executor.model_runner import InputMetadata
+from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
 from sglang.srt.models.llama2 import LlamaForCausalLM
 from sglang.srt.models.mistral import MistralForCausalLM
 from sglang.srt.models.qwen2 import Qwen2ForCausalLM

sglang 0.2.10__py3-none-any.whl → 0.2.11__py3-none-any.whl

sglang 0.2.10py3-none-any.whl → 0.2.11py3-none-any.whl