PyPI - sglang - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl - Mend

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

sglang/bench_latency.py +2 -1
sglang/lang/chat_template.py +17 -0
sglang/launch_server_llavavid.py +1 -1
sglang/srt/configs/__init__.py +3 -0
sglang/srt/configs/model_config.py +27 -2
sglang/srt/configs/qwen2vl.py +133 -0
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/conversation.py +27 -0
sglang/srt/hf_transformers_utils.py +16 -1
sglang/srt/layers/attention/__init__.py +16 -5
sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
sglang/srt/layers/attention/flashinfer_backend.py +174 -54
sglang/srt/layers/attention/triton_backend.py +22 -6
sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
sglang/srt/layers/linear.py +89 -63
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/rotary_embedding.py +112 -0
sglang/srt/layers/sampler.py +51 -39
sglang/srt/lora/lora.py +3 -1
sglang/srt/managers/data_parallel_controller.py +1 -1
sglang/srt/managers/detokenizer_manager.py +4 -0
sglang/srt/managers/image_processor.py +186 -13
sglang/srt/managers/io_struct.py +10 -0
sglang/srt/managers/schedule_batch.py +238 -68
sglang/srt/managers/scheduler.py +69 -50
sglang/srt/managers/tokenizer_manager.py +24 -4
sglang/srt/managers/tp_worker.py +26 -111
sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
sglang/srt/mem_cache/memory_pool.py +56 -10
sglang/srt/mem_cache/radix_cache.py +4 -3
sglang/srt/model_executor/cuda_graph_runner.py +87 -28
sglang/srt/model_executor/forward_batch_info.py +83 -3
sglang/srt/model_executor/model_runner.py +32 -11
sglang/srt/models/chatglm.py +3 -3
sglang/srt/models/deepseek_v2.py +2 -2
sglang/srt/models/mllama.py +1004 -0
sglang/srt/models/qwen2_vl.py +724 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_batch_info.py +13 -3
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +12 -0
sglang/srt/server_args.py +10 -0
sglang/srt/utils.py +22 -0
sglang/test/run_eval.py +2 -0
sglang/test/runners.py +20 -1
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +100 -3
sglang/version.py +1 -1
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -23,17 +23,20 @@ ScheduleBatch -> ModelWorkerBatch -> ForwardBatch
 - ScheduleBatch is managed by `scheduler.py::Scheduler`.
   It contains high-level scheduling data. Most of the data is on the CPU.
 - ModelWorkerBatch is managed by `tp_worker.py::TpModelWorker`.
+  It is a subset of `ScheduleBatch` that only contains data related to the model forward on GPU.
+  It will be transformed from CPU scheduler to GPU model runner.
 - ForwardBatch is managed by `model_runner.py::ModelRunner`.
   It contains low-level tensor data. Most of the data consists of GPU tensors.
 """
+import dataclasses
 import logging
-from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 import torch
 from sglang.global_config import global_config
+from sglang.srt.configs.model_config import ModelConfig
 from sglang.srt.constrained import RegexGuide
 from sglang.srt.constrained.jump_forward import JumpForwardMap
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
@@ -114,38 +117,50 @@ class FINISH_ABORT(BaseFinishReason):
         }
-@dataclass
+@dataclasses.dataclass
 class ImageInputs:
     """The image related inputs."""
     pixel_values: torch.Tensor
-    image_hash: int
+    image_hashes: Optional[list] = None
     image_sizes: Optional[list] = None
     image_offsets: Optional[list] = None
     pad_values: Optional[list] = None
     modalities: Optional[list] = None
+    num_image_tokens: Optional[int] = None
     image_embeds: Optional[List[torch.Tensor]] = None
     aspect_ratio_ids: Optional[List[torch.Tensor]] = None
     aspect_ratio_mask: Optional[List[torch.Tensor]] = None
+    # QWen2-VL related
+    image_grid_thws: List[Tuple[int, int, int]] = None
     @staticmethod
     def from_dict(obj, vocab_size):
         # Use image hash as fake token_ids, which is then used for prefix matching
         ret = ImageInputs(
             pixel_values=obj["pixel_values"],
-            image_hash=hash(tuple(obj["image_hashes"])),
+            image_hashes=hash(tuple(obj["image_hashes"])),
         )
-        image_hash = ret.image_hash
+        image_hash = ret.image_hashes
         ret.pad_values = [
             (image_hash) % vocab_size,
             (image_hash >> 16) % vocab_size,
             (image_hash >> 32) % vocab_size,
             (image_hash >> 64) % vocab_size,
         ]
-        ret.image_sizes = obj["image_sizes"]
-        # Only when pixel values is not None we have modalities
-        ret.modalities = obj["modalities"] or ["image"]
+        optional_args = [
+            "image_sizes",
+            "modalities",
+            "aspect_ratio_ids",
+            "aspect_ratio_mask",
+            "image_grid_thws",
+        ]
+        for arg in optional_args:
+            if arg in obj:
+                setattr(ret, arg, obj[arg])
         return ret
@@ -236,6 +251,9 @@ class Req:
         self.regex_fsm_state: int = 0
         self.jump_forward_map: JumpForwardMap = None
+        # For Qwen2-VL
+        self.mrope_position_delta = []  # use mutable object
     # whether request reached finished condition
     def finished(self) -> bool:
         return self.finished_reason is not None
@@ -316,15 +334,20 @@ class Req:
         last_token_id = self.output_ids[-1]
-        matched_eos = last_token_id in self.sampling_params.stop_token_ids
+        matched_eos = False
+        # Check stop token ids
+        if self.sampling_params.stop_token_ids:
+            matched_eos = last_token_id in self.sampling_params.stop_token_ids
         if self.tokenizer is not None:
             matched_eos |= last_token_id == self.tokenizer.eos_token_id
+            if self.tokenizer.additional_stop_token_ids:
+                matched_eos |= last_token_id in self.tokenizer.additional_stop_token_ids
         if matched_eos and not self.sampling_params.ignore_eos:
             self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
             return
+        # Check stop strings
         if len(self.sampling_params.stop_strs) > 0:
             tail_str = self.tokenizer.decode(
                 self.output_ids[-(self.sampling_params.stop_str_max_len + 1) :]
@@ -399,7 +422,7 @@ class Req:
 bid = 0
-@dataclass
+@dataclasses.dataclass
 class ScheduleBatch:
     """Store all inforamtion of a batch."""
@@ -409,6 +432,9 @@ class ScheduleBatch:
     token_to_kv_pool: BaseTokenToKVPool = None
     tree_cache: BasePrefixCache = None
+    # For utility
+    model_config: ModelConfig = None
     forward_mode: ForwardMode = None
     sampling_info: SamplingBatchInfo = None
@@ -416,10 +442,13 @@ class ScheduleBatch:
     input_ids: torch.Tensor = None
     req_pool_indices: torch.Tensor = None
     seq_lens: torch.Tensor = None
+    # The output locations of the KV cache
     out_cache_loc: torch.Tensor = None
     output_ids: torch.Tensor = None
+    # The sum of all sequence lengths
+    seq_lens_sum: int = None
     # For processing logprobs
     return_logprob: bool = False
     top_logprobs_nums: Optional[List[int]] = None
@@ -428,33 +457,42 @@ class ScheduleBatch:
     prefix_lens: List[int] = None
     extend_lens: List[int] = None
     extend_num_tokens: int = None
-    running_bs: int = None
     decoding_reqs: List[Req] = None
+    # For encoder-decoder
+    encoder_cached: Optional[List[bool]] = None
+    encoder_lens: Optional[torch.Tensor] = None
+    encoder_lens_cpu: Optional[List[int]] = None
+    encoder_out_cache_loc: Optional[torch.Tensor] = None
     # Stream
     has_stream: bool = False
-    # device
-    device: str = "cuda"
     # Has regex
     has_regex: bool = False
-    @classmethod
-    def init_new(cls, reqs, req_to_token_pool, token_to_kv_pool, tree_cache):
-        return_logprob = any(req.return_logprob for req in reqs)
-        has_stream = any(req.stream for req in reqs)
-        has_regex = any(req.regex_fsm for req in reqs)
+    # device
+    device: str = "cuda"
+    @classmethod
+    def init_new(
+        cls,
+        reqs,
+        req_to_token_pool,
+        token_to_kv_pool,
+        tree_cache,
+        model_config,
+    ):
         return cls(
             reqs=reqs,
             req_to_token_pool=req_to_token_pool,
             token_to_kv_pool=token_to_kv_pool,
             tree_cache=tree_cache,
-            return_logprob=return_logprob,
-            has_stream=has_stream,
+            model_config=model_config,
+            return_logprob=any(req.return_logprob for req in reqs),
+            has_stream=any(req.stream for req in reqs),
+            has_regex=any(req.regex_fsm for req in reqs),
             device=req_to_token_pool.device,
-            has_regex=has_regex,
         )
     def batch_size(self):
@@ -481,14 +519,90 @@ class ScheduleBatch:
                 out_cache_loc = self.token_to_kv_pool.alloc(num_tokens)
             if out_cache_loc is None:
-                logger.error("Prefill out of memory. Try to lower your batch size.")
+                phase_str = "Prefill" if self.forward_mode.is_extend() else "Decode"
+                logger.error(
+                    f"{phase_str} out of memory. Try to lower your batch size.\n"
+                    f"Try to allocate {num_tokens} tokens.\n"
+                    f"Avaliable tokens: {self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()}\n"
+                )
                 if self.tree_cache is not None:
                     self.tree_cache.pretty_print()
                 exit(1)
         return out_cache_loc
-    def prepare_for_extend(self, vocab_size: int):
+    def prepare_encoder_info_extend(self, input_ids: List[int], seq_lens: List[int]):
+        self.encoder_lens_cpu = []
+        self.encoder_cached = []
+        for req in self.reqs:
+            im = req.image_inputs
+            if im is None or im.num_image_tokens is None:
+                # No image input
+                self.encoder_lens_cpu.append(0)
+                self.encoder_cached.append(True)
+            else:
+                self.encoder_lens_cpu.append(im.num_image_tokens)
+                self.encoder_cached.append(
+                    self.forward_mode.is_decode()
+                    or len(req.prefix_indices) >= im.num_image_tokens
+                )
+        self.encoder_lens = torch.tensor(self.encoder_lens_cpu, dtype=torch.int32).to(
+            self.device, non_blocking=True
+        )
+        # Strip encoder infos
+        pt = 0
+        decoder_out_cache_loc = []
+        encoder_out_cache_loc = []
+        for i, req in enumerate(self.reqs):
+            encoder_len = self.encoder_lens_cpu[i]
+            seq_lens[i] -= encoder_len
+            if len(req.prefix_indices) < encoder_len:
+                # NOTE: the encoder part should considered as a whole
+                assert len(req.prefix_indices) == 0
+                input_ids[i] = input_ids[i][encoder_len:]
+                encoder_out_cache_loc.append(self.out_cache_loc[pt : pt + encoder_len])
+                decoder_out_cache_loc.append(
+                    self.out_cache_loc[pt + encoder_len : pt + req.extend_input_len]
+                )
+                self.extend_lens[i] -= encoder_len
+                self.extend_num_tokens -= encoder_len
+            else:
+                decoder_out_cache_loc.append(
+                    self.out_cache_loc[pt : pt + req.extend_input_len]
+                )
+                self.prefix_lens[i] -= encoder_len
+            pt += req.extend_input_len
+        # Reassign
+        self.input_ids = torch.tensor(sum(input_ids, []), dtype=torch.int32).to(
+            self.device, non_blocking=True
+        )
+        self.seq_lens = torch.tensor(seq_lens, dtype=torch.int32).to(
+            self.device, non_blocking=True
+        )
+        if not decoder_out_cache_loc:
+            self.out_cache_loc = torch.empty(0, dtype=torch.int32).to(
+                self.device, non_blocking=True
+            )
+        else:
+            self.out_cache_loc = torch.cat(decoder_out_cache_loc)
+        if not encoder_out_cache_loc:
+            self.encoder_out_cache_loc = torch.empty(0, dtype=torch.int32).to(
+                self.device, non_blocking=True
+            )
+        else:
+            self.encoder_out_cache_loc = torch.cat(encoder_out_cache_loc)
+        assert len(self.out_cache_loc) == self.extend_num_tokens
+    def prepare_for_extend(self):
         self.forward_mode = ForwardMode.EXTEND
         bs = len(self.reqs)
@@ -516,12 +630,12 @@ class ScheduleBatch:
             assert seq_len - pre_len == req.extend_input_len
             if pre_len > 0:
-                self.req_to_token_pool.req_to_token[req.req_pool_idx, :pre_len] = (
-                    req.prefix_indices
+                self.req_to_token_pool.write(
+                    (req.req_pool_idx, slice(0, pre_len)), req.prefix_indices
                 )
-            self.req_to_token_pool.req_to_token[req.req_pool_idx, pre_len:seq_len] = (
-                out_cache_loc[pt : pt + req.extend_input_len]
+            self.req_to_token_pool.write(
+                (req.req_pool_idx, slice(pre_len, seq_len)),
+                out_cache_loc[pt : pt + req.extend_input_len],
             )
             # Compute the relative logprob_start_len in an extend batch
@@ -546,16 +660,23 @@ class ScheduleBatch:
             self.device, non_blocking=True
         )
-        self.extend_num_tokens = extend_num_tokens
         self.out_cache_loc = out_cache_loc
+        self.seq_lens_sum = sum(seq_lens)
         if self.return_logprob:
             self.top_logprobs_nums = [r.top_logprobs_num for r in reqs]
+        self.extend_num_tokens = extend_num_tokens
         self.prefix_lens = [len(r.prefix_indices) for r in reqs]
         self.extend_lens = [r.extend_input_len for r in reqs]
         self.extend_logprob_start_lens = [r.extend_logprob_start_len for r in reqs]
+        if self.model_config.is_encoder_decoder:
+            self.prepare_encoder_info_extend(input_ids, seq_lens)
         self.sampling_info = SamplingBatchInfo.from_schedule_batch(
-            self, vocab_size, global_server_args_dict["disable_penalizer"]
+            self,
+            self.model_config.vocab_size,
+            global_server_args_dict["disable_penalizer"],
         )
     def mix_with_running(self, running_batch: "ScheduleBatch"):
@@ -568,12 +689,11 @@ class ScheduleBatch:
         input_ids = torch.cat([self.input_ids, running_batch.input_ids])
         out_cache_loc = torch.cat([self.out_cache_loc, running_batch.out_cache_loc])
-        extend_num_tokens = self.extend_num_tokens + running_bs
         self.merge_batch(running_batch)
         self.input_ids = input_ids
         self.out_cache_loc = out_cache_loc
-        self.extend_num_tokens = extend_num_tokens
+        self.extend_num_tokens += running_bs
         # NOTE: prefix_indices is what has been cached, but we don't cache each decode step
         self.prefix_lens.extend(
@@ -631,8 +751,8 @@ class ScheduleBatch:
             if isinstance(self.tree_cache, ChunkCache):
                 # ChunkCache does not have eviction
-                token_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx][
-                    : seq_lens_cpu[idx]
+                token_indices = self.req_to_token_pool.req_to_token[
+                    req.req_pool_idx, : seq_lens_cpu[idx]
                 ]
                 self.token_to_kv_pool.free(token_indices)
                 self.req_to_token_pool.free(req.req_pool_idx)
@@ -640,8 +760,8 @@ class ScheduleBatch:
             else:
                 # TODO: apply more fine-grained retraction
                 last_uncached_pos = len(req.prefix_indices)
-                token_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx][
-                    last_uncached_pos : seq_lens_cpu[idx]
+                token_indices = self.req_to_token_pool.req_to_token[
+                    req.req_pool_idx, last_uncached_pos : seq_lens_cpu[idx]
                 ]
                 self.token_to_kv_pool.free(token_indices)
                 self.req_to_token_pool.free(req.req_pool_idx)
@@ -746,7 +866,11 @@ class ScheduleBatch:
         return jump_forward_reqs
-    def prepare_for_decode(self):
+    def prepare_encoder_info_decode(self):
+        # Reset the encoder cached status
+        self.encoder_cached = [True] * len(self.reqs)
+    def prepare_for_decode(self, enable_overlap: bool = False):
         self.forward_mode = ForwardMode.DECODE
         self.input_ids = self.output_ids
@@ -760,10 +884,25 @@ class ScheduleBatch:
         bs = len(self.reqs)
         self.out_cache_loc = self.alloc_token_slots(bs)
-        self.req_to_token_pool.req_to_token[self.req_pool_indices, self.seq_lens] = (
-            self.out_cache_loc
-        )
-        self.seq_lens.add_(1)
+        if self.model_config.is_encoder_decoder:
+            locs = self.encoder_lens + self.seq_lens
+            self.prepare_encoder_info_decode()
+        else:
+            locs = self.seq_lens
+        if enable_overlap:
+            # Do not use in-place operations in the overlap mode
+            self.req_to_token_pool.write(
+                (self.req_pool_indices, locs), self.out_cache_loc
+            )
+            self.seq_lens = self.seq_lens + 1
+        else:
+            # A faster in-place version
+            self.req_to_token_pool.write(
+                (self.req_pool_indices, locs), self.out_cache_loc
+            )
+            self.seq_lens.add_(1)
+        self.seq_lens_sum += bs
     def filter_batch(
         self,
@@ -787,6 +926,10 @@ class ScheduleBatch:
             # No need to filter
             return
+        if self.model_config.is_encoder_decoder:
+            self.encoder_lens = self.encoder_lens[keep_indices]
+            self.encoder_lens_cpu = [self.encoder_lens_cpu[i] for i in keep_indices]
         self.reqs = [self.reqs[i] for i in keep_indices]
         new_indices = torch.tensor(keep_indices, dtype=torch.int32).to(
             self.device, non_blocking=True
@@ -794,6 +937,7 @@ class ScheduleBatch:
         self.req_pool_indices = self.req_pool_indices[new_indices]
         self.seq_lens = self.seq_lens[new_indices]
         self.out_cache_loc = None
+        self.seq_lens_sum = self.seq_lens.sum().item()
         self.output_ids = self.output_ids[new_indices]
         self.return_logprob = any(req.return_logprob for req in self.reqs)
         if self.return_logprob:
@@ -812,11 +956,17 @@ class ScheduleBatch:
         # needs to be called with pre-merged Batch.reqs.
         self.sampling_info.merge_batch(other.sampling_info)
+        # Encoder-decoder infos
+        if self.model_config.is_encoder_decoder:
+            self.encoder_lens = torch.cat([self.encoder_lens, other.encoder_lens])
+            self.encoder_lens_cpu.extend(other.encoder_lens_cpu)
         self.req_pool_indices = torch.concat(
             [self.req_pool_indices, other.req_pool_indices]
         )
         self.seq_lens = torch.concat([self.seq_lens, other.seq_lens])
         self.out_cache_loc = None
+        self.seq_lens_sum += other.seq_lens_sum
         if self.output_ids is not None:
             self.output_ids = torch.concat([self.output_ids, other.output_ids])
         if self.return_logprob and other.return_logprob:
@@ -833,16 +983,12 @@ class ScheduleBatch:
     def get_model_worker_batch(self):
         if self.forward_mode.is_decode():
-            extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = (
-                image_inputs
-            ) = None
+            extend_seq_lens = extend_prefix_lens = extend_logprob_start_lens = None
         else:
             extend_seq_lens = self.extend_lens
             extend_prefix_lens = self.prefix_lens
             extend_logprob_start_lens = self.extend_logprob_start_lens
-            image_inputs = [r.image_inputs for r in self.reqs]
-        lora_paths = [req.lora_path for req in self.reqs]
         if self.has_regex:
             self.sampling_info.regex_fsms = [req.regex_fsm for req in self.reqs]
             self.sampling_info.regex_fsm_states = [
@@ -854,6 +1000,8 @@ class ScheduleBatch:
         global bid
         bid += 1
+        mrope_positions_delta = [req.mrope_position_delta for req in self.reqs]
         return ModelWorkerBatch(
             bid=bid,
             forward_mode=self.forward_mode,
@@ -861,19 +1009,29 @@ class ScheduleBatch:
             req_pool_indices=self.req_pool_indices,
             seq_lens=self.seq_lens,
             out_cache_loc=self.out_cache_loc,
+            seq_lens_sum=self.seq_lens_sum,
+            req_to_token_pool_records=self.req_to_token_pool.get_write_records(),
             return_logprob=self.return_logprob,
             top_logprobs_nums=self.top_logprobs_nums,
+            extend_num_tokens=self.extend_num_tokens,
             extend_seq_lens=extend_seq_lens,
             extend_prefix_lens=extend_prefix_lens,
             extend_logprob_start_lens=extend_logprob_start_lens,
-            image_inputs=image_inputs,
-            lora_paths=lora_paths,
+            image_inputs=[r.image_inputs for r in self.reqs],
+            encoder_cached=self.encoder_cached,
+            encoder_lens=self.encoder_lens,
+            encoder_lens_cpu=self.encoder_lens_cpu,
+            encoder_out_cache_loc=self.encoder_out_cache_loc,
+            lora_paths=[req.lora_path for req in self.reqs],
             sampling_info=self.sampling_info,
+            mrope_positions_delta=mrope_positions_delta,
         )
     def copy(self):
+        # Only contain fields that will be used by process_batch_result
         return ScheduleBatch(
             reqs=self.reqs,
+            model_config=self.model_config,
             forward_mode=self.forward_mode,
             out_cache_loc=self.out_cache_loc,
             return_logprob=self.return_logprob,
@@ -887,7 +1045,7 @@ class ScheduleBatch:
         )
-@dataclass
+@dataclasses.dataclass
 class ModelWorkerBatch:
     # The batch id
     bid: int
@@ -902,11 +1060,18 @@ class ModelWorkerBatch:
     # The indices of output tokens in the token_to_kv_pool
     out_cache_loc: torch.Tensor
+    # The sum of all sequence lengths
+    seq_lens_sum: int
+    # The memory pool operation records
+    req_to_token_pool_records: Optional[List[Tuple[Tuple, torch.Tensor]]]
     # For logprob
     return_logprob: bool
     top_logprobs_nums: Optional[List[int]]
     # For extend
+    extend_num_tokens: Optional[int]
     extend_seq_lens: Optional[List[int]]
     extend_prefix_lens: Optional[List[int]]
     extend_logprob_start_lens: Optional[List[int]]
@@ -914,26 +1079,31 @@ class ModelWorkerBatch:
     # For multimodal
     image_inputs: Optional[List[ImageInputs]]
+    # For encoder-decoder
+    encoder_cached: Optional[List[bool]]
+    encoder_lens: Optional[torch.Tensor]
+    encoder_lens_cpu: Optional[List[int]]
+    encoder_out_cache_loc: Optional[torch.Tensor]
     # For LoRA
     lora_paths: Optional[List[str]]
     # Sampling info
     sampling_info: SamplingBatchInfo
+    # For Qwen2-VL
+    mrope_positions_delta: List[List[int]]
     def copy(self):
-        return ModelWorkerBatch(
-            bid=self.bid,
-            forward_mode=self.forward_mode,
-            input_ids=self.input_ids.clone(),
-            req_pool_indices=self.req_pool_indices,
-            seq_lens=self.seq_lens.clone(),
-            out_cache_loc=self.out_cache_loc,
-            return_logprob=self.return_logprob,
-            top_logprobs_nums=self.top_logprobs_nums,
-            extend_seq_lens=self.extend_seq_lens,
-            extend_prefix_lens=self.extend_prefix_lens,
-            extend_logprob_start_lens=self.extend_logprob_start_lens,
-            image_inputs=self.image_inputs,
-            lora_paths=self.lora_paths,
-            sampling_info=self.sampling_info.copy(),
-        )
+        return dataclasses.replace(self, sampling_info=self.sampling_info.copy())
+    def to(self, device: str):
+        self.input_ids = self.input_ids.to(device, non_blocking=True)
+        self.req_pool_indices = self.req_pool_indices.to(device, non_blocking=True)
+        self.seq_lens = self.seq_lens.to(device, non_blocking=True)
+        self.out_cache_loc = self.out_cache_loc.to(device, non_blocking=True)
+        self.req_to_token_pool_records = [
+            (x, y.to(device, non_blocking=True))
+            for x, y in self.req_to_token_pool_records
+        ]
+        self.sampling_info.to(device)

sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl

sglang 0.3.4py3-none-any.whl → 0.3.4.post2py3-none-any.whl