PyPI - sglang - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

sglang 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

sglang/__init__.py +55 -2
sglang/api.py +3 -5
sglang/backend/anthropic.py +33 -13
sglang/backend/openai.py +2 -1
sglang/backend/runtime_endpoint.py +18 -5
sglang/backend/vertexai.py +1 -0
sglang/global_config.py +1 -0
sglang/lang/chat_template.py +74 -0
sglang/lang/interpreter.py +40 -16
sglang/lang/ir.py +1 -1
sglang/lang/tracer.py +6 -4
sglang/launch_server.py +2 -1
sglang/srt/constrained/fsm_cache.py +15 -3
sglang/srt/constrained/jump_forward.py +1 -0
sglang/srt/conversation.py +2 -2
sglang/srt/hf_transformers_utils.py +2 -1
sglang/srt/layers/context_flashattention_nopad.py +1 -0
sglang/srt/layers/extend_attention.py +1 -0
sglang/srt/layers/logits_processor.py +114 -54
sglang/srt/layers/radix_attention.py +2 -1
sglang/srt/layers/token_attention.py +1 -0
sglang/srt/managers/detokenizer_manager.py +5 -1
sglang/srt/managers/io_struct.py +12 -0
sglang/srt/managers/router/infer_batch.py +70 -33
sglang/srt/managers/router/manager.py +7 -2
sglang/srt/managers/router/model_rpc.py +116 -73
sglang/srt/managers/router/model_runner.py +121 -155
sglang/srt/managers/router/radix_cache.py +46 -38
sglang/srt/managers/tokenizer_manager.py +56 -11
sglang/srt/memory_pool.py +5 -14
sglang/srt/model_config.py +7 -0
sglang/srt/models/commandr.py +376 -0
sglang/srt/models/dbrx.py +413 -0
sglang/srt/models/dbrx_config.py +281 -0
sglang/srt/models/gemma.py +22 -20
sglang/srt/models/llama2.py +23 -21
sglang/srt/models/llava.py +12 -10
sglang/srt/models/mixtral.py +27 -25
sglang/srt/models/qwen.py +23 -21
sglang/srt/models/qwen2.py +23 -21
sglang/srt/models/stablelm.py +292 -0
sglang/srt/models/yivl.py +6 -5
sglang/srt/openai_api_adapter.py +356 -0
sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +36 -20
sglang/srt/sampling_params.py +2 -0
sglang/srt/server.py +68 -439
sglang/srt/server_args.py +76 -49
sglang/srt/utils.py +88 -32
sglang/srt/weight_utils.py +402 -0
sglang/test/test_programs.py +8 -7
sglang/test/test_utils.py +196 -8
{sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/METADATA +13 -15
sglang-0.1.15.dist-info/RECORD +69 -0
{sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/WHEEL +1 -1
sglang-0.1.13.dist-info/RECORD +0 -63
{sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/LICENSE +0 -0
{sglang-0.1.13.dist-info → sglang-0.1.15.dist-info}/top_level.txt +0 -0

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import torch
-from sglang.srt.managers.router.model_runner import ForwardMode, InputMetadata
 from torch import nn
-from vllm.model_executor.parallel_utils.communication_op import (
+from vllm.distributed import (
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_gather,
 )
+from sglang.srt.managers.router.model_runner import ForwardMode, InputMetadata
 class LogitsProcessor(nn.Module):
     def __init__(self, config):
@@ -13,76 +14,136 @@ class LogitsProcessor(nn.Module):
         self.config = config
         self.tp_size = get_tensor_model_parallel_world_size()
-    def forward(self, input_ids, hidden_states, weight, input_metadata):
-        last_index = None
+    def _get_normalized_prompt_logprobs(
+        self, prefill_token_logprobs, input_metadata: InputMetadata
+    ):
+        logprobs_cumsum = torch.cumsum(
+            prefill_token_logprobs, dim=0, dtype=torch.float32
+        )
-        # Compute the last index (the first decode token) of each requeast
-        # if we are in prefill or extend mode.
+        start = input_metadata.extend_start_loc.clone()
+        end = start + input_metadata.extend_seq_lens - 2
+        start.clamp_(min=0, max=prefill_token_logprobs.shape[0] - 1)
+        end.clamp_(min=0, max=prefill_token_logprobs.shape[0] - 1)
+        sum_logp = (
+            logprobs_cumsum[end]
+            - logprobs_cumsum[start]
+            + prefill_token_logprobs[start]
+        )
+        normalized_prompt_logprobs = sum_logp / (
+            (input_metadata.extend_seq_lens - 1).clamp(min=1)
+        )
+        return normalized_prompt_logprobs
+    def _get_top_logprobs(self, all_logprobs, input_metadata: InputMetadata):
+        if input_metadata.forward_mode == ForwardMode.DECODE:
+            decode_top_logprobs = []
+            for i in range(all_logprobs.shape[0]):
+                k = input_metadata.top_logprobs_nums[i]
+                t = all_logprobs[i].topk(k)
+                v_cpu = t.values.tolist()
+                p_cpu = t.indices.tolist()
+                decode_top_logprobs.append(list(zip(v_cpu, p_cpu)))
+            return None, decode_top_logprobs
+        else:
+            prefill_top_logprobs, decode_top_logprobs = [], []
+            pt = 0
+            # NOTE: the GPU-CPU overhead can be reduced
+            extend_seq_lens_cpu = input_metadata.extend_seq_lens.cpu().numpy()
+            for i in range(len(extend_seq_lens_cpu)):
+                if extend_seq_lens_cpu[i] == 0:
+                    prefill_top_logprobs.append([])
+                    decode_top_logprobs.append([])
+                    continue
+                k = input_metadata.top_logprobs_nums[i]
+                t = all_logprobs[pt : pt + extend_seq_lens_cpu[i]].topk(k)
+                vs_cpu = t.values.tolist()
+                ps_cpu = t.indices.tolist()
+                prefill_top_logprobs.append(
+                    [list(zip(vs_cpu[j], ps_cpu[j])) for j in range(len(vs_cpu) - 1)]
+                )
+                decode_top_logprobs.append(list(zip(vs_cpu[-1], ps_cpu[-1])))
+                pt += extend_seq_lens_cpu[i]
+            return prefill_top_logprobs, decode_top_logprobs
+    def forward(self, input_ids, hidden_states, weight, input_metadata: InputMetadata):
+        # Get last index for next token prediction, except for DECODE mode.
+        last_index = None
         if input_metadata.forward_mode != ForwardMode.DECODE:
             last_index = (
-                torch.cumsum(
-                    input_metadata.seq_lens - input_metadata.prefix_lens,
-                    dim=0,
-                    dtype=torch.long,
-                )
+                torch.cumsum(input_metadata.extend_seq_lens, dim=0, dtype=torch.long)
                 - 1
             )
+        # Get the last hidden states and last logits
+        if input_metadata.forward_mode == ForwardMode.DECODE:
+            last_hidden = hidden_states
+        else:
+            last_hidden = hidden_states[last_index]
+        last_logits = torch.matmul(last_hidden, weight.T)
+        if self.tp_size > 1:
+            last_logits = tensor_model_parallel_all_gather(last_logits)
+        last_logits = last_logits[:, : self.config.vocab_size]
+        # Return only last_logits if logprob is not requested
         if not input_metadata.return_logprob:
-            # When logprob is not requested, only compute the last logits.
-            if input_metadata.forward_mode == ForwardMode.DECODE:
-                last_hidden = hidden_states
-            else:
-                last_hidden = hidden_states[last_index]
-                hidden_states = None
-            last_logits = torch.matmul(last_hidden, weight.T)
-            if self.tp_size > 1:
-                last_logits = tensor_model_parallel_all_gather(last_logits)
-            last_logits = last_logits[:, : self.config.vocab_size]
-            return last_logits, (None, None, None)
+            hidden_states = None
+            return last_logits, (None, None, None, None, None)
         else:
             # When logprob is requested, compute the logits for all tokens.
-            logits = torch.matmul(hidden_states, weight.T)
-            if self.tp_size > 1:
-                logits = tensor_model_parallel_all_gather(logits)
-            logits = logits[:, : self.config.vocab_size]
-            all_logprobs = torch.log(torch.softmax(logits.float(), dim=-1) + 1e-6)
+            if input_metadata.forward_mode == ForwardMode.DECODE:
+                all_logits = last_logits
+            else:
+                all_logits = torch.matmul(hidden_states, weight.T)
+                if self.tp_size > 1:
+                    all_logits = tensor_model_parallel_all_gather(all_logits)
+                all_logits = all_logits[:, : self.config.vocab_size]
+            all_logprobs = all_logits.float()
+            del all_logits
+            all_logprobs[:] = torch.nn.functional.log_softmax(all_logprobs, dim=-1)
+            return_top_logprob = any(x > 0 for x in input_metadata.top_logprobs_nums)
+            if return_top_logprob:
+                prefill_top_logprobs, decode_top_logprobs = self._get_top_logprobs(
+                    all_logprobs, input_metadata
+                )
+            else:
+                prefill_top_logprobs = decode_top_logprobs = None
             if input_metadata.forward_mode == ForwardMode.DECODE:
-                last_logits = logits
                 last_logprobs = all_logprobs
-                prefill_logprobs = normalized_logprobs = None
+                return last_logits, (
+                    None,
+                    None,
+                    None,
+                    decode_top_logprobs,
+                    last_logprobs,
+                )
             else:
                 # Compute the logprobs for the last token of each request.
-                last_logits = logits[last_index]
                 last_logprobs = all_logprobs[last_index]
                 # Compute the logprobs and normalized logprobs for the prefill tokens.
                 # Note that we pad a zero at the end of each sequence for easy computation.
-                prefill_logprobs = all_logprobs[
+                prefill_token_logprobs = all_logprobs[
                     torch.arange(all_logprobs.shape[0], device="cuda"),
                     torch.cat([input_ids[1:], torch.tensor([0], device="cuda")]),
                 ]
-                logprobs_cumsum = torch.cumsum(
-                    prefill_logprobs, dim=0, dtype=torch.float32
-                )
-                start = input_metadata.extend_start_loc.clone()
-                end = start + input_metadata.extend_seq_lens - 2
-                start.clamp_(min=0, max=prefill_logprobs.shape[0] - 1)
-                end.clamp_(min=0, max=prefill_logprobs.shape[0] - 1)
-                sum_logp = (
-                    logprobs_cumsum[end]
-                    - logprobs_cumsum[start]
-                    + prefill_logprobs[start]
+                normalized_prompt_logprobs = self._get_normalized_prompt_logprobs(
+                    prefill_token_logprobs, input_metadata
                 )
-                normalized_logprobs = sum_logp / (
-                    (input_metadata.extend_seq_lens - 1).clamp(min=1)
+                return last_logits, (
+                    prefill_token_logprobs,
+                    normalized_prompt_logprobs,
+                    prefill_top_logprobs,
+                    decode_top_logprobs,
+                    last_logprobs,
                 )
-            return last_logits, (prefill_logprobs, normalized_logprobs, last_logprobs)
 if __name__ == "__main__":
     all_logprobs = torch.tensor(
@@ -93,23 +154,22 @@ if __name__ == "__main__":
     )
     seq_lens = torch.tensor([2, 0, 3, 0], dtype=torch.int32, device="cuda")
     input_ids = torch.tensor([1, 2, 3, 0, 1], dtype=torch.int32, device="cuda")
-    logprobs = torch.zeros(5, dtype=torch.float32, device="cuda")
-    logprobs = all_logprobs[
+    token_logprobs = all_logprobs[
         torch.arange(all_logprobs.shape[0], device="cuda"),
         torch.cat([input_ids[1:], torch.tensor([0], device="cuda")]),
     ]
-    logprobs_cumsum = torch.cumsum(logprobs, dim=0, dtype=torch.float32)
+    logprobs_cumsum = torch.cumsum(token_logprobs, dim=0, dtype=torch.float32)
     len_cumsum = torch.cumsum(seq_lens, dim=0)
     start = torch.cat((torch.tensor([0], device="cuda"), len_cumsum[:-1]), 0)
     end = start + seq_lens - 2
-    start.clamp_(min=0, max=logprobs.shape[0] - 1)
-    end.clamp_(min=0, max=logprobs.shape[0] - 1)
-    sum_logp = logprobs_cumsum[end] - logprobs_cumsum[start] + logprobs[start]
+    start.clamp_(min=0, max=token_logprobs.shape[0] - 1)
+    end.clamp_(min=0, max=token_logprobs.shape[0] - 1)
+    sum_logp = logprobs_cumsum[end] - logprobs_cumsum[start] + token_logprobs[start]
     # assert logprobs == [2, _, 2, 4, _]
-    print("logprobs", logprobs)
+    print("token logprobs", token_logprobs)
     print("start", start)
     print("end", end)
     print("sum_logp", sum_logp)

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -1,9 +1,10 @@
 import torch
+from torch import nn
 from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
 from sglang.srt.layers.extend_attention import extend_attention_fwd
 from sglang.srt.layers.token_attention import token_attention_fwd
 from sglang.srt.managers.router.model_runner import ForwardMode, InputMetadata
-from torch import nn
 class RadixAttention(nn.Module):

sglang/srt/layers/token_attention.py CHANGED Viewed

@@ -4,6 +4,7 @@
 import torch
 import triton
 import triton.language as tl
 from sglang.srt.managers.router.model_runner import global_server_args_dict
 from sglang.srt.utils import wrap_kernel_launcher

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -3,6 +3,7 @@ import asyncio
 import uvloop
 import zmq
 import zmq.asyncio
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.srt.managers.io_struct import BatchStrOut, BatchTokenIDOut
 from sglang.srt.server_args import PortArgs, ServerArgs
@@ -37,10 +38,13 @@ class DetokenizerManager:
             if isinstance(recv_obj, BatchTokenIDOut):
                 output_tokens = recv_obj.output_tokens
-                # TODO(lmzheng): handle skip_special_tokens per request
+                # TODO(lmzheng): handle skip_special_tokens/spaces_between_special_tokens per request
                 output_strs = self.tokenizer.batch_decode(
                     output_tokens,
                     skip_special_tokens=recv_obj.skip_special_tokens[0],
+                    spaces_between_special_tokens=recv_obj.spaces_between_special_tokens[
+                        0
+                    ],
                 )
                 # Trim stop str

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -19,10 +19,13 @@ class GenerateReqInput:
     return_logprob: Optional[Union[List[bool], bool]] = None
     # The start location of the prompt for return_logprob
     logprob_start_len: Optional[Union[List[int], int]] = None
+    # The number of top logprobs to return
+    top_logprobs_num: Optional[Union[List[int], int]] = None
     # Whether to detokenize tokens in logprobs
     return_text_in_logprobs: bool = False
     # Whether to stream output
     stream: bool = False
+    # TODO: make all parameters a Union[List[T], T] to allow for batched requests
     def post_init(self):
         is_single = isinstance(self.text, str)
@@ -36,6 +39,8 @@ class GenerateReqInput:
                 self.return_logprob = False
             if self.logprob_start_len is None:
                 self.logprob_start_len = 0
+            if self.top_logprobs_num is None:
+                self.top_logprobs_num = 0
         else:
             num = len(self.text)
@@ -64,6 +69,11 @@ class GenerateReqInput:
             elif not isinstance(self.logprob_start_len, list):
                 self.logprob_start_len = [self.logprob_start_len] * num
+            if self.top_logprobs_num is None:
+                self.top_logprobs_num = [0] * num
+            elif not isinstance(self.top_logprobs_num, list):
+                self.top_logprobs_num = [self.top_logprobs_num] * num
 @dataclass
 class TokenizedGenerateReqInput:
@@ -76,6 +86,7 @@ class TokenizedGenerateReqInput:
     sampling_params: SamplingParams
     return_logprob: bool
     logprob_start_len: int
+    top_logprobs_num: int
     stream: bool
@@ -86,6 +97,7 @@ class BatchTokenIDOut:
     output_and_jump_forward_strs: List[str]
     hit_stop_str: List[Optional[str]]
     skip_special_tokens: List[bool]
+    spaces_between_special_tokens: List[bool]
     meta_info: List[Dict]
     finished: List[bool]

sglang/srt/managers/router/infer_batch.py CHANGED Viewed

@@ -1,22 +1,23 @@
 from dataclasses import dataclass
-from enum import Enum, auto
+from enum import IntEnum, auto
 from typing import List
 import numpy as np
 import torch
 from sglang.srt.managers.router.radix_cache import RadixCache
 from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
-class ForwardMode(Enum):
+class ForwardMode(IntEnum):
     PREFILL = auto()
     EXTEND = auto()
     DECODE = auto()
-class FinishReason(Enum):
-    LENGTH = auto()
+class FinishReason(IntEnum):
     EOS_TOKEN = auto()
+    LENGTH = auto()
     STOP_STR = auto()
@@ -30,6 +31,7 @@ class Req:
         # Since jump forward may retokenize the prompt with partial outputs,
         # we maintain the original prompt length to report the correct usage.
         self.prompt_tokens = len(input_ids)
         # The number of decoded tokens for token usage report. Note that
         # this does not include the jump forward tokens.
         self.completion_tokens_wo_jump_forward = 0
@@ -40,11 +42,11 @@ class Req:
         self.image_offset = 0
         self.pad_value = None
+        # Sampling parameters
         self.sampling_params = None
-        self.return_logprob = False
-        self.logprob_start_len = 0
         self.stream = False
+        # Check finish
         self.tokenizer = None
         self.finished = False
         self.finish_reason = None
@@ -54,11 +56,17 @@ class Req:
         self.prefix_indices = []
         self.last_node = None
-        self.logprob = None
-        self.token_logprob = None
-        self.normalized_logprob = None
-        # For constrained decoding
+        # Logprobs
+        self.return_logprob = False
+        self.logprob_start_len = 0
+        self.top_logprobs_num = 0
+        self.normalized_prompt_logprob = None
+        self.prefill_token_logprobs = None
+        self.decode_token_logprobs = None
+        self.prefill_top_logprobs = None
+        self.decode_top_logprobs = None
+        # Constrained decoding
         self.regex_fsm = None
         self.regex_fsm_state = 0
         self.jump_forward_map = None
@@ -159,7 +167,10 @@ class Batch:
     out_cache_loc: torch.Tensor = None
     out_cache_cont_start: torch.Tensor = None
     out_cache_cont_end: torch.Tensor = None
+    # for processing logprobs
     return_logprob: bool = False
+    top_logprobs_nums: List[int] = None
     # for multimodal
     pixel_values: List[torch.Tensor] = None
@@ -229,12 +240,11 @@ class Batch:
         extend_num_tokens = seq_lens.sum() - prefix_lens.sum()
         out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
         if out_cache_loc is None:
-            if not self.tree_cache.disable:
-                self.tree_cache.evict(extend_num_tokens, self.token_to_kv_pool.free)
-                out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
+            self.tree_cache.evict(extend_num_tokens, self.token_to_kv_pool.dec_refs)
+            out_cache_loc = self.token_to_kv_pool.alloc(extend_num_tokens)
             if out_cache_loc is None:
-                print("Prefill out of memory. This should nerver happen.")
+                print("Prefill out of memory. This should never happen.")
                 self.tree_cache.pretty_print()
                 exit()
@@ -245,10 +255,14 @@ class Batch:
             ] = out_cache_loc[pt : pt + extend_lens[i]]
             pt += extend_lens[i]
-        # Handle logit bias
-        logit_bias = torch.zeros((bs, vocab_size), dtype=torch.float32, device=device)
+        # Handle logit bias but only allocate when needed
+        logit_bias = None
         for i in range(bs):
             if reqs[i].sampling_params.dtype == "int":
+                if logit_bias is None:
+                    logit_bias = torch.zeros(
+                        (bs, vocab_size), dtype=torch.float32, device=device
+                    )
                 logit_bias[i] = int_token_logit_bias
         # Set fields
@@ -266,6 +280,7 @@ class Batch:
         self.position_ids_offsets = position_ids_offsets
         self.extend_num_tokens = extend_num_tokens
         self.out_cache_loc = out_cache_loc
+        self.top_logprobs_nums = [r.top_logprobs_num for r in reqs]
         self.temperatures = torch.tensor(
             [r.sampling_params.temperature for r in reqs],
@@ -295,8 +310,8 @@ class Batch:
         if self.token_to_kv_pool.available_size() >= bs:
             return True
-        if not self.tree_cache.disable:
-            self.tree_cache.evict(bs, self.token_to_kv_pool.free)
+        self.tree_cache.evict(bs, self.token_to_kv_pool.dec_refs)
         if self.token_to_kv_pool.available_size() >= bs:
             return True
@@ -310,8 +325,8 @@ class Batch:
         )
         retracted_reqs = []
-        seq_lens_np = self.seq_lens.cpu().numpy()
-        req_pool_indices_np = self.req_pool_indices.cpu().numpy()
+        seq_lens_cpu = self.seq_lens.cpu().numpy()
+        req_pool_indices_cpu = self.req_pool_indices.cpu().numpy()
         while self.token_to_kv_pool.available_size() < len(self.reqs):
             idx = sorted_indices.pop()
             req = self.reqs[idx]
@@ -327,9 +342,9 @@ class Batch:
             # TODO: apply more fine-grained retraction
             token_indices = self.req_to_token_pool.req_to_token[
-                req_pool_indices_np[idx]
-            ][: seq_lens_np[idx]]
-            self.token_to_kv_pool.free(token_indices)
+                req_pool_indices_cpu[idx]
+            ][: seq_lens_cpu[idx]]
+            self.token_to_kv_pool.dec_refs(token_indices)
         self.filter_batch(sorted_indices)
@@ -352,7 +367,7 @@ class Batch:
                     # insert the old request into tree_cache
                     token_ids_in_memory = tuple(req.input_ids + req.output_ids)[:-1]
                     if req_pool_indices_cpu is None:
-                        req_pool_indices_cpu = self.req_pool_indices.cpu().tolist()
+                        req_pool_indices_cpu = self.req_pool_indices.tolist()
                     req_pool_idx = req_pool_indices_cpu[i]
                     indices = self.req_to_token_pool.req_to_token[
                         req_pool_idx, : len(token_ids_in_memory)
@@ -360,7 +375,7 @@ class Batch:
                     prefix_len = self.tree_cache.insert(
                         token_ids_in_memory, indices.clone()
                     )
-                    self.token_to_kv_pool.free(indices[:prefix_len])
+                    self.token_to_kv_pool.dec_refs(indices[:prefix_len])
                     self.req_to_token_pool.free(req_pool_idx)
                     self.tree_cache.dec_ref_counter(req.last_node)
@@ -391,7 +406,7 @@ class Batch:
             self.out_cache_loc = self.token_to_kv_pool.alloc(bs)
             if self.out_cache_loc is None:
-                print("Decode out of memory. This should nerver happen.")
+                print("Decode out of memory. This should never happen.")
                 self.tree_cache.pretty_print()
                 exit()
@@ -415,6 +430,7 @@ class Batch:
         self.prefix_lens = None
         self.position_ids_offsets = self.position_ids_offsets[new_indices]
         self.out_cache_loc = self.out_cache_cont_start = self.out_cache_cont_end = None
+        self.top_logprobs_nums = [self.top_logprobs_nums[i] for i in unfinished_indices]
         self.return_logprob = any(req.return_logprob for req in self.reqs)
         for item in [
@@ -425,9 +441,12 @@ class Batch:
             "presence_penalties",
             "logit_bias",
         ]:
-            setattr(self, item, getattr(self, item)[new_indices])
+            self_val = getattr(self, item, None)
+            # logit_bias can be None
+            if self_val is not None:
+                setattr(self, item, self_val[new_indices])
-    def merge(self, other):
+    def merge(self, other: "Batch"):
         self.reqs.extend(other.reqs)
         self.req_pool_indices = torch.concat(
@@ -439,6 +458,7 @@ class Batch:
             [self.position_ids_offsets, other.position_ids_offsets]
         )
         self.out_cache_loc = self.out_cache_cont_start = self.out_cache_cont_end = None
+        self.top_logprobs_nums.extend(other.top_logprobs_nums)
         self.return_logprob = any(req.return_logprob for req in self.reqs)
         for item in [
@@ -447,17 +467,34 @@ class Batch:
             "top_ks",
             "frequency_penalties",
             "presence_penalties",
-            "logit_bias",
         ]:
-            setattr(
-                self, item, torch.concat([getattr(self, item), getattr(other, item)])
+            self_val = getattr(self, item, None)
+            other_val = getattr(other, item, None)
+            setattr(self, item, torch.concat([self_val, other_val]))
+        # logit_bias can be None
+        if self.logit_bias is not None or other.logit_bias is not None:
+            vocab_size = (
+                self.logit_bias.shape[1]
+                if self.logit_bias is not None
+                else other.logit_bias.shape[1]
             )
+            if self.logit_bias is None:
+                self.logit_bias = torch.zeros(
+                    (len(self.reqs), vocab_size), dtype=torch.float32, device="cuda"
+                )
+            if other.logit_bias is None:
+                other.logit_bias = torch.zeros(
+                    (len(other.reqs), vocab_size), dtype=torch.float32, device="cuda"
+                )
+            self.logit_bias = torch.concat([self.logit_bias, other.logit_bias])
     def sample(self, logits: torch.Tensor):
         # Post process logits
         logits = logits.contiguous()
         logits.div_(self.temperatures)
-        logits.add_(self.logit_bias)
+        if self.logit_bias is not None:
+            logits.add_(self.logit_bias)
         has_regex = any(req.regex_fsm is not None for req in self.reqs)
         if has_regex:

sglang/srt/managers/router/manager.py CHANGED Viewed

@@ -4,6 +4,7 @@ import logging
 import uvloop
 import zmq
 import zmq.asyncio
 from sglang.srt.backend_config import GLOBAL_BACKEND_CONFIG
 from sglang.srt.managers.router.model_rpc import ModelRpcClient
 from sglang.srt.server_args import PortArgs, ServerArgs
@@ -41,12 +42,16 @@ class RouterManager:
                 self.send_to_detokenizer.send_pyobj(obj)
             # async sleep for receiving the subsequent request and avoiding cache miss
+            slept = False
             if len(out_pyobjs) != 0:
                 has_finished = any([obj.finished for obj in out_pyobjs])
                 if has_finished:
-                    await asyncio.sleep(self.extend_dependency_time)
+                    if self.extend_dependency_time > 0:
+                        slept = True
+                        await asyncio.sleep(self.extend_dependency_time)
-            await asyncio.sleep(0.0006)
+            if not slept:
+                await asyncio.sleep(0.0006)
     async def loop_for_recv_requests(self):
         while True:

sglang 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

sglang 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl