PyPI - sglang - Versions diffs - 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

sglang 0.3.4.post1py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

sglang/api.py +1 -1
sglang/bench_latency.py +3 -3
sglang/bench_server_latency.py +2 -3
sglang/bench_serving.py +92 -0
sglang/global_config.py +9 -3
sglang/lang/chat_template.py +50 -25
sglang/lang/interpreter.py +9 -1
sglang/lang/ir.py +11 -2
sglang/launch_server.py +1 -1
sglang/srt/configs/model_config.py +76 -15
sglang/srt/constrained/__init__.py +18 -0
sglang/srt/constrained/bnf_cache.py +61 -0
sglang/srt/constrained/fsm_cache.py +10 -3
sglang/srt/constrained/grammar.py +190 -0
sglang/srt/hf_transformers_utils.py +20 -5
sglang/srt/layers/attention/flashinfer_backend.py +5 -5
sglang/srt/layers/attention/triton_ops/decode_attention.py +110 -30
sglang/srt/layers/attention/triton_ops/prefill_attention.py +1 -1
sglang/srt/layers/fused_moe/fused_moe.py +4 -3
sglang/srt/layers/fused_moe/layer.py +28 -0
sglang/srt/layers/logits_processor.py +5 -5
sglang/srt/layers/quantization/base_config.py +16 -1
sglang/srt/layers/rotary_embedding.py +15 -48
sglang/srt/layers/sampler.py +51 -39
sglang/srt/layers/vocab_parallel_embedding.py +486 -0
sglang/srt/managers/data_parallel_controller.py +8 -7
sglang/srt/managers/detokenizer_manager.py +11 -9
sglang/srt/managers/image_processor.py +4 -3
sglang/srt/managers/io_struct.py +80 -78
sglang/srt/managers/schedule_batch.py +46 -52
sglang/srt/managers/schedule_policy.py +24 -13
sglang/srt/managers/scheduler.py +145 -82
sglang/srt/managers/tokenizer_manager.py +236 -334
sglang/srt/managers/tp_worker.py +5 -5
sglang/srt/managers/tp_worker_overlap_thread.py +58 -21
sglang/srt/mem_cache/flush_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +10 -3
sglang/srt/model_executor/cuda_graph_runner.py +34 -23
sglang/srt/model_executor/forward_batch_info.py +6 -9
sglang/srt/model_executor/model_runner.py +10 -19
sglang/srt/models/baichuan.py +4 -4
sglang/srt/models/chatglm.py +4 -4
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +5 -5
sglang/srt/models/deepseek.py +4 -4
sglang/srt/models/deepseek_v2.py +4 -4
sglang/srt/models/exaone.py +4 -4
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -1
sglang/srt/models/gpt2.py +287 -0
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +4 -4
sglang/srt/models/internlm2.py +4 -4
sglang/srt/models/llama.py +15 -7
sglang/srt/models/llama_embedding.py +2 -10
sglang/srt/models/llama_reward.py +5 -0
sglang/srt/models/minicpm.py +4 -4
sglang/srt/models/minicpm3.py +4 -4
sglang/srt/models/mixtral.py +7 -5
sglang/srt/models/mixtral_quant.py +4 -4
sglang/srt/models/mllama.py +5 -5
sglang/srt/models/olmo.py +4 -4
sglang/srt/models/olmoe.py +4 -4
sglang/srt/models/qwen.py +4 -4
sglang/srt/models/qwen2.py +4 -4
sglang/srt/models/qwen2_moe.py +4 -4
sglang/srt/models/qwen2_vl.py +4 -8
sglang/srt/models/stablelm.py +4 -4
sglang/srt/models/torch_native_llama.py +4 -4
sglang/srt/models/xverse.py +4 -4
sglang/srt/models/xverse_moe.py +4 -4
sglang/srt/openai_api/adapter.py +52 -66
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
sglang/srt/sampling/sampling_batch_info.py +7 -13
sglang/srt/sampling/sampling_params.py +5 -7
sglang/srt/server.py +41 -33
sglang/srt/server_args.py +34 -5
sglang/srt/utils.py +40 -56
sglang/test/run_eval.py +2 -0
sglang/test/runners.py +2 -1
sglang/test/srt/sampling/penaltylib/utils.py +1 -0
sglang/test/test_utils.py +151 -6
sglang/utils.py +62 -1
sglang/version.py +1 -1
sglang-0.3.5.dist-info/METADATA +344 -0
sglang-0.3.5.dist-info/RECORD +152 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/WHEEL +1 -1
sglang-0.3.4.post1.dist-info/METADATA +0 -900
sglang-0.3.4.post1.dist-info/RECORD +0 -148
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/LICENSE +0 -0
{sglang-0.3.4.post1.dist-info → sglang-0.3.5.dist-info}/top_level.txt +0 -0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -56,49 +56,47 @@ class GenerateReqInput:
     # LoRA related
     lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
-    # Whether it is a single request or a batch request
-    is_single: bool = True
-    def post_init(self):
+    def normalize_batch_and_arguments(self):
         if (self.text is None and self.input_ids is None) or (
             self.text is not None and self.input_ids is not None
         ):
             raise ValueError("Either text or input_ids should be provided.")
-        self.is_single = False
+        # Derive the batch size
         if self.text is not None:
             if isinstance(self.text, str):
                 self.is_single = True
                 self.batch_size = 1
             else:
+                self.is_single = False
                 self.batch_size = len(self.text)
         else:
             if isinstance(self.input_ids[0], int):
                 self.is_single = True
                 self.batch_size = 1
             else:
+                self.is_single = False
                 self.batch_size = len(self.input_ids)
+        # Handle parallel sampling
+        # When parallel sampling is used, we always treat the input as a batch.
         if self.sampling_params is None:
             self.parallel_sample_num = 1
         elif isinstance(self.sampling_params, dict):
             self.parallel_sample_num = self.sampling_params.get("n", 1)
         else:  # isinstance(self.sampling_params, list):
             self.parallel_sample_num = self.sampling_params[0].get("n", 1)
-            for sp in self.sampling_params:
-                # TODO cope with the case that the parallel_sample_num is different for different samples
-                assert self.parallel_sample_num == sp.get(
-                    "n", 1
-                ), "The parallel_sample_num should be the same for all samples in sample params."
-        if self.parallel_sample_num > 1:
-            if self.is_single:
-                self.is_single = False
-                if self.text is not None:
-                    self.text = [self.text]
-                if self.input_ids is not None:
-                    self.input_ids = [self.input_ids]
+            assert all(self.parallel_sample_num == sampling_params.get("n", 1) for sampling_params in self.sampling_params), (
+                "The parallel_sample_num should be the same for all samples in sample params.")
+        if self.parallel_sample_num > 1 and self.is_single:
+            self.is_single = False
+            if self.text is not None:
+                self.text = [self.text]
+            if self.input_ids is not None:
+                self.input_ids = [self.input_ids]
+        # Fill in default arguments
         if self.is_single:
             if self.sampling_params is None:
                 self.sampling_params = {}
@@ -114,9 +112,8 @@ class GenerateReqInput:
             if self.parallel_sample_num == 1:
                 num = self.batch_size
             else:
-                # FIXME support cascade inference
-                # first bs samples are used for caching the prefix for parallel sampling
-                num = self.batch_size + self.parallel_sample_num * self.batch_size
+                # Expand parallel_sample_num
+                num = self.batch_size * self.parallel_sample_num
             if self.image_data is None:
                 self.image_data = [None] * num
@@ -129,14 +126,11 @@ class GenerateReqInput:
                 self.sampling_params = [{}] * num
             elif not isinstance(self.sampling_params, list):
                 self.sampling_params = [self.sampling_params] * num
-            else:
-                assert self.parallel_sample_num == 1
             if self.rid is None:
                 self.rid = [uuid.uuid4().hex for _ in range(num)]
             else:
                 assert isinstance(self.rid, list), "The rid should be a list."
-                assert self.parallel_sample_num == 1
             if self.return_logprob is None:
                 self.return_logprob = [False] * num
@@ -159,6 +153,26 @@ class GenerateReqInput:
             else:
                 assert self.parallel_sample_num == 1
+    def regenerate_rid(self):
+        self.rid = uuid.uuid4().hex
+        return self.rid
+    def __getitem__(self, i):
+        return GenerateReqInput(
+            text=self.text[i] if self.text is not None else None,
+            input_ids=self.input_ids[i] if self.input_ids is not None else None,
+            image_data=self.image_data[i],
+            sampling_params=self.sampling_params[i],
+            rid=self.rid[i],
+            return_logprob=self.return_logprob[i],
+            logprob_start_len=self.logprob_start_len[i],
+            top_logprobs_num=self.top_logprobs_num[i],
+            return_text_in_logprobs=self.return_text_in_logprobs,
+            stream=self.stream,
+            modalities=self.modalities[i] if self.modalities else None,
+            lora_path=self.lora_path[i] if self.lora_path is not None else None,
+        )
 @dataclass
 class TokenizedGenerateReqInput:
@@ -196,85 +210,61 @@ class EmbeddingReqInput:
     # Dummy sampling params for compatibility
     sampling_params: Union[List[Dict], Dict] = None
-    def post_init(self):
+    def normalize_batch_and_arguments(self):
         if (self.text is None and self.input_ids is None) or (
             self.text is not None and self.input_ids is not None
         ):
             raise ValueError("Either text or input_ids should be provided.")
+        # Derive the batch size
         if self.text is not None:
-            self.is_single = isinstance(self.text, str)
+            if isinstance(self.text, str):
+                self.is_single = True
+                self.batch_size = 1
+            else:
+                self.is_single = False
+                self.batch_size = len(self.text)
         else:
-            self.is_single = isinstance(self.input_ids[0], int)
+            if isinstance(self.input_ids[0], int):
+                self.is_single = True
+                self.batch_size = 1
+            else:
+                self.is_single = False
+                self.batch_size = len(self.input_ids)
+        # Fill in default arguments
         if self.is_single:
             if self.rid is None:
                 self.rid = uuid.uuid4().hex
             if self.sampling_params is None:
                 self.sampling_params = {}
-            self.sampling_params["max_new_tokens"] = 1
+            self.sampling_params["max_new_tokens"] = 0
         else:
-            # support select operation
-            self.batch_size = (
-                len(self.text) if self.text is not None else len(self.input_ids)
-            )
             if self.rid is None:
                 self.rid = [uuid.uuid4().hex for _ in range(self.batch_size)]
             else:
-                if not isinstance(self.rid, list):
-                    raise ValueError("The rid should be a list.")
+                assert isinstance(self.rid, list), "The rid should be a list."
             if self.sampling_params is None:
                 self.sampling_params = [{}] * self.batch_size
             for i in range(self.batch_size):
-                self.sampling_params[i]["max_new_tokens"] = 1
-@dataclass
-class TokenizedEmbeddingReqInput:
-    # The request id
-    rid: str
-    # The input text
-    input_text: str
-    # The input token ids
-    input_ids: List[int]
-    # Dummy sampling params for compatibility
-    sampling_params: SamplingParams
+                self.sampling_params[i]["max_new_tokens"] = 0
+    def regenerate_rid(self):
+        self.rid = uuid.uuid4().hex
+        return self.rid
-@dataclass
-class RewardReqInput:
-    # The input prompt in the chat format. It can be a single prompt or a batch of prompts.
-    conv: Union[List[List[Dict]], List[Dict]]
-    # The request id.
-    rid: Optional[Union[List[str], str]] = None
-    # Dummy sampling params for compatibility
-    sampling_params: Union[List[Dict], Dict] = None
-    def post_init(self):
-        self.is_single = isinstance(self.conv[0], dict)
-        if self.is_single:
-            if self.rid is None:
-                self.rid = uuid.uuid4().hex
-            if self.sampling_params is None:
-                self.sampling_params = {}
-            self.sampling_params["max_new_tokens"] = 1
-        else:
-            # support select operation
-            self.batch_size = len(self.conv)
-            if self.rid is None:
-                self.rid = [uuid.uuid4().hex for _ in range(self.batch_size)]
-            else:
-                if not isinstance(self.rid, list):
-                    raise ValueError("The rid should be a list.")
-            if self.sampling_params is None:
-                self.sampling_params = [{}] * self.batch_size
-            for i in range(self.batch_size):
-                self.sampling_params[i]["max_new_tokens"] = 1
+    def __getitem__(self, i):
+        return EmbeddingReqInput(
+            text=self.text[i] if self.text is not None else None,
+            input_ids=self.input_ids[i] if self.input_ids is not None else None,
+            sampling_params=self.sampling_params[i],
+            rid=self.rid[i],
+        )
 @dataclass
-class TokenizedRewardReqInput:
+class TokenizedEmbeddingReqInput:
     # The request id
     rid: str
     # The input text
@@ -294,6 +284,8 @@ class BatchTokenIDOut:
     decoded_texts: List[str]
     decode_ids: List[int]
     read_offsets: List[int]
+    # Only used when `--skip-tokenizer-init`
+    output_ids: Optional[List[int]]
     skip_special_tokens: List[bool]
     spaces_between_special_tokens: List[bool]
     meta_info: List[Dict]
@@ -353,3 +345,13 @@ class AbortReq:
 class ProfileReq(Enum):
     START_PROFILE = 1
     STOP_PROFILE = 2
+@dataclass
+class GetMemPoolSizeReq:
+    pass
+@dataclass
+class GetMemPoolSizeReqOutput:
+    size: int

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -37,8 +37,7 @@ import torch
 from sglang.global_config import global_config
 from sglang.srt.configs.model_config import ModelConfig
-from sglang.srt.constrained import RegexGuide
-from sglang.srt.constrained.jump_forward import JumpForwardMap
+from sglang.srt.constrained.grammar import Grammar
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
 from sglang.srt.mem_cache.chunk_cache import ChunkCache
 from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
@@ -212,9 +211,6 @@ class Req:
         # this does not include the jump forward tokens.
         self.completion_tokens_wo_jump_forward = 0
-        # The number of cached tokens, that were already cached in the KV store
-        self.cached_tokens = 0
         # For vision inputs
         self.image_inputs: Optional[ImageInputs] = None
@@ -222,7 +218,10 @@ class Req:
         self.prefix_indices = []
         self.extend_input_len = 0
         self.last_node = None
-        self.is_inflight_req = 0
+        self.is_being_chunked = 0
+        # For retraction
+        self.is_retracted = False
         # Logprobs (arguments)
         self.return_logprob = False
@@ -243,13 +242,14 @@ class Req:
         # The relative logprob_start_len in an extend batch
         self.extend_logprob_start_len = 0
-        # Embedding
+        # Embedding (return values)
         self.embedding = None
         # Constrained decoding
-        self.regex_fsm: RegexGuide = None
-        self.regex_fsm_state: int = 0
-        self.jump_forward_map: JumpForwardMap = None
+        self.grammar: Optional[Grammar] = None
+        # The number of cached tokens, that were already cached in the KV cache
+        self.cached_tokens = 0
         # For Qwen2-VL
         self.mrope_position_delta = []  # use mutable object
@@ -334,15 +334,20 @@ class Req:
         last_token_id = self.output_ids[-1]
-        matched_eos = last_token_id in self.sampling_params.stop_token_ids
+        matched_eos = False
+        # Check stop token ids
+        if self.sampling_params.stop_token_ids:
+            matched_eos = last_token_id in self.sampling_params.stop_token_ids
         if self.tokenizer is not None:
             matched_eos |= last_token_id == self.tokenizer.eos_token_id
+            if self.tokenizer.additional_stop_token_ids:
+                matched_eos |= last_token_id in self.tokenizer.additional_stop_token_ids
         if matched_eos and not self.sampling_params.ignore_eos:
             self.finished_reason = FINISH_MATCHED_TOKEN(matched=last_token_id)
             return
+        # Check stop strings
         if len(self.sampling_params.stop_strs) > 0:
             tail_str = self.tokenizer.decode(
                 self.output_ids[-(self.sampling_params.stop_str_max_len + 1) :]
@@ -354,6 +359,8 @@ class Req:
                     return
     def jump_forward_and_retokenize(self, jump_forward_str, next_state):
+        assert self.grammar is not None and self.tokenizer is not None
         if self.origin_input_text is None:
             # Recovering text can only use unpadded ids
             self.origin_input_text = self.tokenizer.decode(
@@ -393,7 +400,8 @@ class Req:
                 self.surr_offset = self.read_offset - i
                 break
-        self.regex_fsm_state = next_state
+        # update the inner state of the grammar
+        self.grammar.jump_and_retokenize(old_output_ids, self.output_ids, next_state)
         if self.return_logprob:
             # For fast-forward part's logprobs
@@ -463,8 +471,8 @@ class ScheduleBatch:
     # Stream
     has_stream: bool = False
-    # Has regex
-    has_regex: bool = False
+    # Has grammar
+    has_grammar: bool = False
     # device
     device: str = "cuda"
@@ -472,7 +480,7 @@ class ScheduleBatch:
     @classmethod
     def init_new(
         cls,
-        reqs,
+        reqs: List[Req],
         req_to_token_pool,
         token_to_kv_pool,
         tree_cache,
@@ -486,7 +494,7 @@ class ScheduleBatch:
             model_config=model_config,
             return_logprob=any(req.return_logprob for req in reqs),
             has_stream=any(req.stream for req in reqs),
-            has_regex=any(req.regex_fsm for req in reqs),
+            has_grammar=any(req.grammar for req in reqs),
             device=req_to_token_pool.device,
         )
@@ -514,7 +522,12 @@ class ScheduleBatch:
                 out_cache_loc = self.token_to_kv_pool.alloc(num_tokens)
             if out_cache_loc is None:
-                logger.error("Prefill out of memory. Try to lower your batch size.")
+                phase_str = "Prefill" if self.forward_mode.is_extend() else "Decode"
+                logger.error(
+                    f"{phase_str} out of memory. Try to lower your batch size.\n"
+                    f"Try to allocate {num_tokens} tokens.\n"
+                    f"Avaliable tokens: {self.token_to_kv_pool.available_size() + self.tree_cache.evictable_size()}\n"
+                )
                 if self.tree_cache is not None:
                     self.tree_cache.pretty_print()
                 exit(1)
@@ -551,7 +564,7 @@ class ScheduleBatch:
             seq_lens[i] -= encoder_len
             if len(req.prefix_indices) < encoder_len:
-                # NOTE: the encoder part should considered as a whole
+                # NOTE: the encoder part should be considered as a whole
                 assert len(req.prefix_indices) == 0
                 input_ids[i] = input_ids[i][encoder_len:]
                 encoder_out_cache_loc.append(self.out_cache_loc[pt : pt + encoder_len])
@@ -638,6 +651,7 @@ class ScheduleBatch:
             req.extend_logprob_start_len = extend_logprob_start_len
             pt += req.extend_input_len
+            req.is_retracted = False
         # Set fields
         self.input_ids = torch.tensor(sum(input_ids, []), dtype=torch.int32).to(
@@ -770,6 +784,7 @@ class ScheduleBatch:
             req.prefix_indices = []
             req.last_node = None
             req.extend_input_len = 0
+            req.is_retracted = True
             # For incremental logprobs
             req.last_update_decode_tokens = 0
@@ -793,26 +808,10 @@ class ScheduleBatch:
         keep_indices = set(i for i in range(len(self.reqs)))
         for i, req in enumerate(self.reqs):
-            if req.jump_forward_map is not None:
-                jump_forward_bytes = req.jump_forward_map.jump_forward_byte(
-                    req.regex_fsm_state
-                )
-                if jump_forward_bytes is not None and len(jump_forward_bytes) > 1:
-                    suffix_bytes = []
-                    continuation_range = range(0x80, 0xC0)
-                    cur_state = req.regex_fsm_state
-                    while (
-                        len(jump_forward_bytes)
-                        and jump_forward_bytes[0][0] in continuation_range
-                    ):
-                        # continuation bytes
-                        byte_edge = jump_forward_bytes.pop(0)
-                        suffix_bytes.append(byte_edge[0])
-                        cur_state = byte_edge[1]
-                    suffix_tokens = [f"<0x{hex(b)[2:].upper()}>" for b in suffix_bytes]
-                    suffix_ids = req.tokenizer.convert_tokens_to_ids(suffix_tokens)
+            if req.grammar is not None:
+                jump_helper = req.grammar.try_jump(req.tokenizer)
+                if jump_helper.can_jump():
+                    suffix_ids = jump_helper.suffix_ids
                     # Current ids, for cache and revert
                     cur_all_ids = tuple(req.origin_input_ids + req.output_ids)[:-1]
                     cur_output_ids = req.output_ids
@@ -826,10 +825,8 @@ class ScheduleBatch:
                     (
                         jump_forward_str,
                         next_state,
-                    ) = req.jump_forward_map.jump_forward_symbol(cur_state)
+                    ) = req.grammar.jump_forward_str_state(jump_helper)
-                    # Make the incrementally decoded text part of jump_forward_str
-                    # so that the UTF-8 will not corrupt
                     jump_forward_str = new_text + jump_forward_str
                     if not req.jump_forward_and_retokenize(
                         jump_forward_str, next_state
@@ -896,7 +893,7 @@ class ScheduleBatch:
     def filter_batch(
         self,
-        current_inflight_req: Optional[Req] = None,
+        being_chunked_req: Optional[Req] = None,
         keep_indices: Optional[List[int]] = None,
     ):
         if keep_indices is None:
@@ -904,7 +901,7 @@ class ScheduleBatch:
                 i
                 for i in range(len(self.reqs))
                 if not self.reqs[i].finished()
-                and self.reqs[i] is not current_inflight_req
+                and self.reqs[i] is not being_chunked_req
             ]
         if keep_indices is None or len(keep_indices) == 0:
@@ -936,7 +933,7 @@ class ScheduleBatch:
             self.top_logprobs_nums = None
         self.has_stream = any(req.stream for req in self.reqs)
-        self.has_regex = any(req.regex_fsm for req in self.reqs)
+        self.has_grammar = any(req.grammar for req in self.reqs)
         self.sampling_info.filter_batch(keep_indices, new_indices)
@@ -969,7 +966,7 @@ class ScheduleBatch:
         self.return_logprob = self.return_logprob or other.return_logprob
         self.has_stream = self.has_stream or other.has_stream
-        self.has_regex = self.has_regex or other.has_regex
+        self.has_grammar = self.has_grammar or other.has_grammar
     def get_model_worker_batch(self):
         if self.forward_mode.is_decode():
@@ -979,13 +976,10 @@ class ScheduleBatch:
             extend_prefix_lens = self.prefix_lens
             extend_logprob_start_lens = self.extend_logprob_start_lens
-        if self.has_regex:
-            self.sampling_info.regex_fsms = [req.regex_fsm for req in self.reqs]
-            self.sampling_info.regex_fsm_states = [
-                req.regex_fsm_state for req in self.reqs
-            ]
+        if self.has_grammar:
+            self.sampling_info.grammars = [req.grammar for req in self.reqs]
         else:
-            self.sampling_info.regex_fsms = None
+            self.sampling_info.grammars = None
         global bid
         bid += 1

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -30,7 +30,9 @@ from sglang.srt.mem_cache.radix_cache import TreeNode
 # This can prevent the server from being too conservative.
 # Note that this only clips the estimation in the scheduler but does not change the stop
 # condition. The request can still generate tokens until it hits the unclipped max_new_tokens.
-CLIP_MAX_NEW_TOKENS = int(os.environ.get("SGLANG_CLIP_MAX_NEW_TOKENS", "4096"))
+CLIP_MAX_NEW_TOKENS_ESTIMATION = int(
+    os.environ.get("SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION", "4096")
+)
 class SchedulePolicy:
@@ -43,9 +45,15 @@ class SchedulePolicy:
         self.tree_cache = tree_cache
     def calc_priority(self, waiting_queue: List[Req]):
+        if len(waiting_queue) > 128 and self.policy == "lpm":
+            # Turn off the expensive prefix matching and sorting when the #queue is large.
+            policy = "fcfs"
+        else:
+            policy = self.policy
         # Compute matched prefix length
         prefix_computed = False
-        if self.policy == "lpm" or self.policy == "dfs-weight":
+        if policy == "lpm" or policy == "dfs-weight":
             for r in waiting_queue:
                 # NOTE: the prefix_indices must always be aligned with last_node
                 r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
@@ -54,18 +62,18 @@ class SchedulePolicy:
             prefix_computed = True
-        if self.policy == "lpm":
+        if policy == "lpm":
             # Longest Prefix Match
             waiting_queue.sort(key=lambda x: -len(x.prefix_indices))
-        elif self.policy == "fcfs":
+        elif policy == "fcfs":
             # first come first serve
             pass
-        elif self.policy == "lof":
+        elif policy == "lof":
             # longest output first
             waiting_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
-        elif self.policy == "random":
+        elif policy == "random":
             random.shuffle(waiting_queue)
-        elif self.policy == "dfs-weight":
+        elif policy == "dfs-weight":
             last_node_to_reqs = defaultdict(list)
             for req in waiting_queue:
                 last_node_to_reqs[req.last_node].append(req)
@@ -83,7 +91,7 @@ class SchedulePolicy:
                 waiting_queue,
             )
         else:
-            raise ValueError(f"Unknown schedule_policy: {self.policy}")
+            raise ValueError(f"Unknown schedule_policy: {policy=}")
         return prefix_computed
@@ -146,7 +154,7 @@ class PrefillAdder:
                 [
                     min(
                         (r.sampling_params.max_new_tokens - len(r.output_ids)),
-                        CLIP_MAX_NEW_TOKENS,
+                        CLIP_MAX_NEW_TOKENS_ESTIMATION,
                     )
                     * self.new_token_ratio
                     for r in running_batch.reqs
@@ -186,7 +194,7 @@ class PrefillAdder:
             len(req.prefix_indices),
             req.extend_input_len,
             (
-                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
+                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION)
                 if not truncated
                 else 0
             ),
@@ -258,7 +266,7 @@ class PrefillAdder:
             self._prefill_one_req(
                 0,
                 req.extend_input_len,
-                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS),
+                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION),
             )
         else:
             # Chunked prefill
@@ -276,7 +284,7 @@ class PrefillAdder:
             return self.add_one_req_ignore_eos(req)
         total_tokens = req.extend_input_len + min(
-            req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS
+            req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS_ESTIMATION
         )
         input_tokens = req.extend_input_len
         prefix_len = len(req.prefix_indices)
@@ -302,7 +310,10 @@ class PrefillAdder:
                 self._prefill_one_req(
                     prefix_len,
                     input_tokens,
-                    min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS),
+                    min(
+                        req.sampling_params.max_new_tokens,
+                        CLIP_MAX_NEW_TOKENS_ESTIMATION,
+                    ),
                 )
             else:
                 # Chunked prefill

sglang 0.3.4.post1__py3-none-any.whl → 0.3.5__py3-none-any.whl

sglang 0.3.4.post1py3-none-any.whl → 0.3.5py3-none-any.whl